1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23 /* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32 #include <linux/module.h> 33 #include <linux/fs.h> 34 #include <linux/time.h> 35 #include <linux/jbd2.h> 36 #include <linux/highuid.h> 37 #include <linux/pagemap.h> 38 #include <linux/quotaops.h> 39 #include <linux/string.h> 40 #include <linux/slab.h> 41 #include <linux/falloc.h> 42 #include <asm/uaccess.h> 43 #include <linux/fiemap.h> 44 #include "ext4_jbd2.h" 45 #include "ext4_extents.h" 46 47 #include <trace/events/ext4.h> 48 49 static int ext4_split_extent(handle_t *handle, 50 struct inode *inode, 51 struct ext4_ext_path *path, 52 struct ext4_map_blocks *map, 53 int split_flag, 54 int flags); 55 56 static int ext4_ext_truncate_extend_restart(handle_t *handle, 57 struct inode *inode, 58 int needed) 59 { 60 int err; 61 62 if (!ext4_handle_valid(handle)) 63 return 0; 64 if (handle->h_buffer_credits > needed) 65 return 0; 66 err = ext4_journal_extend(handle, needed); 67 if (err <= 0) 68 return err; 69 err = ext4_truncate_restart_trans(handle, inode, needed); 70 if (err == 0) 71 err = -EAGAIN; 72 73 return err; 74 } 75 76 /* 77 * could return: 78 * - EROFS 79 * - ENOMEM 80 */ 81 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 82 struct ext4_ext_path *path) 83 { 84 if (path->p_bh) { 85 /* path points to block */ 86 return ext4_journal_get_write_access(handle, path->p_bh); 87 } 88 /* path points to leaf/index in inode body */ 89 /* we use in-core data, no need to protect them */ 90 return 0; 91 } 92 93 /* 94 * could return: 95 * - EROFS 96 * - ENOMEM 97 * - EIO 98 */ 99 static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 100 struct ext4_ext_path *path) 101 { 102 int err; 103 if (path->p_bh) { 104 /* path points to block */ 105 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 106 } else { 107 /* path points to leaf/index in inode body */ 108 err = ext4_mark_inode_dirty(handle, inode); 109 } 110 return err; 111 } 112 113 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 114 struct ext4_ext_path *path, 115 ext4_lblk_t block) 116 { 117 struct ext4_inode_info *ei = EXT4_I(inode); 118 ext4_fsblk_t bg_start; 119 ext4_fsblk_t last_block; 120 ext4_grpblk_t colour; 121 ext4_group_t block_group; 122 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 123 int depth; 124 125 if (path) { 126 struct ext4_extent *ex; 127 depth = path->p_depth; 128 129 /* 130 * Try to predict block placement assuming that we are 131 * filling in a file which will eventually be 132 * non-sparse --- i.e., in the case of libbfd writing 133 * an ELF object sections out-of-order but in a way 134 * the eventually results in a contiguous object or 135 * executable file, or some database extending a table 136 * space file. However, this is actually somewhat 137 * non-ideal if we are writing a sparse file such as 138 * qemu or KVM writing a raw image file that is going 139 * to stay fairly sparse, since it will end up 140 * fragmenting the file system's free space. Maybe we 141 * should have some hueristics or some way to allow 142 * userspace to pass a hint to file system, 143 * especially if the latter case turns out to be 144 * common. 145 */ 146 ex = path[depth].p_ext; 147 if (ex) { 148 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 149 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 150 151 if (block > ext_block) 152 return ext_pblk + (block - ext_block); 153 else 154 return ext_pblk - (ext_block - block); 155 } 156 157 /* it looks like index is empty; 158 * try to find starting block from index itself */ 159 if (path[depth].p_bh) 160 return path[depth].p_bh->b_blocknr; 161 } 162 163 /* OK. use inode's group */ 164 block_group = ei->i_block_group; 165 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 166 /* 167 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 168 * block groups per flexgroup, reserve the first block 169 * group for directories and special files. Regular 170 * files will start at the second block group. This 171 * tends to speed up directory access and improves 172 * fsck times. 173 */ 174 block_group &= ~(flex_size-1); 175 if (S_ISREG(inode->i_mode)) 176 block_group++; 177 } 178 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 179 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 180 181 /* 182 * If we are doing delayed allocation, we don't need take 183 * colour into account. 184 */ 185 if (test_opt(inode->i_sb, DELALLOC)) 186 return bg_start; 187 188 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 189 colour = (current->pid % 16) * 190 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 191 else 192 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 193 return bg_start + colour + block; 194 } 195 196 /* 197 * Allocation for a meta data block 198 */ 199 static ext4_fsblk_t 200 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 201 struct ext4_ext_path *path, 202 struct ext4_extent *ex, int *err, unsigned int flags) 203 { 204 ext4_fsblk_t goal, newblock; 205 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 208 NULL, err); 209 return newblock; 210 } 211 212 static inline int ext4_ext_space_block(struct inode *inode, int check) 213 { 214 int size; 215 216 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 217 / sizeof(struct ext4_extent); 218 if (!check) { 219 #ifdef AGGRESSIVE_TEST 220 if (size > 6) 221 size = 6; 222 #endif 223 } 224 return size; 225 } 226 227 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 228 { 229 int size; 230 231 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 232 / sizeof(struct ext4_extent_idx); 233 if (!check) { 234 #ifdef AGGRESSIVE_TEST 235 if (size > 5) 236 size = 5; 237 #endif 238 } 239 return size; 240 } 241 242 static inline int ext4_ext_space_root(struct inode *inode, int check) 243 { 244 int size; 245 246 size = sizeof(EXT4_I(inode)->i_data); 247 size -= sizeof(struct ext4_extent_header); 248 size /= sizeof(struct ext4_extent); 249 if (!check) { 250 #ifdef AGGRESSIVE_TEST 251 if (size > 3) 252 size = 3; 253 #endif 254 } 255 return size; 256 } 257 258 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 259 { 260 int size; 261 262 size = sizeof(EXT4_I(inode)->i_data); 263 size -= sizeof(struct ext4_extent_header); 264 size /= sizeof(struct ext4_extent_idx); 265 if (!check) { 266 #ifdef AGGRESSIVE_TEST 267 if (size > 4) 268 size = 4; 269 #endif 270 } 271 return size; 272 } 273 274 /* 275 * Calculate the number of metadata blocks needed 276 * to allocate @blocks 277 * Worse case is one block per extent 278 */ 279 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 280 { 281 struct ext4_inode_info *ei = EXT4_I(inode); 282 int idxs, num = 0; 283 284 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 285 / sizeof(struct ext4_extent_idx)); 286 287 /* 288 * If the new delayed allocation block is contiguous with the 289 * previous da block, it can share index blocks with the 290 * previous block, so we only need to allocate a new index 291 * block every idxs leaf blocks. At ldxs**2 blocks, we need 292 * an additional index block, and at ldxs**3 blocks, yet 293 * another index blocks. 294 */ 295 if (ei->i_da_metadata_calc_len && 296 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 297 if ((ei->i_da_metadata_calc_len % idxs) == 0) 298 num++; 299 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 300 num++; 301 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 302 num++; 303 ei->i_da_metadata_calc_len = 0; 304 } else 305 ei->i_da_metadata_calc_len++; 306 ei->i_da_metadata_calc_last_lblock++; 307 return num; 308 } 309 310 /* 311 * In the worst case we need a new set of index blocks at 312 * every level of the inode's extent tree. 313 */ 314 ei->i_da_metadata_calc_len = 1; 315 ei->i_da_metadata_calc_last_lblock = lblock; 316 return ext_depth(inode) + 1; 317 } 318 319 static int 320 ext4_ext_max_entries(struct inode *inode, int depth) 321 { 322 int max; 323 324 if (depth == ext_depth(inode)) { 325 if (depth == 0) 326 max = ext4_ext_space_root(inode, 1); 327 else 328 max = ext4_ext_space_root_idx(inode, 1); 329 } else { 330 if (depth == 0) 331 max = ext4_ext_space_block(inode, 1); 332 else 333 max = ext4_ext_space_block_idx(inode, 1); 334 } 335 336 return max; 337 } 338 339 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 340 { 341 ext4_fsblk_t block = ext4_ext_pblock(ext); 342 int len = ext4_ext_get_actual_len(ext); 343 344 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 345 } 346 347 static int ext4_valid_extent_idx(struct inode *inode, 348 struct ext4_extent_idx *ext_idx) 349 { 350 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 351 352 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 353 } 354 355 static int ext4_valid_extent_entries(struct inode *inode, 356 struct ext4_extent_header *eh, 357 int depth) 358 { 359 struct ext4_extent *ext; 360 struct ext4_extent_idx *ext_idx; 361 unsigned short entries; 362 if (eh->eh_entries == 0) 363 return 1; 364 365 entries = le16_to_cpu(eh->eh_entries); 366 367 if (depth == 0) { 368 /* leaf entries */ 369 ext = EXT_FIRST_EXTENT(eh); 370 while (entries) { 371 if (!ext4_valid_extent(inode, ext)) 372 return 0; 373 ext++; 374 entries--; 375 } 376 } else { 377 ext_idx = EXT_FIRST_INDEX(eh); 378 while (entries) { 379 if (!ext4_valid_extent_idx(inode, ext_idx)) 380 return 0; 381 ext_idx++; 382 entries--; 383 } 384 } 385 return 1; 386 } 387 388 static int __ext4_ext_check(const char *function, unsigned int line, 389 struct inode *inode, struct ext4_extent_header *eh, 390 int depth) 391 { 392 const char *error_msg; 393 int max = 0; 394 395 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 396 error_msg = "invalid magic"; 397 goto corrupted; 398 } 399 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 400 error_msg = "unexpected eh_depth"; 401 goto corrupted; 402 } 403 if (unlikely(eh->eh_max == 0)) { 404 error_msg = "invalid eh_max"; 405 goto corrupted; 406 } 407 max = ext4_ext_max_entries(inode, depth); 408 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 409 error_msg = "too large eh_max"; 410 goto corrupted; 411 } 412 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 413 error_msg = "invalid eh_entries"; 414 goto corrupted; 415 } 416 if (!ext4_valid_extent_entries(inode, eh, depth)) { 417 error_msg = "invalid extent entries"; 418 goto corrupted; 419 } 420 return 0; 421 422 corrupted: 423 ext4_error_inode(inode, function, line, 0, 424 "bad header/extent: %s - magic %x, " 425 "entries %u, max %u(%u), depth %u(%u)", 426 error_msg, le16_to_cpu(eh->eh_magic), 427 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 428 max, le16_to_cpu(eh->eh_depth), depth); 429 430 return -EIO; 431 } 432 433 #define ext4_ext_check(inode, eh, depth) \ 434 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 435 436 int ext4_ext_check_inode(struct inode *inode) 437 { 438 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 439 } 440 441 #ifdef EXT_DEBUG 442 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 443 { 444 int k, l = path->p_depth; 445 446 ext_debug("path:"); 447 for (k = 0; k <= l; k++, path++) { 448 if (path->p_idx) { 449 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 450 ext4_idx_pblock(path->p_idx)); 451 } else if (path->p_ext) { 452 ext_debug(" %d:[%d]%d:%llu ", 453 le32_to_cpu(path->p_ext->ee_block), 454 ext4_ext_is_uninitialized(path->p_ext), 455 ext4_ext_get_actual_len(path->p_ext), 456 ext4_ext_pblock(path->p_ext)); 457 } else 458 ext_debug(" []"); 459 } 460 ext_debug("\n"); 461 } 462 463 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 464 { 465 int depth = ext_depth(inode); 466 struct ext4_extent_header *eh; 467 struct ext4_extent *ex; 468 int i; 469 470 if (!path) 471 return; 472 473 eh = path[depth].p_hdr; 474 ex = EXT_FIRST_EXTENT(eh); 475 476 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 477 478 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 479 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 480 ext4_ext_is_uninitialized(ex), 481 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 482 } 483 ext_debug("\n"); 484 } 485 486 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 487 ext4_fsblk_t newblock, int level) 488 { 489 int depth = ext_depth(inode); 490 struct ext4_extent *ex; 491 492 if (depth != level) { 493 struct ext4_extent_idx *idx; 494 idx = path[level].p_idx; 495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 496 ext_debug("%d: move %d:%llu in new index %llu\n", level, 497 le32_to_cpu(idx->ei_block), 498 ext4_idx_pblock(idx), 499 newblock); 500 idx++; 501 } 502 503 return; 504 } 505 506 ex = path[depth].p_ext; 507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 509 le32_to_cpu(ex->ee_block), 510 ext4_ext_pblock(ex), 511 ext4_ext_is_uninitialized(ex), 512 ext4_ext_get_actual_len(ex), 513 newblock); 514 ex++; 515 } 516 } 517 518 #else 519 #define ext4_ext_show_path(inode, path) 520 #define ext4_ext_show_leaf(inode, path) 521 #define ext4_ext_show_move(inode, path, newblock, level) 522 #endif 523 524 void ext4_ext_drop_refs(struct ext4_ext_path *path) 525 { 526 int depth = path->p_depth; 527 int i; 528 529 for (i = 0; i <= depth; i++, path++) 530 if (path->p_bh) { 531 brelse(path->p_bh); 532 path->p_bh = NULL; 533 } 534 } 535 536 /* 537 * ext4_ext_binsearch_idx: 538 * binary search for the closest index of the given block 539 * the header must be checked before calling this 540 */ 541 static void 542 ext4_ext_binsearch_idx(struct inode *inode, 543 struct ext4_ext_path *path, ext4_lblk_t block) 544 { 545 struct ext4_extent_header *eh = path->p_hdr; 546 struct ext4_extent_idx *r, *l, *m; 547 548 549 ext_debug("binsearch for %u(idx): ", block); 550 551 l = EXT_FIRST_INDEX(eh) + 1; 552 r = EXT_LAST_INDEX(eh); 553 while (l <= r) { 554 m = l + (r - l) / 2; 555 if (block < le32_to_cpu(m->ei_block)) 556 r = m - 1; 557 else 558 l = m + 1; 559 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 560 m, le32_to_cpu(m->ei_block), 561 r, le32_to_cpu(r->ei_block)); 562 } 563 564 path->p_idx = l - 1; 565 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 566 ext4_idx_pblock(path->p_idx)); 567 568 #ifdef CHECK_BINSEARCH 569 { 570 struct ext4_extent_idx *chix, *ix; 571 int k; 572 573 chix = ix = EXT_FIRST_INDEX(eh); 574 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 575 if (k != 0 && 576 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 577 printk(KERN_DEBUG "k=%d, ix=0x%p, " 578 "first=0x%p\n", k, 579 ix, EXT_FIRST_INDEX(eh)); 580 printk(KERN_DEBUG "%u <= %u\n", 581 le32_to_cpu(ix->ei_block), 582 le32_to_cpu(ix[-1].ei_block)); 583 } 584 BUG_ON(k && le32_to_cpu(ix->ei_block) 585 <= le32_to_cpu(ix[-1].ei_block)); 586 if (block < le32_to_cpu(ix->ei_block)) 587 break; 588 chix = ix; 589 } 590 BUG_ON(chix != path->p_idx); 591 } 592 #endif 593 594 } 595 596 /* 597 * ext4_ext_binsearch: 598 * binary search for closest extent of the given block 599 * the header must be checked before calling this 600 */ 601 static void 602 ext4_ext_binsearch(struct inode *inode, 603 struct ext4_ext_path *path, ext4_lblk_t block) 604 { 605 struct ext4_extent_header *eh = path->p_hdr; 606 struct ext4_extent *r, *l, *m; 607 608 if (eh->eh_entries == 0) { 609 /* 610 * this leaf is empty: 611 * we get such a leaf in split/add case 612 */ 613 return; 614 } 615 616 ext_debug("binsearch for %u: ", block); 617 618 l = EXT_FIRST_EXTENT(eh) + 1; 619 r = EXT_LAST_EXTENT(eh); 620 621 while (l <= r) { 622 m = l + (r - l) / 2; 623 if (block < le32_to_cpu(m->ee_block)) 624 r = m - 1; 625 else 626 l = m + 1; 627 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 628 m, le32_to_cpu(m->ee_block), 629 r, le32_to_cpu(r->ee_block)); 630 } 631 632 path->p_ext = l - 1; 633 ext_debug(" -> %d:%llu:[%d]%d ", 634 le32_to_cpu(path->p_ext->ee_block), 635 ext4_ext_pblock(path->p_ext), 636 ext4_ext_is_uninitialized(path->p_ext), 637 ext4_ext_get_actual_len(path->p_ext)); 638 639 #ifdef CHECK_BINSEARCH 640 { 641 struct ext4_extent *chex, *ex; 642 int k; 643 644 chex = ex = EXT_FIRST_EXTENT(eh); 645 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 646 BUG_ON(k && le32_to_cpu(ex->ee_block) 647 <= le32_to_cpu(ex[-1].ee_block)); 648 if (block < le32_to_cpu(ex->ee_block)) 649 break; 650 chex = ex; 651 } 652 BUG_ON(chex != path->p_ext); 653 } 654 #endif 655 656 } 657 658 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 659 { 660 struct ext4_extent_header *eh; 661 662 eh = ext_inode_hdr(inode); 663 eh->eh_depth = 0; 664 eh->eh_entries = 0; 665 eh->eh_magic = EXT4_EXT_MAGIC; 666 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 667 ext4_mark_inode_dirty(handle, inode); 668 ext4_ext_invalidate_cache(inode); 669 return 0; 670 } 671 672 struct ext4_ext_path * 673 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 674 struct ext4_ext_path *path) 675 { 676 struct ext4_extent_header *eh; 677 struct buffer_head *bh; 678 short int depth, i, ppos = 0, alloc = 0; 679 680 eh = ext_inode_hdr(inode); 681 depth = ext_depth(inode); 682 683 /* account possible depth increase */ 684 if (!path) { 685 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 686 GFP_NOFS); 687 if (!path) 688 return ERR_PTR(-ENOMEM); 689 alloc = 1; 690 } 691 path[0].p_hdr = eh; 692 path[0].p_bh = NULL; 693 694 i = depth; 695 /* walk through the tree */ 696 while (i) { 697 int need_to_validate = 0; 698 699 ext_debug("depth %d: num %d, max %d\n", 700 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 701 702 ext4_ext_binsearch_idx(inode, path + ppos, block); 703 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 704 path[ppos].p_depth = i; 705 path[ppos].p_ext = NULL; 706 707 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 708 if (unlikely(!bh)) 709 goto err; 710 if (!bh_uptodate_or_lock(bh)) { 711 trace_ext4_ext_load_extent(inode, block, 712 path[ppos].p_block); 713 if (bh_submit_read(bh) < 0) { 714 put_bh(bh); 715 goto err; 716 } 717 /* validate the extent entries */ 718 need_to_validate = 1; 719 } 720 eh = ext_block_hdr(bh); 721 ppos++; 722 if (unlikely(ppos > depth)) { 723 put_bh(bh); 724 EXT4_ERROR_INODE(inode, 725 "ppos %d > depth %d", ppos, depth); 726 goto err; 727 } 728 path[ppos].p_bh = bh; 729 path[ppos].p_hdr = eh; 730 i--; 731 732 if (need_to_validate && ext4_ext_check(inode, eh, i)) 733 goto err; 734 } 735 736 path[ppos].p_depth = i; 737 path[ppos].p_ext = NULL; 738 path[ppos].p_idx = NULL; 739 740 /* find extent */ 741 ext4_ext_binsearch(inode, path + ppos, block); 742 /* if not an empty leaf */ 743 if (path[ppos].p_ext) 744 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 745 746 ext4_ext_show_path(inode, path); 747 748 return path; 749 750 err: 751 ext4_ext_drop_refs(path); 752 if (alloc) 753 kfree(path); 754 return ERR_PTR(-EIO); 755 } 756 757 /* 758 * ext4_ext_insert_index: 759 * insert new index [@logical;@ptr] into the block at @curp; 760 * check where to insert: before @curp or after @curp 761 */ 762 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 763 struct ext4_ext_path *curp, 764 int logical, ext4_fsblk_t ptr) 765 { 766 struct ext4_extent_idx *ix; 767 int len, err; 768 769 err = ext4_ext_get_access(handle, inode, curp); 770 if (err) 771 return err; 772 773 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 774 EXT4_ERROR_INODE(inode, 775 "logical %d == ei_block %d!", 776 logical, le32_to_cpu(curp->p_idx->ei_block)); 777 return -EIO; 778 } 779 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 780 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 781 /* insert after */ 782 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 783 len = (len - 1) * sizeof(struct ext4_extent_idx); 784 len = len < 0 ? 0 : len; 785 ext_debug("insert new index %d after: %llu. " 786 "move %d from 0x%p to 0x%p\n", 787 logical, ptr, len, 788 (curp->p_idx + 1), (curp->p_idx + 2)); 789 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 790 } 791 ix = curp->p_idx + 1; 792 } else { 793 /* insert before */ 794 len = len * sizeof(struct ext4_extent_idx); 795 len = len < 0 ? 0 : len; 796 ext_debug("insert new index %d before: %llu. " 797 "move %d from 0x%p to 0x%p\n", 798 logical, ptr, len, 799 curp->p_idx, (curp->p_idx + 1)); 800 memmove(curp->p_idx + 1, curp->p_idx, len); 801 ix = curp->p_idx; 802 } 803 804 ix->ei_block = cpu_to_le32(logical); 805 ext4_idx_store_pblock(ix, ptr); 806 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 807 808 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 809 > le16_to_cpu(curp->p_hdr->eh_max))) { 810 EXT4_ERROR_INODE(inode, 811 "logical %d == ei_block %d!", 812 logical, le32_to_cpu(curp->p_idx->ei_block)); 813 return -EIO; 814 } 815 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 816 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 817 return -EIO; 818 } 819 820 err = ext4_ext_dirty(handle, inode, curp); 821 ext4_std_error(inode->i_sb, err); 822 823 return err; 824 } 825 826 /* 827 * ext4_ext_split: 828 * inserts new subtree into the path, using free index entry 829 * at depth @at: 830 * - allocates all needed blocks (new leaf and all intermediate index blocks) 831 * - makes decision where to split 832 * - moves remaining extents and index entries (right to the split point) 833 * into the newly allocated blocks 834 * - initializes subtree 835 */ 836 static int ext4_ext_split(handle_t *handle, struct inode *inode, 837 unsigned int flags, 838 struct ext4_ext_path *path, 839 struct ext4_extent *newext, int at) 840 { 841 struct buffer_head *bh = NULL; 842 int depth = ext_depth(inode); 843 struct ext4_extent_header *neh; 844 struct ext4_extent_idx *fidx; 845 int i = at, k, m, a; 846 ext4_fsblk_t newblock, oldblock; 847 __le32 border; 848 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 849 int err = 0; 850 851 /* make decision: where to split? */ 852 /* FIXME: now decision is simplest: at current extent */ 853 854 /* if current leaf will be split, then we should use 855 * border from split point */ 856 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 857 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 858 return -EIO; 859 } 860 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 861 border = path[depth].p_ext[1].ee_block; 862 ext_debug("leaf will be split." 863 " next leaf starts at %d\n", 864 le32_to_cpu(border)); 865 } else { 866 border = newext->ee_block; 867 ext_debug("leaf will be added." 868 " next leaf starts at %d\n", 869 le32_to_cpu(border)); 870 } 871 872 /* 873 * If error occurs, then we break processing 874 * and mark filesystem read-only. index won't 875 * be inserted and tree will be in consistent 876 * state. Next mount will repair buffers too. 877 */ 878 879 /* 880 * Get array to track all allocated blocks. 881 * We need this to handle errors and free blocks 882 * upon them. 883 */ 884 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 885 if (!ablocks) 886 return -ENOMEM; 887 888 /* allocate all needed blocks */ 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 890 for (a = 0; a < depth - at; a++) { 891 newblock = ext4_ext_new_meta_block(handle, inode, path, 892 newext, &err, flags); 893 if (newblock == 0) 894 goto cleanup; 895 ablocks[a] = newblock; 896 } 897 898 /* initialize new leaf */ 899 newblock = ablocks[--a]; 900 if (unlikely(newblock == 0)) { 901 EXT4_ERROR_INODE(inode, "newblock == 0!"); 902 err = -EIO; 903 goto cleanup; 904 } 905 bh = sb_getblk(inode->i_sb, newblock); 906 if (!bh) { 907 err = -EIO; 908 goto cleanup; 909 } 910 lock_buffer(bh); 911 912 err = ext4_journal_get_create_access(handle, bh); 913 if (err) 914 goto cleanup; 915 916 neh = ext_block_hdr(bh); 917 neh->eh_entries = 0; 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 919 neh->eh_magic = EXT4_EXT_MAGIC; 920 neh->eh_depth = 0; 921 922 /* move remainder of path[depth] to the new leaf */ 923 if (unlikely(path[depth].p_hdr->eh_entries != 924 path[depth].p_hdr->eh_max)) { 925 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 926 path[depth].p_hdr->eh_entries, 927 path[depth].p_hdr->eh_max); 928 err = -EIO; 929 goto cleanup; 930 } 931 /* start copy from next extent */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 933 ext4_ext_show_move(inode, path, newblock, depth); 934 if (m) { 935 struct ext4_extent *ex; 936 ex = EXT_FIRST_EXTENT(neh); 937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 938 le16_add_cpu(&neh->eh_entries, m); 939 } 940 941 set_buffer_uptodate(bh); 942 unlock_buffer(bh); 943 944 err = ext4_handle_dirty_metadata(handle, inode, bh); 945 if (err) 946 goto cleanup; 947 brelse(bh); 948 bh = NULL; 949 950 /* correct old leaf */ 951 if (m) { 952 err = ext4_ext_get_access(handle, inode, path + depth); 953 if (err) 954 goto cleanup; 955 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 956 err = ext4_ext_dirty(handle, inode, path + depth); 957 if (err) 958 goto cleanup; 959 960 } 961 962 /* create intermediate indexes */ 963 k = depth - at - 1; 964 if (unlikely(k < 0)) { 965 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 966 err = -EIO; 967 goto cleanup; 968 } 969 if (k) 970 ext_debug("create %d intermediate indices\n", k); 971 /* insert new index into current index block */ 972 /* current depth stored in i var */ 973 i = depth - 1; 974 while (k--) { 975 oldblock = newblock; 976 newblock = ablocks[--a]; 977 bh = sb_getblk(inode->i_sb, newblock); 978 if (!bh) { 979 err = -EIO; 980 goto cleanup; 981 } 982 lock_buffer(bh); 983 984 err = ext4_journal_get_create_access(handle, bh); 985 if (err) 986 goto cleanup; 987 988 neh = ext_block_hdr(bh); 989 neh->eh_entries = cpu_to_le16(1); 990 neh->eh_magic = EXT4_EXT_MAGIC; 991 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 992 neh->eh_depth = cpu_to_le16(depth - i); 993 fidx = EXT_FIRST_INDEX(neh); 994 fidx->ei_block = border; 995 ext4_idx_store_pblock(fidx, oldblock); 996 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 998 i, newblock, le32_to_cpu(border), oldblock); 999 1000 /* move remainder of path[i] to the new index block */ 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1002 EXT_LAST_INDEX(path[i].p_hdr))) { 1003 EXT4_ERROR_INODE(inode, 1004 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1005 le32_to_cpu(path[i].p_ext->ee_block)); 1006 err = -EIO; 1007 goto cleanup; 1008 } 1009 /* start copy indexes */ 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1012 EXT_MAX_INDEX(path[i].p_hdr)); 1013 ext4_ext_show_move(inode, path, newblock, i); 1014 if (m) { 1015 memmove(++fidx, path[i].p_idx, 1016 sizeof(struct ext4_extent_idx) * m); 1017 le16_add_cpu(&neh->eh_entries, m); 1018 } 1019 set_buffer_uptodate(bh); 1020 unlock_buffer(bh); 1021 1022 err = ext4_handle_dirty_metadata(handle, inode, bh); 1023 if (err) 1024 goto cleanup; 1025 brelse(bh); 1026 bh = NULL; 1027 1028 /* correct old index */ 1029 if (m) { 1030 err = ext4_ext_get_access(handle, inode, path + i); 1031 if (err) 1032 goto cleanup; 1033 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1034 err = ext4_ext_dirty(handle, inode, path + i); 1035 if (err) 1036 goto cleanup; 1037 } 1038 1039 i--; 1040 } 1041 1042 /* insert new index */ 1043 err = ext4_ext_insert_index(handle, inode, path + at, 1044 le32_to_cpu(border), newblock); 1045 1046 cleanup: 1047 if (bh) { 1048 if (buffer_locked(bh)) 1049 unlock_buffer(bh); 1050 brelse(bh); 1051 } 1052 1053 if (err) { 1054 /* free all allocated blocks in error case */ 1055 for (i = 0; i < depth; i++) { 1056 if (!ablocks[i]) 1057 continue; 1058 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1059 EXT4_FREE_BLOCKS_METADATA); 1060 } 1061 } 1062 kfree(ablocks); 1063 1064 return err; 1065 } 1066 1067 /* 1068 * ext4_ext_grow_indepth: 1069 * implements tree growing procedure: 1070 * - allocates new block 1071 * - moves top-level data (index block or leaf) into the new block 1072 * - initializes new top-level, creating index that points to the 1073 * just created block 1074 */ 1075 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1076 unsigned int flags, 1077 struct ext4_ext_path *path, 1078 struct ext4_extent *newext) 1079 { 1080 struct ext4_ext_path *curp = path; 1081 struct ext4_extent_header *neh; 1082 struct buffer_head *bh; 1083 ext4_fsblk_t newblock; 1084 int err = 0; 1085 1086 newblock = ext4_ext_new_meta_block(handle, inode, path, 1087 newext, &err, flags); 1088 if (newblock == 0) 1089 return err; 1090 1091 bh = sb_getblk(inode->i_sb, newblock); 1092 if (!bh) { 1093 err = -EIO; 1094 ext4_std_error(inode->i_sb, err); 1095 return err; 1096 } 1097 lock_buffer(bh); 1098 1099 err = ext4_journal_get_create_access(handle, bh); 1100 if (err) { 1101 unlock_buffer(bh); 1102 goto out; 1103 } 1104 1105 /* move top-level index/leaf into new block */ 1106 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1107 1108 /* set size of new block */ 1109 neh = ext_block_hdr(bh); 1110 /* old root could have indexes or leaves 1111 * so calculate e_max right way */ 1112 if (ext_depth(inode)) 1113 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1114 else 1115 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1116 neh->eh_magic = EXT4_EXT_MAGIC; 1117 set_buffer_uptodate(bh); 1118 unlock_buffer(bh); 1119 1120 err = ext4_handle_dirty_metadata(handle, inode, bh); 1121 if (err) 1122 goto out; 1123 1124 /* create index in new top-level index: num,max,pointer */ 1125 err = ext4_ext_get_access(handle, inode, curp); 1126 if (err) 1127 goto out; 1128 1129 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1130 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1131 curp->p_hdr->eh_entries = cpu_to_le16(1); 1132 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1133 1134 if (path[0].p_hdr->eh_depth) 1135 curp->p_idx->ei_block = 1136 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; 1137 else 1138 curp->p_idx->ei_block = 1139 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 1140 ext4_idx_store_pblock(curp->p_idx, newblock); 1141 1142 neh = ext_inode_hdr(inode); 1143 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1144 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1145 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1146 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1147 1148 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1149 err = ext4_ext_dirty(handle, inode, curp); 1150 out: 1151 brelse(bh); 1152 1153 return err; 1154 } 1155 1156 /* 1157 * ext4_ext_create_new_leaf: 1158 * finds empty index and adds new leaf. 1159 * if no free index is found, then it requests in-depth growing. 1160 */ 1161 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1162 unsigned int flags, 1163 struct ext4_ext_path *path, 1164 struct ext4_extent *newext) 1165 { 1166 struct ext4_ext_path *curp; 1167 int depth, i, err = 0; 1168 1169 repeat: 1170 i = depth = ext_depth(inode); 1171 1172 /* walk up to the tree and look for free index entry */ 1173 curp = path + depth; 1174 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1175 i--; 1176 curp--; 1177 } 1178 1179 /* we use already allocated block for index block, 1180 * so subsequent data blocks should be contiguous */ 1181 if (EXT_HAS_FREE_INDEX(curp)) { 1182 /* if we found index with free entry, then use that 1183 * entry: create all needed subtree and add new leaf */ 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i); 1185 if (err) 1186 goto out; 1187 1188 /* refill path */ 1189 ext4_ext_drop_refs(path); 1190 path = ext4_ext_find_extent(inode, 1191 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1192 path); 1193 if (IS_ERR(path)) 1194 err = PTR_ERR(path); 1195 } else { 1196 /* tree is full, time to grow in depth */ 1197 err = ext4_ext_grow_indepth(handle, inode, flags, 1198 path, newext); 1199 if (err) 1200 goto out; 1201 1202 /* refill path */ 1203 ext4_ext_drop_refs(path); 1204 path = ext4_ext_find_extent(inode, 1205 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1206 path); 1207 if (IS_ERR(path)) { 1208 err = PTR_ERR(path); 1209 goto out; 1210 } 1211 1212 /* 1213 * only first (depth 0 -> 1) produces free space; 1214 * in all other cases we have to split the grown tree 1215 */ 1216 depth = ext_depth(inode); 1217 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1218 /* now we need to split */ 1219 goto repeat; 1220 } 1221 } 1222 1223 out: 1224 return err; 1225 } 1226 1227 /* 1228 * search the closest allocated block to the left for *logical 1229 * and returns it at @logical + it's physical address at @phys 1230 * if *logical is the smallest allocated block, the function 1231 * returns 0 at @phys 1232 * return value contains 0 (success) or error code 1233 */ 1234 static int ext4_ext_search_left(struct inode *inode, 1235 struct ext4_ext_path *path, 1236 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1237 { 1238 struct ext4_extent_idx *ix; 1239 struct ext4_extent *ex; 1240 int depth, ee_len; 1241 1242 if (unlikely(path == NULL)) { 1243 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1244 return -EIO; 1245 } 1246 depth = path->p_depth; 1247 *phys = 0; 1248 1249 if (depth == 0 && path->p_ext == NULL) 1250 return 0; 1251 1252 /* usually extent in the path covers blocks smaller 1253 * then *logical, but it can be that extent is the 1254 * first one in the file */ 1255 1256 ex = path[depth].p_ext; 1257 ee_len = ext4_ext_get_actual_len(ex); 1258 if (*logical < le32_to_cpu(ex->ee_block)) { 1259 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1260 EXT4_ERROR_INODE(inode, 1261 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1262 *logical, le32_to_cpu(ex->ee_block)); 1263 return -EIO; 1264 } 1265 while (--depth >= 0) { 1266 ix = path[depth].p_idx; 1267 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1268 EXT4_ERROR_INODE(inode, 1269 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1270 ix != NULL ? ix->ei_block : 0, 1271 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1272 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1273 depth); 1274 return -EIO; 1275 } 1276 } 1277 return 0; 1278 } 1279 1280 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1281 EXT4_ERROR_INODE(inode, 1282 "logical %d < ee_block %d + ee_len %d!", 1283 *logical, le32_to_cpu(ex->ee_block), ee_len); 1284 return -EIO; 1285 } 1286 1287 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1288 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1289 return 0; 1290 } 1291 1292 /* 1293 * search the closest allocated block to the right for *logical 1294 * and returns it at @logical + it's physical address at @phys 1295 * if *logical is the smallest allocated block, the function 1296 * returns 0 at @phys 1297 * return value contains 0 (success) or error code 1298 */ 1299 static int ext4_ext_search_right(struct inode *inode, 1300 struct ext4_ext_path *path, 1301 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1302 { 1303 struct buffer_head *bh = NULL; 1304 struct ext4_extent_header *eh; 1305 struct ext4_extent_idx *ix; 1306 struct ext4_extent *ex; 1307 ext4_fsblk_t block; 1308 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1309 int ee_len; 1310 1311 if (unlikely(path == NULL)) { 1312 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1313 return -EIO; 1314 } 1315 depth = path->p_depth; 1316 *phys = 0; 1317 1318 if (depth == 0 && path->p_ext == NULL) 1319 return 0; 1320 1321 /* usually extent in the path covers blocks smaller 1322 * then *logical, but it can be that extent is the 1323 * first one in the file */ 1324 1325 ex = path[depth].p_ext; 1326 ee_len = ext4_ext_get_actual_len(ex); 1327 if (*logical < le32_to_cpu(ex->ee_block)) { 1328 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1329 EXT4_ERROR_INODE(inode, 1330 "first_extent(path[%d].p_hdr) != ex", 1331 depth); 1332 return -EIO; 1333 } 1334 while (--depth >= 0) { 1335 ix = path[depth].p_idx; 1336 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1337 EXT4_ERROR_INODE(inode, 1338 "ix != EXT_FIRST_INDEX *logical %d!", 1339 *logical); 1340 return -EIO; 1341 } 1342 } 1343 *logical = le32_to_cpu(ex->ee_block); 1344 *phys = ext4_ext_pblock(ex); 1345 return 0; 1346 } 1347 1348 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1349 EXT4_ERROR_INODE(inode, 1350 "logical %d < ee_block %d + ee_len %d!", 1351 *logical, le32_to_cpu(ex->ee_block), ee_len); 1352 return -EIO; 1353 } 1354 1355 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1356 /* next allocated block in this leaf */ 1357 ex++; 1358 *logical = le32_to_cpu(ex->ee_block); 1359 *phys = ext4_ext_pblock(ex); 1360 return 0; 1361 } 1362 1363 /* go up and search for index to the right */ 1364 while (--depth >= 0) { 1365 ix = path[depth].p_idx; 1366 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1367 goto got_index; 1368 } 1369 1370 /* we've gone up to the root and found no index to the right */ 1371 return 0; 1372 1373 got_index: 1374 /* we've found index to the right, let's 1375 * follow it and find the closest allocated 1376 * block to the right */ 1377 ix++; 1378 block = ext4_idx_pblock(ix); 1379 while (++depth < path->p_depth) { 1380 bh = sb_bread(inode->i_sb, block); 1381 if (bh == NULL) 1382 return -EIO; 1383 eh = ext_block_hdr(bh); 1384 /* subtract from p_depth to get proper eh_depth */ 1385 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1386 put_bh(bh); 1387 return -EIO; 1388 } 1389 ix = EXT_FIRST_INDEX(eh); 1390 block = ext4_idx_pblock(ix); 1391 put_bh(bh); 1392 } 1393 1394 bh = sb_bread(inode->i_sb, block); 1395 if (bh == NULL) 1396 return -EIO; 1397 eh = ext_block_hdr(bh); 1398 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1399 put_bh(bh); 1400 return -EIO; 1401 } 1402 ex = EXT_FIRST_EXTENT(eh); 1403 *logical = le32_to_cpu(ex->ee_block); 1404 *phys = ext4_ext_pblock(ex); 1405 put_bh(bh); 1406 return 0; 1407 } 1408 1409 /* 1410 * ext4_ext_next_allocated_block: 1411 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1412 * NOTE: it considers block number from index entry as 1413 * allocated block. Thus, index entries have to be consistent 1414 * with leaves. 1415 */ 1416 static ext4_lblk_t 1417 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1418 { 1419 int depth; 1420 1421 BUG_ON(path == NULL); 1422 depth = path->p_depth; 1423 1424 if (depth == 0 && path->p_ext == NULL) 1425 return EXT_MAX_BLOCKS; 1426 1427 while (depth >= 0) { 1428 if (depth == path->p_depth) { 1429 /* leaf */ 1430 if (path[depth].p_ext != 1431 EXT_LAST_EXTENT(path[depth].p_hdr)) 1432 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1433 } else { 1434 /* index */ 1435 if (path[depth].p_idx != 1436 EXT_LAST_INDEX(path[depth].p_hdr)) 1437 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1438 } 1439 depth--; 1440 } 1441 1442 return EXT_MAX_BLOCKS; 1443 } 1444 1445 /* 1446 * ext4_ext_next_leaf_block: 1447 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1448 */ 1449 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1450 struct ext4_ext_path *path) 1451 { 1452 int depth; 1453 1454 BUG_ON(path == NULL); 1455 depth = path->p_depth; 1456 1457 /* zero-tree has no leaf blocks at all */ 1458 if (depth == 0) 1459 return EXT_MAX_BLOCKS; 1460 1461 /* go to index block */ 1462 depth--; 1463 1464 while (depth >= 0) { 1465 if (path[depth].p_idx != 1466 EXT_LAST_INDEX(path[depth].p_hdr)) 1467 return (ext4_lblk_t) 1468 le32_to_cpu(path[depth].p_idx[1].ei_block); 1469 depth--; 1470 } 1471 1472 return EXT_MAX_BLOCKS; 1473 } 1474 1475 /* 1476 * ext4_ext_correct_indexes: 1477 * if leaf gets modified and modified extent is first in the leaf, 1478 * then we have to correct all indexes above. 1479 * TODO: do we need to correct tree in all cases? 1480 */ 1481 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1482 struct ext4_ext_path *path) 1483 { 1484 struct ext4_extent_header *eh; 1485 int depth = ext_depth(inode); 1486 struct ext4_extent *ex; 1487 __le32 border; 1488 int k, err = 0; 1489 1490 eh = path[depth].p_hdr; 1491 ex = path[depth].p_ext; 1492 1493 if (unlikely(ex == NULL || eh == NULL)) { 1494 EXT4_ERROR_INODE(inode, 1495 "ex %p == NULL or eh %p == NULL", ex, eh); 1496 return -EIO; 1497 } 1498 1499 if (depth == 0) { 1500 /* there is no tree at all */ 1501 return 0; 1502 } 1503 1504 if (ex != EXT_FIRST_EXTENT(eh)) { 1505 /* we correct tree if first leaf got modified only */ 1506 return 0; 1507 } 1508 1509 /* 1510 * TODO: we need correction if border is smaller than current one 1511 */ 1512 k = depth - 1; 1513 border = path[depth].p_ext->ee_block; 1514 err = ext4_ext_get_access(handle, inode, path + k); 1515 if (err) 1516 return err; 1517 path[k].p_idx->ei_block = border; 1518 err = ext4_ext_dirty(handle, inode, path + k); 1519 if (err) 1520 return err; 1521 1522 while (k--) { 1523 /* change all left-side indexes */ 1524 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1525 break; 1526 err = ext4_ext_get_access(handle, inode, path + k); 1527 if (err) 1528 break; 1529 path[k].p_idx->ei_block = border; 1530 err = ext4_ext_dirty(handle, inode, path + k); 1531 if (err) 1532 break; 1533 } 1534 1535 return err; 1536 } 1537 1538 int 1539 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1540 struct ext4_extent *ex2) 1541 { 1542 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1543 1544 /* 1545 * Make sure that either both extents are uninitialized, or 1546 * both are _not_. 1547 */ 1548 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1549 return 0; 1550 1551 if (ext4_ext_is_uninitialized(ex1)) 1552 max_len = EXT_UNINIT_MAX_LEN; 1553 else 1554 max_len = EXT_INIT_MAX_LEN; 1555 1556 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1557 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1558 1559 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1560 le32_to_cpu(ex2->ee_block)) 1561 return 0; 1562 1563 /* 1564 * To allow future support for preallocated extents to be added 1565 * as an RO_COMPAT feature, refuse to merge to extents if 1566 * this can result in the top bit of ee_len being set. 1567 */ 1568 if (ext1_ee_len + ext2_ee_len > max_len) 1569 return 0; 1570 #ifdef AGGRESSIVE_TEST 1571 if (ext1_ee_len >= 4) 1572 return 0; 1573 #endif 1574 1575 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1576 return 1; 1577 return 0; 1578 } 1579 1580 /* 1581 * This function tries to merge the "ex" extent to the next extent in the tree. 1582 * It always tries to merge towards right. If you want to merge towards 1583 * left, pass "ex - 1" as argument instead of "ex". 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1585 * 1 if they got merged. 1586 */ 1587 static int ext4_ext_try_to_merge_right(struct inode *inode, 1588 struct ext4_ext_path *path, 1589 struct ext4_extent *ex) 1590 { 1591 struct ext4_extent_header *eh; 1592 unsigned int depth, len; 1593 int merge_done = 0; 1594 int uninitialized = 0; 1595 1596 depth = ext_depth(inode); 1597 BUG_ON(path[depth].p_hdr == NULL); 1598 eh = path[depth].p_hdr; 1599 1600 while (ex < EXT_LAST_EXTENT(eh)) { 1601 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1602 break; 1603 /* merge with next extent! */ 1604 if (ext4_ext_is_uninitialized(ex)) 1605 uninitialized = 1; 1606 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1607 + ext4_ext_get_actual_len(ex + 1)); 1608 if (uninitialized) 1609 ext4_ext_mark_uninitialized(ex); 1610 1611 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1612 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1613 * sizeof(struct ext4_extent); 1614 memmove(ex + 1, ex + 2, len); 1615 } 1616 le16_add_cpu(&eh->eh_entries, -1); 1617 merge_done = 1; 1618 WARN_ON(eh->eh_entries == 0); 1619 if (!eh->eh_entries) 1620 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1621 } 1622 1623 return merge_done; 1624 } 1625 1626 /* 1627 * This function tries to merge the @ex extent to neighbours in the tree. 1628 * return 1 if merge left else 0. 1629 */ 1630 static int ext4_ext_try_to_merge(struct inode *inode, 1631 struct ext4_ext_path *path, 1632 struct ext4_extent *ex) { 1633 struct ext4_extent_header *eh; 1634 unsigned int depth; 1635 int merge_done = 0; 1636 int ret = 0; 1637 1638 depth = ext_depth(inode); 1639 BUG_ON(path[depth].p_hdr == NULL); 1640 eh = path[depth].p_hdr; 1641 1642 if (ex > EXT_FIRST_EXTENT(eh)) 1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1644 1645 if (!merge_done) 1646 ret = ext4_ext_try_to_merge_right(inode, path, ex); 1647 1648 return ret; 1649 } 1650 1651 /* 1652 * check if a portion of the "newext" extent overlaps with an 1653 * existing extent. 1654 * 1655 * If there is an overlap discovered, it updates the length of the newext 1656 * such that there will be no overlap, and then returns 1. 1657 * If there is no overlap found, it returns 0. 1658 */ 1659 static unsigned int ext4_ext_check_overlap(struct inode *inode, 1660 struct ext4_extent *newext, 1661 struct ext4_ext_path *path) 1662 { 1663 ext4_lblk_t b1, b2; 1664 unsigned int depth, len1; 1665 unsigned int ret = 0; 1666 1667 b1 = le32_to_cpu(newext->ee_block); 1668 len1 = ext4_ext_get_actual_len(newext); 1669 depth = ext_depth(inode); 1670 if (!path[depth].p_ext) 1671 goto out; 1672 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1673 1674 /* 1675 * get the next allocated block if the extent in the path 1676 * is before the requested block(s) 1677 */ 1678 if (b2 < b1) { 1679 b2 = ext4_ext_next_allocated_block(path); 1680 if (b2 == EXT_MAX_BLOCKS) 1681 goto out; 1682 } 1683 1684 /* check for wrap through zero on extent logical start block*/ 1685 if (b1 + len1 < b1) { 1686 len1 = EXT_MAX_BLOCKS - b1; 1687 newext->ee_len = cpu_to_le16(len1); 1688 ret = 1; 1689 } 1690 1691 /* check for overlap */ 1692 if (b1 + len1 > b2) { 1693 newext->ee_len = cpu_to_le16(b2 - b1); 1694 ret = 1; 1695 } 1696 out: 1697 return ret; 1698 } 1699 1700 /* 1701 * ext4_ext_insert_extent: 1702 * tries to merge requsted extent into the existing extent or 1703 * inserts requested extent as new one into the tree, 1704 * creating new leaf in the no-space case. 1705 */ 1706 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1707 struct ext4_ext_path *path, 1708 struct ext4_extent *newext, int flag) 1709 { 1710 struct ext4_extent_header *eh; 1711 struct ext4_extent *ex, *fex; 1712 struct ext4_extent *nearex; /* nearest extent */ 1713 struct ext4_ext_path *npath = NULL; 1714 int depth, len, err; 1715 ext4_lblk_t next; 1716 unsigned uninitialized = 0; 1717 int flags = 0; 1718 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1721 return -EIO; 1722 } 1723 depth = ext_depth(inode); 1724 ex = path[depth].p_ext; 1725 if (unlikely(path[depth].p_hdr == NULL)) { 1726 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1727 return -EIO; 1728 } 1729 1730 /* try to insert block into found extent and return */ 1731 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1732 && ext4_can_extents_be_merged(inode, ex, newext)) { 1733 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1734 ext4_ext_is_uninitialized(newext), 1735 ext4_ext_get_actual_len(newext), 1736 le32_to_cpu(ex->ee_block), 1737 ext4_ext_is_uninitialized(ex), 1738 ext4_ext_get_actual_len(ex), 1739 ext4_ext_pblock(ex)); 1740 err = ext4_ext_get_access(handle, inode, path + depth); 1741 if (err) 1742 return err; 1743 1744 /* 1745 * ext4_can_extents_be_merged should have checked that either 1746 * both extents are uninitialized, or both aren't. Thus we 1747 * need to check only one of them here. 1748 */ 1749 if (ext4_ext_is_uninitialized(ex)) 1750 uninitialized = 1; 1751 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1752 + ext4_ext_get_actual_len(newext)); 1753 if (uninitialized) 1754 ext4_ext_mark_uninitialized(ex); 1755 eh = path[depth].p_hdr; 1756 nearex = ex; 1757 goto merge; 1758 } 1759 1760 repeat: 1761 depth = ext_depth(inode); 1762 eh = path[depth].p_hdr; 1763 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1764 goto has_space; 1765 1766 /* probably next leaf has space for us? */ 1767 fex = EXT_LAST_EXTENT(eh); 1768 next = ext4_ext_next_leaf_block(inode, path); 1769 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1770 && next != EXT_MAX_BLOCKS) { 1771 ext_debug("next leaf block - %d\n", next); 1772 BUG_ON(npath != NULL); 1773 npath = ext4_ext_find_extent(inode, next, NULL); 1774 if (IS_ERR(npath)) 1775 return PTR_ERR(npath); 1776 BUG_ON(npath->p_depth != path->p_depth); 1777 eh = npath[depth].p_hdr; 1778 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1779 ext_debug("next leaf isn't full(%d)\n", 1780 le16_to_cpu(eh->eh_entries)); 1781 path = npath; 1782 goto repeat; 1783 } 1784 ext_debug("next leaf has no free space(%d,%d)\n", 1785 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1786 } 1787 1788 /* 1789 * There is no free space in the found leaf. 1790 * We're gonna add a new leaf in the tree. 1791 */ 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) 1793 flags = EXT4_MB_USE_ROOT_BLOCKS; 1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext); 1795 if (err) 1796 goto cleanup; 1797 depth = ext_depth(inode); 1798 eh = path[depth].p_hdr; 1799 1800 has_space: 1801 nearex = path[depth].p_ext; 1802 1803 err = ext4_ext_get_access(handle, inode, path + depth); 1804 if (err) 1805 goto cleanup; 1806 1807 if (!nearex) { 1808 /* there is no extent in this leaf, create first one */ 1809 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1810 le32_to_cpu(newext->ee_block), 1811 ext4_ext_pblock(newext), 1812 ext4_ext_is_uninitialized(newext), 1813 ext4_ext_get_actual_len(newext)); 1814 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1815 } else if (le32_to_cpu(newext->ee_block) 1816 > le32_to_cpu(nearex->ee_block)) { 1817 /* BUG_ON(newext->ee_block == nearex->ee_block); */ 1818 if (nearex != EXT_LAST_EXTENT(eh)) { 1819 len = EXT_MAX_EXTENT(eh) - nearex; 1820 len = (len - 1) * sizeof(struct ext4_extent); 1821 len = len < 0 ? 0 : len; 1822 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1823 "move %d from 0x%p to 0x%p\n", 1824 le32_to_cpu(newext->ee_block), 1825 ext4_ext_pblock(newext), 1826 ext4_ext_is_uninitialized(newext), 1827 ext4_ext_get_actual_len(newext), 1828 nearex, len, nearex + 1, nearex + 2); 1829 memmove(nearex + 2, nearex + 1, len); 1830 } 1831 path[depth].p_ext = nearex + 1; 1832 } else { 1833 BUG_ON(newext->ee_block == nearex->ee_block); 1834 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1835 len = len < 0 ? 0 : len; 1836 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1837 "move %d from 0x%p to 0x%p\n", 1838 le32_to_cpu(newext->ee_block), 1839 ext4_ext_pblock(newext), 1840 ext4_ext_is_uninitialized(newext), 1841 ext4_ext_get_actual_len(newext), 1842 nearex, len, nearex + 1, nearex + 2); 1843 memmove(nearex + 1, nearex, len); 1844 path[depth].p_ext = nearex; 1845 } 1846 1847 le16_add_cpu(&eh->eh_entries, 1); 1848 nearex = path[depth].p_ext; 1849 nearex->ee_block = newext->ee_block; 1850 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1851 nearex->ee_len = newext->ee_len; 1852 1853 merge: 1854 /* try to merge extents to the right */ 1855 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1856 ext4_ext_try_to_merge(inode, path, nearex); 1857 1858 /* try to merge extents to the left */ 1859 1860 /* time to correct all indexes above */ 1861 err = ext4_ext_correct_indexes(handle, inode, path); 1862 if (err) 1863 goto cleanup; 1864 1865 err = ext4_ext_dirty(handle, inode, path + depth); 1866 1867 cleanup: 1868 if (npath) { 1869 ext4_ext_drop_refs(npath); 1870 kfree(npath); 1871 } 1872 ext4_ext_invalidate_cache(inode); 1873 return err; 1874 } 1875 1876 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1877 ext4_lblk_t num, ext_prepare_callback func, 1878 void *cbdata) 1879 { 1880 struct ext4_ext_path *path = NULL; 1881 struct ext4_ext_cache cbex; 1882 struct ext4_extent *ex; 1883 ext4_lblk_t next, start = 0, end = 0; 1884 ext4_lblk_t last = block + num; 1885 int depth, exists, err = 0; 1886 1887 BUG_ON(func == NULL); 1888 BUG_ON(inode == NULL); 1889 1890 while (block < last && block != EXT_MAX_BLOCKS) { 1891 num = last - block; 1892 /* find extent for this block */ 1893 down_read(&EXT4_I(inode)->i_data_sem); 1894 path = ext4_ext_find_extent(inode, block, path); 1895 up_read(&EXT4_I(inode)->i_data_sem); 1896 if (IS_ERR(path)) { 1897 err = PTR_ERR(path); 1898 path = NULL; 1899 break; 1900 } 1901 1902 depth = ext_depth(inode); 1903 if (unlikely(path[depth].p_hdr == NULL)) { 1904 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1905 err = -EIO; 1906 break; 1907 } 1908 ex = path[depth].p_ext; 1909 next = ext4_ext_next_allocated_block(path); 1910 1911 exists = 0; 1912 if (!ex) { 1913 /* there is no extent yet, so try to allocate 1914 * all requested space */ 1915 start = block; 1916 end = block + num; 1917 } else if (le32_to_cpu(ex->ee_block) > block) { 1918 /* need to allocate space before found extent */ 1919 start = block; 1920 end = le32_to_cpu(ex->ee_block); 1921 if (block + num < end) 1922 end = block + num; 1923 } else if (block >= le32_to_cpu(ex->ee_block) 1924 + ext4_ext_get_actual_len(ex)) { 1925 /* need to allocate space after found extent */ 1926 start = block; 1927 end = block + num; 1928 if (end >= next) 1929 end = next; 1930 } else if (block >= le32_to_cpu(ex->ee_block)) { 1931 /* 1932 * some part of requested space is covered 1933 * by found extent 1934 */ 1935 start = block; 1936 end = le32_to_cpu(ex->ee_block) 1937 + ext4_ext_get_actual_len(ex); 1938 if (block + num < end) 1939 end = block + num; 1940 exists = 1; 1941 } else { 1942 BUG(); 1943 } 1944 BUG_ON(end <= start); 1945 1946 if (!exists) { 1947 cbex.ec_block = start; 1948 cbex.ec_len = end - start; 1949 cbex.ec_start = 0; 1950 } else { 1951 cbex.ec_block = le32_to_cpu(ex->ee_block); 1952 cbex.ec_len = ext4_ext_get_actual_len(ex); 1953 cbex.ec_start = ext4_ext_pblock(ex); 1954 } 1955 1956 if (unlikely(cbex.ec_len == 0)) { 1957 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 1958 err = -EIO; 1959 break; 1960 } 1961 err = func(inode, next, &cbex, ex, cbdata); 1962 ext4_ext_drop_refs(path); 1963 1964 if (err < 0) 1965 break; 1966 1967 if (err == EXT_REPEAT) 1968 continue; 1969 else if (err == EXT_BREAK) { 1970 err = 0; 1971 break; 1972 } 1973 1974 if (ext_depth(inode) != depth) { 1975 /* depth was changed. we have to realloc path */ 1976 kfree(path); 1977 path = NULL; 1978 } 1979 1980 block = cbex.ec_block + cbex.ec_len; 1981 } 1982 1983 if (path) { 1984 ext4_ext_drop_refs(path); 1985 kfree(path); 1986 } 1987 1988 return err; 1989 } 1990 1991 static void 1992 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1993 __u32 len, ext4_fsblk_t start) 1994 { 1995 struct ext4_ext_cache *cex; 1996 BUG_ON(len == 0); 1997 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1998 cex = &EXT4_I(inode)->i_cached_extent; 1999 cex->ec_block = block; 2000 cex->ec_len = len; 2001 cex->ec_start = start; 2002 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2003 } 2004 2005 /* 2006 * ext4_ext_put_gap_in_cache: 2007 * calculate boundaries of the gap that the requested block fits into 2008 * and cache this gap 2009 */ 2010 static void 2011 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2012 ext4_lblk_t block) 2013 { 2014 int depth = ext_depth(inode); 2015 unsigned long len; 2016 ext4_lblk_t lblock; 2017 struct ext4_extent *ex; 2018 2019 ex = path[depth].p_ext; 2020 if (ex == NULL) { 2021 /* there is no extent yet, so gap is [0;-] */ 2022 lblock = 0; 2023 len = EXT_MAX_BLOCKS; 2024 ext_debug("cache gap(whole file):"); 2025 } else if (block < le32_to_cpu(ex->ee_block)) { 2026 lblock = block; 2027 len = le32_to_cpu(ex->ee_block) - block; 2028 ext_debug("cache gap(before): %u [%u:%u]", 2029 block, 2030 le32_to_cpu(ex->ee_block), 2031 ext4_ext_get_actual_len(ex)); 2032 } else if (block >= le32_to_cpu(ex->ee_block) 2033 + ext4_ext_get_actual_len(ex)) { 2034 ext4_lblk_t next; 2035 lblock = le32_to_cpu(ex->ee_block) 2036 + ext4_ext_get_actual_len(ex); 2037 2038 next = ext4_ext_next_allocated_block(path); 2039 ext_debug("cache gap(after): [%u:%u] %u", 2040 le32_to_cpu(ex->ee_block), 2041 ext4_ext_get_actual_len(ex), 2042 block); 2043 BUG_ON(next == lblock); 2044 len = next - lblock; 2045 } else { 2046 lblock = len = 0; 2047 BUG(); 2048 } 2049 2050 ext_debug(" -> %u:%lu\n", lblock, len); 2051 ext4_ext_put_in_cache(inode, lblock, len, 0); 2052 } 2053 2054 /* 2055 * ext4_ext_in_cache() 2056 * Checks to see if the given block is in the cache. 2057 * If it is, the cached extent is stored in the given 2058 * cache extent pointer. If the cached extent is a hole, 2059 * this routine should be used instead of 2060 * ext4_ext_in_cache if the calling function needs to 2061 * know the size of the hole. 2062 * 2063 * @inode: The files inode 2064 * @block: The block to look for in the cache 2065 * @ex: Pointer where the cached extent will be stored 2066 * if it contains block 2067 * 2068 * Return 0 if cache is invalid; 1 if the cache is valid 2069 */ 2070 static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, 2071 struct ext4_ext_cache *ex){ 2072 struct ext4_ext_cache *cex; 2073 struct ext4_sb_info *sbi; 2074 int ret = 0; 2075 2076 /* 2077 * We borrow i_block_reservation_lock to protect i_cached_extent 2078 */ 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2080 cex = &EXT4_I(inode)->i_cached_extent; 2081 sbi = EXT4_SB(inode->i_sb); 2082 2083 /* has cache valid data? */ 2084 if (cex->ec_len == 0) 2085 goto errout; 2086 2087 if (in_range(block, cex->ec_block, cex->ec_len)) { 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache)); 2089 ext_debug("%u cached by %u:%u:%llu\n", 2090 block, 2091 cex->ec_block, cex->ec_len, cex->ec_start); 2092 ret = 1; 2093 } 2094 errout: 2095 if (!ret) 2096 sbi->extent_cache_misses++; 2097 else 2098 sbi->extent_cache_hits++; 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2100 return ret; 2101 } 2102 2103 /* 2104 * ext4_ext_in_cache() 2105 * Checks to see if the given block is in the cache. 2106 * If it is, the cached extent is stored in the given 2107 * extent pointer. 2108 * 2109 * @inode: The files inode 2110 * @block: The block to look for in the cache 2111 * @ex: Pointer where the cached extent will be stored 2112 * if it contains block 2113 * 2114 * Return 0 if cache is invalid; 1 if the cache is valid 2115 */ 2116 static int 2117 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2118 struct ext4_extent *ex) 2119 { 2120 struct ext4_ext_cache cex; 2121 int ret = 0; 2122 2123 if (ext4_ext_check_cache(inode, block, &cex)) { 2124 ex->ee_block = cpu_to_le32(cex.ec_block); 2125 ext4_ext_store_pblock(ex, cex.ec_start); 2126 ex->ee_len = cpu_to_le16(cex.ec_len); 2127 ret = 1; 2128 } 2129 2130 return ret; 2131 } 2132 2133 2134 /* 2135 * ext4_ext_rm_idx: 2136 * removes index from the index block. 2137 * It's used in truncate case only, thus all requests are for 2138 * last index in the block only. 2139 */ 2140 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2141 struct ext4_ext_path *path) 2142 { 2143 int err; 2144 ext4_fsblk_t leaf; 2145 2146 /* free index block */ 2147 path--; 2148 leaf = ext4_idx_pblock(path->p_idx); 2149 if (unlikely(path->p_hdr->eh_entries == 0)) { 2150 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2151 return -EIO; 2152 } 2153 err = ext4_ext_get_access(handle, inode, path); 2154 if (err) 2155 return err; 2156 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2157 err = ext4_ext_dirty(handle, inode, path); 2158 if (err) 2159 return err; 2160 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2161 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2162 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2163 return err; 2164 } 2165 2166 /* 2167 * ext4_ext_calc_credits_for_single_extent: 2168 * This routine returns max. credits that needed to insert an extent 2169 * to the extent tree. 2170 * When pass the actual path, the caller should calculate credits 2171 * under i_data_sem. 2172 */ 2173 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2174 struct ext4_ext_path *path) 2175 { 2176 if (path) { 2177 int depth = ext_depth(inode); 2178 int ret = 0; 2179 2180 /* probably there is space in leaf? */ 2181 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2182 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2183 2184 /* 2185 * There are some space in the leaf tree, no 2186 * need to account for leaf block credit 2187 * 2188 * bitmaps and block group descriptor blocks 2189 * and other metadat blocks still need to be 2190 * accounted. 2191 */ 2192 /* 1 bitmap, 1 block group descriptor */ 2193 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2194 return ret; 2195 } 2196 } 2197 2198 return ext4_chunk_trans_blocks(inode, nrblocks); 2199 } 2200 2201 /* 2202 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2203 * 2204 * if nrblocks are fit in a single extent (chunk flag is 1), then 2205 * in the worse case, each tree level index/leaf need to be changed 2206 * if the tree split due to insert a new extent, then the old tree 2207 * index/leaf need to be updated too 2208 * 2209 * If the nrblocks are discontiguous, they could cause 2210 * the whole tree split more than once, but this is really rare. 2211 */ 2212 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2213 { 2214 int index; 2215 int depth = ext_depth(inode); 2216 2217 if (chunk) 2218 index = depth * 2; 2219 else 2220 index = depth * 3; 2221 2222 return index; 2223 } 2224 2225 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2226 struct ext4_extent *ex, 2227 ext4_lblk_t from, ext4_lblk_t to) 2228 { 2229 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2230 int flags = EXT4_FREE_BLOCKS_FORGET; 2231 2232 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2233 flags |= EXT4_FREE_BLOCKS_METADATA; 2234 #ifdef EXTENTS_STATS 2235 { 2236 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2237 spin_lock(&sbi->s_ext_stats_lock); 2238 sbi->s_ext_blocks += ee_len; 2239 sbi->s_ext_extents++; 2240 if (ee_len < sbi->s_ext_min) 2241 sbi->s_ext_min = ee_len; 2242 if (ee_len > sbi->s_ext_max) 2243 sbi->s_ext_max = ee_len; 2244 if (ext_depth(inode) > sbi->s_depth_max) 2245 sbi->s_depth_max = ext_depth(inode); 2246 spin_unlock(&sbi->s_ext_stats_lock); 2247 } 2248 #endif 2249 if (from >= le32_to_cpu(ex->ee_block) 2250 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2251 /* tail removal */ 2252 ext4_lblk_t num; 2253 ext4_fsblk_t start; 2254 2255 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2256 start = ext4_ext_pblock(ex) + ee_len - num; 2257 ext_debug("free last %u blocks starting %llu\n", num, start); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2259 } else if (from == le32_to_cpu(ex->ee_block) 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2261 /* head removal */ 2262 ext4_lblk_t num; 2263 ext4_fsblk_t start; 2264 2265 num = to - from; 2266 start = ext4_ext_pblock(ex); 2267 2268 ext_debug("free first %u blocks starting %llu\n", num, start); 2269 ext4_free_blocks(handle, inode, 0, start, num, flags); 2270 2271 } else { 2272 printk(KERN_INFO "strange request: removal(2) " 2273 "%u-%u from %u:%u\n", 2274 from, to, le32_to_cpu(ex->ee_block), ee_len); 2275 } 2276 return 0; 2277 } 2278 2279 2280 /* 2281 * ext4_ext_rm_leaf() Removes the extents associated with the 2282 * blocks appearing between "start" and "end", and splits the extents 2283 * if "start" and "end" appear in the same extent 2284 * 2285 * @handle: The journal handle 2286 * @inode: The files inode 2287 * @path: The path to the leaf 2288 * @start: The first block to remove 2289 * @end: The last block to remove 2290 */ 2291 static int 2292 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2293 struct ext4_ext_path *path, ext4_lblk_t start, 2294 ext4_lblk_t end) 2295 { 2296 int err = 0, correct_index = 0; 2297 int depth = ext_depth(inode), credits; 2298 struct ext4_extent_header *eh; 2299 ext4_lblk_t a, b, block; 2300 unsigned num; 2301 ext4_lblk_t ex_ee_block; 2302 unsigned short ex_ee_len; 2303 unsigned uninitialized = 0; 2304 struct ext4_extent *ex; 2305 struct ext4_map_blocks map; 2306 2307 /* the header must be checked already in ext4_ext_remove_space() */ 2308 ext_debug("truncate since %u in leaf\n", start); 2309 if (!path[depth].p_hdr) 2310 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2311 eh = path[depth].p_hdr; 2312 if (unlikely(path[depth].p_hdr == NULL)) { 2313 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2314 return -EIO; 2315 } 2316 /* find where to start removing */ 2317 ex = EXT_LAST_EXTENT(eh); 2318 2319 ex_ee_block = le32_to_cpu(ex->ee_block); 2320 ex_ee_len = ext4_ext_get_actual_len(ex); 2321 2322 while (ex >= EXT_FIRST_EXTENT(eh) && 2323 ex_ee_block + ex_ee_len > start) { 2324 2325 if (ext4_ext_is_uninitialized(ex)) 2326 uninitialized = 1; 2327 else 2328 uninitialized = 0; 2329 2330 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2331 uninitialized, ex_ee_len); 2332 path[depth].p_ext = ex; 2333 2334 a = ex_ee_block > start ? ex_ee_block : start; 2335 b = ex_ee_block+ex_ee_len - 1 < end ? 2336 ex_ee_block+ex_ee_len - 1 : end; 2337 2338 ext_debug(" border %u:%u\n", a, b); 2339 2340 /* If this extent is beyond the end of the hole, skip it */ 2341 if (end <= ex_ee_block) { 2342 ex--; 2343 ex_ee_block = le32_to_cpu(ex->ee_block); 2344 ex_ee_len = ext4_ext_get_actual_len(ex); 2345 continue; 2346 } else if (a != ex_ee_block && 2347 b != ex_ee_block + ex_ee_len - 1) { 2348 /* 2349 * If this is a truncate, then this condition should 2350 * never happen because at least one of the end points 2351 * needs to be on the edge of the extent. 2352 */ 2353 if (end == EXT_MAX_BLOCKS - 1) { 2354 ext_debug(" bad truncate %u:%u\n", 2355 start, end); 2356 block = 0; 2357 num = 0; 2358 err = -EIO; 2359 goto out; 2360 } 2361 /* 2362 * else this is a hole punch, so the extent needs to 2363 * be split since neither edge of the hole is on the 2364 * extent edge 2365 */ 2366 else{ 2367 map.m_pblk = ext4_ext_pblock(ex); 2368 map.m_lblk = ex_ee_block; 2369 map.m_len = b - ex_ee_block; 2370 2371 err = ext4_split_extent(handle, 2372 inode, path, &map, 0, 2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT | 2374 EXT4_GET_BLOCKS_PRE_IO); 2375 2376 if (err < 0) 2377 goto out; 2378 2379 ex_ee_len = ext4_ext_get_actual_len(ex); 2380 2381 b = ex_ee_block+ex_ee_len - 1 < end ? 2382 ex_ee_block+ex_ee_len - 1 : end; 2383 2384 /* Then remove tail of this extent */ 2385 block = ex_ee_block; 2386 num = a - block; 2387 } 2388 } else if (a != ex_ee_block) { 2389 /* remove tail of the extent */ 2390 block = ex_ee_block; 2391 num = a - block; 2392 } else if (b != ex_ee_block + ex_ee_len - 1) { 2393 /* remove head of the extent */ 2394 block = b; 2395 num = ex_ee_block + ex_ee_len - b; 2396 2397 /* 2398 * If this is a truncate, this condition 2399 * should never happen 2400 */ 2401 if (end == EXT_MAX_BLOCKS - 1) { 2402 ext_debug(" bad truncate %u:%u\n", 2403 start, end); 2404 err = -EIO; 2405 goto out; 2406 } 2407 } else { 2408 /* remove whole extent: excellent! */ 2409 block = ex_ee_block; 2410 num = 0; 2411 if (a != ex_ee_block) { 2412 ext_debug(" bad truncate %u:%u\n", 2413 start, end); 2414 err = -EIO; 2415 goto out; 2416 } 2417 2418 if (b != ex_ee_block + ex_ee_len - 1) { 2419 ext_debug(" bad truncate %u:%u\n", 2420 start, end); 2421 err = -EIO; 2422 goto out; 2423 } 2424 } 2425 2426 /* 2427 * 3 for leaf, sb, and inode plus 2 (bmap and group 2428 * descriptor) for each block group; assume two block 2429 * groups plus ex_ee_len/blocks_per_block_group for 2430 * the worst case 2431 */ 2432 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2433 if (ex == EXT_FIRST_EXTENT(eh)) { 2434 correct_index = 1; 2435 credits += (ext_depth(inode)) + 1; 2436 } 2437 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2438 2439 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2440 if (err) 2441 goto out; 2442 2443 err = ext4_ext_get_access(handle, inode, path + depth); 2444 if (err) 2445 goto out; 2446 2447 err = ext4_remove_blocks(handle, inode, ex, a, b); 2448 if (err) 2449 goto out; 2450 2451 if (num == 0) { 2452 /* this extent is removed; mark slot entirely unused */ 2453 ext4_ext_store_pblock(ex, 0); 2454 } else if (block != ex_ee_block) { 2455 /* 2456 * If this was a head removal, then we need to update 2457 * the physical block since it is now at a different 2458 * location 2459 */ 2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a)); 2461 } 2462 2463 ex->ee_block = cpu_to_le32(block); 2464 ex->ee_len = cpu_to_le16(num); 2465 /* 2466 * Do not mark uninitialized if all the blocks in the 2467 * extent have been removed. 2468 */ 2469 if (uninitialized && num) 2470 ext4_ext_mark_uninitialized(ex); 2471 2472 err = ext4_ext_dirty(handle, inode, path + depth); 2473 if (err) 2474 goto out; 2475 2476 /* 2477 * If the extent was completely released, 2478 * we need to remove it from the leaf 2479 */ 2480 if (num == 0) { 2481 if (end != EXT_MAX_BLOCKS - 1) { 2482 /* 2483 * For hole punching, we need to scoot all the 2484 * extents up when an extent is removed so that 2485 * we dont have blank extents in the middle 2486 */ 2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2488 sizeof(struct ext4_extent)); 2489 2490 /* Now get rid of the one at the end */ 2491 memset(EXT_LAST_EXTENT(eh), 0, 2492 sizeof(struct ext4_extent)); 2493 } 2494 le16_add_cpu(&eh->eh_entries, -1); 2495 } 2496 2497 ext_debug("new extent: %u:%u:%llu\n", block, num, 2498 ext4_ext_pblock(ex)); 2499 ex--; 2500 ex_ee_block = le32_to_cpu(ex->ee_block); 2501 ex_ee_len = ext4_ext_get_actual_len(ex); 2502 } 2503 2504 if (correct_index && eh->eh_entries) 2505 err = ext4_ext_correct_indexes(handle, inode, path); 2506 2507 /* if this leaf is free, then we should 2508 * remove it from index block above */ 2509 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2510 err = ext4_ext_rm_idx(handle, inode, path + depth); 2511 2512 out: 2513 return err; 2514 } 2515 2516 /* 2517 * ext4_ext_more_to_rm: 2518 * returns 1 if current index has to be freed (even partial) 2519 */ 2520 static int 2521 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2522 { 2523 BUG_ON(path->p_idx == NULL); 2524 2525 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2526 return 0; 2527 2528 /* 2529 * if truncate on deeper level happened, it wasn't partial, 2530 * so we have to consider current index for truncation 2531 */ 2532 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2533 return 0; 2534 return 1; 2535 } 2536 2537 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2538 ext4_lblk_t end) 2539 { 2540 struct super_block *sb = inode->i_sb; 2541 int depth = ext_depth(inode); 2542 struct ext4_ext_path *path; 2543 handle_t *handle; 2544 int i, err; 2545 2546 ext_debug("truncate since %u\n", start); 2547 2548 /* probably first extent we're gonna free will be last in block */ 2549 handle = ext4_journal_start(inode, depth + 1); 2550 if (IS_ERR(handle)) 2551 return PTR_ERR(handle); 2552 2553 again: 2554 ext4_ext_invalidate_cache(inode); 2555 2556 /* 2557 * We start scanning from right side, freeing all the blocks 2558 * after i_size and walking into the tree depth-wise. 2559 */ 2560 depth = ext_depth(inode); 2561 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2562 if (path == NULL) { 2563 ext4_journal_stop(handle); 2564 return -ENOMEM; 2565 } 2566 path[0].p_depth = depth; 2567 path[0].p_hdr = ext_inode_hdr(inode); 2568 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2569 err = -EIO; 2570 goto out; 2571 } 2572 i = err = 0; 2573 2574 while (i >= 0 && err == 0) { 2575 if (i == depth) { 2576 /* this is leaf block */ 2577 err = ext4_ext_rm_leaf(handle, inode, path, 2578 start, end); 2579 /* root level has p_bh == NULL, brelse() eats this */ 2580 brelse(path[i].p_bh); 2581 path[i].p_bh = NULL; 2582 i--; 2583 continue; 2584 } 2585 2586 /* this is index block */ 2587 if (!path[i].p_hdr) { 2588 ext_debug("initialize header\n"); 2589 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2590 } 2591 2592 if (!path[i].p_idx) { 2593 /* this level hasn't been touched yet */ 2594 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2595 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2596 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2597 path[i].p_hdr, 2598 le16_to_cpu(path[i].p_hdr->eh_entries)); 2599 } else { 2600 /* we were already here, see at next index */ 2601 path[i].p_idx--; 2602 } 2603 2604 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2605 i, EXT_FIRST_INDEX(path[i].p_hdr), 2606 path[i].p_idx); 2607 if (ext4_ext_more_to_rm(path + i)) { 2608 struct buffer_head *bh; 2609 /* go to the next level */ 2610 ext_debug("move to level %d (block %llu)\n", 2611 i + 1, ext4_idx_pblock(path[i].p_idx)); 2612 memset(path + i + 1, 0, sizeof(*path)); 2613 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); 2614 if (!bh) { 2615 /* should we reset i_size? */ 2616 err = -EIO; 2617 break; 2618 } 2619 if (WARN_ON(i + 1 > depth)) { 2620 err = -EIO; 2621 break; 2622 } 2623 if (ext4_ext_check(inode, ext_block_hdr(bh), 2624 depth - i - 1)) { 2625 err = -EIO; 2626 break; 2627 } 2628 path[i + 1].p_bh = bh; 2629 2630 /* save actual number of indexes since this 2631 * number is changed at the next iteration */ 2632 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2633 i++; 2634 } else { 2635 /* we finished processing this index, go up */ 2636 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2637 /* index is empty, remove it; 2638 * handle must be already prepared by the 2639 * truncatei_leaf() */ 2640 err = ext4_ext_rm_idx(handle, inode, path + i); 2641 } 2642 /* root level has p_bh == NULL, brelse() eats this */ 2643 brelse(path[i].p_bh); 2644 path[i].p_bh = NULL; 2645 i--; 2646 ext_debug("return to level %d\n", i); 2647 } 2648 } 2649 2650 /* TODO: flexible tree reduction should be here */ 2651 if (path->p_hdr->eh_entries == 0) { 2652 /* 2653 * truncate to zero freed all the tree, 2654 * so we need to correct eh_depth 2655 */ 2656 err = ext4_ext_get_access(handle, inode, path); 2657 if (err == 0) { 2658 ext_inode_hdr(inode)->eh_depth = 0; 2659 ext_inode_hdr(inode)->eh_max = 2660 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2661 err = ext4_ext_dirty(handle, inode, path); 2662 } 2663 } 2664 out: 2665 ext4_ext_drop_refs(path); 2666 kfree(path); 2667 if (err == -EAGAIN) 2668 goto again; 2669 ext4_journal_stop(handle); 2670 2671 return err; 2672 } 2673 2674 /* 2675 * called at mount time 2676 */ 2677 void ext4_ext_init(struct super_block *sb) 2678 { 2679 /* 2680 * possible initialization would be here 2681 */ 2682 2683 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2684 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2685 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2686 #ifdef AGGRESSIVE_TEST 2687 printk(", aggressive tests"); 2688 #endif 2689 #ifdef CHECK_BINSEARCH 2690 printk(", check binsearch"); 2691 #endif 2692 #ifdef EXTENTS_STATS 2693 printk(", stats"); 2694 #endif 2695 printk("\n"); 2696 #endif 2697 #ifdef EXTENTS_STATS 2698 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2699 EXT4_SB(sb)->s_ext_min = 1 << 30; 2700 EXT4_SB(sb)->s_ext_max = 0; 2701 #endif 2702 } 2703 } 2704 2705 /* 2706 * called at umount time 2707 */ 2708 void ext4_ext_release(struct super_block *sb) 2709 { 2710 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2711 return; 2712 2713 #ifdef EXTENTS_STATS 2714 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2715 struct ext4_sb_info *sbi = EXT4_SB(sb); 2716 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2717 sbi->s_ext_blocks, sbi->s_ext_extents, 2718 sbi->s_ext_blocks / sbi->s_ext_extents); 2719 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2720 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2721 } 2722 #endif 2723 } 2724 2725 /* FIXME!! we need to try to merge to left or right after zero-out */ 2726 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2727 { 2728 ext4_fsblk_t ee_pblock; 2729 unsigned int ee_len; 2730 int ret; 2731 2732 ee_len = ext4_ext_get_actual_len(ex); 2733 ee_pblock = ext4_ext_pblock(ex); 2734 2735 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 2736 if (ret > 0) 2737 ret = 0; 2738 2739 return ret; 2740 } 2741 2742 /* 2743 * used by extent splitting. 2744 */ 2745 #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 2746 due to ENOSPC */ 2747 #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ 2748 #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ 2749 2750 /* 2751 * ext4_split_extent_at() splits an extent at given block. 2752 * 2753 * @handle: the journal handle 2754 * @inode: the file inode 2755 * @path: the path to the extent 2756 * @split: the logical block where the extent is splitted. 2757 * @split_flags: indicates if the extent could be zeroout if split fails, and 2758 * the states(init or uninit) of new extents. 2759 * @flags: flags used to insert new extent to extent tree. 2760 * 2761 * 2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 2763 * of which are deterimined by split_flag. 2764 * 2765 * There are two cases: 2766 * a> the extent are splitted into two extent. 2767 * b> split is not needed, and just mark the extent. 2768 * 2769 * return 0 on success. 2770 */ 2771 static int ext4_split_extent_at(handle_t *handle, 2772 struct inode *inode, 2773 struct ext4_ext_path *path, 2774 ext4_lblk_t split, 2775 int split_flag, 2776 int flags) 2777 { 2778 ext4_fsblk_t newblock; 2779 ext4_lblk_t ee_block; 2780 struct ext4_extent *ex, newex, orig_ex; 2781 struct ext4_extent *ex2 = NULL; 2782 unsigned int ee_len, depth; 2783 int err = 0; 2784 2785 ext_debug("ext4_split_extents_at: inode %lu, logical" 2786 "block %llu\n", inode->i_ino, (unsigned long long)split); 2787 2788 ext4_ext_show_leaf(inode, path); 2789 2790 depth = ext_depth(inode); 2791 ex = path[depth].p_ext; 2792 ee_block = le32_to_cpu(ex->ee_block); 2793 ee_len = ext4_ext_get_actual_len(ex); 2794 newblock = split - ee_block + ext4_ext_pblock(ex); 2795 2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 2797 2798 err = ext4_ext_get_access(handle, inode, path + depth); 2799 if (err) 2800 goto out; 2801 2802 if (split == ee_block) { 2803 /* 2804 * case b: block @split is the block that the extent begins with 2805 * then we just change the state of the extent, and splitting 2806 * is not needed. 2807 */ 2808 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2809 ext4_ext_mark_uninitialized(ex); 2810 else 2811 ext4_ext_mark_initialized(ex); 2812 2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2814 ext4_ext_try_to_merge(inode, path, ex); 2815 2816 err = ext4_ext_dirty(handle, inode, path + depth); 2817 goto out; 2818 } 2819 2820 /* case a */ 2821 memcpy(&orig_ex, ex, sizeof(orig_ex)); 2822 ex->ee_len = cpu_to_le16(split - ee_block); 2823 if (split_flag & EXT4_EXT_MARK_UNINIT1) 2824 ext4_ext_mark_uninitialized(ex); 2825 2826 /* 2827 * path may lead to new leaf, not to original leaf any more 2828 * after ext4_ext_insert_extent() returns, 2829 */ 2830 err = ext4_ext_dirty(handle, inode, path + depth); 2831 if (err) 2832 goto fix_extent_len; 2833 2834 ex2 = &newex; 2835 ex2->ee_block = cpu_to_le32(split); 2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 2837 ext4_ext_store_pblock(ex2, newblock); 2838 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2839 ext4_ext_mark_uninitialized(ex2); 2840 2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 2843 err = ext4_ext_zeroout(inode, &orig_ex); 2844 if (err) 2845 goto fix_extent_len; 2846 /* update the extent length and mark as initialized */ 2847 ex->ee_len = cpu_to_le32(ee_len); 2848 ext4_ext_try_to_merge(inode, path, ex); 2849 err = ext4_ext_dirty(handle, inode, path + depth); 2850 goto out; 2851 } else if (err) 2852 goto fix_extent_len; 2853 2854 out: 2855 ext4_ext_show_leaf(inode, path); 2856 return err; 2857 2858 fix_extent_len: 2859 ex->ee_len = orig_ex.ee_len; 2860 ext4_ext_dirty(handle, inode, path + depth); 2861 return err; 2862 } 2863 2864 /* 2865 * ext4_split_extents() splits an extent and mark extent which is covered 2866 * by @map as split_flags indicates 2867 * 2868 * It may result in splitting the extent into multiple extents (upto three) 2869 * There are three possibilities: 2870 * a> There is no split required 2871 * b> Splits in two extents: Split is happening at either end of the extent 2872 * c> Splits in three extents: Somone is splitting in middle of the extent 2873 * 2874 */ 2875 static int ext4_split_extent(handle_t *handle, 2876 struct inode *inode, 2877 struct ext4_ext_path *path, 2878 struct ext4_map_blocks *map, 2879 int split_flag, 2880 int flags) 2881 { 2882 ext4_lblk_t ee_block; 2883 struct ext4_extent *ex; 2884 unsigned int ee_len, depth; 2885 int err = 0; 2886 int uninitialized; 2887 int split_flag1, flags1; 2888 2889 depth = ext_depth(inode); 2890 ex = path[depth].p_ext; 2891 ee_block = le32_to_cpu(ex->ee_block); 2892 ee_len = ext4_ext_get_actual_len(ex); 2893 uninitialized = ext4_ext_is_uninitialized(ex); 2894 2895 if (map->m_lblk + map->m_len < ee_block + ee_len) { 2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? 2897 EXT4_EXT_MAY_ZEROOUT : 0; 2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 2899 if (uninitialized) 2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 | 2901 EXT4_EXT_MARK_UNINIT2; 2902 err = ext4_split_extent_at(handle, inode, path, 2903 map->m_lblk + map->m_len, split_flag1, flags1); 2904 if (err) 2905 goto out; 2906 } 2907 2908 ext4_ext_drop_refs(path); 2909 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2910 if (IS_ERR(path)) 2911 return PTR_ERR(path); 2912 2913 if (map->m_lblk >= ee_block) { 2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? 2915 EXT4_EXT_MAY_ZEROOUT : 0; 2916 if (uninitialized) 2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1; 2918 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2; 2920 err = ext4_split_extent_at(handle, inode, path, 2921 map->m_lblk, split_flag1, flags); 2922 if (err) 2923 goto out; 2924 } 2925 2926 ext4_ext_show_leaf(inode, path); 2927 out: 2928 return err ? err : map->m_len; 2929 } 2930 2931 #define EXT4_EXT_ZERO_LEN 7 2932 /* 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write 2934 * to an uninitialized extent. It may result in splitting the uninitialized 2935 * extent into multiple extents (up to three - one initialized and two 2936 * uninitialized). 2937 * There are three possibilities: 2938 * a> There is no split required: Entire extent should be initialized 2939 * b> Splits in two extents: Write is happening at either end of the extent 2940 * c> Splits in three extents: Somone is writing in middle of the extent 2941 */ 2942 static int ext4_ext_convert_to_initialized(handle_t *handle, 2943 struct inode *inode, 2944 struct ext4_map_blocks *map, 2945 struct ext4_ext_path *path) 2946 { 2947 struct ext4_map_blocks split_map; 2948 struct ext4_extent zero_ex; 2949 struct ext4_extent *ex; 2950 ext4_lblk_t ee_block, eof_block; 2951 unsigned int allocated, ee_len, depth; 2952 int err = 0; 2953 int split_flag = 0; 2954 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2956 "block %llu, max_blocks %u\n", inode->i_ino, 2957 (unsigned long long)map->m_lblk, map->m_len); 2958 2959 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2960 inode->i_sb->s_blocksize_bits; 2961 if (eof_block < map->m_lblk + map->m_len) 2962 eof_block = map->m_lblk + map->m_len; 2963 2964 depth = ext_depth(inode); 2965 ex = path[depth].p_ext; 2966 ee_block = le32_to_cpu(ex->ee_block); 2967 ee_len = ext4_ext_get_actual_len(ex); 2968 allocated = ee_len - (map->m_lblk - ee_block); 2969 2970 WARN_ON(map->m_lblk < ee_block); 2971 /* 2972 * It is safe to convert extent to initialized via explicit 2973 * zeroout only if extent is fully insde i_size or new_size. 2974 */ 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 2976 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 2980 err = ext4_ext_zeroout(inode, ex); 2981 if (err) 2982 goto out; 2983 2984 err = ext4_ext_get_access(handle, inode, path + depth); 2985 if (err) 2986 goto out; 2987 ext4_ext_mark_initialized(ex); 2988 ext4_ext_try_to_merge(inode, path, ex); 2989 err = ext4_ext_dirty(handle, inode, path + depth); 2990 goto out; 2991 } 2992 2993 /* 2994 * four cases: 2995 * 1. split the extent into three extents. 2996 * 2. split the extent into two extents, zeroout the first half. 2997 * 3. split the extent into two extents, zeroout the second half. 2998 * 4. split the extent into two extents with out zeroout. 2999 */ 3000 split_map.m_lblk = map->m_lblk; 3001 split_map.m_len = map->m_len; 3002 3003 if (allocated > map->m_len) { 3004 if (allocated <= EXT4_EXT_ZERO_LEN && 3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3006 /* case 3 */ 3007 zero_ex.ee_block = 3008 cpu_to_le32(map->m_lblk); 3009 zero_ex.ee_len = cpu_to_le16(allocated); 3010 ext4_ext_store_pblock(&zero_ex, 3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block); 3012 err = ext4_ext_zeroout(inode, &zero_ex); 3013 if (err) 3014 goto out; 3015 split_map.m_lblk = map->m_lblk; 3016 split_map.m_len = allocated; 3017 } else if ((map->m_lblk - ee_block + map->m_len < 3018 EXT4_EXT_ZERO_LEN) && 3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3020 /* case 2 */ 3021 if (map->m_lblk != ee_block) { 3022 zero_ex.ee_block = ex->ee_block; 3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk - 3024 ee_block); 3025 ext4_ext_store_pblock(&zero_ex, 3026 ext4_ext_pblock(ex)); 3027 err = ext4_ext_zeroout(inode, &zero_ex); 3028 if (err) 3029 goto out; 3030 } 3031 3032 split_map.m_lblk = ee_block; 3033 split_map.m_len = map->m_lblk - ee_block + map->m_len; 3034 allocated = map->m_len; 3035 } 3036 } 3037 3038 allocated = ext4_split_extent(handle, inode, path, 3039 &split_map, split_flag, 0); 3040 if (allocated < 0) 3041 err = allocated; 3042 3043 out: 3044 return err ? err : allocated; 3045 } 3046 3047 /* 3048 * This function is called by ext4_ext_map_blocks() from 3049 * ext4_get_blocks_dio_write() when DIO to write 3050 * to an uninitialized extent. 3051 * 3052 * Writing to an uninitialized extent may result in splitting the uninitialized 3053 * extent into multiple /initialized uninitialized extents (up to three) 3054 * There are three possibilities: 3055 * a> There is no split required: Entire extent should be uninitialized 3056 * b> Splits in two extents: Write is happening at either end of the extent 3057 * c> Splits in three extents: Somone is writing in middle of the extent 3058 * 3059 * One of more index blocks maybe needed if the extent tree grow after 3060 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3061 * complete, we need to split the uninitialized extent before DIO submit 3062 * the IO. The uninitialized extent called at this time will be split 3063 * into three uninitialized extent(at most). After IO complete, the part 3064 * being filled will be convert to initialized by the end_io callback function 3065 * via ext4_convert_unwritten_extents(). 3066 * 3067 * Returns the size of uninitialized extent to be written on success. 3068 */ 3069 static int ext4_split_unwritten_extents(handle_t *handle, 3070 struct inode *inode, 3071 struct ext4_map_blocks *map, 3072 struct ext4_ext_path *path, 3073 int flags) 3074 { 3075 ext4_lblk_t eof_block; 3076 ext4_lblk_t ee_block; 3077 struct ext4_extent *ex; 3078 unsigned int ee_len; 3079 int split_flag = 0, depth; 3080 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3082 "block %llu, max_blocks %u\n", inode->i_ino, 3083 (unsigned long long)map->m_lblk, map->m_len); 3084 3085 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3086 inode->i_sb->s_blocksize_bits; 3087 if (eof_block < map->m_lblk + map->m_len) 3088 eof_block = map->m_lblk + map->m_len; 3089 /* 3090 * It is safe to convert extent to initialized via explicit 3091 * zeroout only if extent is fully insde i_size or new_size. 3092 */ 3093 depth = ext_depth(inode); 3094 ex = path[depth].p_ext; 3095 ee_block = le32_to_cpu(ex->ee_block); 3096 ee_len = ext4_ext_get_actual_len(ex); 3097 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3099 split_flag |= EXT4_EXT_MARK_UNINIT2; 3100 3101 flags |= EXT4_GET_BLOCKS_PRE_IO; 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3103 } 3104 3105 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3106 struct inode *inode, 3107 struct ext4_ext_path *path) 3108 { 3109 struct ext4_extent *ex; 3110 struct ext4_extent_header *eh; 3111 int depth; 3112 int err = 0; 3113 3114 depth = ext_depth(inode); 3115 eh = path[depth].p_hdr; 3116 ex = path[depth].p_ext; 3117 3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3119 "block %llu, max_blocks %u\n", inode->i_ino, 3120 (unsigned long long)le32_to_cpu(ex->ee_block), 3121 ext4_ext_get_actual_len(ex)); 3122 3123 err = ext4_ext_get_access(handle, inode, path + depth); 3124 if (err) 3125 goto out; 3126 /* first mark the extent as initialized */ 3127 ext4_ext_mark_initialized(ex); 3128 3129 /* note: ext4_ext_correct_indexes() isn't needed here because 3130 * borders are not changed 3131 */ 3132 ext4_ext_try_to_merge(inode, path, ex); 3133 3134 /* Mark modified extent as dirty */ 3135 err = ext4_ext_dirty(handle, inode, path + depth); 3136 out: 3137 ext4_ext_show_leaf(inode, path); 3138 return err; 3139 } 3140 3141 static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3142 sector_t block, int count) 3143 { 3144 int i; 3145 for (i = 0; i < count; i++) 3146 unmap_underlying_metadata(bdev, block + i); 3147 } 3148 3149 /* 3150 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3151 */ 3152 static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3153 ext4_lblk_t lblk, 3154 struct ext4_ext_path *path, 3155 unsigned int len) 3156 { 3157 int i, depth; 3158 struct ext4_extent_header *eh; 3159 struct ext4_extent *last_ex; 3160 3161 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3162 return 0; 3163 3164 depth = ext_depth(inode); 3165 eh = path[depth].p_hdr; 3166 3167 if (unlikely(!eh->eh_entries)) { 3168 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3169 "EOFBLOCKS_FL set"); 3170 return -EIO; 3171 } 3172 last_ex = EXT_LAST_EXTENT(eh); 3173 /* 3174 * We should clear the EOFBLOCKS_FL flag if we are writing the 3175 * last block in the last extent in the file. We test this by 3176 * first checking to see if the caller to 3177 * ext4_ext_get_blocks() was interested in the last block (or 3178 * a block beyond the last block) in the current extent. If 3179 * this turns out to be false, we can bail out from this 3180 * function immediately. 3181 */ 3182 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3183 ext4_ext_get_actual_len(last_ex)) 3184 return 0; 3185 /* 3186 * If the caller does appear to be planning to write at or 3187 * beyond the end of the current extent, we then test to see 3188 * if the current extent is the last extent in the file, by 3189 * checking to make sure it was reached via the rightmost node 3190 * at each level of the tree. 3191 */ 3192 for (i = depth-1; i >= 0; i--) 3193 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3194 return 0; 3195 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3196 return ext4_mark_inode_dirty(handle, inode); 3197 } 3198 3199 static int 3200 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3201 struct ext4_map_blocks *map, 3202 struct ext4_ext_path *path, int flags, 3203 unsigned int allocated, ext4_fsblk_t newblock) 3204 { 3205 int ret = 0; 3206 int err = 0; 3207 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3208 3209 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3210 "block %llu, max_blocks %u, flags %d, allocated %u", 3211 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3212 flags, allocated); 3213 ext4_ext_show_leaf(inode, path); 3214 3215 /* get_block() before submit the IO, split the extent */ 3216 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3217 ret = ext4_split_unwritten_extents(handle, inode, map, 3218 path, flags); 3219 /* 3220 * Flag the inode(non aio case) or end_io struct (aio case) 3221 * that this IO needs to conversion to written when IO is 3222 * completed 3223 */ 3224 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3225 io->flag = EXT4_IO_END_UNWRITTEN; 3226 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3227 } else 3228 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3229 if (ext4_should_dioread_nolock(inode)) 3230 map->m_flags |= EXT4_MAP_UNINIT; 3231 goto out; 3232 } 3233 /* IO end_io complete, convert the filled extent to written */ 3234 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3235 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3236 path); 3237 if (ret >= 0) { 3238 ext4_update_inode_fsync_trans(handle, inode, 1); 3239 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3240 path, map->m_len); 3241 } else 3242 err = ret; 3243 goto out2; 3244 } 3245 /* buffered IO case */ 3246 /* 3247 * repeat fallocate creation request 3248 * we already have an unwritten extent 3249 */ 3250 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3251 goto map_out; 3252 3253 /* buffered READ or buffered write_begin() lookup */ 3254 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3255 /* 3256 * We have blocks reserved already. We 3257 * return allocated blocks so that delalloc 3258 * won't do block reservation for us. But 3259 * the buffer head will be unmapped so that 3260 * a read from the block returns 0s. 3261 */ 3262 map->m_flags |= EXT4_MAP_UNWRITTEN; 3263 goto out1; 3264 } 3265 3266 /* buffered write, writepage time, convert*/ 3267 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3268 if (ret >= 0) { 3269 ext4_update_inode_fsync_trans(handle, inode, 1); 3270 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 3271 map->m_len); 3272 if (err < 0) 3273 goto out2; 3274 } 3275 3276 out: 3277 if (ret <= 0) { 3278 err = ret; 3279 goto out2; 3280 } else 3281 allocated = ret; 3282 map->m_flags |= EXT4_MAP_NEW; 3283 /* 3284 * if we allocated more blocks than requested 3285 * we need to make sure we unmap the extra block 3286 * allocated. The actual needed block will get 3287 * unmapped later when we find the buffer_head marked 3288 * new. 3289 */ 3290 if (allocated > map->m_len) { 3291 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3292 newblock + map->m_len, 3293 allocated - map->m_len); 3294 allocated = map->m_len; 3295 } 3296 3297 /* 3298 * If we have done fallocate with the offset that is already 3299 * delayed allocated, we would have block reservation 3300 * and quota reservation done in the delayed write path. 3301 * But fallocate would have already updated quota and block 3302 * count for this offset. So cancel these reservation 3303 */ 3304 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3305 ext4_da_update_reserve_space(inode, allocated, 0); 3306 3307 map_out: 3308 map->m_flags |= EXT4_MAP_MAPPED; 3309 out1: 3310 if (allocated > map->m_len) 3311 allocated = map->m_len; 3312 ext4_ext_show_leaf(inode, path); 3313 map->m_pblk = newblock; 3314 map->m_len = allocated; 3315 out2: 3316 if (path) { 3317 ext4_ext_drop_refs(path); 3318 kfree(path); 3319 } 3320 return err ? err : allocated; 3321 } 3322 3323 /* 3324 * Block allocation/map/preallocation routine for extents based files 3325 * 3326 * 3327 * Need to be called with 3328 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3329 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3330 * 3331 * return > 0, number of of blocks already mapped/allocated 3332 * if create == 0 and these are pre-allocated blocks 3333 * buffer head is unmapped 3334 * otherwise blocks are mapped 3335 * 3336 * return = 0, if plain look up failed (blocks have not been allocated) 3337 * buffer head is unmapped 3338 * 3339 * return < 0, error case. 3340 */ 3341 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3342 struct ext4_map_blocks *map, int flags) 3343 { 3344 struct ext4_ext_path *path = NULL; 3345 struct ext4_extent newex, *ex; 3346 ext4_fsblk_t newblock = 0; 3347 int err = 0, depth, ret; 3348 unsigned int allocated = 0; 3349 unsigned int punched_out = 0; 3350 unsigned int result = 0; 3351 struct ext4_allocation_request ar; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3353 struct ext4_map_blocks punch_map; 3354 3355 ext_debug("blocks %u/%u requested for inode %lu\n", 3356 map->m_lblk, map->m_len, inode->i_ino); 3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3358 3359 /* check in cache */ 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && 3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3364 /* 3365 * block isn't allocated yet and 3366 * user doesn't want to allocate it 3367 */ 3368 goto out2; 3369 } 3370 /* we should allocate requested block */ 3371 } else { 3372 /* block is already allocated */ 3373 newblock = map->m_lblk 3374 - le32_to_cpu(newex.ee_block) 3375 + ext4_ext_pblock(&newex); 3376 /* number of remaining blocks in the extent */ 3377 allocated = ext4_ext_get_actual_len(&newex) - 3378 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3379 goto out; 3380 } 3381 } 3382 3383 /* find extent for this block */ 3384 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3385 if (IS_ERR(path)) { 3386 err = PTR_ERR(path); 3387 path = NULL; 3388 goto out2; 3389 } 3390 3391 depth = ext_depth(inode); 3392 3393 /* 3394 * consistent leaf must not be empty; 3395 * this situation is possible, though, _during_ tree modification; 3396 * this is why assert can't be put in ext4_ext_find_extent() 3397 */ 3398 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3399 EXT4_ERROR_INODE(inode, "bad extent address " 3400 "lblock: %lu, depth: %d pblock %lld", 3401 (unsigned long) map->m_lblk, depth, 3402 path[depth].p_block); 3403 err = -EIO; 3404 goto out2; 3405 } 3406 3407 ex = path[depth].p_ext; 3408 if (ex) { 3409 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3410 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3411 unsigned short ee_len; 3412 3413 /* 3414 * Uninitialized extents are treated as holes, except that 3415 * we split out initialized portions during a write. 3416 */ 3417 ee_len = ext4_ext_get_actual_len(ex); 3418 /* if found extent covers block, simply return it */ 3419 if (in_range(map->m_lblk, ee_block, ee_len)) { 3420 newblock = map->m_lblk - ee_block + ee_start; 3421 /* number of remaining blocks in the extent */ 3422 allocated = ee_len - (map->m_lblk - ee_block); 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3424 ee_block, ee_len, newblock); 3425 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) { 3427 /* 3428 * Do not put uninitialized extent 3429 * in the cache 3430 */ 3431 if (!ext4_ext_is_uninitialized(ex)) { 3432 ext4_ext_put_in_cache(inode, ee_block, 3433 ee_len, ee_start); 3434 goto out; 3435 } 3436 ret = ext4_ext_handle_uninitialized_extents( 3437 handle, inode, map, path, flags, 3438 allocated, newblock); 3439 return ret; 3440 } 3441 3442 /* 3443 * Punch out the map length, but only to the 3444 * end of the extent 3445 */ 3446 punched_out = allocated < map->m_len ? 3447 allocated : map->m_len; 3448 3449 /* 3450 * Sense extents need to be converted to 3451 * uninitialized, they must fit in an 3452 * uninitialized extent 3453 */ 3454 if (punched_out > EXT_UNINIT_MAX_LEN) 3455 punched_out = EXT_UNINIT_MAX_LEN; 3456 3457 punch_map.m_lblk = map->m_lblk; 3458 punch_map.m_pblk = newblock; 3459 punch_map.m_len = punched_out; 3460 punch_map.m_flags = 0; 3461 3462 /* Check to see if the extent needs to be split */ 3463 if (punch_map.m_len != ee_len || 3464 punch_map.m_lblk != ee_block) { 3465 3466 ret = ext4_split_extent(handle, inode, 3467 path, &punch_map, 0, 3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT | 3469 EXT4_GET_BLOCKS_PRE_IO); 3470 3471 if (ret < 0) { 3472 err = ret; 3473 goto out2; 3474 } 3475 /* 3476 * find extent for the block at 3477 * the start of the hole 3478 */ 3479 ext4_ext_drop_refs(path); 3480 kfree(path); 3481 3482 path = ext4_ext_find_extent(inode, 3483 map->m_lblk, NULL); 3484 if (IS_ERR(path)) { 3485 err = PTR_ERR(path); 3486 path = NULL; 3487 goto out2; 3488 } 3489 3490 depth = ext_depth(inode); 3491 ex = path[depth].p_ext; 3492 ee_len = ext4_ext_get_actual_len(ex); 3493 ee_block = le32_to_cpu(ex->ee_block); 3494 ee_start = ext4_ext_pblock(ex); 3495 3496 } 3497 3498 ext4_ext_mark_uninitialized(ex); 3499 3500 err = ext4_ext_remove_space(inode, map->m_lblk, 3501 map->m_lblk + punched_out); 3502 3503 goto out2; 3504 } 3505 } 3506 3507 /* 3508 * requested block isn't allocated yet; 3509 * we couldn't try to create block if create flag is zero 3510 */ 3511 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3512 /* 3513 * put just found gap into cache to speed up 3514 * subsequent requests 3515 */ 3516 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3517 goto out2; 3518 } 3519 /* 3520 * Okay, we need to do block allocation. 3521 */ 3522 3523 /* find neighbour allocated blocks */ 3524 ar.lleft = map->m_lblk; 3525 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3526 if (err) 3527 goto out2; 3528 ar.lright = map->m_lblk; 3529 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3530 if (err) 3531 goto out2; 3532 3533 /* 3534 * See if request is beyond maximum number of blocks we can have in 3535 * a single extent. For an initialized extent this limit is 3536 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3537 * EXT_UNINIT_MAX_LEN. 3538 */ 3539 if (map->m_len > EXT_INIT_MAX_LEN && 3540 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3541 map->m_len = EXT_INIT_MAX_LEN; 3542 else if (map->m_len > EXT_UNINIT_MAX_LEN && 3543 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3544 map->m_len = EXT_UNINIT_MAX_LEN; 3545 3546 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3547 newex.ee_block = cpu_to_le32(map->m_lblk); 3548 newex.ee_len = cpu_to_le16(map->m_len); 3549 err = ext4_ext_check_overlap(inode, &newex, path); 3550 if (err) 3551 allocated = ext4_ext_get_actual_len(&newex); 3552 else 3553 allocated = map->m_len; 3554 3555 /* allocate new block */ 3556 ar.inode = inode; 3557 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3558 ar.logical = map->m_lblk; 3559 ar.len = allocated; 3560 if (S_ISREG(inode->i_mode)) 3561 ar.flags = EXT4_MB_HINT_DATA; 3562 else 3563 /* disable in-core preallocation for non-regular files */ 3564 ar.flags = 0; 3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3568 if (!newblock) 3569 goto out2; 3570 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 3571 ar.goal, newblock, allocated); 3572 3573 /* try to insert new extent into found leaf and return */ 3574 ext4_ext_store_pblock(&newex, newblock); 3575 newex.ee_len = cpu_to_le16(ar.len); 3576 /* Mark uninitialized */ 3577 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3578 ext4_ext_mark_uninitialized(&newex); 3579 /* 3580 * io_end structure was created for every IO write to an 3581 * uninitialized extent. To avoid unnecessary conversion, 3582 * here we flag the IO that really needs the conversion. 3583 * For non asycn direct IO case, flag the inode state 3584 * that we need to perform conversion when IO is done. 3585 */ 3586 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3587 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3588 io->flag = EXT4_IO_END_UNWRITTEN; 3589 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3590 } else 3591 ext4_set_inode_state(inode, 3592 EXT4_STATE_DIO_UNWRITTEN); 3593 } 3594 if (ext4_should_dioread_nolock(inode)) 3595 map->m_flags |= EXT4_MAP_UNINIT; 3596 } 3597 3598 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 3599 if (err) 3600 goto out2; 3601 3602 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3603 if (err) { 3604 /* free data blocks we just allocated */ 3605 /* not a good idea to call discard here directly, 3606 * but otherwise we'd need to call it every free() */ 3607 ext4_discard_preallocations(inode); 3608 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 3609 ext4_ext_get_actual_len(&newex), 0); 3610 goto out2; 3611 } 3612 3613 /* previous routine could use block we allocated */ 3614 newblock = ext4_ext_pblock(&newex); 3615 allocated = ext4_ext_get_actual_len(&newex); 3616 if (allocated > map->m_len) 3617 allocated = map->m_len; 3618 map->m_flags |= EXT4_MAP_NEW; 3619 3620 /* 3621 * Update reserved blocks/metadata blocks after successful 3622 * block allocation which had been deferred till now. 3623 */ 3624 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3625 ext4_da_update_reserve_space(inode, allocated, 1); 3626 3627 /* 3628 * Cache the extent and update transaction to commit on fdatasync only 3629 * when it is _not_ an uninitialized extent. 3630 */ 3631 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3632 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); 3633 ext4_update_inode_fsync_trans(handle, inode, 1); 3634 } else 3635 ext4_update_inode_fsync_trans(handle, inode, 0); 3636 out: 3637 if (allocated > map->m_len) 3638 allocated = map->m_len; 3639 ext4_ext_show_leaf(inode, path); 3640 map->m_flags |= EXT4_MAP_MAPPED; 3641 map->m_pblk = newblock; 3642 map->m_len = allocated; 3643 out2: 3644 if (path) { 3645 ext4_ext_drop_refs(path); 3646 kfree(path); 3647 } 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3649 newblock, map->m_len, err ? err : allocated); 3650 3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ? 3652 punched_out : allocated; 3653 3654 return err ? err : result; 3655 } 3656 3657 void ext4_ext_truncate(struct inode *inode) 3658 { 3659 struct address_space *mapping = inode->i_mapping; 3660 struct super_block *sb = inode->i_sb; 3661 ext4_lblk_t last_block; 3662 handle_t *handle; 3663 int err = 0; 3664 3665 /* 3666 * finish any pending end_io work so we won't run the risk of 3667 * converting any truncated blocks to initialized later 3668 */ 3669 ext4_flush_completed_IO(inode); 3670 3671 /* 3672 * probably first extent we're gonna free will be last in block 3673 */ 3674 err = ext4_writepage_trans_blocks(inode); 3675 handle = ext4_journal_start(inode, err); 3676 if (IS_ERR(handle)) 3677 return; 3678 3679 if (inode->i_size & (sb->s_blocksize - 1)) 3680 ext4_block_truncate_page(handle, mapping, inode->i_size); 3681 3682 if (ext4_orphan_add(handle, inode)) 3683 goto out_stop; 3684 3685 down_write(&EXT4_I(inode)->i_data_sem); 3686 ext4_ext_invalidate_cache(inode); 3687 3688 ext4_discard_preallocations(inode); 3689 3690 /* 3691 * TODO: optimization is possible here. 3692 * Probably we need not scan at all, 3693 * because page truncation is enough. 3694 */ 3695 3696 /* we have to know where to truncate from in crash case */ 3697 EXT4_I(inode)->i_disksize = inode->i_size; 3698 ext4_mark_inode_dirty(handle, inode); 3699 3700 last_block = (inode->i_size + sb->s_blocksize - 1) 3701 >> EXT4_BLOCK_SIZE_BITS(sb); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 3703 3704 /* In a multi-transaction truncate, we only make the final 3705 * transaction synchronous. 3706 */ 3707 if (IS_SYNC(inode)) 3708 ext4_handle_sync(handle); 3709 3710 up_write(&EXT4_I(inode)->i_data_sem); 3711 3712 out_stop: 3713 /* 3714 * If this was a simple ftruncate() and the file will remain alive, 3715 * then we need to clear up the orphan record which we created above. 3716 * However, if this was a real unlink then we were called by 3717 * ext4_delete_inode(), and we allow that function to clean up the 3718 * orphan info for us. 3719 */ 3720 if (inode->i_nlink) 3721 ext4_orphan_del(handle, inode); 3722 3723 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3724 ext4_mark_inode_dirty(handle, inode); 3725 ext4_journal_stop(handle); 3726 } 3727 3728 static void ext4_falloc_update_inode(struct inode *inode, 3729 int mode, loff_t new_size, int update_ctime) 3730 { 3731 struct timespec now; 3732 3733 if (update_ctime) { 3734 now = current_fs_time(inode->i_sb); 3735 if (!timespec_equal(&inode->i_ctime, &now)) 3736 inode->i_ctime = now; 3737 } 3738 /* 3739 * Update only when preallocation was requested beyond 3740 * the file size. 3741 */ 3742 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3743 if (new_size > i_size_read(inode)) 3744 i_size_write(inode, new_size); 3745 if (new_size > EXT4_I(inode)->i_disksize) 3746 ext4_update_i_disksize(inode, new_size); 3747 } else { 3748 /* 3749 * Mark that we allocate beyond EOF so the subsequent truncate 3750 * can proceed even if the new size is the same as i_size. 3751 */ 3752 if (new_size > i_size_read(inode)) 3753 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3754 } 3755 3756 } 3757 3758 /* 3759 * preallocate space for a file. This implements ext4's fallocate file 3760 * operation, which gets called from sys_fallocate system call. 3761 * For block-mapped files, posix_fallocate should fall back to the method 3762 * of writing zeroes to the required new blocks (the same behavior which is 3763 * expected for file systems which do not support fallocate() system call). 3764 */ 3765 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 3766 { 3767 struct inode *inode = file->f_path.dentry->d_inode; 3768 handle_t *handle; 3769 loff_t new_size; 3770 unsigned int max_blocks; 3771 int ret = 0; 3772 int ret2 = 0; 3773 int retries = 0; 3774 struct ext4_map_blocks map; 3775 unsigned int credits, blkbits = inode->i_blkbits; 3776 3777 /* 3778 * currently supporting (pre)allocate mode for extent-based 3779 * files _only_ 3780 */ 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3782 return -EOPNOTSUPP; 3783 3784 /* Return error if mode is not supported */ 3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 3786 return -EOPNOTSUPP; 3787 3788 if (mode & FALLOC_FL_PUNCH_HOLE) 3789 return ext4_punch_hole(file, offset, len); 3790 3791 trace_ext4_fallocate_enter(inode, offset, len, mode); 3792 map.m_lblk = offset >> blkbits; 3793 /* 3794 * We can't just convert len to max_blocks because 3795 * If blocksize = 4096 offset = 3072 and len = 2048 3796 */ 3797 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3798 - map.m_lblk; 3799 /* 3800 * credits to insert 1 extent into extent tree 3801 */ 3802 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3803 mutex_lock(&inode->i_mutex); 3804 ret = inode_newsize_ok(inode, (len + offset)); 3805 if (ret) { 3806 mutex_unlock(&inode->i_mutex); 3807 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 3808 return ret; 3809 } 3810 retry: 3811 while (ret >= 0 && ret < max_blocks) { 3812 map.m_lblk = map.m_lblk + ret; 3813 map.m_len = max_blocks = max_blocks - ret; 3814 handle = ext4_journal_start(inode, credits); 3815 if (IS_ERR(handle)) { 3816 ret = PTR_ERR(handle); 3817 break; 3818 } 3819 ret = ext4_map_blocks(handle, inode, &map, 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT | 3821 EXT4_GET_BLOCKS_NO_NORMALIZE); 3822 if (ret <= 0) { 3823 #ifdef EXT4FS_DEBUG 3824 WARN_ON(ret <= 0); 3825 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3826 "returned error inode#%lu, block=%u, " 3827 "max_blocks=%u", __func__, 3828 inode->i_ino, map.m_lblk, max_blocks); 3829 #endif 3830 ext4_mark_inode_dirty(handle, inode); 3831 ret2 = ext4_journal_stop(handle); 3832 break; 3833 } 3834 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3835 blkbits) >> blkbits)) 3836 new_size = offset + len; 3837 else 3838 new_size = (map.m_lblk + ret) << blkbits; 3839 3840 ext4_falloc_update_inode(inode, mode, new_size, 3841 (map.m_flags & EXT4_MAP_NEW)); 3842 ext4_mark_inode_dirty(handle, inode); 3843 ret2 = ext4_journal_stop(handle); 3844 if (ret2) 3845 break; 3846 } 3847 if (ret == -ENOSPC && 3848 ext4_should_retry_alloc(inode->i_sb, &retries)) { 3849 ret = 0; 3850 goto retry; 3851 } 3852 mutex_unlock(&inode->i_mutex); 3853 trace_ext4_fallocate_exit(inode, offset, max_blocks, 3854 ret > 0 ? ret2 : ret); 3855 return ret > 0 ? ret2 : ret; 3856 } 3857 3858 /* 3859 * This function convert a range of blocks to written extents 3860 * The caller of this function will pass the start offset and the size. 3861 * all unwritten extents within this range will be converted to 3862 * written extents. 3863 * 3864 * This function is called from the direct IO end io call back 3865 * function, to convert the fallocated extents after IO is completed. 3866 * Returns 0 on success. 3867 */ 3868 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3869 ssize_t len) 3870 { 3871 handle_t *handle; 3872 unsigned int max_blocks; 3873 int ret = 0; 3874 int ret2 = 0; 3875 struct ext4_map_blocks map; 3876 unsigned int credits, blkbits = inode->i_blkbits; 3877 3878 map.m_lblk = offset >> blkbits; 3879 /* 3880 * We can't just convert len to max_blocks because 3881 * If blocksize = 4096 offset = 3072 and len = 2048 3882 */ 3883 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 3884 map.m_lblk); 3885 /* 3886 * credits to insert 1 extent into extent tree 3887 */ 3888 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3889 while (ret >= 0 && ret < max_blocks) { 3890 map.m_lblk += ret; 3891 map.m_len = (max_blocks -= ret); 3892 handle = ext4_journal_start(inode, credits); 3893 if (IS_ERR(handle)) { 3894 ret = PTR_ERR(handle); 3895 break; 3896 } 3897 ret = ext4_map_blocks(handle, inode, &map, 3898 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3899 if (ret <= 0) { 3900 WARN_ON(ret <= 0); 3901 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3902 "returned error inode#%lu, block=%u, " 3903 "max_blocks=%u", __func__, 3904 inode->i_ino, map.m_lblk, map.m_len); 3905 } 3906 ext4_mark_inode_dirty(handle, inode); 3907 ret2 = ext4_journal_stop(handle); 3908 if (ret <= 0 || ret2 ) 3909 break; 3910 } 3911 return ret > 0 ? ret2 : ret; 3912 } 3913 3914 /* 3915 * Callback function called for each extent to gather FIEMAP information. 3916 */ 3917 static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, 3918 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3919 void *data) 3920 { 3921 __u64 logical; 3922 __u64 physical; 3923 __u64 length; 3924 __u32 flags = 0; 3925 int ret = 0; 3926 struct fiemap_extent_info *fieinfo = data; 3927 unsigned char blksize_bits; 3928 3929 blksize_bits = inode->i_sb->s_blocksize_bits; 3930 logical = (__u64)newex->ec_block << blksize_bits; 3931 3932 if (newex->ec_start == 0) { 3933 /* 3934 * No extent in extent-tree contains block @newex->ec_start, 3935 * then the block may stay in 1)a hole or 2)delayed-extent. 3936 * 3937 * Holes or delayed-extents are processed as follows. 3938 * 1. lookup dirty pages with specified range in pagecache. 3939 * If no page is got, then there is no delayed-extent and 3940 * return with EXT_CONTINUE. 3941 * 2. find the 1st mapped buffer, 3942 * 3. check if the mapped buffer is both in the request range 3943 * and a delayed buffer. If not, there is no delayed-extent, 3944 * then return. 3945 * 4. a delayed-extent is found, the extent will be collected. 3946 */ 3947 ext4_lblk_t end = 0; 3948 pgoff_t last_offset; 3949 pgoff_t offset; 3950 pgoff_t index; 3951 pgoff_t start_index = 0; 3952 struct page **pages = NULL; 3953 struct buffer_head *bh = NULL; 3954 struct buffer_head *head = NULL; 3955 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); 3956 3957 pages = kmalloc(PAGE_SIZE, GFP_KERNEL); 3958 if (pages == NULL) 3959 return -ENOMEM; 3960 3961 offset = logical >> PAGE_SHIFT; 3962 repeat: 3963 last_offset = offset; 3964 head = NULL; 3965 ret = find_get_pages_tag(inode->i_mapping, &offset, 3966 PAGECACHE_TAG_DIRTY, nr_pages, pages); 3967 3968 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 3969 /* First time, try to find a mapped buffer. */ 3970 if (ret == 0) { 3971 out: 3972 for (index = 0; index < ret; index++) 3973 page_cache_release(pages[index]); 3974 /* just a hole. */ 3975 kfree(pages); 3976 return EXT_CONTINUE; 3977 } 3978 index = 0; 3979 3980 next_page: 3981 /* Try to find the 1st mapped buffer. */ 3982 end = ((__u64)pages[index]->index << PAGE_SHIFT) >> 3983 blksize_bits; 3984 if (!page_has_buffers(pages[index])) 3985 goto out; 3986 head = page_buffers(pages[index]); 3987 if (!head) 3988 goto out; 3989 3990 index++; 3991 bh = head; 3992 do { 3993 if (end >= newex->ec_block + 3994 newex->ec_len) 3995 /* The buffer is out of 3996 * the request range. 3997 */ 3998 goto out; 3999 4000 if (buffer_mapped(bh) && 4001 end >= newex->ec_block) { 4002 start_index = index - 1; 4003 /* get the 1st mapped buffer. */ 4004 goto found_mapped_buffer; 4005 } 4006 4007 bh = bh->b_this_page; 4008 end++; 4009 } while (bh != head); 4010 4011 /* No mapped buffer in the range found in this page, 4012 * We need to look up next page. 4013 */ 4014 if (index >= ret) { 4015 /* There is no page left, but we need to limit 4016 * newex->ec_len. 4017 */ 4018 newex->ec_len = end - newex->ec_block; 4019 goto out; 4020 } 4021 goto next_page; 4022 } else { 4023 /*Find contiguous delayed buffers. */ 4024 if (ret > 0 && pages[0]->index == last_offset) 4025 head = page_buffers(pages[0]); 4026 bh = head; 4027 index = 1; 4028 start_index = 0; 4029 } 4030 4031 found_mapped_buffer: 4032 if (bh != NULL && buffer_delay(bh)) { 4033 /* 1st or contiguous delayed buffer found. */ 4034 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 4035 /* 4036 * 1st delayed buffer found, record 4037 * the start of extent. 4038 */ 4039 flags |= FIEMAP_EXTENT_DELALLOC; 4040 newex->ec_block = end; 4041 logical = (__u64)end << blksize_bits; 4042 } 4043 /* Find contiguous delayed buffers. */ 4044 do { 4045 if (!buffer_delay(bh)) 4046 goto found_delayed_extent; 4047 bh = bh->b_this_page; 4048 end++; 4049 } while (bh != head); 4050 4051 for (; index < ret; index++) { 4052 if (!page_has_buffers(pages[index])) { 4053 bh = NULL; 4054 break; 4055 } 4056 head = page_buffers(pages[index]); 4057 if (!head) { 4058 bh = NULL; 4059 break; 4060 } 4061 4062 if (pages[index]->index != 4063 pages[start_index]->index + index 4064 - start_index) { 4065 /* Blocks are not contiguous. */ 4066 bh = NULL; 4067 break; 4068 } 4069 bh = head; 4070 do { 4071 if (!buffer_delay(bh)) 4072 /* Delayed-extent ends. */ 4073 goto found_delayed_extent; 4074 bh = bh->b_this_page; 4075 end++; 4076 } while (bh != head); 4077 } 4078 } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) 4079 /* a hole found. */ 4080 goto out; 4081 4082 found_delayed_extent: 4083 newex->ec_len = min(end - newex->ec_block, 4084 (ext4_lblk_t)EXT_INIT_MAX_LEN); 4085 if (ret == nr_pages && bh != NULL && 4086 newex->ec_len < EXT_INIT_MAX_LEN && 4087 buffer_delay(bh)) { 4088 /* Have not collected an extent and continue. */ 4089 for (index = 0; index < ret; index++) 4090 page_cache_release(pages[index]); 4091 goto repeat; 4092 } 4093 4094 for (index = 0; index < ret; index++) 4095 page_cache_release(pages[index]); 4096 kfree(pages); 4097 } 4098 4099 physical = (__u64)newex->ec_start << blksize_bits; 4100 length = (__u64)newex->ec_len << blksize_bits; 4101 4102 if (ex && ext4_ext_is_uninitialized(ex)) 4103 flags |= FIEMAP_EXTENT_UNWRITTEN; 4104 4105 if (next == EXT_MAX_BLOCKS) 4106 flags |= FIEMAP_EXTENT_LAST; 4107 4108 ret = fiemap_fill_next_extent(fieinfo, logical, physical, 4109 length, flags); 4110 if (ret < 0) 4111 return ret; 4112 if (ret == 1) 4113 return EXT_BREAK; 4114 return EXT_CONTINUE; 4115 } 4116 4117 /* fiemap flags we can handle specified here */ 4118 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4119 4120 static int ext4_xattr_fiemap(struct inode *inode, 4121 struct fiemap_extent_info *fieinfo) 4122 { 4123 __u64 physical = 0; 4124 __u64 length; 4125 __u32 flags = FIEMAP_EXTENT_LAST; 4126 int blockbits = inode->i_sb->s_blocksize_bits; 4127 int error = 0; 4128 4129 /* in-inode? */ 4130 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4131 struct ext4_iloc iloc; 4132 int offset; /* offset of xattr in inode */ 4133 4134 error = ext4_get_inode_loc(inode, &iloc); 4135 if (error) 4136 return error; 4137 physical = iloc.bh->b_blocknr << blockbits; 4138 offset = EXT4_GOOD_OLD_INODE_SIZE + 4139 EXT4_I(inode)->i_extra_isize; 4140 physical += offset; 4141 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 4142 flags |= FIEMAP_EXTENT_DATA_INLINE; 4143 brelse(iloc.bh); 4144 } else { /* external block */ 4145 physical = EXT4_I(inode)->i_file_acl << blockbits; 4146 length = inode->i_sb->s_blocksize; 4147 } 4148 4149 if (physical) 4150 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4151 length, flags); 4152 return (error < 0 ? error : 0); 4153 } 4154 4155 /* 4156 * ext4_ext_punch_hole 4157 * 4158 * Punches a hole of "length" bytes in a file starting 4159 * at byte "offset" 4160 * 4161 * @inode: The inode of the file to punch a hole in 4162 * @offset: The starting byte offset of the hole 4163 * @length: The length of the hole 4164 * 4165 * Returns the number of blocks removed or negative on err 4166 */ 4167 int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) 4168 { 4169 struct inode *inode = file->f_path.dentry->d_inode; 4170 struct super_block *sb = inode->i_sb; 4171 struct ext4_ext_cache cache_ex; 4172 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks; 4173 struct address_space *mapping = inode->i_mapping; 4174 struct ext4_map_blocks map; 4175 handle_t *handle; 4176 loff_t first_block_offset, last_block_offset, block_len; 4177 loff_t first_page, last_page, first_page_offset, last_page_offset; 4178 int ret, credits, blocks_released, err = 0; 4179 4180 first_block = (offset + sb->s_blocksize - 1) >> 4181 EXT4_BLOCK_SIZE_BITS(sb); 4182 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4183 4184 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb); 4185 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb); 4186 4187 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 4188 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 4189 4190 first_page_offset = first_page << PAGE_CACHE_SHIFT; 4191 last_page_offset = last_page << PAGE_CACHE_SHIFT; 4192 4193 /* 4194 * Write out all dirty pages to avoid race conditions 4195 * Then release them. 4196 */ 4197 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4198 err = filemap_write_and_wait_range(mapping, 4199 first_page_offset == 0 ? 0 : first_page_offset-1, 4200 last_page_offset); 4201 4202 if (err) 4203 return err; 4204 } 4205 4206 /* Now release the pages */ 4207 if (last_page_offset > first_page_offset) { 4208 truncate_inode_pages_range(mapping, first_page_offset, 4209 last_page_offset-1); 4210 } 4211 4212 /* finish any pending end_io work */ 4213 ext4_flush_completed_IO(inode); 4214 4215 credits = ext4_writepage_trans_blocks(inode); 4216 handle = ext4_journal_start(inode, credits); 4217 if (IS_ERR(handle)) 4218 return PTR_ERR(handle); 4219 4220 err = ext4_orphan_add(handle, inode); 4221 if (err) 4222 goto out; 4223 4224 /* 4225 * Now we need to zero out the un block aligned data. 4226 * If the file is smaller than a block, just 4227 * zero out the middle 4228 */ 4229 if (first_block > last_block) 4230 ext4_block_zero_page_range(handle, mapping, offset, length); 4231 else { 4232 /* zero out the head of the hole before the first block */ 4233 block_len = first_block_offset - offset; 4234 if (block_len > 0) 4235 ext4_block_zero_page_range(handle, mapping, 4236 offset, block_len); 4237 4238 /* zero out the tail of the hole after the last block */ 4239 block_len = offset + length - last_block_offset; 4240 if (block_len > 0) { 4241 ext4_block_zero_page_range(handle, mapping, 4242 last_block_offset, block_len); 4243 } 4244 } 4245 4246 /* If there are no blocks to remove, return now */ 4247 if (first_block >= last_block) 4248 goto out; 4249 4250 down_write(&EXT4_I(inode)->i_data_sem); 4251 ext4_ext_invalidate_cache(inode); 4252 ext4_discard_preallocations(inode); 4253 4254 /* 4255 * Loop over all the blocks and identify blocks 4256 * that need to be punched out 4257 */ 4258 iblock = first_block; 4259 blocks_released = 0; 4260 while (iblock < last_block) { 4261 max_blocks = last_block - iblock; 4262 num_blocks = 1; 4263 memset(&map, 0, sizeof(map)); 4264 map.m_lblk = iblock; 4265 map.m_len = max_blocks; 4266 ret = ext4_ext_map_blocks(handle, inode, &map, 4267 EXT4_GET_BLOCKS_PUNCH_OUT_EXT); 4268 4269 if (ret > 0) { 4270 blocks_released += ret; 4271 num_blocks = ret; 4272 } else if (ret == 0) { 4273 /* 4274 * If map blocks could not find the block, 4275 * then it is in a hole. If the hole was 4276 * not already cached, then map blocks should 4277 * put it in the cache. So we can get the hole 4278 * out of the cache 4279 */ 4280 memset(&cache_ex, 0, sizeof(cache_ex)); 4281 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) && 4282 !cache_ex.ec_start) { 4283 4284 /* The hole is cached */ 4285 num_blocks = cache_ex.ec_block + 4286 cache_ex.ec_len - iblock; 4287 4288 } else { 4289 /* The block could not be identified */ 4290 err = -EIO; 4291 break; 4292 } 4293 } else { 4294 /* Map blocks error */ 4295 err = ret; 4296 break; 4297 } 4298 4299 if (num_blocks == 0) { 4300 /* This condition should never happen */ 4301 ext_debug("Block lookup failed"); 4302 err = -EIO; 4303 break; 4304 } 4305 4306 iblock += num_blocks; 4307 } 4308 4309 if (blocks_released > 0) { 4310 ext4_ext_invalidate_cache(inode); 4311 ext4_discard_preallocations(inode); 4312 } 4313 4314 if (IS_SYNC(inode)) 4315 ext4_handle_sync(handle); 4316 4317 up_write(&EXT4_I(inode)->i_data_sem); 4318 4319 out: 4320 ext4_orphan_del(handle, inode); 4321 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4322 ext4_mark_inode_dirty(handle, inode); 4323 ext4_journal_stop(handle); 4324 return err; 4325 } 4326 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4327 __u64 start, __u64 len) 4328 { 4329 ext4_lblk_t start_blk; 4330 int error = 0; 4331 4332 /* fallback to generic here if not in extents fmt */ 4333 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4334 return generic_block_fiemap(inode, fieinfo, start, len, 4335 ext4_get_block); 4336 4337 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 4338 return -EBADR; 4339 4340 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4341 error = ext4_xattr_fiemap(inode, fieinfo); 4342 } else { 4343 ext4_lblk_t len_blks; 4344 __u64 last_blk; 4345 4346 start_blk = start >> inode->i_sb->s_blocksize_bits; 4347 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4348 if (last_blk >= EXT_MAX_BLOCKS) 4349 last_blk = EXT_MAX_BLOCKS-1; 4350 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4351 4352 /* 4353 * Walk the extent tree gathering extent information. 4354 * ext4_ext_fiemap_cb will push extents back to user. 4355 */ 4356 error = ext4_ext_walk_space(inode, start_blk, len_blks, 4357 ext4_ext_fiemap_cb, fieinfo); 4358 } 4359 4360 return error; 4361 } 4362