1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23 /* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32 #include <linux/fs.h> 33 #include <linux/time.h> 34 #include <linux/jbd2.h> 35 #include <linux/highuid.h> 36 #include <linux/pagemap.h> 37 #include <linux/quotaops.h> 38 #include <linux/string.h> 39 #include <linux/slab.h> 40 #include <linux/falloc.h> 41 #include <asm/uaccess.h> 42 #include <linux/fiemap.h> 43 #include "ext4_jbd2.h" 44 #include "ext4_extents.h" 45 #include "xattr.h" 46 47 #include <trace/events/ext4.h> 48 49 /* 50 * used by extent splitting. 51 */ 52 #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 53 due to ENOSPC */ 54 #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ 55 #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ 56 57 #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ 58 #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ 59 60 static __le32 ext4_extent_block_csum(struct inode *inode, 61 struct ext4_extent_header *eh) 62 { 63 struct ext4_inode_info *ei = EXT4_I(inode); 64 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 65 __u32 csum; 66 67 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, 68 EXT4_EXTENT_TAIL_OFFSET(eh)); 69 return cpu_to_le32(csum); 70 } 71 72 static int ext4_extent_block_csum_verify(struct inode *inode, 73 struct ext4_extent_header *eh) 74 { 75 struct ext4_extent_tail *et; 76 77 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 78 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 79 return 1; 80 81 et = find_ext4_extent_tail(eh); 82 if (et->et_checksum != ext4_extent_block_csum(inode, eh)) 83 return 0; 84 return 1; 85 } 86 87 static void ext4_extent_block_csum_set(struct inode *inode, 88 struct ext4_extent_header *eh) 89 { 90 struct ext4_extent_tail *et; 91 92 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 93 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) 94 return; 95 96 et = find_ext4_extent_tail(eh); 97 et->et_checksum = ext4_extent_block_csum(inode, eh); 98 } 99 100 static int ext4_split_extent(handle_t *handle, 101 struct inode *inode, 102 struct ext4_ext_path *path, 103 struct ext4_map_blocks *map, 104 int split_flag, 105 int flags); 106 107 static int ext4_split_extent_at(handle_t *handle, 108 struct inode *inode, 109 struct ext4_ext_path *path, 110 ext4_lblk_t split, 111 int split_flag, 112 int flags); 113 114 static int ext4_find_delayed_extent(struct inode *inode, 115 struct extent_status *newes); 116 117 static int ext4_ext_truncate_extend_restart(handle_t *handle, 118 struct inode *inode, 119 int needed) 120 { 121 int err; 122 123 if (!ext4_handle_valid(handle)) 124 return 0; 125 if (handle->h_buffer_credits > needed) 126 return 0; 127 err = ext4_journal_extend(handle, needed); 128 if (err <= 0) 129 return err; 130 err = ext4_truncate_restart_trans(handle, inode, needed); 131 if (err == 0) 132 err = -EAGAIN; 133 134 return err; 135 } 136 137 /* 138 * could return: 139 * - EROFS 140 * - ENOMEM 141 */ 142 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 143 struct ext4_ext_path *path) 144 { 145 if (path->p_bh) { 146 /* path points to block */ 147 return ext4_journal_get_write_access(handle, path->p_bh); 148 } 149 /* path points to leaf/index in inode body */ 150 /* we use in-core data, no need to protect them */ 151 return 0; 152 } 153 154 /* 155 * could return: 156 * - EROFS 157 * - ENOMEM 158 * - EIO 159 */ 160 int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, 161 struct inode *inode, struct ext4_ext_path *path) 162 { 163 int err; 164 if (path->p_bh) { 165 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 166 /* path points to block */ 167 err = __ext4_handle_dirty_metadata(where, line, handle, 168 inode, path->p_bh); 169 } else { 170 /* path points to leaf/index in inode body */ 171 err = ext4_mark_inode_dirty(handle, inode); 172 } 173 return err; 174 } 175 176 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 177 struct ext4_ext_path *path, 178 ext4_lblk_t block) 179 { 180 if (path) { 181 int depth = path->p_depth; 182 struct ext4_extent *ex; 183 184 /* 185 * Try to predict block placement assuming that we are 186 * filling in a file which will eventually be 187 * non-sparse --- i.e., in the case of libbfd writing 188 * an ELF object sections out-of-order but in a way 189 * the eventually results in a contiguous object or 190 * executable file, or some database extending a table 191 * space file. However, this is actually somewhat 192 * non-ideal if we are writing a sparse file such as 193 * qemu or KVM writing a raw image file that is going 194 * to stay fairly sparse, since it will end up 195 * fragmenting the file system's free space. Maybe we 196 * should have some hueristics or some way to allow 197 * userspace to pass a hint to file system, 198 * especially if the latter case turns out to be 199 * common. 200 */ 201 ex = path[depth].p_ext; 202 if (ex) { 203 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 204 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 205 206 if (block > ext_block) 207 return ext_pblk + (block - ext_block); 208 else 209 return ext_pblk - (ext_block - block); 210 } 211 212 /* it looks like index is empty; 213 * try to find starting block from index itself */ 214 if (path[depth].p_bh) 215 return path[depth].p_bh->b_blocknr; 216 } 217 218 /* OK. use inode's group */ 219 return ext4_inode_to_goal_block(inode); 220 } 221 222 /* 223 * Allocation for a meta data block 224 */ 225 static ext4_fsblk_t 226 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 227 struct ext4_ext_path *path, 228 struct ext4_extent *ex, int *err, unsigned int flags) 229 { 230 ext4_fsblk_t goal, newblock; 231 232 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 233 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 234 NULL, err); 235 return newblock; 236 } 237 238 static inline int ext4_ext_space_block(struct inode *inode, int check) 239 { 240 int size; 241 242 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 243 / sizeof(struct ext4_extent); 244 #ifdef AGGRESSIVE_TEST 245 if (!check && size > 6) 246 size = 6; 247 #endif 248 return size; 249 } 250 251 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 252 { 253 int size; 254 255 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 256 / sizeof(struct ext4_extent_idx); 257 #ifdef AGGRESSIVE_TEST 258 if (!check && size > 5) 259 size = 5; 260 #endif 261 return size; 262 } 263 264 static inline int ext4_ext_space_root(struct inode *inode, int check) 265 { 266 int size; 267 268 size = sizeof(EXT4_I(inode)->i_data); 269 size -= sizeof(struct ext4_extent_header); 270 size /= sizeof(struct ext4_extent); 271 #ifdef AGGRESSIVE_TEST 272 if (!check && size > 3) 273 size = 3; 274 #endif 275 return size; 276 } 277 278 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 279 { 280 int size; 281 282 size = sizeof(EXT4_I(inode)->i_data); 283 size -= sizeof(struct ext4_extent_header); 284 size /= sizeof(struct ext4_extent_idx); 285 #ifdef AGGRESSIVE_TEST 286 if (!check && size > 4) 287 size = 4; 288 #endif 289 return size; 290 } 291 292 /* 293 * Calculate the number of metadata blocks needed 294 * to allocate @blocks 295 * Worse case is one block per extent 296 */ 297 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 298 { 299 struct ext4_inode_info *ei = EXT4_I(inode); 300 int idxs; 301 302 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 303 / sizeof(struct ext4_extent_idx)); 304 305 /* 306 * If the new delayed allocation block is contiguous with the 307 * previous da block, it can share index blocks with the 308 * previous block, so we only need to allocate a new index 309 * block every idxs leaf blocks. At ldxs**2 blocks, we need 310 * an additional index block, and at ldxs**3 blocks, yet 311 * another index blocks. 312 */ 313 if (ei->i_da_metadata_calc_len && 314 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 315 int num = 0; 316 317 if ((ei->i_da_metadata_calc_len % idxs) == 0) 318 num++; 319 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 320 num++; 321 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 322 num++; 323 ei->i_da_metadata_calc_len = 0; 324 } else 325 ei->i_da_metadata_calc_len++; 326 ei->i_da_metadata_calc_last_lblock++; 327 return num; 328 } 329 330 /* 331 * In the worst case we need a new set of index blocks at 332 * every level of the inode's extent tree. 333 */ 334 ei->i_da_metadata_calc_len = 1; 335 ei->i_da_metadata_calc_last_lblock = lblock; 336 return ext_depth(inode) + 1; 337 } 338 339 static int 340 ext4_ext_max_entries(struct inode *inode, int depth) 341 { 342 int max; 343 344 if (depth == ext_depth(inode)) { 345 if (depth == 0) 346 max = ext4_ext_space_root(inode, 1); 347 else 348 max = ext4_ext_space_root_idx(inode, 1); 349 } else { 350 if (depth == 0) 351 max = ext4_ext_space_block(inode, 1); 352 else 353 max = ext4_ext_space_block_idx(inode, 1); 354 } 355 356 return max; 357 } 358 359 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 360 { 361 ext4_fsblk_t block = ext4_ext_pblock(ext); 362 int len = ext4_ext_get_actual_len(ext); 363 364 if (len == 0) 365 return 0; 366 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 367 } 368 369 static int ext4_valid_extent_idx(struct inode *inode, 370 struct ext4_extent_idx *ext_idx) 371 { 372 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 373 374 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 375 } 376 377 static int ext4_valid_extent_entries(struct inode *inode, 378 struct ext4_extent_header *eh, 379 int depth) 380 { 381 unsigned short entries; 382 if (eh->eh_entries == 0) 383 return 1; 384 385 entries = le16_to_cpu(eh->eh_entries); 386 387 if (depth == 0) { 388 /* leaf entries */ 389 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 390 while (entries) { 391 if (!ext4_valid_extent(inode, ext)) 392 return 0; 393 ext++; 394 entries--; 395 } 396 } else { 397 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 398 while (entries) { 399 if (!ext4_valid_extent_idx(inode, ext_idx)) 400 return 0; 401 ext_idx++; 402 entries--; 403 } 404 } 405 return 1; 406 } 407 408 static int __ext4_ext_check(const char *function, unsigned int line, 409 struct inode *inode, struct ext4_extent_header *eh, 410 int depth, ext4_fsblk_t pblk) 411 { 412 const char *error_msg; 413 int max = 0; 414 415 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 416 error_msg = "invalid magic"; 417 goto corrupted; 418 } 419 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 420 error_msg = "unexpected eh_depth"; 421 goto corrupted; 422 } 423 if (unlikely(eh->eh_max == 0)) { 424 error_msg = "invalid eh_max"; 425 goto corrupted; 426 } 427 max = ext4_ext_max_entries(inode, depth); 428 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 429 error_msg = "too large eh_max"; 430 goto corrupted; 431 } 432 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 433 error_msg = "invalid eh_entries"; 434 goto corrupted; 435 } 436 if (!ext4_valid_extent_entries(inode, eh, depth)) { 437 error_msg = "invalid extent entries"; 438 goto corrupted; 439 } 440 /* Verify checksum on non-root extent tree nodes */ 441 if (ext_depth(inode) != depth && 442 !ext4_extent_block_csum_verify(inode, eh)) { 443 error_msg = "extent tree corrupted"; 444 goto corrupted; 445 } 446 return 0; 447 448 corrupted: 449 ext4_error_inode(inode, function, line, 0, 450 "pblk %llu bad header/extent: %s - magic %x, " 451 "entries %u, max %u(%u), depth %u(%u)", 452 (unsigned long long) pblk, error_msg, 453 le16_to_cpu(eh->eh_magic), 454 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 455 max, le16_to_cpu(eh->eh_depth), depth); 456 return -EIO; 457 } 458 459 #define ext4_ext_check(inode, eh, depth, pblk) \ 460 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) 461 462 int ext4_ext_check_inode(struct inode *inode) 463 { 464 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); 465 } 466 467 static struct buffer_head * 468 __read_extent_tree_block(const char *function, unsigned int line, 469 struct inode *inode, ext4_fsblk_t pblk, int depth, 470 int flags) 471 { 472 struct buffer_head *bh; 473 int err; 474 475 bh = sb_getblk(inode->i_sb, pblk); 476 if (unlikely(!bh)) 477 return ERR_PTR(-ENOMEM); 478 479 if (!bh_uptodate_or_lock(bh)) { 480 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); 481 err = bh_submit_read(bh); 482 if (err < 0) 483 goto errout; 484 } 485 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) 486 return bh; 487 err = __ext4_ext_check(function, line, inode, 488 ext_block_hdr(bh), depth, pblk); 489 if (err) 490 goto errout; 491 set_buffer_verified(bh); 492 /* 493 * If this is a leaf block, cache all of its entries 494 */ 495 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { 496 struct ext4_extent_header *eh = ext_block_hdr(bh); 497 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); 498 ext4_lblk_t prev = 0; 499 int i; 500 501 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { 502 unsigned int status = EXTENT_STATUS_WRITTEN; 503 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 504 int len = ext4_ext_get_actual_len(ex); 505 506 if (prev && (prev != lblk)) 507 ext4_es_cache_extent(inode, prev, 508 lblk - prev, ~0, 509 EXTENT_STATUS_HOLE); 510 511 if (ext4_ext_is_uninitialized(ex)) 512 status = EXTENT_STATUS_UNWRITTEN; 513 ext4_es_cache_extent(inode, lblk, len, 514 ext4_ext_pblock(ex), status); 515 prev = lblk + len; 516 } 517 } 518 return bh; 519 errout: 520 put_bh(bh); 521 return ERR_PTR(err); 522 523 } 524 525 #define read_extent_tree_block(inode, pblk, depth, flags) \ 526 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 527 (depth), (flags)) 528 529 /* 530 * This function is called to cache a file's extent information in the 531 * extent status tree 532 */ 533 int ext4_ext_precache(struct inode *inode) 534 { 535 struct ext4_inode_info *ei = EXT4_I(inode); 536 struct ext4_ext_path *path = NULL; 537 struct buffer_head *bh; 538 int i = 0, depth, ret = 0; 539 540 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 541 return 0; /* not an extent-mapped inode */ 542 543 down_read(&ei->i_data_sem); 544 depth = ext_depth(inode); 545 546 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 547 GFP_NOFS); 548 if (path == NULL) { 549 up_read(&ei->i_data_sem); 550 return -ENOMEM; 551 } 552 553 /* Don't cache anything if there are no external extent blocks */ 554 if (depth == 0) 555 goto out; 556 path[0].p_hdr = ext_inode_hdr(inode); 557 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); 558 if (ret) 559 goto out; 560 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 561 while (i >= 0) { 562 /* 563 * If this is a leaf block or we've reached the end of 564 * the index block, go up 565 */ 566 if ((i == depth) || 567 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 568 brelse(path[i].p_bh); 569 path[i].p_bh = NULL; 570 i--; 571 continue; 572 } 573 bh = read_extent_tree_block(inode, 574 ext4_idx_pblock(path[i].p_idx++), 575 depth - i - 1, 576 EXT4_EX_FORCE_CACHE); 577 if (IS_ERR(bh)) { 578 ret = PTR_ERR(bh); 579 break; 580 } 581 i++; 582 path[i].p_bh = bh; 583 path[i].p_hdr = ext_block_hdr(bh); 584 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); 585 } 586 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 587 out: 588 up_read(&ei->i_data_sem); 589 ext4_ext_drop_refs(path); 590 kfree(path); 591 return ret; 592 } 593 594 #ifdef EXT_DEBUG 595 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 596 { 597 int k, l = path->p_depth; 598 599 ext_debug("path:"); 600 for (k = 0; k <= l; k++, path++) { 601 if (path->p_idx) { 602 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 603 ext4_idx_pblock(path->p_idx)); 604 } else if (path->p_ext) { 605 ext_debug(" %d:[%d]%d:%llu ", 606 le32_to_cpu(path->p_ext->ee_block), 607 ext4_ext_is_uninitialized(path->p_ext), 608 ext4_ext_get_actual_len(path->p_ext), 609 ext4_ext_pblock(path->p_ext)); 610 } else 611 ext_debug(" []"); 612 } 613 ext_debug("\n"); 614 } 615 616 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 617 { 618 int depth = ext_depth(inode); 619 struct ext4_extent_header *eh; 620 struct ext4_extent *ex; 621 int i; 622 623 if (!path) 624 return; 625 626 eh = path[depth].p_hdr; 627 ex = EXT_FIRST_EXTENT(eh); 628 629 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 630 631 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 632 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 633 ext4_ext_is_uninitialized(ex), 634 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 635 } 636 ext_debug("\n"); 637 } 638 639 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 640 ext4_fsblk_t newblock, int level) 641 { 642 int depth = ext_depth(inode); 643 struct ext4_extent *ex; 644 645 if (depth != level) { 646 struct ext4_extent_idx *idx; 647 idx = path[level].p_idx; 648 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 649 ext_debug("%d: move %d:%llu in new index %llu\n", level, 650 le32_to_cpu(idx->ei_block), 651 ext4_idx_pblock(idx), 652 newblock); 653 idx++; 654 } 655 656 return; 657 } 658 659 ex = path[depth].p_ext; 660 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 661 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 662 le32_to_cpu(ex->ee_block), 663 ext4_ext_pblock(ex), 664 ext4_ext_is_uninitialized(ex), 665 ext4_ext_get_actual_len(ex), 666 newblock); 667 ex++; 668 } 669 } 670 671 #else 672 #define ext4_ext_show_path(inode, path) 673 #define ext4_ext_show_leaf(inode, path) 674 #define ext4_ext_show_move(inode, path, newblock, level) 675 #endif 676 677 void ext4_ext_drop_refs(struct ext4_ext_path *path) 678 { 679 int depth = path->p_depth; 680 int i; 681 682 for (i = 0; i <= depth; i++, path++) 683 if (path->p_bh) { 684 brelse(path->p_bh); 685 path->p_bh = NULL; 686 } 687 } 688 689 /* 690 * ext4_ext_binsearch_idx: 691 * binary search for the closest index of the given block 692 * the header must be checked before calling this 693 */ 694 static void 695 ext4_ext_binsearch_idx(struct inode *inode, 696 struct ext4_ext_path *path, ext4_lblk_t block) 697 { 698 struct ext4_extent_header *eh = path->p_hdr; 699 struct ext4_extent_idx *r, *l, *m; 700 701 702 ext_debug("binsearch for %u(idx): ", block); 703 704 l = EXT_FIRST_INDEX(eh) + 1; 705 r = EXT_LAST_INDEX(eh); 706 while (l <= r) { 707 m = l + (r - l) / 2; 708 if (block < le32_to_cpu(m->ei_block)) 709 r = m - 1; 710 else 711 l = m + 1; 712 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 713 m, le32_to_cpu(m->ei_block), 714 r, le32_to_cpu(r->ei_block)); 715 } 716 717 path->p_idx = l - 1; 718 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), 719 ext4_idx_pblock(path->p_idx)); 720 721 #ifdef CHECK_BINSEARCH 722 { 723 struct ext4_extent_idx *chix, *ix; 724 int k; 725 726 chix = ix = EXT_FIRST_INDEX(eh); 727 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 728 if (k != 0 && 729 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 730 printk(KERN_DEBUG "k=%d, ix=0x%p, " 731 "first=0x%p\n", k, 732 ix, EXT_FIRST_INDEX(eh)); 733 printk(KERN_DEBUG "%u <= %u\n", 734 le32_to_cpu(ix->ei_block), 735 le32_to_cpu(ix[-1].ei_block)); 736 } 737 BUG_ON(k && le32_to_cpu(ix->ei_block) 738 <= le32_to_cpu(ix[-1].ei_block)); 739 if (block < le32_to_cpu(ix->ei_block)) 740 break; 741 chix = ix; 742 } 743 BUG_ON(chix != path->p_idx); 744 } 745 #endif 746 747 } 748 749 /* 750 * ext4_ext_binsearch: 751 * binary search for closest extent of the given block 752 * the header must be checked before calling this 753 */ 754 static void 755 ext4_ext_binsearch(struct inode *inode, 756 struct ext4_ext_path *path, ext4_lblk_t block) 757 { 758 struct ext4_extent_header *eh = path->p_hdr; 759 struct ext4_extent *r, *l, *m; 760 761 if (eh->eh_entries == 0) { 762 /* 763 * this leaf is empty: 764 * we get such a leaf in split/add case 765 */ 766 return; 767 } 768 769 ext_debug("binsearch for %u: ", block); 770 771 l = EXT_FIRST_EXTENT(eh) + 1; 772 r = EXT_LAST_EXTENT(eh); 773 774 while (l <= r) { 775 m = l + (r - l) / 2; 776 if (block < le32_to_cpu(m->ee_block)) 777 r = m - 1; 778 else 779 l = m + 1; 780 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 781 m, le32_to_cpu(m->ee_block), 782 r, le32_to_cpu(r->ee_block)); 783 } 784 785 path->p_ext = l - 1; 786 ext_debug(" -> %d:%llu:[%d]%d ", 787 le32_to_cpu(path->p_ext->ee_block), 788 ext4_ext_pblock(path->p_ext), 789 ext4_ext_is_uninitialized(path->p_ext), 790 ext4_ext_get_actual_len(path->p_ext)); 791 792 #ifdef CHECK_BINSEARCH 793 { 794 struct ext4_extent *chex, *ex; 795 int k; 796 797 chex = ex = EXT_FIRST_EXTENT(eh); 798 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 799 BUG_ON(k && le32_to_cpu(ex->ee_block) 800 <= le32_to_cpu(ex[-1].ee_block)); 801 if (block < le32_to_cpu(ex->ee_block)) 802 break; 803 chex = ex; 804 } 805 BUG_ON(chex != path->p_ext); 806 } 807 #endif 808 809 } 810 811 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 812 { 813 struct ext4_extent_header *eh; 814 815 eh = ext_inode_hdr(inode); 816 eh->eh_depth = 0; 817 eh->eh_entries = 0; 818 eh->eh_magic = EXT4_EXT_MAGIC; 819 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 820 ext4_mark_inode_dirty(handle, inode); 821 return 0; 822 } 823 824 struct ext4_ext_path * 825 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 826 struct ext4_ext_path *path, int flags) 827 { 828 struct ext4_extent_header *eh; 829 struct buffer_head *bh; 830 short int depth, i, ppos = 0, alloc = 0; 831 int ret; 832 833 eh = ext_inode_hdr(inode); 834 depth = ext_depth(inode); 835 836 /* account possible depth increase */ 837 if (!path) { 838 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 839 GFP_NOFS); 840 if (!path) 841 return ERR_PTR(-ENOMEM); 842 alloc = 1; 843 } 844 path[0].p_hdr = eh; 845 path[0].p_bh = NULL; 846 847 i = depth; 848 /* walk through the tree */ 849 while (i) { 850 ext_debug("depth %d: num %d, max %d\n", 851 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 852 853 ext4_ext_binsearch_idx(inode, path + ppos, block); 854 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 855 path[ppos].p_depth = i; 856 path[ppos].p_ext = NULL; 857 858 bh = read_extent_tree_block(inode, path[ppos].p_block, --i, 859 flags); 860 if (IS_ERR(bh)) { 861 ret = PTR_ERR(bh); 862 goto err; 863 } 864 865 eh = ext_block_hdr(bh); 866 ppos++; 867 if (unlikely(ppos > depth)) { 868 put_bh(bh); 869 EXT4_ERROR_INODE(inode, 870 "ppos %d > depth %d", ppos, depth); 871 ret = -EIO; 872 goto err; 873 } 874 path[ppos].p_bh = bh; 875 path[ppos].p_hdr = eh; 876 } 877 878 path[ppos].p_depth = i; 879 path[ppos].p_ext = NULL; 880 path[ppos].p_idx = NULL; 881 882 /* find extent */ 883 ext4_ext_binsearch(inode, path + ppos, block); 884 /* if not an empty leaf */ 885 if (path[ppos].p_ext) 886 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 887 888 ext4_ext_show_path(inode, path); 889 890 return path; 891 892 err: 893 ext4_ext_drop_refs(path); 894 if (alloc) 895 kfree(path); 896 return ERR_PTR(ret); 897 } 898 899 /* 900 * ext4_ext_insert_index: 901 * insert new index [@logical;@ptr] into the block at @curp; 902 * check where to insert: before @curp or after @curp 903 */ 904 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 905 struct ext4_ext_path *curp, 906 int logical, ext4_fsblk_t ptr) 907 { 908 struct ext4_extent_idx *ix; 909 int len, err; 910 911 err = ext4_ext_get_access(handle, inode, curp); 912 if (err) 913 return err; 914 915 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 916 EXT4_ERROR_INODE(inode, 917 "logical %d == ei_block %d!", 918 logical, le32_to_cpu(curp->p_idx->ei_block)); 919 return -EIO; 920 } 921 922 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 923 >= le16_to_cpu(curp->p_hdr->eh_max))) { 924 EXT4_ERROR_INODE(inode, 925 "eh_entries %d >= eh_max %d!", 926 le16_to_cpu(curp->p_hdr->eh_entries), 927 le16_to_cpu(curp->p_hdr->eh_max)); 928 return -EIO; 929 } 930 931 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 932 /* insert after */ 933 ext_debug("insert new index %d after: %llu\n", logical, ptr); 934 ix = curp->p_idx + 1; 935 } else { 936 /* insert before */ 937 ext_debug("insert new index %d before: %llu\n", logical, ptr); 938 ix = curp->p_idx; 939 } 940 941 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 942 BUG_ON(len < 0); 943 if (len > 0) { 944 ext_debug("insert new index %d: " 945 "move %d indices from 0x%p to 0x%p\n", 946 logical, len, ix, ix + 1); 947 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 948 } 949 950 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 951 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 952 return -EIO; 953 } 954 955 ix->ei_block = cpu_to_le32(logical); 956 ext4_idx_store_pblock(ix, ptr); 957 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 958 959 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 960 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 961 return -EIO; 962 } 963 964 err = ext4_ext_dirty(handle, inode, curp); 965 ext4_std_error(inode->i_sb, err); 966 967 return err; 968 } 969 970 /* 971 * ext4_ext_split: 972 * inserts new subtree into the path, using free index entry 973 * at depth @at: 974 * - allocates all needed blocks (new leaf and all intermediate index blocks) 975 * - makes decision where to split 976 * - moves remaining extents and index entries (right to the split point) 977 * into the newly allocated blocks 978 * - initializes subtree 979 */ 980 static int ext4_ext_split(handle_t *handle, struct inode *inode, 981 unsigned int flags, 982 struct ext4_ext_path *path, 983 struct ext4_extent *newext, int at) 984 { 985 struct buffer_head *bh = NULL; 986 int depth = ext_depth(inode); 987 struct ext4_extent_header *neh; 988 struct ext4_extent_idx *fidx; 989 int i = at, k, m, a; 990 ext4_fsblk_t newblock, oldblock; 991 __le32 border; 992 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 993 int err = 0; 994 995 /* make decision: where to split? */ 996 /* FIXME: now decision is simplest: at current extent */ 997 998 /* if current leaf will be split, then we should use 999 * border from split point */ 1000 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 1001 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 1002 return -EIO; 1003 } 1004 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 1005 border = path[depth].p_ext[1].ee_block; 1006 ext_debug("leaf will be split." 1007 " next leaf starts at %d\n", 1008 le32_to_cpu(border)); 1009 } else { 1010 border = newext->ee_block; 1011 ext_debug("leaf will be added." 1012 " next leaf starts at %d\n", 1013 le32_to_cpu(border)); 1014 } 1015 1016 /* 1017 * If error occurs, then we break processing 1018 * and mark filesystem read-only. index won't 1019 * be inserted and tree will be in consistent 1020 * state. Next mount will repair buffers too. 1021 */ 1022 1023 /* 1024 * Get array to track all allocated blocks. 1025 * We need this to handle errors and free blocks 1026 * upon them. 1027 */ 1028 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 1029 if (!ablocks) 1030 return -ENOMEM; 1031 1032 /* allocate all needed blocks */ 1033 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 1034 for (a = 0; a < depth - at; a++) { 1035 newblock = ext4_ext_new_meta_block(handle, inode, path, 1036 newext, &err, flags); 1037 if (newblock == 0) 1038 goto cleanup; 1039 ablocks[a] = newblock; 1040 } 1041 1042 /* initialize new leaf */ 1043 newblock = ablocks[--a]; 1044 if (unlikely(newblock == 0)) { 1045 EXT4_ERROR_INODE(inode, "newblock == 0!"); 1046 err = -EIO; 1047 goto cleanup; 1048 } 1049 bh = sb_getblk(inode->i_sb, newblock); 1050 if (unlikely(!bh)) { 1051 err = -ENOMEM; 1052 goto cleanup; 1053 } 1054 lock_buffer(bh); 1055 1056 err = ext4_journal_get_create_access(handle, bh); 1057 if (err) 1058 goto cleanup; 1059 1060 neh = ext_block_hdr(bh); 1061 neh->eh_entries = 0; 1062 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1063 neh->eh_magic = EXT4_EXT_MAGIC; 1064 neh->eh_depth = 0; 1065 1066 /* move remainder of path[depth] to the new leaf */ 1067 if (unlikely(path[depth].p_hdr->eh_entries != 1068 path[depth].p_hdr->eh_max)) { 1069 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 1070 path[depth].p_hdr->eh_entries, 1071 path[depth].p_hdr->eh_max); 1072 err = -EIO; 1073 goto cleanup; 1074 } 1075 /* start copy from next extent */ 1076 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 1077 ext4_ext_show_move(inode, path, newblock, depth); 1078 if (m) { 1079 struct ext4_extent *ex; 1080 ex = EXT_FIRST_EXTENT(neh); 1081 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 1082 le16_add_cpu(&neh->eh_entries, m); 1083 } 1084 1085 ext4_extent_block_csum_set(inode, neh); 1086 set_buffer_uptodate(bh); 1087 unlock_buffer(bh); 1088 1089 err = ext4_handle_dirty_metadata(handle, inode, bh); 1090 if (err) 1091 goto cleanup; 1092 brelse(bh); 1093 bh = NULL; 1094 1095 /* correct old leaf */ 1096 if (m) { 1097 err = ext4_ext_get_access(handle, inode, path + depth); 1098 if (err) 1099 goto cleanup; 1100 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 1101 err = ext4_ext_dirty(handle, inode, path + depth); 1102 if (err) 1103 goto cleanup; 1104 1105 } 1106 1107 /* create intermediate indexes */ 1108 k = depth - at - 1; 1109 if (unlikely(k < 0)) { 1110 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 1111 err = -EIO; 1112 goto cleanup; 1113 } 1114 if (k) 1115 ext_debug("create %d intermediate indices\n", k); 1116 /* insert new index into current index block */ 1117 /* current depth stored in i var */ 1118 i = depth - 1; 1119 while (k--) { 1120 oldblock = newblock; 1121 newblock = ablocks[--a]; 1122 bh = sb_getblk(inode->i_sb, newblock); 1123 if (unlikely(!bh)) { 1124 err = -ENOMEM; 1125 goto cleanup; 1126 } 1127 lock_buffer(bh); 1128 1129 err = ext4_journal_get_create_access(handle, bh); 1130 if (err) 1131 goto cleanup; 1132 1133 neh = ext_block_hdr(bh); 1134 neh->eh_entries = cpu_to_le16(1); 1135 neh->eh_magic = EXT4_EXT_MAGIC; 1136 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1137 neh->eh_depth = cpu_to_le16(depth - i); 1138 fidx = EXT_FIRST_INDEX(neh); 1139 fidx->ei_block = border; 1140 ext4_idx_store_pblock(fidx, oldblock); 1141 1142 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 1143 i, newblock, le32_to_cpu(border), oldblock); 1144 1145 /* move remainder of path[i] to the new index block */ 1146 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1147 EXT_LAST_INDEX(path[i].p_hdr))) { 1148 EXT4_ERROR_INODE(inode, 1149 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1150 le32_to_cpu(path[i].p_ext->ee_block)); 1151 err = -EIO; 1152 goto cleanup; 1153 } 1154 /* start copy indexes */ 1155 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1156 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1157 EXT_MAX_INDEX(path[i].p_hdr)); 1158 ext4_ext_show_move(inode, path, newblock, i); 1159 if (m) { 1160 memmove(++fidx, path[i].p_idx, 1161 sizeof(struct ext4_extent_idx) * m); 1162 le16_add_cpu(&neh->eh_entries, m); 1163 } 1164 ext4_extent_block_csum_set(inode, neh); 1165 set_buffer_uptodate(bh); 1166 unlock_buffer(bh); 1167 1168 err = ext4_handle_dirty_metadata(handle, inode, bh); 1169 if (err) 1170 goto cleanup; 1171 brelse(bh); 1172 bh = NULL; 1173 1174 /* correct old index */ 1175 if (m) { 1176 err = ext4_ext_get_access(handle, inode, path + i); 1177 if (err) 1178 goto cleanup; 1179 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1180 err = ext4_ext_dirty(handle, inode, path + i); 1181 if (err) 1182 goto cleanup; 1183 } 1184 1185 i--; 1186 } 1187 1188 /* insert new index */ 1189 err = ext4_ext_insert_index(handle, inode, path + at, 1190 le32_to_cpu(border), newblock); 1191 1192 cleanup: 1193 if (bh) { 1194 if (buffer_locked(bh)) 1195 unlock_buffer(bh); 1196 brelse(bh); 1197 } 1198 1199 if (err) { 1200 /* free all allocated blocks in error case */ 1201 for (i = 0; i < depth; i++) { 1202 if (!ablocks[i]) 1203 continue; 1204 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1205 EXT4_FREE_BLOCKS_METADATA); 1206 } 1207 } 1208 kfree(ablocks); 1209 1210 return err; 1211 } 1212 1213 /* 1214 * ext4_ext_grow_indepth: 1215 * implements tree growing procedure: 1216 * - allocates new block 1217 * - moves top-level data (index block or leaf) into the new block 1218 * - initializes new top-level, creating index that points to the 1219 * just created block 1220 */ 1221 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1222 unsigned int flags, 1223 struct ext4_extent *newext) 1224 { 1225 struct ext4_extent_header *neh; 1226 struct buffer_head *bh; 1227 ext4_fsblk_t newblock; 1228 int err = 0; 1229 1230 newblock = ext4_ext_new_meta_block(handle, inode, NULL, 1231 newext, &err, flags); 1232 if (newblock == 0) 1233 return err; 1234 1235 bh = sb_getblk(inode->i_sb, newblock); 1236 if (unlikely(!bh)) 1237 return -ENOMEM; 1238 lock_buffer(bh); 1239 1240 err = ext4_journal_get_create_access(handle, bh); 1241 if (err) { 1242 unlock_buffer(bh); 1243 goto out; 1244 } 1245 1246 /* move top-level index/leaf into new block */ 1247 memmove(bh->b_data, EXT4_I(inode)->i_data, 1248 sizeof(EXT4_I(inode)->i_data)); 1249 1250 /* set size of new block */ 1251 neh = ext_block_hdr(bh); 1252 /* old root could have indexes or leaves 1253 * so calculate e_max right way */ 1254 if (ext_depth(inode)) 1255 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1256 else 1257 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1258 neh->eh_magic = EXT4_EXT_MAGIC; 1259 ext4_extent_block_csum_set(inode, neh); 1260 set_buffer_uptodate(bh); 1261 unlock_buffer(bh); 1262 1263 err = ext4_handle_dirty_metadata(handle, inode, bh); 1264 if (err) 1265 goto out; 1266 1267 /* Update top-level index: num,max,pointer */ 1268 neh = ext_inode_hdr(inode); 1269 neh->eh_entries = cpu_to_le16(1); 1270 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); 1271 if (neh->eh_depth == 0) { 1272 /* Root extent block becomes index block */ 1273 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1274 EXT_FIRST_INDEX(neh)->ei_block = 1275 EXT_FIRST_EXTENT(neh)->ee_block; 1276 } 1277 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1278 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1279 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1280 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1281 1282 le16_add_cpu(&neh->eh_depth, 1); 1283 ext4_mark_inode_dirty(handle, inode); 1284 out: 1285 brelse(bh); 1286 1287 return err; 1288 } 1289 1290 /* 1291 * ext4_ext_create_new_leaf: 1292 * finds empty index and adds new leaf. 1293 * if no free index is found, then it requests in-depth growing. 1294 */ 1295 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1296 unsigned int mb_flags, 1297 unsigned int gb_flags, 1298 struct ext4_ext_path *path, 1299 struct ext4_extent *newext) 1300 { 1301 struct ext4_ext_path *curp; 1302 int depth, i, err = 0; 1303 1304 repeat: 1305 i = depth = ext_depth(inode); 1306 1307 /* walk up to the tree and look for free index entry */ 1308 curp = path + depth; 1309 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1310 i--; 1311 curp--; 1312 } 1313 1314 /* we use already allocated block for index block, 1315 * so subsequent data blocks should be contiguous */ 1316 if (EXT_HAS_FREE_INDEX(curp)) { 1317 /* if we found index with free entry, then use that 1318 * entry: create all needed subtree and add new leaf */ 1319 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); 1320 if (err) 1321 goto out; 1322 1323 /* refill path */ 1324 ext4_ext_drop_refs(path); 1325 path = ext4_ext_find_extent(inode, 1326 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1327 path, gb_flags); 1328 if (IS_ERR(path)) 1329 err = PTR_ERR(path); 1330 } else { 1331 /* tree is full, time to grow in depth */ 1332 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); 1333 if (err) 1334 goto out; 1335 1336 /* refill path */ 1337 ext4_ext_drop_refs(path); 1338 path = ext4_ext_find_extent(inode, 1339 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1340 path, gb_flags); 1341 if (IS_ERR(path)) { 1342 err = PTR_ERR(path); 1343 goto out; 1344 } 1345 1346 /* 1347 * only first (depth 0 -> 1) produces free space; 1348 * in all other cases we have to split the grown tree 1349 */ 1350 depth = ext_depth(inode); 1351 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1352 /* now we need to split */ 1353 goto repeat; 1354 } 1355 } 1356 1357 out: 1358 return err; 1359 } 1360 1361 /* 1362 * search the closest allocated block to the left for *logical 1363 * and returns it at @logical + it's physical address at @phys 1364 * if *logical is the smallest allocated block, the function 1365 * returns 0 at @phys 1366 * return value contains 0 (success) or error code 1367 */ 1368 static int ext4_ext_search_left(struct inode *inode, 1369 struct ext4_ext_path *path, 1370 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1371 { 1372 struct ext4_extent_idx *ix; 1373 struct ext4_extent *ex; 1374 int depth, ee_len; 1375 1376 if (unlikely(path == NULL)) { 1377 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1378 return -EIO; 1379 } 1380 depth = path->p_depth; 1381 *phys = 0; 1382 1383 if (depth == 0 && path->p_ext == NULL) 1384 return 0; 1385 1386 /* usually extent in the path covers blocks smaller 1387 * then *logical, but it can be that extent is the 1388 * first one in the file */ 1389 1390 ex = path[depth].p_ext; 1391 ee_len = ext4_ext_get_actual_len(ex); 1392 if (*logical < le32_to_cpu(ex->ee_block)) { 1393 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1394 EXT4_ERROR_INODE(inode, 1395 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1396 *logical, le32_to_cpu(ex->ee_block)); 1397 return -EIO; 1398 } 1399 while (--depth >= 0) { 1400 ix = path[depth].p_idx; 1401 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1402 EXT4_ERROR_INODE(inode, 1403 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1404 ix != NULL ? le32_to_cpu(ix->ei_block) : 0, 1405 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1406 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, 1407 depth); 1408 return -EIO; 1409 } 1410 } 1411 return 0; 1412 } 1413 1414 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1415 EXT4_ERROR_INODE(inode, 1416 "logical %d < ee_block %d + ee_len %d!", 1417 *logical, le32_to_cpu(ex->ee_block), ee_len); 1418 return -EIO; 1419 } 1420 1421 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1422 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1423 return 0; 1424 } 1425 1426 /* 1427 * search the closest allocated block to the right for *logical 1428 * and returns it at @logical + it's physical address at @phys 1429 * if *logical is the largest allocated block, the function 1430 * returns 0 at @phys 1431 * return value contains 0 (success) or error code 1432 */ 1433 static int ext4_ext_search_right(struct inode *inode, 1434 struct ext4_ext_path *path, 1435 ext4_lblk_t *logical, ext4_fsblk_t *phys, 1436 struct ext4_extent **ret_ex) 1437 { 1438 struct buffer_head *bh = NULL; 1439 struct ext4_extent_header *eh; 1440 struct ext4_extent_idx *ix; 1441 struct ext4_extent *ex; 1442 ext4_fsblk_t block; 1443 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1444 int ee_len; 1445 1446 if (unlikely(path == NULL)) { 1447 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1448 return -EIO; 1449 } 1450 depth = path->p_depth; 1451 *phys = 0; 1452 1453 if (depth == 0 && path->p_ext == NULL) 1454 return 0; 1455 1456 /* usually extent in the path covers blocks smaller 1457 * then *logical, but it can be that extent is the 1458 * first one in the file */ 1459 1460 ex = path[depth].p_ext; 1461 ee_len = ext4_ext_get_actual_len(ex); 1462 if (*logical < le32_to_cpu(ex->ee_block)) { 1463 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1464 EXT4_ERROR_INODE(inode, 1465 "first_extent(path[%d].p_hdr) != ex", 1466 depth); 1467 return -EIO; 1468 } 1469 while (--depth >= 0) { 1470 ix = path[depth].p_idx; 1471 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1472 EXT4_ERROR_INODE(inode, 1473 "ix != EXT_FIRST_INDEX *logical %d!", 1474 *logical); 1475 return -EIO; 1476 } 1477 } 1478 goto found_extent; 1479 } 1480 1481 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1482 EXT4_ERROR_INODE(inode, 1483 "logical %d < ee_block %d + ee_len %d!", 1484 *logical, le32_to_cpu(ex->ee_block), ee_len); 1485 return -EIO; 1486 } 1487 1488 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1489 /* next allocated block in this leaf */ 1490 ex++; 1491 goto found_extent; 1492 } 1493 1494 /* go up and search for index to the right */ 1495 while (--depth >= 0) { 1496 ix = path[depth].p_idx; 1497 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1498 goto got_index; 1499 } 1500 1501 /* we've gone up to the root and found no index to the right */ 1502 return 0; 1503 1504 got_index: 1505 /* we've found index to the right, let's 1506 * follow it and find the closest allocated 1507 * block to the right */ 1508 ix++; 1509 block = ext4_idx_pblock(ix); 1510 while (++depth < path->p_depth) { 1511 /* subtract from p_depth to get proper eh_depth */ 1512 bh = read_extent_tree_block(inode, block, 1513 path->p_depth - depth, 0); 1514 if (IS_ERR(bh)) 1515 return PTR_ERR(bh); 1516 eh = ext_block_hdr(bh); 1517 ix = EXT_FIRST_INDEX(eh); 1518 block = ext4_idx_pblock(ix); 1519 put_bh(bh); 1520 } 1521 1522 bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); 1523 if (IS_ERR(bh)) 1524 return PTR_ERR(bh); 1525 eh = ext_block_hdr(bh); 1526 ex = EXT_FIRST_EXTENT(eh); 1527 found_extent: 1528 *logical = le32_to_cpu(ex->ee_block); 1529 *phys = ext4_ext_pblock(ex); 1530 *ret_ex = ex; 1531 if (bh) 1532 put_bh(bh); 1533 return 0; 1534 } 1535 1536 /* 1537 * ext4_ext_next_allocated_block: 1538 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1539 * NOTE: it considers block number from index entry as 1540 * allocated block. Thus, index entries have to be consistent 1541 * with leaves. 1542 */ 1543 static ext4_lblk_t 1544 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1545 { 1546 int depth; 1547 1548 BUG_ON(path == NULL); 1549 depth = path->p_depth; 1550 1551 if (depth == 0 && path->p_ext == NULL) 1552 return EXT_MAX_BLOCKS; 1553 1554 while (depth >= 0) { 1555 if (depth == path->p_depth) { 1556 /* leaf */ 1557 if (path[depth].p_ext && 1558 path[depth].p_ext != 1559 EXT_LAST_EXTENT(path[depth].p_hdr)) 1560 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1561 } else { 1562 /* index */ 1563 if (path[depth].p_idx != 1564 EXT_LAST_INDEX(path[depth].p_hdr)) 1565 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1566 } 1567 depth--; 1568 } 1569 1570 return EXT_MAX_BLOCKS; 1571 } 1572 1573 /* 1574 * ext4_ext_next_leaf_block: 1575 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1576 */ 1577 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) 1578 { 1579 int depth; 1580 1581 BUG_ON(path == NULL); 1582 depth = path->p_depth; 1583 1584 /* zero-tree has no leaf blocks at all */ 1585 if (depth == 0) 1586 return EXT_MAX_BLOCKS; 1587 1588 /* go to index block */ 1589 depth--; 1590 1591 while (depth >= 0) { 1592 if (path[depth].p_idx != 1593 EXT_LAST_INDEX(path[depth].p_hdr)) 1594 return (ext4_lblk_t) 1595 le32_to_cpu(path[depth].p_idx[1].ei_block); 1596 depth--; 1597 } 1598 1599 return EXT_MAX_BLOCKS; 1600 } 1601 1602 /* 1603 * ext4_ext_correct_indexes: 1604 * if leaf gets modified and modified extent is first in the leaf, 1605 * then we have to correct all indexes above. 1606 * TODO: do we need to correct tree in all cases? 1607 */ 1608 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1609 struct ext4_ext_path *path) 1610 { 1611 struct ext4_extent_header *eh; 1612 int depth = ext_depth(inode); 1613 struct ext4_extent *ex; 1614 __le32 border; 1615 int k, err = 0; 1616 1617 eh = path[depth].p_hdr; 1618 ex = path[depth].p_ext; 1619 1620 if (unlikely(ex == NULL || eh == NULL)) { 1621 EXT4_ERROR_INODE(inode, 1622 "ex %p == NULL or eh %p == NULL", ex, eh); 1623 return -EIO; 1624 } 1625 1626 if (depth == 0) { 1627 /* there is no tree at all */ 1628 return 0; 1629 } 1630 1631 if (ex != EXT_FIRST_EXTENT(eh)) { 1632 /* we correct tree if first leaf got modified only */ 1633 return 0; 1634 } 1635 1636 /* 1637 * TODO: we need correction if border is smaller than current one 1638 */ 1639 k = depth - 1; 1640 border = path[depth].p_ext->ee_block; 1641 err = ext4_ext_get_access(handle, inode, path + k); 1642 if (err) 1643 return err; 1644 path[k].p_idx->ei_block = border; 1645 err = ext4_ext_dirty(handle, inode, path + k); 1646 if (err) 1647 return err; 1648 1649 while (k--) { 1650 /* change all left-side indexes */ 1651 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1652 break; 1653 err = ext4_ext_get_access(handle, inode, path + k); 1654 if (err) 1655 break; 1656 path[k].p_idx->ei_block = border; 1657 err = ext4_ext_dirty(handle, inode, path + k); 1658 if (err) 1659 break; 1660 } 1661 1662 return err; 1663 } 1664 1665 int 1666 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1667 struct ext4_extent *ex2) 1668 { 1669 unsigned short ext1_ee_len, ext2_ee_len; 1670 1671 /* 1672 * Make sure that both extents are initialized. We don't merge 1673 * uninitialized extents so that we can be sure that end_io code has 1674 * the extent that was written properly split out and conversion to 1675 * initialized is trivial. 1676 */ 1677 if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) 1678 return 0; 1679 1680 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1681 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1682 1683 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1684 le32_to_cpu(ex2->ee_block)) 1685 return 0; 1686 1687 /* 1688 * To allow future support for preallocated extents to be added 1689 * as an RO_COMPAT feature, refuse to merge to extents if 1690 * this can result in the top bit of ee_len being set. 1691 */ 1692 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1693 return 0; 1694 #ifdef AGGRESSIVE_TEST 1695 if (ext1_ee_len >= 4) 1696 return 0; 1697 #endif 1698 1699 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1700 return 1; 1701 return 0; 1702 } 1703 1704 /* 1705 * This function tries to merge the "ex" extent to the next extent in the tree. 1706 * It always tries to merge towards right. If you want to merge towards 1707 * left, pass "ex - 1" as argument instead of "ex". 1708 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1709 * 1 if they got merged. 1710 */ 1711 static int ext4_ext_try_to_merge_right(struct inode *inode, 1712 struct ext4_ext_path *path, 1713 struct ext4_extent *ex) 1714 { 1715 struct ext4_extent_header *eh; 1716 unsigned int depth, len; 1717 int merge_done = 0; 1718 1719 depth = ext_depth(inode); 1720 BUG_ON(path[depth].p_hdr == NULL); 1721 eh = path[depth].p_hdr; 1722 1723 while (ex < EXT_LAST_EXTENT(eh)) { 1724 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1725 break; 1726 /* merge with next extent! */ 1727 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1728 + ext4_ext_get_actual_len(ex + 1)); 1729 1730 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1731 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1732 * sizeof(struct ext4_extent); 1733 memmove(ex + 1, ex + 2, len); 1734 } 1735 le16_add_cpu(&eh->eh_entries, -1); 1736 merge_done = 1; 1737 WARN_ON(eh->eh_entries == 0); 1738 if (!eh->eh_entries) 1739 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1740 } 1741 1742 return merge_done; 1743 } 1744 1745 /* 1746 * This function does a very simple check to see if we can collapse 1747 * an extent tree with a single extent tree leaf block into the inode. 1748 */ 1749 static void ext4_ext_try_to_merge_up(handle_t *handle, 1750 struct inode *inode, 1751 struct ext4_ext_path *path) 1752 { 1753 size_t s; 1754 unsigned max_root = ext4_ext_space_root(inode, 0); 1755 ext4_fsblk_t blk; 1756 1757 if ((path[0].p_depth != 1) || 1758 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1759 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1760 return; 1761 1762 /* 1763 * We need to modify the block allocation bitmap and the block 1764 * group descriptor to release the extent tree block. If we 1765 * can't get the journal credits, give up. 1766 */ 1767 if (ext4_journal_extend(handle, 2)) 1768 return; 1769 1770 /* 1771 * Copy the extent data up to the inode 1772 */ 1773 blk = ext4_idx_pblock(path[0].p_idx); 1774 s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1775 sizeof(struct ext4_extent_idx); 1776 s += sizeof(struct ext4_extent_header); 1777 1778 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1779 path[0].p_depth = 0; 1780 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1781 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1782 path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1783 1784 brelse(path[1].p_bh); 1785 ext4_free_blocks(handle, inode, NULL, blk, 1, 1786 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | 1787 EXT4_FREE_BLOCKS_RESERVE); 1788 } 1789 1790 /* 1791 * This function tries to merge the @ex extent to neighbours in the tree. 1792 * return 1 if merge left else 0. 1793 */ 1794 static void ext4_ext_try_to_merge(handle_t *handle, 1795 struct inode *inode, 1796 struct ext4_ext_path *path, 1797 struct ext4_extent *ex) { 1798 struct ext4_extent_header *eh; 1799 unsigned int depth; 1800 int merge_done = 0; 1801 1802 depth = ext_depth(inode); 1803 BUG_ON(path[depth].p_hdr == NULL); 1804 eh = path[depth].p_hdr; 1805 1806 if (ex > EXT_FIRST_EXTENT(eh)) 1807 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1808 1809 if (!merge_done) 1810 (void) ext4_ext_try_to_merge_right(inode, path, ex); 1811 1812 ext4_ext_try_to_merge_up(handle, inode, path); 1813 } 1814 1815 /* 1816 * check if a portion of the "newext" extent overlaps with an 1817 * existing extent. 1818 * 1819 * If there is an overlap discovered, it updates the length of the newext 1820 * such that there will be no overlap, and then returns 1. 1821 * If there is no overlap found, it returns 0. 1822 */ 1823 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 1824 struct inode *inode, 1825 struct ext4_extent *newext, 1826 struct ext4_ext_path *path) 1827 { 1828 ext4_lblk_t b1, b2; 1829 unsigned int depth, len1; 1830 unsigned int ret = 0; 1831 1832 b1 = le32_to_cpu(newext->ee_block); 1833 len1 = ext4_ext_get_actual_len(newext); 1834 depth = ext_depth(inode); 1835 if (!path[depth].p_ext) 1836 goto out; 1837 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1838 b2 &= ~(sbi->s_cluster_ratio - 1); 1839 1840 /* 1841 * get the next allocated block if the extent in the path 1842 * is before the requested block(s) 1843 */ 1844 if (b2 < b1) { 1845 b2 = ext4_ext_next_allocated_block(path); 1846 if (b2 == EXT_MAX_BLOCKS) 1847 goto out; 1848 b2 &= ~(sbi->s_cluster_ratio - 1); 1849 } 1850 1851 /* check for wrap through zero on extent logical start block*/ 1852 if (b1 + len1 < b1) { 1853 len1 = EXT_MAX_BLOCKS - b1; 1854 newext->ee_len = cpu_to_le16(len1); 1855 ret = 1; 1856 } 1857 1858 /* check for overlap */ 1859 if (b1 + len1 > b2) { 1860 newext->ee_len = cpu_to_le16(b2 - b1); 1861 ret = 1; 1862 } 1863 out: 1864 return ret; 1865 } 1866 1867 /* 1868 * ext4_ext_insert_extent: 1869 * tries to merge requsted extent into the existing extent or 1870 * inserts requested extent as new one into the tree, 1871 * creating new leaf in the no-space case. 1872 */ 1873 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1874 struct ext4_ext_path *path, 1875 struct ext4_extent *newext, int gb_flags) 1876 { 1877 struct ext4_extent_header *eh; 1878 struct ext4_extent *ex, *fex; 1879 struct ext4_extent *nearex; /* nearest extent */ 1880 struct ext4_ext_path *npath = NULL; 1881 int depth, len, err; 1882 ext4_lblk_t next; 1883 int mb_flags = 0; 1884 1885 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1886 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1887 return -EIO; 1888 } 1889 depth = ext_depth(inode); 1890 ex = path[depth].p_ext; 1891 eh = path[depth].p_hdr; 1892 if (unlikely(path[depth].p_hdr == NULL)) { 1893 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1894 return -EIO; 1895 } 1896 1897 /* try to insert block into found extent and return */ 1898 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 1899 1900 /* 1901 * Try to see whether we should rather test the extent on 1902 * right from ex, or from the left of ex. This is because 1903 * ext4_ext_find_extent() can return either extent on the 1904 * left, or on the right from the searched position. This 1905 * will make merging more effective. 1906 */ 1907 if (ex < EXT_LAST_EXTENT(eh) && 1908 (le32_to_cpu(ex->ee_block) + 1909 ext4_ext_get_actual_len(ex) < 1910 le32_to_cpu(newext->ee_block))) { 1911 ex += 1; 1912 goto prepend; 1913 } else if ((ex > EXT_FIRST_EXTENT(eh)) && 1914 (le32_to_cpu(newext->ee_block) + 1915 ext4_ext_get_actual_len(newext) < 1916 le32_to_cpu(ex->ee_block))) 1917 ex -= 1; 1918 1919 /* Try to append newex to the ex */ 1920 if (ext4_can_extents_be_merged(inode, ex, newext)) { 1921 ext_debug("append [%d]%d block to %u:[%d]%d" 1922 "(from %llu)\n", 1923 ext4_ext_is_uninitialized(newext), 1924 ext4_ext_get_actual_len(newext), 1925 le32_to_cpu(ex->ee_block), 1926 ext4_ext_is_uninitialized(ex), 1927 ext4_ext_get_actual_len(ex), 1928 ext4_ext_pblock(ex)); 1929 err = ext4_ext_get_access(handle, inode, 1930 path + depth); 1931 if (err) 1932 return err; 1933 1934 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1935 + ext4_ext_get_actual_len(newext)); 1936 eh = path[depth].p_hdr; 1937 nearex = ex; 1938 goto merge; 1939 } 1940 1941 prepend: 1942 /* Try to prepend newex to the ex */ 1943 if (ext4_can_extents_be_merged(inode, newext, ex)) { 1944 ext_debug("prepend %u[%d]%d block to %u:[%d]%d" 1945 "(from %llu)\n", 1946 le32_to_cpu(newext->ee_block), 1947 ext4_ext_is_uninitialized(newext), 1948 ext4_ext_get_actual_len(newext), 1949 le32_to_cpu(ex->ee_block), 1950 ext4_ext_is_uninitialized(ex), 1951 ext4_ext_get_actual_len(ex), 1952 ext4_ext_pblock(ex)); 1953 err = ext4_ext_get_access(handle, inode, 1954 path + depth); 1955 if (err) 1956 return err; 1957 1958 ex->ee_block = newext->ee_block; 1959 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 1960 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1961 + ext4_ext_get_actual_len(newext)); 1962 eh = path[depth].p_hdr; 1963 nearex = ex; 1964 goto merge; 1965 } 1966 } 1967 1968 depth = ext_depth(inode); 1969 eh = path[depth].p_hdr; 1970 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1971 goto has_space; 1972 1973 /* probably next leaf has space for us? */ 1974 fex = EXT_LAST_EXTENT(eh); 1975 next = EXT_MAX_BLOCKS; 1976 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 1977 next = ext4_ext_next_leaf_block(path); 1978 if (next != EXT_MAX_BLOCKS) { 1979 ext_debug("next leaf block - %u\n", next); 1980 BUG_ON(npath != NULL); 1981 npath = ext4_ext_find_extent(inode, next, NULL, 0); 1982 if (IS_ERR(npath)) 1983 return PTR_ERR(npath); 1984 BUG_ON(npath->p_depth != path->p_depth); 1985 eh = npath[depth].p_hdr; 1986 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1987 ext_debug("next leaf isn't full(%d)\n", 1988 le16_to_cpu(eh->eh_entries)); 1989 path = npath; 1990 goto has_space; 1991 } 1992 ext_debug("next leaf has no free space(%d,%d)\n", 1993 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1994 } 1995 1996 /* 1997 * There is no free space in the found leaf. 1998 * We're gonna add a new leaf in the tree. 1999 */ 2000 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2001 mb_flags = EXT4_MB_USE_RESERVED; 2002 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2003 path, newext); 2004 if (err) 2005 goto cleanup; 2006 depth = ext_depth(inode); 2007 eh = path[depth].p_hdr; 2008 2009 has_space: 2010 nearex = path[depth].p_ext; 2011 2012 err = ext4_ext_get_access(handle, inode, path + depth); 2013 if (err) 2014 goto cleanup; 2015 2016 if (!nearex) { 2017 /* there is no extent in this leaf, create first one */ 2018 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", 2019 le32_to_cpu(newext->ee_block), 2020 ext4_ext_pblock(newext), 2021 ext4_ext_is_uninitialized(newext), 2022 ext4_ext_get_actual_len(newext)); 2023 nearex = EXT_FIRST_EXTENT(eh); 2024 } else { 2025 if (le32_to_cpu(newext->ee_block) 2026 > le32_to_cpu(nearex->ee_block)) { 2027 /* Insert after */ 2028 ext_debug("insert %u:%llu:[%d]%d before: " 2029 "nearest %p\n", 2030 le32_to_cpu(newext->ee_block), 2031 ext4_ext_pblock(newext), 2032 ext4_ext_is_uninitialized(newext), 2033 ext4_ext_get_actual_len(newext), 2034 nearex); 2035 nearex++; 2036 } else { 2037 /* Insert before */ 2038 BUG_ON(newext->ee_block == nearex->ee_block); 2039 ext_debug("insert %u:%llu:[%d]%d after: " 2040 "nearest %p\n", 2041 le32_to_cpu(newext->ee_block), 2042 ext4_ext_pblock(newext), 2043 ext4_ext_is_uninitialized(newext), 2044 ext4_ext_get_actual_len(newext), 2045 nearex); 2046 } 2047 len = EXT_LAST_EXTENT(eh) - nearex + 1; 2048 if (len > 0) { 2049 ext_debug("insert %u:%llu:[%d]%d: " 2050 "move %d extents from 0x%p to 0x%p\n", 2051 le32_to_cpu(newext->ee_block), 2052 ext4_ext_pblock(newext), 2053 ext4_ext_is_uninitialized(newext), 2054 ext4_ext_get_actual_len(newext), 2055 len, nearex, nearex + 1); 2056 memmove(nearex + 1, nearex, 2057 len * sizeof(struct ext4_extent)); 2058 } 2059 } 2060 2061 le16_add_cpu(&eh->eh_entries, 1); 2062 path[depth].p_ext = nearex; 2063 nearex->ee_block = newext->ee_block; 2064 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 2065 nearex->ee_len = newext->ee_len; 2066 2067 merge: 2068 /* try to merge extents */ 2069 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 2070 ext4_ext_try_to_merge(handle, inode, path, nearex); 2071 2072 2073 /* time to correct all indexes above */ 2074 err = ext4_ext_correct_indexes(handle, inode, path); 2075 if (err) 2076 goto cleanup; 2077 2078 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2079 2080 cleanup: 2081 if (npath) { 2082 ext4_ext_drop_refs(npath); 2083 kfree(npath); 2084 } 2085 return err; 2086 } 2087 2088 static int ext4_fill_fiemap_extents(struct inode *inode, 2089 ext4_lblk_t block, ext4_lblk_t num, 2090 struct fiemap_extent_info *fieinfo) 2091 { 2092 struct ext4_ext_path *path = NULL; 2093 struct ext4_extent *ex; 2094 struct extent_status es; 2095 ext4_lblk_t next, next_del, start = 0, end = 0; 2096 ext4_lblk_t last = block + num; 2097 int exists, depth = 0, err = 0; 2098 unsigned int flags = 0; 2099 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 2100 2101 while (block < last && block != EXT_MAX_BLOCKS) { 2102 num = last - block; 2103 /* find extent for this block */ 2104 down_read(&EXT4_I(inode)->i_data_sem); 2105 2106 if (path && ext_depth(inode) != depth) { 2107 /* depth was changed. we have to realloc path */ 2108 kfree(path); 2109 path = NULL; 2110 } 2111 2112 path = ext4_ext_find_extent(inode, block, path, 0); 2113 if (IS_ERR(path)) { 2114 up_read(&EXT4_I(inode)->i_data_sem); 2115 err = PTR_ERR(path); 2116 path = NULL; 2117 break; 2118 } 2119 2120 depth = ext_depth(inode); 2121 if (unlikely(path[depth].p_hdr == NULL)) { 2122 up_read(&EXT4_I(inode)->i_data_sem); 2123 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2124 err = -EIO; 2125 break; 2126 } 2127 ex = path[depth].p_ext; 2128 next = ext4_ext_next_allocated_block(path); 2129 ext4_ext_drop_refs(path); 2130 2131 flags = 0; 2132 exists = 0; 2133 if (!ex) { 2134 /* there is no extent yet, so try to allocate 2135 * all requested space */ 2136 start = block; 2137 end = block + num; 2138 } else if (le32_to_cpu(ex->ee_block) > block) { 2139 /* need to allocate space before found extent */ 2140 start = block; 2141 end = le32_to_cpu(ex->ee_block); 2142 if (block + num < end) 2143 end = block + num; 2144 } else if (block >= le32_to_cpu(ex->ee_block) 2145 + ext4_ext_get_actual_len(ex)) { 2146 /* need to allocate space after found extent */ 2147 start = block; 2148 end = block + num; 2149 if (end >= next) 2150 end = next; 2151 } else if (block >= le32_to_cpu(ex->ee_block)) { 2152 /* 2153 * some part of requested space is covered 2154 * by found extent 2155 */ 2156 start = block; 2157 end = le32_to_cpu(ex->ee_block) 2158 + ext4_ext_get_actual_len(ex); 2159 if (block + num < end) 2160 end = block + num; 2161 exists = 1; 2162 } else { 2163 BUG(); 2164 } 2165 BUG_ON(end <= start); 2166 2167 if (!exists) { 2168 es.es_lblk = start; 2169 es.es_len = end - start; 2170 es.es_pblk = 0; 2171 } else { 2172 es.es_lblk = le32_to_cpu(ex->ee_block); 2173 es.es_len = ext4_ext_get_actual_len(ex); 2174 es.es_pblk = ext4_ext_pblock(ex); 2175 if (ext4_ext_is_uninitialized(ex)) 2176 flags |= FIEMAP_EXTENT_UNWRITTEN; 2177 } 2178 2179 /* 2180 * Find delayed extent and update es accordingly. We call 2181 * it even in !exists case to find out whether es is the 2182 * last existing extent or not. 2183 */ 2184 next_del = ext4_find_delayed_extent(inode, &es); 2185 if (!exists && next_del) { 2186 exists = 1; 2187 flags |= (FIEMAP_EXTENT_DELALLOC | 2188 FIEMAP_EXTENT_UNKNOWN); 2189 } 2190 up_read(&EXT4_I(inode)->i_data_sem); 2191 2192 if (unlikely(es.es_len == 0)) { 2193 EXT4_ERROR_INODE(inode, "es.es_len == 0"); 2194 err = -EIO; 2195 break; 2196 } 2197 2198 /* 2199 * This is possible iff next == next_del == EXT_MAX_BLOCKS. 2200 * we need to check next == EXT_MAX_BLOCKS because it is 2201 * possible that an extent is with unwritten and delayed 2202 * status due to when an extent is delayed allocated and 2203 * is allocated by fallocate status tree will track both of 2204 * them in a extent. 2205 * 2206 * So we could return a unwritten and delayed extent, and 2207 * its block is equal to 'next'. 2208 */ 2209 if (next == next_del && next == EXT_MAX_BLOCKS) { 2210 flags |= FIEMAP_EXTENT_LAST; 2211 if (unlikely(next_del != EXT_MAX_BLOCKS || 2212 next != EXT_MAX_BLOCKS)) { 2213 EXT4_ERROR_INODE(inode, 2214 "next extent == %u, next " 2215 "delalloc extent = %u", 2216 next, next_del); 2217 err = -EIO; 2218 break; 2219 } 2220 } 2221 2222 if (exists) { 2223 err = fiemap_fill_next_extent(fieinfo, 2224 (__u64)es.es_lblk << blksize_bits, 2225 (__u64)es.es_pblk << blksize_bits, 2226 (__u64)es.es_len << blksize_bits, 2227 flags); 2228 if (err < 0) 2229 break; 2230 if (err == 1) { 2231 err = 0; 2232 break; 2233 } 2234 } 2235 2236 block = es.es_lblk + es.es_len; 2237 } 2238 2239 if (path) { 2240 ext4_ext_drop_refs(path); 2241 kfree(path); 2242 } 2243 2244 return err; 2245 } 2246 2247 /* 2248 * ext4_ext_put_gap_in_cache: 2249 * calculate boundaries of the gap that the requested block fits into 2250 * and cache this gap 2251 */ 2252 static void 2253 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2254 ext4_lblk_t block) 2255 { 2256 int depth = ext_depth(inode); 2257 unsigned long len = 0; 2258 ext4_lblk_t lblock = 0; 2259 struct ext4_extent *ex; 2260 2261 ex = path[depth].p_ext; 2262 if (ex == NULL) { 2263 /* 2264 * there is no extent yet, so gap is [0;-] and we 2265 * don't cache it 2266 */ 2267 ext_debug("cache gap(whole file):"); 2268 } else if (block < le32_to_cpu(ex->ee_block)) { 2269 lblock = block; 2270 len = le32_to_cpu(ex->ee_block) - block; 2271 ext_debug("cache gap(before): %u [%u:%u]", 2272 block, 2273 le32_to_cpu(ex->ee_block), 2274 ext4_ext_get_actual_len(ex)); 2275 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2276 ext4_es_insert_extent(inode, lblock, len, ~0, 2277 EXTENT_STATUS_HOLE); 2278 } else if (block >= le32_to_cpu(ex->ee_block) 2279 + ext4_ext_get_actual_len(ex)) { 2280 ext4_lblk_t next; 2281 lblock = le32_to_cpu(ex->ee_block) 2282 + ext4_ext_get_actual_len(ex); 2283 2284 next = ext4_ext_next_allocated_block(path); 2285 ext_debug("cache gap(after): [%u:%u] %u", 2286 le32_to_cpu(ex->ee_block), 2287 ext4_ext_get_actual_len(ex), 2288 block); 2289 BUG_ON(next == lblock); 2290 len = next - lblock; 2291 if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) 2292 ext4_es_insert_extent(inode, lblock, len, ~0, 2293 EXTENT_STATUS_HOLE); 2294 } else { 2295 BUG(); 2296 } 2297 2298 ext_debug(" -> %u:%lu\n", lblock, len); 2299 } 2300 2301 /* 2302 * ext4_ext_rm_idx: 2303 * removes index from the index block. 2304 */ 2305 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2306 struct ext4_ext_path *path, int depth) 2307 { 2308 int err; 2309 ext4_fsblk_t leaf; 2310 2311 /* free index block */ 2312 depth--; 2313 path = path + depth; 2314 leaf = ext4_idx_pblock(path->p_idx); 2315 if (unlikely(path->p_hdr->eh_entries == 0)) { 2316 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2317 return -EIO; 2318 } 2319 err = ext4_ext_get_access(handle, inode, path); 2320 if (err) 2321 return err; 2322 2323 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { 2324 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; 2325 len *= sizeof(struct ext4_extent_idx); 2326 memmove(path->p_idx, path->p_idx + 1, len); 2327 } 2328 2329 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2330 err = ext4_ext_dirty(handle, inode, path); 2331 if (err) 2332 return err; 2333 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2334 trace_ext4_ext_rm_idx(inode, leaf); 2335 2336 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2337 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2338 2339 while (--depth >= 0) { 2340 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) 2341 break; 2342 path--; 2343 err = ext4_ext_get_access(handle, inode, path); 2344 if (err) 2345 break; 2346 path->p_idx->ei_block = (path+1)->p_idx->ei_block; 2347 err = ext4_ext_dirty(handle, inode, path); 2348 if (err) 2349 break; 2350 } 2351 return err; 2352 } 2353 2354 /* 2355 * ext4_ext_calc_credits_for_single_extent: 2356 * This routine returns max. credits that needed to insert an extent 2357 * to the extent tree. 2358 * When pass the actual path, the caller should calculate credits 2359 * under i_data_sem. 2360 */ 2361 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2362 struct ext4_ext_path *path) 2363 { 2364 if (path) { 2365 int depth = ext_depth(inode); 2366 int ret = 0; 2367 2368 /* probably there is space in leaf? */ 2369 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2370 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2371 2372 /* 2373 * There are some space in the leaf tree, no 2374 * need to account for leaf block credit 2375 * 2376 * bitmaps and block group descriptor blocks 2377 * and other metadata blocks still need to be 2378 * accounted. 2379 */ 2380 /* 1 bitmap, 1 block group descriptor */ 2381 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2382 return ret; 2383 } 2384 } 2385 2386 return ext4_chunk_trans_blocks(inode, nrblocks); 2387 } 2388 2389 /* 2390 * How many index/leaf blocks need to change/allocate to add @extents extents? 2391 * 2392 * If we add a single extent, then in the worse case, each tree level 2393 * index/leaf need to be changed in case of the tree split. 2394 * 2395 * If more extents are inserted, they could cause the whole tree split more 2396 * than once, but this is really rare. 2397 */ 2398 int ext4_ext_index_trans_blocks(struct inode *inode, int extents) 2399 { 2400 int index; 2401 int depth; 2402 2403 /* If we are converting the inline data, only one is needed here. */ 2404 if (ext4_has_inline_data(inode)) 2405 return 1; 2406 2407 depth = ext_depth(inode); 2408 2409 if (extents <= 1) 2410 index = depth * 2; 2411 else 2412 index = depth * 3; 2413 2414 return index; 2415 } 2416 2417 static inline int get_default_free_blocks_flags(struct inode *inode) 2418 { 2419 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2420 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2421 else if (ext4_should_journal_data(inode)) 2422 return EXT4_FREE_BLOCKS_FORGET; 2423 return 0; 2424 } 2425 2426 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2427 struct ext4_extent *ex, 2428 long long *partial_cluster, 2429 ext4_lblk_t from, ext4_lblk_t to) 2430 { 2431 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2432 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2433 ext4_fsblk_t pblk; 2434 int flags = get_default_free_blocks_flags(inode); 2435 2436 /* 2437 * For bigalloc file systems, we never free a partial cluster 2438 * at the beginning of the extent. Instead, we make a note 2439 * that we tried freeing the cluster, and check to see if we 2440 * need to free it on a subsequent call to ext4_remove_blocks, 2441 * or at the end of the ext4_truncate() operation. 2442 */ 2443 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2444 2445 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); 2446 /* 2447 * If we have a partial cluster, and it's different from the 2448 * cluster of the last block, we need to explicitly free the 2449 * partial cluster here. 2450 */ 2451 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2452 if ((*partial_cluster > 0) && 2453 (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2454 ext4_free_blocks(handle, inode, NULL, 2455 EXT4_C2B(sbi, *partial_cluster), 2456 sbi->s_cluster_ratio, flags); 2457 *partial_cluster = 0; 2458 } 2459 2460 #ifdef EXTENTS_STATS 2461 { 2462 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2463 spin_lock(&sbi->s_ext_stats_lock); 2464 sbi->s_ext_blocks += ee_len; 2465 sbi->s_ext_extents++; 2466 if (ee_len < sbi->s_ext_min) 2467 sbi->s_ext_min = ee_len; 2468 if (ee_len > sbi->s_ext_max) 2469 sbi->s_ext_max = ee_len; 2470 if (ext_depth(inode) > sbi->s_depth_max) 2471 sbi->s_depth_max = ext_depth(inode); 2472 spin_unlock(&sbi->s_ext_stats_lock); 2473 } 2474 #endif 2475 if (from >= le32_to_cpu(ex->ee_block) 2476 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2477 /* tail removal */ 2478 ext4_lblk_t num; 2479 unsigned int unaligned; 2480 2481 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2482 pblk = ext4_ext_pblock(ex) + ee_len - num; 2483 /* 2484 * Usually we want to free partial cluster at the end of the 2485 * extent, except for the situation when the cluster is still 2486 * used by any other extent (partial_cluster is negative). 2487 */ 2488 if (*partial_cluster < 0 && 2489 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) 2490 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2491 2492 ext_debug("free last %u blocks starting %llu partial %lld\n", 2493 num, pblk, *partial_cluster); 2494 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2495 /* 2496 * If the block range to be freed didn't start at the 2497 * beginning of a cluster, and we removed the entire 2498 * extent and the cluster is not used by any other extent, 2499 * save the partial cluster here, since we might need to 2500 * delete if we determine that the truncate operation has 2501 * removed all of the blocks in the cluster. 2502 * 2503 * On the other hand, if we did not manage to free the whole 2504 * extent, we have to mark the cluster as used (store negative 2505 * cluster number in partial_cluster). 2506 */ 2507 unaligned = pblk & (sbi->s_cluster_ratio - 1); 2508 if (unaligned && (ee_len == num) && 2509 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) 2510 *partial_cluster = EXT4_B2C(sbi, pblk); 2511 else if (unaligned) 2512 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); 2513 else if (*partial_cluster > 0) 2514 *partial_cluster = 0; 2515 } else 2516 ext4_error(sbi->s_sb, "strange request: removal(2) " 2517 "%u-%u from %u:%u\n", 2518 from, to, le32_to_cpu(ex->ee_block), ee_len); 2519 return 0; 2520 } 2521 2522 2523 /* 2524 * ext4_ext_rm_leaf() Removes the extents associated with the 2525 * blocks appearing between "start" and "end", and splits the extents 2526 * if "start" and "end" appear in the same extent 2527 * 2528 * @handle: The journal handle 2529 * @inode: The files inode 2530 * @path: The path to the leaf 2531 * @partial_cluster: The cluster which we'll have to free if all extents 2532 * has been released from it. It gets negative in case 2533 * that the cluster is still used. 2534 * @start: The first block to remove 2535 * @end: The last block to remove 2536 */ 2537 static int 2538 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2539 struct ext4_ext_path *path, 2540 long long *partial_cluster, 2541 ext4_lblk_t start, ext4_lblk_t end) 2542 { 2543 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2544 int err = 0, correct_index = 0; 2545 int depth = ext_depth(inode), credits; 2546 struct ext4_extent_header *eh; 2547 ext4_lblk_t a, b; 2548 unsigned num; 2549 ext4_lblk_t ex_ee_block; 2550 unsigned short ex_ee_len; 2551 unsigned uninitialized = 0; 2552 struct ext4_extent *ex; 2553 ext4_fsblk_t pblk; 2554 2555 /* the header must be checked already in ext4_ext_remove_space() */ 2556 ext_debug("truncate since %u in leaf to %u\n", start, end); 2557 if (!path[depth].p_hdr) 2558 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2559 eh = path[depth].p_hdr; 2560 if (unlikely(path[depth].p_hdr == NULL)) { 2561 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2562 return -EIO; 2563 } 2564 /* find where to start removing */ 2565 ex = path[depth].p_ext; 2566 if (!ex) 2567 ex = EXT_LAST_EXTENT(eh); 2568 2569 ex_ee_block = le32_to_cpu(ex->ee_block); 2570 ex_ee_len = ext4_ext_get_actual_len(ex); 2571 2572 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2573 2574 while (ex >= EXT_FIRST_EXTENT(eh) && 2575 ex_ee_block + ex_ee_len > start) { 2576 2577 if (ext4_ext_is_uninitialized(ex)) 2578 uninitialized = 1; 2579 else 2580 uninitialized = 0; 2581 2582 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2583 uninitialized, ex_ee_len); 2584 path[depth].p_ext = ex; 2585 2586 a = ex_ee_block > start ? ex_ee_block : start; 2587 b = ex_ee_block+ex_ee_len - 1 < end ? 2588 ex_ee_block+ex_ee_len - 1 : end; 2589 2590 ext_debug(" border %u:%u\n", a, b); 2591 2592 /* If this extent is beyond the end of the hole, skip it */ 2593 if (end < ex_ee_block) { 2594 /* 2595 * We're going to skip this extent and move to another, 2596 * so if this extent is not cluster aligned we have 2597 * to mark the current cluster as used to avoid 2598 * accidentally freeing it later on 2599 */ 2600 pblk = ext4_ext_pblock(ex); 2601 if (pblk & (sbi->s_cluster_ratio - 1)) 2602 *partial_cluster = 2603 -((long long)EXT4_B2C(sbi, pblk)); 2604 ex--; 2605 ex_ee_block = le32_to_cpu(ex->ee_block); 2606 ex_ee_len = ext4_ext_get_actual_len(ex); 2607 continue; 2608 } else if (b != ex_ee_block + ex_ee_len - 1) { 2609 EXT4_ERROR_INODE(inode, 2610 "can not handle truncate %u:%u " 2611 "on extent %u:%u", 2612 start, end, ex_ee_block, 2613 ex_ee_block + ex_ee_len - 1); 2614 err = -EIO; 2615 goto out; 2616 } else if (a != ex_ee_block) { 2617 /* remove tail of the extent */ 2618 num = a - ex_ee_block; 2619 } else { 2620 /* remove whole extent: excellent! */ 2621 num = 0; 2622 } 2623 /* 2624 * 3 for leaf, sb, and inode plus 2 (bmap and group 2625 * descriptor) for each block group; assume two block 2626 * groups plus ex_ee_len/blocks_per_block_group for 2627 * the worst case 2628 */ 2629 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2630 if (ex == EXT_FIRST_EXTENT(eh)) { 2631 correct_index = 1; 2632 credits += (ext_depth(inode)) + 1; 2633 } 2634 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2635 2636 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2637 if (err) 2638 goto out; 2639 2640 err = ext4_ext_get_access(handle, inode, path + depth); 2641 if (err) 2642 goto out; 2643 2644 err = ext4_remove_blocks(handle, inode, ex, partial_cluster, 2645 a, b); 2646 if (err) 2647 goto out; 2648 2649 if (num == 0) 2650 /* this extent is removed; mark slot entirely unused */ 2651 ext4_ext_store_pblock(ex, 0); 2652 2653 ex->ee_len = cpu_to_le16(num); 2654 /* 2655 * Do not mark uninitialized if all the blocks in the 2656 * extent have been removed. 2657 */ 2658 if (uninitialized && num) 2659 ext4_ext_mark_uninitialized(ex); 2660 /* 2661 * If the extent was completely released, 2662 * we need to remove it from the leaf 2663 */ 2664 if (num == 0) { 2665 if (end != EXT_MAX_BLOCKS - 1) { 2666 /* 2667 * For hole punching, we need to scoot all the 2668 * extents up when an extent is removed so that 2669 * we dont have blank extents in the middle 2670 */ 2671 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2672 sizeof(struct ext4_extent)); 2673 2674 /* Now get rid of the one at the end */ 2675 memset(EXT_LAST_EXTENT(eh), 0, 2676 sizeof(struct ext4_extent)); 2677 } 2678 le16_add_cpu(&eh->eh_entries, -1); 2679 } else if (*partial_cluster > 0) 2680 *partial_cluster = 0; 2681 2682 err = ext4_ext_dirty(handle, inode, path + depth); 2683 if (err) 2684 goto out; 2685 2686 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, 2687 ext4_ext_pblock(ex)); 2688 ex--; 2689 ex_ee_block = le32_to_cpu(ex->ee_block); 2690 ex_ee_len = ext4_ext_get_actual_len(ex); 2691 } 2692 2693 if (correct_index && eh->eh_entries) 2694 err = ext4_ext_correct_indexes(handle, inode, path); 2695 2696 /* 2697 * Free the partial cluster only if the current extent does not 2698 * reference it. Otherwise we might free used cluster. 2699 */ 2700 if (*partial_cluster > 0 && 2701 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2702 *partial_cluster)) { 2703 int flags = get_default_free_blocks_flags(inode); 2704 2705 ext4_free_blocks(handle, inode, NULL, 2706 EXT4_C2B(sbi, *partial_cluster), 2707 sbi->s_cluster_ratio, flags); 2708 *partial_cluster = 0; 2709 } 2710 2711 /* if this leaf is free, then we should 2712 * remove it from index block above */ 2713 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2714 err = ext4_ext_rm_idx(handle, inode, path, depth); 2715 2716 out: 2717 return err; 2718 } 2719 2720 /* 2721 * ext4_ext_more_to_rm: 2722 * returns 1 if current index has to be freed (even partial) 2723 */ 2724 static int 2725 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2726 { 2727 BUG_ON(path->p_idx == NULL); 2728 2729 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2730 return 0; 2731 2732 /* 2733 * if truncate on deeper level happened, it wasn't partial, 2734 * so we have to consider current index for truncation 2735 */ 2736 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2737 return 0; 2738 return 1; 2739 } 2740 2741 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2742 ext4_lblk_t end) 2743 { 2744 struct super_block *sb = inode->i_sb; 2745 int depth = ext_depth(inode); 2746 struct ext4_ext_path *path = NULL; 2747 long long partial_cluster = 0; 2748 handle_t *handle; 2749 int i = 0, err = 0; 2750 2751 ext_debug("truncate since %u to %u\n", start, end); 2752 2753 /* probably first extent we're gonna free will be last in block */ 2754 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); 2755 if (IS_ERR(handle)) 2756 return PTR_ERR(handle); 2757 2758 again: 2759 trace_ext4_ext_remove_space(inode, start, end, depth); 2760 2761 /* 2762 * Check if we are removing extents inside the extent tree. If that 2763 * is the case, we are going to punch a hole inside the extent tree 2764 * so we have to check whether we need to split the extent covering 2765 * the last block to remove so we can easily remove the part of it 2766 * in ext4_ext_rm_leaf(). 2767 */ 2768 if (end < EXT_MAX_BLOCKS - 1) { 2769 struct ext4_extent *ex; 2770 ext4_lblk_t ee_block; 2771 2772 /* find extent for this block */ 2773 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2774 if (IS_ERR(path)) { 2775 ext4_journal_stop(handle); 2776 return PTR_ERR(path); 2777 } 2778 depth = ext_depth(inode); 2779 /* Leaf not may not exist only if inode has no blocks at all */ 2780 ex = path[depth].p_ext; 2781 if (!ex) { 2782 if (depth) { 2783 EXT4_ERROR_INODE(inode, 2784 "path[%d].p_hdr == NULL", 2785 depth); 2786 err = -EIO; 2787 } 2788 goto out; 2789 } 2790 2791 ee_block = le32_to_cpu(ex->ee_block); 2792 2793 /* 2794 * See if the last block is inside the extent, if so split 2795 * the extent at 'end' block so we can easily remove the 2796 * tail of the first part of the split extent in 2797 * ext4_ext_rm_leaf(). 2798 */ 2799 if (end >= ee_block && 2800 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2801 int split_flag = 0; 2802 2803 if (ext4_ext_is_uninitialized(ex)) 2804 split_flag = EXT4_EXT_MARK_UNINIT1 | 2805 EXT4_EXT_MARK_UNINIT2; 2806 2807 /* 2808 * Split the extent in two so that 'end' is the last 2809 * block in the first new extent. Also we should not 2810 * fail removing space due to ENOSPC so try to use 2811 * reserved block if that happens. 2812 */ 2813 err = ext4_split_extent_at(handle, inode, path, 2814 end + 1, split_flag, 2815 EXT4_EX_NOCACHE | 2816 EXT4_GET_BLOCKS_PRE_IO | 2817 EXT4_GET_BLOCKS_METADATA_NOFAIL); 2818 2819 if (err < 0) 2820 goto out; 2821 } 2822 } 2823 /* 2824 * We start scanning from right side, freeing all the blocks 2825 * after i_size and walking into the tree depth-wise. 2826 */ 2827 depth = ext_depth(inode); 2828 if (path) { 2829 int k = i = depth; 2830 while (--k > 0) 2831 path[k].p_block = 2832 le16_to_cpu(path[k].p_hdr->eh_entries)+1; 2833 } else { 2834 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 2835 GFP_NOFS); 2836 if (path == NULL) { 2837 ext4_journal_stop(handle); 2838 return -ENOMEM; 2839 } 2840 path[0].p_depth = depth; 2841 path[0].p_hdr = ext_inode_hdr(inode); 2842 i = 0; 2843 2844 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { 2845 err = -EIO; 2846 goto out; 2847 } 2848 } 2849 err = 0; 2850 2851 while (i >= 0 && err == 0) { 2852 if (i == depth) { 2853 /* this is leaf block */ 2854 err = ext4_ext_rm_leaf(handle, inode, path, 2855 &partial_cluster, start, 2856 end); 2857 /* root level has p_bh == NULL, brelse() eats this */ 2858 brelse(path[i].p_bh); 2859 path[i].p_bh = NULL; 2860 i--; 2861 continue; 2862 } 2863 2864 /* this is index block */ 2865 if (!path[i].p_hdr) { 2866 ext_debug("initialize header\n"); 2867 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2868 } 2869 2870 if (!path[i].p_idx) { 2871 /* this level hasn't been touched yet */ 2872 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2873 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2874 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2875 path[i].p_hdr, 2876 le16_to_cpu(path[i].p_hdr->eh_entries)); 2877 } else { 2878 /* we were already here, see at next index */ 2879 path[i].p_idx--; 2880 } 2881 2882 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2883 i, EXT_FIRST_INDEX(path[i].p_hdr), 2884 path[i].p_idx); 2885 if (ext4_ext_more_to_rm(path + i)) { 2886 struct buffer_head *bh; 2887 /* go to the next level */ 2888 ext_debug("move to level %d (block %llu)\n", 2889 i + 1, ext4_idx_pblock(path[i].p_idx)); 2890 memset(path + i + 1, 0, sizeof(*path)); 2891 bh = read_extent_tree_block(inode, 2892 ext4_idx_pblock(path[i].p_idx), depth - i - 1, 2893 EXT4_EX_NOCACHE); 2894 if (IS_ERR(bh)) { 2895 /* should we reset i_size? */ 2896 err = PTR_ERR(bh); 2897 break; 2898 } 2899 /* Yield here to deal with large extent trees. 2900 * Should be a no-op if we did IO above. */ 2901 cond_resched(); 2902 if (WARN_ON(i + 1 > depth)) { 2903 err = -EIO; 2904 break; 2905 } 2906 path[i + 1].p_bh = bh; 2907 2908 /* save actual number of indexes since this 2909 * number is changed at the next iteration */ 2910 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2911 i++; 2912 } else { 2913 /* we finished processing this index, go up */ 2914 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2915 /* index is empty, remove it; 2916 * handle must be already prepared by the 2917 * truncatei_leaf() */ 2918 err = ext4_ext_rm_idx(handle, inode, path, i); 2919 } 2920 /* root level has p_bh == NULL, brelse() eats this */ 2921 brelse(path[i].p_bh); 2922 path[i].p_bh = NULL; 2923 i--; 2924 ext_debug("return to level %d\n", i); 2925 } 2926 } 2927 2928 trace_ext4_ext_remove_space_done(inode, start, end, depth, 2929 partial_cluster, path->p_hdr->eh_entries); 2930 2931 /* If we still have something in the partial cluster and we have removed 2932 * even the first extent, then we should free the blocks in the partial 2933 * cluster as well. */ 2934 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { 2935 int flags = get_default_free_blocks_flags(inode); 2936 2937 ext4_free_blocks(handle, inode, NULL, 2938 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2939 EXT4_SB(sb)->s_cluster_ratio, flags); 2940 partial_cluster = 0; 2941 } 2942 2943 /* TODO: flexible tree reduction should be here */ 2944 if (path->p_hdr->eh_entries == 0) { 2945 /* 2946 * truncate to zero freed all the tree, 2947 * so we need to correct eh_depth 2948 */ 2949 err = ext4_ext_get_access(handle, inode, path); 2950 if (err == 0) { 2951 ext_inode_hdr(inode)->eh_depth = 0; 2952 ext_inode_hdr(inode)->eh_max = 2953 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2954 err = ext4_ext_dirty(handle, inode, path); 2955 } 2956 } 2957 out: 2958 ext4_ext_drop_refs(path); 2959 kfree(path); 2960 if (err == -EAGAIN) { 2961 path = NULL; 2962 goto again; 2963 } 2964 ext4_journal_stop(handle); 2965 2966 return err; 2967 } 2968 2969 /* 2970 * called at mount time 2971 */ 2972 void ext4_ext_init(struct super_block *sb) 2973 { 2974 /* 2975 * possible initialization would be here 2976 */ 2977 2978 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2979 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2980 printk(KERN_INFO "EXT4-fs: file extents enabled" 2981 #ifdef AGGRESSIVE_TEST 2982 ", aggressive tests" 2983 #endif 2984 #ifdef CHECK_BINSEARCH 2985 ", check binsearch" 2986 #endif 2987 #ifdef EXTENTS_STATS 2988 ", stats" 2989 #endif 2990 "\n"); 2991 #endif 2992 #ifdef EXTENTS_STATS 2993 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2994 EXT4_SB(sb)->s_ext_min = 1 << 30; 2995 EXT4_SB(sb)->s_ext_max = 0; 2996 #endif 2997 } 2998 } 2999 3000 /* 3001 * called at umount time 3002 */ 3003 void ext4_ext_release(struct super_block *sb) 3004 { 3005 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 3006 return; 3007 3008 #ifdef EXTENTS_STATS 3009 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 3010 struct ext4_sb_info *sbi = EXT4_SB(sb); 3011 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 3012 sbi->s_ext_blocks, sbi->s_ext_extents, 3013 sbi->s_ext_blocks / sbi->s_ext_extents); 3014 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 3015 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 3016 } 3017 #endif 3018 } 3019 3020 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) 3021 { 3022 ext4_lblk_t ee_block; 3023 ext4_fsblk_t ee_pblock; 3024 unsigned int ee_len; 3025 3026 ee_block = le32_to_cpu(ex->ee_block); 3027 ee_len = ext4_ext_get_actual_len(ex); 3028 ee_pblock = ext4_ext_pblock(ex); 3029 3030 if (ee_len == 0) 3031 return 0; 3032 3033 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, 3034 EXTENT_STATUS_WRITTEN); 3035 } 3036 3037 /* FIXME!! we need to try to merge to left or right after zero-out */ 3038 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 3039 { 3040 ext4_fsblk_t ee_pblock; 3041 unsigned int ee_len; 3042 int ret; 3043 3044 ee_len = ext4_ext_get_actual_len(ex); 3045 ee_pblock = ext4_ext_pblock(ex); 3046 3047 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 3048 if (ret > 0) 3049 ret = 0; 3050 3051 return ret; 3052 } 3053 3054 /* 3055 * ext4_split_extent_at() splits an extent at given block. 3056 * 3057 * @handle: the journal handle 3058 * @inode: the file inode 3059 * @path: the path to the extent 3060 * @split: the logical block where the extent is splitted. 3061 * @split_flags: indicates if the extent could be zeroout if split fails, and 3062 * the states(init or uninit) of new extents. 3063 * @flags: flags used to insert new extent to extent tree. 3064 * 3065 * 3066 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 3067 * of which are deterimined by split_flag. 3068 * 3069 * There are two cases: 3070 * a> the extent are splitted into two extent. 3071 * b> split is not needed, and just mark the extent. 3072 * 3073 * return 0 on success. 3074 */ 3075 static int ext4_split_extent_at(handle_t *handle, 3076 struct inode *inode, 3077 struct ext4_ext_path *path, 3078 ext4_lblk_t split, 3079 int split_flag, 3080 int flags) 3081 { 3082 ext4_fsblk_t newblock; 3083 ext4_lblk_t ee_block; 3084 struct ext4_extent *ex, newex, orig_ex, zero_ex; 3085 struct ext4_extent *ex2 = NULL; 3086 unsigned int ee_len, depth; 3087 int err = 0; 3088 3089 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 3090 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 3091 3092 ext_debug("ext4_split_extents_at: inode %lu, logical" 3093 "block %llu\n", inode->i_ino, (unsigned long long)split); 3094 3095 ext4_ext_show_leaf(inode, path); 3096 3097 depth = ext_depth(inode); 3098 ex = path[depth].p_ext; 3099 ee_block = le32_to_cpu(ex->ee_block); 3100 ee_len = ext4_ext_get_actual_len(ex); 3101 newblock = split - ee_block + ext4_ext_pblock(ex); 3102 3103 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 3104 BUG_ON(!ext4_ext_is_uninitialized(ex) && 3105 split_flag & (EXT4_EXT_MAY_ZEROOUT | 3106 EXT4_EXT_MARK_UNINIT1 | 3107 EXT4_EXT_MARK_UNINIT2)); 3108 3109 err = ext4_ext_get_access(handle, inode, path + depth); 3110 if (err) 3111 goto out; 3112 3113 if (split == ee_block) { 3114 /* 3115 * case b: block @split is the block that the extent begins with 3116 * then we just change the state of the extent, and splitting 3117 * is not needed. 3118 */ 3119 if (split_flag & EXT4_EXT_MARK_UNINIT2) 3120 ext4_ext_mark_uninitialized(ex); 3121 else 3122 ext4_ext_mark_initialized(ex); 3123 3124 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 3125 ext4_ext_try_to_merge(handle, inode, path, ex); 3126 3127 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3128 goto out; 3129 } 3130 3131 /* case a */ 3132 memcpy(&orig_ex, ex, sizeof(orig_ex)); 3133 ex->ee_len = cpu_to_le16(split - ee_block); 3134 if (split_flag & EXT4_EXT_MARK_UNINIT1) 3135 ext4_ext_mark_uninitialized(ex); 3136 3137 /* 3138 * path may lead to new leaf, not to original leaf any more 3139 * after ext4_ext_insert_extent() returns, 3140 */ 3141 err = ext4_ext_dirty(handle, inode, path + depth); 3142 if (err) 3143 goto fix_extent_len; 3144 3145 ex2 = &newex; 3146 ex2->ee_block = cpu_to_le32(split); 3147 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 3148 ext4_ext_store_pblock(ex2, newblock); 3149 if (split_flag & EXT4_EXT_MARK_UNINIT2) 3150 ext4_ext_mark_uninitialized(ex2); 3151 3152 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3153 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3154 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3155 if (split_flag & EXT4_EXT_DATA_VALID1) { 3156 err = ext4_ext_zeroout(inode, ex2); 3157 zero_ex.ee_block = ex2->ee_block; 3158 zero_ex.ee_len = cpu_to_le16( 3159 ext4_ext_get_actual_len(ex2)); 3160 ext4_ext_store_pblock(&zero_ex, 3161 ext4_ext_pblock(ex2)); 3162 } else { 3163 err = ext4_ext_zeroout(inode, ex); 3164 zero_ex.ee_block = ex->ee_block; 3165 zero_ex.ee_len = cpu_to_le16( 3166 ext4_ext_get_actual_len(ex)); 3167 ext4_ext_store_pblock(&zero_ex, 3168 ext4_ext_pblock(ex)); 3169 } 3170 } else { 3171 err = ext4_ext_zeroout(inode, &orig_ex); 3172 zero_ex.ee_block = orig_ex.ee_block; 3173 zero_ex.ee_len = cpu_to_le16( 3174 ext4_ext_get_actual_len(&orig_ex)); 3175 ext4_ext_store_pblock(&zero_ex, 3176 ext4_ext_pblock(&orig_ex)); 3177 } 3178 3179 if (err) 3180 goto fix_extent_len; 3181 /* update the extent length and mark as initialized */ 3182 ex->ee_len = cpu_to_le16(ee_len); 3183 ext4_ext_try_to_merge(handle, inode, path, ex); 3184 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3185 if (err) 3186 goto fix_extent_len; 3187 3188 /* update extent status tree */ 3189 err = ext4_zeroout_es(inode, &zero_ex); 3190 3191 goto out; 3192 } else if (err) 3193 goto fix_extent_len; 3194 3195 out: 3196 ext4_ext_show_leaf(inode, path); 3197 return err; 3198 3199 fix_extent_len: 3200 ex->ee_len = orig_ex.ee_len; 3201 ext4_ext_dirty(handle, inode, path + depth); 3202 return err; 3203 } 3204 3205 /* 3206 * ext4_split_extents() splits an extent and mark extent which is covered 3207 * by @map as split_flags indicates 3208 * 3209 * It may result in splitting the extent into multiple extents (up to three) 3210 * There are three possibilities: 3211 * a> There is no split required 3212 * b> Splits in two extents: Split is happening at either end of the extent 3213 * c> Splits in three extents: Somone is splitting in middle of the extent 3214 * 3215 */ 3216 static int ext4_split_extent(handle_t *handle, 3217 struct inode *inode, 3218 struct ext4_ext_path *path, 3219 struct ext4_map_blocks *map, 3220 int split_flag, 3221 int flags) 3222 { 3223 ext4_lblk_t ee_block; 3224 struct ext4_extent *ex; 3225 unsigned int ee_len, depth; 3226 int err = 0; 3227 int uninitialized; 3228 int split_flag1, flags1; 3229 int allocated = map->m_len; 3230 3231 depth = ext_depth(inode); 3232 ex = path[depth].p_ext; 3233 ee_block = le32_to_cpu(ex->ee_block); 3234 ee_len = ext4_ext_get_actual_len(ex); 3235 uninitialized = ext4_ext_is_uninitialized(ex); 3236 3237 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3238 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3239 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3240 if (uninitialized) 3241 split_flag1 |= EXT4_EXT_MARK_UNINIT1 | 3242 EXT4_EXT_MARK_UNINIT2; 3243 if (split_flag & EXT4_EXT_DATA_VALID2) 3244 split_flag1 |= EXT4_EXT_DATA_VALID1; 3245 err = ext4_split_extent_at(handle, inode, path, 3246 map->m_lblk + map->m_len, split_flag1, flags1); 3247 if (err) 3248 goto out; 3249 } else { 3250 allocated = ee_len - (map->m_lblk - ee_block); 3251 } 3252 /* 3253 * Update path is required because previous ext4_split_extent_at() may 3254 * result in split of original leaf or extent zeroout. 3255 */ 3256 ext4_ext_drop_refs(path); 3257 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3258 if (IS_ERR(path)) 3259 return PTR_ERR(path); 3260 depth = ext_depth(inode); 3261 ex = path[depth].p_ext; 3262 uninitialized = ext4_ext_is_uninitialized(ex); 3263 split_flag1 = 0; 3264 3265 if (map->m_lblk >= ee_block) { 3266 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 3267 if (uninitialized) { 3268 split_flag1 |= EXT4_EXT_MARK_UNINIT1; 3269 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3270 EXT4_EXT_MARK_UNINIT2); 3271 } 3272 err = ext4_split_extent_at(handle, inode, path, 3273 map->m_lblk, split_flag1, flags); 3274 if (err) 3275 goto out; 3276 } 3277 3278 ext4_ext_show_leaf(inode, path); 3279 out: 3280 return err ? err : allocated; 3281 } 3282 3283 /* 3284 * This function is called by ext4_ext_map_blocks() if someone tries to write 3285 * to an uninitialized extent. It may result in splitting the uninitialized 3286 * extent into multiple extents (up to three - one initialized and two 3287 * uninitialized). 3288 * There are three possibilities: 3289 * a> There is no split required: Entire extent should be initialized 3290 * b> Splits in two extents: Write is happening at either end of the extent 3291 * c> Splits in three extents: Somone is writing in middle of the extent 3292 * 3293 * Pre-conditions: 3294 * - The extent pointed to by 'path' is uninitialized. 3295 * - The extent pointed to by 'path' contains a superset 3296 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). 3297 * 3298 * Post-conditions on success: 3299 * - the returned value is the number of blocks beyond map->l_lblk 3300 * that are allocated and initialized. 3301 * It is guaranteed to be >= map->m_len. 3302 */ 3303 static int ext4_ext_convert_to_initialized(handle_t *handle, 3304 struct inode *inode, 3305 struct ext4_map_blocks *map, 3306 struct ext4_ext_path *path, 3307 int flags) 3308 { 3309 struct ext4_sb_info *sbi; 3310 struct ext4_extent_header *eh; 3311 struct ext4_map_blocks split_map; 3312 struct ext4_extent zero_ex; 3313 struct ext4_extent *ex, *abut_ex; 3314 ext4_lblk_t ee_block, eof_block; 3315 unsigned int ee_len, depth, map_len = map->m_len; 3316 int allocated = 0, max_zeroout = 0; 3317 int err = 0; 3318 int split_flag = 0; 3319 3320 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3321 "block %llu, max_blocks %u\n", inode->i_ino, 3322 (unsigned long long)map->m_lblk, map_len); 3323 3324 sbi = EXT4_SB(inode->i_sb); 3325 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3326 inode->i_sb->s_blocksize_bits; 3327 if (eof_block < map->m_lblk + map_len) 3328 eof_block = map->m_lblk + map_len; 3329 3330 depth = ext_depth(inode); 3331 eh = path[depth].p_hdr; 3332 ex = path[depth].p_ext; 3333 ee_block = le32_to_cpu(ex->ee_block); 3334 ee_len = ext4_ext_get_actual_len(ex); 3335 zero_ex.ee_len = 0; 3336 3337 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3338 3339 /* Pre-conditions */ 3340 BUG_ON(!ext4_ext_is_uninitialized(ex)); 3341 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 3342 3343 /* 3344 * Attempt to transfer newly initialized blocks from the currently 3345 * uninitialized extent to its neighbor. This is much cheaper 3346 * than an insertion followed by a merge as those involve costly 3347 * memmove() calls. Transferring to the left is the common case in 3348 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) 3349 * followed by append writes. 3350 * 3351 * Limitations of the current logic: 3352 * - L1: we do not deal with writes covering the whole extent. 3353 * This would require removing the extent if the transfer 3354 * is possible. 3355 * - L2: we only attempt to merge with an extent stored in the 3356 * same extent tree node. 3357 */ 3358 if ((map->m_lblk == ee_block) && 3359 /* See if we can merge left */ 3360 (map_len < ee_len) && /*L1*/ 3361 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ 3362 ext4_lblk_t prev_lblk; 3363 ext4_fsblk_t prev_pblk, ee_pblk; 3364 unsigned int prev_len; 3365 3366 abut_ex = ex - 1; 3367 prev_lblk = le32_to_cpu(abut_ex->ee_block); 3368 prev_len = ext4_ext_get_actual_len(abut_ex); 3369 prev_pblk = ext4_ext_pblock(abut_ex); 3370 ee_pblk = ext4_ext_pblock(ex); 3371 3372 /* 3373 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3374 * upon those conditions: 3375 * - C1: abut_ex is initialized, 3376 * - C2: abut_ex is logically abutting ex, 3377 * - C3: abut_ex is physically abutting ex, 3378 * - C4: abut_ex can receive the additional blocks without 3379 * overflowing the (initialized) length limit. 3380 */ 3381 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ 3382 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3383 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3384 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3385 err = ext4_ext_get_access(handle, inode, path + depth); 3386 if (err) 3387 goto out; 3388 3389 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3390 map, ex, abut_ex); 3391 3392 /* Shift the start of ex by 'map_len' blocks */ 3393 ex->ee_block = cpu_to_le32(ee_block + map_len); 3394 ext4_ext_store_pblock(ex, ee_pblk + map_len); 3395 ex->ee_len = cpu_to_le16(ee_len - map_len); 3396 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3397 3398 /* Extend abut_ex by 'map_len' blocks */ 3399 abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 3400 3401 /* Result: number of initialized blocks past m_lblk */ 3402 allocated = map_len; 3403 } 3404 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && 3405 (map_len < ee_len) && /*L1*/ 3406 ex < EXT_LAST_EXTENT(eh)) { /*L2*/ 3407 /* See if we can merge right */ 3408 ext4_lblk_t next_lblk; 3409 ext4_fsblk_t next_pblk, ee_pblk; 3410 unsigned int next_len; 3411 3412 abut_ex = ex + 1; 3413 next_lblk = le32_to_cpu(abut_ex->ee_block); 3414 next_len = ext4_ext_get_actual_len(abut_ex); 3415 next_pblk = ext4_ext_pblock(abut_ex); 3416 ee_pblk = ext4_ext_pblock(ex); 3417 3418 /* 3419 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3420 * upon those conditions: 3421 * - C1: abut_ex is initialized, 3422 * - C2: abut_ex is logically abutting ex, 3423 * - C3: abut_ex is physically abutting ex, 3424 * - C4: abut_ex can receive the additional blocks without 3425 * overflowing the (initialized) length limit. 3426 */ 3427 if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/ 3428 ((map->m_lblk + map_len) == next_lblk) && /*C2*/ 3429 ((ee_pblk + ee_len) == next_pblk) && /*C3*/ 3430 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3431 err = ext4_ext_get_access(handle, inode, path + depth); 3432 if (err) 3433 goto out; 3434 3435 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3436 map, ex, abut_ex); 3437 3438 /* Shift the start of abut_ex by 'map_len' blocks */ 3439 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 3440 ext4_ext_store_pblock(abut_ex, next_pblk - map_len); 3441 ex->ee_len = cpu_to_le16(ee_len - map_len); 3442 ext4_ext_mark_uninitialized(ex); /* Restore the flag */ 3443 3444 /* Extend abut_ex by 'map_len' blocks */ 3445 abut_ex->ee_len = cpu_to_le16(next_len + map_len); 3446 3447 /* Result: number of initialized blocks past m_lblk */ 3448 allocated = map_len; 3449 } 3450 } 3451 if (allocated) { 3452 /* Mark the block containing both extents as dirty */ 3453 ext4_ext_dirty(handle, inode, path + depth); 3454 3455 /* Update path to point to the right extent */ 3456 path[depth].p_ext = abut_ex; 3457 goto out; 3458 } else 3459 allocated = ee_len - (map->m_lblk - ee_block); 3460 3461 WARN_ON(map->m_lblk < ee_block); 3462 /* 3463 * It is safe to convert extent to initialized via explicit 3464 * zeroout only if extent is fully insde i_size or new_size. 3465 */ 3466 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3467 3468 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3469 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3470 (inode->i_sb->s_blocksize_bits - 10); 3471 3472 /* If extent is less than s_max_zeroout_kb, zeroout directly */ 3473 if (max_zeroout && (ee_len <= max_zeroout)) { 3474 err = ext4_ext_zeroout(inode, ex); 3475 if (err) 3476 goto out; 3477 zero_ex.ee_block = ex->ee_block; 3478 zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); 3479 ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); 3480 3481 err = ext4_ext_get_access(handle, inode, path + depth); 3482 if (err) 3483 goto out; 3484 ext4_ext_mark_initialized(ex); 3485 ext4_ext_try_to_merge(handle, inode, path, ex); 3486 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3487 goto out; 3488 } 3489 3490 /* 3491 * four cases: 3492 * 1. split the extent into three extents. 3493 * 2. split the extent into two extents, zeroout the first half. 3494 * 3. split the extent into two extents, zeroout the second half. 3495 * 4. split the extent into two extents with out zeroout. 3496 */ 3497 split_map.m_lblk = map->m_lblk; 3498 split_map.m_len = map->m_len; 3499 3500 if (max_zeroout && (allocated > map->m_len)) { 3501 if (allocated <= max_zeroout) { 3502 /* case 3 */ 3503 zero_ex.ee_block = 3504 cpu_to_le32(map->m_lblk); 3505 zero_ex.ee_len = cpu_to_le16(allocated); 3506 ext4_ext_store_pblock(&zero_ex, 3507 ext4_ext_pblock(ex) + map->m_lblk - ee_block); 3508 err = ext4_ext_zeroout(inode, &zero_ex); 3509 if (err) 3510 goto out; 3511 split_map.m_lblk = map->m_lblk; 3512 split_map.m_len = allocated; 3513 } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) { 3514 /* case 2 */ 3515 if (map->m_lblk != ee_block) { 3516 zero_ex.ee_block = ex->ee_block; 3517 zero_ex.ee_len = cpu_to_le16(map->m_lblk - 3518 ee_block); 3519 ext4_ext_store_pblock(&zero_ex, 3520 ext4_ext_pblock(ex)); 3521 err = ext4_ext_zeroout(inode, &zero_ex); 3522 if (err) 3523 goto out; 3524 } 3525 3526 split_map.m_lblk = ee_block; 3527 split_map.m_len = map->m_lblk - ee_block + map->m_len; 3528 allocated = map->m_len; 3529 } 3530 } 3531 3532 allocated = ext4_split_extent(handle, inode, path, 3533 &split_map, split_flag, flags); 3534 if (allocated < 0) 3535 err = allocated; 3536 3537 out: 3538 /* If we have gotten a failure, don't zero out status tree */ 3539 if (!err) 3540 err = ext4_zeroout_es(inode, &zero_ex); 3541 return err ? err : allocated; 3542 } 3543 3544 /* 3545 * This function is called by ext4_ext_map_blocks() from 3546 * ext4_get_blocks_dio_write() when DIO to write 3547 * to an uninitialized extent. 3548 * 3549 * Writing to an uninitialized extent may result in splitting the uninitialized 3550 * extent into multiple initialized/uninitialized extents (up to three) 3551 * There are three possibilities: 3552 * a> There is no split required: Entire extent should be uninitialized 3553 * b> Splits in two extents: Write is happening at either end of the extent 3554 * c> Splits in three extents: Somone is writing in middle of the extent 3555 * 3556 * One of more index blocks maybe needed if the extent tree grow after 3557 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3558 * complete, we need to split the uninitialized extent before DIO submit 3559 * the IO. The uninitialized extent called at this time will be split 3560 * into three uninitialized extent(at most). After IO complete, the part 3561 * being filled will be convert to initialized by the end_io callback function 3562 * via ext4_convert_unwritten_extents(). 3563 * 3564 * Returns the size of uninitialized extent to be written on success. 3565 */ 3566 static int ext4_split_unwritten_extents(handle_t *handle, 3567 struct inode *inode, 3568 struct ext4_map_blocks *map, 3569 struct ext4_ext_path *path, 3570 int flags) 3571 { 3572 ext4_lblk_t eof_block; 3573 ext4_lblk_t ee_block; 3574 struct ext4_extent *ex; 3575 unsigned int ee_len; 3576 int split_flag = 0, depth; 3577 3578 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3579 "block %llu, max_blocks %u\n", inode->i_ino, 3580 (unsigned long long)map->m_lblk, map->m_len); 3581 3582 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3583 inode->i_sb->s_blocksize_bits; 3584 if (eof_block < map->m_lblk + map->m_len) 3585 eof_block = map->m_lblk + map->m_len; 3586 /* 3587 * It is safe to convert extent to initialized via explicit 3588 * zeroout only if extent is fully insde i_size or new_size. 3589 */ 3590 depth = ext_depth(inode); 3591 ex = path[depth].p_ext; 3592 ee_block = le32_to_cpu(ex->ee_block); 3593 ee_len = ext4_ext_get_actual_len(ex); 3594 3595 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3596 split_flag |= EXT4_EXT_MARK_UNINIT2; 3597 if (flags & EXT4_GET_BLOCKS_CONVERT) 3598 split_flag |= EXT4_EXT_DATA_VALID2; 3599 flags |= EXT4_GET_BLOCKS_PRE_IO; 3600 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3601 } 3602 3603 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3604 struct inode *inode, 3605 struct ext4_map_blocks *map, 3606 struct ext4_ext_path *path) 3607 { 3608 struct ext4_extent *ex; 3609 ext4_lblk_t ee_block; 3610 unsigned int ee_len; 3611 int depth; 3612 int err = 0; 3613 3614 depth = ext_depth(inode); 3615 ex = path[depth].p_ext; 3616 ee_block = le32_to_cpu(ex->ee_block); 3617 ee_len = ext4_ext_get_actual_len(ex); 3618 3619 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3620 "block %llu, max_blocks %u\n", inode->i_ino, 3621 (unsigned long long)ee_block, ee_len); 3622 3623 /* If extent is larger than requested it is a clear sign that we still 3624 * have some extent state machine issues left. So extent_split is still 3625 * required. 3626 * TODO: Once all related issues will be fixed this situation should be 3627 * illegal. 3628 */ 3629 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3630 #ifdef EXT4_DEBUG 3631 ext4_warning("Inode (%ld) finished: extent logical block %llu," 3632 " len %u; IO logical block %llu, len %u\n", 3633 inode->i_ino, (unsigned long long)ee_block, ee_len, 3634 (unsigned long long)map->m_lblk, map->m_len); 3635 #endif 3636 err = ext4_split_unwritten_extents(handle, inode, map, path, 3637 EXT4_GET_BLOCKS_CONVERT); 3638 if (err < 0) 3639 goto out; 3640 ext4_ext_drop_refs(path); 3641 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3642 if (IS_ERR(path)) { 3643 err = PTR_ERR(path); 3644 goto out; 3645 } 3646 depth = ext_depth(inode); 3647 ex = path[depth].p_ext; 3648 } 3649 3650 err = ext4_ext_get_access(handle, inode, path + depth); 3651 if (err) 3652 goto out; 3653 /* first mark the extent as initialized */ 3654 ext4_ext_mark_initialized(ex); 3655 3656 /* note: ext4_ext_correct_indexes() isn't needed here because 3657 * borders are not changed 3658 */ 3659 ext4_ext_try_to_merge(handle, inode, path, ex); 3660 3661 /* Mark modified extent as dirty */ 3662 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3663 out: 3664 ext4_ext_show_leaf(inode, path); 3665 return err; 3666 } 3667 3668 static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3669 sector_t block, int count) 3670 { 3671 int i; 3672 for (i = 0; i < count; i++) 3673 unmap_underlying_metadata(bdev, block + i); 3674 } 3675 3676 /* 3677 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3678 */ 3679 static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3680 ext4_lblk_t lblk, 3681 struct ext4_ext_path *path, 3682 unsigned int len) 3683 { 3684 int i, depth; 3685 struct ext4_extent_header *eh; 3686 struct ext4_extent *last_ex; 3687 3688 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3689 return 0; 3690 3691 depth = ext_depth(inode); 3692 eh = path[depth].p_hdr; 3693 3694 /* 3695 * We're going to remove EOFBLOCKS_FL entirely in future so we 3696 * do not care for this case anymore. Simply remove the flag 3697 * if there are no extents. 3698 */ 3699 if (unlikely(!eh->eh_entries)) 3700 goto out; 3701 last_ex = EXT_LAST_EXTENT(eh); 3702 /* 3703 * We should clear the EOFBLOCKS_FL flag if we are writing the 3704 * last block in the last extent in the file. We test this by 3705 * first checking to see if the caller to 3706 * ext4_ext_get_blocks() was interested in the last block (or 3707 * a block beyond the last block) in the current extent. If 3708 * this turns out to be false, we can bail out from this 3709 * function immediately. 3710 */ 3711 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3712 ext4_ext_get_actual_len(last_ex)) 3713 return 0; 3714 /* 3715 * If the caller does appear to be planning to write at or 3716 * beyond the end of the current extent, we then test to see 3717 * if the current extent is the last extent in the file, by 3718 * checking to make sure it was reached via the rightmost node 3719 * at each level of the tree. 3720 */ 3721 for (i = depth-1; i >= 0; i--) 3722 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3723 return 0; 3724 out: 3725 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3726 return ext4_mark_inode_dirty(handle, inode); 3727 } 3728 3729 /** 3730 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3731 * 3732 * Return 1 if there is a delalloc block in the range, otherwise 0. 3733 */ 3734 int ext4_find_delalloc_range(struct inode *inode, 3735 ext4_lblk_t lblk_start, 3736 ext4_lblk_t lblk_end) 3737 { 3738 struct extent_status es; 3739 3740 ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); 3741 if (es.es_len == 0) 3742 return 0; /* there is no delay extent in this tree */ 3743 else if (es.es_lblk <= lblk_start && 3744 lblk_start < es.es_lblk + es.es_len) 3745 return 1; 3746 else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) 3747 return 1; 3748 else 3749 return 0; 3750 } 3751 3752 int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) 3753 { 3754 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3755 ext4_lblk_t lblk_start, lblk_end; 3756 lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); 3757 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3758 3759 return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3760 } 3761 3762 /** 3763 * Determines how many complete clusters (out of those specified by the 'map') 3764 * are under delalloc and were reserved quota for. 3765 * This function is called when we are writing out the blocks that were 3766 * originally written with their allocation delayed, but then the space was 3767 * allocated using fallocate() before the delayed allocation could be resolved. 3768 * The cases to look for are: 3769 * ('=' indicated delayed allocated blocks 3770 * '-' indicates non-delayed allocated blocks) 3771 * (a) partial clusters towards beginning and/or end outside of allocated range 3772 * are not delalloc'ed. 3773 * Ex: 3774 * |----c---=|====c====|====c====|===-c----| 3775 * |++++++ allocated ++++++| 3776 * ==> 4 complete clusters in above example 3777 * 3778 * (b) partial cluster (outside of allocated range) towards either end is 3779 * marked for delayed allocation. In this case, we will exclude that 3780 * cluster. 3781 * Ex: 3782 * |----====c========|========c========| 3783 * |++++++ allocated ++++++| 3784 * ==> 1 complete clusters in above example 3785 * 3786 * Ex: 3787 * |================c================| 3788 * |++++++ allocated ++++++| 3789 * ==> 0 complete clusters in above example 3790 * 3791 * The ext4_da_update_reserve_space will be called only if we 3792 * determine here that there were some "entire" clusters that span 3793 * this 'allocated' range. 3794 * In the non-bigalloc case, this function will just end up returning num_blks 3795 * without ever calling ext4_find_delalloc_range. 3796 */ 3797 static unsigned int 3798 get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, 3799 unsigned int num_blks) 3800 { 3801 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3802 ext4_lblk_t alloc_cluster_start, alloc_cluster_end; 3803 ext4_lblk_t lblk_from, lblk_to, c_offset; 3804 unsigned int allocated_clusters = 0; 3805 3806 alloc_cluster_start = EXT4_B2C(sbi, lblk_start); 3807 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); 3808 3809 /* max possible clusters for this allocation */ 3810 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; 3811 3812 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 3813 3814 /* Check towards left side */ 3815 c_offset = lblk_start & (sbi->s_cluster_ratio - 1); 3816 if (c_offset) { 3817 lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); 3818 lblk_to = lblk_from + c_offset - 1; 3819 3820 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3821 allocated_clusters--; 3822 } 3823 3824 /* Now check towards right. */ 3825 c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); 3826 if (allocated_clusters && c_offset) { 3827 lblk_from = lblk_start + num_blks; 3828 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3829 3830 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3831 allocated_clusters--; 3832 } 3833 3834 return allocated_clusters; 3835 } 3836 3837 static int 3838 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3839 struct ext4_map_blocks *map, 3840 struct ext4_ext_path *path, int flags, 3841 unsigned int allocated, ext4_fsblk_t newblock) 3842 { 3843 int ret = 0; 3844 int err = 0; 3845 ext4_io_end_t *io = ext4_inode_aio(inode); 3846 3847 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical " 3848 "block %llu, max_blocks %u, flags %x, allocated %u\n", 3849 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3850 flags, allocated); 3851 ext4_ext_show_leaf(inode, path); 3852 3853 /* 3854 * When writing into uninitialized space, we should not fail to 3855 * allocate metadata blocks for the new extent block if needed. 3856 */ 3857 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; 3858 3859 trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, 3860 allocated, newblock); 3861 3862 /* get_block() before submit the IO, split the extent */ 3863 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3864 ret = ext4_split_unwritten_extents(handle, inode, map, 3865 path, flags); 3866 if (ret <= 0) 3867 goto out; 3868 /* 3869 * Flag the inode(non aio case) or end_io struct (aio case) 3870 * that this IO needs to conversion to written when IO is 3871 * completed 3872 */ 3873 if (io) 3874 ext4_set_io_unwritten_flag(inode, io); 3875 else 3876 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3877 map->m_flags |= EXT4_MAP_UNWRITTEN; 3878 if (ext4_should_dioread_nolock(inode)) 3879 map->m_flags |= EXT4_MAP_UNINIT; 3880 goto out; 3881 } 3882 /* IO end_io complete, convert the filled extent to written */ 3883 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3884 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 3885 path); 3886 if (ret >= 0) { 3887 ext4_update_inode_fsync_trans(handle, inode, 1); 3888 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3889 path, map->m_len); 3890 } else 3891 err = ret; 3892 map->m_flags |= EXT4_MAP_MAPPED; 3893 if (allocated > map->m_len) 3894 allocated = map->m_len; 3895 map->m_len = allocated; 3896 goto out2; 3897 } 3898 /* buffered IO case */ 3899 /* 3900 * repeat fallocate creation request 3901 * we already have an unwritten extent 3902 */ 3903 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) { 3904 map->m_flags |= EXT4_MAP_UNWRITTEN; 3905 goto map_out; 3906 } 3907 3908 /* buffered READ or buffered write_begin() lookup */ 3909 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3910 /* 3911 * We have blocks reserved already. We 3912 * return allocated blocks so that delalloc 3913 * won't do block reservation for us. But 3914 * the buffer head will be unmapped so that 3915 * a read from the block returns 0s. 3916 */ 3917 map->m_flags |= EXT4_MAP_UNWRITTEN; 3918 goto out1; 3919 } 3920 3921 /* buffered write, writepage time, convert*/ 3922 ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); 3923 if (ret >= 0) 3924 ext4_update_inode_fsync_trans(handle, inode, 1); 3925 out: 3926 if (ret <= 0) { 3927 err = ret; 3928 goto out2; 3929 } else 3930 allocated = ret; 3931 map->m_flags |= EXT4_MAP_NEW; 3932 /* 3933 * if we allocated more blocks than requested 3934 * we need to make sure we unmap the extra block 3935 * allocated. The actual needed block will get 3936 * unmapped later when we find the buffer_head marked 3937 * new. 3938 */ 3939 if (allocated > map->m_len) { 3940 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3941 newblock + map->m_len, 3942 allocated - map->m_len); 3943 allocated = map->m_len; 3944 } 3945 map->m_len = allocated; 3946 3947 /* 3948 * If we have done fallocate with the offset that is already 3949 * delayed allocated, we would have block reservation 3950 * and quota reservation done in the delayed write path. 3951 * But fallocate would have already updated quota and block 3952 * count for this offset. So cancel these reservation 3953 */ 3954 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 3955 unsigned int reserved_clusters; 3956 reserved_clusters = get_reserved_cluster_alloc(inode, 3957 map->m_lblk, map->m_len); 3958 if (reserved_clusters) 3959 ext4_da_update_reserve_space(inode, 3960 reserved_clusters, 3961 0); 3962 } 3963 3964 map_out: 3965 map->m_flags |= EXT4_MAP_MAPPED; 3966 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { 3967 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 3968 map->m_len); 3969 if (err < 0) 3970 goto out2; 3971 } 3972 out1: 3973 if (allocated > map->m_len) 3974 allocated = map->m_len; 3975 ext4_ext_show_leaf(inode, path); 3976 map->m_pblk = newblock; 3977 map->m_len = allocated; 3978 out2: 3979 if (path) { 3980 ext4_ext_drop_refs(path); 3981 kfree(path); 3982 } 3983 return err ? err : allocated; 3984 } 3985 3986 /* 3987 * get_implied_cluster_alloc - check to see if the requested 3988 * allocation (in the map structure) overlaps with a cluster already 3989 * allocated in an extent. 3990 * @sb The filesystem superblock structure 3991 * @map The requested lblk->pblk mapping 3992 * @ex The extent structure which might contain an implied 3993 * cluster allocation 3994 * 3995 * This function is called by ext4_ext_map_blocks() after we failed to 3996 * find blocks that were already in the inode's extent tree. Hence, 3997 * we know that the beginning of the requested region cannot overlap 3998 * the extent from the inode's extent tree. There are three cases we 3999 * want to catch. The first is this case: 4000 * 4001 * |--- cluster # N--| 4002 * |--- extent ---| |---- requested region ---| 4003 * |==========| 4004 * 4005 * The second case that we need to test for is this one: 4006 * 4007 * |--------- cluster # N ----------------| 4008 * |--- requested region --| |------- extent ----| 4009 * |=======================| 4010 * 4011 * The third case is when the requested region lies between two extents 4012 * within the same cluster: 4013 * |------------- cluster # N-------------| 4014 * |----- ex -----| |---- ex_right ----| 4015 * |------ requested region ------| 4016 * |================| 4017 * 4018 * In each of the above cases, we need to set the map->m_pblk and 4019 * map->m_len so it corresponds to the return the extent labelled as 4020 * "|====|" from cluster #N, since it is already in use for data in 4021 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to 4022 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated 4023 * as a new "allocated" block region. Otherwise, we will return 0 and 4024 * ext4_ext_map_blocks() will then allocate one or more new clusters 4025 * by calling ext4_mb_new_blocks(). 4026 */ 4027 static int get_implied_cluster_alloc(struct super_block *sb, 4028 struct ext4_map_blocks *map, 4029 struct ext4_extent *ex, 4030 struct ext4_ext_path *path) 4031 { 4032 struct ext4_sb_info *sbi = EXT4_SB(sb); 4033 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 4034 ext4_lblk_t ex_cluster_start, ex_cluster_end; 4035 ext4_lblk_t rr_cluster_start; 4036 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4037 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4038 unsigned short ee_len = ext4_ext_get_actual_len(ex); 4039 4040 /* The extent passed in that we are trying to match */ 4041 ex_cluster_start = EXT4_B2C(sbi, ee_block); 4042 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); 4043 4044 /* The requested region passed into ext4_map_blocks() */ 4045 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 4046 4047 if ((rr_cluster_start == ex_cluster_end) || 4048 (rr_cluster_start == ex_cluster_start)) { 4049 if (rr_cluster_start == ex_cluster_end) 4050 ee_start += ee_len - 1; 4051 map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + 4052 c_offset; 4053 map->m_len = min(map->m_len, 4054 (unsigned) sbi->s_cluster_ratio - c_offset); 4055 /* 4056 * Check for and handle this case: 4057 * 4058 * |--------- cluster # N-------------| 4059 * |------- extent ----| 4060 * |--- requested region ---| 4061 * |===========| 4062 */ 4063 4064 if (map->m_lblk < ee_block) 4065 map->m_len = min(map->m_len, ee_block - map->m_lblk); 4066 4067 /* 4068 * Check for the case where there is already another allocated 4069 * block to the right of 'ex' but before the end of the cluster. 4070 * 4071 * |------------- cluster # N-------------| 4072 * |----- ex -----| |---- ex_right ----| 4073 * |------ requested region ------| 4074 * |================| 4075 */ 4076 if (map->m_lblk > ee_block) { 4077 ext4_lblk_t next = ext4_ext_next_allocated_block(path); 4078 map->m_len = min(map->m_len, next - map->m_lblk); 4079 } 4080 4081 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); 4082 return 1; 4083 } 4084 4085 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); 4086 return 0; 4087 } 4088 4089 4090 /* 4091 * Block allocation/map/preallocation routine for extents based files 4092 * 4093 * 4094 * Need to be called with 4095 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 4096 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 4097 * 4098 * return > 0, number of of blocks already mapped/allocated 4099 * if create == 0 and these are pre-allocated blocks 4100 * buffer head is unmapped 4101 * otherwise blocks are mapped 4102 * 4103 * return = 0, if plain look up failed (blocks have not been allocated) 4104 * buffer head is unmapped 4105 * 4106 * return < 0, error case. 4107 */ 4108 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 4109 struct ext4_map_blocks *map, int flags) 4110 { 4111 struct ext4_ext_path *path = NULL; 4112 struct ext4_extent newex, *ex, *ex2; 4113 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4114 ext4_fsblk_t newblock = 0; 4115 int free_on_err = 0, err = 0, depth; 4116 unsigned int allocated = 0, offset = 0; 4117 unsigned int allocated_clusters = 0; 4118 struct ext4_allocation_request ar; 4119 ext4_io_end_t *io = ext4_inode_aio(inode); 4120 ext4_lblk_t cluster_offset; 4121 int set_unwritten = 0; 4122 4123 ext_debug("blocks %u/%u requested for inode %lu\n", 4124 map->m_lblk, map->m_len, inode->i_ino); 4125 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4126 4127 /* find extent for this block */ 4128 path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); 4129 if (IS_ERR(path)) { 4130 err = PTR_ERR(path); 4131 path = NULL; 4132 goto out2; 4133 } 4134 4135 depth = ext_depth(inode); 4136 4137 /* 4138 * consistent leaf must not be empty; 4139 * this situation is possible, though, _during_ tree modification; 4140 * this is why assert can't be put in ext4_ext_find_extent() 4141 */ 4142 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4143 EXT4_ERROR_INODE(inode, "bad extent address " 4144 "lblock: %lu, depth: %d pblock %lld", 4145 (unsigned long) map->m_lblk, depth, 4146 path[depth].p_block); 4147 err = -EIO; 4148 goto out2; 4149 } 4150 4151 ex = path[depth].p_ext; 4152 if (ex) { 4153 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4154 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4155 unsigned short ee_len; 4156 4157 /* 4158 * Uninitialized extents are treated as holes, except that 4159 * we split out initialized portions during a write. 4160 */ 4161 ee_len = ext4_ext_get_actual_len(ex); 4162 4163 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 4164 4165 /* if found extent covers block, simply return it */ 4166 if (in_range(map->m_lblk, ee_block, ee_len)) { 4167 newblock = map->m_lblk - ee_block + ee_start; 4168 /* number of remaining blocks in the extent */ 4169 allocated = ee_len - (map->m_lblk - ee_block); 4170 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4171 ee_block, ee_len, newblock); 4172 4173 if (!ext4_ext_is_uninitialized(ex)) 4174 goto out; 4175 4176 allocated = ext4_ext_handle_uninitialized_extents( 4177 handle, inode, map, path, flags, 4178 allocated, newblock); 4179 goto out3; 4180 } 4181 } 4182 4183 if ((sbi->s_cluster_ratio > 1) && 4184 ext4_find_delalloc_cluster(inode, map->m_lblk)) 4185 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4186 4187 /* 4188 * requested block isn't allocated yet; 4189 * we couldn't try to create block if create flag is zero 4190 */ 4191 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4192 /* 4193 * put just found gap into cache to speed up 4194 * subsequent requests 4195 */ 4196 if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) 4197 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4198 goto out2; 4199 } 4200 4201 /* 4202 * Okay, we need to do block allocation. 4203 */ 4204 map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; 4205 newex.ee_block = cpu_to_le32(map->m_lblk); 4206 cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 4207 4208 /* 4209 * If we are doing bigalloc, check to see if the extent returned 4210 * by ext4_ext_find_extent() implies a cluster we can use. 4211 */ 4212 if (cluster_offset && ex && 4213 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4214 ar.len = allocated = map->m_len; 4215 newblock = map->m_pblk; 4216 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4217 goto got_allocated_blocks; 4218 } 4219 4220 /* find neighbour allocated blocks */ 4221 ar.lleft = map->m_lblk; 4222 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 4223 if (err) 4224 goto out2; 4225 ar.lright = map->m_lblk; 4226 ex2 = NULL; 4227 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); 4228 if (err) 4229 goto out2; 4230 4231 /* Check if the extent after searching to the right implies a 4232 * cluster we can use. */ 4233 if ((sbi->s_cluster_ratio > 1) && ex2 && 4234 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4235 ar.len = allocated = map->m_len; 4236 newblock = map->m_pblk; 4237 map->m_flags |= EXT4_MAP_FROM_CLUSTER; 4238 goto got_allocated_blocks; 4239 } 4240 4241 /* 4242 * See if request is beyond maximum number of blocks we can have in 4243 * a single extent. For an initialized extent this limit is 4244 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 4245 * EXT_UNINIT_MAX_LEN. 4246 */ 4247 if (map->m_len > EXT_INIT_MAX_LEN && 4248 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 4249 map->m_len = EXT_INIT_MAX_LEN; 4250 else if (map->m_len > EXT_UNINIT_MAX_LEN && 4251 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 4252 map->m_len = EXT_UNINIT_MAX_LEN; 4253 4254 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 4255 newex.ee_len = cpu_to_le16(map->m_len); 4256 err = ext4_ext_check_overlap(sbi, inode, &newex, path); 4257 if (err) 4258 allocated = ext4_ext_get_actual_len(&newex); 4259 else 4260 allocated = map->m_len; 4261 4262 /* allocate new block */ 4263 ar.inode = inode; 4264 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 4265 ar.logical = map->m_lblk; 4266 /* 4267 * We calculate the offset from the beginning of the cluster 4268 * for the logical block number, since when we allocate a 4269 * physical cluster, the physical block should start at the 4270 * same offset from the beginning of the cluster. This is 4271 * needed so that future calls to get_implied_cluster_alloc() 4272 * work correctly. 4273 */ 4274 offset = map->m_lblk & (sbi->s_cluster_ratio - 1); 4275 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4276 ar.goal -= offset; 4277 ar.logical -= offset; 4278 if (S_ISREG(inode->i_mode)) 4279 ar.flags = EXT4_MB_HINT_DATA; 4280 else 4281 /* disable in-core preallocation for non-regular files */ 4282 ar.flags = 0; 4283 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4284 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4285 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4286 if (!newblock) 4287 goto out2; 4288 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4289 ar.goal, newblock, allocated); 4290 free_on_err = 1; 4291 allocated_clusters = ar.len; 4292 ar.len = EXT4_C2B(sbi, ar.len) - offset; 4293 if (ar.len > allocated) 4294 ar.len = allocated; 4295 4296 got_allocated_blocks: 4297 /* try to insert new extent into found leaf and return */ 4298 ext4_ext_store_pblock(&newex, newblock + offset); 4299 newex.ee_len = cpu_to_le16(ar.len); 4300 /* Mark uninitialized */ 4301 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 4302 ext4_ext_mark_uninitialized(&newex); 4303 map->m_flags |= EXT4_MAP_UNWRITTEN; 4304 /* 4305 * io_end structure was created for every IO write to an 4306 * uninitialized extent. To avoid unnecessary conversion, 4307 * here we flag the IO that really needs the conversion. 4308 * For non asycn direct IO case, flag the inode state 4309 * that we need to perform conversion when IO is done. 4310 */ 4311 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) 4312 set_unwritten = 1; 4313 if (ext4_should_dioread_nolock(inode)) 4314 map->m_flags |= EXT4_MAP_UNINIT; 4315 } 4316 4317 err = 0; 4318 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) 4319 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4320 path, ar.len); 4321 if (!err) 4322 err = ext4_ext_insert_extent(handle, inode, path, 4323 &newex, flags); 4324 4325 if (!err && set_unwritten) { 4326 if (io) 4327 ext4_set_io_unwritten_flag(inode, io); 4328 else 4329 ext4_set_inode_state(inode, 4330 EXT4_STATE_DIO_UNWRITTEN); 4331 } 4332 4333 if (err && free_on_err) { 4334 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4335 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4336 /* free data blocks we just allocated */ 4337 /* not a good idea to call discard here directly, 4338 * but otherwise we'd need to call it every free() */ 4339 ext4_discard_preallocations(inode); 4340 ext4_free_blocks(handle, inode, NULL, newblock, 4341 EXT4_C2B(sbi, allocated_clusters), fb_flags); 4342 goto out2; 4343 } 4344 4345 /* previous routine could use block we allocated */ 4346 newblock = ext4_ext_pblock(&newex); 4347 allocated = ext4_ext_get_actual_len(&newex); 4348 if (allocated > map->m_len) 4349 allocated = map->m_len; 4350 map->m_flags |= EXT4_MAP_NEW; 4351 4352 /* 4353 * Update reserved blocks/metadata blocks after successful 4354 * block allocation which had been deferred till now. 4355 */ 4356 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4357 unsigned int reserved_clusters; 4358 /* 4359 * Check how many clusters we had reserved this allocated range 4360 */ 4361 reserved_clusters = get_reserved_cluster_alloc(inode, 4362 map->m_lblk, allocated); 4363 if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { 4364 if (reserved_clusters) { 4365 /* 4366 * We have clusters reserved for this range. 4367 * But since we are not doing actual allocation 4368 * and are simply using blocks from previously 4369 * allocated cluster, we should release the 4370 * reservation and not claim quota. 4371 */ 4372 ext4_da_update_reserve_space(inode, 4373 reserved_clusters, 0); 4374 } 4375 } else { 4376 BUG_ON(allocated_clusters < reserved_clusters); 4377 if (reserved_clusters < allocated_clusters) { 4378 struct ext4_inode_info *ei = EXT4_I(inode); 4379 int reservation = allocated_clusters - 4380 reserved_clusters; 4381 /* 4382 * It seems we claimed few clusters outside of 4383 * the range of this allocation. We should give 4384 * it back to the reservation pool. This can 4385 * happen in the following case: 4386 * 4387 * * Suppose s_cluster_ratio is 4 (i.e., each 4388 * cluster has 4 blocks. Thus, the clusters 4389 * are [0-3],[4-7],[8-11]... 4390 * * First comes delayed allocation write for 4391 * logical blocks 10 & 11. Since there were no 4392 * previous delayed allocated blocks in the 4393 * range [8-11], we would reserve 1 cluster 4394 * for this write. 4395 * * Next comes write for logical blocks 3 to 8. 4396 * In this case, we will reserve 2 clusters 4397 * (for [0-3] and [4-7]; and not for [8-11] as 4398 * that range has a delayed allocated blocks. 4399 * Thus total reserved clusters now becomes 3. 4400 * * Now, during the delayed allocation writeout 4401 * time, we will first write blocks [3-8] and 4402 * allocate 3 clusters for writing these 4403 * blocks. Also, we would claim all these 4404 * three clusters above. 4405 * * Now when we come here to writeout the 4406 * blocks [10-11], we would expect to claim 4407 * the reservation of 1 cluster we had made 4408 * (and we would claim it since there are no 4409 * more delayed allocated blocks in the range 4410 * [8-11]. But our reserved cluster count had 4411 * already gone to 0. 4412 * 4413 * Thus, at the step 4 above when we determine 4414 * that there are still some unwritten delayed 4415 * allocated blocks outside of our current 4416 * block range, we should increment the 4417 * reserved clusters count so that when the 4418 * remaining blocks finally gets written, we 4419 * could claim them. 4420 */ 4421 dquot_reserve_block(inode, 4422 EXT4_C2B(sbi, reservation)); 4423 spin_lock(&ei->i_block_reservation_lock); 4424 ei->i_reserved_data_blocks += reservation; 4425 spin_unlock(&ei->i_block_reservation_lock); 4426 } 4427 /* 4428 * We will claim quota for all newly allocated blocks. 4429 * We're updating the reserved space *after* the 4430 * correction above so we do not accidentally free 4431 * all the metadata reservation because we might 4432 * actually need it later on. 4433 */ 4434 ext4_da_update_reserve_space(inode, allocated_clusters, 4435 1); 4436 } 4437 } 4438 4439 /* 4440 * Cache the extent and update transaction to commit on fdatasync only 4441 * when it is _not_ an uninitialized extent. 4442 */ 4443 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) 4444 ext4_update_inode_fsync_trans(handle, inode, 1); 4445 else 4446 ext4_update_inode_fsync_trans(handle, inode, 0); 4447 out: 4448 if (allocated > map->m_len) 4449 allocated = map->m_len; 4450 ext4_ext_show_leaf(inode, path); 4451 map->m_flags |= EXT4_MAP_MAPPED; 4452 map->m_pblk = newblock; 4453 map->m_len = allocated; 4454 out2: 4455 if (path) { 4456 ext4_ext_drop_refs(path); 4457 kfree(path); 4458 } 4459 4460 out3: 4461 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4462 err ? err : allocated); 4463 ext4_es_lru_add(inode); 4464 return err ? err : allocated; 4465 } 4466 4467 void ext4_ext_truncate(handle_t *handle, struct inode *inode) 4468 { 4469 struct super_block *sb = inode->i_sb; 4470 ext4_lblk_t last_block; 4471 int err = 0; 4472 4473 /* 4474 * TODO: optimization is possible here. 4475 * Probably we need not scan at all, 4476 * because page truncation is enough. 4477 */ 4478 4479 /* we have to know where to truncate from in crash case */ 4480 EXT4_I(inode)->i_disksize = inode->i_size; 4481 ext4_mark_inode_dirty(handle, inode); 4482 4483 last_block = (inode->i_size + sb->s_blocksize - 1) 4484 >> EXT4_BLOCK_SIZE_BITS(sb); 4485 retry: 4486 err = ext4_es_remove_extent(inode, last_block, 4487 EXT_MAX_BLOCKS - last_block); 4488 if (err == -ENOMEM) { 4489 cond_resched(); 4490 congestion_wait(BLK_RW_ASYNC, HZ/50); 4491 goto retry; 4492 } 4493 if (err) { 4494 ext4_std_error(inode->i_sb, err); 4495 return; 4496 } 4497 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4498 ext4_std_error(inode->i_sb, err); 4499 } 4500 4501 static void ext4_falloc_update_inode(struct inode *inode, 4502 int mode, loff_t new_size, int update_ctime) 4503 { 4504 struct timespec now; 4505 4506 if (update_ctime) { 4507 now = current_fs_time(inode->i_sb); 4508 if (!timespec_equal(&inode->i_ctime, &now)) 4509 inode->i_ctime = now; 4510 } 4511 /* 4512 * Update only when preallocation was requested beyond 4513 * the file size. 4514 */ 4515 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 4516 if (new_size > i_size_read(inode)) 4517 i_size_write(inode, new_size); 4518 if (new_size > EXT4_I(inode)->i_disksize) 4519 ext4_update_i_disksize(inode, new_size); 4520 } else { 4521 /* 4522 * Mark that we allocate beyond EOF so the subsequent truncate 4523 * can proceed even if the new size is the same as i_size. 4524 */ 4525 if (new_size > i_size_read(inode)) 4526 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4527 } 4528 4529 } 4530 4531 /* 4532 * preallocate space for a file. This implements ext4's fallocate file 4533 * operation, which gets called from sys_fallocate system call. 4534 * For block-mapped files, posix_fallocate should fall back to the method 4535 * of writing zeroes to the required new blocks (the same behavior which is 4536 * expected for file systems which do not support fallocate() system call). 4537 */ 4538 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4539 { 4540 struct inode *inode = file_inode(file); 4541 handle_t *handle; 4542 loff_t new_size; 4543 unsigned int max_blocks; 4544 int ret = 0; 4545 int ret2 = 0; 4546 int retries = 0; 4547 int flags; 4548 struct ext4_map_blocks map; 4549 unsigned int credits, blkbits = inode->i_blkbits; 4550 4551 /* Return error if mode is not supported */ 4552 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 4553 return -EOPNOTSUPP; 4554 4555 if (mode & FALLOC_FL_PUNCH_HOLE) 4556 return ext4_punch_hole(inode, offset, len); 4557 4558 ret = ext4_convert_inline_data(inode); 4559 if (ret) 4560 return ret; 4561 4562 /* 4563 * currently supporting (pre)allocate mode for extent-based 4564 * files _only_ 4565 */ 4566 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4567 return -EOPNOTSUPP; 4568 4569 trace_ext4_fallocate_enter(inode, offset, len, mode); 4570 map.m_lblk = offset >> blkbits; 4571 /* 4572 * We can't just convert len to max_blocks because 4573 * If blocksize = 4096 offset = 3072 and len = 2048 4574 */ 4575 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 4576 - map.m_lblk; 4577 /* 4578 * credits to insert 1 extent into extent tree 4579 */ 4580 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4581 mutex_lock(&inode->i_mutex); 4582 ret = inode_newsize_ok(inode, (len + offset)); 4583 if (ret) { 4584 mutex_unlock(&inode->i_mutex); 4585 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4586 return ret; 4587 } 4588 flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT; 4589 if (mode & FALLOC_FL_KEEP_SIZE) 4590 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4591 /* 4592 * Don't normalize the request if it can fit in one extent so 4593 * that it doesn't get unnecessarily split into multiple 4594 * extents. 4595 */ 4596 if (len <= EXT_UNINIT_MAX_LEN << blkbits) 4597 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4598 4599 retry: 4600 while (ret >= 0 && ret < max_blocks) { 4601 map.m_lblk = map.m_lblk + ret; 4602 map.m_len = max_blocks = max_blocks - ret; 4603 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4604 credits); 4605 if (IS_ERR(handle)) { 4606 ret = PTR_ERR(handle); 4607 break; 4608 } 4609 ret = ext4_map_blocks(handle, inode, &map, flags); 4610 if (ret <= 0) { 4611 #ifdef EXT4FS_DEBUG 4612 ext4_warning(inode->i_sb, 4613 "inode #%lu: block %u: len %u: " 4614 "ext4_ext_map_blocks returned %d", 4615 inode->i_ino, map.m_lblk, 4616 map.m_len, ret); 4617 #endif 4618 ext4_mark_inode_dirty(handle, inode); 4619 ret2 = ext4_journal_stop(handle); 4620 break; 4621 } 4622 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 4623 blkbits) >> blkbits)) 4624 new_size = offset + len; 4625 else 4626 new_size = ((loff_t) map.m_lblk + ret) << blkbits; 4627 4628 ext4_falloc_update_inode(inode, mode, new_size, 4629 (map.m_flags & EXT4_MAP_NEW)); 4630 ext4_mark_inode_dirty(handle, inode); 4631 if ((file->f_flags & O_SYNC) && ret >= max_blocks) 4632 ext4_handle_sync(handle); 4633 ret2 = ext4_journal_stop(handle); 4634 if (ret2) 4635 break; 4636 } 4637 if (ret == -ENOSPC && 4638 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4639 ret = 0; 4640 goto retry; 4641 } 4642 mutex_unlock(&inode->i_mutex); 4643 trace_ext4_fallocate_exit(inode, offset, max_blocks, 4644 ret > 0 ? ret2 : ret); 4645 return ret > 0 ? ret2 : ret; 4646 } 4647 4648 /* 4649 * This function convert a range of blocks to written extents 4650 * The caller of this function will pass the start offset and the size. 4651 * all unwritten extents within this range will be converted to 4652 * written extents. 4653 * 4654 * This function is called from the direct IO end io call back 4655 * function, to convert the fallocated extents after IO is completed. 4656 * Returns 0 on success. 4657 */ 4658 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 4659 loff_t offset, ssize_t len) 4660 { 4661 unsigned int max_blocks; 4662 int ret = 0; 4663 int ret2 = 0; 4664 struct ext4_map_blocks map; 4665 unsigned int credits, blkbits = inode->i_blkbits; 4666 4667 map.m_lblk = offset >> blkbits; 4668 /* 4669 * We can't just convert len to max_blocks because 4670 * If blocksize = 4096 offset = 3072 and len = 2048 4671 */ 4672 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4673 map.m_lblk); 4674 /* 4675 * This is somewhat ugly but the idea is clear: When transaction is 4676 * reserved, everything goes into it. Otherwise we rather start several 4677 * smaller transactions for conversion of each extent separately. 4678 */ 4679 if (handle) { 4680 handle = ext4_journal_start_reserved(handle, 4681 EXT4_HT_EXT_CONVERT); 4682 if (IS_ERR(handle)) 4683 return PTR_ERR(handle); 4684 credits = 0; 4685 } else { 4686 /* 4687 * credits to insert 1 extent into extent tree 4688 */ 4689 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4690 } 4691 while (ret >= 0 && ret < max_blocks) { 4692 map.m_lblk += ret; 4693 map.m_len = (max_blocks -= ret); 4694 if (credits) { 4695 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4696 credits); 4697 if (IS_ERR(handle)) { 4698 ret = PTR_ERR(handle); 4699 break; 4700 } 4701 } 4702 ret = ext4_map_blocks(handle, inode, &map, 4703 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4704 if (ret <= 0) 4705 ext4_warning(inode->i_sb, 4706 "inode #%lu: block %u: len %u: " 4707 "ext4_ext_map_blocks returned %d", 4708 inode->i_ino, map.m_lblk, 4709 map.m_len, ret); 4710 ext4_mark_inode_dirty(handle, inode); 4711 if (credits) 4712 ret2 = ext4_journal_stop(handle); 4713 if (ret <= 0 || ret2) 4714 break; 4715 } 4716 if (!credits) 4717 ret2 = ext4_journal_stop(handle); 4718 return ret > 0 ? ret2 : ret; 4719 } 4720 4721 /* 4722 * If newes is not existing extent (newes->ec_pblk equals zero) find 4723 * delayed extent at start of newes and update newes accordingly and 4724 * return start of the next delayed extent. 4725 * 4726 * If newes is existing extent (newes->ec_pblk is not equal zero) 4727 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed 4728 * extent found. Leave newes unmodified. 4729 */ 4730 static int ext4_find_delayed_extent(struct inode *inode, 4731 struct extent_status *newes) 4732 { 4733 struct extent_status es; 4734 ext4_lblk_t block, next_del; 4735 4736 if (newes->es_pblk == 0) { 4737 ext4_es_find_delayed_extent_range(inode, newes->es_lblk, 4738 newes->es_lblk + newes->es_len - 1, &es); 4739 4740 /* 4741 * No extent in extent-tree contains block @newes->es_pblk, 4742 * then the block may stay in 1)a hole or 2)delayed-extent. 4743 */ 4744 if (es.es_len == 0) 4745 /* A hole found. */ 4746 return 0; 4747 4748 if (es.es_lblk > newes->es_lblk) { 4749 /* A hole found. */ 4750 newes->es_len = min(es.es_lblk - newes->es_lblk, 4751 newes->es_len); 4752 return 0; 4753 } 4754 4755 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; 4756 } 4757 4758 block = newes->es_lblk + newes->es_len; 4759 ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); 4760 if (es.es_len == 0) 4761 next_del = EXT_MAX_BLOCKS; 4762 else 4763 next_del = es.es_lblk; 4764 4765 return next_del; 4766 } 4767 /* fiemap flags we can handle specified here */ 4768 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4769 4770 static int ext4_xattr_fiemap(struct inode *inode, 4771 struct fiemap_extent_info *fieinfo) 4772 { 4773 __u64 physical = 0; 4774 __u64 length; 4775 __u32 flags = FIEMAP_EXTENT_LAST; 4776 int blockbits = inode->i_sb->s_blocksize_bits; 4777 int error = 0; 4778 4779 /* in-inode? */ 4780 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4781 struct ext4_iloc iloc; 4782 int offset; /* offset of xattr in inode */ 4783 4784 error = ext4_get_inode_loc(inode, &iloc); 4785 if (error) 4786 return error; 4787 physical = (__u64)iloc.bh->b_blocknr << blockbits; 4788 offset = EXT4_GOOD_OLD_INODE_SIZE + 4789 EXT4_I(inode)->i_extra_isize; 4790 physical += offset; 4791 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 4792 flags |= FIEMAP_EXTENT_DATA_INLINE; 4793 brelse(iloc.bh); 4794 } else { /* external block */ 4795 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; 4796 length = inode->i_sb->s_blocksize; 4797 } 4798 4799 if (physical) 4800 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4801 length, flags); 4802 return (error < 0 ? error : 0); 4803 } 4804 4805 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4806 __u64 start, __u64 len) 4807 { 4808 ext4_lblk_t start_blk; 4809 int error = 0; 4810 4811 if (ext4_has_inline_data(inode)) { 4812 int has_inline = 1; 4813 4814 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); 4815 4816 if (has_inline) 4817 return error; 4818 } 4819 4820 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 4821 error = ext4_ext_precache(inode); 4822 if (error) 4823 return error; 4824 } 4825 4826 /* fallback to generic here if not in extents fmt */ 4827 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4828 return generic_block_fiemap(inode, fieinfo, start, len, 4829 ext4_get_block); 4830 4831 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 4832 return -EBADR; 4833 4834 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4835 error = ext4_xattr_fiemap(inode, fieinfo); 4836 } else { 4837 ext4_lblk_t len_blks; 4838 __u64 last_blk; 4839 4840 start_blk = start >> inode->i_sb->s_blocksize_bits; 4841 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4842 if (last_blk >= EXT_MAX_BLOCKS) 4843 last_blk = EXT_MAX_BLOCKS-1; 4844 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4845 4846 /* 4847 * Walk the extent tree gathering extent information 4848 * and pushing extents back to the user. 4849 */ 4850 error = ext4_fill_fiemap_extents(inode, start_blk, 4851 len_blks, fieinfo); 4852 } 4853 ext4_es_lru_add(inode); 4854 return error; 4855 } 4856