1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 4 * Written by Alex Tomas <alex@clusterfs.com> 5 * 6 * Architecture independence: 7 * Copyright (c) 2005, Bull S.A. 8 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 9 */ 10 11 /* 12 * Extents support for EXT4 13 * 14 * TODO: 15 * - ext4*_error() should be used in some situations 16 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 17 * - smart tree reduction 18 */ 19 20 #include <linux/fs.h> 21 #include <linux/time.h> 22 #include <linux/jbd2.h> 23 #include <linux/highuid.h> 24 #include <linux/pagemap.h> 25 #include <linux/quotaops.h> 26 #include <linux/string.h> 27 #include <linux/slab.h> 28 #include <linux/uaccess.h> 29 #include <linux/fiemap.h> 30 #include <linux/backing-dev.h> 31 #include "ext4_jbd2.h" 32 #include "ext4_extents.h" 33 #include "xattr.h" 34 35 #include <trace/events/ext4.h> 36 37 /* 38 * used by extent splitting. 39 */ 40 #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 41 due to ENOSPC */ 42 #define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */ 43 #define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */ 44 45 #define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */ 46 #define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */ 47 48 static __le32 ext4_extent_block_csum(struct inode *inode, 49 struct ext4_extent_header *eh) 50 { 51 struct ext4_inode_info *ei = EXT4_I(inode); 52 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 53 __u32 csum; 54 55 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, 56 EXT4_EXTENT_TAIL_OFFSET(eh)); 57 return cpu_to_le32(csum); 58 } 59 60 static int ext4_extent_block_csum_verify(struct inode *inode, 61 struct ext4_extent_header *eh) 62 { 63 struct ext4_extent_tail *et; 64 65 if (!ext4_has_metadata_csum(inode->i_sb)) 66 return 1; 67 68 et = find_ext4_extent_tail(eh); 69 if (et->et_checksum != ext4_extent_block_csum(inode, eh)) 70 return 0; 71 return 1; 72 } 73 74 static void ext4_extent_block_csum_set(struct inode *inode, 75 struct ext4_extent_header *eh) 76 { 77 struct ext4_extent_tail *et; 78 79 if (!ext4_has_metadata_csum(inode->i_sb)) 80 return; 81 82 et = find_ext4_extent_tail(eh); 83 et->et_checksum = ext4_extent_block_csum(inode, eh); 84 } 85 86 static int ext4_split_extent(handle_t *handle, 87 struct inode *inode, 88 struct ext4_ext_path **ppath, 89 struct ext4_map_blocks *map, 90 int split_flag, 91 int flags); 92 93 static int ext4_split_extent_at(handle_t *handle, 94 struct inode *inode, 95 struct ext4_ext_path **ppath, 96 ext4_lblk_t split, 97 int split_flag, 98 int flags); 99 100 static int ext4_find_delayed_extent(struct inode *inode, 101 struct extent_status *newes); 102 103 static int ext4_ext_truncate_extend_restart(handle_t *handle, 104 struct inode *inode, 105 int needed) 106 { 107 int err; 108 109 if (!ext4_handle_valid(handle)) 110 return 0; 111 if (handle->h_buffer_credits >= needed) 112 return 0; 113 /* 114 * If we need to extend the journal get a few extra blocks 115 * while we're at it for efficiency's sake. 116 */ 117 needed += 3; 118 err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); 119 if (err <= 0) 120 return err; 121 err = ext4_truncate_restart_trans(handle, inode, needed); 122 if (err == 0) 123 err = -EAGAIN; 124 125 return err; 126 } 127 128 /* 129 * could return: 130 * - EROFS 131 * - ENOMEM 132 */ 133 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 134 struct ext4_ext_path *path) 135 { 136 if (path->p_bh) { 137 /* path points to block */ 138 BUFFER_TRACE(path->p_bh, "get_write_access"); 139 return ext4_journal_get_write_access(handle, path->p_bh); 140 } 141 /* path points to leaf/index in inode body */ 142 /* we use in-core data, no need to protect them */ 143 return 0; 144 } 145 146 /* 147 * could return: 148 * - EROFS 149 * - ENOMEM 150 * - EIO 151 */ 152 int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, 153 struct inode *inode, struct ext4_ext_path *path) 154 { 155 int err; 156 157 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); 158 if (path->p_bh) { 159 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 160 /* path points to block */ 161 err = __ext4_handle_dirty_metadata(where, line, handle, 162 inode, path->p_bh); 163 } else { 164 /* path points to leaf/index in inode body */ 165 err = ext4_mark_inode_dirty(handle, inode); 166 } 167 return err; 168 } 169 170 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 171 struct ext4_ext_path *path, 172 ext4_lblk_t block) 173 { 174 if (path) { 175 int depth = path->p_depth; 176 struct ext4_extent *ex; 177 178 /* 179 * Try to predict block placement assuming that we are 180 * filling in a file which will eventually be 181 * non-sparse --- i.e., in the case of libbfd writing 182 * an ELF object sections out-of-order but in a way 183 * the eventually results in a contiguous object or 184 * executable file, or some database extending a table 185 * space file. However, this is actually somewhat 186 * non-ideal if we are writing a sparse file such as 187 * qemu or KVM writing a raw image file that is going 188 * to stay fairly sparse, since it will end up 189 * fragmenting the file system's free space. Maybe we 190 * should have some hueristics or some way to allow 191 * userspace to pass a hint to file system, 192 * especially if the latter case turns out to be 193 * common. 194 */ 195 ex = path[depth].p_ext; 196 if (ex) { 197 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 198 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 199 200 if (block > ext_block) 201 return ext_pblk + (block - ext_block); 202 else 203 return ext_pblk - (ext_block - block); 204 } 205 206 /* it looks like index is empty; 207 * try to find starting block from index itself */ 208 if (path[depth].p_bh) 209 return path[depth].p_bh->b_blocknr; 210 } 211 212 /* OK. use inode's group */ 213 return ext4_inode_to_goal_block(inode); 214 } 215 216 /* 217 * Allocation for a meta data block 218 */ 219 static ext4_fsblk_t 220 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 221 struct ext4_ext_path *path, 222 struct ext4_extent *ex, int *err, unsigned int flags) 223 { 224 ext4_fsblk_t goal, newblock; 225 226 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 227 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 228 NULL, err); 229 return newblock; 230 } 231 232 static inline int ext4_ext_space_block(struct inode *inode, int check) 233 { 234 int size; 235 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 237 / sizeof(struct ext4_extent); 238 #ifdef AGGRESSIVE_TEST 239 if (!check && size > 6) 240 size = 6; 241 #endif 242 return size; 243 } 244 245 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 246 { 247 int size; 248 249 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 250 / sizeof(struct ext4_extent_idx); 251 #ifdef AGGRESSIVE_TEST 252 if (!check && size > 5) 253 size = 5; 254 #endif 255 return size; 256 } 257 258 static inline int ext4_ext_space_root(struct inode *inode, int check) 259 { 260 int size; 261 262 size = sizeof(EXT4_I(inode)->i_data); 263 size -= sizeof(struct ext4_extent_header); 264 size /= sizeof(struct ext4_extent); 265 #ifdef AGGRESSIVE_TEST 266 if (!check && size > 3) 267 size = 3; 268 #endif 269 return size; 270 } 271 272 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 273 { 274 int size; 275 276 size = sizeof(EXT4_I(inode)->i_data); 277 size -= sizeof(struct ext4_extent_header); 278 size /= sizeof(struct ext4_extent_idx); 279 #ifdef AGGRESSIVE_TEST 280 if (!check && size > 4) 281 size = 4; 282 #endif 283 return size; 284 } 285 286 static inline int 287 ext4_force_split_extent_at(handle_t *handle, struct inode *inode, 288 struct ext4_ext_path **ppath, ext4_lblk_t lblk, 289 int nofail) 290 { 291 struct ext4_ext_path *path = *ppath; 292 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 293 294 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? 295 EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, 296 EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | 297 (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); 298 } 299 300 /* 301 * Calculate the number of metadata blocks needed 302 * to allocate @blocks 303 * Worse case is one block per extent 304 */ 305 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 306 { 307 struct ext4_inode_info *ei = EXT4_I(inode); 308 int idxs; 309 310 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 311 / sizeof(struct ext4_extent_idx)); 312 313 /* 314 * If the new delayed allocation block is contiguous with the 315 * previous da block, it can share index blocks with the 316 * previous block, so we only need to allocate a new index 317 * block every idxs leaf blocks. At ldxs**2 blocks, we need 318 * an additional index block, and at ldxs**3 blocks, yet 319 * another index blocks. 320 */ 321 if (ei->i_da_metadata_calc_len && 322 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 323 int num = 0; 324 325 if ((ei->i_da_metadata_calc_len % idxs) == 0) 326 num++; 327 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 328 num++; 329 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 330 num++; 331 ei->i_da_metadata_calc_len = 0; 332 } else 333 ei->i_da_metadata_calc_len++; 334 ei->i_da_metadata_calc_last_lblock++; 335 return num; 336 } 337 338 /* 339 * In the worst case we need a new set of index blocks at 340 * every level of the inode's extent tree. 341 */ 342 ei->i_da_metadata_calc_len = 1; 343 ei->i_da_metadata_calc_last_lblock = lblock; 344 return ext_depth(inode) + 1; 345 } 346 347 static int 348 ext4_ext_max_entries(struct inode *inode, int depth) 349 { 350 int max; 351 352 if (depth == ext_depth(inode)) { 353 if (depth == 0) 354 max = ext4_ext_space_root(inode, 1); 355 else 356 max = ext4_ext_space_root_idx(inode, 1); 357 } else { 358 if (depth == 0) 359 max = ext4_ext_space_block(inode, 1); 360 else 361 max = ext4_ext_space_block_idx(inode, 1); 362 } 363 364 return max; 365 } 366 367 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 368 { 369 ext4_fsblk_t block = ext4_ext_pblock(ext); 370 int len = ext4_ext_get_actual_len(ext); 371 ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); 372 373 /* 374 * We allow neither: 375 * - zero length 376 * - overflow/wrap-around 377 */ 378 if (lblock + len <= lblock) 379 return 0; 380 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 381 } 382 383 static int ext4_valid_extent_idx(struct inode *inode, 384 struct ext4_extent_idx *ext_idx) 385 { 386 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 387 388 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 389 } 390 391 static int ext4_valid_extent_entries(struct inode *inode, 392 struct ext4_extent_header *eh, 393 int depth) 394 { 395 unsigned short entries; 396 if (eh->eh_entries == 0) 397 return 1; 398 399 entries = le16_to_cpu(eh->eh_entries); 400 401 if (depth == 0) { 402 /* leaf entries */ 403 struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); 404 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 405 ext4_fsblk_t pblock = 0; 406 ext4_lblk_t lblock = 0; 407 ext4_lblk_t prev = 0; 408 int len = 0; 409 while (entries) { 410 if (!ext4_valid_extent(inode, ext)) 411 return 0; 412 413 /* Check for overlapping extents */ 414 lblock = le32_to_cpu(ext->ee_block); 415 len = ext4_ext_get_actual_len(ext); 416 if ((lblock <= prev) && prev) { 417 pblock = ext4_ext_pblock(ext); 418 es->s_last_error_block = cpu_to_le64(pblock); 419 return 0; 420 } 421 ext++; 422 entries--; 423 prev = lblock + len - 1; 424 } 425 } else { 426 struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); 427 while (entries) { 428 if (!ext4_valid_extent_idx(inode, ext_idx)) 429 return 0; 430 ext_idx++; 431 entries--; 432 } 433 } 434 return 1; 435 } 436 437 static int __ext4_ext_check(const char *function, unsigned int line, 438 struct inode *inode, struct ext4_extent_header *eh, 439 int depth, ext4_fsblk_t pblk) 440 { 441 const char *error_msg; 442 int max = 0, err = -EFSCORRUPTED; 443 444 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 445 error_msg = "invalid magic"; 446 goto corrupted; 447 } 448 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 449 error_msg = "unexpected eh_depth"; 450 goto corrupted; 451 } 452 if (unlikely(eh->eh_max == 0)) { 453 error_msg = "invalid eh_max"; 454 goto corrupted; 455 } 456 max = ext4_ext_max_entries(inode, depth); 457 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 458 error_msg = "too large eh_max"; 459 goto corrupted; 460 } 461 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 462 error_msg = "invalid eh_entries"; 463 goto corrupted; 464 } 465 if (!ext4_valid_extent_entries(inode, eh, depth)) { 466 error_msg = "invalid extent entries"; 467 goto corrupted; 468 } 469 if (unlikely(depth > 32)) { 470 error_msg = "too large eh_depth"; 471 goto corrupted; 472 } 473 /* Verify checksum on non-root extent tree nodes */ 474 if (ext_depth(inode) != depth && 475 !ext4_extent_block_csum_verify(inode, eh)) { 476 error_msg = "extent tree corrupted"; 477 err = -EFSBADCRC; 478 goto corrupted; 479 } 480 return 0; 481 482 corrupted: 483 ext4_error_inode(inode, function, line, 0, 484 "pblk %llu bad header/extent: %s - magic %x, " 485 "entries %u, max %u(%u), depth %u(%u)", 486 (unsigned long long) pblk, error_msg, 487 le16_to_cpu(eh->eh_magic), 488 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 489 max, le16_to_cpu(eh->eh_depth), depth); 490 return err; 491 } 492 493 #define ext4_ext_check(inode, eh, depth, pblk) \ 494 __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk)) 495 496 int ext4_ext_check_inode(struct inode *inode) 497 { 498 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); 499 } 500 501 static struct buffer_head * 502 __read_extent_tree_block(const char *function, unsigned int line, 503 struct inode *inode, ext4_fsblk_t pblk, int depth, 504 int flags) 505 { 506 struct buffer_head *bh; 507 int err; 508 509 bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); 510 if (unlikely(!bh)) 511 return ERR_PTR(-ENOMEM); 512 513 if (!bh_uptodate_or_lock(bh)) { 514 trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); 515 err = bh_submit_read(bh); 516 if (err < 0) 517 goto errout; 518 } 519 if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) 520 return bh; 521 err = __ext4_ext_check(function, line, inode, 522 ext_block_hdr(bh), depth, pblk); 523 if (err) 524 goto errout; 525 set_buffer_verified(bh); 526 /* 527 * If this is a leaf block, cache all of its entries 528 */ 529 if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { 530 struct ext4_extent_header *eh = ext_block_hdr(bh); 531 struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); 532 ext4_lblk_t prev = 0; 533 int i; 534 535 for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { 536 unsigned int status = EXTENT_STATUS_WRITTEN; 537 ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); 538 int len = ext4_ext_get_actual_len(ex); 539 540 if (prev && (prev != lblk)) 541 ext4_es_cache_extent(inode, prev, 542 lblk - prev, ~0, 543 EXTENT_STATUS_HOLE); 544 545 if (ext4_ext_is_unwritten(ex)) 546 status = EXTENT_STATUS_UNWRITTEN; 547 ext4_es_cache_extent(inode, lblk, len, 548 ext4_ext_pblock(ex), status); 549 prev = lblk + len; 550 } 551 } 552 return bh; 553 errout: 554 put_bh(bh); 555 return ERR_PTR(err); 556 557 } 558 559 #define read_extent_tree_block(inode, pblk, depth, flags) \ 560 __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ 561 (depth), (flags)) 562 563 /* 564 * This function is called to cache a file's extent information in the 565 * extent status tree 566 */ 567 int ext4_ext_precache(struct inode *inode) 568 { 569 struct ext4_inode_info *ei = EXT4_I(inode); 570 struct ext4_ext_path *path = NULL; 571 struct buffer_head *bh; 572 int i = 0, depth, ret = 0; 573 574 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 575 return 0; /* not an extent-mapped inode */ 576 577 down_read(&ei->i_data_sem); 578 depth = ext_depth(inode); 579 580 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 581 GFP_NOFS); 582 if (path == NULL) { 583 up_read(&ei->i_data_sem); 584 return -ENOMEM; 585 } 586 587 /* Don't cache anything if there are no external extent blocks */ 588 if (depth == 0) 589 goto out; 590 path[0].p_hdr = ext_inode_hdr(inode); 591 ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); 592 if (ret) 593 goto out; 594 path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr); 595 while (i >= 0) { 596 /* 597 * If this is a leaf block or we've reached the end of 598 * the index block, go up 599 */ 600 if ((i == depth) || 601 path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) { 602 brelse(path[i].p_bh); 603 path[i].p_bh = NULL; 604 i--; 605 continue; 606 } 607 bh = read_extent_tree_block(inode, 608 ext4_idx_pblock(path[i].p_idx++), 609 depth - i - 1, 610 EXT4_EX_FORCE_CACHE); 611 if (IS_ERR(bh)) { 612 ret = PTR_ERR(bh); 613 break; 614 } 615 i++; 616 path[i].p_bh = bh; 617 path[i].p_hdr = ext_block_hdr(bh); 618 path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr); 619 } 620 ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED); 621 out: 622 up_read(&ei->i_data_sem); 623 ext4_ext_drop_refs(path); 624 kfree(path); 625 return ret; 626 } 627 628 #ifdef EXT_DEBUG 629 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 630 { 631 int k, l = path->p_depth; 632 633 ext_debug("path:"); 634 for (k = 0; k <= l; k++, path++) { 635 if (path->p_idx) { 636 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 637 ext4_idx_pblock(path->p_idx)); 638 } else if (path->p_ext) { 639 ext_debug(" %d:[%d]%d:%llu ", 640 le32_to_cpu(path->p_ext->ee_block), 641 ext4_ext_is_unwritten(path->p_ext), 642 ext4_ext_get_actual_len(path->p_ext), 643 ext4_ext_pblock(path->p_ext)); 644 } else 645 ext_debug(" []"); 646 } 647 ext_debug("\n"); 648 } 649 650 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 651 { 652 int depth = ext_depth(inode); 653 struct ext4_extent_header *eh; 654 struct ext4_extent *ex; 655 int i; 656 657 if (!path) 658 return; 659 660 eh = path[depth].p_hdr; 661 ex = EXT_FIRST_EXTENT(eh); 662 663 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 664 665 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 666 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 667 ext4_ext_is_unwritten(ex), 668 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 669 } 670 ext_debug("\n"); 671 } 672 673 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, 674 ext4_fsblk_t newblock, int level) 675 { 676 int depth = ext_depth(inode); 677 struct ext4_extent *ex; 678 679 if (depth != level) { 680 struct ext4_extent_idx *idx; 681 idx = path[level].p_idx; 682 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { 683 ext_debug("%d: move %d:%llu in new index %llu\n", level, 684 le32_to_cpu(idx->ei_block), 685 ext4_idx_pblock(idx), 686 newblock); 687 idx++; 688 } 689 690 return; 691 } 692 693 ex = path[depth].p_ext; 694 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { 695 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 696 le32_to_cpu(ex->ee_block), 697 ext4_ext_pblock(ex), 698 ext4_ext_is_unwritten(ex), 699 ext4_ext_get_actual_len(ex), 700 newblock); 701 ex++; 702 } 703 } 704 705 #else 706 #define ext4_ext_show_path(inode, path) 707 #define ext4_ext_show_leaf(inode, path) 708 #define ext4_ext_show_move(inode, path, newblock, level) 709 #endif 710 711 void ext4_ext_drop_refs(struct ext4_ext_path *path) 712 { 713 int depth, i; 714 715 if (!path) 716 return; 717 depth = path->p_depth; 718 for (i = 0; i <= depth; i++, path++) 719 if (path->p_bh) { 720 brelse(path->p_bh); 721 path->p_bh = NULL; 722 } 723 } 724 725 /* 726 * ext4_ext_binsearch_idx: 727 * binary search for the closest index of the given block 728 * the header must be checked before calling this 729 */ 730 static void 731 ext4_ext_binsearch_idx(struct inode *inode, 732 struct ext4_ext_path *path, ext4_lblk_t block) 733 { 734 struct ext4_extent_header *eh = path->p_hdr; 735 struct ext4_extent_idx *r, *l, *m; 736 737 738 ext_debug("binsearch for %u(idx): ", block); 739 740 l = EXT_FIRST_INDEX(eh) + 1; 741 r = EXT_LAST_INDEX(eh); 742 while (l <= r) { 743 m = l + (r - l) / 2; 744 if (block < le32_to_cpu(m->ei_block)) 745 r = m - 1; 746 else 747 l = m + 1; 748 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 749 m, le32_to_cpu(m->ei_block), 750 r, le32_to_cpu(r->ei_block)); 751 } 752 753 path->p_idx = l - 1; 754 ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), 755 ext4_idx_pblock(path->p_idx)); 756 757 #ifdef CHECK_BINSEARCH 758 { 759 struct ext4_extent_idx *chix, *ix; 760 int k; 761 762 chix = ix = EXT_FIRST_INDEX(eh); 763 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 764 if (k != 0 && 765 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 766 printk(KERN_DEBUG "k=%d, ix=0x%p, " 767 "first=0x%p\n", k, 768 ix, EXT_FIRST_INDEX(eh)); 769 printk(KERN_DEBUG "%u <= %u\n", 770 le32_to_cpu(ix->ei_block), 771 le32_to_cpu(ix[-1].ei_block)); 772 } 773 BUG_ON(k && le32_to_cpu(ix->ei_block) 774 <= le32_to_cpu(ix[-1].ei_block)); 775 if (block < le32_to_cpu(ix->ei_block)) 776 break; 777 chix = ix; 778 } 779 BUG_ON(chix != path->p_idx); 780 } 781 #endif 782 783 } 784 785 /* 786 * ext4_ext_binsearch: 787 * binary search for closest extent of the given block 788 * the header must be checked before calling this 789 */ 790 static void 791 ext4_ext_binsearch(struct inode *inode, 792 struct ext4_ext_path *path, ext4_lblk_t block) 793 { 794 struct ext4_extent_header *eh = path->p_hdr; 795 struct ext4_extent *r, *l, *m; 796 797 if (eh->eh_entries == 0) { 798 /* 799 * this leaf is empty: 800 * we get such a leaf in split/add case 801 */ 802 return; 803 } 804 805 ext_debug("binsearch for %u: ", block); 806 807 l = EXT_FIRST_EXTENT(eh) + 1; 808 r = EXT_LAST_EXTENT(eh); 809 810 while (l <= r) { 811 m = l + (r - l) / 2; 812 if (block < le32_to_cpu(m->ee_block)) 813 r = m - 1; 814 else 815 l = m + 1; 816 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 817 m, le32_to_cpu(m->ee_block), 818 r, le32_to_cpu(r->ee_block)); 819 } 820 821 path->p_ext = l - 1; 822 ext_debug(" -> %d:%llu:[%d]%d ", 823 le32_to_cpu(path->p_ext->ee_block), 824 ext4_ext_pblock(path->p_ext), 825 ext4_ext_is_unwritten(path->p_ext), 826 ext4_ext_get_actual_len(path->p_ext)); 827 828 #ifdef CHECK_BINSEARCH 829 { 830 struct ext4_extent *chex, *ex; 831 int k; 832 833 chex = ex = EXT_FIRST_EXTENT(eh); 834 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 835 BUG_ON(k && le32_to_cpu(ex->ee_block) 836 <= le32_to_cpu(ex[-1].ee_block)); 837 if (block < le32_to_cpu(ex->ee_block)) 838 break; 839 chex = ex; 840 } 841 BUG_ON(chex != path->p_ext); 842 } 843 #endif 844 845 } 846 847 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 848 { 849 struct ext4_extent_header *eh; 850 851 eh = ext_inode_hdr(inode); 852 eh->eh_depth = 0; 853 eh->eh_entries = 0; 854 eh->eh_magic = EXT4_EXT_MAGIC; 855 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 856 ext4_mark_inode_dirty(handle, inode); 857 return 0; 858 } 859 860 struct ext4_ext_path * 861 ext4_find_extent(struct inode *inode, ext4_lblk_t block, 862 struct ext4_ext_path **orig_path, int flags) 863 { 864 struct ext4_extent_header *eh; 865 struct buffer_head *bh; 866 struct ext4_ext_path *path = orig_path ? *orig_path : NULL; 867 short int depth, i, ppos = 0; 868 int ret; 869 870 eh = ext_inode_hdr(inode); 871 depth = ext_depth(inode); 872 873 if (path) { 874 ext4_ext_drop_refs(path); 875 if (depth > path[0].p_maxdepth) { 876 kfree(path); 877 *orig_path = path = NULL; 878 } 879 } 880 if (!path) { 881 /* account possible depth increase */ 882 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 883 GFP_NOFS); 884 if (unlikely(!path)) 885 return ERR_PTR(-ENOMEM); 886 path[0].p_maxdepth = depth + 1; 887 } 888 path[0].p_hdr = eh; 889 path[0].p_bh = NULL; 890 891 i = depth; 892 /* walk through the tree */ 893 while (i) { 894 ext_debug("depth %d: num %d, max %d\n", 895 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 896 897 ext4_ext_binsearch_idx(inode, path + ppos, block); 898 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 899 path[ppos].p_depth = i; 900 path[ppos].p_ext = NULL; 901 902 bh = read_extent_tree_block(inode, path[ppos].p_block, --i, 903 flags); 904 if (IS_ERR(bh)) { 905 ret = PTR_ERR(bh); 906 goto err; 907 } 908 909 eh = ext_block_hdr(bh); 910 ppos++; 911 path[ppos].p_bh = bh; 912 path[ppos].p_hdr = eh; 913 } 914 915 path[ppos].p_depth = i; 916 path[ppos].p_ext = NULL; 917 path[ppos].p_idx = NULL; 918 919 /* find extent */ 920 ext4_ext_binsearch(inode, path + ppos, block); 921 /* if not an empty leaf */ 922 if (path[ppos].p_ext) 923 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 924 925 ext4_ext_show_path(inode, path); 926 927 return path; 928 929 err: 930 ext4_ext_drop_refs(path); 931 kfree(path); 932 if (orig_path) 933 *orig_path = NULL; 934 return ERR_PTR(ret); 935 } 936 937 /* 938 * ext4_ext_insert_index: 939 * insert new index [@logical;@ptr] into the block at @curp; 940 * check where to insert: before @curp or after @curp 941 */ 942 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 943 struct ext4_ext_path *curp, 944 int logical, ext4_fsblk_t ptr) 945 { 946 struct ext4_extent_idx *ix; 947 int len, err; 948 949 err = ext4_ext_get_access(handle, inode, curp); 950 if (err) 951 return err; 952 953 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 954 EXT4_ERROR_INODE(inode, 955 "logical %d == ei_block %d!", 956 logical, le32_to_cpu(curp->p_idx->ei_block)); 957 return -EFSCORRUPTED; 958 } 959 960 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 961 >= le16_to_cpu(curp->p_hdr->eh_max))) { 962 EXT4_ERROR_INODE(inode, 963 "eh_entries %d >= eh_max %d!", 964 le16_to_cpu(curp->p_hdr->eh_entries), 965 le16_to_cpu(curp->p_hdr->eh_max)); 966 return -EFSCORRUPTED; 967 } 968 969 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 970 /* insert after */ 971 ext_debug("insert new index %d after: %llu\n", logical, ptr); 972 ix = curp->p_idx + 1; 973 } else { 974 /* insert before */ 975 ext_debug("insert new index %d before: %llu\n", logical, ptr); 976 ix = curp->p_idx; 977 } 978 979 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 980 BUG_ON(len < 0); 981 if (len > 0) { 982 ext_debug("insert new index %d: " 983 "move %d indices from 0x%p to 0x%p\n", 984 logical, len, ix, ix + 1); 985 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 986 } 987 988 if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 989 EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 990 return -EFSCORRUPTED; 991 } 992 993 ix->ei_block = cpu_to_le32(logical); 994 ext4_idx_store_pblock(ix, ptr); 995 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 996 997 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 998 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 999 return -EFSCORRUPTED; 1000 } 1001 1002 err = ext4_ext_dirty(handle, inode, curp); 1003 ext4_std_error(inode->i_sb, err); 1004 1005 return err; 1006 } 1007 1008 /* 1009 * ext4_ext_split: 1010 * inserts new subtree into the path, using free index entry 1011 * at depth @at: 1012 * - allocates all needed blocks (new leaf and all intermediate index blocks) 1013 * - makes decision where to split 1014 * - moves remaining extents and index entries (right to the split point) 1015 * into the newly allocated blocks 1016 * - initializes subtree 1017 */ 1018 static int ext4_ext_split(handle_t *handle, struct inode *inode, 1019 unsigned int flags, 1020 struct ext4_ext_path *path, 1021 struct ext4_extent *newext, int at) 1022 { 1023 struct buffer_head *bh = NULL; 1024 int depth = ext_depth(inode); 1025 struct ext4_extent_header *neh; 1026 struct ext4_extent_idx *fidx; 1027 int i = at, k, m, a; 1028 ext4_fsblk_t newblock, oldblock; 1029 __le32 border; 1030 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 1031 int err = 0; 1032 1033 /* make decision: where to split? */ 1034 /* FIXME: now decision is simplest: at current extent */ 1035 1036 /* if current leaf will be split, then we should use 1037 * border from split point */ 1038 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 1039 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 1040 return -EFSCORRUPTED; 1041 } 1042 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 1043 border = path[depth].p_ext[1].ee_block; 1044 ext_debug("leaf will be split." 1045 " next leaf starts at %d\n", 1046 le32_to_cpu(border)); 1047 } else { 1048 border = newext->ee_block; 1049 ext_debug("leaf will be added." 1050 " next leaf starts at %d\n", 1051 le32_to_cpu(border)); 1052 } 1053 1054 /* 1055 * If error occurs, then we break processing 1056 * and mark filesystem read-only. index won't 1057 * be inserted and tree will be in consistent 1058 * state. Next mount will repair buffers too. 1059 */ 1060 1061 /* 1062 * Get array to track all allocated blocks. 1063 * We need this to handle errors and free blocks 1064 * upon them. 1065 */ 1066 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 1067 if (!ablocks) 1068 return -ENOMEM; 1069 1070 /* allocate all needed blocks */ 1071 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 1072 for (a = 0; a < depth - at; a++) { 1073 newblock = ext4_ext_new_meta_block(handle, inode, path, 1074 newext, &err, flags); 1075 if (newblock == 0) 1076 goto cleanup; 1077 ablocks[a] = newblock; 1078 } 1079 1080 /* initialize new leaf */ 1081 newblock = ablocks[--a]; 1082 if (unlikely(newblock == 0)) { 1083 EXT4_ERROR_INODE(inode, "newblock == 0!"); 1084 err = -EFSCORRUPTED; 1085 goto cleanup; 1086 } 1087 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); 1088 if (unlikely(!bh)) { 1089 err = -ENOMEM; 1090 goto cleanup; 1091 } 1092 lock_buffer(bh); 1093 1094 err = ext4_journal_get_create_access(handle, bh); 1095 if (err) 1096 goto cleanup; 1097 1098 neh = ext_block_hdr(bh); 1099 neh->eh_entries = 0; 1100 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1101 neh->eh_magic = EXT4_EXT_MAGIC; 1102 neh->eh_depth = 0; 1103 1104 /* move remainder of path[depth] to the new leaf */ 1105 if (unlikely(path[depth].p_hdr->eh_entries != 1106 path[depth].p_hdr->eh_max)) { 1107 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 1108 path[depth].p_hdr->eh_entries, 1109 path[depth].p_hdr->eh_max); 1110 err = -EFSCORRUPTED; 1111 goto cleanup; 1112 } 1113 /* start copy from next extent */ 1114 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++; 1115 ext4_ext_show_move(inode, path, newblock, depth); 1116 if (m) { 1117 struct ext4_extent *ex; 1118 ex = EXT_FIRST_EXTENT(neh); 1119 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m); 1120 le16_add_cpu(&neh->eh_entries, m); 1121 } 1122 1123 ext4_extent_block_csum_set(inode, neh); 1124 set_buffer_uptodate(bh); 1125 unlock_buffer(bh); 1126 1127 err = ext4_handle_dirty_metadata(handle, inode, bh); 1128 if (err) 1129 goto cleanup; 1130 brelse(bh); 1131 bh = NULL; 1132 1133 /* correct old leaf */ 1134 if (m) { 1135 err = ext4_ext_get_access(handle, inode, path + depth); 1136 if (err) 1137 goto cleanup; 1138 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 1139 err = ext4_ext_dirty(handle, inode, path + depth); 1140 if (err) 1141 goto cleanup; 1142 1143 } 1144 1145 /* create intermediate indexes */ 1146 k = depth - at - 1; 1147 if (unlikely(k < 0)) { 1148 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 1149 err = -EFSCORRUPTED; 1150 goto cleanup; 1151 } 1152 if (k) 1153 ext_debug("create %d intermediate indices\n", k); 1154 /* insert new index into current index block */ 1155 /* current depth stored in i var */ 1156 i = depth - 1; 1157 while (k--) { 1158 oldblock = newblock; 1159 newblock = ablocks[--a]; 1160 bh = sb_getblk(inode->i_sb, newblock); 1161 if (unlikely(!bh)) { 1162 err = -ENOMEM; 1163 goto cleanup; 1164 } 1165 lock_buffer(bh); 1166 1167 err = ext4_journal_get_create_access(handle, bh); 1168 if (err) 1169 goto cleanup; 1170 1171 neh = ext_block_hdr(bh); 1172 neh->eh_entries = cpu_to_le16(1); 1173 neh->eh_magic = EXT4_EXT_MAGIC; 1174 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1175 neh->eh_depth = cpu_to_le16(depth - i); 1176 fidx = EXT_FIRST_INDEX(neh); 1177 fidx->ei_block = border; 1178 ext4_idx_store_pblock(fidx, oldblock); 1179 1180 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 1181 i, newblock, le32_to_cpu(border), oldblock); 1182 1183 /* move remainder of path[i] to the new index block */ 1184 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1185 EXT_LAST_INDEX(path[i].p_hdr))) { 1186 EXT4_ERROR_INODE(inode, 1187 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 1188 le32_to_cpu(path[i].p_ext->ee_block)); 1189 err = -EFSCORRUPTED; 1190 goto cleanup; 1191 } 1192 /* start copy indexes */ 1193 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; 1194 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1195 EXT_MAX_INDEX(path[i].p_hdr)); 1196 ext4_ext_show_move(inode, path, newblock, i); 1197 if (m) { 1198 memmove(++fidx, path[i].p_idx, 1199 sizeof(struct ext4_extent_idx) * m); 1200 le16_add_cpu(&neh->eh_entries, m); 1201 } 1202 ext4_extent_block_csum_set(inode, neh); 1203 set_buffer_uptodate(bh); 1204 unlock_buffer(bh); 1205 1206 err = ext4_handle_dirty_metadata(handle, inode, bh); 1207 if (err) 1208 goto cleanup; 1209 brelse(bh); 1210 bh = NULL; 1211 1212 /* correct old index */ 1213 if (m) { 1214 err = ext4_ext_get_access(handle, inode, path + i); 1215 if (err) 1216 goto cleanup; 1217 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1218 err = ext4_ext_dirty(handle, inode, path + i); 1219 if (err) 1220 goto cleanup; 1221 } 1222 1223 i--; 1224 } 1225 1226 /* insert new index */ 1227 err = ext4_ext_insert_index(handle, inode, path + at, 1228 le32_to_cpu(border), newblock); 1229 1230 cleanup: 1231 if (bh) { 1232 if (buffer_locked(bh)) 1233 unlock_buffer(bh); 1234 brelse(bh); 1235 } 1236 1237 if (err) { 1238 /* free all allocated blocks in error case */ 1239 for (i = 0; i < depth; i++) { 1240 if (!ablocks[i]) 1241 continue; 1242 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1243 EXT4_FREE_BLOCKS_METADATA); 1244 } 1245 } 1246 kfree(ablocks); 1247 1248 return err; 1249 } 1250 1251 /* 1252 * ext4_ext_grow_indepth: 1253 * implements tree growing procedure: 1254 * - allocates new block 1255 * - moves top-level data (index block or leaf) into the new block 1256 * - initializes new top-level, creating index that points to the 1257 * just created block 1258 */ 1259 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1260 unsigned int flags) 1261 { 1262 struct ext4_extent_header *neh; 1263 struct buffer_head *bh; 1264 ext4_fsblk_t newblock, goal = 0; 1265 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 1266 int err = 0; 1267 1268 /* Try to prepend new index to old one */ 1269 if (ext_depth(inode)) 1270 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); 1271 if (goal > le32_to_cpu(es->s_first_data_block)) { 1272 flags |= EXT4_MB_HINT_TRY_GOAL; 1273 goal--; 1274 } else 1275 goal = ext4_inode_to_goal_block(inode); 1276 newblock = ext4_new_meta_blocks(handle, inode, goal, flags, 1277 NULL, &err); 1278 if (newblock == 0) 1279 return err; 1280 1281 bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); 1282 if (unlikely(!bh)) 1283 return -ENOMEM; 1284 lock_buffer(bh); 1285 1286 err = ext4_journal_get_create_access(handle, bh); 1287 if (err) { 1288 unlock_buffer(bh); 1289 goto out; 1290 } 1291 1292 /* move top-level index/leaf into new block */ 1293 memmove(bh->b_data, EXT4_I(inode)->i_data, 1294 sizeof(EXT4_I(inode)->i_data)); 1295 1296 /* set size of new block */ 1297 neh = ext_block_hdr(bh); 1298 /* old root could have indexes or leaves 1299 * so calculate e_max right way */ 1300 if (ext_depth(inode)) 1301 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1302 else 1303 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1304 neh->eh_magic = EXT4_EXT_MAGIC; 1305 ext4_extent_block_csum_set(inode, neh); 1306 set_buffer_uptodate(bh); 1307 unlock_buffer(bh); 1308 1309 err = ext4_handle_dirty_metadata(handle, inode, bh); 1310 if (err) 1311 goto out; 1312 1313 /* Update top-level index: num,max,pointer */ 1314 neh = ext_inode_hdr(inode); 1315 neh->eh_entries = cpu_to_le16(1); 1316 ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock); 1317 if (neh->eh_depth == 0) { 1318 /* Root extent block becomes index block */ 1319 neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1320 EXT_FIRST_INDEX(neh)->ei_block = 1321 EXT_FIRST_EXTENT(neh)->ee_block; 1322 } 1323 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1324 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1325 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1326 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1327 1328 le16_add_cpu(&neh->eh_depth, 1); 1329 ext4_mark_inode_dirty(handle, inode); 1330 out: 1331 brelse(bh); 1332 1333 return err; 1334 } 1335 1336 /* 1337 * ext4_ext_create_new_leaf: 1338 * finds empty index and adds new leaf. 1339 * if no free index is found, then it requests in-depth growing. 1340 */ 1341 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1342 unsigned int mb_flags, 1343 unsigned int gb_flags, 1344 struct ext4_ext_path **ppath, 1345 struct ext4_extent *newext) 1346 { 1347 struct ext4_ext_path *path = *ppath; 1348 struct ext4_ext_path *curp; 1349 int depth, i, err = 0; 1350 1351 repeat: 1352 i = depth = ext_depth(inode); 1353 1354 /* walk up to the tree and look for free index entry */ 1355 curp = path + depth; 1356 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1357 i--; 1358 curp--; 1359 } 1360 1361 /* we use already allocated block for index block, 1362 * so subsequent data blocks should be contiguous */ 1363 if (EXT_HAS_FREE_INDEX(curp)) { 1364 /* if we found index with free entry, then use that 1365 * entry: create all needed subtree and add new leaf */ 1366 err = ext4_ext_split(handle, inode, mb_flags, path, newext, i); 1367 if (err) 1368 goto out; 1369 1370 /* refill path */ 1371 path = ext4_find_extent(inode, 1372 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1373 ppath, gb_flags); 1374 if (IS_ERR(path)) 1375 err = PTR_ERR(path); 1376 } else { 1377 /* tree is full, time to grow in depth */ 1378 err = ext4_ext_grow_indepth(handle, inode, mb_flags); 1379 if (err) 1380 goto out; 1381 1382 /* refill path */ 1383 path = ext4_find_extent(inode, 1384 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1385 ppath, gb_flags); 1386 if (IS_ERR(path)) { 1387 err = PTR_ERR(path); 1388 goto out; 1389 } 1390 1391 /* 1392 * only first (depth 0 -> 1) produces free space; 1393 * in all other cases we have to split the grown tree 1394 */ 1395 depth = ext_depth(inode); 1396 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1397 /* now we need to split */ 1398 goto repeat; 1399 } 1400 } 1401 1402 out: 1403 return err; 1404 } 1405 1406 /* 1407 * search the closest allocated block to the left for *logical 1408 * and returns it at @logical + it's physical address at @phys 1409 * if *logical is the smallest allocated block, the function 1410 * returns 0 at @phys 1411 * return value contains 0 (success) or error code 1412 */ 1413 static int ext4_ext_search_left(struct inode *inode, 1414 struct ext4_ext_path *path, 1415 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1416 { 1417 struct ext4_extent_idx *ix; 1418 struct ext4_extent *ex; 1419 int depth, ee_len; 1420 1421 if (unlikely(path == NULL)) { 1422 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1423 return -EFSCORRUPTED; 1424 } 1425 depth = path->p_depth; 1426 *phys = 0; 1427 1428 if (depth == 0 && path->p_ext == NULL) 1429 return 0; 1430 1431 /* usually extent in the path covers blocks smaller 1432 * then *logical, but it can be that extent is the 1433 * first one in the file */ 1434 1435 ex = path[depth].p_ext; 1436 ee_len = ext4_ext_get_actual_len(ex); 1437 if (*logical < le32_to_cpu(ex->ee_block)) { 1438 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1439 EXT4_ERROR_INODE(inode, 1440 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1441 *logical, le32_to_cpu(ex->ee_block)); 1442 return -EFSCORRUPTED; 1443 } 1444 while (--depth >= 0) { 1445 ix = path[depth].p_idx; 1446 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1447 EXT4_ERROR_INODE(inode, 1448 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1449 ix != NULL ? le32_to_cpu(ix->ei_block) : 0, 1450 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1451 le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, 1452 depth); 1453 return -EFSCORRUPTED; 1454 } 1455 } 1456 return 0; 1457 } 1458 1459 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1460 EXT4_ERROR_INODE(inode, 1461 "logical %d < ee_block %d + ee_len %d!", 1462 *logical, le32_to_cpu(ex->ee_block), ee_len); 1463 return -EFSCORRUPTED; 1464 } 1465 1466 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1467 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1468 return 0; 1469 } 1470 1471 /* 1472 * search the closest allocated block to the right for *logical 1473 * and returns it at @logical + it's physical address at @phys 1474 * if *logical is the largest allocated block, the function 1475 * returns 0 at @phys 1476 * return value contains 0 (success) or error code 1477 */ 1478 static int ext4_ext_search_right(struct inode *inode, 1479 struct ext4_ext_path *path, 1480 ext4_lblk_t *logical, ext4_fsblk_t *phys, 1481 struct ext4_extent **ret_ex) 1482 { 1483 struct buffer_head *bh = NULL; 1484 struct ext4_extent_header *eh; 1485 struct ext4_extent_idx *ix; 1486 struct ext4_extent *ex; 1487 ext4_fsblk_t block; 1488 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1489 int ee_len; 1490 1491 if (unlikely(path == NULL)) { 1492 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1493 return -EFSCORRUPTED; 1494 } 1495 depth = path->p_depth; 1496 *phys = 0; 1497 1498 if (depth == 0 && path->p_ext == NULL) 1499 return 0; 1500 1501 /* usually extent in the path covers blocks smaller 1502 * then *logical, but it can be that extent is the 1503 * first one in the file */ 1504 1505 ex = path[depth].p_ext; 1506 ee_len = ext4_ext_get_actual_len(ex); 1507 if (*logical < le32_to_cpu(ex->ee_block)) { 1508 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1509 EXT4_ERROR_INODE(inode, 1510 "first_extent(path[%d].p_hdr) != ex", 1511 depth); 1512 return -EFSCORRUPTED; 1513 } 1514 while (--depth >= 0) { 1515 ix = path[depth].p_idx; 1516 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1517 EXT4_ERROR_INODE(inode, 1518 "ix != EXT_FIRST_INDEX *logical %d!", 1519 *logical); 1520 return -EFSCORRUPTED; 1521 } 1522 } 1523 goto found_extent; 1524 } 1525 1526 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1527 EXT4_ERROR_INODE(inode, 1528 "logical %d < ee_block %d + ee_len %d!", 1529 *logical, le32_to_cpu(ex->ee_block), ee_len); 1530 return -EFSCORRUPTED; 1531 } 1532 1533 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1534 /* next allocated block in this leaf */ 1535 ex++; 1536 goto found_extent; 1537 } 1538 1539 /* go up and search for index to the right */ 1540 while (--depth >= 0) { 1541 ix = path[depth].p_idx; 1542 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1543 goto got_index; 1544 } 1545 1546 /* we've gone up to the root and found no index to the right */ 1547 return 0; 1548 1549 got_index: 1550 /* we've found index to the right, let's 1551 * follow it and find the closest allocated 1552 * block to the right */ 1553 ix++; 1554 block = ext4_idx_pblock(ix); 1555 while (++depth < path->p_depth) { 1556 /* subtract from p_depth to get proper eh_depth */ 1557 bh = read_extent_tree_block(inode, block, 1558 path->p_depth - depth, 0); 1559 if (IS_ERR(bh)) 1560 return PTR_ERR(bh); 1561 eh = ext_block_hdr(bh); 1562 ix = EXT_FIRST_INDEX(eh); 1563 block = ext4_idx_pblock(ix); 1564 put_bh(bh); 1565 } 1566 1567 bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0); 1568 if (IS_ERR(bh)) 1569 return PTR_ERR(bh); 1570 eh = ext_block_hdr(bh); 1571 ex = EXT_FIRST_EXTENT(eh); 1572 found_extent: 1573 *logical = le32_to_cpu(ex->ee_block); 1574 *phys = ext4_ext_pblock(ex); 1575 *ret_ex = ex; 1576 if (bh) 1577 put_bh(bh); 1578 return 0; 1579 } 1580 1581 /* 1582 * ext4_ext_next_allocated_block: 1583 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. 1584 * NOTE: it considers block number from index entry as 1585 * allocated block. Thus, index entries have to be consistent 1586 * with leaves. 1587 */ 1588 ext4_lblk_t 1589 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1590 { 1591 int depth; 1592 1593 BUG_ON(path == NULL); 1594 depth = path->p_depth; 1595 1596 if (depth == 0 && path->p_ext == NULL) 1597 return EXT_MAX_BLOCKS; 1598 1599 while (depth >= 0) { 1600 if (depth == path->p_depth) { 1601 /* leaf */ 1602 if (path[depth].p_ext && 1603 path[depth].p_ext != 1604 EXT_LAST_EXTENT(path[depth].p_hdr)) 1605 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1606 } else { 1607 /* index */ 1608 if (path[depth].p_idx != 1609 EXT_LAST_INDEX(path[depth].p_hdr)) 1610 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1611 } 1612 depth--; 1613 } 1614 1615 return EXT_MAX_BLOCKS; 1616 } 1617 1618 /* 1619 * ext4_ext_next_leaf_block: 1620 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1621 */ 1622 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) 1623 { 1624 int depth; 1625 1626 BUG_ON(path == NULL); 1627 depth = path->p_depth; 1628 1629 /* zero-tree has no leaf blocks at all */ 1630 if (depth == 0) 1631 return EXT_MAX_BLOCKS; 1632 1633 /* go to index block */ 1634 depth--; 1635 1636 while (depth >= 0) { 1637 if (path[depth].p_idx != 1638 EXT_LAST_INDEX(path[depth].p_hdr)) 1639 return (ext4_lblk_t) 1640 le32_to_cpu(path[depth].p_idx[1].ei_block); 1641 depth--; 1642 } 1643 1644 return EXT_MAX_BLOCKS; 1645 } 1646 1647 /* 1648 * ext4_ext_correct_indexes: 1649 * if leaf gets modified and modified extent is first in the leaf, 1650 * then we have to correct all indexes above. 1651 * TODO: do we need to correct tree in all cases? 1652 */ 1653 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1654 struct ext4_ext_path *path) 1655 { 1656 struct ext4_extent_header *eh; 1657 int depth = ext_depth(inode); 1658 struct ext4_extent *ex; 1659 __le32 border; 1660 int k, err = 0; 1661 1662 eh = path[depth].p_hdr; 1663 ex = path[depth].p_ext; 1664 1665 if (unlikely(ex == NULL || eh == NULL)) { 1666 EXT4_ERROR_INODE(inode, 1667 "ex %p == NULL or eh %p == NULL", ex, eh); 1668 return -EFSCORRUPTED; 1669 } 1670 1671 if (depth == 0) { 1672 /* there is no tree at all */ 1673 return 0; 1674 } 1675 1676 if (ex != EXT_FIRST_EXTENT(eh)) { 1677 /* we correct tree if first leaf got modified only */ 1678 return 0; 1679 } 1680 1681 /* 1682 * TODO: we need correction if border is smaller than current one 1683 */ 1684 k = depth - 1; 1685 border = path[depth].p_ext->ee_block; 1686 err = ext4_ext_get_access(handle, inode, path + k); 1687 if (err) 1688 return err; 1689 path[k].p_idx->ei_block = border; 1690 err = ext4_ext_dirty(handle, inode, path + k); 1691 if (err) 1692 return err; 1693 1694 while (k--) { 1695 /* change all left-side indexes */ 1696 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1697 break; 1698 err = ext4_ext_get_access(handle, inode, path + k); 1699 if (err) 1700 break; 1701 path[k].p_idx->ei_block = border; 1702 err = ext4_ext_dirty(handle, inode, path + k); 1703 if (err) 1704 break; 1705 } 1706 1707 return err; 1708 } 1709 1710 int 1711 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1712 struct ext4_extent *ex2) 1713 { 1714 unsigned short ext1_ee_len, ext2_ee_len; 1715 1716 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) 1717 return 0; 1718 1719 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1720 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1721 1722 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1723 le32_to_cpu(ex2->ee_block)) 1724 return 0; 1725 1726 /* 1727 * To allow future support for preallocated extents to be added 1728 * as an RO_COMPAT feature, refuse to merge to extents if 1729 * this can result in the top bit of ee_len being set. 1730 */ 1731 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1732 return 0; 1733 /* 1734 * The check for IO to unwritten extent is somewhat racy as we 1735 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after 1736 * dropping i_data_sem. But reserved blocks should save us in that 1737 * case. 1738 */ 1739 if (ext4_ext_is_unwritten(ex1) && 1740 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || 1741 atomic_read(&EXT4_I(inode)->i_unwritten) || 1742 (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN))) 1743 return 0; 1744 #ifdef AGGRESSIVE_TEST 1745 if (ext1_ee_len >= 4) 1746 return 0; 1747 #endif 1748 1749 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1750 return 1; 1751 return 0; 1752 } 1753 1754 /* 1755 * This function tries to merge the "ex" extent to the next extent in the tree. 1756 * It always tries to merge towards right. If you want to merge towards 1757 * left, pass "ex - 1" as argument instead of "ex". 1758 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1759 * 1 if they got merged. 1760 */ 1761 static int ext4_ext_try_to_merge_right(struct inode *inode, 1762 struct ext4_ext_path *path, 1763 struct ext4_extent *ex) 1764 { 1765 struct ext4_extent_header *eh; 1766 unsigned int depth, len; 1767 int merge_done = 0, unwritten; 1768 1769 depth = ext_depth(inode); 1770 BUG_ON(path[depth].p_hdr == NULL); 1771 eh = path[depth].p_hdr; 1772 1773 while (ex < EXT_LAST_EXTENT(eh)) { 1774 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1775 break; 1776 /* merge with next extent! */ 1777 unwritten = ext4_ext_is_unwritten(ex); 1778 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1779 + ext4_ext_get_actual_len(ex + 1)); 1780 if (unwritten) 1781 ext4_ext_mark_unwritten(ex); 1782 1783 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1784 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1785 * sizeof(struct ext4_extent); 1786 memmove(ex + 1, ex + 2, len); 1787 } 1788 le16_add_cpu(&eh->eh_entries, -1); 1789 merge_done = 1; 1790 WARN_ON(eh->eh_entries == 0); 1791 if (!eh->eh_entries) 1792 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1793 } 1794 1795 return merge_done; 1796 } 1797 1798 /* 1799 * This function does a very simple check to see if we can collapse 1800 * an extent tree with a single extent tree leaf block into the inode. 1801 */ 1802 static void ext4_ext_try_to_merge_up(handle_t *handle, 1803 struct inode *inode, 1804 struct ext4_ext_path *path) 1805 { 1806 size_t s; 1807 unsigned max_root = ext4_ext_space_root(inode, 0); 1808 ext4_fsblk_t blk; 1809 1810 if ((path[0].p_depth != 1) || 1811 (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) || 1812 (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root)) 1813 return; 1814 1815 /* 1816 * We need to modify the block allocation bitmap and the block 1817 * group descriptor to release the extent tree block. If we 1818 * can't get the journal credits, give up. 1819 */ 1820 if (ext4_journal_extend(handle, 2)) 1821 return; 1822 1823 /* 1824 * Copy the extent data up to the inode 1825 */ 1826 blk = ext4_idx_pblock(path[0].p_idx); 1827 s = le16_to_cpu(path[1].p_hdr->eh_entries) * 1828 sizeof(struct ext4_extent_idx); 1829 s += sizeof(struct ext4_extent_header); 1830 1831 path[1].p_maxdepth = path[0].p_maxdepth; 1832 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1833 path[0].p_depth = 0; 1834 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1835 (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr)); 1836 path[0].p_hdr->eh_max = cpu_to_le16(max_root); 1837 1838 brelse(path[1].p_bh); 1839 ext4_free_blocks(handle, inode, NULL, blk, 1, 1840 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 1841 } 1842 1843 /* 1844 * This function tries to merge the @ex extent to neighbours in the tree. 1845 * return 1 if merge left else 0. 1846 */ 1847 static void ext4_ext_try_to_merge(handle_t *handle, 1848 struct inode *inode, 1849 struct ext4_ext_path *path, 1850 struct ext4_extent *ex) { 1851 struct ext4_extent_header *eh; 1852 unsigned int depth; 1853 int merge_done = 0; 1854 1855 depth = ext_depth(inode); 1856 BUG_ON(path[depth].p_hdr == NULL); 1857 eh = path[depth].p_hdr; 1858 1859 if (ex > EXT_FIRST_EXTENT(eh)) 1860 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1861 1862 if (!merge_done) 1863 (void) ext4_ext_try_to_merge_right(inode, path, ex); 1864 1865 ext4_ext_try_to_merge_up(handle, inode, path); 1866 } 1867 1868 /* 1869 * check if a portion of the "newext" extent overlaps with an 1870 * existing extent. 1871 * 1872 * If there is an overlap discovered, it updates the length of the newext 1873 * such that there will be no overlap, and then returns 1. 1874 * If there is no overlap found, it returns 0. 1875 */ 1876 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, 1877 struct inode *inode, 1878 struct ext4_extent *newext, 1879 struct ext4_ext_path *path) 1880 { 1881 ext4_lblk_t b1, b2; 1882 unsigned int depth, len1; 1883 unsigned int ret = 0; 1884 1885 b1 = le32_to_cpu(newext->ee_block); 1886 len1 = ext4_ext_get_actual_len(newext); 1887 depth = ext_depth(inode); 1888 if (!path[depth].p_ext) 1889 goto out; 1890 b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); 1891 1892 /* 1893 * get the next allocated block if the extent in the path 1894 * is before the requested block(s) 1895 */ 1896 if (b2 < b1) { 1897 b2 = ext4_ext_next_allocated_block(path); 1898 if (b2 == EXT_MAX_BLOCKS) 1899 goto out; 1900 b2 = EXT4_LBLK_CMASK(sbi, b2); 1901 } 1902 1903 /* check for wrap through zero on extent logical start block*/ 1904 if (b1 + len1 < b1) { 1905 len1 = EXT_MAX_BLOCKS - b1; 1906 newext->ee_len = cpu_to_le16(len1); 1907 ret = 1; 1908 } 1909 1910 /* check for overlap */ 1911 if (b1 + len1 > b2) { 1912 newext->ee_len = cpu_to_le16(b2 - b1); 1913 ret = 1; 1914 } 1915 out: 1916 return ret; 1917 } 1918 1919 /* 1920 * ext4_ext_insert_extent: 1921 * tries to merge requsted extent into the existing extent or 1922 * inserts requested extent as new one into the tree, 1923 * creating new leaf in the no-space case. 1924 */ 1925 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1926 struct ext4_ext_path **ppath, 1927 struct ext4_extent *newext, int gb_flags) 1928 { 1929 struct ext4_ext_path *path = *ppath; 1930 struct ext4_extent_header *eh; 1931 struct ext4_extent *ex, *fex; 1932 struct ext4_extent *nearex; /* nearest extent */ 1933 struct ext4_ext_path *npath = NULL; 1934 int depth, len, err; 1935 ext4_lblk_t next; 1936 int mb_flags = 0, unwritten; 1937 1938 if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1939 mb_flags |= EXT4_MB_DELALLOC_RESERVED; 1940 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1941 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1942 return -EFSCORRUPTED; 1943 } 1944 depth = ext_depth(inode); 1945 ex = path[depth].p_ext; 1946 eh = path[depth].p_hdr; 1947 if (unlikely(path[depth].p_hdr == NULL)) { 1948 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1949 return -EFSCORRUPTED; 1950 } 1951 1952 /* try to insert block into found extent and return */ 1953 if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 1954 1955 /* 1956 * Try to see whether we should rather test the extent on 1957 * right from ex, or from the left of ex. This is because 1958 * ext4_find_extent() can return either extent on the 1959 * left, or on the right from the searched position. This 1960 * will make merging more effective. 1961 */ 1962 if (ex < EXT_LAST_EXTENT(eh) && 1963 (le32_to_cpu(ex->ee_block) + 1964 ext4_ext_get_actual_len(ex) < 1965 le32_to_cpu(newext->ee_block))) { 1966 ex += 1; 1967 goto prepend; 1968 } else if ((ex > EXT_FIRST_EXTENT(eh)) && 1969 (le32_to_cpu(newext->ee_block) + 1970 ext4_ext_get_actual_len(newext) < 1971 le32_to_cpu(ex->ee_block))) 1972 ex -= 1; 1973 1974 /* Try to append newex to the ex */ 1975 if (ext4_can_extents_be_merged(inode, ex, newext)) { 1976 ext_debug("append [%d]%d block to %u:[%d]%d" 1977 "(from %llu)\n", 1978 ext4_ext_is_unwritten(newext), 1979 ext4_ext_get_actual_len(newext), 1980 le32_to_cpu(ex->ee_block), 1981 ext4_ext_is_unwritten(ex), 1982 ext4_ext_get_actual_len(ex), 1983 ext4_ext_pblock(ex)); 1984 err = ext4_ext_get_access(handle, inode, 1985 path + depth); 1986 if (err) 1987 return err; 1988 unwritten = ext4_ext_is_unwritten(ex); 1989 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1990 + ext4_ext_get_actual_len(newext)); 1991 if (unwritten) 1992 ext4_ext_mark_unwritten(ex); 1993 eh = path[depth].p_hdr; 1994 nearex = ex; 1995 goto merge; 1996 } 1997 1998 prepend: 1999 /* Try to prepend newex to the ex */ 2000 if (ext4_can_extents_be_merged(inode, newext, ex)) { 2001 ext_debug("prepend %u[%d]%d block to %u:[%d]%d" 2002 "(from %llu)\n", 2003 le32_to_cpu(newext->ee_block), 2004 ext4_ext_is_unwritten(newext), 2005 ext4_ext_get_actual_len(newext), 2006 le32_to_cpu(ex->ee_block), 2007 ext4_ext_is_unwritten(ex), 2008 ext4_ext_get_actual_len(ex), 2009 ext4_ext_pblock(ex)); 2010 err = ext4_ext_get_access(handle, inode, 2011 path + depth); 2012 if (err) 2013 return err; 2014 2015 unwritten = ext4_ext_is_unwritten(ex); 2016 ex->ee_block = newext->ee_block; 2017 ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); 2018 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 2019 + ext4_ext_get_actual_len(newext)); 2020 if (unwritten) 2021 ext4_ext_mark_unwritten(ex); 2022 eh = path[depth].p_hdr; 2023 nearex = ex; 2024 goto merge; 2025 } 2026 } 2027 2028 depth = ext_depth(inode); 2029 eh = path[depth].p_hdr; 2030 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 2031 goto has_space; 2032 2033 /* probably next leaf has space for us? */ 2034 fex = EXT_LAST_EXTENT(eh); 2035 next = EXT_MAX_BLOCKS; 2036 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) 2037 next = ext4_ext_next_leaf_block(path); 2038 if (next != EXT_MAX_BLOCKS) { 2039 ext_debug("next leaf block - %u\n", next); 2040 BUG_ON(npath != NULL); 2041 npath = ext4_find_extent(inode, next, NULL, 0); 2042 if (IS_ERR(npath)) 2043 return PTR_ERR(npath); 2044 BUG_ON(npath->p_depth != path->p_depth); 2045 eh = npath[depth].p_hdr; 2046 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 2047 ext_debug("next leaf isn't full(%d)\n", 2048 le16_to_cpu(eh->eh_entries)); 2049 path = npath; 2050 goto has_space; 2051 } 2052 ext_debug("next leaf has no free space(%d,%d)\n", 2053 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 2054 } 2055 2056 /* 2057 * There is no free space in the found leaf. 2058 * We're gonna add a new leaf in the tree. 2059 */ 2060 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2061 mb_flags |= EXT4_MB_USE_RESERVED; 2062 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2063 ppath, newext); 2064 if (err) 2065 goto cleanup; 2066 depth = ext_depth(inode); 2067 eh = path[depth].p_hdr; 2068 2069 has_space: 2070 nearex = path[depth].p_ext; 2071 2072 err = ext4_ext_get_access(handle, inode, path + depth); 2073 if (err) 2074 goto cleanup; 2075 2076 if (!nearex) { 2077 /* there is no extent in this leaf, create first one */ 2078 ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", 2079 le32_to_cpu(newext->ee_block), 2080 ext4_ext_pblock(newext), 2081 ext4_ext_is_unwritten(newext), 2082 ext4_ext_get_actual_len(newext)); 2083 nearex = EXT_FIRST_EXTENT(eh); 2084 } else { 2085 if (le32_to_cpu(newext->ee_block) 2086 > le32_to_cpu(nearex->ee_block)) { 2087 /* Insert after */ 2088 ext_debug("insert %u:%llu:[%d]%d before: " 2089 "nearest %p\n", 2090 le32_to_cpu(newext->ee_block), 2091 ext4_ext_pblock(newext), 2092 ext4_ext_is_unwritten(newext), 2093 ext4_ext_get_actual_len(newext), 2094 nearex); 2095 nearex++; 2096 } else { 2097 /* Insert before */ 2098 BUG_ON(newext->ee_block == nearex->ee_block); 2099 ext_debug("insert %u:%llu:[%d]%d after: " 2100 "nearest %p\n", 2101 le32_to_cpu(newext->ee_block), 2102 ext4_ext_pblock(newext), 2103 ext4_ext_is_unwritten(newext), 2104 ext4_ext_get_actual_len(newext), 2105 nearex); 2106 } 2107 len = EXT_LAST_EXTENT(eh) - nearex + 1; 2108 if (len > 0) { 2109 ext_debug("insert %u:%llu:[%d]%d: " 2110 "move %d extents from 0x%p to 0x%p\n", 2111 le32_to_cpu(newext->ee_block), 2112 ext4_ext_pblock(newext), 2113 ext4_ext_is_unwritten(newext), 2114 ext4_ext_get_actual_len(newext), 2115 len, nearex, nearex + 1); 2116 memmove(nearex + 1, nearex, 2117 len * sizeof(struct ext4_extent)); 2118 } 2119 } 2120 2121 le16_add_cpu(&eh->eh_entries, 1); 2122 path[depth].p_ext = nearex; 2123 nearex->ee_block = newext->ee_block; 2124 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 2125 nearex->ee_len = newext->ee_len; 2126 2127 merge: 2128 /* try to merge extents */ 2129 if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 2130 ext4_ext_try_to_merge(handle, inode, path, nearex); 2131 2132 2133 /* time to correct all indexes above */ 2134 err = ext4_ext_correct_indexes(handle, inode, path); 2135 if (err) 2136 goto cleanup; 2137 2138 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2139 2140 cleanup: 2141 ext4_ext_drop_refs(npath); 2142 kfree(npath); 2143 return err; 2144 } 2145 2146 static int ext4_fill_fiemap_extents(struct inode *inode, 2147 ext4_lblk_t block, ext4_lblk_t num, 2148 struct fiemap_extent_info *fieinfo) 2149 { 2150 struct ext4_ext_path *path = NULL; 2151 struct ext4_extent *ex; 2152 struct extent_status es; 2153 ext4_lblk_t next, next_del, start = 0, end = 0; 2154 ext4_lblk_t last = block + num; 2155 int exists, depth = 0, err = 0; 2156 unsigned int flags = 0; 2157 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; 2158 2159 while (block < last && block != EXT_MAX_BLOCKS) { 2160 num = last - block; 2161 /* find extent for this block */ 2162 down_read(&EXT4_I(inode)->i_data_sem); 2163 2164 path = ext4_find_extent(inode, block, &path, 0); 2165 if (IS_ERR(path)) { 2166 up_read(&EXT4_I(inode)->i_data_sem); 2167 err = PTR_ERR(path); 2168 path = NULL; 2169 break; 2170 } 2171 2172 depth = ext_depth(inode); 2173 if (unlikely(path[depth].p_hdr == NULL)) { 2174 up_read(&EXT4_I(inode)->i_data_sem); 2175 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2176 err = -EFSCORRUPTED; 2177 break; 2178 } 2179 ex = path[depth].p_ext; 2180 next = ext4_ext_next_allocated_block(path); 2181 2182 flags = 0; 2183 exists = 0; 2184 if (!ex) { 2185 /* there is no extent yet, so try to allocate 2186 * all requested space */ 2187 start = block; 2188 end = block + num; 2189 } else if (le32_to_cpu(ex->ee_block) > block) { 2190 /* need to allocate space before found extent */ 2191 start = block; 2192 end = le32_to_cpu(ex->ee_block); 2193 if (block + num < end) 2194 end = block + num; 2195 } else if (block >= le32_to_cpu(ex->ee_block) 2196 + ext4_ext_get_actual_len(ex)) { 2197 /* need to allocate space after found extent */ 2198 start = block; 2199 end = block + num; 2200 if (end >= next) 2201 end = next; 2202 } else if (block >= le32_to_cpu(ex->ee_block)) { 2203 /* 2204 * some part of requested space is covered 2205 * by found extent 2206 */ 2207 start = block; 2208 end = le32_to_cpu(ex->ee_block) 2209 + ext4_ext_get_actual_len(ex); 2210 if (block + num < end) 2211 end = block + num; 2212 exists = 1; 2213 } else { 2214 BUG(); 2215 } 2216 BUG_ON(end <= start); 2217 2218 if (!exists) { 2219 es.es_lblk = start; 2220 es.es_len = end - start; 2221 es.es_pblk = 0; 2222 } else { 2223 es.es_lblk = le32_to_cpu(ex->ee_block); 2224 es.es_len = ext4_ext_get_actual_len(ex); 2225 es.es_pblk = ext4_ext_pblock(ex); 2226 if (ext4_ext_is_unwritten(ex)) 2227 flags |= FIEMAP_EXTENT_UNWRITTEN; 2228 } 2229 2230 /* 2231 * Find delayed extent and update es accordingly. We call 2232 * it even in !exists case to find out whether es is the 2233 * last existing extent or not. 2234 */ 2235 next_del = ext4_find_delayed_extent(inode, &es); 2236 if (!exists && next_del) { 2237 exists = 1; 2238 flags |= (FIEMAP_EXTENT_DELALLOC | 2239 FIEMAP_EXTENT_UNKNOWN); 2240 } 2241 up_read(&EXT4_I(inode)->i_data_sem); 2242 2243 if (unlikely(es.es_len == 0)) { 2244 EXT4_ERROR_INODE(inode, "es.es_len == 0"); 2245 err = -EFSCORRUPTED; 2246 break; 2247 } 2248 2249 /* 2250 * This is possible iff next == next_del == EXT_MAX_BLOCKS. 2251 * we need to check next == EXT_MAX_BLOCKS because it is 2252 * possible that an extent is with unwritten and delayed 2253 * status due to when an extent is delayed allocated and 2254 * is allocated by fallocate status tree will track both of 2255 * them in a extent. 2256 * 2257 * So we could return a unwritten and delayed extent, and 2258 * its block is equal to 'next'. 2259 */ 2260 if (next == next_del && next == EXT_MAX_BLOCKS) { 2261 flags |= FIEMAP_EXTENT_LAST; 2262 if (unlikely(next_del != EXT_MAX_BLOCKS || 2263 next != EXT_MAX_BLOCKS)) { 2264 EXT4_ERROR_INODE(inode, 2265 "next extent == %u, next " 2266 "delalloc extent = %u", 2267 next, next_del); 2268 err = -EFSCORRUPTED; 2269 break; 2270 } 2271 } 2272 2273 if (exists) { 2274 err = fiemap_fill_next_extent(fieinfo, 2275 (__u64)es.es_lblk << blksize_bits, 2276 (__u64)es.es_pblk << blksize_bits, 2277 (__u64)es.es_len << blksize_bits, 2278 flags); 2279 if (err < 0) 2280 break; 2281 if (err == 1) { 2282 err = 0; 2283 break; 2284 } 2285 } 2286 2287 block = es.es_lblk + es.es_len; 2288 } 2289 2290 ext4_ext_drop_refs(path); 2291 kfree(path); 2292 return err; 2293 } 2294 2295 /* 2296 * ext4_ext_determine_hole - determine hole around given block 2297 * @inode: inode we lookup in 2298 * @path: path in extent tree to @lblk 2299 * @lblk: pointer to logical block around which we want to determine hole 2300 * 2301 * Determine hole length (and start if easily possible) around given logical 2302 * block. We don't try too hard to find the beginning of the hole but @path 2303 * actually points to extent before @lblk, we provide it. 2304 * 2305 * The function returns the length of a hole starting at @lblk. We update @lblk 2306 * to the beginning of the hole if we managed to find it. 2307 */ 2308 static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, 2309 struct ext4_ext_path *path, 2310 ext4_lblk_t *lblk) 2311 { 2312 int depth = ext_depth(inode); 2313 struct ext4_extent *ex; 2314 ext4_lblk_t len; 2315 2316 ex = path[depth].p_ext; 2317 if (ex == NULL) { 2318 /* there is no extent yet, so gap is [0;-] */ 2319 *lblk = 0; 2320 len = EXT_MAX_BLOCKS; 2321 } else if (*lblk < le32_to_cpu(ex->ee_block)) { 2322 len = le32_to_cpu(ex->ee_block) - *lblk; 2323 } else if (*lblk >= le32_to_cpu(ex->ee_block) 2324 + ext4_ext_get_actual_len(ex)) { 2325 ext4_lblk_t next; 2326 2327 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); 2328 next = ext4_ext_next_allocated_block(path); 2329 BUG_ON(next == *lblk); 2330 len = next - *lblk; 2331 } else { 2332 BUG(); 2333 } 2334 return len; 2335 } 2336 2337 /* 2338 * ext4_ext_put_gap_in_cache: 2339 * calculate boundaries of the gap that the requested block fits into 2340 * and cache this gap 2341 */ 2342 static void 2343 ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, 2344 ext4_lblk_t hole_len) 2345 { 2346 struct extent_status es; 2347 2348 ext4_es_find_delayed_extent_range(inode, hole_start, 2349 hole_start + hole_len - 1, &es); 2350 if (es.es_len) { 2351 /* There's delayed extent containing lblock? */ 2352 if (es.es_lblk <= hole_start) 2353 return; 2354 hole_len = min(es.es_lblk - hole_start, hole_len); 2355 } 2356 ext_debug(" -> %u:%u\n", hole_start, hole_len); 2357 ext4_es_insert_extent(inode, hole_start, hole_len, ~0, 2358 EXTENT_STATUS_HOLE); 2359 } 2360 2361 /* 2362 * ext4_ext_rm_idx: 2363 * removes index from the index block. 2364 */ 2365 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2366 struct ext4_ext_path *path, int depth) 2367 { 2368 int err; 2369 ext4_fsblk_t leaf; 2370 2371 /* free index block */ 2372 depth--; 2373 path = path + depth; 2374 leaf = ext4_idx_pblock(path->p_idx); 2375 if (unlikely(path->p_hdr->eh_entries == 0)) { 2376 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2377 return -EFSCORRUPTED; 2378 } 2379 err = ext4_ext_get_access(handle, inode, path); 2380 if (err) 2381 return err; 2382 2383 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { 2384 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; 2385 len *= sizeof(struct ext4_extent_idx); 2386 memmove(path->p_idx, path->p_idx + 1, len); 2387 } 2388 2389 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2390 err = ext4_ext_dirty(handle, inode, path); 2391 if (err) 2392 return err; 2393 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2394 trace_ext4_ext_rm_idx(inode, leaf); 2395 2396 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2397 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2398 2399 while (--depth >= 0) { 2400 if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) 2401 break; 2402 path--; 2403 err = ext4_ext_get_access(handle, inode, path); 2404 if (err) 2405 break; 2406 path->p_idx->ei_block = (path+1)->p_idx->ei_block; 2407 err = ext4_ext_dirty(handle, inode, path); 2408 if (err) 2409 break; 2410 } 2411 return err; 2412 } 2413 2414 /* 2415 * ext4_ext_calc_credits_for_single_extent: 2416 * This routine returns max. credits that needed to insert an extent 2417 * to the extent tree. 2418 * When pass the actual path, the caller should calculate credits 2419 * under i_data_sem. 2420 */ 2421 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2422 struct ext4_ext_path *path) 2423 { 2424 if (path) { 2425 int depth = ext_depth(inode); 2426 int ret = 0; 2427 2428 /* probably there is space in leaf? */ 2429 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2430 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2431 2432 /* 2433 * There are some space in the leaf tree, no 2434 * need to account for leaf block credit 2435 * 2436 * bitmaps and block group descriptor blocks 2437 * and other metadata blocks still need to be 2438 * accounted. 2439 */ 2440 /* 1 bitmap, 1 block group descriptor */ 2441 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2442 return ret; 2443 } 2444 } 2445 2446 return ext4_chunk_trans_blocks(inode, nrblocks); 2447 } 2448 2449 /* 2450 * How many index/leaf blocks need to change/allocate to add @extents extents? 2451 * 2452 * If we add a single extent, then in the worse case, each tree level 2453 * index/leaf need to be changed in case of the tree split. 2454 * 2455 * If more extents are inserted, they could cause the whole tree split more 2456 * than once, but this is really rare. 2457 */ 2458 int ext4_ext_index_trans_blocks(struct inode *inode, int extents) 2459 { 2460 int index; 2461 int depth; 2462 2463 /* If we are converting the inline data, only one is needed here. */ 2464 if (ext4_has_inline_data(inode)) 2465 return 1; 2466 2467 depth = ext_depth(inode); 2468 2469 if (extents <= 1) 2470 index = depth * 2; 2471 else 2472 index = depth * 3; 2473 2474 return index; 2475 } 2476 2477 static inline int get_default_free_blocks_flags(struct inode *inode) 2478 { 2479 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || 2480 ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE)) 2481 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET; 2482 else if (ext4_should_journal_data(inode)) 2483 return EXT4_FREE_BLOCKS_FORGET; 2484 return 0; 2485 } 2486 2487 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2488 struct ext4_extent *ex, 2489 long long *partial_cluster, 2490 ext4_lblk_t from, ext4_lblk_t to) 2491 { 2492 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2493 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2494 ext4_fsblk_t pblk; 2495 int flags = get_default_free_blocks_flags(inode); 2496 2497 /* 2498 * For bigalloc file systems, we never free a partial cluster 2499 * at the beginning of the extent. Instead, we make a note 2500 * that we tried freeing the cluster, and check to see if we 2501 * need to free it on a subsequent call to ext4_remove_blocks, 2502 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. 2503 */ 2504 flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; 2505 2506 trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster); 2507 /* 2508 * If we have a partial cluster, and it's different from the 2509 * cluster of the last block, we need to explicitly free the 2510 * partial cluster here. 2511 */ 2512 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2513 if (*partial_cluster > 0 && 2514 *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { 2515 ext4_free_blocks(handle, inode, NULL, 2516 EXT4_C2B(sbi, *partial_cluster), 2517 sbi->s_cluster_ratio, flags); 2518 *partial_cluster = 0; 2519 } 2520 2521 #ifdef EXTENTS_STATS 2522 { 2523 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2524 spin_lock(&sbi->s_ext_stats_lock); 2525 sbi->s_ext_blocks += ee_len; 2526 sbi->s_ext_extents++; 2527 if (ee_len < sbi->s_ext_min) 2528 sbi->s_ext_min = ee_len; 2529 if (ee_len > sbi->s_ext_max) 2530 sbi->s_ext_max = ee_len; 2531 if (ext_depth(inode) > sbi->s_depth_max) 2532 sbi->s_depth_max = ext_depth(inode); 2533 spin_unlock(&sbi->s_ext_stats_lock); 2534 } 2535 #endif 2536 if (from >= le32_to_cpu(ex->ee_block) 2537 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2538 /* tail removal */ 2539 ext4_lblk_t num; 2540 long long first_cluster; 2541 2542 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2543 pblk = ext4_ext_pblock(ex) + ee_len - num; 2544 /* 2545 * Usually we want to free partial cluster at the end of the 2546 * extent, except for the situation when the cluster is still 2547 * used by any other extent (partial_cluster is negative). 2548 */ 2549 if (*partial_cluster < 0 && 2550 *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) 2551 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; 2552 2553 ext_debug("free last %u blocks starting %llu partial %lld\n", 2554 num, pblk, *partial_cluster); 2555 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2556 /* 2557 * If the block range to be freed didn't start at the 2558 * beginning of a cluster, and we removed the entire 2559 * extent and the cluster is not used by any other extent, 2560 * save the partial cluster here, since we might need to 2561 * delete if we determine that the truncate or punch hole 2562 * operation has removed all of the blocks in the cluster. 2563 * If that cluster is used by another extent, preserve its 2564 * negative value so it isn't freed later on. 2565 * 2566 * If the whole extent wasn't freed, we've reached the 2567 * start of the truncated/punched region and have finished 2568 * removing blocks. If there's a partial cluster here it's 2569 * shared with the remainder of the extent and is no longer 2570 * a candidate for removal. 2571 */ 2572 if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { 2573 first_cluster = (long long) EXT4_B2C(sbi, pblk); 2574 if (first_cluster != -*partial_cluster) 2575 *partial_cluster = first_cluster; 2576 } else { 2577 *partial_cluster = 0; 2578 } 2579 } else 2580 ext4_error(sbi->s_sb, "strange request: removal(2) " 2581 "%u-%u from %u:%u", 2582 from, to, le32_to_cpu(ex->ee_block), ee_len); 2583 return 0; 2584 } 2585 2586 2587 /* 2588 * ext4_ext_rm_leaf() Removes the extents associated with the 2589 * blocks appearing between "start" and "end". Both "start" 2590 * and "end" must appear in the same extent or EIO is returned. 2591 * 2592 * @handle: The journal handle 2593 * @inode: The files inode 2594 * @path: The path to the leaf 2595 * @partial_cluster: The cluster which we'll have to free if all extents 2596 * has been released from it. However, if this value is 2597 * negative, it's a cluster just to the right of the 2598 * punched region and it must not be freed. 2599 * @start: The first block to remove 2600 * @end: The last block to remove 2601 */ 2602 static int 2603 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2604 struct ext4_ext_path *path, 2605 long long *partial_cluster, 2606 ext4_lblk_t start, ext4_lblk_t end) 2607 { 2608 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2609 int err = 0, correct_index = 0; 2610 int depth = ext_depth(inode), credits; 2611 struct ext4_extent_header *eh; 2612 ext4_lblk_t a, b; 2613 unsigned num; 2614 ext4_lblk_t ex_ee_block; 2615 unsigned short ex_ee_len; 2616 unsigned unwritten = 0; 2617 struct ext4_extent *ex; 2618 ext4_fsblk_t pblk; 2619 2620 /* the header must be checked already in ext4_ext_remove_space() */ 2621 ext_debug("truncate since %u in leaf to %u\n", start, end); 2622 if (!path[depth].p_hdr) 2623 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2624 eh = path[depth].p_hdr; 2625 if (unlikely(path[depth].p_hdr == NULL)) { 2626 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2627 return -EFSCORRUPTED; 2628 } 2629 /* find where to start removing */ 2630 ex = path[depth].p_ext; 2631 if (!ex) 2632 ex = EXT_LAST_EXTENT(eh); 2633 2634 ex_ee_block = le32_to_cpu(ex->ee_block); 2635 ex_ee_len = ext4_ext_get_actual_len(ex); 2636 2637 trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); 2638 2639 while (ex >= EXT_FIRST_EXTENT(eh) && 2640 ex_ee_block + ex_ee_len > start) { 2641 2642 if (ext4_ext_is_unwritten(ex)) 2643 unwritten = 1; 2644 else 2645 unwritten = 0; 2646 2647 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2648 unwritten, ex_ee_len); 2649 path[depth].p_ext = ex; 2650 2651 a = ex_ee_block > start ? ex_ee_block : start; 2652 b = ex_ee_block+ex_ee_len - 1 < end ? 2653 ex_ee_block+ex_ee_len - 1 : end; 2654 2655 ext_debug(" border %u:%u\n", a, b); 2656 2657 /* If this extent is beyond the end of the hole, skip it */ 2658 if (end < ex_ee_block) { 2659 /* 2660 * We're going to skip this extent and move to another, 2661 * so note that its first cluster is in use to avoid 2662 * freeing it when removing blocks. Eventually, the 2663 * right edge of the truncated/punched region will 2664 * be just to the left. 2665 */ 2666 if (sbi->s_cluster_ratio > 1) { 2667 pblk = ext4_ext_pblock(ex); 2668 *partial_cluster = 2669 -(long long) EXT4_B2C(sbi, pblk); 2670 } 2671 ex--; 2672 ex_ee_block = le32_to_cpu(ex->ee_block); 2673 ex_ee_len = ext4_ext_get_actual_len(ex); 2674 continue; 2675 } else if (b != ex_ee_block + ex_ee_len - 1) { 2676 EXT4_ERROR_INODE(inode, 2677 "can not handle truncate %u:%u " 2678 "on extent %u:%u", 2679 start, end, ex_ee_block, 2680 ex_ee_block + ex_ee_len - 1); 2681 err = -EFSCORRUPTED; 2682 goto out; 2683 } else if (a != ex_ee_block) { 2684 /* remove tail of the extent */ 2685 num = a - ex_ee_block; 2686 } else { 2687 /* remove whole extent: excellent! */ 2688 num = 0; 2689 } 2690 /* 2691 * 3 for leaf, sb, and inode plus 2 (bmap and group 2692 * descriptor) for each block group; assume two block 2693 * groups plus ex_ee_len/blocks_per_block_group for 2694 * the worst case 2695 */ 2696 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2697 if (ex == EXT_FIRST_EXTENT(eh)) { 2698 correct_index = 1; 2699 credits += (ext_depth(inode)) + 1; 2700 } 2701 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2702 2703 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2704 if (err) 2705 goto out; 2706 2707 err = ext4_ext_get_access(handle, inode, path + depth); 2708 if (err) 2709 goto out; 2710 2711 err = ext4_remove_blocks(handle, inode, ex, partial_cluster, 2712 a, b); 2713 if (err) 2714 goto out; 2715 2716 if (num == 0) 2717 /* this extent is removed; mark slot entirely unused */ 2718 ext4_ext_store_pblock(ex, 0); 2719 2720 ex->ee_len = cpu_to_le16(num); 2721 /* 2722 * Do not mark unwritten if all the blocks in the 2723 * extent have been removed. 2724 */ 2725 if (unwritten && num) 2726 ext4_ext_mark_unwritten(ex); 2727 /* 2728 * If the extent was completely released, 2729 * we need to remove it from the leaf 2730 */ 2731 if (num == 0) { 2732 if (end != EXT_MAX_BLOCKS - 1) { 2733 /* 2734 * For hole punching, we need to scoot all the 2735 * extents up when an extent is removed so that 2736 * we dont have blank extents in the middle 2737 */ 2738 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) * 2739 sizeof(struct ext4_extent)); 2740 2741 /* Now get rid of the one at the end */ 2742 memset(EXT_LAST_EXTENT(eh), 0, 2743 sizeof(struct ext4_extent)); 2744 } 2745 le16_add_cpu(&eh->eh_entries, -1); 2746 } 2747 2748 err = ext4_ext_dirty(handle, inode, path + depth); 2749 if (err) 2750 goto out; 2751 2752 ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, 2753 ext4_ext_pblock(ex)); 2754 ex--; 2755 ex_ee_block = le32_to_cpu(ex->ee_block); 2756 ex_ee_len = ext4_ext_get_actual_len(ex); 2757 } 2758 2759 if (correct_index && eh->eh_entries) 2760 err = ext4_ext_correct_indexes(handle, inode, path); 2761 2762 /* 2763 * If there's a partial cluster and at least one extent remains in 2764 * the leaf, free the partial cluster if it isn't shared with the 2765 * current extent. If it is shared with the current extent 2766 * we zero partial_cluster because we've reached the start of the 2767 * truncated/punched region and we're done removing blocks. 2768 */ 2769 if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { 2770 pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; 2771 if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { 2772 ext4_free_blocks(handle, inode, NULL, 2773 EXT4_C2B(sbi, *partial_cluster), 2774 sbi->s_cluster_ratio, 2775 get_default_free_blocks_flags(inode)); 2776 } 2777 *partial_cluster = 0; 2778 } 2779 2780 /* if this leaf is free, then we should 2781 * remove it from index block above */ 2782 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2783 err = ext4_ext_rm_idx(handle, inode, path, depth); 2784 2785 out: 2786 return err; 2787 } 2788 2789 /* 2790 * ext4_ext_more_to_rm: 2791 * returns 1 if current index has to be freed (even partial) 2792 */ 2793 static int 2794 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2795 { 2796 BUG_ON(path->p_idx == NULL); 2797 2798 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2799 return 0; 2800 2801 /* 2802 * if truncate on deeper level happened, it wasn't partial, 2803 * so we have to consider current index for truncation 2804 */ 2805 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2806 return 0; 2807 return 1; 2808 } 2809 2810 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2811 ext4_lblk_t end) 2812 { 2813 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2814 int depth = ext_depth(inode); 2815 struct ext4_ext_path *path = NULL; 2816 long long partial_cluster = 0; 2817 handle_t *handle; 2818 int i = 0, err = 0; 2819 2820 ext_debug("truncate since %u to %u\n", start, end); 2821 2822 /* probably first extent we're gonna free will be last in block */ 2823 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1); 2824 if (IS_ERR(handle)) 2825 return PTR_ERR(handle); 2826 2827 again: 2828 trace_ext4_ext_remove_space(inode, start, end, depth); 2829 2830 /* 2831 * Check if we are removing extents inside the extent tree. If that 2832 * is the case, we are going to punch a hole inside the extent tree 2833 * so we have to check whether we need to split the extent covering 2834 * the last block to remove so we can easily remove the part of it 2835 * in ext4_ext_rm_leaf(). 2836 */ 2837 if (end < EXT_MAX_BLOCKS - 1) { 2838 struct ext4_extent *ex; 2839 ext4_lblk_t ee_block, ex_end, lblk; 2840 ext4_fsblk_t pblk; 2841 2842 /* find extent for or closest extent to this block */ 2843 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2844 if (IS_ERR(path)) { 2845 ext4_journal_stop(handle); 2846 return PTR_ERR(path); 2847 } 2848 depth = ext_depth(inode); 2849 /* Leaf not may not exist only if inode has no blocks at all */ 2850 ex = path[depth].p_ext; 2851 if (!ex) { 2852 if (depth) { 2853 EXT4_ERROR_INODE(inode, 2854 "path[%d].p_hdr == NULL", 2855 depth); 2856 err = -EFSCORRUPTED; 2857 } 2858 goto out; 2859 } 2860 2861 ee_block = le32_to_cpu(ex->ee_block); 2862 ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; 2863 2864 /* 2865 * See if the last block is inside the extent, if so split 2866 * the extent at 'end' block so we can easily remove the 2867 * tail of the first part of the split extent in 2868 * ext4_ext_rm_leaf(). 2869 */ 2870 if (end >= ee_block && end < ex_end) { 2871 2872 /* 2873 * If we're going to split the extent, note that 2874 * the cluster containing the block after 'end' is 2875 * in use to avoid freeing it when removing blocks. 2876 */ 2877 if (sbi->s_cluster_ratio > 1) { 2878 pblk = ext4_ext_pblock(ex) + end - ee_block + 2; 2879 partial_cluster = 2880 -(long long) EXT4_B2C(sbi, pblk); 2881 } 2882 2883 /* 2884 * Split the extent in two so that 'end' is the last 2885 * block in the first new extent. Also we should not 2886 * fail removing space due to ENOSPC so try to use 2887 * reserved block if that happens. 2888 */ 2889 err = ext4_force_split_extent_at(handle, inode, &path, 2890 end + 1, 1); 2891 if (err < 0) 2892 goto out; 2893 2894 } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { 2895 /* 2896 * If there's an extent to the right its first cluster 2897 * contains the immediate right boundary of the 2898 * truncated/punched region. Set partial_cluster to 2899 * its negative value so it won't be freed if shared 2900 * with the current extent. The end < ee_block case 2901 * is handled in ext4_ext_rm_leaf(). 2902 */ 2903 lblk = ex_end + 1; 2904 err = ext4_ext_search_right(inode, path, &lblk, &pblk, 2905 &ex); 2906 if (err) 2907 goto out; 2908 if (pblk) 2909 partial_cluster = 2910 -(long long) EXT4_B2C(sbi, pblk); 2911 } 2912 } 2913 /* 2914 * We start scanning from right side, freeing all the blocks 2915 * after i_size and walking into the tree depth-wise. 2916 */ 2917 depth = ext_depth(inode); 2918 if (path) { 2919 int k = i = depth; 2920 while (--k > 0) 2921 path[k].p_block = 2922 le16_to_cpu(path[k].p_hdr->eh_entries)+1; 2923 } else { 2924 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), 2925 GFP_NOFS); 2926 if (path == NULL) { 2927 ext4_journal_stop(handle); 2928 return -ENOMEM; 2929 } 2930 path[0].p_maxdepth = path[0].p_depth = depth; 2931 path[0].p_hdr = ext_inode_hdr(inode); 2932 i = 0; 2933 2934 if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { 2935 err = -EFSCORRUPTED; 2936 goto out; 2937 } 2938 } 2939 err = 0; 2940 2941 while (i >= 0 && err == 0) { 2942 if (i == depth) { 2943 /* this is leaf block */ 2944 err = ext4_ext_rm_leaf(handle, inode, path, 2945 &partial_cluster, start, 2946 end); 2947 /* root level has p_bh == NULL, brelse() eats this */ 2948 brelse(path[i].p_bh); 2949 path[i].p_bh = NULL; 2950 i--; 2951 continue; 2952 } 2953 2954 /* this is index block */ 2955 if (!path[i].p_hdr) { 2956 ext_debug("initialize header\n"); 2957 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2958 } 2959 2960 if (!path[i].p_idx) { 2961 /* this level hasn't been touched yet */ 2962 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2963 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2964 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2965 path[i].p_hdr, 2966 le16_to_cpu(path[i].p_hdr->eh_entries)); 2967 } else { 2968 /* we were already here, see at next index */ 2969 path[i].p_idx--; 2970 } 2971 2972 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2973 i, EXT_FIRST_INDEX(path[i].p_hdr), 2974 path[i].p_idx); 2975 if (ext4_ext_more_to_rm(path + i)) { 2976 struct buffer_head *bh; 2977 /* go to the next level */ 2978 ext_debug("move to level %d (block %llu)\n", 2979 i + 1, ext4_idx_pblock(path[i].p_idx)); 2980 memset(path + i + 1, 0, sizeof(*path)); 2981 bh = read_extent_tree_block(inode, 2982 ext4_idx_pblock(path[i].p_idx), depth - i - 1, 2983 EXT4_EX_NOCACHE); 2984 if (IS_ERR(bh)) { 2985 /* should we reset i_size? */ 2986 err = PTR_ERR(bh); 2987 break; 2988 } 2989 /* Yield here to deal with large extent trees. 2990 * Should be a no-op if we did IO above. */ 2991 cond_resched(); 2992 if (WARN_ON(i + 1 > depth)) { 2993 err = -EFSCORRUPTED; 2994 break; 2995 } 2996 path[i + 1].p_bh = bh; 2997 2998 /* save actual number of indexes since this 2999 * number is changed at the next iteration */ 3000 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 3001 i++; 3002 } else { 3003 /* we finished processing this index, go up */ 3004 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 3005 /* index is empty, remove it; 3006 * handle must be already prepared by the 3007 * truncatei_leaf() */ 3008 err = ext4_ext_rm_idx(handle, inode, path, i); 3009 } 3010 /* root level has p_bh == NULL, brelse() eats this */ 3011 brelse(path[i].p_bh); 3012 path[i].p_bh = NULL; 3013 i--; 3014 ext_debug("return to level %d\n", i); 3015 } 3016 } 3017 3018 trace_ext4_ext_remove_space_done(inode, start, end, depth, 3019 partial_cluster, path->p_hdr->eh_entries); 3020 3021 /* 3022 * If we still have something in the partial cluster and we have removed 3023 * even the first extent, then we should free the blocks in the partial 3024 * cluster as well. (This code will only run when there are no leaves 3025 * to the immediate left of the truncated/punched region.) 3026 */ 3027 if (partial_cluster > 0 && err == 0) { 3028 /* don't zero partial_cluster since it's not used afterwards */ 3029 ext4_free_blocks(handle, inode, NULL, 3030 EXT4_C2B(sbi, partial_cluster), 3031 sbi->s_cluster_ratio, 3032 get_default_free_blocks_flags(inode)); 3033 } 3034 3035 /* TODO: flexible tree reduction should be here */ 3036 if (path->p_hdr->eh_entries == 0) { 3037 /* 3038 * truncate to zero freed all the tree, 3039 * so we need to correct eh_depth 3040 */ 3041 err = ext4_ext_get_access(handle, inode, path); 3042 if (err == 0) { 3043 ext_inode_hdr(inode)->eh_depth = 0; 3044 ext_inode_hdr(inode)->eh_max = 3045 cpu_to_le16(ext4_ext_space_root(inode, 0)); 3046 err = ext4_ext_dirty(handle, inode, path); 3047 } 3048 } 3049 out: 3050 ext4_ext_drop_refs(path); 3051 kfree(path); 3052 path = NULL; 3053 if (err == -EAGAIN) 3054 goto again; 3055 ext4_journal_stop(handle); 3056 3057 return err; 3058 } 3059 3060 /* 3061 * called at mount time 3062 */ 3063 void ext4_ext_init(struct super_block *sb) 3064 { 3065 /* 3066 * possible initialization would be here 3067 */ 3068 3069 if (ext4_has_feature_extents(sb)) { 3070 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 3071 printk(KERN_INFO "EXT4-fs: file extents enabled" 3072 #ifdef AGGRESSIVE_TEST 3073 ", aggressive tests" 3074 #endif 3075 #ifdef CHECK_BINSEARCH 3076 ", check binsearch" 3077 #endif 3078 #ifdef EXTENTS_STATS 3079 ", stats" 3080 #endif 3081 "\n"); 3082 #endif 3083 #ifdef EXTENTS_STATS 3084 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 3085 EXT4_SB(sb)->s_ext_min = 1 << 30; 3086 EXT4_SB(sb)->s_ext_max = 0; 3087 #endif 3088 } 3089 } 3090 3091 /* 3092 * called at umount time 3093 */ 3094 void ext4_ext_release(struct super_block *sb) 3095 { 3096 if (!ext4_has_feature_extents(sb)) 3097 return; 3098 3099 #ifdef EXTENTS_STATS 3100 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 3101 struct ext4_sb_info *sbi = EXT4_SB(sb); 3102 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 3103 sbi->s_ext_blocks, sbi->s_ext_extents, 3104 sbi->s_ext_blocks / sbi->s_ext_extents); 3105 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 3106 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 3107 } 3108 #endif 3109 } 3110 3111 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) 3112 { 3113 ext4_lblk_t ee_block; 3114 ext4_fsblk_t ee_pblock; 3115 unsigned int ee_len; 3116 3117 ee_block = le32_to_cpu(ex->ee_block); 3118 ee_len = ext4_ext_get_actual_len(ex); 3119 ee_pblock = ext4_ext_pblock(ex); 3120 3121 if (ee_len == 0) 3122 return 0; 3123 3124 return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, 3125 EXTENT_STATUS_WRITTEN); 3126 } 3127 3128 /* FIXME!! we need to try to merge to left or right after zero-out */ 3129 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 3130 { 3131 ext4_fsblk_t ee_pblock; 3132 unsigned int ee_len; 3133 3134 ee_len = ext4_ext_get_actual_len(ex); 3135 ee_pblock = ext4_ext_pblock(ex); 3136 return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, 3137 ee_len); 3138 } 3139 3140 /* 3141 * ext4_split_extent_at() splits an extent at given block. 3142 * 3143 * @handle: the journal handle 3144 * @inode: the file inode 3145 * @path: the path to the extent 3146 * @split: the logical block where the extent is splitted. 3147 * @split_flags: indicates if the extent could be zeroout if split fails, and 3148 * the states(init or unwritten) of new extents. 3149 * @flags: flags used to insert new extent to extent tree. 3150 * 3151 * 3152 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 3153 * of which are deterimined by split_flag. 3154 * 3155 * There are two cases: 3156 * a> the extent are splitted into two extent. 3157 * b> split is not needed, and just mark the extent. 3158 * 3159 * return 0 on success. 3160 */ 3161 static int ext4_split_extent_at(handle_t *handle, 3162 struct inode *inode, 3163 struct ext4_ext_path **ppath, 3164 ext4_lblk_t split, 3165 int split_flag, 3166 int flags) 3167 { 3168 struct ext4_ext_path *path = *ppath; 3169 ext4_fsblk_t newblock; 3170 ext4_lblk_t ee_block; 3171 struct ext4_extent *ex, newex, orig_ex, zero_ex; 3172 struct ext4_extent *ex2 = NULL; 3173 unsigned int ee_len, depth; 3174 int err = 0; 3175 3176 BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == 3177 (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); 3178 3179 ext_debug("ext4_split_extents_at: inode %lu, logical" 3180 "block %llu\n", inode->i_ino, (unsigned long long)split); 3181 3182 ext4_ext_show_leaf(inode, path); 3183 3184 depth = ext_depth(inode); 3185 ex = path[depth].p_ext; 3186 ee_block = le32_to_cpu(ex->ee_block); 3187 ee_len = ext4_ext_get_actual_len(ex); 3188 newblock = split - ee_block + ext4_ext_pblock(ex); 3189 3190 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 3191 BUG_ON(!ext4_ext_is_unwritten(ex) && 3192 split_flag & (EXT4_EXT_MAY_ZEROOUT | 3193 EXT4_EXT_MARK_UNWRIT1 | 3194 EXT4_EXT_MARK_UNWRIT2)); 3195 3196 err = ext4_ext_get_access(handle, inode, path + depth); 3197 if (err) 3198 goto out; 3199 3200 if (split == ee_block) { 3201 /* 3202 * case b: block @split is the block that the extent begins with 3203 * then we just change the state of the extent, and splitting 3204 * is not needed. 3205 */ 3206 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3207 ext4_ext_mark_unwritten(ex); 3208 else 3209 ext4_ext_mark_initialized(ex); 3210 3211 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 3212 ext4_ext_try_to_merge(handle, inode, path, ex); 3213 3214 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3215 goto out; 3216 } 3217 3218 /* case a */ 3219 memcpy(&orig_ex, ex, sizeof(orig_ex)); 3220 ex->ee_len = cpu_to_le16(split - ee_block); 3221 if (split_flag & EXT4_EXT_MARK_UNWRIT1) 3222 ext4_ext_mark_unwritten(ex); 3223 3224 /* 3225 * path may lead to new leaf, not to original leaf any more 3226 * after ext4_ext_insert_extent() returns, 3227 */ 3228 err = ext4_ext_dirty(handle, inode, path + depth); 3229 if (err) 3230 goto fix_extent_len; 3231 3232 ex2 = &newex; 3233 ex2->ee_block = cpu_to_le32(split); 3234 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 3235 ext4_ext_store_pblock(ex2, newblock); 3236 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3237 ext4_ext_mark_unwritten(ex2); 3238 3239 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); 3240 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3241 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3242 if (split_flag & EXT4_EXT_DATA_VALID1) { 3243 err = ext4_ext_zeroout(inode, ex2); 3244 zero_ex.ee_block = ex2->ee_block; 3245 zero_ex.ee_len = cpu_to_le16( 3246 ext4_ext_get_actual_len(ex2)); 3247 ext4_ext_store_pblock(&zero_ex, 3248 ext4_ext_pblock(ex2)); 3249 } else { 3250 err = ext4_ext_zeroout(inode, ex); 3251 zero_ex.ee_block = ex->ee_block; 3252 zero_ex.ee_len = cpu_to_le16( 3253 ext4_ext_get_actual_len(ex)); 3254 ext4_ext_store_pblock(&zero_ex, 3255 ext4_ext_pblock(ex)); 3256 } 3257 } else { 3258 err = ext4_ext_zeroout(inode, &orig_ex); 3259 zero_ex.ee_block = orig_ex.ee_block; 3260 zero_ex.ee_len = cpu_to_le16( 3261 ext4_ext_get_actual_len(&orig_ex)); 3262 ext4_ext_store_pblock(&zero_ex, 3263 ext4_ext_pblock(&orig_ex)); 3264 } 3265 3266 if (err) 3267 goto fix_extent_len; 3268 /* update the extent length and mark as initialized */ 3269 ex->ee_len = cpu_to_le16(ee_len); 3270 ext4_ext_try_to_merge(handle, inode, path, ex); 3271 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3272 if (err) 3273 goto fix_extent_len; 3274 3275 /* update extent status tree */ 3276 err = ext4_zeroout_es(inode, &zero_ex); 3277 3278 goto out; 3279 } else if (err) 3280 goto fix_extent_len; 3281 3282 out: 3283 ext4_ext_show_leaf(inode, path); 3284 return err; 3285 3286 fix_extent_len: 3287 ex->ee_len = orig_ex.ee_len; 3288 ext4_ext_dirty(handle, inode, path + path->p_depth); 3289 return err; 3290 } 3291 3292 /* 3293 * ext4_split_extents() splits an extent and mark extent which is covered 3294 * by @map as split_flags indicates 3295 * 3296 * It may result in splitting the extent into multiple extents (up to three) 3297 * There are three possibilities: 3298 * a> There is no split required 3299 * b> Splits in two extents: Split is happening at either end of the extent 3300 * c> Splits in three extents: Somone is splitting in middle of the extent 3301 * 3302 */ 3303 static int ext4_split_extent(handle_t *handle, 3304 struct inode *inode, 3305 struct ext4_ext_path **ppath, 3306 struct ext4_map_blocks *map, 3307 int split_flag, 3308 int flags) 3309 { 3310 struct ext4_ext_path *path = *ppath; 3311 ext4_lblk_t ee_block; 3312 struct ext4_extent *ex; 3313 unsigned int ee_len, depth; 3314 int err = 0; 3315 int unwritten; 3316 int split_flag1, flags1; 3317 int allocated = map->m_len; 3318 3319 depth = ext_depth(inode); 3320 ex = path[depth].p_ext; 3321 ee_block = le32_to_cpu(ex->ee_block); 3322 ee_len = ext4_ext_get_actual_len(ex); 3323 unwritten = ext4_ext_is_unwritten(ex); 3324 3325 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3326 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3327 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3328 if (unwritten) 3329 split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | 3330 EXT4_EXT_MARK_UNWRIT2; 3331 if (split_flag & EXT4_EXT_DATA_VALID2) 3332 split_flag1 |= EXT4_EXT_DATA_VALID1; 3333 err = ext4_split_extent_at(handle, inode, ppath, 3334 map->m_lblk + map->m_len, split_flag1, flags1); 3335 if (err) 3336 goto out; 3337 } else { 3338 allocated = ee_len - (map->m_lblk - ee_block); 3339 } 3340 /* 3341 * Update path is required because previous ext4_split_extent_at() may 3342 * result in split of original leaf or extent zeroout. 3343 */ 3344 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3345 if (IS_ERR(path)) 3346 return PTR_ERR(path); 3347 depth = ext_depth(inode); 3348 ex = path[depth].p_ext; 3349 if (!ex) { 3350 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3351 (unsigned long) map->m_lblk); 3352 return -EFSCORRUPTED; 3353 } 3354 unwritten = ext4_ext_is_unwritten(ex); 3355 split_flag1 = 0; 3356 3357 if (map->m_lblk >= ee_block) { 3358 split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; 3359 if (unwritten) { 3360 split_flag1 |= EXT4_EXT_MARK_UNWRIT1; 3361 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3362 EXT4_EXT_MARK_UNWRIT2); 3363 } 3364 err = ext4_split_extent_at(handle, inode, ppath, 3365 map->m_lblk, split_flag1, flags); 3366 if (err) 3367 goto out; 3368 } 3369 3370 ext4_ext_show_leaf(inode, path); 3371 out: 3372 return err ? err : allocated; 3373 } 3374 3375 /* 3376 * This function is called by ext4_ext_map_blocks() if someone tries to write 3377 * to an unwritten extent. It may result in splitting the unwritten 3378 * extent into multiple extents (up to three - one initialized and two 3379 * unwritten). 3380 * There are three possibilities: 3381 * a> There is no split required: Entire extent should be initialized 3382 * b> Splits in two extents: Write is happening at either end of the extent 3383 * c> Splits in three extents: Somone is writing in middle of the extent 3384 * 3385 * Pre-conditions: 3386 * - The extent pointed to by 'path' is unwritten. 3387 * - The extent pointed to by 'path' contains a superset 3388 * of the logical span [map->m_lblk, map->m_lblk + map->m_len). 3389 * 3390 * Post-conditions on success: 3391 * - the returned value is the number of blocks beyond map->l_lblk 3392 * that are allocated and initialized. 3393 * It is guaranteed to be >= map->m_len. 3394 */ 3395 static int ext4_ext_convert_to_initialized(handle_t *handle, 3396 struct inode *inode, 3397 struct ext4_map_blocks *map, 3398 struct ext4_ext_path **ppath, 3399 int flags) 3400 { 3401 struct ext4_ext_path *path = *ppath; 3402 struct ext4_sb_info *sbi; 3403 struct ext4_extent_header *eh; 3404 struct ext4_map_blocks split_map; 3405 struct ext4_extent zero_ex1, zero_ex2; 3406 struct ext4_extent *ex, *abut_ex; 3407 ext4_lblk_t ee_block, eof_block; 3408 unsigned int ee_len, depth, map_len = map->m_len; 3409 int allocated = 0, max_zeroout = 0; 3410 int err = 0; 3411 int split_flag = EXT4_EXT_DATA_VALID2; 3412 3413 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 3414 "block %llu, max_blocks %u\n", inode->i_ino, 3415 (unsigned long long)map->m_lblk, map_len); 3416 3417 sbi = EXT4_SB(inode->i_sb); 3418 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3419 inode->i_sb->s_blocksize_bits; 3420 if (eof_block < map->m_lblk + map_len) 3421 eof_block = map->m_lblk + map_len; 3422 3423 depth = ext_depth(inode); 3424 eh = path[depth].p_hdr; 3425 ex = path[depth].p_ext; 3426 ee_block = le32_to_cpu(ex->ee_block); 3427 ee_len = ext4_ext_get_actual_len(ex); 3428 zero_ex1.ee_len = 0; 3429 zero_ex2.ee_len = 0; 3430 3431 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); 3432 3433 /* Pre-conditions */ 3434 BUG_ON(!ext4_ext_is_unwritten(ex)); 3435 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); 3436 3437 /* 3438 * Attempt to transfer newly initialized blocks from the currently 3439 * unwritten extent to its neighbor. This is much cheaper 3440 * than an insertion followed by a merge as those involve costly 3441 * memmove() calls. Transferring to the left is the common case in 3442 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE) 3443 * followed by append writes. 3444 * 3445 * Limitations of the current logic: 3446 * - L1: we do not deal with writes covering the whole extent. 3447 * This would require removing the extent if the transfer 3448 * is possible. 3449 * - L2: we only attempt to merge with an extent stored in the 3450 * same extent tree node. 3451 */ 3452 if ((map->m_lblk == ee_block) && 3453 /* See if we can merge left */ 3454 (map_len < ee_len) && /*L1*/ 3455 (ex > EXT_FIRST_EXTENT(eh))) { /*L2*/ 3456 ext4_lblk_t prev_lblk; 3457 ext4_fsblk_t prev_pblk, ee_pblk; 3458 unsigned int prev_len; 3459 3460 abut_ex = ex - 1; 3461 prev_lblk = le32_to_cpu(abut_ex->ee_block); 3462 prev_len = ext4_ext_get_actual_len(abut_ex); 3463 prev_pblk = ext4_ext_pblock(abut_ex); 3464 ee_pblk = ext4_ext_pblock(ex); 3465 3466 /* 3467 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3468 * upon those conditions: 3469 * - C1: abut_ex is initialized, 3470 * - C2: abut_ex is logically abutting ex, 3471 * - C3: abut_ex is physically abutting ex, 3472 * - C4: abut_ex can receive the additional blocks without 3473 * overflowing the (initialized) length limit. 3474 */ 3475 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3476 ((prev_lblk + prev_len) == ee_block) && /*C2*/ 3477 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/ 3478 (prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3479 err = ext4_ext_get_access(handle, inode, path + depth); 3480 if (err) 3481 goto out; 3482 3483 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3484 map, ex, abut_ex); 3485 3486 /* Shift the start of ex by 'map_len' blocks */ 3487 ex->ee_block = cpu_to_le32(ee_block + map_len); 3488 ext4_ext_store_pblock(ex, ee_pblk + map_len); 3489 ex->ee_len = cpu_to_le16(ee_len - map_len); 3490 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3491 3492 /* Extend abut_ex by 'map_len' blocks */ 3493 abut_ex->ee_len = cpu_to_le16(prev_len + map_len); 3494 3495 /* Result: number of initialized blocks past m_lblk */ 3496 allocated = map_len; 3497 } 3498 } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) && 3499 (map_len < ee_len) && /*L1*/ 3500 ex < EXT_LAST_EXTENT(eh)) { /*L2*/ 3501 /* See if we can merge right */ 3502 ext4_lblk_t next_lblk; 3503 ext4_fsblk_t next_pblk, ee_pblk; 3504 unsigned int next_len; 3505 3506 abut_ex = ex + 1; 3507 next_lblk = le32_to_cpu(abut_ex->ee_block); 3508 next_len = ext4_ext_get_actual_len(abut_ex); 3509 next_pblk = ext4_ext_pblock(abut_ex); 3510 ee_pblk = ext4_ext_pblock(ex); 3511 3512 /* 3513 * A transfer of blocks from 'ex' to 'abut_ex' is allowed 3514 * upon those conditions: 3515 * - C1: abut_ex is initialized, 3516 * - C2: abut_ex is logically abutting ex, 3517 * - C3: abut_ex is physically abutting ex, 3518 * - C4: abut_ex can receive the additional blocks without 3519 * overflowing the (initialized) length limit. 3520 */ 3521 if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/ 3522 ((map->m_lblk + map_len) == next_lblk) && /*C2*/ 3523 ((ee_pblk + ee_len) == next_pblk) && /*C3*/ 3524 (next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/ 3525 err = ext4_ext_get_access(handle, inode, path + depth); 3526 if (err) 3527 goto out; 3528 3529 trace_ext4_ext_convert_to_initialized_fastpath(inode, 3530 map, ex, abut_ex); 3531 3532 /* Shift the start of abut_ex by 'map_len' blocks */ 3533 abut_ex->ee_block = cpu_to_le32(next_lblk - map_len); 3534 ext4_ext_store_pblock(abut_ex, next_pblk - map_len); 3535 ex->ee_len = cpu_to_le16(ee_len - map_len); 3536 ext4_ext_mark_unwritten(ex); /* Restore the flag */ 3537 3538 /* Extend abut_ex by 'map_len' blocks */ 3539 abut_ex->ee_len = cpu_to_le16(next_len + map_len); 3540 3541 /* Result: number of initialized blocks past m_lblk */ 3542 allocated = map_len; 3543 } 3544 } 3545 if (allocated) { 3546 /* Mark the block containing both extents as dirty */ 3547 ext4_ext_dirty(handle, inode, path + depth); 3548 3549 /* Update path to point to the right extent */ 3550 path[depth].p_ext = abut_ex; 3551 goto out; 3552 } else 3553 allocated = ee_len - (map->m_lblk - ee_block); 3554 3555 WARN_ON(map->m_lblk < ee_block); 3556 /* 3557 * It is safe to convert extent to initialized via explicit 3558 * zeroout only if extent is fully inside i_size or new_size. 3559 */ 3560 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; 3561 3562 if (EXT4_EXT_MAY_ZEROOUT & split_flag) 3563 max_zeroout = sbi->s_extent_max_zeroout_kb >> 3564 (inode->i_sb->s_blocksize_bits - 10); 3565 3566 if (ext4_encrypted_inode(inode)) 3567 max_zeroout = 0; 3568 3569 /* 3570 * five cases: 3571 * 1. split the extent into three extents. 3572 * 2. split the extent into two extents, zeroout the head of the first 3573 * extent. 3574 * 3. split the extent into two extents, zeroout the tail of the second 3575 * extent. 3576 * 4. split the extent into two extents with out zeroout. 3577 * 5. no splitting needed, just possibly zeroout the head and / or the 3578 * tail of the extent. 3579 */ 3580 split_map.m_lblk = map->m_lblk; 3581 split_map.m_len = map->m_len; 3582 3583 if (max_zeroout && (allocated > split_map.m_len)) { 3584 if (allocated <= max_zeroout) { 3585 /* case 3 or 5 */ 3586 zero_ex1.ee_block = 3587 cpu_to_le32(split_map.m_lblk + 3588 split_map.m_len); 3589 zero_ex1.ee_len = 3590 cpu_to_le16(allocated - split_map.m_len); 3591 ext4_ext_store_pblock(&zero_ex1, 3592 ext4_ext_pblock(ex) + split_map.m_lblk + 3593 split_map.m_len - ee_block); 3594 err = ext4_ext_zeroout(inode, &zero_ex1); 3595 if (err) 3596 goto out; 3597 split_map.m_len = allocated; 3598 } 3599 if (split_map.m_lblk - ee_block + split_map.m_len < 3600 max_zeroout) { 3601 /* case 2 or 5 */ 3602 if (split_map.m_lblk != ee_block) { 3603 zero_ex2.ee_block = ex->ee_block; 3604 zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk - 3605 ee_block); 3606 ext4_ext_store_pblock(&zero_ex2, 3607 ext4_ext_pblock(ex)); 3608 err = ext4_ext_zeroout(inode, &zero_ex2); 3609 if (err) 3610 goto out; 3611 } 3612 3613 split_map.m_len += split_map.m_lblk - ee_block; 3614 split_map.m_lblk = ee_block; 3615 allocated = map->m_len; 3616 } 3617 } 3618 3619 err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, 3620 flags); 3621 if (err > 0) 3622 err = 0; 3623 out: 3624 /* If we have gotten a failure, don't zero out status tree */ 3625 if (!err) { 3626 err = ext4_zeroout_es(inode, &zero_ex1); 3627 if (!err) 3628 err = ext4_zeroout_es(inode, &zero_ex2); 3629 } 3630 return err ? err : allocated; 3631 } 3632 3633 /* 3634 * This function is called by ext4_ext_map_blocks() from 3635 * ext4_get_blocks_dio_write() when DIO to write 3636 * to an unwritten extent. 3637 * 3638 * Writing to an unwritten extent may result in splitting the unwritten 3639 * extent into multiple initialized/unwritten extents (up to three) 3640 * There are three possibilities: 3641 * a> There is no split required: Entire extent should be unwritten 3642 * b> Splits in two extents: Write is happening at either end of the extent 3643 * c> Splits in three extents: Somone is writing in middle of the extent 3644 * 3645 * This works the same way in the case of initialized -> unwritten conversion. 3646 * 3647 * One of more index blocks maybe needed if the extent tree grow after 3648 * the unwritten extent split. To prevent ENOSPC occur at the IO 3649 * complete, we need to split the unwritten extent before DIO submit 3650 * the IO. The unwritten extent called at this time will be split 3651 * into three unwritten extent(at most). After IO complete, the part 3652 * being filled will be convert to initialized by the end_io callback function 3653 * via ext4_convert_unwritten_extents(). 3654 * 3655 * Returns the size of unwritten extent to be written on success. 3656 */ 3657 static int ext4_split_convert_extents(handle_t *handle, 3658 struct inode *inode, 3659 struct ext4_map_blocks *map, 3660 struct ext4_ext_path **ppath, 3661 int flags) 3662 { 3663 struct ext4_ext_path *path = *ppath; 3664 ext4_lblk_t eof_block; 3665 ext4_lblk_t ee_block; 3666 struct ext4_extent *ex; 3667 unsigned int ee_len; 3668 int split_flag = 0, depth; 3669 3670 ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", 3671 __func__, inode->i_ino, 3672 (unsigned long long)map->m_lblk, map->m_len); 3673 3674 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3675 inode->i_sb->s_blocksize_bits; 3676 if (eof_block < map->m_lblk + map->m_len) 3677 eof_block = map->m_lblk + map->m_len; 3678 /* 3679 * It is safe to convert extent to initialized via explicit 3680 * zeroout only if extent is fully insde i_size or new_size. 3681 */ 3682 depth = ext_depth(inode); 3683 ex = path[depth].p_ext; 3684 ee_block = le32_to_cpu(ex->ee_block); 3685 ee_len = ext4_ext_get_actual_len(ex); 3686 3687 /* Convert to unwritten */ 3688 if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) { 3689 split_flag |= EXT4_EXT_DATA_VALID1; 3690 /* Convert to initialized */ 3691 } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 3692 split_flag |= ee_block + ee_len <= eof_block ? 3693 EXT4_EXT_MAY_ZEROOUT : 0; 3694 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3695 } 3696 flags |= EXT4_GET_BLOCKS_PRE_IO; 3697 return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); 3698 } 3699 3700 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3701 struct inode *inode, 3702 struct ext4_map_blocks *map, 3703 struct ext4_ext_path **ppath) 3704 { 3705 struct ext4_ext_path *path = *ppath; 3706 struct ext4_extent *ex; 3707 ext4_lblk_t ee_block; 3708 unsigned int ee_len; 3709 int depth; 3710 int err = 0; 3711 3712 depth = ext_depth(inode); 3713 ex = path[depth].p_ext; 3714 ee_block = le32_to_cpu(ex->ee_block); 3715 ee_len = ext4_ext_get_actual_len(ex); 3716 3717 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3718 "block %llu, max_blocks %u\n", inode->i_ino, 3719 (unsigned long long)ee_block, ee_len); 3720 3721 /* If extent is larger than requested it is a clear sign that we still 3722 * have some extent state machine issues left. So extent_split is still 3723 * required. 3724 * TODO: Once all related issues will be fixed this situation should be 3725 * illegal. 3726 */ 3727 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3728 #ifdef EXT4_DEBUG 3729 ext4_warning("Inode (%ld) finished: extent logical block %llu," 3730 " len %u; IO logical block %llu, len %u", 3731 inode->i_ino, (unsigned long long)ee_block, ee_len, 3732 (unsigned long long)map->m_lblk, map->m_len); 3733 #endif 3734 err = ext4_split_convert_extents(handle, inode, map, ppath, 3735 EXT4_GET_BLOCKS_CONVERT); 3736 if (err < 0) 3737 return err; 3738 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3739 if (IS_ERR(path)) 3740 return PTR_ERR(path); 3741 depth = ext_depth(inode); 3742 ex = path[depth].p_ext; 3743 } 3744 3745 err = ext4_ext_get_access(handle, inode, path + depth); 3746 if (err) 3747 goto out; 3748 /* first mark the extent as initialized */ 3749 ext4_ext_mark_initialized(ex); 3750 3751 /* note: ext4_ext_correct_indexes() isn't needed here because 3752 * borders are not changed 3753 */ 3754 ext4_ext_try_to_merge(handle, inode, path, ex); 3755 3756 /* Mark modified extent as dirty */ 3757 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3758 out: 3759 ext4_ext_show_leaf(inode, path); 3760 return err; 3761 } 3762 3763 /* 3764 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3765 */ 3766 static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3767 ext4_lblk_t lblk, 3768 struct ext4_ext_path *path, 3769 unsigned int len) 3770 { 3771 int i, depth; 3772 struct ext4_extent_header *eh; 3773 struct ext4_extent *last_ex; 3774 3775 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3776 return 0; 3777 3778 depth = ext_depth(inode); 3779 eh = path[depth].p_hdr; 3780 3781 /* 3782 * We're going to remove EOFBLOCKS_FL entirely in future so we 3783 * do not care for this case anymore. Simply remove the flag 3784 * if there are no extents. 3785 */ 3786 if (unlikely(!eh->eh_entries)) 3787 goto out; 3788 last_ex = EXT_LAST_EXTENT(eh); 3789 /* 3790 * We should clear the EOFBLOCKS_FL flag if we are writing the 3791 * last block in the last extent in the file. We test this by 3792 * first checking to see if the caller to 3793 * ext4_ext_get_blocks() was interested in the last block (or 3794 * a block beyond the last block) in the current extent. If 3795 * this turns out to be false, we can bail out from this 3796 * function immediately. 3797 */ 3798 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3799 ext4_ext_get_actual_len(last_ex)) 3800 return 0; 3801 /* 3802 * If the caller does appear to be planning to write at or 3803 * beyond the end of the current extent, we then test to see 3804 * if the current extent is the last extent in the file, by 3805 * checking to make sure it was reached via the rightmost node 3806 * at each level of the tree. 3807 */ 3808 for (i = depth-1; i >= 0; i--) 3809 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3810 return 0; 3811 out: 3812 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3813 return ext4_mark_inode_dirty(handle, inode); 3814 } 3815 3816 /** 3817 * ext4_find_delalloc_range: find delayed allocated block in the given range. 3818 * 3819 * Return 1 if there is a delalloc block in the range, otherwise 0. 3820 */ 3821 int ext4_find_delalloc_range(struct inode *inode, 3822 ext4_lblk_t lblk_start, 3823 ext4_lblk_t lblk_end) 3824 { 3825 struct extent_status es; 3826 3827 ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es); 3828 if (es.es_len == 0) 3829 return 0; /* there is no delay extent in this tree */ 3830 else if (es.es_lblk <= lblk_start && 3831 lblk_start < es.es_lblk + es.es_len) 3832 return 1; 3833 else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) 3834 return 1; 3835 else 3836 return 0; 3837 } 3838 3839 int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) 3840 { 3841 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3842 ext4_lblk_t lblk_start, lblk_end; 3843 lblk_start = EXT4_LBLK_CMASK(sbi, lblk); 3844 lblk_end = lblk_start + sbi->s_cluster_ratio - 1; 3845 3846 return ext4_find_delalloc_range(inode, lblk_start, lblk_end); 3847 } 3848 3849 /** 3850 * Determines how many complete clusters (out of those specified by the 'map') 3851 * are under delalloc and were reserved quota for. 3852 * This function is called when we are writing out the blocks that were 3853 * originally written with their allocation delayed, but then the space was 3854 * allocated using fallocate() before the delayed allocation could be resolved. 3855 * The cases to look for are: 3856 * ('=' indicated delayed allocated blocks 3857 * '-' indicates non-delayed allocated blocks) 3858 * (a) partial clusters towards beginning and/or end outside of allocated range 3859 * are not delalloc'ed. 3860 * Ex: 3861 * |----c---=|====c====|====c====|===-c----| 3862 * |++++++ allocated ++++++| 3863 * ==> 4 complete clusters in above example 3864 * 3865 * (b) partial cluster (outside of allocated range) towards either end is 3866 * marked for delayed allocation. In this case, we will exclude that 3867 * cluster. 3868 * Ex: 3869 * |----====c========|========c========| 3870 * |++++++ allocated ++++++| 3871 * ==> 1 complete clusters in above example 3872 * 3873 * Ex: 3874 * |================c================| 3875 * |++++++ allocated ++++++| 3876 * ==> 0 complete clusters in above example 3877 * 3878 * The ext4_da_update_reserve_space will be called only if we 3879 * determine here that there were some "entire" clusters that span 3880 * this 'allocated' range. 3881 * In the non-bigalloc case, this function will just end up returning num_blks 3882 * without ever calling ext4_find_delalloc_range. 3883 */ 3884 static unsigned int 3885 get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, 3886 unsigned int num_blks) 3887 { 3888 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3889 ext4_lblk_t alloc_cluster_start, alloc_cluster_end; 3890 ext4_lblk_t lblk_from, lblk_to, c_offset; 3891 unsigned int allocated_clusters = 0; 3892 3893 alloc_cluster_start = EXT4_B2C(sbi, lblk_start); 3894 alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); 3895 3896 /* max possible clusters for this allocation */ 3897 allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; 3898 3899 trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); 3900 3901 /* Check towards left side */ 3902 c_offset = EXT4_LBLK_COFF(sbi, lblk_start); 3903 if (c_offset) { 3904 lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); 3905 lblk_to = lblk_from + c_offset - 1; 3906 3907 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3908 allocated_clusters--; 3909 } 3910 3911 /* Now check towards right. */ 3912 c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); 3913 if (allocated_clusters && c_offset) { 3914 lblk_from = lblk_start + num_blks; 3915 lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; 3916 3917 if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) 3918 allocated_clusters--; 3919 } 3920 3921 return allocated_clusters; 3922 } 3923 3924 static int 3925 convert_initialized_extent(handle_t *handle, struct inode *inode, 3926 struct ext4_map_blocks *map, 3927 struct ext4_ext_path **ppath, 3928 unsigned int allocated) 3929 { 3930 struct ext4_ext_path *path = *ppath; 3931 struct ext4_extent *ex; 3932 ext4_lblk_t ee_block; 3933 unsigned int ee_len; 3934 int depth; 3935 int err = 0; 3936 3937 /* 3938 * Make sure that the extent is no bigger than we support with 3939 * unwritten extent 3940 */ 3941 if (map->m_len > EXT_UNWRITTEN_MAX_LEN) 3942 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; 3943 3944 depth = ext_depth(inode); 3945 ex = path[depth].p_ext; 3946 ee_block = le32_to_cpu(ex->ee_block); 3947 ee_len = ext4_ext_get_actual_len(ex); 3948 3949 ext_debug("%s: inode %lu, logical" 3950 "block %llu, max_blocks %u\n", __func__, inode->i_ino, 3951 (unsigned long long)ee_block, ee_len); 3952 3953 if (ee_block != map->m_lblk || ee_len > map->m_len) { 3954 err = ext4_split_convert_extents(handle, inode, map, ppath, 3955 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); 3956 if (err < 0) 3957 return err; 3958 path = ext4_find_extent(inode, map->m_lblk, ppath, 0); 3959 if (IS_ERR(path)) 3960 return PTR_ERR(path); 3961 depth = ext_depth(inode); 3962 ex = path[depth].p_ext; 3963 if (!ex) { 3964 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 3965 (unsigned long) map->m_lblk); 3966 return -EFSCORRUPTED; 3967 } 3968 } 3969 3970 err = ext4_ext_get_access(handle, inode, path + depth); 3971 if (err) 3972 return err; 3973 /* first mark the extent as unwritten */ 3974 ext4_ext_mark_unwritten(ex); 3975 3976 /* note: ext4_ext_correct_indexes() isn't needed here because 3977 * borders are not changed 3978 */ 3979 ext4_ext_try_to_merge(handle, inode, path, ex); 3980 3981 /* Mark modified extent as dirty */ 3982 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 3983 if (err) 3984 return err; 3985 ext4_ext_show_leaf(inode, path); 3986 3987 ext4_update_inode_fsync_trans(handle, inode, 1); 3988 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); 3989 if (err) 3990 return err; 3991 map->m_flags |= EXT4_MAP_UNWRITTEN; 3992 if (allocated > map->m_len) 3993 allocated = map->m_len; 3994 map->m_len = allocated; 3995 return allocated; 3996 } 3997 3998 static int 3999 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, 4000 struct ext4_map_blocks *map, 4001 struct ext4_ext_path **ppath, int flags, 4002 unsigned int allocated, ext4_fsblk_t newblock) 4003 { 4004 struct ext4_ext_path *path = *ppath; 4005 int ret = 0; 4006 int err = 0; 4007 4008 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " 4009 "block %llu, max_blocks %u, flags %x, allocated %u\n", 4010 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 4011 flags, allocated); 4012 ext4_ext_show_leaf(inode, path); 4013 4014 /* 4015 * When writing into unwritten space, we should not fail to 4016 * allocate metadata blocks for the new extent block if needed. 4017 */ 4018 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL; 4019 4020 trace_ext4_ext_handle_unwritten_extents(inode, map, flags, 4021 allocated, newblock); 4022 4023 /* get_block() before submit the IO, split the extent */ 4024 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 4025 ret = ext4_split_convert_extents(handle, inode, map, ppath, 4026 flags | EXT4_GET_BLOCKS_CONVERT); 4027 if (ret <= 0) 4028 goto out; 4029 map->m_flags |= EXT4_MAP_UNWRITTEN; 4030 goto out; 4031 } 4032 /* IO end_io complete, convert the filled extent to written */ 4033 if (flags & EXT4_GET_BLOCKS_CONVERT) { 4034 if (flags & EXT4_GET_BLOCKS_ZERO) { 4035 if (allocated > map->m_len) 4036 allocated = map->m_len; 4037 err = ext4_issue_zeroout(inode, map->m_lblk, newblock, 4038 allocated); 4039 if (err < 0) 4040 goto out2; 4041 } 4042 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 4043 ppath); 4044 if (ret >= 0) { 4045 ext4_update_inode_fsync_trans(handle, inode, 1); 4046 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4047 path, map->m_len); 4048 } else 4049 err = ret; 4050 map->m_flags |= EXT4_MAP_MAPPED; 4051 map->m_pblk = newblock; 4052 if (allocated > map->m_len) 4053 allocated = map->m_len; 4054 map->m_len = allocated; 4055 goto out2; 4056 } 4057 /* buffered IO case */ 4058 /* 4059 * repeat fallocate creation request 4060 * we already have an unwritten extent 4061 */ 4062 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { 4063 map->m_flags |= EXT4_MAP_UNWRITTEN; 4064 goto map_out; 4065 } 4066 4067 /* buffered READ or buffered write_begin() lookup */ 4068 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4069 /* 4070 * We have blocks reserved already. We 4071 * return allocated blocks so that delalloc 4072 * won't do block reservation for us. But 4073 * the buffer head will be unmapped so that 4074 * a read from the block returns 0s. 4075 */ 4076 map->m_flags |= EXT4_MAP_UNWRITTEN; 4077 goto out1; 4078 } 4079 4080 /* buffered write, writepage time, convert*/ 4081 ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); 4082 if (ret >= 0) 4083 ext4_update_inode_fsync_trans(handle, inode, 1); 4084 out: 4085 if (ret <= 0) { 4086 err = ret; 4087 goto out2; 4088 } else 4089 allocated = ret; 4090 map->m_flags |= EXT4_MAP_NEW; 4091 /* 4092 * if we allocated more blocks than requested 4093 * we need to make sure we unmap the extra block 4094 * allocated. The actual needed block will get 4095 * unmapped later when we find the buffer_head marked 4096 * new. 4097 */ 4098 if (allocated > map->m_len) { 4099 clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, 4100 allocated - map->m_len); 4101 allocated = map->m_len; 4102 } 4103 map->m_len = allocated; 4104 4105 /* 4106 * If we have done fallocate with the offset that is already 4107 * delayed allocated, we would have block reservation 4108 * and quota reservation done in the delayed write path. 4109 * But fallocate would have already updated quota and block 4110 * count for this offset. So cancel these reservation 4111 */ 4112 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4113 unsigned int reserved_clusters; 4114 reserved_clusters = get_reserved_cluster_alloc(inode, 4115 map->m_lblk, map->m_len); 4116 if (reserved_clusters) 4117 ext4_da_update_reserve_space(inode, 4118 reserved_clusters, 4119 0); 4120 } 4121 4122 map_out: 4123 map->m_flags |= EXT4_MAP_MAPPED; 4124 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { 4125 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 4126 map->m_len); 4127 if (err < 0) 4128 goto out2; 4129 } 4130 out1: 4131 if (allocated > map->m_len) 4132 allocated = map->m_len; 4133 ext4_ext_show_leaf(inode, path); 4134 map->m_pblk = newblock; 4135 map->m_len = allocated; 4136 out2: 4137 return err ? err : allocated; 4138 } 4139 4140 /* 4141 * get_implied_cluster_alloc - check to see if the requested 4142 * allocation (in the map structure) overlaps with a cluster already 4143 * allocated in an extent. 4144 * @sb The filesystem superblock structure 4145 * @map The requested lblk->pblk mapping 4146 * @ex The extent structure which might contain an implied 4147 * cluster allocation 4148 * 4149 * This function is called by ext4_ext_map_blocks() after we failed to 4150 * find blocks that were already in the inode's extent tree. Hence, 4151 * we know that the beginning of the requested region cannot overlap 4152 * the extent from the inode's extent tree. There are three cases we 4153 * want to catch. The first is this case: 4154 * 4155 * |--- cluster # N--| 4156 * |--- extent ---| |---- requested region ---| 4157 * |==========| 4158 * 4159 * The second case that we need to test for is this one: 4160 * 4161 * |--------- cluster # N ----------------| 4162 * |--- requested region --| |------- extent ----| 4163 * |=======================| 4164 * 4165 * The third case is when the requested region lies between two extents 4166 * within the same cluster: 4167 * |------------- cluster # N-------------| 4168 * |----- ex -----| |---- ex_right ----| 4169 * |------ requested region ------| 4170 * |================| 4171 * 4172 * In each of the above cases, we need to set the map->m_pblk and 4173 * map->m_len so it corresponds to the return the extent labelled as 4174 * "|====|" from cluster #N, since it is already in use for data in 4175 * cluster EXT4_B2C(sbi, map->m_lblk). We will then return 1 to 4176 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated 4177 * as a new "allocated" block region. Otherwise, we will return 0 and 4178 * ext4_ext_map_blocks() will then allocate one or more new clusters 4179 * by calling ext4_mb_new_blocks(). 4180 */ 4181 static int get_implied_cluster_alloc(struct super_block *sb, 4182 struct ext4_map_blocks *map, 4183 struct ext4_extent *ex, 4184 struct ext4_ext_path *path) 4185 { 4186 struct ext4_sb_info *sbi = EXT4_SB(sb); 4187 ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4188 ext4_lblk_t ex_cluster_start, ex_cluster_end; 4189 ext4_lblk_t rr_cluster_start; 4190 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4191 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4192 unsigned short ee_len = ext4_ext_get_actual_len(ex); 4193 4194 /* The extent passed in that we are trying to match */ 4195 ex_cluster_start = EXT4_B2C(sbi, ee_block); 4196 ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1); 4197 4198 /* The requested region passed into ext4_map_blocks() */ 4199 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 4200 4201 if ((rr_cluster_start == ex_cluster_end) || 4202 (rr_cluster_start == ex_cluster_start)) { 4203 if (rr_cluster_start == ex_cluster_end) 4204 ee_start += ee_len - 1; 4205 map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; 4206 map->m_len = min(map->m_len, 4207 (unsigned) sbi->s_cluster_ratio - c_offset); 4208 /* 4209 * Check for and handle this case: 4210 * 4211 * |--------- cluster # N-------------| 4212 * |------- extent ----| 4213 * |--- requested region ---| 4214 * |===========| 4215 */ 4216 4217 if (map->m_lblk < ee_block) 4218 map->m_len = min(map->m_len, ee_block - map->m_lblk); 4219 4220 /* 4221 * Check for the case where there is already another allocated 4222 * block to the right of 'ex' but before the end of the cluster. 4223 * 4224 * |------------- cluster # N-------------| 4225 * |----- ex -----| |---- ex_right ----| 4226 * |------ requested region ------| 4227 * |================| 4228 */ 4229 if (map->m_lblk > ee_block) { 4230 ext4_lblk_t next = ext4_ext_next_allocated_block(path); 4231 map->m_len = min(map->m_len, next - map->m_lblk); 4232 } 4233 4234 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1); 4235 return 1; 4236 } 4237 4238 trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0); 4239 return 0; 4240 } 4241 4242 4243 /* 4244 * Block allocation/map/preallocation routine for extents based files 4245 * 4246 * 4247 * Need to be called with 4248 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 4249 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 4250 * 4251 * return > 0, number of of blocks already mapped/allocated 4252 * if create == 0 and these are pre-allocated blocks 4253 * buffer head is unmapped 4254 * otherwise blocks are mapped 4255 * 4256 * return = 0, if plain look up failed (blocks have not been allocated) 4257 * buffer head is unmapped 4258 * 4259 * return < 0, error case. 4260 */ 4261 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 4262 struct ext4_map_blocks *map, int flags) 4263 { 4264 struct ext4_ext_path *path = NULL; 4265 struct ext4_extent newex, *ex, *ex2; 4266 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4267 ext4_fsblk_t newblock = 0; 4268 int free_on_err = 0, err = 0, depth, ret; 4269 unsigned int allocated = 0, offset = 0; 4270 unsigned int allocated_clusters = 0; 4271 struct ext4_allocation_request ar; 4272 ext4_lblk_t cluster_offset; 4273 bool map_from_cluster = false; 4274 4275 ext_debug("blocks %u/%u requested for inode %lu\n", 4276 map->m_lblk, map->m_len, inode->i_ino); 4277 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4278 4279 /* find extent for this block */ 4280 path = ext4_find_extent(inode, map->m_lblk, NULL, 0); 4281 if (IS_ERR(path)) { 4282 err = PTR_ERR(path); 4283 path = NULL; 4284 goto out2; 4285 } 4286 4287 depth = ext_depth(inode); 4288 4289 /* 4290 * consistent leaf must not be empty; 4291 * this situation is possible, though, _during_ tree modification; 4292 * this is why assert can't be put in ext4_find_extent() 4293 */ 4294 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4295 EXT4_ERROR_INODE(inode, "bad extent address " 4296 "lblock: %lu, depth: %d pblock %lld", 4297 (unsigned long) map->m_lblk, depth, 4298 path[depth].p_block); 4299 err = -EFSCORRUPTED; 4300 goto out2; 4301 } 4302 4303 ex = path[depth].p_ext; 4304 if (ex) { 4305 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 4306 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 4307 unsigned short ee_len; 4308 4309 4310 /* 4311 * unwritten extents are treated as holes, except that 4312 * we split out initialized portions during a write. 4313 */ 4314 ee_len = ext4_ext_get_actual_len(ex); 4315 4316 trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len); 4317 4318 /* if found extent covers block, simply return it */ 4319 if (in_range(map->m_lblk, ee_block, ee_len)) { 4320 newblock = map->m_lblk - ee_block + ee_start; 4321 /* number of remaining blocks in the extent */ 4322 allocated = ee_len - (map->m_lblk - ee_block); 4323 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 4324 ee_block, ee_len, newblock); 4325 4326 /* 4327 * If the extent is initialized check whether the 4328 * caller wants to convert it to unwritten. 4329 */ 4330 if ((!ext4_ext_is_unwritten(ex)) && 4331 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4332 allocated = convert_initialized_extent( 4333 handle, inode, map, &path, 4334 allocated); 4335 goto out2; 4336 } else if (!ext4_ext_is_unwritten(ex)) 4337 goto out; 4338 4339 ret = ext4_ext_handle_unwritten_extents( 4340 handle, inode, map, &path, flags, 4341 allocated, newblock); 4342 if (ret < 0) 4343 err = ret; 4344 else 4345 allocated = ret; 4346 goto out2; 4347 } 4348 } 4349 4350 /* 4351 * requested block isn't allocated yet; 4352 * we couldn't try to create block if create flag is zero 4353 */ 4354 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4355 ext4_lblk_t hole_start, hole_len; 4356 4357 hole_start = map->m_lblk; 4358 hole_len = ext4_ext_determine_hole(inode, path, &hole_start); 4359 /* 4360 * put just found gap into cache to speed up 4361 * subsequent requests 4362 */ 4363 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); 4364 4365 /* Update hole_len to reflect hole size after map->m_lblk */ 4366 if (hole_start != map->m_lblk) 4367 hole_len -= map->m_lblk - hole_start; 4368 map->m_pblk = 0; 4369 map->m_len = min_t(unsigned int, map->m_len, hole_len); 4370 4371 goto out2; 4372 } 4373 4374 /* 4375 * Okay, we need to do block allocation. 4376 */ 4377 newex.ee_block = cpu_to_le32(map->m_lblk); 4378 cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4379 4380 /* 4381 * If we are doing bigalloc, check to see if the extent returned 4382 * by ext4_find_extent() implies a cluster we can use. 4383 */ 4384 if (cluster_offset && ex && 4385 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4386 ar.len = allocated = map->m_len; 4387 newblock = map->m_pblk; 4388 map_from_cluster = true; 4389 goto got_allocated_blocks; 4390 } 4391 4392 /* find neighbour allocated blocks */ 4393 ar.lleft = map->m_lblk; 4394 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 4395 if (err) 4396 goto out2; 4397 ar.lright = map->m_lblk; 4398 ex2 = NULL; 4399 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); 4400 if (err) 4401 goto out2; 4402 4403 /* Check if the extent after searching to the right implies a 4404 * cluster we can use. */ 4405 if ((sbi->s_cluster_ratio > 1) && ex2 && 4406 get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { 4407 ar.len = allocated = map->m_len; 4408 newblock = map->m_pblk; 4409 map_from_cluster = true; 4410 goto got_allocated_blocks; 4411 } 4412 4413 /* 4414 * See if request is beyond maximum number of blocks we can have in 4415 * a single extent. For an initialized extent this limit is 4416 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is 4417 * EXT_UNWRITTEN_MAX_LEN. 4418 */ 4419 if (map->m_len > EXT_INIT_MAX_LEN && 4420 !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4421 map->m_len = EXT_INIT_MAX_LEN; 4422 else if (map->m_len > EXT_UNWRITTEN_MAX_LEN && 4423 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT)) 4424 map->m_len = EXT_UNWRITTEN_MAX_LEN; 4425 4426 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 4427 newex.ee_len = cpu_to_le16(map->m_len); 4428 err = ext4_ext_check_overlap(sbi, inode, &newex, path); 4429 if (err) 4430 allocated = ext4_ext_get_actual_len(&newex); 4431 else 4432 allocated = map->m_len; 4433 4434 /* allocate new block */ 4435 ar.inode = inode; 4436 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 4437 ar.logical = map->m_lblk; 4438 /* 4439 * We calculate the offset from the beginning of the cluster 4440 * for the logical block number, since when we allocate a 4441 * physical cluster, the physical block should start at the 4442 * same offset from the beginning of the cluster. This is 4443 * needed so that future calls to get_implied_cluster_alloc() 4444 * work correctly. 4445 */ 4446 offset = EXT4_LBLK_COFF(sbi, map->m_lblk); 4447 ar.len = EXT4_NUM_B2C(sbi, offset+allocated); 4448 ar.goal -= offset; 4449 ar.logical -= offset; 4450 if (S_ISREG(inode->i_mode)) 4451 ar.flags = EXT4_MB_HINT_DATA; 4452 else 4453 /* disable in-core preallocation for non-regular files */ 4454 ar.flags = 0; 4455 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4456 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4457 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 4458 ar.flags |= EXT4_MB_DELALLOC_RESERVED; 4459 if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 4460 ar.flags |= EXT4_MB_USE_RESERVED; 4461 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4462 if (!newblock) 4463 goto out2; 4464 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 4465 ar.goal, newblock, allocated); 4466 free_on_err = 1; 4467 allocated_clusters = ar.len; 4468 ar.len = EXT4_C2B(sbi, ar.len) - offset; 4469 if (ar.len > allocated) 4470 ar.len = allocated; 4471 4472 got_allocated_blocks: 4473 /* try to insert new extent into found leaf and return */ 4474 ext4_ext_store_pblock(&newex, newblock + offset); 4475 newex.ee_len = cpu_to_le16(ar.len); 4476 /* Mark unwritten */ 4477 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ 4478 ext4_ext_mark_unwritten(&newex); 4479 map->m_flags |= EXT4_MAP_UNWRITTEN; 4480 } 4481 4482 err = 0; 4483 if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) 4484 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4485 path, ar.len); 4486 if (!err) 4487 err = ext4_ext_insert_extent(handle, inode, &path, 4488 &newex, flags); 4489 4490 if (err && free_on_err) { 4491 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4492 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4493 /* free data blocks we just allocated */ 4494 /* not a good idea to call discard here directly, 4495 * but otherwise we'd need to call it every free() */ 4496 ext4_discard_preallocations(inode); 4497 ext4_free_blocks(handle, inode, NULL, newblock, 4498 EXT4_C2B(sbi, allocated_clusters), fb_flags); 4499 goto out2; 4500 } 4501 4502 /* previous routine could use block we allocated */ 4503 newblock = ext4_ext_pblock(&newex); 4504 allocated = ext4_ext_get_actual_len(&newex); 4505 if (allocated > map->m_len) 4506 allocated = map->m_len; 4507 map->m_flags |= EXT4_MAP_NEW; 4508 4509 /* 4510 * Update reserved blocks/metadata blocks after successful 4511 * block allocation which had been deferred till now. 4512 */ 4513 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 4514 unsigned int reserved_clusters; 4515 /* 4516 * Check how many clusters we had reserved this allocated range 4517 */ 4518 reserved_clusters = get_reserved_cluster_alloc(inode, 4519 map->m_lblk, allocated); 4520 if (!map_from_cluster) { 4521 BUG_ON(allocated_clusters < reserved_clusters); 4522 if (reserved_clusters < allocated_clusters) { 4523 struct ext4_inode_info *ei = EXT4_I(inode); 4524 int reservation = allocated_clusters - 4525 reserved_clusters; 4526 /* 4527 * It seems we claimed few clusters outside of 4528 * the range of this allocation. We should give 4529 * it back to the reservation pool. This can 4530 * happen in the following case: 4531 * 4532 * * Suppose s_cluster_ratio is 4 (i.e., each 4533 * cluster has 4 blocks. Thus, the clusters 4534 * are [0-3],[4-7],[8-11]... 4535 * * First comes delayed allocation write for 4536 * logical blocks 10 & 11. Since there were no 4537 * previous delayed allocated blocks in the 4538 * range [8-11], we would reserve 1 cluster 4539 * for this write. 4540 * * Next comes write for logical blocks 3 to 8. 4541 * In this case, we will reserve 2 clusters 4542 * (for [0-3] and [4-7]; and not for [8-11] as 4543 * that range has a delayed allocated blocks. 4544 * Thus total reserved clusters now becomes 3. 4545 * * Now, during the delayed allocation writeout 4546 * time, we will first write blocks [3-8] and 4547 * allocate 3 clusters for writing these 4548 * blocks. Also, we would claim all these 4549 * three clusters above. 4550 * * Now when we come here to writeout the 4551 * blocks [10-11], we would expect to claim 4552 * the reservation of 1 cluster we had made 4553 * (and we would claim it since there are no 4554 * more delayed allocated blocks in the range 4555 * [8-11]. But our reserved cluster count had 4556 * already gone to 0. 4557 * 4558 * Thus, at the step 4 above when we determine 4559 * that there are still some unwritten delayed 4560 * allocated blocks outside of our current 4561 * block range, we should increment the 4562 * reserved clusters count so that when the 4563 * remaining blocks finally gets written, we 4564 * could claim them. 4565 */ 4566 dquot_reserve_block(inode, 4567 EXT4_C2B(sbi, reservation)); 4568 spin_lock(&ei->i_block_reservation_lock); 4569 ei->i_reserved_data_blocks += reservation; 4570 spin_unlock(&ei->i_block_reservation_lock); 4571 } 4572 /* 4573 * We will claim quota for all newly allocated blocks. 4574 * We're updating the reserved space *after* the 4575 * correction above so we do not accidentally free 4576 * all the metadata reservation because we might 4577 * actually need it later on. 4578 */ 4579 ext4_da_update_reserve_space(inode, allocated_clusters, 4580 1); 4581 } 4582 } 4583 4584 /* 4585 * Cache the extent and update transaction to commit on fdatasync only 4586 * when it is _not_ an unwritten extent. 4587 */ 4588 if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0) 4589 ext4_update_inode_fsync_trans(handle, inode, 1); 4590 else 4591 ext4_update_inode_fsync_trans(handle, inode, 0); 4592 out: 4593 if (allocated > map->m_len) 4594 allocated = map->m_len; 4595 ext4_ext_show_leaf(inode, path); 4596 map->m_flags |= EXT4_MAP_MAPPED; 4597 map->m_pblk = newblock; 4598 map->m_len = allocated; 4599 out2: 4600 ext4_ext_drop_refs(path); 4601 kfree(path); 4602 4603 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4604 err ? err : allocated); 4605 return err ? err : allocated; 4606 } 4607 4608 int ext4_ext_truncate(handle_t *handle, struct inode *inode) 4609 { 4610 struct super_block *sb = inode->i_sb; 4611 ext4_lblk_t last_block; 4612 int err = 0; 4613 4614 /* 4615 * TODO: optimization is possible here. 4616 * Probably we need not scan at all, 4617 * because page truncation is enough. 4618 */ 4619 4620 /* we have to know where to truncate from in crash case */ 4621 EXT4_I(inode)->i_disksize = inode->i_size; 4622 err = ext4_mark_inode_dirty(handle, inode); 4623 if (err) 4624 return err; 4625 4626 last_block = (inode->i_size + sb->s_blocksize - 1) 4627 >> EXT4_BLOCK_SIZE_BITS(sb); 4628 retry: 4629 err = ext4_es_remove_extent(inode, last_block, 4630 EXT_MAX_BLOCKS - last_block); 4631 if (err == -ENOMEM) { 4632 cond_resched(); 4633 congestion_wait(BLK_RW_ASYNC, HZ/50); 4634 goto retry; 4635 } 4636 if (err) 4637 return err; 4638 return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4639 } 4640 4641 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4642 ext4_lblk_t len, loff_t new_size, 4643 int flags) 4644 { 4645 struct inode *inode = file_inode(file); 4646 handle_t *handle; 4647 int ret = 0; 4648 int ret2 = 0; 4649 int retries = 0; 4650 int depth = 0; 4651 struct ext4_map_blocks map; 4652 unsigned int credits; 4653 loff_t epos; 4654 4655 BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); 4656 map.m_lblk = offset; 4657 map.m_len = len; 4658 /* 4659 * Don't normalize the request if it can fit in one extent so 4660 * that it doesn't get unnecessarily split into multiple 4661 * extents. 4662 */ 4663 if (len <= EXT_UNWRITTEN_MAX_LEN) 4664 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4665 4666 /* 4667 * credits to insert 1 extent into extent tree 4668 */ 4669 credits = ext4_chunk_trans_blocks(inode, len); 4670 depth = ext_depth(inode); 4671 4672 retry: 4673 while (ret >= 0 && len) { 4674 /* 4675 * Recalculate credits when extent tree depth changes. 4676 */ 4677 if (depth != ext_depth(inode)) { 4678 credits = ext4_chunk_trans_blocks(inode, len); 4679 depth = ext_depth(inode); 4680 } 4681 4682 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4683 credits); 4684 if (IS_ERR(handle)) { 4685 ret = PTR_ERR(handle); 4686 break; 4687 } 4688 ret = ext4_map_blocks(handle, inode, &map, flags); 4689 if (ret <= 0) { 4690 ext4_debug("inode #%lu: block %u: len %u: " 4691 "ext4_ext_map_blocks returned %d", 4692 inode->i_ino, map.m_lblk, 4693 map.m_len, ret); 4694 ext4_mark_inode_dirty(handle, inode); 4695 ret2 = ext4_journal_stop(handle); 4696 break; 4697 } 4698 map.m_lblk += ret; 4699 map.m_len = len = len - ret; 4700 epos = (loff_t)map.m_lblk << inode->i_blkbits; 4701 inode->i_ctime = current_time(inode); 4702 if (new_size) { 4703 if (epos > new_size) 4704 epos = new_size; 4705 if (ext4_update_inode_size(inode, epos) & 0x1) 4706 inode->i_mtime = inode->i_ctime; 4707 } else { 4708 if (epos > inode->i_size) 4709 ext4_set_inode_flag(inode, 4710 EXT4_INODE_EOFBLOCKS); 4711 } 4712 ext4_mark_inode_dirty(handle, inode); 4713 ext4_update_inode_fsync_trans(handle, inode, 1); 4714 ret2 = ext4_journal_stop(handle); 4715 if (ret2) 4716 break; 4717 } 4718 if (ret == -ENOSPC && 4719 ext4_should_retry_alloc(inode->i_sb, &retries)) { 4720 ret = 0; 4721 goto retry; 4722 } 4723 4724 return ret > 0 ? ret2 : ret; 4725 } 4726 4727 static long ext4_zero_range(struct file *file, loff_t offset, 4728 loff_t len, int mode) 4729 { 4730 struct inode *inode = file_inode(file); 4731 handle_t *handle = NULL; 4732 unsigned int max_blocks; 4733 loff_t new_size = 0; 4734 int ret = 0; 4735 int flags; 4736 int credits; 4737 int partial_begin, partial_end; 4738 loff_t start, end; 4739 ext4_lblk_t lblk; 4740 unsigned int blkbits = inode->i_blkbits; 4741 4742 trace_ext4_zero_range(inode, offset, len, mode); 4743 4744 if (!S_ISREG(inode->i_mode)) 4745 return -EINVAL; 4746 4747 /* Call ext4_force_commit to flush all data in case of data=journal. */ 4748 if (ext4_should_journal_data(inode)) { 4749 ret = ext4_force_commit(inode->i_sb); 4750 if (ret) 4751 return ret; 4752 } 4753 4754 /* 4755 * Round up offset. This is not fallocate, we neet to zero out 4756 * blocks, so convert interior block aligned part of the range to 4757 * unwritten and possibly manually zero out unaligned parts of the 4758 * range. 4759 */ 4760 start = round_up(offset, 1 << blkbits); 4761 end = round_down((offset + len), 1 << blkbits); 4762 4763 if (start < offset || end > offset + len) 4764 return -EINVAL; 4765 partial_begin = offset & ((1 << blkbits) - 1); 4766 partial_end = (offset + len) & ((1 << blkbits) - 1); 4767 4768 lblk = start >> blkbits; 4769 max_blocks = (end >> blkbits); 4770 if (max_blocks < lblk) 4771 max_blocks = 0; 4772 else 4773 max_blocks -= lblk; 4774 4775 inode_lock(inode); 4776 4777 /* 4778 * Indirect files do not support unwritten extnets 4779 */ 4780 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4781 ret = -EOPNOTSUPP; 4782 goto out_mutex; 4783 } 4784 4785 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4786 (offset + len > i_size_read(inode) || 4787 offset + len > EXT4_I(inode)->i_disksize)) { 4788 new_size = offset + len; 4789 ret = inode_newsize_ok(inode, new_size); 4790 if (ret) 4791 goto out_mutex; 4792 } 4793 4794 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4795 if (mode & FALLOC_FL_KEEP_SIZE) 4796 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4797 4798 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4799 ext4_inode_block_unlocked_dio(inode); 4800 inode_dio_wait(inode); 4801 4802 /* Preallocate the range including the unaligned edges */ 4803 if (partial_begin || partial_end) { 4804 ret = ext4_alloc_file_blocks(file, 4805 round_down(offset, 1 << blkbits) >> blkbits, 4806 (round_up((offset + len), 1 << blkbits) - 4807 round_down(offset, 1 << blkbits)) >> blkbits, 4808 new_size, flags); 4809 if (ret) 4810 goto out_dio; 4811 4812 } 4813 4814 /* Zero range excluding the unaligned edges */ 4815 if (max_blocks > 0) { 4816 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4817 EXT4_EX_NOCACHE); 4818 4819 /* 4820 * Prevent page faults from reinstantiating pages we have 4821 * released from page cache. 4822 */ 4823 down_write(&EXT4_I(inode)->i_mmap_sem); 4824 ret = ext4_update_disksize_before_punch(inode, offset, len); 4825 if (ret) { 4826 up_write(&EXT4_I(inode)->i_mmap_sem); 4827 goto out_dio; 4828 } 4829 /* Now release the pages and zero block aligned part of pages */ 4830 truncate_pagecache_range(inode, start, end - 1); 4831 inode->i_mtime = inode->i_ctime = current_time(inode); 4832 4833 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4834 flags); 4835 up_write(&EXT4_I(inode)->i_mmap_sem); 4836 if (ret) 4837 goto out_dio; 4838 } 4839 if (!partial_begin && !partial_end) 4840 goto out_dio; 4841 4842 /* 4843 * In worst case we have to writeout two nonadjacent unwritten 4844 * blocks and update the inode 4845 */ 4846 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; 4847 if (ext4_should_journal_data(inode)) 4848 credits += 2; 4849 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 4850 if (IS_ERR(handle)) { 4851 ret = PTR_ERR(handle); 4852 ext4_std_error(inode->i_sb, ret); 4853 goto out_dio; 4854 } 4855 4856 inode->i_mtime = inode->i_ctime = current_time(inode); 4857 if (new_size) { 4858 ext4_update_inode_size(inode, new_size); 4859 } else { 4860 /* 4861 * Mark that we allocate beyond EOF so the subsequent truncate 4862 * can proceed even if the new size is the same as i_size. 4863 */ 4864 if ((offset + len) > i_size_read(inode)) 4865 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4866 } 4867 ext4_mark_inode_dirty(handle, inode); 4868 4869 /* Zero out partial block at the edges of the range */ 4870 ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4871 if (ret >= 0) 4872 ext4_update_inode_fsync_trans(handle, inode, 1); 4873 4874 if (file->f_flags & O_SYNC) 4875 ext4_handle_sync(handle); 4876 4877 ext4_journal_stop(handle); 4878 out_dio: 4879 ext4_inode_resume_unlocked_dio(inode); 4880 out_mutex: 4881 inode_unlock(inode); 4882 return ret; 4883 } 4884 4885 /* 4886 * preallocate space for a file. This implements ext4's fallocate file 4887 * operation, which gets called from sys_fallocate system call. 4888 * For block-mapped files, posix_fallocate should fall back to the method 4889 * of writing zeroes to the required new blocks (the same behavior which is 4890 * expected for file systems which do not support fallocate() system call). 4891 */ 4892 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4893 { 4894 struct inode *inode = file_inode(file); 4895 loff_t new_size = 0; 4896 unsigned int max_blocks; 4897 int ret = 0; 4898 int flags; 4899 ext4_lblk_t lblk; 4900 unsigned int blkbits = inode->i_blkbits; 4901 4902 /* 4903 * Encrypted inodes can't handle collapse range or insert 4904 * range since we would need to re-encrypt blocks with a 4905 * different IV or XTS tweak (which are based on the logical 4906 * block number). 4907 * 4908 * XXX It's not clear why zero range isn't working, but we'll 4909 * leave it disabled for encrypted inodes for now. This is a 4910 * bug we should fix.... 4911 */ 4912 if (ext4_encrypted_inode(inode) && 4913 (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | 4914 FALLOC_FL_ZERO_RANGE))) 4915 return -EOPNOTSUPP; 4916 4917 /* Return error if mode is not supported */ 4918 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 4919 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | 4920 FALLOC_FL_INSERT_RANGE)) 4921 return -EOPNOTSUPP; 4922 4923 if (mode & FALLOC_FL_PUNCH_HOLE) 4924 return ext4_punch_hole(inode, offset, len); 4925 4926 ret = ext4_convert_inline_data(inode); 4927 if (ret) 4928 return ret; 4929 4930 if (mode & FALLOC_FL_COLLAPSE_RANGE) 4931 return ext4_collapse_range(inode, offset, len); 4932 4933 if (mode & FALLOC_FL_INSERT_RANGE) 4934 return ext4_insert_range(inode, offset, len); 4935 4936 if (mode & FALLOC_FL_ZERO_RANGE) 4937 return ext4_zero_range(file, offset, len, mode); 4938 4939 trace_ext4_fallocate_enter(inode, offset, len, mode); 4940 lblk = offset >> blkbits; 4941 4942 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); 4943 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4944 if (mode & FALLOC_FL_KEEP_SIZE) 4945 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4946 4947 inode_lock(inode); 4948 4949 /* 4950 * We only support preallocation for extent-based files only 4951 */ 4952 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4953 ret = -EOPNOTSUPP; 4954 goto out; 4955 } 4956 4957 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4958 (offset + len > i_size_read(inode) || 4959 offset + len > EXT4_I(inode)->i_disksize)) { 4960 new_size = offset + len; 4961 ret = inode_newsize_ok(inode, new_size); 4962 if (ret) 4963 goto out; 4964 } 4965 4966 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4967 ext4_inode_block_unlocked_dio(inode); 4968 inode_dio_wait(inode); 4969 4970 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); 4971 ext4_inode_resume_unlocked_dio(inode); 4972 if (ret) 4973 goto out; 4974 4975 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4976 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, 4977 EXT4_I(inode)->i_sync_tid); 4978 } 4979 out: 4980 inode_unlock(inode); 4981 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4982 return ret; 4983 } 4984 4985 /* 4986 * This function convert a range of blocks to written extents 4987 * The caller of this function will pass the start offset and the size. 4988 * all unwritten extents within this range will be converted to 4989 * written extents. 4990 * 4991 * This function is called from the direct IO end io call back 4992 * function, to convert the fallocated extents after IO is completed. 4993 * Returns 0 on success. 4994 */ 4995 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, 4996 loff_t offset, ssize_t len) 4997 { 4998 unsigned int max_blocks; 4999 int ret = 0; 5000 int ret2 = 0; 5001 struct ext4_map_blocks map; 5002 unsigned int credits, blkbits = inode->i_blkbits; 5003 5004 map.m_lblk = offset >> blkbits; 5005 max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); 5006 5007 /* 5008 * This is somewhat ugly but the idea is clear: When transaction is 5009 * reserved, everything goes into it. Otherwise we rather start several 5010 * smaller transactions for conversion of each extent separately. 5011 */ 5012 if (handle) { 5013 handle = ext4_journal_start_reserved(handle, 5014 EXT4_HT_EXT_CONVERT); 5015 if (IS_ERR(handle)) 5016 return PTR_ERR(handle); 5017 credits = 0; 5018 } else { 5019 /* 5020 * credits to insert 1 extent into extent tree 5021 */ 5022 credits = ext4_chunk_trans_blocks(inode, max_blocks); 5023 } 5024 while (ret >= 0 && ret < max_blocks) { 5025 map.m_lblk += ret; 5026 map.m_len = (max_blocks -= ret); 5027 if (credits) { 5028 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 5029 credits); 5030 if (IS_ERR(handle)) { 5031 ret = PTR_ERR(handle); 5032 break; 5033 } 5034 } 5035 ret = ext4_map_blocks(handle, inode, &map, 5036 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 5037 if (ret <= 0) 5038 ext4_warning(inode->i_sb, 5039 "inode #%lu: block %u: len %u: " 5040 "ext4_ext_map_blocks returned %d", 5041 inode->i_ino, map.m_lblk, 5042 map.m_len, ret); 5043 ext4_mark_inode_dirty(handle, inode); 5044 if (credits) 5045 ret2 = ext4_journal_stop(handle); 5046 if (ret <= 0 || ret2) 5047 break; 5048 } 5049 if (!credits) 5050 ret2 = ext4_journal_stop(handle); 5051 return ret > 0 ? ret2 : ret; 5052 } 5053 5054 /* 5055 * If newes is not existing extent (newes->ec_pblk equals zero) find 5056 * delayed extent at start of newes and update newes accordingly and 5057 * return start of the next delayed extent. 5058 * 5059 * If newes is existing extent (newes->ec_pblk is not equal zero) 5060 * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed 5061 * extent found. Leave newes unmodified. 5062 */ 5063 static int ext4_find_delayed_extent(struct inode *inode, 5064 struct extent_status *newes) 5065 { 5066 struct extent_status es; 5067 ext4_lblk_t block, next_del; 5068 5069 if (newes->es_pblk == 0) { 5070 ext4_es_find_delayed_extent_range(inode, newes->es_lblk, 5071 newes->es_lblk + newes->es_len - 1, &es); 5072 5073 /* 5074 * No extent in extent-tree contains block @newes->es_pblk, 5075 * then the block may stay in 1)a hole or 2)delayed-extent. 5076 */ 5077 if (es.es_len == 0) 5078 /* A hole found. */ 5079 return 0; 5080 5081 if (es.es_lblk > newes->es_lblk) { 5082 /* A hole found. */ 5083 newes->es_len = min(es.es_lblk - newes->es_lblk, 5084 newes->es_len); 5085 return 0; 5086 } 5087 5088 newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; 5089 } 5090 5091 block = newes->es_lblk + newes->es_len; 5092 ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es); 5093 if (es.es_len == 0) 5094 next_del = EXT_MAX_BLOCKS; 5095 else 5096 next_del = es.es_lblk; 5097 5098 return next_del; 5099 } 5100 /* fiemap flags we can handle specified here */ 5101 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 5102 5103 static int ext4_xattr_fiemap(struct inode *inode, 5104 struct fiemap_extent_info *fieinfo) 5105 { 5106 __u64 physical = 0; 5107 __u64 length; 5108 __u32 flags = FIEMAP_EXTENT_LAST; 5109 int blockbits = inode->i_sb->s_blocksize_bits; 5110 int error = 0; 5111 5112 /* in-inode? */ 5113 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 5114 struct ext4_iloc iloc; 5115 int offset; /* offset of xattr in inode */ 5116 5117 error = ext4_get_inode_loc(inode, &iloc); 5118 if (error) 5119 return error; 5120 physical = (__u64)iloc.bh->b_blocknr << blockbits; 5121 offset = EXT4_GOOD_OLD_INODE_SIZE + 5122 EXT4_I(inode)->i_extra_isize; 5123 physical += offset; 5124 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 5125 flags |= FIEMAP_EXTENT_DATA_INLINE; 5126 brelse(iloc.bh); 5127 } else { /* external block */ 5128 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; 5129 length = inode->i_sb->s_blocksize; 5130 } 5131 5132 if (physical) 5133 error = fiemap_fill_next_extent(fieinfo, 0, physical, 5134 length, flags); 5135 return (error < 0 ? error : 0); 5136 } 5137 5138 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5139 __u64 start, __u64 len) 5140 { 5141 ext4_lblk_t start_blk; 5142 int error = 0; 5143 5144 if (ext4_has_inline_data(inode)) { 5145 int has_inline = 1; 5146 5147 error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, 5148 start, len); 5149 5150 if (has_inline) 5151 return error; 5152 } 5153 5154 if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { 5155 error = ext4_ext_precache(inode); 5156 if (error) 5157 return error; 5158 } 5159 5160 /* fallback to generic here if not in extents fmt */ 5161 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5162 return generic_block_fiemap(inode, fieinfo, start, len, 5163 ext4_get_block); 5164 5165 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 5166 return -EBADR; 5167 5168 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 5169 error = ext4_xattr_fiemap(inode, fieinfo); 5170 } else { 5171 ext4_lblk_t len_blks; 5172 __u64 last_blk; 5173 5174 start_blk = start >> inode->i_sb->s_blocksize_bits; 5175 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 5176 if (last_blk >= EXT_MAX_BLOCKS) 5177 last_blk = EXT_MAX_BLOCKS-1; 5178 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 5179 5180 /* 5181 * Walk the extent tree gathering extent information 5182 * and pushing extents back to the user. 5183 */ 5184 error = ext4_fill_fiemap_extents(inode, start_blk, 5185 len_blks, fieinfo); 5186 } 5187 return error; 5188 } 5189 5190 /* 5191 * ext4_access_path: 5192 * Function to access the path buffer for marking it dirty. 5193 * It also checks if there are sufficient credits left in the journal handle 5194 * to update path. 5195 */ 5196 static int 5197 ext4_access_path(handle_t *handle, struct inode *inode, 5198 struct ext4_ext_path *path) 5199 { 5200 int credits, err; 5201 5202 if (!ext4_handle_valid(handle)) 5203 return 0; 5204 5205 /* 5206 * Check if need to extend journal credits 5207 * 3 for leaf, sb, and inode plus 2 (bmap and group 5208 * descriptor) for each block group; assume two block 5209 * groups 5210 */ 5211 if (handle->h_buffer_credits < 7) { 5212 credits = ext4_writepage_trans_blocks(inode); 5213 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 5214 /* EAGAIN is success */ 5215 if (err && err != -EAGAIN) 5216 return err; 5217 } 5218 5219 err = ext4_ext_get_access(handle, inode, path); 5220 return err; 5221 } 5222 5223 /* 5224 * ext4_ext_shift_path_extents: 5225 * Shift the extents of a path structure lying between path[depth].p_ext 5226 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells 5227 * if it is right shift or left shift operation. 5228 */ 5229 static int 5230 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, 5231 struct inode *inode, handle_t *handle, 5232 enum SHIFT_DIRECTION SHIFT) 5233 { 5234 int depth, err = 0; 5235 struct ext4_extent *ex_start, *ex_last; 5236 bool update = 0; 5237 depth = path->p_depth; 5238 5239 while (depth >= 0) { 5240 if (depth == path->p_depth) { 5241 ex_start = path[depth].p_ext; 5242 if (!ex_start) 5243 return -EFSCORRUPTED; 5244 5245 ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); 5246 5247 err = ext4_access_path(handle, inode, path + depth); 5248 if (err) 5249 goto out; 5250 5251 if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) 5252 update = 1; 5253 5254 while (ex_start <= ex_last) { 5255 if (SHIFT == SHIFT_LEFT) { 5256 le32_add_cpu(&ex_start->ee_block, 5257 -shift); 5258 /* Try to merge to the left. */ 5259 if ((ex_start > 5260 EXT_FIRST_EXTENT(path[depth].p_hdr)) 5261 && 5262 ext4_ext_try_to_merge_right(inode, 5263 path, ex_start - 1)) 5264 ex_last--; 5265 else 5266 ex_start++; 5267 } else { 5268 le32_add_cpu(&ex_last->ee_block, shift); 5269 ext4_ext_try_to_merge_right(inode, path, 5270 ex_last); 5271 ex_last--; 5272 } 5273 } 5274 err = ext4_ext_dirty(handle, inode, path + depth); 5275 if (err) 5276 goto out; 5277 5278 if (--depth < 0 || !update) 5279 break; 5280 } 5281 5282 /* Update index too */ 5283 err = ext4_access_path(handle, inode, path + depth); 5284 if (err) 5285 goto out; 5286 5287 if (SHIFT == SHIFT_LEFT) 5288 le32_add_cpu(&path[depth].p_idx->ei_block, -shift); 5289 else 5290 le32_add_cpu(&path[depth].p_idx->ei_block, shift); 5291 err = ext4_ext_dirty(handle, inode, path + depth); 5292 if (err) 5293 goto out; 5294 5295 /* we are done if current index is not a starting index */ 5296 if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) 5297 break; 5298 5299 depth--; 5300 } 5301 5302 out: 5303 return err; 5304 } 5305 5306 /* 5307 * ext4_ext_shift_extents: 5308 * All the extents which lies in the range from @start to the last allocated 5309 * block for the @inode are shifted either towards left or right (depending 5310 * upon @SHIFT) by @shift blocks. 5311 * On success, 0 is returned, error otherwise. 5312 */ 5313 static int 5314 ext4_ext_shift_extents(struct inode *inode, handle_t *handle, 5315 ext4_lblk_t start, ext4_lblk_t shift, 5316 enum SHIFT_DIRECTION SHIFT) 5317 { 5318 struct ext4_ext_path *path; 5319 int ret = 0, depth; 5320 struct ext4_extent *extent; 5321 ext4_lblk_t stop, *iterator, ex_start, ex_end; 5322 5323 /* Let path point to the last extent */ 5324 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 5325 EXT4_EX_NOCACHE); 5326 if (IS_ERR(path)) 5327 return PTR_ERR(path); 5328 5329 depth = path->p_depth; 5330 extent = path[depth].p_ext; 5331 if (!extent) 5332 goto out; 5333 5334 stop = le32_to_cpu(extent->ee_block); 5335 5336 /* 5337 * In case of left shift, Don't start shifting extents until we make 5338 * sure the hole is big enough to accommodate the shift. 5339 */ 5340 if (SHIFT == SHIFT_LEFT) { 5341 path = ext4_find_extent(inode, start - 1, &path, 5342 EXT4_EX_NOCACHE); 5343 if (IS_ERR(path)) 5344 return PTR_ERR(path); 5345 depth = path->p_depth; 5346 extent = path[depth].p_ext; 5347 if (extent) { 5348 ex_start = le32_to_cpu(extent->ee_block); 5349 ex_end = le32_to_cpu(extent->ee_block) + 5350 ext4_ext_get_actual_len(extent); 5351 } else { 5352 ex_start = 0; 5353 ex_end = 0; 5354 } 5355 5356 if ((start == ex_start && shift > ex_start) || 5357 (shift > start - ex_end)) { 5358 ext4_ext_drop_refs(path); 5359 kfree(path); 5360 return -EINVAL; 5361 } 5362 } 5363 5364 /* 5365 * In case of left shift, iterator points to start and it is increased 5366 * till we reach stop. In case of right shift, iterator points to stop 5367 * and it is decreased till we reach start. 5368 */ 5369 if (SHIFT == SHIFT_LEFT) 5370 iterator = &start; 5371 else 5372 iterator = &stop; 5373 5374 /* 5375 * Its safe to start updating extents. Start and stop are unsigned, so 5376 * in case of right shift if extent with 0 block is reached, iterator 5377 * becomes NULL to indicate the end of the loop. 5378 */ 5379 while (iterator && start <= stop) { 5380 path = ext4_find_extent(inode, *iterator, &path, 5381 EXT4_EX_NOCACHE); 5382 if (IS_ERR(path)) 5383 return PTR_ERR(path); 5384 depth = path->p_depth; 5385 extent = path[depth].p_ext; 5386 if (!extent) { 5387 EXT4_ERROR_INODE(inode, "unexpected hole at %lu", 5388 (unsigned long) *iterator); 5389 return -EFSCORRUPTED; 5390 } 5391 if (SHIFT == SHIFT_LEFT && *iterator > 5392 le32_to_cpu(extent->ee_block)) { 5393 /* Hole, move to the next extent */ 5394 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { 5395 path[depth].p_ext++; 5396 } else { 5397 *iterator = ext4_ext_next_allocated_block(path); 5398 continue; 5399 } 5400 } 5401 5402 if (SHIFT == SHIFT_LEFT) { 5403 extent = EXT_LAST_EXTENT(path[depth].p_hdr); 5404 *iterator = le32_to_cpu(extent->ee_block) + 5405 ext4_ext_get_actual_len(extent); 5406 } else { 5407 extent = EXT_FIRST_EXTENT(path[depth].p_hdr); 5408 if (le32_to_cpu(extent->ee_block) > 0) 5409 *iterator = le32_to_cpu(extent->ee_block) - 1; 5410 else 5411 /* Beginning is reached, end of the loop */ 5412 iterator = NULL; 5413 /* Update path extent in case we need to stop */ 5414 while (le32_to_cpu(extent->ee_block) < start) 5415 extent++; 5416 path[depth].p_ext = extent; 5417 } 5418 ret = ext4_ext_shift_path_extents(path, shift, inode, 5419 handle, SHIFT); 5420 if (ret) 5421 break; 5422 } 5423 out: 5424 ext4_ext_drop_refs(path); 5425 kfree(path); 5426 return ret; 5427 } 5428 5429 /* 5430 * ext4_collapse_range: 5431 * This implements the fallocate's collapse range functionality for ext4 5432 * Returns: 0 and non-zero on error. 5433 */ 5434 int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) 5435 { 5436 struct super_block *sb = inode->i_sb; 5437 ext4_lblk_t punch_start, punch_stop; 5438 handle_t *handle; 5439 unsigned int credits; 5440 loff_t new_size, ioffset; 5441 int ret; 5442 5443 /* 5444 * We need to test this early because xfstests assumes that a 5445 * collapse range of (0, 1) will return EOPNOTSUPP if the file 5446 * system does not support collapse range. 5447 */ 5448 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5449 return -EOPNOTSUPP; 5450 5451 /* Collapse range works only on fs block size aligned offsets. */ 5452 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || 5453 len & (EXT4_CLUSTER_SIZE(sb) - 1)) 5454 return -EINVAL; 5455 5456 if (!S_ISREG(inode->i_mode)) 5457 return -EINVAL; 5458 5459 trace_ext4_collapse_range(inode, offset, len); 5460 5461 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5462 punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5463 5464 /* Call ext4_force_commit to flush all data in case of data=journal. */ 5465 if (ext4_should_journal_data(inode)) { 5466 ret = ext4_force_commit(inode->i_sb); 5467 if (ret) 5468 return ret; 5469 } 5470 5471 inode_lock(inode); 5472 /* 5473 * There is no need to overlap collapse range with EOF, in which case 5474 * it is effectively a truncate operation 5475 */ 5476 if (offset + len >= i_size_read(inode)) { 5477 ret = -EINVAL; 5478 goto out_mutex; 5479 } 5480 5481 /* Currently just for extent based files */ 5482 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5483 ret = -EOPNOTSUPP; 5484 goto out_mutex; 5485 } 5486 5487 /* Wait for existing dio to complete */ 5488 ext4_inode_block_unlocked_dio(inode); 5489 inode_dio_wait(inode); 5490 5491 /* 5492 * Prevent page faults from reinstantiating pages we have released from 5493 * page cache. 5494 */ 5495 down_write(&EXT4_I(inode)->i_mmap_sem); 5496 /* 5497 * Need to round down offset to be aligned with page size boundary 5498 * for page size > block size. 5499 */ 5500 ioffset = round_down(offset, PAGE_SIZE); 5501 /* 5502 * Write tail of the last page before removed range since it will get 5503 * removed from the page cache below. 5504 */ 5505 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); 5506 if (ret) 5507 goto out_mmap; 5508 /* 5509 * Write data that will be shifted to preserve them when discarding 5510 * page cache below. We are also protected from pages becoming dirty 5511 * by i_mmap_sem. 5512 */ 5513 ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, 5514 LLONG_MAX); 5515 if (ret) 5516 goto out_mmap; 5517 truncate_pagecache(inode, ioffset); 5518 5519 credits = ext4_writepage_trans_blocks(inode); 5520 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5521 if (IS_ERR(handle)) { 5522 ret = PTR_ERR(handle); 5523 goto out_mmap; 5524 } 5525 5526 down_write(&EXT4_I(inode)->i_data_sem); 5527 ext4_discard_preallocations(inode); 5528 5529 ret = ext4_es_remove_extent(inode, punch_start, 5530 EXT_MAX_BLOCKS - punch_start); 5531 if (ret) { 5532 up_write(&EXT4_I(inode)->i_data_sem); 5533 goto out_stop; 5534 } 5535 5536 ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); 5537 if (ret) { 5538 up_write(&EXT4_I(inode)->i_data_sem); 5539 goto out_stop; 5540 } 5541 ext4_discard_preallocations(inode); 5542 5543 ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5544 punch_stop - punch_start, SHIFT_LEFT); 5545 if (ret) { 5546 up_write(&EXT4_I(inode)->i_data_sem); 5547 goto out_stop; 5548 } 5549 5550 new_size = i_size_read(inode) - len; 5551 i_size_write(inode, new_size); 5552 EXT4_I(inode)->i_disksize = new_size; 5553 5554 up_write(&EXT4_I(inode)->i_data_sem); 5555 if (IS_SYNC(inode)) 5556 ext4_handle_sync(handle); 5557 inode->i_mtime = inode->i_ctime = current_time(inode); 5558 ext4_mark_inode_dirty(handle, inode); 5559 ext4_update_inode_fsync_trans(handle, inode, 1); 5560 5561 out_stop: 5562 ext4_journal_stop(handle); 5563 out_mmap: 5564 up_write(&EXT4_I(inode)->i_mmap_sem); 5565 ext4_inode_resume_unlocked_dio(inode); 5566 out_mutex: 5567 inode_unlock(inode); 5568 return ret; 5569 } 5570 5571 /* 5572 * ext4_insert_range: 5573 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. 5574 * The data blocks starting from @offset to the EOF are shifted by @len 5575 * towards right to create a hole in the @inode. Inode size is increased 5576 * by len bytes. 5577 * Returns 0 on success, error otherwise. 5578 */ 5579 int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) 5580 { 5581 struct super_block *sb = inode->i_sb; 5582 handle_t *handle; 5583 struct ext4_ext_path *path; 5584 struct ext4_extent *extent; 5585 ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; 5586 unsigned int credits, ee_len; 5587 int ret = 0, depth, split_flag = 0; 5588 loff_t ioffset; 5589 5590 /* 5591 * We need to test this early because xfstests assumes that an 5592 * insert range of (0, 1) will return EOPNOTSUPP if the file 5593 * system does not support insert range. 5594 */ 5595 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5596 return -EOPNOTSUPP; 5597 5598 /* Insert range works only on fs block size aligned offsets. */ 5599 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || 5600 len & (EXT4_CLUSTER_SIZE(sb) - 1)) 5601 return -EINVAL; 5602 5603 if (!S_ISREG(inode->i_mode)) 5604 return -EOPNOTSUPP; 5605 5606 trace_ext4_insert_range(inode, offset, len); 5607 5608 offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5609 len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); 5610 5611 /* Call ext4_force_commit to flush all data in case of data=journal */ 5612 if (ext4_should_journal_data(inode)) { 5613 ret = ext4_force_commit(inode->i_sb); 5614 if (ret) 5615 return ret; 5616 } 5617 5618 inode_lock(inode); 5619 /* Currently just for extent based files */ 5620 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5621 ret = -EOPNOTSUPP; 5622 goto out_mutex; 5623 } 5624 5625 /* Check for wrap through zero */ 5626 if (inode->i_size + len > inode->i_sb->s_maxbytes) { 5627 ret = -EFBIG; 5628 goto out_mutex; 5629 } 5630 5631 /* Offset should be less than i_size */ 5632 if (offset >= i_size_read(inode)) { 5633 ret = -EINVAL; 5634 goto out_mutex; 5635 } 5636 5637 /* Wait for existing dio to complete */ 5638 ext4_inode_block_unlocked_dio(inode); 5639 inode_dio_wait(inode); 5640 5641 /* 5642 * Prevent page faults from reinstantiating pages we have released from 5643 * page cache. 5644 */ 5645 down_write(&EXT4_I(inode)->i_mmap_sem); 5646 /* 5647 * Need to round down to align start offset to page size boundary 5648 * for page size > block size. 5649 */ 5650 ioffset = round_down(offset, PAGE_SIZE); 5651 /* Write out all dirty pages */ 5652 ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5653 LLONG_MAX); 5654 if (ret) 5655 goto out_mmap; 5656 truncate_pagecache(inode, ioffset); 5657 5658 credits = ext4_writepage_trans_blocks(inode); 5659 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5660 if (IS_ERR(handle)) { 5661 ret = PTR_ERR(handle); 5662 goto out_mmap; 5663 } 5664 5665 /* Expand file to avoid data loss if there is error while shifting */ 5666 inode->i_size += len; 5667 EXT4_I(inode)->i_disksize += len; 5668 inode->i_mtime = inode->i_ctime = current_time(inode); 5669 ret = ext4_mark_inode_dirty(handle, inode); 5670 if (ret) 5671 goto out_stop; 5672 5673 down_write(&EXT4_I(inode)->i_data_sem); 5674 ext4_discard_preallocations(inode); 5675 5676 path = ext4_find_extent(inode, offset_lblk, NULL, 0); 5677 if (IS_ERR(path)) { 5678 up_write(&EXT4_I(inode)->i_data_sem); 5679 goto out_stop; 5680 } 5681 5682 depth = ext_depth(inode); 5683 extent = path[depth].p_ext; 5684 if (extent) { 5685 ee_start_lblk = le32_to_cpu(extent->ee_block); 5686 ee_len = ext4_ext_get_actual_len(extent); 5687 5688 /* 5689 * If offset_lblk is not the starting block of extent, split 5690 * the extent @offset_lblk 5691 */ 5692 if ((offset_lblk > ee_start_lblk) && 5693 (offset_lblk < (ee_start_lblk + ee_len))) { 5694 if (ext4_ext_is_unwritten(extent)) 5695 split_flag = EXT4_EXT_MARK_UNWRIT1 | 5696 EXT4_EXT_MARK_UNWRIT2; 5697 ret = ext4_split_extent_at(handle, inode, &path, 5698 offset_lblk, split_flag, 5699 EXT4_EX_NOCACHE | 5700 EXT4_GET_BLOCKS_PRE_IO | 5701 EXT4_GET_BLOCKS_METADATA_NOFAIL); 5702 } 5703 5704 ext4_ext_drop_refs(path); 5705 kfree(path); 5706 if (ret < 0) { 5707 up_write(&EXT4_I(inode)->i_data_sem); 5708 goto out_stop; 5709 } 5710 } else { 5711 ext4_ext_drop_refs(path); 5712 kfree(path); 5713 } 5714 5715 ret = ext4_es_remove_extent(inode, offset_lblk, 5716 EXT_MAX_BLOCKS - offset_lblk); 5717 if (ret) { 5718 up_write(&EXT4_I(inode)->i_data_sem); 5719 goto out_stop; 5720 } 5721 5722 /* 5723 * if offset_lblk lies in a hole which is at start of file, use 5724 * ee_start_lblk to shift extents 5725 */ 5726 ret = ext4_ext_shift_extents(inode, handle, 5727 ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, 5728 len_lblk, SHIFT_RIGHT); 5729 5730 up_write(&EXT4_I(inode)->i_data_sem); 5731 if (IS_SYNC(inode)) 5732 ext4_handle_sync(handle); 5733 if (ret >= 0) 5734 ext4_update_inode_fsync_trans(handle, inode, 1); 5735 5736 out_stop: 5737 ext4_journal_stop(handle); 5738 out_mmap: 5739 up_write(&EXT4_I(inode)->i_mmap_sem); 5740 ext4_inode_resume_unlocked_dio(inode); 5741 out_mutex: 5742 inode_unlock(inode); 5743 return ret; 5744 } 5745 5746 /** 5747 * ext4_swap_extents - Swap extents between two inodes 5748 * 5749 * @inode1: First inode 5750 * @inode2: Second inode 5751 * @lblk1: Start block for first inode 5752 * @lblk2: Start block for second inode 5753 * @count: Number of blocks to swap 5754 * @mark_unwritten: Mark second inode's extents as unwritten after swap 5755 * @erp: Pointer to save error value 5756 * 5757 * This helper routine does exactly what is promise "swap extents". All other 5758 * stuff such as page-cache locking consistency, bh mapping consistency or 5759 * extent's data copying must be performed by caller. 5760 * Locking: 5761 * i_mutex is held for both inodes 5762 * i_data_sem is locked for write for both inodes 5763 * Assumptions: 5764 * All pages from requested range are locked for both inodes 5765 */ 5766 int 5767 ext4_swap_extents(handle_t *handle, struct inode *inode1, 5768 struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, 5769 ext4_lblk_t count, int unwritten, int *erp) 5770 { 5771 struct ext4_ext_path *path1 = NULL; 5772 struct ext4_ext_path *path2 = NULL; 5773 int replaced_count = 0; 5774 5775 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); 5776 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); 5777 BUG_ON(!inode_is_locked(inode1)); 5778 BUG_ON(!inode_is_locked(inode2)); 5779 5780 *erp = ext4_es_remove_extent(inode1, lblk1, count); 5781 if (unlikely(*erp)) 5782 return 0; 5783 *erp = ext4_es_remove_extent(inode2, lblk2, count); 5784 if (unlikely(*erp)) 5785 return 0; 5786 5787 while (count) { 5788 struct ext4_extent *ex1, *ex2, tmp_ex; 5789 ext4_lblk_t e1_blk, e2_blk; 5790 int e1_len, e2_len, len; 5791 int split = 0; 5792 5793 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); 5794 if (IS_ERR(path1)) { 5795 *erp = PTR_ERR(path1); 5796 path1 = NULL; 5797 finish: 5798 count = 0; 5799 goto repeat; 5800 } 5801 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); 5802 if (IS_ERR(path2)) { 5803 *erp = PTR_ERR(path2); 5804 path2 = NULL; 5805 goto finish; 5806 } 5807 ex1 = path1[path1->p_depth].p_ext; 5808 ex2 = path2[path2->p_depth].p_ext; 5809 /* Do we have somthing to swap ? */ 5810 if (unlikely(!ex2 || !ex1)) 5811 goto finish; 5812 5813 e1_blk = le32_to_cpu(ex1->ee_block); 5814 e2_blk = le32_to_cpu(ex2->ee_block); 5815 e1_len = ext4_ext_get_actual_len(ex1); 5816 e2_len = ext4_ext_get_actual_len(ex2); 5817 5818 /* Hole handling */ 5819 if (!in_range(lblk1, e1_blk, e1_len) || 5820 !in_range(lblk2, e2_blk, e2_len)) { 5821 ext4_lblk_t next1, next2; 5822 5823 /* if hole after extent, then go to next extent */ 5824 next1 = ext4_ext_next_allocated_block(path1); 5825 next2 = ext4_ext_next_allocated_block(path2); 5826 /* If hole before extent, then shift to that extent */ 5827 if (e1_blk > lblk1) 5828 next1 = e1_blk; 5829 if (e2_blk > lblk2) 5830 next2 = e2_blk; 5831 /* Do we have something to swap */ 5832 if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) 5833 goto finish; 5834 /* Move to the rightest boundary */ 5835 len = next1 - lblk1; 5836 if (len < next2 - lblk2) 5837 len = next2 - lblk2; 5838 if (len > count) 5839 len = count; 5840 lblk1 += len; 5841 lblk2 += len; 5842 count -= len; 5843 goto repeat; 5844 } 5845 5846 /* Prepare left boundary */ 5847 if (e1_blk < lblk1) { 5848 split = 1; 5849 *erp = ext4_force_split_extent_at(handle, inode1, 5850 &path1, lblk1, 0); 5851 if (unlikely(*erp)) 5852 goto finish; 5853 } 5854 if (e2_blk < lblk2) { 5855 split = 1; 5856 *erp = ext4_force_split_extent_at(handle, inode2, 5857 &path2, lblk2, 0); 5858 if (unlikely(*erp)) 5859 goto finish; 5860 } 5861 /* ext4_split_extent_at() may result in leaf extent split, 5862 * path must to be revalidated. */ 5863 if (split) 5864 goto repeat; 5865 5866 /* Prepare right boundary */ 5867 len = count; 5868 if (len > e1_blk + e1_len - lblk1) 5869 len = e1_blk + e1_len - lblk1; 5870 if (len > e2_blk + e2_len - lblk2) 5871 len = e2_blk + e2_len - lblk2; 5872 5873 if (len != e1_len) { 5874 split = 1; 5875 *erp = ext4_force_split_extent_at(handle, inode1, 5876 &path1, lblk1 + len, 0); 5877 if (unlikely(*erp)) 5878 goto finish; 5879 } 5880 if (len != e2_len) { 5881 split = 1; 5882 *erp = ext4_force_split_extent_at(handle, inode2, 5883 &path2, lblk2 + len, 0); 5884 if (*erp) 5885 goto finish; 5886 } 5887 /* ext4_split_extent_at() may result in leaf extent split, 5888 * path must to be revalidated. */ 5889 if (split) 5890 goto repeat; 5891 5892 BUG_ON(e2_len != e1_len); 5893 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); 5894 if (unlikely(*erp)) 5895 goto finish; 5896 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); 5897 if (unlikely(*erp)) 5898 goto finish; 5899 5900 /* Both extents are fully inside boundaries. Swap it now */ 5901 tmp_ex = *ex1; 5902 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); 5903 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); 5904 ex1->ee_len = cpu_to_le16(e2_len); 5905 ex2->ee_len = cpu_to_le16(e1_len); 5906 if (unwritten) 5907 ext4_ext_mark_unwritten(ex2); 5908 if (ext4_ext_is_unwritten(&tmp_ex)) 5909 ext4_ext_mark_unwritten(ex1); 5910 5911 ext4_ext_try_to_merge(handle, inode2, path2, ex2); 5912 ext4_ext_try_to_merge(handle, inode1, path1, ex1); 5913 *erp = ext4_ext_dirty(handle, inode2, path2 + 5914 path2->p_depth); 5915 if (unlikely(*erp)) 5916 goto finish; 5917 *erp = ext4_ext_dirty(handle, inode1, path1 + 5918 path1->p_depth); 5919 /* 5920 * Looks scarry ah..? second inode already points to new blocks, 5921 * and it was successfully dirtied. But luckily error may happen 5922 * only due to journal error, so full transaction will be 5923 * aborted anyway. 5924 */ 5925 if (unlikely(*erp)) 5926 goto finish; 5927 lblk1 += len; 5928 lblk2 += len; 5929 replaced_count += len; 5930 count -= len; 5931 5932 repeat: 5933 ext4_ext_drop_refs(path1); 5934 kfree(path1); 5935 ext4_ext_drop_refs(path2); 5936 kfree(path2); 5937 path1 = path2 = NULL; 5938 } 5939 return replaced_count; 5940 } 5941