1 /* 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 3 * Written by Alex Tomas <alex@clusterfs.com> 4 * 5 * Architecture independence: 6 * Copyright (c) 2005, Bull S.A. 7 * Written by Pierre Peiffer <pierre.peiffer@bull.net> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License version 2 as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public Licens 19 * along with this program; if not, write to the Free Software 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 21 */ 22 23 /* 24 * Extents support for EXT4 25 * 26 * TODO: 27 * - ext4*_error() should be used in some situations 28 * - analyze all BUG()/BUG_ON(), use -EIO where appropriate 29 * - smart tree reduction 30 */ 31 32 #include <linux/module.h> 33 #include <linux/fs.h> 34 #include <linux/time.h> 35 #include <linux/jbd2.h> 36 #include <linux/highuid.h> 37 #include <linux/pagemap.h> 38 #include <linux/quotaops.h> 39 #include <linux/string.h> 40 #include <linux/slab.h> 41 #include <linux/falloc.h> 42 #include <asm/uaccess.h> 43 #include <linux/fiemap.h> 44 #include "ext4_jbd2.h" 45 #include "ext4_extents.h" 46 47 #include <trace/events/ext4.h> 48 49 static int ext4_ext_truncate_extend_restart(handle_t *handle, 50 struct inode *inode, 51 int needed) 52 { 53 int err; 54 55 if (!ext4_handle_valid(handle)) 56 return 0; 57 if (handle->h_buffer_credits > needed) 58 return 0; 59 err = ext4_journal_extend(handle, needed); 60 if (err <= 0) 61 return err; 62 err = ext4_truncate_restart_trans(handle, inode, needed); 63 if (err == 0) 64 err = -EAGAIN; 65 66 return err; 67 } 68 69 /* 70 * could return: 71 * - EROFS 72 * - ENOMEM 73 */ 74 static int ext4_ext_get_access(handle_t *handle, struct inode *inode, 75 struct ext4_ext_path *path) 76 { 77 if (path->p_bh) { 78 /* path points to block */ 79 return ext4_journal_get_write_access(handle, path->p_bh); 80 } 81 /* path points to leaf/index in inode body */ 82 /* we use in-core data, no need to protect them */ 83 return 0; 84 } 85 86 /* 87 * could return: 88 * - EROFS 89 * - ENOMEM 90 * - EIO 91 */ 92 static int ext4_ext_dirty(handle_t *handle, struct inode *inode, 93 struct ext4_ext_path *path) 94 { 95 int err; 96 if (path->p_bh) { 97 /* path points to block */ 98 err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); 99 } else { 100 /* path points to leaf/index in inode body */ 101 err = ext4_mark_inode_dirty(handle, inode); 102 } 103 return err; 104 } 105 106 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, 107 struct ext4_ext_path *path, 108 ext4_lblk_t block) 109 { 110 struct ext4_inode_info *ei = EXT4_I(inode); 111 ext4_fsblk_t bg_start; 112 ext4_fsblk_t last_block; 113 ext4_grpblk_t colour; 114 ext4_group_t block_group; 115 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 116 int depth; 117 118 if (path) { 119 struct ext4_extent *ex; 120 depth = path->p_depth; 121 122 /* 123 * Try to predict block placement assuming that we are 124 * filling in a file which will eventually be 125 * non-sparse --- i.e., in the case of libbfd writing 126 * an ELF object sections out-of-order but in a way 127 * the eventually results in a contiguous object or 128 * executable file, or some database extending a table 129 * space file. However, this is actually somewhat 130 * non-ideal if we are writing a sparse file such as 131 * qemu or KVM writing a raw image file that is going 132 * to stay fairly sparse, since it will end up 133 * fragmenting the file system's free space. Maybe we 134 * should have some hueristics or some way to allow 135 * userspace to pass a hint to file system, 136 * especially if the latter case turns out to be 137 * common. 138 */ 139 ex = path[depth].p_ext; 140 if (ex) { 141 ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex); 142 ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block); 143 144 if (block > ext_block) 145 return ext_pblk + (block - ext_block); 146 else 147 return ext_pblk - (ext_block - block); 148 } 149 150 /* it looks like index is empty; 151 * try to find starting block from index itself */ 152 if (path[depth].p_bh) 153 return path[depth].p_bh->b_blocknr; 154 } 155 156 /* OK. use inode's group */ 157 block_group = ei->i_block_group; 158 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 159 /* 160 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 161 * block groups per flexgroup, reserve the first block 162 * group for directories and special files. Regular 163 * files will start at the second block group. This 164 * tends to speed up directory access and improves 165 * fsck times. 166 */ 167 block_group &= ~(flex_size-1); 168 if (S_ISREG(inode->i_mode)) 169 block_group++; 170 } 171 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 172 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 173 174 /* 175 * If we are doing delayed allocation, we don't need take 176 * colour into account. 177 */ 178 if (test_opt(inode->i_sb, DELALLOC)) 179 return bg_start; 180 181 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 182 colour = (current->pid % 16) * 183 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 184 else 185 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 186 return bg_start + colour + block; 187 } 188 189 /* 190 * Allocation for a meta data block 191 */ 192 static ext4_fsblk_t 193 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 194 struct ext4_ext_path *path, 195 struct ext4_extent *ex, int *err) 196 { 197 ext4_fsblk_t goal, newblock; 198 199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 201 return newblock; 202 } 203 204 static inline int ext4_ext_space_block(struct inode *inode, int check) 205 { 206 int size; 207 208 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 209 / sizeof(struct ext4_extent); 210 if (!check) { 211 #ifdef AGGRESSIVE_TEST 212 if (size > 6) 213 size = 6; 214 #endif 215 } 216 return size; 217 } 218 219 static inline int ext4_ext_space_block_idx(struct inode *inode, int check) 220 { 221 int size; 222 223 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 224 / sizeof(struct ext4_extent_idx); 225 if (!check) { 226 #ifdef AGGRESSIVE_TEST 227 if (size > 5) 228 size = 5; 229 #endif 230 } 231 return size; 232 } 233 234 static inline int ext4_ext_space_root(struct inode *inode, int check) 235 { 236 int size; 237 238 size = sizeof(EXT4_I(inode)->i_data); 239 size -= sizeof(struct ext4_extent_header); 240 size /= sizeof(struct ext4_extent); 241 if (!check) { 242 #ifdef AGGRESSIVE_TEST 243 if (size > 3) 244 size = 3; 245 #endif 246 } 247 return size; 248 } 249 250 static inline int ext4_ext_space_root_idx(struct inode *inode, int check) 251 { 252 int size; 253 254 size = sizeof(EXT4_I(inode)->i_data); 255 size -= sizeof(struct ext4_extent_header); 256 size /= sizeof(struct ext4_extent_idx); 257 if (!check) { 258 #ifdef AGGRESSIVE_TEST 259 if (size > 4) 260 size = 4; 261 #endif 262 } 263 return size; 264 } 265 266 /* 267 * Calculate the number of metadata blocks needed 268 * to allocate @blocks 269 * Worse case is one block per extent 270 */ 271 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 272 { 273 struct ext4_inode_info *ei = EXT4_I(inode); 274 int idxs, num = 0; 275 276 idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 277 / sizeof(struct ext4_extent_idx)); 278 279 /* 280 * If the new delayed allocation block is contiguous with the 281 * previous da block, it can share index blocks with the 282 * previous block, so we only need to allocate a new index 283 * block every idxs leaf blocks. At ldxs**2 blocks, we need 284 * an additional index block, and at ldxs**3 blocks, yet 285 * another index blocks. 286 */ 287 if (ei->i_da_metadata_calc_len && 288 ei->i_da_metadata_calc_last_lblock+1 == lblock) { 289 if ((ei->i_da_metadata_calc_len % idxs) == 0) 290 num++; 291 if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) 292 num++; 293 if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { 294 num++; 295 ei->i_da_metadata_calc_len = 0; 296 } else 297 ei->i_da_metadata_calc_len++; 298 ei->i_da_metadata_calc_last_lblock++; 299 return num; 300 } 301 302 /* 303 * In the worst case we need a new set of index blocks at 304 * every level of the inode's extent tree. 305 */ 306 ei->i_da_metadata_calc_len = 1; 307 ei->i_da_metadata_calc_last_lblock = lblock; 308 return ext_depth(inode) + 1; 309 } 310 311 static int 312 ext4_ext_max_entries(struct inode *inode, int depth) 313 { 314 int max; 315 316 if (depth == ext_depth(inode)) { 317 if (depth == 0) 318 max = ext4_ext_space_root(inode, 1); 319 else 320 max = ext4_ext_space_root_idx(inode, 1); 321 } else { 322 if (depth == 0) 323 max = ext4_ext_space_block(inode, 1); 324 else 325 max = ext4_ext_space_block_idx(inode, 1); 326 } 327 328 return max; 329 } 330 331 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) 332 { 333 ext4_fsblk_t block = ext4_ext_pblock(ext); 334 int len = ext4_ext_get_actual_len(ext); 335 336 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); 337 } 338 339 static int ext4_valid_extent_idx(struct inode *inode, 340 struct ext4_extent_idx *ext_idx) 341 { 342 ext4_fsblk_t block = ext4_idx_pblock(ext_idx); 343 344 return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); 345 } 346 347 static int ext4_valid_extent_entries(struct inode *inode, 348 struct ext4_extent_header *eh, 349 int depth) 350 { 351 struct ext4_extent *ext; 352 struct ext4_extent_idx *ext_idx; 353 unsigned short entries; 354 if (eh->eh_entries == 0) 355 return 1; 356 357 entries = le16_to_cpu(eh->eh_entries); 358 359 if (depth == 0) { 360 /* leaf entries */ 361 ext = EXT_FIRST_EXTENT(eh); 362 while (entries) { 363 if (!ext4_valid_extent(inode, ext)) 364 return 0; 365 ext++; 366 entries--; 367 } 368 } else { 369 ext_idx = EXT_FIRST_INDEX(eh); 370 while (entries) { 371 if (!ext4_valid_extent_idx(inode, ext_idx)) 372 return 0; 373 ext_idx++; 374 entries--; 375 } 376 } 377 return 1; 378 } 379 380 static int __ext4_ext_check(const char *function, unsigned int line, 381 struct inode *inode, struct ext4_extent_header *eh, 382 int depth) 383 { 384 const char *error_msg; 385 int max = 0; 386 387 if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { 388 error_msg = "invalid magic"; 389 goto corrupted; 390 } 391 if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) { 392 error_msg = "unexpected eh_depth"; 393 goto corrupted; 394 } 395 if (unlikely(eh->eh_max == 0)) { 396 error_msg = "invalid eh_max"; 397 goto corrupted; 398 } 399 max = ext4_ext_max_entries(inode, depth); 400 if (unlikely(le16_to_cpu(eh->eh_max) > max)) { 401 error_msg = "too large eh_max"; 402 goto corrupted; 403 } 404 if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) { 405 error_msg = "invalid eh_entries"; 406 goto corrupted; 407 } 408 if (!ext4_valid_extent_entries(inode, eh, depth)) { 409 error_msg = "invalid extent entries"; 410 goto corrupted; 411 } 412 return 0; 413 414 corrupted: 415 ext4_error_inode(inode, function, line, 0, 416 "bad header/extent: %s - magic %x, " 417 "entries %u, max %u(%u), depth %u(%u)", 418 error_msg, le16_to_cpu(eh->eh_magic), 419 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 420 max, le16_to_cpu(eh->eh_depth), depth); 421 422 return -EIO; 423 } 424 425 #define ext4_ext_check(inode, eh, depth) \ 426 __ext4_ext_check(__func__, __LINE__, inode, eh, depth) 427 428 int ext4_ext_check_inode(struct inode *inode) 429 { 430 return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); 431 } 432 433 #ifdef EXT_DEBUG 434 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) 435 { 436 int k, l = path->p_depth; 437 438 ext_debug("path:"); 439 for (k = 0; k <= l; k++, path++) { 440 if (path->p_idx) { 441 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 442 ext4_idx_pblock(path->p_idx)); 443 } else if (path->p_ext) { 444 ext_debug(" %d:[%d]%d:%llu ", 445 le32_to_cpu(path->p_ext->ee_block), 446 ext4_ext_is_uninitialized(path->p_ext), 447 ext4_ext_get_actual_len(path->p_ext), 448 ext4_ext_pblock(path->p_ext)); 449 } else 450 ext_debug(" []"); 451 } 452 ext_debug("\n"); 453 } 454 455 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) 456 { 457 int depth = ext_depth(inode); 458 struct ext4_extent_header *eh; 459 struct ext4_extent *ex; 460 int i; 461 462 if (!path) 463 return; 464 465 eh = path[depth].p_hdr; 466 ex = EXT_FIRST_EXTENT(eh); 467 468 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); 469 470 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 471 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), 472 ext4_ext_is_uninitialized(ex), 473 ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); 474 } 475 ext_debug("\n"); 476 } 477 #else 478 #define ext4_ext_show_path(inode, path) 479 #define ext4_ext_show_leaf(inode, path) 480 #endif 481 482 void ext4_ext_drop_refs(struct ext4_ext_path *path) 483 { 484 int depth = path->p_depth; 485 int i; 486 487 for (i = 0; i <= depth; i++, path++) 488 if (path->p_bh) { 489 brelse(path->p_bh); 490 path->p_bh = NULL; 491 } 492 } 493 494 /* 495 * ext4_ext_binsearch_idx: 496 * binary search for the closest index of the given block 497 * the header must be checked before calling this 498 */ 499 static void 500 ext4_ext_binsearch_idx(struct inode *inode, 501 struct ext4_ext_path *path, ext4_lblk_t block) 502 { 503 struct ext4_extent_header *eh = path->p_hdr; 504 struct ext4_extent_idx *r, *l, *m; 505 506 507 ext_debug("binsearch for %u(idx): ", block); 508 509 l = EXT_FIRST_INDEX(eh) + 1; 510 r = EXT_LAST_INDEX(eh); 511 while (l <= r) { 512 m = l + (r - l) / 2; 513 if (block < le32_to_cpu(m->ei_block)) 514 r = m - 1; 515 else 516 l = m + 1; 517 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), 518 m, le32_to_cpu(m->ei_block), 519 r, le32_to_cpu(r->ei_block)); 520 } 521 522 path->p_idx = l - 1; 523 ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block), 524 ext4_idx_pblock(path->p_idx)); 525 526 #ifdef CHECK_BINSEARCH 527 { 528 struct ext4_extent_idx *chix, *ix; 529 int k; 530 531 chix = ix = EXT_FIRST_INDEX(eh); 532 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { 533 if (k != 0 && 534 le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { 535 printk(KERN_DEBUG "k=%d, ix=0x%p, " 536 "first=0x%p\n", k, 537 ix, EXT_FIRST_INDEX(eh)); 538 printk(KERN_DEBUG "%u <= %u\n", 539 le32_to_cpu(ix->ei_block), 540 le32_to_cpu(ix[-1].ei_block)); 541 } 542 BUG_ON(k && le32_to_cpu(ix->ei_block) 543 <= le32_to_cpu(ix[-1].ei_block)); 544 if (block < le32_to_cpu(ix->ei_block)) 545 break; 546 chix = ix; 547 } 548 BUG_ON(chix != path->p_idx); 549 } 550 #endif 551 552 } 553 554 /* 555 * ext4_ext_binsearch: 556 * binary search for closest extent of the given block 557 * the header must be checked before calling this 558 */ 559 static void 560 ext4_ext_binsearch(struct inode *inode, 561 struct ext4_ext_path *path, ext4_lblk_t block) 562 { 563 struct ext4_extent_header *eh = path->p_hdr; 564 struct ext4_extent *r, *l, *m; 565 566 if (eh->eh_entries == 0) { 567 /* 568 * this leaf is empty: 569 * we get such a leaf in split/add case 570 */ 571 return; 572 } 573 574 ext_debug("binsearch for %u: ", block); 575 576 l = EXT_FIRST_EXTENT(eh) + 1; 577 r = EXT_LAST_EXTENT(eh); 578 579 while (l <= r) { 580 m = l + (r - l) / 2; 581 if (block < le32_to_cpu(m->ee_block)) 582 r = m - 1; 583 else 584 l = m + 1; 585 ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), 586 m, le32_to_cpu(m->ee_block), 587 r, le32_to_cpu(r->ee_block)); 588 } 589 590 path->p_ext = l - 1; 591 ext_debug(" -> %d:%llu:[%d]%d ", 592 le32_to_cpu(path->p_ext->ee_block), 593 ext4_ext_pblock(path->p_ext), 594 ext4_ext_is_uninitialized(path->p_ext), 595 ext4_ext_get_actual_len(path->p_ext)); 596 597 #ifdef CHECK_BINSEARCH 598 { 599 struct ext4_extent *chex, *ex; 600 int k; 601 602 chex = ex = EXT_FIRST_EXTENT(eh); 603 for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) { 604 BUG_ON(k && le32_to_cpu(ex->ee_block) 605 <= le32_to_cpu(ex[-1].ee_block)); 606 if (block < le32_to_cpu(ex->ee_block)) 607 break; 608 chex = ex; 609 } 610 BUG_ON(chex != path->p_ext); 611 } 612 #endif 613 614 } 615 616 int ext4_ext_tree_init(handle_t *handle, struct inode *inode) 617 { 618 struct ext4_extent_header *eh; 619 620 eh = ext_inode_hdr(inode); 621 eh->eh_depth = 0; 622 eh->eh_entries = 0; 623 eh->eh_magic = EXT4_EXT_MAGIC; 624 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); 625 ext4_mark_inode_dirty(handle, inode); 626 ext4_ext_invalidate_cache(inode); 627 return 0; 628 } 629 630 struct ext4_ext_path * 631 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 632 struct ext4_ext_path *path) 633 { 634 struct ext4_extent_header *eh; 635 struct buffer_head *bh; 636 short int depth, i, ppos = 0, alloc = 0; 637 638 eh = ext_inode_hdr(inode); 639 depth = ext_depth(inode); 640 641 /* account possible depth increase */ 642 if (!path) { 643 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 644 GFP_NOFS); 645 if (!path) 646 return ERR_PTR(-ENOMEM); 647 alloc = 1; 648 } 649 path[0].p_hdr = eh; 650 path[0].p_bh = NULL; 651 652 i = depth; 653 /* walk through the tree */ 654 while (i) { 655 int need_to_validate = 0; 656 657 ext_debug("depth %d: num %d, max %d\n", 658 ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 659 660 ext4_ext_binsearch_idx(inode, path + ppos, block); 661 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); 662 path[ppos].p_depth = i; 663 path[ppos].p_ext = NULL; 664 665 bh = sb_getblk(inode->i_sb, path[ppos].p_block); 666 if (unlikely(!bh)) 667 goto err; 668 if (!bh_uptodate_or_lock(bh)) { 669 trace_ext4_ext_load_extent(inode, block, 670 path[ppos].p_block); 671 if (bh_submit_read(bh) < 0) { 672 put_bh(bh); 673 goto err; 674 } 675 /* validate the extent entries */ 676 need_to_validate = 1; 677 } 678 eh = ext_block_hdr(bh); 679 ppos++; 680 if (unlikely(ppos > depth)) { 681 put_bh(bh); 682 EXT4_ERROR_INODE(inode, 683 "ppos %d > depth %d", ppos, depth); 684 goto err; 685 } 686 path[ppos].p_bh = bh; 687 path[ppos].p_hdr = eh; 688 i--; 689 690 if (need_to_validate && ext4_ext_check(inode, eh, i)) 691 goto err; 692 } 693 694 path[ppos].p_depth = i; 695 path[ppos].p_ext = NULL; 696 path[ppos].p_idx = NULL; 697 698 /* find extent */ 699 ext4_ext_binsearch(inode, path + ppos, block); 700 /* if not an empty leaf */ 701 if (path[ppos].p_ext) 702 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); 703 704 ext4_ext_show_path(inode, path); 705 706 return path; 707 708 err: 709 ext4_ext_drop_refs(path); 710 if (alloc) 711 kfree(path); 712 return ERR_PTR(-EIO); 713 } 714 715 /* 716 * ext4_ext_insert_index: 717 * insert new index [@logical;@ptr] into the block at @curp; 718 * check where to insert: before @curp or after @curp 719 */ 720 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 721 struct ext4_ext_path *curp, 722 int logical, ext4_fsblk_t ptr) 723 { 724 struct ext4_extent_idx *ix; 725 int len, err; 726 727 err = ext4_ext_get_access(handle, inode, curp); 728 if (err) 729 return err; 730 731 if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) { 732 EXT4_ERROR_INODE(inode, 733 "logical %d == ei_block %d!", 734 logical, le32_to_cpu(curp->p_idx->ei_block)); 735 return -EIO; 736 } 737 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 738 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 739 /* insert after */ 740 if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { 741 len = (len - 1) * sizeof(struct ext4_extent_idx); 742 len = len < 0 ? 0 : len; 743 ext_debug("insert new index %d after: %llu. " 744 "move %d from 0x%p to 0x%p\n", 745 logical, ptr, len, 746 (curp->p_idx + 1), (curp->p_idx + 2)); 747 memmove(curp->p_idx + 2, curp->p_idx + 1, len); 748 } 749 ix = curp->p_idx + 1; 750 } else { 751 /* insert before */ 752 len = len * sizeof(struct ext4_extent_idx); 753 len = len < 0 ? 0 : len; 754 ext_debug("insert new index %d before: %llu. " 755 "move %d from 0x%p to 0x%p\n", 756 logical, ptr, len, 757 curp->p_idx, (curp->p_idx + 1)); 758 memmove(curp->p_idx + 1, curp->p_idx, len); 759 ix = curp->p_idx; 760 } 761 762 ix->ei_block = cpu_to_le32(logical); 763 ext4_idx_store_pblock(ix, ptr); 764 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 765 766 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) 767 > le16_to_cpu(curp->p_hdr->eh_max))) { 768 EXT4_ERROR_INODE(inode, 769 "logical %d == ei_block %d!", 770 logical, le32_to_cpu(curp->p_idx->ei_block)); 771 return -EIO; 772 } 773 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 774 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 775 return -EIO; 776 } 777 778 err = ext4_ext_dirty(handle, inode, curp); 779 ext4_std_error(inode->i_sb, err); 780 781 return err; 782 } 783 784 /* 785 * ext4_ext_split: 786 * inserts new subtree into the path, using free index entry 787 * at depth @at: 788 * - allocates all needed blocks (new leaf and all intermediate index blocks) 789 * - makes decision where to split 790 * - moves remaining extents and index entries (right to the split point) 791 * into the newly allocated blocks 792 * - initializes subtree 793 */ 794 static int ext4_ext_split(handle_t *handle, struct inode *inode, 795 struct ext4_ext_path *path, 796 struct ext4_extent *newext, int at) 797 { 798 struct buffer_head *bh = NULL; 799 int depth = ext_depth(inode); 800 struct ext4_extent_header *neh; 801 struct ext4_extent_idx *fidx; 802 struct ext4_extent *ex; 803 int i = at, k, m, a; 804 ext4_fsblk_t newblock, oldblock; 805 __le32 border; 806 ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ 807 int err = 0; 808 809 /* make decision: where to split? */ 810 /* FIXME: now decision is simplest: at current extent */ 811 812 /* if current leaf will be split, then we should use 813 * border from split point */ 814 if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { 815 EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); 816 return -EIO; 817 } 818 if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { 819 border = path[depth].p_ext[1].ee_block; 820 ext_debug("leaf will be split." 821 " next leaf starts at %d\n", 822 le32_to_cpu(border)); 823 } else { 824 border = newext->ee_block; 825 ext_debug("leaf will be added." 826 " next leaf starts at %d\n", 827 le32_to_cpu(border)); 828 } 829 830 /* 831 * If error occurs, then we break processing 832 * and mark filesystem read-only. index won't 833 * be inserted and tree will be in consistent 834 * state. Next mount will repair buffers too. 835 */ 836 837 /* 838 * Get array to track all allocated blocks. 839 * We need this to handle errors and free blocks 840 * upon them. 841 */ 842 ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS); 843 if (!ablocks) 844 return -ENOMEM; 845 846 /* allocate all needed blocks */ 847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 848 for (a = 0; a < depth - at; a++) { 849 newblock = ext4_ext_new_meta_block(handle, inode, path, 850 newext, &err); 851 if (newblock == 0) 852 goto cleanup; 853 ablocks[a] = newblock; 854 } 855 856 /* initialize new leaf */ 857 newblock = ablocks[--a]; 858 if (unlikely(newblock == 0)) { 859 EXT4_ERROR_INODE(inode, "newblock == 0!"); 860 err = -EIO; 861 goto cleanup; 862 } 863 bh = sb_getblk(inode->i_sb, newblock); 864 if (!bh) { 865 err = -EIO; 866 goto cleanup; 867 } 868 lock_buffer(bh); 869 870 err = ext4_journal_get_create_access(handle, bh); 871 if (err) 872 goto cleanup; 873 874 neh = ext_block_hdr(bh); 875 neh->eh_entries = 0; 876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 877 neh->eh_magic = EXT4_EXT_MAGIC; 878 neh->eh_depth = 0; 879 ex = EXT_FIRST_EXTENT(neh); 880 881 /* move remainder of path[depth] to the new leaf */ 882 if (unlikely(path[depth].p_hdr->eh_entries != 883 path[depth].p_hdr->eh_max)) { 884 EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", 885 path[depth].p_hdr->eh_entries, 886 path[depth].p_hdr->eh_max); 887 err = -EIO; 888 goto cleanup; 889 } 890 /* start copy from next extent */ 891 /* TODO: we could do it by single memmove */ 892 m = 0; 893 path[depth].p_ext++; 894 while (path[depth].p_ext <= 895 EXT_MAX_EXTENT(path[depth].p_hdr)) { 896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", 897 le32_to_cpu(path[depth].p_ext->ee_block), 898 ext4_ext_pblock(path[depth].p_ext), 899 ext4_ext_is_uninitialized(path[depth].p_ext), 900 ext4_ext_get_actual_len(path[depth].p_ext), 901 newblock); 902 /*memmove(ex++, path[depth].p_ext++, 903 sizeof(struct ext4_extent)); 904 neh->eh_entries++;*/ 905 path[depth].p_ext++; 906 m++; 907 } 908 if (m) { 909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 910 le16_add_cpu(&neh->eh_entries, m); 911 } 912 913 set_buffer_uptodate(bh); 914 unlock_buffer(bh); 915 916 err = ext4_handle_dirty_metadata(handle, inode, bh); 917 if (err) 918 goto cleanup; 919 brelse(bh); 920 bh = NULL; 921 922 /* correct old leaf */ 923 if (m) { 924 err = ext4_ext_get_access(handle, inode, path + depth); 925 if (err) 926 goto cleanup; 927 le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); 928 err = ext4_ext_dirty(handle, inode, path + depth); 929 if (err) 930 goto cleanup; 931 932 } 933 934 /* create intermediate indexes */ 935 k = depth - at - 1; 936 if (unlikely(k < 0)) { 937 EXT4_ERROR_INODE(inode, "k %d < 0!", k); 938 err = -EIO; 939 goto cleanup; 940 } 941 if (k) 942 ext_debug("create %d intermediate indices\n", k); 943 /* insert new index into current index block */ 944 /* current depth stored in i var */ 945 i = depth - 1; 946 while (k--) { 947 oldblock = newblock; 948 newblock = ablocks[--a]; 949 bh = sb_getblk(inode->i_sb, newblock); 950 if (!bh) { 951 err = -EIO; 952 goto cleanup; 953 } 954 lock_buffer(bh); 955 956 err = ext4_journal_get_create_access(handle, bh); 957 if (err) 958 goto cleanup; 959 960 neh = ext_block_hdr(bh); 961 neh->eh_entries = cpu_to_le16(1); 962 neh->eh_magic = EXT4_EXT_MAGIC; 963 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 964 neh->eh_depth = cpu_to_le16(depth - i); 965 fidx = EXT_FIRST_INDEX(neh); 966 fidx->ei_block = border; 967 ext4_idx_store_pblock(fidx, oldblock); 968 969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 970 i, newblock, le32_to_cpu(border), oldblock); 971 /* copy indexes */ 972 m = 0; 973 path[i].p_idx++; 974 975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 976 EXT_MAX_INDEX(path[i].p_hdr)); 977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 978 EXT_LAST_INDEX(path[i].p_hdr))) { 979 EXT4_ERROR_INODE(inode, 980 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", 981 le32_to_cpu(path[i].p_ext->ee_block)); 982 err = -EIO; 983 goto cleanup; 984 } 985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 987 le32_to_cpu(path[i].p_idx->ei_block), 988 ext4_idx_pblock(path[i].p_idx), 989 newblock); 990 /*memmove(++fidx, path[i].p_idx++, 991 sizeof(struct ext4_extent_idx)); 992 neh->eh_entries++; 993 BUG_ON(neh->eh_entries > neh->eh_max);*/ 994 path[i].p_idx++; 995 m++; 996 } 997 if (m) { 998 memmove(++fidx, path[i].p_idx - m, 999 sizeof(struct ext4_extent_idx) * m); 1000 le16_add_cpu(&neh->eh_entries, m); 1001 } 1002 set_buffer_uptodate(bh); 1003 unlock_buffer(bh); 1004 1005 err = ext4_handle_dirty_metadata(handle, inode, bh); 1006 if (err) 1007 goto cleanup; 1008 brelse(bh); 1009 bh = NULL; 1010 1011 /* correct old index */ 1012 if (m) { 1013 err = ext4_ext_get_access(handle, inode, path + i); 1014 if (err) 1015 goto cleanup; 1016 le16_add_cpu(&path[i].p_hdr->eh_entries, -m); 1017 err = ext4_ext_dirty(handle, inode, path + i); 1018 if (err) 1019 goto cleanup; 1020 } 1021 1022 i--; 1023 } 1024 1025 /* insert new index */ 1026 err = ext4_ext_insert_index(handle, inode, path + at, 1027 le32_to_cpu(border), newblock); 1028 1029 cleanup: 1030 if (bh) { 1031 if (buffer_locked(bh)) 1032 unlock_buffer(bh); 1033 brelse(bh); 1034 } 1035 1036 if (err) { 1037 /* free all allocated blocks in error case */ 1038 for (i = 0; i < depth; i++) { 1039 if (!ablocks[i]) 1040 continue; 1041 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, 1042 EXT4_FREE_BLOCKS_METADATA); 1043 } 1044 } 1045 kfree(ablocks); 1046 1047 return err; 1048 } 1049 1050 /* 1051 * ext4_ext_grow_indepth: 1052 * implements tree growing procedure: 1053 * - allocates new block 1054 * - moves top-level data (index block or leaf) into the new block 1055 * - initializes new top-level, creating index that points to the 1056 * just created block 1057 */ 1058 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1059 struct ext4_ext_path *path, 1060 struct ext4_extent *newext) 1061 { 1062 struct ext4_ext_path *curp = path; 1063 struct ext4_extent_header *neh; 1064 struct buffer_head *bh; 1065 ext4_fsblk_t newblock; 1066 int err = 0; 1067 1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1069 if (newblock == 0) 1070 return err; 1071 1072 bh = sb_getblk(inode->i_sb, newblock); 1073 if (!bh) { 1074 err = -EIO; 1075 ext4_std_error(inode->i_sb, err); 1076 return err; 1077 } 1078 lock_buffer(bh); 1079 1080 err = ext4_journal_get_create_access(handle, bh); 1081 if (err) { 1082 unlock_buffer(bh); 1083 goto out; 1084 } 1085 1086 /* move top-level index/leaf into new block */ 1087 memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data)); 1088 1089 /* set size of new block */ 1090 neh = ext_block_hdr(bh); 1091 /* old root could have indexes or leaves 1092 * so calculate e_max right way */ 1093 if (ext_depth(inode)) 1094 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); 1095 else 1096 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 1097 neh->eh_magic = EXT4_EXT_MAGIC; 1098 set_buffer_uptodate(bh); 1099 unlock_buffer(bh); 1100 1101 err = ext4_handle_dirty_metadata(handle, inode, bh); 1102 if (err) 1103 goto out; 1104 1105 /* create index in new top-level index: num,max,pointer */ 1106 err = ext4_ext_get_access(handle, inode, curp); 1107 if (err) 1108 goto out; 1109 1110 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1111 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0)); 1112 curp->p_hdr->eh_entries = cpu_to_le16(1); 1113 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1114 1115 if (path[0].p_hdr->eh_depth) 1116 curp->p_idx->ei_block = 1117 EXT_FIRST_INDEX(path[0].p_hdr)->ei_block; 1118 else 1119 curp->p_idx->ei_block = 1120 EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block; 1121 ext4_idx_store_pblock(curp->p_idx, newblock); 1122 1123 neh = ext_inode_hdr(inode); 1124 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1125 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1126 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), 1127 ext4_idx_pblock(EXT_FIRST_INDEX(neh))); 1128 1129 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1130 err = ext4_ext_dirty(handle, inode, curp); 1131 out: 1132 brelse(bh); 1133 1134 return err; 1135 } 1136 1137 /* 1138 * ext4_ext_create_new_leaf: 1139 * finds empty index and adds new leaf. 1140 * if no free index is found, then it requests in-depth growing. 1141 */ 1142 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1143 struct ext4_ext_path *path, 1144 struct ext4_extent *newext) 1145 { 1146 struct ext4_ext_path *curp; 1147 int depth, i, err = 0; 1148 1149 repeat: 1150 i = depth = ext_depth(inode); 1151 1152 /* walk up to the tree and look for free index entry */ 1153 curp = path + depth; 1154 while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { 1155 i--; 1156 curp--; 1157 } 1158 1159 /* we use already allocated block for index block, 1160 * so subsequent data blocks should be contiguous */ 1161 if (EXT_HAS_FREE_INDEX(curp)) { 1162 /* if we found index with free entry, then use that 1163 * entry: create all needed subtree and add new leaf */ 1164 err = ext4_ext_split(handle, inode, path, newext, i); 1165 if (err) 1166 goto out; 1167 1168 /* refill path */ 1169 ext4_ext_drop_refs(path); 1170 path = ext4_ext_find_extent(inode, 1171 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1172 path); 1173 if (IS_ERR(path)) 1174 err = PTR_ERR(path); 1175 } else { 1176 /* tree is full, time to grow in depth */ 1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1178 if (err) 1179 goto out; 1180 1181 /* refill path */ 1182 ext4_ext_drop_refs(path); 1183 path = ext4_ext_find_extent(inode, 1184 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1185 path); 1186 if (IS_ERR(path)) { 1187 err = PTR_ERR(path); 1188 goto out; 1189 } 1190 1191 /* 1192 * only first (depth 0 -> 1) produces free space; 1193 * in all other cases we have to split the grown tree 1194 */ 1195 depth = ext_depth(inode); 1196 if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) { 1197 /* now we need to split */ 1198 goto repeat; 1199 } 1200 } 1201 1202 out: 1203 return err; 1204 } 1205 1206 /* 1207 * search the closest allocated block to the left for *logical 1208 * and returns it at @logical + it's physical address at @phys 1209 * if *logical is the smallest allocated block, the function 1210 * returns 0 at @phys 1211 * return value contains 0 (success) or error code 1212 */ 1213 static int ext4_ext_search_left(struct inode *inode, 1214 struct ext4_ext_path *path, 1215 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1216 { 1217 struct ext4_extent_idx *ix; 1218 struct ext4_extent *ex; 1219 int depth, ee_len; 1220 1221 if (unlikely(path == NULL)) { 1222 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1223 return -EIO; 1224 } 1225 depth = path->p_depth; 1226 *phys = 0; 1227 1228 if (depth == 0 && path->p_ext == NULL) 1229 return 0; 1230 1231 /* usually extent in the path covers blocks smaller 1232 * then *logical, but it can be that extent is the 1233 * first one in the file */ 1234 1235 ex = path[depth].p_ext; 1236 ee_len = ext4_ext_get_actual_len(ex); 1237 if (*logical < le32_to_cpu(ex->ee_block)) { 1238 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1239 EXT4_ERROR_INODE(inode, 1240 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", 1241 *logical, le32_to_cpu(ex->ee_block)); 1242 return -EIO; 1243 } 1244 while (--depth >= 0) { 1245 ix = path[depth].p_idx; 1246 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1247 EXT4_ERROR_INODE(inode, 1248 "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!", 1249 ix != NULL ? ix->ei_block : 0, 1250 EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? 1251 EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0, 1252 depth); 1253 return -EIO; 1254 } 1255 } 1256 return 0; 1257 } 1258 1259 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1260 EXT4_ERROR_INODE(inode, 1261 "logical %d < ee_block %d + ee_len %d!", 1262 *logical, le32_to_cpu(ex->ee_block), ee_len); 1263 return -EIO; 1264 } 1265 1266 *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; 1267 *phys = ext4_ext_pblock(ex) + ee_len - 1; 1268 return 0; 1269 } 1270 1271 /* 1272 * search the closest allocated block to the right for *logical 1273 * and returns it at @logical + it's physical address at @phys 1274 * if *logical is the smallest allocated block, the function 1275 * returns 0 at @phys 1276 * return value contains 0 (success) or error code 1277 */ 1278 static int ext4_ext_search_right(struct inode *inode, 1279 struct ext4_ext_path *path, 1280 ext4_lblk_t *logical, ext4_fsblk_t *phys) 1281 { 1282 struct buffer_head *bh = NULL; 1283 struct ext4_extent_header *eh; 1284 struct ext4_extent_idx *ix; 1285 struct ext4_extent *ex; 1286 ext4_fsblk_t block; 1287 int depth; /* Note, NOT eh_depth; depth from top of tree */ 1288 int ee_len; 1289 1290 if (unlikely(path == NULL)) { 1291 EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); 1292 return -EIO; 1293 } 1294 depth = path->p_depth; 1295 *phys = 0; 1296 1297 if (depth == 0 && path->p_ext == NULL) 1298 return 0; 1299 1300 /* usually extent in the path covers blocks smaller 1301 * then *logical, but it can be that extent is the 1302 * first one in the file */ 1303 1304 ex = path[depth].p_ext; 1305 ee_len = ext4_ext_get_actual_len(ex); 1306 if (*logical < le32_to_cpu(ex->ee_block)) { 1307 if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) { 1308 EXT4_ERROR_INODE(inode, 1309 "first_extent(path[%d].p_hdr) != ex", 1310 depth); 1311 return -EIO; 1312 } 1313 while (--depth >= 0) { 1314 ix = path[depth].p_idx; 1315 if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) { 1316 EXT4_ERROR_INODE(inode, 1317 "ix != EXT_FIRST_INDEX *logical %d!", 1318 *logical); 1319 return -EIO; 1320 } 1321 } 1322 *logical = le32_to_cpu(ex->ee_block); 1323 *phys = ext4_ext_pblock(ex); 1324 return 0; 1325 } 1326 1327 if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) { 1328 EXT4_ERROR_INODE(inode, 1329 "logical %d < ee_block %d + ee_len %d!", 1330 *logical, le32_to_cpu(ex->ee_block), ee_len); 1331 return -EIO; 1332 } 1333 1334 if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { 1335 /* next allocated block in this leaf */ 1336 ex++; 1337 *logical = le32_to_cpu(ex->ee_block); 1338 *phys = ext4_ext_pblock(ex); 1339 return 0; 1340 } 1341 1342 /* go up and search for index to the right */ 1343 while (--depth >= 0) { 1344 ix = path[depth].p_idx; 1345 if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) 1346 goto got_index; 1347 } 1348 1349 /* we've gone up to the root and found no index to the right */ 1350 return 0; 1351 1352 got_index: 1353 /* we've found index to the right, let's 1354 * follow it and find the closest allocated 1355 * block to the right */ 1356 ix++; 1357 block = ext4_idx_pblock(ix); 1358 while (++depth < path->p_depth) { 1359 bh = sb_bread(inode->i_sb, block); 1360 if (bh == NULL) 1361 return -EIO; 1362 eh = ext_block_hdr(bh); 1363 /* subtract from p_depth to get proper eh_depth */ 1364 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1365 put_bh(bh); 1366 return -EIO; 1367 } 1368 ix = EXT_FIRST_INDEX(eh); 1369 block = ext4_idx_pblock(ix); 1370 put_bh(bh); 1371 } 1372 1373 bh = sb_bread(inode->i_sb, block); 1374 if (bh == NULL) 1375 return -EIO; 1376 eh = ext_block_hdr(bh); 1377 if (ext4_ext_check(inode, eh, path->p_depth - depth)) { 1378 put_bh(bh); 1379 return -EIO; 1380 } 1381 ex = EXT_FIRST_EXTENT(eh); 1382 *logical = le32_to_cpu(ex->ee_block); 1383 *phys = ext4_ext_pblock(ex); 1384 put_bh(bh); 1385 return 0; 1386 } 1387 1388 /* 1389 * ext4_ext_next_allocated_block: 1390 * returns allocated block in subsequent extent or EXT_MAX_BLOCK. 1391 * NOTE: it considers block number from index entry as 1392 * allocated block. Thus, index entries have to be consistent 1393 * with leaves. 1394 */ 1395 static ext4_lblk_t 1396 ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1397 { 1398 int depth; 1399 1400 BUG_ON(path == NULL); 1401 depth = path->p_depth; 1402 1403 if (depth == 0 && path->p_ext == NULL) 1404 return EXT_MAX_BLOCK; 1405 1406 while (depth >= 0) { 1407 if (depth == path->p_depth) { 1408 /* leaf */ 1409 if (path[depth].p_ext != 1410 EXT_LAST_EXTENT(path[depth].p_hdr)) 1411 return le32_to_cpu(path[depth].p_ext[1].ee_block); 1412 } else { 1413 /* index */ 1414 if (path[depth].p_idx != 1415 EXT_LAST_INDEX(path[depth].p_hdr)) 1416 return le32_to_cpu(path[depth].p_idx[1].ei_block); 1417 } 1418 depth--; 1419 } 1420 1421 return EXT_MAX_BLOCK; 1422 } 1423 1424 /* 1425 * ext4_ext_next_leaf_block: 1426 * returns first allocated block from next leaf or EXT_MAX_BLOCK 1427 */ 1428 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1429 struct ext4_ext_path *path) 1430 { 1431 int depth; 1432 1433 BUG_ON(path == NULL); 1434 depth = path->p_depth; 1435 1436 /* zero-tree has no leaf blocks at all */ 1437 if (depth == 0) 1438 return EXT_MAX_BLOCK; 1439 1440 /* go to index block */ 1441 depth--; 1442 1443 while (depth >= 0) { 1444 if (path[depth].p_idx != 1445 EXT_LAST_INDEX(path[depth].p_hdr)) 1446 return (ext4_lblk_t) 1447 le32_to_cpu(path[depth].p_idx[1].ei_block); 1448 depth--; 1449 } 1450 1451 return EXT_MAX_BLOCK; 1452 } 1453 1454 /* 1455 * ext4_ext_correct_indexes: 1456 * if leaf gets modified and modified extent is first in the leaf, 1457 * then we have to correct all indexes above. 1458 * TODO: do we need to correct tree in all cases? 1459 */ 1460 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, 1461 struct ext4_ext_path *path) 1462 { 1463 struct ext4_extent_header *eh; 1464 int depth = ext_depth(inode); 1465 struct ext4_extent *ex; 1466 __le32 border; 1467 int k, err = 0; 1468 1469 eh = path[depth].p_hdr; 1470 ex = path[depth].p_ext; 1471 1472 if (unlikely(ex == NULL || eh == NULL)) { 1473 EXT4_ERROR_INODE(inode, 1474 "ex %p == NULL or eh %p == NULL", ex, eh); 1475 return -EIO; 1476 } 1477 1478 if (depth == 0) { 1479 /* there is no tree at all */ 1480 return 0; 1481 } 1482 1483 if (ex != EXT_FIRST_EXTENT(eh)) { 1484 /* we correct tree if first leaf got modified only */ 1485 return 0; 1486 } 1487 1488 /* 1489 * TODO: we need correction if border is smaller than current one 1490 */ 1491 k = depth - 1; 1492 border = path[depth].p_ext->ee_block; 1493 err = ext4_ext_get_access(handle, inode, path + k); 1494 if (err) 1495 return err; 1496 path[k].p_idx->ei_block = border; 1497 err = ext4_ext_dirty(handle, inode, path + k); 1498 if (err) 1499 return err; 1500 1501 while (k--) { 1502 /* change all left-side indexes */ 1503 if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) 1504 break; 1505 err = ext4_ext_get_access(handle, inode, path + k); 1506 if (err) 1507 break; 1508 path[k].p_idx->ei_block = border; 1509 err = ext4_ext_dirty(handle, inode, path + k); 1510 if (err) 1511 break; 1512 } 1513 1514 return err; 1515 } 1516 1517 int 1518 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, 1519 struct ext4_extent *ex2) 1520 { 1521 unsigned short ext1_ee_len, ext2_ee_len, max_len; 1522 1523 /* 1524 * Make sure that either both extents are uninitialized, or 1525 * both are _not_. 1526 */ 1527 if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) 1528 return 0; 1529 1530 if (ext4_ext_is_uninitialized(ex1)) 1531 max_len = EXT_UNINIT_MAX_LEN; 1532 else 1533 max_len = EXT_INIT_MAX_LEN; 1534 1535 ext1_ee_len = ext4_ext_get_actual_len(ex1); 1536 ext2_ee_len = ext4_ext_get_actual_len(ex2); 1537 1538 if (le32_to_cpu(ex1->ee_block) + ext1_ee_len != 1539 le32_to_cpu(ex2->ee_block)) 1540 return 0; 1541 1542 /* 1543 * To allow future support for preallocated extents to be added 1544 * as an RO_COMPAT feature, refuse to merge to extents if 1545 * this can result in the top bit of ee_len being set. 1546 */ 1547 if (ext1_ee_len + ext2_ee_len > max_len) 1548 return 0; 1549 #ifdef AGGRESSIVE_TEST 1550 if (ext1_ee_len >= 4) 1551 return 0; 1552 #endif 1553 1554 if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2)) 1555 return 1; 1556 return 0; 1557 } 1558 1559 /* 1560 * This function tries to merge the "ex" extent to the next extent in the tree. 1561 * It always tries to merge towards right. If you want to merge towards 1562 * left, pass "ex - 1" as argument instead of "ex". 1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1564 * 1 if they got merged. 1565 */ 1566 static int ext4_ext_try_to_merge_right(struct inode *inode, 1567 struct ext4_ext_path *path, 1568 struct ext4_extent *ex) 1569 { 1570 struct ext4_extent_header *eh; 1571 unsigned int depth, len; 1572 int merge_done = 0; 1573 int uninitialized = 0; 1574 1575 depth = ext_depth(inode); 1576 BUG_ON(path[depth].p_hdr == NULL); 1577 eh = path[depth].p_hdr; 1578 1579 while (ex < EXT_LAST_EXTENT(eh)) { 1580 if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) 1581 break; 1582 /* merge with next extent! */ 1583 if (ext4_ext_is_uninitialized(ex)) 1584 uninitialized = 1; 1585 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1586 + ext4_ext_get_actual_len(ex + 1)); 1587 if (uninitialized) 1588 ext4_ext_mark_uninitialized(ex); 1589 1590 if (ex + 1 < EXT_LAST_EXTENT(eh)) { 1591 len = (EXT_LAST_EXTENT(eh) - ex - 1) 1592 * sizeof(struct ext4_extent); 1593 memmove(ex + 1, ex + 2, len); 1594 } 1595 le16_add_cpu(&eh->eh_entries, -1); 1596 merge_done = 1; 1597 WARN_ON(eh->eh_entries == 0); 1598 if (!eh->eh_entries) 1599 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!"); 1600 } 1601 1602 return merge_done; 1603 } 1604 1605 /* 1606 * This function tries to merge the @ex extent to neighbours in the tree. 1607 * return 1 if merge left else 0. 1608 */ 1609 static int ext4_ext_try_to_merge(struct inode *inode, 1610 struct ext4_ext_path *path, 1611 struct ext4_extent *ex) { 1612 struct ext4_extent_header *eh; 1613 unsigned int depth; 1614 int merge_done = 0; 1615 int ret = 0; 1616 1617 depth = ext_depth(inode); 1618 BUG_ON(path[depth].p_hdr == NULL); 1619 eh = path[depth].p_hdr; 1620 1621 if (ex > EXT_FIRST_EXTENT(eh)) 1622 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); 1623 1624 if (!merge_done) 1625 ret = ext4_ext_try_to_merge_right(inode, path, ex); 1626 1627 return ret; 1628 } 1629 1630 /* 1631 * check if a portion of the "newext" extent overlaps with an 1632 * existing extent. 1633 * 1634 * If there is an overlap discovered, it updates the length of the newext 1635 * such that there will be no overlap, and then returns 1. 1636 * If there is no overlap found, it returns 0. 1637 */ 1638 static unsigned int ext4_ext_check_overlap(struct inode *inode, 1639 struct ext4_extent *newext, 1640 struct ext4_ext_path *path) 1641 { 1642 ext4_lblk_t b1, b2; 1643 unsigned int depth, len1; 1644 unsigned int ret = 0; 1645 1646 b1 = le32_to_cpu(newext->ee_block); 1647 len1 = ext4_ext_get_actual_len(newext); 1648 depth = ext_depth(inode); 1649 if (!path[depth].p_ext) 1650 goto out; 1651 b2 = le32_to_cpu(path[depth].p_ext->ee_block); 1652 1653 /* 1654 * get the next allocated block if the extent in the path 1655 * is before the requested block(s) 1656 */ 1657 if (b2 < b1) { 1658 b2 = ext4_ext_next_allocated_block(path); 1659 if (b2 == EXT_MAX_BLOCK) 1660 goto out; 1661 } 1662 1663 /* check for wrap through zero on extent logical start block*/ 1664 if (b1 + len1 < b1) { 1665 len1 = EXT_MAX_BLOCK - b1; 1666 newext->ee_len = cpu_to_le16(len1); 1667 ret = 1; 1668 } 1669 1670 /* check for overlap */ 1671 if (b1 + len1 > b2) { 1672 newext->ee_len = cpu_to_le16(b2 - b1); 1673 ret = 1; 1674 } 1675 out: 1676 return ret; 1677 } 1678 1679 /* 1680 * ext4_ext_insert_extent: 1681 * tries to merge requsted extent into the existing extent or 1682 * inserts requested extent as new one into the tree, 1683 * creating new leaf in the no-space case. 1684 */ 1685 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1686 struct ext4_ext_path *path, 1687 struct ext4_extent *newext, int flag) 1688 { 1689 struct ext4_extent_header *eh; 1690 struct ext4_extent *ex, *fex; 1691 struct ext4_extent *nearex; /* nearest extent */ 1692 struct ext4_ext_path *npath = NULL; 1693 int depth, len, err; 1694 ext4_lblk_t next; 1695 unsigned uninitialized = 0; 1696 1697 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1698 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1699 return -EIO; 1700 } 1701 depth = ext_depth(inode); 1702 ex = path[depth].p_ext; 1703 if (unlikely(path[depth].p_hdr == NULL)) { 1704 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1705 return -EIO; 1706 } 1707 1708 /* try to insert block into found extent and return */ 1709 if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO) 1710 && ext4_can_extents_be_merged(inode, ex, newext)) { 1711 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1712 ext4_ext_is_uninitialized(newext), 1713 ext4_ext_get_actual_len(newext), 1714 le32_to_cpu(ex->ee_block), 1715 ext4_ext_is_uninitialized(ex), 1716 ext4_ext_get_actual_len(ex), 1717 ext4_ext_pblock(ex)); 1718 err = ext4_ext_get_access(handle, inode, path + depth); 1719 if (err) 1720 return err; 1721 1722 /* 1723 * ext4_can_extents_be_merged should have checked that either 1724 * both extents are uninitialized, or both aren't. Thus we 1725 * need to check only one of them here. 1726 */ 1727 if (ext4_ext_is_uninitialized(ex)) 1728 uninitialized = 1; 1729 ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) 1730 + ext4_ext_get_actual_len(newext)); 1731 if (uninitialized) 1732 ext4_ext_mark_uninitialized(ex); 1733 eh = path[depth].p_hdr; 1734 nearex = ex; 1735 goto merge; 1736 } 1737 1738 repeat: 1739 depth = ext_depth(inode); 1740 eh = path[depth].p_hdr; 1741 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1742 goto has_space; 1743 1744 /* probably next leaf has space for us? */ 1745 fex = EXT_LAST_EXTENT(eh); 1746 next = ext4_ext_next_leaf_block(inode, path); 1747 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1748 && next != EXT_MAX_BLOCK) { 1749 ext_debug("next leaf block - %d\n", next); 1750 BUG_ON(npath != NULL); 1751 npath = ext4_ext_find_extent(inode, next, NULL); 1752 if (IS_ERR(npath)) 1753 return PTR_ERR(npath); 1754 BUG_ON(npath->p_depth != path->p_depth); 1755 eh = npath[depth].p_hdr; 1756 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1757 ext_debug("next leaf isn't full(%d)\n", 1758 le16_to_cpu(eh->eh_entries)); 1759 path = npath; 1760 goto repeat; 1761 } 1762 ext_debug("next leaf has no free space(%d,%d)\n", 1763 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1764 } 1765 1766 /* 1767 * There is no free space in the found leaf. 1768 * We're gonna add a new leaf in the tree. 1769 */ 1770 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1771 if (err) 1772 goto cleanup; 1773 depth = ext_depth(inode); 1774 eh = path[depth].p_hdr; 1775 1776 has_space: 1777 nearex = path[depth].p_ext; 1778 1779 err = ext4_ext_get_access(handle, inode, path + depth); 1780 if (err) 1781 goto cleanup; 1782 1783 if (!nearex) { 1784 /* there is no extent in this leaf, create first one */ 1785 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n", 1786 le32_to_cpu(newext->ee_block), 1787 ext4_ext_pblock(newext), 1788 ext4_ext_is_uninitialized(newext), 1789 ext4_ext_get_actual_len(newext)); 1790 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1791 } else if (le32_to_cpu(newext->ee_block) 1792 > le32_to_cpu(nearex->ee_block)) { 1793 /* BUG_ON(newext->ee_block == nearex->ee_block); */ 1794 if (nearex != EXT_LAST_EXTENT(eh)) { 1795 len = EXT_MAX_EXTENT(eh) - nearex; 1796 len = (len - 1) * sizeof(struct ext4_extent); 1797 len = len < 0 ? 0 : len; 1798 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, " 1799 "move %d from 0x%p to 0x%p\n", 1800 le32_to_cpu(newext->ee_block), 1801 ext4_ext_pblock(newext), 1802 ext4_ext_is_uninitialized(newext), 1803 ext4_ext_get_actual_len(newext), 1804 nearex, len, nearex + 1, nearex + 2); 1805 memmove(nearex + 2, nearex + 1, len); 1806 } 1807 path[depth].p_ext = nearex + 1; 1808 } else { 1809 BUG_ON(newext->ee_block == nearex->ee_block); 1810 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1811 len = len < 0 ? 0 : len; 1812 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, " 1813 "move %d from 0x%p to 0x%p\n", 1814 le32_to_cpu(newext->ee_block), 1815 ext4_ext_pblock(newext), 1816 ext4_ext_is_uninitialized(newext), 1817 ext4_ext_get_actual_len(newext), 1818 nearex, len, nearex + 1, nearex + 2); 1819 memmove(nearex + 1, nearex, len); 1820 path[depth].p_ext = nearex; 1821 } 1822 1823 le16_add_cpu(&eh->eh_entries, 1); 1824 nearex = path[depth].p_ext; 1825 nearex->ee_block = newext->ee_block; 1826 ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext)); 1827 nearex->ee_len = newext->ee_len; 1828 1829 merge: 1830 /* try to merge extents to the right */ 1831 if (!(flag & EXT4_GET_BLOCKS_PRE_IO)) 1832 ext4_ext_try_to_merge(inode, path, nearex); 1833 1834 /* try to merge extents to the left */ 1835 1836 /* time to correct all indexes above */ 1837 err = ext4_ext_correct_indexes(handle, inode, path); 1838 if (err) 1839 goto cleanup; 1840 1841 err = ext4_ext_dirty(handle, inode, path + depth); 1842 1843 cleanup: 1844 if (npath) { 1845 ext4_ext_drop_refs(npath); 1846 kfree(npath); 1847 } 1848 ext4_ext_invalidate_cache(inode); 1849 return err; 1850 } 1851 1852 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, 1853 ext4_lblk_t num, ext_prepare_callback func, 1854 void *cbdata) 1855 { 1856 struct ext4_ext_path *path = NULL; 1857 struct ext4_ext_cache cbex; 1858 struct ext4_extent *ex; 1859 ext4_lblk_t next, start = 0, end = 0; 1860 ext4_lblk_t last = block + num; 1861 int depth, exists, err = 0; 1862 1863 BUG_ON(func == NULL); 1864 BUG_ON(inode == NULL); 1865 1866 while (block < last && block != EXT_MAX_BLOCK) { 1867 num = last - block; 1868 /* find extent for this block */ 1869 down_read(&EXT4_I(inode)->i_data_sem); 1870 path = ext4_ext_find_extent(inode, block, path); 1871 up_read(&EXT4_I(inode)->i_data_sem); 1872 if (IS_ERR(path)) { 1873 err = PTR_ERR(path); 1874 path = NULL; 1875 break; 1876 } 1877 1878 depth = ext_depth(inode); 1879 if (unlikely(path[depth].p_hdr == NULL)) { 1880 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 1881 err = -EIO; 1882 break; 1883 } 1884 ex = path[depth].p_ext; 1885 next = ext4_ext_next_allocated_block(path); 1886 1887 exists = 0; 1888 if (!ex) { 1889 /* there is no extent yet, so try to allocate 1890 * all requested space */ 1891 start = block; 1892 end = block + num; 1893 } else if (le32_to_cpu(ex->ee_block) > block) { 1894 /* need to allocate space before found extent */ 1895 start = block; 1896 end = le32_to_cpu(ex->ee_block); 1897 if (block + num < end) 1898 end = block + num; 1899 } else if (block >= le32_to_cpu(ex->ee_block) 1900 + ext4_ext_get_actual_len(ex)) { 1901 /* need to allocate space after found extent */ 1902 start = block; 1903 end = block + num; 1904 if (end >= next) 1905 end = next; 1906 } else if (block >= le32_to_cpu(ex->ee_block)) { 1907 /* 1908 * some part of requested space is covered 1909 * by found extent 1910 */ 1911 start = block; 1912 end = le32_to_cpu(ex->ee_block) 1913 + ext4_ext_get_actual_len(ex); 1914 if (block + num < end) 1915 end = block + num; 1916 exists = 1; 1917 } else { 1918 BUG(); 1919 } 1920 BUG_ON(end <= start); 1921 1922 if (!exists) { 1923 cbex.ec_block = start; 1924 cbex.ec_len = end - start; 1925 cbex.ec_start = 0; 1926 } else { 1927 cbex.ec_block = le32_to_cpu(ex->ee_block); 1928 cbex.ec_len = ext4_ext_get_actual_len(ex); 1929 cbex.ec_start = ext4_ext_pblock(ex); 1930 } 1931 1932 if (unlikely(cbex.ec_len == 0)) { 1933 EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); 1934 err = -EIO; 1935 break; 1936 } 1937 err = func(inode, path, &cbex, ex, cbdata); 1938 ext4_ext_drop_refs(path); 1939 1940 if (err < 0) 1941 break; 1942 1943 if (err == EXT_REPEAT) 1944 continue; 1945 else if (err == EXT_BREAK) { 1946 err = 0; 1947 break; 1948 } 1949 1950 if (ext_depth(inode) != depth) { 1951 /* depth was changed. we have to realloc path */ 1952 kfree(path); 1953 path = NULL; 1954 } 1955 1956 block = cbex.ec_block + cbex.ec_len; 1957 } 1958 1959 if (path) { 1960 ext4_ext_drop_refs(path); 1961 kfree(path); 1962 } 1963 1964 return err; 1965 } 1966 1967 static void 1968 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, 1969 __u32 len, ext4_fsblk_t start) 1970 { 1971 struct ext4_ext_cache *cex; 1972 BUG_ON(len == 0); 1973 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1974 cex = &EXT4_I(inode)->i_cached_extent; 1975 cex->ec_block = block; 1976 cex->ec_len = len; 1977 cex->ec_start = start; 1978 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1979 } 1980 1981 /* 1982 * ext4_ext_put_gap_in_cache: 1983 * calculate boundaries of the gap that the requested block fits into 1984 * and cache this gap 1985 */ 1986 static void 1987 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 1988 ext4_lblk_t block) 1989 { 1990 int depth = ext_depth(inode); 1991 unsigned long len; 1992 ext4_lblk_t lblock; 1993 struct ext4_extent *ex; 1994 1995 ex = path[depth].p_ext; 1996 if (ex == NULL) { 1997 /* there is no extent yet, so gap is [0;-] */ 1998 lblock = 0; 1999 len = EXT_MAX_BLOCK; 2000 ext_debug("cache gap(whole file):"); 2001 } else if (block < le32_to_cpu(ex->ee_block)) { 2002 lblock = block; 2003 len = le32_to_cpu(ex->ee_block) - block; 2004 ext_debug("cache gap(before): %u [%u:%u]", 2005 block, 2006 le32_to_cpu(ex->ee_block), 2007 ext4_ext_get_actual_len(ex)); 2008 } else if (block >= le32_to_cpu(ex->ee_block) 2009 + ext4_ext_get_actual_len(ex)) { 2010 ext4_lblk_t next; 2011 lblock = le32_to_cpu(ex->ee_block) 2012 + ext4_ext_get_actual_len(ex); 2013 2014 next = ext4_ext_next_allocated_block(path); 2015 ext_debug("cache gap(after): [%u:%u] %u", 2016 le32_to_cpu(ex->ee_block), 2017 ext4_ext_get_actual_len(ex), 2018 block); 2019 BUG_ON(next == lblock); 2020 len = next - lblock; 2021 } else { 2022 lblock = len = 0; 2023 BUG(); 2024 } 2025 2026 ext_debug(" -> %u:%lu\n", lblock, len); 2027 ext4_ext_put_in_cache(inode, lblock, len, 0); 2028 } 2029 2030 /* 2031 * Return 0 if cache is invalid; 1 if the cache is valid 2032 */ 2033 static int 2034 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2035 struct ext4_extent *ex) 2036 { 2037 struct ext4_ext_cache *cex; 2038 int ret = 0; 2039 2040 /* 2041 * We borrow i_block_reservation_lock to protect i_cached_extent 2042 */ 2043 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2044 cex = &EXT4_I(inode)->i_cached_extent; 2045 2046 /* has cache valid data? */ 2047 if (cex->ec_len == 0) 2048 goto errout; 2049 2050 if (in_range(block, cex->ec_block, cex->ec_len)) { 2051 ex->ee_block = cpu_to_le32(cex->ec_block); 2052 ext4_ext_store_pblock(ex, cex->ec_start); 2053 ex->ee_len = cpu_to_le16(cex->ec_len); 2054 ext_debug("%u cached by %u:%u:%llu\n", 2055 block, 2056 cex->ec_block, cex->ec_len, cex->ec_start); 2057 ret = 1; 2058 } 2059 errout: 2060 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2061 return ret; 2062 } 2063 2064 /* 2065 * ext4_ext_rm_idx: 2066 * removes index from the index block. 2067 * It's used in truncate case only, thus all requests are for 2068 * last index in the block only. 2069 */ 2070 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2071 struct ext4_ext_path *path) 2072 { 2073 int err; 2074 ext4_fsblk_t leaf; 2075 2076 /* free index block */ 2077 path--; 2078 leaf = ext4_idx_pblock(path->p_idx); 2079 if (unlikely(path->p_hdr->eh_entries == 0)) { 2080 EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); 2081 return -EIO; 2082 } 2083 err = ext4_ext_get_access(handle, inode, path); 2084 if (err) 2085 return err; 2086 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2087 err = ext4_ext_dirty(handle, inode, path); 2088 if (err) 2089 return err; 2090 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2091 ext4_free_blocks(handle, inode, NULL, leaf, 1, 2092 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2093 return err; 2094 } 2095 2096 /* 2097 * ext4_ext_calc_credits_for_single_extent: 2098 * This routine returns max. credits that needed to insert an extent 2099 * to the extent tree. 2100 * When pass the actual path, the caller should calculate credits 2101 * under i_data_sem. 2102 */ 2103 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, 2104 struct ext4_ext_path *path) 2105 { 2106 if (path) { 2107 int depth = ext_depth(inode); 2108 int ret = 0; 2109 2110 /* probably there is space in leaf? */ 2111 if (le16_to_cpu(path[depth].p_hdr->eh_entries) 2112 < le16_to_cpu(path[depth].p_hdr->eh_max)) { 2113 2114 /* 2115 * There are some space in the leaf tree, no 2116 * need to account for leaf block credit 2117 * 2118 * bitmaps and block group descriptor blocks 2119 * and other metadat blocks still need to be 2120 * accounted. 2121 */ 2122 /* 1 bitmap, 1 block group descriptor */ 2123 ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); 2124 return ret; 2125 } 2126 } 2127 2128 return ext4_chunk_trans_blocks(inode, nrblocks); 2129 } 2130 2131 /* 2132 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2133 * 2134 * if nrblocks are fit in a single extent (chunk flag is 1), then 2135 * in the worse case, each tree level index/leaf need to be changed 2136 * if the tree split due to insert a new extent, then the old tree 2137 * index/leaf need to be updated too 2138 * 2139 * If the nrblocks are discontiguous, they could cause 2140 * the whole tree split more than once, but this is really rare. 2141 */ 2142 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2143 { 2144 int index; 2145 int depth = ext_depth(inode); 2146 2147 if (chunk) 2148 index = depth * 2; 2149 else 2150 index = depth * 3; 2151 2152 return index; 2153 } 2154 2155 static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2156 struct ext4_extent *ex, 2157 ext4_lblk_t from, ext4_lblk_t to) 2158 { 2159 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2160 int flags = EXT4_FREE_BLOCKS_FORGET; 2161 2162 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2163 flags |= EXT4_FREE_BLOCKS_METADATA; 2164 #ifdef EXTENTS_STATS 2165 { 2166 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2167 spin_lock(&sbi->s_ext_stats_lock); 2168 sbi->s_ext_blocks += ee_len; 2169 sbi->s_ext_extents++; 2170 if (ee_len < sbi->s_ext_min) 2171 sbi->s_ext_min = ee_len; 2172 if (ee_len > sbi->s_ext_max) 2173 sbi->s_ext_max = ee_len; 2174 if (ext_depth(inode) > sbi->s_depth_max) 2175 sbi->s_depth_max = ext_depth(inode); 2176 spin_unlock(&sbi->s_ext_stats_lock); 2177 } 2178 #endif 2179 if (from >= le32_to_cpu(ex->ee_block) 2180 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2181 /* tail removal */ 2182 ext4_lblk_t num; 2183 ext4_fsblk_t start; 2184 2185 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2186 start = ext4_ext_pblock(ex) + ee_len - num; 2187 ext_debug("free last %u blocks starting %llu\n", num, start); 2188 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2189 } else if (from == le32_to_cpu(ex->ee_block) 2190 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2191 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2192 from, to, le32_to_cpu(ex->ee_block), ee_len); 2193 } else { 2194 printk(KERN_INFO "strange request: removal(2) " 2195 "%u-%u from %u:%u\n", 2196 from, to, le32_to_cpu(ex->ee_block), ee_len); 2197 } 2198 return 0; 2199 } 2200 2201 static int 2202 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2203 struct ext4_ext_path *path, ext4_lblk_t start) 2204 { 2205 int err = 0, correct_index = 0; 2206 int depth = ext_depth(inode), credits; 2207 struct ext4_extent_header *eh; 2208 ext4_lblk_t a, b, block; 2209 unsigned num; 2210 ext4_lblk_t ex_ee_block; 2211 unsigned short ex_ee_len; 2212 unsigned uninitialized = 0; 2213 struct ext4_extent *ex; 2214 2215 /* the header must be checked already in ext4_ext_remove_space() */ 2216 ext_debug("truncate since %u in leaf\n", start); 2217 if (!path[depth].p_hdr) 2218 path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); 2219 eh = path[depth].p_hdr; 2220 if (unlikely(path[depth].p_hdr == NULL)) { 2221 EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); 2222 return -EIO; 2223 } 2224 /* find where to start removing */ 2225 ex = EXT_LAST_EXTENT(eh); 2226 2227 ex_ee_block = le32_to_cpu(ex->ee_block); 2228 ex_ee_len = ext4_ext_get_actual_len(ex); 2229 2230 while (ex >= EXT_FIRST_EXTENT(eh) && 2231 ex_ee_block + ex_ee_len > start) { 2232 2233 if (ext4_ext_is_uninitialized(ex)) 2234 uninitialized = 1; 2235 else 2236 uninitialized = 0; 2237 2238 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, 2239 uninitialized, ex_ee_len); 2240 path[depth].p_ext = ex; 2241 2242 a = ex_ee_block > start ? ex_ee_block : start; 2243 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2244 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2245 2246 ext_debug(" border %u:%u\n", a, b); 2247 2248 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2249 block = 0; 2250 num = 0; 2251 BUG(); 2252 } else if (a != ex_ee_block) { 2253 /* remove tail of the extent */ 2254 block = ex_ee_block; 2255 num = a - block; 2256 } else if (b != ex_ee_block + ex_ee_len - 1) { 2257 /* remove head of the extent */ 2258 block = a; 2259 num = b - a; 2260 /* there is no "make a hole" API yet */ 2261 BUG(); 2262 } else { 2263 /* remove whole extent: excellent! */ 2264 block = ex_ee_block; 2265 num = 0; 2266 BUG_ON(a != ex_ee_block); 2267 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2268 } 2269 2270 /* 2271 * 3 for leaf, sb, and inode plus 2 (bmap and group 2272 * descriptor) for each block group; assume two block 2273 * groups plus ex_ee_len/blocks_per_block_group for 2274 * the worst case 2275 */ 2276 credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); 2277 if (ex == EXT_FIRST_EXTENT(eh)) { 2278 correct_index = 1; 2279 credits += (ext_depth(inode)) + 1; 2280 } 2281 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); 2282 2283 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2284 if (err) 2285 goto out; 2286 2287 err = ext4_ext_get_access(handle, inode, path + depth); 2288 if (err) 2289 goto out; 2290 2291 err = ext4_remove_blocks(handle, inode, ex, a, b); 2292 if (err) 2293 goto out; 2294 2295 if (num == 0) { 2296 /* this extent is removed; mark slot entirely unused */ 2297 ext4_ext_store_pblock(ex, 0); 2298 le16_add_cpu(&eh->eh_entries, -1); 2299 } 2300 2301 ex->ee_block = cpu_to_le32(block); 2302 ex->ee_len = cpu_to_le16(num); 2303 /* 2304 * Do not mark uninitialized if all the blocks in the 2305 * extent have been removed. 2306 */ 2307 if (uninitialized && num) 2308 ext4_ext_mark_uninitialized(ex); 2309 2310 err = ext4_ext_dirty(handle, inode, path + depth); 2311 if (err) 2312 goto out; 2313 2314 ext_debug("new extent: %u:%u:%llu\n", block, num, 2315 ext4_ext_pblock(ex)); 2316 ex--; 2317 ex_ee_block = le32_to_cpu(ex->ee_block); 2318 ex_ee_len = ext4_ext_get_actual_len(ex); 2319 } 2320 2321 if (correct_index && eh->eh_entries) 2322 err = ext4_ext_correct_indexes(handle, inode, path); 2323 2324 /* if this leaf is free, then we should 2325 * remove it from index block above */ 2326 if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) 2327 err = ext4_ext_rm_idx(handle, inode, path + depth); 2328 2329 out: 2330 return err; 2331 } 2332 2333 /* 2334 * ext4_ext_more_to_rm: 2335 * returns 1 if current index has to be freed (even partial) 2336 */ 2337 static int 2338 ext4_ext_more_to_rm(struct ext4_ext_path *path) 2339 { 2340 BUG_ON(path->p_idx == NULL); 2341 2342 if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) 2343 return 0; 2344 2345 /* 2346 * if truncate on deeper level happened, it wasn't partial, 2347 * so we have to consider current index for truncation 2348 */ 2349 if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block) 2350 return 0; 2351 return 1; 2352 } 2353 2354 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2355 { 2356 struct super_block *sb = inode->i_sb; 2357 int depth = ext_depth(inode); 2358 struct ext4_ext_path *path; 2359 handle_t *handle; 2360 int i, err; 2361 2362 ext_debug("truncate since %u\n", start); 2363 2364 /* probably first extent we're gonna free will be last in block */ 2365 handle = ext4_journal_start(inode, depth + 1); 2366 if (IS_ERR(handle)) 2367 return PTR_ERR(handle); 2368 2369 again: 2370 ext4_ext_invalidate_cache(inode); 2371 2372 /* 2373 * We start scanning from right side, freeing all the blocks 2374 * after i_size and walking into the tree depth-wise. 2375 */ 2376 depth = ext_depth(inode); 2377 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2378 if (path == NULL) { 2379 ext4_journal_stop(handle); 2380 return -ENOMEM; 2381 } 2382 path[0].p_depth = depth; 2383 path[0].p_hdr = ext_inode_hdr(inode); 2384 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2385 err = -EIO; 2386 goto out; 2387 } 2388 i = err = 0; 2389 2390 while (i >= 0 && err == 0) { 2391 if (i == depth) { 2392 /* this is leaf block */ 2393 err = ext4_ext_rm_leaf(handle, inode, path, start); 2394 /* root level has p_bh == NULL, brelse() eats this */ 2395 brelse(path[i].p_bh); 2396 path[i].p_bh = NULL; 2397 i--; 2398 continue; 2399 } 2400 2401 /* this is index block */ 2402 if (!path[i].p_hdr) { 2403 ext_debug("initialize header\n"); 2404 path[i].p_hdr = ext_block_hdr(path[i].p_bh); 2405 } 2406 2407 if (!path[i].p_idx) { 2408 /* this level hasn't been touched yet */ 2409 path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); 2410 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; 2411 ext_debug("init index ptr: hdr 0x%p, num %d\n", 2412 path[i].p_hdr, 2413 le16_to_cpu(path[i].p_hdr->eh_entries)); 2414 } else { 2415 /* we were already here, see at next index */ 2416 path[i].p_idx--; 2417 } 2418 2419 ext_debug("level %d - index, first 0x%p, cur 0x%p\n", 2420 i, EXT_FIRST_INDEX(path[i].p_hdr), 2421 path[i].p_idx); 2422 if (ext4_ext_more_to_rm(path + i)) { 2423 struct buffer_head *bh; 2424 /* go to the next level */ 2425 ext_debug("move to level %d (block %llu)\n", 2426 i + 1, ext4_idx_pblock(path[i].p_idx)); 2427 memset(path + i + 1, 0, sizeof(*path)); 2428 bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx)); 2429 if (!bh) { 2430 /* should we reset i_size? */ 2431 err = -EIO; 2432 break; 2433 } 2434 if (WARN_ON(i + 1 > depth)) { 2435 err = -EIO; 2436 break; 2437 } 2438 if (ext4_ext_check(inode, ext_block_hdr(bh), 2439 depth - i - 1)) { 2440 err = -EIO; 2441 break; 2442 } 2443 path[i + 1].p_bh = bh; 2444 2445 /* save actual number of indexes since this 2446 * number is changed at the next iteration */ 2447 path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries); 2448 i++; 2449 } else { 2450 /* we finished processing this index, go up */ 2451 if (path[i].p_hdr->eh_entries == 0 && i > 0) { 2452 /* index is empty, remove it; 2453 * handle must be already prepared by the 2454 * truncatei_leaf() */ 2455 err = ext4_ext_rm_idx(handle, inode, path + i); 2456 } 2457 /* root level has p_bh == NULL, brelse() eats this */ 2458 brelse(path[i].p_bh); 2459 path[i].p_bh = NULL; 2460 i--; 2461 ext_debug("return to level %d\n", i); 2462 } 2463 } 2464 2465 /* TODO: flexible tree reduction should be here */ 2466 if (path->p_hdr->eh_entries == 0) { 2467 /* 2468 * truncate to zero freed all the tree, 2469 * so we need to correct eh_depth 2470 */ 2471 err = ext4_ext_get_access(handle, inode, path); 2472 if (err == 0) { 2473 ext_inode_hdr(inode)->eh_depth = 0; 2474 ext_inode_hdr(inode)->eh_max = 2475 cpu_to_le16(ext4_ext_space_root(inode, 0)); 2476 err = ext4_ext_dirty(handle, inode, path); 2477 } 2478 } 2479 out: 2480 ext4_ext_drop_refs(path); 2481 kfree(path); 2482 if (err == -EAGAIN) 2483 goto again; 2484 ext4_journal_stop(handle); 2485 2486 return err; 2487 } 2488 2489 /* 2490 * called at mount time 2491 */ 2492 void ext4_ext_init(struct super_block *sb) 2493 { 2494 /* 2495 * possible initialization would be here 2496 */ 2497 2498 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2499 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) 2500 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2501 #ifdef AGGRESSIVE_TEST 2502 printk(", aggressive tests"); 2503 #endif 2504 #ifdef CHECK_BINSEARCH 2505 printk(", check binsearch"); 2506 #endif 2507 #ifdef EXTENTS_STATS 2508 printk(", stats"); 2509 #endif 2510 printk("\n"); 2511 #endif 2512 #ifdef EXTENTS_STATS 2513 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2514 EXT4_SB(sb)->s_ext_min = 1 << 30; 2515 EXT4_SB(sb)->s_ext_max = 0; 2516 #endif 2517 } 2518 } 2519 2520 /* 2521 * called at umount time 2522 */ 2523 void ext4_ext_release(struct super_block *sb) 2524 { 2525 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) 2526 return; 2527 2528 #ifdef EXTENTS_STATS 2529 if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) { 2530 struct ext4_sb_info *sbi = EXT4_SB(sb); 2531 printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n", 2532 sbi->s_ext_blocks, sbi->s_ext_extents, 2533 sbi->s_ext_blocks / sbi->s_ext_extents); 2534 printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n", 2535 sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max); 2536 } 2537 #endif 2538 } 2539 2540 /* FIXME!! we need to try to merge to left or right after zero-out */ 2541 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2542 { 2543 ext4_fsblk_t ee_pblock; 2544 unsigned int ee_len; 2545 int ret; 2546 2547 ee_len = ext4_ext_get_actual_len(ex); 2548 ee_pblock = ext4_ext_pblock(ex); 2549 2550 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 2551 if (ret > 0) 2552 ret = 0; 2553 2554 return ret; 2555 } 2556 2557 /* 2558 * used by extent splitting. 2559 */ 2560 #define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \ 2561 due to ENOSPC */ 2562 #define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */ 2563 #define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */ 2564 2565 /* 2566 * ext4_split_extent_at() splits an extent at given block. 2567 * 2568 * @handle: the journal handle 2569 * @inode: the file inode 2570 * @path: the path to the extent 2571 * @split: the logical block where the extent is splitted. 2572 * @split_flags: indicates if the extent could be zeroout if split fails, and 2573 * the states(init or uninit) of new extents. 2574 * @flags: flags used to insert new extent to extent tree. 2575 * 2576 * 2577 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states 2578 * of which are deterimined by split_flag. 2579 * 2580 * There are two cases: 2581 * a> the extent are splitted into two extent. 2582 * b> split is not needed, and just mark the extent. 2583 * 2584 * return 0 on success. 2585 */ 2586 static int ext4_split_extent_at(handle_t *handle, 2587 struct inode *inode, 2588 struct ext4_ext_path *path, 2589 ext4_lblk_t split, 2590 int split_flag, 2591 int flags) 2592 { 2593 ext4_fsblk_t newblock; 2594 ext4_lblk_t ee_block; 2595 struct ext4_extent *ex, newex, orig_ex; 2596 struct ext4_extent *ex2 = NULL; 2597 unsigned int ee_len, depth; 2598 int err = 0; 2599 2600 ext_debug("ext4_split_extents_at: inode %lu, logical" 2601 "block %llu\n", inode->i_ino, (unsigned long long)split); 2602 2603 ext4_ext_show_leaf(inode, path); 2604 2605 depth = ext_depth(inode); 2606 ex = path[depth].p_ext; 2607 ee_block = le32_to_cpu(ex->ee_block); 2608 ee_len = ext4_ext_get_actual_len(ex); 2609 newblock = split - ee_block + ext4_ext_pblock(ex); 2610 2611 BUG_ON(split < ee_block || split >= (ee_block + ee_len)); 2612 2613 err = ext4_ext_get_access(handle, inode, path + depth); 2614 if (err) 2615 goto out; 2616 2617 if (split == ee_block) { 2618 /* 2619 * case b: block @split is the block that the extent begins with 2620 * then we just change the state of the extent, and splitting 2621 * is not needed. 2622 */ 2623 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2624 ext4_ext_mark_uninitialized(ex); 2625 else 2626 ext4_ext_mark_initialized(ex); 2627 2628 if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 2629 ext4_ext_try_to_merge(inode, path, ex); 2630 2631 err = ext4_ext_dirty(handle, inode, path + depth); 2632 goto out; 2633 } 2634 2635 /* case a */ 2636 memcpy(&orig_ex, ex, sizeof(orig_ex)); 2637 ex->ee_len = cpu_to_le16(split - ee_block); 2638 if (split_flag & EXT4_EXT_MARK_UNINIT1) 2639 ext4_ext_mark_uninitialized(ex); 2640 2641 /* 2642 * path may lead to new leaf, not to original leaf any more 2643 * after ext4_ext_insert_extent() returns, 2644 */ 2645 err = ext4_ext_dirty(handle, inode, path + depth); 2646 if (err) 2647 goto fix_extent_len; 2648 2649 ex2 = &newex; 2650 ex2->ee_block = cpu_to_le32(split); 2651 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block)); 2652 ext4_ext_store_pblock(ex2, newblock); 2653 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2654 ext4_ext_mark_uninitialized(ex2); 2655 2656 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 2657 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 2658 err = ext4_ext_zeroout(inode, &orig_ex); 2659 if (err) 2660 goto fix_extent_len; 2661 /* update the extent length and mark as initialized */ 2662 ex->ee_len = cpu_to_le32(ee_len); 2663 ext4_ext_try_to_merge(inode, path, ex); 2664 err = ext4_ext_dirty(handle, inode, path + depth); 2665 goto out; 2666 } else if (err) 2667 goto fix_extent_len; 2668 2669 out: 2670 ext4_ext_show_leaf(inode, path); 2671 return err; 2672 2673 fix_extent_len: 2674 ex->ee_len = orig_ex.ee_len; 2675 ext4_ext_dirty(handle, inode, path + depth); 2676 return err; 2677 } 2678 2679 /* 2680 * ext4_split_extents() splits an extent and mark extent which is covered 2681 * by @map as split_flags indicates 2682 * 2683 * It may result in splitting the extent into multiple extents (upto three) 2684 * There are three possibilities: 2685 * a> There is no split required 2686 * b> Splits in two extents: Split is happening at either end of the extent 2687 * c> Splits in three extents: Somone is splitting in middle of the extent 2688 * 2689 */ 2690 static int ext4_split_extent(handle_t *handle, 2691 struct inode *inode, 2692 struct ext4_ext_path *path, 2693 struct ext4_map_blocks *map, 2694 int split_flag, 2695 int flags) 2696 { 2697 ext4_lblk_t ee_block; 2698 struct ext4_extent *ex; 2699 unsigned int ee_len, depth; 2700 int err = 0; 2701 int uninitialized; 2702 int split_flag1, flags1; 2703 2704 depth = ext_depth(inode); 2705 ex = path[depth].p_ext; 2706 ee_block = le32_to_cpu(ex->ee_block); 2707 ee_len = ext4_ext_get_actual_len(ex); 2708 uninitialized = ext4_ext_is_uninitialized(ex); 2709 2710 if (map->m_lblk + map->m_len < ee_block + ee_len) { 2711 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? 2712 EXT4_EXT_MAY_ZEROOUT : 0; 2713 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 2714 if (uninitialized) 2715 split_flag1 |= EXT4_EXT_MARK_UNINIT1 | 2716 EXT4_EXT_MARK_UNINIT2; 2717 err = ext4_split_extent_at(handle, inode, path, 2718 map->m_lblk + map->m_len, split_flag1, flags1); 2719 } 2720 2721 ext4_ext_drop_refs(path); 2722 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2723 if (IS_ERR(path)) 2724 return PTR_ERR(path); 2725 2726 if (map->m_lblk >= ee_block) { 2727 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ? 2728 EXT4_EXT_MAY_ZEROOUT : 0; 2729 if (uninitialized) 2730 split_flag1 |= EXT4_EXT_MARK_UNINIT1; 2731 if (split_flag & EXT4_EXT_MARK_UNINIT2) 2732 split_flag1 |= EXT4_EXT_MARK_UNINIT2; 2733 err = ext4_split_extent_at(handle, inode, path, 2734 map->m_lblk, split_flag1, flags); 2735 if (err) 2736 goto out; 2737 } 2738 2739 ext4_ext_show_leaf(inode, path); 2740 out: 2741 return err ? err : map->m_len; 2742 } 2743 2744 #define EXT4_EXT_ZERO_LEN 7 2745 /* 2746 * This function is called by ext4_ext_map_blocks() if someone tries to write 2747 * to an uninitialized extent. It may result in splitting the uninitialized 2748 * extent into multiple extents (up to three - one initialized and two 2749 * uninitialized). 2750 * There are three possibilities: 2751 * a> There is no split required: Entire extent should be initialized 2752 * b> Splits in two extents: Write is happening at either end of the extent 2753 * c> Splits in three extents: Somone is writing in middle of the extent 2754 */ 2755 static int ext4_ext_convert_to_initialized(handle_t *handle, 2756 struct inode *inode, 2757 struct ext4_map_blocks *map, 2758 struct ext4_ext_path *path) 2759 { 2760 struct ext4_extent *ex, newex, orig_ex; 2761 struct ext4_extent *ex1 = NULL; 2762 struct ext4_extent *ex2 = NULL; 2763 struct ext4_extent *ex3 = NULL; 2764 struct ext4_extent_header *eh; 2765 ext4_lblk_t ee_block, eof_block; 2766 unsigned int allocated, ee_len, depth; 2767 ext4_fsblk_t newblock; 2768 int err = 0; 2769 int ret = 0; 2770 int may_zeroout; 2771 2772 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2773 "block %llu, max_blocks %u\n", inode->i_ino, 2774 (unsigned long long)map->m_lblk, map->m_len); 2775 2776 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 2777 inode->i_sb->s_blocksize_bits; 2778 if (eof_block < map->m_lblk + map->m_len) 2779 eof_block = map->m_lblk + map->m_len; 2780 2781 depth = ext_depth(inode); 2782 eh = path[depth].p_hdr; 2783 ex = path[depth].p_ext; 2784 ee_block = le32_to_cpu(ex->ee_block); 2785 ee_len = ext4_ext_get_actual_len(ex); 2786 allocated = ee_len - (map->m_lblk - ee_block); 2787 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); 2788 2789 ex2 = ex; 2790 orig_ex.ee_block = ex->ee_block; 2791 orig_ex.ee_len = cpu_to_le16(ee_len); 2792 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); 2793 2794 /* 2795 * It is safe to convert extent to initialized via explicit 2796 * zeroout only if extent is fully insde i_size or new_size. 2797 */ 2798 may_zeroout = ee_block + ee_len <= eof_block; 2799 2800 err = ext4_ext_get_access(handle, inode, path + depth); 2801 if (err) 2802 goto out; 2803 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2804 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2805 err = ext4_ext_zeroout(inode, &orig_ex); 2806 if (err) 2807 goto fix_extent_len; 2808 /* update the extent length and mark as initialized */ 2809 ex->ee_block = orig_ex.ee_block; 2810 ex->ee_len = orig_ex.ee_len; 2811 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2812 ext4_ext_dirty(handle, inode, path + depth); 2813 /* zeroed the full extent */ 2814 return allocated; 2815 } 2816 2817 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 2818 if (map->m_lblk > ee_block) { 2819 ex1 = ex; 2820 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2821 ext4_ext_mark_uninitialized(ex1); 2822 ex2 = &newex; 2823 } 2824 /* 2825 * for sanity, update the length of the ex2 extent before 2826 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2827 * overlap of blocks. 2828 */ 2829 if (!ex1 && allocated > map->m_len) 2830 ex2->ee_len = cpu_to_le16(map->m_len); 2831 /* ex3: to ee_block + ee_len : uninitialised */ 2832 if (allocated > map->m_len) { 2833 unsigned int newdepth; 2834 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2835 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { 2836 /* 2837 * map->m_lblk == ee_block is handled by the zerouout 2838 * at the beginning. 2839 * Mark first half uninitialized. 2840 * Mark second half initialized and zero out the 2841 * initialized extent 2842 */ 2843 ex->ee_block = orig_ex.ee_block; 2844 ex->ee_len = cpu_to_le16(ee_len - allocated); 2845 ext4_ext_mark_uninitialized(ex); 2846 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2847 ext4_ext_dirty(handle, inode, path + depth); 2848 2849 ex3 = &newex; 2850 ex3->ee_block = cpu_to_le32(map->m_lblk); 2851 ext4_ext_store_pblock(ex3, newblock); 2852 ex3->ee_len = cpu_to_le16(allocated); 2853 err = ext4_ext_insert_extent(handle, inode, path, 2854 ex3, 0); 2855 if (err == -ENOSPC) { 2856 err = ext4_ext_zeroout(inode, &orig_ex); 2857 if (err) 2858 goto fix_extent_len; 2859 ex->ee_block = orig_ex.ee_block; 2860 ex->ee_len = orig_ex.ee_len; 2861 ext4_ext_store_pblock(ex, 2862 ext4_ext_pblock(&orig_ex)); 2863 ext4_ext_dirty(handle, inode, path + depth); 2864 /* blocks available from map->m_lblk */ 2865 return allocated; 2866 2867 } else if (err) 2868 goto fix_extent_len; 2869 2870 /* 2871 * We need to zero out the second half because 2872 * an fallocate request can update file size and 2873 * converting the second half to initialized extent 2874 * implies that we can leak some junk data to user 2875 * space. 2876 */ 2877 err = ext4_ext_zeroout(inode, ex3); 2878 if (err) { 2879 /* 2880 * We should actually mark the 2881 * second half as uninit and return error 2882 * Insert would have changed the extent 2883 */ 2884 depth = ext_depth(inode); 2885 ext4_ext_drop_refs(path); 2886 path = ext4_ext_find_extent(inode, map->m_lblk, 2887 path); 2888 if (IS_ERR(path)) { 2889 err = PTR_ERR(path); 2890 return err; 2891 } 2892 /* get the second half extent details */ 2893 ex = path[depth].p_ext; 2894 err = ext4_ext_get_access(handle, inode, 2895 path + depth); 2896 if (err) 2897 return err; 2898 ext4_ext_mark_uninitialized(ex); 2899 ext4_ext_dirty(handle, inode, path + depth); 2900 return err; 2901 } 2902 2903 /* zeroed the second half */ 2904 return allocated; 2905 } 2906 ex3 = &newex; 2907 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 2908 ext4_ext_store_pblock(ex3, newblock + map->m_len); 2909 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 2910 ext4_ext_mark_uninitialized(ex3); 2911 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2912 if (err == -ENOSPC && may_zeroout) { 2913 err = ext4_ext_zeroout(inode, &orig_ex); 2914 if (err) 2915 goto fix_extent_len; 2916 /* update the extent length and mark as initialized */ 2917 ex->ee_block = orig_ex.ee_block; 2918 ex->ee_len = orig_ex.ee_len; 2919 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2920 ext4_ext_dirty(handle, inode, path + depth); 2921 /* zeroed the full extent */ 2922 /* blocks available from map->m_lblk */ 2923 return allocated; 2924 2925 } else if (err) 2926 goto fix_extent_len; 2927 /* 2928 * The depth, and hence eh & ex might change 2929 * as part of the insert above. 2930 */ 2931 newdepth = ext_depth(inode); 2932 /* 2933 * update the extent length after successful insert of the 2934 * split extent 2935 */ 2936 ee_len -= ext4_ext_get_actual_len(ex3); 2937 orig_ex.ee_len = cpu_to_le16(ee_len); 2938 may_zeroout = ee_block + ee_len <= eof_block; 2939 2940 depth = newdepth; 2941 ext4_ext_drop_refs(path); 2942 path = ext4_ext_find_extent(inode, map->m_lblk, path); 2943 if (IS_ERR(path)) { 2944 err = PTR_ERR(path); 2945 goto out; 2946 } 2947 eh = path[depth].p_hdr; 2948 ex = path[depth].p_ext; 2949 if (ex2 != &newex) 2950 ex2 = ex; 2951 2952 err = ext4_ext_get_access(handle, inode, path + depth); 2953 if (err) 2954 goto out; 2955 2956 allocated = map->m_len; 2957 2958 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2959 * to insert a extent in the middle zerout directly 2960 * otherwise give the extent a chance to merge to left 2961 */ 2962 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2963 map->m_lblk != ee_block && may_zeroout) { 2964 err = ext4_ext_zeroout(inode, &orig_ex); 2965 if (err) 2966 goto fix_extent_len; 2967 /* update the extent length and mark as initialized */ 2968 ex->ee_block = orig_ex.ee_block; 2969 ex->ee_len = orig_ex.ee_len; 2970 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 2971 ext4_ext_dirty(handle, inode, path + depth); 2972 /* zero out the first half */ 2973 /* blocks available from map->m_lblk */ 2974 return allocated; 2975 } 2976 } 2977 /* 2978 * If there was a change of depth as part of the 2979 * insertion of ex3 above, we need to update the length 2980 * of the ex1 extent again here 2981 */ 2982 if (ex1 && ex1 != ex) { 2983 ex1 = ex; 2984 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 2985 ext4_ext_mark_uninitialized(ex1); 2986 ex2 = &newex; 2987 } 2988 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */ 2989 ex2->ee_block = cpu_to_le32(map->m_lblk); 2990 ext4_ext_store_pblock(ex2, newblock); 2991 ex2->ee_len = cpu_to_le16(allocated); 2992 if (ex2 != ex) 2993 goto insert; 2994 /* 2995 * New (initialized) extent starts from the first block 2996 * in the current extent. i.e., ex2 == ex 2997 * We have to see if it can be merged with the extent 2998 * on the left. 2999 */ 3000 if (ex2 > EXT_FIRST_EXTENT(eh)) { 3001 /* 3002 * To merge left, pass "ex2 - 1" to try_to_merge(), 3003 * since it merges towards right _only_. 3004 */ 3005 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1); 3006 if (ret) { 3007 err = ext4_ext_correct_indexes(handle, inode, path); 3008 if (err) 3009 goto out; 3010 depth = ext_depth(inode); 3011 ex2--; 3012 } 3013 } 3014 /* 3015 * Try to Merge towards right. This might be required 3016 * only when the whole extent is being written to. 3017 * i.e. ex2 == ex and ex3 == NULL. 3018 */ 3019 if (!ex3) { 3020 ret = ext4_ext_try_to_merge(inode, path, ex2); 3021 if (ret) { 3022 err = ext4_ext_correct_indexes(handle, inode, path); 3023 if (err) 3024 goto out; 3025 } 3026 } 3027 /* Mark modified extent as dirty */ 3028 err = ext4_ext_dirty(handle, inode, path + depth); 3029 goto out; 3030 insert: 3031 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3032 if (err == -ENOSPC && may_zeroout) { 3033 err = ext4_ext_zeroout(inode, &orig_ex); 3034 if (err) 3035 goto fix_extent_len; 3036 /* update the extent length and mark as initialized */ 3037 ex->ee_block = orig_ex.ee_block; 3038 ex->ee_len = orig_ex.ee_len; 3039 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3040 ext4_ext_dirty(handle, inode, path + depth); 3041 /* zero out the first half */ 3042 return allocated; 3043 } else if (err) 3044 goto fix_extent_len; 3045 out: 3046 ext4_ext_show_leaf(inode, path); 3047 return err ? err : allocated; 3048 3049 fix_extent_len: 3050 ex->ee_block = orig_ex.ee_block; 3051 ex->ee_len = orig_ex.ee_len; 3052 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3053 ext4_ext_mark_uninitialized(ex); 3054 ext4_ext_dirty(handle, inode, path + depth); 3055 return err; 3056 } 3057 3058 /* 3059 * This function is called by ext4_ext_map_blocks() from 3060 * ext4_get_blocks_dio_write() when DIO to write 3061 * to an uninitialized extent. 3062 * 3063 * Writing to an uninitialized extent may result in splitting the uninitialized 3064 * extent into multiple /initialized uninitialized extents (up to three) 3065 * There are three possibilities: 3066 * a> There is no split required: Entire extent should be uninitialized 3067 * b> Splits in two extents: Write is happening at either end of the extent 3068 * c> Splits in three extents: Somone is writing in middle of the extent 3069 * 3070 * One of more index blocks maybe needed if the extent tree grow after 3071 * the uninitialized extent split. To prevent ENOSPC occur at the IO 3072 * complete, we need to split the uninitialized extent before DIO submit 3073 * the IO. The uninitialized extent called at this time will be split 3074 * into three uninitialized extent(at most). After IO complete, the part 3075 * being filled will be convert to initialized by the end_io callback function 3076 * via ext4_convert_unwritten_extents(). 3077 * 3078 * Returns the size of uninitialized extent to be written on success. 3079 */ 3080 static int ext4_split_unwritten_extents(handle_t *handle, 3081 struct inode *inode, 3082 struct ext4_map_blocks *map, 3083 struct ext4_ext_path *path, 3084 int flags) 3085 { 3086 struct ext4_extent *ex, newex, orig_ex; 3087 struct ext4_extent *ex1 = NULL; 3088 struct ext4_extent *ex2 = NULL; 3089 struct ext4_extent *ex3 = NULL; 3090 ext4_lblk_t ee_block, eof_block; 3091 unsigned int allocated, ee_len, depth; 3092 ext4_fsblk_t newblock; 3093 int err = 0; 3094 int may_zeroout; 3095 3096 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3097 "block %llu, max_blocks %u\n", inode->i_ino, 3098 (unsigned long long)map->m_lblk, map->m_len); 3099 3100 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> 3101 inode->i_sb->s_blocksize_bits; 3102 if (eof_block < map->m_lblk + map->m_len) 3103 eof_block = map->m_lblk + map->m_len; 3104 3105 depth = ext_depth(inode); 3106 ex = path[depth].p_ext; 3107 ee_block = le32_to_cpu(ex->ee_block); 3108 ee_len = ext4_ext_get_actual_len(ex); 3109 allocated = ee_len - (map->m_lblk - ee_block); 3110 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex); 3111 3112 ex2 = ex; 3113 orig_ex.ee_block = ex->ee_block; 3114 orig_ex.ee_len = cpu_to_le16(ee_len); 3115 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex)); 3116 3117 /* 3118 * It is safe to convert extent to initialized via explicit 3119 * zeroout only if extent is fully insde i_size or new_size. 3120 */ 3121 may_zeroout = ee_block + ee_len <= eof_block; 3122 3123 /* 3124 * If the uninitialized extent begins at the same logical 3125 * block where the write begins, and the write completely 3126 * covers the extent, then we don't need to split it. 3127 */ 3128 if ((map->m_lblk == ee_block) && (allocated <= map->m_len)) 3129 return allocated; 3130 3131 err = ext4_ext_get_access(handle, inode, path + depth); 3132 if (err) 3133 goto out; 3134 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */ 3135 if (map->m_lblk > ee_block) { 3136 ex1 = ex; 3137 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3138 ext4_ext_mark_uninitialized(ex1); 3139 ex2 = &newex; 3140 } 3141 /* 3142 * for sanity, update the length of the ex2 extent before 3143 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3144 * overlap of blocks. 3145 */ 3146 if (!ex1 && allocated > map->m_len) 3147 ex2->ee_len = cpu_to_le16(map->m_len); 3148 /* ex3: to ee_block + ee_len : uninitialised */ 3149 if (allocated > map->m_len) { 3150 unsigned int newdepth; 3151 ex3 = &newex; 3152 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len); 3153 ext4_ext_store_pblock(ex3, newblock + map->m_len); 3154 ex3->ee_len = cpu_to_le16(allocated - map->m_len); 3155 ext4_ext_mark_uninitialized(ex3); 3156 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3157 if (err == -ENOSPC && may_zeroout) { 3158 err = ext4_ext_zeroout(inode, &orig_ex); 3159 if (err) 3160 goto fix_extent_len; 3161 /* update the extent length and mark as initialized */ 3162 ex->ee_block = orig_ex.ee_block; 3163 ex->ee_len = orig_ex.ee_len; 3164 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3165 ext4_ext_dirty(handle, inode, path + depth); 3166 /* zeroed the full extent */ 3167 /* blocks available from map->m_lblk */ 3168 return allocated; 3169 3170 } else if (err) 3171 goto fix_extent_len; 3172 /* 3173 * The depth, and hence eh & ex might change 3174 * as part of the insert above. 3175 */ 3176 newdepth = ext_depth(inode); 3177 /* 3178 * update the extent length after successful insert of the 3179 * split extent 3180 */ 3181 ee_len -= ext4_ext_get_actual_len(ex3); 3182 orig_ex.ee_len = cpu_to_le16(ee_len); 3183 may_zeroout = ee_block + ee_len <= eof_block; 3184 3185 depth = newdepth; 3186 ext4_ext_drop_refs(path); 3187 path = ext4_ext_find_extent(inode, map->m_lblk, path); 3188 if (IS_ERR(path)) { 3189 err = PTR_ERR(path); 3190 goto out; 3191 } 3192 ex = path[depth].p_ext; 3193 if (ex2 != &newex) 3194 ex2 = ex; 3195 3196 err = ext4_ext_get_access(handle, inode, path + depth); 3197 if (err) 3198 goto out; 3199 3200 allocated = map->m_len; 3201 } 3202 /* 3203 * If there was a change of depth as part of the 3204 * insertion of ex3 above, we need to update the length 3205 * of the ex1 extent again here 3206 */ 3207 if (ex1 && ex1 != ex) { 3208 ex1 = ex; 3209 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block); 3210 ext4_ext_mark_uninitialized(ex1); 3211 ex2 = &newex; 3212 } 3213 /* 3214 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written 3215 * using direct I/O, uninitialised still. 3216 */ 3217 ex2->ee_block = cpu_to_le32(map->m_lblk); 3218 ext4_ext_store_pblock(ex2, newblock); 3219 ex2->ee_len = cpu_to_le16(allocated); 3220 ext4_ext_mark_uninitialized(ex2); 3221 if (ex2 != ex) 3222 goto insert; 3223 /* Mark modified extent as dirty */ 3224 err = ext4_ext_dirty(handle, inode, path + depth); 3225 ext_debug("out here\n"); 3226 goto out; 3227 insert: 3228 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3229 if (err == -ENOSPC && may_zeroout) { 3230 err = ext4_ext_zeroout(inode, &orig_ex); 3231 if (err) 3232 goto fix_extent_len; 3233 /* update the extent length and mark as initialized */ 3234 ex->ee_block = orig_ex.ee_block; 3235 ex->ee_len = orig_ex.ee_len; 3236 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3237 ext4_ext_dirty(handle, inode, path + depth); 3238 /* zero out the first half */ 3239 return allocated; 3240 } else if (err) 3241 goto fix_extent_len; 3242 out: 3243 ext4_ext_show_leaf(inode, path); 3244 return err ? err : allocated; 3245 3246 fix_extent_len: 3247 ex->ee_block = orig_ex.ee_block; 3248 ex->ee_len = orig_ex.ee_len; 3249 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex)); 3250 ext4_ext_mark_uninitialized(ex); 3251 ext4_ext_dirty(handle, inode, path + depth); 3252 return err; 3253 } 3254 3255 static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3256 struct inode *inode, 3257 struct ext4_ext_path *path) 3258 { 3259 struct ext4_extent *ex; 3260 struct ext4_extent_header *eh; 3261 int depth; 3262 int err = 0; 3263 3264 depth = ext_depth(inode); 3265 eh = path[depth].p_hdr; 3266 ex = path[depth].p_ext; 3267 3268 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3269 "block %llu, max_blocks %u\n", inode->i_ino, 3270 (unsigned long long)le32_to_cpu(ex->ee_block), 3271 ext4_ext_get_actual_len(ex)); 3272 3273 err = ext4_ext_get_access(handle, inode, path + depth); 3274 if (err) 3275 goto out; 3276 /* first mark the extent as initialized */ 3277 ext4_ext_mark_initialized(ex); 3278 3279 /* note: ext4_ext_correct_indexes() isn't needed here because 3280 * borders are not changed 3281 */ 3282 ext4_ext_try_to_merge(inode, path, ex); 3283 3284 /* Mark modified extent as dirty */ 3285 err = ext4_ext_dirty(handle, inode, path + depth); 3286 out: 3287 ext4_ext_show_leaf(inode, path); 3288 return err; 3289 } 3290 3291 static void unmap_underlying_metadata_blocks(struct block_device *bdev, 3292 sector_t block, int count) 3293 { 3294 int i; 3295 for (i = 0; i < count; i++) 3296 unmap_underlying_metadata(bdev, block + i); 3297 } 3298 3299 /* 3300 * Handle EOFBLOCKS_FL flag, clearing it if necessary 3301 */ 3302 static int check_eofblocks_fl(handle_t *handle, struct inode *inode, 3303 ext4_lblk_t lblk, 3304 struct ext4_ext_path *path, 3305 unsigned int len) 3306 { 3307 int i, depth; 3308 struct ext4_extent_header *eh; 3309 struct ext4_extent *last_ex; 3310 3311 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3312 return 0; 3313 3314 depth = ext_depth(inode); 3315 eh = path[depth].p_hdr; 3316 3317 if (unlikely(!eh->eh_entries)) { 3318 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3319 "EOFBLOCKS_FL set"); 3320 return -EIO; 3321 } 3322 last_ex = EXT_LAST_EXTENT(eh); 3323 /* 3324 * We should clear the EOFBLOCKS_FL flag if we are writing the 3325 * last block in the last extent in the file. We test this by 3326 * first checking to see if the caller to 3327 * ext4_ext_get_blocks() was interested in the last block (or 3328 * a block beyond the last block) in the current extent. If 3329 * this turns out to be false, we can bail out from this 3330 * function immediately. 3331 */ 3332 if (lblk + len < le32_to_cpu(last_ex->ee_block) + 3333 ext4_ext_get_actual_len(last_ex)) 3334 return 0; 3335 /* 3336 * If the caller does appear to be planning to write at or 3337 * beyond the end of the current extent, we then test to see 3338 * if the current extent is the last extent in the file, by 3339 * checking to make sure it was reached via the rightmost node 3340 * at each level of the tree. 3341 */ 3342 for (i = depth-1; i >= 0; i--) 3343 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) 3344 return 0; 3345 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3346 return ext4_mark_inode_dirty(handle, inode); 3347 } 3348 3349 static int 3350 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3351 struct ext4_map_blocks *map, 3352 struct ext4_ext_path *path, int flags, 3353 unsigned int allocated, ext4_fsblk_t newblock) 3354 { 3355 int ret = 0; 3356 int err = 0; 3357 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3358 3359 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3360 "block %llu, max_blocks %u, flags %d, allocated %u", 3361 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3362 flags, allocated); 3363 ext4_ext_show_leaf(inode, path); 3364 3365 /* get_block() before submit the IO, split the extent */ 3366 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3367 ret = ext4_split_unwritten_extents(handle, inode, map, 3368 path, flags); 3369 /* 3370 * Flag the inode(non aio case) or end_io struct (aio case) 3371 * that this IO needs to conversion to written when IO is 3372 * completed 3373 */ 3374 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3375 io->flag = EXT4_IO_END_UNWRITTEN; 3376 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3377 } else 3378 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3379 if (ext4_should_dioread_nolock(inode)) 3380 map->m_flags |= EXT4_MAP_UNINIT; 3381 goto out; 3382 } 3383 /* IO end_io complete, convert the filled extent to written */ 3384 if ((flags & EXT4_GET_BLOCKS_CONVERT)) { 3385 ret = ext4_convert_unwritten_extents_endio(handle, inode, 3386 path); 3387 if (ret >= 0) { 3388 ext4_update_inode_fsync_trans(handle, inode, 1); 3389 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3390 path, map->m_len); 3391 } else 3392 err = ret; 3393 goto out2; 3394 } 3395 /* buffered IO case */ 3396 /* 3397 * repeat fallocate creation request 3398 * we already have an unwritten extent 3399 */ 3400 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3401 goto map_out; 3402 3403 /* buffered READ or buffered write_begin() lookup */ 3404 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3405 /* 3406 * We have blocks reserved already. We 3407 * return allocated blocks so that delalloc 3408 * won't do block reservation for us. But 3409 * the buffer head will be unmapped so that 3410 * a read from the block returns 0s. 3411 */ 3412 map->m_flags |= EXT4_MAP_UNWRITTEN; 3413 goto out1; 3414 } 3415 3416 /* buffered write, writepage time, convert*/ 3417 ret = ext4_ext_convert_to_initialized(handle, inode, map, path); 3418 if (ret >= 0) { 3419 ext4_update_inode_fsync_trans(handle, inode, 1); 3420 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, 3421 map->m_len); 3422 if (err < 0) 3423 goto out2; 3424 } 3425 3426 out: 3427 if (ret <= 0) { 3428 err = ret; 3429 goto out2; 3430 } else 3431 allocated = ret; 3432 map->m_flags |= EXT4_MAP_NEW; 3433 /* 3434 * if we allocated more blocks than requested 3435 * we need to make sure we unmap the extra block 3436 * allocated. The actual needed block will get 3437 * unmapped later when we find the buffer_head marked 3438 * new. 3439 */ 3440 if (allocated > map->m_len) { 3441 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3442 newblock + map->m_len, 3443 allocated - map->m_len); 3444 allocated = map->m_len; 3445 } 3446 3447 /* 3448 * If we have done fallocate with the offset that is already 3449 * delayed allocated, we would have block reservation 3450 * and quota reservation done in the delayed write path. 3451 * But fallocate would have already updated quota and block 3452 * count for this offset. So cancel these reservation 3453 */ 3454 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3455 ext4_da_update_reserve_space(inode, allocated, 0); 3456 3457 map_out: 3458 map->m_flags |= EXT4_MAP_MAPPED; 3459 out1: 3460 if (allocated > map->m_len) 3461 allocated = map->m_len; 3462 ext4_ext_show_leaf(inode, path); 3463 map->m_pblk = newblock; 3464 map->m_len = allocated; 3465 out2: 3466 if (path) { 3467 ext4_ext_drop_refs(path); 3468 kfree(path); 3469 } 3470 return err ? err : allocated; 3471 } 3472 3473 /* 3474 * Block allocation/map/preallocation routine for extents based files 3475 * 3476 * 3477 * Need to be called with 3478 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 3479 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 3480 * 3481 * return > 0, number of of blocks already mapped/allocated 3482 * if create == 0 and these are pre-allocated blocks 3483 * buffer head is unmapped 3484 * otherwise blocks are mapped 3485 * 3486 * return = 0, if plain look up failed (blocks have not been allocated) 3487 * buffer head is unmapped 3488 * 3489 * return < 0, error case. 3490 */ 3491 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 3492 struct ext4_map_blocks *map, int flags) 3493 { 3494 struct ext4_ext_path *path = NULL; 3495 struct ext4_extent newex, *ex; 3496 ext4_fsblk_t newblock = 0; 3497 int err = 0, depth, ret; 3498 unsigned int allocated = 0; 3499 struct ext4_allocation_request ar; 3500 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3501 3502 ext_debug("blocks %u/%u requested for inode %lu\n", 3503 map->m_lblk, map->m_len, inode->i_ino); 3504 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3505 3506 /* check in cache */ 3507 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3508 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3509 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3510 /* 3511 * block isn't allocated yet and 3512 * user doesn't want to allocate it 3513 */ 3514 goto out2; 3515 } 3516 /* we should allocate requested block */ 3517 } else { 3518 /* block is already allocated */ 3519 newblock = map->m_lblk 3520 - le32_to_cpu(newex.ee_block) 3521 + ext4_ext_pblock(&newex); 3522 /* number of remaining blocks in the extent */ 3523 allocated = ext4_ext_get_actual_len(&newex) - 3524 (map->m_lblk - le32_to_cpu(newex.ee_block)); 3525 goto out; 3526 } 3527 } 3528 3529 /* find extent for this block */ 3530 path = ext4_ext_find_extent(inode, map->m_lblk, NULL); 3531 if (IS_ERR(path)) { 3532 err = PTR_ERR(path); 3533 path = NULL; 3534 goto out2; 3535 } 3536 3537 depth = ext_depth(inode); 3538 3539 /* 3540 * consistent leaf must not be empty; 3541 * this situation is possible, though, _during_ tree modification; 3542 * this is why assert can't be put in ext4_ext_find_extent() 3543 */ 3544 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3545 EXT4_ERROR_INODE(inode, "bad extent address " 3546 "lblock: %lu, depth: %d pblock %lld", 3547 (unsigned long) map->m_lblk, depth, 3548 path[depth].p_block); 3549 err = -EIO; 3550 goto out2; 3551 } 3552 3553 ex = path[depth].p_ext; 3554 if (ex) { 3555 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3556 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3557 unsigned short ee_len; 3558 3559 /* 3560 * Uninitialized extents are treated as holes, except that 3561 * we split out initialized portions during a write. 3562 */ 3563 ee_len = ext4_ext_get_actual_len(ex); 3564 /* if found extent covers block, simply return it */ 3565 if (in_range(map->m_lblk, ee_block, ee_len)) { 3566 newblock = map->m_lblk - ee_block + ee_start; 3567 /* number of remaining blocks in the extent */ 3568 allocated = ee_len - (map->m_lblk - ee_block); 3569 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3570 ee_block, ee_len, newblock); 3571 3572 /* Do not put uninitialized extent in the cache */ 3573 if (!ext4_ext_is_uninitialized(ex)) { 3574 ext4_ext_put_in_cache(inode, ee_block, 3575 ee_len, ee_start); 3576 goto out; 3577 } 3578 ret = ext4_ext_handle_uninitialized_extents(handle, 3579 inode, map, path, flags, allocated, 3580 newblock); 3581 return ret; 3582 } 3583 } 3584 3585 /* 3586 * requested block isn't allocated yet; 3587 * we couldn't try to create block if create flag is zero 3588 */ 3589 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3590 /* 3591 * put just found gap into cache to speed up 3592 * subsequent requests 3593 */ 3594 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 3595 goto out2; 3596 } 3597 /* 3598 * Okay, we need to do block allocation. 3599 */ 3600 3601 /* find neighbour allocated blocks */ 3602 ar.lleft = map->m_lblk; 3603 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3604 if (err) 3605 goto out2; 3606 ar.lright = map->m_lblk; 3607 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3608 if (err) 3609 goto out2; 3610 3611 /* 3612 * See if request is beyond maximum number of blocks we can have in 3613 * a single extent. For an initialized extent this limit is 3614 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3615 * EXT_UNINIT_MAX_LEN. 3616 */ 3617 if (map->m_len > EXT_INIT_MAX_LEN && 3618 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3619 map->m_len = EXT_INIT_MAX_LEN; 3620 else if (map->m_len > EXT_UNINIT_MAX_LEN && 3621 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3622 map->m_len = EXT_UNINIT_MAX_LEN; 3623 3624 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */ 3625 newex.ee_block = cpu_to_le32(map->m_lblk); 3626 newex.ee_len = cpu_to_le16(map->m_len); 3627 err = ext4_ext_check_overlap(inode, &newex, path); 3628 if (err) 3629 allocated = ext4_ext_get_actual_len(&newex); 3630 else 3631 allocated = map->m_len; 3632 3633 /* allocate new block */ 3634 ar.inode = inode; 3635 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk); 3636 ar.logical = map->m_lblk; 3637 ar.len = allocated; 3638 if (S_ISREG(inode->i_mode)) 3639 ar.flags = EXT4_MB_HINT_DATA; 3640 else 3641 /* disable in-core preallocation for non-regular files */ 3642 ar.flags = 0; 3643 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3644 if (!newblock) 3645 goto out2; 3646 ext_debug("allocate new block: goal %llu, found %llu/%u\n", 3647 ar.goal, newblock, allocated); 3648 3649 /* try to insert new extent into found leaf and return */ 3650 ext4_ext_store_pblock(&newex, newblock); 3651 newex.ee_len = cpu_to_le16(ar.len); 3652 /* Mark uninitialized */ 3653 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ 3654 ext4_ext_mark_uninitialized(&newex); 3655 /* 3656 * io_end structure was created for every IO write to an 3657 * uninitialized extent. To avoid unnecessary conversion, 3658 * here we flag the IO that really needs the conversion. 3659 * For non asycn direct IO case, flag the inode state 3660 * that we need to perform conversion when IO is done. 3661 */ 3662 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3663 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) { 3664 io->flag = EXT4_IO_END_UNWRITTEN; 3665 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); 3666 } else 3667 ext4_set_inode_state(inode, 3668 EXT4_STATE_DIO_UNWRITTEN); 3669 } 3670 if (ext4_should_dioread_nolock(inode)) 3671 map->m_flags |= EXT4_MAP_UNINIT; 3672 } 3673 3674 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 3675 if (err) 3676 goto out2; 3677 3678 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3679 if (err) { 3680 /* free data blocks we just allocated */ 3681 /* not a good idea to call discard here directly, 3682 * but otherwise we'd need to call it every free() */ 3683 ext4_discard_preallocations(inode); 3684 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 3685 ext4_ext_get_actual_len(&newex), 0); 3686 goto out2; 3687 } 3688 3689 /* previous routine could use block we allocated */ 3690 newblock = ext4_ext_pblock(&newex); 3691 allocated = ext4_ext_get_actual_len(&newex); 3692 if (allocated > map->m_len) 3693 allocated = map->m_len; 3694 map->m_flags |= EXT4_MAP_NEW; 3695 3696 /* 3697 * Update reserved blocks/metadata blocks after successful 3698 * block allocation which had been deferred till now. 3699 */ 3700 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 3701 ext4_da_update_reserve_space(inode, allocated, 1); 3702 3703 /* 3704 * Cache the extent and update transaction to commit on fdatasync only 3705 * when it is _not_ an uninitialized extent. 3706 */ 3707 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3708 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock); 3709 ext4_update_inode_fsync_trans(handle, inode, 1); 3710 } else 3711 ext4_update_inode_fsync_trans(handle, inode, 0); 3712 out: 3713 if (allocated > map->m_len) 3714 allocated = map->m_len; 3715 ext4_ext_show_leaf(inode, path); 3716 map->m_flags |= EXT4_MAP_MAPPED; 3717 map->m_pblk = newblock; 3718 map->m_len = allocated; 3719 out2: 3720 if (path) { 3721 ext4_ext_drop_refs(path); 3722 kfree(path); 3723 } 3724 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3725 newblock, map->m_len, err ? err : allocated); 3726 return err ? err : allocated; 3727 } 3728 3729 void ext4_ext_truncate(struct inode *inode) 3730 { 3731 struct address_space *mapping = inode->i_mapping; 3732 struct super_block *sb = inode->i_sb; 3733 ext4_lblk_t last_block; 3734 handle_t *handle; 3735 int err = 0; 3736 3737 /* 3738 * finish any pending end_io work so we won't run the risk of 3739 * converting any truncated blocks to initialized later 3740 */ 3741 ext4_flush_completed_IO(inode); 3742 3743 /* 3744 * probably first extent we're gonna free will be last in block 3745 */ 3746 err = ext4_writepage_trans_blocks(inode); 3747 handle = ext4_journal_start(inode, err); 3748 if (IS_ERR(handle)) 3749 return; 3750 3751 if (inode->i_size & (sb->s_blocksize - 1)) 3752 ext4_block_truncate_page(handle, mapping, inode->i_size); 3753 3754 if (ext4_orphan_add(handle, inode)) 3755 goto out_stop; 3756 3757 down_write(&EXT4_I(inode)->i_data_sem); 3758 ext4_ext_invalidate_cache(inode); 3759 3760 ext4_discard_preallocations(inode); 3761 3762 /* 3763 * TODO: optimization is possible here. 3764 * Probably we need not scan at all, 3765 * because page truncation is enough. 3766 */ 3767 3768 /* we have to know where to truncate from in crash case */ 3769 EXT4_I(inode)->i_disksize = inode->i_size; 3770 ext4_mark_inode_dirty(handle, inode); 3771 3772 last_block = (inode->i_size + sb->s_blocksize - 1) 3773 >> EXT4_BLOCK_SIZE_BITS(sb); 3774 err = ext4_ext_remove_space(inode, last_block); 3775 3776 /* In a multi-transaction truncate, we only make the final 3777 * transaction synchronous. 3778 */ 3779 if (IS_SYNC(inode)) 3780 ext4_handle_sync(handle); 3781 3782 out_stop: 3783 up_write(&EXT4_I(inode)->i_data_sem); 3784 /* 3785 * If this was a simple ftruncate() and the file will remain alive, 3786 * then we need to clear up the orphan record which we created above. 3787 * However, if this was a real unlink then we were called by 3788 * ext4_delete_inode(), and we allow that function to clean up the 3789 * orphan info for us. 3790 */ 3791 if (inode->i_nlink) 3792 ext4_orphan_del(handle, inode); 3793 3794 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3795 ext4_mark_inode_dirty(handle, inode); 3796 ext4_journal_stop(handle); 3797 } 3798 3799 static void ext4_falloc_update_inode(struct inode *inode, 3800 int mode, loff_t new_size, int update_ctime) 3801 { 3802 struct timespec now; 3803 3804 if (update_ctime) { 3805 now = current_fs_time(inode->i_sb); 3806 if (!timespec_equal(&inode->i_ctime, &now)) 3807 inode->i_ctime = now; 3808 } 3809 /* 3810 * Update only when preallocation was requested beyond 3811 * the file size. 3812 */ 3813 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3814 if (new_size > i_size_read(inode)) 3815 i_size_write(inode, new_size); 3816 if (new_size > EXT4_I(inode)->i_disksize) 3817 ext4_update_i_disksize(inode, new_size); 3818 } else { 3819 /* 3820 * Mark that we allocate beyond EOF so the subsequent truncate 3821 * can proceed even if the new size is the same as i_size. 3822 */ 3823 if (new_size > i_size_read(inode)) 3824 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 3825 } 3826 3827 } 3828 3829 /* 3830 * preallocate space for a file. This implements ext4's fallocate file 3831 * operation, which gets called from sys_fallocate system call. 3832 * For block-mapped files, posix_fallocate should fall back to the method 3833 * of writing zeroes to the required new blocks (the same behavior which is 3834 * expected for file systems which do not support fallocate() system call). 3835 */ 3836 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 3837 { 3838 struct inode *inode = file->f_path.dentry->d_inode; 3839 handle_t *handle; 3840 loff_t new_size; 3841 unsigned int max_blocks; 3842 int ret = 0; 3843 int ret2 = 0; 3844 int retries = 0; 3845 struct ext4_map_blocks map; 3846 unsigned int credits, blkbits = inode->i_blkbits; 3847 3848 /* We only support the FALLOC_FL_KEEP_SIZE mode */ 3849 if (mode & ~FALLOC_FL_KEEP_SIZE) 3850 return -EOPNOTSUPP; 3851 3852 /* 3853 * currently supporting (pre)allocate mode for extent-based 3854 * files _only_ 3855 */ 3856 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3857 return -EOPNOTSUPP; 3858 3859 trace_ext4_fallocate_enter(inode, offset, len, mode); 3860 map.m_lblk = offset >> blkbits; 3861 /* 3862 * We can't just convert len to max_blocks because 3863 * If blocksize = 4096 offset = 3072 and len = 2048 3864 */ 3865 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3866 - map.m_lblk; 3867 /* 3868 * credits to insert 1 extent into extent tree 3869 */ 3870 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3871 mutex_lock(&inode->i_mutex); 3872 ret = inode_newsize_ok(inode, (len + offset)); 3873 if (ret) { 3874 mutex_unlock(&inode->i_mutex); 3875 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 3876 return ret; 3877 } 3878 retry: 3879 while (ret >= 0 && ret < max_blocks) { 3880 map.m_lblk = map.m_lblk + ret; 3881 map.m_len = max_blocks = max_blocks - ret; 3882 handle = ext4_journal_start(inode, credits); 3883 if (IS_ERR(handle)) { 3884 ret = PTR_ERR(handle); 3885 break; 3886 } 3887 ret = ext4_map_blocks(handle, inode, &map, 3888 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3889 if (ret <= 0) { 3890 #ifdef EXT4FS_DEBUG 3891 WARN_ON(ret <= 0); 3892 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3893 "returned error inode#%lu, block=%u, " 3894 "max_blocks=%u", __func__, 3895 inode->i_ino, map.m_lblk, max_blocks); 3896 #endif 3897 ext4_mark_inode_dirty(handle, inode); 3898 ret2 = ext4_journal_stop(handle); 3899 break; 3900 } 3901 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3902 blkbits) >> blkbits)) 3903 new_size = offset + len; 3904 else 3905 new_size = (map.m_lblk + ret) << blkbits; 3906 3907 ext4_falloc_update_inode(inode, mode, new_size, 3908 (map.m_flags & EXT4_MAP_NEW)); 3909 ext4_mark_inode_dirty(handle, inode); 3910 ret2 = ext4_journal_stop(handle); 3911 if (ret2) 3912 break; 3913 } 3914 if (ret == -ENOSPC && 3915 ext4_should_retry_alloc(inode->i_sb, &retries)) { 3916 ret = 0; 3917 goto retry; 3918 } 3919 mutex_unlock(&inode->i_mutex); 3920 trace_ext4_fallocate_exit(inode, offset, max_blocks, 3921 ret > 0 ? ret2 : ret); 3922 return ret > 0 ? ret2 : ret; 3923 } 3924 3925 /* 3926 * This function convert a range of blocks to written extents 3927 * The caller of this function will pass the start offset and the size. 3928 * all unwritten extents within this range will be converted to 3929 * written extents. 3930 * 3931 * This function is called from the direct IO end io call back 3932 * function, to convert the fallocated extents after IO is completed. 3933 * Returns 0 on success. 3934 */ 3935 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 3936 ssize_t len) 3937 { 3938 handle_t *handle; 3939 unsigned int max_blocks; 3940 int ret = 0; 3941 int ret2 = 0; 3942 struct ext4_map_blocks map; 3943 unsigned int credits, blkbits = inode->i_blkbits; 3944 3945 map.m_lblk = offset >> blkbits; 3946 /* 3947 * We can't just convert len to max_blocks because 3948 * If blocksize = 4096 offset = 3072 and len = 2048 3949 */ 3950 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 3951 map.m_lblk); 3952 /* 3953 * credits to insert 1 extent into extent tree 3954 */ 3955 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3956 while (ret >= 0 && ret < max_blocks) { 3957 map.m_lblk += ret; 3958 map.m_len = (max_blocks -= ret); 3959 handle = ext4_journal_start(inode, credits); 3960 if (IS_ERR(handle)) { 3961 ret = PTR_ERR(handle); 3962 break; 3963 } 3964 ret = ext4_map_blocks(handle, inode, &map, 3965 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3966 if (ret <= 0) { 3967 WARN_ON(ret <= 0); 3968 printk(KERN_ERR "%s: ext4_ext_map_blocks " 3969 "returned error inode#%lu, block=%u, " 3970 "max_blocks=%u", __func__, 3971 inode->i_ino, map.m_lblk, map.m_len); 3972 } 3973 ext4_mark_inode_dirty(handle, inode); 3974 ret2 = ext4_journal_stop(handle); 3975 if (ret <= 0 || ret2 ) 3976 break; 3977 } 3978 return ret > 0 ? ret2 : ret; 3979 } 3980 3981 /* 3982 * Callback function called for each extent to gather FIEMAP information. 3983 */ 3984 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3985 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3986 void *data) 3987 { 3988 __u64 logical; 3989 __u64 physical; 3990 __u64 length; 3991 loff_t size; 3992 __u32 flags = 0; 3993 int ret = 0; 3994 struct fiemap_extent_info *fieinfo = data; 3995 unsigned char blksize_bits; 3996 3997 blksize_bits = inode->i_sb->s_blocksize_bits; 3998 logical = (__u64)newex->ec_block << blksize_bits; 3999 4000 if (newex->ec_start == 0) { 4001 /* 4002 * No extent in extent-tree contains block @newex->ec_start, 4003 * then the block may stay in 1)a hole or 2)delayed-extent. 4004 * 4005 * Holes or delayed-extents are processed as follows. 4006 * 1. lookup dirty pages with specified range in pagecache. 4007 * If no page is got, then there is no delayed-extent and 4008 * return with EXT_CONTINUE. 4009 * 2. find the 1st mapped buffer, 4010 * 3. check if the mapped buffer is both in the request range 4011 * and a delayed buffer. If not, there is no delayed-extent, 4012 * then return. 4013 * 4. a delayed-extent is found, the extent will be collected. 4014 */ 4015 ext4_lblk_t end = 0; 4016 pgoff_t last_offset; 4017 pgoff_t offset; 4018 pgoff_t index; 4019 struct page **pages = NULL; 4020 struct buffer_head *bh = NULL; 4021 struct buffer_head *head = NULL; 4022 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); 4023 4024 pages = kmalloc(PAGE_SIZE, GFP_KERNEL); 4025 if (pages == NULL) 4026 return -ENOMEM; 4027 4028 offset = logical >> PAGE_SHIFT; 4029 repeat: 4030 last_offset = offset; 4031 head = NULL; 4032 ret = find_get_pages_tag(inode->i_mapping, &offset, 4033 PAGECACHE_TAG_DIRTY, nr_pages, pages); 4034 4035 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 4036 /* First time, try to find a mapped buffer. */ 4037 if (ret == 0) { 4038 out: 4039 for (index = 0; index < ret; index++) 4040 page_cache_release(pages[index]); 4041 /* just a hole. */ 4042 kfree(pages); 4043 return EXT_CONTINUE; 4044 } 4045 4046 /* Try to find the 1st mapped buffer. */ 4047 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 4048 blksize_bits; 4049 if (!page_has_buffers(pages[0])) 4050 goto out; 4051 head = page_buffers(pages[0]); 4052 if (!head) 4053 goto out; 4054 4055 bh = head; 4056 do { 4057 if (buffer_mapped(bh)) { 4058 /* get the 1st mapped buffer. */ 4059 if (end > newex->ec_block + 4060 newex->ec_len) 4061 /* The buffer is out of 4062 * the request range. 4063 */ 4064 goto out; 4065 goto found_mapped_buffer; 4066 } 4067 bh = bh->b_this_page; 4068 end++; 4069 } while (bh != head); 4070 4071 /* No mapped buffer found. */ 4072 goto out; 4073 } else { 4074 /*Find contiguous delayed buffers. */ 4075 if (ret > 0 && pages[0]->index == last_offset) 4076 head = page_buffers(pages[0]); 4077 bh = head; 4078 } 4079 4080 found_mapped_buffer: 4081 if (bh != NULL && buffer_delay(bh)) { 4082 /* 1st or contiguous delayed buffer found. */ 4083 if (!(flags & FIEMAP_EXTENT_DELALLOC)) { 4084 /* 4085 * 1st delayed buffer found, record 4086 * the start of extent. 4087 */ 4088 flags |= FIEMAP_EXTENT_DELALLOC; 4089 newex->ec_block = end; 4090 logical = (__u64)end << blksize_bits; 4091 } 4092 /* Find contiguous delayed buffers. */ 4093 do { 4094 if (!buffer_delay(bh)) 4095 goto found_delayed_extent; 4096 bh = bh->b_this_page; 4097 end++; 4098 } while (bh != head); 4099 4100 for (index = 1; index < ret; index++) { 4101 if (!page_has_buffers(pages[index])) { 4102 bh = NULL; 4103 break; 4104 } 4105 head = page_buffers(pages[index]); 4106 if (!head) { 4107 bh = NULL; 4108 break; 4109 } 4110 if (pages[index]->index != 4111 pages[0]->index + index) { 4112 /* Blocks are not contiguous. */ 4113 bh = NULL; 4114 break; 4115 } 4116 bh = head; 4117 do { 4118 if (!buffer_delay(bh)) 4119 /* Delayed-extent ends. */ 4120 goto found_delayed_extent; 4121 bh = bh->b_this_page; 4122 end++; 4123 } while (bh != head); 4124 } 4125 } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) 4126 /* a hole found. */ 4127 goto out; 4128 4129 found_delayed_extent: 4130 newex->ec_len = min(end - newex->ec_block, 4131 (ext4_lblk_t)EXT_INIT_MAX_LEN); 4132 if (ret == nr_pages && bh != NULL && 4133 newex->ec_len < EXT_INIT_MAX_LEN && 4134 buffer_delay(bh)) { 4135 /* Have not collected an extent and continue. */ 4136 for (index = 0; index < ret; index++) 4137 page_cache_release(pages[index]); 4138 goto repeat; 4139 } 4140 4141 for (index = 0; index < ret; index++) 4142 page_cache_release(pages[index]); 4143 kfree(pages); 4144 } 4145 4146 physical = (__u64)newex->ec_start << blksize_bits; 4147 length = (__u64)newex->ec_len << blksize_bits; 4148 4149 if (ex && ext4_ext_is_uninitialized(ex)) 4150 flags |= FIEMAP_EXTENT_UNWRITTEN; 4151 4152 size = i_size_read(inode); 4153 if (logical + length >= size) 4154 flags |= FIEMAP_EXTENT_LAST; 4155 4156 ret = fiemap_fill_next_extent(fieinfo, logical, physical, 4157 length, flags); 4158 if (ret < 0) 4159 return ret; 4160 if (ret == 1) 4161 return EXT_BREAK; 4162 return EXT_CONTINUE; 4163 } 4164 4165 /* fiemap flags we can handle specified here */ 4166 #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 4167 4168 static int ext4_xattr_fiemap(struct inode *inode, 4169 struct fiemap_extent_info *fieinfo) 4170 { 4171 __u64 physical = 0; 4172 __u64 length; 4173 __u32 flags = FIEMAP_EXTENT_LAST; 4174 int blockbits = inode->i_sb->s_blocksize_bits; 4175 int error = 0; 4176 4177 /* in-inode? */ 4178 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 4179 struct ext4_iloc iloc; 4180 int offset; /* offset of xattr in inode */ 4181 4182 error = ext4_get_inode_loc(inode, &iloc); 4183 if (error) 4184 return error; 4185 physical = iloc.bh->b_blocknr << blockbits; 4186 offset = EXT4_GOOD_OLD_INODE_SIZE + 4187 EXT4_I(inode)->i_extra_isize; 4188 physical += offset; 4189 length = EXT4_SB(inode->i_sb)->s_inode_size - offset; 4190 flags |= FIEMAP_EXTENT_DATA_INLINE; 4191 brelse(iloc.bh); 4192 } else { /* external block */ 4193 physical = EXT4_I(inode)->i_file_acl << blockbits; 4194 length = inode->i_sb->s_blocksize; 4195 } 4196 4197 if (physical) 4198 error = fiemap_fill_next_extent(fieinfo, 0, physical, 4199 length, flags); 4200 return (error < 0 ? error : 0); 4201 } 4202 4203 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4204 __u64 start, __u64 len) 4205 { 4206 ext4_lblk_t start_blk; 4207 int error = 0; 4208 4209 /* fallback to generic here if not in extents fmt */ 4210 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4211 return generic_block_fiemap(inode, fieinfo, start, len, 4212 ext4_get_block); 4213 4214 if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) 4215 return -EBADR; 4216 4217 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { 4218 error = ext4_xattr_fiemap(inode, fieinfo); 4219 } else { 4220 ext4_lblk_t len_blks; 4221 __u64 last_blk; 4222 4223 start_blk = start >> inode->i_sb->s_blocksize_bits; 4224 last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; 4225 if (last_blk >= EXT_MAX_BLOCK) 4226 last_blk = EXT_MAX_BLOCK-1; 4227 len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; 4228 4229 /* 4230 * Walk the extent tree gathering extent information. 4231 * ext4_ext_fiemap_cb will push extents back to user. 4232 */ 4233 error = ext4_ext_walk_space(inode, start_blk, len_blks, 4234 ext4_ext_fiemap_cb, fieinfo); 4235 } 4236 4237 return error; 4238 } 4239 4240