1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 #include "ext4_jbd2.h" 41 #include "xattr.h" 42 #include "acl.h" 43 #include "ext4_extents.h" 44 45 #define MPAGE_DA_EXTENT_TAIL 0x01 46 47 static inline int ext4_begin_ordered_truncate(struct inode *inode, 48 loff_t new_size) 49 { 50 return jbd2_journal_begin_ordered_truncate( 51 EXT4_SB(inode->i_sb)->s_journal, 52 &EXT4_I(inode)->jinode, 53 new_size); 54 } 55 56 static void ext4_invalidatepage(struct page *page, unsigned long offset); 57 58 /* 59 * Test whether an inode is a fast symlink. 60 */ 61 static int ext4_inode_is_fast_symlink(struct inode *inode) 62 { 63 int ea_blocks = EXT4_I(inode)->i_file_acl ? 64 (inode->i_sb->s_blocksize >> 9) : 0; 65 66 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 67 } 68 69 /* 70 * The ext4 forget function must perform a revoke if we are freeing data 71 * which has been journaled. Metadata (eg. indirect blocks) must be 72 * revoked in all cases. 73 * 74 * "bh" may be NULL: a metadata block may have been freed from memory 75 * but there may still be a record of it in the journal, and that record 76 * still needs to be revoked. 77 * 78 * If the handle isn't valid we're not journaling so there's nothing to do. 79 */ 80 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 81 struct buffer_head *bh, ext4_fsblk_t blocknr) 82 { 83 int err; 84 85 if (!ext4_handle_valid(handle)) 86 return 0; 87 88 might_sleep(); 89 90 BUFFER_TRACE(bh, "enter"); 91 92 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 93 "data mode %lx\n", 94 bh, is_metadata, inode->i_mode, 95 test_opt(inode->i_sb, DATA_FLAGS)); 96 97 /* Never use the revoke function if we are doing full data 98 * journaling: there is no need to, and a V1 superblock won't 99 * support it. Otherwise, only skip the revoke on un-journaled 100 * data blocks. */ 101 102 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 103 (!is_metadata && !ext4_should_journal_data(inode))) { 104 if (bh) { 105 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 106 return ext4_journal_forget(handle, bh); 107 } 108 return 0; 109 } 110 111 /* 112 * data!=journal && (is_metadata || should_journal_data(inode)) 113 */ 114 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 115 err = ext4_journal_revoke(handle, blocknr, bh); 116 if (err) 117 ext4_abort(inode->i_sb, __func__, 118 "error %d when attempting revoke", err); 119 BUFFER_TRACE(bh, "exit"); 120 return err; 121 } 122 123 /* 124 * Work out how many blocks we need to proceed with the next chunk of a 125 * truncate transaction. 126 */ 127 static unsigned long blocks_for_truncate(struct inode *inode) 128 { 129 ext4_lblk_t needed; 130 131 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 132 133 /* Give ourselves just enough room to cope with inodes in which 134 * i_blocks is corrupt: we've seen disk corruptions in the past 135 * which resulted in random data in an inode which looked enough 136 * like a regular file for ext4 to try to delete it. Things 137 * will go a bit crazy if that happens, but at least we should 138 * try not to panic the whole kernel. */ 139 if (needed < 2) 140 needed = 2; 141 142 /* But we need to bound the transaction so we don't overflow the 143 * journal. */ 144 if (needed > EXT4_MAX_TRANS_DATA) 145 needed = EXT4_MAX_TRANS_DATA; 146 147 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 148 } 149 150 /* 151 * Truncate transactions can be complex and absolutely huge. So we need to 152 * be able to restart the transaction at a conventient checkpoint to make 153 * sure we don't overflow the journal. 154 * 155 * start_transaction gets us a new handle for a truncate transaction, 156 * and extend_transaction tries to extend the existing one a bit. If 157 * extend fails, we need to propagate the failure up and restart the 158 * transaction in the top-level truncate loop. --sct 159 */ 160 static handle_t *start_transaction(struct inode *inode) 161 { 162 handle_t *result; 163 164 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 165 if (!IS_ERR(result)) 166 return result; 167 168 ext4_std_error(inode->i_sb, PTR_ERR(result)); 169 return result; 170 } 171 172 /* 173 * Try to extend this transaction for the purposes of truncation. 174 * 175 * Returns 0 if we managed to create more room. If we can't create more 176 * room, and the transaction must be restarted we return 1. 177 */ 178 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 179 { 180 if (!ext4_handle_valid(handle)) 181 return 0; 182 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 183 return 0; 184 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 185 return 0; 186 return 1; 187 } 188 189 /* 190 * Restart the transaction associated with *handle. This does a commit, 191 * so before we call here everything must be consistently dirtied against 192 * this transaction. 193 */ 194 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 { 196 BUG_ON(EXT4_JOURNAL(inode) == NULL); 197 jbd_debug(2, "restarting handle %p\n", handle); 198 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 199 } 200 201 /* 202 * Called at the last iput() if i_nlink is zero. 203 */ 204 void ext4_delete_inode(struct inode *inode) 205 { 206 handle_t *handle; 207 int err; 208 209 if (ext4_should_order_data(inode)) 210 ext4_begin_ordered_truncate(inode, 0); 211 truncate_inode_pages(&inode->i_data, 0); 212 213 if (is_bad_inode(inode)) 214 goto no_delete; 215 216 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 217 if (IS_ERR(handle)) { 218 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 219 /* 220 * If we're going to skip the normal cleanup, we still need to 221 * make sure that the in-core orphan linked list is properly 222 * cleaned up. 223 */ 224 ext4_orphan_del(NULL, inode); 225 goto no_delete; 226 } 227 228 if (IS_SYNC(inode)) 229 ext4_handle_sync(handle); 230 inode->i_size = 0; 231 err = ext4_mark_inode_dirty(handle, inode); 232 if (err) { 233 ext4_warning(inode->i_sb, __func__, 234 "couldn't mark inode dirty (err %d)", err); 235 goto stop_handle; 236 } 237 if (inode->i_blocks) 238 ext4_truncate(inode); 239 240 /* 241 * ext4_ext_truncate() doesn't reserve any slop when it 242 * restarts journal transactions; therefore there may not be 243 * enough credits left in the handle to remove the inode from 244 * the orphan list and set the dtime field. 245 */ 246 if (!ext4_handle_has_enough_credits(handle, 3)) { 247 err = ext4_journal_extend(handle, 3); 248 if (err > 0) 249 err = ext4_journal_restart(handle, 3); 250 if (err != 0) { 251 ext4_warning(inode->i_sb, __func__, 252 "couldn't extend journal (err %d)", err); 253 stop_handle: 254 ext4_journal_stop(handle); 255 goto no_delete; 256 } 257 } 258 259 /* 260 * Kill off the orphan record which ext4_truncate created. 261 * AKPM: I think this can be inside the above `if'. 262 * Note that ext4_orphan_del() has to be able to cope with the 263 * deletion of a non-existent orphan - this is because we don't 264 * know if ext4_truncate() actually created an orphan record. 265 * (Well, we could do this if we need to, but heck - it works) 266 */ 267 ext4_orphan_del(handle, inode); 268 EXT4_I(inode)->i_dtime = get_seconds(); 269 270 /* 271 * One subtle ordering requirement: if anything has gone wrong 272 * (transaction abort, IO errors, whatever), then we can still 273 * do these next steps (the fs will already have been marked as 274 * having errors), but we can't free the inode if the mark_dirty 275 * fails. 276 */ 277 if (ext4_mark_inode_dirty(handle, inode)) 278 /* If that failed, just do the required in-core inode clear. */ 279 clear_inode(inode); 280 else 281 ext4_free_inode(handle, inode); 282 ext4_journal_stop(handle); 283 return; 284 no_delete: 285 clear_inode(inode); /* We must guarantee clearing of inode... */ 286 } 287 288 typedef struct { 289 __le32 *p; 290 __le32 key; 291 struct buffer_head *bh; 292 } Indirect; 293 294 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 295 { 296 p->key = *(p->p = v); 297 p->bh = bh; 298 } 299 300 /** 301 * ext4_block_to_path - parse the block number into array of offsets 302 * @inode: inode in question (we are only interested in its superblock) 303 * @i_block: block number to be parsed 304 * @offsets: array to store the offsets in 305 * @boundary: set this non-zero if the referred-to block is likely to be 306 * followed (on disk) by an indirect block. 307 * 308 * To store the locations of file's data ext4 uses a data structure common 309 * for UNIX filesystems - tree of pointers anchored in the inode, with 310 * data blocks at leaves and indirect blocks in intermediate nodes. 311 * This function translates the block number into path in that tree - 312 * return value is the path length and @offsets[n] is the offset of 313 * pointer to (n+1)th node in the nth one. If @block is out of range 314 * (negative or too large) warning is printed and zero returned. 315 * 316 * Note: function doesn't find node addresses, so no IO is needed. All 317 * we need to know is the capacity of indirect blocks (taken from the 318 * inode->i_sb). 319 */ 320 321 /* 322 * Portability note: the last comparison (check that we fit into triple 323 * indirect block) is spelled differently, because otherwise on an 324 * architecture with 32-bit longs and 8Kb pages we might get into trouble 325 * if our filesystem had 8Kb blocks. We might use long long, but that would 326 * kill us on x86. Oh, well, at least the sign propagation does not matter - 327 * i_block would have to be negative in the very beginning, so we would not 328 * get there at all. 329 */ 330 331 static int ext4_block_to_path(struct inode *inode, 332 ext4_lblk_t i_block, 333 ext4_lblk_t offsets[4], int *boundary) 334 { 335 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 336 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 337 const long direct_blocks = EXT4_NDIR_BLOCKS, 338 indirect_blocks = ptrs, 339 double_blocks = (1 << (ptrs_bits * 2)); 340 int n = 0; 341 int final = 0; 342 343 if (i_block < 0) { 344 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 345 } else if (i_block < direct_blocks) { 346 offsets[n++] = i_block; 347 final = direct_blocks; 348 } else if ((i_block -= direct_blocks) < indirect_blocks) { 349 offsets[n++] = EXT4_IND_BLOCK; 350 offsets[n++] = i_block; 351 final = ptrs; 352 } else if ((i_block -= indirect_blocks) < double_blocks) { 353 offsets[n++] = EXT4_DIND_BLOCK; 354 offsets[n++] = i_block >> ptrs_bits; 355 offsets[n++] = i_block & (ptrs - 1); 356 final = ptrs; 357 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 358 offsets[n++] = EXT4_TIND_BLOCK; 359 offsets[n++] = i_block >> (ptrs_bits * 2); 360 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 361 offsets[n++] = i_block & (ptrs - 1); 362 final = ptrs; 363 } else { 364 ext4_warning(inode->i_sb, "ext4_block_to_path", 365 "block %lu > max in inode %lu", 366 i_block + direct_blocks + 367 indirect_blocks + double_blocks, inode->i_ino); 368 } 369 if (boundary) 370 *boundary = final - 1 - (i_block & (ptrs - 1)); 371 return n; 372 } 373 374 static int __ext4_check_blockref(const char *function, struct inode *inode, 375 __le32 *p, unsigned int max) { 376 377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); 378 __le32 *bref = p; 379 while (bref < p+max) { 380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { 381 ext4_error(inode->i_sb, function, 382 "block reference %u >= max (%u) " 383 "in inode #%lu, offset=%d", 384 le32_to_cpu(*bref), maxblocks, 385 inode->i_ino, (int)(bref-p)); 386 return -EIO; 387 } 388 bref++; 389 } 390 return 0; 391 } 392 393 394 #define ext4_check_indirect_blockref(inode, bh) \ 395 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 396 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 397 398 #define ext4_check_inode_blockref(inode) \ 399 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 400 EXT4_NDIR_BLOCKS) 401 402 /** 403 * ext4_get_branch - read the chain of indirect blocks leading to data 404 * @inode: inode in question 405 * @depth: depth of the chain (1 - direct pointer, etc.) 406 * @offsets: offsets of pointers in inode/indirect blocks 407 * @chain: place to store the result 408 * @err: here we store the error value 409 * 410 * Function fills the array of triples <key, p, bh> and returns %NULL 411 * if everything went OK or the pointer to the last filled triple 412 * (incomplete one) otherwise. Upon the return chain[i].key contains 413 * the number of (i+1)-th block in the chain (as it is stored in memory, 414 * i.e. little-endian 32-bit), chain[i].p contains the address of that 415 * number (it points into struct inode for i==0 and into the bh->b_data 416 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 417 * block for i>0 and NULL for i==0. In other words, it holds the block 418 * numbers of the chain, addresses they were taken from (and where we can 419 * verify that chain did not change) and buffer_heads hosting these 420 * numbers. 421 * 422 * Function stops when it stumbles upon zero pointer (absent block) 423 * (pointer to last triple returned, *@err == 0) 424 * or when it gets an IO error reading an indirect block 425 * (ditto, *@err == -EIO) 426 * or when it reads all @depth-1 indirect blocks successfully and finds 427 * the whole chain, all way to the data (returns %NULL, *err == 0). 428 * 429 * Need to be called with 430 * down_read(&EXT4_I(inode)->i_data_sem) 431 */ 432 static Indirect *ext4_get_branch(struct inode *inode, int depth, 433 ext4_lblk_t *offsets, 434 Indirect chain[4], int *err) 435 { 436 struct super_block *sb = inode->i_sb; 437 Indirect *p = chain; 438 struct buffer_head *bh; 439 440 *err = 0; 441 /* i_data is not going away, no lock needed */ 442 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 443 if (!p->key) 444 goto no_block; 445 while (--depth) { 446 bh = sb_getblk(sb, le32_to_cpu(p->key)); 447 if (unlikely(!bh)) 448 goto failure; 449 450 if (!bh_uptodate_or_lock(bh)) { 451 if (bh_submit_read(bh) < 0) { 452 put_bh(bh); 453 goto failure; 454 } 455 /* validate block references */ 456 if (ext4_check_indirect_blockref(inode, bh)) { 457 put_bh(bh); 458 goto failure; 459 } 460 } 461 462 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 463 /* Reader: end */ 464 if (!p->key) 465 goto no_block; 466 } 467 return NULL; 468 469 failure: 470 *err = -EIO; 471 no_block: 472 return p; 473 } 474 475 /** 476 * ext4_find_near - find a place for allocation with sufficient locality 477 * @inode: owner 478 * @ind: descriptor of indirect block. 479 * 480 * This function returns the preferred place for block allocation. 481 * It is used when heuristic for sequential allocation fails. 482 * Rules are: 483 * + if there is a block to the left of our position - allocate near it. 484 * + if pointer will live in indirect block - allocate near that block. 485 * + if pointer will live in inode - allocate in the same 486 * cylinder group. 487 * 488 * In the latter case we colour the starting block by the callers PID to 489 * prevent it from clashing with concurrent allocations for a different inode 490 * in the same block group. The PID is used here so that functionally related 491 * files will be close-by on-disk. 492 * 493 * Caller must make sure that @ind is valid and will stay that way. 494 */ 495 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 496 { 497 struct ext4_inode_info *ei = EXT4_I(inode); 498 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 499 __le32 *p; 500 ext4_fsblk_t bg_start; 501 ext4_fsblk_t last_block; 502 ext4_grpblk_t colour; 503 ext4_group_t block_group; 504 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); 505 506 /* Try to find previous block */ 507 for (p = ind->p - 1; p >= start; p--) { 508 if (*p) 509 return le32_to_cpu(*p); 510 } 511 512 /* No such thing, so let's try location of indirect block */ 513 if (ind->bh) 514 return ind->bh->b_blocknr; 515 516 /* 517 * It is going to be referred to from the inode itself? OK, just put it 518 * into the same cylinder group then. 519 */ 520 block_group = ei->i_block_group; 521 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 522 block_group &= ~(flex_size-1); 523 if (S_ISREG(inode->i_mode)) 524 block_group++; 525 } 526 bg_start = ext4_group_first_block_no(inode->i_sb, block_group); 527 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 528 529 /* 530 * If we are doing delayed allocation, we don't need take 531 * colour into account. 532 */ 533 if (test_opt(inode->i_sb, DELALLOC)) 534 return bg_start; 535 536 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 537 colour = (current->pid % 16) * 538 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 539 else 540 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 541 return bg_start + colour; 542 } 543 544 /** 545 * ext4_find_goal - find a preferred place for allocation. 546 * @inode: owner 547 * @block: block we want 548 * @partial: pointer to the last triple within a chain 549 * 550 * Normally this function find the preferred place for block allocation, 551 * returns it. 552 */ 553 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 554 Indirect *partial) 555 { 556 /* 557 * XXX need to get goal block from mballoc's data structures 558 */ 559 560 return ext4_find_near(inode, partial); 561 } 562 563 /** 564 * ext4_blks_to_allocate: Look up the block map and count the number 565 * of direct blocks need to be allocated for the given branch. 566 * 567 * @branch: chain of indirect blocks 568 * @k: number of blocks need for indirect blocks 569 * @blks: number of data blocks to be mapped. 570 * @blocks_to_boundary: the offset in the indirect block 571 * 572 * return the total number of blocks to be allocate, including the 573 * direct and indirect blocks. 574 */ 575 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 576 int blocks_to_boundary) 577 { 578 unsigned int count = 0; 579 580 /* 581 * Simple case, [t,d]Indirect block(s) has not allocated yet 582 * then it's clear blocks on that path have not allocated 583 */ 584 if (k > 0) { 585 /* right now we don't handle cross boundary allocation */ 586 if (blks < blocks_to_boundary + 1) 587 count += blks; 588 else 589 count += blocks_to_boundary + 1; 590 return count; 591 } 592 593 count++; 594 while (count < blks && count <= blocks_to_boundary && 595 le32_to_cpu(*(branch[0].p + count)) == 0) { 596 count++; 597 } 598 return count; 599 } 600 601 /** 602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 603 * @indirect_blks: the number of blocks need to allocate for indirect 604 * blocks 605 * 606 * @new_blocks: on return it will store the new block numbers for 607 * the indirect blocks(if needed) and the first direct block, 608 * @blks: on return it will store the total number of allocated 609 * direct blocks 610 */ 611 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 612 ext4_lblk_t iblock, ext4_fsblk_t goal, 613 int indirect_blks, int blks, 614 ext4_fsblk_t new_blocks[4], int *err) 615 { 616 struct ext4_allocation_request ar; 617 int target, i; 618 unsigned long count = 0, blk_allocated = 0; 619 int index = 0; 620 ext4_fsblk_t current_block = 0; 621 int ret = 0; 622 623 /* 624 * Here we try to allocate the requested multiple blocks at once, 625 * on a best-effort basis. 626 * To build a branch, we should allocate blocks for 627 * the indirect blocks(if not allocated yet), and at least 628 * the first direct block of this branch. That's the 629 * minimum number of blocks need to allocate(required) 630 */ 631 /* first we try to allocate the indirect blocks */ 632 target = indirect_blks; 633 while (target > 0) { 634 count = target; 635 /* allocating blocks for indirect blocks and direct blocks */ 636 current_block = ext4_new_meta_blocks(handle, inode, 637 goal, &count, err); 638 if (*err) 639 goto failed_out; 640 641 target -= count; 642 /* allocate blocks for indirect blocks */ 643 while (index < indirect_blks && count) { 644 new_blocks[index++] = current_block++; 645 count--; 646 } 647 if (count > 0) { 648 /* 649 * save the new block number 650 * for the first direct block 651 */ 652 new_blocks[index] = current_block; 653 printk(KERN_INFO "%s returned more blocks than " 654 "requested\n", __func__); 655 WARN_ON(1); 656 break; 657 } 658 } 659 660 target = blks - count ; 661 blk_allocated = count; 662 if (!target) 663 goto allocated; 664 /* Now allocate data blocks */ 665 memset(&ar, 0, sizeof(ar)); 666 ar.inode = inode; 667 ar.goal = goal; 668 ar.len = target; 669 ar.logical = iblock; 670 if (S_ISREG(inode->i_mode)) 671 /* enable in-core preallocation only for regular files */ 672 ar.flags = EXT4_MB_HINT_DATA; 673 674 current_block = ext4_mb_new_blocks(handle, &ar, err); 675 676 if (*err && (target == blks)) { 677 /* 678 * if the allocation failed and we didn't allocate 679 * any blocks before 680 */ 681 goto failed_out; 682 } 683 if (!*err) { 684 if (target == blks) { 685 /* 686 * save the new block number 687 * for the first direct block 688 */ 689 new_blocks[index] = current_block; 690 } 691 blk_allocated += ar.len; 692 } 693 allocated: 694 /* total number of blocks allocated for direct blocks */ 695 ret = blk_allocated; 696 *err = 0; 697 return ret; 698 failed_out: 699 for (i = 0; i < index; i++) 700 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 701 return ret; 702 } 703 704 /** 705 * ext4_alloc_branch - allocate and set up a chain of blocks. 706 * @inode: owner 707 * @indirect_blks: number of allocated indirect blocks 708 * @blks: number of allocated direct blocks 709 * @offsets: offsets (in the blocks) to store the pointers to next. 710 * @branch: place to store the chain in. 711 * 712 * This function allocates blocks, zeroes out all but the last one, 713 * links them into chain and (if we are synchronous) writes them to disk. 714 * In other words, it prepares a branch that can be spliced onto the 715 * inode. It stores the information about that chain in the branch[], in 716 * the same format as ext4_get_branch() would do. We are calling it after 717 * we had read the existing part of chain and partial points to the last 718 * triple of that (one with zero ->key). Upon the exit we have the same 719 * picture as after the successful ext4_get_block(), except that in one 720 * place chain is disconnected - *branch->p is still zero (we did not 721 * set the last link), but branch->key contains the number that should 722 * be placed into *branch->p to fill that gap. 723 * 724 * If allocation fails we free all blocks we've allocated (and forget 725 * their buffer_heads) and return the error value the from failed 726 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 727 * as described above and return 0. 728 */ 729 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 730 ext4_lblk_t iblock, int indirect_blks, 731 int *blks, ext4_fsblk_t goal, 732 ext4_lblk_t *offsets, Indirect *branch) 733 { 734 int blocksize = inode->i_sb->s_blocksize; 735 int i, n = 0; 736 int err = 0; 737 struct buffer_head *bh; 738 int num; 739 ext4_fsblk_t new_blocks[4]; 740 ext4_fsblk_t current_block; 741 742 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 743 *blks, new_blocks, &err); 744 if (err) 745 return err; 746 747 branch[0].key = cpu_to_le32(new_blocks[0]); 748 /* 749 * metadata blocks and data blocks are allocated. 750 */ 751 for (n = 1; n <= indirect_blks; n++) { 752 /* 753 * Get buffer_head for parent block, zero it out 754 * and set the pointer to new one, then send 755 * parent to disk. 756 */ 757 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 758 branch[n].bh = bh; 759 lock_buffer(bh); 760 BUFFER_TRACE(bh, "call get_create_access"); 761 err = ext4_journal_get_create_access(handle, bh); 762 if (err) { 763 unlock_buffer(bh); 764 brelse(bh); 765 goto failed; 766 } 767 768 memset(bh->b_data, 0, blocksize); 769 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 770 branch[n].key = cpu_to_le32(new_blocks[n]); 771 *branch[n].p = branch[n].key; 772 if (n == indirect_blks) { 773 current_block = new_blocks[n]; 774 /* 775 * End of chain, update the last new metablock of 776 * the chain to point to the new allocated 777 * data blocks numbers 778 */ 779 for (i=1; i < num; i++) 780 *(branch[n].p + i) = cpu_to_le32(++current_block); 781 } 782 BUFFER_TRACE(bh, "marking uptodate"); 783 set_buffer_uptodate(bh); 784 unlock_buffer(bh); 785 786 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 787 err = ext4_handle_dirty_metadata(handle, inode, bh); 788 if (err) 789 goto failed; 790 } 791 *blks = num; 792 return err; 793 failed: 794 /* Allocation failed, free what we already allocated */ 795 for (i = 1; i <= n ; i++) { 796 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 797 ext4_journal_forget(handle, branch[i].bh); 798 } 799 for (i = 0; i < indirect_blks; i++) 800 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 801 802 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 803 804 return err; 805 } 806 807 /** 808 * ext4_splice_branch - splice the allocated branch onto inode. 809 * @inode: owner 810 * @block: (logical) number of block we are adding 811 * @chain: chain of indirect blocks (with a missing link - see 812 * ext4_alloc_branch) 813 * @where: location of missing link 814 * @num: number of indirect blocks we are adding 815 * @blks: number of direct blocks we are adding 816 * 817 * This function fills the missing link and does all housekeeping needed in 818 * inode (->i_blocks, etc.). In case of success we end up with the full 819 * chain to new block and return 0. 820 */ 821 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 822 ext4_lblk_t block, Indirect *where, int num, int blks) 823 { 824 int i; 825 int err = 0; 826 ext4_fsblk_t current_block; 827 828 /* 829 * If we're splicing into a [td]indirect block (as opposed to the 830 * inode) then we need to get write access to the [td]indirect block 831 * before the splice. 832 */ 833 if (where->bh) { 834 BUFFER_TRACE(where->bh, "get_write_access"); 835 err = ext4_journal_get_write_access(handle, where->bh); 836 if (err) 837 goto err_out; 838 } 839 /* That's it */ 840 841 *where->p = where->key; 842 843 /* 844 * Update the host buffer_head or inode to point to more just allocated 845 * direct blocks blocks 846 */ 847 if (num == 0 && blks > 1) { 848 current_block = le32_to_cpu(where->key) + 1; 849 for (i = 1; i < blks; i++) 850 *(where->p + i) = cpu_to_le32(current_block++); 851 } 852 853 /* We are done with atomic stuff, now do the rest of housekeeping */ 854 855 inode->i_ctime = ext4_current_time(inode); 856 ext4_mark_inode_dirty(handle, inode); 857 858 /* had we spliced it onto indirect block? */ 859 if (where->bh) { 860 /* 861 * If we spliced it onto an indirect block, we haven't 862 * altered the inode. Note however that if it is being spliced 863 * onto an indirect block at the very end of the file (the 864 * file is growing) then we *will* alter the inode to reflect 865 * the new i_size. But that is not done here - it is done in 866 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 867 */ 868 jbd_debug(5, "splicing indirect only\n"); 869 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 870 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 871 if (err) 872 goto err_out; 873 } else { 874 /* 875 * OK, we spliced it into the inode itself on a direct block. 876 * Inode was dirtied above. 877 */ 878 jbd_debug(5, "splicing direct\n"); 879 } 880 return err; 881 882 err_out: 883 for (i = 1; i <= num; i++) { 884 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 885 ext4_journal_forget(handle, where[i].bh); 886 ext4_free_blocks(handle, inode, 887 le32_to_cpu(where[i-1].key), 1, 0); 888 } 889 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 890 891 return err; 892 } 893 894 /* 895 * The ext4_ind_get_blocks() function handles non-extents inodes 896 * (i.e., using the traditional indirect/double-indirect i_blocks 897 * scheme) for ext4_get_blocks(). 898 * 899 * Allocation strategy is simple: if we have to allocate something, we will 900 * have to go the whole way to leaf. So let's do it before attaching anything 901 * to tree, set linkage between the newborn blocks, write them if sync is 902 * required, recheck the path, free and repeat if check fails, otherwise 903 * set the last missing link (that will protect us from any truncate-generated 904 * removals - all blocks on the path are immune now) and possibly force the 905 * write on the parent block. 906 * That has a nice additional property: no special recovery from the failed 907 * allocations is needed - we simply release blocks and do not touch anything 908 * reachable from inode. 909 * 910 * `handle' can be NULL if create == 0. 911 * 912 * return > 0, # of blocks mapped or allocated. 913 * return = 0, if plain lookup failed. 914 * return < 0, error case. 915 * 916 * The ext4_ind_get_blocks() function should be called with 917 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem 918 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or 919 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 920 * blocks. 921 */ 922 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 923 ext4_lblk_t iblock, unsigned int maxblocks, 924 struct buffer_head *bh_result, 925 int flags) 926 { 927 int err = -EIO; 928 ext4_lblk_t offsets[4]; 929 Indirect chain[4]; 930 Indirect *partial; 931 ext4_fsblk_t goal; 932 int indirect_blks; 933 int blocks_to_boundary = 0; 934 int depth; 935 struct ext4_inode_info *ei = EXT4_I(inode); 936 int count = 0; 937 ext4_fsblk_t first_block = 0; 938 loff_t disksize; 939 940 941 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 942 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 943 depth = ext4_block_to_path(inode, iblock, offsets, 944 &blocks_to_boundary); 945 946 if (depth == 0) 947 goto out; 948 949 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 950 951 /* Simplest case - block found, no allocation needed */ 952 if (!partial) { 953 first_block = le32_to_cpu(chain[depth - 1].key); 954 clear_buffer_new(bh_result); 955 count++; 956 /*map more blocks*/ 957 while (count < maxblocks && count <= blocks_to_boundary) { 958 ext4_fsblk_t blk; 959 960 blk = le32_to_cpu(*(chain[depth-1].p + count)); 961 962 if (blk == first_block + count) 963 count++; 964 else 965 break; 966 } 967 goto got_it; 968 } 969 970 /* Next simple case - plain lookup or failed read of indirect block */ 971 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 972 goto cleanup; 973 974 /* 975 * Okay, we need to do block allocation. 976 */ 977 goal = ext4_find_goal(inode, iblock, partial); 978 979 /* the number of blocks need to allocate for [d,t]indirect blocks */ 980 indirect_blks = (chain + depth) - partial - 1; 981 982 /* 983 * Next look up the indirect map to count the totoal number of 984 * direct blocks to allocate for this branch. 985 */ 986 count = ext4_blks_to_allocate(partial, indirect_blks, 987 maxblocks, blocks_to_boundary); 988 /* 989 * Block out ext4_truncate while we alter the tree 990 */ 991 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 992 &count, goal, 993 offsets + (partial - chain), partial); 994 995 /* 996 * The ext4_splice_branch call will free and forget any buffers 997 * on the new chain if there is a failure, but that risks using 998 * up transaction credits, especially for bitmaps where the 999 * credits cannot be returned. Can we handle this somehow? We 1000 * may need to return -EAGAIN upwards in the worst case. --sct 1001 */ 1002 if (!err) 1003 err = ext4_splice_branch(handle, inode, iblock, 1004 partial, indirect_blks, count); 1005 /* 1006 * i_disksize growing is protected by i_data_sem. Don't forget to 1007 * protect it if you're about to implement concurrent 1008 * ext4_get_block() -bzzz 1009 */ 1010 if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) { 1011 disksize = ((loff_t) iblock + count) << inode->i_blkbits; 1012 if (disksize > i_size_read(inode)) 1013 disksize = i_size_read(inode); 1014 if (disksize > ei->i_disksize) 1015 ei->i_disksize = disksize; 1016 } 1017 if (err) 1018 goto cleanup; 1019 1020 set_buffer_new(bh_result); 1021 got_it: 1022 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1023 if (count > blocks_to_boundary) 1024 set_buffer_boundary(bh_result); 1025 err = count; 1026 /* Clean up and exit */ 1027 partial = chain + depth - 1; /* the whole chain */ 1028 cleanup: 1029 while (partial > chain) { 1030 BUFFER_TRACE(partial->bh, "call brelse"); 1031 brelse(partial->bh); 1032 partial--; 1033 } 1034 BUFFER_TRACE(bh_result, "returned"); 1035 out: 1036 return err; 1037 } 1038 1039 qsize_t ext4_get_reserved_space(struct inode *inode) 1040 { 1041 unsigned long long total; 1042 1043 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1044 total = EXT4_I(inode)->i_reserved_data_blocks + 1045 EXT4_I(inode)->i_reserved_meta_blocks; 1046 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1047 1048 return total; 1049 } 1050 /* 1051 * Calculate the number of metadata blocks need to reserve 1052 * to allocate @blocks for non extent file based file 1053 */ 1054 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 1055 { 1056 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1057 int ind_blks, dind_blks, tind_blks; 1058 1059 /* number of new indirect blocks needed */ 1060 ind_blks = (blocks + icap - 1) / icap; 1061 1062 dind_blks = (ind_blks + icap - 1) / icap; 1063 1064 tind_blks = 1; 1065 1066 return ind_blks + dind_blks + tind_blks; 1067 } 1068 1069 /* 1070 * Calculate the number of metadata blocks need to reserve 1071 * to allocate given number of blocks 1072 */ 1073 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1074 { 1075 if (!blocks) 1076 return 0; 1077 1078 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1079 return ext4_ext_calc_metadata_amount(inode, blocks); 1080 1081 return ext4_indirect_calc_metadata_amount(inode, blocks); 1082 } 1083 1084 static void ext4_da_update_reserve_space(struct inode *inode, int used) 1085 { 1086 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1087 int total, mdb, mdb_free; 1088 1089 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1090 /* recalculate the number of metablocks still need to be reserved */ 1091 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1092 mdb = ext4_calc_metadata_amount(inode, total); 1093 1094 /* figure out how many metablocks to release */ 1095 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1096 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1097 1098 if (mdb_free) { 1099 /* Account for allocated meta_blocks */ 1100 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1101 1102 /* update fs dirty blocks counter */ 1103 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1104 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1105 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1106 } 1107 1108 /* update per-inode reservations */ 1109 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1110 EXT4_I(inode)->i_reserved_data_blocks -= used; 1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1112 1113 /* 1114 * free those over-booking quota for metadata blocks 1115 */ 1116 if (mdb_free) 1117 vfs_dq_release_reservation_block(inode, mdb_free); 1118 1119 /* 1120 * If we have done all the pending block allocations and if 1121 * there aren't any writers on the inode, we can discard the 1122 * inode's preallocations. 1123 */ 1124 if (!total && (atomic_read(&inode->i_writecount) == 0)) 1125 ext4_discard_preallocations(inode); 1126 } 1127 1128 /* 1129 * The ext4_get_blocks() function tries to look up the requested blocks, 1130 * and returns if the blocks are already mapped. 1131 * 1132 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1133 * and store the allocated blocks in the result buffer head and mark it 1134 * mapped. 1135 * 1136 * If file type is extents based, it will call ext4_ext_get_blocks(), 1137 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1138 * based files 1139 * 1140 * On success, it returns the number of blocks being mapped or allocate. 1141 * if create==0 and the blocks are pre-allocated and uninitialized block, 1142 * the result buffer head is unmapped. If the create ==1, it will make sure 1143 * the buffer head is mapped. 1144 * 1145 * It returns 0 if plain look up failed (blocks have not been allocated), in 1146 * that casem, buffer head is unmapped 1147 * 1148 * It returns the error in case of allocation failure. 1149 */ 1150 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1151 unsigned int max_blocks, struct buffer_head *bh, 1152 int flags) 1153 { 1154 int retval; 1155 1156 clear_buffer_mapped(bh); 1157 clear_buffer_unwritten(bh); 1158 1159 /* 1160 * Try to see if we can get the block without requesting a new 1161 * file system block. 1162 */ 1163 down_read((&EXT4_I(inode)->i_data_sem)); 1164 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1165 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1166 bh, 0); 1167 } else { 1168 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1169 bh, 0); 1170 } 1171 up_read((&EXT4_I(inode)->i_data_sem)); 1172 1173 /* If it is only a block(s) look up */ 1174 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 1175 return retval; 1176 1177 /* 1178 * Returns if the blocks have already allocated 1179 * 1180 * Note that if blocks have been preallocated 1181 * ext4_ext_get_block() returns th create = 0 1182 * with buffer head unmapped. 1183 */ 1184 if (retval > 0 && buffer_mapped(bh)) 1185 return retval; 1186 1187 /* 1188 * When we call get_blocks without the create flag, the 1189 * BH_Unwritten flag could have gotten set if the blocks 1190 * requested were part of a uninitialized extent. We need to 1191 * clear this flag now that we are committed to convert all or 1192 * part of the uninitialized extent to be an initialized 1193 * extent. This is because we need to avoid the combination 1194 * of BH_Unwritten and BH_Mapped flags being simultaneously 1195 * set on the buffer_head. 1196 */ 1197 clear_buffer_unwritten(bh); 1198 1199 /* 1200 * New blocks allocate and/or writing to uninitialized extent 1201 * will possibly result in updating i_data, so we take 1202 * the write lock of i_data_sem, and call get_blocks() 1203 * with create == 1 flag. 1204 */ 1205 down_write((&EXT4_I(inode)->i_data_sem)); 1206 1207 /* 1208 * if the caller is from delayed allocation writeout path 1209 * we have already reserved fs blocks for allocation 1210 * let the underlying get_block() function know to 1211 * avoid double accounting 1212 */ 1213 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) 1214 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1215 /* 1216 * We need to check for EXT4 here because migrate 1217 * could have changed the inode type in between 1218 */ 1219 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1220 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1221 bh, flags); 1222 } else { 1223 retval = ext4_ind_get_blocks(handle, inode, block, 1224 max_blocks, bh, flags); 1225 1226 if (retval > 0 && buffer_new(bh)) { 1227 /* 1228 * We allocated new blocks which will result in 1229 * i_data's format changing. Force the migrate 1230 * to fail by clearing migrate flags 1231 */ 1232 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1233 ~EXT4_EXT_MIGRATE; 1234 } 1235 } 1236 1237 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { 1238 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1239 /* 1240 * Update reserved blocks/metadata blocks 1241 * after successful block allocation 1242 * which were deferred till now 1243 */ 1244 if ((retval > 0) && buffer_delay(bh)) 1245 ext4_da_update_reserve_space(inode, retval); 1246 } 1247 1248 up_write((&EXT4_I(inode)->i_data_sem)); 1249 return retval; 1250 } 1251 1252 /* Maximum number of blocks we map for direct IO at once. */ 1253 #define DIO_MAX_BLOCKS 4096 1254 1255 int ext4_get_block(struct inode *inode, sector_t iblock, 1256 struct buffer_head *bh_result, int create) 1257 { 1258 handle_t *handle = ext4_journal_current_handle(); 1259 int ret = 0, started = 0; 1260 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1261 int dio_credits; 1262 1263 if (create && !handle) { 1264 /* Direct IO write... */ 1265 if (max_blocks > DIO_MAX_BLOCKS) 1266 max_blocks = DIO_MAX_BLOCKS; 1267 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1268 handle = ext4_journal_start(inode, dio_credits); 1269 if (IS_ERR(handle)) { 1270 ret = PTR_ERR(handle); 1271 goto out; 1272 } 1273 started = 1; 1274 } 1275 1276 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1277 create ? EXT4_GET_BLOCKS_CREATE : 0); 1278 if (ret > 0) { 1279 bh_result->b_size = (ret << inode->i_blkbits); 1280 ret = 0; 1281 } 1282 if (started) 1283 ext4_journal_stop(handle); 1284 out: 1285 return ret; 1286 } 1287 1288 /* 1289 * `handle' can be NULL if create is zero 1290 */ 1291 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1292 ext4_lblk_t block, int create, int *errp) 1293 { 1294 struct buffer_head dummy; 1295 int fatal = 0, err; 1296 int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE; 1297 1298 J_ASSERT(handle != NULL || create == 0); 1299 1300 dummy.b_state = 0; 1301 dummy.b_blocknr = -1000; 1302 buffer_trace_init(&dummy.b_history); 1303 if (create) 1304 flags |= EXT4_GET_BLOCKS_CREATE; 1305 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1306 /* 1307 * ext4_get_blocks() returns number of blocks mapped. 0 in 1308 * case of a HOLE. 1309 */ 1310 if (err > 0) { 1311 if (err > 1) 1312 WARN_ON(1); 1313 err = 0; 1314 } 1315 *errp = err; 1316 if (!err && buffer_mapped(&dummy)) { 1317 struct buffer_head *bh; 1318 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1319 if (!bh) { 1320 *errp = -EIO; 1321 goto err; 1322 } 1323 if (buffer_new(&dummy)) { 1324 J_ASSERT(create != 0); 1325 J_ASSERT(handle != NULL); 1326 1327 /* 1328 * Now that we do not always journal data, we should 1329 * keep in mind whether this should always journal the 1330 * new buffer as metadata. For now, regular file 1331 * writes use ext4_get_block instead, so it's not a 1332 * problem. 1333 */ 1334 lock_buffer(bh); 1335 BUFFER_TRACE(bh, "call get_create_access"); 1336 fatal = ext4_journal_get_create_access(handle, bh); 1337 if (!fatal && !buffer_uptodate(bh)) { 1338 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1339 set_buffer_uptodate(bh); 1340 } 1341 unlock_buffer(bh); 1342 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1343 err = ext4_handle_dirty_metadata(handle, inode, bh); 1344 if (!fatal) 1345 fatal = err; 1346 } else { 1347 BUFFER_TRACE(bh, "not a new buffer"); 1348 } 1349 if (fatal) { 1350 *errp = fatal; 1351 brelse(bh); 1352 bh = NULL; 1353 } 1354 return bh; 1355 } 1356 err: 1357 return NULL; 1358 } 1359 1360 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1361 ext4_lblk_t block, int create, int *err) 1362 { 1363 struct buffer_head *bh; 1364 1365 bh = ext4_getblk(handle, inode, block, create, err); 1366 if (!bh) 1367 return bh; 1368 if (buffer_uptodate(bh)) 1369 return bh; 1370 ll_rw_block(READ_META, 1, &bh); 1371 wait_on_buffer(bh); 1372 if (buffer_uptodate(bh)) 1373 return bh; 1374 put_bh(bh); 1375 *err = -EIO; 1376 return NULL; 1377 } 1378 1379 static int walk_page_buffers(handle_t *handle, 1380 struct buffer_head *head, 1381 unsigned from, 1382 unsigned to, 1383 int *partial, 1384 int (*fn)(handle_t *handle, 1385 struct buffer_head *bh)) 1386 { 1387 struct buffer_head *bh; 1388 unsigned block_start, block_end; 1389 unsigned blocksize = head->b_size; 1390 int err, ret = 0; 1391 struct buffer_head *next; 1392 1393 for (bh = head, block_start = 0; 1394 ret == 0 && (bh != head || !block_start); 1395 block_start = block_end, bh = next) 1396 { 1397 next = bh->b_this_page; 1398 block_end = block_start + blocksize; 1399 if (block_end <= from || block_start >= to) { 1400 if (partial && !buffer_uptodate(bh)) 1401 *partial = 1; 1402 continue; 1403 } 1404 err = (*fn)(handle, bh); 1405 if (!ret) 1406 ret = err; 1407 } 1408 return ret; 1409 } 1410 1411 /* 1412 * To preserve ordering, it is essential that the hole instantiation and 1413 * the data write be encapsulated in a single transaction. We cannot 1414 * close off a transaction and start a new one between the ext4_get_block() 1415 * and the commit_write(). So doing the jbd2_journal_start at the start of 1416 * prepare_write() is the right place. 1417 * 1418 * Also, this function can nest inside ext4_writepage() -> 1419 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1420 * has generated enough buffer credits to do the whole page. So we won't 1421 * block on the journal in that case, which is good, because the caller may 1422 * be PF_MEMALLOC. 1423 * 1424 * By accident, ext4 can be reentered when a transaction is open via 1425 * quota file writes. If we were to commit the transaction while thus 1426 * reentered, there can be a deadlock - we would be holding a quota 1427 * lock, and the commit would never complete if another thread had a 1428 * transaction open and was blocking on the quota lock - a ranking 1429 * violation. 1430 * 1431 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1432 * will _not_ run commit under these circumstances because handle->h_ref 1433 * is elevated. We'll still have enough credits for the tiny quotafile 1434 * write. 1435 */ 1436 static int do_journal_get_write_access(handle_t *handle, 1437 struct buffer_head *bh) 1438 { 1439 if (!buffer_mapped(bh) || buffer_freed(bh)) 1440 return 0; 1441 return ext4_journal_get_write_access(handle, bh); 1442 } 1443 1444 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1445 loff_t pos, unsigned len, unsigned flags, 1446 struct page **pagep, void **fsdata) 1447 { 1448 struct inode *inode = mapping->host; 1449 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1450 handle_t *handle; 1451 int retries = 0; 1452 struct page *page; 1453 pgoff_t index; 1454 unsigned from, to; 1455 1456 trace_mark(ext4_write_begin, 1457 "dev %s ino %lu pos %llu len %u flags %u", 1458 inode->i_sb->s_id, inode->i_ino, 1459 (unsigned long long) pos, len, flags); 1460 index = pos >> PAGE_CACHE_SHIFT; 1461 from = pos & (PAGE_CACHE_SIZE - 1); 1462 to = from + len; 1463 1464 retry: 1465 handle = ext4_journal_start(inode, needed_blocks); 1466 if (IS_ERR(handle)) { 1467 ret = PTR_ERR(handle); 1468 goto out; 1469 } 1470 1471 /* We cannot recurse into the filesystem as the transaction is already 1472 * started */ 1473 flags |= AOP_FLAG_NOFS; 1474 1475 page = grab_cache_page_write_begin(mapping, index, flags); 1476 if (!page) { 1477 ext4_journal_stop(handle); 1478 ret = -ENOMEM; 1479 goto out; 1480 } 1481 *pagep = page; 1482 1483 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1484 ext4_get_block); 1485 1486 if (!ret && ext4_should_journal_data(inode)) { 1487 ret = walk_page_buffers(handle, page_buffers(page), 1488 from, to, NULL, do_journal_get_write_access); 1489 } 1490 1491 if (ret) { 1492 unlock_page(page); 1493 ext4_journal_stop(handle); 1494 page_cache_release(page); 1495 /* 1496 * block_write_begin may have instantiated a few blocks 1497 * outside i_size. Trim these off again. Don't need 1498 * i_size_read because we hold i_mutex. 1499 */ 1500 if (pos + len > inode->i_size) 1501 vmtruncate(inode, inode->i_size); 1502 } 1503 1504 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1505 goto retry; 1506 out: 1507 return ret; 1508 } 1509 1510 /* For write_end() in data=journal mode */ 1511 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1512 { 1513 if (!buffer_mapped(bh) || buffer_freed(bh)) 1514 return 0; 1515 set_buffer_uptodate(bh); 1516 return ext4_handle_dirty_metadata(handle, NULL, bh); 1517 } 1518 1519 /* 1520 * We need to pick up the new inode size which generic_commit_write gave us 1521 * `file' can be NULL - eg, when called from page_symlink(). 1522 * 1523 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1524 * buffers are managed internally. 1525 */ 1526 static int ext4_ordered_write_end(struct file *file, 1527 struct address_space *mapping, 1528 loff_t pos, unsigned len, unsigned copied, 1529 struct page *page, void *fsdata) 1530 { 1531 handle_t *handle = ext4_journal_current_handle(); 1532 struct inode *inode = mapping->host; 1533 int ret = 0, ret2; 1534 1535 trace_mark(ext4_ordered_write_end, 1536 "dev %s ino %lu pos %llu len %u copied %u", 1537 inode->i_sb->s_id, inode->i_ino, 1538 (unsigned long long) pos, len, copied); 1539 ret = ext4_jbd2_file_inode(handle, inode); 1540 1541 if (ret == 0) { 1542 loff_t new_i_size; 1543 1544 new_i_size = pos + copied; 1545 if (new_i_size > EXT4_I(inode)->i_disksize) { 1546 ext4_update_i_disksize(inode, new_i_size); 1547 /* We need to mark inode dirty even if 1548 * new_i_size is less that inode->i_size 1549 * bu greater than i_disksize.(hint delalloc) 1550 */ 1551 ext4_mark_inode_dirty(handle, inode); 1552 } 1553 1554 ret2 = generic_write_end(file, mapping, pos, len, copied, 1555 page, fsdata); 1556 copied = ret2; 1557 if (ret2 < 0) 1558 ret = ret2; 1559 } 1560 ret2 = ext4_journal_stop(handle); 1561 if (!ret) 1562 ret = ret2; 1563 1564 return ret ? ret : copied; 1565 } 1566 1567 static int ext4_writeback_write_end(struct file *file, 1568 struct address_space *mapping, 1569 loff_t pos, unsigned len, unsigned copied, 1570 struct page *page, void *fsdata) 1571 { 1572 handle_t *handle = ext4_journal_current_handle(); 1573 struct inode *inode = mapping->host; 1574 int ret = 0, ret2; 1575 loff_t new_i_size; 1576 1577 trace_mark(ext4_writeback_write_end, 1578 "dev %s ino %lu pos %llu len %u copied %u", 1579 inode->i_sb->s_id, inode->i_ino, 1580 (unsigned long long) pos, len, copied); 1581 new_i_size = pos + copied; 1582 if (new_i_size > EXT4_I(inode)->i_disksize) { 1583 ext4_update_i_disksize(inode, new_i_size); 1584 /* We need to mark inode dirty even if 1585 * new_i_size is less that inode->i_size 1586 * bu greater than i_disksize.(hint delalloc) 1587 */ 1588 ext4_mark_inode_dirty(handle, inode); 1589 } 1590 1591 ret2 = generic_write_end(file, mapping, pos, len, copied, 1592 page, fsdata); 1593 copied = ret2; 1594 if (ret2 < 0) 1595 ret = ret2; 1596 1597 ret2 = ext4_journal_stop(handle); 1598 if (!ret) 1599 ret = ret2; 1600 1601 return ret ? ret : copied; 1602 } 1603 1604 static int ext4_journalled_write_end(struct file *file, 1605 struct address_space *mapping, 1606 loff_t pos, unsigned len, unsigned copied, 1607 struct page *page, void *fsdata) 1608 { 1609 handle_t *handle = ext4_journal_current_handle(); 1610 struct inode *inode = mapping->host; 1611 int ret = 0, ret2; 1612 int partial = 0; 1613 unsigned from, to; 1614 loff_t new_i_size; 1615 1616 trace_mark(ext4_journalled_write_end, 1617 "dev %s ino %lu pos %llu len %u copied %u", 1618 inode->i_sb->s_id, inode->i_ino, 1619 (unsigned long long) pos, len, copied); 1620 from = pos & (PAGE_CACHE_SIZE - 1); 1621 to = from + len; 1622 1623 if (copied < len) { 1624 if (!PageUptodate(page)) 1625 copied = 0; 1626 page_zero_new_buffers(page, from+copied, to); 1627 } 1628 1629 ret = walk_page_buffers(handle, page_buffers(page), from, 1630 to, &partial, write_end_fn); 1631 if (!partial) 1632 SetPageUptodate(page); 1633 new_i_size = pos + copied; 1634 if (new_i_size > inode->i_size) 1635 i_size_write(inode, pos+copied); 1636 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1637 if (new_i_size > EXT4_I(inode)->i_disksize) { 1638 ext4_update_i_disksize(inode, new_i_size); 1639 ret2 = ext4_mark_inode_dirty(handle, inode); 1640 if (!ret) 1641 ret = ret2; 1642 } 1643 1644 unlock_page(page); 1645 ret2 = ext4_journal_stop(handle); 1646 if (!ret) 1647 ret = ret2; 1648 page_cache_release(page); 1649 1650 return ret ? ret : copied; 1651 } 1652 1653 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1654 { 1655 int retries = 0; 1656 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1657 unsigned long md_needed, mdblocks, total = 0; 1658 1659 /* 1660 * recalculate the amount of metadata blocks to reserve 1661 * in order to allocate nrblocks 1662 * worse case is one extent per block 1663 */ 1664 repeat: 1665 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1666 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1667 mdblocks = ext4_calc_metadata_amount(inode, total); 1668 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1669 1670 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1671 total = md_needed + nrblocks; 1672 1673 /* 1674 * Make quota reservation here to prevent quota overflow 1675 * later. Real quota accounting is done at pages writeout 1676 * time. 1677 */ 1678 if (vfs_dq_reserve_block(inode, total)) { 1679 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1680 return -EDQUOT; 1681 } 1682 1683 if (ext4_claim_free_blocks(sbi, total)) { 1684 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1685 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1686 yield(); 1687 goto repeat; 1688 } 1689 vfs_dq_release_reservation_block(inode, total); 1690 return -ENOSPC; 1691 } 1692 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1693 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1694 1695 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1696 return 0; /* success */ 1697 } 1698 1699 static void ext4_da_release_space(struct inode *inode, int to_free) 1700 { 1701 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1702 int total, mdb, mdb_free, release; 1703 1704 if (!to_free) 1705 return; /* Nothing to release, exit */ 1706 1707 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1708 1709 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1710 /* 1711 * if there is no reserved blocks, but we try to free some 1712 * then the counter is messed up somewhere. 1713 * but since this function is called from invalidate 1714 * page, it's harmless to return without any action 1715 */ 1716 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1717 "blocks for inode %lu, but there is no reserved " 1718 "data blocks\n", to_free, inode->i_ino); 1719 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1720 return; 1721 } 1722 1723 /* recalculate the number of metablocks still need to be reserved */ 1724 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1725 mdb = ext4_calc_metadata_amount(inode, total); 1726 1727 /* figure out how many metablocks to release */ 1728 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1729 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1730 1731 release = to_free + mdb_free; 1732 1733 /* update fs dirty blocks counter for truncate case */ 1734 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1735 1736 /* update per-inode reservations */ 1737 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1738 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1739 1740 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1741 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1742 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1743 1744 vfs_dq_release_reservation_block(inode, release); 1745 } 1746 1747 static void ext4_da_page_release_reservation(struct page *page, 1748 unsigned long offset) 1749 { 1750 int to_release = 0; 1751 struct buffer_head *head, *bh; 1752 unsigned int curr_off = 0; 1753 1754 head = page_buffers(page); 1755 bh = head; 1756 do { 1757 unsigned int next_off = curr_off + bh->b_size; 1758 1759 if ((offset <= curr_off) && (buffer_delay(bh))) { 1760 to_release++; 1761 clear_buffer_delay(bh); 1762 } 1763 curr_off = next_off; 1764 } while ((bh = bh->b_this_page) != head); 1765 ext4_da_release_space(page->mapping->host, to_release); 1766 } 1767 1768 /* 1769 * Delayed allocation stuff 1770 */ 1771 1772 struct mpage_da_data { 1773 struct inode *inode; 1774 sector_t b_blocknr; /* start block number of extent */ 1775 size_t b_size; /* size of extent */ 1776 unsigned long b_state; /* state of the extent */ 1777 unsigned long first_page, next_page; /* extent of pages */ 1778 struct writeback_control *wbc; 1779 int io_done; 1780 int pages_written; 1781 int retval; 1782 }; 1783 1784 /* 1785 * mpage_da_submit_io - walks through extent of pages and try to write 1786 * them with writepage() call back 1787 * 1788 * @mpd->inode: inode 1789 * @mpd->first_page: first page of the extent 1790 * @mpd->next_page: page after the last page of the extent 1791 * 1792 * By the time mpage_da_submit_io() is called we expect all blocks 1793 * to be allocated. this may be wrong if allocation failed. 1794 * 1795 * As pages are already locked by write_cache_pages(), we can't use it 1796 */ 1797 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1798 { 1799 long pages_skipped; 1800 struct pagevec pvec; 1801 unsigned long index, end; 1802 int ret = 0, err, nr_pages, i; 1803 struct inode *inode = mpd->inode; 1804 struct address_space *mapping = inode->i_mapping; 1805 1806 BUG_ON(mpd->next_page <= mpd->first_page); 1807 /* 1808 * We need to start from the first_page to the next_page - 1 1809 * to make sure we also write the mapped dirty buffer_heads. 1810 * If we look at mpd->b_blocknr we would only be looking 1811 * at the currently mapped buffer_heads. 1812 */ 1813 index = mpd->first_page; 1814 end = mpd->next_page - 1; 1815 1816 pagevec_init(&pvec, 0); 1817 while (index <= end) { 1818 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1819 if (nr_pages == 0) 1820 break; 1821 for (i = 0; i < nr_pages; i++) { 1822 struct page *page = pvec.pages[i]; 1823 1824 index = page->index; 1825 if (index > end) 1826 break; 1827 index++; 1828 1829 BUG_ON(!PageLocked(page)); 1830 BUG_ON(PageWriteback(page)); 1831 1832 pages_skipped = mpd->wbc->pages_skipped; 1833 err = mapping->a_ops->writepage(page, mpd->wbc); 1834 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1835 /* 1836 * have successfully written the page 1837 * without skipping the same 1838 */ 1839 mpd->pages_written++; 1840 /* 1841 * In error case, we have to continue because 1842 * remaining pages are still locked 1843 * XXX: unlock and re-dirty them? 1844 */ 1845 if (ret == 0) 1846 ret = err; 1847 } 1848 pagevec_release(&pvec); 1849 } 1850 return ret; 1851 } 1852 1853 /* 1854 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1855 * 1856 * @mpd->inode - inode to walk through 1857 * @exbh->b_blocknr - first block on a disk 1858 * @exbh->b_size - amount of space in bytes 1859 * @logical - first logical block to start assignment with 1860 * 1861 * the function goes through all passed space and put actual disk 1862 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 1863 */ 1864 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1865 struct buffer_head *exbh) 1866 { 1867 struct inode *inode = mpd->inode; 1868 struct address_space *mapping = inode->i_mapping; 1869 int blocks = exbh->b_size >> inode->i_blkbits; 1870 sector_t pblock = exbh->b_blocknr, cur_logical; 1871 struct buffer_head *head, *bh; 1872 pgoff_t index, end; 1873 struct pagevec pvec; 1874 int nr_pages, i; 1875 1876 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1877 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1878 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1879 1880 pagevec_init(&pvec, 0); 1881 1882 while (index <= end) { 1883 /* XXX: optimize tail */ 1884 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1885 if (nr_pages == 0) 1886 break; 1887 for (i = 0; i < nr_pages; i++) { 1888 struct page *page = pvec.pages[i]; 1889 1890 index = page->index; 1891 if (index > end) 1892 break; 1893 index++; 1894 1895 BUG_ON(!PageLocked(page)); 1896 BUG_ON(PageWriteback(page)); 1897 BUG_ON(!page_has_buffers(page)); 1898 1899 bh = page_buffers(page); 1900 head = bh; 1901 1902 /* skip blocks out of the range */ 1903 do { 1904 if (cur_logical >= logical) 1905 break; 1906 cur_logical++; 1907 } while ((bh = bh->b_this_page) != head); 1908 1909 do { 1910 if (cur_logical >= logical + blocks) 1911 break; 1912 1913 if (buffer_delay(bh) || 1914 buffer_unwritten(bh)) { 1915 1916 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 1917 1918 if (buffer_delay(bh)) { 1919 clear_buffer_delay(bh); 1920 bh->b_blocknr = pblock; 1921 } else { 1922 /* 1923 * unwritten already should have 1924 * blocknr assigned. Verify that 1925 */ 1926 clear_buffer_unwritten(bh); 1927 BUG_ON(bh->b_blocknr != pblock); 1928 } 1929 1930 } else if (buffer_mapped(bh)) 1931 BUG_ON(bh->b_blocknr != pblock); 1932 1933 cur_logical++; 1934 pblock++; 1935 } while ((bh = bh->b_this_page) != head); 1936 } 1937 pagevec_release(&pvec); 1938 } 1939 } 1940 1941 1942 /* 1943 * __unmap_underlying_blocks - just a helper function to unmap 1944 * set of blocks described by @bh 1945 */ 1946 static inline void __unmap_underlying_blocks(struct inode *inode, 1947 struct buffer_head *bh) 1948 { 1949 struct block_device *bdev = inode->i_sb->s_bdev; 1950 int blocks, i; 1951 1952 blocks = bh->b_size >> inode->i_blkbits; 1953 for (i = 0; i < blocks; i++) 1954 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1955 } 1956 1957 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 1958 sector_t logical, long blk_cnt) 1959 { 1960 int nr_pages, i; 1961 pgoff_t index, end; 1962 struct pagevec pvec; 1963 struct inode *inode = mpd->inode; 1964 struct address_space *mapping = inode->i_mapping; 1965 1966 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1967 end = (logical + blk_cnt - 1) >> 1968 (PAGE_CACHE_SHIFT - inode->i_blkbits); 1969 while (index <= end) { 1970 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1971 if (nr_pages == 0) 1972 break; 1973 for (i = 0; i < nr_pages; i++) { 1974 struct page *page = pvec.pages[i]; 1975 index = page->index; 1976 if (index > end) 1977 break; 1978 index++; 1979 1980 BUG_ON(!PageLocked(page)); 1981 BUG_ON(PageWriteback(page)); 1982 block_invalidatepage(page, 0); 1983 ClearPageUptodate(page); 1984 unlock_page(page); 1985 } 1986 } 1987 return; 1988 } 1989 1990 static void ext4_print_free_blocks(struct inode *inode) 1991 { 1992 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1993 printk(KERN_EMERG "Total free blocks count %lld\n", 1994 ext4_count_free_blocks(inode->i_sb)); 1995 printk(KERN_EMERG "Free/Dirty block details\n"); 1996 printk(KERN_EMERG "free_blocks=%lld\n", 1997 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 1998 printk(KERN_EMERG "dirty_blocks=%lld\n", 1999 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2000 printk(KERN_EMERG "Block reservation details\n"); 2001 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2002 EXT4_I(inode)->i_reserved_data_blocks); 2003 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2004 EXT4_I(inode)->i_reserved_meta_blocks); 2005 return; 2006 } 2007 2008 /* 2009 * This function is used by mpage_da_map_blocks(). We separate it out 2010 * as a separate function just to make life easier, and because 2011 * mpage_da_map_blocks() used to be a generic function that took a 2012 * get_block_t. 2013 */ 2014 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 2015 struct buffer_head *bh_result) 2016 { 2017 int ret; 2018 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2019 loff_t disksize = EXT4_I(inode)->i_disksize; 2020 handle_t *handle = NULL; 2021 2022 handle = ext4_journal_current_handle(); 2023 BUG_ON(!handle); 2024 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, 2025 bh_result, EXT4_GET_BLOCKS_CREATE| 2026 EXT4_GET_BLOCKS_DELALLOC_RESERVE); 2027 if (ret <= 0) 2028 return ret; 2029 2030 bh_result->b_size = (ret << inode->i_blkbits); 2031 2032 if (ext4_should_order_data(inode)) { 2033 int retval; 2034 retval = ext4_jbd2_file_inode(handle, inode); 2035 if (retval) 2036 /* 2037 * Failed to add inode for ordered mode. Don't 2038 * update file size 2039 */ 2040 return retval; 2041 } 2042 2043 /* 2044 * Update on-disk size along with block allocation we don't 2045 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change 2046 * within already allocated block -bzzz 2047 */ 2048 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2049 if (disksize > i_size_read(inode)) 2050 disksize = i_size_read(inode); 2051 if (disksize > EXT4_I(inode)->i_disksize) { 2052 ext4_update_i_disksize(inode, disksize); 2053 ret = ext4_mark_inode_dirty(handle, inode); 2054 return ret; 2055 } 2056 return 0; 2057 } 2058 2059 /* 2060 * mpage_da_map_blocks - go through given space 2061 * 2062 * @mpd - bh describing space 2063 * 2064 * The function skips space we know is already mapped to disk blocks. 2065 * 2066 */ 2067 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2068 { 2069 int err = 0; 2070 struct buffer_head new; 2071 sector_t next; 2072 2073 /* 2074 * We consider only non-mapped and non-allocated blocks 2075 */ 2076 if ((mpd->b_state & (1 << BH_Mapped)) && 2077 !(mpd->b_state & (1 << BH_Delay)) && 2078 !(mpd->b_state & (1 << BH_Unwritten))) 2079 return 0; 2080 /* 2081 * We need to make sure the BH_Delay flag is passed down to 2082 * ext4_da_get_block_write(), since it calls ext4_get_blocks() 2083 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. This flag 2084 * causes ext4_get_blocks() to call 2085 * ext4_da_update_reserve_space() if the passed buffer head 2086 * has the BH_Delay flag set. In the future, once we clean up 2087 * the interfaces to ext4_get_blocks(), we should pass in a 2088 * separate flag which requests that the delayed allocation 2089 * statistics should be updated, instead of depending on the 2090 * state information getting passed down via the map_bh's 2091 * state bitmasks plus the magic 2092 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. 2093 */ 2094 new.b_state = mpd->b_state & (1 << BH_Delay); 2095 new.b_blocknr = 0; 2096 new.b_size = mpd->b_size; 2097 next = mpd->b_blocknr; 2098 /* 2099 * If we didn't accumulate anything 2100 * to write simply return 2101 */ 2102 if (!new.b_size) 2103 return 0; 2104 2105 err = ext4_da_get_block_write(mpd->inode, next, &new); 2106 if (err) { 2107 /* 2108 * If get block returns with error we simply 2109 * return. Later writepage will redirty the page and 2110 * writepages will find the dirty page again 2111 */ 2112 if (err == -EAGAIN) 2113 return 0; 2114 2115 if (err == -ENOSPC && 2116 ext4_count_free_blocks(mpd->inode->i_sb)) { 2117 mpd->retval = err; 2118 return 0; 2119 } 2120 2121 /* 2122 * get block failure will cause us to loop in 2123 * writepages, because a_ops->writepage won't be able 2124 * to make progress. The page will be redirtied by 2125 * writepage and writepages will again try to write 2126 * the same. 2127 */ 2128 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2129 "at logical offset %llu with max blocks " 2130 "%zd with error %d\n", 2131 __func__, mpd->inode->i_ino, 2132 (unsigned long long)next, 2133 mpd->b_size >> mpd->inode->i_blkbits, err); 2134 printk(KERN_EMERG "This should not happen.!! " 2135 "Data will be lost\n"); 2136 if (err == -ENOSPC) { 2137 ext4_print_free_blocks(mpd->inode); 2138 } 2139 /* invlaidate all the pages */ 2140 ext4_da_block_invalidatepages(mpd, next, 2141 mpd->b_size >> mpd->inode->i_blkbits); 2142 return err; 2143 } 2144 BUG_ON(new.b_size == 0); 2145 2146 if (buffer_new(&new)) 2147 __unmap_underlying_blocks(mpd->inode, &new); 2148 2149 /* 2150 * If blocks are delayed marked, we need to 2151 * put actual blocknr and drop delayed bit 2152 */ 2153 if ((mpd->b_state & (1 << BH_Delay)) || 2154 (mpd->b_state & (1 << BH_Unwritten))) 2155 mpage_put_bnr_to_bhs(mpd, next, &new); 2156 2157 return 0; 2158 } 2159 2160 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 2161 (1 << BH_Delay) | (1 << BH_Unwritten)) 2162 2163 /* 2164 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 2165 * 2166 * @mpd->lbh - extent of blocks 2167 * @logical - logical number of the block in the file 2168 * @bh - bh of the block (used to access block's state) 2169 * 2170 * the function is used to collect contig. blocks in same state 2171 */ 2172 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 2173 sector_t logical, size_t b_size, 2174 unsigned long b_state) 2175 { 2176 sector_t next; 2177 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2178 2179 /* check if thereserved journal credits might overflow */ 2180 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2181 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2182 /* 2183 * With non-extent format we are limited by the journal 2184 * credit available. Total credit needed to insert 2185 * nrblocks contiguous blocks is dependent on the 2186 * nrblocks. So limit nrblocks. 2187 */ 2188 goto flush_it; 2189 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2190 EXT4_MAX_TRANS_DATA) { 2191 /* 2192 * Adding the new buffer_head would make it cross the 2193 * allowed limit for which we have journal credit 2194 * reserved. So limit the new bh->b_size 2195 */ 2196 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2197 mpd->inode->i_blkbits; 2198 /* we will do mpage_da_submit_io in the next loop */ 2199 } 2200 } 2201 /* 2202 * First block in the extent 2203 */ 2204 if (mpd->b_size == 0) { 2205 mpd->b_blocknr = logical; 2206 mpd->b_size = b_size; 2207 mpd->b_state = b_state & BH_FLAGS; 2208 return; 2209 } 2210 2211 next = mpd->b_blocknr + nrblocks; 2212 /* 2213 * Can we merge the block to our big extent? 2214 */ 2215 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { 2216 mpd->b_size += b_size; 2217 return; 2218 } 2219 2220 flush_it: 2221 /* 2222 * We couldn't merge the block to our extent, so we 2223 * need to flush current extent and start new one 2224 */ 2225 if (mpage_da_map_blocks(mpd) == 0) 2226 mpage_da_submit_io(mpd); 2227 mpd->io_done = 1; 2228 return; 2229 } 2230 2231 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2232 { 2233 /* 2234 * unmapped buffer is possible for holes. 2235 * delay buffer is possible with delayed allocation. 2236 * We also need to consider unwritten buffer as unmapped. 2237 */ 2238 return (!buffer_mapped(bh) || buffer_delay(bh) || 2239 buffer_unwritten(bh)) && buffer_dirty(bh); 2240 } 2241 2242 /* 2243 * __mpage_da_writepage - finds extent of pages and blocks 2244 * 2245 * @page: page to consider 2246 * @wbc: not used, we just follow rules 2247 * @data: context 2248 * 2249 * The function finds extents of pages and scan them for all blocks. 2250 */ 2251 static int __mpage_da_writepage(struct page *page, 2252 struct writeback_control *wbc, void *data) 2253 { 2254 struct mpage_da_data *mpd = data; 2255 struct inode *inode = mpd->inode; 2256 struct buffer_head *bh, *head; 2257 sector_t logical; 2258 2259 if (mpd->io_done) { 2260 /* 2261 * Rest of the page in the page_vec 2262 * redirty then and skip then. We will 2263 * try to to write them again after 2264 * starting a new transaction 2265 */ 2266 redirty_page_for_writepage(wbc, page); 2267 unlock_page(page); 2268 return MPAGE_DA_EXTENT_TAIL; 2269 } 2270 /* 2271 * Can we merge this page to current extent? 2272 */ 2273 if (mpd->next_page != page->index) { 2274 /* 2275 * Nope, we can't. So, we map non-allocated blocks 2276 * and start IO on them using writepage() 2277 */ 2278 if (mpd->next_page != mpd->first_page) { 2279 if (mpage_da_map_blocks(mpd) == 0) 2280 mpage_da_submit_io(mpd); 2281 /* 2282 * skip rest of the page in the page_vec 2283 */ 2284 mpd->io_done = 1; 2285 redirty_page_for_writepage(wbc, page); 2286 unlock_page(page); 2287 return MPAGE_DA_EXTENT_TAIL; 2288 } 2289 2290 /* 2291 * Start next extent of pages ... 2292 */ 2293 mpd->first_page = page->index; 2294 2295 /* 2296 * ... and blocks 2297 */ 2298 mpd->b_size = 0; 2299 mpd->b_state = 0; 2300 mpd->b_blocknr = 0; 2301 } 2302 2303 mpd->next_page = page->index + 1; 2304 logical = (sector_t) page->index << 2305 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2306 2307 if (!page_has_buffers(page)) { 2308 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, 2309 (1 << BH_Dirty) | (1 << BH_Uptodate)); 2310 if (mpd->io_done) 2311 return MPAGE_DA_EXTENT_TAIL; 2312 } else { 2313 /* 2314 * Page with regular buffer heads, just add all dirty ones 2315 */ 2316 head = page_buffers(page); 2317 bh = head; 2318 do { 2319 BUG_ON(buffer_locked(bh)); 2320 /* 2321 * We need to try to allocate 2322 * unmapped blocks in the same page. 2323 * Otherwise we won't make progress 2324 * with the page in ext4_da_writepage 2325 */ 2326 if (ext4_bh_unmapped_or_delay(NULL, bh)) { 2327 mpage_add_bh_to_extent(mpd, logical, 2328 bh->b_size, 2329 bh->b_state); 2330 if (mpd->io_done) 2331 return MPAGE_DA_EXTENT_TAIL; 2332 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2333 /* 2334 * mapped dirty buffer. We need to update 2335 * the b_state because we look at 2336 * b_state in mpage_da_map_blocks. We don't 2337 * update b_size because if we find an 2338 * unmapped buffer_head later we need to 2339 * use the b_state flag of that buffer_head. 2340 */ 2341 if (mpd->b_size == 0) 2342 mpd->b_state = bh->b_state & BH_FLAGS; 2343 } 2344 logical++; 2345 } while ((bh = bh->b_this_page) != head); 2346 } 2347 2348 return 0; 2349 } 2350 2351 /* 2352 * This is a special get_blocks_t callback which is used by 2353 * ext4_da_write_begin(). It will either return mapped block or 2354 * reserve space for a single block. 2355 * 2356 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 2357 * We also have b_blocknr = -1 and b_bdev initialized properly 2358 * 2359 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 2360 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 2361 * initialized properly. 2362 */ 2363 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2364 struct buffer_head *bh_result, int create) 2365 { 2366 int ret = 0; 2367 sector_t invalid_block = ~((sector_t) 0xffff); 2368 2369 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 2370 invalid_block = ~0; 2371 2372 BUG_ON(create == 0); 2373 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2374 2375 /* 2376 * first, we need to know whether the block is allocated already 2377 * preallocated blocks are unmapped but should treated 2378 * the same as allocated blocks. 2379 */ 2380 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2381 if ((ret == 0) && !buffer_delay(bh_result)) { 2382 /* the block isn't (pre)allocated yet, let's reserve space */ 2383 /* 2384 * XXX: __block_prepare_write() unmaps passed block, 2385 * is it OK? 2386 */ 2387 ret = ext4_da_reserve_space(inode, 1); 2388 if (ret) 2389 /* not enough space to reserve */ 2390 return ret; 2391 2392 map_bh(bh_result, inode->i_sb, invalid_block); 2393 set_buffer_new(bh_result); 2394 set_buffer_delay(bh_result); 2395 } else if (ret > 0) { 2396 bh_result->b_size = (ret << inode->i_blkbits); 2397 if (buffer_unwritten(bh_result)) { 2398 /* A delayed write to unwritten bh should 2399 * be marked new and mapped. Mapped ensures 2400 * that we don't do get_block multiple times 2401 * when we write to the same offset and new 2402 * ensures that we do proper zero out for 2403 * partial write. 2404 */ 2405 set_buffer_new(bh_result); 2406 set_buffer_mapped(bh_result); 2407 } 2408 ret = 0; 2409 } 2410 2411 return ret; 2412 } 2413 2414 /* 2415 * This function is used as a standard get_block_t calback function 2416 * when there is no desire to allocate any blocks. It is used as a 2417 * callback function for block_prepare_write(), nobh_writepage(), and 2418 * block_write_full_page(). These functions should only try to map a 2419 * single block at a time. 2420 * 2421 * Since this function doesn't do block allocations even if the caller 2422 * requests it by passing in create=1, it is critically important that 2423 * any caller checks to make sure that any buffer heads are returned 2424 * by this function are either all already mapped or marked for 2425 * delayed allocation before calling nobh_writepage() or 2426 * block_write_full_page(). Otherwise, b_blocknr could be left 2427 * unitialized, and the page write functions will be taken by 2428 * surprise. 2429 */ 2430 static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2431 struct buffer_head *bh_result, int create) 2432 { 2433 int ret = 0; 2434 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2435 2436 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2437 2438 /* 2439 * we don't want to do block allocation in writepage 2440 * so call get_block_wrap with create = 0 2441 */ 2442 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); 2443 BUG_ON(create && ret == 0); 2444 if (ret > 0) { 2445 bh_result->b_size = (ret << inode->i_blkbits); 2446 ret = 0; 2447 } 2448 return ret; 2449 } 2450 2451 /* 2452 * This function can get called via... 2453 * - ext4_da_writepages after taking page lock (have journal handle) 2454 * - journal_submit_inode_data_buffers (no journal handle) 2455 * - shrink_page_list via pdflush (no journal handle) 2456 * - grab_page_cache when doing write_begin (have journal handle) 2457 */ 2458 static int ext4_da_writepage(struct page *page, 2459 struct writeback_control *wbc) 2460 { 2461 int ret = 0; 2462 loff_t size; 2463 unsigned int len; 2464 struct buffer_head *page_bufs; 2465 struct inode *inode = page->mapping->host; 2466 2467 trace_mark(ext4_da_writepage, 2468 "dev %s ino %lu page_index %lu", 2469 inode->i_sb->s_id, inode->i_ino, page->index); 2470 size = i_size_read(inode); 2471 if (page->index == size >> PAGE_CACHE_SHIFT) 2472 len = size & ~PAGE_CACHE_MASK; 2473 else 2474 len = PAGE_CACHE_SIZE; 2475 2476 if (page_has_buffers(page)) { 2477 page_bufs = page_buffers(page); 2478 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2479 ext4_bh_unmapped_or_delay)) { 2480 /* 2481 * We don't want to do block allocation 2482 * So redirty the page and return 2483 * We may reach here when we do a journal commit 2484 * via journal_submit_inode_data_buffers. 2485 * If we don't have mapping block we just ignore 2486 * them. We can also reach here via shrink_page_list 2487 */ 2488 redirty_page_for_writepage(wbc, page); 2489 unlock_page(page); 2490 return 0; 2491 } 2492 } else { 2493 /* 2494 * The test for page_has_buffers() is subtle: 2495 * We know the page is dirty but it lost buffers. That means 2496 * that at some moment in time after write_begin()/write_end() 2497 * has been called all buffers have been clean and thus they 2498 * must have been written at least once. So they are all 2499 * mapped and we can happily proceed with mapping them 2500 * and writing the page. 2501 * 2502 * Try to initialize the buffer_heads and check whether 2503 * all are mapped and non delay. We don't want to 2504 * do block allocation here. 2505 */ 2506 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2507 noalloc_get_block_write); 2508 if (!ret) { 2509 page_bufs = page_buffers(page); 2510 /* check whether all are mapped and non delay */ 2511 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2512 ext4_bh_unmapped_or_delay)) { 2513 redirty_page_for_writepage(wbc, page); 2514 unlock_page(page); 2515 return 0; 2516 } 2517 } else { 2518 /* 2519 * We can't do block allocation here 2520 * so just redity the page and unlock 2521 * and return 2522 */ 2523 redirty_page_for_writepage(wbc, page); 2524 unlock_page(page); 2525 return 0; 2526 } 2527 /* now mark the buffer_heads as dirty and uptodate */ 2528 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2529 } 2530 2531 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2532 ret = nobh_writepage(page, noalloc_get_block_write, wbc); 2533 else 2534 ret = block_write_full_page(page, noalloc_get_block_write, 2535 wbc); 2536 2537 return ret; 2538 } 2539 2540 /* 2541 * This is called via ext4_da_writepages() to 2542 * calulate the total number of credits to reserve to fit 2543 * a single extent allocation into a single transaction, 2544 * ext4_da_writpeages() will loop calling this before 2545 * the block allocation. 2546 */ 2547 2548 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2549 { 2550 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2551 2552 /* 2553 * With non-extent format the journal credit needed to 2554 * insert nrblocks contiguous block is dependent on 2555 * number of contiguous block. So we will limit 2556 * number of contiguous block to a sane value 2557 */ 2558 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2559 (max_blocks > EXT4_MAX_TRANS_DATA)) 2560 max_blocks = EXT4_MAX_TRANS_DATA; 2561 2562 return ext4_chunk_trans_blocks(inode, max_blocks); 2563 } 2564 2565 static int ext4_da_writepages(struct address_space *mapping, 2566 struct writeback_control *wbc) 2567 { 2568 pgoff_t index; 2569 int range_whole = 0; 2570 handle_t *handle = NULL; 2571 struct mpage_da_data mpd; 2572 struct inode *inode = mapping->host; 2573 int no_nrwrite_index_update; 2574 int pages_written = 0; 2575 long pages_skipped; 2576 int range_cyclic, cycled = 1, io_done = 0; 2577 int needed_blocks, ret = 0, nr_to_writebump = 0; 2578 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2579 2580 trace_mark(ext4_da_writepages, 2581 "dev %s ino %lu nr_t_write %ld " 2582 "pages_skipped %ld range_start %llu " 2583 "range_end %llu nonblocking %d " 2584 "for_kupdate %d for_reclaim %d " 2585 "for_writepages %d range_cyclic %d", 2586 inode->i_sb->s_id, inode->i_ino, 2587 wbc->nr_to_write, wbc->pages_skipped, 2588 (unsigned long long) wbc->range_start, 2589 (unsigned long long) wbc->range_end, 2590 wbc->nonblocking, wbc->for_kupdate, 2591 wbc->for_reclaim, wbc->for_writepages, 2592 wbc->range_cyclic); 2593 2594 /* 2595 * No pages to write? This is mainly a kludge to avoid starting 2596 * a transaction for special inodes like journal inode on last iput() 2597 * because that could violate lock ordering on umount 2598 */ 2599 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2600 return 0; 2601 2602 /* 2603 * If the filesystem has aborted, it is read-only, so return 2604 * right away instead of dumping stack traces later on that 2605 * will obscure the real source of the problem. We test 2606 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because 2607 * the latter could be true if the filesystem is mounted 2608 * read-only, and in that case, ext4_da_writepages should 2609 * *never* be called, so if that ever happens, we would want 2610 * the stack trace. 2611 */ 2612 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) 2613 return -EROFS; 2614 2615 /* 2616 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2617 * This make sure small files blocks are allocated in 2618 * single attempt. This ensure that small files 2619 * get less fragmented. 2620 */ 2621 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2622 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2623 wbc->nr_to_write = sbi->s_mb_stream_request; 2624 } 2625 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2626 range_whole = 1; 2627 2628 range_cyclic = wbc->range_cyclic; 2629 if (wbc->range_cyclic) { 2630 index = mapping->writeback_index; 2631 if (index) 2632 cycled = 0; 2633 wbc->range_start = index << PAGE_CACHE_SHIFT; 2634 wbc->range_end = LLONG_MAX; 2635 wbc->range_cyclic = 0; 2636 } else 2637 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2638 2639 mpd.wbc = wbc; 2640 mpd.inode = mapping->host; 2641 2642 /* 2643 * we don't want write_cache_pages to update 2644 * nr_to_write and writeback_index 2645 */ 2646 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2647 wbc->no_nrwrite_index_update = 1; 2648 pages_skipped = wbc->pages_skipped; 2649 2650 retry: 2651 while (!ret && wbc->nr_to_write > 0) { 2652 2653 /* 2654 * we insert one extent at a time. So we need 2655 * credit needed for single extent allocation. 2656 * journalled mode is currently not supported 2657 * by delalloc 2658 */ 2659 BUG_ON(ext4_should_journal_data(inode)); 2660 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2661 2662 /* start a new transaction*/ 2663 handle = ext4_journal_start(inode, needed_blocks); 2664 if (IS_ERR(handle)) { 2665 ret = PTR_ERR(handle); 2666 printk(KERN_CRIT "%s: jbd2_start: " 2667 "%ld pages, ino %lu; err %d\n", __func__, 2668 wbc->nr_to_write, inode->i_ino, ret); 2669 dump_stack(); 2670 goto out_writepages; 2671 } 2672 2673 /* 2674 * Now call __mpage_da_writepage to find the next 2675 * contiguous region of logical blocks that need 2676 * blocks to be allocated by ext4. We don't actually 2677 * submit the blocks for I/O here, even though 2678 * write_cache_pages thinks it will, and will set the 2679 * pages as clean for write before calling 2680 * __mpage_da_writepage(). 2681 */ 2682 mpd.b_size = 0; 2683 mpd.b_state = 0; 2684 mpd.b_blocknr = 0; 2685 mpd.first_page = 0; 2686 mpd.next_page = 0; 2687 mpd.io_done = 0; 2688 mpd.pages_written = 0; 2689 mpd.retval = 0; 2690 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 2691 &mpd); 2692 /* 2693 * If we have a contigous extent of pages and we 2694 * haven't done the I/O yet, map the blocks and submit 2695 * them for I/O. 2696 */ 2697 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2698 if (mpage_da_map_blocks(&mpd) == 0) 2699 mpage_da_submit_io(&mpd); 2700 mpd.io_done = 1; 2701 ret = MPAGE_DA_EXTENT_TAIL; 2702 } 2703 wbc->nr_to_write -= mpd.pages_written; 2704 2705 ext4_journal_stop(handle); 2706 2707 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2708 /* commit the transaction which would 2709 * free blocks released in the transaction 2710 * and try again 2711 */ 2712 jbd2_journal_force_commit_nested(sbi->s_journal); 2713 wbc->pages_skipped = pages_skipped; 2714 ret = 0; 2715 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2716 /* 2717 * got one extent now try with 2718 * rest of the pages 2719 */ 2720 pages_written += mpd.pages_written; 2721 wbc->pages_skipped = pages_skipped; 2722 ret = 0; 2723 io_done = 1; 2724 } else if (wbc->nr_to_write) 2725 /* 2726 * There is no more writeout needed 2727 * or we requested for a noblocking writeout 2728 * and we found the device congested 2729 */ 2730 break; 2731 } 2732 if (!io_done && !cycled) { 2733 cycled = 1; 2734 index = 0; 2735 wbc->range_start = index << PAGE_CACHE_SHIFT; 2736 wbc->range_end = mapping->writeback_index - 1; 2737 goto retry; 2738 } 2739 if (pages_skipped != wbc->pages_skipped) 2740 printk(KERN_EMERG "This should not happen leaving %s " 2741 "with nr_to_write = %ld ret = %d\n", 2742 __func__, wbc->nr_to_write, ret); 2743 2744 /* Update index */ 2745 index += pages_written; 2746 wbc->range_cyclic = range_cyclic; 2747 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2748 /* 2749 * set the writeback_index so that range_cyclic 2750 * mode will write it back later 2751 */ 2752 mapping->writeback_index = index; 2753 2754 out_writepages: 2755 if (!no_nrwrite_index_update) 2756 wbc->no_nrwrite_index_update = 0; 2757 wbc->nr_to_write -= nr_to_writebump; 2758 trace_mark(ext4_da_writepage_result, 2759 "dev %s ino %lu ret %d pages_written %d " 2760 "pages_skipped %ld congestion %d " 2761 "more_io %d no_nrwrite_index_update %d", 2762 inode->i_sb->s_id, inode->i_ino, ret, 2763 pages_written, wbc->pages_skipped, 2764 wbc->encountered_congestion, wbc->more_io, 2765 wbc->no_nrwrite_index_update); 2766 return ret; 2767 } 2768 2769 #define FALL_BACK_TO_NONDELALLOC 1 2770 static int ext4_nonda_switch(struct super_block *sb) 2771 { 2772 s64 free_blocks, dirty_blocks; 2773 struct ext4_sb_info *sbi = EXT4_SB(sb); 2774 2775 /* 2776 * switch to non delalloc mode if we are running low 2777 * on free block. The free block accounting via percpu 2778 * counters can get slightly wrong with percpu_counter_batch getting 2779 * accumulated on each CPU without updating global counters 2780 * Delalloc need an accurate free block accounting. So switch 2781 * to non delalloc when we are near to error range. 2782 */ 2783 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2784 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2785 if (2 * free_blocks < 3 * dirty_blocks || 2786 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2787 /* 2788 * free block count is less that 150% of dirty blocks 2789 * or free blocks is less that watermark 2790 */ 2791 return 1; 2792 } 2793 return 0; 2794 } 2795 2796 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2797 loff_t pos, unsigned len, unsigned flags, 2798 struct page **pagep, void **fsdata) 2799 { 2800 int ret, retries = 0; 2801 struct page *page; 2802 pgoff_t index; 2803 unsigned from, to; 2804 struct inode *inode = mapping->host; 2805 handle_t *handle; 2806 2807 index = pos >> PAGE_CACHE_SHIFT; 2808 from = pos & (PAGE_CACHE_SIZE - 1); 2809 to = from + len; 2810 2811 if (ext4_nonda_switch(inode->i_sb)) { 2812 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2813 return ext4_write_begin(file, mapping, pos, 2814 len, flags, pagep, fsdata); 2815 } 2816 *fsdata = (void *)0; 2817 2818 trace_mark(ext4_da_write_begin, 2819 "dev %s ino %lu pos %llu len %u flags %u", 2820 inode->i_sb->s_id, inode->i_ino, 2821 (unsigned long long) pos, len, flags); 2822 retry: 2823 /* 2824 * With delayed allocation, we don't log the i_disksize update 2825 * if there is delayed block allocation. But we still need 2826 * to journalling the i_disksize update if writes to the end 2827 * of file which has an already mapped buffer. 2828 */ 2829 handle = ext4_journal_start(inode, 1); 2830 if (IS_ERR(handle)) { 2831 ret = PTR_ERR(handle); 2832 goto out; 2833 } 2834 /* We cannot recurse into the filesystem as the transaction is already 2835 * started */ 2836 flags |= AOP_FLAG_NOFS; 2837 2838 page = grab_cache_page_write_begin(mapping, index, flags); 2839 if (!page) { 2840 ext4_journal_stop(handle); 2841 ret = -ENOMEM; 2842 goto out; 2843 } 2844 *pagep = page; 2845 2846 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2847 ext4_da_get_block_prep); 2848 if (ret < 0) { 2849 unlock_page(page); 2850 ext4_journal_stop(handle); 2851 page_cache_release(page); 2852 /* 2853 * block_write_begin may have instantiated a few blocks 2854 * outside i_size. Trim these off again. Don't need 2855 * i_size_read because we hold i_mutex. 2856 */ 2857 if (pos + len > inode->i_size) 2858 vmtruncate(inode, inode->i_size); 2859 } 2860 2861 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2862 goto retry; 2863 out: 2864 return ret; 2865 } 2866 2867 /* 2868 * Check if we should update i_disksize 2869 * when write to the end of file but not require block allocation 2870 */ 2871 static int ext4_da_should_update_i_disksize(struct page *page, 2872 unsigned long offset) 2873 { 2874 struct buffer_head *bh; 2875 struct inode *inode = page->mapping->host; 2876 unsigned int idx; 2877 int i; 2878 2879 bh = page_buffers(page); 2880 idx = offset >> inode->i_blkbits; 2881 2882 for (i = 0; i < idx; i++) 2883 bh = bh->b_this_page; 2884 2885 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 2886 return 0; 2887 return 1; 2888 } 2889 2890 static int ext4_da_write_end(struct file *file, 2891 struct address_space *mapping, 2892 loff_t pos, unsigned len, unsigned copied, 2893 struct page *page, void *fsdata) 2894 { 2895 struct inode *inode = mapping->host; 2896 int ret = 0, ret2; 2897 handle_t *handle = ext4_journal_current_handle(); 2898 loff_t new_i_size; 2899 unsigned long start, end; 2900 int write_mode = (int)(unsigned long)fsdata; 2901 2902 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2903 if (ext4_should_order_data(inode)) { 2904 return ext4_ordered_write_end(file, mapping, pos, 2905 len, copied, page, fsdata); 2906 } else if (ext4_should_writeback_data(inode)) { 2907 return ext4_writeback_write_end(file, mapping, pos, 2908 len, copied, page, fsdata); 2909 } else { 2910 BUG(); 2911 } 2912 } 2913 2914 trace_mark(ext4_da_write_end, 2915 "dev %s ino %lu pos %llu len %u copied %u", 2916 inode->i_sb->s_id, inode->i_ino, 2917 (unsigned long long) pos, len, copied); 2918 start = pos & (PAGE_CACHE_SIZE - 1); 2919 end = start + copied - 1; 2920 2921 /* 2922 * generic_write_end() will run mark_inode_dirty() if i_size 2923 * changes. So let's piggyback the i_disksize mark_inode_dirty 2924 * into that. 2925 */ 2926 2927 new_i_size = pos + copied; 2928 if (new_i_size > EXT4_I(inode)->i_disksize) { 2929 if (ext4_da_should_update_i_disksize(page, end)) { 2930 down_write(&EXT4_I(inode)->i_data_sem); 2931 if (new_i_size > EXT4_I(inode)->i_disksize) { 2932 /* 2933 * Updating i_disksize when extending file 2934 * without needing block allocation 2935 */ 2936 if (ext4_should_order_data(inode)) 2937 ret = ext4_jbd2_file_inode(handle, 2938 inode); 2939 2940 EXT4_I(inode)->i_disksize = new_i_size; 2941 } 2942 up_write(&EXT4_I(inode)->i_data_sem); 2943 /* We need to mark inode dirty even if 2944 * new_i_size is less that inode->i_size 2945 * bu greater than i_disksize.(hint delalloc) 2946 */ 2947 ext4_mark_inode_dirty(handle, inode); 2948 } 2949 } 2950 ret2 = generic_write_end(file, mapping, pos, len, copied, 2951 page, fsdata); 2952 copied = ret2; 2953 if (ret2 < 0) 2954 ret = ret2; 2955 ret2 = ext4_journal_stop(handle); 2956 if (!ret) 2957 ret = ret2; 2958 2959 return ret ? ret : copied; 2960 } 2961 2962 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2963 { 2964 /* 2965 * Drop reserved blocks 2966 */ 2967 BUG_ON(!PageLocked(page)); 2968 if (!page_has_buffers(page)) 2969 goto out; 2970 2971 ext4_da_page_release_reservation(page, offset); 2972 2973 out: 2974 ext4_invalidatepage(page, offset); 2975 2976 return; 2977 } 2978 2979 /* 2980 * Force all delayed allocation blocks to be allocated for a given inode. 2981 */ 2982 int ext4_alloc_da_blocks(struct inode *inode) 2983 { 2984 if (!EXT4_I(inode)->i_reserved_data_blocks && 2985 !EXT4_I(inode)->i_reserved_meta_blocks) 2986 return 0; 2987 2988 /* 2989 * We do something simple for now. The filemap_flush() will 2990 * also start triggering a write of the data blocks, which is 2991 * not strictly speaking necessary (and for users of 2992 * laptop_mode, not even desirable). However, to do otherwise 2993 * would require replicating code paths in: 2994 * 2995 * ext4_da_writepages() -> 2996 * write_cache_pages() ---> (via passed in callback function) 2997 * __mpage_da_writepage() --> 2998 * mpage_add_bh_to_extent() 2999 * mpage_da_map_blocks() 3000 * 3001 * The problem is that write_cache_pages(), located in 3002 * mm/page-writeback.c, marks pages clean in preparation for 3003 * doing I/O, which is not desirable if we're not planning on 3004 * doing I/O at all. 3005 * 3006 * We could call write_cache_pages(), and then redirty all of 3007 * the pages by calling redirty_page_for_writeback() but that 3008 * would be ugly in the extreme. So instead we would need to 3009 * replicate parts of the code in the above functions, 3010 * simplifying them becuase we wouldn't actually intend to 3011 * write out the pages, but rather only collect contiguous 3012 * logical block extents, call the multi-block allocator, and 3013 * then update the buffer heads with the block allocations. 3014 * 3015 * For now, though, we'll cheat by calling filemap_flush(), 3016 * which will map the blocks, and start the I/O, but not 3017 * actually wait for the I/O to complete. 3018 */ 3019 return filemap_flush(inode->i_mapping); 3020 } 3021 3022 /* 3023 * bmap() is special. It gets used by applications such as lilo and by 3024 * the swapper to find the on-disk block of a specific piece of data. 3025 * 3026 * Naturally, this is dangerous if the block concerned is still in the 3027 * journal. If somebody makes a swapfile on an ext4 data-journaling 3028 * filesystem and enables swap, then they may get a nasty shock when the 3029 * data getting swapped to that swapfile suddenly gets overwritten by 3030 * the original zero's written out previously to the journal and 3031 * awaiting writeback in the kernel's buffer cache. 3032 * 3033 * So, if we see any bmap calls here on a modified, data-journaled file, 3034 * take extra steps to flush any blocks which might be in the cache. 3035 */ 3036 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3037 { 3038 struct inode *inode = mapping->host; 3039 journal_t *journal; 3040 int err; 3041 3042 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3043 test_opt(inode->i_sb, DELALLOC)) { 3044 /* 3045 * With delalloc we want to sync the file 3046 * so that we can make sure we allocate 3047 * blocks for file 3048 */ 3049 filemap_write_and_wait(mapping); 3050 } 3051 3052 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 3053 /* 3054 * This is a REALLY heavyweight approach, but the use of 3055 * bmap on dirty files is expected to be extremely rare: 3056 * only if we run lilo or swapon on a freshly made file 3057 * do we expect this to happen. 3058 * 3059 * (bmap requires CAP_SYS_RAWIO so this does not 3060 * represent an unprivileged user DOS attack --- we'd be 3061 * in trouble if mortal users could trigger this path at 3062 * will.) 3063 * 3064 * NB. EXT4_STATE_JDATA is not set on files other than 3065 * regular files. If somebody wants to bmap a directory 3066 * or symlink and gets confused because the buffer 3067 * hasn't yet been flushed to disk, they deserve 3068 * everything they get. 3069 */ 3070 3071 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 3072 journal = EXT4_JOURNAL(inode); 3073 jbd2_journal_lock_updates(journal); 3074 err = jbd2_journal_flush(journal); 3075 jbd2_journal_unlock_updates(journal); 3076 3077 if (err) 3078 return 0; 3079 } 3080 3081 return generic_block_bmap(mapping, block, ext4_get_block); 3082 } 3083 3084 static int bget_one(handle_t *handle, struct buffer_head *bh) 3085 { 3086 get_bh(bh); 3087 return 0; 3088 } 3089 3090 static int bput_one(handle_t *handle, struct buffer_head *bh) 3091 { 3092 put_bh(bh); 3093 return 0; 3094 } 3095 3096 /* 3097 * Note that we don't need to start a transaction unless we're journaling data 3098 * because we should have holes filled from ext4_page_mkwrite(). We even don't 3099 * need to file the inode to the transaction's list in ordered mode because if 3100 * we are writing back data added by write(), the inode is already there and if 3101 * we are writing back data modified via mmap(), noone guarantees in which 3102 * transaction the data will hit the disk. In case we are journaling data, we 3103 * cannot start transaction directly because transaction start ranks above page 3104 * lock so we have to do some magic. 3105 * 3106 * In all journaling modes block_write_full_page() will start the I/O. 3107 * 3108 * Problem: 3109 * 3110 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 3111 * ext4_writepage() 3112 * 3113 * Similar for: 3114 * 3115 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 3116 * 3117 * Same applies to ext4_get_block(). We will deadlock on various things like 3118 * lock_journal and i_data_sem 3119 * 3120 * Setting PF_MEMALLOC here doesn't work - too many internal memory 3121 * allocations fail. 3122 * 3123 * 16May01: If we're reentered then journal_current_handle() will be 3124 * non-zero. We simply *return*. 3125 * 3126 * 1 July 2001: @@@ FIXME: 3127 * In journalled data mode, a data buffer may be metadata against the 3128 * current transaction. But the same file is part of a shared mapping 3129 * and someone does a writepage() on it. 3130 * 3131 * We will move the buffer onto the async_data list, but *after* it has 3132 * been dirtied. So there's a small window where we have dirty data on 3133 * BJ_Metadata. 3134 * 3135 * Note that this only applies to the last partial page in the file. The 3136 * bit which block_write_full_page() uses prepare/commit for. (That's 3137 * broken code anyway: it's wrong for msync()). 3138 * 3139 * It's a rare case: affects the final partial page, for journalled data 3140 * where the file is subject to bith write() and writepage() in the same 3141 * transction. To fix it we'll need a custom block_write_full_page(). 3142 * We'll probably need that anyway for journalling writepage() output. 3143 * 3144 * We don't honour synchronous mounts for writepage(). That would be 3145 * disastrous. Any write() or metadata operation will sync the fs for 3146 * us. 3147 * 3148 */ 3149 static int __ext4_normal_writepage(struct page *page, 3150 struct writeback_control *wbc) 3151 { 3152 struct inode *inode = page->mapping->host; 3153 3154 if (test_opt(inode->i_sb, NOBH)) 3155 return nobh_writepage(page, noalloc_get_block_write, wbc); 3156 else 3157 return block_write_full_page(page, noalloc_get_block_write, 3158 wbc); 3159 } 3160 3161 static int ext4_normal_writepage(struct page *page, 3162 struct writeback_control *wbc) 3163 { 3164 struct inode *inode = page->mapping->host; 3165 loff_t size = i_size_read(inode); 3166 loff_t len; 3167 3168 trace_mark(ext4_normal_writepage, 3169 "dev %s ino %lu page_index %lu", 3170 inode->i_sb->s_id, inode->i_ino, page->index); 3171 J_ASSERT(PageLocked(page)); 3172 if (page->index == size >> PAGE_CACHE_SHIFT) 3173 len = size & ~PAGE_CACHE_MASK; 3174 else 3175 len = PAGE_CACHE_SIZE; 3176 3177 if (page_has_buffers(page)) { 3178 /* if page has buffers it should all be mapped 3179 * and allocated. If there are not buffers attached 3180 * to the page we know the page is dirty but it lost 3181 * buffers. That means that at some moment in time 3182 * after write_begin() / write_end() has been called 3183 * all buffers have been clean and thus they must have been 3184 * written at least once. So they are all mapped and we can 3185 * happily proceed with mapping them and writing the page. 3186 */ 3187 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3188 ext4_bh_unmapped_or_delay)); 3189 } 3190 3191 if (!ext4_journal_current_handle()) 3192 return __ext4_normal_writepage(page, wbc); 3193 3194 redirty_page_for_writepage(wbc, page); 3195 unlock_page(page); 3196 return 0; 3197 } 3198 3199 static int __ext4_journalled_writepage(struct page *page, 3200 struct writeback_control *wbc) 3201 { 3202 struct address_space *mapping = page->mapping; 3203 struct inode *inode = mapping->host; 3204 struct buffer_head *page_bufs; 3205 handle_t *handle = NULL; 3206 int ret = 0; 3207 int err; 3208 3209 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3210 noalloc_get_block_write); 3211 if (ret != 0) 3212 goto out_unlock; 3213 3214 page_bufs = page_buffers(page); 3215 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3216 bget_one); 3217 /* As soon as we unlock the page, it can go away, but we have 3218 * references to buffers so we are safe */ 3219 unlock_page(page); 3220 3221 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3222 if (IS_ERR(handle)) { 3223 ret = PTR_ERR(handle); 3224 goto out; 3225 } 3226 3227 ret = walk_page_buffers(handle, page_bufs, 0, 3228 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3229 3230 err = walk_page_buffers(handle, page_bufs, 0, 3231 PAGE_CACHE_SIZE, NULL, write_end_fn); 3232 if (ret == 0) 3233 ret = err; 3234 err = ext4_journal_stop(handle); 3235 if (!ret) 3236 ret = err; 3237 3238 walk_page_buffers(handle, page_bufs, 0, 3239 PAGE_CACHE_SIZE, NULL, bput_one); 3240 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3241 goto out; 3242 3243 out_unlock: 3244 unlock_page(page); 3245 out: 3246 return ret; 3247 } 3248 3249 static int ext4_journalled_writepage(struct page *page, 3250 struct writeback_control *wbc) 3251 { 3252 struct inode *inode = page->mapping->host; 3253 loff_t size = i_size_read(inode); 3254 loff_t len; 3255 3256 trace_mark(ext4_journalled_writepage, 3257 "dev %s ino %lu page_index %lu", 3258 inode->i_sb->s_id, inode->i_ino, page->index); 3259 J_ASSERT(PageLocked(page)); 3260 if (page->index == size >> PAGE_CACHE_SHIFT) 3261 len = size & ~PAGE_CACHE_MASK; 3262 else 3263 len = PAGE_CACHE_SIZE; 3264 3265 if (page_has_buffers(page)) { 3266 /* if page has buffers it should all be mapped 3267 * and allocated. If there are not buffers attached 3268 * to the page we know the page is dirty but it lost 3269 * buffers. That means that at some moment in time 3270 * after write_begin() / write_end() has been called 3271 * all buffers have been clean and thus they must have been 3272 * written at least once. So they are all mapped and we can 3273 * happily proceed with mapping them and writing the page. 3274 */ 3275 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3276 ext4_bh_unmapped_or_delay)); 3277 } 3278 3279 if (ext4_journal_current_handle()) 3280 goto no_write; 3281 3282 if (PageChecked(page)) { 3283 /* 3284 * It's mmapped pagecache. Add buffers and journal it. There 3285 * doesn't seem much point in redirtying the page here. 3286 */ 3287 ClearPageChecked(page); 3288 return __ext4_journalled_writepage(page, wbc); 3289 } else { 3290 /* 3291 * It may be a page full of checkpoint-mode buffers. We don't 3292 * really know unless we go poke around in the buffer_heads. 3293 * But block_write_full_page will do the right thing. 3294 */ 3295 return block_write_full_page(page, noalloc_get_block_write, 3296 wbc); 3297 } 3298 no_write: 3299 redirty_page_for_writepage(wbc, page); 3300 unlock_page(page); 3301 return 0; 3302 } 3303 3304 static int ext4_readpage(struct file *file, struct page *page) 3305 { 3306 return mpage_readpage(page, ext4_get_block); 3307 } 3308 3309 static int 3310 ext4_readpages(struct file *file, struct address_space *mapping, 3311 struct list_head *pages, unsigned nr_pages) 3312 { 3313 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3314 } 3315 3316 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3317 { 3318 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3319 3320 /* 3321 * If it's a full truncate we just forget about the pending dirtying 3322 */ 3323 if (offset == 0) 3324 ClearPageChecked(page); 3325 3326 if (journal) 3327 jbd2_journal_invalidatepage(journal, page, offset); 3328 else 3329 block_invalidatepage(page, offset); 3330 } 3331 3332 static int ext4_releasepage(struct page *page, gfp_t wait) 3333 { 3334 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3335 3336 WARN_ON(PageChecked(page)); 3337 if (!page_has_buffers(page)) 3338 return 0; 3339 if (journal) 3340 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3341 else 3342 return try_to_free_buffers(page); 3343 } 3344 3345 /* 3346 * If the O_DIRECT write will extend the file then add this inode to the 3347 * orphan list. So recovery will truncate it back to the original size 3348 * if the machine crashes during the write. 3349 * 3350 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3351 * crashes then stale disk data _may_ be exposed inside the file. But current 3352 * VFS code falls back into buffered path in that case so we are safe. 3353 */ 3354 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3355 const struct iovec *iov, loff_t offset, 3356 unsigned long nr_segs) 3357 { 3358 struct file *file = iocb->ki_filp; 3359 struct inode *inode = file->f_mapping->host; 3360 struct ext4_inode_info *ei = EXT4_I(inode); 3361 handle_t *handle; 3362 ssize_t ret; 3363 int orphan = 0; 3364 size_t count = iov_length(iov, nr_segs); 3365 3366 if (rw == WRITE) { 3367 loff_t final_size = offset + count; 3368 3369 if (final_size > inode->i_size) { 3370 /* Credits for sb + inode write */ 3371 handle = ext4_journal_start(inode, 2); 3372 if (IS_ERR(handle)) { 3373 ret = PTR_ERR(handle); 3374 goto out; 3375 } 3376 ret = ext4_orphan_add(handle, inode); 3377 if (ret) { 3378 ext4_journal_stop(handle); 3379 goto out; 3380 } 3381 orphan = 1; 3382 ei->i_disksize = inode->i_size; 3383 ext4_journal_stop(handle); 3384 } 3385 } 3386 3387 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3388 offset, nr_segs, 3389 ext4_get_block, NULL); 3390 3391 if (orphan) { 3392 int err; 3393 3394 /* Credits for sb + inode write */ 3395 handle = ext4_journal_start(inode, 2); 3396 if (IS_ERR(handle)) { 3397 /* This is really bad luck. We've written the data 3398 * but cannot extend i_size. Bail out and pretend 3399 * the write failed... */ 3400 ret = PTR_ERR(handle); 3401 goto out; 3402 } 3403 if (inode->i_nlink) 3404 ext4_orphan_del(handle, inode); 3405 if (ret > 0) { 3406 loff_t end = offset + ret; 3407 if (end > inode->i_size) { 3408 ei->i_disksize = end; 3409 i_size_write(inode, end); 3410 /* 3411 * We're going to return a positive `ret' 3412 * here due to non-zero-length I/O, so there's 3413 * no way of reporting error returns from 3414 * ext4_mark_inode_dirty() to userspace. So 3415 * ignore it. 3416 */ 3417 ext4_mark_inode_dirty(handle, inode); 3418 } 3419 } 3420 err = ext4_journal_stop(handle); 3421 if (ret == 0) 3422 ret = err; 3423 } 3424 out: 3425 return ret; 3426 } 3427 3428 /* 3429 * Pages can be marked dirty completely asynchronously from ext4's journalling 3430 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3431 * much here because ->set_page_dirty is called under VFS locks. The page is 3432 * not necessarily locked. 3433 * 3434 * We cannot just dirty the page and leave attached buffers clean, because the 3435 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3436 * or jbddirty because all the journalling code will explode. 3437 * 3438 * So what we do is to mark the page "pending dirty" and next time writepage 3439 * is called, propagate that into the buffers appropriately. 3440 */ 3441 static int ext4_journalled_set_page_dirty(struct page *page) 3442 { 3443 SetPageChecked(page); 3444 return __set_page_dirty_nobuffers(page); 3445 } 3446 3447 static const struct address_space_operations ext4_ordered_aops = { 3448 .readpage = ext4_readpage, 3449 .readpages = ext4_readpages, 3450 .writepage = ext4_normal_writepage, 3451 .sync_page = block_sync_page, 3452 .write_begin = ext4_write_begin, 3453 .write_end = ext4_ordered_write_end, 3454 .bmap = ext4_bmap, 3455 .invalidatepage = ext4_invalidatepage, 3456 .releasepage = ext4_releasepage, 3457 .direct_IO = ext4_direct_IO, 3458 .migratepage = buffer_migrate_page, 3459 .is_partially_uptodate = block_is_partially_uptodate, 3460 }; 3461 3462 static const struct address_space_operations ext4_writeback_aops = { 3463 .readpage = ext4_readpage, 3464 .readpages = ext4_readpages, 3465 .writepage = ext4_normal_writepage, 3466 .sync_page = block_sync_page, 3467 .write_begin = ext4_write_begin, 3468 .write_end = ext4_writeback_write_end, 3469 .bmap = ext4_bmap, 3470 .invalidatepage = ext4_invalidatepage, 3471 .releasepage = ext4_releasepage, 3472 .direct_IO = ext4_direct_IO, 3473 .migratepage = buffer_migrate_page, 3474 .is_partially_uptodate = block_is_partially_uptodate, 3475 }; 3476 3477 static const struct address_space_operations ext4_journalled_aops = { 3478 .readpage = ext4_readpage, 3479 .readpages = ext4_readpages, 3480 .writepage = ext4_journalled_writepage, 3481 .sync_page = block_sync_page, 3482 .write_begin = ext4_write_begin, 3483 .write_end = ext4_journalled_write_end, 3484 .set_page_dirty = ext4_journalled_set_page_dirty, 3485 .bmap = ext4_bmap, 3486 .invalidatepage = ext4_invalidatepage, 3487 .releasepage = ext4_releasepage, 3488 .is_partially_uptodate = block_is_partially_uptodate, 3489 }; 3490 3491 static const struct address_space_operations ext4_da_aops = { 3492 .readpage = ext4_readpage, 3493 .readpages = ext4_readpages, 3494 .writepage = ext4_da_writepage, 3495 .writepages = ext4_da_writepages, 3496 .sync_page = block_sync_page, 3497 .write_begin = ext4_da_write_begin, 3498 .write_end = ext4_da_write_end, 3499 .bmap = ext4_bmap, 3500 .invalidatepage = ext4_da_invalidatepage, 3501 .releasepage = ext4_releasepage, 3502 .direct_IO = ext4_direct_IO, 3503 .migratepage = buffer_migrate_page, 3504 .is_partially_uptodate = block_is_partially_uptodate, 3505 }; 3506 3507 void ext4_set_aops(struct inode *inode) 3508 { 3509 if (ext4_should_order_data(inode) && 3510 test_opt(inode->i_sb, DELALLOC)) 3511 inode->i_mapping->a_ops = &ext4_da_aops; 3512 else if (ext4_should_order_data(inode)) 3513 inode->i_mapping->a_ops = &ext4_ordered_aops; 3514 else if (ext4_should_writeback_data(inode) && 3515 test_opt(inode->i_sb, DELALLOC)) 3516 inode->i_mapping->a_ops = &ext4_da_aops; 3517 else if (ext4_should_writeback_data(inode)) 3518 inode->i_mapping->a_ops = &ext4_writeback_aops; 3519 else 3520 inode->i_mapping->a_ops = &ext4_journalled_aops; 3521 } 3522 3523 /* 3524 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3525 * up to the end of the block which corresponds to `from'. 3526 * This required during truncate. We need to physically zero the tail end 3527 * of that block so it doesn't yield old data if the file is later grown. 3528 */ 3529 int ext4_block_truncate_page(handle_t *handle, 3530 struct address_space *mapping, loff_t from) 3531 { 3532 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3533 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3534 unsigned blocksize, length, pos; 3535 ext4_lblk_t iblock; 3536 struct inode *inode = mapping->host; 3537 struct buffer_head *bh; 3538 struct page *page; 3539 int err = 0; 3540 3541 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3542 if (!page) 3543 return -EINVAL; 3544 3545 blocksize = inode->i_sb->s_blocksize; 3546 length = blocksize - (offset & (blocksize - 1)); 3547 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3548 3549 /* 3550 * For "nobh" option, we can only work if we don't need to 3551 * read-in the page - otherwise we create buffers to do the IO. 3552 */ 3553 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3554 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3555 zero_user(page, offset, length); 3556 set_page_dirty(page); 3557 goto unlock; 3558 } 3559 3560 if (!page_has_buffers(page)) 3561 create_empty_buffers(page, blocksize, 0); 3562 3563 /* Find the buffer that contains "offset" */ 3564 bh = page_buffers(page); 3565 pos = blocksize; 3566 while (offset >= pos) { 3567 bh = bh->b_this_page; 3568 iblock++; 3569 pos += blocksize; 3570 } 3571 3572 err = 0; 3573 if (buffer_freed(bh)) { 3574 BUFFER_TRACE(bh, "freed: skip"); 3575 goto unlock; 3576 } 3577 3578 if (!buffer_mapped(bh)) { 3579 BUFFER_TRACE(bh, "unmapped"); 3580 ext4_get_block(inode, iblock, bh, 0); 3581 /* unmapped? It's a hole - nothing to do */ 3582 if (!buffer_mapped(bh)) { 3583 BUFFER_TRACE(bh, "still unmapped"); 3584 goto unlock; 3585 } 3586 } 3587 3588 /* Ok, it's mapped. Make sure it's up-to-date */ 3589 if (PageUptodate(page)) 3590 set_buffer_uptodate(bh); 3591 3592 if (!buffer_uptodate(bh)) { 3593 err = -EIO; 3594 ll_rw_block(READ, 1, &bh); 3595 wait_on_buffer(bh); 3596 /* Uhhuh. Read error. Complain and punt. */ 3597 if (!buffer_uptodate(bh)) 3598 goto unlock; 3599 } 3600 3601 if (ext4_should_journal_data(inode)) { 3602 BUFFER_TRACE(bh, "get write access"); 3603 err = ext4_journal_get_write_access(handle, bh); 3604 if (err) 3605 goto unlock; 3606 } 3607 3608 zero_user(page, offset, length); 3609 3610 BUFFER_TRACE(bh, "zeroed end of block"); 3611 3612 err = 0; 3613 if (ext4_should_journal_data(inode)) { 3614 err = ext4_handle_dirty_metadata(handle, inode, bh); 3615 } else { 3616 if (ext4_should_order_data(inode)) 3617 err = ext4_jbd2_file_inode(handle, inode); 3618 mark_buffer_dirty(bh); 3619 } 3620 3621 unlock: 3622 unlock_page(page); 3623 page_cache_release(page); 3624 return err; 3625 } 3626 3627 /* 3628 * Probably it should be a library function... search for first non-zero word 3629 * or memcmp with zero_page, whatever is better for particular architecture. 3630 * Linus? 3631 */ 3632 static inline int all_zeroes(__le32 *p, __le32 *q) 3633 { 3634 while (p < q) 3635 if (*p++) 3636 return 0; 3637 return 1; 3638 } 3639 3640 /** 3641 * ext4_find_shared - find the indirect blocks for partial truncation. 3642 * @inode: inode in question 3643 * @depth: depth of the affected branch 3644 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3645 * @chain: place to store the pointers to partial indirect blocks 3646 * @top: place to the (detached) top of branch 3647 * 3648 * This is a helper function used by ext4_truncate(). 3649 * 3650 * When we do truncate() we may have to clean the ends of several 3651 * indirect blocks but leave the blocks themselves alive. Block is 3652 * partially truncated if some data below the new i_size is refered 3653 * from it (and it is on the path to the first completely truncated 3654 * data block, indeed). We have to free the top of that path along 3655 * with everything to the right of the path. Since no allocation 3656 * past the truncation point is possible until ext4_truncate() 3657 * finishes, we may safely do the latter, but top of branch may 3658 * require special attention - pageout below the truncation point 3659 * might try to populate it. 3660 * 3661 * We atomically detach the top of branch from the tree, store the 3662 * block number of its root in *@top, pointers to buffer_heads of 3663 * partially truncated blocks - in @chain[].bh and pointers to 3664 * their last elements that should not be removed - in 3665 * @chain[].p. Return value is the pointer to last filled element 3666 * of @chain. 3667 * 3668 * The work left to caller to do the actual freeing of subtrees: 3669 * a) free the subtree starting from *@top 3670 * b) free the subtrees whose roots are stored in 3671 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3672 * c) free the subtrees growing from the inode past the @chain[0]. 3673 * (no partially truncated stuff there). */ 3674 3675 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3676 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3677 { 3678 Indirect *partial, *p; 3679 int k, err; 3680 3681 *top = 0; 3682 /* Make k index the deepest non-null offest + 1 */ 3683 for (k = depth; k > 1 && !offsets[k-1]; k--) 3684 ; 3685 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3686 /* Writer: pointers */ 3687 if (!partial) 3688 partial = chain + k-1; 3689 /* 3690 * If the branch acquired continuation since we've looked at it - 3691 * fine, it should all survive and (new) top doesn't belong to us. 3692 */ 3693 if (!partial->key && *partial->p) 3694 /* Writer: end */ 3695 goto no_top; 3696 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3697 ; 3698 /* 3699 * OK, we've found the last block that must survive. The rest of our 3700 * branch should be detached before unlocking. However, if that rest 3701 * of branch is all ours and does not grow immediately from the inode 3702 * it's easier to cheat and just decrement partial->p. 3703 */ 3704 if (p == chain + k - 1 && p > chain) { 3705 p->p--; 3706 } else { 3707 *top = *p->p; 3708 /* Nope, don't do this in ext4. Must leave the tree intact */ 3709 #if 0 3710 *p->p = 0; 3711 #endif 3712 } 3713 /* Writer: end */ 3714 3715 while (partial > p) { 3716 brelse(partial->bh); 3717 partial--; 3718 } 3719 no_top: 3720 return partial; 3721 } 3722 3723 /* 3724 * Zero a number of block pointers in either an inode or an indirect block. 3725 * If we restart the transaction we must again get write access to the 3726 * indirect block for further modification. 3727 * 3728 * We release `count' blocks on disk, but (last - first) may be greater 3729 * than `count' because there can be holes in there. 3730 */ 3731 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3732 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3733 unsigned long count, __le32 *first, __le32 *last) 3734 { 3735 __le32 *p; 3736 if (try_to_extend_transaction(handle, inode)) { 3737 if (bh) { 3738 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3739 ext4_handle_dirty_metadata(handle, inode, bh); 3740 } 3741 ext4_mark_inode_dirty(handle, inode); 3742 ext4_journal_test_restart(handle, inode); 3743 if (bh) { 3744 BUFFER_TRACE(bh, "retaking write access"); 3745 ext4_journal_get_write_access(handle, bh); 3746 } 3747 } 3748 3749 /* 3750 * Any buffers which are on the journal will be in memory. We find 3751 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3752 * on them. We've already detached each block from the file, so 3753 * bforget() in jbd2_journal_forget() should be safe. 3754 * 3755 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3756 */ 3757 for (p = first; p < last; p++) { 3758 u32 nr = le32_to_cpu(*p); 3759 if (nr) { 3760 struct buffer_head *tbh; 3761 3762 *p = 0; 3763 tbh = sb_find_get_block(inode->i_sb, nr); 3764 ext4_forget(handle, 0, inode, tbh, nr); 3765 } 3766 } 3767 3768 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3769 } 3770 3771 /** 3772 * ext4_free_data - free a list of data blocks 3773 * @handle: handle for this transaction 3774 * @inode: inode we are dealing with 3775 * @this_bh: indirect buffer_head which contains *@first and *@last 3776 * @first: array of block numbers 3777 * @last: points immediately past the end of array 3778 * 3779 * We are freeing all blocks refered from that array (numbers are stored as 3780 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3781 * 3782 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3783 * blocks are contiguous then releasing them at one time will only affect one 3784 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3785 * actually use a lot of journal space. 3786 * 3787 * @this_bh will be %NULL if @first and @last point into the inode's direct 3788 * block pointers. 3789 */ 3790 static void ext4_free_data(handle_t *handle, struct inode *inode, 3791 struct buffer_head *this_bh, 3792 __le32 *first, __le32 *last) 3793 { 3794 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3795 unsigned long count = 0; /* Number of blocks in the run */ 3796 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3797 corresponding to 3798 block_to_free */ 3799 ext4_fsblk_t nr; /* Current block # */ 3800 __le32 *p; /* Pointer into inode/ind 3801 for current block */ 3802 int err; 3803 3804 if (this_bh) { /* For indirect block */ 3805 BUFFER_TRACE(this_bh, "get_write_access"); 3806 err = ext4_journal_get_write_access(handle, this_bh); 3807 /* Important: if we can't update the indirect pointers 3808 * to the blocks, we can't free them. */ 3809 if (err) 3810 return; 3811 } 3812 3813 for (p = first; p < last; p++) { 3814 nr = le32_to_cpu(*p); 3815 if (nr) { 3816 /* accumulate blocks to free if they're contiguous */ 3817 if (count == 0) { 3818 block_to_free = nr; 3819 block_to_free_p = p; 3820 count = 1; 3821 } else if (nr == block_to_free + count) { 3822 count++; 3823 } else { 3824 ext4_clear_blocks(handle, inode, this_bh, 3825 block_to_free, 3826 count, block_to_free_p, p); 3827 block_to_free = nr; 3828 block_to_free_p = p; 3829 count = 1; 3830 } 3831 } 3832 } 3833 3834 if (count > 0) 3835 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3836 count, block_to_free_p, p); 3837 3838 if (this_bh) { 3839 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3840 3841 /* 3842 * The buffer head should have an attached journal head at this 3843 * point. However, if the data is corrupted and an indirect 3844 * block pointed to itself, it would have been detached when 3845 * the block was cleared. Check for this instead of OOPSing. 3846 */ 3847 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3848 ext4_handle_dirty_metadata(handle, inode, this_bh); 3849 else 3850 ext4_error(inode->i_sb, __func__, 3851 "circular indirect block detected, " 3852 "inode=%lu, block=%llu", 3853 inode->i_ino, 3854 (unsigned long long) this_bh->b_blocknr); 3855 } 3856 } 3857 3858 /** 3859 * ext4_free_branches - free an array of branches 3860 * @handle: JBD handle for this transaction 3861 * @inode: inode we are dealing with 3862 * @parent_bh: the buffer_head which contains *@first and *@last 3863 * @first: array of block numbers 3864 * @last: pointer immediately past the end of array 3865 * @depth: depth of the branches to free 3866 * 3867 * We are freeing all blocks refered from these branches (numbers are 3868 * stored as little-endian 32-bit) and updating @inode->i_blocks 3869 * appropriately. 3870 */ 3871 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3872 struct buffer_head *parent_bh, 3873 __le32 *first, __le32 *last, int depth) 3874 { 3875 ext4_fsblk_t nr; 3876 __le32 *p; 3877 3878 if (ext4_handle_is_aborted(handle)) 3879 return; 3880 3881 if (depth--) { 3882 struct buffer_head *bh; 3883 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3884 p = last; 3885 while (--p >= first) { 3886 nr = le32_to_cpu(*p); 3887 if (!nr) 3888 continue; /* A hole */ 3889 3890 /* Go read the buffer for the next level down */ 3891 bh = sb_bread(inode->i_sb, nr); 3892 3893 /* 3894 * A read failure? Report error and clear slot 3895 * (should be rare). 3896 */ 3897 if (!bh) { 3898 ext4_error(inode->i_sb, "ext4_free_branches", 3899 "Read failure, inode=%lu, block=%llu", 3900 inode->i_ino, nr); 3901 continue; 3902 } 3903 3904 /* This zaps the entire block. Bottom up. */ 3905 BUFFER_TRACE(bh, "free child branches"); 3906 ext4_free_branches(handle, inode, bh, 3907 (__le32 *) bh->b_data, 3908 (__le32 *) bh->b_data + addr_per_block, 3909 depth); 3910 3911 /* 3912 * We've probably journalled the indirect block several 3913 * times during the truncate. But it's no longer 3914 * needed and we now drop it from the transaction via 3915 * jbd2_journal_revoke(). 3916 * 3917 * That's easy if it's exclusively part of this 3918 * transaction. But if it's part of the committing 3919 * transaction then jbd2_journal_forget() will simply 3920 * brelse() it. That means that if the underlying 3921 * block is reallocated in ext4_get_block(), 3922 * unmap_underlying_metadata() will find this block 3923 * and will try to get rid of it. damn, damn. 3924 * 3925 * If this block has already been committed to the 3926 * journal, a revoke record will be written. And 3927 * revoke records must be emitted *before* clearing 3928 * this block's bit in the bitmaps. 3929 */ 3930 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3931 3932 /* 3933 * Everything below this this pointer has been 3934 * released. Now let this top-of-subtree go. 3935 * 3936 * We want the freeing of this indirect block to be 3937 * atomic in the journal with the updating of the 3938 * bitmap block which owns it. So make some room in 3939 * the journal. 3940 * 3941 * We zero the parent pointer *after* freeing its 3942 * pointee in the bitmaps, so if extend_transaction() 3943 * for some reason fails to put the bitmap changes and 3944 * the release into the same transaction, recovery 3945 * will merely complain about releasing a free block, 3946 * rather than leaking blocks. 3947 */ 3948 if (ext4_handle_is_aborted(handle)) 3949 return; 3950 if (try_to_extend_transaction(handle, inode)) { 3951 ext4_mark_inode_dirty(handle, inode); 3952 ext4_journal_test_restart(handle, inode); 3953 } 3954 3955 ext4_free_blocks(handle, inode, nr, 1, 1); 3956 3957 if (parent_bh) { 3958 /* 3959 * The block which we have just freed is 3960 * pointed to by an indirect block: journal it 3961 */ 3962 BUFFER_TRACE(parent_bh, "get_write_access"); 3963 if (!ext4_journal_get_write_access(handle, 3964 parent_bh)){ 3965 *p = 0; 3966 BUFFER_TRACE(parent_bh, 3967 "call ext4_handle_dirty_metadata"); 3968 ext4_handle_dirty_metadata(handle, 3969 inode, 3970 parent_bh); 3971 } 3972 } 3973 } 3974 } else { 3975 /* We have reached the bottom of the tree. */ 3976 BUFFER_TRACE(parent_bh, "free data blocks"); 3977 ext4_free_data(handle, inode, parent_bh, first, last); 3978 } 3979 } 3980 3981 int ext4_can_truncate(struct inode *inode) 3982 { 3983 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3984 return 0; 3985 if (S_ISREG(inode->i_mode)) 3986 return 1; 3987 if (S_ISDIR(inode->i_mode)) 3988 return 1; 3989 if (S_ISLNK(inode->i_mode)) 3990 return !ext4_inode_is_fast_symlink(inode); 3991 return 0; 3992 } 3993 3994 /* 3995 * ext4_truncate() 3996 * 3997 * We block out ext4_get_block() block instantiations across the entire 3998 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3999 * simultaneously on behalf of the same inode. 4000 * 4001 * As we work through the truncate and commmit bits of it to the journal there 4002 * is one core, guiding principle: the file's tree must always be consistent on 4003 * disk. We must be able to restart the truncate after a crash. 4004 * 4005 * The file's tree may be transiently inconsistent in memory (although it 4006 * probably isn't), but whenever we close off and commit a journal transaction, 4007 * the contents of (the filesystem + the journal) must be consistent and 4008 * restartable. It's pretty simple, really: bottom up, right to left (although 4009 * left-to-right works OK too). 4010 * 4011 * Note that at recovery time, journal replay occurs *before* the restart of 4012 * truncate against the orphan inode list. 4013 * 4014 * The committed inode has the new, desired i_size (which is the same as 4015 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4016 * that this inode's truncate did not complete and it will again call 4017 * ext4_truncate() to have another go. So there will be instantiated blocks 4018 * to the right of the truncation point in a crashed ext4 filesystem. But 4019 * that's fine - as long as they are linked from the inode, the post-crash 4020 * ext4_truncate() run will find them and release them. 4021 */ 4022 void ext4_truncate(struct inode *inode) 4023 { 4024 handle_t *handle; 4025 struct ext4_inode_info *ei = EXT4_I(inode); 4026 __le32 *i_data = ei->i_data; 4027 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 4028 struct address_space *mapping = inode->i_mapping; 4029 ext4_lblk_t offsets[4]; 4030 Indirect chain[4]; 4031 Indirect *partial; 4032 __le32 nr = 0; 4033 int n; 4034 ext4_lblk_t last_block; 4035 unsigned blocksize = inode->i_sb->s_blocksize; 4036 4037 if (!ext4_can_truncate(inode)) 4038 return; 4039 4040 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4041 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 4042 4043 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4044 ext4_ext_truncate(inode); 4045 return; 4046 } 4047 4048 handle = start_transaction(inode); 4049 if (IS_ERR(handle)) 4050 return; /* AKPM: return what? */ 4051 4052 last_block = (inode->i_size + blocksize-1) 4053 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4054 4055 if (inode->i_size & (blocksize - 1)) 4056 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4057 goto out_stop; 4058 4059 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4060 if (n == 0) 4061 goto out_stop; /* error */ 4062 4063 /* 4064 * OK. This truncate is going to happen. We add the inode to the 4065 * orphan list, so that if this truncate spans multiple transactions, 4066 * and we crash, we will resume the truncate when the filesystem 4067 * recovers. It also marks the inode dirty, to catch the new size. 4068 * 4069 * Implication: the file must always be in a sane, consistent 4070 * truncatable state while each transaction commits. 4071 */ 4072 if (ext4_orphan_add(handle, inode)) 4073 goto out_stop; 4074 4075 /* 4076 * From here we block out all ext4_get_block() callers who want to 4077 * modify the block allocation tree. 4078 */ 4079 down_write(&ei->i_data_sem); 4080 4081 ext4_discard_preallocations(inode); 4082 4083 /* 4084 * The orphan list entry will now protect us from any crash which 4085 * occurs before the truncate completes, so it is now safe to propagate 4086 * the new, shorter inode size (held for now in i_size) into the 4087 * on-disk inode. We do this via i_disksize, which is the value which 4088 * ext4 *really* writes onto the disk inode. 4089 */ 4090 ei->i_disksize = inode->i_size; 4091 4092 if (n == 1) { /* direct blocks */ 4093 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4094 i_data + EXT4_NDIR_BLOCKS); 4095 goto do_indirects; 4096 } 4097 4098 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 4099 /* Kill the top of shared branch (not detached) */ 4100 if (nr) { 4101 if (partial == chain) { 4102 /* Shared branch grows from the inode */ 4103 ext4_free_branches(handle, inode, NULL, 4104 &nr, &nr+1, (chain+n-1) - partial); 4105 *partial->p = 0; 4106 /* 4107 * We mark the inode dirty prior to restart, 4108 * and prior to stop. No need for it here. 4109 */ 4110 } else { 4111 /* Shared branch grows from an indirect block */ 4112 BUFFER_TRACE(partial->bh, "get_write_access"); 4113 ext4_free_branches(handle, inode, partial->bh, 4114 partial->p, 4115 partial->p+1, (chain+n-1) - partial); 4116 } 4117 } 4118 /* Clear the ends of indirect blocks on the shared branch */ 4119 while (partial > chain) { 4120 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 4121 (__le32*)partial->bh->b_data+addr_per_block, 4122 (chain+n-1) - partial); 4123 BUFFER_TRACE(partial->bh, "call brelse"); 4124 brelse (partial->bh); 4125 partial--; 4126 } 4127 do_indirects: 4128 /* Kill the remaining (whole) subtrees */ 4129 switch (offsets[0]) { 4130 default: 4131 nr = i_data[EXT4_IND_BLOCK]; 4132 if (nr) { 4133 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 4134 i_data[EXT4_IND_BLOCK] = 0; 4135 } 4136 case EXT4_IND_BLOCK: 4137 nr = i_data[EXT4_DIND_BLOCK]; 4138 if (nr) { 4139 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 4140 i_data[EXT4_DIND_BLOCK] = 0; 4141 } 4142 case EXT4_DIND_BLOCK: 4143 nr = i_data[EXT4_TIND_BLOCK]; 4144 if (nr) { 4145 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 4146 i_data[EXT4_TIND_BLOCK] = 0; 4147 } 4148 case EXT4_TIND_BLOCK: 4149 ; 4150 } 4151 4152 up_write(&ei->i_data_sem); 4153 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4154 ext4_mark_inode_dirty(handle, inode); 4155 4156 /* 4157 * In a multi-transaction truncate, we only make the final transaction 4158 * synchronous 4159 */ 4160 if (IS_SYNC(inode)) 4161 ext4_handle_sync(handle); 4162 out_stop: 4163 /* 4164 * If this was a simple ftruncate(), and the file will remain alive 4165 * then we need to clear up the orphan record which we created above. 4166 * However, if this was a real unlink then we were called by 4167 * ext4_delete_inode(), and we allow that function to clean up the 4168 * orphan info for us. 4169 */ 4170 if (inode->i_nlink) 4171 ext4_orphan_del(handle, inode); 4172 4173 ext4_journal_stop(handle); 4174 } 4175 4176 /* 4177 * ext4_get_inode_loc returns with an extra refcount against the inode's 4178 * underlying buffer_head on success. If 'in_mem' is true, we have all 4179 * data in memory that is needed to recreate the on-disk version of this 4180 * inode. 4181 */ 4182 static int __ext4_get_inode_loc(struct inode *inode, 4183 struct ext4_iloc *iloc, int in_mem) 4184 { 4185 struct ext4_group_desc *gdp; 4186 struct buffer_head *bh; 4187 struct super_block *sb = inode->i_sb; 4188 ext4_fsblk_t block; 4189 int inodes_per_block, inode_offset; 4190 4191 iloc->bh = NULL; 4192 if (!ext4_valid_inum(sb, inode->i_ino)) 4193 return -EIO; 4194 4195 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4196 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4197 if (!gdp) 4198 return -EIO; 4199 4200 /* 4201 * Figure out the offset within the block group inode table 4202 */ 4203 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4204 inode_offset = ((inode->i_ino - 1) % 4205 EXT4_INODES_PER_GROUP(sb)); 4206 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4207 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4208 4209 bh = sb_getblk(sb, block); 4210 if (!bh) { 4211 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4212 "inode block - inode=%lu, block=%llu", 4213 inode->i_ino, block); 4214 return -EIO; 4215 } 4216 if (!buffer_uptodate(bh)) { 4217 lock_buffer(bh); 4218 4219 /* 4220 * If the buffer has the write error flag, we have failed 4221 * to write out another inode in the same block. In this 4222 * case, we don't have to read the block because we may 4223 * read the old inode data successfully. 4224 */ 4225 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4226 set_buffer_uptodate(bh); 4227 4228 if (buffer_uptodate(bh)) { 4229 /* someone brought it uptodate while we waited */ 4230 unlock_buffer(bh); 4231 goto has_buffer; 4232 } 4233 4234 /* 4235 * If we have all information of the inode in memory and this 4236 * is the only valid inode in the block, we need not read the 4237 * block. 4238 */ 4239 if (in_mem) { 4240 struct buffer_head *bitmap_bh; 4241 int i, start; 4242 4243 start = inode_offset & ~(inodes_per_block - 1); 4244 4245 /* Is the inode bitmap in cache? */ 4246 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4247 if (!bitmap_bh) 4248 goto make_io; 4249 4250 /* 4251 * If the inode bitmap isn't in cache then the 4252 * optimisation may end up performing two reads instead 4253 * of one, so skip it. 4254 */ 4255 if (!buffer_uptodate(bitmap_bh)) { 4256 brelse(bitmap_bh); 4257 goto make_io; 4258 } 4259 for (i = start; i < start + inodes_per_block; i++) { 4260 if (i == inode_offset) 4261 continue; 4262 if (ext4_test_bit(i, bitmap_bh->b_data)) 4263 break; 4264 } 4265 brelse(bitmap_bh); 4266 if (i == start + inodes_per_block) { 4267 /* all other inodes are free, so skip I/O */ 4268 memset(bh->b_data, 0, bh->b_size); 4269 set_buffer_uptodate(bh); 4270 unlock_buffer(bh); 4271 goto has_buffer; 4272 } 4273 } 4274 4275 make_io: 4276 /* 4277 * If we need to do any I/O, try to pre-readahead extra 4278 * blocks from the inode table. 4279 */ 4280 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4281 ext4_fsblk_t b, end, table; 4282 unsigned num; 4283 4284 table = ext4_inode_table(sb, gdp); 4285 /* s_inode_readahead_blks is always a power of 2 */ 4286 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4287 if (table > b) 4288 b = table; 4289 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4290 num = EXT4_INODES_PER_GROUP(sb); 4291 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4292 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4293 num -= ext4_itable_unused_count(sb, gdp); 4294 table += num / inodes_per_block; 4295 if (end > table) 4296 end = table; 4297 while (b <= end) 4298 sb_breadahead(sb, b++); 4299 } 4300 4301 /* 4302 * There are other valid inodes in the buffer, this inode 4303 * has in-inode xattrs, or we don't have this inode in memory. 4304 * Read the block from disk. 4305 */ 4306 get_bh(bh); 4307 bh->b_end_io = end_buffer_read_sync; 4308 submit_bh(READ_META, bh); 4309 wait_on_buffer(bh); 4310 if (!buffer_uptodate(bh)) { 4311 ext4_error(sb, __func__, 4312 "unable to read inode block - inode=%lu, " 4313 "block=%llu", inode->i_ino, block); 4314 brelse(bh); 4315 return -EIO; 4316 } 4317 } 4318 has_buffer: 4319 iloc->bh = bh; 4320 return 0; 4321 } 4322 4323 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4324 { 4325 /* We have all inode data except xattrs in memory here. */ 4326 return __ext4_get_inode_loc(inode, iloc, 4327 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4328 } 4329 4330 void ext4_set_inode_flags(struct inode *inode) 4331 { 4332 unsigned int flags = EXT4_I(inode)->i_flags; 4333 4334 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4335 if (flags & EXT4_SYNC_FL) 4336 inode->i_flags |= S_SYNC; 4337 if (flags & EXT4_APPEND_FL) 4338 inode->i_flags |= S_APPEND; 4339 if (flags & EXT4_IMMUTABLE_FL) 4340 inode->i_flags |= S_IMMUTABLE; 4341 if (flags & EXT4_NOATIME_FL) 4342 inode->i_flags |= S_NOATIME; 4343 if (flags & EXT4_DIRSYNC_FL) 4344 inode->i_flags |= S_DIRSYNC; 4345 } 4346 4347 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4348 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4349 { 4350 unsigned int flags = ei->vfs_inode.i_flags; 4351 4352 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4353 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4354 if (flags & S_SYNC) 4355 ei->i_flags |= EXT4_SYNC_FL; 4356 if (flags & S_APPEND) 4357 ei->i_flags |= EXT4_APPEND_FL; 4358 if (flags & S_IMMUTABLE) 4359 ei->i_flags |= EXT4_IMMUTABLE_FL; 4360 if (flags & S_NOATIME) 4361 ei->i_flags |= EXT4_NOATIME_FL; 4362 if (flags & S_DIRSYNC) 4363 ei->i_flags |= EXT4_DIRSYNC_FL; 4364 } 4365 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4366 struct ext4_inode_info *ei) 4367 { 4368 blkcnt_t i_blocks ; 4369 struct inode *inode = &(ei->vfs_inode); 4370 struct super_block *sb = inode->i_sb; 4371 4372 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4373 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4374 /* we are using combined 48 bit field */ 4375 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4376 le32_to_cpu(raw_inode->i_blocks_lo); 4377 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4378 /* i_blocks represent file system block size */ 4379 return i_blocks << (inode->i_blkbits - 9); 4380 } else { 4381 return i_blocks; 4382 } 4383 } else { 4384 return le32_to_cpu(raw_inode->i_blocks_lo); 4385 } 4386 } 4387 4388 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4389 { 4390 struct ext4_iloc iloc; 4391 struct ext4_inode *raw_inode; 4392 struct ext4_inode_info *ei; 4393 struct buffer_head *bh; 4394 struct inode *inode; 4395 long ret; 4396 int block; 4397 4398 inode = iget_locked(sb, ino); 4399 if (!inode) 4400 return ERR_PTR(-ENOMEM); 4401 if (!(inode->i_state & I_NEW)) 4402 return inode; 4403 4404 ei = EXT4_I(inode); 4405 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4406 ei->i_acl = EXT4_ACL_NOT_CACHED; 4407 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4408 #endif 4409 4410 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4411 if (ret < 0) 4412 goto bad_inode; 4413 bh = iloc.bh; 4414 raw_inode = ext4_raw_inode(&iloc); 4415 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4416 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4417 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4418 if (!(test_opt(inode->i_sb, NO_UID32))) { 4419 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4420 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4421 } 4422 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4423 4424 ei->i_state = 0; 4425 ei->i_dir_start_lookup = 0; 4426 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4427 /* We now have enough fields to check if the inode was active or not. 4428 * This is needed because nfsd might try to access dead inodes 4429 * the test is that same one that e2fsck uses 4430 * NeilBrown 1999oct15 4431 */ 4432 if (inode->i_nlink == 0) { 4433 if (inode->i_mode == 0 || 4434 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4435 /* this inode is deleted */ 4436 brelse(bh); 4437 ret = -ESTALE; 4438 goto bad_inode; 4439 } 4440 /* The only unlinked inodes we let through here have 4441 * valid i_mode and are being read by the orphan 4442 * recovery code: that's fine, we're about to complete 4443 * the process of deleting those. */ 4444 } 4445 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4446 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4447 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4448 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) 4449 ei->i_file_acl |= 4450 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4451 inode->i_size = ext4_isize(raw_inode); 4452 ei->i_disksize = inode->i_size; 4453 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4454 ei->i_block_group = iloc.block_group; 4455 ei->i_last_alloc_group = ~0; 4456 /* 4457 * NOTE! The in-memory inode i_data array is in little-endian order 4458 * even on big-endian machines: we do NOT byteswap the block numbers! 4459 */ 4460 for (block = 0; block < EXT4_N_BLOCKS; block++) 4461 ei->i_data[block] = raw_inode->i_block[block]; 4462 INIT_LIST_HEAD(&ei->i_orphan); 4463 4464 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4465 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4466 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4467 EXT4_INODE_SIZE(inode->i_sb)) { 4468 brelse(bh); 4469 ret = -EIO; 4470 goto bad_inode; 4471 } 4472 if (ei->i_extra_isize == 0) { 4473 /* The extra space is currently unused. Use it. */ 4474 ei->i_extra_isize = sizeof(struct ext4_inode) - 4475 EXT4_GOOD_OLD_INODE_SIZE; 4476 } else { 4477 __le32 *magic = (void *)raw_inode + 4478 EXT4_GOOD_OLD_INODE_SIZE + 4479 ei->i_extra_isize; 4480 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4481 ei->i_state |= EXT4_STATE_XATTR; 4482 } 4483 } else 4484 ei->i_extra_isize = 0; 4485 4486 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4487 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4488 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4489 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4490 4491 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4492 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4493 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4494 inode->i_version |= 4495 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4496 } 4497 4498 ret = 0; 4499 if (ei->i_file_acl && 4500 ((ei->i_file_acl < 4501 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 4502 EXT4_SB(sb)->s_gdb_count)) || 4503 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { 4504 ext4_error(sb, __func__, 4505 "bad extended attribute block %llu in inode #%lu", 4506 ei->i_file_acl, inode->i_ino); 4507 ret = -EIO; 4508 goto bad_inode; 4509 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 4510 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4511 (S_ISLNK(inode->i_mode) && 4512 !ext4_inode_is_fast_symlink(inode))) 4513 /* Validate extent which is part of inode */ 4514 ret = ext4_ext_check_inode(inode); 4515 } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4516 (S_ISLNK(inode->i_mode) && 4517 !ext4_inode_is_fast_symlink(inode))) { 4518 /* Validate block references which are part of inode */ 4519 ret = ext4_check_inode_blockref(inode); 4520 } 4521 if (ret) { 4522 brelse(bh); 4523 goto bad_inode; 4524 } 4525 4526 if (S_ISREG(inode->i_mode)) { 4527 inode->i_op = &ext4_file_inode_operations; 4528 inode->i_fop = &ext4_file_operations; 4529 ext4_set_aops(inode); 4530 } else if (S_ISDIR(inode->i_mode)) { 4531 inode->i_op = &ext4_dir_inode_operations; 4532 inode->i_fop = &ext4_dir_operations; 4533 } else if (S_ISLNK(inode->i_mode)) { 4534 if (ext4_inode_is_fast_symlink(inode)) { 4535 inode->i_op = &ext4_fast_symlink_inode_operations; 4536 nd_terminate_link(ei->i_data, inode->i_size, 4537 sizeof(ei->i_data) - 1); 4538 } else { 4539 inode->i_op = &ext4_symlink_inode_operations; 4540 ext4_set_aops(inode); 4541 } 4542 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 4543 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 4544 inode->i_op = &ext4_special_inode_operations; 4545 if (raw_inode->i_block[0]) 4546 init_special_inode(inode, inode->i_mode, 4547 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4548 else 4549 init_special_inode(inode, inode->i_mode, 4550 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4551 } else { 4552 brelse(bh); 4553 ret = -EIO; 4554 ext4_error(inode->i_sb, __func__, 4555 "bogus i_mode (%o) for inode=%lu", 4556 inode->i_mode, inode->i_ino); 4557 goto bad_inode; 4558 } 4559 brelse(iloc.bh); 4560 ext4_set_inode_flags(inode); 4561 unlock_new_inode(inode); 4562 return inode; 4563 4564 bad_inode: 4565 iget_failed(inode); 4566 return ERR_PTR(ret); 4567 } 4568 4569 static int ext4_inode_blocks_set(handle_t *handle, 4570 struct ext4_inode *raw_inode, 4571 struct ext4_inode_info *ei) 4572 { 4573 struct inode *inode = &(ei->vfs_inode); 4574 u64 i_blocks = inode->i_blocks; 4575 struct super_block *sb = inode->i_sb; 4576 4577 if (i_blocks <= ~0U) { 4578 /* 4579 * i_blocks can be represnted in a 32 bit variable 4580 * as multiple of 512 bytes 4581 */ 4582 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4583 raw_inode->i_blocks_high = 0; 4584 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4585 return 0; 4586 } 4587 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4588 return -EFBIG; 4589 4590 if (i_blocks <= 0xffffffffffffULL) { 4591 /* 4592 * i_blocks can be represented in a 48 bit variable 4593 * as multiple of 512 bytes 4594 */ 4595 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4596 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4597 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4598 } else { 4599 ei->i_flags |= EXT4_HUGE_FILE_FL; 4600 /* i_block is stored in file system block size */ 4601 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4602 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4603 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4604 } 4605 return 0; 4606 } 4607 4608 /* 4609 * Post the struct inode info into an on-disk inode location in the 4610 * buffer-cache. This gobbles the caller's reference to the 4611 * buffer_head in the inode location struct. 4612 * 4613 * The caller must have write access to iloc->bh. 4614 */ 4615 static int ext4_do_update_inode(handle_t *handle, 4616 struct inode *inode, 4617 struct ext4_iloc *iloc) 4618 { 4619 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4620 struct ext4_inode_info *ei = EXT4_I(inode); 4621 struct buffer_head *bh = iloc->bh; 4622 int err = 0, rc, block; 4623 4624 /* For fields not not tracking in the in-memory inode, 4625 * initialise them to zero for new inodes. */ 4626 if (ei->i_state & EXT4_STATE_NEW) 4627 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4628 4629 ext4_get_inode_flags(ei); 4630 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4631 if (!(test_opt(inode->i_sb, NO_UID32))) { 4632 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4633 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4634 /* 4635 * Fix up interoperability with old kernels. Otherwise, old inodes get 4636 * re-used with the upper 16 bits of the uid/gid intact 4637 */ 4638 if (!ei->i_dtime) { 4639 raw_inode->i_uid_high = 4640 cpu_to_le16(high_16_bits(inode->i_uid)); 4641 raw_inode->i_gid_high = 4642 cpu_to_le16(high_16_bits(inode->i_gid)); 4643 } else { 4644 raw_inode->i_uid_high = 0; 4645 raw_inode->i_gid_high = 0; 4646 } 4647 } else { 4648 raw_inode->i_uid_low = 4649 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4650 raw_inode->i_gid_low = 4651 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4652 raw_inode->i_uid_high = 0; 4653 raw_inode->i_gid_high = 0; 4654 } 4655 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4656 4657 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4658 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4659 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4660 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4661 4662 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4663 goto out_brelse; 4664 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4665 /* clear the migrate flag in the raw_inode */ 4666 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4667 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4668 cpu_to_le32(EXT4_OS_HURD)) 4669 raw_inode->i_file_acl_high = 4670 cpu_to_le16(ei->i_file_acl >> 32); 4671 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4672 ext4_isize_set(raw_inode, ei->i_disksize); 4673 if (ei->i_disksize > 0x7fffffffULL) { 4674 struct super_block *sb = inode->i_sb; 4675 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4676 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4677 EXT4_SB(sb)->s_es->s_rev_level == 4678 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4679 /* If this is the first large file 4680 * created, add a flag to the superblock. 4681 */ 4682 err = ext4_journal_get_write_access(handle, 4683 EXT4_SB(sb)->s_sbh); 4684 if (err) 4685 goto out_brelse; 4686 ext4_update_dynamic_rev(sb); 4687 EXT4_SET_RO_COMPAT_FEATURE(sb, 4688 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4689 sb->s_dirt = 1; 4690 ext4_handle_sync(handle); 4691 err = ext4_handle_dirty_metadata(handle, inode, 4692 EXT4_SB(sb)->s_sbh); 4693 } 4694 } 4695 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4696 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4697 if (old_valid_dev(inode->i_rdev)) { 4698 raw_inode->i_block[0] = 4699 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4700 raw_inode->i_block[1] = 0; 4701 } else { 4702 raw_inode->i_block[0] = 0; 4703 raw_inode->i_block[1] = 4704 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4705 raw_inode->i_block[2] = 0; 4706 } 4707 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4708 raw_inode->i_block[block] = ei->i_data[block]; 4709 4710 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4711 if (ei->i_extra_isize) { 4712 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4713 raw_inode->i_version_hi = 4714 cpu_to_le32(inode->i_version >> 32); 4715 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4716 } 4717 4718 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4719 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4720 if (!err) 4721 err = rc; 4722 ei->i_state &= ~EXT4_STATE_NEW; 4723 4724 out_brelse: 4725 brelse(bh); 4726 ext4_std_error(inode->i_sb, err); 4727 return err; 4728 } 4729 4730 /* 4731 * ext4_write_inode() 4732 * 4733 * We are called from a few places: 4734 * 4735 * - Within generic_file_write() for O_SYNC files. 4736 * Here, there will be no transaction running. We wait for any running 4737 * trasnaction to commit. 4738 * 4739 * - Within sys_sync(), kupdate and such. 4740 * We wait on commit, if tol to. 4741 * 4742 * - Within prune_icache() (PF_MEMALLOC == true) 4743 * Here we simply return. We can't afford to block kswapd on the 4744 * journal commit. 4745 * 4746 * In all cases it is actually safe for us to return without doing anything, 4747 * because the inode has been copied into a raw inode buffer in 4748 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4749 * knfsd. 4750 * 4751 * Note that we are absolutely dependent upon all inode dirtiers doing the 4752 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4753 * which we are interested. 4754 * 4755 * It would be a bug for them to not do this. The code: 4756 * 4757 * mark_inode_dirty(inode) 4758 * stuff(); 4759 * inode->i_size = expr; 4760 * 4761 * is in error because a kswapd-driven write_inode() could occur while 4762 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4763 * will no longer be on the superblock's dirty inode list. 4764 */ 4765 int ext4_write_inode(struct inode *inode, int wait) 4766 { 4767 if (current->flags & PF_MEMALLOC) 4768 return 0; 4769 4770 if (ext4_journal_current_handle()) { 4771 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4772 dump_stack(); 4773 return -EIO; 4774 } 4775 4776 if (!wait) 4777 return 0; 4778 4779 return ext4_force_commit(inode->i_sb); 4780 } 4781 4782 int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) 4783 { 4784 int err = 0; 4785 4786 mark_buffer_dirty(bh); 4787 if (inode && inode_needs_sync(inode)) { 4788 sync_dirty_buffer(bh); 4789 if (buffer_req(bh) && !buffer_uptodate(bh)) { 4790 ext4_error(inode->i_sb, __func__, 4791 "IO error syncing inode, " 4792 "inode=%lu, block=%llu", 4793 inode->i_ino, 4794 (unsigned long long)bh->b_blocknr); 4795 err = -EIO; 4796 } 4797 } 4798 return err; 4799 } 4800 4801 /* 4802 * ext4_setattr() 4803 * 4804 * Called from notify_change. 4805 * 4806 * We want to trap VFS attempts to truncate the file as soon as 4807 * possible. In particular, we want to make sure that when the VFS 4808 * shrinks i_size, we put the inode on the orphan list and modify 4809 * i_disksize immediately, so that during the subsequent flushing of 4810 * dirty pages and freeing of disk blocks, we can guarantee that any 4811 * commit will leave the blocks being flushed in an unused state on 4812 * disk. (On recovery, the inode will get truncated and the blocks will 4813 * be freed, so we have a strong guarantee that no future commit will 4814 * leave these blocks visible to the user.) 4815 * 4816 * Another thing we have to assure is that if we are in ordered mode 4817 * and inode is still attached to the committing transaction, we must 4818 * we start writeout of all the dirty pages which are being truncated. 4819 * This way we are sure that all the data written in the previous 4820 * transaction are already on disk (truncate waits for pages under 4821 * writeback). 4822 * 4823 * Called with inode->i_mutex down. 4824 */ 4825 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4826 { 4827 struct inode *inode = dentry->d_inode; 4828 int error, rc = 0; 4829 const unsigned int ia_valid = attr->ia_valid; 4830 4831 error = inode_change_ok(inode, attr); 4832 if (error) 4833 return error; 4834 4835 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4836 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4837 handle_t *handle; 4838 4839 /* (user+group)*(old+new) structure, inode write (sb, 4840 * inode block, ? - but truncate inode update has it) */ 4841 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4842 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4843 if (IS_ERR(handle)) { 4844 error = PTR_ERR(handle); 4845 goto err_out; 4846 } 4847 error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; 4848 if (error) { 4849 ext4_journal_stop(handle); 4850 return error; 4851 } 4852 /* Update corresponding info in inode so that everything is in 4853 * one transaction */ 4854 if (attr->ia_valid & ATTR_UID) 4855 inode->i_uid = attr->ia_uid; 4856 if (attr->ia_valid & ATTR_GID) 4857 inode->i_gid = attr->ia_gid; 4858 error = ext4_mark_inode_dirty(handle, inode); 4859 ext4_journal_stop(handle); 4860 } 4861 4862 if (attr->ia_valid & ATTR_SIZE) { 4863 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4864 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4865 4866 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4867 error = -EFBIG; 4868 goto err_out; 4869 } 4870 } 4871 } 4872 4873 if (S_ISREG(inode->i_mode) && 4874 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4875 handle_t *handle; 4876 4877 handle = ext4_journal_start(inode, 3); 4878 if (IS_ERR(handle)) { 4879 error = PTR_ERR(handle); 4880 goto err_out; 4881 } 4882 4883 error = ext4_orphan_add(handle, inode); 4884 EXT4_I(inode)->i_disksize = attr->ia_size; 4885 rc = ext4_mark_inode_dirty(handle, inode); 4886 if (!error) 4887 error = rc; 4888 ext4_journal_stop(handle); 4889 4890 if (ext4_should_order_data(inode)) { 4891 error = ext4_begin_ordered_truncate(inode, 4892 attr->ia_size); 4893 if (error) { 4894 /* Do as much error cleanup as possible */ 4895 handle = ext4_journal_start(inode, 3); 4896 if (IS_ERR(handle)) { 4897 ext4_orphan_del(NULL, inode); 4898 goto err_out; 4899 } 4900 ext4_orphan_del(handle, inode); 4901 ext4_journal_stop(handle); 4902 goto err_out; 4903 } 4904 } 4905 } 4906 4907 rc = inode_setattr(inode, attr); 4908 4909 /* If inode_setattr's call to ext4_truncate failed to get a 4910 * transaction handle at all, we need to clean up the in-core 4911 * orphan list manually. */ 4912 if (inode->i_nlink) 4913 ext4_orphan_del(NULL, inode); 4914 4915 if (!rc && (ia_valid & ATTR_MODE)) 4916 rc = ext4_acl_chmod(inode); 4917 4918 err_out: 4919 ext4_std_error(inode->i_sb, error); 4920 if (!error) 4921 error = rc; 4922 return error; 4923 } 4924 4925 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4926 struct kstat *stat) 4927 { 4928 struct inode *inode; 4929 unsigned long delalloc_blocks; 4930 4931 inode = dentry->d_inode; 4932 generic_fillattr(inode, stat); 4933 4934 /* 4935 * We can't update i_blocks if the block allocation is delayed 4936 * otherwise in the case of system crash before the real block 4937 * allocation is done, we will have i_blocks inconsistent with 4938 * on-disk file blocks. 4939 * We always keep i_blocks updated together with real 4940 * allocation. But to not confuse with user, stat 4941 * will return the blocks that include the delayed allocation 4942 * blocks for this file. 4943 */ 4944 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4945 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4946 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4947 4948 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4949 return 0; 4950 } 4951 4952 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4953 int chunk) 4954 { 4955 int indirects; 4956 4957 /* if nrblocks are contiguous */ 4958 if (chunk) { 4959 /* 4960 * With N contiguous data blocks, it need at most 4961 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4962 * 2 dindirect blocks 4963 * 1 tindirect block 4964 */ 4965 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4966 return indirects + 3; 4967 } 4968 /* 4969 * if nrblocks are not contiguous, worse case, each block touch 4970 * a indirect block, and each indirect block touch a double indirect 4971 * block, plus a triple indirect block 4972 */ 4973 indirects = nrblocks * 2 + 1; 4974 return indirects; 4975 } 4976 4977 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4978 { 4979 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 4980 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 4981 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4982 } 4983 4984 /* 4985 * Account for index blocks, block groups bitmaps and block group 4986 * descriptor blocks if modify datablocks and index blocks 4987 * worse case, the indexs blocks spread over different block groups 4988 * 4989 * If datablocks are discontiguous, they are possible to spread over 4990 * different block groups too. If they are contiugous, with flexbg, 4991 * they could still across block group boundary. 4992 * 4993 * Also account for superblock, inode, quota and xattr blocks 4994 */ 4995 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4996 { 4997 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4998 int gdpblocks; 4999 int idxblocks; 5000 int ret = 0; 5001 5002 /* 5003 * How many index blocks need to touch to modify nrblocks? 5004 * The "Chunk" flag indicating whether the nrblocks is 5005 * physically contiguous on disk 5006 * 5007 * For Direct IO and fallocate, they calls get_block to allocate 5008 * one single extent at a time, so they could set the "Chunk" flag 5009 */ 5010 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 5011 5012 ret = idxblocks; 5013 5014 /* 5015 * Now let's see how many group bitmaps and group descriptors need 5016 * to account 5017 */ 5018 groups = idxblocks; 5019 if (chunk) 5020 groups += 1; 5021 else 5022 groups += nrblocks; 5023 5024 gdpblocks = groups; 5025 if (groups > ngroups) 5026 groups = ngroups; 5027 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5028 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5029 5030 /* bitmaps and block group descriptor blocks */ 5031 ret += groups + gdpblocks; 5032 5033 /* Blocks for super block, inode, quota and xattr blocks */ 5034 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5035 5036 return ret; 5037 } 5038 5039 /* 5040 * Calulate the total number of credits to reserve to fit 5041 * the modification of a single pages into a single transaction, 5042 * which may include multiple chunks of block allocations. 5043 * 5044 * This could be called via ext4_write_begin() 5045 * 5046 * We need to consider the worse case, when 5047 * one new block per extent. 5048 */ 5049 int ext4_writepage_trans_blocks(struct inode *inode) 5050 { 5051 int bpp = ext4_journal_blocks_per_page(inode); 5052 int ret; 5053 5054 ret = ext4_meta_trans_blocks(inode, bpp, 0); 5055 5056 /* Account for data blocks for journalled mode */ 5057 if (ext4_should_journal_data(inode)) 5058 ret += bpp; 5059 return ret; 5060 } 5061 5062 /* 5063 * Calculate the journal credits for a chunk of data modification. 5064 * 5065 * This is called from DIO, fallocate or whoever calling 5066 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. 5067 * 5068 * journal buffers for data blocks are not included here, as DIO 5069 * and fallocate do no need to journal data buffers. 5070 */ 5071 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5072 { 5073 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5074 } 5075 5076 /* 5077 * The caller must have previously called ext4_reserve_inode_write(). 5078 * Give this, we know that the caller already has write access to iloc->bh. 5079 */ 5080 int ext4_mark_iloc_dirty(handle_t *handle, 5081 struct inode *inode, struct ext4_iloc *iloc) 5082 { 5083 int err = 0; 5084 5085 if (test_opt(inode->i_sb, I_VERSION)) 5086 inode_inc_iversion(inode); 5087 5088 /* the do_update_inode consumes one bh->b_count */ 5089 get_bh(iloc->bh); 5090 5091 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5092 err = ext4_do_update_inode(handle, inode, iloc); 5093 put_bh(iloc->bh); 5094 return err; 5095 } 5096 5097 /* 5098 * On success, We end up with an outstanding reference count against 5099 * iloc->bh. This _must_ be cleaned up later. 5100 */ 5101 5102 int 5103 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5104 struct ext4_iloc *iloc) 5105 { 5106 int err; 5107 5108 err = ext4_get_inode_loc(inode, iloc); 5109 if (!err) { 5110 BUFFER_TRACE(iloc->bh, "get_write_access"); 5111 err = ext4_journal_get_write_access(handle, iloc->bh); 5112 if (err) { 5113 brelse(iloc->bh); 5114 iloc->bh = NULL; 5115 } 5116 } 5117 ext4_std_error(inode->i_sb, err); 5118 return err; 5119 } 5120 5121 /* 5122 * Expand an inode by new_extra_isize bytes. 5123 * Returns 0 on success or negative error number on failure. 5124 */ 5125 static int ext4_expand_extra_isize(struct inode *inode, 5126 unsigned int new_extra_isize, 5127 struct ext4_iloc iloc, 5128 handle_t *handle) 5129 { 5130 struct ext4_inode *raw_inode; 5131 struct ext4_xattr_ibody_header *header; 5132 struct ext4_xattr_entry *entry; 5133 5134 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5135 return 0; 5136 5137 raw_inode = ext4_raw_inode(&iloc); 5138 5139 header = IHDR(inode, raw_inode); 5140 entry = IFIRST(header); 5141 5142 /* No extended attributes present */ 5143 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 5144 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5145 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 5146 new_extra_isize); 5147 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5148 return 0; 5149 } 5150 5151 /* try to expand with EAs present */ 5152 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 5153 raw_inode, handle); 5154 } 5155 5156 /* 5157 * What we do here is to mark the in-core inode as clean with respect to inode 5158 * dirtiness (it may still be data-dirty). 5159 * This means that the in-core inode may be reaped by prune_icache 5160 * without having to perform any I/O. This is a very good thing, 5161 * because *any* task may call prune_icache - even ones which 5162 * have a transaction open against a different journal. 5163 * 5164 * Is this cheating? Not really. Sure, we haven't written the 5165 * inode out, but prune_icache isn't a user-visible syncing function. 5166 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5167 * we start and wait on commits. 5168 * 5169 * Is this efficient/effective? Well, we're being nice to the system 5170 * by cleaning up our inodes proactively so they can be reaped 5171 * without I/O. But we are potentially leaving up to five seconds' 5172 * worth of inodes floating about which prune_icache wants us to 5173 * write out. One way to fix that would be to get prune_icache() 5174 * to do a write_super() to free up some memory. It has the desired 5175 * effect. 5176 */ 5177 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5178 { 5179 struct ext4_iloc iloc; 5180 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5181 static unsigned int mnt_count; 5182 int err, ret; 5183 5184 might_sleep(); 5185 err = ext4_reserve_inode_write(handle, inode, &iloc); 5186 if (ext4_handle_valid(handle) && 5187 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5188 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 5189 /* 5190 * We need extra buffer credits since we may write into EA block 5191 * with this same handle. If journal_extend fails, then it will 5192 * only result in a minor loss of functionality for that inode. 5193 * If this is felt to be critical, then e2fsck should be run to 5194 * force a large enough s_min_extra_isize. 5195 */ 5196 if ((jbd2_journal_extend(handle, 5197 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 5198 ret = ext4_expand_extra_isize(inode, 5199 sbi->s_want_extra_isize, 5200 iloc, handle); 5201 if (ret) { 5202 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 5203 if (mnt_count != 5204 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5205 ext4_warning(inode->i_sb, __func__, 5206 "Unable to expand inode %lu. Delete" 5207 " some EAs or run e2fsck.", 5208 inode->i_ino); 5209 mnt_count = 5210 le16_to_cpu(sbi->s_es->s_mnt_count); 5211 } 5212 } 5213 } 5214 } 5215 if (!err) 5216 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 5217 return err; 5218 } 5219 5220 /* 5221 * ext4_dirty_inode() is called from __mark_inode_dirty() 5222 * 5223 * We're really interested in the case where a file is being extended. 5224 * i_size has been changed by generic_commit_write() and we thus need 5225 * to include the updated inode in the current transaction. 5226 * 5227 * Also, vfs_dq_alloc_block() will always dirty the inode when blocks 5228 * are allocated to the file. 5229 * 5230 * If the inode is marked synchronous, we don't honour that here - doing 5231 * so would cause a commit on atime updates, which we don't bother doing. 5232 * We handle synchronous inodes at the highest possible level. 5233 */ 5234 void ext4_dirty_inode(struct inode *inode) 5235 { 5236 handle_t *current_handle = ext4_journal_current_handle(); 5237 handle_t *handle; 5238 5239 if (!ext4_handle_valid(current_handle)) { 5240 ext4_mark_inode_dirty(current_handle, inode); 5241 return; 5242 } 5243 5244 handle = ext4_journal_start(inode, 2); 5245 if (IS_ERR(handle)) 5246 goto out; 5247 if (current_handle && 5248 current_handle->h_transaction != handle->h_transaction) { 5249 /* This task has a transaction open against a different fs */ 5250 printk(KERN_EMERG "%s: transactions do not match!\n", 5251 __func__); 5252 } else { 5253 jbd_debug(5, "marking dirty. outer handle=%p\n", 5254 current_handle); 5255 ext4_mark_inode_dirty(handle, inode); 5256 } 5257 ext4_journal_stop(handle); 5258 out: 5259 return; 5260 } 5261 5262 #if 0 5263 /* 5264 * Bind an inode's backing buffer_head into this transaction, to prevent 5265 * it from being flushed to disk early. Unlike 5266 * ext4_reserve_inode_write, this leaves behind no bh reference and 5267 * returns no iloc structure, so the caller needs to repeat the iloc 5268 * lookup to mark the inode dirty later. 5269 */ 5270 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5271 { 5272 struct ext4_iloc iloc; 5273 5274 int err = 0; 5275 if (handle) { 5276 err = ext4_get_inode_loc(inode, &iloc); 5277 if (!err) { 5278 BUFFER_TRACE(iloc.bh, "get_write_access"); 5279 err = jbd2_journal_get_write_access(handle, iloc.bh); 5280 if (!err) 5281 err = ext4_handle_dirty_metadata(handle, 5282 inode, 5283 iloc.bh); 5284 brelse(iloc.bh); 5285 } 5286 } 5287 ext4_std_error(inode->i_sb, err); 5288 return err; 5289 } 5290 #endif 5291 5292 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5293 { 5294 journal_t *journal; 5295 handle_t *handle; 5296 int err; 5297 5298 /* 5299 * We have to be very careful here: changing a data block's 5300 * journaling status dynamically is dangerous. If we write a 5301 * data block to the journal, change the status and then delete 5302 * that block, we risk forgetting to revoke the old log record 5303 * from the journal and so a subsequent replay can corrupt data. 5304 * So, first we make sure that the journal is empty and that 5305 * nobody is changing anything. 5306 */ 5307 5308 journal = EXT4_JOURNAL(inode); 5309 if (!journal) 5310 return 0; 5311 if (is_journal_aborted(journal)) 5312 return -EROFS; 5313 5314 jbd2_journal_lock_updates(journal); 5315 jbd2_journal_flush(journal); 5316 5317 /* 5318 * OK, there are no updates running now, and all cached data is 5319 * synced to disk. We are now in a completely consistent state 5320 * which doesn't have anything in the journal, and we know that 5321 * no filesystem updates are running, so it is safe to modify 5322 * the inode's in-core data-journaling state flag now. 5323 */ 5324 5325 if (val) 5326 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5327 else 5328 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5329 ext4_set_aops(inode); 5330 5331 jbd2_journal_unlock_updates(journal); 5332 5333 /* Finally we can mark the inode as dirty. */ 5334 5335 handle = ext4_journal_start(inode, 1); 5336 if (IS_ERR(handle)) 5337 return PTR_ERR(handle); 5338 5339 err = ext4_mark_inode_dirty(handle, inode); 5340 ext4_handle_sync(handle); 5341 ext4_journal_stop(handle); 5342 ext4_std_error(inode->i_sb, err); 5343 5344 return err; 5345 } 5346 5347 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5348 { 5349 return !buffer_mapped(bh); 5350 } 5351 5352 int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 5353 { 5354 struct page *page = vmf->page; 5355 loff_t size; 5356 unsigned long len; 5357 int ret = -EINVAL; 5358 void *fsdata; 5359 struct file *file = vma->vm_file; 5360 struct inode *inode = file->f_path.dentry->d_inode; 5361 struct address_space *mapping = inode->i_mapping; 5362 5363 /* 5364 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 5365 * get i_mutex because we are already holding mmap_sem. 5366 */ 5367 down_read(&inode->i_alloc_sem); 5368 size = i_size_read(inode); 5369 if (page->mapping != mapping || size <= page_offset(page) 5370 || !PageUptodate(page)) { 5371 /* page got truncated from under us? */ 5372 goto out_unlock; 5373 } 5374 ret = 0; 5375 if (PageMappedToDisk(page)) 5376 goto out_unlock; 5377 5378 if (page->index == size >> PAGE_CACHE_SHIFT) 5379 len = size & ~PAGE_CACHE_MASK; 5380 else 5381 len = PAGE_CACHE_SIZE; 5382 5383 if (page_has_buffers(page)) { 5384 /* return if we have all the buffers mapped */ 5385 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5386 ext4_bh_unmapped)) 5387 goto out_unlock; 5388 } 5389 /* 5390 * OK, we need to fill the hole... Do write_begin write_end 5391 * to do block allocation/reservation.We are not holding 5392 * inode.i__mutex here. That allow * parallel write_begin, 5393 * write_end call. lock_page prevent this from happening 5394 * on the same page though 5395 */ 5396 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 5397 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 5398 if (ret < 0) 5399 goto out_unlock; 5400 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 5401 len, len, page, fsdata); 5402 if (ret < 0) 5403 goto out_unlock; 5404 ret = 0; 5405 out_unlock: 5406 if (ret) 5407 ret = VM_FAULT_SIGBUS; 5408 up_read(&inode->i_alloc_sem); 5409 return ret; 5410 } 5411