1 /* 2 * linux/fs/ext4/inode.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/inode.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * Goal-directed block allocation by Stephen Tweedie 16 * (sct@redhat.com), 1993, 1998 17 * Big-endian to little-endian byte-swapping/bitmaps by 18 * David S. Miller (davem@caip.rutgers.edu), 1995 19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 20 * (jj@sunsite.ms.mff.cuni.cz) 21 * 22 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 23 */ 24 25 #include <linux/module.h> 26 #include <linux/fs.h> 27 #include <linux/time.h> 28 #include <linux/jbd2.h> 29 #include <linux/highuid.h> 30 #include <linux/pagemap.h> 31 #include <linux/quotaops.h> 32 #include <linux/string.h> 33 #include <linux/buffer_head.h> 34 #include <linux/writeback.h> 35 #include <linux/pagevec.h> 36 #include <linux/mpage.h> 37 #include <linux/namei.h> 38 #include <linux/uio.h> 39 #include <linux/bio.h> 40 #include "ext4_jbd2.h" 41 #include "xattr.h" 42 #include "acl.h" 43 #include "ext4_extents.h" 44 45 #define MPAGE_DA_EXTENT_TAIL 0x01 46 47 static inline int ext4_begin_ordered_truncate(struct inode *inode, 48 loff_t new_size) 49 { 50 return jbd2_journal_begin_ordered_truncate( 51 EXT4_SB(inode->i_sb)->s_journal, 52 &EXT4_I(inode)->jinode, 53 new_size); 54 } 55 56 static void ext4_invalidatepage(struct page *page, unsigned long offset); 57 58 /* 59 * Test whether an inode is a fast symlink. 60 */ 61 static int ext4_inode_is_fast_symlink(struct inode *inode) 62 { 63 int ea_blocks = EXT4_I(inode)->i_file_acl ? 64 (inode->i_sb->s_blocksize >> 9) : 0; 65 66 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 67 } 68 69 /* 70 * The ext4 forget function must perform a revoke if we are freeing data 71 * which has been journaled. Metadata (eg. indirect blocks) must be 72 * revoked in all cases. 73 * 74 * "bh" may be NULL: a metadata block may have been freed from memory 75 * but there may still be a record of it in the journal, and that record 76 * still needs to be revoked. 77 * 78 * If the handle isn't valid we're not journaling so there's nothing to do. 79 */ 80 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, 81 struct buffer_head *bh, ext4_fsblk_t blocknr) 82 { 83 int err; 84 85 if (!ext4_handle_valid(handle)) 86 return 0; 87 88 might_sleep(); 89 90 BUFFER_TRACE(bh, "enter"); 91 92 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 93 "data mode %lx\n", 94 bh, is_metadata, inode->i_mode, 95 test_opt(inode->i_sb, DATA_FLAGS)); 96 97 /* Never use the revoke function if we are doing full data 98 * journaling: there is no need to, and a V1 superblock won't 99 * support it. Otherwise, only skip the revoke on un-journaled 100 * data blocks. */ 101 102 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 103 (!is_metadata && !ext4_should_journal_data(inode))) { 104 if (bh) { 105 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 106 return ext4_journal_forget(handle, bh); 107 } 108 return 0; 109 } 110 111 /* 112 * data!=journal && (is_metadata || should_journal_data(inode)) 113 */ 114 BUFFER_TRACE(bh, "call ext4_journal_revoke"); 115 err = ext4_journal_revoke(handle, blocknr, bh); 116 if (err) 117 ext4_abort(inode->i_sb, __func__, 118 "error %d when attempting revoke", err); 119 BUFFER_TRACE(bh, "exit"); 120 return err; 121 } 122 123 /* 124 * Work out how many blocks we need to proceed with the next chunk of a 125 * truncate transaction. 126 */ 127 static unsigned long blocks_for_truncate(struct inode *inode) 128 { 129 ext4_lblk_t needed; 130 131 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); 132 133 /* Give ourselves just enough room to cope with inodes in which 134 * i_blocks is corrupt: we've seen disk corruptions in the past 135 * which resulted in random data in an inode which looked enough 136 * like a regular file for ext4 to try to delete it. Things 137 * will go a bit crazy if that happens, but at least we should 138 * try not to panic the whole kernel. */ 139 if (needed < 2) 140 needed = 2; 141 142 /* But we need to bound the transaction so we don't overflow the 143 * journal. */ 144 if (needed > EXT4_MAX_TRANS_DATA) 145 needed = EXT4_MAX_TRANS_DATA; 146 147 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; 148 } 149 150 /* 151 * Truncate transactions can be complex and absolutely huge. So we need to 152 * be able to restart the transaction at a conventient checkpoint to make 153 * sure we don't overflow the journal. 154 * 155 * start_transaction gets us a new handle for a truncate transaction, 156 * and extend_transaction tries to extend the existing one a bit. If 157 * extend fails, we need to propagate the failure up and restart the 158 * transaction in the top-level truncate loop. --sct 159 */ 160 static handle_t *start_transaction(struct inode *inode) 161 { 162 handle_t *result; 163 164 result = ext4_journal_start(inode, blocks_for_truncate(inode)); 165 if (!IS_ERR(result)) 166 return result; 167 168 ext4_std_error(inode->i_sb, PTR_ERR(result)); 169 return result; 170 } 171 172 /* 173 * Try to extend this transaction for the purposes of truncation. 174 * 175 * Returns 0 if we managed to create more room. If we can't create more 176 * room, and the transaction must be restarted we return 1. 177 */ 178 static int try_to_extend_transaction(handle_t *handle, struct inode *inode) 179 { 180 if (!ext4_handle_valid(handle)) 181 return 0; 182 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) 183 return 0; 184 if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) 185 return 0; 186 return 1; 187 } 188 189 /* 190 * Restart the transaction associated with *handle. This does a commit, 191 * so before we call here everything must be consistently dirtied against 192 * this transaction. 193 */ 194 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 { 196 BUG_ON(EXT4_JOURNAL(inode) == NULL); 197 jbd_debug(2, "restarting handle %p\n", handle); 198 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 199 } 200 201 /* 202 * Called at the last iput() if i_nlink is zero. 203 */ 204 void ext4_delete_inode(struct inode *inode) 205 { 206 handle_t *handle; 207 int err; 208 209 if (ext4_should_order_data(inode)) 210 ext4_begin_ordered_truncate(inode, 0); 211 truncate_inode_pages(&inode->i_data, 0); 212 213 if (is_bad_inode(inode)) 214 goto no_delete; 215 216 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 217 if (IS_ERR(handle)) { 218 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 219 /* 220 * If we're going to skip the normal cleanup, we still need to 221 * make sure that the in-core orphan linked list is properly 222 * cleaned up. 223 */ 224 ext4_orphan_del(NULL, inode); 225 goto no_delete; 226 } 227 228 if (IS_SYNC(inode)) 229 ext4_handle_sync(handle); 230 inode->i_size = 0; 231 err = ext4_mark_inode_dirty(handle, inode); 232 if (err) { 233 ext4_warning(inode->i_sb, __func__, 234 "couldn't mark inode dirty (err %d)", err); 235 goto stop_handle; 236 } 237 if (inode->i_blocks) 238 ext4_truncate(inode); 239 240 /* 241 * ext4_ext_truncate() doesn't reserve any slop when it 242 * restarts journal transactions; therefore there may not be 243 * enough credits left in the handle to remove the inode from 244 * the orphan list and set the dtime field. 245 */ 246 if (!ext4_handle_has_enough_credits(handle, 3)) { 247 err = ext4_journal_extend(handle, 3); 248 if (err > 0) 249 err = ext4_journal_restart(handle, 3); 250 if (err != 0) { 251 ext4_warning(inode->i_sb, __func__, 252 "couldn't extend journal (err %d)", err); 253 stop_handle: 254 ext4_journal_stop(handle); 255 goto no_delete; 256 } 257 } 258 259 /* 260 * Kill off the orphan record which ext4_truncate created. 261 * AKPM: I think this can be inside the above `if'. 262 * Note that ext4_orphan_del() has to be able to cope with the 263 * deletion of a non-existent orphan - this is because we don't 264 * know if ext4_truncate() actually created an orphan record. 265 * (Well, we could do this if we need to, but heck - it works) 266 */ 267 ext4_orphan_del(handle, inode); 268 EXT4_I(inode)->i_dtime = get_seconds(); 269 270 /* 271 * One subtle ordering requirement: if anything has gone wrong 272 * (transaction abort, IO errors, whatever), then we can still 273 * do these next steps (the fs will already have been marked as 274 * having errors), but we can't free the inode if the mark_dirty 275 * fails. 276 */ 277 if (ext4_mark_inode_dirty(handle, inode)) 278 /* If that failed, just do the required in-core inode clear. */ 279 clear_inode(inode); 280 else 281 ext4_free_inode(handle, inode); 282 ext4_journal_stop(handle); 283 return; 284 no_delete: 285 clear_inode(inode); /* We must guarantee clearing of inode... */ 286 } 287 288 typedef struct { 289 __le32 *p; 290 __le32 key; 291 struct buffer_head *bh; 292 } Indirect; 293 294 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) 295 { 296 p->key = *(p->p = v); 297 p->bh = bh; 298 } 299 300 /** 301 * ext4_block_to_path - parse the block number into array of offsets 302 * @inode: inode in question (we are only interested in its superblock) 303 * @i_block: block number to be parsed 304 * @offsets: array to store the offsets in 305 * @boundary: set this non-zero if the referred-to block is likely to be 306 * followed (on disk) by an indirect block. 307 * 308 * To store the locations of file's data ext4 uses a data structure common 309 * for UNIX filesystems - tree of pointers anchored in the inode, with 310 * data blocks at leaves and indirect blocks in intermediate nodes. 311 * This function translates the block number into path in that tree - 312 * return value is the path length and @offsets[n] is the offset of 313 * pointer to (n+1)th node in the nth one. If @block is out of range 314 * (negative or too large) warning is printed and zero returned. 315 * 316 * Note: function doesn't find node addresses, so no IO is needed. All 317 * we need to know is the capacity of indirect blocks (taken from the 318 * inode->i_sb). 319 */ 320 321 /* 322 * Portability note: the last comparison (check that we fit into triple 323 * indirect block) is spelled differently, because otherwise on an 324 * architecture with 32-bit longs and 8Kb pages we might get into trouble 325 * if our filesystem had 8Kb blocks. We might use long long, but that would 326 * kill us on x86. Oh, well, at least the sign propagation does not matter - 327 * i_block would have to be negative in the very beginning, so we would not 328 * get there at all. 329 */ 330 331 static int ext4_block_to_path(struct inode *inode, 332 ext4_lblk_t i_block, 333 ext4_lblk_t offsets[4], int *boundary) 334 { 335 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); 336 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); 337 const long direct_blocks = EXT4_NDIR_BLOCKS, 338 indirect_blocks = ptrs, 339 double_blocks = (1 << (ptrs_bits * 2)); 340 int n = 0; 341 int final = 0; 342 343 if (i_block < 0) { 344 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); 345 } else if (i_block < direct_blocks) { 346 offsets[n++] = i_block; 347 final = direct_blocks; 348 } else if ((i_block -= direct_blocks) < indirect_blocks) { 349 offsets[n++] = EXT4_IND_BLOCK; 350 offsets[n++] = i_block; 351 final = ptrs; 352 } else if ((i_block -= indirect_blocks) < double_blocks) { 353 offsets[n++] = EXT4_DIND_BLOCK; 354 offsets[n++] = i_block >> ptrs_bits; 355 offsets[n++] = i_block & (ptrs - 1); 356 final = ptrs; 357 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { 358 offsets[n++] = EXT4_TIND_BLOCK; 359 offsets[n++] = i_block >> (ptrs_bits * 2); 360 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); 361 offsets[n++] = i_block & (ptrs - 1); 362 final = ptrs; 363 } else { 364 ext4_warning(inode->i_sb, "ext4_block_to_path", 365 "block %lu > max in inode %lu", 366 i_block + direct_blocks + 367 indirect_blocks + double_blocks, inode->i_ino); 368 } 369 if (boundary) 370 *boundary = final - 1 - (i_block & (ptrs - 1)); 371 return n; 372 } 373 374 /** 375 * ext4_get_branch - read the chain of indirect blocks leading to data 376 * @inode: inode in question 377 * @depth: depth of the chain (1 - direct pointer, etc.) 378 * @offsets: offsets of pointers in inode/indirect blocks 379 * @chain: place to store the result 380 * @err: here we store the error value 381 * 382 * Function fills the array of triples <key, p, bh> and returns %NULL 383 * if everything went OK or the pointer to the last filled triple 384 * (incomplete one) otherwise. Upon the return chain[i].key contains 385 * the number of (i+1)-th block in the chain (as it is stored in memory, 386 * i.e. little-endian 32-bit), chain[i].p contains the address of that 387 * number (it points into struct inode for i==0 and into the bh->b_data 388 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect 389 * block for i>0 and NULL for i==0. In other words, it holds the block 390 * numbers of the chain, addresses they were taken from (and where we can 391 * verify that chain did not change) and buffer_heads hosting these 392 * numbers. 393 * 394 * Function stops when it stumbles upon zero pointer (absent block) 395 * (pointer to last triple returned, *@err == 0) 396 * or when it gets an IO error reading an indirect block 397 * (ditto, *@err == -EIO) 398 * or when it reads all @depth-1 indirect blocks successfully and finds 399 * the whole chain, all way to the data (returns %NULL, *err == 0). 400 * 401 * Need to be called with 402 * down_read(&EXT4_I(inode)->i_data_sem) 403 */ 404 static Indirect *ext4_get_branch(struct inode *inode, int depth, 405 ext4_lblk_t *offsets, 406 Indirect chain[4], int *err) 407 { 408 struct super_block *sb = inode->i_sb; 409 Indirect *p = chain; 410 struct buffer_head *bh; 411 412 *err = 0; 413 /* i_data is not going away, no lock needed */ 414 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); 415 if (!p->key) 416 goto no_block; 417 while (--depth) { 418 bh = sb_bread(sb, le32_to_cpu(p->key)); 419 if (!bh) 420 goto failure; 421 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); 422 /* Reader: end */ 423 if (!p->key) 424 goto no_block; 425 } 426 return NULL; 427 428 failure: 429 *err = -EIO; 430 no_block: 431 return p; 432 } 433 434 /** 435 * ext4_find_near - find a place for allocation with sufficient locality 436 * @inode: owner 437 * @ind: descriptor of indirect block. 438 * 439 * This function returns the preferred place for block allocation. 440 * It is used when heuristic for sequential allocation fails. 441 * Rules are: 442 * + if there is a block to the left of our position - allocate near it. 443 * + if pointer will live in indirect block - allocate near that block. 444 * + if pointer will live in inode - allocate in the same 445 * cylinder group. 446 * 447 * In the latter case we colour the starting block by the callers PID to 448 * prevent it from clashing with concurrent allocations for a different inode 449 * in the same block group. The PID is used here so that functionally related 450 * files will be close-by on-disk. 451 * 452 * Caller must make sure that @ind is valid and will stay that way. 453 */ 454 static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) 455 { 456 struct ext4_inode_info *ei = EXT4_I(inode); 457 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; 458 __le32 *p; 459 ext4_fsblk_t bg_start; 460 ext4_fsblk_t last_block; 461 ext4_grpblk_t colour; 462 463 /* Try to find previous block */ 464 for (p = ind->p - 1; p >= start; p--) { 465 if (*p) 466 return le32_to_cpu(*p); 467 } 468 469 /* No such thing, so let's try location of indirect block */ 470 if (ind->bh) 471 return ind->bh->b_blocknr; 472 473 /* 474 * It is going to be referred to from the inode itself? OK, just put it 475 * into the same cylinder group then. 476 */ 477 bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group); 478 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; 479 480 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) 481 colour = (current->pid % 16) * 482 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); 483 else 484 colour = (current->pid % 16) * ((last_block - bg_start) / 16); 485 return bg_start + colour; 486 } 487 488 /** 489 * ext4_find_goal - find a preferred place for allocation. 490 * @inode: owner 491 * @block: block we want 492 * @partial: pointer to the last triple within a chain 493 * 494 * Normally this function find the preferred place for block allocation, 495 * returns it. 496 */ 497 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 498 Indirect *partial) 499 { 500 /* 501 * XXX need to get goal block from mballoc's data structures 502 */ 503 504 return ext4_find_near(inode, partial); 505 } 506 507 /** 508 * ext4_blks_to_allocate: Look up the block map and count the number 509 * of direct blocks need to be allocated for the given branch. 510 * 511 * @branch: chain of indirect blocks 512 * @k: number of blocks need for indirect blocks 513 * @blks: number of data blocks to be mapped. 514 * @blocks_to_boundary: the offset in the indirect block 515 * 516 * return the total number of blocks to be allocate, including the 517 * direct and indirect blocks. 518 */ 519 static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, 520 int blocks_to_boundary) 521 { 522 unsigned int count = 0; 523 524 /* 525 * Simple case, [t,d]Indirect block(s) has not allocated yet 526 * then it's clear blocks on that path have not allocated 527 */ 528 if (k > 0) { 529 /* right now we don't handle cross boundary allocation */ 530 if (blks < blocks_to_boundary + 1) 531 count += blks; 532 else 533 count += blocks_to_boundary + 1; 534 return count; 535 } 536 537 count++; 538 while (count < blks && count <= blocks_to_boundary && 539 le32_to_cpu(*(branch[0].p + count)) == 0) { 540 count++; 541 } 542 return count; 543 } 544 545 /** 546 * ext4_alloc_blocks: multiple allocate blocks needed for a branch 547 * @indirect_blks: the number of blocks need to allocate for indirect 548 * blocks 549 * 550 * @new_blocks: on return it will store the new block numbers for 551 * the indirect blocks(if needed) and the first direct block, 552 * @blks: on return it will store the total number of allocated 553 * direct blocks 554 */ 555 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, 556 ext4_lblk_t iblock, ext4_fsblk_t goal, 557 int indirect_blks, int blks, 558 ext4_fsblk_t new_blocks[4], int *err) 559 { 560 struct ext4_allocation_request ar; 561 int target, i; 562 unsigned long count = 0, blk_allocated = 0; 563 int index = 0; 564 ext4_fsblk_t current_block = 0; 565 int ret = 0; 566 567 /* 568 * Here we try to allocate the requested multiple blocks at once, 569 * on a best-effort basis. 570 * To build a branch, we should allocate blocks for 571 * the indirect blocks(if not allocated yet), and at least 572 * the first direct block of this branch. That's the 573 * minimum number of blocks need to allocate(required) 574 */ 575 /* first we try to allocate the indirect blocks */ 576 target = indirect_blks; 577 while (target > 0) { 578 count = target; 579 /* allocating blocks for indirect blocks and direct blocks */ 580 current_block = ext4_new_meta_blocks(handle, inode, 581 goal, &count, err); 582 if (*err) 583 goto failed_out; 584 585 target -= count; 586 /* allocate blocks for indirect blocks */ 587 while (index < indirect_blks && count) { 588 new_blocks[index++] = current_block++; 589 count--; 590 } 591 if (count > 0) { 592 /* 593 * save the new block number 594 * for the first direct block 595 */ 596 new_blocks[index] = current_block; 597 printk(KERN_INFO "%s returned more blocks than " 598 "requested\n", __func__); 599 WARN_ON(1); 600 break; 601 } 602 } 603 604 target = blks - count ; 605 blk_allocated = count; 606 if (!target) 607 goto allocated; 608 /* Now allocate data blocks */ 609 memset(&ar, 0, sizeof(ar)); 610 ar.inode = inode; 611 ar.goal = goal; 612 ar.len = target; 613 ar.logical = iblock; 614 if (S_ISREG(inode->i_mode)) 615 /* enable in-core preallocation only for regular files */ 616 ar.flags = EXT4_MB_HINT_DATA; 617 618 current_block = ext4_mb_new_blocks(handle, &ar, err); 619 620 if (*err && (target == blks)) { 621 /* 622 * if the allocation failed and we didn't allocate 623 * any blocks before 624 */ 625 goto failed_out; 626 } 627 if (!*err) { 628 if (target == blks) { 629 /* 630 * save the new block number 631 * for the first direct block 632 */ 633 new_blocks[index] = current_block; 634 } 635 blk_allocated += ar.len; 636 } 637 allocated: 638 /* total number of blocks allocated for direct blocks */ 639 ret = blk_allocated; 640 *err = 0; 641 return ret; 642 failed_out: 643 for (i = 0; i < index; i++) 644 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 645 return ret; 646 } 647 648 /** 649 * ext4_alloc_branch - allocate and set up a chain of blocks. 650 * @inode: owner 651 * @indirect_blks: number of allocated indirect blocks 652 * @blks: number of allocated direct blocks 653 * @offsets: offsets (in the blocks) to store the pointers to next. 654 * @branch: place to store the chain in. 655 * 656 * This function allocates blocks, zeroes out all but the last one, 657 * links them into chain and (if we are synchronous) writes them to disk. 658 * In other words, it prepares a branch that can be spliced onto the 659 * inode. It stores the information about that chain in the branch[], in 660 * the same format as ext4_get_branch() would do. We are calling it after 661 * we had read the existing part of chain and partial points to the last 662 * triple of that (one with zero ->key). Upon the exit we have the same 663 * picture as after the successful ext4_get_block(), except that in one 664 * place chain is disconnected - *branch->p is still zero (we did not 665 * set the last link), but branch->key contains the number that should 666 * be placed into *branch->p to fill that gap. 667 * 668 * If allocation fails we free all blocks we've allocated (and forget 669 * their buffer_heads) and return the error value the from failed 670 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 671 * as described above and return 0. 672 */ 673 static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 674 ext4_lblk_t iblock, int indirect_blks, 675 int *blks, ext4_fsblk_t goal, 676 ext4_lblk_t *offsets, Indirect *branch) 677 { 678 int blocksize = inode->i_sb->s_blocksize; 679 int i, n = 0; 680 int err = 0; 681 struct buffer_head *bh; 682 int num; 683 ext4_fsblk_t new_blocks[4]; 684 ext4_fsblk_t current_block; 685 686 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, 687 *blks, new_blocks, &err); 688 if (err) 689 return err; 690 691 branch[0].key = cpu_to_le32(new_blocks[0]); 692 /* 693 * metadata blocks and data blocks are allocated. 694 */ 695 for (n = 1; n <= indirect_blks; n++) { 696 /* 697 * Get buffer_head for parent block, zero it out 698 * and set the pointer to new one, then send 699 * parent to disk. 700 */ 701 bh = sb_getblk(inode->i_sb, new_blocks[n-1]); 702 branch[n].bh = bh; 703 lock_buffer(bh); 704 BUFFER_TRACE(bh, "call get_create_access"); 705 err = ext4_journal_get_create_access(handle, bh); 706 if (err) { 707 unlock_buffer(bh); 708 brelse(bh); 709 goto failed; 710 } 711 712 memset(bh->b_data, 0, blocksize); 713 branch[n].p = (__le32 *) bh->b_data + offsets[n]; 714 branch[n].key = cpu_to_le32(new_blocks[n]); 715 *branch[n].p = branch[n].key; 716 if (n == indirect_blks) { 717 current_block = new_blocks[n]; 718 /* 719 * End of chain, update the last new metablock of 720 * the chain to point to the new allocated 721 * data blocks numbers 722 */ 723 for (i=1; i < num; i++) 724 *(branch[n].p + i) = cpu_to_le32(++current_block); 725 } 726 BUFFER_TRACE(bh, "marking uptodate"); 727 set_buffer_uptodate(bh); 728 unlock_buffer(bh); 729 730 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 731 err = ext4_handle_dirty_metadata(handle, inode, bh); 732 if (err) 733 goto failed; 734 } 735 *blks = num; 736 return err; 737 failed: 738 /* Allocation failed, free what we already allocated */ 739 for (i = 1; i <= n ; i++) { 740 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 741 ext4_journal_forget(handle, branch[i].bh); 742 } 743 for (i = 0; i < indirect_blks; i++) 744 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 745 746 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 747 748 return err; 749 } 750 751 /** 752 * ext4_splice_branch - splice the allocated branch onto inode. 753 * @inode: owner 754 * @block: (logical) number of block we are adding 755 * @chain: chain of indirect blocks (with a missing link - see 756 * ext4_alloc_branch) 757 * @where: location of missing link 758 * @num: number of indirect blocks we are adding 759 * @blks: number of direct blocks we are adding 760 * 761 * This function fills the missing link and does all housekeeping needed in 762 * inode (->i_blocks, etc.). In case of success we end up with the full 763 * chain to new block and return 0. 764 */ 765 static int ext4_splice_branch(handle_t *handle, struct inode *inode, 766 ext4_lblk_t block, Indirect *where, int num, int blks) 767 { 768 int i; 769 int err = 0; 770 ext4_fsblk_t current_block; 771 772 /* 773 * If we're splicing into a [td]indirect block (as opposed to the 774 * inode) then we need to get write access to the [td]indirect block 775 * before the splice. 776 */ 777 if (where->bh) { 778 BUFFER_TRACE(where->bh, "get_write_access"); 779 err = ext4_journal_get_write_access(handle, where->bh); 780 if (err) 781 goto err_out; 782 } 783 /* That's it */ 784 785 *where->p = where->key; 786 787 /* 788 * Update the host buffer_head or inode to point to more just allocated 789 * direct blocks blocks 790 */ 791 if (num == 0 && blks > 1) { 792 current_block = le32_to_cpu(where->key) + 1; 793 for (i = 1; i < blks; i++) 794 *(where->p + i) = cpu_to_le32(current_block++); 795 } 796 797 /* We are done with atomic stuff, now do the rest of housekeeping */ 798 799 inode->i_ctime = ext4_current_time(inode); 800 ext4_mark_inode_dirty(handle, inode); 801 802 /* had we spliced it onto indirect block? */ 803 if (where->bh) { 804 /* 805 * If we spliced it onto an indirect block, we haven't 806 * altered the inode. Note however that if it is being spliced 807 * onto an indirect block at the very end of the file (the 808 * file is growing) then we *will* alter the inode to reflect 809 * the new i_size. But that is not done here - it is done in 810 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. 811 */ 812 jbd_debug(5, "splicing indirect only\n"); 813 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 814 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 815 if (err) 816 goto err_out; 817 } else { 818 /* 819 * OK, we spliced it into the inode itself on a direct block. 820 * Inode was dirtied above. 821 */ 822 jbd_debug(5, "splicing direct\n"); 823 } 824 return err; 825 826 err_out: 827 for (i = 1; i <= num; i++) { 828 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 829 ext4_journal_forget(handle, where[i].bh); 830 ext4_free_blocks(handle, inode, 831 le32_to_cpu(where[i-1].key), 1, 0); 832 } 833 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 834 835 return err; 836 } 837 838 /* 839 * Allocation strategy is simple: if we have to allocate something, we will 840 * have to go the whole way to leaf. So let's do it before attaching anything 841 * to tree, set linkage between the newborn blocks, write them if sync is 842 * required, recheck the path, free and repeat if check fails, otherwise 843 * set the last missing link (that will protect us from any truncate-generated 844 * removals - all blocks on the path are immune now) and possibly force the 845 * write on the parent block. 846 * That has a nice additional property: no special recovery from the failed 847 * allocations is needed - we simply release blocks and do not touch anything 848 * reachable from inode. 849 * 850 * `handle' can be NULL if create == 0. 851 * 852 * return > 0, # of blocks mapped or allocated. 853 * return = 0, if plain lookup failed. 854 * return < 0, error case. 855 * 856 * 857 * Need to be called with 858 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block 859 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) 860 */ 861 static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, 862 ext4_lblk_t iblock, unsigned int maxblocks, 863 struct buffer_head *bh_result, 864 int create, int extend_disksize) 865 { 866 int err = -EIO; 867 ext4_lblk_t offsets[4]; 868 Indirect chain[4]; 869 Indirect *partial; 870 ext4_fsblk_t goal; 871 int indirect_blks; 872 int blocks_to_boundary = 0; 873 int depth; 874 struct ext4_inode_info *ei = EXT4_I(inode); 875 int count = 0; 876 ext4_fsblk_t first_block = 0; 877 loff_t disksize; 878 879 880 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 881 J_ASSERT(handle != NULL || create == 0); 882 depth = ext4_block_to_path(inode, iblock, offsets, 883 &blocks_to_boundary); 884 885 if (depth == 0) 886 goto out; 887 888 partial = ext4_get_branch(inode, depth, offsets, chain, &err); 889 890 /* Simplest case - block found, no allocation needed */ 891 if (!partial) { 892 first_block = le32_to_cpu(chain[depth - 1].key); 893 clear_buffer_new(bh_result); 894 count++; 895 /*map more blocks*/ 896 while (count < maxblocks && count <= blocks_to_boundary) { 897 ext4_fsblk_t blk; 898 899 blk = le32_to_cpu(*(chain[depth-1].p + count)); 900 901 if (blk == first_block + count) 902 count++; 903 else 904 break; 905 } 906 goto got_it; 907 } 908 909 /* Next simple case - plain lookup or failed read of indirect block */ 910 if (!create || err == -EIO) 911 goto cleanup; 912 913 /* 914 * Okay, we need to do block allocation. 915 */ 916 goal = ext4_find_goal(inode, iblock, partial); 917 918 /* the number of blocks need to allocate for [d,t]indirect blocks */ 919 indirect_blks = (chain + depth) - partial - 1; 920 921 /* 922 * Next look up the indirect map to count the totoal number of 923 * direct blocks to allocate for this branch. 924 */ 925 count = ext4_blks_to_allocate(partial, indirect_blks, 926 maxblocks, blocks_to_boundary); 927 /* 928 * Block out ext4_truncate while we alter the tree 929 */ 930 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 931 &count, goal, 932 offsets + (partial - chain), partial); 933 934 /* 935 * The ext4_splice_branch call will free and forget any buffers 936 * on the new chain if there is a failure, but that risks using 937 * up transaction credits, especially for bitmaps where the 938 * credits cannot be returned. Can we handle this somehow? We 939 * may need to return -EAGAIN upwards in the worst case. --sct 940 */ 941 if (!err) 942 err = ext4_splice_branch(handle, inode, iblock, 943 partial, indirect_blks, count); 944 /* 945 * i_disksize growing is protected by i_data_sem. Don't forget to 946 * protect it if you're about to implement concurrent 947 * ext4_get_block() -bzzz 948 */ 949 if (!err && extend_disksize) { 950 disksize = ((loff_t) iblock + count) << inode->i_blkbits; 951 if (disksize > i_size_read(inode)) 952 disksize = i_size_read(inode); 953 if (disksize > ei->i_disksize) 954 ei->i_disksize = disksize; 955 } 956 if (err) 957 goto cleanup; 958 959 set_buffer_new(bh_result); 960 got_it: 961 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 962 if (count > blocks_to_boundary) 963 set_buffer_boundary(bh_result); 964 err = count; 965 /* Clean up and exit */ 966 partial = chain + depth - 1; /* the whole chain */ 967 cleanup: 968 while (partial > chain) { 969 BUFFER_TRACE(partial->bh, "call brelse"); 970 brelse(partial->bh); 971 partial--; 972 } 973 BUFFER_TRACE(bh_result, "returned"); 974 out: 975 return err; 976 } 977 978 /* 979 * Calculate the number of metadata blocks need to reserve 980 * to allocate @blocks for non extent file based file 981 */ 982 static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) 983 { 984 int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); 985 int ind_blks, dind_blks, tind_blks; 986 987 /* number of new indirect blocks needed */ 988 ind_blks = (blocks + icap - 1) / icap; 989 990 dind_blks = (ind_blks + icap - 1) / icap; 991 992 tind_blks = 1; 993 994 return ind_blks + dind_blks + tind_blks; 995 } 996 997 /* 998 * Calculate the number of metadata blocks need to reserve 999 * to allocate given number of blocks 1000 */ 1001 static int ext4_calc_metadata_amount(struct inode *inode, int blocks) 1002 { 1003 if (!blocks) 1004 return 0; 1005 1006 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1007 return ext4_ext_calc_metadata_amount(inode, blocks); 1008 1009 return ext4_indirect_calc_metadata_amount(inode, blocks); 1010 } 1011 1012 static void ext4_da_update_reserve_space(struct inode *inode, int used) 1013 { 1014 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1015 int total, mdb, mdb_free; 1016 1017 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1018 /* recalculate the number of metablocks still need to be reserved */ 1019 total = EXT4_I(inode)->i_reserved_data_blocks - used; 1020 mdb = ext4_calc_metadata_amount(inode, total); 1021 1022 /* figure out how many metablocks to release */ 1023 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1024 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1025 1026 if (mdb_free) { 1027 /* Account for allocated meta_blocks */ 1028 mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; 1029 1030 /* update fs dirty blocks counter */ 1031 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); 1032 EXT4_I(inode)->i_allocated_meta_blocks = 0; 1033 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1034 } 1035 1036 /* update per-inode reservations */ 1037 BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); 1038 EXT4_I(inode)->i_reserved_data_blocks -= used; 1039 1040 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1041 } 1042 1043 /* 1044 * The ext4_get_blocks_wrap() function try to look up the requested blocks, 1045 * and returns if the blocks are already mapped. 1046 * 1047 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1048 * and store the allocated blocks in the result buffer head and mark it 1049 * mapped. 1050 * 1051 * If file type is extents based, it will call ext4_ext_get_blocks(), 1052 * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping 1053 * based files 1054 * 1055 * On success, it returns the number of blocks being mapped or allocate. 1056 * if create==0 and the blocks are pre-allocated and uninitialized block, 1057 * the result buffer head is unmapped. If the create ==1, it will make sure 1058 * the buffer head is mapped. 1059 * 1060 * It returns 0 if plain look up failed (blocks have not been allocated), in 1061 * that casem, buffer head is unmapped 1062 * 1063 * It returns the error in case of allocation failure. 1064 */ 1065 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, 1066 unsigned int max_blocks, struct buffer_head *bh, 1067 int create, int extend_disksize, int flag) 1068 { 1069 int retval; 1070 1071 clear_buffer_mapped(bh); 1072 1073 /* 1074 * Try to see if we can get the block without requesting 1075 * for new file system block. 1076 */ 1077 down_read((&EXT4_I(inode)->i_data_sem)); 1078 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1079 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1080 bh, 0, 0); 1081 } else { 1082 retval = ext4_get_blocks_handle(handle, 1083 inode, block, max_blocks, bh, 0, 0); 1084 } 1085 up_read((&EXT4_I(inode)->i_data_sem)); 1086 1087 /* If it is only a block(s) look up */ 1088 if (!create) 1089 return retval; 1090 1091 /* 1092 * Returns if the blocks have already allocated 1093 * 1094 * Note that if blocks have been preallocated 1095 * ext4_ext_get_block() returns th create = 0 1096 * with buffer head unmapped. 1097 */ 1098 if (retval > 0 && buffer_mapped(bh)) 1099 return retval; 1100 1101 /* 1102 * New blocks allocate and/or writing to uninitialized extent 1103 * will possibly result in updating i_data, so we take 1104 * the write lock of i_data_sem, and call get_blocks() 1105 * with create == 1 flag. 1106 */ 1107 down_write((&EXT4_I(inode)->i_data_sem)); 1108 1109 /* 1110 * if the caller is from delayed allocation writeout path 1111 * we have already reserved fs blocks for allocation 1112 * let the underlying get_block() function know to 1113 * avoid double accounting 1114 */ 1115 if (flag) 1116 EXT4_I(inode)->i_delalloc_reserved_flag = 1; 1117 /* 1118 * We need to check for EXT4 here because migrate 1119 * could have changed the inode type in between 1120 */ 1121 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1122 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1123 bh, create, extend_disksize); 1124 } else { 1125 retval = ext4_get_blocks_handle(handle, inode, block, 1126 max_blocks, bh, create, extend_disksize); 1127 1128 if (retval > 0 && buffer_new(bh)) { 1129 /* 1130 * We allocated new blocks which will result in 1131 * i_data's format changing. Force the migrate 1132 * to fail by clearing migrate flags 1133 */ 1134 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1135 ~EXT4_EXT_MIGRATE; 1136 } 1137 } 1138 1139 if (flag) { 1140 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1141 /* 1142 * Update reserved blocks/metadata blocks 1143 * after successful block allocation 1144 * which were deferred till now 1145 */ 1146 if ((retval > 0) && buffer_delay(bh)) 1147 ext4_da_update_reserve_space(inode, retval); 1148 } 1149 1150 up_write((&EXT4_I(inode)->i_data_sem)); 1151 return retval; 1152 } 1153 1154 /* Maximum number of blocks we map for direct IO at once. */ 1155 #define DIO_MAX_BLOCKS 4096 1156 1157 int ext4_get_block(struct inode *inode, sector_t iblock, 1158 struct buffer_head *bh_result, int create) 1159 { 1160 handle_t *handle = ext4_journal_current_handle(); 1161 int ret = 0, started = 0; 1162 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 1163 int dio_credits; 1164 1165 if (create && !handle) { 1166 /* Direct IO write... */ 1167 if (max_blocks > DIO_MAX_BLOCKS) 1168 max_blocks = DIO_MAX_BLOCKS; 1169 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1170 handle = ext4_journal_start(inode, dio_credits); 1171 if (IS_ERR(handle)) { 1172 ret = PTR_ERR(handle); 1173 goto out; 1174 } 1175 started = 1; 1176 } 1177 1178 ret = ext4_get_blocks_wrap(handle, inode, iblock, 1179 max_blocks, bh_result, create, 0, 0); 1180 if (ret > 0) { 1181 bh_result->b_size = (ret << inode->i_blkbits); 1182 ret = 0; 1183 } 1184 if (started) 1185 ext4_journal_stop(handle); 1186 out: 1187 return ret; 1188 } 1189 1190 /* 1191 * `handle' can be NULL if create is zero 1192 */ 1193 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1194 ext4_lblk_t block, int create, int *errp) 1195 { 1196 struct buffer_head dummy; 1197 int fatal = 0, err; 1198 1199 J_ASSERT(handle != NULL || create == 0); 1200 1201 dummy.b_state = 0; 1202 dummy.b_blocknr = -1000; 1203 buffer_trace_init(&dummy.b_history); 1204 err = ext4_get_blocks_wrap(handle, inode, block, 1, 1205 &dummy, create, 1, 0); 1206 /* 1207 * ext4_get_blocks_handle() returns number of blocks 1208 * mapped. 0 in case of a HOLE. 1209 */ 1210 if (err > 0) { 1211 if (err > 1) 1212 WARN_ON(1); 1213 err = 0; 1214 } 1215 *errp = err; 1216 if (!err && buffer_mapped(&dummy)) { 1217 struct buffer_head *bh; 1218 bh = sb_getblk(inode->i_sb, dummy.b_blocknr); 1219 if (!bh) { 1220 *errp = -EIO; 1221 goto err; 1222 } 1223 if (buffer_new(&dummy)) { 1224 J_ASSERT(create != 0); 1225 J_ASSERT(handle != NULL); 1226 1227 /* 1228 * Now that we do not always journal data, we should 1229 * keep in mind whether this should always journal the 1230 * new buffer as metadata. For now, regular file 1231 * writes use ext4_get_block instead, so it's not a 1232 * problem. 1233 */ 1234 lock_buffer(bh); 1235 BUFFER_TRACE(bh, "call get_create_access"); 1236 fatal = ext4_journal_get_create_access(handle, bh); 1237 if (!fatal && !buffer_uptodate(bh)) { 1238 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1239 set_buffer_uptodate(bh); 1240 } 1241 unlock_buffer(bh); 1242 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 1243 err = ext4_handle_dirty_metadata(handle, inode, bh); 1244 if (!fatal) 1245 fatal = err; 1246 } else { 1247 BUFFER_TRACE(bh, "not a new buffer"); 1248 } 1249 if (fatal) { 1250 *errp = fatal; 1251 brelse(bh); 1252 bh = NULL; 1253 } 1254 return bh; 1255 } 1256 err: 1257 return NULL; 1258 } 1259 1260 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1261 ext4_lblk_t block, int create, int *err) 1262 { 1263 struct buffer_head *bh; 1264 1265 bh = ext4_getblk(handle, inode, block, create, err); 1266 if (!bh) 1267 return bh; 1268 if (buffer_uptodate(bh)) 1269 return bh; 1270 ll_rw_block(READ_META, 1, &bh); 1271 wait_on_buffer(bh); 1272 if (buffer_uptodate(bh)) 1273 return bh; 1274 put_bh(bh); 1275 *err = -EIO; 1276 return NULL; 1277 } 1278 1279 static int walk_page_buffers(handle_t *handle, 1280 struct buffer_head *head, 1281 unsigned from, 1282 unsigned to, 1283 int *partial, 1284 int (*fn)(handle_t *handle, 1285 struct buffer_head *bh)) 1286 { 1287 struct buffer_head *bh; 1288 unsigned block_start, block_end; 1289 unsigned blocksize = head->b_size; 1290 int err, ret = 0; 1291 struct buffer_head *next; 1292 1293 for (bh = head, block_start = 0; 1294 ret == 0 && (bh != head || !block_start); 1295 block_start = block_end, bh = next) 1296 { 1297 next = bh->b_this_page; 1298 block_end = block_start + blocksize; 1299 if (block_end <= from || block_start >= to) { 1300 if (partial && !buffer_uptodate(bh)) 1301 *partial = 1; 1302 continue; 1303 } 1304 err = (*fn)(handle, bh); 1305 if (!ret) 1306 ret = err; 1307 } 1308 return ret; 1309 } 1310 1311 /* 1312 * To preserve ordering, it is essential that the hole instantiation and 1313 * the data write be encapsulated in a single transaction. We cannot 1314 * close off a transaction and start a new one between the ext4_get_block() 1315 * and the commit_write(). So doing the jbd2_journal_start at the start of 1316 * prepare_write() is the right place. 1317 * 1318 * Also, this function can nest inside ext4_writepage() -> 1319 * block_write_full_page(). In that case, we *know* that ext4_writepage() 1320 * has generated enough buffer credits to do the whole page. So we won't 1321 * block on the journal in that case, which is good, because the caller may 1322 * be PF_MEMALLOC. 1323 * 1324 * By accident, ext4 can be reentered when a transaction is open via 1325 * quota file writes. If we were to commit the transaction while thus 1326 * reentered, there can be a deadlock - we would be holding a quota 1327 * lock, and the commit would never complete if another thread had a 1328 * transaction open and was blocking on the quota lock - a ranking 1329 * violation. 1330 * 1331 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 1332 * will _not_ run commit under these circumstances because handle->h_ref 1333 * is elevated. We'll still have enough credits for the tiny quotafile 1334 * write. 1335 */ 1336 static int do_journal_get_write_access(handle_t *handle, 1337 struct buffer_head *bh) 1338 { 1339 if (!buffer_mapped(bh) || buffer_freed(bh)) 1340 return 0; 1341 return ext4_journal_get_write_access(handle, bh); 1342 } 1343 1344 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1345 loff_t pos, unsigned len, unsigned flags, 1346 struct page **pagep, void **fsdata) 1347 { 1348 struct inode *inode = mapping->host; 1349 int ret, needed_blocks = ext4_writepage_trans_blocks(inode); 1350 handle_t *handle; 1351 int retries = 0; 1352 struct page *page; 1353 pgoff_t index; 1354 unsigned from, to; 1355 1356 trace_mark(ext4_write_begin, 1357 "dev %s ino %lu pos %llu len %u flags %u", 1358 inode->i_sb->s_id, inode->i_ino, 1359 (unsigned long long) pos, len, flags); 1360 index = pos >> PAGE_CACHE_SHIFT; 1361 from = pos & (PAGE_CACHE_SIZE - 1); 1362 to = from + len; 1363 1364 retry: 1365 handle = ext4_journal_start(inode, needed_blocks); 1366 if (IS_ERR(handle)) { 1367 ret = PTR_ERR(handle); 1368 goto out; 1369 } 1370 1371 page = grab_cache_page_write_begin(mapping, index, flags); 1372 if (!page) { 1373 ext4_journal_stop(handle); 1374 ret = -ENOMEM; 1375 goto out; 1376 } 1377 *pagep = page; 1378 1379 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 1380 ext4_get_block); 1381 1382 if (!ret && ext4_should_journal_data(inode)) { 1383 ret = walk_page_buffers(handle, page_buffers(page), 1384 from, to, NULL, do_journal_get_write_access); 1385 } 1386 1387 if (ret) { 1388 unlock_page(page); 1389 ext4_journal_stop(handle); 1390 page_cache_release(page); 1391 /* 1392 * block_write_begin may have instantiated a few blocks 1393 * outside i_size. Trim these off again. Don't need 1394 * i_size_read because we hold i_mutex. 1395 */ 1396 if (pos + len > inode->i_size) 1397 vmtruncate(inode, inode->i_size); 1398 } 1399 1400 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1401 goto retry; 1402 out: 1403 return ret; 1404 } 1405 1406 /* For write_end() in data=journal mode */ 1407 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1408 { 1409 if (!buffer_mapped(bh) || buffer_freed(bh)) 1410 return 0; 1411 set_buffer_uptodate(bh); 1412 return ext4_handle_dirty_metadata(handle, NULL, bh); 1413 } 1414 1415 /* 1416 * We need to pick up the new inode size which generic_commit_write gave us 1417 * `file' can be NULL - eg, when called from page_symlink(). 1418 * 1419 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1420 * buffers are managed internally. 1421 */ 1422 static int ext4_ordered_write_end(struct file *file, 1423 struct address_space *mapping, 1424 loff_t pos, unsigned len, unsigned copied, 1425 struct page *page, void *fsdata) 1426 { 1427 handle_t *handle = ext4_journal_current_handle(); 1428 struct inode *inode = mapping->host; 1429 int ret = 0, ret2; 1430 1431 trace_mark(ext4_ordered_write_end, 1432 "dev %s ino %lu pos %llu len %u copied %u", 1433 inode->i_sb->s_id, inode->i_ino, 1434 (unsigned long long) pos, len, copied); 1435 ret = ext4_jbd2_file_inode(handle, inode); 1436 1437 if (ret == 0) { 1438 loff_t new_i_size; 1439 1440 new_i_size = pos + copied; 1441 if (new_i_size > EXT4_I(inode)->i_disksize) { 1442 ext4_update_i_disksize(inode, new_i_size); 1443 /* We need to mark inode dirty even if 1444 * new_i_size is less that inode->i_size 1445 * bu greater than i_disksize.(hint delalloc) 1446 */ 1447 ext4_mark_inode_dirty(handle, inode); 1448 } 1449 1450 ret2 = generic_write_end(file, mapping, pos, len, copied, 1451 page, fsdata); 1452 copied = ret2; 1453 if (ret2 < 0) 1454 ret = ret2; 1455 } 1456 ret2 = ext4_journal_stop(handle); 1457 if (!ret) 1458 ret = ret2; 1459 1460 return ret ? ret : copied; 1461 } 1462 1463 static int ext4_writeback_write_end(struct file *file, 1464 struct address_space *mapping, 1465 loff_t pos, unsigned len, unsigned copied, 1466 struct page *page, void *fsdata) 1467 { 1468 handle_t *handle = ext4_journal_current_handle(); 1469 struct inode *inode = mapping->host; 1470 int ret = 0, ret2; 1471 loff_t new_i_size; 1472 1473 trace_mark(ext4_writeback_write_end, 1474 "dev %s ino %lu pos %llu len %u copied %u", 1475 inode->i_sb->s_id, inode->i_ino, 1476 (unsigned long long) pos, len, copied); 1477 new_i_size = pos + copied; 1478 if (new_i_size > EXT4_I(inode)->i_disksize) { 1479 ext4_update_i_disksize(inode, new_i_size); 1480 /* We need to mark inode dirty even if 1481 * new_i_size is less that inode->i_size 1482 * bu greater than i_disksize.(hint delalloc) 1483 */ 1484 ext4_mark_inode_dirty(handle, inode); 1485 } 1486 1487 ret2 = generic_write_end(file, mapping, pos, len, copied, 1488 page, fsdata); 1489 copied = ret2; 1490 if (ret2 < 0) 1491 ret = ret2; 1492 1493 ret2 = ext4_journal_stop(handle); 1494 if (!ret) 1495 ret = ret2; 1496 1497 return ret ? ret : copied; 1498 } 1499 1500 static int ext4_journalled_write_end(struct file *file, 1501 struct address_space *mapping, 1502 loff_t pos, unsigned len, unsigned copied, 1503 struct page *page, void *fsdata) 1504 { 1505 handle_t *handle = ext4_journal_current_handle(); 1506 struct inode *inode = mapping->host; 1507 int ret = 0, ret2; 1508 int partial = 0; 1509 unsigned from, to; 1510 loff_t new_i_size; 1511 1512 trace_mark(ext4_journalled_write_end, 1513 "dev %s ino %lu pos %llu len %u copied %u", 1514 inode->i_sb->s_id, inode->i_ino, 1515 (unsigned long long) pos, len, copied); 1516 from = pos & (PAGE_CACHE_SIZE - 1); 1517 to = from + len; 1518 1519 if (copied < len) { 1520 if (!PageUptodate(page)) 1521 copied = 0; 1522 page_zero_new_buffers(page, from+copied, to); 1523 } 1524 1525 ret = walk_page_buffers(handle, page_buffers(page), from, 1526 to, &partial, write_end_fn); 1527 if (!partial) 1528 SetPageUptodate(page); 1529 new_i_size = pos + copied; 1530 if (new_i_size > inode->i_size) 1531 i_size_write(inode, pos+copied); 1532 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 1533 if (new_i_size > EXT4_I(inode)->i_disksize) { 1534 ext4_update_i_disksize(inode, new_i_size); 1535 ret2 = ext4_mark_inode_dirty(handle, inode); 1536 if (!ret) 1537 ret = ret2; 1538 } 1539 1540 unlock_page(page); 1541 ret2 = ext4_journal_stop(handle); 1542 if (!ret) 1543 ret = ret2; 1544 page_cache_release(page); 1545 1546 return ret ? ret : copied; 1547 } 1548 1549 static int ext4_da_reserve_space(struct inode *inode, int nrblocks) 1550 { 1551 int retries = 0; 1552 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1553 unsigned long md_needed, mdblocks, total = 0; 1554 1555 /* 1556 * recalculate the amount of metadata blocks to reserve 1557 * in order to allocate nrblocks 1558 * worse case is one extent per block 1559 */ 1560 repeat: 1561 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1562 total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; 1563 mdblocks = ext4_calc_metadata_amount(inode, total); 1564 BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); 1565 1566 md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; 1567 total = md_needed + nrblocks; 1568 1569 if (ext4_claim_free_blocks(sbi, total)) { 1570 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1571 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1572 yield(); 1573 goto repeat; 1574 } 1575 return -ENOSPC; 1576 } 1577 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1578 EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; 1579 1580 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1581 return 0; /* success */ 1582 } 1583 1584 static void ext4_da_release_space(struct inode *inode, int to_free) 1585 { 1586 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1587 int total, mdb, mdb_free, release; 1588 1589 if (!to_free) 1590 return; /* Nothing to release, exit */ 1591 1592 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1593 1594 if (!EXT4_I(inode)->i_reserved_data_blocks) { 1595 /* 1596 * if there is no reserved blocks, but we try to free some 1597 * then the counter is messed up somewhere. 1598 * but since this function is called from invalidate 1599 * page, it's harmless to return without any action 1600 */ 1601 printk(KERN_INFO "ext4 delalloc try to release %d reserved " 1602 "blocks for inode %lu, but there is no reserved " 1603 "data blocks\n", to_free, inode->i_ino); 1604 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1605 return; 1606 } 1607 1608 /* recalculate the number of metablocks still need to be reserved */ 1609 total = EXT4_I(inode)->i_reserved_data_blocks - to_free; 1610 mdb = ext4_calc_metadata_amount(inode, total); 1611 1612 /* figure out how many metablocks to release */ 1613 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1614 mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; 1615 1616 release = to_free + mdb_free; 1617 1618 /* update fs dirty blocks counter for truncate case */ 1619 percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); 1620 1621 /* update per-inode reservations */ 1622 BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); 1623 EXT4_I(inode)->i_reserved_data_blocks -= to_free; 1624 1625 BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); 1626 EXT4_I(inode)->i_reserved_meta_blocks = mdb; 1627 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1628 } 1629 1630 static void ext4_da_page_release_reservation(struct page *page, 1631 unsigned long offset) 1632 { 1633 int to_release = 0; 1634 struct buffer_head *head, *bh; 1635 unsigned int curr_off = 0; 1636 1637 head = page_buffers(page); 1638 bh = head; 1639 do { 1640 unsigned int next_off = curr_off + bh->b_size; 1641 1642 if ((offset <= curr_off) && (buffer_delay(bh))) { 1643 to_release++; 1644 clear_buffer_delay(bh); 1645 } 1646 curr_off = next_off; 1647 } while ((bh = bh->b_this_page) != head); 1648 ext4_da_release_space(page->mapping->host, to_release); 1649 } 1650 1651 /* 1652 * Delayed allocation stuff 1653 */ 1654 1655 struct mpage_da_data { 1656 struct inode *inode; 1657 struct buffer_head lbh; /* extent of blocks */ 1658 unsigned long first_page, next_page; /* extent of pages */ 1659 get_block_t *get_block; 1660 struct writeback_control *wbc; 1661 int io_done; 1662 int pages_written; 1663 int retval; 1664 }; 1665 1666 /* 1667 * mpage_da_submit_io - walks through extent of pages and try to write 1668 * them with writepage() call back 1669 * 1670 * @mpd->inode: inode 1671 * @mpd->first_page: first page of the extent 1672 * @mpd->next_page: page after the last page of the extent 1673 * @mpd->get_block: the filesystem's block mapper function 1674 * 1675 * By the time mpage_da_submit_io() is called we expect all blocks 1676 * to be allocated. this may be wrong if allocation failed. 1677 * 1678 * As pages are already locked by write_cache_pages(), we can't use it 1679 */ 1680 static int mpage_da_submit_io(struct mpage_da_data *mpd) 1681 { 1682 long pages_skipped; 1683 struct pagevec pvec; 1684 unsigned long index, end; 1685 int ret = 0, err, nr_pages, i; 1686 struct inode *inode = mpd->inode; 1687 struct address_space *mapping = inode->i_mapping; 1688 1689 BUG_ON(mpd->next_page <= mpd->first_page); 1690 /* 1691 * We need to start from the first_page to the next_page - 1 1692 * to make sure we also write the mapped dirty buffer_heads. 1693 * If we look at mpd->lbh.b_blocknr we would only be looking 1694 * at the currently mapped buffer_heads. 1695 */ 1696 index = mpd->first_page; 1697 end = mpd->next_page - 1; 1698 1699 pagevec_init(&pvec, 0); 1700 while (index <= end) { 1701 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1702 if (nr_pages == 0) 1703 break; 1704 for (i = 0; i < nr_pages; i++) { 1705 struct page *page = pvec.pages[i]; 1706 1707 index = page->index; 1708 if (index > end) 1709 break; 1710 index++; 1711 1712 BUG_ON(!PageLocked(page)); 1713 BUG_ON(PageWriteback(page)); 1714 1715 pages_skipped = mpd->wbc->pages_skipped; 1716 err = mapping->a_ops->writepage(page, mpd->wbc); 1717 if (!err && (pages_skipped == mpd->wbc->pages_skipped)) 1718 /* 1719 * have successfully written the page 1720 * without skipping the same 1721 */ 1722 mpd->pages_written++; 1723 /* 1724 * In error case, we have to continue because 1725 * remaining pages are still locked 1726 * XXX: unlock and re-dirty them? 1727 */ 1728 if (ret == 0) 1729 ret = err; 1730 } 1731 pagevec_release(&pvec); 1732 } 1733 return ret; 1734 } 1735 1736 /* 1737 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 1738 * 1739 * @mpd->inode - inode to walk through 1740 * @exbh->b_blocknr - first block on a disk 1741 * @exbh->b_size - amount of space in bytes 1742 * @logical - first logical block to start assignment with 1743 * 1744 * the function goes through all passed space and put actual disk 1745 * block numbers into buffer heads, dropping BH_Delay 1746 */ 1747 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 1748 struct buffer_head *exbh) 1749 { 1750 struct inode *inode = mpd->inode; 1751 struct address_space *mapping = inode->i_mapping; 1752 int blocks = exbh->b_size >> inode->i_blkbits; 1753 sector_t pblock = exbh->b_blocknr, cur_logical; 1754 struct buffer_head *head, *bh; 1755 pgoff_t index, end; 1756 struct pagevec pvec; 1757 int nr_pages, i; 1758 1759 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1760 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1761 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1762 1763 pagevec_init(&pvec, 0); 1764 1765 while (index <= end) { 1766 /* XXX: optimize tail */ 1767 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1768 if (nr_pages == 0) 1769 break; 1770 for (i = 0; i < nr_pages; i++) { 1771 struct page *page = pvec.pages[i]; 1772 1773 index = page->index; 1774 if (index > end) 1775 break; 1776 index++; 1777 1778 BUG_ON(!PageLocked(page)); 1779 BUG_ON(PageWriteback(page)); 1780 BUG_ON(!page_has_buffers(page)); 1781 1782 bh = page_buffers(page); 1783 head = bh; 1784 1785 /* skip blocks out of the range */ 1786 do { 1787 if (cur_logical >= logical) 1788 break; 1789 cur_logical++; 1790 } while ((bh = bh->b_this_page) != head); 1791 1792 do { 1793 if (cur_logical >= logical + blocks) 1794 break; 1795 if (buffer_delay(bh)) { 1796 bh->b_blocknr = pblock; 1797 clear_buffer_delay(bh); 1798 bh->b_bdev = inode->i_sb->s_bdev; 1799 } else if (buffer_unwritten(bh)) { 1800 bh->b_blocknr = pblock; 1801 clear_buffer_unwritten(bh); 1802 set_buffer_mapped(bh); 1803 set_buffer_new(bh); 1804 bh->b_bdev = inode->i_sb->s_bdev; 1805 } else if (buffer_mapped(bh)) 1806 BUG_ON(bh->b_blocknr != pblock); 1807 1808 cur_logical++; 1809 pblock++; 1810 } while ((bh = bh->b_this_page) != head); 1811 } 1812 pagevec_release(&pvec); 1813 } 1814 } 1815 1816 1817 /* 1818 * __unmap_underlying_blocks - just a helper function to unmap 1819 * set of blocks described by @bh 1820 */ 1821 static inline void __unmap_underlying_blocks(struct inode *inode, 1822 struct buffer_head *bh) 1823 { 1824 struct block_device *bdev = inode->i_sb->s_bdev; 1825 int blocks, i; 1826 1827 blocks = bh->b_size >> inode->i_blkbits; 1828 for (i = 0; i < blocks; i++) 1829 unmap_underlying_metadata(bdev, bh->b_blocknr + i); 1830 } 1831 1832 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 1833 sector_t logical, long blk_cnt) 1834 { 1835 int nr_pages, i; 1836 pgoff_t index, end; 1837 struct pagevec pvec; 1838 struct inode *inode = mpd->inode; 1839 struct address_space *mapping = inode->i_mapping; 1840 1841 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 1842 end = (logical + blk_cnt - 1) >> 1843 (PAGE_CACHE_SHIFT - inode->i_blkbits); 1844 while (index <= end) { 1845 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 1846 if (nr_pages == 0) 1847 break; 1848 for (i = 0; i < nr_pages; i++) { 1849 struct page *page = pvec.pages[i]; 1850 index = page->index; 1851 if (index > end) 1852 break; 1853 index++; 1854 1855 BUG_ON(!PageLocked(page)); 1856 BUG_ON(PageWriteback(page)); 1857 block_invalidatepage(page, 0); 1858 ClearPageUptodate(page); 1859 unlock_page(page); 1860 } 1861 } 1862 return; 1863 } 1864 1865 static void ext4_print_free_blocks(struct inode *inode) 1866 { 1867 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1868 printk(KERN_EMERG "Total free blocks count %lld\n", 1869 ext4_count_free_blocks(inode->i_sb)); 1870 printk(KERN_EMERG "Free/Dirty block details\n"); 1871 printk(KERN_EMERG "free_blocks=%lld\n", 1872 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 1873 printk(KERN_EMERG "dirty_blocks=%lld\n", 1874 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 1875 printk(KERN_EMERG "Block reservation details\n"); 1876 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 1877 EXT4_I(inode)->i_reserved_data_blocks); 1878 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 1879 EXT4_I(inode)->i_reserved_meta_blocks); 1880 return; 1881 } 1882 1883 /* 1884 * mpage_da_map_blocks - go through given space 1885 * 1886 * @mpd->lbh - bh describing space 1887 * @mpd->get_block - the filesystem's block mapper function 1888 * 1889 * The function skips space we know is already mapped to disk blocks. 1890 * 1891 */ 1892 static int mpage_da_map_blocks(struct mpage_da_data *mpd) 1893 { 1894 int err = 0; 1895 struct buffer_head new; 1896 struct buffer_head *lbh = &mpd->lbh; 1897 sector_t next; 1898 1899 /* 1900 * We consider only non-mapped and non-allocated blocks 1901 */ 1902 if (buffer_mapped(lbh) && !buffer_delay(lbh)) 1903 return 0; 1904 new.b_state = lbh->b_state; 1905 new.b_blocknr = 0; 1906 new.b_size = lbh->b_size; 1907 next = lbh->b_blocknr; 1908 /* 1909 * If we didn't accumulate anything 1910 * to write simply return 1911 */ 1912 if (!new.b_size) 1913 return 0; 1914 err = mpd->get_block(mpd->inode, next, &new, 1); 1915 if (err) { 1916 1917 /* If get block returns with error 1918 * we simply return. Later writepage 1919 * will redirty the page and writepages 1920 * will find the dirty page again 1921 */ 1922 if (err == -EAGAIN) 1923 return 0; 1924 1925 if (err == -ENOSPC && 1926 ext4_count_free_blocks(mpd->inode->i_sb)) { 1927 mpd->retval = err; 1928 return 0; 1929 } 1930 1931 /* 1932 * get block failure will cause us 1933 * to loop in writepages. Because 1934 * a_ops->writepage won't be able to 1935 * make progress. The page will be redirtied 1936 * by writepage and writepages will again 1937 * try to write the same. 1938 */ 1939 printk(KERN_EMERG "%s block allocation failed for inode %lu " 1940 "at logical offset %llu with max blocks " 1941 "%zd with error %d\n", 1942 __func__, mpd->inode->i_ino, 1943 (unsigned long long)next, 1944 lbh->b_size >> mpd->inode->i_blkbits, err); 1945 printk(KERN_EMERG "This should not happen.!! " 1946 "Data will be lost\n"); 1947 if (err == -ENOSPC) { 1948 ext4_print_free_blocks(mpd->inode); 1949 } 1950 /* invlaidate all the pages */ 1951 ext4_da_block_invalidatepages(mpd, next, 1952 lbh->b_size >> mpd->inode->i_blkbits); 1953 return err; 1954 } 1955 BUG_ON(new.b_size == 0); 1956 1957 if (buffer_new(&new)) 1958 __unmap_underlying_blocks(mpd->inode, &new); 1959 1960 /* 1961 * If blocks are delayed marked, we need to 1962 * put actual blocknr and drop delayed bit 1963 */ 1964 if (buffer_delay(lbh) || buffer_unwritten(lbh)) 1965 mpage_put_bnr_to_bhs(mpd, next, &new); 1966 1967 return 0; 1968 } 1969 1970 #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ 1971 (1 << BH_Delay) | (1 << BH_Unwritten)) 1972 1973 /* 1974 * mpage_add_bh_to_extent - try to add one more block to extent of blocks 1975 * 1976 * @mpd->lbh - extent of blocks 1977 * @logical - logical number of the block in the file 1978 * @bh - bh of the block (used to access block's state) 1979 * 1980 * the function is used to collect contig. blocks in same state 1981 */ 1982 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, 1983 sector_t logical, struct buffer_head *bh) 1984 { 1985 sector_t next; 1986 size_t b_size = bh->b_size; 1987 struct buffer_head *lbh = &mpd->lbh; 1988 int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; 1989 1990 /* check if thereserved journal credits might overflow */ 1991 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 1992 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 1993 /* 1994 * With non-extent format we are limited by the journal 1995 * credit available. Total credit needed to insert 1996 * nrblocks contiguous blocks is dependent on the 1997 * nrblocks. So limit nrblocks. 1998 */ 1999 goto flush_it; 2000 } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > 2001 EXT4_MAX_TRANS_DATA) { 2002 /* 2003 * Adding the new buffer_head would make it cross the 2004 * allowed limit for which we have journal credit 2005 * reserved. So limit the new bh->b_size 2006 */ 2007 b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << 2008 mpd->inode->i_blkbits; 2009 /* we will do mpage_da_submit_io in the next loop */ 2010 } 2011 } 2012 /* 2013 * First block in the extent 2014 */ 2015 if (lbh->b_size == 0) { 2016 lbh->b_blocknr = logical; 2017 lbh->b_size = b_size; 2018 lbh->b_state = bh->b_state & BH_FLAGS; 2019 return; 2020 } 2021 2022 next = lbh->b_blocknr + nrblocks; 2023 /* 2024 * Can we merge the block to our big extent? 2025 */ 2026 if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { 2027 lbh->b_size += b_size; 2028 return; 2029 } 2030 2031 flush_it: 2032 /* 2033 * We couldn't merge the block to our extent, so we 2034 * need to flush current extent and start new one 2035 */ 2036 if (mpage_da_map_blocks(mpd) == 0) 2037 mpage_da_submit_io(mpd); 2038 mpd->io_done = 1; 2039 return; 2040 } 2041 2042 /* 2043 * __mpage_da_writepage - finds extent of pages and blocks 2044 * 2045 * @page: page to consider 2046 * @wbc: not used, we just follow rules 2047 * @data: context 2048 * 2049 * The function finds extents of pages and scan them for all blocks. 2050 */ 2051 static int __mpage_da_writepage(struct page *page, 2052 struct writeback_control *wbc, void *data) 2053 { 2054 struct mpage_da_data *mpd = data; 2055 struct inode *inode = mpd->inode; 2056 struct buffer_head *bh, *head, fake; 2057 sector_t logical; 2058 2059 if (mpd->io_done) { 2060 /* 2061 * Rest of the page in the page_vec 2062 * redirty then and skip then. We will 2063 * try to to write them again after 2064 * starting a new transaction 2065 */ 2066 redirty_page_for_writepage(wbc, page); 2067 unlock_page(page); 2068 return MPAGE_DA_EXTENT_TAIL; 2069 } 2070 /* 2071 * Can we merge this page to current extent? 2072 */ 2073 if (mpd->next_page != page->index) { 2074 /* 2075 * Nope, we can't. So, we map non-allocated blocks 2076 * and start IO on them using writepage() 2077 */ 2078 if (mpd->next_page != mpd->first_page) { 2079 if (mpage_da_map_blocks(mpd) == 0) 2080 mpage_da_submit_io(mpd); 2081 /* 2082 * skip rest of the page in the page_vec 2083 */ 2084 mpd->io_done = 1; 2085 redirty_page_for_writepage(wbc, page); 2086 unlock_page(page); 2087 return MPAGE_DA_EXTENT_TAIL; 2088 } 2089 2090 /* 2091 * Start next extent of pages ... 2092 */ 2093 mpd->first_page = page->index; 2094 2095 /* 2096 * ... and blocks 2097 */ 2098 mpd->lbh.b_size = 0; 2099 mpd->lbh.b_state = 0; 2100 mpd->lbh.b_blocknr = 0; 2101 } 2102 2103 mpd->next_page = page->index + 1; 2104 logical = (sector_t) page->index << 2105 (PAGE_CACHE_SHIFT - inode->i_blkbits); 2106 2107 if (!page_has_buffers(page)) { 2108 /* 2109 * There is no attached buffer heads yet (mmap?) 2110 * we treat the page asfull of dirty blocks 2111 */ 2112 bh = &fake; 2113 bh->b_size = PAGE_CACHE_SIZE; 2114 bh->b_state = 0; 2115 set_buffer_dirty(bh); 2116 set_buffer_uptodate(bh); 2117 mpage_add_bh_to_extent(mpd, logical, bh); 2118 if (mpd->io_done) 2119 return MPAGE_DA_EXTENT_TAIL; 2120 } else { 2121 /* 2122 * Page with regular buffer heads, just add all dirty ones 2123 */ 2124 head = page_buffers(page); 2125 bh = head; 2126 do { 2127 BUG_ON(buffer_locked(bh)); 2128 /* 2129 * We need to try to allocate 2130 * unmapped blocks in the same page. 2131 * Otherwise we won't make progress 2132 * with the page in ext4_da_writepage 2133 */ 2134 if (buffer_dirty(bh) && 2135 (!buffer_mapped(bh) || buffer_delay(bh))) { 2136 mpage_add_bh_to_extent(mpd, logical, bh); 2137 if (mpd->io_done) 2138 return MPAGE_DA_EXTENT_TAIL; 2139 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { 2140 /* 2141 * mapped dirty buffer. We need to update 2142 * the b_state because we look at 2143 * b_state in mpage_da_map_blocks. We don't 2144 * update b_size because if we find an 2145 * unmapped buffer_head later we need to 2146 * use the b_state flag of that buffer_head. 2147 */ 2148 if (mpd->lbh.b_size == 0) 2149 mpd->lbh.b_state = 2150 bh->b_state & BH_FLAGS; 2151 } 2152 logical++; 2153 } while ((bh = bh->b_this_page) != head); 2154 } 2155 2156 return 0; 2157 } 2158 2159 /* 2160 * mpage_da_writepages - walk the list of dirty pages of the given 2161 * address space, allocates non-allocated blocks, maps newly-allocated 2162 * blocks to existing bhs and issue IO them 2163 * 2164 * @mapping: address space structure to write 2165 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2166 * @get_block: the filesystem's block mapper function. 2167 * 2168 * This is a library function, which implements the writepages() 2169 * address_space_operation. 2170 */ 2171 static int mpage_da_writepages(struct address_space *mapping, 2172 struct writeback_control *wbc, 2173 struct mpage_da_data *mpd) 2174 { 2175 int ret; 2176 2177 if (!mpd->get_block) 2178 return generic_writepages(mapping, wbc); 2179 2180 mpd->lbh.b_size = 0; 2181 mpd->lbh.b_state = 0; 2182 mpd->lbh.b_blocknr = 0; 2183 mpd->first_page = 0; 2184 mpd->next_page = 0; 2185 mpd->io_done = 0; 2186 mpd->pages_written = 0; 2187 mpd->retval = 0; 2188 2189 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); 2190 /* 2191 * Handle last extent of pages 2192 */ 2193 if (!mpd->io_done && mpd->next_page != mpd->first_page) { 2194 if (mpage_da_map_blocks(mpd) == 0) 2195 mpage_da_submit_io(mpd); 2196 2197 mpd->io_done = 1; 2198 ret = MPAGE_DA_EXTENT_TAIL; 2199 } 2200 wbc->nr_to_write -= mpd->pages_written; 2201 return ret; 2202 } 2203 2204 /* 2205 * this is a special callback for ->write_begin() only 2206 * it's intention is to return mapped block or reserve space 2207 */ 2208 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2209 struct buffer_head *bh_result, int create) 2210 { 2211 int ret = 0; 2212 2213 BUG_ON(create == 0); 2214 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2215 2216 /* 2217 * first, we need to know whether the block is allocated already 2218 * preallocated blocks are unmapped but should treated 2219 * the same as allocated blocks. 2220 */ 2221 ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); 2222 if ((ret == 0) && !buffer_delay(bh_result)) { 2223 /* the block isn't (pre)allocated yet, let's reserve space */ 2224 /* 2225 * XXX: __block_prepare_write() unmaps passed block, 2226 * is it OK? 2227 */ 2228 ret = ext4_da_reserve_space(inode, 1); 2229 if (ret) 2230 /* not enough space to reserve */ 2231 return ret; 2232 2233 map_bh(bh_result, inode->i_sb, 0); 2234 set_buffer_new(bh_result); 2235 set_buffer_delay(bh_result); 2236 } else if (ret > 0) { 2237 bh_result->b_size = (ret << inode->i_blkbits); 2238 ret = 0; 2239 } 2240 2241 return ret; 2242 } 2243 #define EXT4_DELALLOC_RSVED 1 2244 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, 2245 struct buffer_head *bh_result, int create) 2246 { 2247 int ret; 2248 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2249 loff_t disksize = EXT4_I(inode)->i_disksize; 2250 handle_t *handle = NULL; 2251 2252 handle = ext4_journal_current_handle(); 2253 BUG_ON(!handle); 2254 ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, 2255 bh_result, create, 0, EXT4_DELALLOC_RSVED); 2256 if (ret > 0) { 2257 2258 bh_result->b_size = (ret << inode->i_blkbits); 2259 2260 if (ext4_should_order_data(inode)) { 2261 int retval; 2262 retval = ext4_jbd2_file_inode(handle, inode); 2263 if (retval) 2264 /* 2265 * Failed to add inode for ordered 2266 * mode. Don't update file size 2267 */ 2268 return retval; 2269 } 2270 2271 /* 2272 * Update on-disk size along with block allocation 2273 * we don't use 'extend_disksize' as size may change 2274 * within already allocated block -bzzz 2275 */ 2276 disksize = ((loff_t) iblock + ret) << inode->i_blkbits; 2277 if (disksize > i_size_read(inode)) 2278 disksize = i_size_read(inode); 2279 if (disksize > EXT4_I(inode)->i_disksize) { 2280 ext4_update_i_disksize(inode, disksize); 2281 ret = ext4_mark_inode_dirty(handle, inode); 2282 return ret; 2283 } 2284 ret = 0; 2285 } 2286 return ret; 2287 } 2288 2289 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) 2290 { 2291 /* 2292 * unmapped buffer is possible for holes. 2293 * delay buffer is possible with delayed allocation 2294 */ 2295 return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); 2296 } 2297 2298 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, 2299 struct buffer_head *bh_result, int create) 2300 { 2301 int ret = 0; 2302 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; 2303 2304 /* 2305 * we don't want to do block allocation in writepage 2306 * so call get_block_wrap with create = 0 2307 */ 2308 ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, 2309 bh_result, 0, 0, 0); 2310 if (ret > 0) { 2311 bh_result->b_size = (ret << inode->i_blkbits); 2312 ret = 0; 2313 } 2314 return ret; 2315 } 2316 2317 /* 2318 * get called vi ext4_da_writepages after taking page lock (have journal handle) 2319 * get called via journal_submit_inode_data_buffers (no journal handle) 2320 * get called via shrink_page_list via pdflush (no journal handle) 2321 * or grab_page_cache when doing write_begin (have journal handle) 2322 */ 2323 static int ext4_da_writepage(struct page *page, 2324 struct writeback_control *wbc) 2325 { 2326 int ret = 0; 2327 loff_t size; 2328 unsigned int len; 2329 struct buffer_head *page_bufs; 2330 struct inode *inode = page->mapping->host; 2331 2332 trace_mark(ext4_da_writepage, 2333 "dev %s ino %lu page_index %lu", 2334 inode->i_sb->s_id, inode->i_ino, page->index); 2335 size = i_size_read(inode); 2336 if (page->index == size >> PAGE_CACHE_SHIFT) 2337 len = size & ~PAGE_CACHE_MASK; 2338 else 2339 len = PAGE_CACHE_SIZE; 2340 2341 if (page_has_buffers(page)) { 2342 page_bufs = page_buffers(page); 2343 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2344 ext4_bh_unmapped_or_delay)) { 2345 /* 2346 * We don't want to do block allocation 2347 * So redirty the page and return 2348 * We may reach here when we do a journal commit 2349 * via journal_submit_inode_data_buffers. 2350 * If we don't have mapping block we just ignore 2351 * them. We can also reach here via shrink_page_list 2352 */ 2353 redirty_page_for_writepage(wbc, page); 2354 unlock_page(page); 2355 return 0; 2356 } 2357 } else { 2358 /* 2359 * The test for page_has_buffers() is subtle: 2360 * We know the page is dirty but it lost buffers. That means 2361 * that at some moment in time after write_begin()/write_end() 2362 * has been called all buffers have been clean and thus they 2363 * must have been written at least once. So they are all 2364 * mapped and we can happily proceed with mapping them 2365 * and writing the page. 2366 * 2367 * Try to initialize the buffer_heads and check whether 2368 * all are mapped and non delay. We don't want to 2369 * do block allocation here. 2370 */ 2371 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 2372 ext4_normal_get_block_write); 2373 if (!ret) { 2374 page_bufs = page_buffers(page); 2375 /* check whether all are mapped and non delay */ 2376 if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2377 ext4_bh_unmapped_or_delay)) { 2378 redirty_page_for_writepage(wbc, page); 2379 unlock_page(page); 2380 return 0; 2381 } 2382 } else { 2383 /* 2384 * We can't do block allocation here 2385 * so just redity the page and unlock 2386 * and return 2387 */ 2388 redirty_page_for_writepage(wbc, page); 2389 unlock_page(page); 2390 return 0; 2391 } 2392 /* now mark the buffer_heads as dirty and uptodate */ 2393 block_commit_write(page, 0, PAGE_CACHE_SIZE); 2394 } 2395 2396 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2397 ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); 2398 else 2399 ret = block_write_full_page(page, 2400 ext4_normal_get_block_write, 2401 wbc); 2402 2403 return ret; 2404 } 2405 2406 /* 2407 * This is called via ext4_da_writepages() to 2408 * calulate the total number of credits to reserve to fit 2409 * a single extent allocation into a single transaction, 2410 * ext4_da_writpeages() will loop calling this before 2411 * the block allocation. 2412 */ 2413 2414 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2415 { 2416 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 2417 2418 /* 2419 * With non-extent format the journal credit needed to 2420 * insert nrblocks contiguous block is dependent on 2421 * number of contiguous block. So we will limit 2422 * number of contiguous block to a sane value 2423 */ 2424 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2425 (max_blocks > EXT4_MAX_TRANS_DATA)) 2426 max_blocks = EXT4_MAX_TRANS_DATA; 2427 2428 return ext4_chunk_trans_blocks(inode, max_blocks); 2429 } 2430 2431 static int ext4_da_writepages(struct address_space *mapping, 2432 struct writeback_control *wbc) 2433 { 2434 pgoff_t index; 2435 int range_whole = 0; 2436 handle_t *handle = NULL; 2437 struct mpage_da_data mpd; 2438 struct inode *inode = mapping->host; 2439 int no_nrwrite_index_update; 2440 int pages_written = 0; 2441 long pages_skipped; 2442 int range_cyclic, cycled = 1, io_done = 0; 2443 int needed_blocks, ret = 0, nr_to_writebump = 0; 2444 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2445 2446 trace_mark(ext4_da_writepages, 2447 "dev %s ino %lu nr_t_write %ld " 2448 "pages_skipped %ld range_start %llu " 2449 "range_end %llu nonblocking %d " 2450 "for_kupdate %d for_reclaim %d " 2451 "for_writepages %d range_cyclic %d", 2452 inode->i_sb->s_id, inode->i_ino, 2453 wbc->nr_to_write, wbc->pages_skipped, 2454 (unsigned long long) wbc->range_start, 2455 (unsigned long long) wbc->range_end, 2456 wbc->nonblocking, wbc->for_kupdate, 2457 wbc->for_reclaim, wbc->for_writepages, 2458 wbc->range_cyclic); 2459 2460 /* 2461 * No pages to write? This is mainly a kludge to avoid starting 2462 * a transaction for special inodes like journal inode on last iput() 2463 * because that could violate lock ordering on umount 2464 */ 2465 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2466 return 0; 2467 2468 /* 2469 * If the filesystem has aborted, it is read-only, so return 2470 * right away instead of dumping stack traces later on that 2471 * will obscure the real source of the problem. We test 2472 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because 2473 * the latter could be true if the filesystem is mounted 2474 * read-only, and in that case, ext4_da_writepages should 2475 * *never* be called, so if that ever happens, we would want 2476 * the stack trace. 2477 */ 2478 if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) 2479 return -EROFS; 2480 2481 /* 2482 * Make sure nr_to_write is >= sbi->s_mb_stream_request 2483 * This make sure small files blocks are allocated in 2484 * single attempt. This ensure that small files 2485 * get less fragmented. 2486 */ 2487 if (wbc->nr_to_write < sbi->s_mb_stream_request) { 2488 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; 2489 wbc->nr_to_write = sbi->s_mb_stream_request; 2490 } 2491 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2492 range_whole = 1; 2493 2494 range_cyclic = wbc->range_cyclic; 2495 if (wbc->range_cyclic) { 2496 index = mapping->writeback_index; 2497 if (index) 2498 cycled = 0; 2499 wbc->range_start = index << PAGE_CACHE_SHIFT; 2500 wbc->range_end = LLONG_MAX; 2501 wbc->range_cyclic = 0; 2502 } else 2503 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2504 2505 mpd.wbc = wbc; 2506 mpd.inode = mapping->host; 2507 2508 /* 2509 * we don't want write_cache_pages to update 2510 * nr_to_write and writeback_index 2511 */ 2512 no_nrwrite_index_update = wbc->no_nrwrite_index_update; 2513 wbc->no_nrwrite_index_update = 1; 2514 pages_skipped = wbc->pages_skipped; 2515 2516 retry: 2517 while (!ret && wbc->nr_to_write > 0) { 2518 2519 /* 2520 * we insert one extent at a time. So we need 2521 * credit needed for single extent allocation. 2522 * journalled mode is currently not supported 2523 * by delalloc 2524 */ 2525 BUG_ON(ext4_should_journal_data(inode)); 2526 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2527 2528 /* start a new transaction*/ 2529 handle = ext4_journal_start(inode, needed_blocks); 2530 if (IS_ERR(handle)) { 2531 ret = PTR_ERR(handle); 2532 printk(KERN_CRIT "%s: jbd2_start: " 2533 "%ld pages, ino %lu; err %d\n", __func__, 2534 wbc->nr_to_write, inode->i_ino, ret); 2535 dump_stack(); 2536 goto out_writepages; 2537 } 2538 mpd.get_block = ext4_da_get_block_write; 2539 ret = mpage_da_writepages(mapping, wbc, &mpd); 2540 2541 ext4_journal_stop(handle); 2542 2543 if (mpd.retval == -ENOSPC) { 2544 /* commit the transaction which would 2545 * free blocks released in the transaction 2546 * and try again 2547 */ 2548 jbd2_journal_force_commit_nested(sbi->s_journal); 2549 wbc->pages_skipped = pages_skipped; 2550 ret = 0; 2551 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2552 /* 2553 * got one extent now try with 2554 * rest of the pages 2555 */ 2556 pages_written += mpd.pages_written; 2557 wbc->pages_skipped = pages_skipped; 2558 ret = 0; 2559 io_done = 1; 2560 } else if (wbc->nr_to_write) 2561 /* 2562 * There is no more writeout needed 2563 * or we requested for a noblocking writeout 2564 * and we found the device congested 2565 */ 2566 break; 2567 } 2568 if (!io_done && !cycled) { 2569 cycled = 1; 2570 index = 0; 2571 wbc->range_start = index << PAGE_CACHE_SHIFT; 2572 wbc->range_end = mapping->writeback_index - 1; 2573 goto retry; 2574 } 2575 if (pages_skipped != wbc->pages_skipped) 2576 printk(KERN_EMERG "This should not happen leaving %s " 2577 "with nr_to_write = %ld ret = %d\n", 2578 __func__, wbc->nr_to_write, ret); 2579 2580 /* Update index */ 2581 index += pages_written; 2582 wbc->range_cyclic = range_cyclic; 2583 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2584 /* 2585 * set the writeback_index so that range_cyclic 2586 * mode will write it back later 2587 */ 2588 mapping->writeback_index = index; 2589 2590 out_writepages: 2591 if (!no_nrwrite_index_update) 2592 wbc->no_nrwrite_index_update = 0; 2593 wbc->nr_to_write -= nr_to_writebump; 2594 trace_mark(ext4_da_writepage_result, 2595 "dev %s ino %lu ret %d pages_written %d " 2596 "pages_skipped %ld congestion %d " 2597 "more_io %d no_nrwrite_index_update %d", 2598 inode->i_sb->s_id, inode->i_ino, ret, 2599 pages_written, wbc->pages_skipped, 2600 wbc->encountered_congestion, wbc->more_io, 2601 wbc->no_nrwrite_index_update); 2602 return ret; 2603 } 2604 2605 #define FALL_BACK_TO_NONDELALLOC 1 2606 static int ext4_nonda_switch(struct super_block *sb) 2607 { 2608 s64 free_blocks, dirty_blocks; 2609 struct ext4_sb_info *sbi = EXT4_SB(sb); 2610 2611 /* 2612 * switch to non delalloc mode if we are running low 2613 * on free block. The free block accounting via percpu 2614 * counters can get slightly wrong with percpu_counter_batch getting 2615 * accumulated on each CPU without updating global counters 2616 * Delalloc need an accurate free block accounting. So switch 2617 * to non delalloc when we are near to error range. 2618 */ 2619 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); 2620 dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); 2621 if (2 * free_blocks < 3 * dirty_blocks || 2622 free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { 2623 /* 2624 * free block count is less that 150% of dirty blocks 2625 * or free blocks is less that watermark 2626 */ 2627 return 1; 2628 } 2629 return 0; 2630 } 2631 2632 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2633 loff_t pos, unsigned len, unsigned flags, 2634 struct page **pagep, void **fsdata) 2635 { 2636 int ret, retries = 0; 2637 struct page *page; 2638 pgoff_t index; 2639 unsigned from, to; 2640 struct inode *inode = mapping->host; 2641 handle_t *handle; 2642 2643 index = pos >> PAGE_CACHE_SHIFT; 2644 from = pos & (PAGE_CACHE_SIZE - 1); 2645 to = from + len; 2646 2647 if (ext4_nonda_switch(inode->i_sb)) { 2648 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2649 return ext4_write_begin(file, mapping, pos, 2650 len, flags, pagep, fsdata); 2651 } 2652 *fsdata = (void *)0; 2653 2654 trace_mark(ext4_da_write_begin, 2655 "dev %s ino %lu pos %llu len %u flags %u", 2656 inode->i_sb->s_id, inode->i_ino, 2657 (unsigned long long) pos, len, flags); 2658 retry: 2659 /* 2660 * With delayed allocation, we don't log the i_disksize update 2661 * if there is delayed block allocation. But we still need 2662 * to journalling the i_disksize update if writes to the end 2663 * of file which has an already mapped buffer. 2664 */ 2665 handle = ext4_journal_start(inode, 1); 2666 if (IS_ERR(handle)) { 2667 ret = PTR_ERR(handle); 2668 goto out; 2669 } 2670 2671 page = grab_cache_page_write_begin(mapping, index, flags); 2672 if (!page) { 2673 ext4_journal_stop(handle); 2674 ret = -ENOMEM; 2675 goto out; 2676 } 2677 *pagep = page; 2678 2679 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 2680 ext4_da_get_block_prep); 2681 if (ret < 0) { 2682 unlock_page(page); 2683 ext4_journal_stop(handle); 2684 page_cache_release(page); 2685 /* 2686 * block_write_begin may have instantiated a few blocks 2687 * outside i_size. Trim these off again. Don't need 2688 * i_size_read because we hold i_mutex. 2689 */ 2690 if (pos + len > inode->i_size) 2691 vmtruncate(inode, inode->i_size); 2692 } 2693 2694 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 2695 goto retry; 2696 out: 2697 return ret; 2698 } 2699 2700 /* 2701 * Check if we should update i_disksize 2702 * when write to the end of file but not require block allocation 2703 */ 2704 static int ext4_da_should_update_i_disksize(struct page *page, 2705 unsigned long offset) 2706 { 2707 struct buffer_head *bh; 2708 struct inode *inode = page->mapping->host; 2709 unsigned int idx; 2710 int i; 2711 2712 bh = page_buffers(page); 2713 idx = offset >> inode->i_blkbits; 2714 2715 for (i = 0; i < idx; i++) 2716 bh = bh->b_this_page; 2717 2718 if (!buffer_mapped(bh) || (buffer_delay(bh))) 2719 return 0; 2720 return 1; 2721 } 2722 2723 static int ext4_da_write_end(struct file *file, 2724 struct address_space *mapping, 2725 loff_t pos, unsigned len, unsigned copied, 2726 struct page *page, void *fsdata) 2727 { 2728 struct inode *inode = mapping->host; 2729 int ret = 0, ret2; 2730 handle_t *handle = ext4_journal_current_handle(); 2731 loff_t new_i_size; 2732 unsigned long start, end; 2733 int write_mode = (int)(unsigned long)fsdata; 2734 2735 if (write_mode == FALL_BACK_TO_NONDELALLOC) { 2736 if (ext4_should_order_data(inode)) { 2737 return ext4_ordered_write_end(file, mapping, pos, 2738 len, copied, page, fsdata); 2739 } else if (ext4_should_writeback_data(inode)) { 2740 return ext4_writeback_write_end(file, mapping, pos, 2741 len, copied, page, fsdata); 2742 } else { 2743 BUG(); 2744 } 2745 } 2746 2747 trace_mark(ext4_da_write_end, 2748 "dev %s ino %lu pos %llu len %u copied %u", 2749 inode->i_sb->s_id, inode->i_ino, 2750 (unsigned long long) pos, len, copied); 2751 start = pos & (PAGE_CACHE_SIZE - 1); 2752 end = start + copied - 1; 2753 2754 /* 2755 * generic_write_end() will run mark_inode_dirty() if i_size 2756 * changes. So let's piggyback the i_disksize mark_inode_dirty 2757 * into that. 2758 */ 2759 2760 new_i_size = pos + copied; 2761 if (new_i_size > EXT4_I(inode)->i_disksize) { 2762 if (ext4_da_should_update_i_disksize(page, end)) { 2763 down_write(&EXT4_I(inode)->i_data_sem); 2764 if (new_i_size > EXT4_I(inode)->i_disksize) { 2765 /* 2766 * Updating i_disksize when extending file 2767 * without needing block allocation 2768 */ 2769 if (ext4_should_order_data(inode)) 2770 ret = ext4_jbd2_file_inode(handle, 2771 inode); 2772 2773 EXT4_I(inode)->i_disksize = new_i_size; 2774 } 2775 up_write(&EXT4_I(inode)->i_data_sem); 2776 /* We need to mark inode dirty even if 2777 * new_i_size is less that inode->i_size 2778 * bu greater than i_disksize.(hint delalloc) 2779 */ 2780 ext4_mark_inode_dirty(handle, inode); 2781 } 2782 } 2783 ret2 = generic_write_end(file, mapping, pos, len, copied, 2784 page, fsdata); 2785 copied = ret2; 2786 if (ret2 < 0) 2787 ret = ret2; 2788 ret2 = ext4_journal_stop(handle); 2789 if (!ret) 2790 ret = ret2; 2791 2792 return ret ? ret : copied; 2793 } 2794 2795 static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2796 { 2797 /* 2798 * Drop reserved blocks 2799 */ 2800 BUG_ON(!PageLocked(page)); 2801 if (!page_has_buffers(page)) 2802 goto out; 2803 2804 ext4_da_page_release_reservation(page, offset); 2805 2806 out: 2807 ext4_invalidatepage(page, offset); 2808 2809 return; 2810 } 2811 2812 2813 /* 2814 * bmap() is special. It gets used by applications such as lilo and by 2815 * the swapper to find the on-disk block of a specific piece of data. 2816 * 2817 * Naturally, this is dangerous if the block concerned is still in the 2818 * journal. If somebody makes a swapfile on an ext4 data-journaling 2819 * filesystem and enables swap, then they may get a nasty shock when the 2820 * data getting swapped to that swapfile suddenly gets overwritten by 2821 * the original zero's written out previously to the journal and 2822 * awaiting writeback in the kernel's buffer cache. 2823 * 2824 * So, if we see any bmap calls here on a modified, data-journaled file, 2825 * take extra steps to flush any blocks which might be in the cache. 2826 */ 2827 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 2828 { 2829 struct inode *inode = mapping->host; 2830 journal_t *journal; 2831 int err; 2832 2833 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 2834 test_opt(inode->i_sb, DELALLOC)) { 2835 /* 2836 * With delalloc we want to sync the file 2837 * so that we can make sure we allocate 2838 * blocks for file 2839 */ 2840 filemap_write_and_wait(mapping); 2841 } 2842 2843 if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { 2844 /* 2845 * This is a REALLY heavyweight approach, but the use of 2846 * bmap on dirty files is expected to be extremely rare: 2847 * only if we run lilo or swapon on a freshly made file 2848 * do we expect this to happen. 2849 * 2850 * (bmap requires CAP_SYS_RAWIO so this does not 2851 * represent an unprivileged user DOS attack --- we'd be 2852 * in trouble if mortal users could trigger this path at 2853 * will.) 2854 * 2855 * NB. EXT4_STATE_JDATA is not set on files other than 2856 * regular files. If somebody wants to bmap a directory 2857 * or symlink and gets confused because the buffer 2858 * hasn't yet been flushed to disk, they deserve 2859 * everything they get. 2860 */ 2861 2862 EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; 2863 journal = EXT4_JOURNAL(inode); 2864 jbd2_journal_lock_updates(journal); 2865 err = jbd2_journal_flush(journal); 2866 jbd2_journal_unlock_updates(journal); 2867 2868 if (err) 2869 return 0; 2870 } 2871 2872 return generic_block_bmap(mapping, block, ext4_get_block); 2873 } 2874 2875 static int bget_one(handle_t *handle, struct buffer_head *bh) 2876 { 2877 get_bh(bh); 2878 return 0; 2879 } 2880 2881 static int bput_one(handle_t *handle, struct buffer_head *bh) 2882 { 2883 put_bh(bh); 2884 return 0; 2885 } 2886 2887 /* 2888 * Note that we don't need to start a transaction unless we're journaling data 2889 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2890 * need to file the inode to the transaction's list in ordered mode because if 2891 * we are writing back data added by write(), the inode is already there and if 2892 * we are writing back data modified via mmap(), noone guarantees in which 2893 * transaction the data will hit the disk. In case we are journaling data, we 2894 * cannot start transaction directly because transaction start ranks above page 2895 * lock so we have to do some magic. 2896 * 2897 * In all journaling modes block_write_full_page() will start the I/O. 2898 * 2899 * Problem: 2900 * 2901 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 2902 * ext4_writepage() 2903 * 2904 * Similar for: 2905 * 2906 * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... 2907 * 2908 * Same applies to ext4_get_block(). We will deadlock on various things like 2909 * lock_journal and i_data_sem 2910 * 2911 * Setting PF_MEMALLOC here doesn't work - too many internal memory 2912 * allocations fail. 2913 * 2914 * 16May01: If we're reentered then journal_current_handle() will be 2915 * non-zero. We simply *return*. 2916 * 2917 * 1 July 2001: @@@ FIXME: 2918 * In journalled data mode, a data buffer may be metadata against the 2919 * current transaction. But the same file is part of a shared mapping 2920 * and someone does a writepage() on it. 2921 * 2922 * We will move the buffer onto the async_data list, but *after* it has 2923 * been dirtied. So there's a small window where we have dirty data on 2924 * BJ_Metadata. 2925 * 2926 * Note that this only applies to the last partial page in the file. The 2927 * bit which block_write_full_page() uses prepare/commit for. (That's 2928 * broken code anyway: it's wrong for msync()). 2929 * 2930 * It's a rare case: affects the final partial page, for journalled data 2931 * where the file is subject to bith write() and writepage() in the same 2932 * transction. To fix it we'll need a custom block_write_full_page(). 2933 * We'll probably need that anyway for journalling writepage() output. 2934 * 2935 * We don't honour synchronous mounts for writepage(). That would be 2936 * disastrous. Any write() or metadata operation will sync the fs for 2937 * us. 2938 * 2939 */ 2940 static int __ext4_normal_writepage(struct page *page, 2941 struct writeback_control *wbc) 2942 { 2943 struct inode *inode = page->mapping->host; 2944 2945 if (test_opt(inode->i_sb, NOBH)) 2946 return nobh_writepage(page, 2947 ext4_normal_get_block_write, wbc); 2948 else 2949 return block_write_full_page(page, 2950 ext4_normal_get_block_write, 2951 wbc); 2952 } 2953 2954 static int ext4_normal_writepage(struct page *page, 2955 struct writeback_control *wbc) 2956 { 2957 struct inode *inode = page->mapping->host; 2958 loff_t size = i_size_read(inode); 2959 loff_t len; 2960 2961 trace_mark(ext4_normal_writepage, 2962 "dev %s ino %lu page_index %lu", 2963 inode->i_sb->s_id, inode->i_ino, page->index); 2964 J_ASSERT(PageLocked(page)); 2965 if (page->index == size >> PAGE_CACHE_SHIFT) 2966 len = size & ~PAGE_CACHE_MASK; 2967 else 2968 len = PAGE_CACHE_SIZE; 2969 2970 if (page_has_buffers(page)) { 2971 /* if page has buffers it should all be mapped 2972 * and allocated. If there are not buffers attached 2973 * to the page we know the page is dirty but it lost 2974 * buffers. That means that at some moment in time 2975 * after write_begin() / write_end() has been called 2976 * all buffers have been clean and thus they must have been 2977 * written at least once. So they are all mapped and we can 2978 * happily proceed with mapping them and writing the page. 2979 */ 2980 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 2981 ext4_bh_unmapped_or_delay)); 2982 } 2983 2984 if (!ext4_journal_current_handle()) 2985 return __ext4_normal_writepage(page, wbc); 2986 2987 redirty_page_for_writepage(wbc, page); 2988 unlock_page(page); 2989 return 0; 2990 } 2991 2992 static int __ext4_journalled_writepage(struct page *page, 2993 struct writeback_control *wbc) 2994 { 2995 struct address_space *mapping = page->mapping; 2996 struct inode *inode = mapping->host; 2997 struct buffer_head *page_bufs; 2998 handle_t *handle = NULL; 2999 int ret = 0; 3000 int err; 3001 3002 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, 3003 ext4_normal_get_block_write); 3004 if (ret != 0) 3005 goto out_unlock; 3006 3007 page_bufs = page_buffers(page); 3008 walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, 3009 bget_one); 3010 /* As soon as we unlock the page, it can go away, but we have 3011 * references to buffers so we are safe */ 3012 unlock_page(page); 3013 3014 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); 3015 if (IS_ERR(handle)) { 3016 ret = PTR_ERR(handle); 3017 goto out; 3018 } 3019 3020 ret = walk_page_buffers(handle, page_bufs, 0, 3021 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); 3022 3023 err = walk_page_buffers(handle, page_bufs, 0, 3024 PAGE_CACHE_SIZE, NULL, write_end_fn); 3025 if (ret == 0) 3026 ret = err; 3027 err = ext4_journal_stop(handle); 3028 if (!ret) 3029 ret = err; 3030 3031 walk_page_buffers(handle, page_bufs, 0, 3032 PAGE_CACHE_SIZE, NULL, bput_one); 3033 EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; 3034 goto out; 3035 3036 out_unlock: 3037 unlock_page(page); 3038 out: 3039 return ret; 3040 } 3041 3042 static int ext4_journalled_writepage(struct page *page, 3043 struct writeback_control *wbc) 3044 { 3045 struct inode *inode = page->mapping->host; 3046 loff_t size = i_size_read(inode); 3047 loff_t len; 3048 3049 trace_mark(ext4_journalled_writepage, 3050 "dev %s ino %lu page_index %lu", 3051 inode->i_sb->s_id, inode->i_ino, page->index); 3052 J_ASSERT(PageLocked(page)); 3053 if (page->index == size >> PAGE_CACHE_SHIFT) 3054 len = size & ~PAGE_CACHE_MASK; 3055 else 3056 len = PAGE_CACHE_SIZE; 3057 3058 if (page_has_buffers(page)) { 3059 /* if page has buffers it should all be mapped 3060 * and allocated. If there are not buffers attached 3061 * to the page we know the page is dirty but it lost 3062 * buffers. That means that at some moment in time 3063 * after write_begin() / write_end() has been called 3064 * all buffers have been clean and thus they must have been 3065 * written at least once. So they are all mapped and we can 3066 * happily proceed with mapping them and writing the page. 3067 */ 3068 BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 3069 ext4_bh_unmapped_or_delay)); 3070 } 3071 3072 if (ext4_journal_current_handle()) 3073 goto no_write; 3074 3075 if (PageChecked(page)) { 3076 /* 3077 * It's mmapped pagecache. Add buffers and journal it. There 3078 * doesn't seem much point in redirtying the page here. 3079 */ 3080 ClearPageChecked(page); 3081 return __ext4_journalled_writepage(page, wbc); 3082 } else { 3083 /* 3084 * It may be a page full of checkpoint-mode buffers. We don't 3085 * really know unless we go poke around in the buffer_heads. 3086 * But block_write_full_page will do the right thing. 3087 */ 3088 return block_write_full_page(page, 3089 ext4_normal_get_block_write, 3090 wbc); 3091 } 3092 no_write: 3093 redirty_page_for_writepage(wbc, page); 3094 unlock_page(page); 3095 return 0; 3096 } 3097 3098 static int ext4_readpage(struct file *file, struct page *page) 3099 { 3100 return mpage_readpage(page, ext4_get_block); 3101 } 3102 3103 static int 3104 ext4_readpages(struct file *file, struct address_space *mapping, 3105 struct list_head *pages, unsigned nr_pages) 3106 { 3107 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 3108 } 3109 3110 static void ext4_invalidatepage(struct page *page, unsigned long offset) 3111 { 3112 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3113 3114 /* 3115 * If it's a full truncate we just forget about the pending dirtying 3116 */ 3117 if (offset == 0) 3118 ClearPageChecked(page); 3119 3120 if (journal) 3121 jbd2_journal_invalidatepage(journal, page, offset); 3122 else 3123 block_invalidatepage(page, offset); 3124 } 3125 3126 static int ext4_releasepage(struct page *page, gfp_t wait) 3127 { 3128 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3129 3130 WARN_ON(PageChecked(page)); 3131 if (!page_has_buffers(page)) 3132 return 0; 3133 if (journal) 3134 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3135 else 3136 return try_to_free_buffers(page); 3137 } 3138 3139 /* 3140 * If the O_DIRECT write will extend the file then add this inode to the 3141 * orphan list. So recovery will truncate it back to the original size 3142 * if the machine crashes during the write. 3143 * 3144 * If the O_DIRECT write is intantiating holes inside i_size and the machine 3145 * crashes then stale disk data _may_ be exposed inside the file. But current 3146 * VFS code falls back into buffered path in that case so we are safe. 3147 */ 3148 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3149 const struct iovec *iov, loff_t offset, 3150 unsigned long nr_segs) 3151 { 3152 struct file *file = iocb->ki_filp; 3153 struct inode *inode = file->f_mapping->host; 3154 struct ext4_inode_info *ei = EXT4_I(inode); 3155 handle_t *handle; 3156 ssize_t ret; 3157 int orphan = 0; 3158 size_t count = iov_length(iov, nr_segs); 3159 3160 if (rw == WRITE) { 3161 loff_t final_size = offset + count; 3162 3163 if (final_size > inode->i_size) { 3164 /* Credits for sb + inode write */ 3165 handle = ext4_journal_start(inode, 2); 3166 if (IS_ERR(handle)) { 3167 ret = PTR_ERR(handle); 3168 goto out; 3169 } 3170 ret = ext4_orphan_add(handle, inode); 3171 if (ret) { 3172 ext4_journal_stop(handle); 3173 goto out; 3174 } 3175 orphan = 1; 3176 ei->i_disksize = inode->i_size; 3177 ext4_journal_stop(handle); 3178 } 3179 } 3180 3181 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3182 offset, nr_segs, 3183 ext4_get_block, NULL); 3184 3185 if (orphan) { 3186 int err; 3187 3188 /* Credits for sb + inode write */ 3189 handle = ext4_journal_start(inode, 2); 3190 if (IS_ERR(handle)) { 3191 /* This is really bad luck. We've written the data 3192 * but cannot extend i_size. Bail out and pretend 3193 * the write failed... */ 3194 ret = PTR_ERR(handle); 3195 goto out; 3196 } 3197 if (inode->i_nlink) 3198 ext4_orphan_del(handle, inode); 3199 if (ret > 0) { 3200 loff_t end = offset + ret; 3201 if (end > inode->i_size) { 3202 ei->i_disksize = end; 3203 i_size_write(inode, end); 3204 /* 3205 * We're going to return a positive `ret' 3206 * here due to non-zero-length I/O, so there's 3207 * no way of reporting error returns from 3208 * ext4_mark_inode_dirty() to userspace. So 3209 * ignore it. 3210 */ 3211 ext4_mark_inode_dirty(handle, inode); 3212 } 3213 } 3214 err = ext4_journal_stop(handle); 3215 if (ret == 0) 3216 ret = err; 3217 } 3218 out: 3219 return ret; 3220 } 3221 3222 /* 3223 * Pages can be marked dirty completely asynchronously from ext4's journalling 3224 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3225 * much here because ->set_page_dirty is called under VFS locks. The page is 3226 * not necessarily locked. 3227 * 3228 * We cannot just dirty the page and leave attached buffers clean, because the 3229 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3230 * or jbddirty because all the journalling code will explode. 3231 * 3232 * So what we do is to mark the page "pending dirty" and next time writepage 3233 * is called, propagate that into the buffers appropriately. 3234 */ 3235 static int ext4_journalled_set_page_dirty(struct page *page) 3236 { 3237 SetPageChecked(page); 3238 return __set_page_dirty_nobuffers(page); 3239 } 3240 3241 static const struct address_space_operations ext4_ordered_aops = { 3242 .readpage = ext4_readpage, 3243 .readpages = ext4_readpages, 3244 .writepage = ext4_normal_writepage, 3245 .sync_page = block_sync_page, 3246 .write_begin = ext4_write_begin, 3247 .write_end = ext4_ordered_write_end, 3248 .bmap = ext4_bmap, 3249 .invalidatepage = ext4_invalidatepage, 3250 .releasepage = ext4_releasepage, 3251 .direct_IO = ext4_direct_IO, 3252 .migratepage = buffer_migrate_page, 3253 .is_partially_uptodate = block_is_partially_uptodate, 3254 }; 3255 3256 static const struct address_space_operations ext4_writeback_aops = { 3257 .readpage = ext4_readpage, 3258 .readpages = ext4_readpages, 3259 .writepage = ext4_normal_writepage, 3260 .sync_page = block_sync_page, 3261 .write_begin = ext4_write_begin, 3262 .write_end = ext4_writeback_write_end, 3263 .bmap = ext4_bmap, 3264 .invalidatepage = ext4_invalidatepage, 3265 .releasepage = ext4_releasepage, 3266 .direct_IO = ext4_direct_IO, 3267 .migratepage = buffer_migrate_page, 3268 .is_partially_uptodate = block_is_partially_uptodate, 3269 }; 3270 3271 static const struct address_space_operations ext4_journalled_aops = { 3272 .readpage = ext4_readpage, 3273 .readpages = ext4_readpages, 3274 .writepage = ext4_journalled_writepage, 3275 .sync_page = block_sync_page, 3276 .write_begin = ext4_write_begin, 3277 .write_end = ext4_journalled_write_end, 3278 .set_page_dirty = ext4_journalled_set_page_dirty, 3279 .bmap = ext4_bmap, 3280 .invalidatepage = ext4_invalidatepage, 3281 .releasepage = ext4_releasepage, 3282 .is_partially_uptodate = block_is_partially_uptodate, 3283 }; 3284 3285 static const struct address_space_operations ext4_da_aops = { 3286 .readpage = ext4_readpage, 3287 .readpages = ext4_readpages, 3288 .writepage = ext4_da_writepage, 3289 .writepages = ext4_da_writepages, 3290 .sync_page = block_sync_page, 3291 .write_begin = ext4_da_write_begin, 3292 .write_end = ext4_da_write_end, 3293 .bmap = ext4_bmap, 3294 .invalidatepage = ext4_da_invalidatepage, 3295 .releasepage = ext4_releasepage, 3296 .direct_IO = ext4_direct_IO, 3297 .migratepage = buffer_migrate_page, 3298 .is_partially_uptodate = block_is_partially_uptodate, 3299 }; 3300 3301 void ext4_set_aops(struct inode *inode) 3302 { 3303 if (ext4_should_order_data(inode) && 3304 test_opt(inode->i_sb, DELALLOC)) 3305 inode->i_mapping->a_ops = &ext4_da_aops; 3306 else if (ext4_should_order_data(inode)) 3307 inode->i_mapping->a_ops = &ext4_ordered_aops; 3308 else if (ext4_should_writeback_data(inode) && 3309 test_opt(inode->i_sb, DELALLOC)) 3310 inode->i_mapping->a_ops = &ext4_da_aops; 3311 else if (ext4_should_writeback_data(inode)) 3312 inode->i_mapping->a_ops = &ext4_writeback_aops; 3313 else 3314 inode->i_mapping->a_ops = &ext4_journalled_aops; 3315 } 3316 3317 /* 3318 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3319 * up to the end of the block which corresponds to `from'. 3320 * This required during truncate. We need to physically zero the tail end 3321 * of that block so it doesn't yield old data if the file is later grown. 3322 */ 3323 int ext4_block_truncate_page(handle_t *handle, 3324 struct address_space *mapping, loff_t from) 3325 { 3326 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3327 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3328 unsigned blocksize, length, pos; 3329 ext4_lblk_t iblock; 3330 struct inode *inode = mapping->host; 3331 struct buffer_head *bh; 3332 struct page *page; 3333 int err = 0; 3334 3335 page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); 3336 if (!page) 3337 return -EINVAL; 3338 3339 blocksize = inode->i_sb->s_blocksize; 3340 length = blocksize - (offset & (blocksize - 1)); 3341 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3342 3343 /* 3344 * For "nobh" option, we can only work if we don't need to 3345 * read-in the page - otherwise we create buffers to do the IO. 3346 */ 3347 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && 3348 ext4_should_writeback_data(inode) && PageUptodate(page)) { 3349 zero_user(page, offset, length); 3350 set_page_dirty(page); 3351 goto unlock; 3352 } 3353 3354 if (!page_has_buffers(page)) 3355 create_empty_buffers(page, blocksize, 0); 3356 3357 /* Find the buffer that contains "offset" */ 3358 bh = page_buffers(page); 3359 pos = blocksize; 3360 while (offset >= pos) { 3361 bh = bh->b_this_page; 3362 iblock++; 3363 pos += blocksize; 3364 } 3365 3366 err = 0; 3367 if (buffer_freed(bh)) { 3368 BUFFER_TRACE(bh, "freed: skip"); 3369 goto unlock; 3370 } 3371 3372 if (!buffer_mapped(bh)) { 3373 BUFFER_TRACE(bh, "unmapped"); 3374 ext4_get_block(inode, iblock, bh, 0); 3375 /* unmapped? It's a hole - nothing to do */ 3376 if (!buffer_mapped(bh)) { 3377 BUFFER_TRACE(bh, "still unmapped"); 3378 goto unlock; 3379 } 3380 } 3381 3382 /* Ok, it's mapped. Make sure it's up-to-date */ 3383 if (PageUptodate(page)) 3384 set_buffer_uptodate(bh); 3385 3386 if (!buffer_uptodate(bh)) { 3387 err = -EIO; 3388 ll_rw_block(READ, 1, &bh); 3389 wait_on_buffer(bh); 3390 /* Uhhuh. Read error. Complain and punt. */ 3391 if (!buffer_uptodate(bh)) 3392 goto unlock; 3393 } 3394 3395 if (ext4_should_journal_data(inode)) { 3396 BUFFER_TRACE(bh, "get write access"); 3397 err = ext4_journal_get_write_access(handle, bh); 3398 if (err) 3399 goto unlock; 3400 } 3401 3402 zero_user(page, offset, length); 3403 3404 BUFFER_TRACE(bh, "zeroed end of block"); 3405 3406 err = 0; 3407 if (ext4_should_journal_data(inode)) { 3408 err = ext4_handle_dirty_metadata(handle, inode, bh); 3409 } else { 3410 if (ext4_should_order_data(inode)) 3411 err = ext4_jbd2_file_inode(handle, inode); 3412 mark_buffer_dirty(bh); 3413 } 3414 3415 unlock: 3416 unlock_page(page); 3417 page_cache_release(page); 3418 return err; 3419 } 3420 3421 /* 3422 * Probably it should be a library function... search for first non-zero word 3423 * or memcmp with zero_page, whatever is better for particular architecture. 3424 * Linus? 3425 */ 3426 static inline int all_zeroes(__le32 *p, __le32 *q) 3427 { 3428 while (p < q) 3429 if (*p++) 3430 return 0; 3431 return 1; 3432 } 3433 3434 /** 3435 * ext4_find_shared - find the indirect blocks for partial truncation. 3436 * @inode: inode in question 3437 * @depth: depth of the affected branch 3438 * @offsets: offsets of pointers in that branch (see ext4_block_to_path) 3439 * @chain: place to store the pointers to partial indirect blocks 3440 * @top: place to the (detached) top of branch 3441 * 3442 * This is a helper function used by ext4_truncate(). 3443 * 3444 * When we do truncate() we may have to clean the ends of several 3445 * indirect blocks but leave the blocks themselves alive. Block is 3446 * partially truncated if some data below the new i_size is refered 3447 * from it (and it is on the path to the first completely truncated 3448 * data block, indeed). We have to free the top of that path along 3449 * with everything to the right of the path. Since no allocation 3450 * past the truncation point is possible until ext4_truncate() 3451 * finishes, we may safely do the latter, but top of branch may 3452 * require special attention - pageout below the truncation point 3453 * might try to populate it. 3454 * 3455 * We atomically detach the top of branch from the tree, store the 3456 * block number of its root in *@top, pointers to buffer_heads of 3457 * partially truncated blocks - in @chain[].bh and pointers to 3458 * their last elements that should not be removed - in 3459 * @chain[].p. Return value is the pointer to last filled element 3460 * of @chain. 3461 * 3462 * The work left to caller to do the actual freeing of subtrees: 3463 * a) free the subtree starting from *@top 3464 * b) free the subtrees whose roots are stored in 3465 * (@chain[i].p+1 .. end of @chain[i].bh->b_data) 3466 * c) free the subtrees growing from the inode past the @chain[0]. 3467 * (no partially truncated stuff there). */ 3468 3469 static Indirect *ext4_find_shared(struct inode *inode, int depth, 3470 ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top) 3471 { 3472 Indirect *partial, *p; 3473 int k, err; 3474 3475 *top = 0; 3476 /* Make k index the deepest non-null offest + 1 */ 3477 for (k = depth; k > 1 && !offsets[k-1]; k--) 3478 ; 3479 partial = ext4_get_branch(inode, k, offsets, chain, &err); 3480 /* Writer: pointers */ 3481 if (!partial) 3482 partial = chain + k-1; 3483 /* 3484 * If the branch acquired continuation since we've looked at it - 3485 * fine, it should all survive and (new) top doesn't belong to us. 3486 */ 3487 if (!partial->key && *partial->p) 3488 /* Writer: end */ 3489 goto no_top; 3490 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) 3491 ; 3492 /* 3493 * OK, we've found the last block that must survive. The rest of our 3494 * branch should be detached before unlocking. However, if that rest 3495 * of branch is all ours and does not grow immediately from the inode 3496 * it's easier to cheat and just decrement partial->p. 3497 */ 3498 if (p == chain + k - 1 && p > chain) { 3499 p->p--; 3500 } else { 3501 *top = *p->p; 3502 /* Nope, don't do this in ext4. Must leave the tree intact */ 3503 #if 0 3504 *p->p = 0; 3505 #endif 3506 } 3507 /* Writer: end */ 3508 3509 while (partial > p) { 3510 brelse(partial->bh); 3511 partial--; 3512 } 3513 no_top: 3514 return partial; 3515 } 3516 3517 /* 3518 * Zero a number of block pointers in either an inode or an indirect block. 3519 * If we restart the transaction we must again get write access to the 3520 * indirect block for further modification. 3521 * 3522 * We release `count' blocks on disk, but (last - first) may be greater 3523 * than `count' because there can be holes in there. 3524 */ 3525 static void ext4_clear_blocks(handle_t *handle, struct inode *inode, 3526 struct buffer_head *bh, ext4_fsblk_t block_to_free, 3527 unsigned long count, __le32 *first, __le32 *last) 3528 { 3529 __le32 *p; 3530 if (try_to_extend_transaction(handle, inode)) { 3531 if (bh) { 3532 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 3533 ext4_handle_dirty_metadata(handle, inode, bh); 3534 } 3535 ext4_mark_inode_dirty(handle, inode); 3536 ext4_journal_test_restart(handle, inode); 3537 if (bh) { 3538 BUFFER_TRACE(bh, "retaking write access"); 3539 ext4_journal_get_write_access(handle, bh); 3540 } 3541 } 3542 3543 /* 3544 * Any buffers which are on the journal will be in memory. We find 3545 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget() 3546 * on them. We've already detached each block from the file, so 3547 * bforget() in jbd2_journal_forget() should be safe. 3548 * 3549 * AKPM: turn on bforget in jbd2_journal_forget()!!! 3550 */ 3551 for (p = first; p < last; p++) { 3552 u32 nr = le32_to_cpu(*p); 3553 if (nr) { 3554 struct buffer_head *tbh; 3555 3556 *p = 0; 3557 tbh = sb_find_get_block(inode->i_sb, nr); 3558 ext4_forget(handle, 0, inode, tbh, nr); 3559 } 3560 } 3561 3562 ext4_free_blocks(handle, inode, block_to_free, count, 0); 3563 } 3564 3565 /** 3566 * ext4_free_data - free a list of data blocks 3567 * @handle: handle for this transaction 3568 * @inode: inode we are dealing with 3569 * @this_bh: indirect buffer_head which contains *@first and *@last 3570 * @first: array of block numbers 3571 * @last: points immediately past the end of array 3572 * 3573 * We are freeing all blocks refered from that array (numbers are stored as 3574 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 3575 * 3576 * We accumulate contiguous runs of blocks to free. Conveniently, if these 3577 * blocks are contiguous then releasing them at one time will only affect one 3578 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't 3579 * actually use a lot of journal space. 3580 * 3581 * @this_bh will be %NULL if @first and @last point into the inode's direct 3582 * block pointers. 3583 */ 3584 static void ext4_free_data(handle_t *handle, struct inode *inode, 3585 struct buffer_head *this_bh, 3586 __le32 *first, __le32 *last) 3587 { 3588 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ 3589 unsigned long count = 0; /* Number of blocks in the run */ 3590 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind 3591 corresponding to 3592 block_to_free */ 3593 ext4_fsblk_t nr; /* Current block # */ 3594 __le32 *p; /* Pointer into inode/ind 3595 for current block */ 3596 int err; 3597 3598 if (this_bh) { /* For indirect block */ 3599 BUFFER_TRACE(this_bh, "get_write_access"); 3600 err = ext4_journal_get_write_access(handle, this_bh); 3601 /* Important: if we can't update the indirect pointers 3602 * to the blocks, we can't free them. */ 3603 if (err) 3604 return; 3605 } 3606 3607 for (p = first; p < last; p++) { 3608 nr = le32_to_cpu(*p); 3609 if (nr) { 3610 /* accumulate blocks to free if they're contiguous */ 3611 if (count == 0) { 3612 block_to_free = nr; 3613 block_to_free_p = p; 3614 count = 1; 3615 } else if (nr == block_to_free + count) { 3616 count++; 3617 } else { 3618 ext4_clear_blocks(handle, inode, this_bh, 3619 block_to_free, 3620 count, block_to_free_p, p); 3621 block_to_free = nr; 3622 block_to_free_p = p; 3623 count = 1; 3624 } 3625 } 3626 } 3627 3628 if (count > 0) 3629 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 3630 count, block_to_free_p, p); 3631 3632 if (this_bh) { 3633 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 3634 3635 /* 3636 * The buffer head should have an attached journal head at this 3637 * point. However, if the data is corrupted and an indirect 3638 * block pointed to itself, it would have been detached when 3639 * the block was cleared. Check for this instead of OOPSing. 3640 */ 3641 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 3642 ext4_handle_dirty_metadata(handle, inode, this_bh); 3643 else 3644 ext4_error(inode->i_sb, __func__, 3645 "circular indirect block detected, " 3646 "inode=%lu, block=%llu", 3647 inode->i_ino, 3648 (unsigned long long) this_bh->b_blocknr); 3649 } 3650 } 3651 3652 /** 3653 * ext4_free_branches - free an array of branches 3654 * @handle: JBD handle for this transaction 3655 * @inode: inode we are dealing with 3656 * @parent_bh: the buffer_head which contains *@first and *@last 3657 * @first: array of block numbers 3658 * @last: pointer immediately past the end of array 3659 * @depth: depth of the branches to free 3660 * 3661 * We are freeing all blocks refered from these branches (numbers are 3662 * stored as little-endian 32-bit) and updating @inode->i_blocks 3663 * appropriately. 3664 */ 3665 static void ext4_free_branches(handle_t *handle, struct inode *inode, 3666 struct buffer_head *parent_bh, 3667 __le32 *first, __le32 *last, int depth) 3668 { 3669 ext4_fsblk_t nr; 3670 __le32 *p; 3671 3672 if (ext4_handle_is_aborted(handle)) 3673 return; 3674 3675 if (depth--) { 3676 struct buffer_head *bh; 3677 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3678 p = last; 3679 while (--p >= first) { 3680 nr = le32_to_cpu(*p); 3681 if (!nr) 3682 continue; /* A hole */ 3683 3684 /* Go read the buffer for the next level down */ 3685 bh = sb_bread(inode->i_sb, nr); 3686 3687 /* 3688 * A read failure? Report error and clear slot 3689 * (should be rare). 3690 */ 3691 if (!bh) { 3692 ext4_error(inode->i_sb, "ext4_free_branches", 3693 "Read failure, inode=%lu, block=%llu", 3694 inode->i_ino, nr); 3695 continue; 3696 } 3697 3698 /* This zaps the entire block. Bottom up. */ 3699 BUFFER_TRACE(bh, "free child branches"); 3700 ext4_free_branches(handle, inode, bh, 3701 (__le32 *) bh->b_data, 3702 (__le32 *) bh->b_data + addr_per_block, 3703 depth); 3704 3705 /* 3706 * We've probably journalled the indirect block several 3707 * times during the truncate. But it's no longer 3708 * needed and we now drop it from the transaction via 3709 * jbd2_journal_revoke(). 3710 * 3711 * That's easy if it's exclusively part of this 3712 * transaction. But if it's part of the committing 3713 * transaction then jbd2_journal_forget() will simply 3714 * brelse() it. That means that if the underlying 3715 * block is reallocated in ext4_get_block(), 3716 * unmap_underlying_metadata() will find this block 3717 * and will try to get rid of it. damn, damn. 3718 * 3719 * If this block has already been committed to the 3720 * journal, a revoke record will be written. And 3721 * revoke records must be emitted *before* clearing 3722 * this block's bit in the bitmaps. 3723 */ 3724 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 3725 3726 /* 3727 * Everything below this this pointer has been 3728 * released. Now let this top-of-subtree go. 3729 * 3730 * We want the freeing of this indirect block to be 3731 * atomic in the journal with the updating of the 3732 * bitmap block which owns it. So make some room in 3733 * the journal. 3734 * 3735 * We zero the parent pointer *after* freeing its 3736 * pointee in the bitmaps, so if extend_transaction() 3737 * for some reason fails to put the bitmap changes and 3738 * the release into the same transaction, recovery 3739 * will merely complain about releasing a free block, 3740 * rather than leaking blocks. 3741 */ 3742 if (ext4_handle_is_aborted(handle)) 3743 return; 3744 if (try_to_extend_transaction(handle, inode)) { 3745 ext4_mark_inode_dirty(handle, inode); 3746 ext4_journal_test_restart(handle, inode); 3747 } 3748 3749 ext4_free_blocks(handle, inode, nr, 1, 1); 3750 3751 if (parent_bh) { 3752 /* 3753 * The block which we have just freed is 3754 * pointed to by an indirect block: journal it 3755 */ 3756 BUFFER_TRACE(parent_bh, "get_write_access"); 3757 if (!ext4_journal_get_write_access(handle, 3758 parent_bh)){ 3759 *p = 0; 3760 BUFFER_TRACE(parent_bh, 3761 "call ext4_handle_dirty_metadata"); 3762 ext4_handle_dirty_metadata(handle, 3763 inode, 3764 parent_bh); 3765 } 3766 } 3767 } 3768 } else { 3769 /* We have reached the bottom of the tree. */ 3770 BUFFER_TRACE(parent_bh, "free data blocks"); 3771 ext4_free_data(handle, inode, parent_bh, first, last); 3772 } 3773 } 3774 3775 int ext4_can_truncate(struct inode *inode) 3776 { 3777 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 3778 return 0; 3779 if (S_ISREG(inode->i_mode)) 3780 return 1; 3781 if (S_ISDIR(inode->i_mode)) 3782 return 1; 3783 if (S_ISLNK(inode->i_mode)) 3784 return !ext4_inode_is_fast_symlink(inode); 3785 return 0; 3786 } 3787 3788 /* 3789 * ext4_truncate() 3790 * 3791 * We block out ext4_get_block() block instantiations across the entire 3792 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 3793 * simultaneously on behalf of the same inode. 3794 * 3795 * As we work through the truncate and commmit bits of it to the journal there 3796 * is one core, guiding principle: the file's tree must always be consistent on 3797 * disk. We must be able to restart the truncate after a crash. 3798 * 3799 * The file's tree may be transiently inconsistent in memory (although it 3800 * probably isn't), but whenever we close off and commit a journal transaction, 3801 * the contents of (the filesystem + the journal) must be consistent and 3802 * restartable. It's pretty simple, really: bottom up, right to left (although 3803 * left-to-right works OK too). 3804 * 3805 * Note that at recovery time, journal replay occurs *before* the restart of 3806 * truncate against the orphan inode list. 3807 * 3808 * The committed inode has the new, desired i_size (which is the same as 3809 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 3810 * that this inode's truncate did not complete and it will again call 3811 * ext4_truncate() to have another go. So there will be instantiated blocks 3812 * to the right of the truncation point in a crashed ext4 filesystem. But 3813 * that's fine - as long as they are linked from the inode, the post-crash 3814 * ext4_truncate() run will find them and release them. 3815 */ 3816 void ext4_truncate(struct inode *inode) 3817 { 3818 handle_t *handle; 3819 struct ext4_inode_info *ei = EXT4_I(inode); 3820 __le32 *i_data = ei->i_data; 3821 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 3822 struct address_space *mapping = inode->i_mapping; 3823 ext4_lblk_t offsets[4]; 3824 Indirect chain[4]; 3825 Indirect *partial; 3826 __le32 nr = 0; 3827 int n; 3828 ext4_lblk_t last_block; 3829 unsigned blocksize = inode->i_sb->s_blocksize; 3830 3831 if (!ext4_can_truncate(inode)) 3832 return; 3833 3834 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3835 ext4_ext_truncate(inode); 3836 return; 3837 } 3838 3839 handle = start_transaction(inode); 3840 if (IS_ERR(handle)) 3841 return; /* AKPM: return what? */ 3842 3843 last_block = (inode->i_size + blocksize-1) 3844 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 3845 3846 if (inode->i_size & (blocksize - 1)) 3847 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 3848 goto out_stop; 3849 3850 n = ext4_block_to_path(inode, last_block, offsets, NULL); 3851 if (n == 0) 3852 goto out_stop; /* error */ 3853 3854 /* 3855 * OK. This truncate is going to happen. We add the inode to the 3856 * orphan list, so that if this truncate spans multiple transactions, 3857 * and we crash, we will resume the truncate when the filesystem 3858 * recovers. It also marks the inode dirty, to catch the new size. 3859 * 3860 * Implication: the file must always be in a sane, consistent 3861 * truncatable state while each transaction commits. 3862 */ 3863 if (ext4_orphan_add(handle, inode)) 3864 goto out_stop; 3865 3866 /* 3867 * From here we block out all ext4_get_block() callers who want to 3868 * modify the block allocation tree. 3869 */ 3870 down_write(&ei->i_data_sem); 3871 3872 ext4_discard_preallocations(inode); 3873 3874 /* 3875 * The orphan list entry will now protect us from any crash which 3876 * occurs before the truncate completes, so it is now safe to propagate 3877 * the new, shorter inode size (held for now in i_size) into the 3878 * on-disk inode. We do this via i_disksize, which is the value which 3879 * ext4 *really* writes onto the disk inode. 3880 */ 3881 ei->i_disksize = inode->i_size; 3882 3883 if (n == 1) { /* direct blocks */ 3884 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 3885 i_data + EXT4_NDIR_BLOCKS); 3886 goto do_indirects; 3887 } 3888 3889 partial = ext4_find_shared(inode, n, offsets, chain, &nr); 3890 /* Kill the top of shared branch (not detached) */ 3891 if (nr) { 3892 if (partial == chain) { 3893 /* Shared branch grows from the inode */ 3894 ext4_free_branches(handle, inode, NULL, 3895 &nr, &nr+1, (chain+n-1) - partial); 3896 *partial->p = 0; 3897 /* 3898 * We mark the inode dirty prior to restart, 3899 * and prior to stop. No need for it here. 3900 */ 3901 } else { 3902 /* Shared branch grows from an indirect block */ 3903 BUFFER_TRACE(partial->bh, "get_write_access"); 3904 ext4_free_branches(handle, inode, partial->bh, 3905 partial->p, 3906 partial->p+1, (chain+n-1) - partial); 3907 } 3908 } 3909 /* Clear the ends of indirect blocks on the shared branch */ 3910 while (partial > chain) { 3911 ext4_free_branches(handle, inode, partial->bh, partial->p + 1, 3912 (__le32*)partial->bh->b_data+addr_per_block, 3913 (chain+n-1) - partial); 3914 BUFFER_TRACE(partial->bh, "call brelse"); 3915 brelse (partial->bh); 3916 partial--; 3917 } 3918 do_indirects: 3919 /* Kill the remaining (whole) subtrees */ 3920 switch (offsets[0]) { 3921 default: 3922 nr = i_data[EXT4_IND_BLOCK]; 3923 if (nr) { 3924 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); 3925 i_data[EXT4_IND_BLOCK] = 0; 3926 } 3927 case EXT4_IND_BLOCK: 3928 nr = i_data[EXT4_DIND_BLOCK]; 3929 if (nr) { 3930 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); 3931 i_data[EXT4_DIND_BLOCK] = 0; 3932 } 3933 case EXT4_DIND_BLOCK: 3934 nr = i_data[EXT4_TIND_BLOCK]; 3935 if (nr) { 3936 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); 3937 i_data[EXT4_TIND_BLOCK] = 0; 3938 } 3939 case EXT4_TIND_BLOCK: 3940 ; 3941 } 3942 3943 up_write(&ei->i_data_sem); 3944 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 3945 ext4_mark_inode_dirty(handle, inode); 3946 3947 /* 3948 * In a multi-transaction truncate, we only make the final transaction 3949 * synchronous 3950 */ 3951 if (IS_SYNC(inode)) 3952 ext4_handle_sync(handle); 3953 out_stop: 3954 /* 3955 * If this was a simple ftruncate(), and the file will remain alive 3956 * then we need to clear up the orphan record which we created above. 3957 * However, if this was a real unlink then we were called by 3958 * ext4_delete_inode(), and we allow that function to clean up the 3959 * orphan info for us. 3960 */ 3961 if (inode->i_nlink) 3962 ext4_orphan_del(handle, inode); 3963 3964 ext4_journal_stop(handle); 3965 } 3966 3967 /* 3968 * ext4_get_inode_loc returns with an extra refcount against the inode's 3969 * underlying buffer_head on success. If 'in_mem' is true, we have all 3970 * data in memory that is needed to recreate the on-disk version of this 3971 * inode. 3972 */ 3973 static int __ext4_get_inode_loc(struct inode *inode, 3974 struct ext4_iloc *iloc, int in_mem) 3975 { 3976 struct ext4_group_desc *gdp; 3977 struct buffer_head *bh; 3978 struct super_block *sb = inode->i_sb; 3979 ext4_fsblk_t block; 3980 int inodes_per_block, inode_offset; 3981 3982 iloc->bh = NULL; 3983 if (!ext4_valid_inum(sb, inode->i_ino)) 3984 return -EIO; 3985 3986 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 3987 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 3988 if (!gdp) 3989 return -EIO; 3990 3991 /* 3992 * Figure out the offset within the block group inode table 3993 */ 3994 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 3995 inode_offset = ((inode->i_ino - 1) % 3996 EXT4_INODES_PER_GROUP(sb)); 3997 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 3998 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 3999 4000 bh = sb_getblk(sb, block); 4001 if (!bh) { 4002 ext4_error(sb, "ext4_get_inode_loc", "unable to read " 4003 "inode block - inode=%lu, block=%llu", 4004 inode->i_ino, block); 4005 return -EIO; 4006 } 4007 if (!buffer_uptodate(bh)) { 4008 lock_buffer(bh); 4009 4010 /* 4011 * If the buffer has the write error flag, we have failed 4012 * to write out another inode in the same block. In this 4013 * case, we don't have to read the block because we may 4014 * read the old inode data successfully. 4015 */ 4016 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4017 set_buffer_uptodate(bh); 4018 4019 if (buffer_uptodate(bh)) { 4020 /* someone brought it uptodate while we waited */ 4021 unlock_buffer(bh); 4022 goto has_buffer; 4023 } 4024 4025 /* 4026 * If we have all information of the inode in memory and this 4027 * is the only valid inode in the block, we need not read the 4028 * block. 4029 */ 4030 if (in_mem) { 4031 struct buffer_head *bitmap_bh; 4032 int i, start; 4033 4034 start = inode_offset & ~(inodes_per_block - 1); 4035 4036 /* Is the inode bitmap in cache? */ 4037 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4038 if (!bitmap_bh) 4039 goto make_io; 4040 4041 /* 4042 * If the inode bitmap isn't in cache then the 4043 * optimisation may end up performing two reads instead 4044 * of one, so skip it. 4045 */ 4046 if (!buffer_uptodate(bitmap_bh)) { 4047 brelse(bitmap_bh); 4048 goto make_io; 4049 } 4050 for (i = start; i < start + inodes_per_block; i++) { 4051 if (i == inode_offset) 4052 continue; 4053 if (ext4_test_bit(i, bitmap_bh->b_data)) 4054 break; 4055 } 4056 brelse(bitmap_bh); 4057 if (i == start + inodes_per_block) { 4058 /* all other inodes are free, so skip I/O */ 4059 memset(bh->b_data, 0, bh->b_size); 4060 set_buffer_uptodate(bh); 4061 unlock_buffer(bh); 4062 goto has_buffer; 4063 } 4064 } 4065 4066 make_io: 4067 /* 4068 * If we need to do any I/O, try to pre-readahead extra 4069 * blocks from the inode table. 4070 */ 4071 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4072 ext4_fsblk_t b, end, table; 4073 unsigned num; 4074 4075 table = ext4_inode_table(sb, gdp); 4076 /* Make sure s_inode_readahead_blks is a power of 2 */ 4077 while (EXT4_SB(sb)->s_inode_readahead_blks & 4078 (EXT4_SB(sb)->s_inode_readahead_blks-1)) 4079 EXT4_SB(sb)->s_inode_readahead_blks = 4080 (EXT4_SB(sb)->s_inode_readahead_blks & 4081 (EXT4_SB(sb)->s_inode_readahead_blks-1)); 4082 b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); 4083 if (table > b) 4084 b = table; 4085 end = b + EXT4_SB(sb)->s_inode_readahead_blks; 4086 num = EXT4_INODES_PER_GROUP(sb); 4087 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4088 EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) 4089 num -= ext4_itable_unused_count(sb, gdp); 4090 table += num / inodes_per_block; 4091 if (end > table) 4092 end = table; 4093 while (b <= end) 4094 sb_breadahead(sb, b++); 4095 } 4096 4097 /* 4098 * There are other valid inodes in the buffer, this inode 4099 * has in-inode xattrs, or we don't have this inode in memory. 4100 * Read the block from disk. 4101 */ 4102 get_bh(bh); 4103 bh->b_end_io = end_buffer_read_sync; 4104 submit_bh(READ_META, bh); 4105 wait_on_buffer(bh); 4106 if (!buffer_uptodate(bh)) { 4107 ext4_error(sb, __func__, 4108 "unable to read inode block - inode=%lu, " 4109 "block=%llu", inode->i_ino, block); 4110 brelse(bh); 4111 return -EIO; 4112 } 4113 } 4114 has_buffer: 4115 iloc->bh = bh; 4116 return 0; 4117 } 4118 4119 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4120 { 4121 /* We have all inode data except xattrs in memory here. */ 4122 return __ext4_get_inode_loc(inode, iloc, 4123 !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); 4124 } 4125 4126 void ext4_set_inode_flags(struct inode *inode) 4127 { 4128 unsigned int flags = EXT4_I(inode)->i_flags; 4129 4130 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); 4131 if (flags & EXT4_SYNC_FL) 4132 inode->i_flags |= S_SYNC; 4133 if (flags & EXT4_APPEND_FL) 4134 inode->i_flags |= S_APPEND; 4135 if (flags & EXT4_IMMUTABLE_FL) 4136 inode->i_flags |= S_IMMUTABLE; 4137 if (flags & EXT4_NOATIME_FL) 4138 inode->i_flags |= S_NOATIME; 4139 if (flags & EXT4_DIRSYNC_FL) 4140 inode->i_flags |= S_DIRSYNC; 4141 } 4142 4143 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4144 void ext4_get_inode_flags(struct ext4_inode_info *ei) 4145 { 4146 unsigned int flags = ei->vfs_inode.i_flags; 4147 4148 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4149 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4150 if (flags & S_SYNC) 4151 ei->i_flags |= EXT4_SYNC_FL; 4152 if (flags & S_APPEND) 4153 ei->i_flags |= EXT4_APPEND_FL; 4154 if (flags & S_IMMUTABLE) 4155 ei->i_flags |= EXT4_IMMUTABLE_FL; 4156 if (flags & S_NOATIME) 4157 ei->i_flags |= EXT4_NOATIME_FL; 4158 if (flags & S_DIRSYNC) 4159 ei->i_flags |= EXT4_DIRSYNC_FL; 4160 } 4161 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4162 struct ext4_inode_info *ei) 4163 { 4164 blkcnt_t i_blocks ; 4165 struct inode *inode = &(ei->vfs_inode); 4166 struct super_block *sb = inode->i_sb; 4167 4168 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 4169 EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { 4170 /* we are using combined 48 bit field */ 4171 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4172 le32_to_cpu(raw_inode->i_blocks_lo); 4173 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4174 /* i_blocks represent file system block size */ 4175 return i_blocks << (inode->i_blkbits - 9); 4176 } else { 4177 return i_blocks; 4178 } 4179 } else { 4180 return le32_to_cpu(raw_inode->i_blocks_lo); 4181 } 4182 } 4183 4184 struct inode *ext4_iget(struct super_block *sb, unsigned long ino) 4185 { 4186 struct ext4_iloc iloc; 4187 struct ext4_inode *raw_inode; 4188 struct ext4_inode_info *ei; 4189 struct buffer_head *bh; 4190 struct inode *inode; 4191 long ret; 4192 int block; 4193 4194 inode = iget_locked(sb, ino); 4195 if (!inode) 4196 return ERR_PTR(-ENOMEM); 4197 if (!(inode->i_state & I_NEW)) 4198 return inode; 4199 4200 ei = EXT4_I(inode); 4201 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4202 ei->i_acl = EXT4_ACL_NOT_CACHED; 4203 ei->i_default_acl = EXT4_ACL_NOT_CACHED; 4204 #endif 4205 4206 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4207 if (ret < 0) 4208 goto bad_inode; 4209 bh = iloc.bh; 4210 raw_inode = ext4_raw_inode(&iloc); 4211 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4212 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4213 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4214 if (!(test_opt(inode->i_sb, NO_UID32))) { 4215 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4216 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4217 } 4218 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); 4219 4220 ei->i_state = 0; 4221 ei->i_dir_start_lookup = 0; 4222 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4223 /* We now have enough fields to check if the inode was active or not. 4224 * This is needed because nfsd might try to access dead inodes 4225 * the test is that same one that e2fsck uses 4226 * NeilBrown 1999oct15 4227 */ 4228 if (inode->i_nlink == 0) { 4229 if (inode->i_mode == 0 || 4230 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4231 /* this inode is deleted */ 4232 brelse(bh); 4233 ret = -ESTALE; 4234 goto bad_inode; 4235 } 4236 /* The only unlinked inodes we let through here have 4237 * valid i_mode and are being read by the orphan 4238 * recovery code: that's fine, we're about to complete 4239 * the process of deleting those. */ 4240 } 4241 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4242 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4243 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4244 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4245 cpu_to_le32(EXT4_OS_HURD)) { 4246 ei->i_file_acl |= 4247 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4248 } 4249 inode->i_size = ext4_isize(raw_inode); 4250 ei->i_disksize = inode->i_size; 4251 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4252 ei->i_block_group = iloc.block_group; 4253 /* 4254 * NOTE! The in-memory inode i_data array is in little-endian order 4255 * even on big-endian machines: we do NOT byteswap the block numbers! 4256 */ 4257 for (block = 0; block < EXT4_N_BLOCKS; block++) 4258 ei->i_data[block] = raw_inode->i_block[block]; 4259 INIT_LIST_HEAD(&ei->i_orphan); 4260 4261 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4262 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4263 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4264 EXT4_INODE_SIZE(inode->i_sb)) { 4265 brelse(bh); 4266 ret = -EIO; 4267 goto bad_inode; 4268 } 4269 if (ei->i_extra_isize == 0) { 4270 /* The extra space is currently unused. Use it. */ 4271 ei->i_extra_isize = sizeof(struct ext4_inode) - 4272 EXT4_GOOD_OLD_INODE_SIZE; 4273 } else { 4274 __le32 *magic = (void *)raw_inode + 4275 EXT4_GOOD_OLD_INODE_SIZE + 4276 ei->i_extra_isize; 4277 if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) 4278 ei->i_state |= EXT4_STATE_XATTR; 4279 } 4280 } else 4281 ei->i_extra_isize = 0; 4282 4283 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4284 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4285 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4286 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4287 4288 inode->i_version = le32_to_cpu(raw_inode->i_disk_version); 4289 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4290 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4291 inode->i_version |= 4292 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4293 } 4294 4295 if (S_ISREG(inode->i_mode)) { 4296 inode->i_op = &ext4_file_inode_operations; 4297 inode->i_fop = &ext4_file_operations; 4298 ext4_set_aops(inode); 4299 } else if (S_ISDIR(inode->i_mode)) { 4300 inode->i_op = &ext4_dir_inode_operations; 4301 inode->i_fop = &ext4_dir_operations; 4302 } else if (S_ISLNK(inode->i_mode)) { 4303 if (ext4_inode_is_fast_symlink(inode)) { 4304 inode->i_op = &ext4_fast_symlink_inode_operations; 4305 nd_terminate_link(ei->i_data, inode->i_size, 4306 sizeof(ei->i_data) - 1); 4307 } else { 4308 inode->i_op = &ext4_symlink_inode_operations; 4309 ext4_set_aops(inode); 4310 } 4311 } else { 4312 inode->i_op = &ext4_special_inode_operations; 4313 if (raw_inode->i_block[0]) 4314 init_special_inode(inode, inode->i_mode, 4315 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4316 else 4317 init_special_inode(inode, inode->i_mode, 4318 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4319 } 4320 brelse(iloc.bh); 4321 ext4_set_inode_flags(inode); 4322 unlock_new_inode(inode); 4323 return inode; 4324 4325 bad_inode: 4326 iget_failed(inode); 4327 return ERR_PTR(ret); 4328 } 4329 4330 static int ext4_inode_blocks_set(handle_t *handle, 4331 struct ext4_inode *raw_inode, 4332 struct ext4_inode_info *ei) 4333 { 4334 struct inode *inode = &(ei->vfs_inode); 4335 u64 i_blocks = inode->i_blocks; 4336 struct super_block *sb = inode->i_sb; 4337 4338 if (i_blocks <= ~0U) { 4339 /* 4340 * i_blocks can be represnted in a 32 bit variable 4341 * as multiple of 512 bytes 4342 */ 4343 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4344 raw_inode->i_blocks_high = 0; 4345 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4346 return 0; 4347 } 4348 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 4349 return -EFBIG; 4350 4351 if (i_blocks <= 0xffffffffffffULL) { 4352 /* 4353 * i_blocks can be represented in a 48 bit variable 4354 * as multiple of 512 bytes 4355 */ 4356 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4357 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4358 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 4359 } else { 4360 ei->i_flags |= EXT4_HUGE_FILE_FL; 4361 /* i_block is stored in file system block size */ 4362 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4363 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4364 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4365 } 4366 return 0; 4367 } 4368 4369 /* 4370 * Post the struct inode info into an on-disk inode location in the 4371 * buffer-cache. This gobbles the caller's reference to the 4372 * buffer_head in the inode location struct. 4373 * 4374 * The caller must have write access to iloc->bh. 4375 */ 4376 static int ext4_do_update_inode(handle_t *handle, 4377 struct inode *inode, 4378 struct ext4_iloc *iloc) 4379 { 4380 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4381 struct ext4_inode_info *ei = EXT4_I(inode); 4382 struct buffer_head *bh = iloc->bh; 4383 int err = 0, rc, block; 4384 4385 /* For fields not not tracking in the in-memory inode, 4386 * initialise them to zero for new inodes. */ 4387 if (ei->i_state & EXT4_STATE_NEW) 4388 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4389 4390 ext4_get_inode_flags(ei); 4391 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4392 if (!(test_opt(inode->i_sb, NO_UID32))) { 4393 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); 4394 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); 4395 /* 4396 * Fix up interoperability with old kernels. Otherwise, old inodes get 4397 * re-used with the upper 16 bits of the uid/gid intact 4398 */ 4399 if (!ei->i_dtime) { 4400 raw_inode->i_uid_high = 4401 cpu_to_le16(high_16_bits(inode->i_uid)); 4402 raw_inode->i_gid_high = 4403 cpu_to_le16(high_16_bits(inode->i_gid)); 4404 } else { 4405 raw_inode->i_uid_high = 0; 4406 raw_inode->i_gid_high = 0; 4407 } 4408 } else { 4409 raw_inode->i_uid_low = 4410 cpu_to_le16(fs_high2lowuid(inode->i_uid)); 4411 raw_inode->i_gid_low = 4412 cpu_to_le16(fs_high2lowgid(inode->i_gid)); 4413 raw_inode->i_uid_high = 0; 4414 raw_inode->i_gid_high = 0; 4415 } 4416 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4417 4418 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4419 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4420 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4421 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4422 4423 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4424 goto out_brelse; 4425 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4426 /* clear the migrate flag in the raw_inode */ 4427 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); 4428 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4429 cpu_to_le32(EXT4_OS_HURD)) 4430 raw_inode->i_file_acl_high = 4431 cpu_to_le16(ei->i_file_acl >> 32); 4432 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4433 ext4_isize_set(raw_inode, ei->i_disksize); 4434 if (ei->i_disksize > 0x7fffffffULL) { 4435 struct super_block *sb = inode->i_sb; 4436 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 4437 EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || 4438 EXT4_SB(sb)->s_es->s_rev_level == 4439 cpu_to_le32(EXT4_GOOD_OLD_REV)) { 4440 /* If this is the first large file 4441 * created, add a flag to the superblock. 4442 */ 4443 err = ext4_journal_get_write_access(handle, 4444 EXT4_SB(sb)->s_sbh); 4445 if (err) 4446 goto out_brelse; 4447 ext4_update_dynamic_rev(sb); 4448 EXT4_SET_RO_COMPAT_FEATURE(sb, 4449 EXT4_FEATURE_RO_COMPAT_LARGE_FILE); 4450 sb->s_dirt = 1; 4451 ext4_handle_sync(handle); 4452 err = ext4_handle_dirty_metadata(handle, inode, 4453 EXT4_SB(sb)->s_sbh); 4454 } 4455 } 4456 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 4457 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 4458 if (old_valid_dev(inode->i_rdev)) { 4459 raw_inode->i_block[0] = 4460 cpu_to_le32(old_encode_dev(inode->i_rdev)); 4461 raw_inode->i_block[1] = 0; 4462 } else { 4463 raw_inode->i_block[0] = 0; 4464 raw_inode->i_block[1] = 4465 cpu_to_le32(new_encode_dev(inode->i_rdev)); 4466 raw_inode->i_block[2] = 0; 4467 } 4468 } else for (block = 0; block < EXT4_N_BLOCKS; block++) 4469 raw_inode->i_block[block] = ei->i_data[block]; 4470 4471 raw_inode->i_disk_version = cpu_to_le32(inode->i_version); 4472 if (ei->i_extra_isize) { 4473 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4474 raw_inode->i_version_hi = 4475 cpu_to_le32(inode->i_version >> 32); 4476 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4477 } 4478 4479 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4480 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4481 if (!err) 4482 err = rc; 4483 ei->i_state &= ~EXT4_STATE_NEW; 4484 4485 out_brelse: 4486 brelse(bh); 4487 ext4_std_error(inode->i_sb, err); 4488 return err; 4489 } 4490 4491 /* 4492 * ext4_write_inode() 4493 * 4494 * We are called from a few places: 4495 * 4496 * - Within generic_file_write() for O_SYNC files. 4497 * Here, there will be no transaction running. We wait for any running 4498 * trasnaction to commit. 4499 * 4500 * - Within sys_sync(), kupdate and such. 4501 * We wait on commit, if tol to. 4502 * 4503 * - Within prune_icache() (PF_MEMALLOC == true) 4504 * Here we simply return. We can't afford to block kswapd on the 4505 * journal commit. 4506 * 4507 * In all cases it is actually safe for us to return without doing anything, 4508 * because the inode has been copied into a raw inode buffer in 4509 * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for 4510 * knfsd. 4511 * 4512 * Note that we are absolutely dependent upon all inode dirtiers doing the 4513 * right thing: they *must* call mark_inode_dirty() after dirtying info in 4514 * which we are interested. 4515 * 4516 * It would be a bug for them to not do this. The code: 4517 * 4518 * mark_inode_dirty(inode) 4519 * stuff(); 4520 * inode->i_size = expr; 4521 * 4522 * is in error because a kswapd-driven write_inode() could occur while 4523 * `stuff()' is running, and the new i_size will be lost. Plus the inode 4524 * will no longer be on the superblock's dirty inode list. 4525 */ 4526 int ext4_write_inode(struct inode *inode, int wait) 4527 { 4528 if (current->flags & PF_MEMALLOC) 4529 return 0; 4530 4531 if (ext4_journal_current_handle()) { 4532 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4533 dump_stack(); 4534 return -EIO; 4535 } 4536 4537 if (!wait) 4538 return 0; 4539 4540 return ext4_force_commit(inode->i_sb); 4541 } 4542 4543 int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) 4544 { 4545 int err = 0; 4546 4547 mark_buffer_dirty(bh); 4548 if (inode && inode_needs_sync(inode)) { 4549 sync_dirty_buffer(bh); 4550 if (buffer_req(bh) && !buffer_uptodate(bh)) { 4551 ext4_error(inode->i_sb, __func__, 4552 "IO error syncing inode, " 4553 "inode=%lu, block=%llu", 4554 inode->i_ino, 4555 (unsigned long long)bh->b_blocknr); 4556 err = -EIO; 4557 } 4558 } 4559 return err; 4560 } 4561 4562 /* 4563 * ext4_setattr() 4564 * 4565 * Called from notify_change. 4566 * 4567 * We want to trap VFS attempts to truncate the file as soon as 4568 * possible. In particular, we want to make sure that when the VFS 4569 * shrinks i_size, we put the inode on the orphan list and modify 4570 * i_disksize immediately, so that during the subsequent flushing of 4571 * dirty pages and freeing of disk blocks, we can guarantee that any 4572 * commit will leave the blocks being flushed in an unused state on 4573 * disk. (On recovery, the inode will get truncated and the blocks will 4574 * be freed, so we have a strong guarantee that no future commit will 4575 * leave these blocks visible to the user.) 4576 * 4577 * Another thing we have to assure is that if we are in ordered mode 4578 * and inode is still attached to the committing transaction, we must 4579 * we start writeout of all the dirty pages which are being truncated. 4580 * This way we are sure that all the data written in the previous 4581 * transaction are already on disk (truncate waits for pages under 4582 * writeback). 4583 * 4584 * Called with inode->i_mutex down. 4585 */ 4586 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 4587 { 4588 struct inode *inode = dentry->d_inode; 4589 int error, rc = 0; 4590 const unsigned int ia_valid = attr->ia_valid; 4591 4592 error = inode_change_ok(inode, attr); 4593 if (error) 4594 return error; 4595 4596 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || 4597 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { 4598 handle_t *handle; 4599 4600 /* (user+group)*(old+new) structure, inode write (sb, 4601 * inode block, ? - but truncate inode update has it) */ 4602 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 4603 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 4604 if (IS_ERR(handle)) { 4605 error = PTR_ERR(handle); 4606 goto err_out; 4607 } 4608 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; 4609 if (error) { 4610 ext4_journal_stop(handle); 4611 return error; 4612 } 4613 /* Update corresponding info in inode so that everything is in 4614 * one transaction */ 4615 if (attr->ia_valid & ATTR_UID) 4616 inode->i_uid = attr->ia_uid; 4617 if (attr->ia_valid & ATTR_GID) 4618 inode->i_gid = attr->ia_gid; 4619 error = ext4_mark_inode_dirty(handle, inode); 4620 ext4_journal_stop(handle); 4621 } 4622 4623 if (attr->ia_valid & ATTR_SIZE) { 4624 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 4625 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4626 4627 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 4628 error = -EFBIG; 4629 goto err_out; 4630 } 4631 } 4632 } 4633 4634 if (S_ISREG(inode->i_mode) && 4635 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { 4636 handle_t *handle; 4637 4638 handle = ext4_journal_start(inode, 3); 4639 if (IS_ERR(handle)) { 4640 error = PTR_ERR(handle); 4641 goto err_out; 4642 } 4643 4644 error = ext4_orphan_add(handle, inode); 4645 EXT4_I(inode)->i_disksize = attr->ia_size; 4646 rc = ext4_mark_inode_dirty(handle, inode); 4647 if (!error) 4648 error = rc; 4649 ext4_journal_stop(handle); 4650 4651 if (ext4_should_order_data(inode)) { 4652 error = ext4_begin_ordered_truncate(inode, 4653 attr->ia_size); 4654 if (error) { 4655 /* Do as much error cleanup as possible */ 4656 handle = ext4_journal_start(inode, 3); 4657 if (IS_ERR(handle)) { 4658 ext4_orphan_del(NULL, inode); 4659 goto err_out; 4660 } 4661 ext4_orphan_del(handle, inode); 4662 ext4_journal_stop(handle); 4663 goto err_out; 4664 } 4665 } 4666 } 4667 4668 rc = inode_setattr(inode, attr); 4669 4670 /* If inode_setattr's call to ext4_truncate failed to get a 4671 * transaction handle at all, we need to clean up the in-core 4672 * orphan list manually. */ 4673 if (inode->i_nlink) 4674 ext4_orphan_del(NULL, inode); 4675 4676 if (!rc && (ia_valid & ATTR_MODE)) 4677 rc = ext4_acl_chmod(inode); 4678 4679 err_out: 4680 ext4_std_error(inode->i_sb, error); 4681 if (!error) 4682 error = rc; 4683 return error; 4684 } 4685 4686 int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 4687 struct kstat *stat) 4688 { 4689 struct inode *inode; 4690 unsigned long delalloc_blocks; 4691 4692 inode = dentry->d_inode; 4693 generic_fillattr(inode, stat); 4694 4695 /* 4696 * We can't update i_blocks if the block allocation is delayed 4697 * otherwise in the case of system crash before the real block 4698 * allocation is done, we will have i_blocks inconsistent with 4699 * on-disk file blocks. 4700 * We always keep i_blocks updated together with real 4701 * allocation. But to not confuse with user, stat 4702 * will return the blocks that include the delayed allocation 4703 * blocks for this file. 4704 */ 4705 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 4706 delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; 4707 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 4708 4709 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4710 return 0; 4711 } 4712 4713 static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, 4714 int chunk) 4715 { 4716 int indirects; 4717 4718 /* if nrblocks are contiguous */ 4719 if (chunk) { 4720 /* 4721 * With N contiguous data blocks, it need at most 4722 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 4723 * 2 dindirect blocks 4724 * 1 tindirect block 4725 */ 4726 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 4727 return indirects + 3; 4728 } 4729 /* 4730 * if nrblocks are not contiguous, worse case, each block touch 4731 * a indirect block, and each indirect block touch a double indirect 4732 * block, plus a triple indirect block 4733 */ 4734 indirects = nrblocks * 2 + 1; 4735 return indirects; 4736 } 4737 4738 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4739 { 4740 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 4741 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 4742 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4743 } 4744 4745 /* 4746 * Account for index blocks, block groups bitmaps and block group 4747 * descriptor blocks if modify datablocks and index blocks 4748 * worse case, the indexs blocks spread over different block groups 4749 * 4750 * If datablocks are discontiguous, they are possible to spread over 4751 * different block groups too. If they are contiugous, with flexbg, 4752 * they could still across block group boundary. 4753 * 4754 * Also account for superblock, inode, quota and xattr blocks 4755 */ 4756 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4757 { 4758 int groups, gdpblocks; 4759 int idxblocks; 4760 int ret = 0; 4761 4762 /* 4763 * How many index blocks need to touch to modify nrblocks? 4764 * The "Chunk" flag indicating whether the nrblocks is 4765 * physically contiguous on disk 4766 * 4767 * For Direct IO and fallocate, they calls get_block to allocate 4768 * one single extent at a time, so they could set the "Chunk" flag 4769 */ 4770 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4771 4772 ret = idxblocks; 4773 4774 /* 4775 * Now let's see how many group bitmaps and group descriptors need 4776 * to account 4777 */ 4778 groups = idxblocks; 4779 if (chunk) 4780 groups += 1; 4781 else 4782 groups += nrblocks; 4783 4784 gdpblocks = groups; 4785 if (groups > EXT4_SB(inode->i_sb)->s_groups_count) 4786 groups = EXT4_SB(inode->i_sb)->s_groups_count; 4787 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 4788 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 4789 4790 /* bitmaps and block group descriptor blocks */ 4791 ret += groups + gdpblocks; 4792 4793 /* Blocks for super block, inode, quota and xattr blocks */ 4794 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 4795 4796 return ret; 4797 } 4798 4799 /* 4800 * Calulate the total number of credits to reserve to fit 4801 * the modification of a single pages into a single transaction, 4802 * which may include multiple chunks of block allocations. 4803 * 4804 * This could be called via ext4_write_begin() 4805 * 4806 * We need to consider the worse case, when 4807 * one new block per extent. 4808 */ 4809 int ext4_writepage_trans_blocks(struct inode *inode) 4810 { 4811 int bpp = ext4_journal_blocks_per_page(inode); 4812 int ret; 4813 4814 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4815 4816 /* Account for data blocks for journalled mode */ 4817 if (ext4_should_journal_data(inode)) 4818 ret += bpp; 4819 return ret; 4820 } 4821 4822 /* 4823 * Calculate the journal credits for a chunk of data modification. 4824 * 4825 * This is called from DIO, fallocate or whoever calling 4826 * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. 4827 * 4828 * journal buffers for data blocks are not included here, as DIO 4829 * and fallocate do no need to journal data buffers. 4830 */ 4831 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 4832 { 4833 return ext4_meta_trans_blocks(inode, nrblocks, 1); 4834 } 4835 4836 /* 4837 * The caller must have previously called ext4_reserve_inode_write(). 4838 * Give this, we know that the caller already has write access to iloc->bh. 4839 */ 4840 int ext4_mark_iloc_dirty(handle_t *handle, 4841 struct inode *inode, struct ext4_iloc *iloc) 4842 { 4843 int err = 0; 4844 4845 if (test_opt(inode->i_sb, I_VERSION)) 4846 inode_inc_iversion(inode); 4847 4848 /* the do_update_inode consumes one bh->b_count */ 4849 get_bh(iloc->bh); 4850 4851 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 4852 err = ext4_do_update_inode(handle, inode, iloc); 4853 put_bh(iloc->bh); 4854 return err; 4855 } 4856 4857 /* 4858 * On success, We end up with an outstanding reference count against 4859 * iloc->bh. This _must_ be cleaned up later. 4860 */ 4861 4862 int 4863 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 4864 struct ext4_iloc *iloc) 4865 { 4866 int err; 4867 4868 err = ext4_get_inode_loc(inode, iloc); 4869 if (!err) { 4870 BUFFER_TRACE(iloc->bh, "get_write_access"); 4871 err = ext4_journal_get_write_access(handle, iloc->bh); 4872 if (err) { 4873 brelse(iloc->bh); 4874 iloc->bh = NULL; 4875 } 4876 } 4877 ext4_std_error(inode->i_sb, err); 4878 return err; 4879 } 4880 4881 /* 4882 * Expand an inode by new_extra_isize bytes. 4883 * Returns 0 on success or negative error number on failure. 4884 */ 4885 static int ext4_expand_extra_isize(struct inode *inode, 4886 unsigned int new_extra_isize, 4887 struct ext4_iloc iloc, 4888 handle_t *handle) 4889 { 4890 struct ext4_inode *raw_inode; 4891 struct ext4_xattr_ibody_header *header; 4892 struct ext4_xattr_entry *entry; 4893 4894 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 4895 return 0; 4896 4897 raw_inode = ext4_raw_inode(&iloc); 4898 4899 header = IHDR(inode, raw_inode); 4900 entry = IFIRST(header); 4901 4902 /* No extended attributes present */ 4903 if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || 4904 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 4905 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, 4906 new_extra_isize); 4907 EXT4_I(inode)->i_extra_isize = new_extra_isize; 4908 return 0; 4909 } 4910 4911 /* try to expand with EAs present */ 4912 return ext4_expand_extra_isize_ea(inode, new_extra_isize, 4913 raw_inode, handle); 4914 } 4915 4916 /* 4917 * What we do here is to mark the in-core inode as clean with respect to inode 4918 * dirtiness (it may still be data-dirty). 4919 * This means that the in-core inode may be reaped by prune_icache 4920 * without having to perform any I/O. This is a very good thing, 4921 * because *any* task may call prune_icache - even ones which 4922 * have a transaction open against a different journal. 4923 * 4924 * Is this cheating? Not really. Sure, we haven't written the 4925 * inode out, but prune_icache isn't a user-visible syncing function. 4926 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 4927 * we start and wait on commits. 4928 * 4929 * Is this efficient/effective? Well, we're being nice to the system 4930 * by cleaning up our inodes proactively so they can be reaped 4931 * without I/O. But we are potentially leaving up to five seconds' 4932 * worth of inodes floating about which prune_icache wants us to 4933 * write out. One way to fix that would be to get prune_icache() 4934 * to do a write_super() to free up some memory. It has the desired 4935 * effect. 4936 */ 4937 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 4938 { 4939 struct ext4_iloc iloc; 4940 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4941 static unsigned int mnt_count; 4942 int err, ret; 4943 4944 might_sleep(); 4945 err = ext4_reserve_inode_write(handle, inode, &iloc); 4946 if (ext4_handle_valid(handle) && 4947 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 4948 !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { 4949 /* 4950 * We need extra buffer credits since we may write into EA block 4951 * with this same handle. If journal_extend fails, then it will 4952 * only result in a minor loss of functionality for that inode. 4953 * If this is felt to be critical, then e2fsck should be run to 4954 * force a large enough s_min_extra_isize. 4955 */ 4956 if ((jbd2_journal_extend(handle, 4957 EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { 4958 ret = ext4_expand_extra_isize(inode, 4959 sbi->s_want_extra_isize, 4960 iloc, handle); 4961 if (ret) { 4962 EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; 4963 if (mnt_count != 4964 le16_to_cpu(sbi->s_es->s_mnt_count)) { 4965 ext4_warning(inode->i_sb, __func__, 4966 "Unable to expand inode %lu. Delete" 4967 " some EAs or run e2fsck.", 4968 inode->i_ino); 4969 mnt_count = 4970 le16_to_cpu(sbi->s_es->s_mnt_count); 4971 } 4972 } 4973 } 4974 } 4975 if (!err) 4976 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 4977 return err; 4978 } 4979 4980 /* 4981 * ext4_dirty_inode() is called from __mark_inode_dirty() 4982 * 4983 * We're really interested in the case where a file is being extended. 4984 * i_size has been changed by generic_commit_write() and we thus need 4985 * to include the updated inode in the current transaction. 4986 * 4987 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks 4988 * are allocated to the file. 4989 * 4990 * If the inode is marked synchronous, we don't honour that here - doing 4991 * so would cause a commit on atime updates, which we don't bother doing. 4992 * We handle synchronous inodes at the highest possible level. 4993 */ 4994 void ext4_dirty_inode(struct inode *inode) 4995 { 4996 handle_t *current_handle = ext4_journal_current_handle(); 4997 handle_t *handle; 4998 4999 if (!ext4_handle_valid(current_handle)) { 5000 ext4_mark_inode_dirty(current_handle, inode); 5001 return; 5002 } 5003 5004 handle = ext4_journal_start(inode, 2); 5005 if (IS_ERR(handle)) 5006 goto out; 5007 if (current_handle && 5008 current_handle->h_transaction != handle->h_transaction) { 5009 /* This task has a transaction open against a different fs */ 5010 printk(KERN_EMERG "%s: transactions do not match!\n", 5011 __func__); 5012 } else { 5013 jbd_debug(5, "marking dirty. outer handle=%p\n", 5014 current_handle); 5015 ext4_mark_inode_dirty(handle, inode); 5016 } 5017 ext4_journal_stop(handle); 5018 out: 5019 return; 5020 } 5021 5022 #if 0 5023 /* 5024 * Bind an inode's backing buffer_head into this transaction, to prevent 5025 * it from being flushed to disk early. Unlike 5026 * ext4_reserve_inode_write, this leaves behind no bh reference and 5027 * returns no iloc structure, so the caller needs to repeat the iloc 5028 * lookup to mark the inode dirty later. 5029 */ 5030 static int ext4_pin_inode(handle_t *handle, struct inode *inode) 5031 { 5032 struct ext4_iloc iloc; 5033 5034 int err = 0; 5035 if (handle) { 5036 err = ext4_get_inode_loc(inode, &iloc); 5037 if (!err) { 5038 BUFFER_TRACE(iloc.bh, "get_write_access"); 5039 err = jbd2_journal_get_write_access(handle, iloc.bh); 5040 if (!err) 5041 err = ext4_handle_dirty_metadata(handle, 5042 inode, 5043 iloc.bh); 5044 brelse(iloc.bh); 5045 } 5046 } 5047 ext4_std_error(inode->i_sb, err); 5048 return err; 5049 } 5050 #endif 5051 5052 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5053 { 5054 journal_t *journal; 5055 handle_t *handle; 5056 int err; 5057 5058 /* 5059 * We have to be very careful here: changing a data block's 5060 * journaling status dynamically is dangerous. If we write a 5061 * data block to the journal, change the status and then delete 5062 * that block, we risk forgetting to revoke the old log record 5063 * from the journal and so a subsequent replay can corrupt data. 5064 * So, first we make sure that the journal is empty and that 5065 * nobody is changing anything. 5066 */ 5067 5068 journal = EXT4_JOURNAL(inode); 5069 if (!journal) 5070 return 0; 5071 if (is_journal_aborted(journal)) 5072 return -EROFS; 5073 5074 jbd2_journal_lock_updates(journal); 5075 jbd2_journal_flush(journal); 5076 5077 /* 5078 * OK, there are no updates running now, and all cached data is 5079 * synced to disk. We are now in a completely consistent state 5080 * which doesn't have anything in the journal, and we know that 5081 * no filesystem updates are running, so it is safe to modify 5082 * the inode's in-core data-journaling state flag now. 5083 */ 5084 5085 if (val) 5086 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5087 else 5088 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5089 ext4_set_aops(inode); 5090 5091 jbd2_journal_unlock_updates(journal); 5092 5093 /* Finally we can mark the inode as dirty. */ 5094 5095 handle = ext4_journal_start(inode, 1); 5096 if (IS_ERR(handle)) 5097 return PTR_ERR(handle); 5098 5099 err = ext4_mark_inode_dirty(handle, inode); 5100 ext4_handle_sync(handle); 5101 ext4_journal_stop(handle); 5102 ext4_std_error(inode->i_sb, err); 5103 5104 return err; 5105 } 5106 5107 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5108 { 5109 return !buffer_mapped(bh); 5110 } 5111 5112 int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) 5113 { 5114 loff_t size; 5115 unsigned long len; 5116 int ret = -EINVAL; 5117 void *fsdata; 5118 struct file *file = vma->vm_file; 5119 struct inode *inode = file->f_path.dentry->d_inode; 5120 struct address_space *mapping = inode->i_mapping; 5121 5122 /* 5123 * Get i_alloc_sem to stop truncates messing with the inode. We cannot 5124 * get i_mutex because we are already holding mmap_sem. 5125 */ 5126 down_read(&inode->i_alloc_sem); 5127 size = i_size_read(inode); 5128 if (page->mapping != mapping || size <= page_offset(page) 5129 || !PageUptodate(page)) { 5130 /* page got truncated from under us? */ 5131 goto out_unlock; 5132 } 5133 ret = 0; 5134 if (PageMappedToDisk(page)) 5135 goto out_unlock; 5136 5137 if (page->index == size >> PAGE_CACHE_SHIFT) 5138 len = size & ~PAGE_CACHE_MASK; 5139 else 5140 len = PAGE_CACHE_SIZE; 5141 5142 if (page_has_buffers(page)) { 5143 /* return if we have all the buffers mapped */ 5144 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5145 ext4_bh_unmapped)) 5146 goto out_unlock; 5147 } 5148 /* 5149 * OK, we need to fill the hole... Do write_begin write_end 5150 * to do block allocation/reservation.We are not holding 5151 * inode.i__mutex here. That allow * parallel write_begin, 5152 * write_end call. lock_page prevent this from happening 5153 * on the same page though 5154 */ 5155 ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), 5156 len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 5157 if (ret < 0) 5158 goto out_unlock; 5159 ret = mapping->a_ops->write_end(file, mapping, page_offset(page), 5160 len, len, page, fsdata); 5161 if (ret < 0) 5162 goto out_unlock; 5163 ret = 0; 5164 out_unlock: 5165 up_read(&inode->i_alloc_sem); 5166 return ret; 5167 } 5168