1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/inode.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/inode.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * 64-bit file support on 64-bit platforms by Jakub Jelinek 17 * (jj@sunsite.ms.mff.cuni.cz) 18 * 19 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 20 */ 21 22 #include <linux/fs.h> 23 #include <linux/time.h> 24 #include <linux/highuid.h> 25 #include <linux/pagemap.h> 26 #include <linux/dax.h> 27 #include <linux/quotaops.h> 28 #include <linux/string.h> 29 #include <linux/buffer_head.h> 30 #include <linux/writeback.h> 31 #include <linux/pagevec.h> 32 #include <linux/mpage.h> 33 #include <linux/namei.h> 34 #include <linux/uio.h> 35 #include <linux/bio.h> 36 #include <linux/workqueue.h> 37 #include <linux/kernel.h> 38 #include <linux/printk.h> 39 #include <linux/slab.h> 40 #include <linux/bitops.h> 41 #include <linux/iomap.h> 42 #include <linux/iversion.h> 43 44 #include "ext4_jbd2.h" 45 #include "xattr.h" 46 #include "acl.h" 47 #include "truncate.h" 48 49 #include <trace/events/ext4.h> 50 51 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, 52 struct ext4_inode_info *ei) 53 { 54 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 55 __u32 csum; 56 __u16 dummy_csum = 0; 57 int offset = offsetof(struct ext4_inode, i_checksum_lo); 58 unsigned int csum_size = sizeof(dummy_csum); 59 60 csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); 61 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); 62 offset += csum_size; 63 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 64 EXT4_GOOD_OLD_INODE_SIZE - offset); 65 66 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 67 offset = offsetof(struct ext4_inode, i_checksum_hi); 68 csum = ext4_chksum(sbi, csum, (__u8 *)raw + 69 EXT4_GOOD_OLD_INODE_SIZE, 70 offset - EXT4_GOOD_OLD_INODE_SIZE); 71 if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { 72 csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, 73 csum_size); 74 offset += csum_size; 75 } 76 csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, 77 EXT4_INODE_SIZE(inode->i_sb) - offset); 78 } 79 80 return csum; 81 } 82 83 static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, 84 struct ext4_inode_info *ei) 85 { 86 __u32 provided, calculated; 87 88 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 89 cpu_to_le32(EXT4_OS_LINUX) || 90 !ext4_has_metadata_csum(inode->i_sb)) 91 return 1; 92 93 provided = le16_to_cpu(raw->i_checksum_lo); 94 calculated = ext4_inode_csum(inode, raw, ei); 95 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 96 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 97 provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; 98 else 99 calculated &= 0xFFFF; 100 101 return provided == calculated; 102 } 103 104 static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, 105 struct ext4_inode_info *ei) 106 { 107 __u32 csum; 108 109 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 110 cpu_to_le32(EXT4_OS_LINUX) || 111 !ext4_has_metadata_csum(inode->i_sb)) 112 return; 113 114 csum = ext4_inode_csum(inode, raw, ei); 115 raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); 116 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 117 EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) 118 raw->i_checksum_hi = cpu_to_le16(csum >> 16); 119 } 120 121 static inline int ext4_begin_ordered_truncate(struct inode *inode, 122 loff_t new_size) 123 { 124 trace_ext4_begin_ordered_truncate(inode, new_size); 125 /* 126 * If jinode is zero, then we never opened the file for 127 * writing, so there's no need to call 128 * jbd2_journal_begin_ordered_truncate() since there's no 129 * outstanding writes we need to flush. 130 */ 131 if (!EXT4_I(inode)->jinode) 132 return 0; 133 return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), 134 EXT4_I(inode)->jinode, 135 new_size); 136 } 137 138 static void ext4_invalidatepage(struct page *page, unsigned int offset, 139 unsigned int length); 140 static int __ext4_journalled_writepage(struct page *page, unsigned int len); 141 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 142 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 143 int pextents); 144 145 /* 146 * Test whether an inode is a fast symlink. 147 * A fast symlink has its symlink data stored in ext4_inode_info->i_data. 148 */ 149 int ext4_inode_is_fast_symlink(struct inode *inode) 150 { 151 if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { 152 int ea_blocks = EXT4_I(inode)->i_file_acl ? 153 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; 154 155 if (ext4_has_inline_data(inode)) 156 return 0; 157 158 return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); 159 } 160 return S_ISLNK(inode->i_mode) && inode->i_size && 161 (inode->i_size < EXT4_N_BLOCKS * 4); 162 } 163 164 /* 165 * Called at the last iput() if i_nlink is zero. 166 */ 167 void ext4_evict_inode(struct inode *inode) 168 { 169 handle_t *handle; 170 int err; 171 /* 172 * Credits for final inode cleanup and freeing: 173 * sb + inode (ext4_orphan_del()), block bitmap, group descriptor 174 * (xattr block freeing), bitmap, group descriptor (inode freeing) 175 */ 176 int extra_credits = 6; 177 struct ext4_xattr_inode_array *ea_inode_array = NULL; 178 179 trace_ext4_evict_inode(inode); 180 181 if (inode->i_nlink) { 182 /* 183 * When journalling data dirty buffers are tracked only in the 184 * journal. So although mm thinks everything is clean and 185 * ready for reaping the inode might still have some pages to 186 * write in the running transaction or waiting to be 187 * checkpointed. Thus calling jbd2_journal_invalidatepage() 188 * (via truncate_inode_pages()) to discard these buffers can 189 * cause data loss. Also even if we did not discard these 190 * buffers, we would have no way to find them after the inode 191 * is reaped and thus user could see stale data if he tries to 192 * read them before the transaction is checkpointed. So be 193 * careful and force everything to disk here... We use 194 * ei->i_datasync_tid to store the newest transaction 195 * containing inode's data. 196 * 197 * Note that directories do not have this problem because they 198 * don't use page cache. 199 */ 200 if (inode->i_ino != EXT4_JOURNAL_INO && 201 ext4_should_journal_data(inode) && 202 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && 203 inode->i_data.nrpages) { 204 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 205 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; 206 207 jbd2_complete_transaction(journal, commit_tid); 208 filemap_write_and_wait(&inode->i_data); 209 } 210 truncate_inode_pages_final(&inode->i_data); 211 212 goto no_delete; 213 } 214 215 if (is_bad_inode(inode)) 216 goto no_delete; 217 dquot_initialize(inode); 218 219 if (ext4_should_order_data(inode)) 220 ext4_begin_ordered_truncate(inode, 0); 221 truncate_inode_pages_final(&inode->i_data); 222 223 /* 224 * Protect us against freezing - iput() caller didn't have to have any 225 * protection against it 226 */ 227 sb_start_intwrite(inode->i_sb); 228 229 if (!IS_NOQUOTA(inode)) 230 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); 231 232 /* 233 * Block bitmap, group descriptor, and inode are accounted in both 234 * ext4_blocks_for_truncate() and extra_credits. So subtract 3. 235 */ 236 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, 237 ext4_blocks_for_truncate(inode) + extra_credits - 3); 238 if (IS_ERR(handle)) { 239 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 240 /* 241 * If we're going to skip the normal cleanup, we still need to 242 * make sure that the in-core orphan linked list is properly 243 * cleaned up. 244 */ 245 ext4_orphan_del(NULL, inode); 246 sb_end_intwrite(inode->i_sb); 247 goto no_delete; 248 } 249 250 if (IS_SYNC(inode)) 251 ext4_handle_sync(handle); 252 253 /* 254 * Set inode->i_size to 0 before calling ext4_truncate(). We need 255 * special handling of symlinks here because i_size is used to 256 * determine whether ext4_inode_info->i_data contains symlink data or 257 * block mappings. Setting i_size to 0 will remove its fast symlink 258 * status. Erase i_data so that it becomes a valid empty block map. 259 */ 260 if (ext4_inode_is_fast_symlink(inode)) 261 memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); 262 inode->i_size = 0; 263 err = ext4_mark_inode_dirty(handle, inode); 264 if (err) { 265 ext4_warning(inode->i_sb, 266 "couldn't mark inode dirty (err %d)", err); 267 goto stop_handle; 268 } 269 if (inode->i_blocks) { 270 err = ext4_truncate(inode); 271 if (err) { 272 ext4_error_err(inode->i_sb, -err, 273 "couldn't truncate inode %lu (err %d)", 274 inode->i_ino, err); 275 goto stop_handle; 276 } 277 } 278 279 /* Remove xattr references. */ 280 err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, 281 extra_credits); 282 if (err) { 283 ext4_warning(inode->i_sb, "xattr delete (err %d)", err); 284 stop_handle: 285 ext4_journal_stop(handle); 286 ext4_orphan_del(NULL, inode); 287 sb_end_intwrite(inode->i_sb); 288 ext4_xattr_inode_array_free(ea_inode_array); 289 goto no_delete; 290 } 291 292 /* 293 * Kill off the orphan record which ext4_truncate created. 294 * AKPM: I think this can be inside the above `if'. 295 * Note that ext4_orphan_del() has to be able to cope with the 296 * deletion of a non-existent orphan - this is because we don't 297 * know if ext4_truncate() actually created an orphan record. 298 * (Well, we could do this if we need to, but heck - it works) 299 */ 300 ext4_orphan_del(handle, inode); 301 EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); 302 303 /* 304 * One subtle ordering requirement: if anything has gone wrong 305 * (transaction abort, IO errors, whatever), then we can still 306 * do these next steps (the fs will already have been marked as 307 * having errors), but we can't free the inode if the mark_dirty 308 * fails. 309 */ 310 if (ext4_mark_inode_dirty(handle, inode)) 311 /* If that failed, just do the required in-core inode clear. */ 312 ext4_clear_inode(inode); 313 else 314 ext4_free_inode(handle, inode); 315 ext4_journal_stop(handle); 316 sb_end_intwrite(inode->i_sb); 317 ext4_xattr_inode_array_free(ea_inode_array); 318 return; 319 no_delete: 320 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 321 } 322 323 #ifdef CONFIG_QUOTA 324 qsize_t *ext4_get_reserved_space(struct inode *inode) 325 { 326 return &EXT4_I(inode)->i_reserved_quota; 327 } 328 #endif 329 330 /* 331 * Called with i_data_sem down, which is important since we can call 332 * ext4_discard_preallocations() from here. 333 */ 334 void ext4_da_update_reserve_space(struct inode *inode, 335 int used, int quota_claim) 336 { 337 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 338 struct ext4_inode_info *ei = EXT4_I(inode); 339 340 spin_lock(&ei->i_block_reservation_lock); 341 trace_ext4_da_update_reserve_space(inode, used, quota_claim); 342 if (unlikely(used > ei->i_reserved_data_blocks)) { 343 ext4_warning(inode->i_sb, "%s: ino %lu, used %d " 344 "with only %d reserved data blocks", 345 __func__, inode->i_ino, used, 346 ei->i_reserved_data_blocks); 347 WARN_ON(1); 348 used = ei->i_reserved_data_blocks; 349 } 350 351 /* Update per-inode reservations */ 352 ei->i_reserved_data_blocks -= used; 353 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); 354 355 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 356 357 /* Update quota subsystem for data blocks */ 358 if (quota_claim) 359 dquot_claim_block(inode, EXT4_C2B(sbi, used)); 360 else { 361 /* 362 * We did fallocate with an offset that is already delayed 363 * allocated. So on delayed allocated writeback we should 364 * not re-claim the quota for fallocated blocks. 365 */ 366 dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); 367 } 368 369 /* 370 * If we have done all the pending block allocations and if 371 * there aren't any writers on the inode, we can discard the 372 * inode's preallocations. 373 */ 374 if ((ei->i_reserved_data_blocks == 0) && 375 !inode_is_open_for_write(inode)) 376 ext4_discard_preallocations(inode); 377 } 378 379 static int __check_block_validity(struct inode *inode, const char *func, 380 unsigned int line, 381 struct ext4_map_blocks *map) 382 { 383 if (ext4_has_feature_journal(inode->i_sb) && 384 (inode->i_ino == 385 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) 386 return 0; 387 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 388 map->m_len)) { 389 ext4_error_inode(inode, func, line, map->m_pblk, 390 "lblock %lu mapped to illegal pblock %llu " 391 "(length %d)", (unsigned long) map->m_lblk, 392 map->m_pblk, map->m_len); 393 return -EFSCORRUPTED; 394 } 395 return 0; 396 } 397 398 int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, 399 ext4_lblk_t len) 400 { 401 int ret; 402 403 if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) 404 return fscrypt_zeroout_range(inode, lblk, pblk, len); 405 406 ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); 407 if (ret > 0) 408 ret = 0; 409 410 return ret; 411 } 412 413 #define check_block_validity(inode, map) \ 414 __check_block_validity((inode), __func__, __LINE__, (map)) 415 416 #ifdef ES_AGGRESSIVE_TEST 417 static void ext4_map_blocks_es_recheck(handle_t *handle, 418 struct inode *inode, 419 struct ext4_map_blocks *es_map, 420 struct ext4_map_blocks *map, 421 int flags) 422 { 423 int retval; 424 425 map->m_flags = 0; 426 /* 427 * There is a race window that the result is not the same. 428 * e.g. xfstests #223 when dioread_nolock enables. The reason 429 * is that we lookup a block mapping in extent status tree with 430 * out taking i_data_sem. So at the time the unwritten extent 431 * could be converted. 432 */ 433 down_read(&EXT4_I(inode)->i_data_sem); 434 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 435 retval = ext4_ext_map_blocks(handle, inode, map, 0); 436 } else { 437 retval = ext4_ind_map_blocks(handle, inode, map, 0); 438 } 439 up_read((&EXT4_I(inode)->i_data_sem)); 440 441 /* 442 * We don't check m_len because extent will be collpased in status 443 * tree. So the m_len might not equal. 444 */ 445 if (es_map->m_lblk != map->m_lblk || 446 es_map->m_flags != map->m_flags || 447 es_map->m_pblk != map->m_pblk) { 448 printk("ES cache assertion failed for inode: %lu " 449 "es_cached ex [%d/%d/%llu/%x] != " 450 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 451 inode->i_ino, es_map->m_lblk, es_map->m_len, 452 es_map->m_pblk, es_map->m_flags, map->m_lblk, 453 map->m_len, map->m_pblk, map->m_flags, 454 retval, flags); 455 } 456 } 457 #endif /* ES_AGGRESSIVE_TEST */ 458 459 /* 460 * The ext4_map_blocks() function tries to look up the requested blocks, 461 * and returns if the blocks are already mapped. 462 * 463 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 464 * and store the allocated blocks in the result buffer head and mark it 465 * mapped. 466 * 467 * If file type is extents based, it will call ext4_ext_map_blocks(), 468 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 469 * based files 470 * 471 * On success, it returns the number of blocks being mapped or allocated. if 472 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map 473 * is marked as unwritten. If the create == 1, it will mark @map as mapped. 474 * 475 * It returns 0 if plain look up failed (blocks have not been allocated), in 476 * that case, @map is returned as unmapped but we still do fill map->m_len to 477 * indicate the length of a hole starting at map->m_lblk. 478 * 479 * It returns the error in case of allocation failure. 480 */ 481 int ext4_map_blocks(handle_t *handle, struct inode *inode, 482 struct ext4_map_blocks *map, int flags) 483 { 484 struct extent_status es; 485 int retval; 486 int ret = 0; 487 #ifdef ES_AGGRESSIVE_TEST 488 struct ext4_map_blocks orig_map; 489 490 memcpy(&orig_map, map, sizeof(*map)); 491 #endif 492 493 map->m_flags = 0; 494 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," 495 "logical block %lu\n", inode->i_ino, flags, map->m_len, 496 (unsigned long) map->m_lblk); 497 498 /* 499 * ext4_map_blocks returns an int, and m_len is an unsigned int 500 */ 501 if (unlikely(map->m_len > INT_MAX)) 502 map->m_len = INT_MAX; 503 504 /* We can handle the block number less than EXT_MAX_BLOCKS */ 505 if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) 506 return -EFSCORRUPTED; 507 508 /* Lookup extent status tree firstly */ 509 if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 510 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 511 map->m_pblk = ext4_es_pblock(&es) + 512 map->m_lblk - es.es_lblk; 513 map->m_flags |= ext4_es_is_written(&es) ? 514 EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; 515 retval = es.es_len - (map->m_lblk - es.es_lblk); 516 if (retval > map->m_len) 517 retval = map->m_len; 518 map->m_len = retval; 519 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 520 map->m_pblk = 0; 521 retval = es.es_len - (map->m_lblk - es.es_lblk); 522 if (retval > map->m_len) 523 retval = map->m_len; 524 map->m_len = retval; 525 retval = 0; 526 } else { 527 BUG(); 528 } 529 #ifdef ES_AGGRESSIVE_TEST 530 ext4_map_blocks_es_recheck(handle, inode, map, 531 &orig_map, flags); 532 #endif 533 goto found; 534 } 535 536 /* 537 * Try to see if we can get the block without requesting a new 538 * file system block. 539 */ 540 down_read(&EXT4_I(inode)->i_data_sem); 541 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 542 retval = ext4_ext_map_blocks(handle, inode, map, 0); 543 } else { 544 retval = ext4_ind_map_blocks(handle, inode, map, 0); 545 } 546 if (retval > 0) { 547 unsigned int status; 548 549 if (unlikely(retval != map->m_len)) { 550 ext4_warning(inode->i_sb, 551 "ES len assertion failed for inode " 552 "%lu: retval %d != map->m_len %d", 553 inode->i_ino, retval, map->m_len); 554 WARN_ON(1); 555 } 556 557 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 558 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 559 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 560 !(status & EXTENT_STATUS_WRITTEN) && 561 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, 562 map->m_lblk + map->m_len - 1)) 563 status |= EXTENT_STATUS_DELAYED; 564 ret = ext4_es_insert_extent(inode, map->m_lblk, 565 map->m_len, map->m_pblk, status); 566 if (ret < 0) 567 retval = ret; 568 } 569 up_read((&EXT4_I(inode)->i_data_sem)); 570 571 found: 572 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 573 ret = check_block_validity(inode, map); 574 if (ret != 0) 575 return ret; 576 } 577 578 /* If it is only a block(s) look up */ 579 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) 580 return retval; 581 582 /* 583 * Returns if the blocks have already allocated 584 * 585 * Note that if blocks have been preallocated 586 * ext4_ext_get_block() returns the create = 0 587 * with buffer head unmapped. 588 */ 589 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) 590 /* 591 * If we need to convert extent to unwritten 592 * we continue and do the actual work in 593 * ext4_ext_map_blocks() 594 */ 595 if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) 596 return retval; 597 598 /* 599 * Here we clear m_flags because after allocating an new extent, 600 * it will be set again. 601 */ 602 map->m_flags &= ~EXT4_MAP_FLAGS; 603 604 /* 605 * New blocks allocate and/or writing to unwritten extent 606 * will possibly result in updating i_data, so we take 607 * the write lock of i_data_sem, and call get_block() 608 * with create == 1 flag. 609 */ 610 down_write(&EXT4_I(inode)->i_data_sem); 611 612 /* 613 * We need to check for EXT4 here because migrate 614 * could have changed the inode type in between 615 */ 616 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 617 retval = ext4_ext_map_blocks(handle, inode, map, flags); 618 } else { 619 retval = ext4_ind_map_blocks(handle, inode, map, flags); 620 621 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { 622 /* 623 * We allocated new blocks which will result in 624 * i_data's format changing. Force the migrate 625 * to fail by clearing migrate flags 626 */ 627 ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); 628 } 629 630 /* 631 * Update reserved blocks/metadata blocks after successful 632 * block allocation which had been deferred till now. We don't 633 * support fallocate for non extent files. So we can update 634 * reserve space here. 635 */ 636 if ((retval > 0) && 637 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 638 ext4_da_update_reserve_space(inode, retval, 1); 639 } 640 641 if (retval > 0) { 642 unsigned int status; 643 644 if (unlikely(retval != map->m_len)) { 645 ext4_warning(inode->i_sb, 646 "ES len assertion failed for inode " 647 "%lu: retval %d != map->m_len %d", 648 inode->i_ino, retval, map->m_len); 649 WARN_ON(1); 650 } 651 652 /* 653 * We have to zeroout blocks before inserting them into extent 654 * status tree. Otherwise someone could look them up there and 655 * use them before they are really zeroed. We also have to 656 * unmap metadata before zeroing as otherwise writeback can 657 * overwrite zeros with stale data from block device. 658 */ 659 if (flags & EXT4_GET_BLOCKS_ZERO && 660 map->m_flags & EXT4_MAP_MAPPED && 661 map->m_flags & EXT4_MAP_NEW) { 662 ret = ext4_issue_zeroout(inode, map->m_lblk, 663 map->m_pblk, map->m_len); 664 if (ret) { 665 retval = ret; 666 goto out_sem; 667 } 668 } 669 670 /* 671 * If the extent has been zeroed out, we don't need to update 672 * extent status tree. 673 */ 674 if ((flags & EXT4_GET_BLOCKS_PRE_IO) && 675 ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 676 if (ext4_es_is_written(&es)) 677 goto out_sem; 678 } 679 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 680 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 681 if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && 682 !(status & EXTENT_STATUS_WRITTEN) && 683 ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, 684 map->m_lblk + map->m_len - 1)) 685 status |= EXTENT_STATUS_DELAYED; 686 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 687 map->m_pblk, status); 688 if (ret < 0) { 689 retval = ret; 690 goto out_sem; 691 } 692 } 693 694 out_sem: 695 up_write((&EXT4_I(inode)->i_data_sem)); 696 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 697 ret = check_block_validity(inode, map); 698 if (ret != 0) 699 return ret; 700 701 /* 702 * Inodes with freshly allocated blocks where contents will be 703 * visible after transaction commit must be on transaction's 704 * ordered data list. 705 */ 706 if (map->m_flags & EXT4_MAP_NEW && 707 !(map->m_flags & EXT4_MAP_UNWRITTEN) && 708 !(flags & EXT4_GET_BLOCKS_ZERO) && 709 !ext4_is_quota_file(inode) && 710 ext4_should_order_data(inode)) { 711 loff_t start_byte = 712 (loff_t)map->m_lblk << inode->i_blkbits; 713 loff_t length = (loff_t)map->m_len << inode->i_blkbits; 714 715 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) 716 ret = ext4_jbd2_inode_add_wait(handle, inode, 717 start_byte, length); 718 else 719 ret = ext4_jbd2_inode_add_write(handle, inode, 720 start_byte, length); 721 if (ret) 722 return ret; 723 } 724 } 725 return retval; 726 } 727 728 /* 729 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages 730 * we have to be careful as someone else may be manipulating b_state as well. 731 */ 732 static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) 733 { 734 unsigned long old_state; 735 unsigned long new_state; 736 737 flags &= EXT4_MAP_FLAGS; 738 739 /* Dummy buffer_head? Set non-atomically. */ 740 if (!bh->b_page) { 741 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; 742 return; 743 } 744 /* 745 * Someone else may be modifying b_state. Be careful! This is ugly but 746 * once we get rid of using bh as a container for mapping information 747 * to pass to / from get_block functions, this can go away. 748 */ 749 do { 750 old_state = READ_ONCE(bh->b_state); 751 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; 752 } while (unlikely( 753 cmpxchg(&bh->b_state, old_state, new_state) != old_state)); 754 } 755 756 static int _ext4_get_block(struct inode *inode, sector_t iblock, 757 struct buffer_head *bh, int flags) 758 { 759 struct ext4_map_blocks map; 760 int ret = 0; 761 762 if (ext4_has_inline_data(inode)) 763 return -ERANGE; 764 765 map.m_lblk = iblock; 766 map.m_len = bh->b_size >> inode->i_blkbits; 767 768 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, 769 flags); 770 if (ret > 0) { 771 map_bh(bh, inode->i_sb, map.m_pblk); 772 ext4_update_bh_state(bh, map.m_flags); 773 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 774 ret = 0; 775 } else if (ret == 0) { 776 /* hole case, need to fill in bh->b_size */ 777 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 778 } 779 return ret; 780 } 781 782 int ext4_get_block(struct inode *inode, sector_t iblock, 783 struct buffer_head *bh, int create) 784 { 785 return _ext4_get_block(inode, iblock, bh, 786 create ? EXT4_GET_BLOCKS_CREATE : 0); 787 } 788 789 /* 790 * Get block function used when preparing for buffered write if we require 791 * creating an unwritten extent if blocks haven't been allocated. The extent 792 * will be converted to written after the IO is complete. 793 */ 794 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 795 struct buffer_head *bh_result, int create) 796 { 797 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", 798 inode->i_ino, create); 799 return _ext4_get_block(inode, iblock, bh_result, 800 EXT4_GET_BLOCKS_IO_CREATE_EXT); 801 } 802 803 /* Maximum number of blocks we map for direct IO at once. */ 804 #define DIO_MAX_BLOCKS 4096 805 806 /* 807 * `handle' can be NULL if create is zero 808 */ 809 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 810 ext4_lblk_t block, int map_flags) 811 { 812 struct ext4_map_blocks map; 813 struct buffer_head *bh; 814 int create = map_flags & EXT4_GET_BLOCKS_CREATE; 815 int err; 816 817 J_ASSERT(handle != NULL || create == 0); 818 819 map.m_lblk = block; 820 map.m_len = 1; 821 err = ext4_map_blocks(handle, inode, &map, map_flags); 822 823 if (err == 0) 824 return create ? ERR_PTR(-ENOSPC) : NULL; 825 if (err < 0) 826 return ERR_PTR(err); 827 828 bh = sb_getblk(inode->i_sb, map.m_pblk); 829 if (unlikely(!bh)) 830 return ERR_PTR(-ENOMEM); 831 if (map.m_flags & EXT4_MAP_NEW) { 832 J_ASSERT(create != 0); 833 J_ASSERT(handle != NULL); 834 835 /* 836 * Now that we do not always journal data, we should 837 * keep in mind whether this should always journal the 838 * new buffer as metadata. For now, regular file 839 * writes use ext4_get_block instead, so it's not a 840 * problem. 841 */ 842 lock_buffer(bh); 843 BUFFER_TRACE(bh, "call get_create_access"); 844 err = ext4_journal_get_create_access(handle, bh); 845 if (unlikely(err)) { 846 unlock_buffer(bh); 847 goto errout; 848 } 849 if (!buffer_uptodate(bh)) { 850 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 851 set_buffer_uptodate(bh); 852 } 853 unlock_buffer(bh); 854 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 855 err = ext4_handle_dirty_metadata(handle, inode, bh); 856 if (unlikely(err)) 857 goto errout; 858 } else 859 BUFFER_TRACE(bh, "not a new buffer"); 860 return bh; 861 errout: 862 brelse(bh); 863 return ERR_PTR(err); 864 } 865 866 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 867 ext4_lblk_t block, int map_flags) 868 { 869 struct buffer_head *bh; 870 871 bh = ext4_getblk(handle, inode, block, map_flags); 872 if (IS_ERR(bh)) 873 return bh; 874 if (!bh || ext4_buffer_uptodate(bh)) 875 return bh; 876 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); 877 wait_on_buffer(bh); 878 if (buffer_uptodate(bh)) 879 return bh; 880 put_bh(bh); 881 return ERR_PTR(-EIO); 882 } 883 884 /* Read a contiguous batch of blocks. */ 885 int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, 886 bool wait, struct buffer_head **bhs) 887 { 888 int i, err; 889 890 for (i = 0; i < bh_count; i++) { 891 bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); 892 if (IS_ERR(bhs[i])) { 893 err = PTR_ERR(bhs[i]); 894 bh_count = i; 895 goto out_brelse; 896 } 897 } 898 899 for (i = 0; i < bh_count; i++) 900 /* Note that NULL bhs[i] is valid because of holes. */ 901 if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) 902 ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, 903 &bhs[i]); 904 905 if (!wait) 906 return 0; 907 908 for (i = 0; i < bh_count; i++) 909 if (bhs[i]) 910 wait_on_buffer(bhs[i]); 911 912 for (i = 0; i < bh_count; i++) { 913 if (bhs[i] && !buffer_uptodate(bhs[i])) { 914 err = -EIO; 915 goto out_brelse; 916 } 917 } 918 return 0; 919 920 out_brelse: 921 for (i = 0; i < bh_count; i++) { 922 brelse(bhs[i]); 923 bhs[i] = NULL; 924 } 925 return err; 926 } 927 928 int ext4_walk_page_buffers(handle_t *handle, 929 struct buffer_head *head, 930 unsigned from, 931 unsigned to, 932 int *partial, 933 int (*fn)(handle_t *handle, 934 struct buffer_head *bh)) 935 { 936 struct buffer_head *bh; 937 unsigned block_start, block_end; 938 unsigned blocksize = head->b_size; 939 int err, ret = 0; 940 struct buffer_head *next; 941 942 for (bh = head, block_start = 0; 943 ret == 0 && (bh != head || !block_start); 944 block_start = block_end, bh = next) { 945 next = bh->b_this_page; 946 block_end = block_start + blocksize; 947 if (block_end <= from || block_start >= to) { 948 if (partial && !buffer_uptodate(bh)) 949 *partial = 1; 950 continue; 951 } 952 err = (*fn)(handle, bh); 953 if (!ret) 954 ret = err; 955 } 956 return ret; 957 } 958 959 /* 960 * To preserve ordering, it is essential that the hole instantiation and 961 * the data write be encapsulated in a single transaction. We cannot 962 * close off a transaction and start a new one between the ext4_get_block() 963 * and the commit_write(). So doing the jbd2_journal_start at the start of 964 * prepare_write() is the right place. 965 * 966 * Also, this function can nest inside ext4_writepage(). In that case, we 967 * *know* that ext4_writepage() has generated enough buffer credits to do the 968 * whole page. So we won't block on the journal in that case, which is good, 969 * because the caller may be PF_MEMALLOC. 970 * 971 * By accident, ext4 can be reentered when a transaction is open via 972 * quota file writes. If we were to commit the transaction while thus 973 * reentered, there can be a deadlock - we would be holding a quota 974 * lock, and the commit would never complete if another thread had a 975 * transaction open and was blocking on the quota lock - a ranking 976 * violation. 977 * 978 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start 979 * will _not_ run commit under these circumstances because handle->h_ref 980 * is elevated. We'll still have enough credits for the tiny quotafile 981 * write. 982 */ 983 int do_journal_get_write_access(handle_t *handle, 984 struct buffer_head *bh) 985 { 986 int dirty = buffer_dirty(bh); 987 int ret; 988 989 if (!buffer_mapped(bh) || buffer_freed(bh)) 990 return 0; 991 /* 992 * __block_write_begin() could have dirtied some buffers. Clean 993 * the dirty bit as jbd2_journal_get_write_access() could complain 994 * otherwise about fs integrity issues. Setting of the dirty bit 995 * by __block_write_begin() isn't a real problem here as we clear 996 * the bit before releasing a page lock and thus writeback cannot 997 * ever write the buffer. 998 */ 999 if (dirty) 1000 clear_buffer_dirty(bh); 1001 BUFFER_TRACE(bh, "get write access"); 1002 ret = ext4_journal_get_write_access(handle, bh); 1003 if (!ret && dirty) 1004 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1005 return ret; 1006 } 1007 1008 #ifdef CONFIG_FS_ENCRYPTION 1009 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, 1010 get_block_t *get_block) 1011 { 1012 unsigned from = pos & (PAGE_SIZE - 1); 1013 unsigned to = from + len; 1014 struct inode *inode = page->mapping->host; 1015 unsigned block_start, block_end; 1016 sector_t block; 1017 int err = 0; 1018 unsigned blocksize = inode->i_sb->s_blocksize; 1019 unsigned bbits; 1020 struct buffer_head *bh, *head, *wait[2]; 1021 int nr_wait = 0; 1022 int i; 1023 1024 BUG_ON(!PageLocked(page)); 1025 BUG_ON(from > PAGE_SIZE); 1026 BUG_ON(to > PAGE_SIZE); 1027 BUG_ON(from > to); 1028 1029 if (!page_has_buffers(page)) 1030 create_empty_buffers(page, blocksize, 0); 1031 head = page_buffers(page); 1032 bbits = ilog2(blocksize); 1033 block = (sector_t)page->index << (PAGE_SHIFT - bbits); 1034 1035 for (bh = head, block_start = 0; bh != head || !block_start; 1036 block++, block_start = block_end, bh = bh->b_this_page) { 1037 block_end = block_start + blocksize; 1038 if (block_end <= from || block_start >= to) { 1039 if (PageUptodate(page)) { 1040 if (!buffer_uptodate(bh)) 1041 set_buffer_uptodate(bh); 1042 } 1043 continue; 1044 } 1045 if (buffer_new(bh)) 1046 clear_buffer_new(bh); 1047 if (!buffer_mapped(bh)) { 1048 WARN_ON(bh->b_size != blocksize); 1049 err = get_block(inode, block, bh, 1); 1050 if (err) 1051 break; 1052 if (buffer_new(bh)) { 1053 if (PageUptodate(page)) { 1054 clear_buffer_new(bh); 1055 set_buffer_uptodate(bh); 1056 mark_buffer_dirty(bh); 1057 continue; 1058 } 1059 if (block_end > to || block_start < from) 1060 zero_user_segments(page, to, block_end, 1061 block_start, from); 1062 continue; 1063 } 1064 } 1065 if (PageUptodate(page)) { 1066 if (!buffer_uptodate(bh)) 1067 set_buffer_uptodate(bh); 1068 continue; 1069 } 1070 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1071 !buffer_unwritten(bh) && 1072 (block_start < from || block_end > to)) { 1073 ll_rw_block(REQ_OP_READ, 0, 1, &bh); 1074 wait[nr_wait++] = bh; 1075 } 1076 } 1077 /* 1078 * If we issued read requests, let them complete. 1079 */ 1080 for (i = 0; i < nr_wait; i++) { 1081 wait_on_buffer(wait[i]); 1082 if (!buffer_uptodate(wait[i])) 1083 err = -EIO; 1084 } 1085 if (unlikely(err)) { 1086 page_zero_new_buffers(page, from, to); 1087 } else if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) { 1088 for (i = 0; i < nr_wait; i++) { 1089 int err2; 1090 1091 err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize, 1092 bh_offset(wait[i])); 1093 if (err2) { 1094 clear_buffer_uptodate(wait[i]); 1095 err = err2; 1096 } 1097 } 1098 } 1099 1100 return err; 1101 } 1102 #endif 1103 1104 static int ext4_write_begin(struct file *file, struct address_space *mapping, 1105 loff_t pos, unsigned len, unsigned flags, 1106 struct page **pagep, void **fsdata) 1107 { 1108 struct inode *inode = mapping->host; 1109 int ret, needed_blocks; 1110 handle_t *handle; 1111 int retries = 0; 1112 struct page *page; 1113 pgoff_t index; 1114 unsigned from, to; 1115 1116 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 1117 return -EIO; 1118 1119 trace_ext4_write_begin(inode, pos, len, flags); 1120 /* 1121 * Reserve one block more for addition to orphan list in case 1122 * we allocate blocks but write fails for some reason 1123 */ 1124 needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1125 index = pos >> PAGE_SHIFT; 1126 from = pos & (PAGE_SIZE - 1); 1127 to = from + len; 1128 1129 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 1130 ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, 1131 flags, pagep); 1132 if (ret < 0) 1133 return ret; 1134 if (ret == 1) 1135 return 0; 1136 } 1137 1138 /* 1139 * grab_cache_page_write_begin() can take a long time if the 1140 * system is thrashing due to memory pressure, or if the page 1141 * is being written back. So grab it first before we start 1142 * the transaction handle. This also allows us to allocate 1143 * the page (if needed) without using GFP_NOFS. 1144 */ 1145 retry_grab: 1146 page = grab_cache_page_write_begin(mapping, index, flags); 1147 if (!page) 1148 return -ENOMEM; 1149 unlock_page(page); 1150 1151 retry_journal: 1152 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); 1153 if (IS_ERR(handle)) { 1154 put_page(page); 1155 return PTR_ERR(handle); 1156 } 1157 1158 lock_page(page); 1159 if (page->mapping != mapping) { 1160 /* The page got truncated from under us */ 1161 unlock_page(page); 1162 put_page(page); 1163 ext4_journal_stop(handle); 1164 goto retry_grab; 1165 } 1166 /* In case writeback began while the page was unlocked */ 1167 wait_for_stable_page(page); 1168 1169 #ifdef CONFIG_FS_ENCRYPTION 1170 if (ext4_should_dioread_nolock(inode)) 1171 ret = ext4_block_write_begin(page, pos, len, 1172 ext4_get_block_unwritten); 1173 else 1174 ret = ext4_block_write_begin(page, pos, len, 1175 ext4_get_block); 1176 #else 1177 if (ext4_should_dioread_nolock(inode)) 1178 ret = __block_write_begin(page, pos, len, 1179 ext4_get_block_unwritten); 1180 else 1181 ret = __block_write_begin(page, pos, len, ext4_get_block); 1182 #endif 1183 if (!ret && ext4_should_journal_data(inode)) { 1184 ret = ext4_walk_page_buffers(handle, page_buffers(page), 1185 from, to, NULL, 1186 do_journal_get_write_access); 1187 } 1188 1189 if (ret) { 1190 bool extended = (pos + len > inode->i_size) && 1191 !ext4_verity_in_progress(inode); 1192 1193 unlock_page(page); 1194 /* 1195 * __block_write_begin may have instantiated a few blocks 1196 * outside i_size. Trim these off again. Don't need 1197 * i_size_read because we hold i_mutex. 1198 * 1199 * Add inode to orphan list in case we crash before 1200 * truncate finishes 1201 */ 1202 if (extended && ext4_can_truncate(inode)) 1203 ext4_orphan_add(handle, inode); 1204 1205 ext4_journal_stop(handle); 1206 if (extended) { 1207 ext4_truncate_failed_write(inode); 1208 /* 1209 * If truncate failed early the inode might 1210 * still be on the orphan list; we need to 1211 * make sure the inode is removed from the 1212 * orphan list in that case. 1213 */ 1214 if (inode->i_nlink) 1215 ext4_orphan_del(NULL, inode); 1216 } 1217 1218 if (ret == -ENOSPC && 1219 ext4_should_retry_alloc(inode->i_sb, &retries)) 1220 goto retry_journal; 1221 put_page(page); 1222 return ret; 1223 } 1224 *pagep = page; 1225 return ret; 1226 } 1227 1228 /* For write_end() in data=journal mode */ 1229 static int write_end_fn(handle_t *handle, struct buffer_head *bh) 1230 { 1231 int ret; 1232 if (!buffer_mapped(bh) || buffer_freed(bh)) 1233 return 0; 1234 set_buffer_uptodate(bh); 1235 ret = ext4_handle_dirty_metadata(handle, NULL, bh); 1236 clear_buffer_meta(bh); 1237 clear_buffer_prio(bh); 1238 return ret; 1239 } 1240 1241 /* 1242 * We need to pick up the new inode size which generic_commit_write gave us 1243 * `file' can be NULL - eg, when called from page_symlink(). 1244 * 1245 * ext4 never places buffers on inode->i_mapping->private_list. metadata 1246 * buffers are managed internally. 1247 */ 1248 static int ext4_write_end(struct file *file, 1249 struct address_space *mapping, 1250 loff_t pos, unsigned len, unsigned copied, 1251 struct page *page, void *fsdata) 1252 { 1253 handle_t *handle = ext4_journal_current_handle(); 1254 struct inode *inode = mapping->host; 1255 loff_t old_size = inode->i_size; 1256 int ret = 0, ret2; 1257 int i_size_changed = 0; 1258 int inline_data = ext4_has_inline_data(inode); 1259 bool verity = ext4_verity_in_progress(inode); 1260 1261 trace_ext4_write_end(inode, pos, len, copied); 1262 if (inline_data) { 1263 ret = ext4_write_inline_data_end(inode, pos, len, 1264 copied, page); 1265 if (ret < 0) { 1266 unlock_page(page); 1267 put_page(page); 1268 goto errout; 1269 } 1270 copied = ret; 1271 } else 1272 copied = block_write_end(file, mapping, pos, 1273 len, copied, page, fsdata); 1274 /* 1275 * it's important to update i_size while still holding page lock: 1276 * page writeout could otherwise come in and zero beyond i_size. 1277 * 1278 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree 1279 * blocks are being written past EOF, so skip the i_size update. 1280 */ 1281 if (!verity) 1282 i_size_changed = ext4_update_inode_size(inode, pos + copied); 1283 unlock_page(page); 1284 put_page(page); 1285 1286 if (old_size < pos && !verity) 1287 pagecache_isize_extended(inode, old_size, pos); 1288 /* 1289 * Don't mark the inode dirty under page lock. First, it unnecessarily 1290 * makes the holding time of page lock longer. Second, it forces lock 1291 * ordering of page lock and transaction start for journaling 1292 * filesystems. 1293 */ 1294 if (i_size_changed || inline_data) 1295 ext4_mark_inode_dirty(handle, inode); 1296 1297 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1298 /* if we have allocated more blocks and copied 1299 * less. We will have blocks allocated outside 1300 * inode->i_size. So truncate them 1301 */ 1302 ext4_orphan_add(handle, inode); 1303 errout: 1304 ret2 = ext4_journal_stop(handle); 1305 if (!ret) 1306 ret = ret2; 1307 1308 if (pos + len > inode->i_size && !verity) { 1309 ext4_truncate_failed_write(inode); 1310 /* 1311 * If truncate failed early the inode might still be 1312 * on the orphan list; we need to make sure the inode 1313 * is removed from the orphan list in that case. 1314 */ 1315 if (inode->i_nlink) 1316 ext4_orphan_del(NULL, inode); 1317 } 1318 1319 return ret ? ret : copied; 1320 } 1321 1322 /* 1323 * This is a private version of page_zero_new_buffers() which doesn't 1324 * set the buffer to be dirty, since in data=journalled mode we need 1325 * to call ext4_handle_dirty_metadata() instead. 1326 */ 1327 static void ext4_journalled_zero_new_buffers(handle_t *handle, 1328 struct page *page, 1329 unsigned from, unsigned to) 1330 { 1331 unsigned int block_start = 0, block_end; 1332 struct buffer_head *head, *bh; 1333 1334 bh = head = page_buffers(page); 1335 do { 1336 block_end = block_start + bh->b_size; 1337 if (buffer_new(bh)) { 1338 if (block_end > from && block_start < to) { 1339 if (!PageUptodate(page)) { 1340 unsigned start, size; 1341 1342 start = max(from, block_start); 1343 size = min(to, block_end) - start; 1344 1345 zero_user(page, start, size); 1346 write_end_fn(handle, bh); 1347 } 1348 clear_buffer_new(bh); 1349 } 1350 } 1351 block_start = block_end; 1352 bh = bh->b_this_page; 1353 } while (bh != head); 1354 } 1355 1356 static int ext4_journalled_write_end(struct file *file, 1357 struct address_space *mapping, 1358 loff_t pos, unsigned len, unsigned copied, 1359 struct page *page, void *fsdata) 1360 { 1361 handle_t *handle = ext4_journal_current_handle(); 1362 struct inode *inode = mapping->host; 1363 loff_t old_size = inode->i_size; 1364 int ret = 0, ret2; 1365 int partial = 0; 1366 unsigned from, to; 1367 int size_changed = 0; 1368 int inline_data = ext4_has_inline_data(inode); 1369 bool verity = ext4_verity_in_progress(inode); 1370 1371 trace_ext4_journalled_write_end(inode, pos, len, copied); 1372 from = pos & (PAGE_SIZE - 1); 1373 to = from + len; 1374 1375 BUG_ON(!ext4_handle_valid(handle)); 1376 1377 if (inline_data) { 1378 ret = ext4_write_inline_data_end(inode, pos, len, 1379 copied, page); 1380 if (ret < 0) { 1381 unlock_page(page); 1382 put_page(page); 1383 goto errout; 1384 } 1385 copied = ret; 1386 } else if (unlikely(copied < len) && !PageUptodate(page)) { 1387 copied = 0; 1388 ext4_journalled_zero_new_buffers(handle, page, from, to); 1389 } else { 1390 if (unlikely(copied < len)) 1391 ext4_journalled_zero_new_buffers(handle, page, 1392 from + copied, to); 1393 ret = ext4_walk_page_buffers(handle, page_buffers(page), from, 1394 from + copied, &partial, 1395 write_end_fn); 1396 if (!partial) 1397 SetPageUptodate(page); 1398 } 1399 if (!verity) 1400 size_changed = ext4_update_inode_size(inode, pos + copied); 1401 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1402 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1403 unlock_page(page); 1404 put_page(page); 1405 1406 if (old_size < pos && !verity) 1407 pagecache_isize_extended(inode, old_size, pos); 1408 1409 if (size_changed || inline_data) { 1410 ret2 = ext4_mark_inode_dirty(handle, inode); 1411 if (!ret) 1412 ret = ret2; 1413 } 1414 1415 if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) 1416 /* if we have allocated more blocks and copied 1417 * less. We will have blocks allocated outside 1418 * inode->i_size. So truncate them 1419 */ 1420 ext4_orphan_add(handle, inode); 1421 1422 errout: 1423 ret2 = ext4_journal_stop(handle); 1424 if (!ret) 1425 ret = ret2; 1426 if (pos + len > inode->i_size && !verity) { 1427 ext4_truncate_failed_write(inode); 1428 /* 1429 * If truncate failed early the inode might still be 1430 * on the orphan list; we need to make sure the inode 1431 * is removed from the orphan list in that case. 1432 */ 1433 if (inode->i_nlink) 1434 ext4_orphan_del(NULL, inode); 1435 } 1436 1437 return ret ? ret : copied; 1438 } 1439 1440 /* 1441 * Reserve space for a single cluster 1442 */ 1443 static int ext4_da_reserve_space(struct inode *inode) 1444 { 1445 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1446 struct ext4_inode_info *ei = EXT4_I(inode); 1447 int ret; 1448 1449 /* 1450 * We will charge metadata quota at writeout time; this saves 1451 * us from metadata over-estimation, though we may go over by 1452 * a small amount in the end. Here we just reserve for data. 1453 */ 1454 ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); 1455 if (ret) 1456 return ret; 1457 1458 spin_lock(&ei->i_block_reservation_lock); 1459 if (ext4_claim_free_clusters(sbi, 1, 0)) { 1460 spin_unlock(&ei->i_block_reservation_lock); 1461 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); 1462 return -ENOSPC; 1463 } 1464 ei->i_reserved_data_blocks++; 1465 trace_ext4_da_reserve_space(inode); 1466 spin_unlock(&ei->i_block_reservation_lock); 1467 1468 return 0; /* success */ 1469 } 1470 1471 void ext4_da_release_space(struct inode *inode, int to_free) 1472 { 1473 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1474 struct ext4_inode_info *ei = EXT4_I(inode); 1475 1476 if (!to_free) 1477 return; /* Nothing to release, exit */ 1478 1479 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1480 1481 trace_ext4_da_release_space(inode, to_free); 1482 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1483 /* 1484 * if there aren't enough reserved blocks, then the 1485 * counter is messed up somewhere. Since this 1486 * function is called from invalidate page, it's 1487 * harmless to return without any action. 1488 */ 1489 ext4_warning(inode->i_sb, "ext4_da_release_space: " 1490 "ino %lu, to_free %d with only %d reserved " 1491 "data blocks", inode->i_ino, to_free, 1492 ei->i_reserved_data_blocks); 1493 WARN_ON(1); 1494 to_free = ei->i_reserved_data_blocks; 1495 } 1496 ei->i_reserved_data_blocks -= to_free; 1497 1498 /* update fs dirty data blocks counter */ 1499 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); 1500 1501 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1502 1503 dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); 1504 } 1505 1506 /* 1507 * Delayed allocation stuff 1508 */ 1509 1510 struct mpage_da_data { 1511 struct inode *inode; 1512 struct writeback_control *wbc; 1513 1514 pgoff_t first_page; /* The first page to write */ 1515 pgoff_t next_page; /* Current page to examine */ 1516 pgoff_t last_page; /* Last page to examine */ 1517 /* 1518 * Extent to map - this can be after first_page because that can be 1519 * fully mapped. We somewhat abuse m_flags to store whether the extent 1520 * is delalloc or unwritten. 1521 */ 1522 struct ext4_map_blocks map; 1523 struct ext4_io_submit io_submit; /* IO submission data */ 1524 unsigned int do_map:1; 1525 }; 1526 1527 static void mpage_release_unused_pages(struct mpage_da_data *mpd, 1528 bool invalidate) 1529 { 1530 int nr_pages, i; 1531 pgoff_t index, end; 1532 struct pagevec pvec; 1533 struct inode *inode = mpd->inode; 1534 struct address_space *mapping = inode->i_mapping; 1535 1536 /* This is necessary when next_page == 0. */ 1537 if (mpd->first_page >= mpd->next_page) 1538 return; 1539 1540 index = mpd->first_page; 1541 end = mpd->next_page - 1; 1542 if (invalidate) { 1543 ext4_lblk_t start, last; 1544 start = index << (PAGE_SHIFT - inode->i_blkbits); 1545 last = end << (PAGE_SHIFT - inode->i_blkbits); 1546 ext4_es_remove_extent(inode, start, last - start + 1); 1547 } 1548 1549 pagevec_init(&pvec); 1550 while (index <= end) { 1551 nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); 1552 if (nr_pages == 0) 1553 break; 1554 for (i = 0; i < nr_pages; i++) { 1555 struct page *page = pvec.pages[i]; 1556 1557 BUG_ON(!PageLocked(page)); 1558 BUG_ON(PageWriteback(page)); 1559 if (invalidate) { 1560 if (page_mapped(page)) 1561 clear_page_dirty_for_io(page); 1562 block_invalidatepage(page, 0, PAGE_SIZE); 1563 ClearPageUptodate(page); 1564 } 1565 unlock_page(page); 1566 } 1567 pagevec_release(&pvec); 1568 } 1569 } 1570 1571 static void ext4_print_free_blocks(struct inode *inode) 1572 { 1573 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1574 struct super_block *sb = inode->i_sb; 1575 struct ext4_inode_info *ei = EXT4_I(inode); 1576 1577 ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", 1578 EXT4_C2B(EXT4_SB(inode->i_sb), 1579 ext4_count_free_clusters(sb))); 1580 ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); 1581 ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", 1582 (long long) EXT4_C2B(EXT4_SB(sb), 1583 percpu_counter_sum(&sbi->s_freeclusters_counter))); 1584 ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", 1585 (long long) EXT4_C2B(EXT4_SB(sb), 1586 percpu_counter_sum(&sbi->s_dirtyclusters_counter))); 1587 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1588 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1589 ei->i_reserved_data_blocks); 1590 return; 1591 } 1592 1593 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1594 { 1595 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1596 } 1597 1598 /* 1599 * ext4_insert_delayed_block - adds a delayed block to the extents status 1600 * tree, incrementing the reserved cluster/block 1601 * count or making a pending reservation 1602 * where needed 1603 * 1604 * @inode - file containing the newly added block 1605 * @lblk - logical block to be added 1606 * 1607 * Returns 0 on success, negative error code on failure. 1608 */ 1609 static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) 1610 { 1611 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1612 int ret; 1613 bool allocated = false; 1614 1615 /* 1616 * If the cluster containing lblk is shared with a delayed, 1617 * written, or unwritten extent in a bigalloc file system, it's 1618 * already been accounted for and does not need to be reserved. 1619 * A pending reservation must be made for the cluster if it's 1620 * shared with a written or unwritten extent and doesn't already 1621 * have one. Written and unwritten extents can be purged from the 1622 * extents status tree if the system is under memory pressure, so 1623 * it's necessary to examine the extent tree if a search of the 1624 * extents status tree doesn't get a match. 1625 */ 1626 if (sbi->s_cluster_ratio == 1) { 1627 ret = ext4_da_reserve_space(inode); 1628 if (ret != 0) /* ENOSPC */ 1629 goto errout; 1630 } else { /* bigalloc */ 1631 if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { 1632 if (!ext4_es_scan_clu(inode, 1633 &ext4_es_is_mapped, lblk)) { 1634 ret = ext4_clu_mapped(inode, 1635 EXT4_B2C(sbi, lblk)); 1636 if (ret < 0) 1637 goto errout; 1638 if (ret == 0) { 1639 ret = ext4_da_reserve_space(inode); 1640 if (ret != 0) /* ENOSPC */ 1641 goto errout; 1642 } else { 1643 allocated = true; 1644 } 1645 } else { 1646 allocated = true; 1647 } 1648 } 1649 } 1650 1651 ret = ext4_es_insert_delayed_block(inode, lblk, allocated); 1652 1653 errout: 1654 return ret; 1655 } 1656 1657 /* 1658 * This function is grabs code from the very beginning of 1659 * ext4_map_blocks, but assumes that the caller is from delayed write 1660 * time. This function looks up the requested blocks and sets the 1661 * buffer delay bit under the protection of i_data_sem. 1662 */ 1663 static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, 1664 struct ext4_map_blocks *map, 1665 struct buffer_head *bh) 1666 { 1667 struct extent_status es; 1668 int retval; 1669 sector_t invalid_block = ~((sector_t) 0xffff); 1670 #ifdef ES_AGGRESSIVE_TEST 1671 struct ext4_map_blocks orig_map; 1672 1673 memcpy(&orig_map, map, sizeof(*map)); 1674 #endif 1675 1676 if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) 1677 invalid_block = ~0; 1678 1679 map->m_flags = 0; 1680 ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," 1681 "logical block %lu\n", inode->i_ino, map->m_len, 1682 (unsigned long) map->m_lblk); 1683 1684 /* Lookup extent status tree firstly */ 1685 if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { 1686 if (ext4_es_is_hole(&es)) { 1687 retval = 0; 1688 down_read(&EXT4_I(inode)->i_data_sem); 1689 goto add_delayed; 1690 } 1691 1692 /* 1693 * Delayed extent could be allocated by fallocate. 1694 * So we need to check it. 1695 */ 1696 if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { 1697 map_bh(bh, inode->i_sb, invalid_block); 1698 set_buffer_new(bh); 1699 set_buffer_delay(bh); 1700 return 0; 1701 } 1702 1703 map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; 1704 retval = es.es_len - (iblock - es.es_lblk); 1705 if (retval > map->m_len) 1706 retval = map->m_len; 1707 map->m_len = retval; 1708 if (ext4_es_is_written(&es)) 1709 map->m_flags |= EXT4_MAP_MAPPED; 1710 else if (ext4_es_is_unwritten(&es)) 1711 map->m_flags |= EXT4_MAP_UNWRITTEN; 1712 else 1713 BUG(); 1714 1715 #ifdef ES_AGGRESSIVE_TEST 1716 ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); 1717 #endif 1718 return retval; 1719 } 1720 1721 /* 1722 * Try to see if we can get the block without requesting a new 1723 * file system block. 1724 */ 1725 down_read(&EXT4_I(inode)->i_data_sem); 1726 if (ext4_has_inline_data(inode)) 1727 retval = 0; 1728 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 1729 retval = ext4_ext_map_blocks(NULL, inode, map, 0); 1730 else 1731 retval = ext4_ind_map_blocks(NULL, inode, map, 0); 1732 1733 add_delayed: 1734 if (retval == 0) { 1735 int ret; 1736 1737 /* 1738 * XXX: __block_prepare_write() unmaps passed block, 1739 * is it OK? 1740 */ 1741 1742 ret = ext4_insert_delayed_block(inode, map->m_lblk); 1743 if (ret != 0) { 1744 retval = ret; 1745 goto out_unlock; 1746 } 1747 1748 map_bh(bh, inode->i_sb, invalid_block); 1749 set_buffer_new(bh); 1750 set_buffer_delay(bh); 1751 } else if (retval > 0) { 1752 int ret; 1753 unsigned int status; 1754 1755 if (unlikely(retval != map->m_len)) { 1756 ext4_warning(inode->i_sb, 1757 "ES len assertion failed for inode " 1758 "%lu: retval %d != map->m_len %d", 1759 inode->i_ino, retval, map->m_len); 1760 WARN_ON(1); 1761 } 1762 1763 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1764 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1765 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1766 map->m_pblk, status); 1767 if (ret != 0) 1768 retval = ret; 1769 } 1770 1771 out_unlock: 1772 up_read((&EXT4_I(inode)->i_data_sem)); 1773 1774 return retval; 1775 } 1776 1777 /* 1778 * This is a special get_block_t callback which is used by 1779 * ext4_da_write_begin(). It will either return mapped block or 1780 * reserve space for a single block. 1781 * 1782 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. 1783 * We also have b_blocknr = -1 and b_bdev initialized properly 1784 * 1785 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. 1786 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev 1787 * initialized properly. 1788 */ 1789 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 1790 struct buffer_head *bh, int create) 1791 { 1792 struct ext4_map_blocks map; 1793 int ret = 0; 1794 1795 BUG_ON(create == 0); 1796 BUG_ON(bh->b_size != inode->i_sb->s_blocksize); 1797 1798 map.m_lblk = iblock; 1799 map.m_len = 1; 1800 1801 /* 1802 * first, we need to know whether the block is allocated already 1803 * preallocated blocks are unmapped but should treated 1804 * the same as allocated blocks. 1805 */ 1806 ret = ext4_da_map_blocks(inode, iblock, &map, bh); 1807 if (ret <= 0) 1808 return ret; 1809 1810 map_bh(bh, inode->i_sb, map.m_pblk); 1811 ext4_update_bh_state(bh, map.m_flags); 1812 1813 if (buffer_unwritten(bh)) { 1814 /* A delayed write to unwritten bh should be marked 1815 * new and mapped. Mapped ensures that we don't do 1816 * get_block multiple times when we write to the same 1817 * offset and new ensures that we do proper zero out 1818 * for partial write. 1819 */ 1820 set_buffer_new(bh); 1821 set_buffer_mapped(bh); 1822 } 1823 return 0; 1824 } 1825 1826 static int bget_one(handle_t *handle, struct buffer_head *bh) 1827 { 1828 get_bh(bh); 1829 return 0; 1830 } 1831 1832 static int bput_one(handle_t *handle, struct buffer_head *bh) 1833 { 1834 put_bh(bh); 1835 return 0; 1836 } 1837 1838 static int __ext4_journalled_writepage(struct page *page, 1839 unsigned int len) 1840 { 1841 struct address_space *mapping = page->mapping; 1842 struct inode *inode = mapping->host; 1843 struct buffer_head *page_bufs = NULL; 1844 handle_t *handle = NULL; 1845 int ret = 0, err = 0; 1846 int inline_data = ext4_has_inline_data(inode); 1847 struct buffer_head *inode_bh = NULL; 1848 1849 ClearPageChecked(page); 1850 1851 if (inline_data) { 1852 BUG_ON(page->index != 0); 1853 BUG_ON(len > ext4_get_max_inline_size(inode)); 1854 inode_bh = ext4_journalled_write_inline_data(inode, len, page); 1855 if (inode_bh == NULL) 1856 goto out; 1857 } else { 1858 page_bufs = page_buffers(page); 1859 if (!page_bufs) { 1860 BUG(); 1861 goto out; 1862 } 1863 ext4_walk_page_buffers(handle, page_bufs, 0, len, 1864 NULL, bget_one); 1865 } 1866 /* 1867 * We need to release the page lock before we start the 1868 * journal, so grab a reference so the page won't disappear 1869 * out from under us. 1870 */ 1871 get_page(page); 1872 unlock_page(page); 1873 1874 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1875 ext4_writepage_trans_blocks(inode)); 1876 if (IS_ERR(handle)) { 1877 ret = PTR_ERR(handle); 1878 put_page(page); 1879 goto out_no_pagelock; 1880 } 1881 BUG_ON(!ext4_handle_valid(handle)); 1882 1883 lock_page(page); 1884 put_page(page); 1885 if (page->mapping != mapping) { 1886 /* The page got truncated from under us */ 1887 ext4_journal_stop(handle); 1888 ret = 0; 1889 goto out; 1890 } 1891 1892 if (inline_data) { 1893 ret = ext4_mark_inode_dirty(handle, inode); 1894 } else { 1895 ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1896 do_journal_get_write_access); 1897 1898 err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, 1899 write_end_fn); 1900 } 1901 if (ret == 0) 1902 ret = err; 1903 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1904 err = ext4_journal_stop(handle); 1905 if (!ret) 1906 ret = err; 1907 1908 if (!ext4_has_inline_data(inode)) 1909 ext4_walk_page_buffers(NULL, page_bufs, 0, len, 1910 NULL, bput_one); 1911 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1912 out: 1913 unlock_page(page); 1914 out_no_pagelock: 1915 brelse(inode_bh); 1916 return ret; 1917 } 1918 1919 /* 1920 * Note that we don't need to start a transaction unless we're journaling data 1921 * because we should have holes filled from ext4_page_mkwrite(). We even don't 1922 * need to file the inode to the transaction's list in ordered mode because if 1923 * we are writing back data added by write(), the inode is already there and if 1924 * we are writing back data modified via mmap(), no one guarantees in which 1925 * transaction the data will hit the disk. In case we are journaling data, we 1926 * cannot start transaction directly because transaction start ranks above page 1927 * lock so we have to do some magic. 1928 * 1929 * This function can get called via... 1930 * - ext4_writepages after taking page lock (have journal handle) 1931 * - journal_submit_inode_data_buffers (no journal handle) 1932 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1933 * - grab_page_cache when doing write_begin (have journal handle) 1934 * 1935 * We don't do any block allocation in this function. If we have page with 1936 * multiple blocks we need to write those buffer_heads that are mapped. This 1937 * is important for mmaped based write. So if we do with blocksize 1K 1938 * truncate(f, 1024); 1939 * a = mmap(f, 0, 4096); 1940 * a[0] = 'a'; 1941 * truncate(f, 4096); 1942 * we have in the page first buffer_head mapped via page_mkwrite call back 1943 * but other buffer_heads would be unmapped but dirty (dirty done via the 1944 * do_wp_page). So writepage should write the first block. If we modify 1945 * the mmap area beyond 1024 we will again get a page_fault and the 1946 * page_mkwrite callback will do the block allocation and mark the 1947 * buffer_heads mapped. 1948 * 1949 * We redirty the page if we have any buffer_heads that is either delay or 1950 * unwritten in the page. 1951 * 1952 * We can get recursively called as show below. 1953 * 1954 * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> 1955 * ext4_writepage() 1956 * 1957 * But since we don't do any block allocation we should not deadlock. 1958 * Page also have the dirty flag cleared so we don't get recurive page_lock. 1959 */ 1960 static int ext4_writepage(struct page *page, 1961 struct writeback_control *wbc) 1962 { 1963 int ret = 0; 1964 loff_t size; 1965 unsigned int len; 1966 struct buffer_head *page_bufs = NULL; 1967 struct inode *inode = page->mapping->host; 1968 struct ext4_io_submit io_submit; 1969 bool keep_towrite = false; 1970 1971 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { 1972 inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 1973 unlock_page(page); 1974 return -EIO; 1975 } 1976 1977 trace_ext4_writepage(page); 1978 size = i_size_read(inode); 1979 if (page->index == size >> PAGE_SHIFT && 1980 !ext4_verity_in_progress(inode)) 1981 len = size & ~PAGE_MASK; 1982 else 1983 len = PAGE_SIZE; 1984 1985 page_bufs = page_buffers(page); 1986 /* 1987 * We cannot do block allocation or other extent handling in this 1988 * function. If there are buffers needing that, we have to redirty 1989 * the page. But we may reach here when we do a journal commit via 1990 * journal_submit_inode_data_buffers() and in that case we must write 1991 * allocated buffers to achieve data=ordered mode guarantees. 1992 * 1993 * Also, if there is only one buffer per page (the fs block 1994 * size == the page size), if one buffer needs block 1995 * allocation or needs to modify the extent tree to clear the 1996 * unwritten flag, we know that the page can't be written at 1997 * all, so we might as well refuse the write immediately. 1998 * Unfortunately if the block size != page size, we can't as 1999 * easily detect this case using ext4_walk_page_buffers(), but 2000 * for the extremely common case, this is an optimization that 2001 * skips a useless round trip through ext4_bio_write_page(). 2002 */ 2003 if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, 2004 ext4_bh_delay_or_unwritten)) { 2005 redirty_page_for_writepage(wbc, page); 2006 if ((current->flags & PF_MEMALLOC) || 2007 (inode->i_sb->s_blocksize == PAGE_SIZE)) { 2008 /* 2009 * For memory cleaning there's no point in writing only 2010 * some buffers. So just bail out. Warn if we came here 2011 * from direct reclaim. 2012 */ 2013 WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) 2014 == PF_MEMALLOC); 2015 unlock_page(page); 2016 return 0; 2017 } 2018 keep_towrite = true; 2019 } 2020 2021 if (PageChecked(page) && ext4_should_journal_data(inode)) 2022 /* 2023 * It's mmapped pagecache. Add buffers and journal it. There 2024 * doesn't seem much point in redirtying the page here. 2025 */ 2026 return __ext4_journalled_writepage(page, len); 2027 2028 ext4_io_submit_init(&io_submit, wbc); 2029 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); 2030 if (!io_submit.io_end) { 2031 redirty_page_for_writepage(wbc, page); 2032 unlock_page(page); 2033 return -ENOMEM; 2034 } 2035 ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); 2036 ext4_io_submit(&io_submit); 2037 /* Drop io_end reference we got from init */ 2038 ext4_put_io_end_defer(io_submit.io_end); 2039 return ret; 2040 } 2041 2042 static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) 2043 { 2044 int len; 2045 loff_t size; 2046 int err; 2047 2048 BUG_ON(page->index != mpd->first_page); 2049 clear_page_dirty_for_io(page); 2050 /* 2051 * We have to be very careful here! Nothing protects writeback path 2052 * against i_size changes and the page can be writeably mapped into 2053 * page tables. So an application can be growing i_size and writing 2054 * data through mmap while writeback runs. clear_page_dirty_for_io() 2055 * write-protects our page in page tables and the page cannot get 2056 * written to again until we release page lock. So only after 2057 * clear_page_dirty_for_io() we are safe to sample i_size for 2058 * ext4_bio_write_page() to zero-out tail of the written page. We rely 2059 * on the barrier provided by TestClearPageDirty in 2060 * clear_page_dirty_for_io() to make sure i_size is really sampled only 2061 * after page tables are updated. 2062 */ 2063 size = i_size_read(mpd->inode); 2064 if (page->index == size >> PAGE_SHIFT && 2065 !ext4_verity_in_progress(mpd->inode)) 2066 len = size & ~PAGE_MASK; 2067 else 2068 len = PAGE_SIZE; 2069 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); 2070 if (!err) 2071 mpd->wbc->nr_to_write--; 2072 mpd->first_page++; 2073 2074 return err; 2075 } 2076 2077 #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) 2078 2079 /* 2080 * mballoc gives us at most this number of blocks... 2081 * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). 2082 * The rest of mballoc seems to handle chunks up to full group size. 2083 */ 2084 #define MAX_WRITEPAGES_EXTENT_LEN 2048 2085 2086 /* 2087 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map 2088 * 2089 * @mpd - extent of blocks 2090 * @lblk - logical number of the block in the file 2091 * @bh - buffer head we want to add to the extent 2092 * 2093 * The function is used to collect contig. blocks in the same state. If the 2094 * buffer doesn't require mapping for writeback and we haven't started the 2095 * extent of buffers to map yet, the function returns 'true' immediately - the 2096 * caller can write the buffer right away. Otherwise the function returns true 2097 * if the block has been added to the extent, false if the block couldn't be 2098 * added. 2099 */ 2100 static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, 2101 struct buffer_head *bh) 2102 { 2103 struct ext4_map_blocks *map = &mpd->map; 2104 2105 /* Buffer that doesn't need mapping for writeback? */ 2106 if (!buffer_dirty(bh) || !buffer_mapped(bh) || 2107 (!buffer_delay(bh) && !buffer_unwritten(bh))) { 2108 /* So far no extent to map => we write the buffer right away */ 2109 if (map->m_len == 0) 2110 return true; 2111 return false; 2112 } 2113 2114 /* First block in the extent? */ 2115 if (map->m_len == 0) { 2116 /* We cannot map unless handle is started... */ 2117 if (!mpd->do_map) 2118 return false; 2119 map->m_lblk = lblk; 2120 map->m_len = 1; 2121 map->m_flags = bh->b_state & BH_FLAGS; 2122 return true; 2123 } 2124 2125 /* Don't go larger than mballoc is willing to allocate */ 2126 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) 2127 return false; 2128 2129 /* Can we merge the block to our big extent? */ 2130 if (lblk == map->m_lblk + map->m_len && 2131 (bh->b_state & BH_FLAGS) == map->m_flags) { 2132 map->m_len++; 2133 return true; 2134 } 2135 return false; 2136 } 2137 2138 /* 2139 * mpage_process_page_bufs - submit page buffers for IO or add them to extent 2140 * 2141 * @mpd - extent of blocks for mapping 2142 * @head - the first buffer in the page 2143 * @bh - buffer we should start processing from 2144 * @lblk - logical number of the block in the file corresponding to @bh 2145 * 2146 * Walk through page buffers from @bh upto @head (exclusive) and either submit 2147 * the page for IO if all buffers in this page were mapped and there's no 2148 * accumulated extent of buffers to map or add buffers in the page to the 2149 * extent of buffers to map. The function returns 1 if the caller can continue 2150 * by processing the next page, 0 if it should stop adding buffers to the 2151 * extent to map because we cannot extend it anymore. It can also return value 2152 * < 0 in case of error during IO submission. 2153 */ 2154 static int mpage_process_page_bufs(struct mpage_da_data *mpd, 2155 struct buffer_head *head, 2156 struct buffer_head *bh, 2157 ext4_lblk_t lblk) 2158 { 2159 struct inode *inode = mpd->inode; 2160 int err; 2161 ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) 2162 >> inode->i_blkbits; 2163 2164 if (ext4_verity_in_progress(inode)) 2165 blocks = EXT_MAX_BLOCKS; 2166 2167 do { 2168 BUG_ON(buffer_locked(bh)); 2169 2170 if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { 2171 /* Found extent to map? */ 2172 if (mpd->map.m_len) 2173 return 0; 2174 /* Buffer needs mapping and handle is not started? */ 2175 if (!mpd->do_map) 2176 return 0; 2177 /* Everything mapped so far and we hit EOF */ 2178 break; 2179 } 2180 } while (lblk++, (bh = bh->b_this_page) != head); 2181 /* So far everything mapped? Submit the page for IO. */ 2182 if (mpd->map.m_len == 0) { 2183 err = mpage_submit_page(mpd, head->b_page); 2184 if (err < 0) 2185 return err; 2186 } 2187 return lblk < blocks; 2188 } 2189 2190 /* 2191 * mpage_process_page - update page buffers corresponding to changed extent and 2192 * may submit fully mapped page for IO 2193 * 2194 * @mpd - description of extent to map, on return next extent to map 2195 * @m_lblk - logical block mapping. 2196 * @m_pblk - corresponding physical mapping. 2197 * @map_bh - determines on return whether this page requires any further 2198 * mapping or not. 2199 * Scan given page buffers corresponding to changed extent and update buffer 2200 * state according to new extent state. 2201 * We map delalloc buffers to their physical location, clear unwritten bits. 2202 * If the given page is not fully mapped, we update @map to the next extent in 2203 * the given page that needs mapping & return @map_bh as true. 2204 */ 2205 static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, 2206 ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, 2207 bool *map_bh) 2208 { 2209 struct buffer_head *head, *bh; 2210 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2211 ext4_lblk_t lblk = *m_lblk; 2212 ext4_fsblk_t pblock = *m_pblk; 2213 int err = 0; 2214 int blkbits = mpd->inode->i_blkbits; 2215 ssize_t io_end_size = 0; 2216 struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); 2217 2218 bh = head = page_buffers(page); 2219 do { 2220 if (lblk < mpd->map.m_lblk) 2221 continue; 2222 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { 2223 /* 2224 * Buffer after end of mapped extent. 2225 * Find next buffer in the page to map. 2226 */ 2227 mpd->map.m_len = 0; 2228 mpd->map.m_flags = 0; 2229 io_end_vec->size += io_end_size; 2230 io_end_size = 0; 2231 2232 err = mpage_process_page_bufs(mpd, head, bh, lblk); 2233 if (err > 0) 2234 err = 0; 2235 if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { 2236 io_end_vec = ext4_alloc_io_end_vec(io_end); 2237 if (IS_ERR(io_end_vec)) { 2238 err = PTR_ERR(io_end_vec); 2239 goto out; 2240 } 2241 io_end_vec->offset = mpd->map.m_lblk << blkbits; 2242 } 2243 *map_bh = true; 2244 goto out; 2245 } 2246 if (buffer_delay(bh)) { 2247 clear_buffer_delay(bh); 2248 bh->b_blocknr = pblock++; 2249 } 2250 clear_buffer_unwritten(bh); 2251 io_end_size += (1 << blkbits); 2252 } while (lblk++, (bh = bh->b_this_page) != head); 2253 2254 io_end_vec->size += io_end_size; 2255 io_end_size = 0; 2256 *map_bh = false; 2257 out: 2258 *m_lblk = lblk; 2259 *m_pblk = pblock; 2260 return err; 2261 } 2262 2263 /* 2264 * mpage_map_buffers - update buffers corresponding to changed extent and 2265 * submit fully mapped pages for IO 2266 * 2267 * @mpd - description of extent to map, on return next extent to map 2268 * 2269 * Scan buffers corresponding to changed extent (we expect corresponding pages 2270 * to be already locked) and update buffer state according to new extent state. 2271 * We map delalloc buffers to their physical location, clear unwritten bits, 2272 * and mark buffers as uninit when we perform writes to unwritten extents 2273 * and do extent conversion after IO is finished. If the last page is not fully 2274 * mapped, we update @map to the next extent in the last page that needs 2275 * mapping. Otherwise we submit the page for IO. 2276 */ 2277 static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) 2278 { 2279 struct pagevec pvec; 2280 int nr_pages, i; 2281 struct inode *inode = mpd->inode; 2282 int bpp_bits = PAGE_SHIFT - inode->i_blkbits; 2283 pgoff_t start, end; 2284 ext4_lblk_t lblk; 2285 ext4_fsblk_t pblock; 2286 int err; 2287 bool map_bh = false; 2288 2289 start = mpd->map.m_lblk >> bpp_bits; 2290 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; 2291 lblk = start << bpp_bits; 2292 pblock = mpd->map.m_pblk; 2293 2294 pagevec_init(&pvec); 2295 while (start <= end) { 2296 nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, 2297 &start, end); 2298 if (nr_pages == 0) 2299 break; 2300 for (i = 0; i < nr_pages; i++) { 2301 struct page *page = pvec.pages[i]; 2302 2303 err = mpage_process_page(mpd, page, &lblk, &pblock, 2304 &map_bh); 2305 /* 2306 * If map_bh is true, means page may require further bh 2307 * mapping, or maybe the page was submitted for IO. 2308 * So we return to call further extent mapping. 2309 */ 2310 if (err < 0 || map_bh == true) 2311 goto out; 2312 /* Page fully mapped - let IO run! */ 2313 err = mpage_submit_page(mpd, page); 2314 if (err < 0) 2315 goto out; 2316 } 2317 pagevec_release(&pvec); 2318 } 2319 /* Extent fully mapped and matches with page boundary. We are done. */ 2320 mpd->map.m_len = 0; 2321 mpd->map.m_flags = 0; 2322 return 0; 2323 out: 2324 pagevec_release(&pvec); 2325 return err; 2326 } 2327 2328 static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) 2329 { 2330 struct inode *inode = mpd->inode; 2331 struct ext4_map_blocks *map = &mpd->map; 2332 int get_blocks_flags; 2333 int err, dioread_nolock; 2334 2335 trace_ext4_da_write_pages_extent(inode, map); 2336 /* 2337 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or 2338 * to convert an unwritten extent to be initialized (in the case 2339 * where we have written into one or more preallocated blocks). It is 2340 * possible that we're going to need more metadata blocks than 2341 * previously reserved. However we must not fail because we're in 2342 * writeback and there is nothing we can do about it so it might result 2343 * in data loss. So use reserved blocks to allocate metadata if 2344 * possible. 2345 * 2346 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if 2347 * the blocks in question are delalloc blocks. This indicates 2348 * that the blocks and quotas has already been checked when 2349 * the data was copied into the page cache. 2350 */ 2351 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 2352 EXT4_GET_BLOCKS_METADATA_NOFAIL | 2353 EXT4_GET_BLOCKS_IO_SUBMIT; 2354 dioread_nolock = ext4_should_dioread_nolock(inode); 2355 if (dioread_nolock) 2356 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2357 if (map->m_flags & (1 << BH_Delay)) 2358 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2359 2360 err = ext4_map_blocks(handle, inode, map, get_blocks_flags); 2361 if (err < 0) 2362 return err; 2363 if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { 2364 if (!mpd->io_submit.io_end->handle && 2365 ext4_handle_valid(handle)) { 2366 mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2367 handle->h_rsv_handle = NULL; 2368 } 2369 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); 2370 } 2371 2372 BUG_ON(map->m_len == 0); 2373 return 0; 2374 } 2375 2376 /* 2377 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length 2378 * mpd->len and submit pages underlying it for IO 2379 * 2380 * @handle - handle for journal operations 2381 * @mpd - extent to map 2382 * @give_up_on_write - we set this to true iff there is a fatal error and there 2383 * is no hope of writing the data. The caller should discard 2384 * dirty pages to avoid infinite loops. 2385 * 2386 * The function maps extent starting at mpd->lblk of length mpd->len. If it is 2387 * delayed, blocks are allocated, if it is unwritten, we may need to convert 2388 * them to initialized or split the described range from larger unwritten 2389 * extent. Note that we need not map all the described range since allocation 2390 * can return less blocks or the range is covered by more unwritten extents. We 2391 * cannot map more because we are limited by reserved transaction credits. On 2392 * the other hand we always make sure that the last touched page is fully 2393 * mapped so that it can be written out (and thus forward progress is 2394 * guaranteed). After mapping we submit all mapped pages for IO. 2395 */ 2396 static int mpage_map_and_submit_extent(handle_t *handle, 2397 struct mpage_da_data *mpd, 2398 bool *give_up_on_write) 2399 { 2400 struct inode *inode = mpd->inode; 2401 struct ext4_map_blocks *map = &mpd->map; 2402 int err; 2403 loff_t disksize; 2404 int progress = 0; 2405 ext4_io_end_t *io_end = mpd->io_submit.io_end; 2406 struct ext4_io_end_vec *io_end_vec; 2407 2408 io_end_vec = ext4_alloc_io_end_vec(io_end); 2409 if (IS_ERR(io_end_vec)) 2410 return PTR_ERR(io_end_vec); 2411 io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; 2412 do { 2413 err = mpage_map_one_extent(handle, mpd); 2414 if (err < 0) { 2415 struct super_block *sb = inode->i_sb; 2416 2417 if (ext4_forced_shutdown(EXT4_SB(sb)) || 2418 EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 2419 goto invalidate_dirty_pages; 2420 /* 2421 * Let the uper layers retry transient errors. 2422 * In the case of ENOSPC, if ext4_count_free_blocks() 2423 * is non-zero, a commit should free up blocks. 2424 */ 2425 if ((err == -ENOMEM) || 2426 (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2427 if (progress) 2428 goto update_disksize; 2429 return err; 2430 } 2431 ext4_msg(sb, KERN_CRIT, 2432 "Delayed block allocation failed for " 2433 "inode %lu at logical offset %llu with" 2434 " max blocks %u with error %d", 2435 inode->i_ino, 2436 (unsigned long long)map->m_lblk, 2437 (unsigned)map->m_len, -err); 2438 ext4_msg(sb, KERN_CRIT, 2439 "This should not happen!! Data will " 2440 "be lost\n"); 2441 if (err == -ENOSPC) 2442 ext4_print_free_blocks(inode); 2443 invalidate_dirty_pages: 2444 *give_up_on_write = true; 2445 return err; 2446 } 2447 progress = 1; 2448 /* 2449 * Update buffer state, submit mapped pages, and get us new 2450 * extent to map 2451 */ 2452 err = mpage_map_and_submit_buffers(mpd); 2453 if (err < 0) 2454 goto update_disksize; 2455 } while (map->m_len); 2456 2457 update_disksize: 2458 /* 2459 * Update on-disk size after IO is submitted. Races with 2460 * truncate are avoided by checking i_size under i_data_sem. 2461 */ 2462 disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; 2463 if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { 2464 int err2; 2465 loff_t i_size; 2466 2467 down_write(&EXT4_I(inode)->i_data_sem); 2468 i_size = i_size_read(inode); 2469 if (disksize > i_size) 2470 disksize = i_size; 2471 if (disksize > EXT4_I(inode)->i_disksize) 2472 EXT4_I(inode)->i_disksize = disksize; 2473 up_write(&EXT4_I(inode)->i_data_sem); 2474 err2 = ext4_mark_inode_dirty(handle, inode); 2475 if (err2) { 2476 ext4_error_err(inode->i_sb, -err2, 2477 "Failed to mark inode %lu dirty", 2478 inode->i_ino); 2479 } 2480 if (!err) 2481 err = err2; 2482 } 2483 return err; 2484 } 2485 2486 /* 2487 * Calculate the total number of credits to reserve for one writepages 2488 * iteration. This is called from ext4_writepages(). We map an extent of 2489 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2490 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2491 * bpp - 1 blocks in bpp different extents. 2492 */ 2493 static int ext4_da_writepages_trans_blocks(struct inode *inode) 2494 { 2495 int bpp = ext4_journal_blocks_per_page(inode); 2496 2497 return ext4_meta_trans_blocks(inode, 2498 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2499 } 2500 2501 /* 2502 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages 2503 * and underlying extent to map 2504 * 2505 * @mpd - where to look for pages 2506 * 2507 * Walk dirty pages in the mapping. If they are fully mapped, submit them for 2508 * IO immediately. When we find a page which isn't mapped we start accumulating 2509 * extent of buffers underlying these pages that needs mapping (formed by 2510 * either delayed or unwritten buffers). We also lock the pages containing 2511 * these buffers. The extent found is returned in @mpd structure (starting at 2512 * mpd->lblk with length mpd->len blocks). 2513 * 2514 * Note that this function can attach bios to one io_end structure which are 2515 * neither logically nor physically contiguous. Although it may seem as an 2516 * unnecessary complication, it is actually inevitable in blocksize < pagesize 2517 * case as we need to track IO to all buffers underlying a page in one io_end. 2518 */ 2519 static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) 2520 { 2521 struct address_space *mapping = mpd->inode->i_mapping; 2522 struct pagevec pvec; 2523 unsigned int nr_pages; 2524 long left = mpd->wbc->nr_to_write; 2525 pgoff_t index = mpd->first_page; 2526 pgoff_t end = mpd->last_page; 2527 xa_mark_t tag; 2528 int i, err = 0; 2529 int blkbits = mpd->inode->i_blkbits; 2530 ext4_lblk_t lblk; 2531 struct buffer_head *head; 2532 2533 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) 2534 tag = PAGECACHE_TAG_TOWRITE; 2535 else 2536 tag = PAGECACHE_TAG_DIRTY; 2537 2538 pagevec_init(&pvec); 2539 mpd->map.m_len = 0; 2540 mpd->next_page = index; 2541 while (index <= end) { 2542 nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 2543 tag); 2544 if (nr_pages == 0) 2545 goto out; 2546 2547 for (i = 0; i < nr_pages; i++) { 2548 struct page *page = pvec.pages[i]; 2549 2550 /* 2551 * Accumulated enough dirty pages? This doesn't apply 2552 * to WB_SYNC_ALL mode. For integrity sync we have to 2553 * keep going because someone may be concurrently 2554 * dirtying pages, and we might have synced a lot of 2555 * newly appeared dirty pages, but have not synced all 2556 * of the old dirty pages. 2557 */ 2558 if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) 2559 goto out; 2560 2561 /* If we can't merge this page, we are done. */ 2562 if (mpd->map.m_len > 0 && mpd->next_page != page->index) 2563 goto out; 2564 2565 lock_page(page); 2566 /* 2567 * If the page is no longer dirty, or its mapping no 2568 * longer corresponds to inode we are writing (which 2569 * means it has been truncated or invalidated), or the 2570 * page is already under writeback and we are not doing 2571 * a data integrity writeback, skip the page 2572 */ 2573 if (!PageDirty(page) || 2574 (PageWriteback(page) && 2575 (mpd->wbc->sync_mode == WB_SYNC_NONE)) || 2576 unlikely(page->mapping != mapping)) { 2577 unlock_page(page); 2578 continue; 2579 } 2580 2581 wait_on_page_writeback(page); 2582 BUG_ON(PageWriteback(page)); 2583 2584 if (mpd->map.m_len == 0) 2585 mpd->first_page = page->index; 2586 mpd->next_page = page->index + 1; 2587 /* Add all dirty buffers to mpd */ 2588 lblk = ((ext4_lblk_t)page->index) << 2589 (PAGE_SHIFT - blkbits); 2590 head = page_buffers(page); 2591 err = mpage_process_page_bufs(mpd, head, head, lblk); 2592 if (err <= 0) 2593 goto out; 2594 err = 0; 2595 left--; 2596 } 2597 pagevec_release(&pvec); 2598 cond_resched(); 2599 } 2600 return 0; 2601 out: 2602 pagevec_release(&pvec); 2603 return err; 2604 } 2605 2606 static int ext4_writepages(struct address_space *mapping, 2607 struct writeback_control *wbc) 2608 { 2609 pgoff_t writeback_index = 0; 2610 long nr_to_write = wbc->nr_to_write; 2611 int range_whole = 0; 2612 int cycled = 1; 2613 handle_t *handle = NULL; 2614 struct mpage_da_data mpd; 2615 struct inode *inode = mapping->host; 2616 int needed_blocks, rsv_blocks = 0, ret = 0; 2617 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2618 bool done; 2619 struct blk_plug plug; 2620 bool give_up_on_write = false; 2621 2622 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2623 return -EIO; 2624 2625 percpu_down_read(&sbi->s_writepages_rwsem); 2626 trace_ext4_writepages(inode, wbc); 2627 2628 /* 2629 * No pages to write? This is mainly a kludge to avoid starting 2630 * a transaction for special inodes like journal inode on last iput() 2631 * because that could violate lock ordering on umount 2632 */ 2633 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2634 goto out_writepages; 2635 2636 if (ext4_should_journal_data(inode)) { 2637 ret = generic_writepages(mapping, wbc); 2638 goto out_writepages; 2639 } 2640 2641 /* 2642 * If the filesystem has aborted, it is read-only, so return 2643 * right away instead of dumping stack traces later on that 2644 * will obscure the real source of the problem. We test 2645 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because 2646 * the latter could be true if the filesystem is mounted 2647 * read-only, and in that case, ext4_writepages should 2648 * *never* be called, so if that ever happens, we would want 2649 * the stack trace. 2650 */ 2651 if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || 2652 sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { 2653 ret = -EROFS; 2654 goto out_writepages; 2655 } 2656 2657 /* 2658 * If we have inline data and arrive here, it means that 2659 * we will soon create the block for the 1st page, so 2660 * we'd better clear the inline data here. 2661 */ 2662 if (ext4_has_inline_data(inode)) { 2663 /* Just inode will be modified... */ 2664 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 2665 if (IS_ERR(handle)) { 2666 ret = PTR_ERR(handle); 2667 goto out_writepages; 2668 } 2669 BUG_ON(ext4_test_inode_state(inode, 2670 EXT4_STATE_MAY_INLINE_DATA)); 2671 ext4_destroy_inline_data(handle, inode); 2672 ext4_journal_stop(handle); 2673 } 2674 2675 if (ext4_should_dioread_nolock(inode)) { 2676 /* 2677 * We may need to convert up to one extent per block in 2678 * the page and we may dirty the inode. 2679 */ 2680 rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, 2681 PAGE_SIZE >> inode->i_blkbits); 2682 } 2683 2684 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2685 range_whole = 1; 2686 2687 if (wbc->range_cyclic) { 2688 writeback_index = mapping->writeback_index; 2689 if (writeback_index) 2690 cycled = 0; 2691 mpd.first_page = writeback_index; 2692 mpd.last_page = -1; 2693 } else { 2694 mpd.first_page = wbc->range_start >> PAGE_SHIFT; 2695 mpd.last_page = wbc->range_end >> PAGE_SHIFT; 2696 } 2697 2698 mpd.inode = inode; 2699 mpd.wbc = wbc; 2700 ext4_io_submit_init(&mpd.io_submit, wbc); 2701 retry: 2702 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2703 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); 2704 done = false; 2705 blk_start_plug(&plug); 2706 2707 /* 2708 * First writeback pages that don't need mapping - we can avoid 2709 * starting a transaction unnecessarily and also avoid being blocked 2710 * in the block layer on device congestion while having transaction 2711 * started. 2712 */ 2713 mpd.do_map = 0; 2714 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2715 if (!mpd.io_submit.io_end) { 2716 ret = -ENOMEM; 2717 goto unplug; 2718 } 2719 ret = mpage_prepare_extent_to_map(&mpd); 2720 /* Unlock pages we didn't use */ 2721 mpage_release_unused_pages(&mpd, false); 2722 /* Submit prepared bio */ 2723 ext4_io_submit(&mpd.io_submit); 2724 ext4_put_io_end_defer(mpd.io_submit.io_end); 2725 mpd.io_submit.io_end = NULL; 2726 if (ret < 0) 2727 goto unplug; 2728 2729 while (!done && mpd.first_page <= mpd.last_page) { 2730 /* For each extent of pages we use new io_end */ 2731 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); 2732 if (!mpd.io_submit.io_end) { 2733 ret = -ENOMEM; 2734 break; 2735 } 2736 2737 /* 2738 * We have two constraints: We find one extent to map and we 2739 * must always write out whole page (makes a difference when 2740 * blocksize < pagesize) so that we don't block on IO when we 2741 * try to write out the rest of the page. Journalled mode is 2742 * not supported by delalloc. 2743 */ 2744 BUG_ON(ext4_should_journal_data(inode)); 2745 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2746 2747 /* start a new transaction */ 2748 handle = ext4_journal_start_with_reserve(inode, 2749 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); 2750 if (IS_ERR(handle)) { 2751 ret = PTR_ERR(handle); 2752 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2753 "%ld pages, ino %lu; err %d", __func__, 2754 wbc->nr_to_write, inode->i_ino, ret); 2755 /* Release allocated io_end */ 2756 ext4_put_io_end(mpd.io_submit.io_end); 2757 mpd.io_submit.io_end = NULL; 2758 break; 2759 } 2760 mpd.do_map = 1; 2761 2762 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); 2763 ret = mpage_prepare_extent_to_map(&mpd); 2764 if (!ret) { 2765 if (mpd.map.m_len) 2766 ret = mpage_map_and_submit_extent(handle, &mpd, 2767 &give_up_on_write); 2768 else { 2769 /* 2770 * We scanned the whole range (or exhausted 2771 * nr_to_write), submitted what was mapped and 2772 * didn't find anything needing mapping. We are 2773 * done. 2774 */ 2775 done = true; 2776 } 2777 } 2778 /* 2779 * Caution: If the handle is synchronous, 2780 * ext4_journal_stop() can wait for transaction commit 2781 * to finish which may depend on writeback of pages to 2782 * complete or on page lock to be released. In that 2783 * case, we have to wait until after after we have 2784 * submitted all the IO, released page locks we hold, 2785 * and dropped io_end reference (for extent conversion 2786 * to be able to complete) before stopping the handle. 2787 */ 2788 if (!ext4_handle_valid(handle) || handle->h_sync == 0) { 2789 ext4_journal_stop(handle); 2790 handle = NULL; 2791 mpd.do_map = 0; 2792 } 2793 /* Unlock pages we didn't use */ 2794 mpage_release_unused_pages(&mpd, give_up_on_write); 2795 /* Submit prepared bio */ 2796 ext4_io_submit(&mpd.io_submit); 2797 2798 /* 2799 * Drop our io_end reference we got from init. We have 2800 * to be careful and use deferred io_end finishing if 2801 * we are still holding the transaction as we can 2802 * release the last reference to io_end which may end 2803 * up doing unwritten extent conversion. 2804 */ 2805 if (handle) { 2806 ext4_put_io_end_defer(mpd.io_submit.io_end); 2807 ext4_journal_stop(handle); 2808 } else 2809 ext4_put_io_end(mpd.io_submit.io_end); 2810 mpd.io_submit.io_end = NULL; 2811 2812 if (ret == -ENOSPC && sbi->s_journal) { 2813 /* 2814 * Commit the transaction which would 2815 * free blocks released in the transaction 2816 * and try again 2817 */ 2818 jbd2_journal_force_commit_nested(sbi->s_journal); 2819 ret = 0; 2820 continue; 2821 } 2822 /* Fatal error - ENOMEM, EIO... */ 2823 if (ret) 2824 break; 2825 } 2826 unplug: 2827 blk_finish_plug(&plug); 2828 if (!ret && !cycled && wbc->nr_to_write > 0) { 2829 cycled = 1; 2830 mpd.last_page = writeback_index - 1; 2831 mpd.first_page = 0; 2832 goto retry; 2833 } 2834 2835 /* Update index */ 2836 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2837 /* 2838 * Set the writeback_index so that range_cyclic 2839 * mode will write it back later 2840 */ 2841 mapping->writeback_index = mpd.first_page; 2842 2843 out_writepages: 2844 trace_ext4_writepages_result(inode, wbc, ret, 2845 nr_to_write - wbc->nr_to_write); 2846 percpu_up_read(&sbi->s_writepages_rwsem); 2847 return ret; 2848 } 2849 2850 static int ext4_dax_writepages(struct address_space *mapping, 2851 struct writeback_control *wbc) 2852 { 2853 int ret; 2854 long nr_to_write = wbc->nr_to_write; 2855 struct inode *inode = mapping->host; 2856 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2857 2858 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2859 return -EIO; 2860 2861 percpu_down_read(&sbi->s_writepages_rwsem); 2862 trace_ext4_writepages(inode, wbc); 2863 2864 ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); 2865 trace_ext4_writepages_result(inode, wbc, ret, 2866 nr_to_write - wbc->nr_to_write); 2867 percpu_up_read(&sbi->s_writepages_rwsem); 2868 return ret; 2869 } 2870 2871 static int ext4_nonda_switch(struct super_block *sb) 2872 { 2873 s64 free_clusters, dirty_clusters; 2874 struct ext4_sb_info *sbi = EXT4_SB(sb); 2875 2876 /* 2877 * switch to non delalloc mode if we are running low 2878 * on free block. The free block accounting via percpu 2879 * counters can get slightly wrong with percpu_counter_batch getting 2880 * accumulated on each CPU without updating global counters 2881 * Delalloc need an accurate free block accounting. So switch 2882 * to non delalloc when we are near to error range. 2883 */ 2884 free_clusters = 2885 percpu_counter_read_positive(&sbi->s_freeclusters_counter); 2886 dirty_clusters = 2887 percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); 2888 /* 2889 * Start pushing delalloc when 1/2 of free blocks are dirty. 2890 */ 2891 if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) 2892 try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); 2893 2894 if (2 * free_clusters < 3 * dirty_clusters || 2895 free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { 2896 /* 2897 * free block count is less than 150% of dirty blocks 2898 * or free blocks is less than watermark 2899 */ 2900 return 1; 2901 } 2902 return 0; 2903 } 2904 2905 /* We always reserve for an inode update; the superblock could be there too */ 2906 static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) 2907 { 2908 if (likely(ext4_has_feature_large_file(inode->i_sb))) 2909 return 1; 2910 2911 if (pos + len <= 0x7fffffffULL) 2912 return 1; 2913 2914 /* We might need to update the superblock to set LARGE_FILE */ 2915 return 2; 2916 } 2917 2918 static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2919 loff_t pos, unsigned len, unsigned flags, 2920 struct page **pagep, void **fsdata) 2921 { 2922 int ret, retries = 0; 2923 struct page *page; 2924 pgoff_t index; 2925 struct inode *inode = mapping->host; 2926 handle_t *handle; 2927 2928 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 2929 return -EIO; 2930 2931 index = pos >> PAGE_SHIFT; 2932 2933 if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || 2934 ext4_verity_in_progress(inode)) { 2935 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 2936 return ext4_write_begin(file, mapping, pos, 2937 len, flags, pagep, fsdata); 2938 } 2939 *fsdata = (void *)0; 2940 trace_ext4_da_write_begin(inode, pos, len, flags); 2941 2942 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2943 ret = ext4_da_write_inline_data_begin(mapping, inode, 2944 pos, len, flags, 2945 pagep, fsdata); 2946 if (ret < 0) 2947 return ret; 2948 if (ret == 1) 2949 return 0; 2950 } 2951 2952 /* 2953 * grab_cache_page_write_begin() can take a long time if the 2954 * system is thrashing due to memory pressure, or if the page 2955 * is being written back. So grab it first before we start 2956 * the transaction handle. This also allows us to allocate 2957 * the page (if needed) without using GFP_NOFS. 2958 */ 2959 retry_grab: 2960 page = grab_cache_page_write_begin(mapping, index, flags); 2961 if (!page) 2962 return -ENOMEM; 2963 unlock_page(page); 2964 2965 /* 2966 * With delayed allocation, we don't log the i_disksize update 2967 * if there is delayed block allocation. But we still need 2968 * to journalling the i_disksize update if writes to the end 2969 * of file which has an already mapped buffer. 2970 */ 2971 retry_journal: 2972 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2973 ext4_da_write_credits(inode, pos, len)); 2974 if (IS_ERR(handle)) { 2975 put_page(page); 2976 return PTR_ERR(handle); 2977 } 2978 2979 lock_page(page); 2980 if (page->mapping != mapping) { 2981 /* The page got truncated from under us */ 2982 unlock_page(page); 2983 put_page(page); 2984 ext4_journal_stop(handle); 2985 goto retry_grab; 2986 } 2987 /* In case writeback began while the page was unlocked */ 2988 wait_for_stable_page(page); 2989 2990 #ifdef CONFIG_FS_ENCRYPTION 2991 ret = ext4_block_write_begin(page, pos, len, 2992 ext4_da_get_block_prep); 2993 #else 2994 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 2995 #endif 2996 if (ret < 0) { 2997 unlock_page(page); 2998 ext4_journal_stop(handle); 2999 /* 3000 * block_write_begin may have instantiated a few blocks 3001 * outside i_size. Trim these off again. Don't need 3002 * i_size_read because we hold i_mutex. 3003 */ 3004 if (pos + len > inode->i_size) 3005 ext4_truncate_failed_write(inode); 3006 3007 if (ret == -ENOSPC && 3008 ext4_should_retry_alloc(inode->i_sb, &retries)) 3009 goto retry_journal; 3010 3011 put_page(page); 3012 return ret; 3013 } 3014 3015 *pagep = page; 3016 return ret; 3017 } 3018 3019 /* 3020 * Check if we should update i_disksize 3021 * when write to the end of file but not require block allocation 3022 */ 3023 static int ext4_da_should_update_i_disksize(struct page *page, 3024 unsigned long offset) 3025 { 3026 struct buffer_head *bh; 3027 struct inode *inode = page->mapping->host; 3028 unsigned int idx; 3029 int i; 3030 3031 bh = page_buffers(page); 3032 idx = offset >> inode->i_blkbits; 3033 3034 for (i = 0; i < idx; i++) 3035 bh = bh->b_this_page; 3036 3037 if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) 3038 return 0; 3039 return 1; 3040 } 3041 3042 static int ext4_da_write_end(struct file *file, 3043 struct address_space *mapping, 3044 loff_t pos, unsigned len, unsigned copied, 3045 struct page *page, void *fsdata) 3046 { 3047 struct inode *inode = mapping->host; 3048 int ret = 0, ret2; 3049 handle_t *handle = ext4_journal_current_handle(); 3050 loff_t new_i_size; 3051 unsigned long start, end; 3052 int write_mode = (int)(unsigned long)fsdata; 3053 3054 if (write_mode == FALL_BACK_TO_NONDELALLOC) 3055 return ext4_write_end(file, mapping, pos, 3056 len, copied, page, fsdata); 3057 3058 trace_ext4_da_write_end(inode, pos, len, copied); 3059 start = pos & (PAGE_SIZE - 1); 3060 end = start + copied - 1; 3061 3062 /* 3063 * generic_write_end() will run mark_inode_dirty() if i_size 3064 * changes. So let's piggyback the i_disksize mark_inode_dirty 3065 * into that. 3066 */ 3067 new_i_size = pos + copied; 3068 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 3069 if (ext4_has_inline_data(inode) || 3070 ext4_da_should_update_i_disksize(page, end)) { 3071 ext4_update_i_disksize(inode, new_i_size); 3072 /* We need to mark inode dirty even if 3073 * new_i_size is less that inode->i_size 3074 * bu greater than i_disksize.(hint delalloc) 3075 */ 3076 ext4_mark_inode_dirty(handle, inode); 3077 } 3078 } 3079 3080 if (write_mode != CONVERT_INLINE_DATA && 3081 ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && 3082 ext4_has_inline_data(inode)) 3083 ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, 3084 page); 3085 else 3086 ret2 = generic_write_end(file, mapping, pos, len, copied, 3087 page, fsdata); 3088 3089 copied = ret2; 3090 if (ret2 < 0) 3091 ret = ret2; 3092 ret2 = ext4_journal_stop(handle); 3093 if (!ret) 3094 ret = ret2; 3095 3096 return ret ? ret : copied; 3097 } 3098 3099 /* 3100 * Force all delayed allocation blocks to be allocated for a given inode. 3101 */ 3102 int ext4_alloc_da_blocks(struct inode *inode) 3103 { 3104 trace_ext4_alloc_da_blocks(inode); 3105 3106 if (!EXT4_I(inode)->i_reserved_data_blocks) 3107 return 0; 3108 3109 /* 3110 * We do something simple for now. The filemap_flush() will 3111 * also start triggering a write of the data blocks, which is 3112 * not strictly speaking necessary (and for users of 3113 * laptop_mode, not even desirable). However, to do otherwise 3114 * would require replicating code paths in: 3115 * 3116 * ext4_writepages() -> 3117 * write_cache_pages() ---> (via passed in callback function) 3118 * __mpage_da_writepage() --> 3119 * mpage_add_bh_to_extent() 3120 * mpage_da_map_blocks() 3121 * 3122 * The problem is that write_cache_pages(), located in 3123 * mm/page-writeback.c, marks pages clean in preparation for 3124 * doing I/O, which is not desirable if we're not planning on 3125 * doing I/O at all. 3126 * 3127 * We could call write_cache_pages(), and then redirty all of 3128 * the pages by calling redirty_page_for_writepage() but that 3129 * would be ugly in the extreme. So instead we would need to 3130 * replicate parts of the code in the above functions, 3131 * simplifying them because we wouldn't actually intend to 3132 * write out the pages, but rather only collect contiguous 3133 * logical block extents, call the multi-block allocator, and 3134 * then update the buffer heads with the block allocations. 3135 * 3136 * For now, though, we'll cheat by calling filemap_flush(), 3137 * which will map the blocks, and start the I/O, but not 3138 * actually wait for the I/O to complete. 3139 */ 3140 return filemap_flush(inode->i_mapping); 3141 } 3142 3143 /* 3144 * bmap() is special. It gets used by applications such as lilo and by 3145 * the swapper to find the on-disk block of a specific piece of data. 3146 * 3147 * Naturally, this is dangerous if the block concerned is still in the 3148 * journal. If somebody makes a swapfile on an ext4 data-journaling 3149 * filesystem and enables swap, then they may get a nasty shock when the 3150 * data getting swapped to that swapfile suddenly gets overwritten by 3151 * the original zero's written out previously to the journal and 3152 * awaiting writeback in the kernel's buffer cache. 3153 * 3154 * So, if we see any bmap calls here on a modified, data-journaled file, 3155 * take extra steps to flush any blocks which might be in the cache. 3156 */ 3157 static sector_t ext4_bmap(struct address_space *mapping, sector_t block) 3158 { 3159 struct inode *inode = mapping->host; 3160 journal_t *journal; 3161 int err; 3162 3163 /* 3164 * We can get here for an inline file via the FIBMAP ioctl 3165 */ 3166 if (ext4_has_inline_data(inode)) 3167 return 0; 3168 3169 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && 3170 test_opt(inode->i_sb, DELALLOC)) { 3171 /* 3172 * With delalloc we want to sync the file 3173 * so that we can make sure we allocate 3174 * blocks for file 3175 */ 3176 filemap_write_and_wait(mapping); 3177 } 3178 3179 if (EXT4_JOURNAL(inode) && 3180 ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { 3181 /* 3182 * This is a REALLY heavyweight approach, but the use of 3183 * bmap on dirty files is expected to be extremely rare: 3184 * only if we run lilo or swapon on a freshly made file 3185 * do we expect this to happen. 3186 * 3187 * (bmap requires CAP_SYS_RAWIO so this does not 3188 * represent an unprivileged user DOS attack --- we'd be 3189 * in trouble if mortal users could trigger this path at 3190 * will.) 3191 * 3192 * NB. EXT4_STATE_JDATA is not set on files other than 3193 * regular files. If somebody wants to bmap a directory 3194 * or symlink and gets confused because the buffer 3195 * hasn't yet been flushed to disk, they deserve 3196 * everything they get. 3197 */ 3198 3199 ext4_clear_inode_state(inode, EXT4_STATE_JDATA); 3200 journal = EXT4_JOURNAL(inode); 3201 jbd2_journal_lock_updates(journal); 3202 err = jbd2_journal_flush(journal); 3203 jbd2_journal_unlock_updates(journal); 3204 3205 if (err) 3206 return 0; 3207 } 3208 3209 return iomap_bmap(mapping, block, &ext4_iomap_ops); 3210 } 3211 3212 static int ext4_readpage(struct file *file, struct page *page) 3213 { 3214 int ret = -EAGAIN; 3215 struct inode *inode = page->mapping->host; 3216 3217 trace_ext4_readpage(page); 3218 3219 if (ext4_has_inline_data(inode)) 3220 ret = ext4_readpage_inline(inode, page); 3221 3222 if (ret == -EAGAIN) 3223 return ext4_mpage_readpages(page->mapping, NULL, page, 1, 3224 false); 3225 3226 return ret; 3227 } 3228 3229 static int 3230 ext4_readpages(struct file *file, struct address_space *mapping, 3231 struct list_head *pages, unsigned nr_pages) 3232 { 3233 struct inode *inode = mapping->host; 3234 3235 /* If the file has inline data, no need to do readpages. */ 3236 if (ext4_has_inline_data(inode)) 3237 return 0; 3238 3239 return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); 3240 } 3241 3242 static void ext4_invalidatepage(struct page *page, unsigned int offset, 3243 unsigned int length) 3244 { 3245 trace_ext4_invalidatepage(page, offset, length); 3246 3247 /* No journalling happens on data buffers when this function is used */ 3248 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 3249 3250 block_invalidatepage(page, offset, length); 3251 } 3252 3253 static int __ext4_journalled_invalidatepage(struct page *page, 3254 unsigned int offset, 3255 unsigned int length) 3256 { 3257 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3258 3259 trace_ext4_journalled_invalidatepage(page, offset, length); 3260 3261 /* 3262 * If it's a full truncate we just forget about the pending dirtying 3263 */ 3264 if (offset == 0 && length == PAGE_SIZE) 3265 ClearPageChecked(page); 3266 3267 return jbd2_journal_invalidatepage(journal, page, offset, length); 3268 } 3269 3270 /* Wrapper for aops... */ 3271 static void ext4_journalled_invalidatepage(struct page *page, 3272 unsigned int offset, 3273 unsigned int length) 3274 { 3275 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); 3276 } 3277 3278 static int ext4_releasepage(struct page *page, gfp_t wait) 3279 { 3280 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3281 3282 trace_ext4_releasepage(page); 3283 3284 /* Page has dirty journalled data -> cannot release */ 3285 if (PageChecked(page)) 3286 return 0; 3287 if (journal) 3288 return jbd2_journal_try_to_free_buffers(journal, page, wait); 3289 else 3290 return try_to_free_buffers(page); 3291 } 3292 3293 static bool ext4_inode_datasync_dirty(struct inode *inode) 3294 { 3295 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 3296 3297 if (journal) 3298 return !jbd2_transaction_committed(journal, 3299 EXT4_I(inode)->i_datasync_tid); 3300 /* Any metadata buffers to write? */ 3301 if (!list_empty(&inode->i_mapping->private_list)) 3302 return true; 3303 return inode->i_state & I_DIRTY_DATASYNC; 3304 } 3305 3306 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, 3307 struct ext4_map_blocks *map, loff_t offset, 3308 loff_t length) 3309 { 3310 u8 blkbits = inode->i_blkbits; 3311 3312 /* 3313 * Writes that span EOF might trigger an I/O size update on completion, 3314 * so consider them to be dirty for the purpose of O_DSYNC, even if 3315 * there is no other metadata changes being made or are pending. 3316 */ 3317 iomap->flags = 0; 3318 if (ext4_inode_datasync_dirty(inode) || 3319 offset + length > i_size_read(inode)) 3320 iomap->flags |= IOMAP_F_DIRTY; 3321 3322 if (map->m_flags & EXT4_MAP_NEW) 3323 iomap->flags |= IOMAP_F_NEW; 3324 3325 iomap->bdev = inode->i_sb->s_bdev; 3326 iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3327 iomap->offset = (u64) map->m_lblk << blkbits; 3328 iomap->length = (u64) map->m_len << blkbits; 3329 3330 if ((map->m_flags & EXT4_MAP_MAPPED) && 3331 !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3332 iomap->flags |= IOMAP_F_MERGED; 3333 3334 /* 3335 * Flags passed to ext4_map_blocks() for direct I/O writes can result 3336 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits 3337 * set. In order for any allocated unwritten extents to be converted 3338 * into written extents correctly within the ->end_io() handler, we 3339 * need to ensure that the iomap->type is set appropriately. Hence, the 3340 * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has 3341 * been set first. 3342 */ 3343 if (map->m_flags & EXT4_MAP_UNWRITTEN) { 3344 iomap->type = IOMAP_UNWRITTEN; 3345 iomap->addr = (u64) map->m_pblk << blkbits; 3346 } else if (map->m_flags & EXT4_MAP_MAPPED) { 3347 iomap->type = IOMAP_MAPPED; 3348 iomap->addr = (u64) map->m_pblk << blkbits; 3349 } else { 3350 iomap->type = IOMAP_HOLE; 3351 iomap->addr = IOMAP_NULL_ADDR; 3352 } 3353 } 3354 3355 static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, 3356 unsigned int flags) 3357 { 3358 handle_t *handle; 3359 u8 blkbits = inode->i_blkbits; 3360 int ret, dio_credits, m_flags = 0, retries = 0; 3361 3362 /* 3363 * Trim the mapping request to the maximum value that we can map at 3364 * once for direct I/O. 3365 */ 3366 if (map->m_len > DIO_MAX_BLOCKS) 3367 map->m_len = DIO_MAX_BLOCKS; 3368 dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); 3369 3370 retry: 3371 /* 3372 * Either we allocate blocks and then don't get an unwritten extent, so 3373 * in that case we have reserved enough credits. Or, the blocks are 3374 * already allocated and unwritten. In that case, the extent conversion 3375 * fits into the credits as well. 3376 */ 3377 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); 3378 if (IS_ERR(handle)) 3379 return PTR_ERR(handle); 3380 3381 /* 3382 * DAX and direct I/O are the only two operations that are currently 3383 * supported with IOMAP_WRITE. 3384 */ 3385 WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT)); 3386 if (IS_DAX(inode)) 3387 m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; 3388 /* 3389 * We use i_size instead of i_disksize here because delalloc writeback 3390 * can complete at any point during the I/O and subsequently push the 3391 * i_disksize out to i_size. This could be beyond where direct I/O is 3392 * happening and thus expose allocated blocks to direct I/O reads. 3393 */ 3394 else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) 3395 m_flags = EXT4_GET_BLOCKS_CREATE; 3396 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3397 m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; 3398 3399 ret = ext4_map_blocks(handle, inode, map, m_flags); 3400 3401 /* 3402 * We cannot fill holes in indirect tree based inodes as that could 3403 * expose stale data in the case of a crash. Use the magic error code 3404 * to fallback to buffered I/O. 3405 */ 3406 if (!m_flags && !ret) 3407 ret = -ENOTBLK; 3408 3409 ext4_journal_stop(handle); 3410 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3411 goto retry; 3412 3413 return ret; 3414 } 3415 3416 3417 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 3418 unsigned flags, struct iomap *iomap, struct iomap *srcmap) 3419 { 3420 int ret; 3421 struct ext4_map_blocks map; 3422 u8 blkbits = inode->i_blkbits; 3423 3424 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3425 return -EINVAL; 3426 3427 if (WARN_ON_ONCE(ext4_has_inline_data(inode))) 3428 return -ERANGE; 3429 3430 /* 3431 * Calculate the first and last logical blocks respectively. 3432 */ 3433 map.m_lblk = offset >> blkbits; 3434 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3435 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3436 3437 if (flags & IOMAP_WRITE) 3438 ret = ext4_iomap_alloc(inode, &map, flags); 3439 else 3440 ret = ext4_map_blocks(NULL, inode, &map, 0); 3441 3442 if (ret < 0) 3443 return ret; 3444 3445 ext4_set_iomap(inode, iomap, &map, offset, length); 3446 3447 return 0; 3448 } 3449 3450 static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, 3451 loff_t length, unsigned flags, struct iomap *iomap, 3452 struct iomap *srcmap) 3453 { 3454 int ret; 3455 3456 /* 3457 * Even for writes we don't need to allocate blocks, so just pretend 3458 * we are reading to save overhead of starting a transaction. 3459 */ 3460 flags &= ~IOMAP_WRITE; 3461 ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); 3462 WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); 3463 return ret; 3464 } 3465 3466 static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, 3467 ssize_t written, unsigned flags, struct iomap *iomap) 3468 { 3469 /* 3470 * Check to see whether an error occurred while writing out the data to 3471 * the allocated blocks. If so, return the magic error code so that we 3472 * fallback to buffered I/O and attempt to complete the remainder of 3473 * the I/O. Any blocks that may have been allocated in preparation for 3474 * the direct I/O will be reused during buffered I/O. 3475 */ 3476 if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) 3477 return -ENOTBLK; 3478 3479 return 0; 3480 } 3481 3482 const struct iomap_ops ext4_iomap_ops = { 3483 .iomap_begin = ext4_iomap_begin, 3484 .iomap_end = ext4_iomap_end, 3485 }; 3486 3487 const struct iomap_ops ext4_iomap_overwrite_ops = { 3488 .iomap_begin = ext4_iomap_overwrite_begin, 3489 .iomap_end = ext4_iomap_end, 3490 }; 3491 3492 static bool ext4_iomap_is_delalloc(struct inode *inode, 3493 struct ext4_map_blocks *map) 3494 { 3495 struct extent_status es; 3496 ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; 3497 3498 ext4_es_find_extent_range(inode, &ext4_es_is_delayed, 3499 map->m_lblk, end, &es); 3500 3501 if (!es.es_len || es.es_lblk > end) 3502 return false; 3503 3504 if (es.es_lblk > map->m_lblk) { 3505 map->m_len = es.es_lblk - map->m_lblk; 3506 return false; 3507 } 3508 3509 offset = map->m_lblk - es.es_lblk; 3510 map->m_len = es.es_len - offset; 3511 3512 return true; 3513 } 3514 3515 static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, 3516 loff_t length, unsigned int flags, 3517 struct iomap *iomap, struct iomap *srcmap) 3518 { 3519 int ret; 3520 bool delalloc = false; 3521 struct ext4_map_blocks map; 3522 u8 blkbits = inode->i_blkbits; 3523 3524 if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) 3525 return -EINVAL; 3526 3527 if (ext4_has_inline_data(inode)) { 3528 ret = ext4_inline_data_iomap(inode, iomap); 3529 if (ret != -EAGAIN) { 3530 if (ret == 0 && offset >= iomap->length) 3531 ret = -ENOENT; 3532 return ret; 3533 } 3534 } 3535 3536 /* 3537 * Calculate the first and last logical block respectively. 3538 */ 3539 map.m_lblk = offset >> blkbits; 3540 map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, 3541 EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; 3542 3543 /* 3544 * Fiemap callers may call for offset beyond s_bitmap_maxbytes. 3545 * So handle it here itself instead of querying ext4_map_blocks(). 3546 * Since ext4_map_blocks() will warn about it and will return 3547 * -EIO error. 3548 */ 3549 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 3550 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3551 3552 if (offset >= sbi->s_bitmap_maxbytes) { 3553 map.m_flags = 0; 3554 goto set_iomap; 3555 } 3556 } 3557 3558 ret = ext4_map_blocks(NULL, inode, &map, 0); 3559 if (ret < 0) 3560 return ret; 3561 if (ret == 0) 3562 delalloc = ext4_iomap_is_delalloc(inode, &map); 3563 3564 set_iomap: 3565 ext4_set_iomap(inode, iomap, &map, offset, length); 3566 if (delalloc && iomap->type == IOMAP_HOLE) 3567 iomap->type = IOMAP_DELALLOC; 3568 3569 return 0; 3570 } 3571 3572 const struct iomap_ops ext4_iomap_report_ops = { 3573 .iomap_begin = ext4_iomap_begin_report, 3574 }; 3575 3576 /* 3577 * Pages can be marked dirty completely asynchronously from ext4's journalling 3578 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3579 * much here because ->set_page_dirty is called under VFS locks. The page is 3580 * not necessarily locked. 3581 * 3582 * We cannot just dirty the page and leave attached buffers clean, because the 3583 * buffers' dirty state is "definitive". We cannot just set the buffers dirty 3584 * or jbddirty because all the journalling code will explode. 3585 * 3586 * So what we do is to mark the page "pending dirty" and next time writepage 3587 * is called, propagate that into the buffers appropriately. 3588 */ 3589 static int ext4_journalled_set_page_dirty(struct page *page) 3590 { 3591 SetPageChecked(page); 3592 return __set_page_dirty_nobuffers(page); 3593 } 3594 3595 static int ext4_set_page_dirty(struct page *page) 3596 { 3597 WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); 3598 WARN_ON_ONCE(!page_has_buffers(page)); 3599 return __set_page_dirty_buffers(page); 3600 } 3601 3602 static const struct address_space_operations ext4_aops = { 3603 .readpage = ext4_readpage, 3604 .readpages = ext4_readpages, 3605 .writepage = ext4_writepage, 3606 .writepages = ext4_writepages, 3607 .write_begin = ext4_write_begin, 3608 .write_end = ext4_write_end, 3609 .set_page_dirty = ext4_set_page_dirty, 3610 .bmap = ext4_bmap, 3611 .invalidatepage = ext4_invalidatepage, 3612 .releasepage = ext4_releasepage, 3613 .direct_IO = noop_direct_IO, 3614 .migratepage = buffer_migrate_page, 3615 .is_partially_uptodate = block_is_partially_uptodate, 3616 .error_remove_page = generic_error_remove_page, 3617 }; 3618 3619 static const struct address_space_operations ext4_journalled_aops = { 3620 .readpage = ext4_readpage, 3621 .readpages = ext4_readpages, 3622 .writepage = ext4_writepage, 3623 .writepages = ext4_writepages, 3624 .write_begin = ext4_write_begin, 3625 .write_end = ext4_journalled_write_end, 3626 .set_page_dirty = ext4_journalled_set_page_dirty, 3627 .bmap = ext4_bmap, 3628 .invalidatepage = ext4_journalled_invalidatepage, 3629 .releasepage = ext4_releasepage, 3630 .direct_IO = noop_direct_IO, 3631 .is_partially_uptodate = block_is_partially_uptodate, 3632 .error_remove_page = generic_error_remove_page, 3633 }; 3634 3635 static const struct address_space_operations ext4_da_aops = { 3636 .readpage = ext4_readpage, 3637 .readpages = ext4_readpages, 3638 .writepage = ext4_writepage, 3639 .writepages = ext4_writepages, 3640 .write_begin = ext4_da_write_begin, 3641 .write_end = ext4_da_write_end, 3642 .set_page_dirty = ext4_set_page_dirty, 3643 .bmap = ext4_bmap, 3644 .invalidatepage = ext4_invalidatepage, 3645 .releasepage = ext4_releasepage, 3646 .direct_IO = noop_direct_IO, 3647 .migratepage = buffer_migrate_page, 3648 .is_partially_uptodate = block_is_partially_uptodate, 3649 .error_remove_page = generic_error_remove_page, 3650 }; 3651 3652 static const struct address_space_operations ext4_dax_aops = { 3653 .writepages = ext4_dax_writepages, 3654 .direct_IO = noop_direct_IO, 3655 .set_page_dirty = noop_set_page_dirty, 3656 .bmap = ext4_bmap, 3657 .invalidatepage = noop_invalidatepage, 3658 }; 3659 3660 void ext4_set_aops(struct inode *inode) 3661 { 3662 switch (ext4_inode_journal_mode(inode)) { 3663 case EXT4_INODE_ORDERED_DATA_MODE: 3664 case EXT4_INODE_WRITEBACK_DATA_MODE: 3665 break; 3666 case EXT4_INODE_JOURNAL_DATA_MODE: 3667 inode->i_mapping->a_ops = &ext4_journalled_aops; 3668 return; 3669 default: 3670 BUG(); 3671 } 3672 if (IS_DAX(inode)) 3673 inode->i_mapping->a_ops = &ext4_dax_aops; 3674 else if (test_opt(inode->i_sb, DELALLOC)) 3675 inode->i_mapping->a_ops = &ext4_da_aops; 3676 else 3677 inode->i_mapping->a_ops = &ext4_aops; 3678 } 3679 3680 static int __ext4_block_zero_page_range(handle_t *handle, 3681 struct address_space *mapping, loff_t from, loff_t length) 3682 { 3683 ext4_fsblk_t index = from >> PAGE_SHIFT; 3684 unsigned offset = from & (PAGE_SIZE-1); 3685 unsigned blocksize, pos; 3686 ext4_lblk_t iblock; 3687 struct inode *inode = mapping->host; 3688 struct buffer_head *bh; 3689 struct page *page; 3690 int err = 0; 3691 3692 page = find_or_create_page(mapping, from >> PAGE_SHIFT, 3693 mapping_gfp_constraint(mapping, ~__GFP_FS)); 3694 if (!page) 3695 return -ENOMEM; 3696 3697 blocksize = inode->i_sb->s_blocksize; 3698 3699 iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); 3700 3701 if (!page_has_buffers(page)) 3702 create_empty_buffers(page, blocksize, 0); 3703 3704 /* Find the buffer that contains "offset" */ 3705 bh = page_buffers(page); 3706 pos = blocksize; 3707 while (offset >= pos) { 3708 bh = bh->b_this_page; 3709 iblock++; 3710 pos += blocksize; 3711 } 3712 if (buffer_freed(bh)) { 3713 BUFFER_TRACE(bh, "freed: skip"); 3714 goto unlock; 3715 } 3716 if (!buffer_mapped(bh)) { 3717 BUFFER_TRACE(bh, "unmapped"); 3718 ext4_get_block(inode, iblock, bh, 0); 3719 /* unmapped? It's a hole - nothing to do */ 3720 if (!buffer_mapped(bh)) { 3721 BUFFER_TRACE(bh, "still unmapped"); 3722 goto unlock; 3723 } 3724 } 3725 3726 /* Ok, it's mapped. Make sure it's up-to-date */ 3727 if (PageUptodate(page)) 3728 set_buffer_uptodate(bh); 3729 3730 if (!buffer_uptodate(bh)) { 3731 err = -EIO; 3732 ll_rw_block(REQ_OP_READ, 0, 1, &bh); 3733 wait_on_buffer(bh); 3734 /* Uhhuh. Read error. Complain and punt. */ 3735 if (!buffer_uptodate(bh)) 3736 goto unlock; 3737 if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { 3738 /* We expect the key to be set. */ 3739 BUG_ON(!fscrypt_has_encryption_key(inode)); 3740 err = fscrypt_decrypt_pagecache_blocks(page, blocksize, 3741 bh_offset(bh)); 3742 if (err) { 3743 clear_buffer_uptodate(bh); 3744 goto unlock; 3745 } 3746 } 3747 } 3748 if (ext4_should_journal_data(inode)) { 3749 BUFFER_TRACE(bh, "get write access"); 3750 err = ext4_journal_get_write_access(handle, bh); 3751 if (err) 3752 goto unlock; 3753 } 3754 zero_user(page, offset, length); 3755 BUFFER_TRACE(bh, "zeroed end of block"); 3756 3757 if (ext4_should_journal_data(inode)) { 3758 err = ext4_handle_dirty_metadata(handle, inode, bh); 3759 } else { 3760 err = 0; 3761 mark_buffer_dirty(bh); 3762 if (ext4_should_order_data(inode)) 3763 err = ext4_jbd2_inode_add_write(handle, inode, from, 3764 length); 3765 } 3766 3767 unlock: 3768 unlock_page(page); 3769 put_page(page); 3770 return err; 3771 } 3772 3773 /* 3774 * ext4_block_zero_page_range() zeros out a mapping of length 'length' 3775 * starting from file offset 'from'. The range to be zero'd must 3776 * be contained with in one block. If the specified range exceeds 3777 * the end of the block it will be shortened to end of the block 3778 * that cooresponds to 'from' 3779 */ 3780 static int ext4_block_zero_page_range(handle_t *handle, 3781 struct address_space *mapping, loff_t from, loff_t length) 3782 { 3783 struct inode *inode = mapping->host; 3784 unsigned offset = from & (PAGE_SIZE-1); 3785 unsigned blocksize = inode->i_sb->s_blocksize; 3786 unsigned max = blocksize - (offset & (blocksize - 1)); 3787 3788 /* 3789 * correct length if it does not fall between 3790 * 'from' and the end of the block 3791 */ 3792 if (length > max || length < 0) 3793 length = max; 3794 3795 if (IS_DAX(inode)) { 3796 return iomap_zero_range(inode, from, length, NULL, 3797 &ext4_iomap_ops); 3798 } 3799 return __ext4_block_zero_page_range(handle, mapping, from, length); 3800 } 3801 3802 /* 3803 * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 3804 * up to the end of the block which corresponds to `from'. 3805 * This required during truncate. We need to physically zero the tail end 3806 * of that block so it doesn't yield old data if the file is later grown. 3807 */ 3808 static int ext4_block_truncate_page(handle_t *handle, 3809 struct address_space *mapping, loff_t from) 3810 { 3811 unsigned offset = from & (PAGE_SIZE-1); 3812 unsigned length; 3813 unsigned blocksize; 3814 struct inode *inode = mapping->host; 3815 3816 /* If we are processing an encrypted inode during orphan list handling */ 3817 if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) 3818 return 0; 3819 3820 blocksize = inode->i_sb->s_blocksize; 3821 length = blocksize - (offset & (blocksize - 1)); 3822 3823 return ext4_block_zero_page_range(handle, mapping, from, length); 3824 } 3825 3826 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3827 loff_t lstart, loff_t length) 3828 { 3829 struct super_block *sb = inode->i_sb; 3830 struct address_space *mapping = inode->i_mapping; 3831 unsigned partial_start, partial_end; 3832 ext4_fsblk_t start, end; 3833 loff_t byte_end = (lstart + length - 1); 3834 int err = 0; 3835 3836 partial_start = lstart & (sb->s_blocksize - 1); 3837 partial_end = byte_end & (sb->s_blocksize - 1); 3838 3839 start = lstart >> sb->s_blocksize_bits; 3840 end = byte_end >> sb->s_blocksize_bits; 3841 3842 /* Handle partial zero within the single block */ 3843 if (start == end && 3844 (partial_start || (partial_end != sb->s_blocksize - 1))) { 3845 err = ext4_block_zero_page_range(handle, mapping, 3846 lstart, length); 3847 return err; 3848 } 3849 /* Handle partial zero out on the start of the range */ 3850 if (partial_start) { 3851 err = ext4_block_zero_page_range(handle, mapping, 3852 lstart, sb->s_blocksize); 3853 if (err) 3854 return err; 3855 } 3856 /* Handle partial zero out on the end of the range */ 3857 if (partial_end != sb->s_blocksize - 1) 3858 err = ext4_block_zero_page_range(handle, mapping, 3859 byte_end - partial_end, 3860 partial_end + 1); 3861 return err; 3862 } 3863 3864 int ext4_can_truncate(struct inode *inode) 3865 { 3866 if (S_ISREG(inode->i_mode)) 3867 return 1; 3868 if (S_ISDIR(inode->i_mode)) 3869 return 1; 3870 if (S_ISLNK(inode->i_mode)) 3871 return !ext4_inode_is_fast_symlink(inode); 3872 return 0; 3873 } 3874 3875 /* 3876 * We have to make sure i_disksize gets properly updated before we truncate 3877 * page cache due to hole punching or zero range. Otherwise i_disksize update 3878 * can get lost as it may have been postponed to submission of writeback but 3879 * that will never happen after we truncate page cache. 3880 */ 3881 int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, 3882 loff_t len) 3883 { 3884 handle_t *handle; 3885 loff_t size = i_size_read(inode); 3886 3887 WARN_ON(!inode_is_locked(inode)); 3888 if (offset > size || offset + len < size) 3889 return 0; 3890 3891 if (EXT4_I(inode)->i_disksize >= size) 3892 return 0; 3893 3894 handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 3895 if (IS_ERR(handle)) 3896 return PTR_ERR(handle); 3897 ext4_update_i_disksize(inode, size); 3898 ext4_mark_inode_dirty(handle, inode); 3899 ext4_journal_stop(handle); 3900 3901 return 0; 3902 } 3903 3904 static void ext4_wait_dax_page(struct ext4_inode_info *ei) 3905 { 3906 up_write(&ei->i_mmap_sem); 3907 schedule(); 3908 down_write(&ei->i_mmap_sem); 3909 } 3910 3911 int ext4_break_layouts(struct inode *inode) 3912 { 3913 struct ext4_inode_info *ei = EXT4_I(inode); 3914 struct page *page; 3915 int error; 3916 3917 if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) 3918 return -EINVAL; 3919 3920 do { 3921 page = dax_layout_busy_page(inode->i_mapping); 3922 if (!page) 3923 return 0; 3924 3925 error = ___wait_var_event(&page->_refcount, 3926 atomic_read(&page->_refcount) == 1, 3927 TASK_INTERRUPTIBLE, 0, 0, 3928 ext4_wait_dax_page(ei)); 3929 } while (error == 0); 3930 3931 return error; 3932 } 3933 3934 /* 3935 * ext4_punch_hole: punches a hole in a file by releasing the blocks 3936 * associated with the given offset and length 3937 * 3938 * @inode: File inode 3939 * @offset: The offset where the hole will begin 3940 * @len: The length of the hole 3941 * 3942 * Returns: 0 on success or negative on failure 3943 */ 3944 3945 int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) 3946 { 3947 struct super_block *sb = inode->i_sb; 3948 ext4_lblk_t first_block, stop_block; 3949 struct address_space *mapping = inode->i_mapping; 3950 loff_t first_block_offset, last_block_offset; 3951 handle_t *handle; 3952 unsigned int credits; 3953 int ret = 0; 3954 3955 trace_ext4_punch_hole(inode, offset, length, 0); 3956 3957 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 3958 if (ext4_has_inline_data(inode)) { 3959 down_write(&EXT4_I(inode)->i_mmap_sem); 3960 ret = ext4_convert_inline_data(inode); 3961 up_write(&EXT4_I(inode)->i_mmap_sem); 3962 if (ret) 3963 return ret; 3964 } 3965 3966 /* 3967 * Write out all dirty pages to avoid race conditions 3968 * Then release them. 3969 */ 3970 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 3971 ret = filemap_write_and_wait_range(mapping, offset, 3972 offset + length - 1); 3973 if (ret) 3974 return ret; 3975 } 3976 3977 inode_lock(inode); 3978 3979 /* No need to punch hole beyond i_size */ 3980 if (offset >= inode->i_size) 3981 goto out_mutex; 3982 3983 /* 3984 * If the hole extends beyond i_size, set the hole 3985 * to end after the page that contains i_size 3986 */ 3987 if (offset + length > inode->i_size) { 3988 length = inode->i_size + 3989 PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - 3990 offset; 3991 } 3992 3993 if (offset & (sb->s_blocksize - 1) || 3994 (offset + length) & (sb->s_blocksize - 1)) { 3995 /* 3996 * Attach jinode to inode for jbd2 if we do any zeroing of 3997 * partial block 3998 */ 3999 ret = ext4_inode_attach_jinode(inode); 4000 if (ret < 0) 4001 goto out_mutex; 4002 4003 } 4004 4005 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4006 inode_dio_wait(inode); 4007 4008 /* 4009 * Prevent page faults from reinstantiating pages we have released from 4010 * page cache. 4011 */ 4012 down_write(&EXT4_I(inode)->i_mmap_sem); 4013 4014 ret = ext4_break_layouts(inode); 4015 if (ret) 4016 goto out_dio; 4017 4018 first_block_offset = round_up(offset, sb->s_blocksize); 4019 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 4020 4021 /* Now release the pages and zero block aligned part of pages*/ 4022 if (last_block_offset > first_block_offset) { 4023 ret = ext4_update_disksize_before_punch(inode, offset, length); 4024 if (ret) 4025 goto out_dio; 4026 truncate_pagecache_range(inode, first_block_offset, 4027 last_block_offset); 4028 } 4029 4030 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4031 credits = ext4_writepage_trans_blocks(inode); 4032 else 4033 credits = ext4_blocks_for_truncate(inode); 4034 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4035 if (IS_ERR(handle)) { 4036 ret = PTR_ERR(handle); 4037 ext4_std_error(sb, ret); 4038 goto out_dio; 4039 } 4040 4041 ret = ext4_zero_partial_blocks(handle, inode, offset, 4042 length); 4043 if (ret) 4044 goto out_stop; 4045 4046 first_block = (offset + sb->s_blocksize - 1) >> 4047 EXT4_BLOCK_SIZE_BITS(sb); 4048 stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4049 4050 /* If there are blocks to remove, do it */ 4051 if (stop_block > first_block) { 4052 4053 down_write(&EXT4_I(inode)->i_data_sem); 4054 ext4_discard_preallocations(inode); 4055 4056 ret = ext4_es_remove_extent(inode, first_block, 4057 stop_block - first_block); 4058 if (ret) { 4059 up_write(&EXT4_I(inode)->i_data_sem); 4060 goto out_stop; 4061 } 4062 4063 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4064 ret = ext4_ext_remove_space(inode, first_block, 4065 stop_block - 1); 4066 else 4067 ret = ext4_ind_remove_space(handle, inode, first_block, 4068 stop_block); 4069 4070 up_write(&EXT4_I(inode)->i_data_sem); 4071 } 4072 if (IS_SYNC(inode)) 4073 ext4_handle_sync(handle); 4074 4075 inode->i_mtime = inode->i_ctime = current_time(inode); 4076 ext4_mark_inode_dirty(handle, inode); 4077 if (ret >= 0) 4078 ext4_update_inode_fsync_trans(handle, inode, 1); 4079 out_stop: 4080 ext4_journal_stop(handle); 4081 out_dio: 4082 up_write(&EXT4_I(inode)->i_mmap_sem); 4083 out_mutex: 4084 inode_unlock(inode); 4085 return ret; 4086 } 4087 4088 int ext4_inode_attach_jinode(struct inode *inode) 4089 { 4090 struct ext4_inode_info *ei = EXT4_I(inode); 4091 struct jbd2_inode *jinode; 4092 4093 if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) 4094 return 0; 4095 4096 jinode = jbd2_alloc_inode(GFP_KERNEL); 4097 spin_lock(&inode->i_lock); 4098 if (!ei->jinode) { 4099 if (!jinode) { 4100 spin_unlock(&inode->i_lock); 4101 return -ENOMEM; 4102 } 4103 ei->jinode = jinode; 4104 jbd2_journal_init_jbd_inode(ei->jinode, inode); 4105 jinode = NULL; 4106 } 4107 spin_unlock(&inode->i_lock); 4108 if (unlikely(jinode != NULL)) 4109 jbd2_free_inode(jinode); 4110 return 0; 4111 } 4112 4113 /* 4114 * ext4_truncate() 4115 * 4116 * We block out ext4_get_block() block instantiations across the entire 4117 * transaction, and VFS/VM ensures that ext4_truncate() cannot run 4118 * simultaneously on behalf of the same inode. 4119 * 4120 * As we work through the truncate and commit bits of it to the journal there 4121 * is one core, guiding principle: the file's tree must always be consistent on 4122 * disk. We must be able to restart the truncate after a crash. 4123 * 4124 * The file's tree may be transiently inconsistent in memory (although it 4125 * probably isn't), but whenever we close off and commit a journal transaction, 4126 * the contents of (the filesystem + the journal) must be consistent and 4127 * restartable. It's pretty simple, really: bottom up, right to left (although 4128 * left-to-right works OK too). 4129 * 4130 * Note that at recovery time, journal replay occurs *before* the restart of 4131 * truncate against the orphan inode list. 4132 * 4133 * The committed inode has the new, desired i_size (which is the same as 4134 * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see 4135 * that this inode's truncate did not complete and it will again call 4136 * ext4_truncate() to have another go. So there will be instantiated blocks 4137 * to the right of the truncation point in a crashed ext4 filesystem. But 4138 * that's fine - as long as they are linked from the inode, the post-crash 4139 * ext4_truncate() run will find them and release them. 4140 */ 4141 int ext4_truncate(struct inode *inode) 4142 { 4143 struct ext4_inode_info *ei = EXT4_I(inode); 4144 unsigned int credits; 4145 int err = 0; 4146 handle_t *handle; 4147 struct address_space *mapping = inode->i_mapping; 4148 4149 /* 4150 * There is a possibility that we're either freeing the inode 4151 * or it's a completely new inode. In those cases we might not 4152 * have i_mutex locked because it's not necessary. 4153 */ 4154 if (!(inode->i_state & (I_NEW|I_FREEING))) 4155 WARN_ON(!inode_is_locked(inode)); 4156 trace_ext4_truncate_enter(inode); 4157 4158 if (!ext4_can_truncate(inode)) 4159 return 0; 4160 4161 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4162 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4163 4164 if (ext4_has_inline_data(inode)) { 4165 int has_inline = 1; 4166 4167 err = ext4_inline_data_truncate(inode, &has_inline); 4168 if (err) 4169 return err; 4170 if (has_inline) 4171 return 0; 4172 } 4173 4174 /* If we zero-out tail of the page, we have to create jinode for jbd2 */ 4175 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { 4176 if (ext4_inode_attach_jinode(inode) < 0) 4177 return 0; 4178 } 4179 4180 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4181 credits = ext4_writepage_trans_blocks(inode); 4182 else 4183 credits = ext4_blocks_for_truncate(inode); 4184 4185 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 4186 if (IS_ERR(handle)) 4187 return PTR_ERR(handle); 4188 4189 if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4190 ext4_block_truncate_page(handle, mapping, inode->i_size); 4191 4192 /* 4193 * We add the inode to the orphan list, so that if this 4194 * truncate spans multiple transactions, and we crash, we will 4195 * resume the truncate when the filesystem recovers. It also 4196 * marks the inode dirty, to catch the new size. 4197 * 4198 * Implication: the file must always be in a sane, consistent 4199 * truncatable state while each transaction commits. 4200 */ 4201 err = ext4_orphan_add(handle, inode); 4202 if (err) 4203 goto out_stop; 4204 4205 down_write(&EXT4_I(inode)->i_data_sem); 4206 4207 ext4_discard_preallocations(inode); 4208 4209 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4210 err = ext4_ext_truncate(handle, inode); 4211 else 4212 ext4_ind_truncate(handle, inode); 4213 4214 up_write(&ei->i_data_sem); 4215 if (err) 4216 goto out_stop; 4217 4218 if (IS_SYNC(inode)) 4219 ext4_handle_sync(handle); 4220 4221 out_stop: 4222 /* 4223 * If this was a simple ftruncate() and the file will remain alive, 4224 * then we need to clear up the orphan record which we created above. 4225 * However, if this was a real unlink then we were called by 4226 * ext4_evict_inode(), and we allow that function to clean up the 4227 * orphan info for us. 4228 */ 4229 if (inode->i_nlink) 4230 ext4_orphan_del(handle, inode); 4231 4232 inode->i_mtime = inode->i_ctime = current_time(inode); 4233 ext4_mark_inode_dirty(handle, inode); 4234 ext4_journal_stop(handle); 4235 4236 trace_ext4_truncate_exit(inode); 4237 return err; 4238 } 4239 4240 /* 4241 * ext4_get_inode_loc returns with an extra refcount against the inode's 4242 * underlying buffer_head on success. If 'in_mem' is true, we have all 4243 * data in memory that is needed to recreate the on-disk version of this 4244 * inode. 4245 */ 4246 static int __ext4_get_inode_loc(struct inode *inode, 4247 struct ext4_iloc *iloc, int in_mem) 4248 { 4249 struct ext4_group_desc *gdp; 4250 struct buffer_head *bh; 4251 struct super_block *sb = inode->i_sb; 4252 ext4_fsblk_t block; 4253 struct blk_plug plug; 4254 int inodes_per_block, inode_offset; 4255 4256 iloc->bh = NULL; 4257 if (inode->i_ino < EXT4_ROOT_INO || 4258 inode->i_ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) 4259 return -EFSCORRUPTED; 4260 4261 iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); 4262 gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); 4263 if (!gdp) 4264 return -EIO; 4265 4266 /* 4267 * Figure out the offset within the block group inode table 4268 */ 4269 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4270 inode_offset = ((inode->i_ino - 1) % 4271 EXT4_INODES_PER_GROUP(sb)); 4272 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4273 iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); 4274 4275 bh = sb_getblk(sb, block); 4276 if (unlikely(!bh)) 4277 return -ENOMEM; 4278 if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) 4279 goto simulate_eio; 4280 if (!buffer_uptodate(bh)) { 4281 lock_buffer(bh); 4282 4283 /* 4284 * If the buffer has the write error flag, we have failed 4285 * to write out another inode in the same block. In this 4286 * case, we don't have to read the block because we may 4287 * read the old inode data successfully. 4288 */ 4289 if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) 4290 set_buffer_uptodate(bh); 4291 4292 if (buffer_uptodate(bh)) { 4293 /* someone brought it uptodate while we waited */ 4294 unlock_buffer(bh); 4295 goto has_buffer; 4296 } 4297 4298 /* 4299 * If we have all information of the inode in memory and this 4300 * is the only valid inode in the block, we need not read the 4301 * block. 4302 */ 4303 if (in_mem) { 4304 struct buffer_head *bitmap_bh; 4305 int i, start; 4306 4307 start = inode_offset & ~(inodes_per_block - 1); 4308 4309 /* Is the inode bitmap in cache? */ 4310 bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); 4311 if (unlikely(!bitmap_bh)) 4312 goto make_io; 4313 4314 /* 4315 * If the inode bitmap isn't in cache then the 4316 * optimisation may end up performing two reads instead 4317 * of one, so skip it. 4318 */ 4319 if (!buffer_uptodate(bitmap_bh)) { 4320 brelse(bitmap_bh); 4321 goto make_io; 4322 } 4323 for (i = start; i < start + inodes_per_block; i++) { 4324 if (i == inode_offset) 4325 continue; 4326 if (ext4_test_bit(i, bitmap_bh->b_data)) 4327 break; 4328 } 4329 brelse(bitmap_bh); 4330 if (i == start + inodes_per_block) { 4331 /* all other inodes are free, so skip I/O */ 4332 memset(bh->b_data, 0, bh->b_size); 4333 set_buffer_uptodate(bh); 4334 unlock_buffer(bh); 4335 goto has_buffer; 4336 } 4337 } 4338 4339 make_io: 4340 /* 4341 * If we need to do any I/O, try to pre-readahead extra 4342 * blocks from the inode table. 4343 */ 4344 blk_start_plug(&plug); 4345 if (EXT4_SB(sb)->s_inode_readahead_blks) { 4346 ext4_fsblk_t b, end, table; 4347 unsigned num; 4348 __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; 4349 4350 table = ext4_inode_table(sb, gdp); 4351 /* s_inode_readahead_blks is always a power of 2 */ 4352 b = block & ~((ext4_fsblk_t) ra_blks - 1); 4353 if (table > b) 4354 b = table; 4355 end = b + ra_blks; 4356 num = EXT4_INODES_PER_GROUP(sb); 4357 if (ext4_has_group_desc_csum(sb)) 4358 num -= ext4_itable_unused_count(sb, gdp); 4359 table += num / inodes_per_block; 4360 if (end > table) 4361 end = table; 4362 while (b <= end) 4363 sb_breadahead_unmovable(sb, b++); 4364 } 4365 4366 /* 4367 * There are other valid inodes in the buffer, this inode 4368 * has in-inode xattrs, or we don't have this inode in memory. 4369 * Read the block from disk. 4370 */ 4371 trace_ext4_load_inode(inode); 4372 get_bh(bh); 4373 bh->b_end_io = end_buffer_read_sync; 4374 submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); 4375 blk_finish_plug(&plug); 4376 wait_on_buffer(bh); 4377 if (!buffer_uptodate(bh)) { 4378 simulate_eio: 4379 ext4_error_inode_block(inode, block, EIO, 4380 "unable to read itable block"); 4381 brelse(bh); 4382 return -EIO; 4383 } 4384 } 4385 has_buffer: 4386 iloc->bh = bh; 4387 return 0; 4388 } 4389 4390 int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) 4391 { 4392 /* We have all inode data except xattrs in memory here. */ 4393 return __ext4_get_inode_loc(inode, iloc, 4394 !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); 4395 } 4396 4397 static bool ext4_should_use_dax(struct inode *inode) 4398 { 4399 if (!test_opt(inode->i_sb, DAX)) 4400 return false; 4401 if (!S_ISREG(inode->i_mode)) 4402 return false; 4403 if (ext4_should_journal_data(inode)) 4404 return false; 4405 if (ext4_has_inline_data(inode)) 4406 return false; 4407 if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) 4408 return false; 4409 if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) 4410 return false; 4411 return true; 4412 } 4413 4414 void ext4_set_inode_flags(struct inode *inode) 4415 { 4416 unsigned int flags = EXT4_I(inode)->i_flags; 4417 unsigned int new_fl = 0; 4418 4419 if (flags & EXT4_SYNC_FL) 4420 new_fl |= S_SYNC; 4421 if (flags & EXT4_APPEND_FL) 4422 new_fl |= S_APPEND; 4423 if (flags & EXT4_IMMUTABLE_FL) 4424 new_fl |= S_IMMUTABLE; 4425 if (flags & EXT4_NOATIME_FL) 4426 new_fl |= S_NOATIME; 4427 if (flags & EXT4_DIRSYNC_FL) 4428 new_fl |= S_DIRSYNC; 4429 if (ext4_should_use_dax(inode)) 4430 new_fl |= S_DAX; 4431 if (flags & EXT4_ENCRYPT_FL) 4432 new_fl |= S_ENCRYPTED; 4433 if (flags & EXT4_CASEFOLD_FL) 4434 new_fl |= S_CASEFOLD; 4435 if (flags & EXT4_VERITY_FL) 4436 new_fl |= S_VERITY; 4437 inode_set_flags(inode, new_fl, 4438 S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| 4439 S_ENCRYPTED|S_CASEFOLD|S_VERITY); 4440 } 4441 4442 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4443 struct ext4_inode_info *ei) 4444 { 4445 blkcnt_t i_blocks ; 4446 struct inode *inode = &(ei->vfs_inode); 4447 struct super_block *sb = inode->i_sb; 4448 4449 if (ext4_has_feature_huge_file(sb)) { 4450 /* we are using combined 48 bit field */ 4451 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4452 le32_to_cpu(raw_inode->i_blocks_lo); 4453 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { 4454 /* i_blocks represent file system block size */ 4455 return i_blocks << (inode->i_blkbits - 9); 4456 } else { 4457 return i_blocks; 4458 } 4459 } else { 4460 return le32_to_cpu(raw_inode->i_blocks_lo); 4461 } 4462 } 4463 4464 static inline int ext4_iget_extra_inode(struct inode *inode, 4465 struct ext4_inode *raw_inode, 4466 struct ext4_inode_info *ei) 4467 { 4468 __le32 *magic = (void *)raw_inode + 4469 EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; 4470 4471 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= 4472 EXT4_INODE_SIZE(inode->i_sb) && 4473 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4474 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4475 return ext4_find_inline_data_nolock(inode); 4476 } else 4477 EXT4_I(inode)->i_inline_off = 0; 4478 return 0; 4479 } 4480 4481 int ext4_get_projid(struct inode *inode, kprojid_t *projid) 4482 { 4483 if (!ext4_has_feature_project(inode->i_sb)) 4484 return -EOPNOTSUPP; 4485 *projid = EXT4_I(inode)->i_projid; 4486 return 0; 4487 } 4488 4489 /* 4490 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of 4491 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag 4492 * set. 4493 */ 4494 static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) 4495 { 4496 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 4497 inode_set_iversion_raw(inode, val); 4498 else 4499 inode_set_iversion_queried(inode, val); 4500 } 4501 static inline u64 ext4_inode_peek_iversion(const struct inode *inode) 4502 { 4503 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) 4504 return inode_peek_iversion_raw(inode); 4505 else 4506 return inode_peek_iversion(inode); 4507 } 4508 4509 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, 4510 ext4_iget_flags flags, const char *function, 4511 unsigned int line) 4512 { 4513 struct ext4_iloc iloc; 4514 struct ext4_inode *raw_inode; 4515 struct ext4_inode_info *ei; 4516 struct inode *inode; 4517 journal_t *journal = EXT4_SB(sb)->s_journal; 4518 long ret; 4519 loff_t size; 4520 int block; 4521 uid_t i_uid; 4522 gid_t i_gid; 4523 projid_t i_projid; 4524 4525 if ((!(flags & EXT4_IGET_SPECIAL) && 4526 (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || 4527 (ino < EXT4_ROOT_INO) || 4528 (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { 4529 if (flags & EXT4_IGET_HANDLE) 4530 return ERR_PTR(-ESTALE); 4531 __ext4_error(sb, function, line, EFSCORRUPTED, 0, 4532 "inode #%lu: comm %s: iget: illegal inode #", 4533 ino, current->comm); 4534 return ERR_PTR(-EFSCORRUPTED); 4535 } 4536 4537 inode = iget_locked(sb, ino); 4538 if (!inode) 4539 return ERR_PTR(-ENOMEM); 4540 if (!(inode->i_state & I_NEW)) 4541 return inode; 4542 4543 ei = EXT4_I(inode); 4544 iloc.bh = NULL; 4545 4546 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4547 if (ret < 0) 4548 goto bad_inode; 4549 raw_inode = ext4_raw_inode(&iloc); 4550 4551 if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { 4552 ext4_error_inode(inode, function, line, 0, 4553 "iget: root inode unallocated"); 4554 ret = -EFSCORRUPTED; 4555 goto bad_inode; 4556 } 4557 4558 if ((flags & EXT4_IGET_HANDLE) && 4559 (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { 4560 ret = -ESTALE; 4561 goto bad_inode; 4562 } 4563 4564 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4565 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4566 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4567 EXT4_INODE_SIZE(inode->i_sb) || 4568 (ei->i_extra_isize & 3)) { 4569 ext4_error_inode(inode, function, line, 0, 4570 "iget: bad extra_isize %u " 4571 "(inode size %u)", 4572 ei->i_extra_isize, 4573 EXT4_INODE_SIZE(inode->i_sb)); 4574 ret = -EFSCORRUPTED; 4575 goto bad_inode; 4576 } 4577 } else 4578 ei->i_extra_isize = 0; 4579 4580 /* Precompute checksum seed for inode metadata */ 4581 if (ext4_has_metadata_csum(sb)) { 4582 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4583 __u32 csum; 4584 __le32 inum = cpu_to_le32(inode->i_ino); 4585 __le32 gen = raw_inode->i_generation; 4586 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, 4587 sizeof(inum)); 4588 ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, 4589 sizeof(gen)); 4590 } 4591 4592 if (!ext4_inode_csum_verify(inode, raw_inode, ei) || 4593 ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { 4594 ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, 4595 "iget: checksum invalid"); 4596 ret = -EFSBADCRC; 4597 goto bad_inode; 4598 } 4599 4600 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4601 i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4602 i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); 4603 if (ext4_has_feature_project(sb) && 4604 EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && 4605 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 4606 i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); 4607 else 4608 i_projid = EXT4_DEF_PROJID; 4609 4610 if (!(test_opt(inode->i_sb, NO_UID32))) { 4611 i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; 4612 i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; 4613 } 4614 i_uid_write(inode, i_uid); 4615 i_gid_write(inode, i_gid); 4616 ei->i_projid = make_kprojid(&init_user_ns, i_projid); 4617 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 4618 4619 ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 4620 ei->i_inline_off = 0; 4621 ei->i_dir_start_lookup = 0; 4622 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); 4623 /* We now have enough fields to check if the inode was active or not. 4624 * This is needed because nfsd might try to access dead inodes 4625 * the test is that same one that e2fsck uses 4626 * NeilBrown 1999oct15 4627 */ 4628 if (inode->i_nlink == 0) { 4629 if ((inode->i_mode == 0 || 4630 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && 4631 ino != EXT4_BOOT_LOADER_INO) { 4632 /* this inode is deleted */ 4633 ret = -ESTALE; 4634 goto bad_inode; 4635 } 4636 /* The only unlinked inodes we let through here have 4637 * valid i_mode and are being read by the orphan 4638 * recovery code: that's fine, we're about to complete 4639 * the process of deleting those. 4640 * OR it is the EXT4_BOOT_LOADER_INO which is 4641 * not initialized on a new filesystem. */ 4642 } 4643 ei->i_flags = le32_to_cpu(raw_inode->i_flags); 4644 ext4_set_inode_flags(inode); 4645 inode->i_blocks = ext4_inode_blocks(raw_inode, ei); 4646 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); 4647 if (ext4_has_feature_64bit(sb)) 4648 ei->i_file_acl |= 4649 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; 4650 inode->i_size = ext4_isize(sb, raw_inode); 4651 if ((size = i_size_read(inode)) < 0) { 4652 ext4_error_inode(inode, function, line, 0, 4653 "iget: bad i_size value: %lld", size); 4654 ret = -EFSCORRUPTED; 4655 goto bad_inode; 4656 } 4657 /* 4658 * If dir_index is not enabled but there's dir with INDEX flag set, 4659 * we'd normally treat htree data as empty space. But with metadata 4660 * checksumming that corrupts checksums so forbid that. 4661 */ 4662 if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && 4663 ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { 4664 ext4_error_inode(inode, function, line, 0, 4665 "iget: Dir with htree data on filesystem without dir_index feature."); 4666 ret = -EFSCORRUPTED; 4667 goto bad_inode; 4668 } 4669 ei->i_disksize = inode->i_size; 4670 #ifdef CONFIG_QUOTA 4671 ei->i_reserved_quota = 0; 4672 #endif 4673 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 4674 ei->i_block_group = iloc.block_group; 4675 ei->i_last_alloc_group = ~0; 4676 /* 4677 * NOTE! The in-memory inode i_data array is in little-endian order 4678 * even on big-endian machines: we do NOT byteswap the block numbers! 4679 */ 4680 for (block = 0; block < EXT4_N_BLOCKS; block++) 4681 ei->i_data[block] = raw_inode->i_block[block]; 4682 INIT_LIST_HEAD(&ei->i_orphan); 4683 4684 /* 4685 * Set transaction id's of transactions that have to be committed 4686 * to finish f[data]sync. We set them to currently running transaction 4687 * as we cannot be sure that the inode or some of its metadata isn't 4688 * part of the transaction - the inode could have been reclaimed and 4689 * now it is reread from disk. 4690 */ 4691 if (journal) { 4692 transaction_t *transaction; 4693 tid_t tid; 4694 4695 read_lock(&journal->j_state_lock); 4696 if (journal->j_running_transaction) 4697 transaction = journal->j_running_transaction; 4698 else 4699 transaction = journal->j_committing_transaction; 4700 if (transaction) 4701 tid = transaction->t_tid; 4702 else 4703 tid = journal->j_commit_sequence; 4704 read_unlock(&journal->j_state_lock); 4705 ei->i_sync_tid = tid; 4706 ei->i_datasync_tid = tid; 4707 } 4708 4709 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4710 if (ei->i_extra_isize == 0) { 4711 /* The extra space is currently unused. Use it. */ 4712 BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); 4713 ei->i_extra_isize = sizeof(struct ext4_inode) - 4714 EXT4_GOOD_OLD_INODE_SIZE; 4715 } else { 4716 ret = ext4_iget_extra_inode(inode, raw_inode, ei); 4717 if (ret) 4718 goto bad_inode; 4719 } 4720 } 4721 4722 EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); 4723 EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); 4724 EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); 4725 EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); 4726 4727 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 4728 u64 ivers = le32_to_cpu(raw_inode->i_disk_version); 4729 4730 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4731 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 4732 ivers |= 4733 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; 4734 } 4735 ext4_inode_set_iversion_queried(inode, ivers); 4736 } 4737 4738 ret = 0; 4739 if (ei->i_file_acl && 4740 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 4741 ext4_error_inode(inode, function, line, 0, 4742 "iget: bad extended attribute block %llu", 4743 ei->i_file_acl); 4744 ret = -EFSCORRUPTED; 4745 goto bad_inode; 4746 } else if (!ext4_has_inline_data(inode)) { 4747 /* validate the block references in the inode */ 4748 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 4749 (S_ISLNK(inode->i_mode) && 4750 !ext4_inode_is_fast_symlink(inode))) { 4751 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4752 ret = ext4_ext_check_inode(inode); 4753 else 4754 ret = ext4_ind_check_inode(inode); 4755 } 4756 } 4757 if (ret) 4758 goto bad_inode; 4759 4760 if (S_ISREG(inode->i_mode)) { 4761 inode->i_op = &ext4_file_inode_operations; 4762 inode->i_fop = &ext4_file_operations; 4763 ext4_set_aops(inode); 4764 } else if (S_ISDIR(inode->i_mode)) { 4765 inode->i_op = &ext4_dir_inode_operations; 4766 inode->i_fop = &ext4_dir_operations; 4767 } else if (S_ISLNK(inode->i_mode)) { 4768 /* VFS does not allow setting these so must be corruption */ 4769 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { 4770 ext4_error_inode(inode, function, line, 0, 4771 "iget: immutable or append flags " 4772 "not allowed on symlinks"); 4773 ret = -EFSCORRUPTED; 4774 goto bad_inode; 4775 } 4776 if (IS_ENCRYPTED(inode)) { 4777 inode->i_op = &ext4_encrypted_symlink_inode_operations; 4778 ext4_set_aops(inode); 4779 } else if (ext4_inode_is_fast_symlink(inode)) { 4780 inode->i_link = (char *)ei->i_data; 4781 inode->i_op = &ext4_fast_symlink_inode_operations; 4782 nd_terminate_link(ei->i_data, inode->i_size, 4783 sizeof(ei->i_data) - 1); 4784 } else { 4785 inode->i_op = &ext4_symlink_inode_operations; 4786 ext4_set_aops(inode); 4787 } 4788 inode_nohighmem(inode); 4789 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || 4790 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { 4791 inode->i_op = &ext4_special_inode_operations; 4792 if (raw_inode->i_block[0]) 4793 init_special_inode(inode, inode->i_mode, 4794 old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); 4795 else 4796 init_special_inode(inode, inode->i_mode, 4797 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4798 } else if (ino == EXT4_BOOT_LOADER_INO) { 4799 make_bad_inode(inode); 4800 } else { 4801 ret = -EFSCORRUPTED; 4802 ext4_error_inode(inode, function, line, 0, 4803 "iget: bogus i_mode (%o)", inode->i_mode); 4804 goto bad_inode; 4805 } 4806 if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) 4807 ext4_error_inode(inode, function, line, 0, 4808 "casefold flag without casefold feature"); 4809 brelse(iloc.bh); 4810 4811 unlock_new_inode(inode); 4812 return inode; 4813 4814 bad_inode: 4815 brelse(iloc.bh); 4816 iget_failed(inode); 4817 return ERR_PTR(ret); 4818 } 4819 4820 static int ext4_inode_blocks_set(handle_t *handle, 4821 struct ext4_inode *raw_inode, 4822 struct ext4_inode_info *ei) 4823 { 4824 struct inode *inode = &(ei->vfs_inode); 4825 u64 i_blocks = READ_ONCE(inode->i_blocks); 4826 struct super_block *sb = inode->i_sb; 4827 4828 if (i_blocks <= ~0U) { 4829 /* 4830 * i_blocks can be represented in a 32 bit variable 4831 * as multiple of 512 bytes 4832 */ 4833 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4834 raw_inode->i_blocks_high = 0; 4835 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4836 return 0; 4837 } 4838 if (!ext4_has_feature_huge_file(sb)) 4839 return -EFBIG; 4840 4841 if (i_blocks <= 0xffffffffffffULL) { 4842 /* 4843 * i_blocks can be represented in a 48 bit variable 4844 * as multiple of 512 bytes 4845 */ 4846 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4847 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4848 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4849 } else { 4850 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); 4851 /* i_block is stored in file system block size */ 4852 i_blocks = i_blocks >> (inode->i_blkbits - 9); 4853 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 4854 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 4855 } 4856 return 0; 4857 } 4858 4859 struct other_inode { 4860 unsigned long orig_ino; 4861 struct ext4_inode *raw_inode; 4862 }; 4863 4864 static int other_inode_match(struct inode * inode, unsigned long ino, 4865 void *data) 4866 { 4867 struct other_inode *oi = (struct other_inode *) data; 4868 4869 if ((inode->i_ino != ino) || 4870 (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | 4871 I_DIRTY_INODE)) || 4872 ((inode->i_state & I_DIRTY_TIME) == 0)) 4873 return 0; 4874 spin_lock(&inode->i_lock); 4875 if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | 4876 I_DIRTY_INODE)) == 0) && 4877 (inode->i_state & I_DIRTY_TIME)) { 4878 struct ext4_inode_info *ei = EXT4_I(inode); 4879 4880 inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); 4881 spin_unlock(&inode->i_lock); 4882 4883 spin_lock(&ei->i_raw_lock); 4884 EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); 4885 EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); 4886 EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); 4887 ext4_inode_csum_set(inode, oi->raw_inode, ei); 4888 spin_unlock(&ei->i_raw_lock); 4889 trace_ext4_other_inode_update_time(inode, oi->orig_ino); 4890 return -1; 4891 } 4892 spin_unlock(&inode->i_lock); 4893 return -1; 4894 } 4895 4896 /* 4897 * Opportunistically update the other time fields for other inodes in 4898 * the same inode table block. 4899 */ 4900 static void ext4_update_other_inodes_time(struct super_block *sb, 4901 unsigned long orig_ino, char *buf) 4902 { 4903 struct other_inode oi; 4904 unsigned long ino; 4905 int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; 4906 int inode_size = EXT4_INODE_SIZE(sb); 4907 4908 oi.orig_ino = orig_ino; 4909 /* 4910 * Calculate the first inode in the inode table block. Inode 4911 * numbers are one-based. That is, the first inode in a block 4912 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). 4913 */ 4914 ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; 4915 for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { 4916 if (ino == orig_ino) 4917 continue; 4918 oi.raw_inode = (struct ext4_inode *) buf; 4919 (void) find_inode_nowait(sb, ino, other_inode_match, &oi); 4920 } 4921 } 4922 4923 /* 4924 * Post the struct inode info into an on-disk inode location in the 4925 * buffer-cache. This gobbles the caller's reference to the 4926 * buffer_head in the inode location struct. 4927 * 4928 * The caller must have write access to iloc->bh. 4929 */ 4930 static int ext4_do_update_inode(handle_t *handle, 4931 struct inode *inode, 4932 struct ext4_iloc *iloc) 4933 { 4934 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4935 struct ext4_inode_info *ei = EXT4_I(inode); 4936 struct buffer_head *bh = iloc->bh; 4937 struct super_block *sb = inode->i_sb; 4938 int err = 0, rc, block; 4939 int need_datasync = 0, set_large_file = 0; 4940 uid_t i_uid; 4941 gid_t i_gid; 4942 projid_t i_projid; 4943 4944 spin_lock(&ei->i_raw_lock); 4945 4946 /* For fields not tracked in the in-memory inode, 4947 * initialise them to zero for new inodes. */ 4948 if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) 4949 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 4950 4951 raw_inode->i_mode = cpu_to_le16(inode->i_mode); 4952 i_uid = i_uid_read(inode); 4953 i_gid = i_gid_read(inode); 4954 i_projid = from_kprojid(&init_user_ns, ei->i_projid); 4955 if (!(test_opt(inode->i_sb, NO_UID32))) { 4956 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); 4957 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); 4958 /* 4959 * Fix up interoperability with old kernels. Otherwise, old inodes get 4960 * re-used with the upper 16 bits of the uid/gid intact 4961 */ 4962 if (ei->i_dtime && list_empty(&ei->i_orphan)) { 4963 raw_inode->i_uid_high = 0; 4964 raw_inode->i_gid_high = 0; 4965 } else { 4966 raw_inode->i_uid_high = 4967 cpu_to_le16(high_16_bits(i_uid)); 4968 raw_inode->i_gid_high = 4969 cpu_to_le16(high_16_bits(i_gid)); 4970 } 4971 } else { 4972 raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); 4973 raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); 4974 raw_inode->i_uid_high = 0; 4975 raw_inode->i_gid_high = 0; 4976 } 4977 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); 4978 4979 EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); 4980 EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); 4981 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4982 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4983 4984 err = ext4_inode_blocks_set(handle, raw_inode, ei); 4985 if (err) { 4986 spin_unlock(&ei->i_raw_lock); 4987 goto out_brelse; 4988 } 4989 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4990 raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); 4991 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) 4992 raw_inode->i_file_acl_high = 4993 cpu_to_le16(ei->i_file_acl >> 32); 4994 raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); 4995 if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { 4996 ext4_isize_set(raw_inode, ei->i_disksize); 4997 need_datasync = 1; 4998 } 4999 if (ei->i_disksize > 0x7fffffffULL) { 5000 if (!ext4_has_feature_large_file(sb) || 5001 EXT4_SB(sb)->s_es->s_rev_level == 5002 cpu_to_le32(EXT4_GOOD_OLD_REV)) 5003 set_large_file = 1; 5004 } 5005 raw_inode->i_generation = cpu_to_le32(inode->i_generation); 5006 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 5007 if (old_valid_dev(inode->i_rdev)) { 5008 raw_inode->i_block[0] = 5009 cpu_to_le32(old_encode_dev(inode->i_rdev)); 5010 raw_inode->i_block[1] = 0; 5011 } else { 5012 raw_inode->i_block[0] = 0; 5013 raw_inode->i_block[1] = 5014 cpu_to_le32(new_encode_dev(inode->i_rdev)); 5015 raw_inode->i_block[2] = 0; 5016 } 5017 } else if (!ext4_has_inline_data(inode)) { 5018 for (block = 0; block < EXT4_N_BLOCKS; block++) 5019 raw_inode->i_block[block] = ei->i_data[block]; 5020 } 5021 5022 if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { 5023 u64 ivers = ext4_inode_peek_iversion(inode); 5024 5025 raw_inode->i_disk_version = cpu_to_le32(ivers); 5026 if (ei->i_extra_isize) { 5027 if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) 5028 raw_inode->i_version_hi = 5029 cpu_to_le32(ivers >> 32); 5030 raw_inode->i_extra_isize = 5031 cpu_to_le16(ei->i_extra_isize); 5032 } 5033 } 5034 5035 BUG_ON(!ext4_has_feature_project(inode->i_sb) && 5036 i_projid != EXT4_DEF_PROJID); 5037 5038 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && 5039 EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) 5040 raw_inode->i_projid = cpu_to_le32(i_projid); 5041 5042 ext4_inode_csum_set(inode, raw_inode, ei); 5043 spin_unlock(&ei->i_raw_lock); 5044 if (inode->i_sb->s_flags & SB_LAZYTIME) 5045 ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, 5046 bh->b_data); 5047 5048 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 5049 rc = ext4_handle_dirty_metadata(handle, NULL, bh); 5050 if (!err) 5051 err = rc; 5052 ext4_clear_inode_state(inode, EXT4_STATE_NEW); 5053 if (set_large_file) { 5054 BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); 5055 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); 5056 if (err) 5057 goto out_brelse; 5058 ext4_set_feature_large_file(sb); 5059 ext4_handle_sync(handle); 5060 err = ext4_handle_dirty_super(handle, sb); 5061 } 5062 ext4_update_inode_fsync_trans(handle, inode, need_datasync); 5063 out_brelse: 5064 brelse(bh); 5065 ext4_std_error(inode->i_sb, err); 5066 return err; 5067 } 5068 5069 /* 5070 * ext4_write_inode() 5071 * 5072 * We are called from a few places: 5073 * 5074 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. 5075 * Here, there will be no transaction running. We wait for any running 5076 * transaction to commit. 5077 * 5078 * - Within flush work (sys_sync(), kupdate and such). 5079 * We wait on commit, if told to. 5080 * 5081 * - Within iput_final() -> write_inode_now() 5082 * We wait on commit, if told to. 5083 * 5084 * In all cases it is actually safe for us to return without doing anything, 5085 * because the inode has been copied into a raw inode buffer in 5086 * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL 5087 * writeback. 5088 * 5089 * Note that we are absolutely dependent upon all inode dirtiers doing the 5090 * right thing: they *must* call mark_inode_dirty() after dirtying info in 5091 * which we are interested. 5092 * 5093 * It would be a bug for them to not do this. The code: 5094 * 5095 * mark_inode_dirty(inode) 5096 * stuff(); 5097 * inode->i_size = expr; 5098 * 5099 * is in error because write_inode() could occur while `stuff()' is running, 5100 * and the new i_size will be lost. Plus the inode will no longer be on the 5101 * superblock's dirty inode list. 5102 */ 5103 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) 5104 { 5105 int err; 5106 5107 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || 5108 sb_rdonly(inode->i_sb)) 5109 return 0; 5110 5111 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5112 return -EIO; 5113 5114 if (EXT4_SB(inode->i_sb)->s_journal) { 5115 if (ext4_journal_current_handle()) { 5116 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 5117 dump_stack(); 5118 return -EIO; 5119 } 5120 5121 /* 5122 * No need to force transaction in WB_SYNC_NONE mode. Also 5123 * ext4_sync_fs() will force the commit after everything is 5124 * written. 5125 */ 5126 if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) 5127 return 0; 5128 5129 err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, 5130 EXT4_I(inode)->i_sync_tid); 5131 } else { 5132 struct ext4_iloc iloc; 5133 5134 err = __ext4_get_inode_loc(inode, &iloc, 0); 5135 if (err) 5136 return err; 5137 /* 5138 * sync(2) will flush the whole buffer cache. No need to do 5139 * it here separately for each inode. 5140 */ 5141 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 5142 sync_dirty_buffer(iloc.bh); 5143 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5144 ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, 5145 "IO error syncing inode"); 5146 err = -EIO; 5147 } 5148 brelse(iloc.bh); 5149 } 5150 return err; 5151 } 5152 5153 /* 5154 * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate 5155 * buffers that are attached to a page stradding i_size and are undergoing 5156 * commit. In that case we have to wait for commit to finish and try again. 5157 */ 5158 static void ext4_wait_for_tail_page_commit(struct inode *inode) 5159 { 5160 struct page *page; 5161 unsigned offset; 5162 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 5163 tid_t commit_tid = 0; 5164 int ret; 5165 5166 offset = inode->i_size & (PAGE_SIZE - 1); 5167 /* 5168 * If the page is fully truncated, we don't need to wait for any commit 5169 * (and we even should not as __ext4_journalled_invalidatepage() may 5170 * strip all buffers from the page but keep the page dirty which can then 5171 * confuse e.g. concurrent ext4_writepage() seeing dirty page without 5172 * buffers). Also we don't need to wait for any commit if all buffers in 5173 * the page remain valid. This is most beneficial for the common case of 5174 * blocksize == PAGESIZE. 5175 */ 5176 if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) 5177 return; 5178 while (1) { 5179 page = find_lock_page(inode->i_mapping, 5180 inode->i_size >> PAGE_SHIFT); 5181 if (!page) 5182 return; 5183 ret = __ext4_journalled_invalidatepage(page, offset, 5184 PAGE_SIZE - offset); 5185 unlock_page(page); 5186 put_page(page); 5187 if (ret != -EBUSY) 5188 return; 5189 commit_tid = 0; 5190 read_lock(&journal->j_state_lock); 5191 if (journal->j_committing_transaction) 5192 commit_tid = journal->j_committing_transaction->t_tid; 5193 read_unlock(&journal->j_state_lock); 5194 if (commit_tid) 5195 jbd2_log_wait_commit(journal, commit_tid); 5196 } 5197 } 5198 5199 /* 5200 * ext4_setattr() 5201 * 5202 * Called from notify_change. 5203 * 5204 * We want to trap VFS attempts to truncate the file as soon as 5205 * possible. In particular, we want to make sure that when the VFS 5206 * shrinks i_size, we put the inode on the orphan list and modify 5207 * i_disksize immediately, so that during the subsequent flushing of 5208 * dirty pages and freeing of disk blocks, we can guarantee that any 5209 * commit will leave the blocks being flushed in an unused state on 5210 * disk. (On recovery, the inode will get truncated and the blocks will 5211 * be freed, so we have a strong guarantee that no future commit will 5212 * leave these blocks visible to the user.) 5213 * 5214 * Another thing we have to assure is that if we are in ordered mode 5215 * and inode is still attached to the committing transaction, we must 5216 * we start writeout of all the dirty pages which are being truncated. 5217 * This way we are sure that all the data written in the previous 5218 * transaction are already on disk (truncate waits for pages under 5219 * writeback). 5220 * 5221 * Called with inode->i_mutex down. 5222 */ 5223 int ext4_setattr(struct dentry *dentry, struct iattr *attr) 5224 { 5225 struct inode *inode = d_inode(dentry); 5226 int error, rc = 0; 5227 int orphan = 0; 5228 const unsigned int ia_valid = attr->ia_valid; 5229 5230 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5231 return -EIO; 5232 5233 if (unlikely(IS_IMMUTABLE(inode))) 5234 return -EPERM; 5235 5236 if (unlikely(IS_APPEND(inode) && 5237 (ia_valid & (ATTR_MODE | ATTR_UID | 5238 ATTR_GID | ATTR_TIMES_SET)))) 5239 return -EPERM; 5240 5241 error = setattr_prepare(dentry, attr); 5242 if (error) 5243 return error; 5244 5245 error = fscrypt_prepare_setattr(dentry, attr); 5246 if (error) 5247 return error; 5248 5249 error = fsverity_prepare_setattr(dentry, attr); 5250 if (error) 5251 return error; 5252 5253 if (is_quota_modification(inode, attr)) { 5254 error = dquot_initialize(inode); 5255 if (error) 5256 return error; 5257 } 5258 if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || 5259 (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { 5260 handle_t *handle; 5261 5262 /* (user+group)*(old+new) structure, inode write (sb, 5263 * inode block, ? - but truncate inode update has it) */ 5264 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 5265 (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + 5266 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); 5267 if (IS_ERR(handle)) { 5268 error = PTR_ERR(handle); 5269 goto err_out; 5270 } 5271 5272 /* dquot_transfer() calls back ext4_get_inode_usage() which 5273 * counts xattr inode references. 5274 */ 5275 down_read(&EXT4_I(inode)->xattr_sem); 5276 error = dquot_transfer(inode, attr); 5277 up_read(&EXT4_I(inode)->xattr_sem); 5278 5279 if (error) { 5280 ext4_journal_stop(handle); 5281 return error; 5282 } 5283 /* Update corresponding info in inode so that everything is in 5284 * one transaction */ 5285 if (attr->ia_valid & ATTR_UID) 5286 inode->i_uid = attr->ia_uid; 5287 if (attr->ia_valid & ATTR_GID) 5288 inode->i_gid = attr->ia_gid; 5289 error = ext4_mark_inode_dirty(handle, inode); 5290 ext4_journal_stop(handle); 5291 } 5292 5293 if (attr->ia_valid & ATTR_SIZE) { 5294 handle_t *handle; 5295 loff_t oldsize = inode->i_size; 5296 int shrink = (attr->ia_size < inode->i_size); 5297 5298 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5299 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5300 5301 if (attr->ia_size > sbi->s_bitmap_maxbytes) 5302 return -EFBIG; 5303 } 5304 if (!S_ISREG(inode->i_mode)) 5305 return -EINVAL; 5306 5307 if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) 5308 inode_inc_iversion(inode); 5309 5310 if (shrink) { 5311 if (ext4_should_order_data(inode)) { 5312 error = ext4_begin_ordered_truncate(inode, 5313 attr->ia_size); 5314 if (error) 5315 goto err_out; 5316 } 5317 /* 5318 * Blocks are going to be removed from the inode. Wait 5319 * for dio in flight. 5320 */ 5321 inode_dio_wait(inode); 5322 } 5323 5324 down_write(&EXT4_I(inode)->i_mmap_sem); 5325 5326 rc = ext4_break_layouts(inode); 5327 if (rc) { 5328 up_write(&EXT4_I(inode)->i_mmap_sem); 5329 return rc; 5330 } 5331 5332 if (attr->ia_size != inode->i_size) { 5333 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 5334 if (IS_ERR(handle)) { 5335 error = PTR_ERR(handle); 5336 goto out_mmap_sem; 5337 } 5338 if (ext4_handle_valid(handle) && shrink) { 5339 error = ext4_orphan_add(handle, inode); 5340 orphan = 1; 5341 } 5342 /* 5343 * Update c/mtime on truncate up, ext4_truncate() will 5344 * update c/mtime in shrink case below 5345 */ 5346 if (!shrink) { 5347 inode->i_mtime = current_time(inode); 5348 inode->i_ctime = inode->i_mtime; 5349 } 5350 down_write(&EXT4_I(inode)->i_data_sem); 5351 EXT4_I(inode)->i_disksize = attr->ia_size; 5352 rc = ext4_mark_inode_dirty(handle, inode); 5353 if (!error) 5354 error = rc; 5355 /* 5356 * We have to update i_size under i_data_sem together 5357 * with i_disksize to avoid races with writeback code 5358 * running ext4_wb_update_i_disksize(). 5359 */ 5360 if (!error) 5361 i_size_write(inode, attr->ia_size); 5362 up_write(&EXT4_I(inode)->i_data_sem); 5363 ext4_journal_stop(handle); 5364 if (error) 5365 goto out_mmap_sem; 5366 if (!shrink) { 5367 pagecache_isize_extended(inode, oldsize, 5368 inode->i_size); 5369 } else if (ext4_should_journal_data(inode)) { 5370 ext4_wait_for_tail_page_commit(inode); 5371 } 5372 } 5373 5374 /* 5375 * Truncate pagecache after we've waited for commit 5376 * in data=journal mode to make pages freeable. 5377 */ 5378 truncate_pagecache(inode, inode->i_size); 5379 /* 5380 * Call ext4_truncate() even if i_size didn't change to 5381 * truncate possible preallocated blocks. 5382 */ 5383 if (attr->ia_size <= oldsize) { 5384 rc = ext4_truncate(inode); 5385 if (rc) 5386 error = rc; 5387 } 5388 out_mmap_sem: 5389 up_write(&EXT4_I(inode)->i_mmap_sem); 5390 } 5391 5392 if (!error) { 5393 setattr_copy(inode, attr); 5394 mark_inode_dirty(inode); 5395 } 5396 5397 /* 5398 * If the call to ext4_truncate failed to get a transaction handle at 5399 * all, we need to clean up the in-core orphan list manually. 5400 */ 5401 if (orphan && inode->i_nlink) 5402 ext4_orphan_del(NULL, inode); 5403 5404 if (!error && (ia_valid & ATTR_MODE)) 5405 rc = posix_acl_chmod(inode, inode->i_mode); 5406 5407 err_out: 5408 ext4_std_error(inode->i_sb, error); 5409 if (!error) 5410 error = rc; 5411 return error; 5412 } 5413 5414 int ext4_getattr(const struct path *path, struct kstat *stat, 5415 u32 request_mask, unsigned int query_flags) 5416 { 5417 struct inode *inode = d_inode(path->dentry); 5418 struct ext4_inode *raw_inode; 5419 struct ext4_inode_info *ei = EXT4_I(inode); 5420 unsigned int flags; 5421 5422 if ((request_mask & STATX_BTIME) && 5423 EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { 5424 stat->result_mask |= STATX_BTIME; 5425 stat->btime.tv_sec = ei->i_crtime.tv_sec; 5426 stat->btime.tv_nsec = ei->i_crtime.tv_nsec; 5427 } 5428 5429 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; 5430 if (flags & EXT4_APPEND_FL) 5431 stat->attributes |= STATX_ATTR_APPEND; 5432 if (flags & EXT4_COMPR_FL) 5433 stat->attributes |= STATX_ATTR_COMPRESSED; 5434 if (flags & EXT4_ENCRYPT_FL) 5435 stat->attributes |= STATX_ATTR_ENCRYPTED; 5436 if (flags & EXT4_IMMUTABLE_FL) 5437 stat->attributes |= STATX_ATTR_IMMUTABLE; 5438 if (flags & EXT4_NODUMP_FL) 5439 stat->attributes |= STATX_ATTR_NODUMP; 5440 if (flags & EXT4_VERITY_FL) 5441 stat->attributes |= STATX_ATTR_VERITY; 5442 5443 stat->attributes_mask |= (STATX_ATTR_APPEND | 5444 STATX_ATTR_COMPRESSED | 5445 STATX_ATTR_ENCRYPTED | 5446 STATX_ATTR_IMMUTABLE | 5447 STATX_ATTR_NODUMP | 5448 STATX_ATTR_VERITY); 5449 5450 generic_fillattr(inode, stat); 5451 return 0; 5452 } 5453 5454 int ext4_file_getattr(const struct path *path, struct kstat *stat, 5455 u32 request_mask, unsigned int query_flags) 5456 { 5457 struct inode *inode = d_inode(path->dentry); 5458 u64 delalloc_blocks; 5459 5460 ext4_getattr(path, stat, request_mask, query_flags); 5461 5462 /* 5463 * If there is inline data in the inode, the inode will normally not 5464 * have data blocks allocated (it may have an external xattr block). 5465 * Report at least one sector for such files, so tools like tar, rsync, 5466 * others don't incorrectly think the file is completely sparse. 5467 */ 5468 if (unlikely(ext4_has_inline_data(inode))) 5469 stat->blocks += (stat->size + 511) >> 9; 5470 5471 /* 5472 * We can't update i_blocks if the block allocation is delayed 5473 * otherwise in the case of system crash before the real block 5474 * allocation is done, we will have i_blocks inconsistent with 5475 * on-disk file blocks. 5476 * We always keep i_blocks updated together with real 5477 * allocation. But to not confuse with user, stat 5478 * will return the blocks that include the delayed allocation 5479 * blocks for this file. 5480 */ 5481 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 5482 EXT4_I(inode)->i_reserved_data_blocks); 5483 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); 5484 return 0; 5485 } 5486 5487 static int ext4_index_trans_blocks(struct inode *inode, int lblocks, 5488 int pextents) 5489 { 5490 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 5491 return ext4_ind_trans_blocks(inode, lblocks); 5492 return ext4_ext_index_trans_blocks(inode, pextents); 5493 } 5494 5495 /* 5496 * Account for index blocks, block groups bitmaps and block group 5497 * descriptor blocks if modify datablocks and index blocks 5498 * worse case, the indexs blocks spread over different block groups 5499 * 5500 * If datablocks are discontiguous, they are possible to spread over 5501 * different block groups too. If they are contiguous, with flexbg, 5502 * they could still across block group boundary. 5503 * 5504 * Also account for superblock, inode, quota and xattr blocks 5505 */ 5506 static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 5507 int pextents) 5508 { 5509 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 5510 int gdpblocks; 5511 int idxblocks; 5512 int ret = 0; 5513 5514 /* 5515 * How many index blocks need to touch to map @lblocks logical blocks 5516 * to @pextents physical extents? 5517 */ 5518 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); 5519 5520 ret = idxblocks; 5521 5522 /* 5523 * Now let's see how many group bitmaps and group descriptors need 5524 * to account 5525 */ 5526 groups = idxblocks + pextents; 5527 gdpblocks = groups; 5528 if (groups > ngroups) 5529 groups = ngroups; 5530 if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) 5531 gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; 5532 5533 /* bitmaps and block group descriptor blocks */ 5534 ret += groups + gdpblocks; 5535 5536 /* Blocks for super block, inode, quota and xattr blocks */ 5537 ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); 5538 5539 return ret; 5540 } 5541 5542 /* 5543 * Calculate the total number of credits to reserve to fit 5544 * the modification of a single pages into a single transaction, 5545 * which may include multiple chunks of block allocations. 5546 * 5547 * This could be called via ext4_write_begin() 5548 * 5549 * We need to consider the worse case, when 5550 * one new block per extent. 5551 */ 5552 int ext4_writepage_trans_blocks(struct inode *inode) 5553 { 5554 int bpp = ext4_journal_blocks_per_page(inode); 5555 int ret; 5556 5557 ret = ext4_meta_trans_blocks(inode, bpp, bpp); 5558 5559 /* Account for data blocks for journalled mode */ 5560 if (ext4_should_journal_data(inode)) 5561 ret += bpp; 5562 return ret; 5563 } 5564 5565 /* 5566 * Calculate the journal credits for a chunk of data modification. 5567 * 5568 * This is called from DIO, fallocate or whoever calling 5569 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. 5570 * 5571 * journal buffers for data blocks are not included here, as DIO 5572 * and fallocate do no need to journal data buffers. 5573 */ 5574 int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) 5575 { 5576 return ext4_meta_trans_blocks(inode, nrblocks, 1); 5577 } 5578 5579 /* 5580 * The caller must have previously called ext4_reserve_inode_write(). 5581 * Give this, we know that the caller already has write access to iloc->bh. 5582 */ 5583 int ext4_mark_iloc_dirty(handle_t *handle, 5584 struct inode *inode, struct ext4_iloc *iloc) 5585 { 5586 int err = 0; 5587 5588 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { 5589 put_bh(iloc->bh); 5590 return -EIO; 5591 } 5592 if (IS_I_VERSION(inode)) 5593 inode_inc_iversion(inode); 5594 5595 /* the do_update_inode consumes one bh->b_count */ 5596 get_bh(iloc->bh); 5597 5598 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5599 err = ext4_do_update_inode(handle, inode, iloc); 5600 put_bh(iloc->bh); 5601 return err; 5602 } 5603 5604 /* 5605 * On success, We end up with an outstanding reference count against 5606 * iloc->bh. This _must_ be cleaned up later. 5607 */ 5608 5609 int 5610 ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 5611 struct ext4_iloc *iloc) 5612 { 5613 int err; 5614 5615 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 5616 return -EIO; 5617 5618 err = ext4_get_inode_loc(inode, iloc); 5619 if (!err) { 5620 BUFFER_TRACE(iloc->bh, "get_write_access"); 5621 err = ext4_journal_get_write_access(handle, iloc->bh); 5622 if (err) { 5623 brelse(iloc->bh); 5624 iloc->bh = NULL; 5625 } 5626 } 5627 ext4_std_error(inode->i_sb, err); 5628 return err; 5629 } 5630 5631 static int __ext4_expand_extra_isize(struct inode *inode, 5632 unsigned int new_extra_isize, 5633 struct ext4_iloc *iloc, 5634 handle_t *handle, int *no_expand) 5635 { 5636 struct ext4_inode *raw_inode; 5637 struct ext4_xattr_ibody_header *header; 5638 unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); 5639 struct ext4_inode_info *ei = EXT4_I(inode); 5640 int error; 5641 5642 /* this was checked at iget time, but double check for good measure */ 5643 if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || 5644 (ei->i_extra_isize & 3)) { 5645 EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", 5646 ei->i_extra_isize, 5647 EXT4_INODE_SIZE(inode->i_sb)); 5648 return -EFSCORRUPTED; 5649 } 5650 if ((new_extra_isize < ei->i_extra_isize) || 5651 (new_extra_isize < 4) || 5652 (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) 5653 return -EINVAL; /* Should never happen */ 5654 5655 raw_inode = ext4_raw_inode(iloc); 5656 5657 header = IHDR(inode, raw_inode); 5658 5659 /* No extended attributes present */ 5660 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5661 header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { 5662 memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + 5663 EXT4_I(inode)->i_extra_isize, 0, 5664 new_extra_isize - EXT4_I(inode)->i_extra_isize); 5665 EXT4_I(inode)->i_extra_isize = new_extra_isize; 5666 return 0; 5667 } 5668 5669 /* try to expand with EAs present */ 5670 error = ext4_expand_extra_isize_ea(inode, new_extra_isize, 5671 raw_inode, handle); 5672 if (error) { 5673 /* 5674 * Inode size expansion failed; don't try again 5675 */ 5676 *no_expand = 1; 5677 } 5678 5679 return error; 5680 } 5681 5682 /* 5683 * Expand an inode by new_extra_isize bytes. 5684 * Returns 0 on success or negative error number on failure. 5685 */ 5686 static int ext4_try_to_expand_extra_isize(struct inode *inode, 5687 unsigned int new_extra_isize, 5688 struct ext4_iloc iloc, 5689 handle_t *handle) 5690 { 5691 int no_expand; 5692 int error; 5693 5694 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) 5695 return -EOVERFLOW; 5696 5697 /* 5698 * In nojournal mode, we can immediately attempt to expand 5699 * the inode. When journaled, we first need to obtain extra 5700 * buffer credits since we may write into the EA block 5701 * with this same handle. If journal_extend fails, then it will 5702 * only result in a minor loss of functionality for that inode. 5703 * If this is felt to be critical, then e2fsck should be run to 5704 * force a large enough s_min_extra_isize. 5705 */ 5706 if (ext4_journal_extend(handle, 5707 EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) 5708 return -ENOSPC; 5709 5710 if (ext4_write_trylock_xattr(inode, &no_expand) == 0) 5711 return -EBUSY; 5712 5713 error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, 5714 handle, &no_expand); 5715 ext4_write_unlock_xattr(inode, &no_expand); 5716 5717 return error; 5718 } 5719 5720 int ext4_expand_extra_isize(struct inode *inode, 5721 unsigned int new_extra_isize, 5722 struct ext4_iloc *iloc) 5723 { 5724 handle_t *handle; 5725 int no_expand; 5726 int error, rc; 5727 5728 if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5729 brelse(iloc->bh); 5730 return -EOVERFLOW; 5731 } 5732 5733 handle = ext4_journal_start(inode, EXT4_HT_INODE, 5734 EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); 5735 if (IS_ERR(handle)) { 5736 error = PTR_ERR(handle); 5737 brelse(iloc->bh); 5738 return error; 5739 } 5740 5741 ext4_write_lock_xattr(inode, &no_expand); 5742 5743 BUFFER_TRACE(iloc->bh, "get_write_access"); 5744 error = ext4_journal_get_write_access(handle, iloc->bh); 5745 if (error) { 5746 brelse(iloc->bh); 5747 goto out_unlock; 5748 } 5749 5750 error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, 5751 handle, &no_expand); 5752 5753 rc = ext4_mark_iloc_dirty(handle, inode, iloc); 5754 if (!error) 5755 error = rc; 5756 5757 out_unlock: 5758 ext4_write_unlock_xattr(inode, &no_expand); 5759 ext4_journal_stop(handle); 5760 return error; 5761 } 5762 5763 /* 5764 * What we do here is to mark the in-core inode as clean with respect to inode 5765 * dirtiness (it may still be data-dirty). 5766 * This means that the in-core inode may be reaped by prune_icache 5767 * without having to perform any I/O. This is a very good thing, 5768 * because *any* task may call prune_icache - even ones which 5769 * have a transaction open against a different journal. 5770 * 5771 * Is this cheating? Not really. Sure, we haven't written the 5772 * inode out, but prune_icache isn't a user-visible syncing function. 5773 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) 5774 * we start and wait on commits. 5775 */ 5776 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) 5777 { 5778 struct ext4_iloc iloc; 5779 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5780 int err; 5781 5782 might_sleep(); 5783 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5784 err = ext4_reserve_inode_write(handle, inode, &iloc); 5785 if (err) 5786 return err; 5787 5788 if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) 5789 ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, 5790 iloc, handle); 5791 5792 return ext4_mark_iloc_dirty(handle, inode, &iloc); 5793 } 5794 5795 /* 5796 * ext4_dirty_inode() is called from __mark_inode_dirty() 5797 * 5798 * We're really interested in the case where a file is being extended. 5799 * i_size has been changed by generic_commit_write() and we thus need 5800 * to include the updated inode in the current transaction. 5801 * 5802 * Also, dquot_alloc_block() will always dirty the inode when blocks 5803 * are allocated to the file. 5804 * 5805 * If the inode is marked synchronous, we don't honour that here - doing 5806 * so would cause a commit on atime updates, which we don't bother doing. 5807 * We handle synchronous inodes at the highest possible level. 5808 * 5809 * If only the I_DIRTY_TIME flag is set, we can skip everything. If 5810 * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need 5811 * to copy into the on-disk inode structure are the timestamp files. 5812 */ 5813 void ext4_dirty_inode(struct inode *inode, int flags) 5814 { 5815 handle_t *handle; 5816 5817 if (flags == I_DIRTY_TIME) 5818 return; 5819 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 5820 if (IS_ERR(handle)) 5821 goto out; 5822 5823 ext4_mark_inode_dirty(handle, inode); 5824 5825 ext4_journal_stop(handle); 5826 out: 5827 return; 5828 } 5829 5830 int ext4_change_inode_journal_flag(struct inode *inode, int val) 5831 { 5832 journal_t *journal; 5833 handle_t *handle; 5834 int err; 5835 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5836 5837 /* 5838 * We have to be very careful here: changing a data block's 5839 * journaling status dynamically is dangerous. If we write a 5840 * data block to the journal, change the status and then delete 5841 * that block, we risk forgetting to revoke the old log record 5842 * from the journal and so a subsequent replay can corrupt data. 5843 * So, first we make sure that the journal is empty and that 5844 * nobody is changing anything. 5845 */ 5846 5847 journal = EXT4_JOURNAL(inode); 5848 if (!journal) 5849 return 0; 5850 if (is_journal_aborted(journal)) 5851 return -EROFS; 5852 5853 /* Wait for all existing dio workers */ 5854 inode_dio_wait(inode); 5855 5856 /* 5857 * Before flushing the journal and switching inode's aops, we have 5858 * to flush all dirty data the inode has. There can be outstanding 5859 * delayed allocations, there can be unwritten extents created by 5860 * fallocate or buffered writes in dioread_nolock mode covered by 5861 * dirty data which can be converted only after flushing the dirty 5862 * data (and journalled aops don't know how to handle these cases). 5863 */ 5864 if (val) { 5865 down_write(&EXT4_I(inode)->i_mmap_sem); 5866 err = filemap_write_and_wait(inode->i_mapping); 5867 if (err < 0) { 5868 up_write(&EXT4_I(inode)->i_mmap_sem); 5869 return err; 5870 } 5871 } 5872 5873 percpu_down_write(&sbi->s_writepages_rwsem); 5874 jbd2_journal_lock_updates(journal); 5875 5876 /* 5877 * OK, there are no updates running now, and all cached data is 5878 * synced to disk. We are now in a completely consistent state 5879 * which doesn't have anything in the journal, and we know that 5880 * no filesystem updates are running, so it is safe to modify 5881 * the inode's in-core data-journaling state flag now. 5882 */ 5883 5884 if (val) 5885 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5886 else { 5887 err = jbd2_journal_flush(journal); 5888 if (err < 0) { 5889 jbd2_journal_unlock_updates(journal); 5890 percpu_up_write(&sbi->s_writepages_rwsem); 5891 return err; 5892 } 5893 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 5894 } 5895 ext4_set_aops(inode); 5896 5897 jbd2_journal_unlock_updates(journal); 5898 percpu_up_write(&sbi->s_writepages_rwsem); 5899 5900 if (val) 5901 up_write(&EXT4_I(inode)->i_mmap_sem); 5902 5903 /* Finally we can mark the inode as dirty. */ 5904 5905 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 5906 if (IS_ERR(handle)) 5907 return PTR_ERR(handle); 5908 5909 err = ext4_mark_inode_dirty(handle, inode); 5910 ext4_handle_sync(handle); 5911 ext4_journal_stop(handle); 5912 ext4_std_error(inode->i_sb, err); 5913 5914 return err; 5915 } 5916 5917 static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) 5918 { 5919 return !buffer_mapped(bh); 5920 } 5921 5922 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) 5923 { 5924 struct vm_area_struct *vma = vmf->vma; 5925 struct page *page = vmf->page; 5926 loff_t size; 5927 unsigned long len; 5928 int err; 5929 vm_fault_t ret; 5930 struct file *file = vma->vm_file; 5931 struct inode *inode = file_inode(file); 5932 struct address_space *mapping = inode->i_mapping; 5933 handle_t *handle; 5934 get_block_t *get_block; 5935 int retries = 0; 5936 5937 if (unlikely(IS_IMMUTABLE(inode))) 5938 return VM_FAULT_SIGBUS; 5939 5940 sb_start_pagefault(inode->i_sb); 5941 file_update_time(vma->vm_file); 5942 5943 down_read(&EXT4_I(inode)->i_mmap_sem); 5944 5945 err = ext4_convert_inline_data(inode); 5946 if (err) 5947 goto out_ret; 5948 5949 /* Delalloc case is easy... */ 5950 if (test_opt(inode->i_sb, DELALLOC) && 5951 !ext4_should_journal_data(inode) && 5952 !ext4_nonda_switch(inode->i_sb)) { 5953 do { 5954 err = block_page_mkwrite(vma, vmf, 5955 ext4_da_get_block_prep); 5956 } while (err == -ENOSPC && 5957 ext4_should_retry_alloc(inode->i_sb, &retries)); 5958 goto out_ret; 5959 } 5960 5961 lock_page(page); 5962 size = i_size_read(inode); 5963 /* Page got truncated from under us? */ 5964 if (page->mapping != mapping || page_offset(page) > size) { 5965 unlock_page(page); 5966 ret = VM_FAULT_NOPAGE; 5967 goto out; 5968 } 5969 5970 if (page->index == size >> PAGE_SHIFT) 5971 len = size & ~PAGE_MASK; 5972 else 5973 len = PAGE_SIZE; 5974 /* 5975 * Return if we have all the buffers mapped. This avoids the need to do 5976 * journal_start/journal_stop which can block and take a long time 5977 */ 5978 if (page_has_buffers(page)) { 5979 if (!ext4_walk_page_buffers(NULL, page_buffers(page), 5980 0, len, NULL, 5981 ext4_bh_unmapped)) { 5982 /* Wait so that we don't change page under IO */ 5983 wait_for_stable_page(page); 5984 ret = VM_FAULT_LOCKED; 5985 goto out; 5986 } 5987 } 5988 unlock_page(page); 5989 /* OK, we need to fill the hole... */ 5990 if (ext4_should_dioread_nolock(inode)) 5991 get_block = ext4_get_block_unwritten; 5992 else 5993 get_block = ext4_get_block; 5994 retry_alloc: 5995 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 5996 ext4_writepage_trans_blocks(inode)); 5997 if (IS_ERR(handle)) { 5998 ret = VM_FAULT_SIGBUS; 5999 goto out; 6000 } 6001 err = block_page_mkwrite(vma, vmf, get_block); 6002 if (!err && ext4_should_journal_data(inode)) { 6003 if (ext4_walk_page_buffers(handle, page_buffers(page), 0, 6004 PAGE_SIZE, NULL, do_journal_get_write_access)) { 6005 unlock_page(page); 6006 ret = VM_FAULT_SIGBUS; 6007 ext4_journal_stop(handle); 6008 goto out; 6009 } 6010 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 6011 } 6012 ext4_journal_stop(handle); 6013 if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 6014 goto retry_alloc; 6015 out_ret: 6016 ret = block_page_mkwrite_return(err); 6017 out: 6018 up_read(&EXT4_I(inode)->i_mmap_sem); 6019 sb_end_pagefault(inode->i_sb); 6020 return ret; 6021 } 6022 6023 vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) 6024 { 6025 struct inode *inode = file_inode(vmf->vma->vm_file); 6026 vm_fault_t ret; 6027 6028 down_read(&EXT4_I(inode)->i_mmap_sem); 6029 ret = filemap_fault(vmf); 6030 up_read(&EXT4_I(inode)->i_mmap_sem); 6031 6032 return ret; 6033 } 6034