1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/super.c 4 * 5 * Copyright (C) 1992, 1993, 1994, 1995 6 * Remy Card (card@masi.ibp.fr) 7 * Laboratoire MASI - Institut Blaise Pascal 8 * Universite Pierre et Marie Curie (Paris VI) 9 * 10 * from 11 * 12 * linux/fs/minix/inode.c 13 * 14 * Copyright (C) 1991, 1992 Linus Torvalds 15 * 16 * Big-endian to little-endian byte-swapping/bitmaps by 17 * David S. Miller (davem@caip.rutgers.edu), 1995 18 */ 19 20 #include <linux/module.h> 21 #include <linux/string.h> 22 #include <linux/fs.h> 23 #include <linux/time.h> 24 #include <linux/vmalloc.h> 25 #include <linux/slab.h> 26 #include <linux/init.h> 27 #include <linux/blkdev.h> 28 #include <linux/backing-dev.h> 29 #include <linux/parser.h> 30 #include <linux/buffer_head.h> 31 #include <linux/exportfs.h> 32 #include <linux/vfs.h> 33 #include <linux/random.h> 34 #include <linux/mount.h> 35 #include <linux/namei.h> 36 #include <linux/quotaops.h> 37 #include <linux/seq_file.h> 38 #include <linux/ctype.h> 39 #include <linux/log2.h> 40 #include <linux/crc16.h> 41 #include <linux/dax.h> 42 #include <linux/cleancache.h> 43 #include <linux/uaccess.h> 44 #include <linux/iversion.h> 45 #include <linux/unicode.h> 46 #include <linux/part_stat.h> 47 #include <linux/kthread.h> 48 #include <linux/freezer.h> 49 50 #include "ext4.h" 51 #include "ext4_extents.h" /* Needed for trace points definition */ 52 #include "ext4_jbd2.h" 53 #include "xattr.h" 54 #include "acl.h" 55 #include "mballoc.h" 56 #include "fsmap.h" 57 58 #define CREATE_TRACE_POINTS 59 #include <trace/events/ext4.h> 60 61 static struct ext4_lazy_init *ext4_li_info; 62 static DEFINE_MUTEX(ext4_li_mtx); 63 static struct ratelimit_state ext4_mount_msg_ratelimit; 64 65 static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 66 unsigned long journal_devnum); 67 static int ext4_show_options(struct seq_file *seq, struct dentry *root); 68 static void ext4_update_super(struct super_block *sb); 69 static int ext4_commit_super(struct super_block *sb); 70 static int ext4_mark_recovery_complete(struct super_block *sb, 71 struct ext4_super_block *es); 72 static int ext4_clear_journal_err(struct super_block *sb, 73 struct ext4_super_block *es); 74 static int ext4_sync_fs(struct super_block *sb, int wait); 75 static int ext4_remount(struct super_block *sb, int *flags, char *data); 76 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 77 static int ext4_unfreeze(struct super_block *sb); 78 static int ext4_freeze(struct super_block *sb); 79 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 80 const char *dev_name, void *data); 81 static inline int ext2_feature_set_ok(struct super_block *sb); 82 static inline int ext3_feature_set_ok(struct super_block *sb); 83 static int ext4_feature_set_ok(struct super_block *sb, int readonly); 84 static void ext4_destroy_lazyinit_thread(void); 85 static void ext4_unregister_li_request(struct super_block *sb); 86 static void ext4_clear_request_list(void); 87 static struct inode *ext4_get_journal_inode(struct super_block *sb, 88 unsigned int journal_inum); 89 90 /* 91 * Lock ordering 92 * 93 * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and 94 * i_mmap_rwsem (inode->i_mmap_rwsem)! 95 * 96 * page fault path: 97 * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> 98 * page lock -> i_data_sem (rw) 99 * 100 * buffered write path: 101 * sb_start_write -> i_mutex -> mmap_lock 102 * sb_start_write -> i_mutex -> transaction start -> page lock -> 103 * i_data_sem (rw) 104 * 105 * truncate: 106 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock 107 * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> 108 * i_data_sem (rw) 109 * 110 * direct IO: 111 * sb_start_write -> i_mutex -> mmap_lock 112 * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) 113 * 114 * writepages: 115 * transaction start -> page lock(s) -> i_data_sem (rw) 116 */ 117 118 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) 119 static struct file_system_type ext2_fs_type = { 120 .owner = THIS_MODULE, 121 .name = "ext2", 122 .mount = ext4_mount, 123 .kill_sb = kill_block_super, 124 .fs_flags = FS_REQUIRES_DEV, 125 }; 126 MODULE_ALIAS_FS("ext2"); 127 MODULE_ALIAS("ext2"); 128 #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) 129 #else 130 #define IS_EXT2_SB(sb) (0) 131 #endif 132 133 134 static struct file_system_type ext3_fs_type = { 135 .owner = THIS_MODULE, 136 .name = "ext3", 137 .mount = ext4_mount, 138 .kill_sb = kill_block_super, 139 .fs_flags = FS_REQUIRES_DEV, 140 }; 141 MODULE_ALIAS_FS("ext3"); 142 MODULE_ALIAS("ext3"); 143 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) 144 145 146 static inline void __ext4_read_bh(struct buffer_head *bh, int op_flags, 147 bh_end_io_t *end_io) 148 { 149 /* 150 * buffer's verified bit is no longer valid after reading from 151 * disk again due to write out error, clear it to make sure we 152 * recheck the buffer contents. 153 */ 154 clear_buffer_verified(bh); 155 156 bh->b_end_io = end_io ? end_io : end_buffer_read_sync; 157 get_bh(bh); 158 submit_bh(REQ_OP_READ, op_flags, bh); 159 } 160 161 void ext4_read_bh_nowait(struct buffer_head *bh, int op_flags, 162 bh_end_io_t *end_io) 163 { 164 BUG_ON(!buffer_locked(bh)); 165 166 if (ext4_buffer_uptodate(bh)) { 167 unlock_buffer(bh); 168 return; 169 } 170 __ext4_read_bh(bh, op_flags, end_io); 171 } 172 173 int ext4_read_bh(struct buffer_head *bh, int op_flags, bh_end_io_t *end_io) 174 { 175 BUG_ON(!buffer_locked(bh)); 176 177 if (ext4_buffer_uptodate(bh)) { 178 unlock_buffer(bh); 179 return 0; 180 } 181 182 __ext4_read_bh(bh, op_flags, end_io); 183 184 wait_on_buffer(bh); 185 if (buffer_uptodate(bh)) 186 return 0; 187 return -EIO; 188 } 189 190 int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait) 191 { 192 if (trylock_buffer(bh)) { 193 if (wait) 194 return ext4_read_bh(bh, op_flags, NULL); 195 ext4_read_bh_nowait(bh, op_flags, NULL); 196 return 0; 197 } 198 if (wait) { 199 wait_on_buffer(bh); 200 if (buffer_uptodate(bh)) 201 return 0; 202 return -EIO; 203 } 204 return 0; 205 } 206 207 /* 208 * This works like __bread_gfp() except it uses ERR_PTR for error 209 * returns. Currently with sb_bread it's impossible to distinguish 210 * between ENOMEM and EIO situations (since both result in a NULL 211 * return. 212 */ 213 static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, 214 sector_t block, int op_flags, 215 gfp_t gfp) 216 { 217 struct buffer_head *bh; 218 int ret; 219 220 bh = sb_getblk_gfp(sb, block, gfp); 221 if (bh == NULL) 222 return ERR_PTR(-ENOMEM); 223 if (ext4_buffer_uptodate(bh)) 224 return bh; 225 226 ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); 227 if (ret) { 228 put_bh(bh); 229 return ERR_PTR(ret); 230 } 231 return bh; 232 } 233 234 struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, 235 int op_flags) 236 { 237 return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); 238 } 239 240 struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, 241 sector_t block) 242 { 243 return __ext4_sb_bread_gfp(sb, block, 0, 0); 244 } 245 246 void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) 247 { 248 struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); 249 250 if (likely(bh)) { 251 ext4_read_bh_lock(bh, REQ_RAHEAD, false); 252 brelse(bh); 253 } 254 } 255 256 static int ext4_verify_csum_type(struct super_block *sb, 257 struct ext4_super_block *es) 258 { 259 if (!ext4_has_feature_metadata_csum(sb)) 260 return 1; 261 262 return es->s_checksum_type == EXT4_CRC32C_CHKSUM; 263 } 264 265 static __le32 ext4_superblock_csum(struct super_block *sb, 266 struct ext4_super_block *es) 267 { 268 struct ext4_sb_info *sbi = EXT4_SB(sb); 269 int offset = offsetof(struct ext4_super_block, s_checksum); 270 __u32 csum; 271 272 csum = ext4_chksum(sbi, ~0, (char *)es, offset); 273 274 return cpu_to_le32(csum); 275 } 276 277 static int ext4_superblock_csum_verify(struct super_block *sb, 278 struct ext4_super_block *es) 279 { 280 if (!ext4_has_metadata_csum(sb)) 281 return 1; 282 283 return es->s_checksum == ext4_superblock_csum(sb, es); 284 } 285 286 void ext4_superblock_csum_set(struct super_block *sb) 287 { 288 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 289 290 if (!ext4_has_metadata_csum(sb)) 291 return; 292 293 es->s_checksum = ext4_superblock_csum(sb, es); 294 } 295 296 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 297 struct ext4_group_desc *bg) 298 { 299 return le32_to_cpu(bg->bg_block_bitmap_lo) | 300 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 301 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); 302 } 303 304 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, 305 struct ext4_group_desc *bg) 306 { 307 return le32_to_cpu(bg->bg_inode_bitmap_lo) | 308 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 309 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); 310 } 311 312 ext4_fsblk_t ext4_inode_table(struct super_block *sb, 313 struct ext4_group_desc *bg) 314 { 315 return le32_to_cpu(bg->bg_inode_table_lo) | 316 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 317 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); 318 } 319 320 __u32 ext4_free_group_clusters(struct super_block *sb, 321 struct ext4_group_desc *bg) 322 { 323 return le16_to_cpu(bg->bg_free_blocks_count_lo) | 324 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 325 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); 326 } 327 328 __u32 ext4_free_inodes_count(struct super_block *sb, 329 struct ext4_group_desc *bg) 330 { 331 return le16_to_cpu(bg->bg_free_inodes_count_lo) | 332 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 333 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); 334 } 335 336 __u32 ext4_used_dirs_count(struct super_block *sb, 337 struct ext4_group_desc *bg) 338 { 339 return le16_to_cpu(bg->bg_used_dirs_count_lo) | 340 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 341 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); 342 } 343 344 __u32 ext4_itable_unused_count(struct super_block *sb, 345 struct ext4_group_desc *bg) 346 { 347 return le16_to_cpu(bg->bg_itable_unused_lo) | 348 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? 349 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); 350 } 351 352 void ext4_block_bitmap_set(struct super_block *sb, 353 struct ext4_group_desc *bg, ext4_fsblk_t blk) 354 { 355 bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); 356 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 357 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); 358 } 359 360 void ext4_inode_bitmap_set(struct super_block *sb, 361 struct ext4_group_desc *bg, ext4_fsblk_t blk) 362 { 363 bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); 364 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 365 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); 366 } 367 368 void ext4_inode_table_set(struct super_block *sb, 369 struct ext4_group_desc *bg, ext4_fsblk_t blk) 370 { 371 bg->bg_inode_table_lo = cpu_to_le32((u32)blk); 372 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 373 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); 374 } 375 376 void ext4_free_group_clusters_set(struct super_block *sb, 377 struct ext4_group_desc *bg, __u32 count) 378 { 379 bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); 380 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 381 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); 382 } 383 384 void ext4_free_inodes_set(struct super_block *sb, 385 struct ext4_group_desc *bg, __u32 count) 386 { 387 bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); 388 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 389 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); 390 } 391 392 void ext4_used_dirs_set(struct super_block *sb, 393 struct ext4_group_desc *bg, __u32 count) 394 { 395 bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); 396 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 397 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); 398 } 399 400 void ext4_itable_unused_set(struct super_block *sb, 401 struct ext4_group_desc *bg, __u32 count) 402 { 403 bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); 404 if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) 405 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 406 } 407 408 static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) 409 { 410 now = clamp_val(now, 0, (1ull << 40) - 1); 411 412 *lo = cpu_to_le32(lower_32_bits(now)); 413 *hi = upper_32_bits(now); 414 } 415 416 static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) 417 { 418 return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); 419 } 420 #define ext4_update_tstamp(es, tstamp) \ 421 __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ 422 ktime_get_real_seconds()) 423 #define ext4_get_tstamp(es, tstamp) \ 424 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 425 426 /* 427 * The del_gendisk() function uninitializes the disk-specific data 428 * structures, including the bdi structure, without telling anyone 429 * else. Once this happens, any attempt to call mark_buffer_dirty() 430 * (for example, by ext4_commit_super), will cause a kernel OOPS. 431 * This is a kludge to prevent these oops until we can put in a proper 432 * hook in del_gendisk() to inform the VFS and file system layers. 433 */ 434 static int block_device_ejected(struct super_block *sb) 435 { 436 struct inode *bd_inode = sb->s_bdev->bd_inode; 437 struct backing_dev_info *bdi = inode_to_bdi(bd_inode); 438 439 return bdi->dev == NULL; 440 } 441 442 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) 443 { 444 struct super_block *sb = journal->j_private; 445 struct ext4_sb_info *sbi = EXT4_SB(sb); 446 int error = is_journal_aborted(journal); 447 struct ext4_journal_cb_entry *jce; 448 449 BUG_ON(txn->t_state == T_FINISHED); 450 451 ext4_process_freed_data(sb, txn->t_tid); 452 453 spin_lock(&sbi->s_md_lock); 454 while (!list_empty(&txn->t_private_list)) { 455 jce = list_entry(txn->t_private_list.next, 456 struct ext4_journal_cb_entry, jce_list); 457 list_del_init(&jce->jce_list); 458 spin_unlock(&sbi->s_md_lock); 459 jce->jce_func(sb, jce, error); 460 spin_lock(&sbi->s_md_lock); 461 } 462 spin_unlock(&sbi->s_md_lock); 463 } 464 465 /* 466 * This writepage callback for write_cache_pages() 467 * takes care of a few cases after page cleaning. 468 * 469 * write_cache_pages() already checks for dirty pages 470 * and calls clear_page_dirty_for_io(), which we want, 471 * to write protect the pages. 472 * 473 * However, we may have to redirty a page (see below.) 474 */ 475 static int ext4_journalled_writepage_callback(struct page *page, 476 struct writeback_control *wbc, 477 void *data) 478 { 479 transaction_t *transaction = (transaction_t *) data; 480 struct buffer_head *bh, *head; 481 struct journal_head *jh; 482 483 bh = head = page_buffers(page); 484 do { 485 /* 486 * We have to redirty a page in these cases: 487 * 1) If buffer is dirty, it means the page was dirty because it 488 * contains a buffer that needs checkpointing. So the dirty bit 489 * needs to be preserved so that checkpointing writes the buffer 490 * properly. 491 * 2) If buffer is not part of the committing transaction 492 * (we may have just accidentally come across this buffer because 493 * inode range tracking is not exact) or if the currently running 494 * transaction already contains this buffer as well, dirty bit 495 * needs to be preserved so that the buffer gets writeprotected 496 * properly on running transaction's commit. 497 */ 498 jh = bh2jh(bh); 499 if (buffer_dirty(bh) || 500 (jh && (jh->b_transaction != transaction || 501 jh->b_next_transaction))) { 502 redirty_page_for_writepage(wbc, page); 503 goto out; 504 } 505 } while ((bh = bh->b_this_page) != head); 506 507 out: 508 return AOP_WRITEPAGE_ACTIVATE; 509 } 510 511 static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) 512 { 513 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 514 struct writeback_control wbc = { 515 .sync_mode = WB_SYNC_ALL, 516 .nr_to_write = LONG_MAX, 517 .range_start = jinode->i_dirty_start, 518 .range_end = jinode->i_dirty_end, 519 }; 520 521 return write_cache_pages(mapping, &wbc, 522 ext4_journalled_writepage_callback, 523 jinode->i_transaction); 524 } 525 526 static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) 527 { 528 int ret; 529 530 if (ext4_should_journal_data(jinode->i_vfs_inode)) 531 ret = ext4_journalled_submit_inode_data_buffers(jinode); 532 else 533 ret = jbd2_journal_submit_inode_data_buffers(jinode); 534 535 return ret; 536 } 537 538 static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 539 { 540 int ret = 0; 541 542 if (!ext4_should_journal_data(jinode->i_vfs_inode)) 543 ret = jbd2_journal_finish_inode_data_buffers(jinode); 544 545 return ret; 546 } 547 548 static bool system_going_down(void) 549 { 550 return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF 551 || system_state == SYSTEM_RESTART; 552 } 553 554 struct ext4_err_translation { 555 int code; 556 int errno; 557 }; 558 559 #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } 560 561 static struct ext4_err_translation err_translation[] = { 562 EXT4_ERR_TRANSLATE(EIO), 563 EXT4_ERR_TRANSLATE(ENOMEM), 564 EXT4_ERR_TRANSLATE(EFSBADCRC), 565 EXT4_ERR_TRANSLATE(EFSCORRUPTED), 566 EXT4_ERR_TRANSLATE(ENOSPC), 567 EXT4_ERR_TRANSLATE(ENOKEY), 568 EXT4_ERR_TRANSLATE(EROFS), 569 EXT4_ERR_TRANSLATE(EFBIG), 570 EXT4_ERR_TRANSLATE(EEXIST), 571 EXT4_ERR_TRANSLATE(ERANGE), 572 EXT4_ERR_TRANSLATE(EOVERFLOW), 573 EXT4_ERR_TRANSLATE(EBUSY), 574 EXT4_ERR_TRANSLATE(ENOTDIR), 575 EXT4_ERR_TRANSLATE(ENOTEMPTY), 576 EXT4_ERR_TRANSLATE(ESHUTDOWN), 577 EXT4_ERR_TRANSLATE(EFAULT), 578 }; 579 580 static int ext4_errno_to_code(int errno) 581 { 582 int i; 583 584 for (i = 0; i < ARRAY_SIZE(err_translation); i++) 585 if (err_translation[i].errno == errno) 586 return err_translation[i].code; 587 return EXT4_ERR_UNKNOWN; 588 } 589 590 static void save_error_info(struct super_block *sb, int error, 591 __u32 ino, __u64 block, 592 const char *func, unsigned int line) 593 { 594 struct ext4_sb_info *sbi = EXT4_SB(sb); 595 596 /* We default to EFSCORRUPTED error... */ 597 if (error == 0) 598 error = EFSCORRUPTED; 599 600 spin_lock(&sbi->s_error_lock); 601 sbi->s_add_error_count++; 602 sbi->s_last_error_code = error; 603 sbi->s_last_error_line = line; 604 sbi->s_last_error_ino = ino; 605 sbi->s_last_error_block = block; 606 sbi->s_last_error_func = func; 607 sbi->s_last_error_time = ktime_get_real_seconds(); 608 if (!sbi->s_first_error_time) { 609 sbi->s_first_error_code = error; 610 sbi->s_first_error_line = line; 611 sbi->s_first_error_ino = ino; 612 sbi->s_first_error_block = block; 613 sbi->s_first_error_func = func; 614 sbi->s_first_error_time = sbi->s_last_error_time; 615 } 616 spin_unlock(&sbi->s_error_lock); 617 } 618 619 /* Deal with the reporting of failure conditions on a filesystem such as 620 * inconsistencies detected or read IO failures. 621 * 622 * On ext2, we can store the error state of the filesystem in the 623 * superblock. That is not possible on ext4, because we may have other 624 * write ordering constraints on the superblock which prevent us from 625 * writing it out straight away; and given that the journal is about to 626 * be aborted, we can't rely on the current, or future, transactions to 627 * write out the superblock safely. 628 * 629 * We'll just use the jbd2_journal_abort() error code to record an error in 630 * the journal instead. On recovery, the journal will complain about 631 * that error until we've noted it down and cleared it. 632 * 633 * If force_ro is set, we unconditionally force the filesystem into an 634 * ABORT|READONLY state, unless the error response on the fs has been set to 635 * panic in which case we take the easy way out and panic immediately. This is 636 * used to deal with unrecoverable failures such as journal IO errors or ENOMEM 637 * at a critical moment in log management. 638 */ 639 static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, 640 __u32 ino, __u64 block, 641 const char *func, unsigned int line) 642 { 643 journal_t *journal = EXT4_SB(sb)->s_journal; 644 bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); 645 646 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 647 if (test_opt(sb, WARN_ON_ERROR)) 648 WARN_ON_ONCE(1); 649 650 if (!continue_fs && !sb_rdonly(sb)) { 651 ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 652 if (journal) 653 jbd2_journal_abort(journal, -EIO); 654 } 655 656 if (!bdev_read_only(sb->s_bdev)) { 657 save_error_info(sb, error, ino, block, func, line); 658 /* 659 * In case the fs should keep running, we need to writeout 660 * superblock through the journal. Due to lock ordering 661 * constraints, it may not be safe to do it right here so we 662 * defer superblock flushing to a workqueue. 663 */ 664 if (continue_fs) 665 schedule_work(&EXT4_SB(sb)->s_error_work); 666 else 667 ext4_commit_super(sb); 668 } 669 670 /* 671 * We force ERRORS_RO behavior when system is rebooting. Otherwise we 672 * could panic during 'reboot -f' as the underlying device got already 673 * disabled. 674 */ 675 if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { 676 panic("EXT4-fs (device %s): panic forced after error\n", 677 sb->s_id); 678 } 679 680 if (sb_rdonly(sb) || continue_fs) 681 return; 682 683 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 684 /* 685 * Make sure updated value of ->s_mount_flags will be visible before 686 * ->s_flags update 687 */ 688 smp_wmb(); 689 sb->s_flags |= SB_RDONLY; 690 } 691 692 static void flush_stashed_error_work(struct work_struct *work) 693 { 694 struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, 695 s_error_work); 696 journal_t *journal = sbi->s_journal; 697 handle_t *handle; 698 699 /* 700 * If the journal is still running, we have to write out superblock 701 * through the journal to avoid collisions of other journalled sb 702 * updates. 703 * 704 * We use directly jbd2 functions here to avoid recursing back into 705 * ext4 error handling code during handling of previous errors. 706 */ 707 if (!sb_rdonly(sbi->s_sb) && journal) { 708 handle = jbd2_journal_start(journal, 1); 709 if (IS_ERR(handle)) 710 goto write_directly; 711 if (jbd2_journal_get_write_access(handle, sbi->s_sbh)) { 712 jbd2_journal_stop(handle); 713 goto write_directly; 714 } 715 ext4_update_super(sbi->s_sb); 716 if (jbd2_journal_dirty_metadata(handle, sbi->s_sbh)) { 717 jbd2_journal_stop(handle); 718 goto write_directly; 719 } 720 jbd2_journal_stop(handle); 721 return; 722 } 723 write_directly: 724 /* 725 * Write through journal failed. Write sb directly to get error info 726 * out and hope for the best. 727 */ 728 ext4_commit_super(sbi->s_sb); 729 } 730 731 #define ext4_error_ratelimit(sb) \ 732 ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ 733 "EXT4-fs error") 734 735 void __ext4_error(struct super_block *sb, const char *function, 736 unsigned int line, bool force_ro, int error, __u64 block, 737 const char *fmt, ...) 738 { 739 struct va_format vaf; 740 va_list args; 741 742 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 743 return; 744 745 trace_ext4_error(sb, function, line); 746 if (ext4_error_ratelimit(sb)) { 747 va_start(args, fmt); 748 vaf.fmt = fmt; 749 vaf.va = &args; 750 printk(KERN_CRIT 751 "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", 752 sb->s_id, function, line, current->comm, &vaf); 753 va_end(args); 754 } 755 ext4_handle_error(sb, force_ro, error, 0, block, function, line); 756 } 757 758 void __ext4_error_inode(struct inode *inode, const char *function, 759 unsigned int line, ext4_fsblk_t block, int error, 760 const char *fmt, ...) 761 { 762 va_list args; 763 struct va_format vaf; 764 765 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 766 return; 767 768 trace_ext4_error(inode->i_sb, function, line); 769 if (ext4_error_ratelimit(inode->i_sb)) { 770 va_start(args, fmt); 771 vaf.fmt = fmt; 772 vaf.va = &args; 773 if (block) 774 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " 775 "inode #%lu: block %llu: comm %s: %pV\n", 776 inode->i_sb->s_id, function, line, inode->i_ino, 777 block, current->comm, &vaf); 778 else 779 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " 780 "inode #%lu: comm %s: %pV\n", 781 inode->i_sb->s_id, function, line, inode->i_ino, 782 current->comm, &vaf); 783 va_end(args); 784 } 785 ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, 786 function, line); 787 } 788 789 void __ext4_error_file(struct file *file, const char *function, 790 unsigned int line, ext4_fsblk_t block, 791 const char *fmt, ...) 792 { 793 va_list args; 794 struct va_format vaf; 795 struct inode *inode = file_inode(file); 796 char pathname[80], *path; 797 798 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) 799 return; 800 801 trace_ext4_error(inode->i_sb, function, line); 802 if (ext4_error_ratelimit(inode->i_sb)) { 803 path = file_path(file, pathname, sizeof(pathname)); 804 if (IS_ERR(path)) 805 path = "(unknown)"; 806 va_start(args, fmt); 807 vaf.fmt = fmt; 808 vaf.va = &args; 809 if (block) 810 printk(KERN_CRIT 811 "EXT4-fs error (device %s): %s:%d: inode #%lu: " 812 "block %llu: comm %s: path %s: %pV\n", 813 inode->i_sb->s_id, function, line, inode->i_ino, 814 block, current->comm, path, &vaf); 815 else 816 printk(KERN_CRIT 817 "EXT4-fs error (device %s): %s:%d: inode #%lu: " 818 "comm %s: path %s: %pV\n", 819 inode->i_sb->s_id, function, line, inode->i_ino, 820 current->comm, path, &vaf); 821 va_end(args); 822 } 823 ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, 824 function, line); 825 } 826 827 const char *ext4_decode_error(struct super_block *sb, int errno, 828 char nbuf[16]) 829 { 830 char *errstr = NULL; 831 832 switch (errno) { 833 case -EFSCORRUPTED: 834 errstr = "Corrupt filesystem"; 835 break; 836 case -EFSBADCRC: 837 errstr = "Filesystem failed CRC"; 838 break; 839 case -EIO: 840 errstr = "IO failure"; 841 break; 842 case -ENOMEM: 843 errstr = "Out of memory"; 844 break; 845 case -EROFS: 846 if (!sb || (EXT4_SB(sb)->s_journal && 847 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) 848 errstr = "Journal has aborted"; 849 else 850 errstr = "Readonly filesystem"; 851 break; 852 default: 853 /* If the caller passed in an extra buffer for unknown 854 * errors, textualise them now. Else we just return 855 * NULL. */ 856 if (nbuf) { 857 /* Check for truncated error codes... */ 858 if (snprintf(nbuf, 16, "error %d", -errno) >= 0) 859 errstr = nbuf; 860 } 861 break; 862 } 863 864 return errstr; 865 } 866 867 /* __ext4_std_error decodes expected errors from journaling functions 868 * automatically and invokes the appropriate error response. */ 869 870 void __ext4_std_error(struct super_block *sb, const char *function, 871 unsigned int line, int errno) 872 { 873 char nbuf[16]; 874 const char *errstr; 875 876 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 877 return; 878 879 /* Special case: if the error is EROFS, and we're not already 880 * inside a transaction, then there's really no point in logging 881 * an error. */ 882 if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) 883 return; 884 885 if (ext4_error_ratelimit(sb)) { 886 errstr = ext4_decode_error(sb, errno, nbuf); 887 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", 888 sb->s_id, function, line, errstr); 889 } 890 891 ext4_handle_error(sb, false, -errno, 0, 0, function, line); 892 } 893 894 void __ext4_msg(struct super_block *sb, 895 const char *prefix, const char *fmt, ...) 896 { 897 struct va_format vaf; 898 va_list args; 899 900 atomic_inc(&EXT4_SB(sb)->s_msg_count); 901 if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) 902 return; 903 904 va_start(args, fmt); 905 vaf.fmt = fmt; 906 vaf.va = &args; 907 printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); 908 va_end(args); 909 } 910 911 static int ext4_warning_ratelimit(struct super_block *sb) 912 { 913 atomic_inc(&EXT4_SB(sb)->s_warning_count); 914 return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), 915 "EXT4-fs warning"); 916 } 917 918 void __ext4_warning(struct super_block *sb, const char *function, 919 unsigned int line, const char *fmt, ...) 920 { 921 struct va_format vaf; 922 va_list args; 923 924 if (!ext4_warning_ratelimit(sb)) 925 return; 926 927 va_start(args, fmt); 928 vaf.fmt = fmt; 929 vaf.va = &args; 930 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", 931 sb->s_id, function, line, &vaf); 932 va_end(args); 933 } 934 935 void __ext4_warning_inode(const struct inode *inode, const char *function, 936 unsigned int line, const char *fmt, ...) 937 { 938 struct va_format vaf; 939 va_list args; 940 941 if (!ext4_warning_ratelimit(inode->i_sb)) 942 return; 943 944 va_start(args, fmt); 945 vaf.fmt = fmt; 946 vaf.va = &args; 947 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " 948 "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, 949 function, line, inode->i_ino, current->comm, &vaf); 950 va_end(args); 951 } 952 953 void __ext4_grp_locked_error(const char *function, unsigned int line, 954 struct super_block *sb, ext4_group_t grp, 955 unsigned long ino, ext4_fsblk_t block, 956 const char *fmt, ...) 957 __releases(bitlock) 958 __acquires(bitlock) 959 { 960 struct va_format vaf; 961 va_list args; 962 963 if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) 964 return; 965 966 trace_ext4_error(sb, function, line); 967 if (ext4_error_ratelimit(sb)) { 968 va_start(args, fmt); 969 vaf.fmt = fmt; 970 vaf.va = &args; 971 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", 972 sb->s_id, function, line, grp); 973 if (ino) 974 printk(KERN_CONT "inode %lu: ", ino); 975 if (block) 976 printk(KERN_CONT "block %llu:", 977 (unsigned long long) block); 978 printk(KERN_CONT "%pV\n", &vaf); 979 va_end(args); 980 } 981 982 if (test_opt(sb, ERRORS_CONT)) { 983 if (test_opt(sb, WARN_ON_ERROR)) 984 WARN_ON_ONCE(1); 985 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 986 if (!bdev_read_only(sb->s_bdev)) { 987 save_error_info(sb, EFSCORRUPTED, ino, block, function, 988 line); 989 schedule_work(&EXT4_SB(sb)->s_error_work); 990 } 991 return; 992 } 993 ext4_unlock_group(sb, grp); 994 ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); 995 /* 996 * We only get here in the ERRORS_RO case; relocking the group 997 * may be dangerous, but nothing bad will happen since the 998 * filesystem will have already been marked read/only and the 999 * journal has been aborted. We return 1 as a hint to callers 1000 * who might what to use the return value from 1001 * ext4_grp_locked_error() to distinguish between the 1002 * ERRORS_CONT and ERRORS_RO case, and perhaps return more 1003 * aggressively from the ext4 function in question, with a 1004 * more appropriate error code. 1005 */ 1006 ext4_lock_group(sb, grp); 1007 return; 1008 } 1009 1010 void ext4_mark_group_bitmap_corrupted(struct super_block *sb, 1011 ext4_group_t group, 1012 unsigned int flags) 1013 { 1014 struct ext4_sb_info *sbi = EXT4_SB(sb); 1015 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 1016 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); 1017 int ret; 1018 1019 if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { 1020 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, 1021 &grp->bb_state); 1022 if (!ret) 1023 percpu_counter_sub(&sbi->s_freeclusters_counter, 1024 grp->bb_free); 1025 } 1026 1027 if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { 1028 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, 1029 &grp->bb_state); 1030 if (!ret && gdp) { 1031 int count; 1032 1033 count = ext4_free_inodes_count(sb, gdp); 1034 percpu_counter_sub(&sbi->s_freeinodes_counter, 1035 count); 1036 } 1037 } 1038 } 1039 1040 void ext4_update_dynamic_rev(struct super_block *sb) 1041 { 1042 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 1043 1044 if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) 1045 return; 1046 1047 ext4_warning(sb, 1048 "updating to rev %d because of new feature flag, " 1049 "running e2fsck is recommended", 1050 EXT4_DYNAMIC_REV); 1051 1052 es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); 1053 es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); 1054 es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); 1055 /* leave es->s_feature_*compat flags alone */ 1056 /* es->s_uuid will be set by e2fsck if empty */ 1057 1058 /* 1059 * The rest of the superblock fields should be zero, and if not it 1060 * means they are likely already in use, so leave them alone. We 1061 * can leave it up to e2fsck to clean up any inconsistencies there. 1062 */ 1063 } 1064 1065 /* 1066 * Open the external journal device 1067 */ 1068 static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) 1069 { 1070 struct block_device *bdev; 1071 1072 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); 1073 if (IS_ERR(bdev)) 1074 goto fail; 1075 return bdev; 1076 1077 fail: 1078 ext4_msg(sb, KERN_ERR, 1079 "failed to open journal device unknown-block(%u,%u) %ld", 1080 MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); 1081 return NULL; 1082 } 1083 1084 /* 1085 * Release the journal device 1086 */ 1087 static void ext4_blkdev_put(struct block_device *bdev) 1088 { 1089 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1090 } 1091 1092 static void ext4_blkdev_remove(struct ext4_sb_info *sbi) 1093 { 1094 struct block_device *bdev; 1095 bdev = sbi->s_journal_bdev; 1096 if (bdev) { 1097 ext4_blkdev_put(bdev); 1098 sbi->s_journal_bdev = NULL; 1099 } 1100 } 1101 1102 static inline struct inode *orphan_list_entry(struct list_head *l) 1103 { 1104 return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; 1105 } 1106 1107 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) 1108 { 1109 struct list_head *l; 1110 1111 ext4_msg(sb, KERN_ERR, "sb orphan head is %d", 1112 le32_to_cpu(sbi->s_es->s_last_orphan)); 1113 1114 printk(KERN_ERR "sb_info orphan list:\n"); 1115 list_for_each(l, &sbi->s_orphan) { 1116 struct inode *inode = orphan_list_entry(l); 1117 printk(KERN_ERR " " 1118 "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", 1119 inode->i_sb->s_id, inode->i_ino, inode, 1120 inode->i_mode, inode->i_nlink, 1121 NEXT_ORPHAN(inode)); 1122 } 1123 } 1124 1125 #ifdef CONFIG_QUOTA 1126 static int ext4_quota_off(struct super_block *sb, int type); 1127 1128 static inline void ext4_quota_off_umount(struct super_block *sb) 1129 { 1130 int type; 1131 1132 /* Use our quota_off function to clear inode flags etc. */ 1133 for (type = 0; type < EXT4_MAXQUOTAS; type++) 1134 ext4_quota_off(sb, type); 1135 } 1136 1137 /* 1138 * This is a helper function which is used in the mount/remount 1139 * codepaths (which holds s_umount) to fetch the quota file name. 1140 */ 1141 static inline char *get_qf_name(struct super_block *sb, 1142 struct ext4_sb_info *sbi, 1143 int type) 1144 { 1145 return rcu_dereference_protected(sbi->s_qf_names[type], 1146 lockdep_is_held(&sb->s_umount)); 1147 } 1148 #else 1149 static inline void ext4_quota_off_umount(struct super_block *sb) 1150 { 1151 } 1152 #endif 1153 1154 static void ext4_put_super(struct super_block *sb) 1155 { 1156 struct ext4_sb_info *sbi = EXT4_SB(sb); 1157 struct ext4_super_block *es = sbi->s_es; 1158 struct buffer_head **group_desc; 1159 struct flex_groups **flex_groups; 1160 int aborted = 0; 1161 int i, err; 1162 1163 ext4_unregister_li_request(sb); 1164 ext4_quota_off_umount(sb); 1165 1166 flush_work(&sbi->s_error_work); 1167 destroy_workqueue(sbi->rsv_conversion_wq); 1168 1169 /* 1170 * Unregister sysfs before destroying jbd2 journal. 1171 * Since we could still access attr_journal_task attribute via sysfs 1172 * path which could have sbi->s_journal->j_task as NULL 1173 */ 1174 ext4_unregister_sysfs(sb); 1175 1176 if (sbi->s_journal) { 1177 jbd2_journal_unregister_shrinker(sbi->s_journal); 1178 aborted = is_journal_aborted(sbi->s_journal); 1179 err = jbd2_journal_destroy(sbi->s_journal); 1180 sbi->s_journal = NULL; 1181 if ((err < 0) && !aborted) { 1182 ext4_abort(sb, -err, "Couldn't clean up the journal"); 1183 } 1184 } 1185 1186 ext4_es_unregister_shrinker(sbi); 1187 del_timer_sync(&sbi->s_err_report); 1188 ext4_release_system_zone(sb); 1189 ext4_mb_release(sb); 1190 ext4_ext_release(sb); 1191 1192 if (!sb_rdonly(sb) && !aborted) { 1193 ext4_clear_feature_journal_needs_recovery(sb); 1194 es->s_state = cpu_to_le16(sbi->s_mount_state); 1195 } 1196 if (!sb_rdonly(sb)) 1197 ext4_commit_super(sb); 1198 1199 rcu_read_lock(); 1200 group_desc = rcu_dereference(sbi->s_group_desc); 1201 for (i = 0; i < sbi->s_gdb_count; i++) 1202 brelse(group_desc[i]); 1203 kvfree(group_desc); 1204 flex_groups = rcu_dereference(sbi->s_flex_groups); 1205 if (flex_groups) { 1206 for (i = 0; i < sbi->s_flex_groups_allocated; i++) 1207 kvfree(flex_groups[i]); 1208 kvfree(flex_groups); 1209 } 1210 rcu_read_unlock(); 1211 percpu_counter_destroy(&sbi->s_freeclusters_counter); 1212 percpu_counter_destroy(&sbi->s_freeinodes_counter); 1213 percpu_counter_destroy(&sbi->s_dirs_counter); 1214 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 1215 percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 1216 percpu_free_rwsem(&sbi->s_writepages_rwsem); 1217 #ifdef CONFIG_QUOTA 1218 for (i = 0; i < EXT4_MAXQUOTAS; i++) 1219 kfree(get_qf_name(sb, sbi, i)); 1220 #endif 1221 1222 /* Debugging code just in case the in-memory inode orphan list 1223 * isn't empty. The on-disk one can be non-empty if we've 1224 * detected an error and taken the fs readonly, but the 1225 * in-memory list had better be clean by this point. */ 1226 if (!list_empty(&sbi->s_orphan)) 1227 dump_orphan_list(sb, sbi); 1228 ASSERT(list_empty(&sbi->s_orphan)); 1229 1230 sync_blockdev(sb->s_bdev); 1231 invalidate_bdev(sb->s_bdev); 1232 if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { 1233 /* 1234 * Invalidate the journal device's buffers. We don't want them 1235 * floating about in memory - the physical journal device may 1236 * hotswapped, and it breaks the `ro-after' testing code. 1237 */ 1238 sync_blockdev(sbi->s_journal_bdev); 1239 invalidate_bdev(sbi->s_journal_bdev); 1240 ext4_blkdev_remove(sbi); 1241 } 1242 1243 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 1244 sbi->s_ea_inode_cache = NULL; 1245 1246 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 1247 sbi->s_ea_block_cache = NULL; 1248 1249 ext4_stop_mmpd(sbi); 1250 1251 brelse(sbi->s_sbh); 1252 sb->s_fs_info = NULL; 1253 /* 1254 * Now that we are completely done shutting down the 1255 * superblock, we need to actually destroy the kobject. 1256 */ 1257 kobject_put(&sbi->s_kobj); 1258 wait_for_completion(&sbi->s_kobj_unregister); 1259 if (sbi->s_chksum_driver) 1260 crypto_free_shash(sbi->s_chksum_driver); 1261 kfree(sbi->s_blockgroup_lock); 1262 fs_put_dax(sbi->s_daxdev); 1263 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); 1264 #ifdef CONFIG_UNICODE 1265 utf8_unload(sb->s_encoding); 1266 #endif 1267 kfree(sbi); 1268 } 1269 1270 static struct kmem_cache *ext4_inode_cachep; 1271 1272 /* 1273 * Called inside transaction, so use GFP_NOFS 1274 */ 1275 static struct inode *ext4_alloc_inode(struct super_block *sb) 1276 { 1277 struct ext4_inode_info *ei; 1278 1279 ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); 1280 if (!ei) 1281 return NULL; 1282 1283 inode_set_iversion(&ei->vfs_inode, 1); 1284 spin_lock_init(&ei->i_raw_lock); 1285 INIT_LIST_HEAD(&ei->i_prealloc_list); 1286 atomic_set(&ei->i_prealloc_active, 0); 1287 spin_lock_init(&ei->i_prealloc_lock); 1288 ext4_es_init_tree(&ei->i_es_tree); 1289 rwlock_init(&ei->i_es_lock); 1290 INIT_LIST_HEAD(&ei->i_es_list); 1291 ei->i_es_all_nr = 0; 1292 ei->i_es_shk_nr = 0; 1293 ei->i_es_shrink_lblk = 0; 1294 ei->i_reserved_data_blocks = 0; 1295 spin_lock_init(&(ei->i_block_reservation_lock)); 1296 ext4_init_pending_tree(&ei->i_pending_tree); 1297 #ifdef CONFIG_QUOTA 1298 ei->i_reserved_quota = 0; 1299 memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); 1300 #endif 1301 ei->jinode = NULL; 1302 INIT_LIST_HEAD(&ei->i_rsv_conversion_list); 1303 spin_lock_init(&ei->i_completed_io_lock); 1304 ei->i_sync_tid = 0; 1305 ei->i_datasync_tid = 0; 1306 atomic_set(&ei->i_unwritten, 0); 1307 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 1308 ext4_fc_init_inode(&ei->vfs_inode); 1309 mutex_init(&ei->i_fc_lock); 1310 return &ei->vfs_inode; 1311 } 1312 1313 static int ext4_drop_inode(struct inode *inode) 1314 { 1315 int drop = generic_drop_inode(inode); 1316 1317 if (!drop) 1318 drop = fscrypt_drop_inode(inode); 1319 1320 trace_ext4_drop_inode(inode, drop); 1321 return drop; 1322 } 1323 1324 static void ext4_free_in_core_inode(struct inode *inode) 1325 { 1326 fscrypt_free_inode(inode); 1327 if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { 1328 pr_warn("%s: inode %ld still in fc list", 1329 __func__, inode->i_ino); 1330 } 1331 kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); 1332 } 1333 1334 static void ext4_destroy_inode(struct inode *inode) 1335 { 1336 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 1337 ext4_msg(inode->i_sb, KERN_ERR, 1338 "Inode %lu (%p): orphan list check failed!", 1339 inode->i_ino, EXT4_I(inode)); 1340 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, 1341 EXT4_I(inode), sizeof(struct ext4_inode_info), 1342 true); 1343 dump_stack(); 1344 } 1345 } 1346 1347 static void init_once(void *foo) 1348 { 1349 struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; 1350 1351 INIT_LIST_HEAD(&ei->i_orphan); 1352 init_rwsem(&ei->xattr_sem); 1353 init_rwsem(&ei->i_data_sem); 1354 init_rwsem(&ei->i_mmap_sem); 1355 inode_init_once(&ei->vfs_inode); 1356 ext4_fc_init_inode(&ei->vfs_inode); 1357 } 1358 1359 static int __init init_inodecache(void) 1360 { 1361 ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", 1362 sizeof(struct ext4_inode_info), 0, 1363 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| 1364 SLAB_ACCOUNT), 1365 offsetof(struct ext4_inode_info, i_data), 1366 sizeof_field(struct ext4_inode_info, i_data), 1367 init_once); 1368 if (ext4_inode_cachep == NULL) 1369 return -ENOMEM; 1370 return 0; 1371 } 1372 1373 static void destroy_inodecache(void) 1374 { 1375 /* 1376 * Make sure all delayed rcu free inodes are flushed before we 1377 * destroy cache. 1378 */ 1379 rcu_barrier(); 1380 kmem_cache_destroy(ext4_inode_cachep); 1381 } 1382 1383 void ext4_clear_inode(struct inode *inode) 1384 { 1385 ext4_fc_del(inode); 1386 invalidate_inode_buffers(inode); 1387 clear_inode(inode); 1388 ext4_discard_preallocations(inode, 0); 1389 ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); 1390 dquot_drop(inode); 1391 if (EXT4_I(inode)->jinode) { 1392 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), 1393 EXT4_I(inode)->jinode); 1394 jbd2_free_inode(EXT4_I(inode)->jinode); 1395 EXT4_I(inode)->jinode = NULL; 1396 } 1397 fscrypt_put_encryption_info(inode); 1398 fsverity_cleanup_inode(inode); 1399 } 1400 1401 static struct inode *ext4_nfs_get_inode(struct super_block *sb, 1402 u64 ino, u32 generation) 1403 { 1404 struct inode *inode; 1405 1406 /* 1407 * Currently we don't know the generation for parent directory, so 1408 * a generation of 0 means "accept any" 1409 */ 1410 inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); 1411 if (IS_ERR(inode)) 1412 return ERR_CAST(inode); 1413 if (generation && inode->i_generation != generation) { 1414 iput(inode); 1415 return ERR_PTR(-ESTALE); 1416 } 1417 1418 return inode; 1419 } 1420 1421 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, 1422 int fh_len, int fh_type) 1423 { 1424 return generic_fh_to_dentry(sb, fid, fh_len, fh_type, 1425 ext4_nfs_get_inode); 1426 } 1427 1428 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, 1429 int fh_len, int fh_type) 1430 { 1431 return generic_fh_to_parent(sb, fid, fh_len, fh_type, 1432 ext4_nfs_get_inode); 1433 } 1434 1435 static int ext4_nfs_commit_metadata(struct inode *inode) 1436 { 1437 struct writeback_control wbc = { 1438 .sync_mode = WB_SYNC_ALL 1439 }; 1440 1441 trace_ext4_nfs_commit_metadata(inode); 1442 return ext4_write_inode(inode, &wbc); 1443 } 1444 1445 /* 1446 * Try to release metadata pages (indirect blocks, directories) which are 1447 * mapped via the block device. Since these pages could have journal heads 1448 * which would prevent try_to_free_buffers() from freeing them, we must use 1449 * jbd2 layer's try_to_free_buffers() function to release them. 1450 */ 1451 static int bdev_try_to_free_page(struct super_block *sb, struct page *page, 1452 gfp_t wait) 1453 { 1454 journal_t *journal = EXT4_SB(sb)->s_journal; 1455 1456 WARN_ON(PageChecked(page)); 1457 if (!page_has_buffers(page)) 1458 return 0; 1459 if (journal) 1460 return jbd2_journal_try_to_free_buffers(journal, page); 1461 1462 return try_to_free_buffers(page); 1463 } 1464 1465 #ifdef CONFIG_FS_ENCRYPTION 1466 static int ext4_get_context(struct inode *inode, void *ctx, size_t len) 1467 { 1468 return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, 1469 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); 1470 } 1471 1472 static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, 1473 void *fs_data) 1474 { 1475 handle_t *handle = fs_data; 1476 int res, res2, credits, retries = 0; 1477 1478 /* 1479 * Encrypting the root directory is not allowed because e2fsck expects 1480 * lost+found to exist and be unencrypted, and encrypting the root 1481 * directory would imply encrypting the lost+found directory as well as 1482 * the filename "lost+found" itself. 1483 */ 1484 if (inode->i_ino == EXT4_ROOT_INO) 1485 return -EPERM; 1486 1487 if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) 1488 return -EINVAL; 1489 1490 if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) 1491 return -EOPNOTSUPP; 1492 1493 res = ext4_convert_inline_data(inode); 1494 if (res) 1495 return res; 1496 1497 /* 1498 * If a journal handle was specified, then the encryption context is 1499 * being set on a new inode via inheritance and is part of a larger 1500 * transaction to create the inode. Otherwise the encryption context is 1501 * being set on an existing inode in its own transaction. Only in the 1502 * latter case should the "retry on ENOSPC" logic be used. 1503 */ 1504 1505 if (handle) { 1506 res = ext4_xattr_set_handle(handle, inode, 1507 EXT4_XATTR_INDEX_ENCRYPTION, 1508 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1509 ctx, len, 0); 1510 if (!res) { 1511 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1512 ext4_clear_inode_state(inode, 1513 EXT4_STATE_MAY_INLINE_DATA); 1514 /* 1515 * Update inode->i_flags - S_ENCRYPTED will be enabled, 1516 * S_DAX may be disabled 1517 */ 1518 ext4_set_inode_flags(inode, false); 1519 } 1520 return res; 1521 } 1522 1523 res = dquot_initialize(inode); 1524 if (res) 1525 return res; 1526 retry: 1527 res = ext4_xattr_set_credits(inode, len, false /* is_create */, 1528 &credits); 1529 if (res) 1530 return res; 1531 1532 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 1533 if (IS_ERR(handle)) 1534 return PTR_ERR(handle); 1535 1536 res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, 1537 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, 1538 ctx, len, 0); 1539 if (!res) { 1540 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); 1541 /* 1542 * Update inode->i_flags - S_ENCRYPTED will be enabled, 1543 * S_DAX may be disabled 1544 */ 1545 ext4_set_inode_flags(inode, false); 1546 res = ext4_mark_inode_dirty(handle, inode); 1547 if (res) 1548 EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); 1549 } 1550 res2 = ext4_journal_stop(handle); 1551 1552 if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 1553 goto retry; 1554 if (!res) 1555 res = res2; 1556 return res; 1557 } 1558 1559 static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) 1560 { 1561 return EXT4_SB(sb)->s_dummy_enc_policy.policy; 1562 } 1563 1564 static bool ext4_has_stable_inodes(struct super_block *sb) 1565 { 1566 return ext4_has_feature_stable_inodes(sb); 1567 } 1568 1569 static void ext4_get_ino_and_lblk_bits(struct super_block *sb, 1570 int *ino_bits_ret, int *lblk_bits_ret) 1571 { 1572 *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); 1573 *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); 1574 } 1575 1576 static const struct fscrypt_operations ext4_cryptops = { 1577 .key_prefix = "ext4:", 1578 .get_context = ext4_get_context, 1579 .set_context = ext4_set_context, 1580 .get_dummy_policy = ext4_get_dummy_policy, 1581 .empty_dir = ext4_empty_dir, 1582 .max_namelen = EXT4_NAME_LEN, 1583 .has_stable_inodes = ext4_has_stable_inodes, 1584 .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, 1585 }; 1586 #endif 1587 1588 #ifdef CONFIG_QUOTA 1589 static const char * const quotatypes[] = INITQFNAMES; 1590 #define QTYPE2NAME(t) (quotatypes[t]) 1591 1592 static int ext4_write_dquot(struct dquot *dquot); 1593 static int ext4_acquire_dquot(struct dquot *dquot); 1594 static int ext4_release_dquot(struct dquot *dquot); 1595 static int ext4_mark_dquot_dirty(struct dquot *dquot); 1596 static int ext4_write_info(struct super_block *sb, int type); 1597 static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1598 const struct path *path); 1599 static int ext4_quota_on_mount(struct super_block *sb, int type); 1600 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1601 size_t len, loff_t off); 1602 static ssize_t ext4_quota_write(struct super_block *sb, int type, 1603 const char *data, size_t len, loff_t off); 1604 static int ext4_quota_enable(struct super_block *sb, int type, int format_id, 1605 unsigned int flags); 1606 static int ext4_enable_quotas(struct super_block *sb); 1607 1608 static struct dquot **ext4_get_dquots(struct inode *inode) 1609 { 1610 return EXT4_I(inode)->i_dquot; 1611 } 1612 1613 static const struct dquot_operations ext4_quota_operations = { 1614 .get_reserved_space = ext4_get_reserved_space, 1615 .write_dquot = ext4_write_dquot, 1616 .acquire_dquot = ext4_acquire_dquot, 1617 .release_dquot = ext4_release_dquot, 1618 .mark_dirty = ext4_mark_dquot_dirty, 1619 .write_info = ext4_write_info, 1620 .alloc_dquot = dquot_alloc, 1621 .destroy_dquot = dquot_destroy, 1622 .get_projid = ext4_get_projid, 1623 .get_inode_usage = ext4_get_inode_usage, 1624 .get_next_id = dquot_get_next_id, 1625 }; 1626 1627 static const struct quotactl_ops ext4_qctl_operations = { 1628 .quota_on = ext4_quota_on, 1629 .quota_off = ext4_quota_off, 1630 .quota_sync = dquot_quota_sync, 1631 .get_state = dquot_get_state, 1632 .set_info = dquot_set_dqinfo, 1633 .get_dqblk = dquot_get_dqblk, 1634 .set_dqblk = dquot_set_dqblk, 1635 .get_nextdqblk = dquot_get_next_dqblk, 1636 }; 1637 #endif 1638 1639 static const struct super_operations ext4_sops = { 1640 .alloc_inode = ext4_alloc_inode, 1641 .free_inode = ext4_free_in_core_inode, 1642 .destroy_inode = ext4_destroy_inode, 1643 .write_inode = ext4_write_inode, 1644 .dirty_inode = ext4_dirty_inode, 1645 .drop_inode = ext4_drop_inode, 1646 .evict_inode = ext4_evict_inode, 1647 .put_super = ext4_put_super, 1648 .sync_fs = ext4_sync_fs, 1649 .freeze_fs = ext4_freeze, 1650 .unfreeze_fs = ext4_unfreeze, 1651 .statfs = ext4_statfs, 1652 .remount_fs = ext4_remount, 1653 .show_options = ext4_show_options, 1654 #ifdef CONFIG_QUOTA 1655 .quota_read = ext4_quota_read, 1656 .quota_write = ext4_quota_write, 1657 .get_dquots = ext4_get_dquots, 1658 #endif 1659 .bdev_try_to_free_page = bdev_try_to_free_page, 1660 }; 1661 1662 static const struct export_operations ext4_export_ops = { 1663 .fh_to_dentry = ext4_fh_to_dentry, 1664 .fh_to_parent = ext4_fh_to_parent, 1665 .get_parent = ext4_get_parent, 1666 .commit_metadata = ext4_nfs_commit_metadata, 1667 }; 1668 1669 enum { 1670 Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, 1671 Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, 1672 Opt_nouid32, Opt_debug, Opt_removed, 1673 Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, 1674 Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, 1675 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, 1676 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, 1677 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1678 Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, 1679 Opt_inlinecrypt, 1680 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1681 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1682 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 1683 Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, 1684 Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, 1685 Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, 1686 Opt_nowarn_on_error, Opt_mblk_io_submit, 1687 Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, 1688 Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, 1689 Opt_inode_readahead_blks, Opt_journal_ioprio, 1690 Opt_dioread_nolock, Opt_dioread_lock, 1691 Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, 1692 Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, 1693 Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, 1694 #ifdef CONFIG_EXT4_DEBUG 1695 Opt_fc_debug_max_replay, Opt_fc_debug_force 1696 #endif 1697 }; 1698 1699 static const match_table_t tokens = { 1700 {Opt_bsd_df, "bsddf"}, 1701 {Opt_minix_df, "minixdf"}, 1702 {Opt_grpid, "grpid"}, 1703 {Opt_grpid, "bsdgroups"}, 1704 {Opt_nogrpid, "nogrpid"}, 1705 {Opt_nogrpid, "sysvgroups"}, 1706 {Opt_resgid, "resgid=%u"}, 1707 {Opt_resuid, "resuid=%u"}, 1708 {Opt_sb, "sb=%u"}, 1709 {Opt_err_cont, "errors=continue"}, 1710 {Opt_err_panic, "errors=panic"}, 1711 {Opt_err_ro, "errors=remount-ro"}, 1712 {Opt_nouid32, "nouid32"}, 1713 {Opt_debug, "debug"}, 1714 {Opt_removed, "oldalloc"}, 1715 {Opt_removed, "orlov"}, 1716 {Opt_user_xattr, "user_xattr"}, 1717 {Opt_nouser_xattr, "nouser_xattr"}, 1718 {Opt_acl, "acl"}, 1719 {Opt_noacl, "noacl"}, 1720 {Opt_noload, "norecovery"}, 1721 {Opt_noload, "noload"}, 1722 {Opt_removed, "nobh"}, 1723 {Opt_removed, "bh"}, 1724 {Opt_commit, "commit=%u"}, 1725 {Opt_min_batch_time, "min_batch_time=%u"}, 1726 {Opt_max_batch_time, "max_batch_time=%u"}, 1727 {Opt_journal_dev, "journal_dev=%u"}, 1728 {Opt_journal_path, "journal_path=%s"}, 1729 {Opt_journal_checksum, "journal_checksum"}, 1730 {Opt_nojournal_checksum, "nojournal_checksum"}, 1731 {Opt_journal_async_commit, "journal_async_commit"}, 1732 {Opt_abort, "abort"}, 1733 {Opt_data_journal, "data=journal"}, 1734 {Opt_data_ordered, "data=ordered"}, 1735 {Opt_data_writeback, "data=writeback"}, 1736 {Opt_data_err_abort, "data_err=abort"}, 1737 {Opt_data_err_ignore, "data_err=ignore"}, 1738 {Opt_offusrjquota, "usrjquota="}, 1739 {Opt_usrjquota, "usrjquota=%s"}, 1740 {Opt_offgrpjquota, "grpjquota="}, 1741 {Opt_grpjquota, "grpjquota=%s"}, 1742 {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, 1743 {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, 1744 {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, 1745 {Opt_grpquota, "grpquota"}, 1746 {Opt_noquota, "noquota"}, 1747 {Opt_quota, "quota"}, 1748 {Opt_usrquota, "usrquota"}, 1749 {Opt_prjquota, "prjquota"}, 1750 {Opt_barrier, "barrier=%u"}, 1751 {Opt_barrier, "barrier"}, 1752 {Opt_nobarrier, "nobarrier"}, 1753 {Opt_i_version, "i_version"}, 1754 {Opt_dax, "dax"}, 1755 {Opt_dax_always, "dax=always"}, 1756 {Opt_dax_inode, "dax=inode"}, 1757 {Opt_dax_never, "dax=never"}, 1758 {Opt_stripe, "stripe=%u"}, 1759 {Opt_delalloc, "delalloc"}, 1760 {Opt_warn_on_error, "warn_on_error"}, 1761 {Opt_nowarn_on_error, "nowarn_on_error"}, 1762 {Opt_lazytime, "lazytime"}, 1763 {Opt_nolazytime, "nolazytime"}, 1764 {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, 1765 {Opt_nodelalloc, "nodelalloc"}, 1766 {Opt_removed, "mblk_io_submit"}, 1767 {Opt_removed, "nomblk_io_submit"}, 1768 {Opt_block_validity, "block_validity"}, 1769 {Opt_noblock_validity, "noblock_validity"}, 1770 {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, 1771 {Opt_journal_ioprio, "journal_ioprio=%u"}, 1772 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1773 {Opt_auto_da_alloc, "auto_da_alloc"}, 1774 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1775 {Opt_dioread_nolock, "dioread_nolock"}, 1776 {Opt_dioread_lock, "nodioread_nolock"}, 1777 {Opt_dioread_lock, "dioread_lock"}, 1778 {Opt_discard, "discard"}, 1779 {Opt_nodiscard, "nodiscard"}, 1780 {Opt_init_itable, "init_itable=%u"}, 1781 {Opt_init_itable, "init_itable"}, 1782 {Opt_noinit_itable, "noinit_itable"}, 1783 #ifdef CONFIG_EXT4_DEBUG 1784 {Opt_fc_debug_force, "fc_debug_force"}, 1785 {Opt_fc_debug_max_replay, "fc_debug_max_replay=%u"}, 1786 #endif 1787 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, 1788 {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, 1789 {Opt_test_dummy_encryption, "test_dummy_encryption"}, 1790 {Opt_inlinecrypt, "inlinecrypt"}, 1791 {Opt_nombcache, "nombcache"}, 1792 {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ 1793 {Opt_removed, "prefetch_block_bitmaps"}, 1794 {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, 1795 {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, 1796 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1797 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1798 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 1799 {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ 1800 {Opt_removed, "journal=%u"}, /* mount option from ext2/3 */ 1801 {Opt_err, NULL}, 1802 }; 1803 1804 static ext4_fsblk_t get_sb_block(void **data) 1805 { 1806 ext4_fsblk_t sb_block; 1807 char *options = (char *) *data; 1808 1809 if (!options || strncmp(options, "sb=", 3) != 0) 1810 return 1; /* Default location */ 1811 1812 options += 3; 1813 /* TODO: use simple_strtoll with >32bit ext4 */ 1814 sb_block = simple_strtoul(options, &options, 0); 1815 if (*options && *options != ',') { 1816 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", 1817 (char *) *data); 1818 return 1; 1819 } 1820 if (*options == ',') 1821 options++; 1822 *data = (void *) options; 1823 1824 return sb_block; 1825 } 1826 1827 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) 1828 #define DEFAULT_MB_OPTIMIZE_SCAN (-1) 1829 1830 static const char deprecated_msg[] = 1831 "Mount option \"%s\" will be removed by %s\n" 1832 "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; 1833 1834 #ifdef CONFIG_QUOTA 1835 static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) 1836 { 1837 struct ext4_sb_info *sbi = EXT4_SB(sb); 1838 char *qname, *old_qname = get_qf_name(sb, sbi, qtype); 1839 int ret = -1; 1840 1841 if (sb_any_quota_loaded(sb) && !old_qname) { 1842 ext4_msg(sb, KERN_ERR, 1843 "Cannot change journaled " 1844 "quota options when quota turned on"); 1845 return -1; 1846 } 1847 if (ext4_has_feature_quota(sb)) { 1848 ext4_msg(sb, KERN_INFO, "Journaled quota options " 1849 "ignored when QUOTA feature is enabled"); 1850 return 1; 1851 } 1852 qname = match_strdup(args); 1853 if (!qname) { 1854 ext4_msg(sb, KERN_ERR, 1855 "Not enough memory for storing quotafile name"); 1856 return -1; 1857 } 1858 if (old_qname) { 1859 if (strcmp(old_qname, qname) == 0) 1860 ret = 1; 1861 else 1862 ext4_msg(sb, KERN_ERR, 1863 "%s quota file already specified", 1864 QTYPE2NAME(qtype)); 1865 goto errout; 1866 } 1867 if (strchr(qname, '/')) { 1868 ext4_msg(sb, KERN_ERR, 1869 "quotafile must be on filesystem root"); 1870 goto errout; 1871 } 1872 rcu_assign_pointer(sbi->s_qf_names[qtype], qname); 1873 set_opt(sb, QUOTA); 1874 return 1; 1875 errout: 1876 kfree(qname); 1877 return ret; 1878 } 1879 1880 static int clear_qf_name(struct super_block *sb, int qtype) 1881 { 1882 1883 struct ext4_sb_info *sbi = EXT4_SB(sb); 1884 char *old_qname = get_qf_name(sb, sbi, qtype); 1885 1886 if (sb_any_quota_loaded(sb) && old_qname) { 1887 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" 1888 " when quota turned on"); 1889 return -1; 1890 } 1891 rcu_assign_pointer(sbi->s_qf_names[qtype], NULL); 1892 synchronize_rcu(); 1893 kfree(old_qname); 1894 return 1; 1895 } 1896 #endif 1897 1898 #define MOPT_SET 0x0001 1899 #define MOPT_CLEAR 0x0002 1900 #define MOPT_NOSUPPORT 0x0004 1901 #define MOPT_EXPLICIT 0x0008 1902 #define MOPT_CLEAR_ERR 0x0010 1903 #define MOPT_GTE0 0x0020 1904 #ifdef CONFIG_QUOTA 1905 #define MOPT_Q 0 1906 #define MOPT_QFMT 0x0040 1907 #else 1908 #define MOPT_Q MOPT_NOSUPPORT 1909 #define MOPT_QFMT MOPT_NOSUPPORT 1910 #endif 1911 #define MOPT_DATAJ 0x0080 1912 #define MOPT_NO_EXT2 0x0100 1913 #define MOPT_NO_EXT3 0x0200 1914 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) 1915 #define MOPT_STRING 0x0400 1916 #define MOPT_SKIP 0x0800 1917 #define MOPT_2 0x1000 1918 1919 static const struct mount_opts { 1920 int token; 1921 int mount_opt; 1922 int flags; 1923 } ext4_mount_opts[] = { 1924 {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, 1925 {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, 1926 {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, 1927 {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, 1928 {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, 1929 {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, 1930 {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, 1931 MOPT_EXT4_ONLY | MOPT_SET}, 1932 {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, 1933 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1934 {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, 1935 {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, 1936 {Opt_delalloc, EXT4_MOUNT_DELALLOC, 1937 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1938 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1939 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1940 {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, 1941 {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, 1942 {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1943 MOPT_EXT4_ONLY | MOPT_CLEAR}, 1944 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1945 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1946 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1947 EXT4_MOUNT_JOURNAL_CHECKSUM), 1948 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1949 {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, 1950 {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, 1951 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, 1952 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, 1953 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, 1954 MOPT_NO_EXT2}, 1955 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, 1956 MOPT_NO_EXT2}, 1957 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, 1958 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, 1959 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, 1960 {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, 1961 {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, 1962 {Opt_commit, 0, MOPT_GTE0}, 1963 {Opt_max_batch_time, 0, MOPT_GTE0}, 1964 {Opt_min_batch_time, 0, MOPT_GTE0}, 1965 {Opt_inode_readahead_blks, 0, MOPT_GTE0}, 1966 {Opt_init_itable, 0, MOPT_GTE0}, 1967 {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, 1968 {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, 1969 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1970 {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, 1971 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1972 {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, 1973 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, 1974 {Opt_stripe, 0, MOPT_GTE0}, 1975 {Opt_resuid, 0, MOPT_GTE0}, 1976 {Opt_resgid, 0, MOPT_GTE0}, 1977 {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0}, 1978 {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING}, 1979 {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0}, 1980 {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1981 {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ}, 1982 {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, 1983 MOPT_NO_EXT2 | MOPT_DATAJ}, 1984 {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, 1985 {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, 1986 #ifdef CONFIG_EXT4_FS_POSIX_ACL 1987 {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, 1988 {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, 1989 #else 1990 {Opt_acl, 0, MOPT_NOSUPPORT}, 1991 {Opt_noacl, 0, MOPT_NOSUPPORT}, 1992 #endif 1993 {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, 1994 {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, 1995 {Opt_debug_want_extra_isize, 0, MOPT_GTE0}, 1996 {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, 1997 {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, 1998 MOPT_SET | MOPT_Q}, 1999 {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, 2000 MOPT_SET | MOPT_Q}, 2001 {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, 2002 MOPT_SET | MOPT_Q}, 2003 {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | 2004 EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), 2005 MOPT_CLEAR | MOPT_Q}, 2006 {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, 2007 {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, 2008 {Opt_offusrjquota, 0, MOPT_Q}, 2009 {Opt_offgrpjquota, 0, MOPT_Q}, 2010 {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, 2011 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 2012 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 2013 {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 2014 {Opt_test_dummy_encryption, 0, MOPT_STRING}, 2015 {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, 2016 {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, 2017 MOPT_SET}, 2018 {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, 2019 #ifdef CONFIG_EXT4_DEBUG 2020 {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, 2021 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, 2022 {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, 2023 #endif 2024 {Opt_err, 0, 0} 2025 }; 2026 2027 #ifdef CONFIG_UNICODE 2028 static const struct ext4_sb_encodings { 2029 __u16 magic; 2030 char *name; 2031 char *version; 2032 } ext4_sb_encoding_map[] = { 2033 {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"}, 2034 }; 2035 2036 static int ext4_sb_read_encoding(const struct ext4_super_block *es, 2037 const struct ext4_sb_encodings **encoding, 2038 __u16 *flags) 2039 { 2040 __u16 magic = le16_to_cpu(es->s_encoding); 2041 int i; 2042 2043 for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) 2044 if (magic == ext4_sb_encoding_map[i].magic) 2045 break; 2046 2047 if (i >= ARRAY_SIZE(ext4_sb_encoding_map)) 2048 return -EINVAL; 2049 2050 *encoding = &ext4_sb_encoding_map[i]; 2051 *flags = le16_to_cpu(es->s_encoding_flags); 2052 2053 return 0; 2054 } 2055 #endif 2056 2057 static int ext4_set_test_dummy_encryption(struct super_block *sb, 2058 const char *opt, 2059 const substring_t *arg, 2060 bool is_remount) 2061 { 2062 #ifdef CONFIG_FS_ENCRYPTION 2063 struct ext4_sb_info *sbi = EXT4_SB(sb); 2064 int err; 2065 2066 /* 2067 * This mount option is just for testing, and it's not worthwhile to 2068 * implement the extra complexity (e.g. RCU protection) that would be 2069 * needed to allow it to be set or changed during remount. We do allow 2070 * it to be specified during remount, but only if there is no change. 2071 */ 2072 if (is_remount && !sbi->s_dummy_enc_policy.policy) { 2073 ext4_msg(sb, KERN_WARNING, 2074 "Can't set test_dummy_encryption on remount"); 2075 return -1; 2076 } 2077 err = fscrypt_set_test_dummy_encryption(sb, arg->from, 2078 &sbi->s_dummy_enc_policy); 2079 if (err) { 2080 if (err == -EEXIST) 2081 ext4_msg(sb, KERN_WARNING, 2082 "Can't change test_dummy_encryption on remount"); 2083 else if (err == -EINVAL) 2084 ext4_msg(sb, KERN_WARNING, 2085 "Value of option \"%s\" is unrecognized", opt); 2086 else 2087 ext4_msg(sb, KERN_WARNING, 2088 "Error processing option \"%s\" [%d]", 2089 opt, err); 2090 return -1; 2091 } 2092 ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); 2093 #else 2094 ext4_msg(sb, KERN_WARNING, 2095 "Test dummy encryption mount option ignored"); 2096 #endif 2097 return 1; 2098 } 2099 2100 struct ext4_parsed_options { 2101 unsigned long journal_devnum; 2102 unsigned int journal_ioprio; 2103 int mb_optimize_scan; 2104 }; 2105 2106 static int handle_mount_opt(struct super_block *sb, char *opt, int token, 2107 substring_t *args, struct ext4_parsed_options *parsed_opts, 2108 int is_remount) 2109 { 2110 struct ext4_sb_info *sbi = EXT4_SB(sb); 2111 const struct mount_opts *m; 2112 kuid_t uid; 2113 kgid_t gid; 2114 int arg = 0; 2115 2116 #ifdef CONFIG_QUOTA 2117 if (token == Opt_usrjquota) 2118 return set_qf_name(sb, USRQUOTA, &args[0]); 2119 else if (token == Opt_grpjquota) 2120 return set_qf_name(sb, GRPQUOTA, &args[0]); 2121 else if (token == Opt_offusrjquota) 2122 return clear_qf_name(sb, USRQUOTA); 2123 else if (token == Opt_offgrpjquota) 2124 return clear_qf_name(sb, GRPQUOTA); 2125 #endif 2126 switch (token) { 2127 case Opt_noacl: 2128 case Opt_nouser_xattr: 2129 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5"); 2130 break; 2131 case Opt_sb: 2132 return 1; /* handled by get_sb_block() */ 2133 case Opt_removed: 2134 ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt); 2135 return 1; 2136 case Opt_abort: 2137 ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); 2138 return 1; 2139 case Opt_i_version: 2140 sb->s_flags |= SB_I_VERSION; 2141 return 1; 2142 case Opt_lazytime: 2143 sb->s_flags |= SB_LAZYTIME; 2144 return 1; 2145 case Opt_nolazytime: 2146 sb->s_flags &= ~SB_LAZYTIME; 2147 return 1; 2148 case Opt_inlinecrypt: 2149 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT 2150 sb->s_flags |= SB_INLINECRYPT; 2151 #else 2152 ext4_msg(sb, KERN_ERR, "inline encryption not supported"); 2153 #endif 2154 return 1; 2155 } 2156 2157 for (m = ext4_mount_opts; m->token != Opt_err; m++) 2158 if (token == m->token) 2159 break; 2160 2161 if (m->token == Opt_err) { 2162 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" " 2163 "or missing value", opt); 2164 return -1; 2165 } 2166 2167 if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { 2168 ext4_msg(sb, KERN_ERR, 2169 "Mount option \"%s\" incompatible with ext2", opt); 2170 return -1; 2171 } 2172 if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { 2173 ext4_msg(sb, KERN_ERR, 2174 "Mount option \"%s\" incompatible with ext3", opt); 2175 return -1; 2176 } 2177 2178 if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg)) 2179 return -1; 2180 if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) 2181 return -1; 2182 if (m->flags & MOPT_EXPLICIT) { 2183 if (m->mount_opt & EXT4_MOUNT_DELALLOC) { 2184 set_opt2(sb, EXPLICIT_DELALLOC); 2185 } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { 2186 set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); 2187 } else 2188 return -1; 2189 } 2190 if (m->flags & MOPT_CLEAR_ERR) 2191 clear_opt(sb, ERRORS_MASK); 2192 if (token == Opt_noquota && sb_any_quota_loaded(sb)) { 2193 ext4_msg(sb, KERN_ERR, "Cannot change quota " 2194 "options when quota turned on"); 2195 return -1; 2196 } 2197 2198 if (m->flags & MOPT_NOSUPPORT) { 2199 ext4_msg(sb, KERN_ERR, "%s option not supported", opt); 2200 } else if (token == Opt_commit) { 2201 if (arg == 0) 2202 arg = JBD2_DEFAULT_MAX_COMMIT_AGE; 2203 else if (arg > INT_MAX / HZ) { 2204 ext4_msg(sb, KERN_ERR, 2205 "Invalid commit interval %d, " 2206 "must be smaller than %d", 2207 arg, INT_MAX / HZ); 2208 return -1; 2209 } 2210 sbi->s_commit_interval = HZ * arg; 2211 } else if (token == Opt_debug_want_extra_isize) { 2212 if ((arg & 1) || 2213 (arg < 4) || 2214 (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { 2215 ext4_msg(sb, KERN_ERR, 2216 "Invalid want_extra_isize %d", arg); 2217 return -1; 2218 } 2219 sbi->s_want_extra_isize = arg; 2220 } else if (token == Opt_max_batch_time) { 2221 sbi->s_max_batch_time = arg; 2222 } else if (token == Opt_min_batch_time) { 2223 sbi->s_min_batch_time = arg; 2224 } else if (token == Opt_inode_readahead_blks) { 2225 if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) { 2226 ext4_msg(sb, KERN_ERR, 2227 "EXT4-fs: inode_readahead_blks must be " 2228 "0 or a power of 2 smaller than 2^31"); 2229 return -1; 2230 } 2231 sbi->s_inode_readahead_blks = arg; 2232 } else if (token == Opt_init_itable) { 2233 set_opt(sb, INIT_INODE_TABLE); 2234 if (!args->from) 2235 arg = EXT4_DEF_LI_WAIT_MULT; 2236 sbi->s_li_wait_mult = arg; 2237 } else if (token == Opt_max_dir_size_kb) { 2238 sbi->s_max_dir_size_kb = arg; 2239 #ifdef CONFIG_EXT4_DEBUG 2240 } else if (token == Opt_fc_debug_max_replay) { 2241 sbi->s_fc_debug_max_replay = arg; 2242 #endif 2243 } else if (token == Opt_stripe) { 2244 sbi->s_stripe = arg; 2245 } else if (token == Opt_resuid) { 2246 uid = make_kuid(current_user_ns(), arg); 2247 if (!uid_valid(uid)) { 2248 ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg); 2249 return -1; 2250 } 2251 sbi->s_resuid = uid; 2252 } else if (token == Opt_resgid) { 2253 gid = make_kgid(current_user_ns(), arg); 2254 if (!gid_valid(gid)) { 2255 ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg); 2256 return -1; 2257 } 2258 sbi->s_resgid = gid; 2259 } else if (token == Opt_journal_dev) { 2260 if (is_remount) { 2261 ext4_msg(sb, KERN_ERR, 2262 "Cannot specify journal on remount"); 2263 return -1; 2264 } 2265 parsed_opts->journal_devnum = arg; 2266 } else if (token == Opt_journal_path) { 2267 char *journal_path; 2268 struct inode *journal_inode; 2269 struct path path; 2270 int error; 2271 2272 if (is_remount) { 2273 ext4_msg(sb, KERN_ERR, 2274 "Cannot specify journal on remount"); 2275 return -1; 2276 } 2277 journal_path = match_strdup(&args[0]); 2278 if (!journal_path) { 2279 ext4_msg(sb, KERN_ERR, "error: could not dup " 2280 "journal device string"); 2281 return -1; 2282 } 2283 2284 error = kern_path(journal_path, LOOKUP_FOLLOW, &path); 2285 if (error) { 2286 ext4_msg(sb, KERN_ERR, "error: could not find " 2287 "journal device path: error %d", error); 2288 kfree(journal_path); 2289 return -1; 2290 } 2291 2292 journal_inode = d_inode(path.dentry); 2293 if (!S_ISBLK(journal_inode->i_mode)) { 2294 ext4_msg(sb, KERN_ERR, "error: journal path %s " 2295 "is not a block device", journal_path); 2296 path_put(&path); 2297 kfree(journal_path); 2298 return -1; 2299 } 2300 2301 parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); 2302 path_put(&path); 2303 kfree(journal_path); 2304 } else if (token == Opt_journal_ioprio) { 2305 if (arg > 7) { 2306 ext4_msg(sb, KERN_ERR, "Invalid journal IO priority" 2307 " (must be 0-7)"); 2308 return -1; 2309 } 2310 parsed_opts->journal_ioprio = 2311 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); 2312 } else if (token == Opt_test_dummy_encryption) { 2313 return ext4_set_test_dummy_encryption(sb, opt, &args[0], 2314 is_remount); 2315 } else if (m->flags & MOPT_DATAJ) { 2316 if (is_remount) { 2317 if (!sbi->s_journal) 2318 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); 2319 else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) { 2320 ext4_msg(sb, KERN_ERR, 2321 "Cannot change data mode on remount"); 2322 return -1; 2323 } 2324 } else { 2325 clear_opt(sb, DATA_FLAGS); 2326 sbi->s_mount_opt |= m->mount_opt; 2327 } 2328 #ifdef CONFIG_QUOTA 2329 } else if (m->flags & MOPT_QFMT) { 2330 if (sb_any_quota_loaded(sb) && 2331 sbi->s_jquota_fmt != m->mount_opt) { 2332 ext4_msg(sb, KERN_ERR, "Cannot change journaled " 2333 "quota options when quota turned on"); 2334 return -1; 2335 } 2336 if (ext4_has_feature_quota(sb)) { 2337 ext4_msg(sb, KERN_INFO, 2338 "Quota format mount options ignored " 2339 "when QUOTA feature is enabled"); 2340 return 1; 2341 } 2342 sbi->s_jquota_fmt = m->mount_opt; 2343 #endif 2344 } else if (token == Opt_dax || token == Opt_dax_always || 2345 token == Opt_dax_inode || token == Opt_dax_never) { 2346 #ifdef CONFIG_FS_DAX 2347 switch (token) { 2348 case Opt_dax: 2349 case Opt_dax_always: 2350 if (is_remount && 2351 (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || 2352 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { 2353 fail_dax_change_remount: 2354 ext4_msg(sb, KERN_ERR, "can't change " 2355 "dax mount option while remounting"); 2356 return -1; 2357 } 2358 if (is_remount && 2359 (test_opt(sb, DATA_FLAGS) == 2360 EXT4_MOUNT_JOURNAL_DATA)) { 2361 ext4_msg(sb, KERN_ERR, "can't mount with " 2362 "both data=journal and dax"); 2363 return -1; 2364 } 2365 ext4_msg(sb, KERN_WARNING, 2366 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); 2367 sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; 2368 sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; 2369 break; 2370 case Opt_dax_never: 2371 if (is_remount && 2372 (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || 2373 (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) 2374 goto fail_dax_change_remount; 2375 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; 2376 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2377 break; 2378 case Opt_dax_inode: 2379 if (is_remount && 2380 ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || 2381 (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || 2382 !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) 2383 goto fail_dax_change_remount; 2384 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2385 sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; 2386 /* Strictly for printing options */ 2387 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; 2388 break; 2389 } 2390 #else 2391 ext4_msg(sb, KERN_INFO, "dax option not supported"); 2392 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; 2393 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; 2394 return -1; 2395 #endif 2396 } else if (token == Opt_data_err_abort) { 2397 sbi->s_mount_opt |= m->mount_opt; 2398 } else if (token == Opt_data_err_ignore) { 2399 sbi->s_mount_opt &= ~m->mount_opt; 2400 } else if (token == Opt_mb_optimize_scan) { 2401 if (arg != 0 && arg != 1) { 2402 ext4_msg(sb, KERN_WARNING, 2403 "mb_optimize_scan should be set to 0 or 1."); 2404 return -1; 2405 } 2406 parsed_opts->mb_optimize_scan = arg; 2407 } else { 2408 if (!args->from) 2409 arg = 1; 2410 if (m->flags & MOPT_CLEAR) 2411 arg = !arg; 2412 else if (unlikely(!(m->flags & MOPT_SET))) { 2413 ext4_msg(sb, KERN_WARNING, 2414 "buggy handling of option %s", opt); 2415 WARN_ON(1); 2416 return -1; 2417 } 2418 if (m->flags & MOPT_2) { 2419 if (arg != 0) 2420 sbi->s_mount_opt2 |= m->mount_opt; 2421 else 2422 sbi->s_mount_opt2 &= ~m->mount_opt; 2423 } else { 2424 if (arg != 0) 2425 sbi->s_mount_opt |= m->mount_opt; 2426 else 2427 sbi->s_mount_opt &= ~m->mount_opt; 2428 } 2429 } 2430 return 1; 2431 } 2432 2433 static int parse_options(char *options, struct super_block *sb, 2434 struct ext4_parsed_options *ret_opts, 2435 int is_remount) 2436 { 2437 struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); 2438 char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name; 2439 substring_t args[MAX_OPT_ARGS]; 2440 int token; 2441 2442 if (!options) 2443 return 1; 2444 2445 while ((p = strsep(&options, ",")) != NULL) { 2446 if (!*p) 2447 continue; 2448 /* 2449 * Initialize args struct so we know whether arg was 2450 * found; some options take optional arguments. 2451 */ 2452 args[0].to = args[0].from = NULL; 2453 token = match_token(p, tokens, args); 2454 if (handle_mount_opt(sb, p, token, args, ret_opts, 2455 is_remount) < 0) 2456 return 0; 2457 } 2458 #ifdef CONFIG_QUOTA 2459 /* 2460 * We do the test below only for project quotas. 'usrquota' and 2461 * 'grpquota' mount options are allowed even without quota feature 2462 * to support legacy quotas in quota files. 2463 */ 2464 if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) { 2465 ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. " 2466 "Cannot enable project quota enforcement."); 2467 return 0; 2468 } 2469 usr_qf_name = get_qf_name(sb, sbi, USRQUOTA); 2470 grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA); 2471 if (usr_qf_name || grp_qf_name) { 2472 if (test_opt(sb, USRQUOTA) && usr_qf_name) 2473 clear_opt(sb, USRQUOTA); 2474 2475 if (test_opt(sb, GRPQUOTA) && grp_qf_name) 2476 clear_opt(sb, GRPQUOTA); 2477 2478 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) { 2479 ext4_msg(sb, KERN_ERR, "old and new quota " 2480 "format mixing"); 2481 return 0; 2482 } 2483 2484 if (!sbi->s_jquota_fmt) { 2485 ext4_msg(sb, KERN_ERR, "journaled quota format " 2486 "not specified"); 2487 return 0; 2488 } 2489 } 2490 #endif 2491 if (test_opt(sb, DIOREAD_NOLOCK)) { 2492 int blocksize = 2493 BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 2494 if (blocksize < PAGE_SIZE) 2495 ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " 2496 "experimental mount option 'dioread_nolock' " 2497 "for blocksize < PAGE_SIZE"); 2498 } 2499 return 1; 2500 } 2501 2502 static inline void ext4_show_quota_options(struct seq_file *seq, 2503 struct super_block *sb) 2504 { 2505 #if defined(CONFIG_QUOTA) 2506 struct ext4_sb_info *sbi = EXT4_SB(sb); 2507 char *usr_qf_name, *grp_qf_name; 2508 2509 if (sbi->s_jquota_fmt) { 2510 char *fmtname = ""; 2511 2512 switch (sbi->s_jquota_fmt) { 2513 case QFMT_VFS_OLD: 2514 fmtname = "vfsold"; 2515 break; 2516 case QFMT_VFS_V0: 2517 fmtname = "vfsv0"; 2518 break; 2519 case QFMT_VFS_V1: 2520 fmtname = "vfsv1"; 2521 break; 2522 } 2523 seq_printf(seq, ",jqfmt=%s", fmtname); 2524 } 2525 2526 rcu_read_lock(); 2527 usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); 2528 grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); 2529 if (usr_qf_name) 2530 seq_show_option(seq, "usrjquota", usr_qf_name); 2531 if (grp_qf_name) 2532 seq_show_option(seq, "grpjquota", grp_qf_name); 2533 rcu_read_unlock(); 2534 #endif 2535 } 2536 2537 static const char *token2str(int token) 2538 { 2539 const struct match_token *t; 2540 2541 for (t = tokens; t->token != Opt_err; t++) 2542 if (t->token == token && !strchr(t->pattern, '=')) 2543 break; 2544 return t->pattern; 2545 } 2546 2547 /* 2548 * Show an option if 2549 * - it's set to a non-default value OR 2550 * - if the per-sb default is different from the global default 2551 */ 2552 static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, 2553 int nodefs) 2554 { 2555 struct ext4_sb_info *sbi = EXT4_SB(sb); 2556 struct ext4_super_block *es = sbi->s_es; 2557 int def_errors, def_mount_opt = sbi->s_def_mount_opt; 2558 const struct mount_opts *m; 2559 char sep = nodefs ? '\n' : ','; 2560 2561 #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) 2562 #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) 2563 2564 if (sbi->s_sb_block != 1) 2565 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); 2566 2567 for (m = ext4_mount_opts; m->token != Opt_err; m++) { 2568 int want_set = m->flags & MOPT_SET; 2569 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || 2570 (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) 2571 continue; 2572 if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) 2573 continue; /* skip if same as the default */ 2574 if ((want_set && 2575 (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || 2576 (!want_set && (sbi->s_mount_opt & m->mount_opt))) 2577 continue; /* select Opt_noFoo vs Opt_Foo */ 2578 SEQ_OPTS_PRINT("%s", token2str(m->token)); 2579 } 2580 2581 if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || 2582 le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) 2583 SEQ_OPTS_PRINT("resuid=%u", 2584 from_kuid_munged(&init_user_ns, sbi->s_resuid)); 2585 if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || 2586 le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) 2587 SEQ_OPTS_PRINT("resgid=%u", 2588 from_kgid_munged(&init_user_ns, sbi->s_resgid)); 2589 def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); 2590 if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) 2591 SEQ_OPTS_PUTS("errors=remount-ro"); 2592 if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) 2593 SEQ_OPTS_PUTS("errors=continue"); 2594 if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) 2595 SEQ_OPTS_PUTS("errors=panic"); 2596 if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) 2597 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); 2598 if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) 2599 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); 2600 if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) 2601 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); 2602 if (sb->s_flags & SB_I_VERSION) 2603 SEQ_OPTS_PUTS("i_version"); 2604 if (nodefs || sbi->s_stripe) 2605 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); 2606 if (nodefs || EXT4_MOUNT_DATA_FLAGS & 2607 (sbi->s_mount_opt ^ def_mount_opt)) { 2608 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 2609 SEQ_OPTS_PUTS("data=journal"); 2610 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 2611 SEQ_OPTS_PUTS("data=ordered"); 2612 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 2613 SEQ_OPTS_PUTS("data=writeback"); 2614 } 2615 if (nodefs || 2616 sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) 2617 SEQ_OPTS_PRINT("inode_readahead_blks=%u", 2618 sbi->s_inode_readahead_blks); 2619 2620 if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || 2621 (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) 2622 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 2623 if (nodefs || sbi->s_max_dir_size_kb) 2624 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); 2625 if (test_opt(sb, DATA_ERR_ABORT)) 2626 SEQ_OPTS_PUTS("data_err=abort"); 2627 2628 fscrypt_show_test_dummy_encryption(seq, sep, sb); 2629 2630 if (sb->s_flags & SB_INLINECRYPT) 2631 SEQ_OPTS_PUTS("inlinecrypt"); 2632 2633 if (test_opt(sb, DAX_ALWAYS)) { 2634 if (IS_EXT2_SB(sb)) 2635 SEQ_OPTS_PUTS("dax"); 2636 else 2637 SEQ_OPTS_PUTS("dax=always"); 2638 } else if (test_opt2(sb, DAX_NEVER)) { 2639 SEQ_OPTS_PUTS("dax=never"); 2640 } else if (test_opt2(sb, DAX_INODE)) { 2641 SEQ_OPTS_PUTS("dax=inode"); 2642 } 2643 ext4_show_quota_options(seq, sb); 2644 return 0; 2645 } 2646 2647 static int ext4_show_options(struct seq_file *seq, struct dentry *root) 2648 { 2649 return _ext4_show_options(seq, root->d_sb, 0); 2650 } 2651 2652 int ext4_seq_options_show(struct seq_file *seq, void *offset) 2653 { 2654 struct super_block *sb = seq->private; 2655 int rc; 2656 2657 seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); 2658 rc = _ext4_show_options(seq, sb, 1); 2659 seq_puts(seq, "\n"); 2660 return rc; 2661 } 2662 2663 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, 2664 int read_only) 2665 { 2666 struct ext4_sb_info *sbi = EXT4_SB(sb); 2667 int err = 0; 2668 2669 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { 2670 ext4_msg(sb, KERN_ERR, "revision level too high, " 2671 "forcing read-only mode"); 2672 err = -EROFS; 2673 goto done; 2674 } 2675 if (read_only) 2676 goto done; 2677 if (!(sbi->s_mount_state & EXT4_VALID_FS)) 2678 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " 2679 "running e2fsck is recommended"); 2680 else if (sbi->s_mount_state & EXT4_ERROR_FS) 2681 ext4_msg(sb, KERN_WARNING, 2682 "warning: mounting fs with errors, " 2683 "running e2fsck is recommended"); 2684 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && 2685 le16_to_cpu(es->s_mnt_count) >= 2686 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 2687 ext4_msg(sb, KERN_WARNING, 2688 "warning: maximal mount count reached, " 2689 "running e2fsck is recommended"); 2690 else if (le32_to_cpu(es->s_checkinterval) && 2691 (ext4_get_tstamp(es, s_lastcheck) + 2692 le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) 2693 ext4_msg(sb, KERN_WARNING, 2694 "warning: checktime reached, " 2695 "running e2fsck is recommended"); 2696 if (!sbi->s_journal) 2697 es->s_state &= cpu_to_le16(~EXT4_VALID_FS); 2698 if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) 2699 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); 2700 le16_add_cpu(&es->s_mnt_count, 1); 2701 ext4_update_tstamp(es, s_mtime); 2702 if (sbi->s_journal) 2703 ext4_set_feature_journal_needs_recovery(sb); 2704 2705 err = ext4_commit_super(sb); 2706 done: 2707 if (test_opt(sb, DEBUG)) 2708 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " 2709 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", 2710 sb->s_blocksize, 2711 sbi->s_groups_count, 2712 EXT4_BLOCKS_PER_GROUP(sb), 2713 EXT4_INODES_PER_GROUP(sb), 2714 sbi->s_mount_opt, sbi->s_mount_opt2); 2715 2716 cleancache_init_fs(sb); 2717 return err; 2718 } 2719 2720 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) 2721 { 2722 struct ext4_sb_info *sbi = EXT4_SB(sb); 2723 struct flex_groups **old_groups, **new_groups; 2724 int size, i, j; 2725 2726 if (!sbi->s_log_groups_per_flex) 2727 return 0; 2728 2729 size = ext4_flex_group(sbi, ngroup - 1) + 1; 2730 if (size <= sbi->s_flex_groups_allocated) 2731 return 0; 2732 2733 new_groups = kvzalloc(roundup_pow_of_two(size * 2734 sizeof(*sbi->s_flex_groups)), GFP_KERNEL); 2735 if (!new_groups) { 2736 ext4_msg(sb, KERN_ERR, 2737 "not enough memory for %d flex group pointers", size); 2738 return -ENOMEM; 2739 } 2740 for (i = sbi->s_flex_groups_allocated; i < size; i++) { 2741 new_groups[i] = kvzalloc(roundup_pow_of_two( 2742 sizeof(struct flex_groups)), 2743 GFP_KERNEL); 2744 if (!new_groups[i]) { 2745 for (j = sbi->s_flex_groups_allocated; j < i; j++) 2746 kvfree(new_groups[j]); 2747 kvfree(new_groups); 2748 ext4_msg(sb, KERN_ERR, 2749 "not enough memory for %d flex groups", size); 2750 return -ENOMEM; 2751 } 2752 } 2753 rcu_read_lock(); 2754 old_groups = rcu_dereference(sbi->s_flex_groups); 2755 if (old_groups) 2756 memcpy(new_groups, old_groups, 2757 (sbi->s_flex_groups_allocated * 2758 sizeof(struct flex_groups *))); 2759 rcu_read_unlock(); 2760 rcu_assign_pointer(sbi->s_flex_groups, new_groups); 2761 sbi->s_flex_groups_allocated = size; 2762 if (old_groups) 2763 ext4_kvfree_array_rcu(old_groups); 2764 return 0; 2765 } 2766 2767 static int ext4_fill_flex_info(struct super_block *sb) 2768 { 2769 struct ext4_sb_info *sbi = EXT4_SB(sb); 2770 struct ext4_group_desc *gdp = NULL; 2771 struct flex_groups *fg; 2772 ext4_group_t flex_group; 2773 int i, err; 2774 2775 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 2776 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { 2777 sbi->s_log_groups_per_flex = 0; 2778 return 1; 2779 } 2780 2781 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 2782 if (err) 2783 goto failed; 2784 2785 for (i = 0; i < sbi->s_groups_count; i++) { 2786 gdp = ext4_get_group_desc(sb, i, NULL); 2787 2788 flex_group = ext4_flex_group(sbi, i); 2789 fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); 2790 atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); 2791 atomic64_add(ext4_free_group_clusters(sb, gdp), 2792 &fg->free_clusters); 2793 atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); 2794 } 2795 2796 return 1; 2797 failed: 2798 return 0; 2799 } 2800 2801 static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, 2802 struct ext4_group_desc *gdp) 2803 { 2804 int offset = offsetof(struct ext4_group_desc, bg_checksum); 2805 __u16 crc = 0; 2806 __le32 le_group = cpu_to_le32(block_group); 2807 struct ext4_sb_info *sbi = EXT4_SB(sb); 2808 2809 if (ext4_has_metadata_csum(sbi->s_sb)) { 2810 /* Use new metadata_csum algorithm */ 2811 __u32 csum32; 2812 __u16 dummy_csum = 0; 2813 2814 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, 2815 sizeof(le_group)); 2816 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); 2817 csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, 2818 sizeof(dummy_csum)); 2819 offset += sizeof(dummy_csum); 2820 if (offset < sbi->s_desc_size) 2821 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, 2822 sbi->s_desc_size - offset); 2823 2824 crc = csum32 & 0xFFFF; 2825 goto out; 2826 } 2827 2828 /* old crc16 code */ 2829 if (!ext4_has_feature_gdt_csum(sb)) 2830 return 0; 2831 2832 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); 2833 crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); 2834 crc = crc16(crc, (__u8 *)gdp, offset); 2835 offset += sizeof(gdp->bg_checksum); /* skip checksum */ 2836 /* for checksum of struct ext4_group_desc do the rest...*/ 2837 if (ext4_has_feature_64bit(sb) && 2838 offset < le16_to_cpu(sbi->s_es->s_desc_size)) 2839 crc = crc16(crc, (__u8 *)gdp + offset, 2840 le16_to_cpu(sbi->s_es->s_desc_size) - 2841 offset); 2842 2843 out: 2844 return cpu_to_le16(crc); 2845 } 2846 2847 int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, 2848 struct ext4_group_desc *gdp) 2849 { 2850 if (ext4_has_group_desc_csum(sb) && 2851 (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) 2852 return 0; 2853 2854 return 1; 2855 } 2856 2857 void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, 2858 struct ext4_group_desc *gdp) 2859 { 2860 if (!ext4_has_group_desc_csum(sb)) 2861 return; 2862 gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); 2863 } 2864 2865 /* Called at mount-time, super-block is locked */ 2866 static int ext4_check_descriptors(struct super_block *sb, 2867 ext4_fsblk_t sb_block, 2868 ext4_group_t *first_not_zeroed) 2869 { 2870 struct ext4_sb_info *sbi = EXT4_SB(sb); 2871 ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); 2872 ext4_fsblk_t last_block; 2873 ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); 2874 ext4_fsblk_t block_bitmap; 2875 ext4_fsblk_t inode_bitmap; 2876 ext4_fsblk_t inode_table; 2877 int flexbg_flag = 0; 2878 ext4_group_t i, grp = sbi->s_groups_count; 2879 2880 if (ext4_has_feature_flex_bg(sb)) 2881 flexbg_flag = 1; 2882 2883 ext4_debug("Checking group descriptors"); 2884 2885 for (i = 0; i < sbi->s_groups_count; i++) { 2886 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); 2887 2888 if (i == sbi->s_groups_count - 1 || flexbg_flag) 2889 last_block = ext4_blocks_count(sbi->s_es) - 1; 2890 else 2891 last_block = first_block + 2892 (EXT4_BLOCKS_PER_GROUP(sb) - 1); 2893 2894 if ((grp == sbi->s_groups_count) && 2895 !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 2896 grp = i; 2897 2898 block_bitmap = ext4_block_bitmap(sb, gdp); 2899 if (block_bitmap == sb_block) { 2900 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2901 "Block bitmap for group %u overlaps " 2902 "superblock", i); 2903 if (!sb_rdonly(sb)) 2904 return 0; 2905 } 2906 if (block_bitmap >= sb_block + 1 && 2907 block_bitmap <= last_bg_block) { 2908 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2909 "Block bitmap for group %u overlaps " 2910 "block group descriptors", i); 2911 if (!sb_rdonly(sb)) 2912 return 0; 2913 } 2914 if (block_bitmap < first_block || block_bitmap > last_block) { 2915 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2916 "Block bitmap for group %u not in group " 2917 "(block %llu)!", i, block_bitmap); 2918 return 0; 2919 } 2920 inode_bitmap = ext4_inode_bitmap(sb, gdp); 2921 if (inode_bitmap == sb_block) { 2922 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2923 "Inode bitmap for group %u overlaps " 2924 "superblock", i); 2925 if (!sb_rdonly(sb)) 2926 return 0; 2927 } 2928 if (inode_bitmap >= sb_block + 1 && 2929 inode_bitmap <= last_bg_block) { 2930 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2931 "Inode bitmap for group %u overlaps " 2932 "block group descriptors", i); 2933 if (!sb_rdonly(sb)) 2934 return 0; 2935 } 2936 if (inode_bitmap < first_block || inode_bitmap > last_block) { 2937 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2938 "Inode bitmap for group %u not in group " 2939 "(block %llu)!", i, inode_bitmap); 2940 return 0; 2941 } 2942 inode_table = ext4_inode_table(sb, gdp); 2943 if (inode_table == sb_block) { 2944 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2945 "Inode table for group %u overlaps " 2946 "superblock", i); 2947 if (!sb_rdonly(sb)) 2948 return 0; 2949 } 2950 if (inode_table >= sb_block + 1 && 2951 inode_table <= last_bg_block) { 2952 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2953 "Inode table for group %u overlaps " 2954 "block group descriptors", i); 2955 if (!sb_rdonly(sb)) 2956 return 0; 2957 } 2958 if (inode_table < first_block || 2959 inode_table + sbi->s_itb_per_group - 1 > last_block) { 2960 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2961 "Inode table for group %u not in group " 2962 "(block %llu)!", i, inode_table); 2963 return 0; 2964 } 2965 ext4_lock_group(sb, i); 2966 if (!ext4_group_desc_csum_verify(sb, i, gdp)) { 2967 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2968 "Checksum for group %u failed (%u!=%u)", 2969 i, le16_to_cpu(ext4_group_desc_csum(sb, i, 2970 gdp)), le16_to_cpu(gdp->bg_checksum)); 2971 if (!sb_rdonly(sb)) { 2972 ext4_unlock_group(sb, i); 2973 return 0; 2974 } 2975 } 2976 ext4_unlock_group(sb, i); 2977 if (!flexbg_flag) 2978 first_block += EXT4_BLOCKS_PER_GROUP(sb); 2979 } 2980 if (NULL != first_not_zeroed) 2981 *first_not_zeroed = grp; 2982 return 1; 2983 } 2984 2985 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at 2986 * the superblock) which were deleted from all directories, but held open by 2987 * a process at the time of a crash. We walk the list and try to delete these 2988 * inodes at recovery time (only with a read-write filesystem). 2989 * 2990 * In order to keep the orphan inode chain consistent during traversal (in 2991 * case of crash during recovery), we link each inode into the superblock 2992 * orphan list_head and handle it the same way as an inode deletion during 2993 * normal operation (which journals the operations for us). 2994 * 2995 * We only do an iget() and an iput() on each inode, which is very safe if we 2996 * accidentally point at an in-use or already deleted inode. The worst that 2997 * can happen in this case is that we get a "bit already cleared" message from 2998 * ext4_free_inode(). The only reason we would point at a wrong inode is if 2999 * e2fsck was run on this filesystem, and it must have already done the orphan 3000 * inode cleanup for us, so we can safely abort without any further action. 3001 */ 3002 static void ext4_orphan_cleanup(struct super_block *sb, 3003 struct ext4_super_block *es) 3004 { 3005 unsigned int s_flags = sb->s_flags; 3006 int ret, nr_orphans = 0, nr_truncates = 0; 3007 #ifdef CONFIG_QUOTA 3008 int quota_update = 0; 3009 int i; 3010 #endif 3011 if (!es->s_last_orphan) { 3012 jbd_debug(4, "no orphan inodes to clean up\n"); 3013 return; 3014 } 3015 3016 if (bdev_read_only(sb->s_bdev)) { 3017 ext4_msg(sb, KERN_ERR, "write access " 3018 "unavailable, skipping orphan cleanup"); 3019 return; 3020 } 3021 3022 /* Check if feature set would not allow a r/w mount */ 3023 if (!ext4_feature_set_ok(sb, 0)) { 3024 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " 3025 "unknown ROCOMPAT features"); 3026 return; 3027 } 3028 3029 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 3030 /* don't clear list on RO mount w/ errors */ 3031 if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { 3032 ext4_msg(sb, KERN_INFO, "Errors on filesystem, " 3033 "clearing orphan list.\n"); 3034 es->s_last_orphan = 0; 3035 } 3036 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 3037 return; 3038 } 3039 3040 if (s_flags & SB_RDONLY) { 3041 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); 3042 sb->s_flags &= ~SB_RDONLY; 3043 } 3044 #ifdef CONFIG_QUOTA 3045 /* 3046 * Turn on quotas which were not enabled for read-only mounts if 3047 * filesystem has quota feature, so that they are updated correctly. 3048 */ 3049 if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { 3050 int ret = ext4_enable_quotas(sb); 3051 3052 if (!ret) 3053 quota_update = 1; 3054 else 3055 ext4_msg(sb, KERN_ERR, 3056 "Cannot turn on quotas: error %d", ret); 3057 } 3058 3059 /* Turn on journaled quotas used for old sytle */ 3060 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 3061 if (EXT4_SB(sb)->s_qf_names[i]) { 3062 int ret = ext4_quota_on_mount(sb, i); 3063 3064 if (!ret) 3065 quota_update = 1; 3066 else 3067 ext4_msg(sb, KERN_ERR, 3068 "Cannot turn on journaled " 3069 "quota: type %d: error %d", i, ret); 3070 } 3071 } 3072 #endif 3073 3074 while (es->s_last_orphan) { 3075 struct inode *inode; 3076 3077 /* 3078 * We may have encountered an error during cleanup; if 3079 * so, skip the rest. 3080 */ 3081 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 3082 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); 3083 es->s_last_orphan = 0; 3084 break; 3085 } 3086 3087 inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); 3088 if (IS_ERR(inode)) { 3089 es->s_last_orphan = 0; 3090 break; 3091 } 3092 3093 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 3094 dquot_initialize(inode); 3095 if (inode->i_nlink) { 3096 if (test_opt(sb, DEBUG)) 3097 ext4_msg(sb, KERN_DEBUG, 3098 "%s: truncating inode %lu to %lld bytes", 3099 __func__, inode->i_ino, inode->i_size); 3100 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 3101 inode->i_ino, inode->i_size); 3102 inode_lock(inode); 3103 truncate_inode_pages(inode->i_mapping, inode->i_size); 3104 ret = ext4_truncate(inode); 3105 if (ret) { 3106 /* 3107 * We need to clean up the in-core orphan list 3108 * manually if ext4_truncate() failed to get a 3109 * transaction handle. 3110 */ 3111 ext4_orphan_del(NULL, inode); 3112 ext4_std_error(inode->i_sb, ret); 3113 } 3114 inode_unlock(inode); 3115 nr_truncates++; 3116 } else { 3117 if (test_opt(sb, DEBUG)) 3118 ext4_msg(sb, KERN_DEBUG, 3119 "%s: deleting unreferenced inode %lu", 3120 __func__, inode->i_ino); 3121 jbd_debug(2, "deleting unreferenced inode %lu\n", 3122 inode->i_ino); 3123 nr_orphans++; 3124 } 3125 iput(inode); /* The delete magic happens here! */ 3126 } 3127 3128 #define PLURAL(x) (x), ((x) == 1) ? "" : "s" 3129 3130 if (nr_orphans) 3131 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", 3132 PLURAL(nr_orphans)); 3133 if (nr_truncates) 3134 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", 3135 PLURAL(nr_truncates)); 3136 #ifdef CONFIG_QUOTA 3137 /* Turn off quotas if they were enabled for orphan cleanup */ 3138 if (quota_update) { 3139 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 3140 if (sb_dqopt(sb)->files[i]) 3141 dquot_quota_off(sb, i); 3142 } 3143 } 3144 #endif 3145 sb->s_flags = s_flags; /* Restore SB_RDONLY status */ 3146 } 3147 3148 /* 3149 * Maximal extent format file size. 3150 * Resulting logical blkno at s_maxbytes must fit in our on-disk 3151 * extent format containers, within a sector_t, and within i_blocks 3152 * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, 3153 * so that won't be a limiting factor. 3154 * 3155 * However there is other limiting factor. We do store extents in the form 3156 * of starting block and length, hence the resulting length of the extent 3157 * covering maximum file size must fit into on-disk format containers as 3158 * well. Given that length is always by 1 unit bigger than max unit (because 3159 * we count 0 as well) we have to lower the s_maxbytes by one fs block. 3160 * 3161 * Note, this does *not* consider any metadata overhead for vfs i_blocks. 3162 */ 3163 static loff_t ext4_max_size(int blkbits, int has_huge_files) 3164 { 3165 loff_t res; 3166 loff_t upper_limit = MAX_LFS_FILESIZE; 3167 3168 BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); 3169 3170 if (!has_huge_files) { 3171 upper_limit = (1LL << 32) - 1; 3172 3173 /* total blocks in file system block size */ 3174 upper_limit >>= (blkbits - 9); 3175 upper_limit <<= blkbits; 3176 } 3177 3178 /* 3179 * 32-bit extent-start container, ee_block. We lower the maxbytes 3180 * by one fs block, so ee_len can cover the extent of maximum file 3181 * size 3182 */ 3183 res = (1LL << 32) - 1; 3184 res <<= blkbits; 3185 3186 /* Sanity check against vm- & vfs- imposed limits */ 3187 if (res > upper_limit) 3188 res = upper_limit; 3189 3190 return res; 3191 } 3192 3193 /* 3194 * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect 3195 * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. 3196 * We need to be 1 filesystem block less than the 2^48 sector limit. 3197 */ 3198 static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) 3199 { 3200 loff_t res = EXT4_NDIR_BLOCKS; 3201 int meta_blocks; 3202 loff_t upper_limit; 3203 /* This is calculated to be the largest file size for a dense, block 3204 * mapped file such that the file's total number of 512-byte sectors, 3205 * including data and all indirect blocks, does not exceed (2^48 - 1). 3206 * 3207 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total 3208 * number of 512-byte sectors of the file. 3209 */ 3210 3211 if (!has_huge_files) { 3212 /* 3213 * !has_huge_files or implies that the inode i_block field 3214 * represents total file blocks in 2^32 512-byte sectors == 3215 * size of vfs inode i_blocks * 8 3216 */ 3217 upper_limit = (1LL << 32) - 1; 3218 3219 /* total blocks in file system block size */ 3220 upper_limit >>= (bits - 9); 3221 3222 } else { 3223 /* 3224 * We use 48 bit ext4_inode i_blocks 3225 * With EXT4_HUGE_FILE_FL set the i_blocks 3226 * represent total number of blocks in 3227 * file system block size 3228 */ 3229 upper_limit = (1LL << 48) - 1; 3230 3231 } 3232 3233 /* indirect blocks */ 3234 meta_blocks = 1; 3235 /* double indirect blocks */ 3236 meta_blocks += 1 + (1LL << (bits-2)); 3237 /* tripple indirect blocks */ 3238 meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); 3239 3240 upper_limit -= meta_blocks; 3241 upper_limit <<= bits; 3242 3243 res += 1LL << (bits-2); 3244 res += 1LL << (2*(bits-2)); 3245 res += 1LL << (3*(bits-2)); 3246 res <<= bits; 3247 if (res > upper_limit) 3248 res = upper_limit; 3249 3250 if (res > MAX_LFS_FILESIZE) 3251 res = MAX_LFS_FILESIZE; 3252 3253 return res; 3254 } 3255 3256 static ext4_fsblk_t descriptor_loc(struct super_block *sb, 3257 ext4_fsblk_t logical_sb_block, int nr) 3258 { 3259 struct ext4_sb_info *sbi = EXT4_SB(sb); 3260 ext4_group_t bg, first_meta_bg; 3261 int has_super = 0; 3262 3263 first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); 3264 3265 if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) 3266 return logical_sb_block + nr + 1; 3267 bg = sbi->s_desc_per_block * nr; 3268 if (ext4_bg_has_super(sb, bg)) 3269 has_super = 1; 3270 3271 /* 3272 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at 3273 * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled 3274 * on modern mke2fs or blksize > 1k on older mke2fs) then we must 3275 * compensate. 3276 */ 3277 if (sb->s_blocksize == 1024 && nr == 0 && 3278 le32_to_cpu(sbi->s_es->s_first_data_block) == 0) 3279 has_super++; 3280 3281 return (has_super + ext4_group_first_block_no(sb, bg)); 3282 } 3283 3284 /** 3285 * ext4_get_stripe_size: Get the stripe size. 3286 * @sbi: In memory super block info 3287 * 3288 * If we have specified it via mount option, then 3289 * use the mount option value. If the value specified at mount time is 3290 * greater than the blocks per group use the super block value. 3291 * If the super block value is greater than blocks per group return 0. 3292 * Allocator needs it be less than blocks per group. 3293 * 3294 */ 3295 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) 3296 { 3297 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); 3298 unsigned long stripe_width = 3299 le32_to_cpu(sbi->s_es->s_raid_stripe_width); 3300 int ret; 3301 3302 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) 3303 ret = sbi->s_stripe; 3304 else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) 3305 ret = stripe_width; 3306 else if (stride && stride <= sbi->s_blocks_per_group) 3307 ret = stride; 3308 else 3309 ret = 0; 3310 3311 /* 3312 * If the stripe width is 1, this makes no sense and 3313 * we set it to 0 to turn off stripe handling code. 3314 */ 3315 if (ret <= 1) 3316 ret = 0; 3317 3318 return ret; 3319 } 3320 3321 /* 3322 * Check whether this filesystem can be mounted based on 3323 * the features present and the RDONLY/RDWR mount requested. 3324 * Returns 1 if this filesystem can be mounted as requested, 3325 * 0 if it cannot be. 3326 */ 3327 static int ext4_feature_set_ok(struct super_block *sb, int readonly) 3328 { 3329 if (ext4_has_unknown_ext4_incompat_features(sb)) { 3330 ext4_msg(sb, KERN_ERR, 3331 "Couldn't mount because of " 3332 "unsupported optional features (%x)", 3333 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & 3334 ~EXT4_FEATURE_INCOMPAT_SUPP)); 3335 return 0; 3336 } 3337 3338 #ifndef CONFIG_UNICODE 3339 if (ext4_has_feature_casefold(sb)) { 3340 ext4_msg(sb, KERN_ERR, 3341 "Filesystem with casefold feature cannot be " 3342 "mounted without CONFIG_UNICODE"); 3343 return 0; 3344 } 3345 #endif 3346 3347 if (readonly) 3348 return 1; 3349 3350 if (ext4_has_feature_readonly(sb)) { 3351 ext4_msg(sb, KERN_INFO, "filesystem is read-only"); 3352 sb->s_flags |= SB_RDONLY; 3353 return 1; 3354 } 3355 3356 /* Check that feature set is OK for a read-write mount */ 3357 if (ext4_has_unknown_ext4_ro_compat_features(sb)) { 3358 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " 3359 "unsupported optional features (%x)", 3360 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & 3361 ~EXT4_FEATURE_RO_COMPAT_SUPP)); 3362 return 0; 3363 } 3364 if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { 3365 ext4_msg(sb, KERN_ERR, 3366 "Can't support bigalloc feature without " 3367 "extents feature\n"); 3368 return 0; 3369 } 3370 3371 #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) 3372 if (!readonly && (ext4_has_feature_quota(sb) || 3373 ext4_has_feature_project(sb))) { 3374 ext4_msg(sb, KERN_ERR, 3375 "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); 3376 return 0; 3377 } 3378 #endif /* CONFIG_QUOTA */ 3379 return 1; 3380 } 3381 3382 /* 3383 * This function is called once a day if we have errors logged 3384 * on the file system 3385 */ 3386 static void print_daily_error_info(struct timer_list *t) 3387 { 3388 struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); 3389 struct super_block *sb = sbi->s_sb; 3390 struct ext4_super_block *es = sbi->s_es; 3391 3392 if (es->s_error_count) 3393 /* fsck newer than v1.41.13 is needed to clean this condition. */ 3394 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", 3395 le32_to_cpu(es->s_error_count)); 3396 if (es->s_first_error_time) { 3397 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", 3398 sb->s_id, 3399 ext4_get_tstamp(es, s_first_error_time), 3400 (int) sizeof(es->s_first_error_func), 3401 es->s_first_error_func, 3402 le32_to_cpu(es->s_first_error_line)); 3403 if (es->s_first_error_ino) 3404 printk(KERN_CONT ": inode %u", 3405 le32_to_cpu(es->s_first_error_ino)); 3406 if (es->s_first_error_block) 3407 printk(KERN_CONT ": block %llu", (unsigned long long) 3408 le64_to_cpu(es->s_first_error_block)); 3409 printk(KERN_CONT "\n"); 3410 } 3411 if (es->s_last_error_time) { 3412 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", 3413 sb->s_id, 3414 ext4_get_tstamp(es, s_last_error_time), 3415 (int) sizeof(es->s_last_error_func), 3416 es->s_last_error_func, 3417 le32_to_cpu(es->s_last_error_line)); 3418 if (es->s_last_error_ino) 3419 printk(KERN_CONT ": inode %u", 3420 le32_to_cpu(es->s_last_error_ino)); 3421 if (es->s_last_error_block) 3422 printk(KERN_CONT ": block %llu", (unsigned long long) 3423 le64_to_cpu(es->s_last_error_block)); 3424 printk(KERN_CONT "\n"); 3425 } 3426 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 3427 } 3428 3429 /* Find next suitable group and run ext4_init_inode_table */ 3430 static int ext4_run_li_request(struct ext4_li_request *elr) 3431 { 3432 struct ext4_group_desc *gdp = NULL; 3433 struct super_block *sb = elr->lr_super; 3434 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 3435 ext4_group_t group = elr->lr_next_group; 3436 unsigned long timeout = 0; 3437 unsigned int prefetch_ios = 0; 3438 int ret = 0; 3439 3440 if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { 3441 elr->lr_next_group = ext4_mb_prefetch(sb, group, 3442 EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); 3443 if (prefetch_ios) 3444 ext4_mb_prefetch_fini(sb, elr->lr_next_group, 3445 prefetch_ios); 3446 trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, 3447 prefetch_ios); 3448 if (group >= elr->lr_next_group) { 3449 ret = 1; 3450 if (elr->lr_first_not_zeroed != ngroups && 3451 !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { 3452 elr->lr_next_group = elr->lr_first_not_zeroed; 3453 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3454 ret = 0; 3455 } 3456 } 3457 return ret; 3458 } 3459 3460 for (; group < ngroups; group++) { 3461 gdp = ext4_get_group_desc(sb, group, NULL); 3462 if (!gdp) { 3463 ret = 1; 3464 break; 3465 } 3466 3467 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 3468 break; 3469 } 3470 3471 if (group >= ngroups) 3472 ret = 1; 3473 3474 if (!ret) { 3475 timeout = jiffies; 3476 ret = ext4_init_inode_table(sb, group, 3477 elr->lr_timeout ? 0 : 1); 3478 trace_ext4_lazy_itable_init(sb, group); 3479 if (elr->lr_timeout == 0) { 3480 timeout = (jiffies - timeout) * 3481 EXT4_SB(elr->lr_super)->s_li_wait_mult; 3482 elr->lr_timeout = timeout; 3483 } 3484 elr->lr_next_sched = jiffies + elr->lr_timeout; 3485 elr->lr_next_group = group + 1; 3486 } 3487 return ret; 3488 } 3489 3490 /* 3491 * Remove lr_request from the list_request and free the 3492 * request structure. Should be called with li_list_mtx held 3493 */ 3494 static void ext4_remove_li_request(struct ext4_li_request *elr) 3495 { 3496 if (!elr) 3497 return; 3498 3499 list_del(&elr->lr_request); 3500 EXT4_SB(elr->lr_super)->s_li_request = NULL; 3501 kfree(elr); 3502 } 3503 3504 static void ext4_unregister_li_request(struct super_block *sb) 3505 { 3506 mutex_lock(&ext4_li_mtx); 3507 if (!ext4_li_info) { 3508 mutex_unlock(&ext4_li_mtx); 3509 return; 3510 } 3511 3512 mutex_lock(&ext4_li_info->li_list_mtx); 3513 ext4_remove_li_request(EXT4_SB(sb)->s_li_request); 3514 mutex_unlock(&ext4_li_info->li_list_mtx); 3515 mutex_unlock(&ext4_li_mtx); 3516 } 3517 3518 static struct task_struct *ext4_lazyinit_task; 3519 3520 /* 3521 * This is the function where ext4lazyinit thread lives. It walks 3522 * through the request list searching for next scheduled filesystem. 3523 * When such a fs is found, run the lazy initialization request 3524 * (ext4_rn_li_request) and keep track of the time spend in this 3525 * function. Based on that time we compute next schedule time of 3526 * the request. When walking through the list is complete, compute 3527 * next waking time and put itself into sleep. 3528 */ 3529 static int ext4_lazyinit_thread(void *arg) 3530 { 3531 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 3532 struct list_head *pos, *n; 3533 struct ext4_li_request *elr; 3534 unsigned long next_wakeup, cur; 3535 3536 BUG_ON(NULL == eli); 3537 3538 cont_thread: 3539 while (true) { 3540 next_wakeup = MAX_JIFFY_OFFSET; 3541 3542 mutex_lock(&eli->li_list_mtx); 3543 if (list_empty(&eli->li_request_list)) { 3544 mutex_unlock(&eli->li_list_mtx); 3545 goto exit_thread; 3546 } 3547 list_for_each_safe(pos, n, &eli->li_request_list) { 3548 int err = 0; 3549 int progress = 0; 3550 elr = list_entry(pos, struct ext4_li_request, 3551 lr_request); 3552 3553 if (time_before(jiffies, elr->lr_next_sched)) { 3554 if (time_before(elr->lr_next_sched, next_wakeup)) 3555 next_wakeup = elr->lr_next_sched; 3556 continue; 3557 } 3558 if (down_read_trylock(&elr->lr_super->s_umount)) { 3559 if (sb_start_write_trylock(elr->lr_super)) { 3560 progress = 1; 3561 /* 3562 * We hold sb->s_umount, sb can not 3563 * be removed from the list, it is 3564 * now safe to drop li_list_mtx 3565 */ 3566 mutex_unlock(&eli->li_list_mtx); 3567 err = ext4_run_li_request(elr); 3568 sb_end_write(elr->lr_super); 3569 mutex_lock(&eli->li_list_mtx); 3570 n = pos->next; 3571 } 3572 up_read((&elr->lr_super->s_umount)); 3573 } 3574 /* error, remove the lazy_init job */ 3575 if (err) { 3576 ext4_remove_li_request(elr); 3577 continue; 3578 } 3579 if (!progress) { 3580 elr->lr_next_sched = jiffies + 3581 (prandom_u32() 3582 % (EXT4_DEF_LI_MAX_START_DELAY * HZ)); 3583 } 3584 if (time_before(elr->lr_next_sched, next_wakeup)) 3585 next_wakeup = elr->lr_next_sched; 3586 } 3587 mutex_unlock(&eli->li_list_mtx); 3588 3589 try_to_freeze(); 3590 3591 cur = jiffies; 3592 if ((time_after_eq(cur, next_wakeup)) || 3593 (MAX_JIFFY_OFFSET == next_wakeup)) { 3594 cond_resched(); 3595 continue; 3596 } 3597 3598 schedule_timeout_interruptible(next_wakeup - cur); 3599 3600 if (kthread_should_stop()) { 3601 ext4_clear_request_list(); 3602 goto exit_thread; 3603 } 3604 } 3605 3606 exit_thread: 3607 /* 3608 * It looks like the request list is empty, but we need 3609 * to check it under the li_list_mtx lock, to prevent any 3610 * additions into it, and of course we should lock ext4_li_mtx 3611 * to atomically free the list and ext4_li_info, because at 3612 * this point another ext4 filesystem could be registering 3613 * new one. 3614 */ 3615 mutex_lock(&ext4_li_mtx); 3616 mutex_lock(&eli->li_list_mtx); 3617 if (!list_empty(&eli->li_request_list)) { 3618 mutex_unlock(&eli->li_list_mtx); 3619 mutex_unlock(&ext4_li_mtx); 3620 goto cont_thread; 3621 } 3622 mutex_unlock(&eli->li_list_mtx); 3623 kfree(ext4_li_info); 3624 ext4_li_info = NULL; 3625 mutex_unlock(&ext4_li_mtx); 3626 3627 return 0; 3628 } 3629 3630 static void ext4_clear_request_list(void) 3631 { 3632 struct list_head *pos, *n; 3633 struct ext4_li_request *elr; 3634 3635 mutex_lock(&ext4_li_info->li_list_mtx); 3636 list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { 3637 elr = list_entry(pos, struct ext4_li_request, 3638 lr_request); 3639 ext4_remove_li_request(elr); 3640 } 3641 mutex_unlock(&ext4_li_info->li_list_mtx); 3642 } 3643 3644 static int ext4_run_lazyinit_thread(void) 3645 { 3646 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, 3647 ext4_li_info, "ext4lazyinit"); 3648 if (IS_ERR(ext4_lazyinit_task)) { 3649 int err = PTR_ERR(ext4_lazyinit_task); 3650 ext4_clear_request_list(); 3651 kfree(ext4_li_info); 3652 ext4_li_info = NULL; 3653 printk(KERN_CRIT "EXT4-fs: error %d creating inode table " 3654 "initialization thread\n", 3655 err); 3656 return err; 3657 } 3658 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 3659 return 0; 3660 } 3661 3662 /* 3663 * Check whether it make sense to run itable init. thread or not. 3664 * If there is at least one uninitialized inode table, return 3665 * corresponding group number, else the loop goes through all 3666 * groups and return total number of groups. 3667 */ 3668 static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) 3669 { 3670 ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; 3671 struct ext4_group_desc *gdp = NULL; 3672 3673 if (!ext4_has_group_desc_csum(sb)) 3674 return ngroups; 3675 3676 for (group = 0; group < ngroups; group++) { 3677 gdp = ext4_get_group_desc(sb, group, NULL); 3678 if (!gdp) 3679 continue; 3680 3681 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) 3682 break; 3683 } 3684 3685 return group; 3686 } 3687 3688 static int ext4_li_info_new(void) 3689 { 3690 struct ext4_lazy_init *eli = NULL; 3691 3692 eli = kzalloc(sizeof(*eli), GFP_KERNEL); 3693 if (!eli) 3694 return -ENOMEM; 3695 3696 INIT_LIST_HEAD(&eli->li_request_list); 3697 mutex_init(&eli->li_list_mtx); 3698 3699 eli->li_state |= EXT4_LAZYINIT_QUIT; 3700 3701 ext4_li_info = eli; 3702 3703 return 0; 3704 } 3705 3706 static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, 3707 ext4_group_t start) 3708 { 3709 struct ext4_li_request *elr; 3710 3711 elr = kzalloc(sizeof(*elr), GFP_KERNEL); 3712 if (!elr) 3713 return NULL; 3714 3715 elr->lr_super = sb; 3716 elr->lr_first_not_zeroed = start; 3717 if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { 3718 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3719 elr->lr_next_group = start; 3720 } else { 3721 elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; 3722 } 3723 3724 /* 3725 * Randomize first schedule time of the request to 3726 * spread the inode table initialization requests 3727 * better. 3728 */ 3729 elr->lr_next_sched = jiffies + (prandom_u32() % 3730 (EXT4_DEF_LI_MAX_START_DELAY * HZ)); 3731 return elr; 3732 } 3733 3734 int ext4_register_li_request(struct super_block *sb, 3735 ext4_group_t first_not_zeroed) 3736 { 3737 struct ext4_sb_info *sbi = EXT4_SB(sb); 3738 struct ext4_li_request *elr = NULL; 3739 ext4_group_t ngroups = sbi->s_groups_count; 3740 int ret = 0; 3741 3742 mutex_lock(&ext4_li_mtx); 3743 if (sbi->s_li_request != NULL) { 3744 /* 3745 * Reset timeout so it can be computed again, because 3746 * s_li_wait_mult might have changed. 3747 */ 3748 sbi->s_li_request->lr_timeout = 0; 3749 goto out; 3750 } 3751 3752 if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && 3753 (first_not_zeroed == ngroups || sb_rdonly(sb) || 3754 !test_opt(sb, INIT_INODE_TABLE))) 3755 goto out; 3756 3757 elr = ext4_li_request_new(sb, first_not_zeroed); 3758 if (!elr) { 3759 ret = -ENOMEM; 3760 goto out; 3761 } 3762 3763 if (NULL == ext4_li_info) { 3764 ret = ext4_li_info_new(); 3765 if (ret) 3766 goto out; 3767 } 3768 3769 mutex_lock(&ext4_li_info->li_list_mtx); 3770 list_add(&elr->lr_request, &ext4_li_info->li_request_list); 3771 mutex_unlock(&ext4_li_info->li_list_mtx); 3772 3773 sbi->s_li_request = elr; 3774 /* 3775 * set elr to NULL here since it has been inserted to 3776 * the request_list and the removal and free of it is 3777 * handled by ext4_clear_request_list from now on. 3778 */ 3779 elr = NULL; 3780 3781 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { 3782 ret = ext4_run_lazyinit_thread(); 3783 if (ret) 3784 goto out; 3785 } 3786 out: 3787 mutex_unlock(&ext4_li_mtx); 3788 if (ret) 3789 kfree(elr); 3790 return ret; 3791 } 3792 3793 /* 3794 * We do not need to lock anything since this is called on 3795 * module unload. 3796 */ 3797 static void ext4_destroy_lazyinit_thread(void) 3798 { 3799 /* 3800 * If thread exited earlier 3801 * there's nothing to be done. 3802 */ 3803 if (!ext4_li_info || !ext4_lazyinit_task) 3804 return; 3805 3806 kthread_stop(ext4_lazyinit_task); 3807 } 3808 3809 static int set_journal_csum_feature_set(struct super_block *sb) 3810 { 3811 int ret = 1; 3812 int compat, incompat; 3813 struct ext4_sb_info *sbi = EXT4_SB(sb); 3814 3815 if (ext4_has_metadata_csum(sb)) { 3816 /* journal checksum v3 */ 3817 compat = 0; 3818 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; 3819 } else { 3820 /* journal checksum v1 */ 3821 compat = JBD2_FEATURE_COMPAT_CHECKSUM; 3822 incompat = 0; 3823 } 3824 3825 jbd2_journal_clear_features(sbi->s_journal, 3826 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3827 JBD2_FEATURE_INCOMPAT_CSUM_V3 | 3828 JBD2_FEATURE_INCOMPAT_CSUM_V2); 3829 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3830 ret = jbd2_journal_set_features(sbi->s_journal, 3831 compat, 0, 3832 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3833 incompat); 3834 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 3835 ret = jbd2_journal_set_features(sbi->s_journal, 3836 compat, 0, 3837 incompat); 3838 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3840 } else { 3841 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3842 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3843 } 3844 3845 return ret; 3846 } 3847 3848 /* 3849 * Note: calculating the overhead so we can be compatible with 3850 * historical BSD practice is quite difficult in the face of 3851 * clusters/bigalloc. This is because multiple metadata blocks from 3852 * different block group can end up in the same allocation cluster. 3853 * Calculating the exact overhead in the face of clustered allocation 3854 * requires either O(all block bitmaps) in memory or O(number of block 3855 * groups**2) in time. We will still calculate the superblock for 3856 * older file systems --- and if we come across with a bigalloc file 3857 * system with zero in s_overhead_clusters the estimate will be close to 3858 * correct especially for very large cluster sizes --- but for newer 3859 * file systems, it's better to calculate this figure once at mkfs 3860 * time, and store it in the superblock. If the superblock value is 3861 * present (even for non-bigalloc file systems), we will use it. 3862 */ 3863 static int count_overhead(struct super_block *sb, ext4_group_t grp, 3864 char *buf) 3865 { 3866 struct ext4_sb_info *sbi = EXT4_SB(sb); 3867 struct ext4_group_desc *gdp; 3868 ext4_fsblk_t first_block, last_block, b; 3869 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3870 int s, j, count = 0; 3871 3872 if (!ext4_has_feature_bigalloc(sb)) 3873 return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + 3874 sbi->s_itb_per_group + 2); 3875 3876 first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + 3877 (grp * EXT4_BLOCKS_PER_GROUP(sb)); 3878 last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; 3879 for (i = 0; i < ngroups; i++) { 3880 gdp = ext4_get_group_desc(sb, i, NULL); 3881 b = ext4_block_bitmap(sb, gdp); 3882 if (b >= first_block && b <= last_block) { 3883 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); 3884 count++; 3885 } 3886 b = ext4_inode_bitmap(sb, gdp); 3887 if (b >= first_block && b <= last_block) { 3888 ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); 3889 count++; 3890 } 3891 b = ext4_inode_table(sb, gdp); 3892 if (b >= first_block && b + sbi->s_itb_per_group <= last_block) 3893 for (j = 0; j < sbi->s_itb_per_group; j++, b++) { 3894 int c = EXT4_B2C(sbi, b - first_block); 3895 ext4_set_bit(c, buf); 3896 count++; 3897 } 3898 if (i != grp) 3899 continue; 3900 s = 0; 3901 if (ext4_bg_has_super(sb, grp)) { 3902 ext4_set_bit(s++, buf); 3903 count++; 3904 } 3905 j = ext4_bg_num_gdb(sb, grp); 3906 if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { 3907 ext4_error(sb, "Invalid number of block group " 3908 "descriptor blocks: %d", j); 3909 j = EXT4_BLOCKS_PER_GROUP(sb) - s; 3910 } 3911 count += j; 3912 for (; j > 0; j--) 3913 ext4_set_bit(EXT4_B2C(sbi, s++), buf); 3914 } 3915 if (!count) 3916 return 0; 3917 return EXT4_CLUSTERS_PER_GROUP(sb) - 3918 ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); 3919 } 3920 3921 /* 3922 * Compute the overhead and stash it in sbi->s_overhead 3923 */ 3924 int ext4_calculate_overhead(struct super_block *sb) 3925 { 3926 struct ext4_sb_info *sbi = EXT4_SB(sb); 3927 struct ext4_super_block *es = sbi->s_es; 3928 struct inode *j_inode; 3929 unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); 3930 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 3931 ext4_fsblk_t overhead = 0; 3932 char *buf = (char *) get_zeroed_page(GFP_NOFS); 3933 3934 if (!buf) 3935 return -ENOMEM; 3936 3937 /* 3938 * Compute the overhead (FS structures). This is constant 3939 * for a given filesystem unless the number of block groups 3940 * changes so we cache the previous value until it does. 3941 */ 3942 3943 /* 3944 * All of the blocks before first_data_block are overhead 3945 */ 3946 overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); 3947 3948 /* 3949 * Add the overhead found in each block group 3950 */ 3951 for (i = 0; i < ngroups; i++) { 3952 int blks; 3953 3954 blks = count_overhead(sb, i, buf); 3955 overhead += blks; 3956 if (blks) 3957 memset(buf, 0, PAGE_SIZE); 3958 cond_resched(); 3959 } 3960 3961 /* 3962 * Add the internal journal blocks whether the journal has been 3963 * loaded or not 3964 */ 3965 if (sbi->s_journal && !sbi->s_journal_bdev) 3966 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); 3967 else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { 3968 /* j_inum for internal journal is non-zero */ 3969 j_inode = ext4_get_journal_inode(sb, j_inum); 3970 if (j_inode) { 3971 j_blocks = j_inode->i_size >> sb->s_blocksize_bits; 3972 overhead += EXT4_NUM_B2C(sbi, j_blocks); 3973 iput(j_inode); 3974 } else { 3975 ext4_msg(sb, KERN_ERR, "can't get journal size"); 3976 } 3977 } 3978 sbi->s_overhead = overhead; 3979 smp_wmb(); 3980 free_page((unsigned long) buf); 3981 return 0; 3982 } 3983 3984 static void ext4_set_resv_clusters(struct super_block *sb) 3985 { 3986 ext4_fsblk_t resv_clusters; 3987 struct ext4_sb_info *sbi = EXT4_SB(sb); 3988 3989 /* 3990 * There's no need to reserve anything when we aren't using extents. 3991 * The space estimates are exact, there are no unwritten extents, 3992 * hole punching doesn't need new metadata... This is needed especially 3993 * to keep ext2/3 backward compatibility. 3994 */ 3995 if (!ext4_has_feature_extents(sb)) 3996 return; 3997 /* 3998 * By default we reserve 2% or 4096 clusters, whichever is smaller. 3999 * This should cover the situations where we can not afford to run 4000 * out of space like for example punch hole, or converting 4001 * unwritten extents in delalloc path. In most cases such 4002 * allocation would require 1, or 2 blocks, higher numbers are 4003 * very rare. 4004 */ 4005 resv_clusters = (ext4_blocks_count(sbi->s_es) >> 4006 sbi->s_cluster_bits); 4007 4008 do_div(resv_clusters, 50); 4009 resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); 4010 4011 atomic64_set(&sbi->s_resv_clusters, resv_clusters); 4012 } 4013 4014 static const char *ext4_quota_mode(struct super_block *sb) 4015 { 4016 #ifdef CONFIG_QUOTA 4017 if (!ext4_quota_capable(sb)) 4018 return "none"; 4019 4020 if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) 4021 return "journalled"; 4022 else 4023 return "writeback"; 4024 #else 4025 return "disabled"; 4026 #endif 4027 } 4028 4029 static int ext4_fill_super(struct super_block *sb, void *data, int silent) 4030 { 4031 struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); 4032 char *orig_data = kstrdup(data, GFP_KERNEL); 4033 struct buffer_head *bh, **group_desc; 4034 struct ext4_super_block *es = NULL; 4035 struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 4036 struct flex_groups **flex_groups; 4037 ext4_fsblk_t block; 4038 ext4_fsblk_t sb_block = get_sb_block(&data); 4039 ext4_fsblk_t logical_sb_block; 4040 unsigned long offset = 0; 4041 unsigned long def_mount_opts; 4042 struct inode *root; 4043 const char *descr; 4044 int ret = -ENOMEM; 4045 int blocksize, clustersize; 4046 unsigned int db_count; 4047 unsigned int i; 4048 int needs_recovery, has_huge_files; 4049 __u64 blocks_count; 4050 int err = 0; 4051 ext4_group_t first_not_zeroed; 4052 struct ext4_parsed_options parsed_opts; 4053 4054 /* Set defaults for the variables that will be set during parsing */ 4055 parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4056 parsed_opts.journal_devnum = 0; 4057 parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; 4058 4059 if ((data && !orig_data) || !sbi) 4060 goto out_free_base; 4061 4062 sbi->s_daxdev = dax_dev; 4063 sbi->s_blockgroup_lock = 4064 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 4065 if (!sbi->s_blockgroup_lock) 4066 goto out_free_base; 4067 4068 sb->s_fs_info = sbi; 4069 sbi->s_sb = sb; 4070 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 4071 sbi->s_sb_block = sb_block; 4072 sbi->s_sectors_written_start = 4073 part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); 4074 4075 /* Cleanup superblock name */ 4076 strreplace(sb->s_id, '/', '!'); 4077 4078 /* -EINVAL is default */ 4079 ret = -EINVAL; 4080 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 4081 if (!blocksize) { 4082 ext4_msg(sb, KERN_ERR, "unable to set blocksize"); 4083 goto out_fail; 4084 } 4085 4086 /* 4087 * The ext4 superblock will not be buffer aligned for other than 1kB 4088 * block sizes. We need to calculate the offset from buffer start. 4089 */ 4090 if (blocksize != EXT4_MIN_BLOCK_SIZE) { 4091 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 4092 offset = do_div(logical_sb_block, blocksize); 4093 } else { 4094 logical_sb_block = sb_block; 4095 } 4096 4097 bh = ext4_sb_bread_unmovable(sb, logical_sb_block); 4098 if (IS_ERR(bh)) { 4099 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 4100 ret = PTR_ERR(bh); 4101 goto out_fail; 4102 } 4103 /* 4104 * Note: s_es must be initialized as soon as possible because 4105 * some ext4 macro-instructions depend on its value 4106 */ 4107 es = (struct ext4_super_block *) (bh->b_data + offset); 4108 sbi->s_es = es; 4109 sb->s_magic = le16_to_cpu(es->s_magic); 4110 if (sb->s_magic != EXT4_SUPER_MAGIC) 4111 goto cantfind_ext4; 4112 sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); 4113 4114 /* Warn if metadata_csum and gdt_csum are both set. */ 4115 if (ext4_has_feature_metadata_csum(sb) && 4116 ext4_has_feature_gdt_csum(sb)) 4117 ext4_warning(sb, "metadata_csum and uninit_bg are " 4118 "redundant flags; please run fsck."); 4119 4120 /* Check for a known checksum algorithm */ 4121 if (!ext4_verify_csum_type(sb, es)) { 4122 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " 4123 "unknown checksum algorithm."); 4124 silent = 1; 4125 goto cantfind_ext4; 4126 } 4127 4128 /* Load the checksum driver */ 4129 sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 4130 if (IS_ERR(sbi->s_chksum_driver)) { 4131 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); 4132 ret = PTR_ERR(sbi->s_chksum_driver); 4133 sbi->s_chksum_driver = NULL; 4134 goto failed_mount; 4135 } 4136 4137 /* Check superblock checksum */ 4138 if (!ext4_superblock_csum_verify(sb, es)) { 4139 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " 4140 "invalid superblock checksum. Run e2fsck?"); 4141 silent = 1; 4142 ret = -EFSBADCRC; 4143 goto cantfind_ext4; 4144 } 4145 4146 /* Precompute checksum seed for all metadata */ 4147 if (ext4_has_feature_csum_seed(sb)) 4148 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); 4149 else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) 4150 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 4151 sizeof(es->s_uuid)); 4152 4153 /* Set defaults before we parse the mount options */ 4154 def_mount_opts = le32_to_cpu(es->s_default_mount_opts); 4155 set_opt(sb, INIT_INODE_TABLE); 4156 if (def_mount_opts & EXT4_DEFM_DEBUG) 4157 set_opt(sb, DEBUG); 4158 if (def_mount_opts & EXT4_DEFM_BSDGROUPS) 4159 set_opt(sb, GRPID); 4160 if (def_mount_opts & EXT4_DEFM_UID16) 4161 set_opt(sb, NO_UID32); 4162 /* xattr user namespace & acls are now defaulted on */ 4163 set_opt(sb, XATTR_USER); 4164 #ifdef CONFIG_EXT4_FS_POSIX_ACL 4165 set_opt(sb, POSIX_ACL); 4166 #endif 4167 if (ext4_has_feature_fast_commit(sb)) 4168 set_opt2(sb, JOURNAL_FAST_COMMIT); 4169 /* don't forget to enable journal_csum when metadata_csum is enabled. */ 4170 if (ext4_has_metadata_csum(sb)) 4171 set_opt(sb, JOURNAL_CHECKSUM); 4172 4173 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 4174 set_opt(sb, JOURNAL_DATA); 4175 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 4176 set_opt(sb, ORDERED_DATA); 4177 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) 4178 set_opt(sb, WRITEBACK_DATA); 4179 4180 if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) 4181 set_opt(sb, ERRORS_PANIC); 4182 else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) 4183 set_opt(sb, ERRORS_CONT); 4184 else 4185 set_opt(sb, ERRORS_RO); 4186 /* block_validity enabled by default; disable with noblock_validity */ 4187 set_opt(sb, BLOCK_VALIDITY); 4188 if (def_mount_opts & EXT4_DEFM_DISCARD) 4189 set_opt(sb, DISCARD); 4190 4191 sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); 4192 sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); 4193 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 4194 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 4195 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 4196 4197 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) 4198 set_opt(sb, BARRIER); 4199 4200 /* 4201 * enable delayed allocation by default 4202 * Use -o nodelalloc to turn it off 4203 */ 4204 if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && 4205 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 4206 set_opt(sb, DELALLOC); 4207 4208 /* 4209 * set default s_li_wait_mult for lazyinit, for the case there is 4210 * no mount option specified. 4211 */ 4212 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; 4213 4214 if (le32_to_cpu(es->s_log_block_size) > 4215 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4216 ext4_msg(sb, KERN_ERR, 4217 "Invalid log block size: %u", 4218 le32_to_cpu(es->s_log_block_size)); 4219 goto failed_mount; 4220 } 4221 if (le32_to_cpu(es->s_log_cluster_size) > 4222 (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { 4223 ext4_msg(sb, KERN_ERR, 4224 "Invalid log cluster size: %u", 4225 le32_to_cpu(es->s_log_cluster_size)); 4226 goto failed_mount; 4227 } 4228 4229 blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 4230 4231 if (blocksize == PAGE_SIZE) 4232 set_opt(sb, DIOREAD_NOLOCK); 4233 4234 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { 4235 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; 4236 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; 4237 } else { 4238 sbi->s_inode_size = le16_to_cpu(es->s_inode_size); 4239 sbi->s_first_ino = le32_to_cpu(es->s_first_ino); 4240 if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { 4241 ext4_msg(sb, KERN_ERR, "invalid first ino: %u", 4242 sbi->s_first_ino); 4243 goto failed_mount; 4244 } 4245 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || 4246 (!is_power_of_2(sbi->s_inode_size)) || 4247 (sbi->s_inode_size > blocksize)) { 4248 ext4_msg(sb, KERN_ERR, 4249 "unsupported inode size: %d", 4250 sbi->s_inode_size); 4251 ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); 4252 goto failed_mount; 4253 } 4254 /* 4255 * i_atime_extra is the last extra field available for 4256 * [acm]times in struct ext4_inode. Checking for that 4257 * field should suffice to ensure we have extra space 4258 * for all three. 4259 */ 4260 if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + 4261 sizeof(((struct ext4_inode *)0)->i_atime_extra)) { 4262 sb->s_time_gran = 1; 4263 sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; 4264 } else { 4265 sb->s_time_gran = NSEC_PER_SEC; 4266 sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; 4267 } 4268 sb->s_time_min = EXT4_TIMESTAMP_MIN; 4269 } 4270 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { 4271 sbi->s_want_extra_isize = sizeof(struct ext4_inode) - 4272 EXT4_GOOD_OLD_INODE_SIZE; 4273 if (ext4_has_feature_extra_isize(sb)) { 4274 unsigned v, max = (sbi->s_inode_size - 4275 EXT4_GOOD_OLD_INODE_SIZE); 4276 4277 v = le16_to_cpu(es->s_want_extra_isize); 4278 if (v > max) { 4279 ext4_msg(sb, KERN_ERR, 4280 "bad s_want_extra_isize: %d", v); 4281 goto failed_mount; 4282 } 4283 if (sbi->s_want_extra_isize < v) 4284 sbi->s_want_extra_isize = v; 4285 4286 v = le16_to_cpu(es->s_min_extra_isize); 4287 if (v > max) { 4288 ext4_msg(sb, KERN_ERR, 4289 "bad s_min_extra_isize: %d", v); 4290 goto failed_mount; 4291 } 4292 if (sbi->s_want_extra_isize < v) 4293 sbi->s_want_extra_isize = v; 4294 } 4295 } 4296 4297 if (sbi->s_es->s_mount_opts[0]) { 4298 char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, 4299 sizeof(sbi->s_es->s_mount_opts), 4300 GFP_KERNEL); 4301 if (!s_mount_opts) 4302 goto failed_mount; 4303 if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { 4304 ext4_msg(sb, KERN_WARNING, 4305 "failed to parse options in superblock: %s", 4306 s_mount_opts); 4307 } 4308 kfree(s_mount_opts); 4309 } 4310 sbi->s_def_mount_opt = sbi->s_mount_opt; 4311 if (!parse_options((char *) data, sb, &parsed_opts, 0)) 4312 goto failed_mount; 4313 4314 #ifdef CONFIG_UNICODE 4315 if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { 4316 const struct ext4_sb_encodings *encoding_info; 4317 struct unicode_map *encoding; 4318 __u16 encoding_flags; 4319 4320 if (ext4_sb_read_encoding(es, &encoding_info, 4321 &encoding_flags)) { 4322 ext4_msg(sb, KERN_ERR, 4323 "Encoding requested by superblock is unknown"); 4324 goto failed_mount; 4325 } 4326 4327 encoding = utf8_load(encoding_info->version); 4328 if (IS_ERR(encoding)) { 4329 ext4_msg(sb, KERN_ERR, 4330 "can't mount with superblock charset: %s-%s " 4331 "not supported by the kernel. flags: 0x%x.", 4332 encoding_info->name, encoding_info->version, 4333 encoding_flags); 4334 goto failed_mount; 4335 } 4336 ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " 4337 "%s-%s with flags 0x%hx", encoding_info->name, 4338 encoding_info->version?:"\b", encoding_flags); 4339 4340 sb->s_encoding = encoding; 4341 sb->s_encoding_flags = encoding_flags; 4342 } 4343 #endif 4344 4345 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4346 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support!\n"); 4347 /* can't mount with both data=journal and dioread_nolock. */ 4348 clear_opt(sb, DIOREAD_NOLOCK); 4349 clear_opt2(sb, JOURNAL_FAST_COMMIT); 4350 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 4351 ext4_msg(sb, KERN_ERR, "can't mount with " 4352 "both data=journal and delalloc"); 4353 goto failed_mount; 4354 } 4355 if (test_opt(sb, DAX_ALWAYS)) { 4356 ext4_msg(sb, KERN_ERR, "can't mount with " 4357 "both data=journal and dax"); 4358 goto failed_mount; 4359 } 4360 if (ext4_has_feature_encrypt(sb)) { 4361 ext4_msg(sb, KERN_WARNING, 4362 "encrypted files will use data=ordered " 4363 "instead of data journaling mode"); 4364 } 4365 if (test_opt(sb, DELALLOC)) 4366 clear_opt(sb, DELALLOC); 4367 } else { 4368 sb->s_iflags |= SB_I_CGROUPWB; 4369 } 4370 4371 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | 4372 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); 4373 4374 if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && 4375 (ext4_has_compat_features(sb) || 4376 ext4_has_ro_compat_features(sb) || 4377 ext4_has_incompat_features(sb))) 4378 ext4_msg(sb, KERN_WARNING, 4379 "feature flags set on rev 0 fs, " 4380 "running e2fsck is recommended"); 4381 4382 if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { 4383 set_opt2(sb, HURD_COMPAT); 4384 if (ext4_has_feature_64bit(sb)) { 4385 ext4_msg(sb, KERN_ERR, 4386 "The Hurd can't support 64-bit file systems"); 4387 goto failed_mount; 4388 } 4389 4390 /* 4391 * ea_inode feature uses l_i_version field which is not 4392 * available in HURD_COMPAT mode. 4393 */ 4394 if (ext4_has_feature_ea_inode(sb)) { 4395 ext4_msg(sb, KERN_ERR, 4396 "ea_inode feature is not supported for Hurd"); 4397 goto failed_mount; 4398 } 4399 } 4400 4401 if (IS_EXT2_SB(sb)) { 4402 if (ext2_feature_set_ok(sb)) 4403 ext4_msg(sb, KERN_INFO, "mounting ext2 file system " 4404 "using the ext4 subsystem"); 4405 else { 4406 /* 4407 * If we're probing be silent, if this looks like 4408 * it's actually an ext[34] filesystem. 4409 */ 4410 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) 4411 goto failed_mount; 4412 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " 4413 "to feature incompatibilities"); 4414 goto failed_mount; 4415 } 4416 } 4417 4418 if (IS_EXT3_SB(sb)) { 4419 if (ext3_feature_set_ok(sb)) 4420 ext4_msg(sb, KERN_INFO, "mounting ext3 file system " 4421 "using the ext4 subsystem"); 4422 else { 4423 /* 4424 * If we're probing be silent, if this looks like 4425 * it's actually an ext4 filesystem. 4426 */ 4427 if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) 4428 goto failed_mount; 4429 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " 4430 "to feature incompatibilities"); 4431 goto failed_mount; 4432 } 4433 } 4434 4435 /* 4436 * Check feature flags regardless of the revision level, since we 4437 * previously didn't change the revision level when setting the flags, 4438 * so there is a chance incompat flags are set on a rev 0 filesystem. 4439 */ 4440 if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) 4441 goto failed_mount; 4442 4443 if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { 4444 ext4_msg(sb, KERN_ERR, 4445 "Number of reserved GDT blocks insanely large: %d", 4446 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); 4447 goto failed_mount; 4448 } 4449 4450 if (bdev_dax_supported(sb->s_bdev, blocksize)) 4451 set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); 4452 4453 if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { 4454 if (ext4_has_feature_inline_data(sb)) { 4455 ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" 4456 " that may contain inline data"); 4457 goto failed_mount; 4458 } 4459 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { 4460 ext4_msg(sb, KERN_ERR, 4461 "DAX unsupported by block device."); 4462 goto failed_mount; 4463 } 4464 } 4465 4466 if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { 4467 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", 4468 es->s_encryption_level); 4469 goto failed_mount; 4470 } 4471 4472 if (sb->s_blocksize != blocksize) { 4473 /* 4474 * bh must be released before kill_bdev(), otherwise 4475 * it won't be freed and its page also. kill_bdev() 4476 * is called by sb_set_blocksize(). 4477 */ 4478 brelse(bh); 4479 /* Validate the filesystem blocksize */ 4480 if (!sb_set_blocksize(sb, blocksize)) { 4481 ext4_msg(sb, KERN_ERR, "bad block size %d", 4482 blocksize); 4483 bh = NULL; 4484 goto failed_mount; 4485 } 4486 4487 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 4488 offset = do_div(logical_sb_block, blocksize); 4489 bh = ext4_sb_bread_unmovable(sb, logical_sb_block); 4490 if (IS_ERR(bh)) { 4491 ext4_msg(sb, KERN_ERR, 4492 "Can't read superblock on 2nd try"); 4493 ret = PTR_ERR(bh); 4494 bh = NULL; 4495 goto failed_mount; 4496 } 4497 es = (struct ext4_super_block *)(bh->b_data + offset); 4498 sbi->s_es = es; 4499 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { 4500 ext4_msg(sb, KERN_ERR, 4501 "Magic mismatch, very weird!"); 4502 goto failed_mount; 4503 } 4504 } 4505 4506 has_huge_files = ext4_has_feature_huge_file(sb); 4507 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 4508 has_huge_files); 4509 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 4510 4511 sbi->s_desc_size = le16_to_cpu(es->s_desc_size); 4512 if (ext4_has_feature_64bit(sb)) { 4513 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || 4514 sbi->s_desc_size > EXT4_MAX_DESC_SIZE || 4515 !is_power_of_2(sbi->s_desc_size)) { 4516 ext4_msg(sb, KERN_ERR, 4517 "unsupported descriptor size %lu", 4518 sbi->s_desc_size); 4519 goto failed_mount; 4520 } 4521 } else 4522 sbi->s_desc_size = EXT4_MIN_DESC_SIZE; 4523 4524 sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); 4525 sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); 4526 4527 sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); 4528 if (sbi->s_inodes_per_block == 0) 4529 goto cantfind_ext4; 4530 if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || 4531 sbi->s_inodes_per_group > blocksize * 8) { 4532 ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", 4533 sbi->s_inodes_per_group); 4534 goto failed_mount; 4535 } 4536 sbi->s_itb_per_group = sbi->s_inodes_per_group / 4537 sbi->s_inodes_per_block; 4538 sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); 4539 sbi->s_sbh = bh; 4540 sbi->s_mount_state = le16_to_cpu(es->s_state); 4541 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 4542 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 4543 4544 for (i = 0; i < 4; i++) 4545 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 4546 sbi->s_def_hash_version = es->s_def_hash_version; 4547 if (ext4_has_feature_dir_index(sb)) { 4548 i = le32_to_cpu(es->s_flags); 4549 if (i & EXT2_FLAGS_UNSIGNED_HASH) 4550 sbi->s_hash_unsigned = 3; 4551 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { 4552 #ifdef __CHAR_UNSIGNED__ 4553 if (!sb_rdonly(sb)) 4554 es->s_flags |= 4555 cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); 4556 sbi->s_hash_unsigned = 3; 4557 #else 4558 if (!sb_rdonly(sb)) 4559 es->s_flags |= 4560 cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); 4561 #endif 4562 } 4563 } 4564 4565 /* Handle clustersize */ 4566 clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); 4567 if (ext4_has_feature_bigalloc(sb)) { 4568 if (clustersize < blocksize) { 4569 ext4_msg(sb, KERN_ERR, 4570 "cluster size (%d) smaller than " 4571 "block size (%d)", clustersize, blocksize); 4572 goto failed_mount; 4573 } 4574 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - 4575 le32_to_cpu(es->s_log_block_size); 4576 sbi->s_clusters_per_group = 4577 le32_to_cpu(es->s_clusters_per_group); 4578 if (sbi->s_clusters_per_group > blocksize * 8) { 4579 ext4_msg(sb, KERN_ERR, 4580 "#clusters per group too big: %lu", 4581 sbi->s_clusters_per_group); 4582 goto failed_mount; 4583 } 4584 if (sbi->s_blocks_per_group != 4585 (sbi->s_clusters_per_group * (clustersize / blocksize))) { 4586 ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and " 4587 "clusters per group (%lu) inconsistent", 4588 sbi->s_blocks_per_group, 4589 sbi->s_clusters_per_group); 4590 goto failed_mount; 4591 } 4592 } else { 4593 if (clustersize != blocksize) { 4594 ext4_msg(sb, KERN_ERR, 4595 "fragment/cluster size (%d) != " 4596 "block size (%d)", clustersize, blocksize); 4597 goto failed_mount; 4598 } 4599 if (sbi->s_blocks_per_group > blocksize * 8) { 4600 ext4_msg(sb, KERN_ERR, 4601 "#blocks per group too big: %lu", 4602 sbi->s_blocks_per_group); 4603 goto failed_mount; 4604 } 4605 sbi->s_clusters_per_group = sbi->s_blocks_per_group; 4606 sbi->s_cluster_bits = 0; 4607 } 4608 sbi->s_cluster_ratio = clustersize / blocksize; 4609 4610 /* Do we have standard group size of clustersize * 8 blocks ? */ 4611 if (sbi->s_blocks_per_group == clustersize << 3) 4612 set_opt2(sb, STD_GROUP_SIZE); 4613 4614 /* 4615 * Test whether we have more sectors than will fit in sector_t, 4616 * and whether the max offset is addressable by the page cache. 4617 */ 4618 err = generic_check_addressable(sb->s_blocksize_bits, 4619 ext4_blocks_count(es)); 4620 if (err) { 4621 ext4_msg(sb, KERN_ERR, "filesystem" 4622 " too large to mount safely on this system"); 4623 goto failed_mount; 4624 } 4625 4626 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 4627 goto cantfind_ext4; 4628 4629 /* check blocks count against device size */ 4630 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; 4631 if (blocks_count && ext4_blocks_count(es) > blocks_count) { 4632 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " 4633 "exceeds size of device (%llu blocks)", 4634 ext4_blocks_count(es), blocks_count); 4635 goto failed_mount; 4636 } 4637 4638 /* 4639 * It makes no sense for the first data block to be beyond the end 4640 * of the filesystem. 4641 */ 4642 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { 4643 ext4_msg(sb, KERN_WARNING, "bad geometry: first data " 4644 "block %u is beyond end of filesystem (%llu)", 4645 le32_to_cpu(es->s_first_data_block), 4646 ext4_blocks_count(es)); 4647 goto failed_mount; 4648 } 4649 if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && 4650 (sbi->s_cluster_ratio == 1)) { 4651 ext4_msg(sb, KERN_WARNING, "bad geometry: first data " 4652 "block is 0 with a 1k block and cluster size"); 4653 goto failed_mount; 4654 } 4655 4656 blocks_count = (ext4_blocks_count(es) - 4657 le32_to_cpu(es->s_first_data_block) + 4658 EXT4_BLOCKS_PER_GROUP(sb) - 1); 4659 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); 4660 if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { 4661 ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " 4662 "(block count %llu, first data block %u, " 4663 "blocks per group %lu)", blocks_count, 4664 ext4_blocks_count(es), 4665 le32_to_cpu(es->s_first_data_block), 4666 EXT4_BLOCKS_PER_GROUP(sb)); 4667 goto failed_mount; 4668 } 4669 sbi->s_groups_count = blocks_count; 4670 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, 4671 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 4672 if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != 4673 le32_to_cpu(es->s_inodes_count)) { 4674 ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", 4675 le32_to_cpu(es->s_inodes_count), 4676 ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); 4677 ret = -EINVAL; 4678 goto failed_mount; 4679 } 4680 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 4681 EXT4_DESC_PER_BLOCK(sb); 4682 if (ext4_has_feature_meta_bg(sb)) { 4683 if (le32_to_cpu(es->s_first_meta_bg) > db_count) { 4684 ext4_msg(sb, KERN_WARNING, 4685 "first meta block group too large: %u " 4686 "(group descriptor block count %u)", 4687 le32_to_cpu(es->s_first_meta_bg), db_count); 4688 goto failed_mount; 4689 } 4690 } 4691 rcu_assign_pointer(sbi->s_group_desc, 4692 kvmalloc_array(db_count, 4693 sizeof(struct buffer_head *), 4694 GFP_KERNEL)); 4695 if (sbi->s_group_desc == NULL) { 4696 ext4_msg(sb, KERN_ERR, "not enough memory"); 4697 ret = -ENOMEM; 4698 goto failed_mount; 4699 } 4700 4701 bgl_lock_init(sbi->s_blockgroup_lock); 4702 4703 /* Pre-read the descriptors into the buffer cache */ 4704 for (i = 0; i < db_count; i++) { 4705 block = descriptor_loc(sb, logical_sb_block, i); 4706 ext4_sb_breadahead_unmovable(sb, block); 4707 } 4708 4709 for (i = 0; i < db_count; i++) { 4710 struct buffer_head *bh; 4711 4712 block = descriptor_loc(sb, logical_sb_block, i); 4713 bh = ext4_sb_bread_unmovable(sb, block); 4714 if (IS_ERR(bh)) { 4715 ext4_msg(sb, KERN_ERR, 4716 "can't read group descriptor %d", i); 4717 db_count = i; 4718 ret = PTR_ERR(bh); 4719 goto failed_mount2; 4720 } 4721 rcu_read_lock(); 4722 rcu_dereference(sbi->s_group_desc)[i] = bh; 4723 rcu_read_unlock(); 4724 } 4725 sbi->s_gdb_count = db_count; 4726 if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { 4727 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 4728 ret = -EFSCORRUPTED; 4729 goto failed_mount2; 4730 } 4731 4732 timer_setup(&sbi->s_err_report, print_daily_error_info, 0); 4733 spin_lock_init(&sbi->s_error_lock); 4734 INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); 4735 4736 /* Register extent status tree shrinker */ 4737 if (ext4_es_register_shrinker(sbi)) 4738 goto failed_mount3; 4739 4740 sbi->s_stripe = ext4_get_stripe_size(sbi); 4741 sbi->s_extent_max_zeroout_kb = 32; 4742 4743 /* 4744 * set up enough so that it can read an inode 4745 */ 4746 sb->s_op = &ext4_sops; 4747 sb->s_export_op = &ext4_export_ops; 4748 sb->s_xattr = ext4_xattr_handlers; 4749 #ifdef CONFIG_FS_ENCRYPTION 4750 sb->s_cop = &ext4_cryptops; 4751 #endif 4752 #ifdef CONFIG_FS_VERITY 4753 sb->s_vop = &ext4_verityops; 4754 #endif 4755 #ifdef CONFIG_QUOTA 4756 sb->dq_op = &ext4_quota_operations; 4757 if (ext4_has_feature_quota(sb)) 4758 sb->s_qcop = &dquot_quotactl_sysfile_ops; 4759 else 4760 sb->s_qcop = &ext4_qctl_operations; 4761 sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; 4762 #endif 4763 memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); 4764 4765 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 4766 mutex_init(&sbi->s_orphan_lock); 4767 4768 /* Initialize fast commit stuff */ 4769 atomic_set(&sbi->s_fc_subtid, 0); 4770 atomic_set(&sbi->s_fc_ineligible_updates, 0); 4771 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); 4772 INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); 4773 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); 4774 INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); 4775 sbi->s_fc_bytes = 0; 4776 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); 4777 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); 4778 spin_lock_init(&sbi->s_fc_lock); 4779 memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); 4780 sbi->s_fc_replay_state.fc_regions = NULL; 4781 sbi->s_fc_replay_state.fc_regions_size = 0; 4782 sbi->s_fc_replay_state.fc_regions_used = 0; 4783 sbi->s_fc_replay_state.fc_regions_valid = 0; 4784 sbi->s_fc_replay_state.fc_modified_inodes = NULL; 4785 sbi->s_fc_replay_state.fc_modified_inodes_size = 0; 4786 sbi->s_fc_replay_state.fc_modified_inodes_used = 0; 4787 4788 sb->s_root = NULL; 4789 4790 needs_recovery = (es->s_last_orphan != 0 || 4791 ext4_has_feature_journal_needs_recovery(sb)); 4792 4793 if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) 4794 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) 4795 goto failed_mount3a; 4796 4797 /* 4798 * The first inode we look at is the journal inode. Don't try 4799 * root first: it may be modified in the journal! 4800 */ 4801 if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { 4802 err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); 4803 if (err) 4804 goto failed_mount3a; 4805 } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && 4806 ext4_has_feature_journal_needs_recovery(sb)) { 4807 ext4_msg(sb, KERN_ERR, "required journal recovery " 4808 "suppressed and not mounted read-only"); 4809 goto failed_mount_wq; 4810 } else { 4811 /* Nojournal mode, all journal mount options are illegal */ 4812 if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { 4813 ext4_msg(sb, KERN_ERR, "can't mount with " 4814 "journal_checksum, fs mounted w/o journal"); 4815 goto failed_mount_wq; 4816 } 4817 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 4818 ext4_msg(sb, KERN_ERR, "can't mount with " 4819 "journal_async_commit, fs mounted w/o journal"); 4820 goto failed_mount_wq; 4821 } 4822 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 4823 ext4_msg(sb, KERN_ERR, "can't mount with " 4824 "commit=%lu, fs mounted w/o journal", 4825 sbi->s_commit_interval / HZ); 4826 goto failed_mount_wq; 4827 } 4828 if (EXT4_MOUNT_DATA_FLAGS & 4829 (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { 4830 ext4_msg(sb, KERN_ERR, "can't mount with " 4831 "data=, fs mounted w/o journal"); 4832 goto failed_mount_wq; 4833 } 4834 sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; 4835 clear_opt(sb, JOURNAL_CHECKSUM); 4836 clear_opt(sb, DATA_FLAGS); 4837 clear_opt2(sb, JOURNAL_FAST_COMMIT); 4838 sbi->s_journal = NULL; 4839 needs_recovery = 0; 4840 goto no_journal; 4841 } 4842 4843 if (ext4_has_feature_64bit(sb) && 4844 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 4845 JBD2_FEATURE_INCOMPAT_64BIT)) { 4846 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); 4847 goto failed_mount_wq; 4848 } 4849 4850 if (!set_journal_csum_feature_set(sb)) { 4851 ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " 4852 "feature set"); 4853 goto failed_mount_wq; 4854 } 4855 4856 if (test_opt2(sb, JOURNAL_FAST_COMMIT) && 4857 !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, 4858 JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { 4859 ext4_msg(sb, KERN_ERR, 4860 "Failed to set fast commit journal feature"); 4861 goto failed_mount_wq; 4862 } 4863 4864 /* We have now updated the journal if required, so we can 4865 * validate the data journaling mode. */ 4866 switch (test_opt(sb, DATA_FLAGS)) { 4867 case 0: 4868 /* No mode set, assume a default based on the journal 4869 * capabilities: ORDERED_DATA if the journal can 4870 * cope, else JOURNAL_DATA 4871 */ 4872 if (jbd2_journal_check_available_features 4873 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 4874 set_opt(sb, ORDERED_DATA); 4875 sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; 4876 } else { 4877 set_opt(sb, JOURNAL_DATA); 4878 sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; 4879 } 4880 break; 4881 4882 case EXT4_MOUNT_ORDERED_DATA: 4883 case EXT4_MOUNT_WRITEBACK_DATA: 4884 if (!jbd2_journal_check_available_features 4885 (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { 4886 ext4_msg(sb, KERN_ERR, "Journal does not support " 4887 "requested data journaling mode"); 4888 goto failed_mount_wq; 4889 } 4890 break; 4891 default: 4892 break; 4893 } 4894 4895 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && 4896 test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 4897 ext4_msg(sb, KERN_ERR, "can't mount with " 4898 "journal_async_commit in data=ordered mode"); 4899 goto failed_mount_wq; 4900 } 4901 4902 set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 4903 4904 sbi->s_journal->j_submit_inode_data_buffers = 4905 ext4_journal_submit_inode_data_buffers; 4906 sbi->s_journal->j_finish_inode_data_buffers = 4907 ext4_journal_finish_inode_data_buffers; 4908 4909 no_journal: 4910 if (!test_opt(sb, NO_MBCACHE)) { 4911 sbi->s_ea_block_cache = ext4_xattr_create_cache(); 4912 if (!sbi->s_ea_block_cache) { 4913 ext4_msg(sb, KERN_ERR, 4914 "Failed to create ea_block_cache"); 4915 goto failed_mount_wq; 4916 } 4917 4918 if (ext4_has_feature_ea_inode(sb)) { 4919 sbi->s_ea_inode_cache = ext4_xattr_create_cache(); 4920 if (!sbi->s_ea_inode_cache) { 4921 ext4_msg(sb, KERN_ERR, 4922 "Failed to create ea_inode_cache"); 4923 goto failed_mount_wq; 4924 } 4925 } 4926 } 4927 4928 if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) { 4929 ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity"); 4930 goto failed_mount_wq; 4931 } 4932 4933 if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && 4934 !ext4_has_feature_encrypt(sb)) { 4935 ext4_set_feature_encrypt(sb); 4936 ext4_commit_super(sb); 4937 } 4938 4939 /* 4940 * Get the # of file system overhead blocks from the 4941 * superblock if present. 4942 */ 4943 if (es->s_overhead_clusters) 4944 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); 4945 else { 4946 err = ext4_calculate_overhead(sb); 4947 if (err) 4948 goto failed_mount_wq; 4949 } 4950 4951 /* 4952 * The maximum number of concurrent works can be high and 4953 * concurrency isn't really necessary. Limit it to 1. 4954 */ 4955 EXT4_SB(sb)->rsv_conversion_wq = 4956 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 4957 if (!EXT4_SB(sb)->rsv_conversion_wq) { 4958 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); 4959 ret = -ENOMEM; 4960 goto failed_mount4; 4961 } 4962 4963 /* 4964 * The jbd2_journal_load will have done any necessary log recovery, 4965 * so we can safely mount the rest of the filesystem now. 4966 */ 4967 4968 root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); 4969 if (IS_ERR(root)) { 4970 ext4_msg(sb, KERN_ERR, "get root inode failed"); 4971 ret = PTR_ERR(root); 4972 root = NULL; 4973 goto failed_mount4; 4974 } 4975 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 4976 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 4977 iput(root); 4978 goto failed_mount4; 4979 } 4980 4981 sb->s_root = d_make_root(root); 4982 if (!sb->s_root) { 4983 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 4984 ret = -ENOMEM; 4985 goto failed_mount4; 4986 } 4987 4988 ret = ext4_setup_super(sb, es, sb_rdonly(sb)); 4989 if (ret == -EROFS) { 4990 sb->s_flags |= SB_RDONLY; 4991 ret = 0; 4992 } else if (ret) 4993 goto failed_mount4a; 4994 4995 ext4_set_resv_clusters(sb); 4996 4997 if (test_opt(sb, BLOCK_VALIDITY)) { 4998 err = ext4_setup_system_zone(sb); 4999 if (err) { 5000 ext4_msg(sb, KERN_ERR, "failed to initialize system " 5001 "zone (%d)", err); 5002 goto failed_mount4a; 5003 } 5004 } 5005 ext4_fc_replay_cleanup(sb); 5006 5007 ext4_ext_init(sb); 5008 5009 /* 5010 * Enable optimize_scan if number of groups is > threshold. This can be 5011 * turned off by passing "mb_optimize_scan=0". This can also be 5012 * turned on forcefully by passing "mb_optimize_scan=1". 5013 */ 5014 if (parsed_opts.mb_optimize_scan == 1) 5015 set_opt2(sb, MB_OPTIMIZE_SCAN); 5016 else if (parsed_opts.mb_optimize_scan == 0) 5017 clear_opt2(sb, MB_OPTIMIZE_SCAN); 5018 else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) 5019 set_opt2(sb, MB_OPTIMIZE_SCAN); 5020 5021 err = ext4_mb_init(sb); 5022 if (err) { 5023 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", 5024 err); 5025 goto failed_mount5; 5026 } 5027 5028 /* 5029 * We can only set up the journal commit callback once 5030 * mballoc is initialized 5031 */ 5032 if (sbi->s_journal) 5033 sbi->s_journal->j_commit_callback = 5034 ext4_journal_commit_callback; 5035 5036 block = ext4_count_free_clusters(sb); 5037 ext4_free_blocks_count_set(sbi->s_es, 5038 EXT4_C2B(sbi, block)); 5039 err = percpu_counter_init(&sbi->s_freeclusters_counter, block, 5040 GFP_KERNEL); 5041 if (!err) { 5042 unsigned long freei = ext4_count_free_inodes(sb); 5043 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 5044 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, 5045 GFP_KERNEL); 5046 } 5047 if (!err) 5048 err = percpu_counter_init(&sbi->s_dirs_counter, 5049 ext4_count_dirs(sb), GFP_KERNEL); 5050 if (!err) 5051 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, 5052 GFP_KERNEL); 5053 if (!err) 5054 err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, 5055 GFP_KERNEL); 5056 if (!err) 5057 err = percpu_init_rwsem(&sbi->s_writepages_rwsem); 5058 5059 if (err) { 5060 ext4_msg(sb, KERN_ERR, "insufficient memory"); 5061 goto failed_mount6; 5062 } 5063 5064 if (ext4_has_feature_flex_bg(sb)) 5065 if (!ext4_fill_flex_info(sb)) { 5066 ext4_msg(sb, KERN_ERR, 5067 "unable to initialize " 5068 "flex_bg meta info!"); 5069 ret = -ENOMEM; 5070 goto failed_mount6; 5071 } 5072 5073 err = ext4_register_li_request(sb, first_not_zeroed); 5074 if (err) 5075 goto failed_mount6; 5076 5077 err = ext4_register_sysfs(sb); 5078 if (err) 5079 goto failed_mount7; 5080 5081 #ifdef CONFIG_QUOTA 5082 /* Enable quota usage during mount. */ 5083 if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { 5084 err = ext4_enable_quotas(sb); 5085 if (err) 5086 goto failed_mount8; 5087 } 5088 #endif /* CONFIG_QUOTA */ 5089 5090 /* 5091 * Save the original bdev mapping's wb_err value which could be 5092 * used to detect the metadata async write error. 5093 */ 5094 spin_lock_init(&sbi->s_bdev_wb_lock); 5095 errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, 5096 &sbi->s_bdev_wb_err); 5097 sb->s_bdev->bd_super = sb; 5098 EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; 5099 ext4_orphan_cleanup(sb, es); 5100 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; 5101 if (needs_recovery) { 5102 ext4_msg(sb, KERN_INFO, "recovery complete"); 5103 err = ext4_mark_recovery_complete(sb, es); 5104 if (err) 5105 goto failed_mount8; 5106 } 5107 if (EXT4_SB(sb)->s_journal) { 5108 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 5109 descr = " journalled data mode"; 5110 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 5111 descr = " ordered data mode"; 5112 else 5113 descr = " writeback data mode"; 5114 } else 5115 descr = "out journal"; 5116 5117 if (test_opt(sb, DISCARD)) { 5118 struct request_queue *q = bdev_get_queue(sb->s_bdev); 5119 if (!blk_queue_discard(q)) 5120 ext4_msg(sb, KERN_WARNING, 5121 "mounting with \"discard\" option, but " 5122 "the device does not support discard"); 5123 } 5124 5125 if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) 5126 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 5127 "Opts: %.*s%s%s. Quota mode: %s.", descr, 5128 (int) sizeof(sbi->s_es->s_mount_opts), 5129 sbi->s_es->s_mount_opts, 5130 *sbi->s_es->s_mount_opts ? "; " : "", orig_data, 5131 ext4_quota_mode(sb)); 5132 5133 if (es->s_error_count) 5134 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 5135 5136 /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ 5137 ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); 5138 ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); 5139 ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); 5140 atomic_set(&sbi->s_warning_count, 0); 5141 atomic_set(&sbi->s_msg_count, 0); 5142 5143 kfree(orig_data); 5144 return 0; 5145 5146 cantfind_ext4: 5147 if (!silent) 5148 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); 5149 goto failed_mount; 5150 5151 failed_mount8: 5152 ext4_unregister_sysfs(sb); 5153 kobject_put(&sbi->s_kobj); 5154 failed_mount7: 5155 ext4_unregister_li_request(sb); 5156 failed_mount6: 5157 ext4_mb_release(sb); 5158 rcu_read_lock(); 5159 flex_groups = rcu_dereference(sbi->s_flex_groups); 5160 if (flex_groups) { 5161 for (i = 0; i < sbi->s_flex_groups_allocated; i++) 5162 kvfree(flex_groups[i]); 5163 kvfree(flex_groups); 5164 } 5165 rcu_read_unlock(); 5166 percpu_counter_destroy(&sbi->s_freeclusters_counter); 5167 percpu_counter_destroy(&sbi->s_freeinodes_counter); 5168 percpu_counter_destroy(&sbi->s_dirs_counter); 5169 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 5170 percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); 5171 percpu_free_rwsem(&sbi->s_writepages_rwsem); 5172 failed_mount5: 5173 ext4_ext_release(sb); 5174 ext4_release_system_zone(sb); 5175 failed_mount4a: 5176 dput(sb->s_root); 5177 sb->s_root = NULL; 5178 failed_mount4: 5179 ext4_msg(sb, KERN_ERR, "mount failed"); 5180 if (EXT4_SB(sb)->rsv_conversion_wq) 5181 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 5182 failed_mount_wq: 5183 ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); 5184 sbi->s_ea_inode_cache = NULL; 5185 5186 ext4_xattr_destroy_cache(sbi->s_ea_block_cache); 5187 sbi->s_ea_block_cache = NULL; 5188 5189 if (sbi->s_journal) { 5190 jbd2_journal_unregister_shrinker(sbi->s_journal); 5191 jbd2_journal_destroy(sbi->s_journal); 5192 sbi->s_journal = NULL; 5193 } 5194 failed_mount3a: 5195 ext4_es_unregister_shrinker(sbi); 5196 failed_mount3: 5197 flush_work(&sbi->s_error_work); 5198 del_timer_sync(&sbi->s_err_report); 5199 ext4_stop_mmpd(sbi); 5200 failed_mount2: 5201 rcu_read_lock(); 5202 group_desc = rcu_dereference(sbi->s_group_desc); 5203 for (i = 0; i < db_count; i++) 5204 brelse(group_desc[i]); 5205 kvfree(group_desc); 5206 rcu_read_unlock(); 5207 failed_mount: 5208 if (sbi->s_chksum_driver) 5209 crypto_free_shash(sbi->s_chksum_driver); 5210 5211 #ifdef CONFIG_UNICODE 5212 utf8_unload(sb->s_encoding); 5213 #endif 5214 5215 #ifdef CONFIG_QUOTA 5216 for (i = 0; i < EXT4_MAXQUOTAS; i++) 5217 kfree(get_qf_name(sb, sbi, i)); 5218 #endif 5219 fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); 5220 /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ 5221 brelse(bh); 5222 ext4_blkdev_remove(sbi); 5223 out_fail: 5224 sb->s_fs_info = NULL; 5225 kfree(sbi->s_blockgroup_lock); 5226 out_free_base: 5227 kfree(sbi); 5228 kfree(orig_data); 5229 fs_put_dax(dax_dev); 5230 return err ? err : ret; 5231 } 5232 5233 /* 5234 * Setup any per-fs journal parameters now. We'll do this both on 5235 * initial mount, once the journal has been initialised but before we've 5236 * done any recovery; and again on any subsequent remount. 5237 */ 5238 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) 5239 { 5240 struct ext4_sb_info *sbi = EXT4_SB(sb); 5241 5242 journal->j_commit_interval = sbi->s_commit_interval; 5243 journal->j_min_batch_time = sbi->s_min_batch_time; 5244 journal->j_max_batch_time = sbi->s_max_batch_time; 5245 ext4_fc_init(sb, journal); 5246 5247 write_lock(&journal->j_state_lock); 5248 if (test_opt(sb, BARRIER)) 5249 journal->j_flags |= JBD2_BARRIER; 5250 else 5251 journal->j_flags &= ~JBD2_BARRIER; 5252 if (test_opt(sb, DATA_ERR_ABORT)) 5253 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 5254 else 5255 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 5256 write_unlock(&journal->j_state_lock); 5257 } 5258 5259 static struct inode *ext4_get_journal_inode(struct super_block *sb, 5260 unsigned int journal_inum) 5261 { 5262 struct inode *journal_inode; 5263 5264 /* 5265 * Test for the existence of a valid inode on disk. Bad things 5266 * happen if we iget() an unused inode, as the subsequent iput() 5267 * will try to delete it. 5268 */ 5269 journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); 5270 if (IS_ERR(journal_inode)) { 5271 ext4_msg(sb, KERN_ERR, "no journal found"); 5272 return NULL; 5273 } 5274 if (!journal_inode->i_nlink) { 5275 make_bad_inode(journal_inode); 5276 iput(journal_inode); 5277 ext4_msg(sb, KERN_ERR, "journal inode is deleted"); 5278 return NULL; 5279 } 5280 5281 jbd_debug(2, "Journal inode found at %p: %lld bytes\n", 5282 journal_inode, journal_inode->i_size); 5283 if (!S_ISREG(journal_inode->i_mode)) { 5284 ext4_msg(sb, KERN_ERR, "invalid journal inode"); 5285 iput(journal_inode); 5286 return NULL; 5287 } 5288 return journal_inode; 5289 } 5290 5291 static journal_t *ext4_get_journal(struct super_block *sb, 5292 unsigned int journal_inum) 5293 { 5294 struct inode *journal_inode; 5295 journal_t *journal; 5296 5297 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5298 return NULL; 5299 5300 journal_inode = ext4_get_journal_inode(sb, journal_inum); 5301 if (!journal_inode) 5302 return NULL; 5303 5304 journal = jbd2_journal_init_inode(journal_inode); 5305 if (!journal) { 5306 ext4_msg(sb, KERN_ERR, "Could not load journal inode"); 5307 iput(journal_inode); 5308 return NULL; 5309 } 5310 journal->j_private = sb; 5311 ext4_init_journal_params(sb, journal); 5312 return journal; 5313 } 5314 5315 static journal_t *ext4_get_dev_journal(struct super_block *sb, 5316 dev_t j_dev) 5317 { 5318 struct buffer_head *bh; 5319 journal_t *journal; 5320 ext4_fsblk_t start; 5321 ext4_fsblk_t len; 5322 int hblock, blocksize; 5323 ext4_fsblk_t sb_block; 5324 unsigned long offset; 5325 struct ext4_super_block *es; 5326 struct block_device *bdev; 5327 5328 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5329 return NULL; 5330 5331 bdev = ext4_blkdev_get(j_dev, sb); 5332 if (bdev == NULL) 5333 return NULL; 5334 5335 blocksize = sb->s_blocksize; 5336 hblock = bdev_logical_block_size(bdev); 5337 if (blocksize < hblock) { 5338 ext4_msg(sb, KERN_ERR, 5339 "blocksize too small for journal device"); 5340 goto out_bdev; 5341 } 5342 5343 sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; 5344 offset = EXT4_MIN_BLOCK_SIZE % blocksize; 5345 set_blocksize(bdev, blocksize); 5346 if (!(bh = __bread(bdev, sb_block, blocksize))) { 5347 ext4_msg(sb, KERN_ERR, "couldn't read superblock of " 5348 "external journal"); 5349 goto out_bdev; 5350 } 5351 5352 es = (struct ext4_super_block *) (bh->b_data + offset); 5353 if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || 5354 !(le32_to_cpu(es->s_feature_incompat) & 5355 EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { 5356 ext4_msg(sb, KERN_ERR, "external journal has " 5357 "bad superblock"); 5358 brelse(bh); 5359 goto out_bdev; 5360 } 5361 5362 if ((le32_to_cpu(es->s_feature_ro_compat) & 5363 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && 5364 es->s_checksum != ext4_superblock_csum(sb, es)) { 5365 ext4_msg(sb, KERN_ERR, "external journal has " 5366 "corrupt superblock"); 5367 brelse(bh); 5368 goto out_bdev; 5369 } 5370 5371 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 5372 ext4_msg(sb, KERN_ERR, "journal UUID does not match"); 5373 brelse(bh); 5374 goto out_bdev; 5375 } 5376 5377 len = ext4_blocks_count(es); 5378 start = sb_block + 1; 5379 brelse(bh); /* we're done with the superblock */ 5380 5381 journal = jbd2_journal_init_dev(bdev, sb->s_bdev, 5382 start, len, blocksize); 5383 if (!journal) { 5384 ext4_msg(sb, KERN_ERR, "failed to create device journal"); 5385 goto out_bdev; 5386 } 5387 journal->j_private = sb; 5388 if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { 5389 ext4_msg(sb, KERN_ERR, "I/O error on journal device"); 5390 goto out_journal; 5391 } 5392 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 5393 ext4_msg(sb, KERN_ERR, "External journal has more than one " 5394 "user (unsupported) - %d", 5395 be32_to_cpu(journal->j_superblock->s_nr_users)); 5396 goto out_journal; 5397 } 5398 EXT4_SB(sb)->s_journal_bdev = bdev; 5399 ext4_init_journal_params(sb, journal); 5400 return journal; 5401 5402 out_journal: 5403 jbd2_journal_destroy(journal); 5404 out_bdev: 5405 ext4_blkdev_put(bdev); 5406 return NULL; 5407 } 5408 5409 static int ext4_load_journal(struct super_block *sb, 5410 struct ext4_super_block *es, 5411 unsigned long journal_devnum) 5412 { 5413 journal_t *journal; 5414 unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); 5415 dev_t journal_dev; 5416 int err = 0; 5417 int really_read_only; 5418 int journal_dev_ro; 5419 5420 if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) 5421 return -EFSCORRUPTED; 5422 5423 if (journal_devnum && 5424 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 5425 ext4_msg(sb, KERN_INFO, "external journal device major/minor " 5426 "numbers have changed"); 5427 journal_dev = new_decode_dev(journal_devnum); 5428 } else 5429 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); 5430 5431 if (journal_inum && journal_dev) { 5432 ext4_msg(sb, KERN_ERR, 5433 "filesystem has both journal inode and journal device!"); 5434 return -EINVAL; 5435 } 5436 5437 if (journal_inum) { 5438 journal = ext4_get_journal(sb, journal_inum); 5439 if (!journal) 5440 return -EINVAL; 5441 } else { 5442 journal = ext4_get_dev_journal(sb, journal_dev); 5443 if (!journal) 5444 return -EINVAL; 5445 } 5446 5447 journal_dev_ro = bdev_read_only(journal->j_dev); 5448 really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; 5449 5450 if (journal_dev_ro && !sb_rdonly(sb)) { 5451 ext4_msg(sb, KERN_ERR, 5452 "journal device read-only, try mounting with '-o ro'"); 5453 err = -EROFS; 5454 goto err_out; 5455 } 5456 5457 /* 5458 * Are we loading a blank journal or performing recovery after a 5459 * crash? For recovery, we need to check in advance whether we 5460 * can get read-write access to the device. 5461 */ 5462 if (ext4_has_feature_journal_needs_recovery(sb)) { 5463 if (sb_rdonly(sb)) { 5464 ext4_msg(sb, KERN_INFO, "INFO: recovery " 5465 "required on readonly filesystem"); 5466 if (really_read_only) { 5467 ext4_msg(sb, KERN_ERR, "write access " 5468 "unavailable, cannot proceed " 5469 "(try mounting with noload)"); 5470 err = -EROFS; 5471 goto err_out; 5472 } 5473 ext4_msg(sb, KERN_INFO, "write access will " 5474 "be enabled during recovery"); 5475 } 5476 } 5477 5478 if (!(journal->j_flags & JBD2_BARRIER)) 5479 ext4_msg(sb, KERN_INFO, "barriers disabled"); 5480 5481 if (!ext4_has_feature_journal_needs_recovery(sb)) 5482 err = jbd2_journal_wipe(journal, !really_read_only); 5483 if (!err) { 5484 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); 5485 if (save) 5486 memcpy(save, ((char *) es) + 5487 EXT4_S_ERR_START, EXT4_S_ERR_LEN); 5488 err = jbd2_journal_load(journal); 5489 if (save) 5490 memcpy(((char *) es) + EXT4_S_ERR_START, 5491 save, EXT4_S_ERR_LEN); 5492 kfree(save); 5493 } 5494 5495 if (err) { 5496 ext4_msg(sb, KERN_ERR, "error loading journal"); 5497 goto err_out; 5498 } 5499 5500 EXT4_SB(sb)->s_journal = journal; 5501 err = ext4_clear_journal_err(sb, es); 5502 if (err) { 5503 EXT4_SB(sb)->s_journal = NULL; 5504 jbd2_journal_destroy(journal); 5505 return err; 5506 } 5507 5508 if (!really_read_only && journal_devnum && 5509 journal_devnum != le32_to_cpu(es->s_journal_dev)) { 5510 es->s_journal_dev = cpu_to_le32(journal_devnum); 5511 5512 /* Make sure we flush the recovery flag to disk. */ 5513 ext4_commit_super(sb); 5514 } 5515 5516 err = jbd2_journal_register_shrinker(journal); 5517 if (err) { 5518 EXT4_SB(sb)->s_journal = NULL; 5519 goto err_out; 5520 } 5521 5522 return 0; 5523 5524 err_out: 5525 jbd2_journal_destroy(journal); 5526 return err; 5527 } 5528 5529 /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ 5530 static void ext4_update_super(struct super_block *sb) 5531 { 5532 struct ext4_sb_info *sbi = EXT4_SB(sb); 5533 struct ext4_super_block *es = sbi->s_es; 5534 struct buffer_head *sbh = sbi->s_sbh; 5535 5536 lock_buffer(sbh); 5537 /* 5538 * If the file system is mounted read-only, don't update the 5539 * superblock write time. This avoids updating the superblock 5540 * write time when we are mounting the root file system 5541 * read/only but we need to replay the journal; at that point, 5542 * for people who are east of GMT and who make their clock 5543 * tick in localtime for Windows bug-for-bug compatibility, 5544 * the clock is set in the future, and this will cause e2fsck 5545 * to complain and force a full file system check. 5546 */ 5547 if (!(sb->s_flags & SB_RDONLY)) 5548 ext4_update_tstamp(es, s_wtime); 5549 es->s_kbytes_written = 5550 cpu_to_le64(sbi->s_kbytes_written + 5551 ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - 5552 sbi->s_sectors_written_start) >> 1)); 5553 if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) 5554 ext4_free_blocks_count_set(es, 5555 EXT4_C2B(sbi, percpu_counter_sum_positive( 5556 &sbi->s_freeclusters_counter))); 5557 if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) 5558 es->s_free_inodes_count = 5559 cpu_to_le32(percpu_counter_sum_positive( 5560 &sbi->s_freeinodes_counter)); 5561 /* Copy error information to the on-disk superblock */ 5562 spin_lock(&sbi->s_error_lock); 5563 if (sbi->s_add_error_count > 0) { 5564 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5565 if (!es->s_first_error_time && !es->s_first_error_time_hi) { 5566 __ext4_update_tstamp(&es->s_first_error_time, 5567 &es->s_first_error_time_hi, 5568 sbi->s_first_error_time); 5569 strncpy(es->s_first_error_func, sbi->s_first_error_func, 5570 sizeof(es->s_first_error_func)); 5571 es->s_first_error_line = 5572 cpu_to_le32(sbi->s_first_error_line); 5573 es->s_first_error_ino = 5574 cpu_to_le32(sbi->s_first_error_ino); 5575 es->s_first_error_block = 5576 cpu_to_le64(sbi->s_first_error_block); 5577 es->s_first_error_errcode = 5578 ext4_errno_to_code(sbi->s_first_error_code); 5579 } 5580 __ext4_update_tstamp(&es->s_last_error_time, 5581 &es->s_last_error_time_hi, 5582 sbi->s_last_error_time); 5583 strncpy(es->s_last_error_func, sbi->s_last_error_func, 5584 sizeof(es->s_last_error_func)); 5585 es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); 5586 es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); 5587 es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); 5588 es->s_last_error_errcode = 5589 ext4_errno_to_code(sbi->s_last_error_code); 5590 /* 5591 * Start the daily error reporting function if it hasn't been 5592 * started already 5593 */ 5594 if (!es->s_error_count) 5595 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); 5596 le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); 5597 sbi->s_add_error_count = 0; 5598 } 5599 spin_unlock(&sbi->s_error_lock); 5600 5601 ext4_superblock_csum_set(sb); 5602 unlock_buffer(sbh); 5603 } 5604 5605 static int ext4_commit_super(struct super_block *sb) 5606 { 5607 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 5608 int error = 0; 5609 5610 if (!sbh) 5611 return -EINVAL; 5612 if (block_device_ejected(sb)) 5613 return -ENODEV; 5614 5615 ext4_update_super(sb); 5616 5617 if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { 5618 /* 5619 * Oh, dear. A previous attempt to write the 5620 * superblock failed. This could happen because the 5621 * USB device was yanked out. Or it could happen to 5622 * be a transient write error and maybe the block will 5623 * be remapped. Nothing we can do but to retry the 5624 * write and hope for the best. 5625 */ 5626 ext4_msg(sb, KERN_ERR, "previous I/O error to " 5627 "superblock detected"); 5628 clear_buffer_write_io_error(sbh); 5629 set_buffer_uptodate(sbh); 5630 } 5631 BUFFER_TRACE(sbh, "marking dirty"); 5632 mark_buffer_dirty(sbh); 5633 error = __sync_dirty_buffer(sbh, 5634 REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); 5635 if (buffer_write_io_error(sbh)) { 5636 ext4_msg(sb, KERN_ERR, "I/O error while writing " 5637 "superblock"); 5638 clear_buffer_write_io_error(sbh); 5639 set_buffer_uptodate(sbh); 5640 } 5641 return error; 5642 } 5643 5644 /* 5645 * Have we just finished recovery? If so, and if we are mounting (or 5646 * remounting) the filesystem readonly, then we will end up with a 5647 * consistent fs on disk. Record that fact. 5648 */ 5649 static int ext4_mark_recovery_complete(struct super_block *sb, 5650 struct ext4_super_block *es) 5651 { 5652 int err; 5653 journal_t *journal = EXT4_SB(sb)->s_journal; 5654 5655 if (!ext4_has_feature_journal(sb)) { 5656 if (journal != NULL) { 5657 ext4_error(sb, "Journal got removed while the fs was " 5658 "mounted!"); 5659 return -EFSCORRUPTED; 5660 } 5661 return 0; 5662 } 5663 jbd2_journal_lock_updates(journal); 5664 err = jbd2_journal_flush(journal, 0); 5665 if (err < 0) 5666 goto out; 5667 5668 if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { 5669 ext4_clear_feature_journal_needs_recovery(sb); 5670 ext4_commit_super(sb); 5671 } 5672 out: 5673 jbd2_journal_unlock_updates(journal); 5674 return err; 5675 } 5676 5677 /* 5678 * If we are mounting (or read-write remounting) a filesystem whose journal 5679 * has recorded an error from a previous lifetime, move that error to the 5680 * main filesystem now. 5681 */ 5682 static int ext4_clear_journal_err(struct super_block *sb, 5683 struct ext4_super_block *es) 5684 { 5685 journal_t *journal; 5686 int j_errno; 5687 const char *errstr; 5688 5689 if (!ext4_has_feature_journal(sb)) { 5690 ext4_error(sb, "Journal got removed while the fs was mounted!"); 5691 return -EFSCORRUPTED; 5692 } 5693 5694 journal = EXT4_SB(sb)->s_journal; 5695 5696 /* 5697 * Now check for any error status which may have been recorded in the 5698 * journal by a prior ext4_error() or ext4_abort() 5699 */ 5700 5701 j_errno = jbd2_journal_errno(journal); 5702 if (j_errno) { 5703 char nbuf[16]; 5704 5705 errstr = ext4_decode_error(sb, j_errno, nbuf); 5706 ext4_warning(sb, "Filesystem error recorded " 5707 "from previous mount: %s", errstr); 5708 ext4_warning(sb, "Marking fs in need of filesystem check."); 5709 5710 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; 5711 es->s_state |= cpu_to_le16(EXT4_ERROR_FS); 5712 ext4_commit_super(sb); 5713 5714 jbd2_journal_clear_err(journal); 5715 jbd2_journal_update_sb_errno(journal); 5716 } 5717 return 0; 5718 } 5719 5720 /* 5721 * Force the running and committing transactions to commit, 5722 * and wait on the commit. 5723 */ 5724 int ext4_force_commit(struct super_block *sb) 5725 { 5726 journal_t *journal; 5727 5728 if (sb_rdonly(sb)) 5729 return 0; 5730 5731 journal = EXT4_SB(sb)->s_journal; 5732 return ext4_journal_force_commit(journal); 5733 } 5734 5735 static int ext4_sync_fs(struct super_block *sb, int wait) 5736 { 5737 int ret = 0; 5738 tid_t target; 5739 bool needs_barrier = false; 5740 struct ext4_sb_info *sbi = EXT4_SB(sb); 5741 5742 if (unlikely(ext4_forced_shutdown(sbi))) 5743 return 0; 5744 5745 trace_ext4_sync_fs(sb, wait); 5746 flush_workqueue(sbi->rsv_conversion_wq); 5747 /* 5748 * Writeback quota in non-journalled quota case - journalled quota has 5749 * no dirty dquots 5750 */ 5751 dquot_writeback_dquots(sb, -1); 5752 /* 5753 * Data writeback is possible w/o journal transaction, so barrier must 5754 * being sent at the end of the function. But we can skip it if 5755 * transaction_commit will do it for us. 5756 */ 5757 if (sbi->s_journal) { 5758 target = jbd2_get_latest_transaction(sbi->s_journal); 5759 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 5760 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 5761 needs_barrier = true; 5762 5763 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 5764 if (wait) 5765 ret = jbd2_log_wait_commit(sbi->s_journal, 5766 target); 5767 } 5768 } else if (wait && test_opt(sb, BARRIER)) 5769 needs_barrier = true; 5770 if (needs_barrier) { 5771 int err; 5772 err = blkdev_issue_flush(sb->s_bdev); 5773 if (!ret) 5774 ret = err; 5775 } 5776 5777 return ret; 5778 } 5779 5780 /* 5781 * LVM calls this function before a (read-only) snapshot is created. This 5782 * gives us a chance to flush the journal completely and mark the fs clean. 5783 * 5784 * Note that only this function cannot bring a filesystem to be in a clean 5785 * state independently. It relies on upper layer to stop all data & metadata 5786 * modifications. 5787 */ 5788 static int ext4_freeze(struct super_block *sb) 5789 { 5790 int error = 0; 5791 journal_t *journal; 5792 5793 if (sb_rdonly(sb)) 5794 return 0; 5795 5796 journal = EXT4_SB(sb)->s_journal; 5797 5798 if (journal) { 5799 /* Now we set up the journal barrier. */ 5800 jbd2_journal_lock_updates(journal); 5801 5802 /* 5803 * Don't clear the needs_recovery flag if we failed to 5804 * flush the journal. 5805 */ 5806 error = jbd2_journal_flush(journal, 0); 5807 if (error < 0) 5808 goto out; 5809 5810 /* Journal blocked and flushed, clear needs_recovery flag. */ 5811 ext4_clear_feature_journal_needs_recovery(sb); 5812 } 5813 5814 error = ext4_commit_super(sb); 5815 out: 5816 if (journal) 5817 /* we rely on upper layer to stop further updates */ 5818 jbd2_journal_unlock_updates(journal); 5819 return error; 5820 } 5821 5822 /* 5823 * Called by LVM after the snapshot is done. We need to reset the RECOVER 5824 * flag here, even though the filesystem is not technically dirty yet. 5825 */ 5826 static int ext4_unfreeze(struct super_block *sb) 5827 { 5828 if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) 5829 return 0; 5830 5831 if (EXT4_SB(sb)->s_journal) { 5832 /* Reset the needs_recovery flag before the fs is unlocked. */ 5833 ext4_set_feature_journal_needs_recovery(sb); 5834 } 5835 5836 ext4_commit_super(sb); 5837 return 0; 5838 } 5839 5840 /* 5841 * Structure to save mount options for ext4_remount's benefit 5842 */ 5843 struct ext4_mount_options { 5844 unsigned long s_mount_opt; 5845 unsigned long s_mount_opt2; 5846 kuid_t s_resuid; 5847 kgid_t s_resgid; 5848 unsigned long s_commit_interval; 5849 u32 s_min_batch_time, s_max_batch_time; 5850 #ifdef CONFIG_QUOTA 5851 int s_jquota_fmt; 5852 char *s_qf_names[EXT4_MAXQUOTAS]; 5853 #endif 5854 }; 5855 5856 static int ext4_remount(struct super_block *sb, int *flags, char *data) 5857 { 5858 struct ext4_super_block *es; 5859 struct ext4_sb_info *sbi = EXT4_SB(sb); 5860 unsigned long old_sb_flags, vfs_flags; 5861 struct ext4_mount_options old_opts; 5862 int enable_quota = 0; 5863 ext4_group_t g; 5864 int err = 0; 5865 #ifdef CONFIG_QUOTA 5866 int i, j; 5867 char *to_free[EXT4_MAXQUOTAS]; 5868 #endif 5869 char *orig_data = kstrdup(data, GFP_KERNEL); 5870 struct ext4_parsed_options parsed_opts; 5871 5872 parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 5873 parsed_opts.journal_devnum = 0; 5874 5875 if (data && !orig_data) 5876 return -ENOMEM; 5877 5878 /* Store the original options */ 5879 old_sb_flags = sb->s_flags; 5880 old_opts.s_mount_opt = sbi->s_mount_opt; 5881 old_opts.s_mount_opt2 = sbi->s_mount_opt2; 5882 old_opts.s_resuid = sbi->s_resuid; 5883 old_opts.s_resgid = sbi->s_resgid; 5884 old_opts.s_commit_interval = sbi->s_commit_interval; 5885 old_opts.s_min_batch_time = sbi->s_min_batch_time; 5886 old_opts.s_max_batch_time = sbi->s_max_batch_time; 5887 #ifdef CONFIG_QUOTA 5888 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 5889 for (i = 0; i < EXT4_MAXQUOTAS; i++) 5890 if (sbi->s_qf_names[i]) { 5891 char *qf_name = get_qf_name(sb, sbi, i); 5892 5893 old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); 5894 if (!old_opts.s_qf_names[i]) { 5895 for (j = 0; j < i; j++) 5896 kfree(old_opts.s_qf_names[j]); 5897 kfree(orig_data); 5898 return -ENOMEM; 5899 } 5900 } else 5901 old_opts.s_qf_names[i] = NULL; 5902 #endif 5903 if (sbi->s_journal && sbi->s_journal->j_task->io_context) 5904 parsed_opts.journal_ioprio = 5905 sbi->s_journal->j_task->io_context->ioprio; 5906 5907 /* 5908 * Some options can be enabled by ext4 and/or by VFS mount flag 5909 * either way we need to make sure it matches in both *flags and 5910 * s_flags. Copy those selected flags from *flags to s_flags 5911 */ 5912 vfs_flags = SB_LAZYTIME | SB_I_VERSION; 5913 sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); 5914 5915 if (!parse_options(data, sb, &parsed_opts, 1)) { 5916 err = -EINVAL; 5917 goto restore_opts; 5918 } 5919 5920 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ 5921 test_opt(sb, JOURNAL_CHECKSUM)) { 5922 ext4_msg(sb, KERN_ERR, "changing journal_checksum " 5923 "during remount not supported; ignoring"); 5924 sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; 5925 } 5926 5927 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 5928 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 5929 ext4_msg(sb, KERN_ERR, "can't mount with " 5930 "both data=journal and delalloc"); 5931 err = -EINVAL; 5932 goto restore_opts; 5933 } 5934 if (test_opt(sb, DIOREAD_NOLOCK)) { 5935 ext4_msg(sb, KERN_ERR, "can't mount with " 5936 "both data=journal and dioread_nolock"); 5937 err = -EINVAL; 5938 goto restore_opts; 5939 } 5940 } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { 5941 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 5942 ext4_msg(sb, KERN_ERR, "can't mount with " 5943 "journal_async_commit in data=ordered mode"); 5944 err = -EINVAL; 5945 goto restore_opts; 5946 } 5947 } 5948 5949 if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { 5950 ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); 5951 err = -EINVAL; 5952 goto restore_opts; 5953 } 5954 5955 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) 5956 ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); 5957 5958 sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | 5959 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); 5960 5961 es = sbi->s_es; 5962 5963 if (sbi->s_journal) { 5964 ext4_init_journal_params(sb, sbi->s_journal); 5965 set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); 5966 } 5967 5968 /* Flush outstanding errors before changing fs state */ 5969 flush_work(&sbi->s_error_work); 5970 5971 if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { 5972 if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { 5973 err = -EROFS; 5974 goto restore_opts; 5975 } 5976 5977 if (*flags & SB_RDONLY) { 5978 err = sync_filesystem(sb); 5979 if (err < 0) 5980 goto restore_opts; 5981 err = dquot_suspend(sb, -1); 5982 if (err < 0) 5983 goto restore_opts; 5984 5985 /* 5986 * First of all, the unconditional stuff we have to do 5987 * to disable replay of the journal when we next remount 5988 */ 5989 sb->s_flags |= SB_RDONLY; 5990 5991 /* 5992 * OK, test if we are remounting a valid rw partition 5993 * readonly, and if so set the rdonly flag and then 5994 * mark the partition as valid again. 5995 */ 5996 if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && 5997 (sbi->s_mount_state & EXT4_VALID_FS)) 5998 es->s_state = cpu_to_le16(sbi->s_mount_state); 5999 6000 if (sbi->s_journal) { 6001 /* 6002 * We let remount-ro finish even if marking fs 6003 * as clean failed... 6004 */ 6005 ext4_mark_recovery_complete(sb, es); 6006 } 6007 ext4_stop_mmpd(sbi); 6008 } else { 6009 /* Make sure we can mount this feature set readwrite */ 6010 if (ext4_has_feature_readonly(sb) || 6011 !ext4_feature_set_ok(sb, 0)) { 6012 err = -EROFS; 6013 goto restore_opts; 6014 } 6015 /* 6016 * Make sure the group descriptor checksums 6017 * are sane. If they aren't, refuse to remount r/w. 6018 */ 6019 for (g = 0; g < sbi->s_groups_count; g++) { 6020 struct ext4_group_desc *gdp = 6021 ext4_get_group_desc(sb, g, NULL); 6022 6023 if (!ext4_group_desc_csum_verify(sb, g, gdp)) { 6024 ext4_msg(sb, KERN_ERR, 6025 "ext4_remount: Checksum for group %u failed (%u!=%u)", 6026 g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), 6027 le16_to_cpu(gdp->bg_checksum)); 6028 err = -EFSBADCRC; 6029 goto restore_opts; 6030 } 6031 } 6032 6033 /* 6034 * If we have an unprocessed orphan list hanging 6035 * around from a previously readonly bdev mount, 6036 * require a full umount/remount for now. 6037 */ 6038 if (es->s_last_orphan) { 6039 ext4_msg(sb, KERN_WARNING, "Couldn't " 6040 "remount RDWR because of unprocessed " 6041 "orphan inode list. Please " 6042 "umount/remount instead"); 6043 err = -EINVAL; 6044 goto restore_opts; 6045 } 6046 6047 /* 6048 * Mounting a RDONLY partition read-write, so reread 6049 * and store the current valid flag. (It may have 6050 * been changed by e2fsck since we originally mounted 6051 * the partition.) 6052 */ 6053 if (sbi->s_journal) { 6054 err = ext4_clear_journal_err(sb, es); 6055 if (err) 6056 goto restore_opts; 6057 } 6058 sbi->s_mount_state = le16_to_cpu(es->s_state); 6059 6060 err = ext4_setup_super(sb, es, 0); 6061 if (err) 6062 goto restore_opts; 6063 6064 sb->s_flags &= ~SB_RDONLY; 6065 if (ext4_has_feature_mmp(sb)) 6066 if (ext4_multi_mount_protect(sb, 6067 le64_to_cpu(es->s_mmp_block))) { 6068 err = -EROFS; 6069 goto restore_opts; 6070 } 6071 enable_quota = 1; 6072 } 6073 } 6074 6075 /* 6076 * Reinitialize lazy itable initialization thread based on 6077 * current settings 6078 */ 6079 if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) 6080 ext4_unregister_li_request(sb); 6081 else { 6082 ext4_group_t first_not_zeroed; 6083 first_not_zeroed = ext4_has_uninit_itable(sb); 6084 ext4_register_li_request(sb, first_not_zeroed); 6085 } 6086 6087 /* 6088 * Handle creation of system zone data early because it can fail. 6089 * Releasing of existing data is done when we are sure remount will 6090 * succeed. 6091 */ 6092 if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { 6093 err = ext4_setup_system_zone(sb); 6094 if (err) 6095 goto restore_opts; 6096 } 6097 6098 if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { 6099 err = ext4_commit_super(sb); 6100 if (err) 6101 goto restore_opts; 6102 } 6103 6104 #ifdef CONFIG_QUOTA 6105 /* Release old quota file names */ 6106 for (i = 0; i < EXT4_MAXQUOTAS; i++) 6107 kfree(old_opts.s_qf_names[i]); 6108 if (enable_quota) { 6109 if (sb_any_quota_suspended(sb)) 6110 dquot_resume(sb, -1); 6111 else if (ext4_has_feature_quota(sb)) { 6112 err = ext4_enable_quotas(sb); 6113 if (err) 6114 goto restore_opts; 6115 } 6116 } 6117 #endif 6118 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) 6119 ext4_release_system_zone(sb); 6120 6121 /* 6122 * Some options can be enabled by ext4 and/or by VFS mount flag 6123 * either way we need to make sure it matches in both *flags and 6124 * s_flags. Copy those selected flags from s_flags to *flags 6125 */ 6126 *flags = (*flags & ~vfs_flags) | (sb->s_flags & vfs_flags); 6127 6128 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s. Quota mode: %s.", 6129 orig_data, ext4_quota_mode(sb)); 6130 kfree(orig_data); 6131 return 0; 6132 6133 restore_opts: 6134 sb->s_flags = old_sb_flags; 6135 sbi->s_mount_opt = old_opts.s_mount_opt; 6136 sbi->s_mount_opt2 = old_opts.s_mount_opt2; 6137 sbi->s_resuid = old_opts.s_resuid; 6138 sbi->s_resgid = old_opts.s_resgid; 6139 sbi->s_commit_interval = old_opts.s_commit_interval; 6140 sbi->s_min_batch_time = old_opts.s_min_batch_time; 6141 sbi->s_max_batch_time = old_opts.s_max_batch_time; 6142 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) 6143 ext4_release_system_zone(sb); 6144 #ifdef CONFIG_QUOTA 6145 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 6146 for (i = 0; i < EXT4_MAXQUOTAS; i++) { 6147 to_free[i] = get_qf_name(sb, sbi, i); 6148 rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); 6149 } 6150 synchronize_rcu(); 6151 for (i = 0; i < EXT4_MAXQUOTAS; i++) 6152 kfree(to_free[i]); 6153 #endif 6154 kfree(orig_data); 6155 return err; 6156 } 6157 6158 #ifdef CONFIG_QUOTA 6159 static int ext4_statfs_project(struct super_block *sb, 6160 kprojid_t projid, struct kstatfs *buf) 6161 { 6162 struct kqid qid; 6163 struct dquot *dquot; 6164 u64 limit; 6165 u64 curblock; 6166 6167 qid = make_kqid_projid(projid); 6168 dquot = dqget(sb, qid); 6169 if (IS_ERR(dquot)) 6170 return PTR_ERR(dquot); 6171 spin_lock(&dquot->dq_dqb_lock); 6172 6173 limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, 6174 dquot->dq_dqb.dqb_bhardlimit); 6175 limit >>= sb->s_blocksize_bits; 6176 6177 if (limit && buf->f_blocks > limit) { 6178 curblock = (dquot->dq_dqb.dqb_curspace + 6179 dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; 6180 buf->f_blocks = limit; 6181 buf->f_bfree = buf->f_bavail = 6182 (buf->f_blocks > curblock) ? 6183 (buf->f_blocks - curblock) : 0; 6184 } 6185 6186 limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, 6187 dquot->dq_dqb.dqb_ihardlimit); 6188 if (limit && buf->f_files > limit) { 6189 buf->f_files = limit; 6190 buf->f_ffree = 6191 (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? 6192 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; 6193 } 6194 6195 spin_unlock(&dquot->dq_dqb_lock); 6196 dqput(dquot); 6197 return 0; 6198 } 6199 #endif 6200 6201 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) 6202 { 6203 struct super_block *sb = dentry->d_sb; 6204 struct ext4_sb_info *sbi = EXT4_SB(sb); 6205 struct ext4_super_block *es = sbi->s_es; 6206 ext4_fsblk_t overhead = 0, resv_blocks; 6207 s64 bfree; 6208 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); 6209 6210 if (!test_opt(sb, MINIX_DF)) 6211 overhead = sbi->s_overhead; 6212 6213 buf->f_type = EXT4_SUPER_MAGIC; 6214 buf->f_bsize = sb->s_blocksize; 6215 buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); 6216 bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - 6217 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); 6218 /* prevent underflow in case that few free space is available */ 6219 buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); 6220 buf->f_bavail = buf->f_bfree - 6221 (ext4_r_blocks_count(es) + resv_blocks); 6222 if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) 6223 buf->f_bavail = 0; 6224 buf->f_files = le32_to_cpu(es->s_inodes_count); 6225 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 6226 buf->f_namelen = EXT4_NAME_LEN; 6227 buf->f_fsid = uuid_to_fsid(es->s_uuid); 6228 6229 #ifdef CONFIG_QUOTA 6230 if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && 6231 sb_has_quota_limits_enabled(sb, PRJQUOTA)) 6232 ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); 6233 #endif 6234 return 0; 6235 } 6236 6237 6238 #ifdef CONFIG_QUOTA 6239 6240 /* 6241 * Helper functions so that transaction is started before we acquire dqio_sem 6242 * to keep correct lock ordering of transaction > dqio_sem 6243 */ 6244 static inline struct inode *dquot_to_inode(struct dquot *dquot) 6245 { 6246 return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; 6247 } 6248 6249 static int ext4_write_dquot(struct dquot *dquot) 6250 { 6251 int ret, err; 6252 handle_t *handle; 6253 struct inode *inode; 6254 6255 inode = dquot_to_inode(dquot); 6256 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 6257 EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); 6258 if (IS_ERR(handle)) 6259 return PTR_ERR(handle); 6260 ret = dquot_commit(dquot); 6261 err = ext4_journal_stop(handle); 6262 if (!ret) 6263 ret = err; 6264 return ret; 6265 } 6266 6267 static int ext4_acquire_dquot(struct dquot *dquot) 6268 { 6269 int ret, err; 6270 handle_t *handle; 6271 6272 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, 6273 EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); 6274 if (IS_ERR(handle)) 6275 return PTR_ERR(handle); 6276 ret = dquot_acquire(dquot); 6277 err = ext4_journal_stop(handle); 6278 if (!ret) 6279 ret = err; 6280 return ret; 6281 } 6282 6283 static int ext4_release_dquot(struct dquot *dquot) 6284 { 6285 int ret, err; 6286 handle_t *handle; 6287 6288 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, 6289 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 6290 if (IS_ERR(handle)) { 6291 /* Release dquot anyway to avoid endless cycle in dqput() */ 6292 dquot_release(dquot); 6293 return PTR_ERR(handle); 6294 } 6295 ret = dquot_release(dquot); 6296 err = ext4_journal_stop(handle); 6297 if (!ret) 6298 ret = err; 6299 return ret; 6300 } 6301 6302 static int ext4_mark_dquot_dirty(struct dquot *dquot) 6303 { 6304 struct super_block *sb = dquot->dq_sb; 6305 6306 if (ext4_is_quota_journalled(sb)) { 6307 dquot_mark_dquot_dirty(dquot); 6308 return ext4_write_dquot(dquot); 6309 } else { 6310 return dquot_mark_dquot_dirty(dquot); 6311 } 6312 } 6313 6314 static int ext4_write_info(struct super_block *sb, int type) 6315 { 6316 int ret, err; 6317 handle_t *handle; 6318 6319 /* Data block + inode block */ 6320 handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); 6321 if (IS_ERR(handle)) 6322 return PTR_ERR(handle); 6323 ret = dquot_commit_info(sb, type); 6324 err = ext4_journal_stop(handle); 6325 if (!ret) 6326 ret = err; 6327 return ret; 6328 } 6329 6330 /* 6331 * Turn on quotas during mount time - we need to find 6332 * the quota file and such... 6333 */ 6334 static int ext4_quota_on_mount(struct super_block *sb, int type) 6335 { 6336 return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), 6337 EXT4_SB(sb)->s_jquota_fmt, type); 6338 } 6339 6340 static void lockdep_set_quota_inode(struct inode *inode, int subclass) 6341 { 6342 struct ext4_inode_info *ei = EXT4_I(inode); 6343 6344 /* The first argument of lockdep_set_subclass has to be 6345 * *exactly* the same as the argument to init_rwsem() --- in 6346 * this case, in init_once() --- or lockdep gets unhappy 6347 * because the name of the lock is set using the 6348 * stringification of the argument to init_rwsem(). 6349 */ 6350 (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ 6351 lockdep_set_subclass(&ei->i_data_sem, subclass); 6352 } 6353 6354 /* 6355 * Standard function to be called on quota_on 6356 */ 6357 static int ext4_quota_on(struct super_block *sb, int type, int format_id, 6358 const struct path *path) 6359 { 6360 int err; 6361 6362 if (!test_opt(sb, QUOTA)) 6363 return -EINVAL; 6364 6365 /* Quotafile not on the same filesystem? */ 6366 if (path->dentry->d_sb != sb) 6367 return -EXDEV; 6368 6369 /* Quota already enabled for this file? */ 6370 if (IS_NOQUOTA(d_inode(path->dentry))) 6371 return -EBUSY; 6372 6373 /* Journaling quota? */ 6374 if (EXT4_SB(sb)->s_qf_names[type]) { 6375 /* Quotafile not in fs root? */ 6376 if (path->dentry->d_parent != sb->s_root) 6377 ext4_msg(sb, KERN_WARNING, 6378 "Quota file not on filesystem root. " 6379 "Journaled quota will not work"); 6380 sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; 6381 } else { 6382 /* 6383 * Clear the flag just in case mount options changed since 6384 * last time. 6385 */ 6386 sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; 6387 } 6388 6389 /* 6390 * When we journal data on quota file, we have to flush journal to see 6391 * all updates to the file when we bypass pagecache... 6392 */ 6393 if (EXT4_SB(sb)->s_journal && 6394 ext4_should_journal_data(d_inode(path->dentry))) { 6395 /* 6396 * We don't need to lock updates but journal_flush() could 6397 * otherwise be livelocked... 6398 */ 6399 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 6400 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); 6401 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 6402 if (err) 6403 return err; 6404 } 6405 6406 lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); 6407 err = dquot_quota_on(sb, type, format_id, path); 6408 if (err) { 6409 lockdep_set_quota_inode(path->dentry->d_inode, 6410 I_DATA_SEM_NORMAL); 6411 } else { 6412 struct inode *inode = d_inode(path->dentry); 6413 handle_t *handle; 6414 6415 /* 6416 * Set inode flags to prevent userspace from messing with quota 6417 * files. If this fails, we return success anyway since quotas 6418 * are already enabled and this is not a hard failure. 6419 */ 6420 inode_lock(inode); 6421 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); 6422 if (IS_ERR(handle)) 6423 goto unlock_inode; 6424 EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; 6425 inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, 6426 S_NOATIME | S_IMMUTABLE); 6427 err = ext4_mark_inode_dirty(handle, inode); 6428 ext4_journal_stop(handle); 6429 unlock_inode: 6430 inode_unlock(inode); 6431 } 6432 return err; 6433 } 6434 6435 static int ext4_quota_enable(struct super_block *sb, int type, int format_id, 6436 unsigned int flags) 6437 { 6438 int err; 6439 struct inode *qf_inode; 6440 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 6441 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 6442 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 6443 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 6444 }; 6445 6446 BUG_ON(!ext4_has_feature_quota(sb)); 6447 6448 if (!qf_inums[type]) 6449 return -EPERM; 6450 6451 qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); 6452 if (IS_ERR(qf_inode)) { 6453 ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); 6454 return PTR_ERR(qf_inode); 6455 } 6456 6457 /* Don't account quota for quota files to avoid recursion */ 6458 qf_inode->i_flags |= S_NOQUOTA; 6459 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); 6460 err = dquot_load_quota_inode(qf_inode, type, format_id, flags); 6461 if (err) 6462 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); 6463 iput(qf_inode); 6464 6465 return err; 6466 } 6467 6468 /* Enable usage tracking for all quota types. */ 6469 static int ext4_enable_quotas(struct super_block *sb) 6470 { 6471 int type, err = 0; 6472 unsigned long qf_inums[EXT4_MAXQUOTAS] = { 6473 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 6474 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), 6475 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) 6476 }; 6477 bool quota_mopt[EXT4_MAXQUOTAS] = { 6478 test_opt(sb, USRQUOTA), 6479 test_opt(sb, GRPQUOTA), 6480 test_opt(sb, PRJQUOTA), 6481 }; 6482 6483 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; 6484 for (type = 0; type < EXT4_MAXQUOTAS; type++) { 6485 if (qf_inums[type]) { 6486 err = ext4_quota_enable(sb, type, QFMT_VFS_V1, 6487 DQUOT_USAGE_ENABLED | 6488 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); 6489 if (err) { 6490 ext4_warning(sb, 6491 "Failed to enable quota tracking " 6492 "(type=%d, err=%d). Please run " 6493 "e2fsck to fix.", type, err); 6494 for (type--; type >= 0; type--) 6495 dquot_quota_off(sb, type); 6496 6497 return err; 6498 } 6499 } 6500 } 6501 return 0; 6502 } 6503 6504 static int ext4_quota_off(struct super_block *sb, int type) 6505 { 6506 struct inode *inode = sb_dqopt(sb)->files[type]; 6507 handle_t *handle; 6508 int err; 6509 6510 /* Force all delayed allocation blocks to be allocated. 6511 * Caller already holds s_umount sem */ 6512 if (test_opt(sb, DELALLOC)) 6513 sync_filesystem(sb); 6514 6515 if (!inode || !igrab(inode)) 6516 goto out; 6517 6518 err = dquot_quota_off(sb, type); 6519 if (err || ext4_has_feature_quota(sb)) 6520 goto out_put; 6521 6522 inode_lock(inode); 6523 /* 6524 * Update modification times of quota files when userspace can 6525 * start looking at them. If we fail, we return success anyway since 6526 * this is not a hard failure and quotas are already disabled. 6527 */ 6528 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); 6529 if (IS_ERR(handle)) { 6530 err = PTR_ERR(handle); 6531 goto out_unlock; 6532 } 6533 EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); 6534 inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); 6535 inode->i_mtime = inode->i_ctime = current_time(inode); 6536 err = ext4_mark_inode_dirty(handle, inode); 6537 ext4_journal_stop(handle); 6538 out_unlock: 6539 inode_unlock(inode); 6540 out_put: 6541 lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); 6542 iput(inode); 6543 return err; 6544 out: 6545 return dquot_quota_off(sb, type); 6546 } 6547 6548 /* Read data from quotafile - avoid pagecache and such because we cannot afford 6549 * acquiring the locks... As quota files are never truncated and quota code 6550 * itself serializes the operations (and no one else should touch the files) 6551 * we don't have to be afraid of races */ 6552 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 6553 size_t len, loff_t off) 6554 { 6555 struct inode *inode = sb_dqopt(sb)->files[type]; 6556 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 6557 int offset = off & (sb->s_blocksize - 1); 6558 int tocopy; 6559 size_t toread; 6560 struct buffer_head *bh; 6561 loff_t i_size = i_size_read(inode); 6562 6563 if (off > i_size) 6564 return 0; 6565 if (off+len > i_size) 6566 len = i_size-off; 6567 toread = len; 6568 while (toread > 0) { 6569 tocopy = sb->s_blocksize - offset < toread ? 6570 sb->s_blocksize - offset : toread; 6571 bh = ext4_bread(NULL, inode, blk, 0); 6572 if (IS_ERR(bh)) 6573 return PTR_ERR(bh); 6574 if (!bh) /* A hole? */ 6575 memset(data, 0, tocopy); 6576 else 6577 memcpy(data, bh->b_data+offset, tocopy); 6578 brelse(bh); 6579 offset = 0; 6580 toread -= tocopy; 6581 data += tocopy; 6582 blk++; 6583 } 6584 return len; 6585 } 6586 6587 /* Write to quotafile (we know the transaction is already started and has 6588 * enough credits) */ 6589 static ssize_t ext4_quota_write(struct super_block *sb, int type, 6590 const char *data, size_t len, loff_t off) 6591 { 6592 struct inode *inode = sb_dqopt(sb)->files[type]; 6593 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 6594 int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); 6595 int retries = 0; 6596 struct buffer_head *bh; 6597 handle_t *handle = journal_current_handle(); 6598 6599 if (EXT4_SB(sb)->s_journal && !handle) { 6600 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" 6601 " cancelled because transaction is not started", 6602 (unsigned long long)off, (unsigned long long)len); 6603 return -EIO; 6604 } 6605 /* 6606 * Since we account only one data block in transaction credits, 6607 * then it is impossible to cross a block boundary. 6608 */ 6609 if (sb->s_blocksize - offset < len) { 6610 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" 6611 " cancelled because not block aligned", 6612 (unsigned long long)off, (unsigned long long)len); 6613 return -EIO; 6614 } 6615 6616 do { 6617 bh = ext4_bread(handle, inode, blk, 6618 EXT4_GET_BLOCKS_CREATE | 6619 EXT4_GET_BLOCKS_METADATA_NOFAIL); 6620 } while (PTR_ERR(bh) == -ENOSPC && 6621 ext4_should_retry_alloc(inode->i_sb, &retries)); 6622 if (IS_ERR(bh)) 6623 return PTR_ERR(bh); 6624 if (!bh) 6625 goto out; 6626 BUFFER_TRACE(bh, "get write access"); 6627 err = ext4_journal_get_write_access(handle, bh); 6628 if (err) { 6629 brelse(bh); 6630 return err; 6631 } 6632 lock_buffer(bh); 6633 memcpy(bh->b_data+offset, data, len); 6634 flush_dcache_page(bh->b_page); 6635 unlock_buffer(bh); 6636 err = ext4_handle_dirty_metadata(handle, NULL, bh); 6637 brelse(bh); 6638 out: 6639 if (inode->i_size < off + len) { 6640 i_size_write(inode, off + len); 6641 EXT4_I(inode)->i_disksize = inode->i_size; 6642 err2 = ext4_mark_inode_dirty(handle, inode); 6643 if (unlikely(err2 && !err)) 6644 err = err2; 6645 } 6646 return err ? err : len; 6647 } 6648 #endif 6649 6650 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 6651 const char *dev_name, void *data) 6652 { 6653 return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); 6654 } 6655 6656 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) 6657 static inline void register_as_ext2(void) 6658 { 6659 int err = register_filesystem(&ext2_fs_type); 6660 if (err) 6661 printk(KERN_WARNING 6662 "EXT4-fs: Unable to register as ext2 (%d)\n", err); 6663 } 6664 6665 static inline void unregister_as_ext2(void) 6666 { 6667 unregister_filesystem(&ext2_fs_type); 6668 } 6669 6670 static inline int ext2_feature_set_ok(struct super_block *sb) 6671 { 6672 if (ext4_has_unknown_ext2_incompat_features(sb)) 6673 return 0; 6674 if (sb_rdonly(sb)) 6675 return 1; 6676 if (ext4_has_unknown_ext2_ro_compat_features(sb)) 6677 return 0; 6678 return 1; 6679 } 6680 #else 6681 static inline void register_as_ext2(void) { } 6682 static inline void unregister_as_ext2(void) { } 6683 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } 6684 #endif 6685 6686 static inline void register_as_ext3(void) 6687 { 6688 int err = register_filesystem(&ext3_fs_type); 6689 if (err) 6690 printk(KERN_WARNING 6691 "EXT4-fs: Unable to register as ext3 (%d)\n", err); 6692 } 6693 6694 static inline void unregister_as_ext3(void) 6695 { 6696 unregister_filesystem(&ext3_fs_type); 6697 } 6698 6699 static inline int ext3_feature_set_ok(struct super_block *sb) 6700 { 6701 if (ext4_has_unknown_ext3_incompat_features(sb)) 6702 return 0; 6703 if (!ext4_has_feature_journal(sb)) 6704 return 0; 6705 if (sb_rdonly(sb)) 6706 return 1; 6707 if (ext4_has_unknown_ext3_ro_compat_features(sb)) 6708 return 0; 6709 return 1; 6710 } 6711 6712 static struct file_system_type ext4_fs_type = { 6713 .owner = THIS_MODULE, 6714 .name = "ext4", 6715 .mount = ext4_mount, 6716 .kill_sb = kill_block_super, 6717 .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, 6718 }; 6719 MODULE_ALIAS_FS("ext4"); 6720 6721 /* Shared across all ext4 file systems */ 6722 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 6723 6724 static int __init ext4_init_fs(void) 6725 { 6726 int i, err; 6727 6728 ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); 6729 ext4_li_info = NULL; 6730 6731 /* Build-time check for flags consistency */ 6732 ext4_check_flag_values(); 6733 6734 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) 6735 init_waitqueue_head(&ext4__ioend_wq[i]); 6736 6737 err = ext4_init_es(); 6738 if (err) 6739 return err; 6740 6741 err = ext4_init_pending(); 6742 if (err) 6743 goto out7; 6744 6745 err = ext4_init_post_read_processing(); 6746 if (err) 6747 goto out6; 6748 6749 err = ext4_init_pageio(); 6750 if (err) 6751 goto out5; 6752 6753 err = ext4_init_system_zone(); 6754 if (err) 6755 goto out4; 6756 6757 err = ext4_init_sysfs(); 6758 if (err) 6759 goto out3; 6760 6761 err = ext4_init_mballoc(); 6762 if (err) 6763 goto out2; 6764 err = init_inodecache(); 6765 if (err) 6766 goto out1; 6767 6768 err = ext4_fc_init_dentry_cache(); 6769 if (err) 6770 goto out05; 6771 6772 register_as_ext3(); 6773 register_as_ext2(); 6774 err = register_filesystem(&ext4_fs_type); 6775 if (err) 6776 goto out; 6777 6778 return 0; 6779 out: 6780 unregister_as_ext2(); 6781 unregister_as_ext3(); 6782 out05: 6783 destroy_inodecache(); 6784 out1: 6785 ext4_exit_mballoc(); 6786 out2: 6787 ext4_exit_sysfs(); 6788 out3: 6789 ext4_exit_system_zone(); 6790 out4: 6791 ext4_exit_pageio(); 6792 out5: 6793 ext4_exit_post_read_processing(); 6794 out6: 6795 ext4_exit_pending(); 6796 out7: 6797 ext4_exit_es(); 6798 6799 return err; 6800 } 6801 6802 static void __exit ext4_exit_fs(void) 6803 { 6804 ext4_destroy_lazyinit_thread(); 6805 unregister_as_ext2(); 6806 unregister_as_ext3(); 6807 unregister_filesystem(&ext4_fs_type); 6808 destroy_inodecache(); 6809 ext4_exit_mballoc(); 6810 ext4_exit_sysfs(); 6811 ext4_exit_system_zone(); 6812 ext4_exit_pageio(); 6813 ext4_exit_post_read_processing(); 6814 ext4_exit_es(); 6815 ext4_exit_pending(); 6816 } 6817 6818 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 6819 MODULE_DESCRIPTION("Fourth Extended Filesystem"); 6820 MODULE_LICENSE("GPL"); 6821 MODULE_SOFTDEP("pre: crc32c"); 6822 module_init(ext4_init_fs) 6823 module_exit(ext4_exit_fs) 6824