xref: /openbmc/linux/fs/ext4/super.c (revision 6ecc07b9)
1 /*
2  *  linux/fs/ext4/super.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Big-endian to little-endian byte-swapping/bitmaps by
16  *        David S. Miller (davem@caip.rutgers.edu), 1995
17  */
18 
19 #include <linux/module.h>
20 #include <linux/string.h>
21 #include <linux/fs.h>
22 #include <linux/time.h>
23 #include <linux/vmalloc.h>
24 #include <linux/jbd2.h>
25 #include <linux/slab.h>
26 #include <linux/init.h>
27 #include <linux/blkdev.h>
28 #include <linux/parser.h>
29 #include <linux/buffer_head.h>
30 #include <linux/exportfs.h>
31 #include <linux/vfs.h>
32 #include <linux/random.h>
33 #include <linux/mount.h>
34 #include <linux/namei.h>
35 #include <linux/quotaops.h>
36 #include <linux/seq_file.h>
37 #include <linux/proc_fs.h>
38 #include <linux/ctype.h>
39 #include <linux/log2.h>
40 #include <linux/crc16.h>
41 #include <linux/cleancache.h>
42 #include <asm/uaccess.h>
43 
44 #include <linux/kthread.h>
45 #include <linux/freezer.h>
46 
47 #include "ext4.h"
48 #include "ext4_jbd2.h"
49 #include "xattr.h"
50 #include "acl.h"
51 #include "mballoc.h"
52 
53 #define CREATE_TRACE_POINTS
54 #include <trace/events/ext4.h>
55 
56 static struct proc_dir_entry *ext4_proc_root;
57 static struct kset *ext4_kset;
58 static struct ext4_lazy_init *ext4_li_info;
59 static struct mutex ext4_li_mtx;
60 static struct ext4_features *ext4_feat;
61 
62 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
63 			     unsigned long journal_devnum);
64 static int ext4_commit_super(struct super_block *sb, int sync);
65 static void ext4_mark_recovery_complete(struct super_block *sb,
66 					struct ext4_super_block *es);
67 static void ext4_clear_journal_err(struct super_block *sb,
68 				   struct ext4_super_block *es);
69 static int ext4_sync_fs(struct super_block *sb, int wait);
70 static const char *ext4_decode_error(struct super_block *sb, int errno,
71 				     char nbuf[16]);
72 static int ext4_remount(struct super_block *sb, int *flags, char *data);
73 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74 static int ext4_unfreeze(struct super_block *sb);
75 static void ext4_write_super(struct super_block *sb);
76 static int ext4_freeze(struct super_block *sb);
77 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
78 		       const char *dev_name, void *data);
79 static inline int ext2_feature_set_ok(struct super_block *sb);
80 static inline int ext3_feature_set_ok(struct super_block *sb);
81 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
82 static void ext4_destroy_lazyinit_thread(void);
83 static void ext4_unregister_li_request(struct super_block *sb);
84 static void ext4_clear_request_list(void);
85 
86 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
87 static struct file_system_type ext2_fs_type = {
88 	.owner		= THIS_MODULE,
89 	.name		= "ext2",
90 	.mount		= ext4_mount,
91 	.kill_sb	= kill_block_super,
92 	.fs_flags	= FS_REQUIRES_DEV,
93 };
94 #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
95 #else
96 #define IS_EXT2_SB(sb) (0)
97 #endif
98 
99 
100 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
101 static struct file_system_type ext3_fs_type = {
102 	.owner		= THIS_MODULE,
103 	.name		= "ext3",
104 	.mount		= ext4_mount,
105 	.kill_sb	= kill_block_super,
106 	.fs_flags	= FS_REQUIRES_DEV,
107 };
108 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
109 #else
110 #define IS_EXT3_SB(sb) (0)
111 #endif
112 
113 void *ext4_kvmalloc(size_t size, gfp_t flags)
114 {
115 	void *ret;
116 
117 	ret = kmalloc(size, flags);
118 	if (!ret)
119 		ret = __vmalloc(size, flags, PAGE_KERNEL);
120 	return ret;
121 }
122 
123 void *ext4_kvzalloc(size_t size, gfp_t flags)
124 {
125 	void *ret;
126 
127 	ret = kzalloc(size, flags);
128 	if (!ret)
129 		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
130 	return ret;
131 }
132 
133 void ext4_kvfree(void *ptr)
134 {
135 	if (is_vmalloc_addr(ptr))
136 		vfree(ptr);
137 	else
138 		kfree(ptr);
139 
140 }
141 
142 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
143 			       struct ext4_group_desc *bg)
144 {
145 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
146 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
147 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
148 }
149 
150 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
151 			       struct ext4_group_desc *bg)
152 {
153 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
154 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
155 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
156 }
157 
158 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
159 			      struct ext4_group_desc *bg)
160 {
161 	return le32_to_cpu(bg->bg_inode_table_lo) |
162 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
163 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
164 }
165 
166 __u32 ext4_free_blks_count(struct super_block *sb,
167 			      struct ext4_group_desc *bg)
168 {
169 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
170 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
171 		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
172 }
173 
174 __u32 ext4_free_inodes_count(struct super_block *sb,
175 			      struct ext4_group_desc *bg)
176 {
177 	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
178 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
179 		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
180 }
181 
182 __u32 ext4_used_dirs_count(struct super_block *sb,
183 			      struct ext4_group_desc *bg)
184 {
185 	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
186 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
187 		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
188 }
189 
190 __u32 ext4_itable_unused_count(struct super_block *sb,
191 			      struct ext4_group_desc *bg)
192 {
193 	return le16_to_cpu(bg->bg_itable_unused_lo) |
194 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
195 		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
196 }
197 
198 void ext4_block_bitmap_set(struct super_block *sb,
199 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
200 {
201 	bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
202 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
203 		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
204 }
205 
206 void ext4_inode_bitmap_set(struct super_block *sb,
207 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
208 {
209 	bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
210 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
211 		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
212 }
213 
214 void ext4_inode_table_set(struct super_block *sb,
215 			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
216 {
217 	bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
218 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
219 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
220 }
221 
222 void ext4_free_blks_set(struct super_block *sb,
223 			  struct ext4_group_desc *bg, __u32 count)
224 {
225 	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
226 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
227 		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
228 }
229 
230 void ext4_free_inodes_set(struct super_block *sb,
231 			  struct ext4_group_desc *bg, __u32 count)
232 {
233 	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
234 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
235 		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
236 }
237 
238 void ext4_used_dirs_set(struct super_block *sb,
239 			  struct ext4_group_desc *bg, __u32 count)
240 {
241 	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
242 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
243 		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
244 }
245 
246 void ext4_itable_unused_set(struct super_block *sb,
247 			  struct ext4_group_desc *bg, __u32 count)
248 {
249 	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
250 	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
251 		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
252 }
253 
254 
255 /* Just increment the non-pointer handle value */
256 static handle_t *ext4_get_nojournal(void)
257 {
258 	handle_t *handle = current->journal_info;
259 	unsigned long ref_cnt = (unsigned long)handle;
260 
261 	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
262 
263 	ref_cnt++;
264 	handle = (handle_t *)ref_cnt;
265 
266 	current->journal_info = handle;
267 	return handle;
268 }
269 
270 
271 /* Decrement the non-pointer handle value */
272 static void ext4_put_nojournal(handle_t *handle)
273 {
274 	unsigned long ref_cnt = (unsigned long)handle;
275 
276 	BUG_ON(ref_cnt == 0);
277 
278 	ref_cnt--;
279 	handle = (handle_t *)ref_cnt;
280 
281 	current->journal_info = handle;
282 }
283 
284 /*
285  * Wrappers for jbd2_journal_start/end.
286  *
287  * The only special thing we need to do here is to make sure that all
288  * journal_end calls result in the superblock being marked dirty, so
289  * that sync() will call the filesystem's write_super callback if
290  * appropriate.
291  *
292  * To avoid j_barrier hold in userspace when a user calls freeze(),
293  * ext4 prevents a new handle from being started by s_frozen, which
294  * is in an upper layer.
295  */
296 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
297 {
298 	journal_t *journal;
299 	handle_t  *handle;
300 
301 	trace_ext4_journal_start(sb, nblocks, _RET_IP_);
302 	if (sb->s_flags & MS_RDONLY)
303 		return ERR_PTR(-EROFS);
304 
305 	journal = EXT4_SB(sb)->s_journal;
306 	handle = ext4_journal_current_handle();
307 
308 	/*
309 	 * If a handle has been started, it should be allowed to
310 	 * finish, otherwise deadlock could happen between freeze
311 	 * and others(e.g. truncate) due to the restart of the
312 	 * journal handle if the filesystem is forzen and active
313 	 * handles are not stopped.
314 	 */
315 	if (!handle)
316 		vfs_check_frozen(sb, SB_FREEZE_TRANS);
317 
318 	if (!journal)
319 		return ext4_get_nojournal();
320 	/*
321 	 * Special case here: if the journal has aborted behind our
322 	 * backs (eg. EIO in the commit thread), then we still need to
323 	 * take the FS itself readonly cleanly.
324 	 */
325 	if (is_journal_aborted(journal)) {
326 		ext4_abort(sb, "Detected aborted journal");
327 		return ERR_PTR(-EROFS);
328 	}
329 	return jbd2_journal_start(journal, nblocks);
330 }
331 
332 /*
333  * The only special thing we need to do here is to make sure that all
334  * jbd2_journal_stop calls result in the superblock being marked dirty, so
335  * that sync() will call the filesystem's write_super callback if
336  * appropriate.
337  */
338 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
339 {
340 	struct super_block *sb;
341 	int err;
342 	int rc;
343 
344 	if (!ext4_handle_valid(handle)) {
345 		ext4_put_nojournal(handle);
346 		return 0;
347 	}
348 	sb = handle->h_transaction->t_journal->j_private;
349 	err = handle->h_err;
350 	rc = jbd2_journal_stop(handle);
351 
352 	if (!err)
353 		err = rc;
354 	if (err)
355 		__ext4_std_error(sb, where, line, err);
356 	return err;
357 }
358 
359 void ext4_journal_abort_handle(const char *caller, unsigned int line,
360 			       const char *err_fn, struct buffer_head *bh,
361 			       handle_t *handle, int err)
362 {
363 	char nbuf[16];
364 	const char *errstr = ext4_decode_error(NULL, err, nbuf);
365 
366 	BUG_ON(!ext4_handle_valid(handle));
367 
368 	if (bh)
369 		BUFFER_TRACE(bh, "abort");
370 
371 	if (!handle->h_err)
372 		handle->h_err = err;
373 
374 	if (is_handle_aborted(handle))
375 		return;
376 
377 	printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
378 	       caller, line, errstr, err_fn);
379 
380 	jbd2_journal_abort_handle(handle);
381 }
382 
383 static void __save_error_info(struct super_block *sb, const char *func,
384 			    unsigned int line)
385 {
386 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
387 
388 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
389 	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
390 	es->s_last_error_time = cpu_to_le32(get_seconds());
391 	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
392 	es->s_last_error_line = cpu_to_le32(line);
393 	if (!es->s_first_error_time) {
394 		es->s_first_error_time = es->s_last_error_time;
395 		strncpy(es->s_first_error_func, func,
396 			sizeof(es->s_first_error_func));
397 		es->s_first_error_line = cpu_to_le32(line);
398 		es->s_first_error_ino = es->s_last_error_ino;
399 		es->s_first_error_block = es->s_last_error_block;
400 	}
401 	/*
402 	 * Start the daily error reporting function if it hasn't been
403 	 * started already
404 	 */
405 	if (!es->s_error_count)
406 		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
407 	es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
408 }
409 
410 static void save_error_info(struct super_block *sb, const char *func,
411 			    unsigned int line)
412 {
413 	__save_error_info(sb, func, line);
414 	ext4_commit_super(sb, 1);
415 }
416 
417 
418 /* Deal with the reporting of failure conditions on a filesystem such as
419  * inconsistencies detected or read IO failures.
420  *
421  * On ext2, we can store the error state of the filesystem in the
422  * superblock.  That is not possible on ext4, because we may have other
423  * write ordering constraints on the superblock which prevent us from
424  * writing it out straight away; and given that the journal is about to
425  * be aborted, we can't rely on the current, or future, transactions to
426  * write out the superblock safely.
427  *
428  * We'll just use the jbd2_journal_abort() error code to record an error in
429  * the journal instead.  On recovery, the journal will complain about
430  * that error until we've noted it down and cleared it.
431  */
432 
433 static void ext4_handle_error(struct super_block *sb)
434 {
435 	if (sb->s_flags & MS_RDONLY)
436 		return;
437 
438 	if (!test_opt(sb, ERRORS_CONT)) {
439 		journal_t *journal = EXT4_SB(sb)->s_journal;
440 
441 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
442 		if (journal)
443 			jbd2_journal_abort(journal, -EIO);
444 	}
445 	if (test_opt(sb, ERRORS_RO)) {
446 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
447 		sb->s_flags |= MS_RDONLY;
448 	}
449 	if (test_opt(sb, ERRORS_PANIC))
450 		panic("EXT4-fs (device %s): panic forced after error\n",
451 			sb->s_id);
452 }
453 
454 void __ext4_error(struct super_block *sb, const char *function,
455 		  unsigned int line, const char *fmt, ...)
456 {
457 	struct va_format vaf;
458 	va_list args;
459 
460 	va_start(args, fmt);
461 	vaf.fmt = fmt;
462 	vaf.va = &args;
463 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
464 	       sb->s_id, function, line, current->comm, &vaf);
465 	va_end(args);
466 
467 	ext4_handle_error(sb);
468 }
469 
470 void ext4_error_inode(struct inode *inode, const char *function,
471 		      unsigned int line, ext4_fsblk_t block,
472 		      const char *fmt, ...)
473 {
474 	va_list args;
475 	struct va_format vaf;
476 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
477 
478 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
479 	es->s_last_error_block = cpu_to_le64(block);
480 	save_error_info(inode->i_sb, function, line);
481 	va_start(args, fmt);
482 	vaf.fmt = fmt;
483 	vaf.va = &args;
484 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
485 	       inode->i_sb->s_id, function, line, inode->i_ino);
486 	if (block)
487 		printk(KERN_CONT "block %llu: ", block);
488 	printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
489 	va_end(args);
490 
491 	ext4_handle_error(inode->i_sb);
492 }
493 
494 void ext4_error_file(struct file *file, const char *function,
495 		     unsigned int line, ext4_fsblk_t block,
496 		     const char *fmt, ...)
497 {
498 	va_list args;
499 	struct va_format vaf;
500 	struct ext4_super_block *es;
501 	struct inode *inode = file->f_dentry->d_inode;
502 	char pathname[80], *path;
503 
504 	es = EXT4_SB(inode->i_sb)->s_es;
505 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
506 	save_error_info(inode->i_sb, function, line);
507 	path = d_path(&(file->f_path), pathname, sizeof(pathname));
508 	if (IS_ERR(path))
509 		path = "(unknown)";
510 	printk(KERN_CRIT
511 	       "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
512 	       inode->i_sb->s_id, function, line, inode->i_ino);
513 	if (block)
514 		printk(KERN_CONT "block %llu: ", block);
515 	va_start(args, fmt);
516 	vaf.fmt = fmt;
517 	vaf.va = &args;
518 	printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
519 	va_end(args);
520 
521 	ext4_handle_error(inode->i_sb);
522 }
523 
524 static const char *ext4_decode_error(struct super_block *sb, int errno,
525 				     char nbuf[16])
526 {
527 	char *errstr = NULL;
528 
529 	switch (errno) {
530 	case -EIO:
531 		errstr = "IO failure";
532 		break;
533 	case -ENOMEM:
534 		errstr = "Out of memory";
535 		break;
536 	case -EROFS:
537 		if (!sb || (EXT4_SB(sb)->s_journal &&
538 			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
539 			errstr = "Journal has aborted";
540 		else
541 			errstr = "Readonly filesystem";
542 		break;
543 	default:
544 		/* If the caller passed in an extra buffer for unknown
545 		 * errors, textualise them now.  Else we just return
546 		 * NULL. */
547 		if (nbuf) {
548 			/* Check for truncated error codes... */
549 			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
550 				errstr = nbuf;
551 		}
552 		break;
553 	}
554 
555 	return errstr;
556 }
557 
558 /* __ext4_std_error decodes expected errors from journaling functions
559  * automatically and invokes the appropriate error response.  */
560 
561 void __ext4_std_error(struct super_block *sb, const char *function,
562 		      unsigned int line, int errno)
563 {
564 	char nbuf[16];
565 	const char *errstr;
566 
567 	/* Special case: if the error is EROFS, and we're not already
568 	 * inside a transaction, then there's really no point in logging
569 	 * an error. */
570 	if (errno == -EROFS && journal_current_handle() == NULL &&
571 	    (sb->s_flags & MS_RDONLY))
572 		return;
573 
574 	errstr = ext4_decode_error(sb, errno, nbuf);
575 	printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
576 	       sb->s_id, function, line, errstr);
577 	save_error_info(sb, function, line);
578 
579 	ext4_handle_error(sb);
580 }
581 
582 /*
583  * ext4_abort is a much stronger failure handler than ext4_error.  The
584  * abort function may be used to deal with unrecoverable failures such
585  * as journal IO errors or ENOMEM at a critical moment in log management.
586  *
587  * We unconditionally force the filesystem into an ABORT|READONLY state,
588  * unless the error response on the fs has been set to panic in which
589  * case we take the easy way out and panic immediately.
590  */
591 
592 void __ext4_abort(struct super_block *sb, const char *function,
593 		unsigned int line, const char *fmt, ...)
594 {
595 	va_list args;
596 
597 	save_error_info(sb, function, line);
598 	va_start(args, fmt);
599 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
600 	       function, line);
601 	vprintk(fmt, args);
602 	printk("\n");
603 	va_end(args);
604 
605 	if ((sb->s_flags & MS_RDONLY) == 0) {
606 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
607 		sb->s_flags |= MS_RDONLY;
608 		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
609 		if (EXT4_SB(sb)->s_journal)
610 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
611 		save_error_info(sb, function, line);
612 	}
613 	if (test_opt(sb, ERRORS_PANIC))
614 		panic("EXT4-fs panic from previous error\n");
615 }
616 
617 void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...)
618 {
619 	struct va_format vaf;
620 	va_list args;
621 
622 	va_start(args, fmt);
623 	vaf.fmt = fmt;
624 	vaf.va = &args;
625 	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
626 	va_end(args);
627 }
628 
629 void __ext4_warning(struct super_block *sb, const char *function,
630 		    unsigned int line, const char *fmt, ...)
631 {
632 	struct va_format vaf;
633 	va_list args;
634 
635 	va_start(args, fmt);
636 	vaf.fmt = fmt;
637 	vaf.va = &args;
638 	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
639 	       sb->s_id, function, line, &vaf);
640 	va_end(args);
641 }
642 
643 void __ext4_grp_locked_error(const char *function, unsigned int line,
644 			     struct super_block *sb, ext4_group_t grp,
645 			     unsigned long ino, ext4_fsblk_t block,
646 			     const char *fmt, ...)
647 __releases(bitlock)
648 __acquires(bitlock)
649 {
650 	struct va_format vaf;
651 	va_list args;
652 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
653 
654 	es->s_last_error_ino = cpu_to_le32(ino);
655 	es->s_last_error_block = cpu_to_le64(block);
656 	__save_error_info(sb, function, line);
657 
658 	va_start(args, fmt);
659 
660 	vaf.fmt = fmt;
661 	vaf.va = &args;
662 	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
663 	       sb->s_id, function, line, grp);
664 	if (ino)
665 		printk(KERN_CONT "inode %lu: ", ino);
666 	if (block)
667 		printk(KERN_CONT "block %llu:", (unsigned long long) block);
668 	printk(KERN_CONT "%pV\n", &vaf);
669 	va_end(args);
670 
671 	if (test_opt(sb, ERRORS_CONT)) {
672 		ext4_commit_super(sb, 0);
673 		return;
674 	}
675 
676 	ext4_unlock_group(sb, grp);
677 	ext4_handle_error(sb);
678 	/*
679 	 * We only get here in the ERRORS_RO case; relocking the group
680 	 * may be dangerous, but nothing bad will happen since the
681 	 * filesystem will have already been marked read/only and the
682 	 * journal has been aborted.  We return 1 as a hint to callers
683 	 * who might what to use the return value from
684 	 * ext4_grp_locked_error() to distinguish between the
685 	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
686 	 * aggressively from the ext4 function in question, with a
687 	 * more appropriate error code.
688 	 */
689 	ext4_lock_group(sb, grp);
690 	return;
691 }
692 
693 void ext4_update_dynamic_rev(struct super_block *sb)
694 {
695 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
696 
697 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
698 		return;
699 
700 	ext4_warning(sb,
701 		     "updating to rev %d because of new feature flag, "
702 		     "running e2fsck is recommended",
703 		     EXT4_DYNAMIC_REV);
704 
705 	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
706 	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
707 	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
708 	/* leave es->s_feature_*compat flags alone */
709 	/* es->s_uuid will be set by e2fsck if empty */
710 
711 	/*
712 	 * The rest of the superblock fields should be zero, and if not it
713 	 * means they are likely already in use, so leave them alone.  We
714 	 * can leave it up to e2fsck to clean up any inconsistencies there.
715 	 */
716 }
717 
718 /*
719  * Open the external journal device
720  */
721 static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
722 {
723 	struct block_device *bdev;
724 	char b[BDEVNAME_SIZE];
725 
726 	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
727 	if (IS_ERR(bdev))
728 		goto fail;
729 	return bdev;
730 
731 fail:
732 	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
733 			__bdevname(dev, b), PTR_ERR(bdev));
734 	return NULL;
735 }
736 
737 /*
738  * Release the journal device
739  */
740 static int ext4_blkdev_put(struct block_device *bdev)
741 {
742 	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
743 }
744 
745 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
746 {
747 	struct block_device *bdev;
748 	int ret = -ENODEV;
749 
750 	bdev = sbi->journal_bdev;
751 	if (bdev) {
752 		ret = ext4_blkdev_put(bdev);
753 		sbi->journal_bdev = NULL;
754 	}
755 	return ret;
756 }
757 
758 static inline struct inode *orphan_list_entry(struct list_head *l)
759 {
760 	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
761 }
762 
763 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
764 {
765 	struct list_head *l;
766 
767 	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
768 		 le32_to_cpu(sbi->s_es->s_last_orphan));
769 
770 	printk(KERN_ERR "sb_info orphan list:\n");
771 	list_for_each(l, &sbi->s_orphan) {
772 		struct inode *inode = orphan_list_entry(l);
773 		printk(KERN_ERR "  "
774 		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
775 		       inode->i_sb->s_id, inode->i_ino, inode,
776 		       inode->i_mode, inode->i_nlink,
777 		       NEXT_ORPHAN(inode));
778 	}
779 }
780 
781 static void ext4_put_super(struct super_block *sb)
782 {
783 	struct ext4_sb_info *sbi = EXT4_SB(sb);
784 	struct ext4_super_block *es = sbi->s_es;
785 	int i, err;
786 
787 	ext4_unregister_li_request(sb);
788 	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
789 
790 	flush_workqueue(sbi->dio_unwritten_wq);
791 	destroy_workqueue(sbi->dio_unwritten_wq);
792 
793 	lock_super(sb);
794 	if (sb->s_dirt)
795 		ext4_commit_super(sb, 1);
796 
797 	if (sbi->s_journal) {
798 		err = jbd2_journal_destroy(sbi->s_journal);
799 		sbi->s_journal = NULL;
800 		if (err < 0)
801 			ext4_abort(sb, "Couldn't clean up the journal");
802 	}
803 
804 	del_timer(&sbi->s_err_report);
805 	ext4_release_system_zone(sb);
806 	ext4_mb_release(sb);
807 	ext4_ext_release(sb);
808 	ext4_xattr_put_super(sb);
809 
810 	if (!(sb->s_flags & MS_RDONLY)) {
811 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
812 		es->s_state = cpu_to_le16(sbi->s_mount_state);
813 		ext4_commit_super(sb, 1);
814 	}
815 	if (sbi->s_proc) {
816 		remove_proc_entry(sb->s_id, ext4_proc_root);
817 	}
818 	kobject_del(&sbi->s_kobj);
819 
820 	for (i = 0; i < sbi->s_gdb_count; i++)
821 		brelse(sbi->s_group_desc[i]);
822 	ext4_kvfree(sbi->s_group_desc);
823 	ext4_kvfree(sbi->s_flex_groups);
824 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
825 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
826 	percpu_counter_destroy(&sbi->s_dirs_counter);
827 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
828 	brelse(sbi->s_sbh);
829 #ifdef CONFIG_QUOTA
830 	for (i = 0; i < MAXQUOTAS; i++)
831 		kfree(sbi->s_qf_names[i]);
832 #endif
833 
834 	/* Debugging code just in case the in-memory inode orphan list
835 	 * isn't empty.  The on-disk one can be non-empty if we've
836 	 * detected an error and taken the fs readonly, but the
837 	 * in-memory list had better be clean by this point. */
838 	if (!list_empty(&sbi->s_orphan))
839 		dump_orphan_list(sb, sbi);
840 	J_ASSERT(list_empty(&sbi->s_orphan));
841 
842 	invalidate_bdev(sb->s_bdev);
843 	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
844 		/*
845 		 * Invalidate the journal device's buffers.  We don't want them
846 		 * floating about in memory - the physical journal device may
847 		 * hotswapped, and it breaks the `ro-after' testing code.
848 		 */
849 		sync_blockdev(sbi->journal_bdev);
850 		invalidate_bdev(sbi->journal_bdev);
851 		ext4_blkdev_remove(sbi);
852 	}
853 	if (sbi->s_mmp_tsk)
854 		kthread_stop(sbi->s_mmp_tsk);
855 	sb->s_fs_info = NULL;
856 	/*
857 	 * Now that we are completely done shutting down the
858 	 * superblock, we need to actually destroy the kobject.
859 	 */
860 	unlock_super(sb);
861 	kobject_put(&sbi->s_kobj);
862 	wait_for_completion(&sbi->s_kobj_unregister);
863 	kfree(sbi->s_blockgroup_lock);
864 	kfree(sbi);
865 }
866 
867 static struct kmem_cache *ext4_inode_cachep;
868 
869 /*
870  * Called inside transaction, so use GFP_NOFS
871  */
872 static struct inode *ext4_alloc_inode(struct super_block *sb)
873 {
874 	struct ext4_inode_info *ei;
875 
876 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
877 	if (!ei)
878 		return NULL;
879 
880 	ei->vfs_inode.i_version = 1;
881 	ei->vfs_inode.i_data.writeback_index = 0;
882 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
883 	INIT_LIST_HEAD(&ei->i_prealloc_list);
884 	spin_lock_init(&ei->i_prealloc_lock);
885 	ei->i_reserved_data_blocks = 0;
886 	ei->i_reserved_meta_blocks = 0;
887 	ei->i_allocated_meta_blocks = 0;
888 	ei->i_da_metadata_calc_len = 0;
889 	spin_lock_init(&(ei->i_block_reservation_lock));
890 #ifdef CONFIG_QUOTA
891 	ei->i_reserved_quota = 0;
892 #endif
893 	ei->jinode = NULL;
894 	INIT_LIST_HEAD(&ei->i_completed_io_list);
895 	spin_lock_init(&ei->i_completed_io_lock);
896 	ei->cur_aio_dio = NULL;
897 	ei->i_sync_tid = 0;
898 	ei->i_datasync_tid = 0;
899 	atomic_set(&ei->i_ioend_count, 0);
900 	atomic_set(&ei->i_aiodio_unwritten, 0);
901 
902 	return &ei->vfs_inode;
903 }
904 
905 static int ext4_drop_inode(struct inode *inode)
906 {
907 	int drop = generic_drop_inode(inode);
908 
909 	trace_ext4_drop_inode(inode, drop);
910 	return drop;
911 }
912 
913 static void ext4_i_callback(struct rcu_head *head)
914 {
915 	struct inode *inode = container_of(head, struct inode, i_rcu);
916 	INIT_LIST_HEAD(&inode->i_dentry);
917 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
918 }
919 
920 static void ext4_destroy_inode(struct inode *inode)
921 {
922 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
923 		ext4_msg(inode->i_sb, KERN_ERR,
924 			 "Inode %lu (%p): orphan list check failed!",
925 			 inode->i_ino, EXT4_I(inode));
926 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
927 				EXT4_I(inode), sizeof(struct ext4_inode_info),
928 				true);
929 		dump_stack();
930 	}
931 	call_rcu(&inode->i_rcu, ext4_i_callback);
932 }
933 
934 static void init_once(void *foo)
935 {
936 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
937 
938 	INIT_LIST_HEAD(&ei->i_orphan);
939 #ifdef CONFIG_EXT4_FS_XATTR
940 	init_rwsem(&ei->xattr_sem);
941 #endif
942 	init_rwsem(&ei->i_data_sem);
943 	inode_init_once(&ei->vfs_inode);
944 }
945 
946 static int init_inodecache(void)
947 {
948 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
949 					     sizeof(struct ext4_inode_info),
950 					     0, (SLAB_RECLAIM_ACCOUNT|
951 						SLAB_MEM_SPREAD),
952 					     init_once);
953 	if (ext4_inode_cachep == NULL)
954 		return -ENOMEM;
955 	return 0;
956 }
957 
958 static void destroy_inodecache(void)
959 {
960 	kmem_cache_destroy(ext4_inode_cachep);
961 }
962 
963 void ext4_clear_inode(struct inode *inode)
964 {
965 	invalidate_inode_buffers(inode);
966 	end_writeback(inode);
967 	dquot_drop(inode);
968 	ext4_discard_preallocations(inode);
969 	if (EXT4_I(inode)->jinode) {
970 		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
971 					       EXT4_I(inode)->jinode);
972 		jbd2_free_inode(EXT4_I(inode)->jinode);
973 		EXT4_I(inode)->jinode = NULL;
974 	}
975 }
976 
977 static inline void ext4_show_quota_options(struct seq_file *seq,
978 					   struct super_block *sb)
979 {
980 #if defined(CONFIG_QUOTA)
981 	struct ext4_sb_info *sbi = EXT4_SB(sb);
982 
983 	if (sbi->s_jquota_fmt) {
984 		char *fmtname = "";
985 
986 		switch (sbi->s_jquota_fmt) {
987 		case QFMT_VFS_OLD:
988 			fmtname = "vfsold";
989 			break;
990 		case QFMT_VFS_V0:
991 			fmtname = "vfsv0";
992 			break;
993 		case QFMT_VFS_V1:
994 			fmtname = "vfsv1";
995 			break;
996 		}
997 		seq_printf(seq, ",jqfmt=%s", fmtname);
998 	}
999 
1000 	if (sbi->s_qf_names[USRQUOTA])
1001 		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
1002 
1003 	if (sbi->s_qf_names[GRPQUOTA])
1004 		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1005 
1006 	if (test_opt(sb, USRQUOTA))
1007 		seq_puts(seq, ",usrquota");
1008 
1009 	if (test_opt(sb, GRPQUOTA))
1010 		seq_puts(seq, ",grpquota");
1011 #endif
1012 }
1013 
1014 /*
1015  * Show an option if
1016  *  - it's set to a non-default value OR
1017  *  - if the per-sb default is different from the global default
1018  */
1019 static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1020 {
1021 	int def_errors;
1022 	unsigned long def_mount_opts;
1023 	struct super_block *sb = vfs->mnt_sb;
1024 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1025 	struct ext4_super_block *es = sbi->s_es;
1026 
1027 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
1028 	def_errors     = le16_to_cpu(es->s_errors);
1029 
1030 	if (sbi->s_sb_block != 1)
1031 		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
1032 	if (test_opt(sb, MINIX_DF))
1033 		seq_puts(seq, ",minixdf");
1034 	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
1035 		seq_puts(seq, ",grpid");
1036 	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
1037 		seq_puts(seq, ",nogrpid");
1038 	if (sbi->s_resuid != EXT4_DEF_RESUID ||
1039 	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
1040 		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
1041 	}
1042 	if (sbi->s_resgid != EXT4_DEF_RESGID ||
1043 	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
1044 		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
1045 	}
1046 	if (test_opt(sb, ERRORS_RO)) {
1047 		if (def_errors == EXT4_ERRORS_PANIC ||
1048 		    def_errors == EXT4_ERRORS_CONTINUE) {
1049 			seq_puts(seq, ",errors=remount-ro");
1050 		}
1051 	}
1052 	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
1053 		seq_puts(seq, ",errors=continue");
1054 	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
1055 		seq_puts(seq, ",errors=panic");
1056 	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
1057 		seq_puts(seq, ",nouid32");
1058 	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
1059 		seq_puts(seq, ",debug");
1060 	if (test_opt(sb, OLDALLOC))
1061 		seq_puts(seq, ",oldalloc");
1062 #ifdef CONFIG_EXT4_FS_XATTR
1063 	if (test_opt(sb, XATTR_USER))
1064 		seq_puts(seq, ",user_xattr");
1065 	if (!test_opt(sb, XATTR_USER))
1066 		seq_puts(seq, ",nouser_xattr");
1067 #endif
1068 #ifdef CONFIG_EXT4_FS_POSIX_ACL
1069 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
1070 		seq_puts(seq, ",acl");
1071 	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
1072 		seq_puts(seq, ",noacl");
1073 #endif
1074 	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
1075 		seq_printf(seq, ",commit=%u",
1076 			   (unsigned) (sbi->s_commit_interval / HZ));
1077 	}
1078 	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
1079 		seq_printf(seq, ",min_batch_time=%u",
1080 			   (unsigned) sbi->s_min_batch_time);
1081 	}
1082 	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
1083 		seq_printf(seq, ",max_batch_time=%u",
1084 			   (unsigned) sbi->s_min_batch_time);
1085 	}
1086 
1087 	/*
1088 	 * We're changing the default of barrier mount option, so
1089 	 * let's always display its mount state so it's clear what its
1090 	 * status is.
1091 	 */
1092 	seq_puts(seq, ",barrier=");
1093 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
1094 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
1095 		seq_puts(seq, ",journal_async_commit");
1096 	else if (test_opt(sb, JOURNAL_CHECKSUM))
1097 		seq_puts(seq, ",journal_checksum");
1098 	if (test_opt(sb, I_VERSION))
1099 		seq_puts(seq, ",i_version");
1100 	if (!test_opt(sb, DELALLOC) &&
1101 	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1102 		seq_puts(seq, ",nodelalloc");
1103 
1104 	if (!test_opt(sb, MBLK_IO_SUBMIT))
1105 		seq_puts(seq, ",nomblk_io_submit");
1106 	if (sbi->s_stripe)
1107 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1108 	/*
1109 	 * journal mode get enabled in different ways
1110 	 * So just print the value even if we didn't specify it
1111 	 */
1112 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
1113 		seq_puts(seq, ",data=journal");
1114 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
1115 		seq_puts(seq, ",data=ordered");
1116 	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
1117 		seq_puts(seq, ",data=writeback");
1118 
1119 	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
1120 		seq_printf(seq, ",inode_readahead_blks=%u",
1121 			   sbi->s_inode_readahead_blks);
1122 
1123 	if (test_opt(sb, DATA_ERR_ABORT))
1124 		seq_puts(seq, ",data_err=abort");
1125 
1126 	if (test_opt(sb, NO_AUTO_DA_ALLOC))
1127 		seq_puts(seq, ",noauto_da_alloc");
1128 
1129 	if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
1130 		seq_puts(seq, ",discard");
1131 
1132 	if (test_opt(sb, NOLOAD))
1133 		seq_puts(seq, ",norecovery");
1134 
1135 	if (test_opt(sb, DIOREAD_NOLOCK))
1136 		seq_puts(seq, ",dioread_nolock");
1137 
1138 	if (test_opt(sb, BLOCK_VALIDITY) &&
1139 	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1140 		seq_puts(seq, ",block_validity");
1141 
1142 	if (!test_opt(sb, INIT_INODE_TABLE))
1143 		seq_puts(seq, ",noinit_inode_table");
1144 	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1145 		seq_printf(seq, ",init_inode_table=%u",
1146 			   (unsigned) sbi->s_li_wait_mult);
1147 
1148 	ext4_show_quota_options(seq, sb);
1149 
1150 	return 0;
1151 }
1152 
1153 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1154 					u64 ino, u32 generation)
1155 {
1156 	struct inode *inode;
1157 
1158 	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
1159 		return ERR_PTR(-ESTALE);
1160 	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
1161 		return ERR_PTR(-ESTALE);
1162 
1163 	/* iget isn't really right if the inode is currently unallocated!!
1164 	 *
1165 	 * ext4_read_inode will return a bad_inode if the inode had been
1166 	 * deleted, so we should be safe.
1167 	 *
1168 	 * Currently we don't know the generation for parent directory, so
1169 	 * a generation of 0 means "accept any"
1170 	 */
1171 	inode = ext4_iget(sb, ino);
1172 	if (IS_ERR(inode))
1173 		return ERR_CAST(inode);
1174 	if (generation && inode->i_generation != generation) {
1175 		iput(inode);
1176 		return ERR_PTR(-ESTALE);
1177 	}
1178 
1179 	return inode;
1180 }
1181 
1182 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1183 					int fh_len, int fh_type)
1184 {
1185 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1186 				    ext4_nfs_get_inode);
1187 }
1188 
1189 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1190 					int fh_len, int fh_type)
1191 {
1192 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1193 				    ext4_nfs_get_inode);
1194 }
1195 
1196 /*
1197  * Try to release metadata pages (indirect blocks, directories) which are
1198  * mapped via the block device.  Since these pages could have journal heads
1199  * which would prevent try_to_free_buffers() from freeing them, we must use
1200  * jbd2 layer's try_to_free_buffers() function to release them.
1201  */
1202 static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
1203 				 gfp_t wait)
1204 {
1205 	journal_t *journal = EXT4_SB(sb)->s_journal;
1206 
1207 	WARN_ON(PageChecked(page));
1208 	if (!page_has_buffers(page))
1209 		return 0;
1210 	if (journal)
1211 		return jbd2_journal_try_to_free_buffers(journal, page,
1212 							wait & ~__GFP_WAIT);
1213 	return try_to_free_buffers(page);
1214 }
1215 
1216 #ifdef CONFIG_QUOTA
1217 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
1218 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
1219 
1220 static int ext4_write_dquot(struct dquot *dquot);
1221 static int ext4_acquire_dquot(struct dquot *dquot);
1222 static int ext4_release_dquot(struct dquot *dquot);
1223 static int ext4_mark_dquot_dirty(struct dquot *dquot);
1224 static int ext4_write_info(struct super_block *sb, int type);
1225 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1226 			 struct path *path);
1227 static int ext4_quota_off(struct super_block *sb, int type);
1228 static int ext4_quota_on_mount(struct super_block *sb, int type);
1229 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1230 			       size_t len, loff_t off);
1231 static ssize_t ext4_quota_write(struct super_block *sb, int type,
1232 				const char *data, size_t len, loff_t off);
1233 
1234 static const struct dquot_operations ext4_quota_operations = {
1235 	.get_reserved_space = ext4_get_reserved_space,
1236 	.write_dquot	= ext4_write_dquot,
1237 	.acquire_dquot	= ext4_acquire_dquot,
1238 	.release_dquot	= ext4_release_dquot,
1239 	.mark_dirty	= ext4_mark_dquot_dirty,
1240 	.write_info	= ext4_write_info,
1241 	.alloc_dquot	= dquot_alloc,
1242 	.destroy_dquot	= dquot_destroy,
1243 };
1244 
1245 static const struct quotactl_ops ext4_qctl_operations = {
1246 	.quota_on	= ext4_quota_on,
1247 	.quota_off	= ext4_quota_off,
1248 	.quota_sync	= dquot_quota_sync,
1249 	.get_info	= dquot_get_dqinfo,
1250 	.set_info	= dquot_set_dqinfo,
1251 	.get_dqblk	= dquot_get_dqblk,
1252 	.set_dqblk	= dquot_set_dqblk
1253 };
1254 #endif
1255 
1256 static const struct super_operations ext4_sops = {
1257 	.alloc_inode	= ext4_alloc_inode,
1258 	.destroy_inode	= ext4_destroy_inode,
1259 	.write_inode	= ext4_write_inode,
1260 	.dirty_inode	= ext4_dirty_inode,
1261 	.drop_inode	= ext4_drop_inode,
1262 	.evict_inode	= ext4_evict_inode,
1263 	.put_super	= ext4_put_super,
1264 	.sync_fs	= ext4_sync_fs,
1265 	.freeze_fs	= ext4_freeze,
1266 	.unfreeze_fs	= ext4_unfreeze,
1267 	.statfs		= ext4_statfs,
1268 	.remount_fs	= ext4_remount,
1269 	.show_options	= ext4_show_options,
1270 #ifdef CONFIG_QUOTA
1271 	.quota_read	= ext4_quota_read,
1272 	.quota_write	= ext4_quota_write,
1273 #endif
1274 	.bdev_try_to_free_page = bdev_try_to_free_page,
1275 };
1276 
1277 static const struct super_operations ext4_nojournal_sops = {
1278 	.alloc_inode	= ext4_alloc_inode,
1279 	.destroy_inode	= ext4_destroy_inode,
1280 	.write_inode	= ext4_write_inode,
1281 	.dirty_inode	= ext4_dirty_inode,
1282 	.drop_inode	= ext4_drop_inode,
1283 	.evict_inode	= ext4_evict_inode,
1284 	.write_super	= ext4_write_super,
1285 	.put_super	= ext4_put_super,
1286 	.statfs		= ext4_statfs,
1287 	.remount_fs	= ext4_remount,
1288 	.show_options	= ext4_show_options,
1289 #ifdef CONFIG_QUOTA
1290 	.quota_read	= ext4_quota_read,
1291 	.quota_write	= ext4_quota_write,
1292 #endif
1293 	.bdev_try_to_free_page = bdev_try_to_free_page,
1294 };
1295 
1296 static const struct export_operations ext4_export_ops = {
1297 	.fh_to_dentry = ext4_fh_to_dentry,
1298 	.fh_to_parent = ext4_fh_to_parent,
1299 	.get_parent = ext4_get_parent,
1300 };
1301 
1302 enum {
1303 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1304 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1305 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
1306 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1307 	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
1308 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
1309 	Opt_journal_update, Opt_journal_dev,
1310 	Opt_journal_checksum, Opt_journal_async_commit,
1311 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1312 	Opt_data_err_abort, Opt_data_err_ignore,
1313 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1314 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1315 	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
1316 	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
1317 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
1318 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1319 	Opt_inode_readahead_blks, Opt_journal_ioprio,
1320 	Opt_dioread_nolock, Opt_dioread_lock,
1321 	Opt_discard, Opt_nodiscard,
1322 	Opt_init_inode_table, Opt_noinit_inode_table,
1323 };
1324 
1325 static const match_table_t tokens = {
1326 	{Opt_bsd_df, "bsddf"},
1327 	{Opt_minix_df, "minixdf"},
1328 	{Opt_grpid, "grpid"},
1329 	{Opt_grpid, "bsdgroups"},
1330 	{Opt_nogrpid, "nogrpid"},
1331 	{Opt_nogrpid, "sysvgroups"},
1332 	{Opt_resgid, "resgid=%u"},
1333 	{Opt_resuid, "resuid=%u"},
1334 	{Opt_sb, "sb=%u"},
1335 	{Opt_err_cont, "errors=continue"},
1336 	{Opt_err_panic, "errors=panic"},
1337 	{Opt_err_ro, "errors=remount-ro"},
1338 	{Opt_nouid32, "nouid32"},
1339 	{Opt_debug, "debug"},
1340 	{Opt_oldalloc, "oldalloc"},
1341 	{Opt_orlov, "orlov"},
1342 	{Opt_user_xattr, "user_xattr"},
1343 	{Opt_nouser_xattr, "nouser_xattr"},
1344 	{Opt_acl, "acl"},
1345 	{Opt_noacl, "noacl"},
1346 	{Opt_noload, "noload"},
1347 	{Opt_noload, "norecovery"},
1348 	{Opt_nobh, "nobh"},
1349 	{Opt_bh, "bh"},
1350 	{Opt_commit, "commit=%u"},
1351 	{Opt_min_batch_time, "min_batch_time=%u"},
1352 	{Opt_max_batch_time, "max_batch_time=%u"},
1353 	{Opt_journal_update, "journal=update"},
1354 	{Opt_journal_dev, "journal_dev=%u"},
1355 	{Opt_journal_checksum, "journal_checksum"},
1356 	{Opt_journal_async_commit, "journal_async_commit"},
1357 	{Opt_abort, "abort"},
1358 	{Opt_data_journal, "data=journal"},
1359 	{Opt_data_ordered, "data=ordered"},
1360 	{Opt_data_writeback, "data=writeback"},
1361 	{Opt_data_err_abort, "data_err=abort"},
1362 	{Opt_data_err_ignore, "data_err=ignore"},
1363 	{Opt_offusrjquota, "usrjquota="},
1364 	{Opt_usrjquota, "usrjquota=%s"},
1365 	{Opt_offgrpjquota, "grpjquota="},
1366 	{Opt_grpjquota, "grpjquota=%s"},
1367 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1368 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1369 	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1370 	{Opt_grpquota, "grpquota"},
1371 	{Opt_noquota, "noquota"},
1372 	{Opt_quota, "quota"},
1373 	{Opt_usrquota, "usrquota"},
1374 	{Opt_barrier, "barrier=%u"},
1375 	{Opt_barrier, "barrier"},
1376 	{Opt_nobarrier, "nobarrier"},
1377 	{Opt_i_version, "i_version"},
1378 	{Opt_stripe, "stripe=%u"},
1379 	{Opt_resize, "resize"},
1380 	{Opt_delalloc, "delalloc"},
1381 	{Opt_nodelalloc, "nodelalloc"},
1382 	{Opt_mblk_io_submit, "mblk_io_submit"},
1383 	{Opt_nomblk_io_submit, "nomblk_io_submit"},
1384 	{Opt_block_validity, "block_validity"},
1385 	{Opt_noblock_validity, "noblock_validity"},
1386 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1387 	{Opt_journal_ioprio, "journal_ioprio=%u"},
1388 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
1389 	{Opt_auto_da_alloc, "auto_da_alloc"},
1390 	{Opt_noauto_da_alloc, "noauto_da_alloc"},
1391 	{Opt_dioread_nolock, "dioread_nolock"},
1392 	{Opt_dioread_lock, "dioread_lock"},
1393 	{Opt_discard, "discard"},
1394 	{Opt_nodiscard, "nodiscard"},
1395 	{Opt_init_inode_table, "init_itable=%u"},
1396 	{Opt_init_inode_table, "init_itable"},
1397 	{Opt_noinit_inode_table, "noinit_itable"},
1398 	{Opt_err, NULL},
1399 };
1400 
1401 static ext4_fsblk_t get_sb_block(void **data)
1402 {
1403 	ext4_fsblk_t	sb_block;
1404 	char		*options = (char *) *data;
1405 
1406 	if (!options || strncmp(options, "sb=", 3) != 0)
1407 		return 1;	/* Default location */
1408 
1409 	options += 3;
1410 	/* TODO: use simple_strtoll with >32bit ext4 */
1411 	sb_block = simple_strtoul(options, &options, 0);
1412 	if (*options && *options != ',') {
1413 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1414 		       (char *) *data);
1415 		return 1;
1416 	}
1417 	if (*options == ',')
1418 		options++;
1419 	*data = (void *) options;
1420 
1421 	return sb_block;
1422 }
1423 
1424 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1425 static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
1426 	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1427 
1428 #ifdef CONFIG_QUOTA
1429 static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1430 {
1431 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1432 	char *qname;
1433 
1434 	if (sb_any_quota_loaded(sb) &&
1435 		!sbi->s_qf_names[qtype]) {
1436 		ext4_msg(sb, KERN_ERR,
1437 			"Cannot change journaled "
1438 			"quota options when quota turned on");
1439 		return 0;
1440 	}
1441 	qname = match_strdup(args);
1442 	if (!qname) {
1443 		ext4_msg(sb, KERN_ERR,
1444 			"Not enough memory for storing quotafile name");
1445 		return 0;
1446 	}
1447 	if (sbi->s_qf_names[qtype] &&
1448 		strcmp(sbi->s_qf_names[qtype], qname)) {
1449 		ext4_msg(sb, KERN_ERR,
1450 			"%s quota file already specified", QTYPE2NAME(qtype));
1451 		kfree(qname);
1452 		return 0;
1453 	}
1454 	sbi->s_qf_names[qtype] = qname;
1455 	if (strchr(sbi->s_qf_names[qtype], '/')) {
1456 		ext4_msg(sb, KERN_ERR,
1457 			"quotafile must be on filesystem root");
1458 		kfree(sbi->s_qf_names[qtype]);
1459 		sbi->s_qf_names[qtype] = NULL;
1460 		return 0;
1461 	}
1462 	set_opt(sb, QUOTA);
1463 	return 1;
1464 }
1465 
1466 static int clear_qf_name(struct super_block *sb, int qtype)
1467 {
1468 
1469 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1470 
1471 	if (sb_any_quota_loaded(sb) &&
1472 		sbi->s_qf_names[qtype]) {
1473 		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1474 			" when quota turned on");
1475 		return 0;
1476 	}
1477 	/*
1478 	 * The space will be released later when all options are confirmed
1479 	 * to be correct
1480 	 */
1481 	sbi->s_qf_names[qtype] = NULL;
1482 	return 1;
1483 }
1484 #endif
1485 
1486 static int parse_options(char *options, struct super_block *sb,
1487 			 unsigned long *journal_devnum,
1488 			 unsigned int *journal_ioprio,
1489 			 ext4_fsblk_t *n_blocks_count, int is_remount)
1490 {
1491 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1492 	char *p;
1493 	substring_t args[MAX_OPT_ARGS];
1494 	int data_opt = 0;
1495 	int option;
1496 #ifdef CONFIG_QUOTA
1497 	int qfmt;
1498 #endif
1499 
1500 	if (!options)
1501 		return 1;
1502 
1503 	while ((p = strsep(&options, ",")) != NULL) {
1504 		int token;
1505 		if (!*p)
1506 			continue;
1507 
1508 		/*
1509 		 * Initialize args struct so we know whether arg was
1510 		 * found; some options take optional arguments.
1511 		 */
1512 		args[0].to = args[0].from = NULL;
1513 		token = match_token(p, tokens, args);
1514 		switch (token) {
1515 		case Opt_bsd_df:
1516 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1517 			clear_opt(sb, MINIX_DF);
1518 			break;
1519 		case Opt_minix_df:
1520 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1521 			set_opt(sb, MINIX_DF);
1522 
1523 			break;
1524 		case Opt_grpid:
1525 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1526 			set_opt(sb, GRPID);
1527 
1528 			break;
1529 		case Opt_nogrpid:
1530 			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
1531 			clear_opt(sb, GRPID);
1532 
1533 			break;
1534 		case Opt_resuid:
1535 			if (match_int(&args[0], &option))
1536 				return 0;
1537 			sbi->s_resuid = option;
1538 			break;
1539 		case Opt_resgid:
1540 			if (match_int(&args[0], &option))
1541 				return 0;
1542 			sbi->s_resgid = option;
1543 			break;
1544 		case Opt_sb:
1545 			/* handled by get_sb_block() instead of here */
1546 			/* *sb_block = match_int(&args[0]); */
1547 			break;
1548 		case Opt_err_panic:
1549 			clear_opt(sb, ERRORS_CONT);
1550 			clear_opt(sb, ERRORS_RO);
1551 			set_opt(sb, ERRORS_PANIC);
1552 			break;
1553 		case Opt_err_ro:
1554 			clear_opt(sb, ERRORS_CONT);
1555 			clear_opt(sb, ERRORS_PANIC);
1556 			set_opt(sb, ERRORS_RO);
1557 			break;
1558 		case Opt_err_cont:
1559 			clear_opt(sb, ERRORS_RO);
1560 			clear_opt(sb, ERRORS_PANIC);
1561 			set_opt(sb, ERRORS_CONT);
1562 			break;
1563 		case Opt_nouid32:
1564 			set_opt(sb, NO_UID32);
1565 			break;
1566 		case Opt_debug:
1567 			set_opt(sb, DEBUG);
1568 			break;
1569 		case Opt_oldalloc:
1570 			set_opt(sb, OLDALLOC);
1571 			break;
1572 		case Opt_orlov:
1573 			clear_opt(sb, OLDALLOC);
1574 			break;
1575 #ifdef CONFIG_EXT4_FS_XATTR
1576 		case Opt_user_xattr:
1577 			set_opt(sb, XATTR_USER);
1578 			break;
1579 		case Opt_nouser_xattr:
1580 			clear_opt(sb, XATTR_USER);
1581 			break;
1582 #else
1583 		case Opt_user_xattr:
1584 		case Opt_nouser_xattr:
1585 			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
1586 			break;
1587 #endif
1588 #ifdef CONFIG_EXT4_FS_POSIX_ACL
1589 		case Opt_acl:
1590 			set_opt(sb, POSIX_ACL);
1591 			break;
1592 		case Opt_noacl:
1593 			clear_opt(sb, POSIX_ACL);
1594 			break;
1595 #else
1596 		case Opt_acl:
1597 		case Opt_noacl:
1598 			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
1599 			break;
1600 #endif
1601 		case Opt_journal_update:
1602 			/* @@@ FIXME */
1603 			/* Eventually we will want to be able to create
1604 			   a journal file here.  For now, only allow the
1605 			   user to specify an existing inode to be the
1606 			   journal file. */
1607 			if (is_remount) {
1608 				ext4_msg(sb, KERN_ERR,
1609 					 "Cannot specify journal on remount");
1610 				return 0;
1611 			}
1612 			set_opt(sb, UPDATE_JOURNAL);
1613 			break;
1614 		case Opt_journal_dev:
1615 			if (is_remount) {
1616 				ext4_msg(sb, KERN_ERR,
1617 					"Cannot specify journal on remount");
1618 				return 0;
1619 			}
1620 			if (match_int(&args[0], &option))
1621 				return 0;
1622 			*journal_devnum = option;
1623 			break;
1624 		case Opt_journal_checksum:
1625 			set_opt(sb, JOURNAL_CHECKSUM);
1626 			break;
1627 		case Opt_journal_async_commit:
1628 			set_opt(sb, JOURNAL_ASYNC_COMMIT);
1629 			set_opt(sb, JOURNAL_CHECKSUM);
1630 			break;
1631 		case Opt_noload:
1632 			set_opt(sb, NOLOAD);
1633 			break;
1634 		case Opt_commit:
1635 			if (match_int(&args[0], &option))
1636 				return 0;
1637 			if (option < 0)
1638 				return 0;
1639 			if (option == 0)
1640 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
1641 			sbi->s_commit_interval = HZ * option;
1642 			break;
1643 		case Opt_max_batch_time:
1644 			if (match_int(&args[0], &option))
1645 				return 0;
1646 			if (option < 0)
1647 				return 0;
1648 			if (option == 0)
1649 				option = EXT4_DEF_MAX_BATCH_TIME;
1650 			sbi->s_max_batch_time = option;
1651 			break;
1652 		case Opt_min_batch_time:
1653 			if (match_int(&args[0], &option))
1654 				return 0;
1655 			if (option < 0)
1656 				return 0;
1657 			sbi->s_min_batch_time = option;
1658 			break;
1659 		case Opt_data_journal:
1660 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
1661 			goto datacheck;
1662 		case Opt_data_ordered:
1663 			data_opt = EXT4_MOUNT_ORDERED_DATA;
1664 			goto datacheck;
1665 		case Opt_data_writeback:
1666 			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
1667 		datacheck:
1668 			if (is_remount) {
1669 				if (test_opt(sb, DATA_FLAGS) != data_opt) {
1670 					ext4_msg(sb, KERN_ERR,
1671 						"Cannot change data mode on remount");
1672 					return 0;
1673 				}
1674 			} else {
1675 				clear_opt(sb, DATA_FLAGS);
1676 				sbi->s_mount_opt |= data_opt;
1677 			}
1678 			break;
1679 		case Opt_data_err_abort:
1680 			set_opt(sb, DATA_ERR_ABORT);
1681 			break;
1682 		case Opt_data_err_ignore:
1683 			clear_opt(sb, DATA_ERR_ABORT);
1684 			break;
1685 #ifdef CONFIG_QUOTA
1686 		case Opt_usrjquota:
1687 			if (!set_qf_name(sb, USRQUOTA, &args[0]))
1688 				return 0;
1689 			break;
1690 		case Opt_grpjquota:
1691 			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
1692 				return 0;
1693 			break;
1694 		case Opt_offusrjquota:
1695 			if (!clear_qf_name(sb, USRQUOTA))
1696 				return 0;
1697 			break;
1698 		case Opt_offgrpjquota:
1699 			if (!clear_qf_name(sb, GRPQUOTA))
1700 				return 0;
1701 			break;
1702 
1703 		case Opt_jqfmt_vfsold:
1704 			qfmt = QFMT_VFS_OLD;
1705 			goto set_qf_format;
1706 		case Opt_jqfmt_vfsv0:
1707 			qfmt = QFMT_VFS_V0;
1708 			goto set_qf_format;
1709 		case Opt_jqfmt_vfsv1:
1710 			qfmt = QFMT_VFS_V1;
1711 set_qf_format:
1712 			if (sb_any_quota_loaded(sb) &&
1713 			    sbi->s_jquota_fmt != qfmt) {
1714 				ext4_msg(sb, KERN_ERR, "Cannot change "
1715 					"journaled quota options when "
1716 					"quota turned on");
1717 				return 0;
1718 			}
1719 			sbi->s_jquota_fmt = qfmt;
1720 			break;
1721 		case Opt_quota:
1722 		case Opt_usrquota:
1723 			set_opt(sb, QUOTA);
1724 			set_opt(sb, USRQUOTA);
1725 			break;
1726 		case Opt_grpquota:
1727 			set_opt(sb, QUOTA);
1728 			set_opt(sb, GRPQUOTA);
1729 			break;
1730 		case Opt_noquota:
1731 			if (sb_any_quota_loaded(sb)) {
1732 				ext4_msg(sb, KERN_ERR, "Cannot change quota "
1733 					"options when quota turned on");
1734 				return 0;
1735 			}
1736 			clear_opt(sb, QUOTA);
1737 			clear_opt(sb, USRQUOTA);
1738 			clear_opt(sb, GRPQUOTA);
1739 			break;
1740 #else
1741 		case Opt_quota:
1742 		case Opt_usrquota:
1743 		case Opt_grpquota:
1744 			ext4_msg(sb, KERN_ERR,
1745 				"quota options not supported");
1746 			break;
1747 		case Opt_usrjquota:
1748 		case Opt_grpjquota:
1749 		case Opt_offusrjquota:
1750 		case Opt_offgrpjquota:
1751 		case Opt_jqfmt_vfsold:
1752 		case Opt_jqfmt_vfsv0:
1753 		case Opt_jqfmt_vfsv1:
1754 			ext4_msg(sb, KERN_ERR,
1755 				"journaled quota options not supported");
1756 			break;
1757 		case Opt_noquota:
1758 			break;
1759 #endif
1760 		case Opt_abort:
1761 			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1762 			break;
1763 		case Opt_nobarrier:
1764 			clear_opt(sb, BARRIER);
1765 			break;
1766 		case Opt_barrier:
1767 			if (args[0].from) {
1768 				if (match_int(&args[0], &option))
1769 					return 0;
1770 			} else
1771 				option = 1;	/* No argument, default to 1 */
1772 			if (option)
1773 				set_opt(sb, BARRIER);
1774 			else
1775 				clear_opt(sb, BARRIER);
1776 			break;
1777 		case Opt_ignore:
1778 			break;
1779 		case Opt_resize:
1780 			if (!is_remount) {
1781 				ext4_msg(sb, KERN_ERR,
1782 					"resize option only available "
1783 					"for remount");
1784 				return 0;
1785 			}
1786 			if (match_int(&args[0], &option) != 0)
1787 				return 0;
1788 			*n_blocks_count = option;
1789 			break;
1790 		case Opt_nobh:
1791 			ext4_msg(sb, KERN_WARNING,
1792 				 "Ignoring deprecated nobh option");
1793 			break;
1794 		case Opt_bh:
1795 			ext4_msg(sb, KERN_WARNING,
1796 				 "Ignoring deprecated bh option");
1797 			break;
1798 		case Opt_i_version:
1799 			set_opt(sb, I_VERSION);
1800 			sb->s_flags |= MS_I_VERSION;
1801 			break;
1802 		case Opt_nodelalloc:
1803 			clear_opt(sb, DELALLOC);
1804 			break;
1805 		case Opt_mblk_io_submit:
1806 			set_opt(sb, MBLK_IO_SUBMIT);
1807 			break;
1808 		case Opt_nomblk_io_submit:
1809 			clear_opt(sb, MBLK_IO_SUBMIT);
1810 			break;
1811 		case Opt_stripe:
1812 			if (match_int(&args[0], &option))
1813 				return 0;
1814 			if (option < 0)
1815 				return 0;
1816 			sbi->s_stripe = option;
1817 			break;
1818 		case Opt_delalloc:
1819 			set_opt(sb, DELALLOC);
1820 			break;
1821 		case Opt_block_validity:
1822 			set_opt(sb, BLOCK_VALIDITY);
1823 			break;
1824 		case Opt_noblock_validity:
1825 			clear_opt(sb, BLOCK_VALIDITY);
1826 			break;
1827 		case Opt_inode_readahead_blks:
1828 			if (match_int(&args[0], &option))
1829 				return 0;
1830 			if (option < 0 || option > (1 << 30))
1831 				return 0;
1832 			if (option && !is_power_of_2(option)) {
1833 				ext4_msg(sb, KERN_ERR,
1834 					 "EXT4-fs: inode_readahead_blks"
1835 					 " must be a power of 2");
1836 				return 0;
1837 			}
1838 			sbi->s_inode_readahead_blks = option;
1839 			break;
1840 		case Opt_journal_ioprio:
1841 			if (match_int(&args[0], &option))
1842 				return 0;
1843 			if (option < 0 || option > 7)
1844 				break;
1845 			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
1846 							    option);
1847 			break;
1848 		case Opt_noauto_da_alloc:
1849 			set_opt(sb, NO_AUTO_DA_ALLOC);
1850 			break;
1851 		case Opt_auto_da_alloc:
1852 			if (args[0].from) {
1853 				if (match_int(&args[0], &option))
1854 					return 0;
1855 			} else
1856 				option = 1;	/* No argument, default to 1 */
1857 			if (option)
1858 				clear_opt(sb, NO_AUTO_DA_ALLOC);
1859 			else
1860 				set_opt(sb,NO_AUTO_DA_ALLOC);
1861 			break;
1862 		case Opt_discard:
1863 			set_opt(sb, DISCARD);
1864 			break;
1865 		case Opt_nodiscard:
1866 			clear_opt(sb, DISCARD);
1867 			break;
1868 		case Opt_dioread_nolock:
1869 			set_opt(sb, DIOREAD_NOLOCK);
1870 			break;
1871 		case Opt_dioread_lock:
1872 			clear_opt(sb, DIOREAD_NOLOCK);
1873 			break;
1874 		case Opt_init_inode_table:
1875 			set_opt(sb, INIT_INODE_TABLE);
1876 			if (args[0].from) {
1877 				if (match_int(&args[0], &option))
1878 					return 0;
1879 			} else
1880 				option = EXT4_DEF_LI_WAIT_MULT;
1881 			if (option < 0)
1882 				return 0;
1883 			sbi->s_li_wait_mult = option;
1884 			break;
1885 		case Opt_noinit_inode_table:
1886 			clear_opt(sb, INIT_INODE_TABLE);
1887 			break;
1888 		default:
1889 			ext4_msg(sb, KERN_ERR,
1890 			       "Unrecognized mount option \"%s\" "
1891 			       "or missing value", p);
1892 			return 0;
1893 		}
1894 	}
1895 #ifdef CONFIG_QUOTA
1896 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
1897 		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
1898 			clear_opt(sb, USRQUOTA);
1899 
1900 		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
1901 			clear_opt(sb, GRPQUOTA);
1902 
1903 		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
1904 			ext4_msg(sb, KERN_ERR, "old and new quota "
1905 					"format mixing");
1906 			return 0;
1907 		}
1908 
1909 		if (!sbi->s_jquota_fmt) {
1910 			ext4_msg(sb, KERN_ERR, "journaled quota format "
1911 					"not specified");
1912 			return 0;
1913 		}
1914 	} else {
1915 		if (sbi->s_jquota_fmt) {
1916 			ext4_msg(sb, KERN_ERR, "journaled quota format "
1917 					"specified with no journaling "
1918 					"enabled");
1919 			return 0;
1920 		}
1921 	}
1922 #endif
1923 	return 1;
1924 }
1925 
1926 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1927 			    int read_only)
1928 {
1929 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1930 	int res = 0;
1931 
1932 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
1933 		ext4_msg(sb, KERN_ERR, "revision level too high, "
1934 			 "forcing read-only mode");
1935 		res = MS_RDONLY;
1936 	}
1937 	if (read_only)
1938 		return res;
1939 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
1940 		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
1941 			 "running e2fsck is recommended");
1942 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
1943 		ext4_msg(sb, KERN_WARNING,
1944 			 "warning: mounting fs with errors, "
1945 			 "running e2fsck is recommended");
1946 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1947 		 le16_to_cpu(es->s_mnt_count) >=
1948 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1949 		ext4_msg(sb, KERN_WARNING,
1950 			 "warning: maximal mount count reached, "
1951 			 "running e2fsck is recommended");
1952 	else if (le32_to_cpu(es->s_checkinterval) &&
1953 		(le32_to_cpu(es->s_lastcheck) +
1954 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
1955 		ext4_msg(sb, KERN_WARNING,
1956 			 "warning: checktime reached, "
1957 			 "running e2fsck is recommended");
1958 	if (!sbi->s_journal)
1959 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
1960 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
1961 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
1962 	le16_add_cpu(&es->s_mnt_count, 1);
1963 	es->s_mtime = cpu_to_le32(get_seconds());
1964 	ext4_update_dynamic_rev(sb);
1965 	if (sbi->s_journal)
1966 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
1967 
1968 	ext4_commit_super(sb, 1);
1969 	if (test_opt(sb, DEBUG))
1970 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
1971 				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
1972 			sb->s_blocksize,
1973 			sbi->s_groups_count,
1974 			EXT4_BLOCKS_PER_GROUP(sb),
1975 			EXT4_INODES_PER_GROUP(sb),
1976 			sbi->s_mount_opt, sbi->s_mount_opt2);
1977 
1978 	cleancache_init_fs(sb);
1979 	return res;
1980 }
1981 
1982 static int ext4_fill_flex_info(struct super_block *sb)
1983 {
1984 	struct ext4_sb_info *sbi = EXT4_SB(sb);
1985 	struct ext4_group_desc *gdp = NULL;
1986 	ext4_group_t flex_group_count;
1987 	ext4_group_t flex_group;
1988 	int groups_per_flex = 0;
1989 	size_t size;
1990 	int i;
1991 
1992 	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1993 	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1994 
1995 	if (groups_per_flex < 2) {
1996 		sbi->s_log_groups_per_flex = 0;
1997 		return 1;
1998 	}
1999 
2000 	/* We allocate both existing and potentially added groups */
2001 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
2002 			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
2003 			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
2004 	size = flex_group_count * sizeof(struct flex_groups);
2005 	sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
2006 	if (sbi->s_flex_groups == NULL) {
2007 		ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
2008 			 flex_group_count);
2009 		goto failed;
2010 	}
2011 
2012 	for (i = 0; i < sbi->s_groups_count; i++) {
2013 		gdp = ext4_get_group_desc(sb, i, NULL);
2014 
2015 		flex_group = ext4_flex_group(sbi, i);
2016 		atomic_add(ext4_free_inodes_count(sb, gdp),
2017 			   &sbi->s_flex_groups[flex_group].free_inodes);
2018 		atomic_add(ext4_free_blks_count(sb, gdp),
2019 			   &sbi->s_flex_groups[flex_group].free_blocks);
2020 		atomic_add(ext4_used_dirs_count(sb, gdp),
2021 			   &sbi->s_flex_groups[flex_group].used_dirs);
2022 	}
2023 
2024 	return 1;
2025 failed:
2026 	return 0;
2027 }
2028 
2029 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
2030 			    struct ext4_group_desc *gdp)
2031 {
2032 	__u16 crc = 0;
2033 
2034 	if (sbi->s_es->s_feature_ro_compat &
2035 	    cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
2036 		int offset = offsetof(struct ext4_group_desc, bg_checksum);
2037 		__le32 le_group = cpu_to_le32(block_group);
2038 
2039 		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2040 		crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2041 		crc = crc16(crc, (__u8 *)gdp, offset);
2042 		offset += sizeof(gdp->bg_checksum); /* skip checksum */
2043 		/* for checksum of struct ext4_group_desc do the rest...*/
2044 		if ((sbi->s_es->s_feature_incompat &
2045 		     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
2046 		    offset < le16_to_cpu(sbi->s_es->s_desc_size))
2047 			crc = crc16(crc, (__u8 *)gdp + offset,
2048 				    le16_to_cpu(sbi->s_es->s_desc_size) -
2049 					offset);
2050 	}
2051 
2052 	return cpu_to_le16(crc);
2053 }
2054 
2055 int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
2056 				struct ext4_group_desc *gdp)
2057 {
2058 	if ((sbi->s_es->s_feature_ro_compat &
2059 	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
2060 	    (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
2061 		return 0;
2062 
2063 	return 1;
2064 }
2065 
2066 /* Called at mount-time, super-block is locked */
2067 static int ext4_check_descriptors(struct super_block *sb,
2068 				  ext4_group_t *first_not_zeroed)
2069 {
2070 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2071 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2072 	ext4_fsblk_t last_block;
2073 	ext4_fsblk_t block_bitmap;
2074 	ext4_fsblk_t inode_bitmap;
2075 	ext4_fsblk_t inode_table;
2076 	int flexbg_flag = 0;
2077 	ext4_group_t i, grp = sbi->s_groups_count;
2078 
2079 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
2080 		flexbg_flag = 1;
2081 
2082 	ext4_debug("Checking group descriptors");
2083 
2084 	for (i = 0; i < sbi->s_groups_count; i++) {
2085 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2086 
2087 		if (i == sbi->s_groups_count - 1 || flexbg_flag)
2088 			last_block = ext4_blocks_count(sbi->s_es) - 1;
2089 		else
2090 			last_block = first_block +
2091 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
2092 
2093 		if ((grp == sbi->s_groups_count) &&
2094 		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2095 			grp = i;
2096 
2097 		block_bitmap = ext4_block_bitmap(sb, gdp);
2098 		if (block_bitmap < first_block || block_bitmap > last_block) {
2099 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2100 			       "Block bitmap for group %u not in group "
2101 			       "(block %llu)!", i, block_bitmap);
2102 			return 0;
2103 		}
2104 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
2105 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
2106 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2107 			       "Inode bitmap for group %u not in group "
2108 			       "(block %llu)!", i, inode_bitmap);
2109 			return 0;
2110 		}
2111 		inode_table = ext4_inode_table(sb, gdp);
2112 		if (inode_table < first_block ||
2113 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
2114 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2115 			       "Inode table for group %u not in group "
2116 			       "(block %llu)!", i, inode_table);
2117 			return 0;
2118 		}
2119 		ext4_lock_group(sb, i);
2120 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
2121 			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2122 				 "Checksum for group %u failed (%u!=%u)",
2123 				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
2124 				     gdp)), le16_to_cpu(gdp->bg_checksum));
2125 			if (!(sb->s_flags & MS_RDONLY)) {
2126 				ext4_unlock_group(sb, i);
2127 				return 0;
2128 			}
2129 		}
2130 		ext4_unlock_group(sb, i);
2131 		if (!flexbg_flag)
2132 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
2133 	}
2134 	if (NULL != first_not_zeroed)
2135 		*first_not_zeroed = grp;
2136 
2137 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
2138 	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
2139 	return 1;
2140 }
2141 
2142 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2143  * the superblock) which were deleted from all directories, but held open by
2144  * a process at the time of a crash.  We walk the list and try to delete these
2145  * inodes at recovery time (only with a read-write filesystem).
2146  *
2147  * In order to keep the orphan inode chain consistent during traversal (in
2148  * case of crash during recovery), we link each inode into the superblock
2149  * orphan list_head and handle it the same way as an inode deletion during
2150  * normal operation (which journals the operations for us).
2151  *
2152  * We only do an iget() and an iput() on each inode, which is very safe if we
2153  * accidentally point at an in-use or already deleted inode.  The worst that
2154  * can happen in this case is that we get a "bit already cleared" message from
2155  * ext4_free_inode().  The only reason we would point at a wrong inode is if
2156  * e2fsck was run on this filesystem, and it must have already done the orphan
2157  * inode cleanup for us, so we can safely abort without any further action.
2158  */
2159 static void ext4_orphan_cleanup(struct super_block *sb,
2160 				struct ext4_super_block *es)
2161 {
2162 	unsigned int s_flags = sb->s_flags;
2163 	int nr_orphans = 0, nr_truncates = 0;
2164 #ifdef CONFIG_QUOTA
2165 	int i;
2166 #endif
2167 	if (!es->s_last_orphan) {
2168 		jbd_debug(4, "no orphan inodes to clean up\n");
2169 		return;
2170 	}
2171 
2172 	if (bdev_read_only(sb->s_bdev)) {
2173 		ext4_msg(sb, KERN_ERR, "write access "
2174 			"unavailable, skipping orphan cleanup");
2175 		return;
2176 	}
2177 
2178 	/* Check if feature set would not allow a r/w mount */
2179 	if (!ext4_feature_set_ok(sb, 0)) {
2180 		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2181 			 "unknown ROCOMPAT features");
2182 		return;
2183 	}
2184 
2185 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2186 		if (es->s_last_orphan)
2187 			jbd_debug(1, "Errors on filesystem, "
2188 				  "clearing orphan list.\n");
2189 		es->s_last_orphan = 0;
2190 		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2191 		return;
2192 	}
2193 
2194 	if (s_flags & MS_RDONLY) {
2195 		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
2196 		sb->s_flags &= ~MS_RDONLY;
2197 	}
2198 #ifdef CONFIG_QUOTA
2199 	/* Needed for iput() to work correctly and not trash data */
2200 	sb->s_flags |= MS_ACTIVE;
2201 	/* Turn on quotas so that they are updated correctly */
2202 	for (i = 0; i < MAXQUOTAS; i++) {
2203 		if (EXT4_SB(sb)->s_qf_names[i]) {
2204 			int ret = ext4_quota_on_mount(sb, i);
2205 			if (ret < 0)
2206 				ext4_msg(sb, KERN_ERR,
2207 					"Cannot turn on journaled "
2208 					"quota: error %d", ret);
2209 		}
2210 	}
2211 #endif
2212 
2213 	while (es->s_last_orphan) {
2214 		struct inode *inode;
2215 
2216 		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
2217 		if (IS_ERR(inode)) {
2218 			es->s_last_orphan = 0;
2219 			break;
2220 		}
2221 
2222 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2223 		dquot_initialize(inode);
2224 		if (inode->i_nlink) {
2225 			ext4_msg(sb, KERN_DEBUG,
2226 				"%s: truncating inode %lu to %lld bytes",
2227 				__func__, inode->i_ino, inode->i_size);
2228 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2229 				  inode->i_ino, inode->i_size);
2230 			ext4_truncate(inode);
2231 			nr_truncates++;
2232 		} else {
2233 			ext4_msg(sb, KERN_DEBUG,
2234 				"%s: deleting unreferenced inode %lu",
2235 				__func__, inode->i_ino);
2236 			jbd_debug(2, "deleting unreferenced inode %lu\n",
2237 				  inode->i_ino);
2238 			nr_orphans++;
2239 		}
2240 		iput(inode);  /* The delete magic happens here! */
2241 	}
2242 
2243 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
2244 
2245 	if (nr_orphans)
2246 		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
2247 		       PLURAL(nr_orphans));
2248 	if (nr_truncates)
2249 		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
2250 		       PLURAL(nr_truncates));
2251 #ifdef CONFIG_QUOTA
2252 	/* Turn quotas off */
2253 	for (i = 0; i < MAXQUOTAS; i++) {
2254 		if (sb_dqopt(sb)->files[i])
2255 			dquot_quota_off(sb, i);
2256 	}
2257 #endif
2258 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
2259 }
2260 
2261 /*
2262  * Maximal extent format file size.
2263  * Resulting logical blkno at s_maxbytes must fit in our on-disk
2264  * extent format containers, within a sector_t, and within i_blocks
2265  * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2266  * so that won't be a limiting factor.
2267  *
2268  * However there is other limiting factor. We do store extents in the form
2269  * of starting block and length, hence the resulting length of the extent
2270  * covering maximum file size must fit into on-disk format containers as
2271  * well. Given that length is always by 1 unit bigger than max unit (because
2272  * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2273  *
2274  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2275  */
2276 static loff_t ext4_max_size(int blkbits, int has_huge_files)
2277 {
2278 	loff_t res;
2279 	loff_t upper_limit = MAX_LFS_FILESIZE;
2280 
2281 	/* small i_blocks in vfs inode? */
2282 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2283 		/*
2284 		 * CONFIG_LBDAF is not enabled implies the inode
2285 		 * i_block represent total blocks in 512 bytes
2286 		 * 32 == size of vfs inode i_blocks * 8
2287 		 */
2288 		upper_limit = (1LL << 32) - 1;
2289 
2290 		/* total blocks in file system block size */
2291 		upper_limit >>= (blkbits - 9);
2292 		upper_limit <<= blkbits;
2293 	}
2294 
2295 	/*
2296 	 * 32-bit extent-start container, ee_block. We lower the maxbytes
2297 	 * by one fs block, so ee_len can cover the extent of maximum file
2298 	 * size
2299 	 */
2300 	res = (1LL << 32) - 1;
2301 	res <<= blkbits;
2302 
2303 	/* Sanity check against vm- & vfs- imposed limits */
2304 	if (res > upper_limit)
2305 		res = upper_limit;
2306 
2307 	return res;
2308 }
2309 
2310 /*
2311  * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
2312  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
2313  * We need to be 1 filesystem block less than the 2^48 sector limit.
2314  */
2315 static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2316 {
2317 	loff_t res = EXT4_NDIR_BLOCKS;
2318 	int meta_blocks;
2319 	loff_t upper_limit;
2320 	/* This is calculated to be the largest file size for a dense, block
2321 	 * mapped file such that the file's total number of 512-byte sectors,
2322 	 * including data and all indirect blocks, does not exceed (2^48 - 1).
2323 	 *
2324 	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
2325 	 * number of 512-byte sectors of the file.
2326 	 */
2327 
2328 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
2329 		/*
2330 		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
2331 		 * the inode i_block field represents total file blocks in
2332 		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
2333 		 */
2334 		upper_limit = (1LL << 32) - 1;
2335 
2336 		/* total blocks in file system block size */
2337 		upper_limit >>= (bits - 9);
2338 
2339 	} else {
2340 		/*
2341 		 * We use 48 bit ext4_inode i_blocks
2342 		 * With EXT4_HUGE_FILE_FL set the i_blocks
2343 		 * represent total number of blocks in
2344 		 * file system block size
2345 		 */
2346 		upper_limit = (1LL << 48) - 1;
2347 
2348 	}
2349 
2350 	/* indirect blocks */
2351 	meta_blocks = 1;
2352 	/* double indirect blocks */
2353 	meta_blocks += 1 + (1LL << (bits-2));
2354 	/* tripple indirect blocks */
2355 	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
2356 
2357 	upper_limit -= meta_blocks;
2358 	upper_limit <<= bits;
2359 
2360 	res += 1LL << (bits-2);
2361 	res += 1LL << (2*(bits-2));
2362 	res += 1LL << (3*(bits-2));
2363 	res <<= bits;
2364 	if (res > upper_limit)
2365 		res = upper_limit;
2366 
2367 	if (res > MAX_LFS_FILESIZE)
2368 		res = MAX_LFS_FILESIZE;
2369 
2370 	return res;
2371 }
2372 
2373 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
2374 				   ext4_fsblk_t logical_sb_block, int nr)
2375 {
2376 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2377 	ext4_group_t bg, first_meta_bg;
2378 	int has_super = 0;
2379 
2380 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
2381 
2382 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
2383 	    nr < first_meta_bg)
2384 		return logical_sb_block + nr + 1;
2385 	bg = sbi->s_desc_per_block * nr;
2386 	if (ext4_bg_has_super(sb, bg))
2387 		has_super = 1;
2388 
2389 	return (has_super + ext4_group_first_block_no(sb, bg));
2390 }
2391 
2392 /**
2393  * ext4_get_stripe_size: Get the stripe size.
2394  * @sbi: In memory super block info
2395  *
2396  * If we have specified it via mount option, then
2397  * use the mount option value. If the value specified at mount time is
2398  * greater than the blocks per group use the super block value.
2399  * If the super block value is greater than blocks per group return 0.
2400  * Allocator needs it be less than blocks per group.
2401  *
2402  */
2403 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2404 {
2405 	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2406 	unsigned long stripe_width =
2407 			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2408 	int ret;
2409 
2410 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2411 		ret = sbi->s_stripe;
2412 	else if (stripe_width <= sbi->s_blocks_per_group)
2413 		ret = stripe_width;
2414 	else if (stride <= sbi->s_blocks_per_group)
2415 		ret = stride;
2416 	else
2417 		ret = 0;
2418 
2419 	/*
2420 	 * If the stripe width is 1, this makes no sense and
2421 	 * we set it to 0 to turn off stripe handling code.
2422 	 */
2423 	if (ret <= 1)
2424 		ret = 0;
2425 
2426 	return ret;
2427 }
2428 
2429 /* sysfs supprt */
2430 
2431 struct ext4_attr {
2432 	struct attribute attr;
2433 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2434 	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2435 			 const char *, size_t);
2436 	int offset;
2437 };
2438 
2439 static int parse_strtoul(const char *buf,
2440 		unsigned long max, unsigned long *value)
2441 {
2442 	char *endp;
2443 
2444 	*value = simple_strtoul(skip_spaces(buf), &endp, 0);
2445 	endp = skip_spaces(endp);
2446 	if (*endp || *value > max)
2447 		return -EINVAL;
2448 
2449 	return 0;
2450 }
2451 
2452 static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
2453 					      struct ext4_sb_info *sbi,
2454 					      char *buf)
2455 {
2456 	return snprintf(buf, PAGE_SIZE, "%llu\n",
2457 			(s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2458 }
2459 
2460 static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2461 					 struct ext4_sb_info *sbi, char *buf)
2462 {
2463 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2464 
2465 	if (!sb->s_bdev->bd_part)
2466 		return snprintf(buf, PAGE_SIZE, "0\n");
2467 	return snprintf(buf, PAGE_SIZE, "%lu\n",
2468 			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2469 			 sbi->s_sectors_written_start) >> 1);
2470 }
2471 
2472 static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2473 					  struct ext4_sb_info *sbi, char *buf)
2474 {
2475 	struct super_block *sb = sbi->s_buddy_cache->i_sb;
2476 
2477 	if (!sb->s_bdev->bd_part)
2478 		return snprintf(buf, PAGE_SIZE, "0\n");
2479 	return snprintf(buf, PAGE_SIZE, "%llu\n",
2480 			(unsigned long long)(sbi->s_kbytes_written +
2481 			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2482 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2483 }
2484 
2485 static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2486 				      struct ext4_sb_info *sbi, char *buf)
2487 {
2488 	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2489 }
2490 
2491 static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2492 					struct ext4_sb_info *sbi, char *buf)
2493 {
2494 	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2495 }
2496 
2497 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2498 					  struct ext4_sb_info *sbi,
2499 					  const char *buf, size_t count)
2500 {
2501 	unsigned long t;
2502 
2503 	if (parse_strtoul(buf, 0x40000000, &t))
2504 		return -EINVAL;
2505 
2506 	if (t && !is_power_of_2(t))
2507 		return -EINVAL;
2508 
2509 	sbi->s_inode_readahead_blks = t;
2510 	return count;
2511 }
2512 
2513 static ssize_t sbi_ui_show(struct ext4_attr *a,
2514 			   struct ext4_sb_info *sbi, char *buf)
2515 {
2516 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2517 
2518 	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2519 }
2520 
2521 static ssize_t sbi_ui_store(struct ext4_attr *a,
2522 			    struct ext4_sb_info *sbi,
2523 			    const char *buf, size_t count)
2524 {
2525 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
2526 	unsigned long t;
2527 
2528 	if (parse_strtoul(buf, 0xffffffff, &t))
2529 		return -EINVAL;
2530 	*ui = t;
2531 	return count;
2532 }
2533 
2534 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2535 static struct ext4_attr ext4_attr_##_name = {			\
2536 	.attr = {.name = __stringify(_name), .mode = _mode },	\
2537 	.show	= _show,					\
2538 	.store	= _store,					\
2539 	.offset = offsetof(struct ext4_sb_info, _elname),	\
2540 }
2541 #define EXT4_ATTR(name, mode, show, store) \
2542 static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2543 
2544 #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2545 #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2546 #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2547 #define EXT4_RW_ATTR_SBI_UI(name, elname)	\
2548 	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2549 #define ATTR_LIST(name) &ext4_attr_##name.attr
2550 
2551 EXT4_RO_ATTR(delayed_allocation_blocks);
2552 EXT4_RO_ATTR(session_write_kbytes);
2553 EXT4_RO_ATTR(lifetime_write_kbytes);
2554 EXT4_RO_ATTR(extent_cache_hits);
2555 EXT4_RO_ATTR(extent_cache_misses);
2556 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2557 		 inode_readahead_blks_store, s_inode_readahead_blks);
2558 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
2559 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
2560 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
2561 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2562 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2563 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2564 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2565 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2566 
2567 static struct attribute *ext4_attrs[] = {
2568 	ATTR_LIST(delayed_allocation_blocks),
2569 	ATTR_LIST(session_write_kbytes),
2570 	ATTR_LIST(lifetime_write_kbytes),
2571 	ATTR_LIST(extent_cache_hits),
2572 	ATTR_LIST(extent_cache_misses),
2573 	ATTR_LIST(inode_readahead_blks),
2574 	ATTR_LIST(inode_goal),
2575 	ATTR_LIST(mb_stats),
2576 	ATTR_LIST(mb_max_to_scan),
2577 	ATTR_LIST(mb_min_to_scan),
2578 	ATTR_LIST(mb_order2_req),
2579 	ATTR_LIST(mb_stream_req),
2580 	ATTR_LIST(mb_group_prealloc),
2581 	ATTR_LIST(max_writeback_mb_bump),
2582 	NULL,
2583 };
2584 
2585 /* Features this copy of ext4 supports */
2586 EXT4_INFO_ATTR(lazy_itable_init);
2587 EXT4_INFO_ATTR(batched_discard);
2588 
2589 static struct attribute *ext4_feat_attrs[] = {
2590 	ATTR_LIST(lazy_itable_init),
2591 	ATTR_LIST(batched_discard),
2592 	NULL,
2593 };
2594 
2595 static ssize_t ext4_attr_show(struct kobject *kobj,
2596 			      struct attribute *attr, char *buf)
2597 {
2598 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2599 						s_kobj);
2600 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2601 
2602 	return a->show ? a->show(a, sbi, buf) : 0;
2603 }
2604 
2605 static ssize_t ext4_attr_store(struct kobject *kobj,
2606 			       struct attribute *attr,
2607 			       const char *buf, size_t len)
2608 {
2609 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2610 						s_kobj);
2611 	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
2612 
2613 	return a->store ? a->store(a, sbi, buf, len) : 0;
2614 }
2615 
2616 static void ext4_sb_release(struct kobject *kobj)
2617 {
2618 	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
2619 						s_kobj);
2620 	complete(&sbi->s_kobj_unregister);
2621 }
2622 
2623 static const struct sysfs_ops ext4_attr_ops = {
2624 	.show	= ext4_attr_show,
2625 	.store	= ext4_attr_store,
2626 };
2627 
2628 static struct kobj_type ext4_ktype = {
2629 	.default_attrs	= ext4_attrs,
2630 	.sysfs_ops	= &ext4_attr_ops,
2631 	.release	= ext4_sb_release,
2632 };
2633 
2634 static void ext4_feat_release(struct kobject *kobj)
2635 {
2636 	complete(&ext4_feat->f_kobj_unregister);
2637 }
2638 
2639 static struct kobj_type ext4_feat_ktype = {
2640 	.default_attrs	= ext4_feat_attrs,
2641 	.sysfs_ops	= &ext4_attr_ops,
2642 	.release	= ext4_feat_release,
2643 };
2644 
2645 /*
2646  * Check whether this filesystem can be mounted based on
2647  * the features present and the RDONLY/RDWR mount requested.
2648  * Returns 1 if this filesystem can be mounted as requested,
2649  * 0 if it cannot be.
2650  */
2651 static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2652 {
2653 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2654 		ext4_msg(sb, KERN_ERR,
2655 			"Couldn't mount because of "
2656 			"unsupported optional features (%x)",
2657 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2658 			~EXT4_FEATURE_INCOMPAT_SUPP));
2659 		return 0;
2660 	}
2661 
2662 	if (readonly)
2663 		return 1;
2664 
2665 	/* Check that feature set is OK for a read-write mount */
2666 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2667 		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2668 			 "unsupported optional features (%x)",
2669 			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2670 				~EXT4_FEATURE_RO_COMPAT_SUPP));
2671 		return 0;
2672 	}
2673 	/*
2674 	 * Large file size enabled file system can only be mounted
2675 	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2676 	 */
2677 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2678 		if (sizeof(blkcnt_t) < sizeof(u64)) {
2679 			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2680 				 "cannot be mounted RDWR without "
2681 				 "CONFIG_LBDAF");
2682 			return 0;
2683 		}
2684 	}
2685 	return 1;
2686 }
2687 
2688 /*
2689  * This function is called once a day if we have errors logged
2690  * on the file system
2691  */
2692 static void print_daily_error_info(unsigned long arg)
2693 {
2694 	struct super_block *sb = (struct super_block *) arg;
2695 	struct ext4_sb_info *sbi;
2696 	struct ext4_super_block *es;
2697 
2698 	sbi = EXT4_SB(sb);
2699 	es = sbi->s_es;
2700 
2701 	if (es->s_error_count)
2702 		ext4_msg(sb, KERN_NOTICE, "error count: %u",
2703 			 le32_to_cpu(es->s_error_count));
2704 	if (es->s_first_error_time) {
2705 		printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
2706 		       sb->s_id, le32_to_cpu(es->s_first_error_time),
2707 		       (int) sizeof(es->s_first_error_func),
2708 		       es->s_first_error_func,
2709 		       le32_to_cpu(es->s_first_error_line));
2710 		if (es->s_first_error_ino)
2711 			printk(": inode %u",
2712 			       le32_to_cpu(es->s_first_error_ino));
2713 		if (es->s_first_error_block)
2714 			printk(": block %llu", (unsigned long long)
2715 			       le64_to_cpu(es->s_first_error_block));
2716 		printk("\n");
2717 	}
2718 	if (es->s_last_error_time) {
2719 		printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
2720 		       sb->s_id, le32_to_cpu(es->s_last_error_time),
2721 		       (int) sizeof(es->s_last_error_func),
2722 		       es->s_last_error_func,
2723 		       le32_to_cpu(es->s_last_error_line));
2724 		if (es->s_last_error_ino)
2725 			printk(": inode %u",
2726 			       le32_to_cpu(es->s_last_error_ino));
2727 		if (es->s_last_error_block)
2728 			printk(": block %llu", (unsigned long long)
2729 			       le64_to_cpu(es->s_last_error_block));
2730 		printk("\n");
2731 	}
2732 	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
2733 }
2734 
2735 /* Find next suitable group and run ext4_init_inode_table */
2736 static int ext4_run_li_request(struct ext4_li_request *elr)
2737 {
2738 	struct ext4_group_desc *gdp = NULL;
2739 	ext4_group_t group, ngroups;
2740 	struct super_block *sb;
2741 	unsigned long timeout = 0;
2742 	int ret = 0;
2743 
2744 	sb = elr->lr_super;
2745 	ngroups = EXT4_SB(sb)->s_groups_count;
2746 
2747 	for (group = elr->lr_next_group; group < ngroups; group++) {
2748 		gdp = ext4_get_group_desc(sb, group, NULL);
2749 		if (!gdp) {
2750 			ret = 1;
2751 			break;
2752 		}
2753 
2754 		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2755 			break;
2756 	}
2757 
2758 	if (group == ngroups)
2759 		ret = 1;
2760 
2761 	if (!ret) {
2762 		timeout = jiffies;
2763 		ret = ext4_init_inode_table(sb, group,
2764 					    elr->lr_timeout ? 0 : 1);
2765 		if (elr->lr_timeout == 0) {
2766 			timeout = (jiffies - timeout) *
2767 				  elr->lr_sbi->s_li_wait_mult;
2768 			elr->lr_timeout = timeout;
2769 		}
2770 		elr->lr_next_sched = jiffies + elr->lr_timeout;
2771 		elr->lr_next_group = group + 1;
2772 	}
2773 
2774 	return ret;
2775 }
2776 
2777 /*
2778  * Remove lr_request from the list_request and free the
2779  * request structure. Should be called with li_list_mtx held
2780  */
2781 static void ext4_remove_li_request(struct ext4_li_request *elr)
2782 {
2783 	struct ext4_sb_info *sbi;
2784 
2785 	if (!elr)
2786 		return;
2787 
2788 	sbi = elr->lr_sbi;
2789 
2790 	list_del(&elr->lr_request);
2791 	sbi->s_li_request = NULL;
2792 	kfree(elr);
2793 }
2794 
2795 static void ext4_unregister_li_request(struct super_block *sb)
2796 {
2797 	mutex_lock(&ext4_li_mtx);
2798 	if (!ext4_li_info) {
2799 		mutex_unlock(&ext4_li_mtx);
2800 		return;
2801 	}
2802 
2803 	mutex_lock(&ext4_li_info->li_list_mtx);
2804 	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2805 	mutex_unlock(&ext4_li_info->li_list_mtx);
2806 	mutex_unlock(&ext4_li_mtx);
2807 }
2808 
2809 static struct task_struct *ext4_lazyinit_task;
2810 
2811 /*
2812  * This is the function where ext4lazyinit thread lives. It walks
2813  * through the request list searching for next scheduled filesystem.
2814  * When such a fs is found, run the lazy initialization request
2815  * (ext4_rn_li_request) and keep track of the time spend in this
2816  * function. Based on that time we compute next schedule time of
2817  * the request. When walking through the list is complete, compute
2818  * next waking time and put itself into sleep.
2819  */
2820 static int ext4_lazyinit_thread(void *arg)
2821 {
2822 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2823 	struct list_head *pos, *n;
2824 	struct ext4_li_request *elr;
2825 	unsigned long next_wakeup, cur;
2826 
2827 	BUG_ON(NULL == eli);
2828 
2829 cont_thread:
2830 	while (true) {
2831 		next_wakeup = MAX_JIFFY_OFFSET;
2832 
2833 		mutex_lock(&eli->li_list_mtx);
2834 		if (list_empty(&eli->li_request_list)) {
2835 			mutex_unlock(&eli->li_list_mtx);
2836 			goto exit_thread;
2837 		}
2838 
2839 		list_for_each_safe(pos, n, &eli->li_request_list) {
2840 			elr = list_entry(pos, struct ext4_li_request,
2841 					 lr_request);
2842 
2843 			if (time_after_eq(jiffies, elr->lr_next_sched)) {
2844 				if (ext4_run_li_request(elr) != 0) {
2845 					/* error, remove the lazy_init job */
2846 					ext4_remove_li_request(elr);
2847 					continue;
2848 				}
2849 			}
2850 
2851 			if (time_before(elr->lr_next_sched, next_wakeup))
2852 				next_wakeup = elr->lr_next_sched;
2853 		}
2854 		mutex_unlock(&eli->li_list_mtx);
2855 
2856 		if (freezing(current))
2857 			refrigerator();
2858 
2859 		cur = jiffies;
2860 		if ((time_after_eq(cur, next_wakeup)) ||
2861 		    (MAX_JIFFY_OFFSET == next_wakeup)) {
2862 			cond_resched();
2863 			continue;
2864 		}
2865 
2866 		schedule_timeout_interruptible(next_wakeup - cur);
2867 
2868 		if (kthread_should_stop()) {
2869 			ext4_clear_request_list();
2870 			goto exit_thread;
2871 		}
2872 	}
2873 
2874 exit_thread:
2875 	/*
2876 	 * It looks like the request list is empty, but we need
2877 	 * to check it under the li_list_mtx lock, to prevent any
2878 	 * additions into it, and of course we should lock ext4_li_mtx
2879 	 * to atomically free the list and ext4_li_info, because at
2880 	 * this point another ext4 filesystem could be registering
2881 	 * new one.
2882 	 */
2883 	mutex_lock(&ext4_li_mtx);
2884 	mutex_lock(&eli->li_list_mtx);
2885 	if (!list_empty(&eli->li_request_list)) {
2886 		mutex_unlock(&eli->li_list_mtx);
2887 		mutex_unlock(&ext4_li_mtx);
2888 		goto cont_thread;
2889 	}
2890 	mutex_unlock(&eli->li_list_mtx);
2891 	kfree(ext4_li_info);
2892 	ext4_li_info = NULL;
2893 	mutex_unlock(&ext4_li_mtx);
2894 
2895 	return 0;
2896 }
2897 
2898 static void ext4_clear_request_list(void)
2899 {
2900 	struct list_head *pos, *n;
2901 	struct ext4_li_request *elr;
2902 
2903 	mutex_lock(&ext4_li_info->li_list_mtx);
2904 	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
2905 		elr = list_entry(pos, struct ext4_li_request,
2906 				 lr_request);
2907 		ext4_remove_li_request(elr);
2908 	}
2909 	mutex_unlock(&ext4_li_info->li_list_mtx);
2910 }
2911 
2912 static int ext4_run_lazyinit_thread(void)
2913 {
2914 	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
2915 					 ext4_li_info, "ext4lazyinit");
2916 	if (IS_ERR(ext4_lazyinit_task)) {
2917 		int err = PTR_ERR(ext4_lazyinit_task);
2918 		ext4_clear_request_list();
2919 		kfree(ext4_li_info);
2920 		ext4_li_info = NULL;
2921 		printk(KERN_CRIT "EXT4: error %d creating inode table "
2922 				 "initialization thread\n",
2923 				 err);
2924 		return err;
2925 	}
2926 	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2927 	return 0;
2928 }
2929 
2930 /*
2931  * Check whether it make sense to run itable init. thread or not.
2932  * If there is at least one uninitialized inode table, return
2933  * corresponding group number, else the loop goes through all
2934  * groups and return total number of groups.
2935  */
2936 static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
2937 {
2938 	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
2939 	struct ext4_group_desc *gdp = NULL;
2940 
2941 	for (group = 0; group < ngroups; group++) {
2942 		gdp = ext4_get_group_desc(sb, group, NULL);
2943 		if (!gdp)
2944 			continue;
2945 
2946 		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2947 			break;
2948 	}
2949 
2950 	return group;
2951 }
2952 
2953 static int ext4_li_info_new(void)
2954 {
2955 	struct ext4_lazy_init *eli = NULL;
2956 
2957 	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
2958 	if (!eli)
2959 		return -ENOMEM;
2960 
2961 	INIT_LIST_HEAD(&eli->li_request_list);
2962 	mutex_init(&eli->li_list_mtx);
2963 
2964 	eli->li_state |= EXT4_LAZYINIT_QUIT;
2965 
2966 	ext4_li_info = eli;
2967 
2968 	return 0;
2969 }
2970 
2971 static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
2972 					    ext4_group_t start)
2973 {
2974 	struct ext4_sb_info *sbi = EXT4_SB(sb);
2975 	struct ext4_li_request *elr;
2976 	unsigned long rnd;
2977 
2978 	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
2979 	if (!elr)
2980 		return NULL;
2981 
2982 	elr->lr_super = sb;
2983 	elr->lr_sbi = sbi;
2984 	elr->lr_next_group = start;
2985 
2986 	/*
2987 	 * Randomize first schedule time of the request to
2988 	 * spread the inode table initialization requests
2989 	 * better.
2990 	 */
2991 	get_random_bytes(&rnd, sizeof(rnd));
2992 	elr->lr_next_sched = jiffies + (unsigned long)rnd %
2993 			     (EXT4_DEF_LI_MAX_START_DELAY * HZ);
2994 
2995 	return elr;
2996 }
2997 
2998 static int ext4_register_li_request(struct super_block *sb,
2999 				    ext4_group_t first_not_zeroed)
3000 {
3001 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3002 	struct ext4_li_request *elr;
3003 	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
3004 	int ret = 0;
3005 
3006 	if (sbi->s_li_request != NULL) {
3007 		/*
3008 		 * Reset timeout so it can be computed again, because
3009 		 * s_li_wait_mult might have changed.
3010 		 */
3011 		sbi->s_li_request->lr_timeout = 0;
3012 		return 0;
3013 	}
3014 
3015 	if (first_not_zeroed == ngroups ||
3016 	    (sb->s_flags & MS_RDONLY) ||
3017 	    !test_opt(sb, INIT_INODE_TABLE))
3018 		return 0;
3019 
3020 	elr = ext4_li_request_new(sb, first_not_zeroed);
3021 	if (!elr)
3022 		return -ENOMEM;
3023 
3024 	mutex_lock(&ext4_li_mtx);
3025 
3026 	if (NULL == ext4_li_info) {
3027 		ret = ext4_li_info_new();
3028 		if (ret)
3029 			goto out;
3030 	}
3031 
3032 	mutex_lock(&ext4_li_info->li_list_mtx);
3033 	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3034 	mutex_unlock(&ext4_li_info->li_list_mtx);
3035 
3036 	sbi->s_li_request = elr;
3037 	/*
3038 	 * set elr to NULL here since it has been inserted to
3039 	 * the request_list and the removal and free of it is
3040 	 * handled by ext4_clear_request_list from now on.
3041 	 */
3042 	elr = NULL;
3043 
3044 	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3045 		ret = ext4_run_lazyinit_thread();
3046 		if (ret)
3047 			goto out;
3048 	}
3049 out:
3050 	mutex_unlock(&ext4_li_mtx);
3051 	if (ret)
3052 		kfree(elr);
3053 	return ret;
3054 }
3055 
3056 /*
3057  * We do not need to lock anything since this is called on
3058  * module unload.
3059  */
3060 static void ext4_destroy_lazyinit_thread(void)
3061 {
3062 	/*
3063 	 * If thread exited earlier
3064 	 * there's nothing to be done.
3065 	 */
3066 	if (!ext4_li_info || !ext4_lazyinit_task)
3067 		return;
3068 
3069 	kthread_stop(ext4_lazyinit_task);
3070 }
3071 
3072 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3073 				__releases(kernel_lock)
3074 				__acquires(kernel_lock)
3075 {
3076 	char *orig_data = kstrdup(data, GFP_KERNEL);
3077 	struct buffer_head *bh;
3078 	struct ext4_super_block *es = NULL;
3079 	struct ext4_sb_info *sbi;
3080 	ext4_fsblk_t block;
3081 	ext4_fsblk_t sb_block = get_sb_block(&data);
3082 	ext4_fsblk_t logical_sb_block;
3083 	unsigned long offset = 0;
3084 	unsigned long journal_devnum = 0;
3085 	unsigned long def_mount_opts;
3086 	struct inode *root;
3087 	char *cp;
3088 	const char *descr;
3089 	int ret = -ENOMEM;
3090 	int blocksize;
3091 	unsigned int db_count;
3092 	unsigned int i;
3093 	int needs_recovery, has_huge_files;
3094 	__u64 blocks_count;
3095 	int err;
3096 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3097 	ext4_group_t first_not_zeroed;
3098 
3099 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3100 	if (!sbi)
3101 		goto out_free_orig;
3102 
3103 	sbi->s_blockgroup_lock =
3104 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3105 	if (!sbi->s_blockgroup_lock) {
3106 		kfree(sbi);
3107 		goto out_free_orig;
3108 	}
3109 	sb->s_fs_info = sbi;
3110 	sbi->s_mount_opt = 0;
3111 	sbi->s_resuid = EXT4_DEF_RESUID;
3112 	sbi->s_resgid = EXT4_DEF_RESGID;
3113 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3114 	sbi->s_sb_block = sb_block;
3115 	if (sb->s_bdev->bd_part)
3116 		sbi->s_sectors_written_start =
3117 			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
3118 
3119 	/* Cleanup superblock name */
3120 	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
3121 		*cp = '!';
3122 
3123 	ret = -EINVAL;
3124 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3125 	if (!blocksize) {
3126 		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3127 		goto out_fail;
3128 	}
3129 
3130 	/*
3131 	 * The ext4 superblock will not be buffer aligned for other than 1kB
3132 	 * block sizes.  We need to calculate the offset from buffer start.
3133 	 */
3134 	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3135 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3136 		offset = do_div(logical_sb_block, blocksize);
3137 	} else {
3138 		logical_sb_block = sb_block;
3139 	}
3140 
3141 	if (!(bh = sb_bread(sb, logical_sb_block))) {
3142 		ext4_msg(sb, KERN_ERR, "unable to read superblock");
3143 		goto out_fail;
3144 	}
3145 	/*
3146 	 * Note: s_es must be initialized as soon as possible because
3147 	 *       some ext4 macro-instructions depend on its value
3148 	 */
3149 	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
3150 	sbi->s_es = es;
3151 	sb->s_magic = le16_to_cpu(es->s_magic);
3152 	if (sb->s_magic != EXT4_SUPER_MAGIC)
3153 		goto cantfind_ext4;
3154 	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3155 
3156 	/* Set defaults before we parse the mount options */
3157 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3158 	set_opt(sb, INIT_INODE_TABLE);
3159 	if (def_mount_opts & EXT4_DEFM_DEBUG)
3160 		set_opt(sb, DEBUG);
3161 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
3162 		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
3163 			"2.6.38");
3164 		set_opt(sb, GRPID);
3165 	}
3166 	if (def_mount_opts & EXT4_DEFM_UID16)
3167 		set_opt(sb, NO_UID32);
3168 	/* xattr user namespace & acls are now defaulted on */
3169 #ifdef CONFIG_EXT4_FS_XATTR
3170 	set_opt(sb, XATTR_USER);
3171 #endif
3172 #ifdef CONFIG_EXT4_FS_POSIX_ACL
3173 	set_opt(sb, POSIX_ACL);
3174 #endif
3175 	set_opt(sb, MBLK_IO_SUBMIT);
3176 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3177 		set_opt(sb, JOURNAL_DATA);
3178 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3179 		set_opt(sb, ORDERED_DATA);
3180 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3181 		set_opt(sb, WRITEBACK_DATA);
3182 
3183 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3184 		set_opt(sb, ERRORS_PANIC);
3185 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3186 		set_opt(sb, ERRORS_CONT);
3187 	else
3188 		set_opt(sb, ERRORS_RO);
3189 	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
3190 		set_opt(sb, BLOCK_VALIDITY);
3191 	if (def_mount_opts & EXT4_DEFM_DISCARD)
3192 		set_opt(sb, DISCARD);
3193 
3194 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
3195 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
3196 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
3197 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
3198 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3199 
3200 	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3201 		set_opt(sb, BARRIER);
3202 
3203 	/*
3204 	 * enable delayed allocation by default
3205 	 * Use -o nodelalloc to turn it off
3206 	 */
3207 	if (!IS_EXT3_SB(sb) &&
3208 	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3209 		set_opt(sb, DELALLOC);
3210 
3211 	/*
3212 	 * set default s_li_wait_mult for lazyinit, for the case there is
3213 	 * no mount option specified.
3214 	 */
3215 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3216 
3217 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3218 			   &journal_devnum, &journal_ioprio, NULL, 0)) {
3219 		ext4_msg(sb, KERN_WARNING,
3220 			 "failed to parse options in superblock: %s",
3221 			 sbi->s_es->s_mount_opts);
3222 	}
3223 	if (!parse_options((char *) data, sb, &journal_devnum,
3224 			   &journal_ioprio, NULL, 0))
3225 		goto failed_mount;
3226 
3227 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3228 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
3229 
3230 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
3231 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
3232 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
3233 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
3234 		ext4_msg(sb, KERN_WARNING,
3235 		       "feature flags set on rev 0 fs, "
3236 		       "running e2fsck is recommended");
3237 
3238 	if (IS_EXT2_SB(sb)) {
3239 		if (ext2_feature_set_ok(sb))
3240 			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3241 				 "using the ext4 subsystem");
3242 		else {
3243 			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3244 				 "to feature incompatibilities");
3245 			goto failed_mount;
3246 		}
3247 	}
3248 
3249 	if (IS_EXT3_SB(sb)) {
3250 		if (ext3_feature_set_ok(sb))
3251 			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3252 				 "using the ext4 subsystem");
3253 		else {
3254 			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3255 				 "to feature incompatibilities");
3256 			goto failed_mount;
3257 		}
3258 	}
3259 
3260 	/*
3261 	 * Check feature flags regardless of the revision level, since we
3262 	 * previously didn't change the revision level when setting the flags,
3263 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
3264 	 */
3265 	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
3266 		goto failed_mount;
3267 
3268 	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3269 
3270 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3271 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
3272 		ext4_msg(sb, KERN_ERR,
3273 		       "Unsupported filesystem blocksize %d", blocksize);
3274 		goto failed_mount;
3275 	}
3276 
3277 	if (sb->s_blocksize != blocksize) {
3278 		/* Validate the filesystem blocksize */
3279 		if (!sb_set_blocksize(sb, blocksize)) {
3280 			ext4_msg(sb, KERN_ERR, "bad block size %d",
3281 					blocksize);
3282 			goto failed_mount;
3283 		}
3284 
3285 		brelse(bh);
3286 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3287 		offset = do_div(logical_sb_block, blocksize);
3288 		bh = sb_bread(sb, logical_sb_block);
3289 		if (!bh) {
3290 			ext4_msg(sb, KERN_ERR,
3291 			       "Can't read superblock on 2nd try");
3292 			goto failed_mount;
3293 		}
3294 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
3295 		sbi->s_es = es;
3296 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
3297 			ext4_msg(sb, KERN_ERR,
3298 			       "Magic mismatch, very weird!");
3299 			goto failed_mount;
3300 		}
3301 	}
3302 
3303 	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
3304 				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
3305 	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
3306 						      has_huge_files);
3307 	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
3308 
3309 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
3310 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
3311 		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
3312 	} else {
3313 		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
3314 		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
3315 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
3316 		    (!is_power_of_2(sbi->s_inode_size)) ||
3317 		    (sbi->s_inode_size > blocksize)) {
3318 			ext4_msg(sb, KERN_ERR,
3319 			       "unsupported inode size: %d",
3320 			       sbi->s_inode_size);
3321 			goto failed_mount;
3322 		}
3323 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
3324 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
3325 	}
3326 
3327 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
3328 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
3329 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
3330 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
3331 		    !is_power_of_2(sbi->s_desc_size)) {
3332 			ext4_msg(sb, KERN_ERR,
3333 			       "unsupported descriptor size %lu",
3334 			       sbi->s_desc_size);
3335 			goto failed_mount;
3336 		}
3337 	} else
3338 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
3339 
3340 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
3341 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
3342 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
3343 		goto cantfind_ext4;
3344 
3345 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
3346 	if (sbi->s_inodes_per_block == 0)
3347 		goto cantfind_ext4;
3348 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
3349 					sbi->s_inodes_per_block;
3350 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
3351 	sbi->s_sbh = bh;
3352 	sbi->s_mount_state = le16_to_cpu(es->s_state);
3353 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3354 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3355 
3356 	for (i = 0; i < 4; i++)
3357 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3358 	sbi->s_def_hash_version = es->s_def_hash_version;
3359 	i = le32_to_cpu(es->s_flags);
3360 	if (i & EXT2_FLAGS_UNSIGNED_HASH)
3361 		sbi->s_hash_unsigned = 3;
3362 	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
3363 #ifdef __CHAR_UNSIGNED__
3364 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
3365 		sbi->s_hash_unsigned = 3;
3366 #else
3367 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
3368 #endif
3369 		sb->s_dirt = 1;
3370 	}
3371 
3372 	if (sbi->s_blocks_per_group > blocksize * 8) {
3373 		ext4_msg(sb, KERN_ERR,
3374 		       "#blocks per group too big: %lu",
3375 		       sbi->s_blocks_per_group);
3376 		goto failed_mount;
3377 	}
3378 	if (sbi->s_inodes_per_group > blocksize * 8) {
3379 		ext4_msg(sb, KERN_ERR,
3380 		       "#inodes per group too big: %lu",
3381 		       sbi->s_inodes_per_group);
3382 		goto failed_mount;
3383 	}
3384 
3385 	/*
3386 	 * Test whether we have more sectors than will fit in sector_t,
3387 	 * and whether the max offset is addressable by the page cache.
3388 	 */
3389 	err = generic_check_addressable(sb->s_blocksize_bits,
3390 					ext4_blocks_count(es));
3391 	if (err) {
3392 		ext4_msg(sb, KERN_ERR, "filesystem"
3393 			 " too large to mount safely on this system");
3394 		if (sizeof(sector_t) < 8)
3395 			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
3396 		ret = err;
3397 		goto failed_mount;
3398 	}
3399 
3400 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
3401 		goto cantfind_ext4;
3402 
3403 	/* check blocks count against device size */
3404 	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
3405 	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
3406 		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
3407 		       "exceeds size of device (%llu blocks)",
3408 		       ext4_blocks_count(es), blocks_count);
3409 		goto failed_mount;
3410 	}
3411 
3412 	/*
3413 	 * It makes no sense for the first data block to be beyond the end
3414 	 * of the filesystem.
3415 	 */
3416 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
3417                 ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
3418 			 "block %u is beyond end of filesystem (%llu)",
3419 			 le32_to_cpu(es->s_first_data_block),
3420 			 ext4_blocks_count(es));
3421 		goto failed_mount;
3422 	}
3423 	blocks_count = (ext4_blocks_count(es) -
3424 			le32_to_cpu(es->s_first_data_block) +
3425 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
3426 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
3427 	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
3428 		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
3429 		       "(block count %llu, first data block %u, "
3430 		       "blocks per group %lu)", sbi->s_groups_count,
3431 		       ext4_blocks_count(es),
3432 		       le32_to_cpu(es->s_first_data_block),
3433 		       EXT4_BLOCKS_PER_GROUP(sb));
3434 		goto failed_mount;
3435 	}
3436 	sbi->s_groups_count = blocks_count;
3437 	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
3438 			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
3439 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
3440 		   EXT4_DESC_PER_BLOCK(sb);
3441 	sbi->s_group_desc = ext4_kvmalloc(db_count *
3442 					  sizeof(struct buffer_head *),
3443 					  GFP_KERNEL);
3444 	if (sbi->s_group_desc == NULL) {
3445 		ext4_msg(sb, KERN_ERR, "not enough memory");
3446 		goto failed_mount;
3447 	}
3448 
3449 #ifdef CONFIG_PROC_FS
3450 	if (ext4_proc_root)
3451 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
3452 #endif
3453 
3454 	bgl_lock_init(sbi->s_blockgroup_lock);
3455 
3456 	for (i = 0; i < db_count; i++) {
3457 		block = descriptor_loc(sb, logical_sb_block, i);
3458 		sbi->s_group_desc[i] = sb_bread(sb, block);
3459 		if (!sbi->s_group_desc[i]) {
3460 			ext4_msg(sb, KERN_ERR,
3461 			       "can't read group descriptor %d", i);
3462 			db_count = i;
3463 			goto failed_mount2;
3464 		}
3465 	}
3466 	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
3467 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3468 		goto failed_mount2;
3469 	}
3470 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
3471 		if (!ext4_fill_flex_info(sb)) {
3472 			ext4_msg(sb, KERN_ERR,
3473 			       "unable to initialize "
3474 			       "flex_bg meta info!");
3475 			goto failed_mount2;
3476 		}
3477 
3478 	sbi->s_gdb_count = db_count;
3479 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3480 	spin_lock_init(&sbi->s_next_gen_lock);
3481 
3482 	init_timer(&sbi->s_err_report);
3483 	sbi->s_err_report.function = print_daily_error_info;
3484 	sbi->s_err_report.data = (unsigned long) sb;
3485 
3486 	err = percpu_counter_init(&sbi->s_freeblocks_counter,
3487 			ext4_count_free_blocks(sb));
3488 	if (!err) {
3489 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
3490 				ext4_count_free_inodes(sb));
3491 	}
3492 	if (!err) {
3493 		err = percpu_counter_init(&sbi->s_dirs_counter,
3494 				ext4_count_dirs(sb));
3495 	}
3496 	if (!err) {
3497 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
3498 	}
3499 	if (err) {
3500 		ext4_msg(sb, KERN_ERR, "insufficient memory");
3501 		goto failed_mount3;
3502 	}
3503 
3504 	sbi->s_stripe = ext4_get_stripe_size(sbi);
3505 	sbi->s_max_writeback_mb_bump = 128;
3506 
3507 	/*
3508 	 * set up enough so that it can read an inode
3509 	 */
3510 	if (!test_opt(sb, NOLOAD) &&
3511 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
3512 		sb->s_op = &ext4_sops;
3513 	else
3514 		sb->s_op = &ext4_nojournal_sops;
3515 	sb->s_export_op = &ext4_export_ops;
3516 	sb->s_xattr = ext4_xattr_handlers;
3517 #ifdef CONFIG_QUOTA
3518 	sb->s_qcop = &ext4_qctl_operations;
3519 	sb->dq_op = &ext4_quota_operations;
3520 #endif
3521 	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3522 
3523 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3524 	mutex_init(&sbi->s_orphan_lock);
3525 	sbi->s_resize_flags = 0;
3526 
3527 	sb->s_root = NULL;
3528 
3529 	needs_recovery = (es->s_last_orphan != 0 ||
3530 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
3531 				    EXT4_FEATURE_INCOMPAT_RECOVER));
3532 
3533 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3534 	    !(sb->s_flags & MS_RDONLY))
3535 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3536 			goto failed_mount3;
3537 
3538 	/*
3539 	 * The first inode we look at is the journal inode.  Don't try
3540 	 * root first: it may be modified in the journal!
3541 	 */
3542 	if (!test_opt(sb, NOLOAD) &&
3543 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
3544 		if (ext4_load_journal(sb, es, journal_devnum))
3545 			goto failed_mount3;
3546 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
3547 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3548 		ext4_msg(sb, KERN_ERR, "required journal recovery "
3549 		       "suppressed and not mounted read-only");
3550 		goto failed_mount_wq;
3551 	} else {
3552 		clear_opt(sb, DATA_FLAGS);
3553 		sbi->s_journal = NULL;
3554 		needs_recovery = 0;
3555 		goto no_journal;
3556 	}
3557 
3558 	if (ext4_blocks_count(es) > 0xffffffffULL &&
3559 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
3560 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
3561 		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
3562 		goto failed_mount_wq;
3563 	}
3564 
3565 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3566 		jbd2_journal_set_features(sbi->s_journal,
3567 				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3568 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3569 	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3570 		jbd2_journal_set_features(sbi->s_journal,
3571 				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
3572 		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3573 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3574 	} else {
3575 		jbd2_journal_clear_features(sbi->s_journal,
3576 				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3577 				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3578 	}
3579 
3580 	/* We have now updated the journal if required, so we can
3581 	 * validate the data journaling mode. */
3582 	switch (test_opt(sb, DATA_FLAGS)) {
3583 	case 0:
3584 		/* No mode set, assume a default based on the journal
3585 		 * capabilities: ORDERED_DATA if the journal can
3586 		 * cope, else JOURNAL_DATA
3587 		 */
3588 		if (jbd2_journal_check_available_features
3589 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
3590 			set_opt(sb, ORDERED_DATA);
3591 		else
3592 			set_opt(sb, JOURNAL_DATA);
3593 		break;
3594 
3595 	case EXT4_MOUNT_ORDERED_DATA:
3596 	case EXT4_MOUNT_WRITEBACK_DATA:
3597 		if (!jbd2_journal_check_available_features
3598 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
3599 			ext4_msg(sb, KERN_ERR, "Journal does not support "
3600 			       "requested data journaling mode");
3601 			goto failed_mount_wq;
3602 		}
3603 	default:
3604 		break;
3605 	}
3606 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
3607 
3608 	/*
3609 	 * The journal may have updated the bg summary counts, so we
3610 	 * need to update the global counters.
3611 	 */
3612 	percpu_counter_set(&sbi->s_freeblocks_counter,
3613 			   ext4_count_free_blocks(sb));
3614 	percpu_counter_set(&sbi->s_freeinodes_counter,
3615 			   ext4_count_free_inodes(sb));
3616 	percpu_counter_set(&sbi->s_dirs_counter,
3617 			   ext4_count_dirs(sb));
3618 	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3619 
3620 no_journal:
3621 	/*
3622 	 * The maximum number of concurrent works can be high and
3623 	 * concurrency isn't really necessary.  Limit it to 1.
3624 	 */
3625 	EXT4_SB(sb)->dio_unwritten_wq =
3626 		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3627 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
3628 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3629 		goto failed_mount_wq;
3630 	}
3631 
3632 	/*
3633 	 * The jbd2_journal_load will have done any necessary log recovery,
3634 	 * so we can safely mount the rest of the filesystem now.
3635 	 */
3636 
3637 	root = ext4_iget(sb, EXT4_ROOT_INO);
3638 	if (IS_ERR(root)) {
3639 		ext4_msg(sb, KERN_ERR, "get root inode failed");
3640 		ret = PTR_ERR(root);
3641 		root = NULL;
3642 		goto failed_mount4;
3643 	}
3644 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3645 		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3646 		goto failed_mount4;
3647 	}
3648 	sb->s_root = d_alloc_root(root);
3649 	if (!sb->s_root) {
3650 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
3651 		ret = -ENOMEM;
3652 		goto failed_mount4;
3653 	}
3654 
3655 	ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
3656 
3657 	/* determine the minimum size of new large inodes, if present */
3658 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
3659 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
3660 						     EXT4_GOOD_OLD_INODE_SIZE;
3661 		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3662 				       EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
3663 			if (sbi->s_want_extra_isize <
3664 			    le16_to_cpu(es->s_want_extra_isize))
3665 				sbi->s_want_extra_isize =
3666 					le16_to_cpu(es->s_want_extra_isize);
3667 			if (sbi->s_want_extra_isize <
3668 			    le16_to_cpu(es->s_min_extra_isize))
3669 				sbi->s_want_extra_isize =
3670 					le16_to_cpu(es->s_min_extra_isize);
3671 		}
3672 	}
3673 	/* Check if enough inode space is available */
3674 	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
3675 							sbi->s_inode_size) {
3676 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
3677 						       EXT4_GOOD_OLD_INODE_SIZE;
3678 		ext4_msg(sb, KERN_INFO, "required extra inode space not"
3679 			 "available");
3680 	}
3681 
3682 	if (test_opt(sb, DELALLOC) &&
3683 	    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
3684 		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
3685 			 "requested data journaling mode");
3686 		clear_opt(sb, DELALLOC);
3687 	}
3688 	if (test_opt(sb, DIOREAD_NOLOCK)) {
3689 		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
3690 			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3691 				"option - requested data journaling mode");
3692 			clear_opt(sb, DIOREAD_NOLOCK);
3693 		}
3694 		if (sb->s_blocksize < PAGE_SIZE) {
3695 			ext4_msg(sb, KERN_WARNING, "Ignoring dioread_nolock "
3696 				"option - block size is too small");
3697 			clear_opt(sb, DIOREAD_NOLOCK);
3698 		}
3699 	}
3700 
3701 	err = ext4_setup_system_zone(sb);
3702 	if (err) {
3703 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
3704 			 "zone (%d)", err);
3705 		goto failed_mount4;
3706 	}
3707 
3708 	ext4_ext_init(sb);
3709 	err = ext4_mb_init(sb, needs_recovery);
3710 	if (err) {
3711 		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
3712 			 err);
3713 		goto failed_mount4;
3714 	}
3715 
3716 	err = ext4_register_li_request(sb, first_not_zeroed);
3717 	if (err)
3718 		goto failed_mount4;
3719 
3720 	sbi->s_kobj.kset = ext4_kset;
3721 	init_completion(&sbi->s_kobj_unregister);
3722 	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
3723 				   "%s", sb->s_id);
3724 	if (err) {
3725 		ext4_mb_release(sb);
3726 		ext4_ext_release(sb);
3727 		goto failed_mount4;
3728 	};
3729 
3730 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
3731 	ext4_orphan_cleanup(sb, es);
3732 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
3733 	if (needs_recovery) {
3734 		ext4_msg(sb, KERN_INFO, "recovery complete");
3735 		ext4_mark_recovery_complete(sb, es);
3736 	}
3737 	if (EXT4_SB(sb)->s_journal) {
3738 		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
3739 			descr = " journalled data mode";
3740 		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
3741 			descr = " ordered data mode";
3742 		else
3743 			descr = " writeback data mode";
3744 	} else
3745 		descr = "out journal";
3746 
3747 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3748 		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3749 		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3750 
3751 	if (es->s_error_count)
3752 		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3753 
3754 	kfree(orig_data);
3755 	return 0;
3756 
3757 cantfind_ext4:
3758 	if (!silent)
3759 		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
3760 	goto failed_mount;
3761 
3762 failed_mount4:
3763 	iput(root);
3764 	sb->s_root = NULL;
3765 	ext4_msg(sb, KERN_ERR, "mount failed");
3766 	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3767 failed_mount_wq:
3768 	ext4_release_system_zone(sb);
3769 	if (sbi->s_journal) {
3770 		jbd2_journal_destroy(sbi->s_journal);
3771 		sbi->s_journal = NULL;
3772 	}
3773 failed_mount3:
3774 	del_timer(&sbi->s_err_report);
3775 	if (sbi->s_flex_groups)
3776 		ext4_kvfree(sbi->s_flex_groups);
3777 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
3778 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
3779 	percpu_counter_destroy(&sbi->s_dirs_counter);
3780 	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3781 	if (sbi->s_mmp_tsk)
3782 		kthread_stop(sbi->s_mmp_tsk);
3783 failed_mount2:
3784 	for (i = 0; i < db_count; i++)
3785 		brelse(sbi->s_group_desc[i]);
3786 	ext4_kvfree(sbi->s_group_desc);
3787 failed_mount:
3788 	if (sbi->s_proc) {
3789 		remove_proc_entry(sb->s_id, ext4_proc_root);
3790 	}
3791 #ifdef CONFIG_QUOTA
3792 	for (i = 0; i < MAXQUOTAS; i++)
3793 		kfree(sbi->s_qf_names[i]);
3794 #endif
3795 	ext4_blkdev_remove(sbi);
3796 	brelse(bh);
3797 out_fail:
3798 	sb->s_fs_info = NULL;
3799 	kfree(sbi->s_blockgroup_lock);
3800 	kfree(sbi);
3801 out_free_orig:
3802 	kfree(orig_data);
3803 	return ret;
3804 }
3805 
3806 /*
3807  * Setup any per-fs journal parameters now.  We'll do this both on
3808  * initial mount, once the journal has been initialised but before we've
3809  * done any recovery; and again on any subsequent remount.
3810  */
3811 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3812 {
3813 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3814 
3815 	journal->j_commit_interval = sbi->s_commit_interval;
3816 	journal->j_min_batch_time = sbi->s_min_batch_time;
3817 	journal->j_max_batch_time = sbi->s_max_batch_time;
3818 
3819 	write_lock(&journal->j_state_lock);
3820 	if (test_opt(sb, BARRIER))
3821 		journal->j_flags |= JBD2_BARRIER;
3822 	else
3823 		journal->j_flags &= ~JBD2_BARRIER;
3824 	if (test_opt(sb, DATA_ERR_ABORT))
3825 		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
3826 	else
3827 		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
3828 	write_unlock(&journal->j_state_lock);
3829 }
3830 
3831 static journal_t *ext4_get_journal(struct super_block *sb,
3832 				   unsigned int journal_inum)
3833 {
3834 	struct inode *journal_inode;
3835 	journal_t *journal;
3836 
3837 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
3838 
3839 	/* First, test for the existence of a valid inode on disk.  Bad
3840 	 * things happen if we iget() an unused inode, as the subsequent
3841 	 * iput() will try to delete it. */
3842 
3843 	journal_inode = ext4_iget(sb, journal_inum);
3844 	if (IS_ERR(journal_inode)) {
3845 		ext4_msg(sb, KERN_ERR, "no journal found");
3846 		return NULL;
3847 	}
3848 	if (!journal_inode->i_nlink) {
3849 		make_bad_inode(journal_inode);
3850 		iput(journal_inode);
3851 		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
3852 		return NULL;
3853 	}
3854 
3855 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
3856 		  journal_inode, journal_inode->i_size);
3857 	if (!S_ISREG(journal_inode->i_mode)) {
3858 		ext4_msg(sb, KERN_ERR, "invalid journal inode");
3859 		iput(journal_inode);
3860 		return NULL;
3861 	}
3862 
3863 	journal = jbd2_journal_init_inode(journal_inode);
3864 	if (!journal) {
3865 		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
3866 		iput(journal_inode);
3867 		return NULL;
3868 	}
3869 	journal->j_private = sb;
3870 	ext4_init_journal_params(sb, journal);
3871 	return journal;
3872 }
3873 
3874 static journal_t *ext4_get_dev_journal(struct super_block *sb,
3875 				       dev_t j_dev)
3876 {
3877 	struct buffer_head *bh;
3878 	journal_t *journal;
3879 	ext4_fsblk_t start;
3880 	ext4_fsblk_t len;
3881 	int hblock, blocksize;
3882 	ext4_fsblk_t sb_block;
3883 	unsigned long offset;
3884 	struct ext4_super_block *es;
3885 	struct block_device *bdev;
3886 
3887 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
3888 
3889 	bdev = ext4_blkdev_get(j_dev, sb);
3890 	if (bdev == NULL)
3891 		return NULL;
3892 
3893 	blocksize = sb->s_blocksize;
3894 	hblock = bdev_logical_block_size(bdev);
3895 	if (blocksize < hblock) {
3896 		ext4_msg(sb, KERN_ERR,
3897 			"blocksize too small for journal device");
3898 		goto out_bdev;
3899 	}
3900 
3901 	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
3902 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
3903 	set_blocksize(bdev, blocksize);
3904 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
3905 		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
3906 		       "external journal");
3907 		goto out_bdev;
3908 	}
3909 
3910 	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
3911 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
3912 	    !(le32_to_cpu(es->s_feature_incompat) &
3913 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
3914 		ext4_msg(sb, KERN_ERR, "external journal has "
3915 					"bad superblock");
3916 		brelse(bh);
3917 		goto out_bdev;
3918 	}
3919 
3920 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
3921 		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
3922 		brelse(bh);
3923 		goto out_bdev;
3924 	}
3925 
3926 	len = ext4_blocks_count(es);
3927 	start = sb_block + 1;
3928 	brelse(bh);	/* we're done with the superblock */
3929 
3930 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
3931 					start, len, blocksize);
3932 	if (!journal) {
3933 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
3934 		goto out_bdev;
3935 	}
3936 	journal->j_private = sb;
3937 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
3938 	wait_on_buffer(journal->j_sb_buffer);
3939 	if (!buffer_uptodate(journal->j_sb_buffer)) {
3940 		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
3941 		goto out_journal;
3942 	}
3943 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
3944 		ext4_msg(sb, KERN_ERR, "External journal has more than one "
3945 					"user (unsupported) - %d",
3946 			be32_to_cpu(journal->j_superblock->s_nr_users));
3947 		goto out_journal;
3948 	}
3949 	EXT4_SB(sb)->journal_bdev = bdev;
3950 	ext4_init_journal_params(sb, journal);
3951 	return journal;
3952 
3953 out_journal:
3954 	jbd2_journal_destroy(journal);
3955 out_bdev:
3956 	ext4_blkdev_put(bdev);
3957 	return NULL;
3958 }
3959 
3960 static int ext4_load_journal(struct super_block *sb,
3961 			     struct ext4_super_block *es,
3962 			     unsigned long journal_devnum)
3963 {
3964 	journal_t *journal;
3965 	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
3966 	dev_t journal_dev;
3967 	int err = 0;
3968 	int really_read_only;
3969 
3970 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
3971 
3972 	if (journal_devnum &&
3973 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
3974 		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
3975 			"numbers have changed");
3976 		journal_dev = new_decode_dev(journal_devnum);
3977 	} else
3978 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
3979 
3980 	really_read_only = bdev_read_only(sb->s_bdev);
3981 
3982 	/*
3983 	 * Are we loading a blank journal or performing recovery after a
3984 	 * crash?  For recovery, we need to check in advance whether we
3985 	 * can get read-write access to the device.
3986 	 */
3987 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3988 		if (sb->s_flags & MS_RDONLY) {
3989 			ext4_msg(sb, KERN_INFO, "INFO: recovery "
3990 					"required on readonly filesystem");
3991 			if (really_read_only) {
3992 				ext4_msg(sb, KERN_ERR, "write access "
3993 					"unavailable, cannot proceed");
3994 				return -EROFS;
3995 			}
3996 			ext4_msg(sb, KERN_INFO, "write access will "
3997 			       "be enabled during recovery");
3998 		}
3999 	}
4000 
4001 	if (journal_inum && journal_dev) {
4002 		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
4003 		       "and inode journals!");
4004 		return -EINVAL;
4005 	}
4006 
4007 	if (journal_inum) {
4008 		if (!(journal = ext4_get_journal(sb, journal_inum)))
4009 			return -EINVAL;
4010 	} else {
4011 		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
4012 			return -EINVAL;
4013 	}
4014 
4015 	if (!(journal->j_flags & JBD2_BARRIER))
4016 		ext4_msg(sb, KERN_INFO, "barriers disabled");
4017 
4018 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
4019 		err = jbd2_journal_update_format(journal);
4020 		if (err)  {
4021 			ext4_msg(sb, KERN_ERR, "error updating journal");
4022 			jbd2_journal_destroy(journal);
4023 			return err;
4024 		}
4025 	}
4026 
4027 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
4028 		err = jbd2_journal_wipe(journal, !really_read_only);
4029 	if (!err) {
4030 		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
4031 		if (save)
4032 			memcpy(save, ((char *) es) +
4033 			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
4034 		err = jbd2_journal_load(journal);
4035 		if (save)
4036 			memcpy(((char *) es) + EXT4_S_ERR_START,
4037 			       save, EXT4_S_ERR_LEN);
4038 		kfree(save);
4039 	}
4040 
4041 	if (err) {
4042 		ext4_msg(sb, KERN_ERR, "error loading journal");
4043 		jbd2_journal_destroy(journal);
4044 		return err;
4045 	}
4046 
4047 	EXT4_SB(sb)->s_journal = journal;
4048 	ext4_clear_journal_err(sb, es);
4049 
4050 	if (!really_read_only && journal_devnum &&
4051 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
4052 		es->s_journal_dev = cpu_to_le32(journal_devnum);
4053 
4054 		/* Make sure we flush the recovery flag to disk. */
4055 		ext4_commit_super(sb, 1);
4056 	}
4057 
4058 	return 0;
4059 }
4060 
4061 static int ext4_commit_super(struct super_block *sb, int sync)
4062 {
4063 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
4064 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4065 	int error = 0;
4066 
4067 	if (!sbh)
4068 		return error;
4069 	if (buffer_write_io_error(sbh)) {
4070 		/*
4071 		 * Oh, dear.  A previous attempt to write the
4072 		 * superblock failed.  This could happen because the
4073 		 * USB device was yanked out.  Or it could happen to
4074 		 * be a transient write error and maybe the block will
4075 		 * be remapped.  Nothing we can do but to retry the
4076 		 * write and hope for the best.
4077 		 */
4078 		ext4_msg(sb, KERN_ERR, "previous I/O error to "
4079 		       "superblock detected");
4080 		clear_buffer_write_io_error(sbh);
4081 		set_buffer_uptodate(sbh);
4082 	}
4083 	/*
4084 	 * If the file system is mounted read-only, don't update the
4085 	 * superblock write time.  This avoids updating the superblock
4086 	 * write time when we are mounting the root file system
4087 	 * read/only but we need to replay the journal; at that point,
4088 	 * for people who are east of GMT and who make their clock
4089 	 * tick in localtime for Windows bug-for-bug compatibility,
4090 	 * the clock is set in the future, and this will cause e2fsck
4091 	 * to complain and force a full file system check.
4092 	 */
4093 	if (!(sb->s_flags & MS_RDONLY))
4094 		es->s_wtime = cpu_to_le32(get_seconds());
4095 	if (sb->s_bdev->bd_part)
4096 		es->s_kbytes_written =
4097 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
4098 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
4099 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
4100 	else
4101 		es->s_kbytes_written =
4102 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4103 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
4104 					   &EXT4_SB(sb)->s_freeblocks_counter));
4105 	es->s_free_inodes_count =
4106 		cpu_to_le32(percpu_counter_sum_positive(
4107 				&EXT4_SB(sb)->s_freeinodes_counter));
4108 	sb->s_dirt = 0;
4109 	BUFFER_TRACE(sbh, "marking dirty");
4110 	mark_buffer_dirty(sbh);
4111 	if (sync) {
4112 		error = sync_dirty_buffer(sbh);
4113 		if (error)
4114 			return error;
4115 
4116 		error = buffer_write_io_error(sbh);
4117 		if (error) {
4118 			ext4_msg(sb, KERN_ERR, "I/O error while writing "
4119 			       "superblock");
4120 			clear_buffer_write_io_error(sbh);
4121 			set_buffer_uptodate(sbh);
4122 		}
4123 	}
4124 	return error;
4125 }
4126 
4127 /*
4128  * Have we just finished recovery?  If so, and if we are mounting (or
4129  * remounting) the filesystem readonly, then we will end up with a
4130  * consistent fs on disk.  Record that fact.
4131  */
4132 static void ext4_mark_recovery_complete(struct super_block *sb,
4133 					struct ext4_super_block *es)
4134 {
4135 	journal_t *journal = EXT4_SB(sb)->s_journal;
4136 
4137 	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
4138 		BUG_ON(journal != NULL);
4139 		return;
4140 	}
4141 	jbd2_journal_lock_updates(journal);
4142 	if (jbd2_journal_flush(journal) < 0)
4143 		goto out;
4144 
4145 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
4146 	    sb->s_flags & MS_RDONLY) {
4147 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4148 		ext4_commit_super(sb, 1);
4149 	}
4150 
4151 out:
4152 	jbd2_journal_unlock_updates(journal);
4153 }
4154 
4155 /*
4156  * If we are mounting (or read-write remounting) a filesystem whose journal
4157  * has recorded an error from a previous lifetime, move that error to the
4158  * main filesystem now.
4159  */
4160 static void ext4_clear_journal_err(struct super_block *sb,
4161 				   struct ext4_super_block *es)
4162 {
4163 	journal_t *journal;
4164 	int j_errno;
4165 	const char *errstr;
4166 
4167 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
4168 
4169 	journal = EXT4_SB(sb)->s_journal;
4170 
4171 	/*
4172 	 * Now check for any error status which may have been recorded in the
4173 	 * journal by a prior ext4_error() or ext4_abort()
4174 	 */
4175 
4176 	j_errno = jbd2_journal_errno(journal);
4177 	if (j_errno) {
4178 		char nbuf[16];
4179 
4180 		errstr = ext4_decode_error(sb, j_errno, nbuf);
4181 		ext4_warning(sb, "Filesystem error recorded "
4182 			     "from previous mount: %s", errstr);
4183 		ext4_warning(sb, "Marking fs in need of filesystem check.");
4184 
4185 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
4186 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
4187 		ext4_commit_super(sb, 1);
4188 
4189 		jbd2_journal_clear_err(journal);
4190 	}
4191 }
4192 
4193 /*
4194  * Force the running and committing transactions to commit,
4195  * and wait on the commit.
4196  */
4197 int ext4_force_commit(struct super_block *sb)
4198 {
4199 	journal_t *journal;
4200 	int ret = 0;
4201 
4202 	if (sb->s_flags & MS_RDONLY)
4203 		return 0;
4204 
4205 	journal = EXT4_SB(sb)->s_journal;
4206 	if (journal) {
4207 		vfs_check_frozen(sb, SB_FREEZE_TRANS);
4208 		ret = ext4_journal_force_commit(journal);
4209 	}
4210 
4211 	return ret;
4212 }
4213 
4214 static void ext4_write_super(struct super_block *sb)
4215 {
4216 	lock_super(sb);
4217 	ext4_commit_super(sb, 1);
4218 	unlock_super(sb);
4219 }
4220 
4221 static int ext4_sync_fs(struct super_block *sb, int wait)
4222 {
4223 	int ret = 0;
4224 	tid_t target;
4225 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4226 
4227 	trace_ext4_sync_fs(sb, wait);
4228 	flush_workqueue(sbi->dio_unwritten_wq);
4229 	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4230 		if (wait)
4231 			jbd2_log_wait_commit(sbi->s_journal, target);
4232 	}
4233 	return ret;
4234 }
4235 
4236 /*
4237  * LVM calls this function before a (read-only) snapshot is created.  This
4238  * gives us a chance to flush the journal completely and mark the fs clean.
4239  *
4240  * Note that only this function cannot bring a filesystem to be in a clean
4241  * state independently, because ext4 prevents a new handle from being started
4242  * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
4243  * the upper layer.
4244  */
4245 static int ext4_freeze(struct super_block *sb)
4246 {
4247 	int error = 0;
4248 	journal_t *journal;
4249 
4250 	if (sb->s_flags & MS_RDONLY)
4251 		return 0;
4252 
4253 	journal = EXT4_SB(sb)->s_journal;
4254 
4255 	/* Now we set up the journal barrier. */
4256 	jbd2_journal_lock_updates(journal);
4257 
4258 	/*
4259 	 * Don't clear the needs_recovery flag if we failed to flush
4260 	 * the journal.
4261 	 */
4262 	error = jbd2_journal_flush(journal);
4263 	if (error < 0)
4264 		goto out;
4265 
4266 	/* Journal blocked and flushed, clear needs_recovery flag. */
4267 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4268 	error = ext4_commit_super(sb, 1);
4269 out:
4270 	/* we rely on s_frozen to stop further updates */
4271 	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4272 	return error;
4273 }
4274 
4275 /*
4276  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
4277  * flag here, even though the filesystem is not technically dirty yet.
4278  */
4279 static int ext4_unfreeze(struct super_block *sb)
4280 {
4281 	if (sb->s_flags & MS_RDONLY)
4282 		return 0;
4283 
4284 	lock_super(sb);
4285 	/* Reset the needs_recovery flag before the fs is unlocked. */
4286 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4287 	ext4_commit_super(sb, 1);
4288 	unlock_super(sb);
4289 	return 0;
4290 }
4291 
4292 /*
4293  * Structure to save mount options for ext4_remount's benefit
4294  */
4295 struct ext4_mount_options {
4296 	unsigned long s_mount_opt;
4297 	unsigned long s_mount_opt2;
4298 	uid_t s_resuid;
4299 	gid_t s_resgid;
4300 	unsigned long s_commit_interval;
4301 	u32 s_min_batch_time, s_max_batch_time;
4302 #ifdef CONFIG_QUOTA
4303 	int s_jquota_fmt;
4304 	char *s_qf_names[MAXQUOTAS];
4305 #endif
4306 };
4307 
4308 static int ext4_remount(struct super_block *sb, int *flags, char *data)
4309 {
4310 	struct ext4_super_block *es;
4311 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4312 	ext4_fsblk_t n_blocks_count = 0;
4313 	unsigned long old_sb_flags;
4314 	struct ext4_mount_options old_opts;
4315 	int enable_quota = 0;
4316 	ext4_group_t g;
4317 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4318 	int err = 0;
4319 #ifdef CONFIG_QUOTA
4320 	int i;
4321 #endif
4322 	char *orig_data = kstrdup(data, GFP_KERNEL);
4323 
4324 	/* Store the original options */
4325 	lock_super(sb);
4326 	old_sb_flags = sb->s_flags;
4327 	old_opts.s_mount_opt = sbi->s_mount_opt;
4328 	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
4329 	old_opts.s_resuid = sbi->s_resuid;
4330 	old_opts.s_resgid = sbi->s_resgid;
4331 	old_opts.s_commit_interval = sbi->s_commit_interval;
4332 	old_opts.s_min_batch_time = sbi->s_min_batch_time;
4333 	old_opts.s_max_batch_time = sbi->s_max_batch_time;
4334 #ifdef CONFIG_QUOTA
4335 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
4336 	for (i = 0; i < MAXQUOTAS; i++)
4337 		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
4338 #endif
4339 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
4340 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
4341 
4342 	/*
4343 	 * Allow the "check" option to be passed as a remount option.
4344 	 */
4345 	if (!parse_options(data, sb, NULL, &journal_ioprio,
4346 			   &n_blocks_count, 1)) {
4347 		err = -EINVAL;
4348 		goto restore_opts;
4349 	}
4350 
4351 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
4352 		ext4_abort(sb, "Abort forced by user");
4353 
4354 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
4355 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
4356 
4357 	es = sbi->s_es;
4358 
4359 	if (sbi->s_journal) {
4360 		ext4_init_journal_params(sb, sbi->s_journal);
4361 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4362 	}
4363 
4364 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
4365 		n_blocks_count > ext4_blocks_count(es)) {
4366 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
4367 			err = -EROFS;
4368 			goto restore_opts;
4369 		}
4370 
4371 		if (*flags & MS_RDONLY) {
4372 			err = dquot_suspend(sb, -1);
4373 			if (err < 0)
4374 				goto restore_opts;
4375 
4376 			/*
4377 			 * First of all, the unconditional stuff we have to do
4378 			 * to disable replay of the journal when we next remount
4379 			 */
4380 			sb->s_flags |= MS_RDONLY;
4381 
4382 			/*
4383 			 * OK, test if we are remounting a valid rw partition
4384 			 * readonly, and if so set the rdonly flag and then
4385 			 * mark the partition as valid again.
4386 			 */
4387 			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
4388 			    (sbi->s_mount_state & EXT4_VALID_FS))
4389 				es->s_state = cpu_to_le16(sbi->s_mount_state);
4390 
4391 			if (sbi->s_journal)
4392 				ext4_mark_recovery_complete(sb, es);
4393 		} else {
4394 			/* Make sure we can mount this feature set readwrite */
4395 			if (!ext4_feature_set_ok(sb, 0)) {
4396 				err = -EROFS;
4397 				goto restore_opts;
4398 			}
4399 			/*
4400 			 * Make sure the group descriptor checksums
4401 			 * are sane.  If they aren't, refuse to remount r/w.
4402 			 */
4403 			for (g = 0; g < sbi->s_groups_count; g++) {
4404 				struct ext4_group_desc *gdp =
4405 					ext4_get_group_desc(sb, g, NULL);
4406 
4407 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
4408 					ext4_msg(sb, KERN_ERR,
4409 	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
4410 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
4411 					       le16_to_cpu(gdp->bg_checksum));
4412 					err = -EINVAL;
4413 					goto restore_opts;
4414 				}
4415 			}
4416 
4417 			/*
4418 			 * If we have an unprocessed orphan list hanging
4419 			 * around from a previously readonly bdev mount,
4420 			 * require a full umount/remount for now.
4421 			 */
4422 			if (es->s_last_orphan) {
4423 				ext4_msg(sb, KERN_WARNING, "Couldn't "
4424 				       "remount RDWR because of unprocessed "
4425 				       "orphan inode list.  Please "
4426 				       "umount/remount instead");
4427 				err = -EINVAL;
4428 				goto restore_opts;
4429 			}
4430 
4431 			/*
4432 			 * Mounting a RDONLY partition read-write, so reread
4433 			 * and store the current valid flag.  (It may have
4434 			 * been changed by e2fsck since we originally mounted
4435 			 * the partition.)
4436 			 */
4437 			if (sbi->s_journal)
4438 				ext4_clear_journal_err(sb, es);
4439 			sbi->s_mount_state = le16_to_cpu(es->s_state);
4440 			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
4441 				goto restore_opts;
4442 			if (!ext4_setup_super(sb, es, 0))
4443 				sb->s_flags &= ~MS_RDONLY;
4444 			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4445 						     EXT4_FEATURE_INCOMPAT_MMP))
4446 				if (ext4_multi_mount_protect(sb,
4447 						le64_to_cpu(es->s_mmp_block))) {
4448 					err = -EROFS;
4449 					goto restore_opts;
4450 				}
4451 			enable_quota = 1;
4452 		}
4453 	}
4454 
4455 	/*
4456 	 * Reinitialize lazy itable initialization thread based on
4457 	 * current settings
4458 	 */
4459 	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
4460 		ext4_unregister_li_request(sb);
4461 	else {
4462 		ext4_group_t first_not_zeroed;
4463 		first_not_zeroed = ext4_has_uninit_itable(sb);
4464 		ext4_register_li_request(sb, first_not_zeroed);
4465 	}
4466 
4467 	ext4_setup_system_zone(sb);
4468 	if (sbi->s_journal == NULL)
4469 		ext4_commit_super(sb, 1);
4470 
4471 #ifdef CONFIG_QUOTA
4472 	/* Release old quota file names */
4473 	for (i = 0; i < MAXQUOTAS; i++)
4474 		if (old_opts.s_qf_names[i] &&
4475 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
4476 			kfree(old_opts.s_qf_names[i]);
4477 #endif
4478 	unlock_super(sb);
4479 	if (enable_quota)
4480 		dquot_resume(sb, -1);
4481 
4482 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
4483 	kfree(orig_data);
4484 	return 0;
4485 
4486 restore_opts:
4487 	sb->s_flags = old_sb_flags;
4488 	sbi->s_mount_opt = old_opts.s_mount_opt;
4489 	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
4490 	sbi->s_resuid = old_opts.s_resuid;
4491 	sbi->s_resgid = old_opts.s_resgid;
4492 	sbi->s_commit_interval = old_opts.s_commit_interval;
4493 	sbi->s_min_batch_time = old_opts.s_min_batch_time;
4494 	sbi->s_max_batch_time = old_opts.s_max_batch_time;
4495 #ifdef CONFIG_QUOTA
4496 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
4497 	for (i = 0; i < MAXQUOTAS; i++) {
4498 		if (sbi->s_qf_names[i] &&
4499 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
4500 			kfree(sbi->s_qf_names[i]);
4501 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
4502 	}
4503 #endif
4504 	unlock_super(sb);
4505 	kfree(orig_data);
4506 	return err;
4507 }
4508 
4509 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4510 {
4511 	struct super_block *sb = dentry->d_sb;
4512 	struct ext4_sb_info *sbi = EXT4_SB(sb);
4513 	struct ext4_super_block *es = sbi->s_es;
4514 	u64 fsid;
4515 	s64 bfree;
4516 
4517 	if (test_opt(sb, MINIX_DF)) {
4518 		sbi->s_overhead_last = 0;
4519 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
4520 		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4521 		ext4_fsblk_t overhead = 0;
4522 
4523 		/*
4524 		 * Compute the overhead (FS structures).  This is constant
4525 		 * for a given filesystem unless the number of block groups
4526 		 * changes so we cache the previous value until it does.
4527 		 */
4528 
4529 		/*
4530 		 * All of the blocks before first_data_block are
4531 		 * overhead
4532 		 */
4533 		overhead = le32_to_cpu(es->s_first_data_block);
4534 
4535 		/*
4536 		 * Add the overhead attributed to the superblock and
4537 		 * block group descriptors.  If the sparse superblocks
4538 		 * feature is turned on, then not all groups have this.
4539 		 */
4540 		for (i = 0; i < ngroups; i++) {
4541 			overhead += ext4_bg_has_super(sb, i) +
4542 				ext4_bg_num_gdb(sb, i);
4543 			cond_resched();
4544 		}
4545 
4546 		/*
4547 		 * Every block group has an inode bitmap, a block
4548 		 * bitmap, and an inode table.
4549 		 */
4550 		overhead += ngroups * (2 + sbi->s_itb_per_group);
4551 		sbi->s_overhead_last = overhead;
4552 		smp_wmb();
4553 		sbi->s_blocks_last = ext4_blocks_count(es);
4554 	}
4555 
4556 	buf->f_type = EXT4_SUPER_MAGIC;
4557 	buf->f_bsize = sb->s_blocksize;
4558 	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4559 	bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4560 		       percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4561 	/* prevent underflow in case that few free space is available */
4562 	buf->f_bfree = max_t(s64, bfree, 0);
4563 	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4564 	if (buf->f_bfree < ext4_r_blocks_count(es))
4565 		buf->f_bavail = 0;
4566 	buf->f_files = le32_to_cpu(es->s_inodes_count);
4567 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
4568 	buf->f_namelen = EXT4_NAME_LEN;
4569 	fsid = le64_to_cpup((void *)es->s_uuid) ^
4570 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
4571 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
4572 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
4573 
4574 	return 0;
4575 }
4576 
4577 /* Helper function for writing quotas on sync - we need to start transaction
4578  * before quota file is locked for write. Otherwise the are possible deadlocks:
4579  * Process 1                         Process 2
4580  * ext4_create()                     quota_sync()
4581  *   jbd2_journal_start()                  write_dquot()
4582  *   dquot_initialize()                         down(dqio_mutex)
4583  *     down(dqio_mutex)                    jbd2_journal_start()
4584  *
4585  */
4586 
4587 #ifdef CONFIG_QUOTA
4588 
4589 static inline struct inode *dquot_to_inode(struct dquot *dquot)
4590 {
4591 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
4592 }
4593 
4594 static int ext4_write_dquot(struct dquot *dquot)
4595 {
4596 	int ret, err;
4597 	handle_t *handle;
4598 	struct inode *inode;
4599 
4600 	inode = dquot_to_inode(dquot);
4601 	handle = ext4_journal_start(inode,
4602 				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
4603 	if (IS_ERR(handle))
4604 		return PTR_ERR(handle);
4605 	ret = dquot_commit(dquot);
4606 	err = ext4_journal_stop(handle);
4607 	if (!ret)
4608 		ret = err;
4609 	return ret;
4610 }
4611 
4612 static int ext4_acquire_dquot(struct dquot *dquot)
4613 {
4614 	int ret, err;
4615 	handle_t *handle;
4616 
4617 	handle = ext4_journal_start(dquot_to_inode(dquot),
4618 				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
4619 	if (IS_ERR(handle))
4620 		return PTR_ERR(handle);
4621 	ret = dquot_acquire(dquot);
4622 	err = ext4_journal_stop(handle);
4623 	if (!ret)
4624 		ret = err;
4625 	return ret;
4626 }
4627 
4628 static int ext4_release_dquot(struct dquot *dquot)
4629 {
4630 	int ret, err;
4631 	handle_t *handle;
4632 
4633 	handle = ext4_journal_start(dquot_to_inode(dquot),
4634 				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
4635 	if (IS_ERR(handle)) {
4636 		/* Release dquot anyway to avoid endless cycle in dqput() */
4637 		dquot_release(dquot);
4638 		return PTR_ERR(handle);
4639 	}
4640 	ret = dquot_release(dquot);
4641 	err = ext4_journal_stop(handle);
4642 	if (!ret)
4643 		ret = err;
4644 	return ret;
4645 }
4646 
4647 static int ext4_mark_dquot_dirty(struct dquot *dquot)
4648 {
4649 	/* Are we journaling quotas? */
4650 	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
4651 	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
4652 		dquot_mark_dquot_dirty(dquot);
4653 		return ext4_write_dquot(dquot);
4654 	} else {
4655 		return dquot_mark_dquot_dirty(dquot);
4656 	}
4657 }
4658 
4659 static int ext4_write_info(struct super_block *sb, int type)
4660 {
4661 	int ret, err;
4662 	handle_t *handle;
4663 
4664 	/* Data block + inode block */
4665 	handle = ext4_journal_start(sb->s_root->d_inode, 2);
4666 	if (IS_ERR(handle))
4667 		return PTR_ERR(handle);
4668 	ret = dquot_commit_info(sb, type);
4669 	err = ext4_journal_stop(handle);
4670 	if (!ret)
4671 		ret = err;
4672 	return ret;
4673 }
4674 
4675 /*
4676  * Turn on quotas during mount time - we need to find
4677  * the quota file and such...
4678  */
4679 static int ext4_quota_on_mount(struct super_block *sb, int type)
4680 {
4681 	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
4682 					EXT4_SB(sb)->s_jquota_fmt, type);
4683 }
4684 
4685 /*
4686  * Standard function to be called on quota_on
4687  */
4688 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4689 			 struct path *path)
4690 {
4691 	int err;
4692 
4693 	if (!test_opt(sb, QUOTA))
4694 		return -EINVAL;
4695 
4696 	/* Quotafile not on the same filesystem? */
4697 	if (path->mnt->mnt_sb != sb)
4698 		return -EXDEV;
4699 	/* Journaling quota? */
4700 	if (EXT4_SB(sb)->s_qf_names[type]) {
4701 		/* Quotafile not in fs root? */
4702 		if (path->dentry->d_parent != sb->s_root)
4703 			ext4_msg(sb, KERN_WARNING,
4704 				"Quota file not on filesystem root. "
4705 				"Journaled quota will not work");
4706 	}
4707 
4708 	/*
4709 	 * When we journal data on quota file, we have to flush journal to see
4710 	 * all updates to the file when we bypass pagecache...
4711 	 */
4712 	if (EXT4_SB(sb)->s_journal &&
4713 	    ext4_should_journal_data(path->dentry->d_inode)) {
4714 		/*
4715 		 * We don't need to lock updates but journal_flush() could
4716 		 * otherwise be livelocked...
4717 		 */
4718 		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
4719 		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
4720 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
4721 		if (err)
4722 			return err;
4723 	}
4724 
4725 	return dquot_quota_on(sb, type, format_id, path);
4726 }
4727 
4728 static int ext4_quota_off(struct super_block *sb, int type)
4729 {
4730 	struct inode *inode = sb_dqopt(sb)->files[type];
4731 	handle_t *handle;
4732 
4733 	/* Force all delayed allocation blocks to be allocated.
4734 	 * Caller already holds s_umount sem */
4735 	if (test_opt(sb, DELALLOC))
4736 		sync_filesystem(sb);
4737 
4738 	if (!inode)
4739 		goto out;
4740 
4741 	/* Update modification times of quota files when userspace can
4742 	 * start looking at them */
4743 	handle = ext4_journal_start(inode, 1);
4744 	if (IS_ERR(handle))
4745 		goto out;
4746 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4747 	ext4_mark_inode_dirty(handle, inode);
4748 	ext4_journal_stop(handle);
4749 
4750 out:
4751 	return dquot_quota_off(sb, type);
4752 }
4753 
4754 /* Read data from quotafile - avoid pagecache and such because we cannot afford
4755  * acquiring the locks... As quota files are never truncated and quota code
4756  * itself serializes the operations (and no one else should touch the files)
4757  * we don't have to be afraid of races */
4758 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
4759 			       size_t len, loff_t off)
4760 {
4761 	struct inode *inode = sb_dqopt(sb)->files[type];
4762 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
4763 	int err = 0;
4764 	int offset = off & (sb->s_blocksize - 1);
4765 	int tocopy;
4766 	size_t toread;
4767 	struct buffer_head *bh;
4768 	loff_t i_size = i_size_read(inode);
4769 
4770 	if (off > i_size)
4771 		return 0;
4772 	if (off+len > i_size)
4773 		len = i_size-off;
4774 	toread = len;
4775 	while (toread > 0) {
4776 		tocopy = sb->s_blocksize - offset < toread ?
4777 				sb->s_blocksize - offset : toread;
4778 		bh = ext4_bread(NULL, inode, blk, 0, &err);
4779 		if (err)
4780 			return err;
4781 		if (!bh)	/* A hole? */
4782 			memset(data, 0, tocopy);
4783 		else
4784 			memcpy(data, bh->b_data+offset, tocopy);
4785 		brelse(bh);
4786 		offset = 0;
4787 		toread -= tocopy;
4788 		data += tocopy;
4789 		blk++;
4790 	}
4791 	return len;
4792 }
4793 
4794 /* Write to quotafile (we know the transaction is already started and has
4795  * enough credits) */
4796 static ssize_t ext4_quota_write(struct super_block *sb, int type,
4797 				const char *data, size_t len, loff_t off)
4798 {
4799 	struct inode *inode = sb_dqopt(sb)->files[type];
4800 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
4801 	int err = 0;
4802 	int offset = off & (sb->s_blocksize - 1);
4803 	struct buffer_head *bh;
4804 	handle_t *handle = journal_current_handle();
4805 
4806 	if (EXT4_SB(sb)->s_journal && !handle) {
4807 		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4808 			" cancelled because transaction is not started",
4809 			(unsigned long long)off, (unsigned long long)len);
4810 		return -EIO;
4811 	}
4812 	/*
4813 	 * Since we account only one data block in transaction credits,
4814 	 * then it is impossible to cross a block boundary.
4815 	 */
4816 	if (sb->s_blocksize - offset < len) {
4817 		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
4818 			" cancelled because not block aligned",
4819 			(unsigned long long)off, (unsigned long long)len);
4820 		return -EIO;
4821 	}
4822 
4823 	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
4824 	bh = ext4_bread(handle, inode, blk, 1, &err);
4825 	if (!bh)
4826 		goto out;
4827 	err = ext4_journal_get_write_access(handle, bh);
4828 	if (err) {
4829 		brelse(bh);
4830 		goto out;
4831 	}
4832 	lock_buffer(bh);
4833 	memcpy(bh->b_data+offset, data, len);
4834 	flush_dcache_page(bh->b_page);
4835 	unlock_buffer(bh);
4836 	err = ext4_handle_dirty_metadata(handle, NULL, bh);
4837 	brelse(bh);
4838 out:
4839 	if (err) {
4840 		mutex_unlock(&inode->i_mutex);
4841 		return err;
4842 	}
4843 	if (inode->i_size < off + len) {
4844 		i_size_write(inode, off + len);
4845 		EXT4_I(inode)->i_disksize = inode->i_size;
4846 		ext4_mark_inode_dirty(handle, inode);
4847 	}
4848 	mutex_unlock(&inode->i_mutex);
4849 	return len;
4850 }
4851 
4852 #endif
4853 
4854 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4855 		       const char *dev_name, void *data)
4856 {
4857 	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
4858 }
4859 
4860 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4861 static inline void register_as_ext2(void)
4862 {
4863 	int err = register_filesystem(&ext2_fs_type);
4864 	if (err)
4865 		printk(KERN_WARNING
4866 		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
4867 }
4868 
4869 static inline void unregister_as_ext2(void)
4870 {
4871 	unregister_filesystem(&ext2_fs_type);
4872 }
4873 
4874 static inline int ext2_feature_set_ok(struct super_block *sb)
4875 {
4876 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4877 		return 0;
4878 	if (sb->s_flags & MS_RDONLY)
4879 		return 1;
4880 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4881 		return 0;
4882 	return 1;
4883 }
4884 MODULE_ALIAS("ext2");
4885 #else
4886 static inline void register_as_ext2(void) { }
4887 static inline void unregister_as_ext2(void) { }
4888 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4889 #endif
4890 
4891 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4892 static inline void register_as_ext3(void)
4893 {
4894 	int err = register_filesystem(&ext3_fs_type);
4895 	if (err)
4896 		printk(KERN_WARNING
4897 		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
4898 }
4899 
4900 static inline void unregister_as_ext3(void)
4901 {
4902 	unregister_filesystem(&ext3_fs_type);
4903 }
4904 
4905 static inline int ext3_feature_set_ok(struct super_block *sb)
4906 {
4907 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4908 		return 0;
4909 	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4910 		return 0;
4911 	if (sb->s_flags & MS_RDONLY)
4912 		return 1;
4913 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4914 		return 0;
4915 	return 1;
4916 }
4917 MODULE_ALIAS("ext3");
4918 #else
4919 static inline void register_as_ext3(void) { }
4920 static inline void unregister_as_ext3(void) { }
4921 static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4922 #endif
4923 
4924 static struct file_system_type ext4_fs_type = {
4925 	.owner		= THIS_MODULE,
4926 	.name		= "ext4",
4927 	.mount		= ext4_mount,
4928 	.kill_sb	= kill_block_super,
4929 	.fs_flags	= FS_REQUIRES_DEV,
4930 };
4931 
4932 static int __init ext4_init_feat_adverts(void)
4933 {
4934 	struct ext4_features *ef;
4935 	int ret = -ENOMEM;
4936 
4937 	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
4938 	if (!ef)
4939 		goto out;
4940 
4941 	ef->f_kobj.kset = ext4_kset;
4942 	init_completion(&ef->f_kobj_unregister);
4943 	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
4944 				   "features");
4945 	if (ret) {
4946 		kfree(ef);
4947 		goto out;
4948 	}
4949 
4950 	ext4_feat = ef;
4951 	ret = 0;
4952 out:
4953 	return ret;
4954 }
4955 
4956 static void ext4_exit_feat_adverts(void)
4957 {
4958 	kobject_put(&ext4_feat->f_kobj);
4959 	wait_for_completion(&ext4_feat->f_kobj_unregister);
4960 	kfree(ext4_feat);
4961 }
4962 
4963 /* Shared across all ext4 file systems */
4964 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4965 struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4966 
4967 static int __init ext4_init_fs(void)
4968 {
4969 	int i, err;
4970 
4971 	ext4_check_flag_values();
4972 
4973 	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4974 		mutex_init(&ext4__aio_mutex[i]);
4975 		init_waitqueue_head(&ext4__ioend_wq[i]);
4976 	}
4977 
4978 	err = ext4_init_pageio();
4979 	if (err)
4980 		return err;
4981 	err = ext4_init_system_zone();
4982 	if (err)
4983 		goto out7;
4984 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4985 	if (!ext4_kset)
4986 		goto out6;
4987 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4988 	if (!ext4_proc_root)
4989 		goto out5;
4990 
4991 	err = ext4_init_feat_adverts();
4992 	if (err)
4993 		goto out4;
4994 
4995 	err = ext4_init_mballoc();
4996 	if (err)
4997 		goto out3;
4998 
4999 	err = ext4_init_xattr();
5000 	if (err)
5001 		goto out2;
5002 	err = init_inodecache();
5003 	if (err)
5004 		goto out1;
5005 	register_as_ext3();
5006 	register_as_ext2();
5007 	err = register_filesystem(&ext4_fs_type);
5008 	if (err)
5009 		goto out;
5010 
5011 	ext4_li_info = NULL;
5012 	mutex_init(&ext4_li_mtx);
5013 	return 0;
5014 out:
5015 	unregister_as_ext2();
5016 	unregister_as_ext3();
5017 	destroy_inodecache();
5018 out1:
5019 	ext4_exit_xattr();
5020 out2:
5021 	ext4_exit_mballoc();
5022 out3:
5023 	ext4_exit_feat_adverts();
5024 out4:
5025 	remove_proc_entry("fs/ext4", NULL);
5026 out5:
5027 	kset_unregister(ext4_kset);
5028 out6:
5029 	ext4_exit_system_zone();
5030 out7:
5031 	ext4_exit_pageio();
5032 	return err;
5033 }
5034 
5035 static void __exit ext4_exit_fs(void)
5036 {
5037 	ext4_destroy_lazyinit_thread();
5038 	unregister_as_ext2();
5039 	unregister_as_ext3();
5040 	unregister_filesystem(&ext4_fs_type);
5041 	destroy_inodecache();
5042 	ext4_exit_xattr();
5043 	ext4_exit_mballoc();
5044 	ext4_exit_feat_adverts();
5045 	remove_proc_entry("fs/ext4", NULL);
5046 	kset_unregister(ext4_kset);
5047 	ext4_exit_system_zone();
5048 	ext4_exit_pageio();
5049 }
5050 
5051 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
5052 MODULE_DESCRIPTION("Fourth Extended Filesystem");
5053 MODULE_LICENSE("GPL");
5054 module_init(ext4_init_fs)
5055 module_exit(ext4_exit_fs)
5056