xref: /openbmc/linux/fs/ext4/extents.c (revision 4209ae12b12265d475bba28634184423149bd14f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4  * Written by Alex Tomas <alex@clusterfs.com>
5  *
6  * Architecture independence:
7  *   Copyright (c) 2005, Bull S.A.
8  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
9  */
10 
11 /*
12  * Extents support for EXT4
13  *
14  * TODO:
15  *   - ext4*_error() should be used in some situations
16  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
17  *   - smart tree reduction
18  */
19 
20 #include <linux/fs.h>
21 #include <linux/time.h>
22 #include <linux/jbd2.h>
23 #include <linux/highuid.h>
24 #include <linux/pagemap.h>
25 #include <linux/quotaops.h>
26 #include <linux/string.h>
27 #include <linux/slab.h>
28 #include <linux/uaccess.h>
29 #include <linux/fiemap.h>
30 #include <linux/backing-dev.h>
31 #include <linux/iomap.h>
32 #include "ext4_jbd2.h"
33 #include "ext4_extents.h"
34 #include "xattr.h"
35 
36 #include <trace/events/ext4.h>
37 
38 /*
39  * used by extent splitting.
40  */
41 #define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
42 					due to ENOSPC */
43 #define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
44 #define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
45 
46 #define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
47 #define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
48 
49 static __le32 ext4_extent_block_csum(struct inode *inode,
50 				     struct ext4_extent_header *eh)
51 {
52 	struct ext4_inode_info *ei = EXT4_I(inode);
53 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
54 	__u32 csum;
55 
56 	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
57 			   EXT4_EXTENT_TAIL_OFFSET(eh));
58 	return cpu_to_le32(csum);
59 }
60 
61 static int ext4_extent_block_csum_verify(struct inode *inode,
62 					 struct ext4_extent_header *eh)
63 {
64 	struct ext4_extent_tail *et;
65 
66 	if (!ext4_has_metadata_csum(inode->i_sb))
67 		return 1;
68 
69 	et = find_ext4_extent_tail(eh);
70 	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
71 		return 0;
72 	return 1;
73 }
74 
75 static void ext4_extent_block_csum_set(struct inode *inode,
76 				       struct ext4_extent_header *eh)
77 {
78 	struct ext4_extent_tail *et;
79 
80 	if (!ext4_has_metadata_csum(inode->i_sb))
81 		return;
82 
83 	et = find_ext4_extent_tail(eh);
84 	et->et_checksum = ext4_extent_block_csum(inode, eh);
85 }
86 
87 static int ext4_split_extent_at(handle_t *handle,
88 			     struct inode *inode,
89 			     struct ext4_ext_path **ppath,
90 			     ext4_lblk_t split,
91 			     int split_flag,
92 			     int flags);
93 
94 static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
95 {
96 	/*
97 	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
98 	 * moment, get_block can be called only for blocks inside i_size since
99 	 * page cache has been already dropped and writes are blocked by
100 	 * i_mutex. So we can safely drop the i_data_sem here.
101 	 */
102 	BUG_ON(EXT4_JOURNAL(inode) == NULL);
103 	ext4_discard_preallocations(inode);
104 	up_write(&EXT4_I(inode)->i_data_sem);
105 	*dropped = 1;
106 	return 0;
107 }
108 
109 /*
110  * Make sure 'handle' has at least 'check_cred' credits. If not, restart
111  * transaction with 'restart_cred' credits. The function drops i_data_sem
112  * when restarting transaction and gets it after transaction is restarted.
113  *
114  * The function returns 0 on success, 1 if transaction had to be restarted,
115  * and < 0 in case of fatal error.
116  */
117 int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
118 				int check_cred, int restart_cred,
119 				int revoke_cred)
120 {
121 	int ret;
122 	int dropped = 0;
123 
124 	ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
125 		revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
126 	if (dropped)
127 		down_write(&EXT4_I(inode)->i_data_sem);
128 	return ret;
129 }
130 
131 /*
132  * could return:
133  *  - EROFS
134  *  - ENOMEM
135  */
136 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
137 				struct ext4_ext_path *path)
138 {
139 	if (path->p_bh) {
140 		/* path points to block */
141 		BUFFER_TRACE(path->p_bh, "get_write_access");
142 		return ext4_journal_get_write_access(handle, path->p_bh);
143 	}
144 	/* path points to leaf/index in inode body */
145 	/* we use in-core data, no need to protect them */
146 	return 0;
147 }
148 
149 /*
150  * could return:
151  *  - EROFS
152  *  - ENOMEM
153  *  - EIO
154  */
155 static int __ext4_ext_dirty(const char *where, unsigned int line,
156 			    handle_t *handle, struct inode *inode,
157 			    struct ext4_ext_path *path)
158 {
159 	int err;
160 
161 	WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
162 	if (path->p_bh) {
163 		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
164 		/* path points to block */
165 		err = __ext4_handle_dirty_metadata(where, line, handle,
166 						   inode, path->p_bh);
167 	} else {
168 		/* path points to leaf/index in inode body */
169 		err = ext4_mark_inode_dirty(handle, inode);
170 	}
171 	return err;
172 }
173 
174 #define ext4_ext_dirty(handle, inode, path) \
175 		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
176 
177 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
178 			      struct ext4_ext_path *path,
179 			      ext4_lblk_t block)
180 {
181 	if (path) {
182 		int depth = path->p_depth;
183 		struct ext4_extent *ex;
184 
185 		/*
186 		 * Try to predict block placement assuming that we are
187 		 * filling in a file which will eventually be
188 		 * non-sparse --- i.e., in the case of libbfd writing
189 		 * an ELF object sections out-of-order but in a way
190 		 * the eventually results in a contiguous object or
191 		 * executable file, or some database extending a table
192 		 * space file.  However, this is actually somewhat
193 		 * non-ideal if we are writing a sparse file such as
194 		 * qemu or KVM writing a raw image file that is going
195 		 * to stay fairly sparse, since it will end up
196 		 * fragmenting the file system's free space.  Maybe we
197 		 * should have some hueristics or some way to allow
198 		 * userspace to pass a hint to file system,
199 		 * especially if the latter case turns out to be
200 		 * common.
201 		 */
202 		ex = path[depth].p_ext;
203 		if (ex) {
204 			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
205 			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
206 
207 			if (block > ext_block)
208 				return ext_pblk + (block - ext_block);
209 			else
210 				return ext_pblk - (ext_block - block);
211 		}
212 
213 		/* it looks like index is empty;
214 		 * try to find starting block from index itself */
215 		if (path[depth].p_bh)
216 			return path[depth].p_bh->b_blocknr;
217 	}
218 
219 	/* OK. use inode's group */
220 	return ext4_inode_to_goal_block(inode);
221 }
222 
223 /*
224  * Allocation for a meta data block
225  */
226 static ext4_fsblk_t
227 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
228 			struct ext4_ext_path *path,
229 			struct ext4_extent *ex, int *err, unsigned int flags)
230 {
231 	ext4_fsblk_t goal, newblock;
232 
233 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
234 	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
235 					NULL, err);
236 	return newblock;
237 }
238 
239 static inline int ext4_ext_space_block(struct inode *inode, int check)
240 {
241 	int size;
242 
243 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
244 			/ sizeof(struct ext4_extent);
245 #ifdef AGGRESSIVE_TEST
246 	if (!check && size > 6)
247 		size = 6;
248 #endif
249 	return size;
250 }
251 
252 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
253 {
254 	int size;
255 
256 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
257 			/ sizeof(struct ext4_extent_idx);
258 #ifdef AGGRESSIVE_TEST
259 	if (!check && size > 5)
260 		size = 5;
261 #endif
262 	return size;
263 }
264 
265 static inline int ext4_ext_space_root(struct inode *inode, int check)
266 {
267 	int size;
268 
269 	size = sizeof(EXT4_I(inode)->i_data);
270 	size -= sizeof(struct ext4_extent_header);
271 	size /= sizeof(struct ext4_extent);
272 #ifdef AGGRESSIVE_TEST
273 	if (!check && size > 3)
274 		size = 3;
275 #endif
276 	return size;
277 }
278 
279 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
280 {
281 	int size;
282 
283 	size = sizeof(EXT4_I(inode)->i_data);
284 	size -= sizeof(struct ext4_extent_header);
285 	size /= sizeof(struct ext4_extent_idx);
286 #ifdef AGGRESSIVE_TEST
287 	if (!check && size > 4)
288 		size = 4;
289 #endif
290 	return size;
291 }
292 
293 static inline int
294 ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
295 			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
296 			   int nofail)
297 {
298 	struct ext4_ext_path *path = *ppath;
299 	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
300 
301 	return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
302 			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
303 			EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
304 			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
305 }
306 
307 static int
308 ext4_ext_max_entries(struct inode *inode, int depth)
309 {
310 	int max;
311 
312 	if (depth == ext_depth(inode)) {
313 		if (depth == 0)
314 			max = ext4_ext_space_root(inode, 1);
315 		else
316 			max = ext4_ext_space_root_idx(inode, 1);
317 	} else {
318 		if (depth == 0)
319 			max = ext4_ext_space_block(inode, 1);
320 		else
321 			max = ext4_ext_space_block_idx(inode, 1);
322 	}
323 
324 	return max;
325 }
326 
327 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
328 {
329 	ext4_fsblk_t block = ext4_ext_pblock(ext);
330 	int len = ext4_ext_get_actual_len(ext);
331 	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
332 
333 	/*
334 	 * We allow neither:
335 	 *  - zero length
336 	 *  - overflow/wrap-around
337 	 */
338 	if (lblock + len <= lblock)
339 		return 0;
340 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
341 }
342 
343 static int ext4_valid_extent_idx(struct inode *inode,
344 				struct ext4_extent_idx *ext_idx)
345 {
346 	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
347 
348 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
349 }
350 
351 static int ext4_valid_extent_entries(struct inode *inode,
352 				     struct ext4_extent_header *eh,
353 				     ext4_fsblk_t *pblk, int depth)
354 {
355 	unsigned short entries;
356 	if (eh->eh_entries == 0)
357 		return 1;
358 
359 	entries = le16_to_cpu(eh->eh_entries);
360 
361 	if (depth == 0) {
362 		/* leaf entries */
363 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
364 		ext4_lblk_t lblock = 0;
365 		ext4_lblk_t prev = 0;
366 		int len = 0;
367 		while (entries) {
368 			if (!ext4_valid_extent(inode, ext))
369 				return 0;
370 
371 			/* Check for overlapping extents */
372 			lblock = le32_to_cpu(ext->ee_block);
373 			len = ext4_ext_get_actual_len(ext);
374 			if ((lblock <= prev) && prev) {
375 				*pblk = ext4_ext_pblock(ext);
376 				return 0;
377 			}
378 			ext++;
379 			entries--;
380 			prev = lblock + len - 1;
381 		}
382 	} else {
383 		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
384 		while (entries) {
385 			if (!ext4_valid_extent_idx(inode, ext_idx))
386 				return 0;
387 			ext_idx++;
388 			entries--;
389 		}
390 	}
391 	return 1;
392 }
393 
394 static int __ext4_ext_check(const char *function, unsigned int line,
395 			    struct inode *inode, struct ext4_extent_header *eh,
396 			    int depth, ext4_fsblk_t pblk)
397 {
398 	const char *error_msg;
399 	int max = 0, err = -EFSCORRUPTED;
400 
401 	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
402 		error_msg = "invalid magic";
403 		goto corrupted;
404 	}
405 	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
406 		error_msg = "unexpected eh_depth";
407 		goto corrupted;
408 	}
409 	if (unlikely(eh->eh_max == 0)) {
410 		error_msg = "invalid eh_max";
411 		goto corrupted;
412 	}
413 	max = ext4_ext_max_entries(inode, depth);
414 	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
415 		error_msg = "too large eh_max";
416 		goto corrupted;
417 	}
418 	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
419 		error_msg = "invalid eh_entries";
420 		goto corrupted;
421 	}
422 	if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
423 		error_msg = "invalid extent entries";
424 		goto corrupted;
425 	}
426 	if (unlikely(depth > 32)) {
427 		error_msg = "too large eh_depth";
428 		goto corrupted;
429 	}
430 	/* Verify checksum on non-root extent tree nodes */
431 	if (ext_depth(inode) != depth &&
432 	    !ext4_extent_block_csum_verify(inode, eh)) {
433 		error_msg = "extent tree corrupted";
434 		err = -EFSBADCRC;
435 		goto corrupted;
436 	}
437 	return 0;
438 
439 corrupted:
440 	ext4_error_inode_err(inode, function, line, 0, -err,
441 			     "pblk %llu bad header/extent: %s - magic %x, "
442 			     "entries %u, max %u(%u), depth %u(%u)",
443 			     (unsigned long long) pblk, error_msg,
444 			     le16_to_cpu(eh->eh_magic),
445 			     le16_to_cpu(eh->eh_entries),
446 			     le16_to_cpu(eh->eh_max),
447 			     max, le16_to_cpu(eh->eh_depth), depth);
448 	return err;
449 }
450 
451 #define ext4_ext_check(inode, eh, depth, pblk)			\
452 	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
453 
454 int ext4_ext_check_inode(struct inode *inode)
455 {
456 	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
457 }
458 
459 static void ext4_cache_extents(struct inode *inode,
460 			       struct ext4_extent_header *eh)
461 {
462 	struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
463 	ext4_lblk_t prev = 0;
464 	int i;
465 
466 	for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
467 		unsigned int status = EXTENT_STATUS_WRITTEN;
468 		ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
469 		int len = ext4_ext_get_actual_len(ex);
470 
471 		if (prev && (prev != lblk))
472 			ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
473 					     EXTENT_STATUS_HOLE);
474 
475 		if (ext4_ext_is_unwritten(ex))
476 			status = EXTENT_STATUS_UNWRITTEN;
477 		ext4_es_cache_extent(inode, lblk, len,
478 				     ext4_ext_pblock(ex), status);
479 		prev = lblk + len;
480 	}
481 }
482 
483 static struct buffer_head *
484 __read_extent_tree_block(const char *function, unsigned int line,
485 			 struct inode *inode, ext4_fsblk_t pblk, int depth,
486 			 int flags)
487 {
488 	struct buffer_head		*bh;
489 	int				err;
490 
491 	bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS);
492 	if (unlikely(!bh))
493 		return ERR_PTR(-ENOMEM);
494 
495 	if (!bh_uptodate_or_lock(bh)) {
496 		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
497 		err = bh_submit_read(bh);
498 		if (err < 0)
499 			goto errout;
500 	}
501 	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
502 		return bh;
503 	if (!ext4_has_feature_journal(inode->i_sb) ||
504 	    (inode->i_ino !=
505 	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) {
506 		err = __ext4_ext_check(function, line, inode,
507 				       ext_block_hdr(bh), depth, pblk);
508 		if (err)
509 			goto errout;
510 	}
511 	set_buffer_verified(bh);
512 	/*
513 	 * If this is a leaf block, cache all of its entries
514 	 */
515 	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
516 		struct ext4_extent_header *eh = ext_block_hdr(bh);
517 		ext4_cache_extents(inode, eh);
518 	}
519 	return bh;
520 errout:
521 	put_bh(bh);
522 	return ERR_PTR(err);
523 
524 }
525 
526 #define read_extent_tree_block(inode, pblk, depth, flags)		\
527 	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
528 				 (depth), (flags))
529 
530 /*
531  * This function is called to cache a file's extent information in the
532  * extent status tree
533  */
534 int ext4_ext_precache(struct inode *inode)
535 {
536 	struct ext4_inode_info *ei = EXT4_I(inode);
537 	struct ext4_ext_path *path = NULL;
538 	struct buffer_head *bh;
539 	int i = 0, depth, ret = 0;
540 
541 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
542 		return 0;	/* not an extent-mapped inode */
543 
544 	down_read(&ei->i_data_sem);
545 	depth = ext_depth(inode);
546 
547 	/* Don't cache anything if there are no external extent blocks */
548 	if (!depth) {
549 		up_read(&ei->i_data_sem);
550 		return ret;
551 	}
552 
553 	path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
554 		       GFP_NOFS);
555 	if (path == NULL) {
556 		up_read(&ei->i_data_sem);
557 		return -ENOMEM;
558 	}
559 
560 	path[0].p_hdr = ext_inode_hdr(inode);
561 	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
562 	if (ret)
563 		goto out;
564 	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
565 	while (i >= 0) {
566 		/*
567 		 * If this is a leaf block or we've reached the end of
568 		 * the index block, go up
569 		 */
570 		if ((i == depth) ||
571 		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
572 			brelse(path[i].p_bh);
573 			path[i].p_bh = NULL;
574 			i--;
575 			continue;
576 		}
577 		bh = read_extent_tree_block(inode,
578 					    ext4_idx_pblock(path[i].p_idx++),
579 					    depth - i - 1,
580 					    EXT4_EX_FORCE_CACHE);
581 		if (IS_ERR(bh)) {
582 			ret = PTR_ERR(bh);
583 			break;
584 		}
585 		i++;
586 		path[i].p_bh = bh;
587 		path[i].p_hdr = ext_block_hdr(bh);
588 		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
589 	}
590 	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
591 out:
592 	up_read(&ei->i_data_sem);
593 	ext4_ext_drop_refs(path);
594 	kfree(path);
595 	return ret;
596 }
597 
598 #ifdef EXT_DEBUG
599 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
600 {
601 	int k, l = path->p_depth;
602 
603 	ext_debug("path:");
604 	for (k = 0; k <= l; k++, path++) {
605 		if (path->p_idx) {
606 			ext_debug("  %d->%llu",
607 				  le32_to_cpu(path->p_idx->ei_block),
608 				  ext4_idx_pblock(path->p_idx));
609 		} else if (path->p_ext) {
610 			ext_debug("  %d:[%d]%d:%llu ",
611 				  le32_to_cpu(path->p_ext->ee_block),
612 				  ext4_ext_is_unwritten(path->p_ext),
613 				  ext4_ext_get_actual_len(path->p_ext),
614 				  ext4_ext_pblock(path->p_ext));
615 		} else
616 			ext_debug("  []");
617 	}
618 	ext_debug("\n");
619 }
620 
621 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
622 {
623 	int depth = ext_depth(inode);
624 	struct ext4_extent_header *eh;
625 	struct ext4_extent *ex;
626 	int i;
627 
628 	if (!path)
629 		return;
630 
631 	eh = path[depth].p_hdr;
632 	ex = EXT_FIRST_EXTENT(eh);
633 
634 	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
635 
636 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
637 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
638 			  ext4_ext_is_unwritten(ex),
639 			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
640 	}
641 	ext_debug("\n");
642 }
643 
644 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
645 			ext4_fsblk_t newblock, int level)
646 {
647 	int depth = ext_depth(inode);
648 	struct ext4_extent *ex;
649 
650 	if (depth != level) {
651 		struct ext4_extent_idx *idx;
652 		idx = path[level].p_idx;
653 		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
654 			ext_debug("%d: move %d:%llu in new index %llu\n", level,
655 					le32_to_cpu(idx->ei_block),
656 					ext4_idx_pblock(idx),
657 					newblock);
658 			idx++;
659 		}
660 
661 		return;
662 	}
663 
664 	ex = path[depth].p_ext;
665 	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
666 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
667 				le32_to_cpu(ex->ee_block),
668 				ext4_ext_pblock(ex),
669 				ext4_ext_is_unwritten(ex),
670 				ext4_ext_get_actual_len(ex),
671 				newblock);
672 		ex++;
673 	}
674 }
675 
676 #else
677 #define ext4_ext_show_path(inode, path)
678 #define ext4_ext_show_leaf(inode, path)
679 #define ext4_ext_show_move(inode, path, newblock, level)
680 #endif
681 
682 void ext4_ext_drop_refs(struct ext4_ext_path *path)
683 {
684 	int depth, i;
685 
686 	if (!path)
687 		return;
688 	depth = path->p_depth;
689 	for (i = 0; i <= depth; i++, path++) {
690 		if (path->p_bh) {
691 			brelse(path->p_bh);
692 			path->p_bh = NULL;
693 		}
694 	}
695 }
696 
697 /*
698  * ext4_ext_binsearch_idx:
699  * binary search for the closest index of the given block
700  * the header must be checked before calling this
701  */
702 static void
703 ext4_ext_binsearch_idx(struct inode *inode,
704 			struct ext4_ext_path *path, ext4_lblk_t block)
705 {
706 	struct ext4_extent_header *eh = path->p_hdr;
707 	struct ext4_extent_idx *r, *l, *m;
708 
709 
710 	ext_debug("binsearch for %u(idx):  ", block);
711 
712 	l = EXT_FIRST_INDEX(eh) + 1;
713 	r = EXT_LAST_INDEX(eh);
714 	while (l <= r) {
715 		m = l + (r - l) / 2;
716 		if (block < le32_to_cpu(m->ei_block))
717 			r = m - 1;
718 		else
719 			l = m + 1;
720 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
721 				m, le32_to_cpu(m->ei_block),
722 				r, le32_to_cpu(r->ei_block));
723 	}
724 
725 	path->p_idx = l - 1;
726 	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
727 		  ext4_idx_pblock(path->p_idx));
728 
729 #ifdef CHECK_BINSEARCH
730 	{
731 		struct ext4_extent_idx *chix, *ix;
732 		int k;
733 
734 		chix = ix = EXT_FIRST_INDEX(eh);
735 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
736 			if (k != 0 && le32_to_cpu(ix->ei_block) <=
737 			    le32_to_cpu(ix[-1].ei_block)) {
738 				printk(KERN_DEBUG "k=%d, ix=0x%p, "
739 				       "first=0x%p\n", k,
740 				       ix, EXT_FIRST_INDEX(eh));
741 				printk(KERN_DEBUG "%u <= %u\n",
742 				       le32_to_cpu(ix->ei_block),
743 				       le32_to_cpu(ix[-1].ei_block));
744 			}
745 			BUG_ON(k && le32_to_cpu(ix->ei_block)
746 					   <= le32_to_cpu(ix[-1].ei_block));
747 			if (block < le32_to_cpu(ix->ei_block))
748 				break;
749 			chix = ix;
750 		}
751 		BUG_ON(chix != path->p_idx);
752 	}
753 #endif
754 
755 }
756 
757 /*
758  * ext4_ext_binsearch:
759  * binary search for closest extent of the given block
760  * the header must be checked before calling this
761  */
762 static void
763 ext4_ext_binsearch(struct inode *inode,
764 		struct ext4_ext_path *path, ext4_lblk_t block)
765 {
766 	struct ext4_extent_header *eh = path->p_hdr;
767 	struct ext4_extent *r, *l, *m;
768 
769 	if (eh->eh_entries == 0) {
770 		/*
771 		 * this leaf is empty:
772 		 * we get such a leaf in split/add case
773 		 */
774 		return;
775 	}
776 
777 	ext_debug("binsearch for %u:  ", block);
778 
779 	l = EXT_FIRST_EXTENT(eh) + 1;
780 	r = EXT_LAST_EXTENT(eh);
781 
782 	while (l <= r) {
783 		m = l + (r - l) / 2;
784 		if (block < le32_to_cpu(m->ee_block))
785 			r = m - 1;
786 		else
787 			l = m + 1;
788 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
789 				m, le32_to_cpu(m->ee_block),
790 				r, le32_to_cpu(r->ee_block));
791 	}
792 
793 	path->p_ext = l - 1;
794 	ext_debug("  -> %d:%llu:[%d]%d ",
795 			le32_to_cpu(path->p_ext->ee_block),
796 			ext4_ext_pblock(path->p_ext),
797 			ext4_ext_is_unwritten(path->p_ext),
798 			ext4_ext_get_actual_len(path->p_ext));
799 
800 #ifdef CHECK_BINSEARCH
801 	{
802 		struct ext4_extent *chex, *ex;
803 		int k;
804 
805 		chex = ex = EXT_FIRST_EXTENT(eh);
806 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
807 			BUG_ON(k && le32_to_cpu(ex->ee_block)
808 					  <= le32_to_cpu(ex[-1].ee_block));
809 			if (block < le32_to_cpu(ex->ee_block))
810 				break;
811 			chex = ex;
812 		}
813 		BUG_ON(chex != path->p_ext);
814 	}
815 #endif
816 
817 }
818 
819 void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
820 {
821 	struct ext4_extent_header *eh;
822 
823 	eh = ext_inode_hdr(inode);
824 	eh->eh_depth = 0;
825 	eh->eh_entries = 0;
826 	eh->eh_magic = EXT4_EXT_MAGIC;
827 	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
828 	ext4_mark_inode_dirty(handle, inode);
829 }
830 
831 struct ext4_ext_path *
832 ext4_find_extent(struct inode *inode, ext4_lblk_t block,
833 		 struct ext4_ext_path **orig_path, int flags)
834 {
835 	struct ext4_extent_header *eh;
836 	struct buffer_head *bh;
837 	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
838 	short int depth, i, ppos = 0;
839 	int ret;
840 
841 	eh = ext_inode_hdr(inode);
842 	depth = ext_depth(inode);
843 	if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
844 		EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
845 				 depth);
846 		ret = -EFSCORRUPTED;
847 		goto err;
848 	}
849 
850 	if (path) {
851 		ext4_ext_drop_refs(path);
852 		if (depth > path[0].p_maxdepth) {
853 			kfree(path);
854 			*orig_path = path = NULL;
855 		}
856 	}
857 	if (!path) {
858 		/* account possible depth increase */
859 		path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
860 				GFP_NOFS);
861 		if (unlikely(!path))
862 			return ERR_PTR(-ENOMEM);
863 		path[0].p_maxdepth = depth + 1;
864 	}
865 	path[0].p_hdr = eh;
866 	path[0].p_bh = NULL;
867 
868 	i = depth;
869 	if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
870 		ext4_cache_extents(inode, eh);
871 	/* walk through the tree */
872 	while (i) {
873 		ext_debug("depth %d: num %d, max %d\n",
874 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
875 
876 		ext4_ext_binsearch_idx(inode, path + ppos, block);
877 		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
878 		path[ppos].p_depth = i;
879 		path[ppos].p_ext = NULL;
880 
881 		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
882 					    flags);
883 		if (IS_ERR(bh)) {
884 			ret = PTR_ERR(bh);
885 			goto err;
886 		}
887 
888 		eh = ext_block_hdr(bh);
889 		ppos++;
890 		path[ppos].p_bh = bh;
891 		path[ppos].p_hdr = eh;
892 	}
893 
894 	path[ppos].p_depth = i;
895 	path[ppos].p_ext = NULL;
896 	path[ppos].p_idx = NULL;
897 
898 	/* find extent */
899 	ext4_ext_binsearch(inode, path + ppos, block);
900 	/* if not an empty leaf */
901 	if (path[ppos].p_ext)
902 		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
903 
904 	ext4_ext_show_path(inode, path);
905 
906 	return path;
907 
908 err:
909 	ext4_ext_drop_refs(path);
910 	kfree(path);
911 	if (orig_path)
912 		*orig_path = NULL;
913 	return ERR_PTR(ret);
914 }
915 
916 /*
917  * ext4_ext_insert_index:
918  * insert new index [@logical;@ptr] into the block at @curp;
919  * check where to insert: before @curp or after @curp
920  */
921 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
922 				 struct ext4_ext_path *curp,
923 				 int logical, ext4_fsblk_t ptr)
924 {
925 	struct ext4_extent_idx *ix;
926 	int len, err;
927 
928 	err = ext4_ext_get_access(handle, inode, curp);
929 	if (err)
930 		return err;
931 
932 	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
933 		EXT4_ERROR_INODE(inode,
934 				 "logical %d == ei_block %d!",
935 				 logical, le32_to_cpu(curp->p_idx->ei_block));
936 		return -EFSCORRUPTED;
937 	}
938 
939 	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
940 			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
941 		EXT4_ERROR_INODE(inode,
942 				 "eh_entries %d >= eh_max %d!",
943 				 le16_to_cpu(curp->p_hdr->eh_entries),
944 				 le16_to_cpu(curp->p_hdr->eh_max));
945 		return -EFSCORRUPTED;
946 	}
947 
948 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
949 		/* insert after */
950 		ext_debug("insert new index %d after: %llu\n", logical, ptr);
951 		ix = curp->p_idx + 1;
952 	} else {
953 		/* insert before */
954 		ext_debug("insert new index %d before: %llu\n", logical, ptr);
955 		ix = curp->p_idx;
956 	}
957 
958 	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
959 	BUG_ON(len < 0);
960 	if (len > 0) {
961 		ext_debug("insert new index %d: "
962 				"move %d indices from 0x%p to 0x%p\n",
963 				logical, len, ix, ix + 1);
964 		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
965 	}
966 
967 	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
968 		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
969 		return -EFSCORRUPTED;
970 	}
971 
972 	ix->ei_block = cpu_to_le32(logical);
973 	ext4_idx_store_pblock(ix, ptr);
974 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
975 
976 	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
977 		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
978 		return -EFSCORRUPTED;
979 	}
980 
981 	err = ext4_ext_dirty(handle, inode, curp);
982 	ext4_std_error(inode->i_sb, err);
983 
984 	return err;
985 }
986 
987 /*
988  * ext4_ext_split:
989  * inserts new subtree into the path, using free index entry
990  * at depth @at:
991  * - allocates all needed blocks (new leaf and all intermediate index blocks)
992  * - makes decision where to split
993  * - moves remaining extents and index entries (right to the split point)
994  *   into the newly allocated blocks
995  * - initializes subtree
996  */
997 static int ext4_ext_split(handle_t *handle, struct inode *inode,
998 			  unsigned int flags,
999 			  struct ext4_ext_path *path,
1000 			  struct ext4_extent *newext, int at)
1001 {
1002 	struct buffer_head *bh = NULL;
1003 	int depth = ext_depth(inode);
1004 	struct ext4_extent_header *neh;
1005 	struct ext4_extent_idx *fidx;
1006 	int i = at, k, m, a;
1007 	ext4_fsblk_t newblock, oldblock;
1008 	__le32 border;
1009 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1010 	int err = 0;
1011 	size_t ext_size = 0;
1012 
1013 	/* make decision: where to split? */
1014 	/* FIXME: now decision is simplest: at current extent */
1015 
1016 	/* if current leaf will be split, then we should use
1017 	 * border from split point */
1018 	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1019 		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1020 		return -EFSCORRUPTED;
1021 	}
1022 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1023 		border = path[depth].p_ext[1].ee_block;
1024 		ext_debug("leaf will be split."
1025 				" next leaf starts at %d\n",
1026 				  le32_to_cpu(border));
1027 	} else {
1028 		border = newext->ee_block;
1029 		ext_debug("leaf will be added."
1030 				" next leaf starts at %d\n",
1031 				le32_to_cpu(border));
1032 	}
1033 
1034 	/*
1035 	 * If error occurs, then we break processing
1036 	 * and mark filesystem read-only. index won't
1037 	 * be inserted and tree will be in consistent
1038 	 * state. Next mount will repair buffers too.
1039 	 */
1040 
1041 	/*
1042 	 * Get array to track all allocated blocks.
1043 	 * We need this to handle errors and free blocks
1044 	 * upon them.
1045 	 */
1046 	ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS);
1047 	if (!ablocks)
1048 		return -ENOMEM;
1049 
1050 	/* allocate all needed blocks */
1051 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1052 	for (a = 0; a < depth - at; a++) {
1053 		newblock = ext4_ext_new_meta_block(handle, inode, path,
1054 						   newext, &err, flags);
1055 		if (newblock == 0)
1056 			goto cleanup;
1057 		ablocks[a] = newblock;
1058 	}
1059 
1060 	/* initialize new leaf */
1061 	newblock = ablocks[--a];
1062 	if (unlikely(newblock == 0)) {
1063 		EXT4_ERROR_INODE(inode, "newblock == 0!");
1064 		err = -EFSCORRUPTED;
1065 		goto cleanup;
1066 	}
1067 	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1068 	if (unlikely(!bh)) {
1069 		err = -ENOMEM;
1070 		goto cleanup;
1071 	}
1072 	lock_buffer(bh);
1073 
1074 	err = ext4_journal_get_create_access(handle, bh);
1075 	if (err)
1076 		goto cleanup;
1077 
1078 	neh = ext_block_hdr(bh);
1079 	neh->eh_entries = 0;
1080 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1081 	neh->eh_magic = EXT4_EXT_MAGIC;
1082 	neh->eh_depth = 0;
1083 
1084 	/* move remainder of path[depth] to the new leaf */
1085 	if (unlikely(path[depth].p_hdr->eh_entries !=
1086 		     path[depth].p_hdr->eh_max)) {
1087 		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1088 				 path[depth].p_hdr->eh_entries,
1089 				 path[depth].p_hdr->eh_max);
1090 		err = -EFSCORRUPTED;
1091 		goto cleanup;
1092 	}
1093 	/* start copy from next extent */
1094 	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1095 	ext4_ext_show_move(inode, path, newblock, depth);
1096 	if (m) {
1097 		struct ext4_extent *ex;
1098 		ex = EXT_FIRST_EXTENT(neh);
1099 		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1100 		le16_add_cpu(&neh->eh_entries, m);
1101 	}
1102 
1103 	/* zero out unused area in the extent block */
1104 	ext_size = sizeof(struct ext4_extent_header) +
1105 		sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1106 	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1107 	ext4_extent_block_csum_set(inode, neh);
1108 	set_buffer_uptodate(bh);
1109 	unlock_buffer(bh);
1110 
1111 	err = ext4_handle_dirty_metadata(handle, inode, bh);
1112 	if (err)
1113 		goto cleanup;
1114 	brelse(bh);
1115 	bh = NULL;
1116 
1117 	/* correct old leaf */
1118 	if (m) {
1119 		err = ext4_ext_get_access(handle, inode, path + depth);
1120 		if (err)
1121 			goto cleanup;
1122 		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1123 		err = ext4_ext_dirty(handle, inode, path + depth);
1124 		if (err)
1125 			goto cleanup;
1126 
1127 	}
1128 
1129 	/* create intermediate indexes */
1130 	k = depth - at - 1;
1131 	if (unlikely(k < 0)) {
1132 		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1133 		err = -EFSCORRUPTED;
1134 		goto cleanup;
1135 	}
1136 	if (k)
1137 		ext_debug("create %d intermediate indices\n", k);
1138 	/* insert new index into current index block */
1139 	/* current depth stored in i var */
1140 	i = depth - 1;
1141 	while (k--) {
1142 		oldblock = newblock;
1143 		newblock = ablocks[--a];
1144 		bh = sb_getblk(inode->i_sb, newblock);
1145 		if (unlikely(!bh)) {
1146 			err = -ENOMEM;
1147 			goto cleanup;
1148 		}
1149 		lock_buffer(bh);
1150 
1151 		err = ext4_journal_get_create_access(handle, bh);
1152 		if (err)
1153 			goto cleanup;
1154 
1155 		neh = ext_block_hdr(bh);
1156 		neh->eh_entries = cpu_to_le16(1);
1157 		neh->eh_magic = EXT4_EXT_MAGIC;
1158 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1159 		neh->eh_depth = cpu_to_le16(depth - i);
1160 		fidx = EXT_FIRST_INDEX(neh);
1161 		fidx->ei_block = border;
1162 		ext4_idx_store_pblock(fidx, oldblock);
1163 
1164 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
1165 				i, newblock, le32_to_cpu(border), oldblock);
1166 
1167 		/* move remainder of path[i] to the new index block */
1168 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1169 					EXT_LAST_INDEX(path[i].p_hdr))) {
1170 			EXT4_ERROR_INODE(inode,
1171 					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1172 					 le32_to_cpu(path[i].p_ext->ee_block));
1173 			err = -EFSCORRUPTED;
1174 			goto cleanup;
1175 		}
1176 		/* start copy indexes */
1177 		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1178 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
1179 				EXT_MAX_INDEX(path[i].p_hdr));
1180 		ext4_ext_show_move(inode, path, newblock, i);
1181 		if (m) {
1182 			memmove(++fidx, path[i].p_idx,
1183 				sizeof(struct ext4_extent_idx) * m);
1184 			le16_add_cpu(&neh->eh_entries, m);
1185 		}
1186 		/* zero out unused area in the extent block */
1187 		ext_size = sizeof(struct ext4_extent_header) +
1188 		   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1189 		memset(bh->b_data + ext_size, 0,
1190 			inode->i_sb->s_blocksize - ext_size);
1191 		ext4_extent_block_csum_set(inode, neh);
1192 		set_buffer_uptodate(bh);
1193 		unlock_buffer(bh);
1194 
1195 		err = ext4_handle_dirty_metadata(handle, inode, bh);
1196 		if (err)
1197 			goto cleanup;
1198 		brelse(bh);
1199 		bh = NULL;
1200 
1201 		/* correct old index */
1202 		if (m) {
1203 			err = ext4_ext_get_access(handle, inode, path + i);
1204 			if (err)
1205 				goto cleanup;
1206 			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1207 			err = ext4_ext_dirty(handle, inode, path + i);
1208 			if (err)
1209 				goto cleanup;
1210 		}
1211 
1212 		i--;
1213 	}
1214 
1215 	/* insert new index */
1216 	err = ext4_ext_insert_index(handle, inode, path + at,
1217 				    le32_to_cpu(border), newblock);
1218 
1219 cleanup:
1220 	if (bh) {
1221 		if (buffer_locked(bh))
1222 			unlock_buffer(bh);
1223 		brelse(bh);
1224 	}
1225 
1226 	if (err) {
1227 		/* free all allocated blocks in error case */
1228 		for (i = 0; i < depth; i++) {
1229 			if (!ablocks[i])
1230 				continue;
1231 			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1232 					 EXT4_FREE_BLOCKS_METADATA);
1233 		}
1234 	}
1235 	kfree(ablocks);
1236 
1237 	return err;
1238 }
1239 
1240 /*
1241  * ext4_ext_grow_indepth:
1242  * implements tree growing procedure:
1243  * - allocates new block
1244  * - moves top-level data (index block or leaf) into the new block
1245  * - initializes new top-level, creating index that points to the
1246  *   just created block
1247  */
1248 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1249 				 unsigned int flags)
1250 {
1251 	struct ext4_extent_header *neh;
1252 	struct buffer_head *bh;
1253 	ext4_fsblk_t newblock, goal = 0;
1254 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1255 	int err = 0;
1256 	size_t ext_size = 0;
1257 
1258 	/* Try to prepend new index to old one */
1259 	if (ext_depth(inode))
1260 		goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1261 	if (goal > le32_to_cpu(es->s_first_data_block)) {
1262 		flags |= EXT4_MB_HINT_TRY_GOAL;
1263 		goal--;
1264 	} else
1265 		goal = ext4_inode_to_goal_block(inode);
1266 	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1267 					NULL, &err);
1268 	if (newblock == 0)
1269 		return err;
1270 
1271 	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1272 	if (unlikely(!bh))
1273 		return -ENOMEM;
1274 	lock_buffer(bh);
1275 
1276 	err = ext4_journal_get_create_access(handle, bh);
1277 	if (err) {
1278 		unlock_buffer(bh);
1279 		goto out;
1280 	}
1281 
1282 	ext_size = sizeof(EXT4_I(inode)->i_data);
1283 	/* move top-level index/leaf into new block */
1284 	memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1285 	/* zero out unused area in the extent block */
1286 	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1287 
1288 	/* set size of new block */
1289 	neh = ext_block_hdr(bh);
1290 	/* old root could have indexes or leaves
1291 	 * so calculate e_max right way */
1292 	if (ext_depth(inode))
1293 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1294 	else
1295 		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1296 	neh->eh_magic = EXT4_EXT_MAGIC;
1297 	ext4_extent_block_csum_set(inode, neh);
1298 	set_buffer_uptodate(bh);
1299 	unlock_buffer(bh);
1300 
1301 	err = ext4_handle_dirty_metadata(handle, inode, bh);
1302 	if (err)
1303 		goto out;
1304 
1305 	/* Update top-level index: num,max,pointer */
1306 	neh = ext_inode_hdr(inode);
1307 	neh->eh_entries = cpu_to_le16(1);
1308 	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1309 	if (neh->eh_depth == 0) {
1310 		/* Root extent block becomes index block */
1311 		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1312 		EXT_FIRST_INDEX(neh)->ei_block =
1313 			EXT_FIRST_EXTENT(neh)->ee_block;
1314 	}
1315 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1316 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1317 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1318 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1319 
1320 	le16_add_cpu(&neh->eh_depth, 1);
1321 	err = ext4_mark_inode_dirty(handle, inode);
1322 out:
1323 	brelse(bh);
1324 
1325 	return err;
1326 }
1327 
1328 /*
1329  * ext4_ext_create_new_leaf:
1330  * finds empty index and adds new leaf.
1331  * if no free index is found, then it requests in-depth growing.
1332  */
1333 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1334 				    unsigned int mb_flags,
1335 				    unsigned int gb_flags,
1336 				    struct ext4_ext_path **ppath,
1337 				    struct ext4_extent *newext)
1338 {
1339 	struct ext4_ext_path *path = *ppath;
1340 	struct ext4_ext_path *curp;
1341 	int depth, i, err = 0;
1342 
1343 repeat:
1344 	i = depth = ext_depth(inode);
1345 
1346 	/* walk up to the tree and look for free index entry */
1347 	curp = path + depth;
1348 	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1349 		i--;
1350 		curp--;
1351 	}
1352 
1353 	/* we use already allocated block for index block,
1354 	 * so subsequent data blocks should be contiguous */
1355 	if (EXT_HAS_FREE_INDEX(curp)) {
1356 		/* if we found index with free entry, then use that
1357 		 * entry: create all needed subtree and add new leaf */
1358 		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1359 		if (err)
1360 			goto out;
1361 
1362 		/* refill path */
1363 		path = ext4_find_extent(inode,
1364 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1365 				    ppath, gb_flags);
1366 		if (IS_ERR(path))
1367 			err = PTR_ERR(path);
1368 	} else {
1369 		/* tree is full, time to grow in depth */
1370 		err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1371 		if (err)
1372 			goto out;
1373 
1374 		/* refill path */
1375 		path = ext4_find_extent(inode,
1376 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1377 				    ppath, gb_flags);
1378 		if (IS_ERR(path)) {
1379 			err = PTR_ERR(path);
1380 			goto out;
1381 		}
1382 
1383 		/*
1384 		 * only first (depth 0 -> 1) produces free space;
1385 		 * in all other cases we have to split the grown tree
1386 		 */
1387 		depth = ext_depth(inode);
1388 		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1389 			/* now we need to split */
1390 			goto repeat;
1391 		}
1392 	}
1393 
1394 out:
1395 	return err;
1396 }
1397 
1398 /*
1399  * search the closest allocated block to the left for *logical
1400  * and returns it at @logical + it's physical address at @phys
1401  * if *logical is the smallest allocated block, the function
1402  * returns 0 at @phys
1403  * return value contains 0 (success) or error code
1404  */
1405 static int ext4_ext_search_left(struct inode *inode,
1406 				struct ext4_ext_path *path,
1407 				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1408 {
1409 	struct ext4_extent_idx *ix;
1410 	struct ext4_extent *ex;
1411 	int depth, ee_len;
1412 
1413 	if (unlikely(path == NULL)) {
1414 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1415 		return -EFSCORRUPTED;
1416 	}
1417 	depth = path->p_depth;
1418 	*phys = 0;
1419 
1420 	if (depth == 0 && path->p_ext == NULL)
1421 		return 0;
1422 
1423 	/* usually extent in the path covers blocks smaller
1424 	 * then *logical, but it can be that extent is the
1425 	 * first one in the file */
1426 
1427 	ex = path[depth].p_ext;
1428 	ee_len = ext4_ext_get_actual_len(ex);
1429 	if (*logical < le32_to_cpu(ex->ee_block)) {
1430 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1431 			EXT4_ERROR_INODE(inode,
1432 					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1433 					 *logical, le32_to_cpu(ex->ee_block));
1434 			return -EFSCORRUPTED;
1435 		}
1436 		while (--depth >= 0) {
1437 			ix = path[depth].p_idx;
1438 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1439 				EXT4_ERROR_INODE(inode,
1440 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1441 				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1442 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1443 		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1444 				  depth);
1445 				return -EFSCORRUPTED;
1446 			}
1447 		}
1448 		return 0;
1449 	}
1450 
1451 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1452 		EXT4_ERROR_INODE(inode,
1453 				 "logical %d < ee_block %d + ee_len %d!",
1454 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1455 		return -EFSCORRUPTED;
1456 	}
1457 
1458 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1459 	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1460 	return 0;
1461 }
1462 
1463 /*
1464  * search the closest allocated block to the right for *logical
1465  * and returns it at @logical + it's physical address at @phys
1466  * if *logical is the largest allocated block, the function
1467  * returns 0 at @phys
1468  * return value contains 0 (success) or error code
1469  */
1470 static int ext4_ext_search_right(struct inode *inode,
1471 				 struct ext4_ext_path *path,
1472 				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1473 				 struct ext4_extent **ret_ex)
1474 {
1475 	struct buffer_head *bh = NULL;
1476 	struct ext4_extent_header *eh;
1477 	struct ext4_extent_idx *ix;
1478 	struct ext4_extent *ex;
1479 	ext4_fsblk_t block;
1480 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1481 	int ee_len;
1482 
1483 	if (unlikely(path == NULL)) {
1484 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1485 		return -EFSCORRUPTED;
1486 	}
1487 	depth = path->p_depth;
1488 	*phys = 0;
1489 
1490 	if (depth == 0 && path->p_ext == NULL)
1491 		return 0;
1492 
1493 	/* usually extent in the path covers blocks smaller
1494 	 * then *logical, but it can be that extent is the
1495 	 * first one in the file */
1496 
1497 	ex = path[depth].p_ext;
1498 	ee_len = ext4_ext_get_actual_len(ex);
1499 	if (*logical < le32_to_cpu(ex->ee_block)) {
1500 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1501 			EXT4_ERROR_INODE(inode,
1502 					 "first_extent(path[%d].p_hdr) != ex",
1503 					 depth);
1504 			return -EFSCORRUPTED;
1505 		}
1506 		while (--depth >= 0) {
1507 			ix = path[depth].p_idx;
1508 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1509 				EXT4_ERROR_INODE(inode,
1510 						 "ix != EXT_FIRST_INDEX *logical %d!",
1511 						 *logical);
1512 				return -EFSCORRUPTED;
1513 			}
1514 		}
1515 		goto found_extent;
1516 	}
1517 
1518 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1519 		EXT4_ERROR_INODE(inode,
1520 				 "logical %d < ee_block %d + ee_len %d!",
1521 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1522 		return -EFSCORRUPTED;
1523 	}
1524 
1525 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1526 		/* next allocated block in this leaf */
1527 		ex++;
1528 		goto found_extent;
1529 	}
1530 
1531 	/* go up and search for index to the right */
1532 	while (--depth >= 0) {
1533 		ix = path[depth].p_idx;
1534 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1535 			goto got_index;
1536 	}
1537 
1538 	/* we've gone up to the root and found no index to the right */
1539 	return 0;
1540 
1541 got_index:
1542 	/* we've found index to the right, let's
1543 	 * follow it and find the closest allocated
1544 	 * block to the right */
1545 	ix++;
1546 	block = ext4_idx_pblock(ix);
1547 	while (++depth < path->p_depth) {
1548 		/* subtract from p_depth to get proper eh_depth */
1549 		bh = read_extent_tree_block(inode, block,
1550 					    path->p_depth - depth, 0);
1551 		if (IS_ERR(bh))
1552 			return PTR_ERR(bh);
1553 		eh = ext_block_hdr(bh);
1554 		ix = EXT_FIRST_INDEX(eh);
1555 		block = ext4_idx_pblock(ix);
1556 		put_bh(bh);
1557 	}
1558 
1559 	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
1560 	if (IS_ERR(bh))
1561 		return PTR_ERR(bh);
1562 	eh = ext_block_hdr(bh);
1563 	ex = EXT_FIRST_EXTENT(eh);
1564 found_extent:
1565 	*logical = le32_to_cpu(ex->ee_block);
1566 	*phys = ext4_ext_pblock(ex);
1567 	*ret_ex = ex;
1568 	if (bh)
1569 		put_bh(bh);
1570 	return 0;
1571 }
1572 
1573 /*
1574  * ext4_ext_next_allocated_block:
1575  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1576  * NOTE: it considers block number from index entry as
1577  * allocated block. Thus, index entries have to be consistent
1578  * with leaves.
1579  */
1580 ext4_lblk_t
1581 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1582 {
1583 	int depth;
1584 
1585 	BUG_ON(path == NULL);
1586 	depth = path->p_depth;
1587 
1588 	if (depth == 0 && path->p_ext == NULL)
1589 		return EXT_MAX_BLOCKS;
1590 
1591 	while (depth >= 0) {
1592 		struct ext4_ext_path *p = &path[depth];
1593 
1594 		if (depth == path->p_depth) {
1595 			/* leaf */
1596 			if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1597 				return le32_to_cpu(p->p_ext[1].ee_block);
1598 		} else {
1599 			/* index */
1600 			if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1601 				return le32_to_cpu(p->p_idx[1].ei_block);
1602 		}
1603 		depth--;
1604 	}
1605 
1606 	return EXT_MAX_BLOCKS;
1607 }
1608 
1609 /*
1610  * ext4_ext_next_leaf_block:
1611  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1612  */
1613 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1614 {
1615 	int depth;
1616 
1617 	BUG_ON(path == NULL);
1618 	depth = path->p_depth;
1619 
1620 	/* zero-tree has no leaf blocks at all */
1621 	if (depth == 0)
1622 		return EXT_MAX_BLOCKS;
1623 
1624 	/* go to index block */
1625 	depth--;
1626 
1627 	while (depth >= 0) {
1628 		if (path[depth].p_idx !=
1629 				EXT_LAST_INDEX(path[depth].p_hdr))
1630 			return (ext4_lblk_t)
1631 				le32_to_cpu(path[depth].p_idx[1].ei_block);
1632 		depth--;
1633 	}
1634 
1635 	return EXT_MAX_BLOCKS;
1636 }
1637 
1638 /*
1639  * ext4_ext_correct_indexes:
1640  * if leaf gets modified and modified extent is first in the leaf,
1641  * then we have to correct all indexes above.
1642  * TODO: do we need to correct tree in all cases?
1643  */
1644 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1645 				struct ext4_ext_path *path)
1646 {
1647 	struct ext4_extent_header *eh;
1648 	int depth = ext_depth(inode);
1649 	struct ext4_extent *ex;
1650 	__le32 border;
1651 	int k, err = 0;
1652 
1653 	eh = path[depth].p_hdr;
1654 	ex = path[depth].p_ext;
1655 
1656 	if (unlikely(ex == NULL || eh == NULL)) {
1657 		EXT4_ERROR_INODE(inode,
1658 				 "ex %p == NULL or eh %p == NULL", ex, eh);
1659 		return -EFSCORRUPTED;
1660 	}
1661 
1662 	if (depth == 0) {
1663 		/* there is no tree at all */
1664 		return 0;
1665 	}
1666 
1667 	if (ex != EXT_FIRST_EXTENT(eh)) {
1668 		/* we correct tree if first leaf got modified only */
1669 		return 0;
1670 	}
1671 
1672 	/*
1673 	 * TODO: we need correction if border is smaller than current one
1674 	 */
1675 	k = depth - 1;
1676 	border = path[depth].p_ext->ee_block;
1677 	err = ext4_ext_get_access(handle, inode, path + k);
1678 	if (err)
1679 		return err;
1680 	path[k].p_idx->ei_block = border;
1681 	err = ext4_ext_dirty(handle, inode, path + k);
1682 	if (err)
1683 		return err;
1684 
1685 	while (k--) {
1686 		/* change all left-side indexes */
1687 		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1688 			break;
1689 		err = ext4_ext_get_access(handle, inode, path + k);
1690 		if (err)
1691 			break;
1692 		path[k].p_idx->ei_block = border;
1693 		err = ext4_ext_dirty(handle, inode, path + k);
1694 		if (err)
1695 			break;
1696 	}
1697 
1698 	return err;
1699 }
1700 
1701 static int ext4_can_extents_be_merged(struct inode *inode,
1702 				      struct ext4_extent *ex1,
1703 				      struct ext4_extent *ex2)
1704 {
1705 	unsigned short ext1_ee_len, ext2_ee_len;
1706 
1707 	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1708 		return 0;
1709 
1710 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1711 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1712 
1713 	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1714 			le32_to_cpu(ex2->ee_block))
1715 		return 0;
1716 
1717 	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1718 		return 0;
1719 
1720 	if (ext4_ext_is_unwritten(ex1) &&
1721 	    ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1722 		return 0;
1723 #ifdef AGGRESSIVE_TEST
1724 	if (ext1_ee_len >= 4)
1725 		return 0;
1726 #endif
1727 
1728 	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1729 		return 1;
1730 	return 0;
1731 }
1732 
1733 /*
1734  * This function tries to merge the "ex" extent to the next extent in the tree.
1735  * It always tries to merge towards right. If you want to merge towards
1736  * left, pass "ex - 1" as argument instead of "ex".
1737  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1738  * 1 if they got merged.
1739  */
1740 static int ext4_ext_try_to_merge_right(struct inode *inode,
1741 				 struct ext4_ext_path *path,
1742 				 struct ext4_extent *ex)
1743 {
1744 	struct ext4_extent_header *eh;
1745 	unsigned int depth, len;
1746 	int merge_done = 0, unwritten;
1747 
1748 	depth = ext_depth(inode);
1749 	BUG_ON(path[depth].p_hdr == NULL);
1750 	eh = path[depth].p_hdr;
1751 
1752 	while (ex < EXT_LAST_EXTENT(eh)) {
1753 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1754 			break;
1755 		/* merge with next extent! */
1756 		unwritten = ext4_ext_is_unwritten(ex);
1757 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1758 				+ ext4_ext_get_actual_len(ex + 1));
1759 		if (unwritten)
1760 			ext4_ext_mark_unwritten(ex);
1761 
1762 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1763 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1764 				* sizeof(struct ext4_extent);
1765 			memmove(ex + 1, ex + 2, len);
1766 		}
1767 		le16_add_cpu(&eh->eh_entries, -1);
1768 		merge_done = 1;
1769 		WARN_ON(eh->eh_entries == 0);
1770 		if (!eh->eh_entries)
1771 			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1772 	}
1773 
1774 	return merge_done;
1775 }
1776 
1777 /*
1778  * This function does a very simple check to see if we can collapse
1779  * an extent tree with a single extent tree leaf block into the inode.
1780  */
1781 static void ext4_ext_try_to_merge_up(handle_t *handle,
1782 				     struct inode *inode,
1783 				     struct ext4_ext_path *path)
1784 {
1785 	size_t s;
1786 	unsigned max_root = ext4_ext_space_root(inode, 0);
1787 	ext4_fsblk_t blk;
1788 
1789 	if ((path[0].p_depth != 1) ||
1790 	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1791 	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1792 		return;
1793 
1794 	/*
1795 	 * We need to modify the block allocation bitmap and the block
1796 	 * group descriptor to release the extent tree block.  If we
1797 	 * can't get the journal credits, give up.
1798 	 */
1799 	if (ext4_journal_extend(handle, 2,
1800 			ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1801 		return;
1802 
1803 	/*
1804 	 * Copy the extent data up to the inode
1805 	 */
1806 	blk = ext4_idx_pblock(path[0].p_idx);
1807 	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1808 		sizeof(struct ext4_extent_idx);
1809 	s += sizeof(struct ext4_extent_header);
1810 
1811 	path[1].p_maxdepth = path[0].p_maxdepth;
1812 	memcpy(path[0].p_hdr, path[1].p_hdr, s);
1813 	path[0].p_depth = 0;
1814 	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1815 		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1816 	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1817 
1818 	brelse(path[1].p_bh);
1819 	ext4_free_blocks(handle, inode, NULL, blk, 1,
1820 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1821 }
1822 
1823 /*
1824  * This function tries to merge the @ex extent to neighbours in the tree, then
1825  * tries to collapse the extent tree into the inode.
1826  */
1827 static void ext4_ext_try_to_merge(handle_t *handle,
1828 				  struct inode *inode,
1829 				  struct ext4_ext_path *path,
1830 				  struct ext4_extent *ex)
1831 {
1832 	struct ext4_extent_header *eh;
1833 	unsigned int depth;
1834 	int merge_done = 0;
1835 
1836 	depth = ext_depth(inode);
1837 	BUG_ON(path[depth].p_hdr == NULL);
1838 	eh = path[depth].p_hdr;
1839 
1840 	if (ex > EXT_FIRST_EXTENT(eh))
1841 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1842 
1843 	if (!merge_done)
1844 		(void) ext4_ext_try_to_merge_right(inode, path, ex);
1845 
1846 	ext4_ext_try_to_merge_up(handle, inode, path);
1847 }
1848 
1849 /*
1850  * check if a portion of the "newext" extent overlaps with an
1851  * existing extent.
1852  *
1853  * If there is an overlap discovered, it updates the length of the newext
1854  * such that there will be no overlap, and then returns 1.
1855  * If there is no overlap found, it returns 0.
1856  */
1857 static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1858 					   struct inode *inode,
1859 					   struct ext4_extent *newext,
1860 					   struct ext4_ext_path *path)
1861 {
1862 	ext4_lblk_t b1, b2;
1863 	unsigned int depth, len1;
1864 	unsigned int ret = 0;
1865 
1866 	b1 = le32_to_cpu(newext->ee_block);
1867 	len1 = ext4_ext_get_actual_len(newext);
1868 	depth = ext_depth(inode);
1869 	if (!path[depth].p_ext)
1870 		goto out;
1871 	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1872 
1873 	/*
1874 	 * get the next allocated block if the extent in the path
1875 	 * is before the requested block(s)
1876 	 */
1877 	if (b2 < b1) {
1878 		b2 = ext4_ext_next_allocated_block(path);
1879 		if (b2 == EXT_MAX_BLOCKS)
1880 			goto out;
1881 		b2 = EXT4_LBLK_CMASK(sbi, b2);
1882 	}
1883 
1884 	/* check for wrap through zero on extent logical start block*/
1885 	if (b1 + len1 < b1) {
1886 		len1 = EXT_MAX_BLOCKS - b1;
1887 		newext->ee_len = cpu_to_le16(len1);
1888 		ret = 1;
1889 	}
1890 
1891 	/* check for overlap */
1892 	if (b1 + len1 > b2) {
1893 		newext->ee_len = cpu_to_le16(b2 - b1);
1894 		ret = 1;
1895 	}
1896 out:
1897 	return ret;
1898 }
1899 
1900 /*
1901  * ext4_ext_insert_extent:
1902  * tries to merge requsted extent into the existing extent or
1903  * inserts requested extent as new one into the tree,
1904  * creating new leaf in the no-space case.
1905  */
1906 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1907 				struct ext4_ext_path **ppath,
1908 				struct ext4_extent *newext, int gb_flags)
1909 {
1910 	struct ext4_ext_path *path = *ppath;
1911 	struct ext4_extent_header *eh;
1912 	struct ext4_extent *ex, *fex;
1913 	struct ext4_extent *nearex; /* nearest extent */
1914 	struct ext4_ext_path *npath = NULL;
1915 	int depth, len, err;
1916 	ext4_lblk_t next;
1917 	int mb_flags = 0, unwritten;
1918 
1919 	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1920 		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1921 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1922 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1923 		return -EFSCORRUPTED;
1924 	}
1925 	depth = ext_depth(inode);
1926 	ex = path[depth].p_ext;
1927 	eh = path[depth].p_hdr;
1928 	if (unlikely(path[depth].p_hdr == NULL)) {
1929 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1930 		return -EFSCORRUPTED;
1931 	}
1932 
1933 	/* try to insert block into found extent and return */
1934 	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1935 
1936 		/*
1937 		 * Try to see whether we should rather test the extent on
1938 		 * right from ex, or from the left of ex. This is because
1939 		 * ext4_find_extent() can return either extent on the
1940 		 * left, or on the right from the searched position. This
1941 		 * will make merging more effective.
1942 		 */
1943 		if (ex < EXT_LAST_EXTENT(eh) &&
1944 		    (le32_to_cpu(ex->ee_block) +
1945 		    ext4_ext_get_actual_len(ex) <
1946 		    le32_to_cpu(newext->ee_block))) {
1947 			ex += 1;
1948 			goto prepend;
1949 		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
1950 			   (le32_to_cpu(newext->ee_block) +
1951 			   ext4_ext_get_actual_len(newext) <
1952 			   le32_to_cpu(ex->ee_block)))
1953 			ex -= 1;
1954 
1955 		/* Try to append newex to the ex */
1956 		if (ext4_can_extents_be_merged(inode, ex, newext)) {
1957 			ext_debug("append [%d]%d block to %u:[%d]%d"
1958 				  "(from %llu)\n",
1959 				  ext4_ext_is_unwritten(newext),
1960 				  ext4_ext_get_actual_len(newext),
1961 				  le32_to_cpu(ex->ee_block),
1962 				  ext4_ext_is_unwritten(ex),
1963 				  ext4_ext_get_actual_len(ex),
1964 				  ext4_ext_pblock(ex));
1965 			err = ext4_ext_get_access(handle, inode,
1966 						  path + depth);
1967 			if (err)
1968 				return err;
1969 			unwritten = ext4_ext_is_unwritten(ex);
1970 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1971 					+ ext4_ext_get_actual_len(newext));
1972 			if (unwritten)
1973 				ext4_ext_mark_unwritten(ex);
1974 			eh = path[depth].p_hdr;
1975 			nearex = ex;
1976 			goto merge;
1977 		}
1978 
1979 prepend:
1980 		/* Try to prepend newex to the ex */
1981 		if (ext4_can_extents_be_merged(inode, newext, ex)) {
1982 			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
1983 				  "(from %llu)\n",
1984 				  le32_to_cpu(newext->ee_block),
1985 				  ext4_ext_is_unwritten(newext),
1986 				  ext4_ext_get_actual_len(newext),
1987 				  le32_to_cpu(ex->ee_block),
1988 				  ext4_ext_is_unwritten(ex),
1989 				  ext4_ext_get_actual_len(ex),
1990 				  ext4_ext_pblock(ex));
1991 			err = ext4_ext_get_access(handle, inode,
1992 						  path + depth);
1993 			if (err)
1994 				return err;
1995 
1996 			unwritten = ext4_ext_is_unwritten(ex);
1997 			ex->ee_block = newext->ee_block;
1998 			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
1999 			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2000 					+ ext4_ext_get_actual_len(newext));
2001 			if (unwritten)
2002 				ext4_ext_mark_unwritten(ex);
2003 			eh = path[depth].p_hdr;
2004 			nearex = ex;
2005 			goto merge;
2006 		}
2007 	}
2008 
2009 	depth = ext_depth(inode);
2010 	eh = path[depth].p_hdr;
2011 	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2012 		goto has_space;
2013 
2014 	/* probably next leaf has space for us? */
2015 	fex = EXT_LAST_EXTENT(eh);
2016 	next = EXT_MAX_BLOCKS;
2017 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2018 		next = ext4_ext_next_leaf_block(path);
2019 	if (next != EXT_MAX_BLOCKS) {
2020 		ext_debug("next leaf block - %u\n", next);
2021 		BUG_ON(npath != NULL);
2022 		npath = ext4_find_extent(inode, next, NULL, 0);
2023 		if (IS_ERR(npath))
2024 			return PTR_ERR(npath);
2025 		BUG_ON(npath->p_depth != path->p_depth);
2026 		eh = npath[depth].p_hdr;
2027 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2028 			ext_debug("next leaf isn't full(%d)\n",
2029 				  le16_to_cpu(eh->eh_entries));
2030 			path = npath;
2031 			goto has_space;
2032 		}
2033 		ext_debug("next leaf has no free space(%d,%d)\n",
2034 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2035 	}
2036 
2037 	/*
2038 	 * There is no free space in the found leaf.
2039 	 * We're gonna add a new leaf in the tree.
2040 	 */
2041 	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2042 		mb_flags |= EXT4_MB_USE_RESERVED;
2043 	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2044 				       ppath, newext);
2045 	if (err)
2046 		goto cleanup;
2047 	depth = ext_depth(inode);
2048 	eh = path[depth].p_hdr;
2049 
2050 has_space:
2051 	nearex = path[depth].p_ext;
2052 
2053 	err = ext4_ext_get_access(handle, inode, path + depth);
2054 	if (err)
2055 		goto cleanup;
2056 
2057 	if (!nearex) {
2058 		/* there is no extent in this leaf, create first one */
2059 		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
2060 				le32_to_cpu(newext->ee_block),
2061 				ext4_ext_pblock(newext),
2062 				ext4_ext_is_unwritten(newext),
2063 				ext4_ext_get_actual_len(newext));
2064 		nearex = EXT_FIRST_EXTENT(eh);
2065 	} else {
2066 		if (le32_to_cpu(newext->ee_block)
2067 			   > le32_to_cpu(nearex->ee_block)) {
2068 			/* Insert after */
2069 			ext_debug("insert %u:%llu:[%d]%d before: "
2070 					"nearest %p\n",
2071 					le32_to_cpu(newext->ee_block),
2072 					ext4_ext_pblock(newext),
2073 					ext4_ext_is_unwritten(newext),
2074 					ext4_ext_get_actual_len(newext),
2075 					nearex);
2076 			nearex++;
2077 		} else {
2078 			/* Insert before */
2079 			BUG_ON(newext->ee_block == nearex->ee_block);
2080 			ext_debug("insert %u:%llu:[%d]%d after: "
2081 					"nearest %p\n",
2082 					le32_to_cpu(newext->ee_block),
2083 					ext4_ext_pblock(newext),
2084 					ext4_ext_is_unwritten(newext),
2085 					ext4_ext_get_actual_len(newext),
2086 					nearex);
2087 		}
2088 		len = EXT_LAST_EXTENT(eh) - nearex + 1;
2089 		if (len > 0) {
2090 			ext_debug("insert %u:%llu:[%d]%d: "
2091 					"move %d extents from 0x%p to 0x%p\n",
2092 					le32_to_cpu(newext->ee_block),
2093 					ext4_ext_pblock(newext),
2094 					ext4_ext_is_unwritten(newext),
2095 					ext4_ext_get_actual_len(newext),
2096 					len, nearex, nearex + 1);
2097 			memmove(nearex + 1, nearex,
2098 				len * sizeof(struct ext4_extent));
2099 		}
2100 	}
2101 
2102 	le16_add_cpu(&eh->eh_entries, 1);
2103 	path[depth].p_ext = nearex;
2104 	nearex->ee_block = newext->ee_block;
2105 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2106 	nearex->ee_len = newext->ee_len;
2107 
2108 merge:
2109 	/* try to merge extents */
2110 	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2111 		ext4_ext_try_to_merge(handle, inode, path, nearex);
2112 
2113 
2114 	/* time to correct all indexes above */
2115 	err = ext4_ext_correct_indexes(handle, inode, path);
2116 	if (err)
2117 		goto cleanup;
2118 
2119 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2120 
2121 cleanup:
2122 	ext4_ext_drop_refs(npath);
2123 	kfree(npath);
2124 	return err;
2125 }
2126 
2127 static int ext4_fill_es_cache_info(struct inode *inode,
2128 				   ext4_lblk_t block, ext4_lblk_t num,
2129 				   struct fiemap_extent_info *fieinfo)
2130 {
2131 	ext4_lblk_t next, end = block + num - 1;
2132 	struct extent_status es;
2133 	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2134 	unsigned int flags;
2135 	int err;
2136 
2137 	while (block <= end) {
2138 		next = 0;
2139 		flags = 0;
2140 		if (!ext4_es_lookup_extent(inode, block, &next, &es))
2141 			break;
2142 		if (ext4_es_is_unwritten(&es))
2143 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2144 		if (ext4_es_is_delayed(&es))
2145 			flags |= (FIEMAP_EXTENT_DELALLOC |
2146 				  FIEMAP_EXTENT_UNKNOWN);
2147 		if (ext4_es_is_hole(&es))
2148 			flags |= EXT4_FIEMAP_EXTENT_HOLE;
2149 		if (next == 0)
2150 			flags |= FIEMAP_EXTENT_LAST;
2151 		if (flags & (FIEMAP_EXTENT_DELALLOC|
2152 			     EXT4_FIEMAP_EXTENT_HOLE))
2153 			es.es_pblk = 0;
2154 		else
2155 			es.es_pblk = ext4_es_pblock(&es);
2156 		err = fiemap_fill_next_extent(fieinfo,
2157 				(__u64)es.es_lblk << blksize_bits,
2158 				(__u64)es.es_pblk << blksize_bits,
2159 				(__u64)es.es_len << blksize_bits,
2160 				flags);
2161 		if (next == 0)
2162 			break;
2163 		block = next;
2164 		if (err < 0)
2165 			return err;
2166 		if (err == 1)
2167 			return 0;
2168 	}
2169 	return 0;
2170 }
2171 
2172 
2173 /*
2174  * ext4_ext_determine_hole - determine hole around given block
2175  * @inode:	inode we lookup in
2176  * @path:	path in extent tree to @lblk
2177  * @lblk:	pointer to logical block around which we want to determine hole
2178  *
2179  * Determine hole length (and start if easily possible) around given logical
2180  * block. We don't try too hard to find the beginning of the hole but @path
2181  * actually points to extent before @lblk, we provide it.
2182  *
2183  * The function returns the length of a hole starting at @lblk. We update @lblk
2184  * to the beginning of the hole if we managed to find it.
2185  */
2186 static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2187 					   struct ext4_ext_path *path,
2188 					   ext4_lblk_t *lblk)
2189 {
2190 	int depth = ext_depth(inode);
2191 	struct ext4_extent *ex;
2192 	ext4_lblk_t len;
2193 
2194 	ex = path[depth].p_ext;
2195 	if (ex == NULL) {
2196 		/* there is no extent yet, so gap is [0;-] */
2197 		*lblk = 0;
2198 		len = EXT_MAX_BLOCKS;
2199 	} else if (*lblk < le32_to_cpu(ex->ee_block)) {
2200 		len = le32_to_cpu(ex->ee_block) - *lblk;
2201 	} else if (*lblk >= le32_to_cpu(ex->ee_block)
2202 			+ ext4_ext_get_actual_len(ex)) {
2203 		ext4_lblk_t next;
2204 
2205 		*lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2206 		next = ext4_ext_next_allocated_block(path);
2207 		BUG_ON(next == *lblk);
2208 		len = next - *lblk;
2209 	} else {
2210 		BUG();
2211 	}
2212 	return len;
2213 }
2214 
2215 /*
2216  * ext4_ext_put_gap_in_cache:
2217  * calculate boundaries of the gap that the requested block fits into
2218  * and cache this gap
2219  */
2220 static void
2221 ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2222 			  ext4_lblk_t hole_len)
2223 {
2224 	struct extent_status es;
2225 
2226 	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2227 				  hole_start + hole_len - 1, &es);
2228 	if (es.es_len) {
2229 		/* There's delayed extent containing lblock? */
2230 		if (es.es_lblk <= hole_start)
2231 			return;
2232 		hole_len = min(es.es_lblk - hole_start, hole_len);
2233 	}
2234 	ext_debug(" -> %u:%u\n", hole_start, hole_len);
2235 	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2236 			      EXTENT_STATUS_HOLE);
2237 }
2238 
2239 /*
2240  * ext4_ext_rm_idx:
2241  * removes index from the index block.
2242  */
2243 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2244 			struct ext4_ext_path *path, int depth)
2245 {
2246 	int err;
2247 	ext4_fsblk_t leaf;
2248 
2249 	/* free index block */
2250 	depth--;
2251 	path = path + depth;
2252 	leaf = ext4_idx_pblock(path->p_idx);
2253 	if (unlikely(path->p_hdr->eh_entries == 0)) {
2254 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2255 		return -EFSCORRUPTED;
2256 	}
2257 	err = ext4_ext_get_access(handle, inode, path);
2258 	if (err)
2259 		return err;
2260 
2261 	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2262 		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2263 		len *= sizeof(struct ext4_extent_idx);
2264 		memmove(path->p_idx, path->p_idx + 1, len);
2265 	}
2266 
2267 	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2268 	err = ext4_ext_dirty(handle, inode, path);
2269 	if (err)
2270 		return err;
2271 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2272 	trace_ext4_ext_rm_idx(inode, leaf);
2273 
2274 	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2275 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2276 
2277 	while (--depth >= 0) {
2278 		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2279 			break;
2280 		path--;
2281 		err = ext4_ext_get_access(handle, inode, path);
2282 		if (err)
2283 			break;
2284 		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2285 		err = ext4_ext_dirty(handle, inode, path);
2286 		if (err)
2287 			break;
2288 	}
2289 	return err;
2290 }
2291 
2292 /*
2293  * ext4_ext_calc_credits_for_single_extent:
2294  * This routine returns max. credits that needed to insert an extent
2295  * to the extent tree.
2296  * When pass the actual path, the caller should calculate credits
2297  * under i_data_sem.
2298  */
2299 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2300 						struct ext4_ext_path *path)
2301 {
2302 	if (path) {
2303 		int depth = ext_depth(inode);
2304 		int ret = 0;
2305 
2306 		/* probably there is space in leaf? */
2307 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2308 				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2309 
2310 			/*
2311 			 *  There are some space in the leaf tree, no
2312 			 *  need to account for leaf block credit
2313 			 *
2314 			 *  bitmaps and block group descriptor blocks
2315 			 *  and other metadata blocks still need to be
2316 			 *  accounted.
2317 			 */
2318 			/* 1 bitmap, 1 block group descriptor */
2319 			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2320 			return ret;
2321 		}
2322 	}
2323 
2324 	return ext4_chunk_trans_blocks(inode, nrblocks);
2325 }
2326 
2327 /*
2328  * How many index/leaf blocks need to change/allocate to add @extents extents?
2329  *
2330  * If we add a single extent, then in the worse case, each tree level
2331  * index/leaf need to be changed in case of the tree split.
2332  *
2333  * If more extents are inserted, they could cause the whole tree split more
2334  * than once, but this is really rare.
2335  */
2336 int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2337 {
2338 	int index;
2339 	int depth;
2340 
2341 	/* If we are converting the inline data, only one is needed here. */
2342 	if (ext4_has_inline_data(inode))
2343 		return 1;
2344 
2345 	depth = ext_depth(inode);
2346 
2347 	if (extents <= 1)
2348 		index = depth * 2;
2349 	else
2350 		index = depth * 3;
2351 
2352 	return index;
2353 }
2354 
2355 static inline int get_default_free_blocks_flags(struct inode *inode)
2356 {
2357 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2358 	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2359 		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2360 	else if (ext4_should_journal_data(inode))
2361 		return EXT4_FREE_BLOCKS_FORGET;
2362 	return 0;
2363 }
2364 
2365 /*
2366  * ext4_rereserve_cluster - increment the reserved cluster count when
2367  *                          freeing a cluster with a pending reservation
2368  *
2369  * @inode - file containing the cluster
2370  * @lblk - logical block in cluster to be reserved
2371  *
2372  * Increments the reserved cluster count and adjusts quota in a bigalloc
2373  * file system when freeing a partial cluster containing at least one
2374  * delayed and unwritten block.  A partial cluster meeting that
2375  * requirement will have a pending reservation.  If so, the
2376  * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2377  * defer reserved and allocated space accounting to a subsequent call
2378  * to this function.
2379  */
2380 static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2381 {
2382 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2383 	struct ext4_inode_info *ei = EXT4_I(inode);
2384 
2385 	dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2386 
2387 	spin_lock(&ei->i_block_reservation_lock);
2388 	ei->i_reserved_data_blocks++;
2389 	percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2390 	spin_unlock(&ei->i_block_reservation_lock);
2391 
2392 	percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2393 	ext4_remove_pending(inode, lblk);
2394 }
2395 
2396 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2397 			      struct ext4_extent *ex,
2398 			      struct partial_cluster *partial,
2399 			      ext4_lblk_t from, ext4_lblk_t to)
2400 {
2401 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2402 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
2403 	ext4_fsblk_t last_pblk, pblk;
2404 	ext4_lblk_t num;
2405 	int flags;
2406 
2407 	/* only extent tail removal is allowed */
2408 	if (from < le32_to_cpu(ex->ee_block) ||
2409 	    to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2410 		ext4_error(sbi->s_sb,
2411 			   "strange request: removal(2) %u-%u from %u:%u",
2412 			   from, to, le32_to_cpu(ex->ee_block), ee_len);
2413 		return 0;
2414 	}
2415 
2416 #ifdef EXTENTS_STATS
2417 	spin_lock(&sbi->s_ext_stats_lock);
2418 	sbi->s_ext_blocks += ee_len;
2419 	sbi->s_ext_extents++;
2420 	if (ee_len < sbi->s_ext_min)
2421 		sbi->s_ext_min = ee_len;
2422 	if (ee_len > sbi->s_ext_max)
2423 		sbi->s_ext_max = ee_len;
2424 	if (ext_depth(inode) > sbi->s_depth_max)
2425 		sbi->s_depth_max = ext_depth(inode);
2426 	spin_unlock(&sbi->s_ext_stats_lock);
2427 #endif
2428 
2429 	trace_ext4_remove_blocks(inode, ex, from, to, partial);
2430 
2431 	/*
2432 	 * if we have a partial cluster, and it's different from the
2433 	 * cluster of the last block in the extent, we free it
2434 	 */
2435 	last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2436 
2437 	if (partial->state != initial &&
2438 	    partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2439 		if (partial->state == tofree) {
2440 			flags = get_default_free_blocks_flags(inode);
2441 			if (ext4_is_pending(inode, partial->lblk))
2442 				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2443 			ext4_free_blocks(handle, inode, NULL,
2444 					 EXT4_C2B(sbi, partial->pclu),
2445 					 sbi->s_cluster_ratio, flags);
2446 			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2447 				ext4_rereserve_cluster(inode, partial->lblk);
2448 		}
2449 		partial->state = initial;
2450 	}
2451 
2452 	num = le32_to_cpu(ex->ee_block) + ee_len - from;
2453 	pblk = ext4_ext_pblock(ex) + ee_len - num;
2454 
2455 	/*
2456 	 * We free the partial cluster at the end of the extent (if any),
2457 	 * unless the cluster is used by another extent (partial_cluster
2458 	 * state is nofree).  If a partial cluster exists here, it must be
2459 	 * shared with the last block in the extent.
2460 	 */
2461 	flags = get_default_free_blocks_flags(inode);
2462 
2463 	/* partial, left end cluster aligned, right end unaligned */
2464 	if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2465 	    (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2466 	    (partial->state != nofree)) {
2467 		if (ext4_is_pending(inode, to))
2468 			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2469 		ext4_free_blocks(handle, inode, NULL,
2470 				 EXT4_PBLK_CMASK(sbi, last_pblk),
2471 				 sbi->s_cluster_ratio, flags);
2472 		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2473 			ext4_rereserve_cluster(inode, to);
2474 		partial->state = initial;
2475 		flags = get_default_free_blocks_flags(inode);
2476 	}
2477 
2478 	flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2479 
2480 	/*
2481 	 * For bigalloc file systems, we never free a partial cluster
2482 	 * at the beginning of the extent.  Instead, we check to see if we
2483 	 * need to free it on a subsequent call to ext4_remove_blocks,
2484 	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2485 	 */
2486 	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2487 	ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2488 
2489 	/* reset the partial cluster if we've freed past it */
2490 	if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2491 		partial->state = initial;
2492 
2493 	/*
2494 	 * If we've freed the entire extent but the beginning is not left
2495 	 * cluster aligned and is not marked as ineligible for freeing we
2496 	 * record the partial cluster at the beginning of the extent.  It
2497 	 * wasn't freed by the preceding ext4_free_blocks() call, and we
2498 	 * need to look farther to the left to determine if it's to be freed
2499 	 * (not shared with another extent). Else, reset the partial
2500 	 * cluster - we're either  done freeing or the beginning of the
2501 	 * extent is left cluster aligned.
2502 	 */
2503 	if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2504 		if (partial->state == initial) {
2505 			partial->pclu = EXT4_B2C(sbi, pblk);
2506 			partial->lblk = from;
2507 			partial->state = tofree;
2508 		}
2509 	} else {
2510 		partial->state = initial;
2511 	}
2512 
2513 	return 0;
2514 }
2515 
2516 /*
2517  * ext4_ext_rm_leaf() Removes the extents associated with the
2518  * blocks appearing between "start" and "end".  Both "start"
2519  * and "end" must appear in the same extent or EIO is returned.
2520  *
2521  * @handle: The journal handle
2522  * @inode:  The files inode
2523  * @path:   The path to the leaf
2524  * @partial_cluster: The cluster which we'll have to free if all extents
2525  *                   has been released from it.  However, if this value is
2526  *                   negative, it's a cluster just to the right of the
2527  *                   punched region and it must not be freed.
2528  * @start:  The first block to remove
2529  * @end:   The last block to remove
2530  */
2531 static int
2532 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2533 		 struct ext4_ext_path *path,
2534 		 struct partial_cluster *partial,
2535 		 ext4_lblk_t start, ext4_lblk_t end)
2536 {
2537 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2538 	int err = 0, correct_index = 0;
2539 	int depth = ext_depth(inode), credits, revoke_credits;
2540 	struct ext4_extent_header *eh;
2541 	ext4_lblk_t a, b;
2542 	unsigned num;
2543 	ext4_lblk_t ex_ee_block;
2544 	unsigned short ex_ee_len;
2545 	unsigned unwritten = 0;
2546 	struct ext4_extent *ex;
2547 	ext4_fsblk_t pblk;
2548 
2549 	/* the header must be checked already in ext4_ext_remove_space() */
2550 	ext_debug("truncate since %u in leaf to %u\n", start, end);
2551 	if (!path[depth].p_hdr)
2552 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2553 	eh = path[depth].p_hdr;
2554 	if (unlikely(path[depth].p_hdr == NULL)) {
2555 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2556 		return -EFSCORRUPTED;
2557 	}
2558 	/* find where to start removing */
2559 	ex = path[depth].p_ext;
2560 	if (!ex)
2561 		ex = EXT_LAST_EXTENT(eh);
2562 
2563 	ex_ee_block = le32_to_cpu(ex->ee_block);
2564 	ex_ee_len = ext4_ext_get_actual_len(ex);
2565 
2566 	trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2567 
2568 	while (ex >= EXT_FIRST_EXTENT(eh) &&
2569 			ex_ee_block + ex_ee_len > start) {
2570 
2571 		if (ext4_ext_is_unwritten(ex))
2572 			unwritten = 1;
2573 		else
2574 			unwritten = 0;
2575 
2576 		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2577 			  unwritten, ex_ee_len);
2578 		path[depth].p_ext = ex;
2579 
2580 		a = ex_ee_block > start ? ex_ee_block : start;
2581 		b = ex_ee_block+ex_ee_len - 1 < end ?
2582 			ex_ee_block+ex_ee_len - 1 : end;
2583 
2584 		ext_debug("  border %u:%u\n", a, b);
2585 
2586 		/* If this extent is beyond the end of the hole, skip it */
2587 		if (end < ex_ee_block) {
2588 			/*
2589 			 * We're going to skip this extent and move to another,
2590 			 * so note that its first cluster is in use to avoid
2591 			 * freeing it when removing blocks.  Eventually, the
2592 			 * right edge of the truncated/punched region will
2593 			 * be just to the left.
2594 			 */
2595 			if (sbi->s_cluster_ratio > 1) {
2596 				pblk = ext4_ext_pblock(ex);
2597 				partial->pclu = EXT4_B2C(sbi, pblk);
2598 				partial->state = nofree;
2599 			}
2600 			ex--;
2601 			ex_ee_block = le32_to_cpu(ex->ee_block);
2602 			ex_ee_len = ext4_ext_get_actual_len(ex);
2603 			continue;
2604 		} else if (b != ex_ee_block + ex_ee_len - 1) {
2605 			EXT4_ERROR_INODE(inode,
2606 					 "can not handle truncate %u:%u "
2607 					 "on extent %u:%u",
2608 					 start, end, ex_ee_block,
2609 					 ex_ee_block + ex_ee_len - 1);
2610 			err = -EFSCORRUPTED;
2611 			goto out;
2612 		} else if (a != ex_ee_block) {
2613 			/* remove tail of the extent */
2614 			num = a - ex_ee_block;
2615 		} else {
2616 			/* remove whole extent: excellent! */
2617 			num = 0;
2618 		}
2619 		/*
2620 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2621 		 * descriptor) for each block group; assume two block
2622 		 * groups plus ex_ee_len/blocks_per_block_group for
2623 		 * the worst case
2624 		 */
2625 		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2626 		if (ex == EXT_FIRST_EXTENT(eh)) {
2627 			correct_index = 1;
2628 			credits += (ext_depth(inode)) + 1;
2629 		}
2630 		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2631 		/*
2632 		 * We may end up freeing some index blocks and data from the
2633 		 * punched range. Note that partial clusters are accounted for
2634 		 * by ext4_free_data_revoke_credits().
2635 		 */
2636 		revoke_credits =
2637 			ext4_free_metadata_revoke_credits(inode->i_sb,
2638 							  ext_depth(inode)) +
2639 			ext4_free_data_revoke_credits(inode, b - a + 1);
2640 
2641 		err = ext4_datasem_ensure_credits(handle, inode, credits,
2642 						  credits, revoke_credits);
2643 		if (err) {
2644 			if (err > 0)
2645 				err = -EAGAIN;
2646 			goto out;
2647 		}
2648 
2649 		err = ext4_ext_get_access(handle, inode, path + depth);
2650 		if (err)
2651 			goto out;
2652 
2653 		err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2654 		if (err)
2655 			goto out;
2656 
2657 		if (num == 0)
2658 			/* this extent is removed; mark slot entirely unused */
2659 			ext4_ext_store_pblock(ex, 0);
2660 
2661 		ex->ee_len = cpu_to_le16(num);
2662 		/*
2663 		 * Do not mark unwritten if all the blocks in the
2664 		 * extent have been removed.
2665 		 */
2666 		if (unwritten && num)
2667 			ext4_ext_mark_unwritten(ex);
2668 		/*
2669 		 * If the extent was completely released,
2670 		 * we need to remove it from the leaf
2671 		 */
2672 		if (num == 0) {
2673 			if (end != EXT_MAX_BLOCKS - 1) {
2674 				/*
2675 				 * For hole punching, we need to scoot all the
2676 				 * extents up when an extent is removed so that
2677 				 * we dont have blank extents in the middle
2678 				 */
2679 				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2680 					sizeof(struct ext4_extent));
2681 
2682 				/* Now get rid of the one at the end */
2683 				memset(EXT_LAST_EXTENT(eh), 0,
2684 					sizeof(struct ext4_extent));
2685 			}
2686 			le16_add_cpu(&eh->eh_entries, -1);
2687 		}
2688 
2689 		err = ext4_ext_dirty(handle, inode, path + depth);
2690 		if (err)
2691 			goto out;
2692 
2693 		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
2694 				ext4_ext_pblock(ex));
2695 		ex--;
2696 		ex_ee_block = le32_to_cpu(ex->ee_block);
2697 		ex_ee_len = ext4_ext_get_actual_len(ex);
2698 	}
2699 
2700 	if (correct_index && eh->eh_entries)
2701 		err = ext4_ext_correct_indexes(handle, inode, path);
2702 
2703 	/*
2704 	 * If there's a partial cluster and at least one extent remains in
2705 	 * the leaf, free the partial cluster if it isn't shared with the
2706 	 * current extent.  If it is shared with the current extent
2707 	 * we reset the partial cluster because we've reached the start of the
2708 	 * truncated/punched region and we're done removing blocks.
2709 	 */
2710 	if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2711 		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2712 		if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2713 			int flags = get_default_free_blocks_flags(inode);
2714 
2715 			if (ext4_is_pending(inode, partial->lblk))
2716 				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2717 			ext4_free_blocks(handle, inode, NULL,
2718 					 EXT4_C2B(sbi, partial->pclu),
2719 					 sbi->s_cluster_ratio, flags);
2720 			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2721 				ext4_rereserve_cluster(inode, partial->lblk);
2722 		}
2723 		partial->state = initial;
2724 	}
2725 
2726 	/* if this leaf is free, then we should
2727 	 * remove it from index block above */
2728 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2729 		err = ext4_ext_rm_idx(handle, inode, path, depth);
2730 
2731 out:
2732 	return err;
2733 }
2734 
2735 /*
2736  * ext4_ext_more_to_rm:
2737  * returns 1 if current index has to be freed (even partial)
2738  */
2739 static int
2740 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2741 {
2742 	BUG_ON(path->p_idx == NULL);
2743 
2744 	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2745 		return 0;
2746 
2747 	/*
2748 	 * if truncate on deeper level happened, it wasn't partial,
2749 	 * so we have to consider current index for truncation
2750 	 */
2751 	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2752 		return 0;
2753 	return 1;
2754 }
2755 
2756 int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2757 			  ext4_lblk_t end)
2758 {
2759 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2760 	int depth = ext_depth(inode);
2761 	struct ext4_ext_path *path = NULL;
2762 	struct partial_cluster partial;
2763 	handle_t *handle;
2764 	int i = 0, err = 0;
2765 
2766 	partial.pclu = 0;
2767 	partial.lblk = 0;
2768 	partial.state = initial;
2769 
2770 	ext_debug("truncate since %u to %u\n", start, end);
2771 
2772 	/* probably first extent we're gonna free will be last in block */
2773 	handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2774 			depth + 1,
2775 			ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2776 	if (IS_ERR(handle))
2777 		return PTR_ERR(handle);
2778 
2779 again:
2780 	trace_ext4_ext_remove_space(inode, start, end, depth);
2781 
2782 	/*
2783 	 * Check if we are removing extents inside the extent tree. If that
2784 	 * is the case, we are going to punch a hole inside the extent tree
2785 	 * so we have to check whether we need to split the extent covering
2786 	 * the last block to remove so we can easily remove the part of it
2787 	 * in ext4_ext_rm_leaf().
2788 	 */
2789 	if (end < EXT_MAX_BLOCKS - 1) {
2790 		struct ext4_extent *ex;
2791 		ext4_lblk_t ee_block, ex_end, lblk;
2792 		ext4_fsblk_t pblk;
2793 
2794 		/* find extent for or closest extent to this block */
2795 		path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2796 		if (IS_ERR(path)) {
2797 			ext4_journal_stop(handle);
2798 			return PTR_ERR(path);
2799 		}
2800 		depth = ext_depth(inode);
2801 		/* Leaf not may not exist only if inode has no blocks at all */
2802 		ex = path[depth].p_ext;
2803 		if (!ex) {
2804 			if (depth) {
2805 				EXT4_ERROR_INODE(inode,
2806 						 "path[%d].p_hdr == NULL",
2807 						 depth);
2808 				err = -EFSCORRUPTED;
2809 			}
2810 			goto out;
2811 		}
2812 
2813 		ee_block = le32_to_cpu(ex->ee_block);
2814 		ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2815 
2816 		/*
2817 		 * See if the last block is inside the extent, if so split
2818 		 * the extent at 'end' block so we can easily remove the
2819 		 * tail of the first part of the split extent in
2820 		 * ext4_ext_rm_leaf().
2821 		 */
2822 		if (end >= ee_block && end < ex_end) {
2823 
2824 			/*
2825 			 * If we're going to split the extent, note that
2826 			 * the cluster containing the block after 'end' is
2827 			 * in use to avoid freeing it when removing blocks.
2828 			 */
2829 			if (sbi->s_cluster_ratio > 1) {
2830 				pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
2831 				partial.pclu = EXT4_B2C(sbi, pblk);
2832 				partial.state = nofree;
2833 			}
2834 
2835 			/*
2836 			 * Split the extent in two so that 'end' is the last
2837 			 * block in the first new extent. Also we should not
2838 			 * fail removing space due to ENOSPC so try to use
2839 			 * reserved block if that happens.
2840 			 */
2841 			err = ext4_force_split_extent_at(handle, inode, &path,
2842 							 end + 1, 1);
2843 			if (err < 0)
2844 				goto out;
2845 
2846 		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
2847 			   partial.state == initial) {
2848 			/*
2849 			 * If we're punching, there's an extent to the right.
2850 			 * If the partial cluster hasn't been set, set it to
2851 			 * that extent's first cluster and its state to nofree
2852 			 * so it won't be freed should it contain blocks to be
2853 			 * removed. If it's already set (tofree/nofree), we're
2854 			 * retrying and keep the original partial cluster info
2855 			 * so a cluster marked tofree as a result of earlier
2856 			 * extent removal is not lost.
2857 			 */
2858 			lblk = ex_end + 1;
2859 			err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2860 						    &ex);
2861 			if (err)
2862 				goto out;
2863 			if (pblk) {
2864 				partial.pclu = EXT4_B2C(sbi, pblk);
2865 				partial.state = nofree;
2866 			}
2867 		}
2868 	}
2869 	/*
2870 	 * We start scanning from right side, freeing all the blocks
2871 	 * after i_size and walking into the tree depth-wise.
2872 	 */
2873 	depth = ext_depth(inode);
2874 	if (path) {
2875 		int k = i = depth;
2876 		while (--k > 0)
2877 			path[k].p_block =
2878 				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2879 	} else {
2880 		path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
2881 			       GFP_NOFS);
2882 		if (path == NULL) {
2883 			ext4_journal_stop(handle);
2884 			return -ENOMEM;
2885 		}
2886 		path[0].p_maxdepth = path[0].p_depth = depth;
2887 		path[0].p_hdr = ext_inode_hdr(inode);
2888 		i = 0;
2889 
2890 		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2891 			err = -EFSCORRUPTED;
2892 			goto out;
2893 		}
2894 	}
2895 	err = 0;
2896 
2897 	while (i >= 0 && err == 0) {
2898 		if (i == depth) {
2899 			/* this is leaf block */
2900 			err = ext4_ext_rm_leaf(handle, inode, path,
2901 					       &partial, start, end);
2902 			/* root level has p_bh == NULL, brelse() eats this */
2903 			brelse(path[i].p_bh);
2904 			path[i].p_bh = NULL;
2905 			i--;
2906 			continue;
2907 		}
2908 
2909 		/* this is index block */
2910 		if (!path[i].p_hdr) {
2911 			ext_debug("initialize header\n");
2912 			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2913 		}
2914 
2915 		if (!path[i].p_idx) {
2916 			/* this level hasn't been touched yet */
2917 			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2918 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2919 			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2920 				  path[i].p_hdr,
2921 				  le16_to_cpu(path[i].p_hdr->eh_entries));
2922 		} else {
2923 			/* we were already here, see at next index */
2924 			path[i].p_idx--;
2925 		}
2926 
2927 		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2928 				i, EXT_FIRST_INDEX(path[i].p_hdr),
2929 				path[i].p_idx);
2930 		if (ext4_ext_more_to_rm(path + i)) {
2931 			struct buffer_head *bh;
2932 			/* go to the next level */
2933 			ext_debug("move to level %d (block %llu)\n",
2934 				  i + 1, ext4_idx_pblock(path[i].p_idx));
2935 			memset(path + i + 1, 0, sizeof(*path));
2936 			bh = read_extent_tree_block(inode,
2937 				ext4_idx_pblock(path[i].p_idx), depth - i - 1,
2938 				EXT4_EX_NOCACHE);
2939 			if (IS_ERR(bh)) {
2940 				/* should we reset i_size? */
2941 				err = PTR_ERR(bh);
2942 				break;
2943 			}
2944 			/* Yield here to deal with large extent trees.
2945 			 * Should be a no-op if we did IO above. */
2946 			cond_resched();
2947 			if (WARN_ON(i + 1 > depth)) {
2948 				err = -EFSCORRUPTED;
2949 				break;
2950 			}
2951 			path[i + 1].p_bh = bh;
2952 
2953 			/* save actual number of indexes since this
2954 			 * number is changed at the next iteration */
2955 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2956 			i++;
2957 		} else {
2958 			/* we finished processing this index, go up */
2959 			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2960 				/* index is empty, remove it;
2961 				 * handle must be already prepared by the
2962 				 * truncatei_leaf() */
2963 				err = ext4_ext_rm_idx(handle, inode, path, i);
2964 			}
2965 			/* root level has p_bh == NULL, brelse() eats this */
2966 			brelse(path[i].p_bh);
2967 			path[i].p_bh = NULL;
2968 			i--;
2969 			ext_debug("return to level %d\n", i);
2970 		}
2971 	}
2972 
2973 	trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
2974 					 path->p_hdr->eh_entries);
2975 
2976 	/*
2977 	 * if there's a partial cluster and we have removed the first extent
2978 	 * in the file, then we also free the partial cluster, if any
2979 	 */
2980 	if (partial.state == tofree && err == 0) {
2981 		int flags = get_default_free_blocks_flags(inode);
2982 
2983 		if (ext4_is_pending(inode, partial.lblk))
2984 			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2985 		ext4_free_blocks(handle, inode, NULL,
2986 				 EXT4_C2B(sbi, partial.pclu),
2987 				 sbi->s_cluster_ratio, flags);
2988 		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2989 			ext4_rereserve_cluster(inode, partial.lblk);
2990 		partial.state = initial;
2991 	}
2992 
2993 	/* TODO: flexible tree reduction should be here */
2994 	if (path->p_hdr->eh_entries == 0) {
2995 		/*
2996 		 * truncate to zero freed all the tree,
2997 		 * so we need to correct eh_depth
2998 		 */
2999 		err = ext4_ext_get_access(handle, inode, path);
3000 		if (err == 0) {
3001 			ext_inode_hdr(inode)->eh_depth = 0;
3002 			ext_inode_hdr(inode)->eh_max =
3003 				cpu_to_le16(ext4_ext_space_root(inode, 0));
3004 			err = ext4_ext_dirty(handle, inode, path);
3005 		}
3006 	}
3007 out:
3008 	ext4_ext_drop_refs(path);
3009 	kfree(path);
3010 	path = NULL;
3011 	if (err == -EAGAIN)
3012 		goto again;
3013 	ext4_journal_stop(handle);
3014 
3015 	return err;
3016 }
3017 
3018 /*
3019  * called at mount time
3020  */
3021 void ext4_ext_init(struct super_block *sb)
3022 {
3023 	/*
3024 	 * possible initialization would be here
3025 	 */
3026 
3027 	if (ext4_has_feature_extents(sb)) {
3028 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3029 		printk(KERN_INFO "EXT4-fs: file extents enabled"
3030 #ifdef AGGRESSIVE_TEST
3031 		       ", aggressive tests"
3032 #endif
3033 #ifdef CHECK_BINSEARCH
3034 		       ", check binsearch"
3035 #endif
3036 #ifdef EXTENTS_STATS
3037 		       ", stats"
3038 #endif
3039 		       "\n");
3040 #endif
3041 #ifdef EXTENTS_STATS
3042 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3043 		EXT4_SB(sb)->s_ext_min = 1 << 30;
3044 		EXT4_SB(sb)->s_ext_max = 0;
3045 #endif
3046 	}
3047 }
3048 
3049 /*
3050  * called at umount time
3051  */
3052 void ext4_ext_release(struct super_block *sb)
3053 {
3054 	if (!ext4_has_feature_extents(sb))
3055 		return;
3056 
3057 #ifdef EXTENTS_STATS
3058 	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3059 		struct ext4_sb_info *sbi = EXT4_SB(sb);
3060 		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3061 			sbi->s_ext_blocks, sbi->s_ext_extents,
3062 			sbi->s_ext_blocks / sbi->s_ext_extents);
3063 		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3064 			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3065 	}
3066 #endif
3067 }
3068 
3069 static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3070 {
3071 	ext4_lblk_t  ee_block;
3072 	ext4_fsblk_t ee_pblock;
3073 	unsigned int ee_len;
3074 
3075 	ee_block  = le32_to_cpu(ex->ee_block);
3076 	ee_len    = ext4_ext_get_actual_len(ex);
3077 	ee_pblock = ext4_ext_pblock(ex);
3078 
3079 	if (ee_len == 0)
3080 		return 0;
3081 
3082 	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3083 				     EXTENT_STATUS_WRITTEN);
3084 }
3085 
3086 /* FIXME!! we need to try to merge to left or right after zero-out  */
3087 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3088 {
3089 	ext4_fsblk_t ee_pblock;
3090 	unsigned int ee_len;
3091 
3092 	ee_len    = ext4_ext_get_actual_len(ex);
3093 	ee_pblock = ext4_ext_pblock(ex);
3094 	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3095 				  ee_len);
3096 }
3097 
3098 /*
3099  * ext4_split_extent_at() splits an extent at given block.
3100  *
3101  * @handle: the journal handle
3102  * @inode: the file inode
3103  * @path: the path to the extent
3104  * @split: the logical block where the extent is splitted.
3105  * @split_flags: indicates if the extent could be zeroout if split fails, and
3106  *		 the states(init or unwritten) of new extents.
3107  * @flags: flags used to insert new extent to extent tree.
3108  *
3109  *
3110  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3111  * of which are deterimined by split_flag.
3112  *
3113  * There are two cases:
3114  *  a> the extent are splitted into two extent.
3115  *  b> split is not needed, and just mark the extent.
3116  *
3117  * return 0 on success.
3118  */
3119 static int ext4_split_extent_at(handle_t *handle,
3120 			     struct inode *inode,
3121 			     struct ext4_ext_path **ppath,
3122 			     ext4_lblk_t split,
3123 			     int split_flag,
3124 			     int flags)
3125 {
3126 	struct ext4_ext_path *path = *ppath;
3127 	ext4_fsblk_t newblock;
3128 	ext4_lblk_t ee_block;
3129 	struct ext4_extent *ex, newex, orig_ex, zero_ex;
3130 	struct ext4_extent *ex2 = NULL;
3131 	unsigned int ee_len, depth;
3132 	int err = 0;
3133 
3134 	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3135 	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3136 
3137 	ext_debug("ext4_split_extents_at: inode %lu, logical"
3138 		"block %llu\n", inode->i_ino, (unsigned long long)split);
3139 
3140 	ext4_ext_show_leaf(inode, path);
3141 
3142 	depth = ext_depth(inode);
3143 	ex = path[depth].p_ext;
3144 	ee_block = le32_to_cpu(ex->ee_block);
3145 	ee_len = ext4_ext_get_actual_len(ex);
3146 	newblock = split - ee_block + ext4_ext_pblock(ex);
3147 
3148 	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3149 	BUG_ON(!ext4_ext_is_unwritten(ex) &&
3150 	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
3151 			     EXT4_EXT_MARK_UNWRIT1 |
3152 			     EXT4_EXT_MARK_UNWRIT2));
3153 
3154 	err = ext4_ext_get_access(handle, inode, path + depth);
3155 	if (err)
3156 		goto out;
3157 
3158 	if (split == ee_block) {
3159 		/*
3160 		 * case b: block @split is the block that the extent begins with
3161 		 * then we just change the state of the extent, and splitting
3162 		 * is not needed.
3163 		 */
3164 		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3165 			ext4_ext_mark_unwritten(ex);
3166 		else
3167 			ext4_ext_mark_initialized(ex);
3168 
3169 		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3170 			ext4_ext_try_to_merge(handle, inode, path, ex);
3171 
3172 		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3173 		goto out;
3174 	}
3175 
3176 	/* case a */
3177 	memcpy(&orig_ex, ex, sizeof(orig_ex));
3178 	ex->ee_len = cpu_to_le16(split - ee_block);
3179 	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3180 		ext4_ext_mark_unwritten(ex);
3181 
3182 	/*
3183 	 * path may lead to new leaf, not to original leaf any more
3184 	 * after ext4_ext_insert_extent() returns,
3185 	 */
3186 	err = ext4_ext_dirty(handle, inode, path + depth);
3187 	if (err)
3188 		goto fix_extent_len;
3189 
3190 	ex2 = &newex;
3191 	ex2->ee_block = cpu_to_le32(split);
3192 	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3193 	ext4_ext_store_pblock(ex2, newblock);
3194 	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3195 		ext4_ext_mark_unwritten(ex2);
3196 
3197 	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3198 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3199 		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3200 			if (split_flag & EXT4_EXT_DATA_VALID1) {
3201 				err = ext4_ext_zeroout(inode, ex2);
3202 				zero_ex.ee_block = ex2->ee_block;
3203 				zero_ex.ee_len = cpu_to_le16(
3204 						ext4_ext_get_actual_len(ex2));
3205 				ext4_ext_store_pblock(&zero_ex,
3206 						      ext4_ext_pblock(ex2));
3207 			} else {
3208 				err = ext4_ext_zeroout(inode, ex);
3209 				zero_ex.ee_block = ex->ee_block;
3210 				zero_ex.ee_len = cpu_to_le16(
3211 						ext4_ext_get_actual_len(ex));
3212 				ext4_ext_store_pblock(&zero_ex,
3213 						      ext4_ext_pblock(ex));
3214 			}
3215 		} else {
3216 			err = ext4_ext_zeroout(inode, &orig_ex);
3217 			zero_ex.ee_block = orig_ex.ee_block;
3218 			zero_ex.ee_len = cpu_to_le16(
3219 						ext4_ext_get_actual_len(&orig_ex));
3220 			ext4_ext_store_pblock(&zero_ex,
3221 					      ext4_ext_pblock(&orig_ex));
3222 		}
3223 
3224 		if (err)
3225 			goto fix_extent_len;
3226 		/* update the extent length and mark as initialized */
3227 		ex->ee_len = cpu_to_le16(ee_len);
3228 		ext4_ext_try_to_merge(handle, inode, path, ex);
3229 		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3230 		if (err)
3231 			goto fix_extent_len;
3232 
3233 		/* update extent status tree */
3234 		err = ext4_zeroout_es(inode, &zero_ex);
3235 
3236 		goto out;
3237 	} else if (err)
3238 		goto fix_extent_len;
3239 
3240 out:
3241 	ext4_ext_show_leaf(inode, path);
3242 	return err;
3243 
3244 fix_extent_len:
3245 	ex->ee_len = orig_ex.ee_len;
3246 	ext4_ext_dirty(handle, inode, path + path->p_depth);
3247 	return err;
3248 }
3249 
3250 /*
3251  * ext4_split_extents() splits an extent and mark extent which is covered
3252  * by @map as split_flags indicates
3253  *
3254  * It may result in splitting the extent into multiple extents (up to three)
3255  * There are three possibilities:
3256  *   a> There is no split required
3257  *   b> Splits in two extents: Split is happening at either end of the extent
3258  *   c> Splits in three extents: Somone is splitting in middle of the extent
3259  *
3260  */
3261 static int ext4_split_extent(handle_t *handle,
3262 			      struct inode *inode,
3263 			      struct ext4_ext_path **ppath,
3264 			      struct ext4_map_blocks *map,
3265 			      int split_flag,
3266 			      int flags)
3267 {
3268 	struct ext4_ext_path *path = *ppath;
3269 	ext4_lblk_t ee_block;
3270 	struct ext4_extent *ex;
3271 	unsigned int ee_len, depth;
3272 	int err = 0;
3273 	int unwritten;
3274 	int split_flag1, flags1;
3275 	int allocated = map->m_len;
3276 
3277 	depth = ext_depth(inode);
3278 	ex = path[depth].p_ext;
3279 	ee_block = le32_to_cpu(ex->ee_block);
3280 	ee_len = ext4_ext_get_actual_len(ex);
3281 	unwritten = ext4_ext_is_unwritten(ex);
3282 
3283 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
3284 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3285 		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3286 		if (unwritten)
3287 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3288 				       EXT4_EXT_MARK_UNWRIT2;
3289 		if (split_flag & EXT4_EXT_DATA_VALID2)
3290 			split_flag1 |= EXT4_EXT_DATA_VALID1;
3291 		err = ext4_split_extent_at(handle, inode, ppath,
3292 				map->m_lblk + map->m_len, split_flag1, flags1);
3293 		if (err)
3294 			goto out;
3295 	} else {
3296 		allocated = ee_len - (map->m_lblk - ee_block);
3297 	}
3298 	/*
3299 	 * Update path is required because previous ext4_split_extent_at() may
3300 	 * result in split of original leaf or extent zeroout.
3301 	 */
3302 	path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3303 	if (IS_ERR(path))
3304 		return PTR_ERR(path);
3305 	depth = ext_depth(inode);
3306 	ex = path[depth].p_ext;
3307 	if (!ex) {
3308 		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3309 				 (unsigned long) map->m_lblk);
3310 		return -EFSCORRUPTED;
3311 	}
3312 	unwritten = ext4_ext_is_unwritten(ex);
3313 	split_flag1 = 0;
3314 
3315 	if (map->m_lblk >= ee_block) {
3316 		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3317 		if (unwritten) {
3318 			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3319 			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3320 						     EXT4_EXT_MARK_UNWRIT2);
3321 		}
3322 		err = ext4_split_extent_at(handle, inode, ppath,
3323 				map->m_lblk, split_flag1, flags);
3324 		if (err)
3325 			goto out;
3326 	}
3327 
3328 	ext4_ext_show_leaf(inode, path);
3329 out:
3330 	return err ? err : allocated;
3331 }
3332 
3333 /*
3334  * This function is called by ext4_ext_map_blocks() if someone tries to write
3335  * to an unwritten extent. It may result in splitting the unwritten
3336  * extent into multiple extents (up to three - one initialized and two
3337  * unwritten).
3338  * There are three possibilities:
3339  *   a> There is no split required: Entire extent should be initialized
3340  *   b> Splits in two extents: Write is happening at either end of the extent
3341  *   c> Splits in three extents: Somone is writing in middle of the extent
3342  *
3343  * Pre-conditions:
3344  *  - The extent pointed to by 'path' is unwritten.
3345  *  - The extent pointed to by 'path' contains a superset
3346  *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3347  *
3348  * Post-conditions on success:
3349  *  - the returned value is the number of blocks beyond map->l_lblk
3350  *    that are allocated and initialized.
3351  *    It is guaranteed to be >= map->m_len.
3352  */
3353 static int ext4_ext_convert_to_initialized(handle_t *handle,
3354 					   struct inode *inode,
3355 					   struct ext4_map_blocks *map,
3356 					   struct ext4_ext_path **ppath,
3357 					   int flags)
3358 {
3359 	struct ext4_ext_path *path = *ppath;
3360 	struct ext4_sb_info *sbi;
3361 	struct ext4_extent_header *eh;
3362 	struct ext4_map_blocks split_map;
3363 	struct ext4_extent zero_ex1, zero_ex2;
3364 	struct ext4_extent *ex, *abut_ex;
3365 	ext4_lblk_t ee_block, eof_block;
3366 	unsigned int ee_len, depth, map_len = map->m_len;
3367 	int allocated = 0, max_zeroout = 0;
3368 	int err = 0;
3369 	int split_flag = EXT4_EXT_DATA_VALID2;
3370 
3371 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
3372 		"block %llu, max_blocks %u\n", inode->i_ino,
3373 		(unsigned long long)map->m_lblk, map_len);
3374 
3375 	sbi = EXT4_SB(inode->i_sb);
3376 	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3377 			>> inode->i_sb->s_blocksize_bits;
3378 	if (eof_block < map->m_lblk + map_len)
3379 		eof_block = map->m_lblk + map_len;
3380 
3381 	depth = ext_depth(inode);
3382 	eh = path[depth].p_hdr;
3383 	ex = path[depth].p_ext;
3384 	ee_block = le32_to_cpu(ex->ee_block);
3385 	ee_len = ext4_ext_get_actual_len(ex);
3386 	zero_ex1.ee_len = 0;
3387 	zero_ex2.ee_len = 0;
3388 
3389 	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3390 
3391 	/* Pre-conditions */
3392 	BUG_ON(!ext4_ext_is_unwritten(ex));
3393 	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3394 
3395 	/*
3396 	 * Attempt to transfer newly initialized blocks from the currently
3397 	 * unwritten extent to its neighbor. This is much cheaper
3398 	 * than an insertion followed by a merge as those involve costly
3399 	 * memmove() calls. Transferring to the left is the common case in
3400 	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3401 	 * followed by append writes.
3402 	 *
3403 	 * Limitations of the current logic:
3404 	 *  - L1: we do not deal with writes covering the whole extent.
3405 	 *    This would require removing the extent if the transfer
3406 	 *    is possible.
3407 	 *  - L2: we only attempt to merge with an extent stored in the
3408 	 *    same extent tree node.
3409 	 */
3410 	if ((map->m_lblk == ee_block) &&
3411 		/* See if we can merge left */
3412 		(map_len < ee_len) &&		/*L1*/
3413 		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
3414 		ext4_lblk_t prev_lblk;
3415 		ext4_fsblk_t prev_pblk, ee_pblk;
3416 		unsigned int prev_len;
3417 
3418 		abut_ex = ex - 1;
3419 		prev_lblk = le32_to_cpu(abut_ex->ee_block);
3420 		prev_len = ext4_ext_get_actual_len(abut_ex);
3421 		prev_pblk = ext4_ext_pblock(abut_ex);
3422 		ee_pblk = ext4_ext_pblock(ex);
3423 
3424 		/*
3425 		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3426 		 * upon those conditions:
3427 		 * - C1: abut_ex is initialized,
3428 		 * - C2: abut_ex is logically abutting ex,
3429 		 * - C3: abut_ex is physically abutting ex,
3430 		 * - C4: abut_ex can receive the additional blocks without
3431 		 *   overflowing the (initialized) length limit.
3432 		 */
3433 		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3434 			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3435 			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3436 			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3437 			err = ext4_ext_get_access(handle, inode, path + depth);
3438 			if (err)
3439 				goto out;
3440 
3441 			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3442 				map, ex, abut_ex);
3443 
3444 			/* Shift the start of ex by 'map_len' blocks */
3445 			ex->ee_block = cpu_to_le32(ee_block + map_len);
3446 			ext4_ext_store_pblock(ex, ee_pblk + map_len);
3447 			ex->ee_len = cpu_to_le16(ee_len - map_len);
3448 			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3449 
3450 			/* Extend abut_ex by 'map_len' blocks */
3451 			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3452 
3453 			/* Result: number of initialized blocks past m_lblk */
3454 			allocated = map_len;
3455 		}
3456 	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3457 		   (map_len < ee_len) &&	/*L1*/
3458 		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
3459 		/* See if we can merge right */
3460 		ext4_lblk_t next_lblk;
3461 		ext4_fsblk_t next_pblk, ee_pblk;
3462 		unsigned int next_len;
3463 
3464 		abut_ex = ex + 1;
3465 		next_lblk = le32_to_cpu(abut_ex->ee_block);
3466 		next_len = ext4_ext_get_actual_len(abut_ex);
3467 		next_pblk = ext4_ext_pblock(abut_ex);
3468 		ee_pblk = ext4_ext_pblock(ex);
3469 
3470 		/*
3471 		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3472 		 * upon those conditions:
3473 		 * - C1: abut_ex is initialized,
3474 		 * - C2: abut_ex is logically abutting ex,
3475 		 * - C3: abut_ex is physically abutting ex,
3476 		 * - C4: abut_ex can receive the additional blocks without
3477 		 *   overflowing the (initialized) length limit.
3478 		 */
3479 		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3480 		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
3481 		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
3482 		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3483 			err = ext4_ext_get_access(handle, inode, path + depth);
3484 			if (err)
3485 				goto out;
3486 
3487 			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3488 				map, ex, abut_ex);
3489 
3490 			/* Shift the start of abut_ex by 'map_len' blocks */
3491 			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3492 			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3493 			ex->ee_len = cpu_to_le16(ee_len - map_len);
3494 			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3495 
3496 			/* Extend abut_ex by 'map_len' blocks */
3497 			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3498 
3499 			/* Result: number of initialized blocks past m_lblk */
3500 			allocated = map_len;
3501 		}
3502 	}
3503 	if (allocated) {
3504 		/* Mark the block containing both extents as dirty */
3505 		ext4_ext_dirty(handle, inode, path + depth);
3506 
3507 		/* Update path to point to the right extent */
3508 		path[depth].p_ext = abut_ex;
3509 		goto out;
3510 	} else
3511 		allocated = ee_len - (map->m_lblk - ee_block);
3512 
3513 	WARN_ON(map->m_lblk < ee_block);
3514 	/*
3515 	 * It is safe to convert extent to initialized via explicit
3516 	 * zeroout only if extent is fully inside i_size or new_size.
3517 	 */
3518 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3519 
3520 	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3521 		max_zeroout = sbi->s_extent_max_zeroout_kb >>
3522 			(inode->i_sb->s_blocksize_bits - 10);
3523 
3524 	/*
3525 	 * five cases:
3526 	 * 1. split the extent into three extents.
3527 	 * 2. split the extent into two extents, zeroout the head of the first
3528 	 *    extent.
3529 	 * 3. split the extent into two extents, zeroout the tail of the second
3530 	 *    extent.
3531 	 * 4. split the extent into two extents with out zeroout.
3532 	 * 5. no splitting needed, just possibly zeroout the head and / or the
3533 	 *    tail of the extent.
3534 	 */
3535 	split_map.m_lblk = map->m_lblk;
3536 	split_map.m_len = map->m_len;
3537 
3538 	if (max_zeroout && (allocated > split_map.m_len)) {
3539 		if (allocated <= max_zeroout) {
3540 			/* case 3 or 5 */
3541 			zero_ex1.ee_block =
3542 				 cpu_to_le32(split_map.m_lblk +
3543 					     split_map.m_len);
3544 			zero_ex1.ee_len =
3545 				cpu_to_le16(allocated - split_map.m_len);
3546 			ext4_ext_store_pblock(&zero_ex1,
3547 				ext4_ext_pblock(ex) + split_map.m_lblk +
3548 				split_map.m_len - ee_block);
3549 			err = ext4_ext_zeroout(inode, &zero_ex1);
3550 			if (err)
3551 				goto out;
3552 			split_map.m_len = allocated;
3553 		}
3554 		if (split_map.m_lblk - ee_block + split_map.m_len <
3555 								max_zeroout) {
3556 			/* case 2 or 5 */
3557 			if (split_map.m_lblk != ee_block) {
3558 				zero_ex2.ee_block = ex->ee_block;
3559 				zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3560 							ee_block);
3561 				ext4_ext_store_pblock(&zero_ex2,
3562 						      ext4_ext_pblock(ex));
3563 				err = ext4_ext_zeroout(inode, &zero_ex2);
3564 				if (err)
3565 					goto out;
3566 			}
3567 
3568 			split_map.m_len += split_map.m_lblk - ee_block;
3569 			split_map.m_lblk = ee_block;
3570 			allocated = map->m_len;
3571 		}
3572 	}
3573 
3574 	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3575 				flags);
3576 	if (err > 0)
3577 		err = 0;
3578 out:
3579 	/* If we have gotten a failure, don't zero out status tree */
3580 	if (!err) {
3581 		err = ext4_zeroout_es(inode, &zero_ex1);
3582 		if (!err)
3583 			err = ext4_zeroout_es(inode, &zero_ex2);
3584 	}
3585 	return err ? err : allocated;
3586 }
3587 
3588 /*
3589  * This function is called by ext4_ext_map_blocks() from
3590  * ext4_get_blocks_dio_write() when DIO to write
3591  * to an unwritten extent.
3592  *
3593  * Writing to an unwritten extent may result in splitting the unwritten
3594  * extent into multiple initialized/unwritten extents (up to three)
3595  * There are three possibilities:
3596  *   a> There is no split required: Entire extent should be unwritten
3597  *   b> Splits in two extents: Write is happening at either end of the extent
3598  *   c> Splits in three extents: Somone is writing in middle of the extent
3599  *
3600  * This works the same way in the case of initialized -> unwritten conversion.
3601  *
3602  * One of more index blocks maybe needed if the extent tree grow after
3603  * the unwritten extent split. To prevent ENOSPC occur at the IO
3604  * complete, we need to split the unwritten extent before DIO submit
3605  * the IO. The unwritten extent called at this time will be split
3606  * into three unwritten extent(at most). After IO complete, the part
3607  * being filled will be convert to initialized by the end_io callback function
3608  * via ext4_convert_unwritten_extents().
3609  *
3610  * Returns the size of unwritten extent to be written on success.
3611  */
3612 static int ext4_split_convert_extents(handle_t *handle,
3613 					struct inode *inode,
3614 					struct ext4_map_blocks *map,
3615 					struct ext4_ext_path **ppath,
3616 					int flags)
3617 {
3618 	struct ext4_ext_path *path = *ppath;
3619 	ext4_lblk_t eof_block;
3620 	ext4_lblk_t ee_block;
3621 	struct ext4_extent *ex;
3622 	unsigned int ee_len;
3623 	int split_flag = 0, depth;
3624 
3625 	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
3626 		  __func__, inode->i_ino,
3627 		  (unsigned long long)map->m_lblk, map->m_len);
3628 
3629 	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3630 			>> inode->i_sb->s_blocksize_bits;
3631 	if (eof_block < map->m_lblk + map->m_len)
3632 		eof_block = map->m_lblk + map->m_len;
3633 	/*
3634 	 * It is safe to convert extent to initialized via explicit
3635 	 * zeroout only if extent is fully insde i_size or new_size.
3636 	 */
3637 	depth = ext_depth(inode);
3638 	ex = path[depth].p_ext;
3639 	ee_block = le32_to_cpu(ex->ee_block);
3640 	ee_len = ext4_ext_get_actual_len(ex);
3641 
3642 	/* Convert to unwritten */
3643 	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3644 		split_flag |= EXT4_EXT_DATA_VALID1;
3645 	/* Convert to initialized */
3646 	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3647 		split_flag |= ee_block + ee_len <= eof_block ?
3648 			      EXT4_EXT_MAY_ZEROOUT : 0;
3649 		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3650 	}
3651 	flags |= EXT4_GET_BLOCKS_PRE_IO;
3652 	return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3653 }
3654 
3655 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3656 						struct inode *inode,
3657 						struct ext4_map_blocks *map,
3658 						struct ext4_ext_path **ppath)
3659 {
3660 	struct ext4_ext_path *path = *ppath;
3661 	struct ext4_extent *ex;
3662 	ext4_lblk_t ee_block;
3663 	unsigned int ee_len;
3664 	int depth;
3665 	int err = 0;
3666 
3667 	depth = ext_depth(inode);
3668 	ex = path[depth].p_ext;
3669 	ee_block = le32_to_cpu(ex->ee_block);
3670 	ee_len = ext4_ext_get_actual_len(ex);
3671 
3672 	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3673 		"block %llu, max_blocks %u\n", inode->i_ino,
3674 		  (unsigned long long)ee_block, ee_len);
3675 
3676 	/* If extent is larger than requested it is a clear sign that we still
3677 	 * have some extent state machine issues left. So extent_split is still
3678 	 * required.
3679 	 * TODO: Once all related issues will be fixed this situation should be
3680 	 * illegal.
3681 	 */
3682 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3683 #ifdef CONFIG_EXT4_DEBUG
3684 		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3685 			     " len %u; IO logical block %llu, len %u",
3686 			     inode->i_ino, (unsigned long long)ee_block, ee_len,
3687 			     (unsigned long long)map->m_lblk, map->m_len);
3688 #endif
3689 		err = ext4_split_convert_extents(handle, inode, map, ppath,
3690 						 EXT4_GET_BLOCKS_CONVERT);
3691 		if (err < 0)
3692 			return err;
3693 		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3694 		if (IS_ERR(path))
3695 			return PTR_ERR(path);
3696 		depth = ext_depth(inode);
3697 		ex = path[depth].p_ext;
3698 	}
3699 
3700 	err = ext4_ext_get_access(handle, inode, path + depth);
3701 	if (err)
3702 		goto out;
3703 	/* first mark the extent as initialized */
3704 	ext4_ext_mark_initialized(ex);
3705 
3706 	/* note: ext4_ext_correct_indexes() isn't needed here because
3707 	 * borders are not changed
3708 	 */
3709 	ext4_ext_try_to_merge(handle, inode, path, ex);
3710 
3711 	/* Mark modified extent as dirty */
3712 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3713 out:
3714 	ext4_ext_show_leaf(inode, path);
3715 	return err;
3716 }
3717 
3718 static int
3719 convert_initialized_extent(handle_t *handle, struct inode *inode,
3720 			   struct ext4_map_blocks *map,
3721 			   struct ext4_ext_path **ppath,
3722 			   unsigned int *allocated)
3723 {
3724 	struct ext4_ext_path *path = *ppath;
3725 	struct ext4_extent *ex;
3726 	ext4_lblk_t ee_block;
3727 	unsigned int ee_len;
3728 	int depth;
3729 	int err = 0;
3730 
3731 	/*
3732 	 * Make sure that the extent is no bigger than we support with
3733 	 * unwritten extent
3734 	 */
3735 	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3736 		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3737 
3738 	depth = ext_depth(inode);
3739 	ex = path[depth].p_ext;
3740 	ee_block = le32_to_cpu(ex->ee_block);
3741 	ee_len = ext4_ext_get_actual_len(ex);
3742 
3743 	ext_debug("%s: inode %lu, logical"
3744 		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
3745 		  (unsigned long long)ee_block, ee_len);
3746 
3747 	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3748 		err = ext4_split_convert_extents(handle, inode, map, ppath,
3749 				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3750 		if (err < 0)
3751 			return err;
3752 		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3753 		if (IS_ERR(path))
3754 			return PTR_ERR(path);
3755 		depth = ext_depth(inode);
3756 		ex = path[depth].p_ext;
3757 		if (!ex) {
3758 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3759 					 (unsigned long) map->m_lblk);
3760 			return -EFSCORRUPTED;
3761 		}
3762 	}
3763 
3764 	err = ext4_ext_get_access(handle, inode, path + depth);
3765 	if (err)
3766 		return err;
3767 	/* first mark the extent as unwritten */
3768 	ext4_ext_mark_unwritten(ex);
3769 
3770 	/* note: ext4_ext_correct_indexes() isn't needed here because
3771 	 * borders are not changed
3772 	 */
3773 	ext4_ext_try_to_merge(handle, inode, path, ex);
3774 
3775 	/* Mark modified extent as dirty */
3776 	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3777 	if (err)
3778 		return err;
3779 	ext4_ext_show_leaf(inode, path);
3780 
3781 	ext4_update_inode_fsync_trans(handle, inode, 1);
3782 
3783 	map->m_flags |= EXT4_MAP_UNWRITTEN;
3784 	if (*allocated > map->m_len)
3785 		*allocated = map->m_len;
3786 	map->m_len = *allocated;
3787 	return 0;
3788 }
3789 
3790 static int
3791 ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3792 			struct ext4_map_blocks *map,
3793 			struct ext4_ext_path **ppath, int flags,
3794 			unsigned int allocated, ext4_fsblk_t newblock)
3795 {
3796 #ifdef EXT_DEBUG
3797 	struct ext4_ext_path *path = *ppath;
3798 #endif
3799 	int ret = 0;
3800 	int err = 0;
3801 
3802 	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
3803 		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
3804 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3805 		  flags, allocated);
3806 	ext4_ext_show_leaf(inode, path);
3807 
3808 	/*
3809 	 * When writing into unwritten space, we should not fail to
3810 	 * allocate metadata blocks for the new extent block if needed.
3811 	 */
3812 	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3813 
3814 	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
3815 						    allocated, newblock);
3816 
3817 	/* get_block() before submit the IO, split the extent */
3818 	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
3819 		ret = ext4_split_convert_extents(handle, inode, map, ppath,
3820 					 flags | EXT4_GET_BLOCKS_CONVERT);
3821 		if (ret <= 0)
3822 			goto out;
3823 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3824 		goto out;
3825 	}
3826 	/* IO end_io complete, convert the filled extent to written */
3827 	if (flags & EXT4_GET_BLOCKS_CONVERT) {
3828 		if (flags & EXT4_GET_BLOCKS_ZERO) {
3829 			if (allocated > map->m_len)
3830 				allocated = map->m_len;
3831 			err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
3832 						 allocated);
3833 			if (err < 0)
3834 				goto out2;
3835 		}
3836 		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
3837 							   ppath);
3838 		if (ret >= 0)
3839 			ext4_update_inode_fsync_trans(handle, inode, 1);
3840 		else
3841 			err = ret;
3842 		map->m_flags |= EXT4_MAP_MAPPED;
3843 		map->m_pblk = newblock;
3844 		if (allocated > map->m_len)
3845 			allocated = map->m_len;
3846 		map->m_len = allocated;
3847 		goto out2;
3848 	}
3849 	/* buffered IO case */
3850 	/*
3851 	 * repeat fallocate creation request
3852 	 * we already have an unwritten extent
3853 	 */
3854 	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
3855 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3856 		goto map_out;
3857 	}
3858 
3859 	/* buffered READ or buffered write_begin() lookup */
3860 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3861 		/*
3862 		 * We have blocks reserved already.  We
3863 		 * return allocated blocks so that delalloc
3864 		 * won't do block reservation for us.  But
3865 		 * the buffer head will be unmapped so that
3866 		 * a read from the block returns 0s.
3867 		 */
3868 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3869 		goto out1;
3870 	}
3871 
3872 	/* buffered write, writepage time, convert*/
3873 	ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
3874 	if (ret >= 0)
3875 		ext4_update_inode_fsync_trans(handle, inode, 1);
3876 out:
3877 	if (ret <= 0) {
3878 		err = ret;
3879 		goto out2;
3880 	} else
3881 		allocated = ret;
3882 	map->m_flags |= EXT4_MAP_NEW;
3883 	if (allocated > map->m_len)
3884 		allocated = map->m_len;
3885 	map->m_len = allocated;
3886 
3887 map_out:
3888 	map->m_flags |= EXT4_MAP_MAPPED;
3889 out1:
3890 	if (allocated > map->m_len)
3891 		allocated = map->m_len;
3892 	ext4_ext_show_leaf(inode, path);
3893 	map->m_pblk = newblock;
3894 	map->m_len = allocated;
3895 out2:
3896 	return err ? err : allocated;
3897 }
3898 
3899 /*
3900  * get_implied_cluster_alloc - check to see if the requested
3901  * allocation (in the map structure) overlaps with a cluster already
3902  * allocated in an extent.
3903  *	@sb	The filesystem superblock structure
3904  *	@map	The requested lblk->pblk mapping
3905  *	@ex	The extent structure which might contain an implied
3906  *			cluster allocation
3907  *
3908  * This function is called by ext4_ext_map_blocks() after we failed to
3909  * find blocks that were already in the inode's extent tree.  Hence,
3910  * we know that the beginning of the requested region cannot overlap
3911  * the extent from the inode's extent tree.  There are three cases we
3912  * want to catch.  The first is this case:
3913  *
3914  *		 |--- cluster # N--|
3915  *    |--- extent ---|	|---- requested region ---|
3916  *			|==========|
3917  *
3918  * The second case that we need to test for is this one:
3919  *
3920  *   |--------- cluster # N ----------------|
3921  *	   |--- requested region --|   |------- extent ----|
3922  *	   |=======================|
3923  *
3924  * The third case is when the requested region lies between two extents
3925  * within the same cluster:
3926  *          |------------- cluster # N-------------|
3927  * |----- ex -----|                  |---- ex_right ----|
3928  *                  |------ requested region ------|
3929  *                  |================|
3930  *
3931  * In each of the above cases, we need to set the map->m_pblk and
3932  * map->m_len so it corresponds to the return the extent labelled as
3933  * "|====|" from cluster #N, since it is already in use for data in
3934  * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
3935  * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
3936  * as a new "allocated" block region.  Otherwise, we will return 0 and
3937  * ext4_ext_map_blocks() will then allocate one or more new clusters
3938  * by calling ext4_mb_new_blocks().
3939  */
3940 static int get_implied_cluster_alloc(struct super_block *sb,
3941 				     struct ext4_map_blocks *map,
3942 				     struct ext4_extent *ex,
3943 				     struct ext4_ext_path *path)
3944 {
3945 	struct ext4_sb_info *sbi = EXT4_SB(sb);
3946 	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
3947 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
3948 	ext4_lblk_t rr_cluster_start;
3949 	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3950 	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3951 	unsigned short ee_len = ext4_ext_get_actual_len(ex);
3952 
3953 	/* The extent passed in that we are trying to match */
3954 	ex_cluster_start = EXT4_B2C(sbi, ee_block);
3955 	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
3956 
3957 	/* The requested region passed into ext4_map_blocks() */
3958 	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3959 
3960 	if ((rr_cluster_start == ex_cluster_end) ||
3961 	    (rr_cluster_start == ex_cluster_start)) {
3962 		if (rr_cluster_start == ex_cluster_end)
3963 			ee_start += ee_len - 1;
3964 		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
3965 		map->m_len = min(map->m_len,
3966 				 (unsigned) sbi->s_cluster_ratio - c_offset);
3967 		/*
3968 		 * Check for and handle this case:
3969 		 *
3970 		 *   |--------- cluster # N-------------|
3971 		 *		       |------- extent ----|
3972 		 *	   |--- requested region ---|
3973 		 *	   |===========|
3974 		 */
3975 
3976 		if (map->m_lblk < ee_block)
3977 			map->m_len = min(map->m_len, ee_block - map->m_lblk);
3978 
3979 		/*
3980 		 * Check for the case where there is already another allocated
3981 		 * block to the right of 'ex' but before the end of the cluster.
3982 		 *
3983 		 *          |------------- cluster # N-------------|
3984 		 * |----- ex -----|                  |---- ex_right ----|
3985 		 *                  |------ requested region ------|
3986 		 *                  |================|
3987 		 */
3988 		if (map->m_lblk > ee_block) {
3989 			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
3990 			map->m_len = min(map->m_len, next - map->m_lblk);
3991 		}
3992 
3993 		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
3994 		return 1;
3995 	}
3996 
3997 	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
3998 	return 0;
3999 }
4000 
4001 
4002 /*
4003  * Block allocation/map/preallocation routine for extents based files
4004  *
4005  *
4006  * Need to be called with
4007  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4008  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4009  *
4010  * return > 0, number of of blocks already mapped/allocated
4011  *          if create == 0 and these are pre-allocated blocks
4012  *          	buffer head is unmapped
4013  *          otherwise blocks are mapped
4014  *
4015  * return = 0, if plain look up failed (blocks have not been allocated)
4016  *          buffer head is unmapped
4017  *
4018  * return < 0, error case.
4019  */
4020 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4021 			struct ext4_map_blocks *map, int flags)
4022 {
4023 	struct ext4_ext_path *path = NULL;
4024 	struct ext4_extent newex, *ex, *ex2;
4025 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4026 	ext4_fsblk_t newblock = 0;
4027 	int err = 0, depth, ret;
4028 	unsigned int allocated = 0, offset = 0;
4029 	unsigned int allocated_clusters = 0;
4030 	struct ext4_allocation_request ar;
4031 	ext4_lblk_t cluster_offset;
4032 
4033 	ext_debug("blocks %u/%u requested for inode %lu\n",
4034 		  map->m_lblk, map->m_len, inode->i_ino);
4035 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4036 
4037 	/* find extent for this block */
4038 	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4039 	if (IS_ERR(path)) {
4040 		err = PTR_ERR(path);
4041 		path = NULL;
4042 		goto out2;
4043 	}
4044 
4045 	depth = ext_depth(inode);
4046 
4047 	/*
4048 	 * consistent leaf must not be empty;
4049 	 * this situation is possible, though, _during_ tree modification;
4050 	 * this is why assert can't be put in ext4_find_extent()
4051 	 */
4052 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4053 		EXT4_ERROR_INODE(inode, "bad extent address "
4054 				 "lblock: %lu, depth: %d pblock %lld",
4055 				 (unsigned long) map->m_lblk, depth,
4056 				 path[depth].p_block);
4057 		err = -EFSCORRUPTED;
4058 		goto out2;
4059 	}
4060 
4061 	ex = path[depth].p_ext;
4062 	if (ex) {
4063 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4064 		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4065 		unsigned short ee_len;
4066 
4067 
4068 		/*
4069 		 * unwritten extents are treated as holes, except that
4070 		 * we split out initialized portions during a write.
4071 		 */
4072 		ee_len = ext4_ext_get_actual_len(ex);
4073 
4074 		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4075 
4076 		/* if found extent covers block, simply return it */
4077 		if (in_range(map->m_lblk, ee_block, ee_len)) {
4078 			newblock = map->m_lblk - ee_block + ee_start;
4079 			/* number of remaining blocks in the extent */
4080 			allocated = ee_len - (map->m_lblk - ee_block);
4081 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
4082 				  ee_block, ee_len, newblock);
4083 
4084 			/*
4085 			 * If the extent is initialized check whether the
4086 			 * caller wants to convert it to unwritten.
4087 			 */
4088 			if ((!ext4_ext_is_unwritten(ex)) &&
4089 			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4090 				err = convert_initialized_extent(handle,
4091 					inode, map, &path, &allocated);
4092 				goto out2;
4093 			} else if (!ext4_ext_is_unwritten(ex)) {
4094 				goto out;
4095 			}
4096 
4097 			ret = ext4_ext_handle_unwritten_extents(
4098 				handle, inode, map, &path, flags,
4099 				allocated, newblock);
4100 			if (ret < 0)
4101 				err = ret;
4102 			else
4103 				allocated = ret;
4104 			goto out2;
4105 		}
4106 	}
4107 
4108 	/*
4109 	 * requested block isn't allocated yet;
4110 	 * we couldn't try to create block if create flag is zero
4111 	 */
4112 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4113 		ext4_lblk_t hole_start, hole_len;
4114 
4115 		hole_start = map->m_lblk;
4116 		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4117 		/*
4118 		 * put just found gap into cache to speed up
4119 		 * subsequent requests
4120 		 */
4121 		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4122 
4123 		/* Update hole_len to reflect hole size after map->m_lblk */
4124 		if (hole_start != map->m_lblk)
4125 			hole_len -= map->m_lblk - hole_start;
4126 		map->m_pblk = 0;
4127 		map->m_len = min_t(unsigned int, map->m_len, hole_len);
4128 
4129 		goto out2;
4130 	}
4131 
4132 	/*
4133 	 * Okay, we need to do block allocation.
4134 	 */
4135 	newex.ee_block = cpu_to_le32(map->m_lblk);
4136 	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4137 
4138 	/*
4139 	 * If we are doing bigalloc, check to see if the extent returned
4140 	 * by ext4_find_extent() implies a cluster we can use.
4141 	 */
4142 	if (cluster_offset && ex &&
4143 	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4144 		ar.len = allocated = map->m_len;
4145 		newblock = map->m_pblk;
4146 		goto got_allocated_blocks;
4147 	}
4148 
4149 	/* find neighbour allocated blocks */
4150 	ar.lleft = map->m_lblk;
4151 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4152 	if (err)
4153 		goto out2;
4154 	ar.lright = map->m_lblk;
4155 	ex2 = NULL;
4156 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4157 	if (err)
4158 		goto out2;
4159 
4160 	/* Check if the extent after searching to the right implies a
4161 	 * cluster we can use. */
4162 	if ((sbi->s_cluster_ratio > 1) && ex2 &&
4163 	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
4164 		ar.len = allocated = map->m_len;
4165 		newblock = map->m_pblk;
4166 		goto got_allocated_blocks;
4167 	}
4168 
4169 	/*
4170 	 * See if request is beyond maximum number of blocks we can have in
4171 	 * a single extent. For an initialized extent this limit is
4172 	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4173 	 * EXT_UNWRITTEN_MAX_LEN.
4174 	 */
4175 	if (map->m_len > EXT_INIT_MAX_LEN &&
4176 	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4177 		map->m_len = EXT_INIT_MAX_LEN;
4178 	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4179 		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4180 		map->m_len = EXT_UNWRITTEN_MAX_LEN;
4181 
4182 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4183 	newex.ee_len = cpu_to_le16(map->m_len);
4184 	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4185 	if (err)
4186 		allocated = ext4_ext_get_actual_len(&newex);
4187 	else
4188 		allocated = map->m_len;
4189 
4190 	/* allocate new block */
4191 	ar.inode = inode;
4192 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4193 	ar.logical = map->m_lblk;
4194 	/*
4195 	 * We calculate the offset from the beginning of the cluster
4196 	 * for the logical block number, since when we allocate a
4197 	 * physical cluster, the physical block should start at the
4198 	 * same offset from the beginning of the cluster.  This is
4199 	 * needed so that future calls to get_implied_cluster_alloc()
4200 	 * work correctly.
4201 	 */
4202 	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4203 	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4204 	ar.goal -= offset;
4205 	ar.logical -= offset;
4206 	if (S_ISREG(inode->i_mode))
4207 		ar.flags = EXT4_MB_HINT_DATA;
4208 	else
4209 		/* disable in-core preallocation for non-regular files */
4210 		ar.flags = 0;
4211 	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4212 		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4213 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4214 		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4215 	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4216 		ar.flags |= EXT4_MB_USE_RESERVED;
4217 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4218 	if (!newblock)
4219 		goto out2;
4220 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
4221 		  ar.goal, newblock, allocated);
4222 	allocated_clusters = ar.len;
4223 	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4224 	if (ar.len > allocated)
4225 		ar.len = allocated;
4226 
4227 got_allocated_blocks:
4228 	/* try to insert new extent into found leaf and return */
4229 	ext4_ext_store_pblock(&newex, newblock + offset);
4230 	newex.ee_len = cpu_to_le16(ar.len);
4231 	/* Mark unwritten */
4232 	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4233 		ext4_ext_mark_unwritten(&newex);
4234 		map->m_flags |= EXT4_MAP_UNWRITTEN;
4235 	}
4236 
4237 	err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
4238 	if (err) {
4239 		if (allocated_clusters) {
4240 			int fb_flags = 0;
4241 
4242 			/*
4243 			 * free data blocks we just allocated.
4244 			 * not a good idea to call discard here directly,
4245 			 * but otherwise we'd need to call it every free().
4246 			 */
4247 			ext4_discard_preallocations(inode);
4248 			if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4249 				fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
4250 			ext4_free_blocks(handle, inode, NULL, newblock,
4251 					 EXT4_C2B(sbi, allocated_clusters),
4252 					 fb_flags);
4253 		}
4254 		goto out2;
4255 	}
4256 
4257 	/* previous routine could use block we allocated */
4258 	newblock = ext4_ext_pblock(&newex);
4259 	allocated = ext4_ext_get_actual_len(&newex);
4260 	if (allocated > map->m_len)
4261 		allocated = map->m_len;
4262 	map->m_flags |= EXT4_MAP_NEW;
4263 
4264 	/*
4265 	 * Reduce the reserved cluster count to reflect successful deferred
4266 	 * allocation of delayed allocated clusters or direct allocation of
4267 	 * clusters discovered to be delayed allocated.  Once allocated, a
4268 	 * cluster is not included in the reserved count.
4269 	 */
4270 	if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
4271 		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4272 			/*
4273 			 * When allocating delayed allocated clusters, simply
4274 			 * reduce the reserved cluster count and claim quota
4275 			 */
4276 			ext4_da_update_reserve_space(inode, allocated_clusters,
4277 							1);
4278 		} else {
4279 			ext4_lblk_t lblk, len;
4280 			unsigned int n;
4281 
4282 			/*
4283 			 * When allocating non-delayed allocated clusters
4284 			 * (from fallocate, filemap, DIO, or clusters
4285 			 * allocated when delalloc has been disabled by
4286 			 * ext4_nonda_switch), reduce the reserved cluster
4287 			 * count by the number of allocated clusters that
4288 			 * have previously been delayed allocated.  Quota
4289 			 * has been claimed by ext4_mb_new_blocks() above,
4290 			 * so release the quota reservations made for any
4291 			 * previously delayed allocated clusters.
4292 			 */
4293 			lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4294 			len = allocated_clusters << sbi->s_cluster_bits;
4295 			n = ext4_es_delayed_clu(inode, lblk, len);
4296 			if (n > 0)
4297 				ext4_da_update_reserve_space(inode, (int) n, 0);
4298 		}
4299 	}
4300 
4301 	/*
4302 	 * Cache the extent and update transaction to commit on fdatasync only
4303 	 * when it is _not_ an unwritten extent.
4304 	 */
4305 	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4306 		ext4_update_inode_fsync_trans(handle, inode, 1);
4307 	else
4308 		ext4_update_inode_fsync_trans(handle, inode, 0);
4309 out:
4310 	if (allocated > map->m_len)
4311 		allocated = map->m_len;
4312 	ext4_ext_show_leaf(inode, path);
4313 	map->m_flags |= EXT4_MAP_MAPPED;
4314 	map->m_pblk = newblock;
4315 	map->m_len = allocated;
4316 out2:
4317 	ext4_ext_drop_refs(path);
4318 	kfree(path);
4319 
4320 	trace_ext4_ext_map_blocks_exit(inode, flags, map,
4321 				       err ? err : allocated);
4322 	return err ? err : allocated;
4323 }
4324 
4325 int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4326 {
4327 	struct super_block *sb = inode->i_sb;
4328 	ext4_lblk_t last_block;
4329 	int err = 0;
4330 
4331 	/*
4332 	 * TODO: optimization is possible here.
4333 	 * Probably we need not scan at all,
4334 	 * because page truncation is enough.
4335 	 */
4336 
4337 	/* we have to know where to truncate from in crash case */
4338 	EXT4_I(inode)->i_disksize = inode->i_size;
4339 	err = ext4_mark_inode_dirty(handle, inode);
4340 	if (err)
4341 		return err;
4342 
4343 	last_block = (inode->i_size + sb->s_blocksize - 1)
4344 			>> EXT4_BLOCK_SIZE_BITS(sb);
4345 retry:
4346 	err = ext4_es_remove_extent(inode, last_block,
4347 				    EXT_MAX_BLOCKS - last_block);
4348 	if (err == -ENOMEM) {
4349 		cond_resched();
4350 		congestion_wait(BLK_RW_ASYNC, HZ/50);
4351 		goto retry;
4352 	}
4353 	if (err)
4354 		return err;
4355 	return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4356 }
4357 
4358 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4359 				  ext4_lblk_t len, loff_t new_size,
4360 				  int flags)
4361 {
4362 	struct inode *inode = file_inode(file);
4363 	handle_t *handle;
4364 	int ret = 0;
4365 	int ret2 = 0, ret3 = 0;
4366 	int retries = 0;
4367 	int depth = 0;
4368 	struct ext4_map_blocks map;
4369 	unsigned int credits;
4370 	loff_t epos;
4371 
4372 	BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4373 	map.m_lblk = offset;
4374 	map.m_len = len;
4375 	/*
4376 	 * Don't normalize the request if it can fit in one extent so
4377 	 * that it doesn't get unnecessarily split into multiple
4378 	 * extents.
4379 	 */
4380 	if (len <= EXT_UNWRITTEN_MAX_LEN)
4381 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4382 
4383 	/*
4384 	 * credits to insert 1 extent into extent tree
4385 	 */
4386 	credits = ext4_chunk_trans_blocks(inode, len);
4387 	depth = ext_depth(inode);
4388 
4389 retry:
4390 	while (ret >= 0 && len) {
4391 		/*
4392 		 * Recalculate credits when extent tree depth changes.
4393 		 */
4394 		if (depth != ext_depth(inode)) {
4395 			credits = ext4_chunk_trans_blocks(inode, len);
4396 			depth = ext_depth(inode);
4397 		}
4398 
4399 		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4400 					    credits);
4401 		if (IS_ERR(handle)) {
4402 			ret = PTR_ERR(handle);
4403 			break;
4404 		}
4405 		ret = ext4_map_blocks(handle, inode, &map, flags);
4406 		if (ret <= 0) {
4407 			ext4_debug("inode #%lu: block %u: len %u: "
4408 				   "ext4_ext_map_blocks returned %d",
4409 				   inode->i_ino, map.m_lblk,
4410 				   map.m_len, ret);
4411 			ext4_mark_inode_dirty(handle, inode);
4412 			ret2 = ext4_journal_stop(handle);
4413 			break;
4414 		}
4415 		map.m_lblk += ret;
4416 		map.m_len = len = len - ret;
4417 		epos = (loff_t)map.m_lblk << inode->i_blkbits;
4418 		inode->i_ctime = current_time(inode);
4419 		if (new_size) {
4420 			if (epos > new_size)
4421 				epos = new_size;
4422 			if (ext4_update_inode_size(inode, epos) & 0x1)
4423 				inode->i_mtime = inode->i_ctime;
4424 		}
4425 		ret2 = ext4_mark_inode_dirty(handle, inode);
4426 		ext4_update_inode_fsync_trans(handle, inode, 1);
4427 		ret3 = ext4_journal_stop(handle);
4428 		ret2 = ret3 ? ret3 : ret2;
4429 		if (unlikely(ret2))
4430 			break;
4431 	}
4432 	if (ret == -ENOSPC &&
4433 			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4434 		ret = 0;
4435 		goto retry;
4436 	}
4437 
4438 	return ret > 0 ? ret2 : ret;
4439 }
4440 
4441 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
4442 
4443 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
4444 
4445 static long ext4_zero_range(struct file *file, loff_t offset,
4446 			    loff_t len, int mode)
4447 {
4448 	struct inode *inode = file_inode(file);
4449 	handle_t *handle = NULL;
4450 	unsigned int max_blocks;
4451 	loff_t new_size = 0;
4452 	int ret = 0;
4453 	int flags;
4454 	int credits;
4455 	int partial_begin, partial_end;
4456 	loff_t start, end;
4457 	ext4_lblk_t lblk;
4458 	unsigned int blkbits = inode->i_blkbits;
4459 
4460 	trace_ext4_zero_range(inode, offset, len, mode);
4461 
4462 	/* Call ext4_force_commit to flush all data in case of data=journal. */
4463 	if (ext4_should_journal_data(inode)) {
4464 		ret = ext4_force_commit(inode->i_sb);
4465 		if (ret)
4466 			return ret;
4467 	}
4468 
4469 	/*
4470 	 * Round up offset. This is not fallocate, we neet to zero out
4471 	 * blocks, so convert interior block aligned part of the range to
4472 	 * unwritten and possibly manually zero out unaligned parts of the
4473 	 * range.
4474 	 */
4475 	start = round_up(offset, 1 << blkbits);
4476 	end = round_down((offset + len), 1 << blkbits);
4477 
4478 	if (start < offset || end > offset + len)
4479 		return -EINVAL;
4480 	partial_begin = offset & ((1 << blkbits) - 1);
4481 	partial_end = (offset + len) & ((1 << blkbits) - 1);
4482 
4483 	lblk = start >> blkbits;
4484 	max_blocks = (end >> blkbits);
4485 	if (max_blocks < lblk)
4486 		max_blocks = 0;
4487 	else
4488 		max_blocks -= lblk;
4489 
4490 	inode_lock(inode);
4491 
4492 	/*
4493 	 * Indirect files do not support unwritten extnets
4494 	 */
4495 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4496 		ret = -EOPNOTSUPP;
4497 		goto out_mutex;
4498 	}
4499 
4500 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4501 	    (offset + len > inode->i_size ||
4502 	     offset + len > EXT4_I(inode)->i_disksize)) {
4503 		new_size = offset + len;
4504 		ret = inode_newsize_ok(inode, new_size);
4505 		if (ret)
4506 			goto out_mutex;
4507 	}
4508 
4509 	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4510 
4511 	/* Wait all existing dio workers, newcomers will block on i_mutex */
4512 	inode_dio_wait(inode);
4513 
4514 	/* Preallocate the range including the unaligned edges */
4515 	if (partial_begin || partial_end) {
4516 		ret = ext4_alloc_file_blocks(file,
4517 				round_down(offset, 1 << blkbits) >> blkbits,
4518 				(round_up((offset + len), 1 << blkbits) -
4519 				 round_down(offset, 1 << blkbits)) >> blkbits,
4520 				new_size, flags);
4521 		if (ret)
4522 			goto out_mutex;
4523 
4524 	}
4525 
4526 	/* Zero range excluding the unaligned edges */
4527 	if (max_blocks > 0) {
4528 		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4529 			  EXT4_EX_NOCACHE);
4530 
4531 		/*
4532 		 * Prevent page faults from reinstantiating pages we have
4533 		 * released from page cache.
4534 		 */
4535 		down_write(&EXT4_I(inode)->i_mmap_sem);
4536 
4537 		ret = ext4_break_layouts(inode);
4538 		if (ret) {
4539 			up_write(&EXT4_I(inode)->i_mmap_sem);
4540 			goto out_mutex;
4541 		}
4542 
4543 		ret = ext4_update_disksize_before_punch(inode, offset, len);
4544 		if (ret) {
4545 			up_write(&EXT4_I(inode)->i_mmap_sem);
4546 			goto out_mutex;
4547 		}
4548 		/* Now release the pages and zero block aligned part of pages */
4549 		truncate_pagecache_range(inode, start, end - 1);
4550 		inode->i_mtime = inode->i_ctime = current_time(inode);
4551 
4552 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4553 					     flags);
4554 		up_write(&EXT4_I(inode)->i_mmap_sem);
4555 		if (ret)
4556 			goto out_mutex;
4557 	}
4558 	if (!partial_begin && !partial_end)
4559 		goto out_mutex;
4560 
4561 	/*
4562 	 * In worst case we have to writeout two nonadjacent unwritten
4563 	 * blocks and update the inode
4564 	 */
4565 	credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4566 	if (ext4_should_journal_data(inode))
4567 		credits += 2;
4568 	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4569 	if (IS_ERR(handle)) {
4570 		ret = PTR_ERR(handle);
4571 		ext4_std_error(inode->i_sb, ret);
4572 		goto out_mutex;
4573 	}
4574 
4575 	inode->i_mtime = inode->i_ctime = current_time(inode);
4576 	if (new_size)
4577 		ext4_update_inode_size(inode, new_size);
4578 	ret = ext4_mark_inode_dirty(handle, inode);
4579 	if (unlikely(ret))
4580 		goto out_handle;
4581 
4582 	/* Zero out partial block at the edges of the range */
4583 	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4584 	if (ret >= 0)
4585 		ext4_update_inode_fsync_trans(handle, inode, 1);
4586 
4587 	if (file->f_flags & O_SYNC)
4588 		ext4_handle_sync(handle);
4589 
4590 out_handle:
4591 	ext4_journal_stop(handle);
4592 out_mutex:
4593 	inode_unlock(inode);
4594 	return ret;
4595 }
4596 
4597 /*
4598  * preallocate space for a file. This implements ext4's fallocate file
4599  * operation, which gets called from sys_fallocate system call.
4600  * For block-mapped files, posix_fallocate should fall back to the method
4601  * of writing zeroes to the required new blocks (the same behavior which is
4602  * expected for file systems which do not support fallocate() system call).
4603  */
4604 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4605 {
4606 	struct inode *inode = file_inode(file);
4607 	loff_t new_size = 0;
4608 	unsigned int max_blocks;
4609 	int ret = 0;
4610 	int flags;
4611 	ext4_lblk_t lblk;
4612 	unsigned int blkbits = inode->i_blkbits;
4613 
4614 	/*
4615 	 * Encrypted inodes can't handle collapse range or insert
4616 	 * range since we would need to re-encrypt blocks with a
4617 	 * different IV or XTS tweak (which are based on the logical
4618 	 * block number).
4619 	 */
4620 	if (IS_ENCRYPTED(inode) &&
4621 	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4622 		return -EOPNOTSUPP;
4623 
4624 	/* Return error if mode is not supported */
4625 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4626 		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4627 		     FALLOC_FL_INSERT_RANGE))
4628 		return -EOPNOTSUPP;
4629 
4630 	if (mode & FALLOC_FL_PUNCH_HOLE)
4631 		return ext4_punch_hole(inode, offset, len);
4632 
4633 	ret = ext4_convert_inline_data(inode);
4634 	if (ret)
4635 		return ret;
4636 
4637 	if (mode & FALLOC_FL_COLLAPSE_RANGE)
4638 		return ext4_collapse_range(inode, offset, len);
4639 
4640 	if (mode & FALLOC_FL_INSERT_RANGE)
4641 		return ext4_insert_range(inode, offset, len);
4642 
4643 	if (mode & FALLOC_FL_ZERO_RANGE)
4644 		return ext4_zero_range(file, offset, len, mode);
4645 
4646 	trace_ext4_fallocate_enter(inode, offset, len, mode);
4647 	lblk = offset >> blkbits;
4648 
4649 	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4650 	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4651 
4652 	inode_lock(inode);
4653 
4654 	/*
4655 	 * We only support preallocation for extent-based files only
4656 	 */
4657 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4658 		ret = -EOPNOTSUPP;
4659 		goto out;
4660 	}
4661 
4662 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4663 	    (offset + len > inode->i_size ||
4664 	     offset + len > EXT4_I(inode)->i_disksize)) {
4665 		new_size = offset + len;
4666 		ret = inode_newsize_ok(inode, new_size);
4667 		if (ret)
4668 			goto out;
4669 	}
4670 
4671 	/* Wait all existing dio workers, newcomers will block on i_mutex */
4672 	inode_dio_wait(inode);
4673 
4674 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4675 	if (ret)
4676 		goto out;
4677 
4678 	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4679 		ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4680 						EXT4_I(inode)->i_sync_tid);
4681 	}
4682 out:
4683 	inode_unlock(inode);
4684 	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4685 	return ret;
4686 }
4687 
4688 /*
4689  * This function convert a range of blocks to written extents
4690  * The caller of this function will pass the start offset and the size.
4691  * all unwritten extents within this range will be converted to
4692  * written extents.
4693  *
4694  * This function is called from the direct IO end io call back
4695  * function, to convert the fallocated extents after IO is completed.
4696  * Returns 0 on success.
4697  */
4698 int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4699 				   loff_t offset, ssize_t len)
4700 {
4701 	unsigned int max_blocks;
4702 	int ret = 0, ret2 = 0, ret3 = 0;
4703 	struct ext4_map_blocks map;
4704 	unsigned int blkbits = inode->i_blkbits;
4705 	unsigned int credits = 0;
4706 
4707 	map.m_lblk = offset >> blkbits;
4708 	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4709 
4710 	if (!handle) {
4711 		/*
4712 		 * credits to insert 1 extent into extent tree
4713 		 */
4714 		credits = ext4_chunk_trans_blocks(inode, max_blocks);
4715 	}
4716 	while (ret >= 0 && ret < max_blocks) {
4717 		map.m_lblk += ret;
4718 		map.m_len = (max_blocks -= ret);
4719 		if (credits) {
4720 			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4721 						    credits);
4722 			if (IS_ERR(handle)) {
4723 				ret = PTR_ERR(handle);
4724 				break;
4725 			}
4726 		}
4727 		ret = ext4_map_blocks(handle, inode, &map,
4728 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4729 		if (ret <= 0)
4730 			ext4_warning(inode->i_sb,
4731 				     "inode #%lu: block %u: len %u: "
4732 				     "ext4_ext_map_blocks returned %d",
4733 				     inode->i_ino, map.m_lblk,
4734 				     map.m_len, ret);
4735 		ret2 = ext4_mark_inode_dirty(handle, inode);
4736 		if (credits) {
4737 			ret3 = ext4_journal_stop(handle);
4738 			if (unlikely(ret3))
4739 				ret2 = ret3;
4740 		}
4741 
4742 		if (ret <= 0 || ret2)
4743 			break;
4744 	}
4745 	return ret > 0 ? ret2 : ret;
4746 }
4747 
4748 int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4749 {
4750 	int ret, err = 0;
4751 	struct ext4_io_end_vec *io_end_vec;
4752 
4753 	/*
4754 	 * This is somewhat ugly but the idea is clear: When transaction is
4755 	 * reserved, everything goes into it. Otherwise we rather start several
4756 	 * smaller transactions for conversion of each extent separately.
4757 	 */
4758 	if (handle) {
4759 		handle = ext4_journal_start_reserved(handle,
4760 						     EXT4_HT_EXT_CONVERT);
4761 		if (IS_ERR(handle))
4762 			return PTR_ERR(handle);
4763 	}
4764 
4765 	list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4766 		ret = ext4_convert_unwritten_extents(handle, io_end->inode,
4767 						     io_end_vec->offset,
4768 						     io_end_vec->size);
4769 		if (ret)
4770 			break;
4771 	}
4772 
4773 	if (handle)
4774 		err = ext4_journal_stop(handle);
4775 
4776 	return ret < 0 ? ret : err;
4777 }
4778 
4779 static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
4780 {
4781 	__u64 physical = 0;
4782 	__u64 length = 0;
4783 	int blockbits = inode->i_sb->s_blocksize_bits;
4784 	int error = 0;
4785 	u16 iomap_type;
4786 
4787 	/* in-inode? */
4788 	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4789 		struct ext4_iloc iloc;
4790 		int offset;	/* offset of xattr in inode */
4791 
4792 		error = ext4_get_inode_loc(inode, &iloc);
4793 		if (error)
4794 			return error;
4795 		physical = (__u64)iloc.bh->b_blocknr << blockbits;
4796 		offset = EXT4_GOOD_OLD_INODE_SIZE +
4797 				EXT4_I(inode)->i_extra_isize;
4798 		physical += offset;
4799 		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4800 		brelse(iloc.bh);
4801 		iomap_type = IOMAP_INLINE;
4802 	} else if (EXT4_I(inode)->i_file_acl) { /* external block */
4803 		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4804 		length = inode->i_sb->s_blocksize;
4805 		iomap_type = IOMAP_MAPPED;
4806 	} else {
4807 		/* no in-inode or external block for xattr, so return -ENOENT */
4808 		error = -ENOENT;
4809 		goto out;
4810 	}
4811 
4812 	iomap->addr = physical;
4813 	iomap->offset = 0;
4814 	iomap->length = length;
4815 	iomap->type = iomap_type;
4816 	iomap->flags = 0;
4817 out:
4818 	return error;
4819 }
4820 
4821 static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
4822 				  loff_t length, unsigned flags,
4823 				  struct iomap *iomap, struct iomap *srcmap)
4824 {
4825 	int error;
4826 
4827 	error = ext4_iomap_xattr_fiemap(inode, iomap);
4828 	if (error == 0 && (offset >= iomap->length))
4829 		error = -ENOENT;
4830 	return error;
4831 }
4832 
4833 static const struct iomap_ops ext4_iomap_xattr_ops = {
4834 	.iomap_begin		= ext4_iomap_xattr_begin,
4835 };
4836 
4837 static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4838 			__u64 start, __u64 len, bool from_es_cache)
4839 {
4840 	ext4_lblk_t start_blk;
4841 	u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR;
4842 	int error = 0;
4843 
4844 	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4845 		error = ext4_ext_precache(inode);
4846 		if (error)
4847 			return error;
4848 		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
4849 	}
4850 
4851 	if (from_es_cache)
4852 		ext4_fiemap_flags &= FIEMAP_FLAG_XATTR;
4853 
4854 	if (fiemap_check_flags(fieinfo, ext4_fiemap_flags))
4855 		return -EBADR;
4856 
4857 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4858 		fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
4859 		error = iomap_fiemap(inode, fieinfo, start, len,
4860 				     &ext4_iomap_xattr_ops);
4861 	} else if (!from_es_cache) {
4862 		error = iomap_fiemap(inode, fieinfo, start, len,
4863 				     &ext4_iomap_report_ops);
4864 	} else {
4865 		ext4_lblk_t len_blks;
4866 		__u64 last_blk;
4867 
4868 		start_blk = start >> inode->i_sb->s_blocksize_bits;
4869 		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4870 		if (last_blk >= EXT_MAX_BLOCKS)
4871 			last_blk = EXT_MAX_BLOCKS-1;
4872 		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4873 
4874 		/*
4875 		 * Walk the extent tree gathering extent information
4876 		 * and pushing extents back to the user.
4877 		 */
4878 		error = ext4_fill_es_cache_info(inode, start_blk, len_blks,
4879 						fieinfo);
4880 	}
4881 	return error;
4882 }
4883 
4884 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4885 		__u64 start, __u64 len)
4886 {
4887 	return _ext4_fiemap(inode, fieinfo, start, len, false);
4888 }
4889 
4890 int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
4891 		      __u64 start, __u64 len)
4892 {
4893 	if (ext4_has_inline_data(inode)) {
4894 		int has_inline;
4895 
4896 		down_read(&EXT4_I(inode)->xattr_sem);
4897 		has_inline = ext4_has_inline_data(inode);
4898 		up_read(&EXT4_I(inode)->xattr_sem);
4899 		if (has_inline)
4900 			return 0;
4901 	}
4902 
4903 	return _ext4_fiemap(inode, fieinfo, start, len, true);
4904 }
4905 
4906 
4907 /*
4908  * ext4_access_path:
4909  * Function to access the path buffer for marking it dirty.
4910  * It also checks if there are sufficient credits left in the journal handle
4911  * to update path.
4912  */
4913 static int
4914 ext4_access_path(handle_t *handle, struct inode *inode,
4915 		struct ext4_ext_path *path)
4916 {
4917 	int credits, err;
4918 
4919 	if (!ext4_handle_valid(handle))
4920 		return 0;
4921 
4922 	/*
4923 	 * Check if need to extend journal credits
4924 	 * 3 for leaf, sb, and inode plus 2 (bmap and group
4925 	 * descriptor) for each block group; assume two block
4926 	 * groups
4927 	 */
4928 	credits = ext4_writepage_trans_blocks(inode);
4929 	err = ext4_datasem_ensure_credits(handle, inode, 7, credits, 0);
4930 	if (err < 0)
4931 		return err;
4932 
4933 	err = ext4_ext_get_access(handle, inode, path);
4934 	return err;
4935 }
4936 
4937 /*
4938  * ext4_ext_shift_path_extents:
4939  * Shift the extents of a path structure lying between path[depth].p_ext
4940  * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
4941  * if it is right shift or left shift operation.
4942  */
4943 static int
4944 ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
4945 			    struct inode *inode, handle_t *handle,
4946 			    enum SHIFT_DIRECTION SHIFT)
4947 {
4948 	int depth, err = 0;
4949 	struct ext4_extent *ex_start, *ex_last;
4950 	bool update = false;
4951 	depth = path->p_depth;
4952 
4953 	while (depth >= 0) {
4954 		if (depth == path->p_depth) {
4955 			ex_start = path[depth].p_ext;
4956 			if (!ex_start)
4957 				return -EFSCORRUPTED;
4958 
4959 			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
4960 
4961 			err = ext4_access_path(handle, inode, path + depth);
4962 			if (err)
4963 				goto out;
4964 
4965 			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
4966 				update = true;
4967 
4968 			while (ex_start <= ex_last) {
4969 				if (SHIFT == SHIFT_LEFT) {
4970 					le32_add_cpu(&ex_start->ee_block,
4971 						-shift);
4972 					/* Try to merge to the left. */
4973 					if ((ex_start >
4974 					    EXT_FIRST_EXTENT(path[depth].p_hdr))
4975 					    &&
4976 					    ext4_ext_try_to_merge_right(inode,
4977 					    path, ex_start - 1))
4978 						ex_last--;
4979 					else
4980 						ex_start++;
4981 				} else {
4982 					le32_add_cpu(&ex_last->ee_block, shift);
4983 					ext4_ext_try_to_merge_right(inode, path,
4984 						ex_last);
4985 					ex_last--;
4986 				}
4987 			}
4988 			err = ext4_ext_dirty(handle, inode, path + depth);
4989 			if (err)
4990 				goto out;
4991 
4992 			if (--depth < 0 || !update)
4993 				break;
4994 		}
4995 
4996 		/* Update index too */
4997 		err = ext4_access_path(handle, inode, path + depth);
4998 		if (err)
4999 			goto out;
5000 
5001 		if (SHIFT == SHIFT_LEFT)
5002 			le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5003 		else
5004 			le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5005 		err = ext4_ext_dirty(handle, inode, path + depth);
5006 		if (err)
5007 			goto out;
5008 
5009 		/* we are done if current index is not a starting index */
5010 		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5011 			break;
5012 
5013 		depth--;
5014 	}
5015 
5016 out:
5017 	return err;
5018 }
5019 
5020 /*
5021  * ext4_ext_shift_extents:
5022  * All the extents which lies in the range from @start to the last allocated
5023  * block for the @inode are shifted either towards left or right (depending
5024  * upon @SHIFT) by @shift blocks.
5025  * On success, 0 is returned, error otherwise.
5026  */
5027 static int
5028 ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5029 		       ext4_lblk_t start, ext4_lblk_t shift,
5030 		       enum SHIFT_DIRECTION SHIFT)
5031 {
5032 	struct ext4_ext_path *path;
5033 	int ret = 0, depth;
5034 	struct ext4_extent *extent;
5035 	ext4_lblk_t stop, *iterator, ex_start, ex_end;
5036 
5037 	/* Let path point to the last extent */
5038 	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5039 				EXT4_EX_NOCACHE);
5040 	if (IS_ERR(path))
5041 		return PTR_ERR(path);
5042 
5043 	depth = path->p_depth;
5044 	extent = path[depth].p_ext;
5045 	if (!extent)
5046 		goto out;
5047 
5048 	stop = le32_to_cpu(extent->ee_block);
5049 
5050        /*
5051 	* For left shifts, make sure the hole on the left is big enough to
5052 	* accommodate the shift.  For right shifts, make sure the last extent
5053 	* won't be shifted beyond EXT_MAX_BLOCKS.
5054 	*/
5055 	if (SHIFT == SHIFT_LEFT) {
5056 		path = ext4_find_extent(inode, start - 1, &path,
5057 					EXT4_EX_NOCACHE);
5058 		if (IS_ERR(path))
5059 			return PTR_ERR(path);
5060 		depth = path->p_depth;
5061 		extent =  path[depth].p_ext;
5062 		if (extent) {
5063 			ex_start = le32_to_cpu(extent->ee_block);
5064 			ex_end = le32_to_cpu(extent->ee_block) +
5065 				ext4_ext_get_actual_len(extent);
5066 		} else {
5067 			ex_start = 0;
5068 			ex_end = 0;
5069 		}
5070 
5071 		if ((start == ex_start && shift > ex_start) ||
5072 		    (shift > start - ex_end)) {
5073 			ret = -EINVAL;
5074 			goto out;
5075 		}
5076 	} else {
5077 		if (shift > EXT_MAX_BLOCKS -
5078 		    (stop + ext4_ext_get_actual_len(extent))) {
5079 			ret = -EINVAL;
5080 			goto out;
5081 		}
5082 	}
5083 
5084 	/*
5085 	 * In case of left shift, iterator points to start and it is increased
5086 	 * till we reach stop. In case of right shift, iterator points to stop
5087 	 * and it is decreased till we reach start.
5088 	 */
5089 	if (SHIFT == SHIFT_LEFT)
5090 		iterator = &start;
5091 	else
5092 		iterator = &stop;
5093 
5094 	/*
5095 	 * Its safe to start updating extents.  Start and stop are unsigned, so
5096 	 * in case of right shift if extent with 0 block is reached, iterator
5097 	 * becomes NULL to indicate the end of the loop.
5098 	 */
5099 	while (iterator && start <= stop) {
5100 		path = ext4_find_extent(inode, *iterator, &path,
5101 					EXT4_EX_NOCACHE);
5102 		if (IS_ERR(path))
5103 			return PTR_ERR(path);
5104 		depth = path->p_depth;
5105 		extent = path[depth].p_ext;
5106 		if (!extent) {
5107 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5108 					 (unsigned long) *iterator);
5109 			return -EFSCORRUPTED;
5110 		}
5111 		if (SHIFT == SHIFT_LEFT && *iterator >
5112 		    le32_to_cpu(extent->ee_block)) {
5113 			/* Hole, move to the next extent */
5114 			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5115 				path[depth].p_ext++;
5116 			} else {
5117 				*iterator = ext4_ext_next_allocated_block(path);
5118 				continue;
5119 			}
5120 		}
5121 
5122 		if (SHIFT == SHIFT_LEFT) {
5123 			extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5124 			*iterator = le32_to_cpu(extent->ee_block) +
5125 					ext4_ext_get_actual_len(extent);
5126 		} else {
5127 			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5128 			if (le32_to_cpu(extent->ee_block) > 0)
5129 				*iterator = le32_to_cpu(extent->ee_block) - 1;
5130 			else
5131 				/* Beginning is reached, end of the loop */
5132 				iterator = NULL;
5133 			/* Update path extent in case we need to stop */
5134 			while (le32_to_cpu(extent->ee_block) < start)
5135 				extent++;
5136 			path[depth].p_ext = extent;
5137 		}
5138 		ret = ext4_ext_shift_path_extents(path, shift, inode,
5139 				handle, SHIFT);
5140 		if (ret)
5141 			break;
5142 	}
5143 out:
5144 	ext4_ext_drop_refs(path);
5145 	kfree(path);
5146 	return ret;
5147 }
5148 
5149 /*
5150  * ext4_collapse_range:
5151  * This implements the fallocate's collapse range functionality for ext4
5152  * Returns: 0 and non-zero on error.
5153  */
5154 static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5155 {
5156 	struct super_block *sb = inode->i_sb;
5157 	ext4_lblk_t punch_start, punch_stop;
5158 	handle_t *handle;
5159 	unsigned int credits;
5160 	loff_t new_size, ioffset;
5161 	int ret;
5162 
5163 	/*
5164 	 * We need to test this early because xfstests assumes that a
5165 	 * collapse range of (0, 1) will return EOPNOTSUPP if the file
5166 	 * system does not support collapse range.
5167 	 */
5168 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5169 		return -EOPNOTSUPP;
5170 
5171 	/* Collapse range works only on fs cluster size aligned regions. */
5172 	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5173 		return -EINVAL;
5174 
5175 	trace_ext4_collapse_range(inode, offset, len);
5176 
5177 	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5178 	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5179 
5180 	/* Call ext4_force_commit to flush all data in case of data=journal. */
5181 	if (ext4_should_journal_data(inode)) {
5182 		ret = ext4_force_commit(inode->i_sb);
5183 		if (ret)
5184 			return ret;
5185 	}
5186 
5187 	inode_lock(inode);
5188 	/*
5189 	 * There is no need to overlap collapse range with EOF, in which case
5190 	 * it is effectively a truncate operation
5191 	 */
5192 	if (offset + len >= inode->i_size) {
5193 		ret = -EINVAL;
5194 		goto out_mutex;
5195 	}
5196 
5197 	/* Currently just for extent based files */
5198 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5199 		ret = -EOPNOTSUPP;
5200 		goto out_mutex;
5201 	}
5202 
5203 	/* Wait for existing dio to complete */
5204 	inode_dio_wait(inode);
5205 
5206 	/*
5207 	 * Prevent page faults from reinstantiating pages we have released from
5208 	 * page cache.
5209 	 */
5210 	down_write(&EXT4_I(inode)->i_mmap_sem);
5211 
5212 	ret = ext4_break_layouts(inode);
5213 	if (ret)
5214 		goto out_mmap;
5215 
5216 	/*
5217 	 * Need to round down offset to be aligned with page size boundary
5218 	 * for page size > block size.
5219 	 */
5220 	ioffset = round_down(offset, PAGE_SIZE);
5221 	/*
5222 	 * Write tail of the last page before removed range since it will get
5223 	 * removed from the page cache below.
5224 	 */
5225 	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5226 	if (ret)
5227 		goto out_mmap;
5228 	/*
5229 	 * Write data that will be shifted to preserve them when discarding
5230 	 * page cache below. We are also protected from pages becoming dirty
5231 	 * by i_mmap_sem.
5232 	 */
5233 	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5234 					   LLONG_MAX);
5235 	if (ret)
5236 		goto out_mmap;
5237 	truncate_pagecache(inode, ioffset);
5238 
5239 	credits = ext4_writepage_trans_blocks(inode);
5240 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5241 	if (IS_ERR(handle)) {
5242 		ret = PTR_ERR(handle);
5243 		goto out_mmap;
5244 	}
5245 
5246 	down_write(&EXT4_I(inode)->i_data_sem);
5247 	ext4_discard_preallocations(inode);
5248 
5249 	ret = ext4_es_remove_extent(inode, punch_start,
5250 				    EXT_MAX_BLOCKS - punch_start);
5251 	if (ret) {
5252 		up_write(&EXT4_I(inode)->i_data_sem);
5253 		goto out_stop;
5254 	}
5255 
5256 	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5257 	if (ret) {
5258 		up_write(&EXT4_I(inode)->i_data_sem);
5259 		goto out_stop;
5260 	}
5261 	ext4_discard_preallocations(inode);
5262 
5263 	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5264 				     punch_stop - punch_start, SHIFT_LEFT);
5265 	if (ret) {
5266 		up_write(&EXT4_I(inode)->i_data_sem);
5267 		goto out_stop;
5268 	}
5269 
5270 	new_size = inode->i_size - len;
5271 	i_size_write(inode, new_size);
5272 	EXT4_I(inode)->i_disksize = new_size;
5273 
5274 	up_write(&EXT4_I(inode)->i_data_sem);
5275 	if (IS_SYNC(inode))
5276 		ext4_handle_sync(handle);
5277 	inode->i_mtime = inode->i_ctime = current_time(inode);
5278 	ret = ext4_mark_inode_dirty(handle, inode);
5279 	ext4_update_inode_fsync_trans(handle, inode, 1);
5280 
5281 out_stop:
5282 	ext4_journal_stop(handle);
5283 out_mmap:
5284 	up_write(&EXT4_I(inode)->i_mmap_sem);
5285 out_mutex:
5286 	inode_unlock(inode);
5287 	return ret;
5288 }
5289 
5290 /*
5291  * ext4_insert_range:
5292  * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5293  * The data blocks starting from @offset to the EOF are shifted by @len
5294  * towards right to create a hole in the @inode. Inode size is increased
5295  * by len bytes.
5296  * Returns 0 on success, error otherwise.
5297  */
5298 static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
5299 {
5300 	struct super_block *sb = inode->i_sb;
5301 	handle_t *handle;
5302 	struct ext4_ext_path *path;
5303 	struct ext4_extent *extent;
5304 	ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5305 	unsigned int credits, ee_len;
5306 	int ret = 0, depth, split_flag = 0;
5307 	loff_t ioffset;
5308 
5309 	/*
5310 	 * We need to test this early because xfstests assumes that an
5311 	 * insert range of (0, 1) will return EOPNOTSUPP if the file
5312 	 * system does not support insert range.
5313 	 */
5314 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5315 		return -EOPNOTSUPP;
5316 
5317 	/* Insert range works only on fs cluster size aligned regions. */
5318 	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5319 		return -EINVAL;
5320 
5321 	trace_ext4_insert_range(inode, offset, len);
5322 
5323 	offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5324 	len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5325 
5326 	/* Call ext4_force_commit to flush all data in case of data=journal */
5327 	if (ext4_should_journal_data(inode)) {
5328 		ret = ext4_force_commit(inode->i_sb);
5329 		if (ret)
5330 			return ret;
5331 	}
5332 
5333 	inode_lock(inode);
5334 	/* Currently just for extent based files */
5335 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5336 		ret = -EOPNOTSUPP;
5337 		goto out_mutex;
5338 	}
5339 
5340 	/* Check whether the maximum file size would be exceeded */
5341 	if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5342 		ret = -EFBIG;
5343 		goto out_mutex;
5344 	}
5345 
5346 	/* Offset must be less than i_size */
5347 	if (offset >= inode->i_size) {
5348 		ret = -EINVAL;
5349 		goto out_mutex;
5350 	}
5351 
5352 	/* Wait for existing dio to complete */
5353 	inode_dio_wait(inode);
5354 
5355 	/*
5356 	 * Prevent page faults from reinstantiating pages we have released from
5357 	 * page cache.
5358 	 */
5359 	down_write(&EXT4_I(inode)->i_mmap_sem);
5360 
5361 	ret = ext4_break_layouts(inode);
5362 	if (ret)
5363 		goto out_mmap;
5364 
5365 	/*
5366 	 * Need to round down to align start offset to page size boundary
5367 	 * for page size > block size.
5368 	 */
5369 	ioffset = round_down(offset, PAGE_SIZE);
5370 	/* Write out all dirty pages */
5371 	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5372 			LLONG_MAX);
5373 	if (ret)
5374 		goto out_mmap;
5375 	truncate_pagecache(inode, ioffset);
5376 
5377 	credits = ext4_writepage_trans_blocks(inode);
5378 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5379 	if (IS_ERR(handle)) {
5380 		ret = PTR_ERR(handle);
5381 		goto out_mmap;
5382 	}
5383 
5384 	/* Expand file to avoid data loss if there is error while shifting */
5385 	inode->i_size += len;
5386 	EXT4_I(inode)->i_disksize += len;
5387 	inode->i_mtime = inode->i_ctime = current_time(inode);
5388 	ret = ext4_mark_inode_dirty(handle, inode);
5389 	if (ret)
5390 		goto out_stop;
5391 
5392 	down_write(&EXT4_I(inode)->i_data_sem);
5393 	ext4_discard_preallocations(inode);
5394 
5395 	path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5396 	if (IS_ERR(path)) {
5397 		up_write(&EXT4_I(inode)->i_data_sem);
5398 		goto out_stop;
5399 	}
5400 
5401 	depth = ext_depth(inode);
5402 	extent = path[depth].p_ext;
5403 	if (extent) {
5404 		ee_start_lblk = le32_to_cpu(extent->ee_block);
5405 		ee_len = ext4_ext_get_actual_len(extent);
5406 
5407 		/*
5408 		 * If offset_lblk is not the starting block of extent, split
5409 		 * the extent @offset_lblk
5410 		 */
5411 		if ((offset_lblk > ee_start_lblk) &&
5412 				(offset_lblk < (ee_start_lblk + ee_len))) {
5413 			if (ext4_ext_is_unwritten(extent))
5414 				split_flag = EXT4_EXT_MARK_UNWRIT1 |
5415 					EXT4_EXT_MARK_UNWRIT2;
5416 			ret = ext4_split_extent_at(handle, inode, &path,
5417 					offset_lblk, split_flag,
5418 					EXT4_EX_NOCACHE |
5419 					EXT4_GET_BLOCKS_PRE_IO |
5420 					EXT4_GET_BLOCKS_METADATA_NOFAIL);
5421 		}
5422 
5423 		ext4_ext_drop_refs(path);
5424 		kfree(path);
5425 		if (ret < 0) {
5426 			up_write(&EXT4_I(inode)->i_data_sem);
5427 			goto out_stop;
5428 		}
5429 	} else {
5430 		ext4_ext_drop_refs(path);
5431 		kfree(path);
5432 	}
5433 
5434 	ret = ext4_es_remove_extent(inode, offset_lblk,
5435 			EXT_MAX_BLOCKS - offset_lblk);
5436 	if (ret) {
5437 		up_write(&EXT4_I(inode)->i_data_sem);
5438 		goto out_stop;
5439 	}
5440 
5441 	/*
5442 	 * if offset_lblk lies in a hole which is at start of file, use
5443 	 * ee_start_lblk to shift extents
5444 	 */
5445 	ret = ext4_ext_shift_extents(inode, handle,
5446 		ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5447 		len_lblk, SHIFT_RIGHT);
5448 
5449 	up_write(&EXT4_I(inode)->i_data_sem);
5450 	if (IS_SYNC(inode))
5451 		ext4_handle_sync(handle);
5452 	if (ret >= 0)
5453 		ext4_update_inode_fsync_trans(handle, inode, 1);
5454 
5455 out_stop:
5456 	ext4_journal_stop(handle);
5457 out_mmap:
5458 	up_write(&EXT4_I(inode)->i_mmap_sem);
5459 out_mutex:
5460 	inode_unlock(inode);
5461 	return ret;
5462 }
5463 
5464 /**
5465  * ext4_swap_extents() - Swap extents between two inodes
5466  * @handle: handle for this transaction
5467  * @inode1:	First inode
5468  * @inode2:	Second inode
5469  * @lblk1:	Start block for first inode
5470  * @lblk2:	Start block for second inode
5471  * @count:	Number of blocks to swap
5472  * @unwritten: Mark second inode's extents as unwritten after swap
5473  * @erp:	Pointer to save error value
5474  *
5475  * This helper routine does exactly what is promise "swap extents". All other
5476  * stuff such as page-cache locking consistency, bh mapping consistency or
5477  * extent's data copying must be performed by caller.
5478  * Locking:
5479  * 		i_mutex is held for both inodes
5480  * 		i_data_sem is locked for write for both inodes
5481  * Assumptions:
5482  *		All pages from requested range are locked for both inodes
5483  */
5484 int
5485 ext4_swap_extents(handle_t *handle, struct inode *inode1,
5486 		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5487 		  ext4_lblk_t count, int unwritten, int *erp)
5488 {
5489 	struct ext4_ext_path *path1 = NULL;
5490 	struct ext4_ext_path *path2 = NULL;
5491 	int replaced_count = 0;
5492 
5493 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5494 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5495 	BUG_ON(!inode_is_locked(inode1));
5496 	BUG_ON(!inode_is_locked(inode2));
5497 
5498 	*erp = ext4_es_remove_extent(inode1, lblk1, count);
5499 	if (unlikely(*erp))
5500 		return 0;
5501 	*erp = ext4_es_remove_extent(inode2, lblk2, count);
5502 	if (unlikely(*erp))
5503 		return 0;
5504 
5505 	while (count) {
5506 		struct ext4_extent *ex1, *ex2, tmp_ex;
5507 		ext4_lblk_t e1_blk, e2_blk;
5508 		int e1_len, e2_len, len;
5509 		int split = 0;
5510 
5511 		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5512 		if (IS_ERR(path1)) {
5513 			*erp = PTR_ERR(path1);
5514 			path1 = NULL;
5515 		finish:
5516 			count = 0;
5517 			goto repeat;
5518 		}
5519 		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5520 		if (IS_ERR(path2)) {
5521 			*erp = PTR_ERR(path2);
5522 			path2 = NULL;
5523 			goto finish;
5524 		}
5525 		ex1 = path1[path1->p_depth].p_ext;
5526 		ex2 = path2[path2->p_depth].p_ext;
5527 		/* Do we have somthing to swap ? */
5528 		if (unlikely(!ex2 || !ex1))
5529 			goto finish;
5530 
5531 		e1_blk = le32_to_cpu(ex1->ee_block);
5532 		e2_blk = le32_to_cpu(ex2->ee_block);
5533 		e1_len = ext4_ext_get_actual_len(ex1);
5534 		e2_len = ext4_ext_get_actual_len(ex2);
5535 
5536 		/* Hole handling */
5537 		if (!in_range(lblk1, e1_blk, e1_len) ||
5538 		    !in_range(lblk2, e2_blk, e2_len)) {
5539 			ext4_lblk_t next1, next2;
5540 
5541 			/* if hole after extent, then go to next extent */
5542 			next1 = ext4_ext_next_allocated_block(path1);
5543 			next2 = ext4_ext_next_allocated_block(path2);
5544 			/* If hole before extent, then shift to that extent */
5545 			if (e1_blk > lblk1)
5546 				next1 = e1_blk;
5547 			if (e2_blk > lblk2)
5548 				next2 = e2_blk;
5549 			/* Do we have something to swap */
5550 			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5551 				goto finish;
5552 			/* Move to the rightest boundary */
5553 			len = next1 - lblk1;
5554 			if (len < next2 - lblk2)
5555 				len = next2 - lblk2;
5556 			if (len > count)
5557 				len = count;
5558 			lblk1 += len;
5559 			lblk2 += len;
5560 			count -= len;
5561 			goto repeat;
5562 		}
5563 
5564 		/* Prepare left boundary */
5565 		if (e1_blk < lblk1) {
5566 			split = 1;
5567 			*erp = ext4_force_split_extent_at(handle, inode1,
5568 						&path1, lblk1, 0);
5569 			if (unlikely(*erp))
5570 				goto finish;
5571 		}
5572 		if (e2_blk < lblk2) {
5573 			split = 1;
5574 			*erp = ext4_force_split_extent_at(handle, inode2,
5575 						&path2,  lblk2, 0);
5576 			if (unlikely(*erp))
5577 				goto finish;
5578 		}
5579 		/* ext4_split_extent_at() may result in leaf extent split,
5580 		 * path must to be revalidated. */
5581 		if (split)
5582 			goto repeat;
5583 
5584 		/* Prepare right boundary */
5585 		len = count;
5586 		if (len > e1_blk + e1_len - lblk1)
5587 			len = e1_blk + e1_len - lblk1;
5588 		if (len > e2_blk + e2_len - lblk2)
5589 			len = e2_blk + e2_len - lblk2;
5590 
5591 		if (len != e1_len) {
5592 			split = 1;
5593 			*erp = ext4_force_split_extent_at(handle, inode1,
5594 						&path1, lblk1 + len, 0);
5595 			if (unlikely(*erp))
5596 				goto finish;
5597 		}
5598 		if (len != e2_len) {
5599 			split = 1;
5600 			*erp = ext4_force_split_extent_at(handle, inode2,
5601 						&path2, lblk2 + len, 0);
5602 			if (*erp)
5603 				goto finish;
5604 		}
5605 		/* ext4_split_extent_at() may result in leaf extent split,
5606 		 * path must to be revalidated. */
5607 		if (split)
5608 			goto repeat;
5609 
5610 		BUG_ON(e2_len != e1_len);
5611 		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5612 		if (unlikely(*erp))
5613 			goto finish;
5614 		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5615 		if (unlikely(*erp))
5616 			goto finish;
5617 
5618 		/* Both extents are fully inside boundaries. Swap it now */
5619 		tmp_ex = *ex1;
5620 		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5621 		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5622 		ex1->ee_len = cpu_to_le16(e2_len);
5623 		ex2->ee_len = cpu_to_le16(e1_len);
5624 		if (unwritten)
5625 			ext4_ext_mark_unwritten(ex2);
5626 		if (ext4_ext_is_unwritten(&tmp_ex))
5627 			ext4_ext_mark_unwritten(ex1);
5628 
5629 		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5630 		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5631 		*erp = ext4_ext_dirty(handle, inode2, path2 +
5632 				      path2->p_depth);
5633 		if (unlikely(*erp))
5634 			goto finish;
5635 		*erp = ext4_ext_dirty(handle, inode1, path1 +
5636 				      path1->p_depth);
5637 		/*
5638 		 * Looks scarry ah..? second inode already points to new blocks,
5639 		 * and it was successfully dirtied. But luckily error may happen
5640 		 * only due to journal error, so full transaction will be
5641 		 * aborted anyway.
5642 		 */
5643 		if (unlikely(*erp))
5644 			goto finish;
5645 		lblk1 += len;
5646 		lblk2 += len;
5647 		replaced_count += len;
5648 		count -= len;
5649 
5650 	repeat:
5651 		ext4_ext_drop_refs(path1);
5652 		kfree(path1);
5653 		ext4_ext_drop_refs(path2);
5654 		kfree(path2);
5655 		path1 = path2 = NULL;
5656 	}
5657 	return replaced_count;
5658 }
5659 
5660 /*
5661  * ext4_clu_mapped - determine whether any block in a logical cluster has
5662  *                   been mapped to a physical cluster
5663  *
5664  * @inode - file containing the logical cluster
5665  * @lclu - logical cluster of interest
5666  *
5667  * Returns 1 if any block in the logical cluster is mapped, signifying
5668  * that a physical cluster has been allocated for it.  Otherwise,
5669  * returns 0.  Can also return negative error codes.  Derived from
5670  * ext4_ext_map_blocks().
5671  */
5672 int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5673 {
5674 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5675 	struct ext4_ext_path *path;
5676 	int depth, mapped = 0, err = 0;
5677 	struct ext4_extent *extent;
5678 	ext4_lblk_t first_lblk, first_lclu, last_lclu;
5679 
5680 	/* search for the extent closest to the first block in the cluster */
5681 	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5682 	if (IS_ERR(path)) {
5683 		err = PTR_ERR(path);
5684 		path = NULL;
5685 		goto out;
5686 	}
5687 
5688 	depth = ext_depth(inode);
5689 
5690 	/*
5691 	 * A consistent leaf must not be empty.  This situation is possible,
5692 	 * though, _during_ tree modification, and it's why an assert can't
5693 	 * be put in ext4_find_extent().
5694 	 */
5695 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5696 		EXT4_ERROR_INODE(inode,
5697 		    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5698 				 (unsigned long) EXT4_C2B(sbi, lclu),
5699 				 depth, path[depth].p_block);
5700 		err = -EFSCORRUPTED;
5701 		goto out;
5702 	}
5703 
5704 	extent = path[depth].p_ext;
5705 
5706 	/* can't be mapped if the extent tree is empty */
5707 	if (extent == NULL)
5708 		goto out;
5709 
5710 	first_lblk = le32_to_cpu(extent->ee_block);
5711 	first_lclu = EXT4_B2C(sbi, first_lblk);
5712 
5713 	/*
5714 	 * Three possible outcomes at this point - found extent spanning
5715 	 * the target cluster, to the left of the target cluster, or to the
5716 	 * right of the target cluster.  The first two cases are handled here.
5717 	 * The last case indicates the target cluster is not mapped.
5718 	 */
5719 	if (lclu >= first_lclu) {
5720 		last_lclu = EXT4_B2C(sbi, first_lblk +
5721 				     ext4_ext_get_actual_len(extent) - 1);
5722 		if (lclu <= last_lclu) {
5723 			mapped = 1;
5724 		} else {
5725 			first_lblk = ext4_ext_next_allocated_block(path);
5726 			first_lclu = EXT4_B2C(sbi, first_lblk);
5727 			if (lclu == first_lclu)
5728 				mapped = 1;
5729 		}
5730 	}
5731 
5732 out:
5733 	ext4_ext_drop_refs(path);
5734 	kfree(path);
5735 
5736 	return err ? err : mapped;
5737 }
5738