xref: /openbmc/linux/fs/ext4/extents.c (revision 189e868fa8fdca702eb9db9d8afc46b5cb9144c9)
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22 
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31 
32 #include <linux/module.h>
33 #include <linux/fs.h>
34 #include <linux/time.h>
35 #include <linux/jbd2.h>
36 #include <linux/highuid.h>
37 #include <linux/pagemap.h>
38 #include <linux/quotaops.h>
39 #include <linux/string.h>
40 #include <linux/slab.h>
41 #include <linux/falloc.h>
42 #include <asm/uaccess.h>
43 #include <linux/fiemap.h>
44 #include "ext4_jbd2.h"
45 #include "ext4_extents.h"
46 
47 #include <trace/events/ext4.h>
48 
49 static int ext4_split_extent(handle_t *handle,
50 				struct inode *inode,
51 				struct ext4_ext_path *path,
52 				struct ext4_map_blocks *map,
53 				int split_flag,
54 				int flags);
55 
56 static int ext4_ext_truncate_extend_restart(handle_t *handle,
57 					    struct inode *inode,
58 					    int needed)
59 {
60 	int err;
61 
62 	if (!ext4_handle_valid(handle))
63 		return 0;
64 	if (handle->h_buffer_credits > needed)
65 		return 0;
66 	err = ext4_journal_extend(handle, needed);
67 	if (err <= 0)
68 		return err;
69 	err = ext4_truncate_restart_trans(handle, inode, needed);
70 	if (err == 0)
71 		err = -EAGAIN;
72 
73 	return err;
74 }
75 
76 /*
77  * could return:
78  *  - EROFS
79  *  - ENOMEM
80  */
81 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
82 				struct ext4_ext_path *path)
83 {
84 	if (path->p_bh) {
85 		/* path points to block */
86 		return ext4_journal_get_write_access(handle, path->p_bh);
87 	}
88 	/* path points to leaf/index in inode body */
89 	/* we use in-core data, no need to protect them */
90 	return 0;
91 }
92 
93 /*
94  * could return:
95  *  - EROFS
96  *  - ENOMEM
97  *  - EIO
98  */
99 #define ext4_ext_dirty(handle, inode, path) \
100 		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
101 static int __ext4_ext_dirty(const char *where, unsigned int line,
102 			    handle_t *handle, struct inode *inode,
103 			    struct ext4_ext_path *path)
104 {
105 	int err;
106 	if (path->p_bh) {
107 		/* path points to block */
108 		err = __ext4_handle_dirty_metadata(where, line, handle,
109 						   inode, path->p_bh);
110 	} else {
111 		/* path points to leaf/index in inode body */
112 		err = ext4_mark_inode_dirty(handle, inode);
113 	}
114 	return err;
115 }
116 
117 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
118 			      struct ext4_ext_path *path,
119 			      ext4_lblk_t block)
120 {
121 	int depth;
122 
123 	if (path) {
124 		struct ext4_extent *ex;
125 		depth = path->p_depth;
126 
127 		/*
128 		 * Try to predict block placement assuming that we are
129 		 * filling in a file which will eventually be
130 		 * non-sparse --- i.e., in the case of libbfd writing
131 		 * an ELF object sections out-of-order but in a way
132 		 * the eventually results in a contiguous object or
133 		 * executable file, or some database extending a table
134 		 * space file.  However, this is actually somewhat
135 		 * non-ideal if we are writing a sparse file such as
136 		 * qemu or KVM writing a raw image file that is going
137 		 * to stay fairly sparse, since it will end up
138 		 * fragmenting the file system's free space.  Maybe we
139 		 * should have some hueristics or some way to allow
140 		 * userspace to pass a hint to file system,
141 		 * especially if the latter case turns out to be
142 		 * common.
143 		 */
144 		ex = path[depth].p_ext;
145 		if (ex) {
146 			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
147 			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
148 
149 			if (block > ext_block)
150 				return ext_pblk + (block - ext_block);
151 			else
152 				return ext_pblk - (ext_block - block);
153 		}
154 
155 		/* it looks like index is empty;
156 		 * try to find starting block from index itself */
157 		if (path[depth].p_bh)
158 			return path[depth].p_bh->b_blocknr;
159 	}
160 
161 	/* OK. use inode's group */
162 	return ext4_inode_to_goal_block(inode);
163 }
164 
165 /*
166  * Allocation for a meta data block
167  */
168 static ext4_fsblk_t
169 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
170 			struct ext4_ext_path *path,
171 			struct ext4_extent *ex, int *err, unsigned int flags)
172 {
173 	ext4_fsblk_t goal, newblock;
174 
175 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
176 	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
177 					NULL, err);
178 	return newblock;
179 }
180 
181 static inline int ext4_ext_space_block(struct inode *inode, int check)
182 {
183 	int size;
184 
185 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
186 			/ sizeof(struct ext4_extent);
187 	if (!check) {
188 #ifdef AGGRESSIVE_TEST
189 		if (size > 6)
190 			size = 6;
191 #endif
192 	}
193 	return size;
194 }
195 
196 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
197 {
198 	int size;
199 
200 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
201 			/ sizeof(struct ext4_extent_idx);
202 	if (!check) {
203 #ifdef AGGRESSIVE_TEST
204 		if (size > 5)
205 			size = 5;
206 #endif
207 	}
208 	return size;
209 }
210 
211 static inline int ext4_ext_space_root(struct inode *inode, int check)
212 {
213 	int size;
214 
215 	size = sizeof(EXT4_I(inode)->i_data);
216 	size -= sizeof(struct ext4_extent_header);
217 	size /= sizeof(struct ext4_extent);
218 	if (!check) {
219 #ifdef AGGRESSIVE_TEST
220 		if (size > 3)
221 			size = 3;
222 #endif
223 	}
224 	return size;
225 }
226 
227 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
228 {
229 	int size;
230 
231 	size = sizeof(EXT4_I(inode)->i_data);
232 	size -= sizeof(struct ext4_extent_header);
233 	size /= sizeof(struct ext4_extent_idx);
234 	if (!check) {
235 #ifdef AGGRESSIVE_TEST
236 		if (size > 4)
237 			size = 4;
238 #endif
239 	}
240 	return size;
241 }
242 
243 /*
244  * Calculate the number of metadata blocks needed
245  * to allocate @blocks
246  * Worse case is one block per extent
247  */
248 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
249 {
250 	struct ext4_inode_info *ei = EXT4_I(inode);
251 	int idxs, num = 0;
252 
253 	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
254 		/ sizeof(struct ext4_extent_idx));
255 
256 	/*
257 	 * If the new delayed allocation block is contiguous with the
258 	 * previous da block, it can share index blocks with the
259 	 * previous block, so we only need to allocate a new index
260 	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
261 	 * an additional index block, and at ldxs**3 blocks, yet
262 	 * another index blocks.
263 	 */
264 	if (ei->i_da_metadata_calc_len &&
265 	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
266 		if ((ei->i_da_metadata_calc_len % idxs) == 0)
267 			num++;
268 		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
269 			num++;
270 		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
271 			num++;
272 			ei->i_da_metadata_calc_len = 0;
273 		} else
274 			ei->i_da_metadata_calc_len++;
275 		ei->i_da_metadata_calc_last_lblock++;
276 		return num;
277 	}
278 
279 	/*
280 	 * In the worst case we need a new set of index blocks at
281 	 * every level of the inode's extent tree.
282 	 */
283 	ei->i_da_metadata_calc_len = 1;
284 	ei->i_da_metadata_calc_last_lblock = lblock;
285 	return ext_depth(inode) + 1;
286 }
287 
288 static int
289 ext4_ext_max_entries(struct inode *inode, int depth)
290 {
291 	int max;
292 
293 	if (depth == ext_depth(inode)) {
294 		if (depth == 0)
295 			max = ext4_ext_space_root(inode, 1);
296 		else
297 			max = ext4_ext_space_root_idx(inode, 1);
298 	} else {
299 		if (depth == 0)
300 			max = ext4_ext_space_block(inode, 1);
301 		else
302 			max = ext4_ext_space_block_idx(inode, 1);
303 	}
304 
305 	return max;
306 }
307 
308 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
309 {
310 	ext4_fsblk_t block = ext4_ext_pblock(ext);
311 	int len = ext4_ext_get_actual_len(ext);
312 
313 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
314 }
315 
316 static int ext4_valid_extent_idx(struct inode *inode,
317 				struct ext4_extent_idx *ext_idx)
318 {
319 	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
320 
321 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
322 }
323 
324 static int ext4_valid_extent_entries(struct inode *inode,
325 				struct ext4_extent_header *eh,
326 				int depth)
327 {
328 	struct ext4_extent *ext;
329 	struct ext4_extent_idx *ext_idx;
330 	unsigned short entries;
331 	if (eh->eh_entries == 0)
332 		return 1;
333 
334 	entries = le16_to_cpu(eh->eh_entries);
335 
336 	if (depth == 0) {
337 		/* leaf entries */
338 		ext = EXT_FIRST_EXTENT(eh);
339 		while (entries) {
340 			if (!ext4_valid_extent(inode, ext))
341 				return 0;
342 			ext++;
343 			entries--;
344 		}
345 	} else {
346 		ext_idx = EXT_FIRST_INDEX(eh);
347 		while (entries) {
348 			if (!ext4_valid_extent_idx(inode, ext_idx))
349 				return 0;
350 			ext_idx++;
351 			entries--;
352 		}
353 	}
354 	return 1;
355 }
356 
357 static int __ext4_ext_check(const char *function, unsigned int line,
358 			    struct inode *inode, struct ext4_extent_header *eh,
359 			    int depth)
360 {
361 	const char *error_msg;
362 	int max = 0;
363 
364 	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
365 		error_msg = "invalid magic";
366 		goto corrupted;
367 	}
368 	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
369 		error_msg = "unexpected eh_depth";
370 		goto corrupted;
371 	}
372 	if (unlikely(eh->eh_max == 0)) {
373 		error_msg = "invalid eh_max";
374 		goto corrupted;
375 	}
376 	max = ext4_ext_max_entries(inode, depth);
377 	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
378 		error_msg = "too large eh_max";
379 		goto corrupted;
380 	}
381 	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
382 		error_msg = "invalid eh_entries";
383 		goto corrupted;
384 	}
385 	if (!ext4_valid_extent_entries(inode, eh, depth)) {
386 		error_msg = "invalid extent entries";
387 		goto corrupted;
388 	}
389 	return 0;
390 
391 corrupted:
392 	ext4_error_inode(inode, function, line, 0,
393 			"bad header/extent: %s - magic %x, "
394 			"entries %u, max %u(%u), depth %u(%u)",
395 			error_msg, le16_to_cpu(eh->eh_magic),
396 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
397 			max, le16_to_cpu(eh->eh_depth), depth);
398 
399 	return -EIO;
400 }
401 
402 #define ext4_ext_check(inode, eh, depth)	\
403 	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
404 
405 int ext4_ext_check_inode(struct inode *inode)
406 {
407 	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
408 }
409 
410 #ifdef EXT_DEBUG
411 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
412 {
413 	int k, l = path->p_depth;
414 
415 	ext_debug("path:");
416 	for (k = 0; k <= l; k++, path++) {
417 		if (path->p_idx) {
418 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
419 			    ext4_idx_pblock(path->p_idx));
420 		} else if (path->p_ext) {
421 			ext_debug("  %d:[%d]%d:%llu ",
422 				  le32_to_cpu(path->p_ext->ee_block),
423 				  ext4_ext_is_uninitialized(path->p_ext),
424 				  ext4_ext_get_actual_len(path->p_ext),
425 				  ext4_ext_pblock(path->p_ext));
426 		} else
427 			ext_debug("  []");
428 	}
429 	ext_debug("\n");
430 }
431 
432 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
433 {
434 	int depth = ext_depth(inode);
435 	struct ext4_extent_header *eh;
436 	struct ext4_extent *ex;
437 	int i;
438 
439 	if (!path)
440 		return;
441 
442 	eh = path[depth].p_hdr;
443 	ex = EXT_FIRST_EXTENT(eh);
444 
445 	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
446 
447 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
448 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
449 			  ext4_ext_is_uninitialized(ex),
450 			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
451 	}
452 	ext_debug("\n");
453 }
454 
455 static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
456 			ext4_fsblk_t newblock, int level)
457 {
458 	int depth = ext_depth(inode);
459 	struct ext4_extent *ex;
460 
461 	if (depth != level) {
462 		struct ext4_extent_idx *idx;
463 		idx = path[level].p_idx;
464 		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
465 			ext_debug("%d: move %d:%llu in new index %llu\n", level,
466 					le32_to_cpu(idx->ei_block),
467 					ext4_idx_pblock(idx),
468 					newblock);
469 			idx++;
470 		}
471 
472 		return;
473 	}
474 
475 	ex = path[depth].p_ext;
476 	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
477 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
478 				le32_to_cpu(ex->ee_block),
479 				ext4_ext_pblock(ex),
480 				ext4_ext_is_uninitialized(ex),
481 				ext4_ext_get_actual_len(ex),
482 				newblock);
483 		ex++;
484 	}
485 }
486 
487 #else
488 #define ext4_ext_show_path(inode, path)
489 #define ext4_ext_show_leaf(inode, path)
490 #define ext4_ext_show_move(inode, path, newblock, level)
491 #endif
492 
493 void ext4_ext_drop_refs(struct ext4_ext_path *path)
494 {
495 	int depth = path->p_depth;
496 	int i;
497 
498 	for (i = 0; i <= depth; i++, path++)
499 		if (path->p_bh) {
500 			brelse(path->p_bh);
501 			path->p_bh = NULL;
502 		}
503 }
504 
505 /*
506  * ext4_ext_binsearch_idx:
507  * binary search for the closest index of the given block
508  * the header must be checked before calling this
509  */
510 static void
511 ext4_ext_binsearch_idx(struct inode *inode,
512 			struct ext4_ext_path *path, ext4_lblk_t block)
513 {
514 	struct ext4_extent_header *eh = path->p_hdr;
515 	struct ext4_extent_idx *r, *l, *m;
516 
517 
518 	ext_debug("binsearch for %u(idx):  ", block);
519 
520 	l = EXT_FIRST_INDEX(eh) + 1;
521 	r = EXT_LAST_INDEX(eh);
522 	while (l <= r) {
523 		m = l + (r - l) / 2;
524 		if (block < le32_to_cpu(m->ei_block))
525 			r = m - 1;
526 		else
527 			l = m + 1;
528 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
529 				m, le32_to_cpu(m->ei_block),
530 				r, le32_to_cpu(r->ei_block));
531 	}
532 
533 	path->p_idx = l - 1;
534 	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
535 		  ext4_idx_pblock(path->p_idx));
536 
537 #ifdef CHECK_BINSEARCH
538 	{
539 		struct ext4_extent_idx *chix, *ix;
540 		int k;
541 
542 		chix = ix = EXT_FIRST_INDEX(eh);
543 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
544 		  if (k != 0 &&
545 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
546 				printk(KERN_DEBUG "k=%d, ix=0x%p, "
547 				       "first=0x%p\n", k,
548 				       ix, EXT_FIRST_INDEX(eh));
549 				printk(KERN_DEBUG "%u <= %u\n",
550 				       le32_to_cpu(ix->ei_block),
551 				       le32_to_cpu(ix[-1].ei_block));
552 			}
553 			BUG_ON(k && le32_to_cpu(ix->ei_block)
554 					   <= le32_to_cpu(ix[-1].ei_block));
555 			if (block < le32_to_cpu(ix->ei_block))
556 				break;
557 			chix = ix;
558 		}
559 		BUG_ON(chix != path->p_idx);
560 	}
561 #endif
562 
563 }
564 
565 /*
566  * ext4_ext_binsearch:
567  * binary search for closest extent of the given block
568  * the header must be checked before calling this
569  */
570 static void
571 ext4_ext_binsearch(struct inode *inode,
572 		struct ext4_ext_path *path, ext4_lblk_t block)
573 {
574 	struct ext4_extent_header *eh = path->p_hdr;
575 	struct ext4_extent *r, *l, *m;
576 
577 	if (eh->eh_entries == 0) {
578 		/*
579 		 * this leaf is empty:
580 		 * we get such a leaf in split/add case
581 		 */
582 		return;
583 	}
584 
585 	ext_debug("binsearch for %u:  ", block);
586 
587 	l = EXT_FIRST_EXTENT(eh) + 1;
588 	r = EXT_LAST_EXTENT(eh);
589 
590 	while (l <= r) {
591 		m = l + (r - l) / 2;
592 		if (block < le32_to_cpu(m->ee_block))
593 			r = m - 1;
594 		else
595 			l = m + 1;
596 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
597 				m, le32_to_cpu(m->ee_block),
598 				r, le32_to_cpu(r->ee_block));
599 	}
600 
601 	path->p_ext = l - 1;
602 	ext_debug("  -> %d:%llu:[%d]%d ",
603 			le32_to_cpu(path->p_ext->ee_block),
604 			ext4_ext_pblock(path->p_ext),
605 			ext4_ext_is_uninitialized(path->p_ext),
606 			ext4_ext_get_actual_len(path->p_ext));
607 
608 #ifdef CHECK_BINSEARCH
609 	{
610 		struct ext4_extent *chex, *ex;
611 		int k;
612 
613 		chex = ex = EXT_FIRST_EXTENT(eh);
614 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
615 			BUG_ON(k && le32_to_cpu(ex->ee_block)
616 					  <= le32_to_cpu(ex[-1].ee_block));
617 			if (block < le32_to_cpu(ex->ee_block))
618 				break;
619 			chex = ex;
620 		}
621 		BUG_ON(chex != path->p_ext);
622 	}
623 #endif
624 
625 }
626 
627 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
628 {
629 	struct ext4_extent_header *eh;
630 
631 	eh = ext_inode_hdr(inode);
632 	eh->eh_depth = 0;
633 	eh->eh_entries = 0;
634 	eh->eh_magic = EXT4_EXT_MAGIC;
635 	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
636 	ext4_mark_inode_dirty(handle, inode);
637 	ext4_ext_invalidate_cache(inode);
638 	return 0;
639 }
640 
641 struct ext4_ext_path *
642 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
643 					struct ext4_ext_path *path)
644 {
645 	struct ext4_extent_header *eh;
646 	struct buffer_head *bh;
647 	short int depth, i, ppos = 0, alloc = 0;
648 
649 	eh = ext_inode_hdr(inode);
650 	depth = ext_depth(inode);
651 
652 	/* account possible depth increase */
653 	if (!path) {
654 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
655 				GFP_NOFS);
656 		if (!path)
657 			return ERR_PTR(-ENOMEM);
658 		alloc = 1;
659 	}
660 	path[0].p_hdr = eh;
661 	path[0].p_bh = NULL;
662 
663 	i = depth;
664 	/* walk through the tree */
665 	while (i) {
666 		int need_to_validate = 0;
667 
668 		ext_debug("depth %d: num %d, max %d\n",
669 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
670 
671 		ext4_ext_binsearch_idx(inode, path + ppos, block);
672 		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
673 		path[ppos].p_depth = i;
674 		path[ppos].p_ext = NULL;
675 
676 		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
677 		if (unlikely(!bh))
678 			goto err;
679 		if (!bh_uptodate_or_lock(bh)) {
680 			trace_ext4_ext_load_extent(inode, block,
681 						path[ppos].p_block);
682 			if (bh_submit_read(bh) < 0) {
683 				put_bh(bh);
684 				goto err;
685 			}
686 			/* validate the extent entries */
687 			need_to_validate = 1;
688 		}
689 		eh = ext_block_hdr(bh);
690 		ppos++;
691 		if (unlikely(ppos > depth)) {
692 			put_bh(bh);
693 			EXT4_ERROR_INODE(inode,
694 					 "ppos %d > depth %d", ppos, depth);
695 			goto err;
696 		}
697 		path[ppos].p_bh = bh;
698 		path[ppos].p_hdr = eh;
699 		i--;
700 
701 		if (need_to_validate && ext4_ext_check(inode, eh, i))
702 			goto err;
703 	}
704 
705 	path[ppos].p_depth = i;
706 	path[ppos].p_ext = NULL;
707 	path[ppos].p_idx = NULL;
708 
709 	/* find extent */
710 	ext4_ext_binsearch(inode, path + ppos, block);
711 	/* if not an empty leaf */
712 	if (path[ppos].p_ext)
713 		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
714 
715 	ext4_ext_show_path(inode, path);
716 
717 	return path;
718 
719 err:
720 	ext4_ext_drop_refs(path);
721 	if (alloc)
722 		kfree(path);
723 	return ERR_PTR(-EIO);
724 }
725 
726 /*
727  * ext4_ext_insert_index:
728  * insert new index [@logical;@ptr] into the block at @curp;
729  * check where to insert: before @curp or after @curp
730  */
731 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
732 				 struct ext4_ext_path *curp,
733 				 int logical, ext4_fsblk_t ptr)
734 {
735 	struct ext4_extent_idx *ix;
736 	int len, err;
737 
738 	err = ext4_ext_get_access(handle, inode, curp);
739 	if (err)
740 		return err;
741 
742 	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
743 		EXT4_ERROR_INODE(inode,
744 				 "logical %d == ei_block %d!",
745 				 logical, le32_to_cpu(curp->p_idx->ei_block));
746 		return -EIO;
747 	}
748 
749 	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
750 			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
751 		EXT4_ERROR_INODE(inode,
752 				 "eh_entries %d >= eh_max %d!",
753 				 le16_to_cpu(curp->p_hdr->eh_entries),
754 				 le16_to_cpu(curp->p_hdr->eh_max));
755 		return -EIO;
756 	}
757 
758 	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
759 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
760 		/* insert after */
761 		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
762 			len = (len - 1) * sizeof(struct ext4_extent_idx);
763 			len = len < 0 ? 0 : len;
764 			ext_debug("insert new index %d after: %llu. "
765 					"move %d from 0x%p to 0x%p\n",
766 					logical, ptr, len,
767 					(curp->p_idx + 1), (curp->p_idx + 2));
768 			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
769 		}
770 		ix = curp->p_idx + 1;
771 	} else {
772 		/* insert before */
773 		len = len * sizeof(struct ext4_extent_idx);
774 		len = len < 0 ? 0 : len;
775 		ext_debug("insert new index %d before: %llu. "
776 				"move %d from 0x%p to 0x%p\n",
777 				logical, ptr, len,
778 				curp->p_idx, (curp->p_idx + 1));
779 		memmove(curp->p_idx + 1, curp->p_idx, len);
780 		ix = curp->p_idx;
781 	}
782 
783 	ix->ei_block = cpu_to_le32(logical);
784 	ext4_idx_store_pblock(ix, ptr);
785 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
786 
787 	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
788 		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
789 		return -EIO;
790 	}
791 
792 	err = ext4_ext_dirty(handle, inode, curp);
793 	ext4_std_error(inode->i_sb, err);
794 
795 	return err;
796 }
797 
798 /*
799  * ext4_ext_split:
800  * inserts new subtree into the path, using free index entry
801  * at depth @at:
802  * - allocates all needed blocks (new leaf and all intermediate index blocks)
803  * - makes decision where to split
804  * - moves remaining extents and index entries (right to the split point)
805  *   into the newly allocated blocks
806  * - initializes subtree
807  */
808 static int ext4_ext_split(handle_t *handle, struct inode *inode,
809 			  unsigned int flags,
810 			  struct ext4_ext_path *path,
811 			  struct ext4_extent *newext, int at)
812 {
813 	struct buffer_head *bh = NULL;
814 	int depth = ext_depth(inode);
815 	struct ext4_extent_header *neh;
816 	struct ext4_extent_idx *fidx;
817 	int i = at, k, m, a;
818 	ext4_fsblk_t newblock, oldblock;
819 	__le32 border;
820 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
821 	int err = 0;
822 
823 	/* make decision: where to split? */
824 	/* FIXME: now decision is simplest: at current extent */
825 
826 	/* if current leaf will be split, then we should use
827 	 * border from split point */
828 	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
829 		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
830 		return -EIO;
831 	}
832 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
833 		border = path[depth].p_ext[1].ee_block;
834 		ext_debug("leaf will be split."
835 				" next leaf starts at %d\n",
836 				  le32_to_cpu(border));
837 	} else {
838 		border = newext->ee_block;
839 		ext_debug("leaf will be added."
840 				" next leaf starts at %d\n",
841 				le32_to_cpu(border));
842 	}
843 
844 	/*
845 	 * If error occurs, then we break processing
846 	 * and mark filesystem read-only. index won't
847 	 * be inserted and tree will be in consistent
848 	 * state. Next mount will repair buffers too.
849 	 */
850 
851 	/*
852 	 * Get array to track all allocated blocks.
853 	 * We need this to handle errors and free blocks
854 	 * upon them.
855 	 */
856 	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
857 	if (!ablocks)
858 		return -ENOMEM;
859 
860 	/* allocate all needed blocks */
861 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
862 	for (a = 0; a < depth - at; a++) {
863 		newblock = ext4_ext_new_meta_block(handle, inode, path,
864 						   newext, &err, flags);
865 		if (newblock == 0)
866 			goto cleanup;
867 		ablocks[a] = newblock;
868 	}
869 
870 	/* initialize new leaf */
871 	newblock = ablocks[--a];
872 	if (unlikely(newblock == 0)) {
873 		EXT4_ERROR_INODE(inode, "newblock == 0!");
874 		err = -EIO;
875 		goto cleanup;
876 	}
877 	bh = sb_getblk(inode->i_sb, newblock);
878 	if (!bh) {
879 		err = -EIO;
880 		goto cleanup;
881 	}
882 	lock_buffer(bh);
883 
884 	err = ext4_journal_get_create_access(handle, bh);
885 	if (err)
886 		goto cleanup;
887 
888 	neh = ext_block_hdr(bh);
889 	neh->eh_entries = 0;
890 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
891 	neh->eh_magic = EXT4_EXT_MAGIC;
892 	neh->eh_depth = 0;
893 
894 	/* move remainder of path[depth] to the new leaf */
895 	if (unlikely(path[depth].p_hdr->eh_entries !=
896 		     path[depth].p_hdr->eh_max)) {
897 		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
898 				 path[depth].p_hdr->eh_entries,
899 				 path[depth].p_hdr->eh_max);
900 		err = -EIO;
901 		goto cleanup;
902 	}
903 	/* start copy from next extent */
904 	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
905 	ext4_ext_show_move(inode, path, newblock, depth);
906 	if (m) {
907 		struct ext4_extent *ex;
908 		ex = EXT_FIRST_EXTENT(neh);
909 		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
910 		le16_add_cpu(&neh->eh_entries, m);
911 	}
912 
913 	set_buffer_uptodate(bh);
914 	unlock_buffer(bh);
915 
916 	err = ext4_handle_dirty_metadata(handle, inode, bh);
917 	if (err)
918 		goto cleanup;
919 	brelse(bh);
920 	bh = NULL;
921 
922 	/* correct old leaf */
923 	if (m) {
924 		err = ext4_ext_get_access(handle, inode, path + depth);
925 		if (err)
926 			goto cleanup;
927 		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
928 		err = ext4_ext_dirty(handle, inode, path + depth);
929 		if (err)
930 			goto cleanup;
931 
932 	}
933 
934 	/* create intermediate indexes */
935 	k = depth - at - 1;
936 	if (unlikely(k < 0)) {
937 		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
938 		err = -EIO;
939 		goto cleanup;
940 	}
941 	if (k)
942 		ext_debug("create %d intermediate indices\n", k);
943 	/* insert new index into current index block */
944 	/* current depth stored in i var */
945 	i = depth - 1;
946 	while (k--) {
947 		oldblock = newblock;
948 		newblock = ablocks[--a];
949 		bh = sb_getblk(inode->i_sb, newblock);
950 		if (!bh) {
951 			err = -EIO;
952 			goto cleanup;
953 		}
954 		lock_buffer(bh);
955 
956 		err = ext4_journal_get_create_access(handle, bh);
957 		if (err)
958 			goto cleanup;
959 
960 		neh = ext_block_hdr(bh);
961 		neh->eh_entries = cpu_to_le16(1);
962 		neh->eh_magic = EXT4_EXT_MAGIC;
963 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
964 		neh->eh_depth = cpu_to_le16(depth - i);
965 		fidx = EXT_FIRST_INDEX(neh);
966 		fidx->ei_block = border;
967 		ext4_idx_store_pblock(fidx, oldblock);
968 
969 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 				i, newblock, le32_to_cpu(border), oldblock);
971 
972 		/* move remainder of path[i] to the new index block */
973 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
974 					EXT_LAST_INDEX(path[i].p_hdr))) {
975 			EXT4_ERROR_INODE(inode,
976 					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
977 					 le32_to_cpu(path[i].p_ext->ee_block));
978 			err = -EIO;
979 			goto cleanup;
980 		}
981 		/* start copy indexes */
982 		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
983 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
984 				EXT_MAX_INDEX(path[i].p_hdr));
985 		ext4_ext_show_move(inode, path, newblock, i);
986 		if (m) {
987 			memmove(++fidx, path[i].p_idx,
988 				sizeof(struct ext4_extent_idx) * m);
989 			le16_add_cpu(&neh->eh_entries, m);
990 		}
991 		set_buffer_uptodate(bh);
992 		unlock_buffer(bh);
993 
994 		err = ext4_handle_dirty_metadata(handle, inode, bh);
995 		if (err)
996 			goto cleanup;
997 		brelse(bh);
998 		bh = NULL;
999 
1000 		/* correct old index */
1001 		if (m) {
1002 			err = ext4_ext_get_access(handle, inode, path + i);
1003 			if (err)
1004 				goto cleanup;
1005 			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1006 			err = ext4_ext_dirty(handle, inode, path + i);
1007 			if (err)
1008 				goto cleanup;
1009 		}
1010 
1011 		i--;
1012 	}
1013 
1014 	/* insert new index */
1015 	err = ext4_ext_insert_index(handle, inode, path + at,
1016 				    le32_to_cpu(border), newblock);
1017 
1018 cleanup:
1019 	if (bh) {
1020 		if (buffer_locked(bh))
1021 			unlock_buffer(bh);
1022 		brelse(bh);
1023 	}
1024 
1025 	if (err) {
1026 		/* free all allocated blocks in error case */
1027 		for (i = 0; i < depth; i++) {
1028 			if (!ablocks[i])
1029 				continue;
1030 			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1031 					 EXT4_FREE_BLOCKS_METADATA);
1032 		}
1033 	}
1034 	kfree(ablocks);
1035 
1036 	return err;
1037 }
1038 
1039 /*
1040  * ext4_ext_grow_indepth:
1041  * implements tree growing procedure:
1042  * - allocates new block
1043  * - moves top-level data (index block or leaf) into the new block
1044  * - initializes new top-level, creating index that points to the
1045  *   just created block
1046  */
1047 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1048 				 unsigned int flags,
1049 				 struct ext4_ext_path *path,
1050 				 struct ext4_extent *newext)
1051 {
1052 	struct ext4_ext_path *curp = path;
1053 	struct ext4_extent_header *neh;
1054 	struct buffer_head *bh;
1055 	ext4_fsblk_t newblock;
1056 	int err = 0;
1057 
1058 	newblock = ext4_ext_new_meta_block(handle, inode, path,
1059 		newext, &err, flags);
1060 	if (newblock == 0)
1061 		return err;
1062 
1063 	bh = sb_getblk(inode->i_sb, newblock);
1064 	if (!bh) {
1065 		err = -EIO;
1066 		ext4_std_error(inode->i_sb, err);
1067 		return err;
1068 	}
1069 	lock_buffer(bh);
1070 
1071 	err = ext4_journal_get_create_access(handle, bh);
1072 	if (err) {
1073 		unlock_buffer(bh);
1074 		goto out;
1075 	}
1076 
1077 	/* move top-level index/leaf into new block */
1078 	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1079 
1080 	/* set size of new block */
1081 	neh = ext_block_hdr(bh);
1082 	/* old root could have indexes or leaves
1083 	 * so calculate e_max right way */
1084 	if (ext_depth(inode))
1085 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1086 	else
1087 		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1088 	neh->eh_magic = EXT4_EXT_MAGIC;
1089 	set_buffer_uptodate(bh);
1090 	unlock_buffer(bh);
1091 
1092 	err = ext4_handle_dirty_metadata(handle, inode, bh);
1093 	if (err)
1094 		goto out;
1095 
1096 	/* create index in new top-level index: num,max,pointer */
1097 	err = ext4_ext_get_access(handle, inode, curp);
1098 	if (err)
1099 		goto out;
1100 
1101 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1102 	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1103 	curp->p_hdr->eh_entries = cpu_to_le16(1);
1104 	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1105 
1106 	if (path[0].p_hdr->eh_depth)
1107 		curp->p_idx->ei_block =
1108 			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1109 	else
1110 		curp->p_idx->ei_block =
1111 			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1112 	ext4_idx_store_pblock(curp->p_idx, newblock);
1113 
1114 	neh = ext_inode_hdr(inode);
1115 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1116 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1117 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1118 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1119 
1120 	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1121 	err = ext4_ext_dirty(handle, inode, curp);
1122 out:
1123 	brelse(bh);
1124 
1125 	return err;
1126 }
1127 
1128 /*
1129  * ext4_ext_create_new_leaf:
1130  * finds empty index and adds new leaf.
1131  * if no free index is found, then it requests in-depth growing.
1132  */
1133 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1134 				    unsigned int flags,
1135 				    struct ext4_ext_path *path,
1136 				    struct ext4_extent *newext)
1137 {
1138 	struct ext4_ext_path *curp;
1139 	int depth, i, err = 0;
1140 
1141 repeat:
1142 	i = depth = ext_depth(inode);
1143 
1144 	/* walk up to the tree and look for free index entry */
1145 	curp = path + depth;
1146 	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1147 		i--;
1148 		curp--;
1149 	}
1150 
1151 	/* we use already allocated block for index block,
1152 	 * so subsequent data blocks should be contiguous */
1153 	if (EXT_HAS_FREE_INDEX(curp)) {
1154 		/* if we found index with free entry, then use that
1155 		 * entry: create all needed subtree and add new leaf */
1156 		err = ext4_ext_split(handle, inode, flags, path, newext, i);
1157 		if (err)
1158 			goto out;
1159 
1160 		/* refill path */
1161 		ext4_ext_drop_refs(path);
1162 		path = ext4_ext_find_extent(inode,
1163 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1164 				    path);
1165 		if (IS_ERR(path))
1166 			err = PTR_ERR(path);
1167 	} else {
1168 		/* tree is full, time to grow in depth */
1169 		err = ext4_ext_grow_indepth(handle, inode, flags,
1170 					    path, newext);
1171 		if (err)
1172 			goto out;
1173 
1174 		/* refill path */
1175 		ext4_ext_drop_refs(path);
1176 		path = ext4_ext_find_extent(inode,
1177 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1178 				    path);
1179 		if (IS_ERR(path)) {
1180 			err = PTR_ERR(path);
1181 			goto out;
1182 		}
1183 
1184 		/*
1185 		 * only first (depth 0 -> 1) produces free space;
1186 		 * in all other cases we have to split the grown tree
1187 		 */
1188 		depth = ext_depth(inode);
1189 		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1190 			/* now we need to split */
1191 			goto repeat;
1192 		}
1193 	}
1194 
1195 out:
1196 	return err;
1197 }
1198 
1199 /*
1200  * search the closest allocated block to the left for *logical
1201  * and returns it at @logical + it's physical address at @phys
1202  * if *logical is the smallest allocated block, the function
1203  * returns 0 at @phys
1204  * return value contains 0 (success) or error code
1205  */
1206 static int ext4_ext_search_left(struct inode *inode,
1207 				struct ext4_ext_path *path,
1208 				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1209 {
1210 	struct ext4_extent_idx *ix;
1211 	struct ext4_extent *ex;
1212 	int depth, ee_len;
1213 
1214 	if (unlikely(path == NULL)) {
1215 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1216 		return -EIO;
1217 	}
1218 	depth = path->p_depth;
1219 	*phys = 0;
1220 
1221 	if (depth == 0 && path->p_ext == NULL)
1222 		return 0;
1223 
1224 	/* usually extent in the path covers blocks smaller
1225 	 * then *logical, but it can be that extent is the
1226 	 * first one in the file */
1227 
1228 	ex = path[depth].p_ext;
1229 	ee_len = ext4_ext_get_actual_len(ex);
1230 	if (*logical < le32_to_cpu(ex->ee_block)) {
1231 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1232 			EXT4_ERROR_INODE(inode,
1233 					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1234 					 *logical, le32_to_cpu(ex->ee_block));
1235 			return -EIO;
1236 		}
1237 		while (--depth >= 0) {
1238 			ix = path[depth].p_idx;
1239 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1240 				EXT4_ERROR_INODE(inode,
1241 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1242 				  ix != NULL ? ix->ei_block : 0,
1243 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1244 				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1245 				  depth);
1246 				return -EIO;
1247 			}
1248 		}
1249 		return 0;
1250 	}
1251 
1252 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1253 		EXT4_ERROR_INODE(inode,
1254 				 "logical %d < ee_block %d + ee_len %d!",
1255 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1256 		return -EIO;
1257 	}
1258 
1259 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1260 	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1261 	return 0;
1262 }
1263 
1264 /*
1265  * search the closest allocated block to the right for *logical
1266  * and returns it at @logical + it's physical address at @phys
1267  * if *logical is the smallest allocated block, the function
1268  * returns 0 at @phys
1269  * return value contains 0 (success) or error code
1270  */
1271 static int ext4_ext_search_right(struct inode *inode,
1272 				 struct ext4_ext_path *path,
1273 				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1274 {
1275 	struct buffer_head *bh = NULL;
1276 	struct ext4_extent_header *eh;
1277 	struct ext4_extent_idx *ix;
1278 	struct ext4_extent *ex;
1279 	ext4_fsblk_t block;
1280 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1281 	int ee_len;
1282 
1283 	if (unlikely(path == NULL)) {
1284 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1285 		return -EIO;
1286 	}
1287 	depth = path->p_depth;
1288 	*phys = 0;
1289 
1290 	if (depth == 0 && path->p_ext == NULL)
1291 		return 0;
1292 
1293 	/* usually extent in the path covers blocks smaller
1294 	 * then *logical, but it can be that extent is the
1295 	 * first one in the file */
1296 
1297 	ex = path[depth].p_ext;
1298 	ee_len = ext4_ext_get_actual_len(ex);
1299 	if (*logical < le32_to_cpu(ex->ee_block)) {
1300 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1301 			EXT4_ERROR_INODE(inode,
1302 					 "first_extent(path[%d].p_hdr) != ex",
1303 					 depth);
1304 			return -EIO;
1305 		}
1306 		while (--depth >= 0) {
1307 			ix = path[depth].p_idx;
1308 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1309 				EXT4_ERROR_INODE(inode,
1310 						 "ix != EXT_FIRST_INDEX *logical %d!",
1311 						 *logical);
1312 				return -EIO;
1313 			}
1314 		}
1315 		*logical = le32_to_cpu(ex->ee_block);
1316 		*phys = ext4_ext_pblock(ex);
1317 		return 0;
1318 	}
1319 
1320 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1321 		EXT4_ERROR_INODE(inode,
1322 				 "logical %d < ee_block %d + ee_len %d!",
1323 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1324 		return -EIO;
1325 	}
1326 
1327 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1328 		/* next allocated block in this leaf */
1329 		ex++;
1330 		*logical = le32_to_cpu(ex->ee_block);
1331 		*phys = ext4_ext_pblock(ex);
1332 		return 0;
1333 	}
1334 
1335 	/* go up and search for index to the right */
1336 	while (--depth >= 0) {
1337 		ix = path[depth].p_idx;
1338 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1339 			goto got_index;
1340 	}
1341 
1342 	/* we've gone up to the root and found no index to the right */
1343 	return 0;
1344 
1345 got_index:
1346 	/* we've found index to the right, let's
1347 	 * follow it and find the closest allocated
1348 	 * block to the right */
1349 	ix++;
1350 	block = ext4_idx_pblock(ix);
1351 	while (++depth < path->p_depth) {
1352 		bh = sb_bread(inode->i_sb, block);
1353 		if (bh == NULL)
1354 			return -EIO;
1355 		eh = ext_block_hdr(bh);
1356 		/* subtract from p_depth to get proper eh_depth */
1357 		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1358 			put_bh(bh);
1359 			return -EIO;
1360 		}
1361 		ix = EXT_FIRST_INDEX(eh);
1362 		block = ext4_idx_pblock(ix);
1363 		put_bh(bh);
1364 	}
1365 
1366 	bh = sb_bread(inode->i_sb, block);
1367 	if (bh == NULL)
1368 		return -EIO;
1369 	eh = ext_block_hdr(bh);
1370 	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1371 		put_bh(bh);
1372 		return -EIO;
1373 	}
1374 	ex = EXT_FIRST_EXTENT(eh);
1375 	*logical = le32_to_cpu(ex->ee_block);
1376 	*phys = ext4_ext_pblock(ex);
1377 	put_bh(bh);
1378 	return 0;
1379 }
1380 
1381 /*
1382  * ext4_ext_next_allocated_block:
1383  * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1384  * NOTE: it considers block number from index entry as
1385  * allocated block. Thus, index entries have to be consistent
1386  * with leaves.
1387  */
1388 static ext4_lblk_t
1389 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1390 {
1391 	int depth;
1392 
1393 	BUG_ON(path == NULL);
1394 	depth = path->p_depth;
1395 
1396 	if (depth == 0 && path->p_ext == NULL)
1397 		return EXT_MAX_BLOCKS;
1398 
1399 	while (depth >= 0) {
1400 		if (depth == path->p_depth) {
1401 			/* leaf */
1402 			if (path[depth].p_ext !=
1403 					EXT_LAST_EXTENT(path[depth].p_hdr))
1404 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1405 		} else {
1406 			/* index */
1407 			if (path[depth].p_idx !=
1408 					EXT_LAST_INDEX(path[depth].p_hdr))
1409 			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1410 		}
1411 		depth--;
1412 	}
1413 
1414 	return EXT_MAX_BLOCKS;
1415 }
1416 
1417 /*
1418  * ext4_ext_next_leaf_block:
1419  * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1420  */
1421 static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1422 {
1423 	int depth;
1424 
1425 	BUG_ON(path == NULL);
1426 	depth = path->p_depth;
1427 
1428 	/* zero-tree has no leaf blocks at all */
1429 	if (depth == 0)
1430 		return EXT_MAX_BLOCKS;
1431 
1432 	/* go to index block */
1433 	depth--;
1434 
1435 	while (depth >= 0) {
1436 		if (path[depth].p_idx !=
1437 				EXT_LAST_INDEX(path[depth].p_hdr))
1438 			return (ext4_lblk_t)
1439 				le32_to_cpu(path[depth].p_idx[1].ei_block);
1440 		depth--;
1441 	}
1442 
1443 	return EXT_MAX_BLOCKS;
1444 }
1445 
1446 /*
1447  * ext4_ext_correct_indexes:
1448  * if leaf gets modified and modified extent is first in the leaf,
1449  * then we have to correct all indexes above.
1450  * TODO: do we need to correct tree in all cases?
1451  */
1452 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1453 				struct ext4_ext_path *path)
1454 {
1455 	struct ext4_extent_header *eh;
1456 	int depth = ext_depth(inode);
1457 	struct ext4_extent *ex;
1458 	__le32 border;
1459 	int k, err = 0;
1460 
1461 	eh = path[depth].p_hdr;
1462 	ex = path[depth].p_ext;
1463 
1464 	if (unlikely(ex == NULL || eh == NULL)) {
1465 		EXT4_ERROR_INODE(inode,
1466 				 "ex %p == NULL or eh %p == NULL", ex, eh);
1467 		return -EIO;
1468 	}
1469 
1470 	if (depth == 0) {
1471 		/* there is no tree at all */
1472 		return 0;
1473 	}
1474 
1475 	if (ex != EXT_FIRST_EXTENT(eh)) {
1476 		/* we correct tree if first leaf got modified only */
1477 		return 0;
1478 	}
1479 
1480 	/*
1481 	 * TODO: we need correction if border is smaller than current one
1482 	 */
1483 	k = depth - 1;
1484 	border = path[depth].p_ext->ee_block;
1485 	err = ext4_ext_get_access(handle, inode, path + k);
1486 	if (err)
1487 		return err;
1488 	path[k].p_idx->ei_block = border;
1489 	err = ext4_ext_dirty(handle, inode, path + k);
1490 	if (err)
1491 		return err;
1492 
1493 	while (k--) {
1494 		/* change all left-side indexes */
1495 		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1496 			break;
1497 		err = ext4_ext_get_access(handle, inode, path + k);
1498 		if (err)
1499 			break;
1500 		path[k].p_idx->ei_block = border;
1501 		err = ext4_ext_dirty(handle, inode, path + k);
1502 		if (err)
1503 			break;
1504 	}
1505 
1506 	return err;
1507 }
1508 
1509 int
1510 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1511 				struct ext4_extent *ex2)
1512 {
1513 	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1514 
1515 	/*
1516 	 * Make sure that either both extents are uninitialized, or
1517 	 * both are _not_.
1518 	 */
1519 	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1520 		return 0;
1521 
1522 	if (ext4_ext_is_uninitialized(ex1))
1523 		max_len = EXT_UNINIT_MAX_LEN;
1524 	else
1525 		max_len = EXT_INIT_MAX_LEN;
1526 
1527 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1528 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1529 
1530 	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1531 			le32_to_cpu(ex2->ee_block))
1532 		return 0;
1533 
1534 	/*
1535 	 * To allow future support for preallocated extents to be added
1536 	 * as an RO_COMPAT feature, refuse to merge to extents if
1537 	 * this can result in the top bit of ee_len being set.
1538 	 */
1539 	if (ext1_ee_len + ext2_ee_len > max_len)
1540 		return 0;
1541 #ifdef AGGRESSIVE_TEST
1542 	if (ext1_ee_len >= 4)
1543 		return 0;
1544 #endif
1545 
1546 	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1547 		return 1;
1548 	return 0;
1549 }
1550 
1551 /*
1552  * This function tries to merge the "ex" extent to the next extent in the tree.
1553  * It always tries to merge towards right. If you want to merge towards
1554  * left, pass "ex - 1" as argument instead of "ex".
1555  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1556  * 1 if they got merged.
1557  */
1558 static int ext4_ext_try_to_merge_right(struct inode *inode,
1559 				 struct ext4_ext_path *path,
1560 				 struct ext4_extent *ex)
1561 {
1562 	struct ext4_extent_header *eh;
1563 	unsigned int depth, len;
1564 	int merge_done = 0;
1565 	int uninitialized = 0;
1566 
1567 	depth = ext_depth(inode);
1568 	BUG_ON(path[depth].p_hdr == NULL);
1569 	eh = path[depth].p_hdr;
1570 
1571 	while (ex < EXT_LAST_EXTENT(eh)) {
1572 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1573 			break;
1574 		/* merge with next extent! */
1575 		if (ext4_ext_is_uninitialized(ex))
1576 			uninitialized = 1;
1577 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1578 				+ ext4_ext_get_actual_len(ex + 1));
1579 		if (uninitialized)
1580 			ext4_ext_mark_uninitialized(ex);
1581 
1582 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1583 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1584 				* sizeof(struct ext4_extent);
1585 			memmove(ex + 1, ex + 2, len);
1586 		}
1587 		le16_add_cpu(&eh->eh_entries, -1);
1588 		merge_done = 1;
1589 		WARN_ON(eh->eh_entries == 0);
1590 		if (!eh->eh_entries)
1591 			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1592 	}
1593 
1594 	return merge_done;
1595 }
1596 
1597 /*
1598  * This function tries to merge the @ex extent to neighbours in the tree.
1599  * return 1 if merge left else 0.
1600  */
1601 static int ext4_ext_try_to_merge(struct inode *inode,
1602 				  struct ext4_ext_path *path,
1603 				  struct ext4_extent *ex) {
1604 	struct ext4_extent_header *eh;
1605 	unsigned int depth;
1606 	int merge_done = 0;
1607 	int ret = 0;
1608 
1609 	depth = ext_depth(inode);
1610 	BUG_ON(path[depth].p_hdr == NULL);
1611 	eh = path[depth].p_hdr;
1612 
1613 	if (ex > EXT_FIRST_EXTENT(eh))
1614 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1615 
1616 	if (!merge_done)
1617 		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1618 
1619 	return ret;
1620 }
1621 
1622 /*
1623  * check if a portion of the "newext" extent overlaps with an
1624  * existing extent.
1625  *
1626  * If there is an overlap discovered, it updates the length of the newext
1627  * such that there will be no overlap, and then returns 1.
1628  * If there is no overlap found, it returns 0.
1629  */
1630 static unsigned int ext4_ext_check_overlap(struct inode *inode,
1631 					   struct ext4_extent *newext,
1632 					   struct ext4_ext_path *path)
1633 {
1634 	ext4_lblk_t b1, b2;
1635 	unsigned int depth, len1;
1636 	unsigned int ret = 0;
1637 
1638 	b1 = le32_to_cpu(newext->ee_block);
1639 	len1 = ext4_ext_get_actual_len(newext);
1640 	depth = ext_depth(inode);
1641 	if (!path[depth].p_ext)
1642 		goto out;
1643 	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1644 
1645 	/*
1646 	 * get the next allocated block if the extent in the path
1647 	 * is before the requested block(s)
1648 	 */
1649 	if (b2 < b1) {
1650 		b2 = ext4_ext_next_allocated_block(path);
1651 		if (b2 == EXT_MAX_BLOCKS)
1652 			goto out;
1653 	}
1654 
1655 	/* check for wrap through zero on extent logical start block*/
1656 	if (b1 + len1 < b1) {
1657 		len1 = EXT_MAX_BLOCKS - b1;
1658 		newext->ee_len = cpu_to_le16(len1);
1659 		ret = 1;
1660 	}
1661 
1662 	/* check for overlap */
1663 	if (b1 + len1 > b2) {
1664 		newext->ee_len = cpu_to_le16(b2 - b1);
1665 		ret = 1;
1666 	}
1667 out:
1668 	return ret;
1669 }
1670 
1671 /*
1672  * ext4_ext_insert_extent:
1673  * tries to merge requsted extent into the existing extent or
1674  * inserts requested extent as new one into the tree,
1675  * creating new leaf in the no-space case.
1676  */
1677 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1678 				struct ext4_ext_path *path,
1679 				struct ext4_extent *newext, int flag)
1680 {
1681 	struct ext4_extent_header *eh;
1682 	struct ext4_extent *ex, *fex;
1683 	struct ext4_extent *nearex; /* nearest extent */
1684 	struct ext4_ext_path *npath = NULL;
1685 	int depth, len, err;
1686 	ext4_lblk_t next;
1687 	unsigned uninitialized = 0;
1688 	int flags = 0;
1689 
1690 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1691 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1692 		return -EIO;
1693 	}
1694 	depth = ext_depth(inode);
1695 	ex = path[depth].p_ext;
1696 	if (unlikely(path[depth].p_hdr == NULL)) {
1697 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1698 		return -EIO;
1699 	}
1700 
1701 	/* try to insert block into found extent and return */
1702 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1703 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1704 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1705 			  ext4_ext_is_uninitialized(newext),
1706 			  ext4_ext_get_actual_len(newext),
1707 			  le32_to_cpu(ex->ee_block),
1708 			  ext4_ext_is_uninitialized(ex),
1709 			  ext4_ext_get_actual_len(ex),
1710 			  ext4_ext_pblock(ex));
1711 		err = ext4_ext_get_access(handle, inode, path + depth);
1712 		if (err)
1713 			return err;
1714 
1715 		/*
1716 		 * ext4_can_extents_be_merged should have checked that either
1717 		 * both extents are uninitialized, or both aren't. Thus we
1718 		 * need to check only one of them here.
1719 		 */
1720 		if (ext4_ext_is_uninitialized(ex))
1721 			uninitialized = 1;
1722 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1723 					+ ext4_ext_get_actual_len(newext));
1724 		if (uninitialized)
1725 			ext4_ext_mark_uninitialized(ex);
1726 		eh = path[depth].p_hdr;
1727 		nearex = ex;
1728 		goto merge;
1729 	}
1730 
1731 	depth = ext_depth(inode);
1732 	eh = path[depth].p_hdr;
1733 	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1734 		goto has_space;
1735 
1736 	/* probably next leaf has space for us? */
1737 	fex = EXT_LAST_EXTENT(eh);
1738 	next = EXT_MAX_BLOCKS;
1739 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1740 		next = ext4_ext_next_leaf_block(path);
1741 	if (next != EXT_MAX_BLOCKS) {
1742 		ext_debug("next leaf block - %d\n", next);
1743 		BUG_ON(npath != NULL);
1744 		npath = ext4_ext_find_extent(inode, next, NULL);
1745 		if (IS_ERR(npath))
1746 			return PTR_ERR(npath);
1747 		BUG_ON(npath->p_depth != path->p_depth);
1748 		eh = npath[depth].p_hdr;
1749 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1750 			ext_debug("next leaf isn't full(%d)\n",
1751 				  le16_to_cpu(eh->eh_entries));
1752 			path = npath;
1753 			goto has_space;
1754 		}
1755 		ext_debug("next leaf has no free space(%d,%d)\n",
1756 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1757 	}
1758 
1759 	/*
1760 	 * There is no free space in the found leaf.
1761 	 * We're gonna add a new leaf in the tree.
1762 	 */
1763 	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1764 		flags = EXT4_MB_USE_ROOT_BLOCKS;
1765 	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1766 	if (err)
1767 		goto cleanup;
1768 	depth = ext_depth(inode);
1769 	eh = path[depth].p_hdr;
1770 
1771 has_space:
1772 	nearex = path[depth].p_ext;
1773 
1774 	err = ext4_ext_get_access(handle, inode, path + depth);
1775 	if (err)
1776 		goto cleanup;
1777 
1778 	if (!nearex) {
1779 		/* there is no extent in this leaf, create first one */
1780 		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1781 				le32_to_cpu(newext->ee_block),
1782 				ext4_ext_pblock(newext),
1783 				ext4_ext_is_uninitialized(newext),
1784 				ext4_ext_get_actual_len(newext));
1785 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1786 	} else if (le32_to_cpu(newext->ee_block)
1787 			   > le32_to_cpu(nearex->ee_block)) {
1788 /*		BUG_ON(newext->ee_block == nearex->ee_block); */
1789 		if (nearex != EXT_LAST_EXTENT(eh)) {
1790 			len = EXT_MAX_EXTENT(eh) - nearex;
1791 			len = (len - 1) * sizeof(struct ext4_extent);
1792 			len = len < 0 ? 0 : len;
1793 			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1794 					"move %d from 0x%p to 0x%p\n",
1795 					le32_to_cpu(newext->ee_block),
1796 					ext4_ext_pblock(newext),
1797 					ext4_ext_is_uninitialized(newext),
1798 					ext4_ext_get_actual_len(newext),
1799 					nearex, len, nearex + 1, nearex + 2);
1800 			memmove(nearex + 2, nearex + 1, len);
1801 		}
1802 		path[depth].p_ext = nearex + 1;
1803 	} else {
1804 		BUG_ON(newext->ee_block == nearex->ee_block);
1805 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1806 		len = len < 0 ? 0 : len;
1807 		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1808 				"move %d from 0x%p to 0x%p\n",
1809 				le32_to_cpu(newext->ee_block),
1810 				ext4_ext_pblock(newext),
1811 				ext4_ext_is_uninitialized(newext),
1812 				ext4_ext_get_actual_len(newext),
1813 				nearex, len, nearex, nearex + 1);
1814 		memmove(nearex + 1, nearex, len);
1815 		path[depth].p_ext = nearex;
1816 	}
1817 
1818 	le16_add_cpu(&eh->eh_entries, 1);
1819 	nearex = path[depth].p_ext;
1820 	nearex->ee_block = newext->ee_block;
1821 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1822 	nearex->ee_len = newext->ee_len;
1823 
1824 merge:
1825 	/* try to merge extents to the right */
1826 	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1827 		ext4_ext_try_to_merge(inode, path, nearex);
1828 
1829 	/* try to merge extents to the left */
1830 
1831 	/* time to correct all indexes above */
1832 	err = ext4_ext_correct_indexes(handle, inode, path);
1833 	if (err)
1834 		goto cleanup;
1835 
1836 	err = ext4_ext_dirty(handle, inode, path + depth);
1837 
1838 cleanup:
1839 	if (npath) {
1840 		ext4_ext_drop_refs(npath);
1841 		kfree(npath);
1842 	}
1843 	ext4_ext_invalidate_cache(inode);
1844 	return err;
1845 }
1846 
1847 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1848 			       ext4_lblk_t num, ext_prepare_callback func,
1849 			       void *cbdata)
1850 {
1851 	struct ext4_ext_path *path = NULL;
1852 	struct ext4_ext_cache cbex;
1853 	struct ext4_extent *ex;
1854 	ext4_lblk_t next, start = 0, end = 0;
1855 	ext4_lblk_t last = block + num;
1856 	int depth, exists, err = 0;
1857 
1858 	BUG_ON(func == NULL);
1859 	BUG_ON(inode == NULL);
1860 
1861 	while (block < last && block != EXT_MAX_BLOCKS) {
1862 		num = last - block;
1863 		/* find extent for this block */
1864 		down_read(&EXT4_I(inode)->i_data_sem);
1865 		path = ext4_ext_find_extent(inode, block, path);
1866 		up_read(&EXT4_I(inode)->i_data_sem);
1867 		if (IS_ERR(path)) {
1868 			err = PTR_ERR(path);
1869 			path = NULL;
1870 			break;
1871 		}
1872 
1873 		depth = ext_depth(inode);
1874 		if (unlikely(path[depth].p_hdr == NULL)) {
1875 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1876 			err = -EIO;
1877 			break;
1878 		}
1879 		ex = path[depth].p_ext;
1880 		next = ext4_ext_next_allocated_block(path);
1881 
1882 		exists = 0;
1883 		if (!ex) {
1884 			/* there is no extent yet, so try to allocate
1885 			 * all requested space */
1886 			start = block;
1887 			end = block + num;
1888 		} else if (le32_to_cpu(ex->ee_block) > block) {
1889 			/* need to allocate space before found extent */
1890 			start = block;
1891 			end = le32_to_cpu(ex->ee_block);
1892 			if (block + num < end)
1893 				end = block + num;
1894 		} else if (block >= le32_to_cpu(ex->ee_block)
1895 					+ ext4_ext_get_actual_len(ex)) {
1896 			/* need to allocate space after found extent */
1897 			start = block;
1898 			end = block + num;
1899 			if (end >= next)
1900 				end = next;
1901 		} else if (block >= le32_to_cpu(ex->ee_block)) {
1902 			/*
1903 			 * some part of requested space is covered
1904 			 * by found extent
1905 			 */
1906 			start = block;
1907 			end = le32_to_cpu(ex->ee_block)
1908 				+ ext4_ext_get_actual_len(ex);
1909 			if (block + num < end)
1910 				end = block + num;
1911 			exists = 1;
1912 		} else {
1913 			BUG();
1914 		}
1915 		BUG_ON(end <= start);
1916 
1917 		if (!exists) {
1918 			cbex.ec_block = start;
1919 			cbex.ec_len = end - start;
1920 			cbex.ec_start = 0;
1921 		} else {
1922 			cbex.ec_block = le32_to_cpu(ex->ee_block);
1923 			cbex.ec_len = ext4_ext_get_actual_len(ex);
1924 			cbex.ec_start = ext4_ext_pblock(ex);
1925 		}
1926 
1927 		if (unlikely(cbex.ec_len == 0)) {
1928 			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1929 			err = -EIO;
1930 			break;
1931 		}
1932 		err = func(inode, next, &cbex, ex, cbdata);
1933 		ext4_ext_drop_refs(path);
1934 
1935 		if (err < 0)
1936 			break;
1937 
1938 		if (err == EXT_REPEAT)
1939 			continue;
1940 		else if (err == EXT_BREAK) {
1941 			err = 0;
1942 			break;
1943 		}
1944 
1945 		if (ext_depth(inode) != depth) {
1946 			/* depth was changed. we have to realloc path */
1947 			kfree(path);
1948 			path = NULL;
1949 		}
1950 
1951 		block = cbex.ec_block + cbex.ec_len;
1952 	}
1953 
1954 	if (path) {
1955 		ext4_ext_drop_refs(path);
1956 		kfree(path);
1957 	}
1958 
1959 	return err;
1960 }
1961 
1962 static void
1963 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1964 			__u32 len, ext4_fsblk_t start)
1965 {
1966 	struct ext4_ext_cache *cex;
1967 	BUG_ON(len == 0);
1968 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1969 	cex = &EXT4_I(inode)->i_cached_extent;
1970 	cex->ec_block = block;
1971 	cex->ec_len = len;
1972 	cex->ec_start = start;
1973 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1974 }
1975 
1976 /*
1977  * ext4_ext_put_gap_in_cache:
1978  * calculate boundaries of the gap that the requested block fits into
1979  * and cache this gap
1980  */
1981 static void
1982 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1983 				ext4_lblk_t block)
1984 {
1985 	int depth = ext_depth(inode);
1986 	unsigned long len;
1987 	ext4_lblk_t lblock;
1988 	struct ext4_extent *ex;
1989 
1990 	ex = path[depth].p_ext;
1991 	if (ex == NULL) {
1992 		/* there is no extent yet, so gap is [0;-] */
1993 		lblock = 0;
1994 		len = EXT_MAX_BLOCKS;
1995 		ext_debug("cache gap(whole file):");
1996 	} else if (block < le32_to_cpu(ex->ee_block)) {
1997 		lblock = block;
1998 		len = le32_to_cpu(ex->ee_block) - block;
1999 		ext_debug("cache gap(before): %u [%u:%u]",
2000 				block,
2001 				le32_to_cpu(ex->ee_block),
2002 				 ext4_ext_get_actual_len(ex));
2003 	} else if (block >= le32_to_cpu(ex->ee_block)
2004 			+ ext4_ext_get_actual_len(ex)) {
2005 		ext4_lblk_t next;
2006 		lblock = le32_to_cpu(ex->ee_block)
2007 			+ ext4_ext_get_actual_len(ex);
2008 
2009 		next = ext4_ext_next_allocated_block(path);
2010 		ext_debug("cache gap(after): [%u:%u] %u",
2011 				le32_to_cpu(ex->ee_block),
2012 				ext4_ext_get_actual_len(ex),
2013 				block);
2014 		BUG_ON(next == lblock);
2015 		len = next - lblock;
2016 	} else {
2017 		lblock = len = 0;
2018 		BUG();
2019 	}
2020 
2021 	ext_debug(" -> %u:%lu\n", lblock, len);
2022 	ext4_ext_put_in_cache(inode, lblock, len, 0);
2023 }
2024 
2025 /*
2026  * ext4_ext_check_cache()
2027  * Checks to see if the given block is in the cache.
2028  * If it is, the cached extent is stored in the given
2029  * cache extent pointer.  If the cached extent is a hole,
2030  * this routine should be used instead of
2031  * ext4_ext_in_cache if the calling function needs to
2032  * know the size of the hole.
2033  *
2034  * @inode: The files inode
2035  * @block: The block to look for in the cache
2036  * @ex:    Pointer where the cached extent will be stored
2037  *         if it contains block
2038  *
2039  * Return 0 if cache is invalid; 1 if the cache is valid
2040  */
2041 static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2042 	struct ext4_ext_cache *ex){
2043 	struct ext4_ext_cache *cex;
2044 	struct ext4_sb_info *sbi;
2045 	int ret = 0;
2046 
2047 	/*
2048 	 * We borrow i_block_reservation_lock to protect i_cached_extent
2049 	 */
2050 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2051 	cex = &EXT4_I(inode)->i_cached_extent;
2052 	sbi = EXT4_SB(inode->i_sb);
2053 
2054 	/* has cache valid data? */
2055 	if (cex->ec_len == 0)
2056 		goto errout;
2057 
2058 	if (in_range(block, cex->ec_block, cex->ec_len)) {
2059 		memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2060 		ext_debug("%u cached by %u:%u:%llu\n",
2061 				block,
2062 				cex->ec_block, cex->ec_len, cex->ec_start);
2063 		ret = 1;
2064 	}
2065 errout:
2066 	if (!ret)
2067 		sbi->extent_cache_misses++;
2068 	else
2069 		sbi->extent_cache_hits++;
2070 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2071 	return ret;
2072 }
2073 
2074 /*
2075  * ext4_ext_in_cache()
2076  * Checks to see if the given block is in the cache.
2077  * If it is, the cached extent is stored in the given
2078  * extent pointer.
2079  *
2080  * @inode: The files inode
2081  * @block: The block to look for in the cache
2082  * @ex:    Pointer where the cached extent will be stored
2083  *         if it contains block
2084  *
2085  * Return 0 if cache is invalid; 1 if the cache is valid
2086  */
2087 static int
2088 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2089 			struct ext4_extent *ex)
2090 {
2091 	struct ext4_ext_cache cex;
2092 	int ret = 0;
2093 
2094 	if (ext4_ext_check_cache(inode, block, &cex)) {
2095 		ex->ee_block = cpu_to_le32(cex.ec_block);
2096 		ext4_ext_store_pblock(ex, cex.ec_start);
2097 		ex->ee_len = cpu_to_le16(cex.ec_len);
2098 		ret = 1;
2099 	}
2100 
2101 	return ret;
2102 }
2103 
2104 
2105 /*
2106  * ext4_ext_rm_idx:
2107  * removes index from the index block.
2108  */
2109 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2110 			struct ext4_ext_path *path)
2111 {
2112 	int err;
2113 	ext4_fsblk_t leaf;
2114 
2115 	/* free index block */
2116 	path--;
2117 	leaf = ext4_idx_pblock(path->p_idx);
2118 	if (unlikely(path->p_hdr->eh_entries == 0)) {
2119 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2120 		return -EIO;
2121 	}
2122 	err = ext4_ext_get_access(handle, inode, path);
2123 	if (err)
2124 		return err;
2125 
2126 	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2127 		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2128 		len *= sizeof(struct ext4_extent_idx);
2129 		memmove(path->p_idx, path->p_idx + 1, len);
2130 	}
2131 
2132 	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2133 	err = ext4_ext_dirty(handle, inode, path);
2134 	if (err)
2135 		return err;
2136 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2137 	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2138 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2139 	return err;
2140 }
2141 
2142 /*
2143  * ext4_ext_calc_credits_for_single_extent:
2144  * This routine returns max. credits that needed to insert an extent
2145  * to the extent tree.
2146  * When pass the actual path, the caller should calculate credits
2147  * under i_data_sem.
2148  */
2149 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2150 						struct ext4_ext_path *path)
2151 {
2152 	if (path) {
2153 		int depth = ext_depth(inode);
2154 		int ret = 0;
2155 
2156 		/* probably there is space in leaf? */
2157 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2158 				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2159 
2160 			/*
2161 			 *  There are some space in the leaf tree, no
2162 			 *  need to account for leaf block credit
2163 			 *
2164 			 *  bitmaps and block group descriptor blocks
2165 			 *  and other metadat blocks still need to be
2166 			 *  accounted.
2167 			 */
2168 			/* 1 bitmap, 1 block group descriptor */
2169 			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2170 			return ret;
2171 		}
2172 	}
2173 
2174 	return ext4_chunk_trans_blocks(inode, nrblocks);
2175 }
2176 
2177 /*
2178  * How many index/leaf blocks need to change/allocate to modify nrblocks?
2179  *
2180  * if nrblocks are fit in a single extent (chunk flag is 1), then
2181  * in the worse case, each tree level index/leaf need to be changed
2182  * if the tree split due to insert a new extent, then the old tree
2183  * index/leaf need to be updated too
2184  *
2185  * If the nrblocks are discontiguous, they could cause
2186  * the whole tree split more than once, but this is really rare.
2187  */
2188 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2189 {
2190 	int index;
2191 	int depth = ext_depth(inode);
2192 
2193 	if (chunk)
2194 		index = depth * 2;
2195 	else
2196 		index = depth * 3;
2197 
2198 	return index;
2199 }
2200 
2201 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2202 				struct ext4_extent *ex,
2203 				ext4_lblk_t from, ext4_lblk_t to)
2204 {
2205 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2206 	int flags = EXT4_FREE_BLOCKS_FORGET;
2207 
2208 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2209 		flags |= EXT4_FREE_BLOCKS_METADATA;
2210 #ifdef EXTENTS_STATS
2211 	{
2212 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2213 		spin_lock(&sbi->s_ext_stats_lock);
2214 		sbi->s_ext_blocks += ee_len;
2215 		sbi->s_ext_extents++;
2216 		if (ee_len < sbi->s_ext_min)
2217 			sbi->s_ext_min = ee_len;
2218 		if (ee_len > sbi->s_ext_max)
2219 			sbi->s_ext_max = ee_len;
2220 		if (ext_depth(inode) > sbi->s_depth_max)
2221 			sbi->s_depth_max = ext_depth(inode);
2222 		spin_unlock(&sbi->s_ext_stats_lock);
2223 	}
2224 #endif
2225 	if (from >= le32_to_cpu(ex->ee_block)
2226 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2227 		/* tail removal */
2228 		ext4_lblk_t num;
2229 		ext4_fsblk_t start;
2230 
2231 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2232 		start = ext4_ext_pblock(ex) + ee_len - num;
2233 		ext_debug("free last %u blocks starting %llu\n", num, start);
2234 		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2235 	} else if (from == le32_to_cpu(ex->ee_block)
2236 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2237 		/* head removal */
2238 		ext4_lblk_t num;
2239 		ext4_fsblk_t start;
2240 
2241 		num = to - from;
2242 		start = ext4_ext_pblock(ex);
2243 
2244 		ext_debug("free first %u blocks starting %llu\n", num, start);
2245 		ext4_free_blocks(handle, inode, 0, start, num, flags);
2246 
2247 	} else {
2248 		printk(KERN_INFO "strange request: removal(2) "
2249 				"%u-%u from %u:%u\n",
2250 				from, to, le32_to_cpu(ex->ee_block), ee_len);
2251 	}
2252 	return 0;
2253 }
2254 
2255 
2256 /*
2257  * ext4_ext_rm_leaf() Removes the extents associated with the
2258  * blocks appearing between "start" and "end", and splits the extents
2259  * if "start" and "end" appear in the same extent
2260  *
2261  * @handle: The journal handle
2262  * @inode:  The files inode
2263  * @path:   The path to the leaf
2264  * @start:  The first block to remove
2265  * @end:   The last block to remove
2266  */
2267 static int
2268 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2269 		struct ext4_ext_path *path, ext4_lblk_t start,
2270 		ext4_lblk_t end)
2271 {
2272 	int err = 0, correct_index = 0;
2273 	int depth = ext_depth(inode), credits;
2274 	struct ext4_extent_header *eh;
2275 	ext4_lblk_t a, b, block;
2276 	unsigned num;
2277 	ext4_lblk_t ex_ee_block;
2278 	unsigned short ex_ee_len;
2279 	unsigned uninitialized = 0;
2280 	struct ext4_extent *ex;
2281 	struct ext4_map_blocks map;
2282 
2283 	/* the header must be checked already in ext4_ext_remove_space() */
2284 	ext_debug("truncate since %u in leaf\n", start);
2285 	if (!path[depth].p_hdr)
2286 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2287 	eh = path[depth].p_hdr;
2288 	if (unlikely(path[depth].p_hdr == NULL)) {
2289 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2290 		return -EIO;
2291 	}
2292 	/* find where to start removing */
2293 	ex = EXT_LAST_EXTENT(eh);
2294 
2295 	ex_ee_block = le32_to_cpu(ex->ee_block);
2296 	ex_ee_len = ext4_ext_get_actual_len(ex);
2297 
2298 	while (ex >= EXT_FIRST_EXTENT(eh) &&
2299 			ex_ee_block + ex_ee_len > start) {
2300 
2301 		if (ext4_ext_is_uninitialized(ex))
2302 			uninitialized = 1;
2303 		else
2304 			uninitialized = 0;
2305 
2306 		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2307 			 uninitialized, ex_ee_len);
2308 		path[depth].p_ext = ex;
2309 
2310 		a = ex_ee_block > start ? ex_ee_block : start;
2311 		b = ex_ee_block+ex_ee_len - 1 < end ?
2312 			ex_ee_block+ex_ee_len - 1 : end;
2313 
2314 		ext_debug("  border %u:%u\n", a, b);
2315 
2316 		/* If this extent is beyond the end of the hole, skip it */
2317 		if (end <= ex_ee_block) {
2318 			ex--;
2319 			ex_ee_block = le32_to_cpu(ex->ee_block);
2320 			ex_ee_len = ext4_ext_get_actual_len(ex);
2321 			continue;
2322 		} else if (a != ex_ee_block &&
2323 			b != ex_ee_block + ex_ee_len - 1) {
2324 			/*
2325 			 * If this is a truncate, then this condition should
2326 			 * never happen because at least one of the end points
2327 			 * needs to be on the edge of the extent.
2328 			 */
2329 			if (end == EXT_MAX_BLOCKS - 1) {
2330 				ext_debug("  bad truncate %u:%u\n",
2331 						start, end);
2332 				block = 0;
2333 				num = 0;
2334 				err = -EIO;
2335 				goto out;
2336 			}
2337 			/*
2338 			 * else this is a hole punch, so the extent needs to
2339 			 * be split since neither edge of the hole is on the
2340 			 * extent edge
2341 			 */
2342 			else{
2343 				map.m_pblk = ext4_ext_pblock(ex);
2344 				map.m_lblk = ex_ee_block;
2345 				map.m_len = b - ex_ee_block;
2346 
2347 				err = ext4_split_extent(handle,
2348 					inode, path, &map, 0,
2349 					EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2350 					EXT4_GET_BLOCKS_PRE_IO);
2351 
2352 				if (err < 0)
2353 					goto out;
2354 
2355 				ex_ee_len = ext4_ext_get_actual_len(ex);
2356 
2357 				b = ex_ee_block+ex_ee_len - 1 < end ?
2358 					ex_ee_block+ex_ee_len - 1 : end;
2359 
2360 				/* Then remove tail of this extent */
2361 				block = ex_ee_block;
2362 				num = a - block;
2363 			}
2364 		} else if (a != ex_ee_block) {
2365 			/* remove tail of the extent */
2366 			block = ex_ee_block;
2367 			num = a - block;
2368 		} else if (b != ex_ee_block + ex_ee_len - 1) {
2369 			/* remove head of the extent */
2370 			block = b;
2371 			num =  ex_ee_block + ex_ee_len - b;
2372 
2373 			/*
2374 			 * If this is a truncate, this condition
2375 			 * should never happen
2376 			 */
2377 			if (end == EXT_MAX_BLOCKS - 1) {
2378 				ext_debug("  bad truncate %u:%u\n",
2379 					start, end);
2380 				err = -EIO;
2381 				goto out;
2382 			}
2383 		} else {
2384 			/* remove whole extent: excellent! */
2385 			block = ex_ee_block;
2386 			num = 0;
2387 			if (a != ex_ee_block) {
2388 				ext_debug("  bad truncate %u:%u\n",
2389 					start, end);
2390 				err = -EIO;
2391 				goto out;
2392 			}
2393 
2394 			if (b != ex_ee_block + ex_ee_len - 1) {
2395 				ext_debug("  bad truncate %u:%u\n",
2396 					start, end);
2397 				err = -EIO;
2398 				goto out;
2399 			}
2400 		}
2401 
2402 		/*
2403 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2404 		 * descriptor) for each block group; assume two block
2405 		 * groups plus ex_ee_len/blocks_per_block_group for
2406 		 * the worst case
2407 		 */
2408 		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2409 		if (ex == EXT_FIRST_EXTENT(eh)) {
2410 			correct_index = 1;
2411 			credits += (ext_depth(inode)) + 1;
2412 		}
2413 		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2414 
2415 		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2416 		if (err)
2417 			goto out;
2418 
2419 		err = ext4_ext_get_access(handle, inode, path + depth);
2420 		if (err)
2421 			goto out;
2422 
2423 		err = ext4_remove_blocks(handle, inode, ex, a, b);
2424 		if (err)
2425 			goto out;
2426 
2427 		if (num == 0) {
2428 			/* this extent is removed; mark slot entirely unused */
2429 			ext4_ext_store_pblock(ex, 0);
2430 		} else if (block != ex_ee_block) {
2431 			/*
2432 			 * If this was a head removal, then we need to update
2433 			 * the physical block since it is now at a different
2434 			 * location
2435 			 */
2436 			ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2437 		}
2438 
2439 		ex->ee_block = cpu_to_le32(block);
2440 		ex->ee_len = cpu_to_le16(num);
2441 		/*
2442 		 * Do not mark uninitialized if all the blocks in the
2443 		 * extent have been removed.
2444 		 */
2445 		if (uninitialized && num)
2446 			ext4_ext_mark_uninitialized(ex);
2447 
2448 		err = ext4_ext_dirty(handle, inode, path + depth);
2449 		if (err)
2450 			goto out;
2451 
2452 		/*
2453 		 * If the extent was completely released,
2454 		 * we need to remove it from the leaf
2455 		 */
2456 		if (num == 0) {
2457 			if (end != EXT_MAX_BLOCKS - 1) {
2458 				/*
2459 				 * For hole punching, we need to scoot all the
2460 				 * extents up when an extent is removed so that
2461 				 * we dont have blank extents in the middle
2462 				 */
2463 				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2464 					sizeof(struct ext4_extent));
2465 
2466 				/* Now get rid of the one at the end */
2467 				memset(EXT_LAST_EXTENT(eh), 0,
2468 					sizeof(struct ext4_extent));
2469 			}
2470 			le16_add_cpu(&eh->eh_entries, -1);
2471 		}
2472 
2473 		ext_debug("new extent: %u:%u:%llu\n", block, num,
2474 				ext4_ext_pblock(ex));
2475 		ex--;
2476 		ex_ee_block = le32_to_cpu(ex->ee_block);
2477 		ex_ee_len = ext4_ext_get_actual_len(ex);
2478 	}
2479 
2480 	if (correct_index && eh->eh_entries)
2481 		err = ext4_ext_correct_indexes(handle, inode, path);
2482 
2483 	/* if this leaf is free, then we should
2484 	 * remove it from index block above */
2485 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2486 		err = ext4_ext_rm_idx(handle, inode, path + depth);
2487 
2488 out:
2489 	return err;
2490 }
2491 
2492 /*
2493  * ext4_ext_more_to_rm:
2494  * returns 1 if current index has to be freed (even partial)
2495  */
2496 static int
2497 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2498 {
2499 	BUG_ON(path->p_idx == NULL);
2500 
2501 	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2502 		return 0;
2503 
2504 	/*
2505 	 * if truncate on deeper level happened, it wasn't partial,
2506 	 * so we have to consider current index for truncation
2507 	 */
2508 	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2509 		return 0;
2510 	return 1;
2511 }
2512 
2513 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2514 {
2515 	struct super_block *sb = inode->i_sb;
2516 	int depth = ext_depth(inode);
2517 	struct ext4_ext_path *path;
2518 	handle_t *handle;
2519 	int i, err;
2520 
2521 	ext_debug("truncate since %u\n", start);
2522 
2523 	/* probably first extent we're gonna free will be last in block */
2524 	handle = ext4_journal_start(inode, depth + 1);
2525 	if (IS_ERR(handle))
2526 		return PTR_ERR(handle);
2527 
2528 again:
2529 	ext4_ext_invalidate_cache(inode);
2530 
2531 	/*
2532 	 * We start scanning from right side, freeing all the blocks
2533 	 * after i_size and walking into the tree depth-wise.
2534 	 */
2535 	depth = ext_depth(inode);
2536 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2537 	if (path == NULL) {
2538 		ext4_journal_stop(handle);
2539 		return -ENOMEM;
2540 	}
2541 	path[0].p_depth = depth;
2542 	path[0].p_hdr = ext_inode_hdr(inode);
2543 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2544 		err = -EIO;
2545 		goto out;
2546 	}
2547 	i = err = 0;
2548 
2549 	while (i >= 0 && err == 0) {
2550 		if (i == depth) {
2551 			/* this is leaf block */
2552 			err = ext4_ext_rm_leaf(handle, inode, path,
2553 					start, EXT_MAX_BLOCKS - 1);
2554 			/* root level has p_bh == NULL, brelse() eats this */
2555 			brelse(path[i].p_bh);
2556 			path[i].p_bh = NULL;
2557 			i--;
2558 			continue;
2559 		}
2560 
2561 		/* this is index block */
2562 		if (!path[i].p_hdr) {
2563 			ext_debug("initialize header\n");
2564 			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2565 		}
2566 
2567 		if (!path[i].p_idx) {
2568 			/* this level hasn't been touched yet */
2569 			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2570 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2571 			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2572 				  path[i].p_hdr,
2573 				  le16_to_cpu(path[i].p_hdr->eh_entries));
2574 		} else {
2575 			/* we were already here, see at next index */
2576 			path[i].p_idx--;
2577 		}
2578 
2579 		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2580 				i, EXT_FIRST_INDEX(path[i].p_hdr),
2581 				path[i].p_idx);
2582 		if (ext4_ext_more_to_rm(path + i)) {
2583 			struct buffer_head *bh;
2584 			/* go to the next level */
2585 			ext_debug("move to level %d (block %llu)\n",
2586 				  i + 1, ext4_idx_pblock(path[i].p_idx));
2587 			memset(path + i + 1, 0, sizeof(*path));
2588 			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2589 			if (!bh) {
2590 				/* should we reset i_size? */
2591 				err = -EIO;
2592 				break;
2593 			}
2594 			if (WARN_ON(i + 1 > depth)) {
2595 				err = -EIO;
2596 				break;
2597 			}
2598 			if (ext4_ext_check(inode, ext_block_hdr(bh),
2599 							depth - i - 1)) {
2600 				err = -EIO;
2601 				break;
2602 			}
2603 			path[i + 1].p_bh = bh;
2604 
2605 			/* save actual number of indexes since this
2606 			 * number is changed at the next iteration */
2607 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2608 			i++;
2609 		} else {
2610 			/* we finished processing this index, go up */
2611 			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2612 				/* index is empty, remove it;
2613 				 * handle must be already prepared by the
2614 				 * truncatei_leaf() */
2615 				err = ext4_ext_rm_idx(handle, inode, path + i);
2616 			}
2617 			/* root level has p_bh == NULL, brelse() eats this */
2618 			brelse(path[i].p_bh);
2619 			path[i].p_bh = NULL;
2620 			i--;
2621 			ext_debug("return to level %d\n", i);
2622 		}
2623 	}
2624 
2625 	/* TODO: flexible tree reduction should be here */
2626 	if (path->p_hdr->eh_entries == 0) {
2627 		/*
2628 		 * truncate to zero freed all the tree,
2629 		 * so we need to correct eh_depth
2630 		 */
2631 		err = ext4_ext_get_access(handle, inode, path);
2632 		if (err == 0) {
2633 			ext_inode_hdr(inode)->eh_depth = 0;
2634 			ext_inode_hdr(inode)->eh_max =
2635 				cpu_to_le16(ext4_ext_space_root(inode, 0));
2636 			err = ext4_ext_dirty(handle, inode, path);
2637 		}
2638 	}
2639 out:
2640 	ext4_ext_drop_refs(path);
2641 	kfree(path);
2642 	if (err == -EAGAIN)
2643 		goto again;
2644 	ext4_journal_stop(handle);
2645 
2646 	return err;
2647 }
2648 
2649 /*
2650  * called at mount time
2651  */
2652 void ext4_ext_init(struct super_block *sb)
2653 {
2654 	/*
2655 	 * possible initialization would be here
2656 	 */
2657 
2658 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2659 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2660 		printk(KERN_INFO "EXT4-fs: file extents enabled");
2661 #ifdef AGGRESSIVE_TEST
2662 		printk(", aggressive tests");
2663 #endif
2664 #ifdef CHECK_BINSEARCH
2665 		printk(", check binsearch");
2666 #endif
2667 #ifdef EXTENTS_STATS
2668 		printk(", stats");
2669 #endif
2670 		printk("\n");
2671 #endif
2672 #ifdef EXTENTS_STATS
2673 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2674 		EXT4_SB(sb)->s_ext_min = 1 << 30;
2675 		EXT4_SB(sb)->s_ext_max = 0;
2676 #endif
2677 	}
2678 }
2679 
2680 /*
2681  * called at umount time
2682  */
2683 void ext4_ext_release(struct super_block *sb)
2684 {
2685 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2686 		return;
2687 
2688 #ifdef EXTENTS_STATS
2689 	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2690 		struct ext4_sb_info *sbi = EXT4_SB(sb);
2691 		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2692 			sbi->s_ext_blocks, sbi->s_ext_extents,
2693 			sbi->s_ext_blocks / sbi->s_ext_extents);
2694 		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2695 			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2696 	}
2697 #endif
2698 }
2699 
2700 /* FIXME!! we need to try to merge to left or right after zero-out  */
2701 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2702 {
2703 	ext4_fsblk_t ee_pblock;
2704 	unsigned int ee_len;
2705 	int ret;
2706 
2707 	ee_len    = ext4_ext_get_actual_len(ex);
2708 	ee_pblock = ext4_ext_pblock(ex);
2709 
2710 	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2711 	if (ret > 0)
2712 		ret = 0;
2713 
2714 	return ret;
2715 }
2716 
2717 /*
2718  * used by extent splitting.
2719  */
2720 #define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2721 					due to ENOSPC */
2722 #define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2723 #define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2724 
2725 /*
2726  * ext4_split_extent_at() splits an extent at given block.
2727  *
2728  * @handle: the journal handle
2729  * @inode: the file inode
2730  * @path: the path to the extent
2731  * @split: the logical block where the extent is splitted.
2732  * @split_flags: indicates if the extent could be zeroout if split fails, and
2733  *		 the states(init or uninit) of new extents.
2734  * @flags: flags used to insert new extent to extent tree.
2735  *
2736  *
2737  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2738  * of which are deterimined by split_flag.
2739  *
2740  * There are two cases:
2741  *  a> the extent are splitted into two extent.
2742  *  b> split is not needed, and just mark the extent.
2743  *
2744  * return 0 on success.
2745  */
2746 static int ext4_split_extent_at(handle_t *handle,
2747 			     struct inode *inode,
2748 			     struct ext4_ext_path *path,
2749 			     ext4_lblk_t split,
2750 			     int split_flag,
2751 			     int flags)
2752 {
2753 	ext4_fsblk_t newblock;
2754 	ext4_lblk_t ee_block;
2755 	struct ext4_extent *ex, newex, orig_ex;
2756 	struct ext4_extent *ex2 = NULL;
2757 	unsigned int ee_len, depth;
2758 	int err = 0;
2759 
2760 	ext_debug("ext4_split_extents_at: inode %lu, logical"
2761 		"block %llu\n", inode->i_ino, (unsigned long long)split);
2762 
2763 	ext4_ext_show_leaf(inode, path);
2764 
2765 	depth = ext_depth(inode);
2766 	ex = path[depth].p_ext;
2767 	ee_block = le32_to_cpu(ex->ee_block);
2768 	ee_len = ext4_ext_get_actual_len(ex);
2769 	newblock = split - ee_block + ext4_ext_pblock(ex);
2770 
2771 	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2772 
2773 	err = ext4_ext_get_access(handle, inode, path + depth);
2774 	if (err)
2775 		goto out;
2776 
2777 	if (split == ee_block) {
2778 		/*
2779 		 * case b: block @split is the block that the extent begins with
2780 		 * then we just change the state of the extent, and splitting
2781 		 * is not needed.
2782 		 */
2783 		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2784 			ext4_ext_mark_uninitialized(ex);
2785 		else
2786 			ext4_ext_mark_initialized(ex);
2787 
2788 		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2789 			ext4_ext_try_to_merge(inode, path, ex);
2790 
2791 		err = ext4_ext_dirty(handle, inode, path + depth);
2792 		goto out;
2793 	}
2794 
2795 	/* case a */
2796 	memcpy(&orig_ex, ex, sizeof(orig_ex));
2797 	ex->ee_len = cpu_to_le16(split - ee_block);
2798 	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2799 		ext4_ext_mark_uninitialized(ex);
2800 
2801 	/*
2802 	 * path may lead to new leaf, not to original leaf any more
2803 	 * after ext4_ext_insert_extent() returns,
2804 	 */
2805 	err = ext4_ext_dirty(handle, inode, path + depth);
2806 	if (err)
2807 		goto fix_extent_len;
2808 
2809 	ex2 = &newex;
2810 	ex2->ee_block = cpu_to_le32(split);
2811 	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2812 	ext4_ext_store_pblock(ex2, newblock);
2813 	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2814 		ext4_ext_mark_uninitialized(ex2);
2815 
2816 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2817 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2818 		err = ext4_ext_zeroout(inode, &orig_ex);
2819 		if (err)
2820 			goto fix_extent_len;
2821 		/* update the extent length and mark as initialized */
2822 		ex->ee_len = cpu_to_le32(ee_len);
2823 		ext4_ext_try_to_merge(inode, path, ex);
2824 		err = ext4_ext_dirty(handle, inode, path + depth);
2825 		goto out;
2826 	} else if (err)
2827 		goto fix_extent_len;
2828 
2829 out:
2830 	ext4_ext_show_leaf(inode, path);
2831 	return err;
2832 
2833 fix_extent_len:
2834 	ex->ee_len = orig_ex.ee_len;
2835 	ext4_ext_dirty(handle, inode, path + depth);
2836 	return err;
2837 }
2838 
2839 /*
2840  * ext4_split_extents() splits an extent and mark extent which is covered
2841  * by @map as split_flags indicates
2842  *
2843  * It may result in splitting the extent into multiple extents (upto three)
2844  * There are three possibilities:
2845  *   a> There is no split required
2846  *   b> Splits in two extents: Split is happening at either end of the extent
2847  *   c> Splits in three extents: Somone is splitting in middle of the extent
2848  *
2849  */
2850 static int ext4_split_extent(handle_t *handle,
2851 			      struct inode *inode,
2852 			      struct ext4_ext_path *path,
2853 			      struct ext4_map_blocks *map,
2854 			      int split_flag,
2855 			      int flags)
2856 {
2857 	ext4_lblk_t ee_block;
2858 	struct ext4_extent *ex;
2859 	unsigned int ee_len, depth;
2860 	int err = 0;
2861 	int uninitialized;
2862 	int split_flag1, flags1;
2863 
2864 	depth = ext_depth(inode);
2865 	ex = path[depth].p_ext;
2866 	ee_block = le32_to_cpu(ex->ee_block);
2867 	ee_len = ext4_ext_get_actual_len(ex);
2868 	uninitialized = ext4_ext_is_uninitialized(ex);
2869 
2870 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2871 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2872 			      EXT4_EXT_MAY_ZEROOUT : 0;
2873 		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2874 		if (uninitialized)
2875 			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2876 				       EXT4_EXT_MARK_UNINIT2;
2877 		err = ext4_split_extent_at(handle, inode, path,
2878 				map->m_lblk + map->m_len, split_flag1, flags1);
2879 		if (err)
2880 			goto out;
2881 	}
2882 
2883 	ext4_ext_drop_refs(path);
2884 	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2885 	if (IS_ERR(path))
2886 		return PTR_ERR(path);
2887 
2888 	if (map->m_lblk >= ee_block) {
2889 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2890 			      EXT4_EXT_MAY_ZEROOUT : 0;
2891 		if (uninitialized)
2892 			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2893 		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2894 			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2895 		err = ext4_split_extent_at(handle, inode, path,
2896 				map->m_lblk, split_flag1, flags);
2897 		if (err)
2898 			goto out;
2899 	}
2900 
2901 	ext4_ext_show_leaf(inode, path);
2902 out:
2903 	return err ? err : map->m_len;
2904 }
2905 
2906 #define EXT4_EXT_ZERO_LEN 7
2907 /*
2908  * This function is called by ext4_ext_map_blocks() if someone tries to write
2909  * to an uninitialized extent. It may result in splitting the uninitialized
2910  * extent into multiple extents (up to three - one initialized and two
2911  * uninitialized).
2912  * There are three possibilities:
2913  *   a> There is no split required: Entire extent should be initialized
2914  *   b> Splits in two extents: Write is happening at either end of the extent
2915  *   c> Splits in three extents: Somone is writing in middle of the extent
2916  */
2917 static int ext4_ext_convert_to_initialized(handle_t *handle,
2918 					   struct inode *inode,
2919 					   struct ext4_map_blocks *map,
2920 					   struct ext4_ext_path *path)
2921 {
2922 	struct ext4_map_blocks split_map;
2923 	struct ext4_extent zero_ex;
2924 	struct ext4_extent *ex;
2925 	ext4_lblk_t ee_block, eof_block;
2926 	unsigned int allocated, ee_len, depth;
2927 	int err = 0;
2928 	int split_flag = 0;
2929 
2930 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2931 		"block %llu, max_blocks %u\n", inode->i_ino,
2932 		(unsigned long long)map->m_lblk, map->m_len);
2933 
2934 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2935 		inode->i_sb->s_blocksize_bits;
2936 	if (eof_block < map->m_lblk + map->m_len)
2937 		eof_block = map->m_lblk + map->m_len;
2938 
2939 	depth = ext_depth(inode);
2940 	ex = path[depth].p_ext;
2941 	ee_block = le32_to_cpu(ex->ee_block);
2942 	ee_len = ext4_ext_get_actual_len(ex);
2943 	allocated = ee_len - (map->m_lblk - ee_block);
2944 
2945 	WARN_ON(map->m_lblk < ee_block);
2946 	/*
2947 	 * It is safe to convert extent to initialized via explicit
2948 	 * zeroout only if extent is fully insde i_size or new_size.
2949 	 */
2950 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2951 
2952 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2953 	if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2954 	    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2955 		err = ext4_ext_zeroout(inode, ex);
2956 		if (err)
2957 			goto out;
2958 
2959 		err = ext4_ext_get_access(handle, inode, path + depth);
2960 		if (err)
2961 			goto out;
2962 		ext4_ext_mark_initialized(ex);
2963 		ext4_ext_try_to_merge(inode, path, ex);
2964 		err = ext4_ext_dirty(handle, inode, path + depth);
2965 		goto out;
2966 	}
2967 
2968 	/*
2969 	 * four cases:
2970 	 * 1. split the extent into three extents.
2971 	 * 2. split the extent into two extents, zeroout the first half.
2972 	 * 3. split the extent into two extents, zeroout the second half.
2973 	 * 4. split the extent into two extents with out zeroout.
2974 	 */
2975 	split_map.m_lblk = map->m_lblk;
2976 	split_map.m_len = map->m_len;
2977 
2978 	if (allocated > map->m_len) {
2979 		if (allocated <= EXT4_EXT_ZERO_LEN &&
2980 		    (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2981 			/* case 3 */
2982 			zero_ex.ee_block =
2983 					 cpu_to_le32(map->m_lblk);
2984 			zero_ex.ee_len = cpu_to_le16(allocated);
2985 			ext4_ext_store_pblock(&zero_ex,
2986 				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
2987 			err = ext4_ext_zeroout(inode, &zero_ex);
2988 			if (err)
2989 				goto out;
2990 			split_map.m_lblk = map->m_lblk;
2991 			split_map.m_len = allocated;
2992 		} else if ((map->m_lblk - ee_block + map->m_len <
2993 			   EXT4_EXT_ZERO_LEN) &&
2994 			   (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2995 			/* case 2 */
2996 			if (map->m_lblk != ee_block) {
2997 				zero_ex.ee_block = ex->ee_block;
2998 				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
2999 							ee_block);
3000 				ext4_ext_store_pblock(&zero_ex,
3001 						      ext4_ext_pblock(ex));
3002 				err = ext4_ext_zeroout(inode, &zero_ex);
3003 				if (err)
3004 					goto out;
3005 			}
3006 
3007 			split_map.m_lblk = ee_block;
3008 			split_map.m_len = map->m_lblk - ee_block + map->m_len;
3009 			allocated = map->m_len;
3010 		}
3011 	}
3012 
3013 	allocated = ext4_split_extent(handle, inode, path,
3014 				       &split_map, split_flag, 0);
3015 	if (allocated < 0)
3016 		err = allocated;
3017 
3018 out:
3019 	return err ? err : allocated;
3020 }
3021 
3022 /*
3023  * This function is called by ext4_ext_map_blocks() from
3024  * ext4_get_blocks_dio_write() when DIO to write
3025  * to an uninitialized extent.
3026  *
3027  * Writing to an uninitialized extent may result in splitting the uninitialized
3028  * extent into multiple /initialized uninitialized extents (up to three)
3029  * There are three possibilities:
3030  *   a> There is no split required: Entire extent should be uninitialized
3031  *   b> Splits in two extents: Write is happening at either end of the extent
3032  *   c> Splits in three extents: Somone is writing in middle of the extent
3033  *
3034  * One of more index blocks maybe needed if the extent tree grow after
3035  * the uninitialized extent split. To prevent ENOSPC occur at the IO
3036  * complete, we need to split the uninitialized extent before DIO submit
3037  * the IO. The uninitialized extent called at this time will be split
3038  * into three uninitialized extent(at most). After IO complete, the part
3039  * being filled will be convert to initialized by the end_io callback function
3040  * via ext4_convert_unwritten_extents().
3041  *
3042  * Returns the size of uninitialized extent to be written on success.
3043  */
3044 static int ext4_split_unwritten_extents(handle_t *handle,
3045 					struct inode *inode,
3046 					struct ext4_map_blocks *map,
3047 					struct ext4_ext_path *path,
3048 					int flags)
3049 {
3050 	ext4_lblk_t eof_block;
3051 	ext4_lblk_t ee_block;
3052 	struct ext4_extent *ex;
3053 	unsigned int ee_len;
3054 	int split_flag = 0, depth;
3055 
3056 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3057 		"block %llu, max_blocks %u\n", inode->i_ino,
3058 		(unsigned long long)map->m_lblk, map->m_len);
3059 
3060 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3061 		inode->i_sb->s_blocksize_bits;
3062 	if (eof_block < map->m_lblk + map->m_len)
3063 		eof_block = map->m_lblk + map->m_len;
3064 	/*
3065 	 * It is safe to convert extent to initialized via explicit
3066 	 * zeroout only if extent is fully insde i_size or new_size.
3067 	 */
3068 	depth = ext_depth(inode);
3069 	ex = path[depth].p_ext;
3070 	ee_block = le32_to_cpu(ex->ee_block);
3071 	ee_len = ext4_ext_get_actual_len(ex);
3072 
3073 	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3074 	split_flag |= EXT4_EXT_MARK_UNINIT2;
3075 
3076 	flags |= EXT4_GET_BLOCKS_PRE_IO;
3077 	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
3078 }
3079 
3080 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3081 					      struct inode *inode,
3082 					      struct ext4_ext_path *path)
3083 {
3084 	struct ext4_extent *ex;
3085 	int depth;
3086 	int err = 0;
3087 
3088 	depth = ext_depth(inode);
3089 	ex = path[depth].p_ext;
3090 
3091 	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3092 		"block %llu, max_blocks %u\n", inode->i_ino,
3093 		(unsigned long long)le32_to_cpu(ex->ee_block),
3094 		ext4_ext_get_actual_len(ex));
3095 
3096 	err = ext4_ext_get_access(handle, inode, path + depth);
3097 	if (err)
3098 		goto out;
3099 	/* first mark the extent as initialized */
3100 	ext4_ext_mark_initialized(ex);
3101 
3102 	/* note: ext4_ext_correct_indexes() isn't needed here because
3103 	 * borders are not changed
3104 	 */
3105 	ext4_ext_try_to_merge(inode, path, ex);
3106 
3107 	/* Mark modified extent as dirty */
3108 	err = ext4_ext_dirty(handle, inode, path + depth);
3109 out:
3110 	ext4_ext_show_leaf(inode, path);
3111 	return err;
3112 }
3113 
3114 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3115 			sector_t block, int count)
3116 {
3117 	int i;
3118 	for (i = 0; i < count; i++)
3119                 unmap_underlying_metadata(bdev, block + i);
3120 }
3121 
3122 /*
3123  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3124  */
3125 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3126 			      ext4_lblk_t lblk,
3127 			      struct ext4_ext_path *path,
3128 			      unsigned int len)
3129 {
3130 	int i, depth;
3131 	struct ext4_extent_header *eh;
3132 	struct ext4_extent *last_ex;
3133 
3134 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3135 		return 0;
3136 
3137 	depth = ext_depth(inode);
3138 	eh = path[depth].p_hdr;
3139 
3140 	if (unlikely(!eh->eh_entries)) {
3141 		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3142 				 "EOFBLOCKS_FL set");
3143 		return -EIO;
3144 	}
3145 	last_ex = EXT_LAST_EXTENT(eh);
3146 	/*
3147 	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3148 	 * last block in the last extent in the file.  We test this by
3149 	 * first checking to see if the caller to
3150 	 * ext4_ext_get_blocks() was interested in the last block (or
3151 	 * a block beyond the last block) in the current extent.  If
3152 	 * this turns out to be false, we can bail out from this
3153 	 * function immediately.
3154 	 */
3155 	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3156 	    ext4_ext_get_actual_len(last_ex))
3157 		return 0;
3158 	/*
3159 	 * If the caller does appear to be planning to write at or
3160 	 * beyond the end of the current extent, we then test to see
3161 	 * if the current extent is the last extent in the file, by
3162 	 * checking to make sure it was reached via the rightmost node
3163 	 * at each level of the tree.
3164 	 */
3165 	for (i = depth-1; i >= 0; i--)
3166 		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3167 			return 0;
3168 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3169 	return ext4_mark_inode_dirty(handle, inode);
3170 }
3171 
3172 static int
3173 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3174 			struct ext4_map_blocks *map,
3175 			struct ext4_ext_path *path, int flags,
3176 			unsigned int allocated, ext4_fsblk_t newblock)
3177 {
3178 	int ret = 0;
3179 	int err = 0;
3180 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3181 
3182 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3183 		  "block %llu, max_blocks %u, flags %d, allocated %u",
3184 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3185 		  flags, allocated);
3186 	ext4_ext_show_leaf(inode, path);
3187 
3188 	/* get_block() before submit the IO, split the extent */
3189 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3190 		ret = ext4_split_unwritten_extents(handle, inode, map,
3191 						   path, flags);
3192 		/*
3193 		 * Flag the inode(non aio case) or end_io struct (aio case)
3194 		 * that this IO needs to conversion to written when IO is
3195 		 * completed
3196 		 */
3197 		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3198 			io->flag = EXT4_IO_END_UNWRITTEN;
3199 			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3200 		} else
3201 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3202 		if (ext4_should_dioread_nolock(inode))
3203 			map->m_flags |= EXT4_MAP_UNINIT;
3204 		goto out;
3205 	}
3206 	/* IO end_io complete, convert the filled extent to written */
3207 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3208 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3209 							path);
3210 		if (ret >= 0) {
3211 			ext4_update_inode_fsync_trans(handle, inode, 1);
3212 			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3213 						 path, map->m_len);
3214 		} else
3215 			err = ret;
3216 		goto out2;
3217 	}
3218 	/* buffered IO case */
3219 	/*
3220 	 * repeat fallocate creation request
3221 	 * we already have an unwritten extent
3222 	 */
3223 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3224 		goto map_out;
3225 
3226 	/* buffered READ or buffered write_begin() lookup */
3227 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3228 		/*
3229 		 * We have blocks reserved already.  We
3230 		 * return allocated blocks so that delalloc
3231 		 * won't do block reservation for us.  But
3232 		 * the buffer head will be unmapped so that
3233 		 * a read from the block returns 0s.
3234 		 */
3235 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3236 		goto out1;
3237 	}
3238 
3239 	/* buffered write, writepage time, convert*/
3240 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3241 	if (ret >= 0) {
3242 		ext4_update_inode_fsync_trans(handle, inode, 1);
3243 		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3244 					 map->m_len);
3245 		if (err < 0)
3246 			goto out2;
3247 	}
3248 
3249 out:
3250 	if (ret <= 0) {
3251 		err = ret;
3252 		goto out2;
3253 	} else
3254 		allocated = ret;
3255 	map->m_flags |= EXT4_MAP_NEW;
3256 	/*
3257 	 * if we allocated more blocks than requested
3258 	 * we need to make sure we unmap the extra block
3259 	 * allocated. The actual needed block will get
3260 	 * unmapped later when we find the buffer_head marked
3261 	 * new.
3262 	 */
3263 	if (allocated > map->m_len) {
3264 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3265 					newblock + map->m_len,
3266 					allocated - map->m_len);
3267 		allocated = map->m_len;
3268 	}
3269 
3270 	/*
3271 	 * If we have done fallocate with the offset that is already
3272 	 * delayed allocated, we would have block reservation
3273 	 * and quota reservation done in the delayed write path.
3274 	 * But fallocate would have already updated quota and block
3275 	 * count for this offset. So cancel these reservation
3276 	 */
3277 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3278 		ext4_da_update_reserve_space(inode, allocated, 0);
3279 
3280 map_out:
3281 	map->m_flags |= EXT4_MAP_MAPPED;
3282 out1:
3283 	if (allocated > map->m_len)
3284 		allocated = map->m_len;
3285 	ext4_ext_show_leaf(inode, path);
3286 	map->m_pblk = newblock;
3287 	map->m_len = allocated;
3288 out2:
3289 	if (path) {
3290 		ext4_ext_drop_refs(path);
3291 		kfree(path);
3292 	}
3293 	return err ? err : allocated;
3294 }
3295 
3296 /*
3297  * Block allocation/map/preallocation routine for extents based files
3298  *
3299  *
3300  * Need to be called with
3301  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3302  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3303  *
3304  * return > 0, number of of blocks already mapped/allocated
3305  *          if create == 0 and these are pre-allocated blocks
3306  *          	buffer head is unmapped
3307  *          otherwise blocks are mapped
3308  *
3309  * return = 0, if plain look up failed (blocks have not been allocated)
3310  *          buffer head is unmapped
3311  *
3312  * return < 0, error case.
3313  */
3314 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3315 			struct ext4_map_blocks *map, int flags)
3316 {
3317 	struct ext4_ext_path *path = NULL;
3318 	struct ext4_extent newex, *ex;
3319 	ext4_fsblk_t newblock = 0;
3320 	int err = 0, depth, ret;
3321 	unsigned int allocated = 0;
3322 	unsigned int punched_out = 0;
3323 	unsigned int result = 0;
3324 	struct ext4_allocation_request ar;
3325 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3326 	struct ext4_map_blocks punch_map;
3327 
3328 	ext_debug("blocks %u/%u requested for inode %lu\n",
3329 		  map->m_lblk, map->m_len, inode->i_ino);
3330 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3331 
3332 	/* check in cache */
3333 	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3334 		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3335 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3336 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3337 				/*
3338 				 * block isn't allocated yet and
3339 				 * user doesn't want to allocate it
3340 				 */
3341 				goto out2;
3342 			}
3343 			/* we should allocate requested block */
3344 		} else {
3345 			/* block is already allocated */
3346 			newblock = map->m_lblk
3347 				   - le32_to_cpu(newex.ee_block)
3348 				   + ext4_ext_pblock(&newex);
3349 			/* number of remaining blocks in the extent */
3350 			allocated = ext4_ext_get_actual_len(&newex) -
3351 				(map->m_lblk - le32_to_cpu(newex.ee_block));
3352 			goto out;
3353 		}
3354 	}
3355 
3356 	/* find extent for this block */
3357 	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3358 	if (IS_ERR(path)) {
3359 		err = PTR_ERR(path);
3360 		path = NULL;
3361 		goto out2;
3362 	}
3363 
3364 	depth = ext_depth(inode);
3365 
3366 	/*
3367 	 * consistent leaf must not be empty;
3368 	 * this situation is possible, though, _during_ tree modification;
3369 	 * this is why assert can't be put in ext4_ext_find_extent()
3370 	 */
3371 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3372 		EXT4_ERROR_INODE(inode, "bad extent address "
3373 				 "lblock: %lu, depth: %d pblock %lld",
3374 				 (unsigned long) map->m_lblk, depth,
3375 				 path[depth].p_block);
3376 		err = -EIO;
3377 		goto out2;
3378 	}
3379 
3380 	ex = path[depth].p_ext;
3381 	if (ex) {
3382 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3383 		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3384 		unsigned short ee_len;
3385 
3386 		/*
3387 		 * Uninitialized extents are treated as holes, except that
3388 		 * we split out initialized portions during a write.
3389 		 */
3390 		ee_len = ext4_ext_get_actual_len(ex);
3391 		/* if found extent covers block, simply return it */
3392 		if (in_range(map->m_lblk, ee_block, ee_len)) {
3393 			newblock = map->m_lblk - ee_block + ee_start;
3394 			/* number of remaining blocks in the extent */
3395 			allocated = ee_len - (map->m_lblk - ee_block);
3396 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3397 				  ee_block, ee_len, newblock);
3398 
3399 			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3400 				/*
3401 				 * Do not put uninitialized extent
3402 				 * in the cache
3403 				 */
3404 				if (!ext4_ext_is_uninitialized(ex)) {
3405 					ext4_ext_put_in_cache(inode, ee_block,
3406 						ee_len, ee_start);
3407 					goto out;
3408 				}
3409 				ret = ext4_ext_handle_uninitialized_extents(
3410 					handle, inode, map, path, flags,
3411 					allocated, newblock);
3412 				return ret;
3413 			}
3414 
3415 			/*
3416 			 * Punch out the map length, but only to the
3417 			 * end of the extent
3418 			 */
3419 			punched_out = allocated < map->m_len ?
3420 				allocated : map->m_len;
3421 
3422 			/*
3423 			 * Sense extents need to be converted to
3424 			 * uninitialized, they must fit in an
3425 			 * uninitialized extent
3426 			 */
3427 			if (punched_out > EXT_UNINIT_MAX_LEN)
3428 				punched_out = EXT_UNINIT_MAX_LEN;
3429 
3430 			punch_map.m_lblk = map->m_lblk;
3431 			punch_map.m_pblk = newblock;
3432 			punch_map.m_len = punched_out;
3433 			punch_map.m_flags = 0;
3434 
3435 			/* Check to see if the extent needs to be split */
3436 			if (punch_map.m_len != ee_len ||
3437 				punch_map.m_lblk != ee_block) {
3438 
3439 				ret = ext4_split_extent(handle, inode,
3440 				path, &punch_map, 0,
3441 				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3442 				EXT4_GET_BLOCKS_PRE_IO);
3443 
3444 				if (ret < 0) {
3445 					err = ret;
3446 					goto out2;
3447 				}
3448 				/*
3449 				 * find extent for the block at
3450 				 * the start of the hole
3451 				 */
3452 				ext4_ext_drop_refs(path);
3453 				kfree(path);
3454 
3455 				path = ext4_ext_find_extent(inode,
3456 				map->m_lblk, NULL);
3457 				if (IS_ERR(path)) {
3458 					err = PTR_ERR(path);
3459 					path = NULL;
3460 					goto out2;
3461 				}
3462 
3463 				depth = ext_depth(inode);
3464 				ex = path[depth].p_ext;
3465 				ee_len = ext4_ext_get_actual_len(ex);
3466 				ee_block = le32_to_cpu(ex->ee_block);
3467 				ee_start = ext4_ext_pblock(ex);
3468 
3469 			}
3470 
3471 			ext4_ext_mark_uninitialized(ex);
3472 
3473 			ext4_ext_invalidate_cache(inode);
3474 
3475 			err = ext4_ext_rm_leaf(handle, inode, path,
3476 				map->m_lblk, map->m_lblk + punched_out);
3477 
3478 			if (!err && path->p_hdr->eh_entries == 0) {
3479 				/*
3480 				 * Punch hole freed all of this sub tree,
3481 				 * so we need to correct eh_depth
3482 				 */
3483 				err = ext4_ext_get_access(handle, inode, path);
3484 				if (err == 0) {
3485 					ext_inode_hdr(inode)->eh_depth = 0;
3486 					ext_inode_hdr(inode)->eh_max =
3487 					cpu_to_le16(ext4_ext_space_root(
3488 						inode, 0));
3489 
3490 					err = ext4_ext_dirty(
3491 						handle, inode, path);
3492 				}
3493 			}
3494 
3495 			goto out2;
3496 		}
3497 	}
3498 
3499 	/*
3500 	 * requested block isn't allocated yet;
3501 	 * we couldn't try to create block if create flag is zero
3502 	 */
3503 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3504 		/*
3505 		 * put just found gap into cache to speed up
3506 		 * subsequent requests
3507 		 */
3508 		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3509 		goto out2;
3510 	}
3511 	/*
3512 	 * Okay, we need to do block allocation.
3513 	 */
3514 
3515 	/* find neighbour allocated blocks */
3516 	ar.lleft = map->m_lblk;
3517 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3518 	if (err)
3519 		goto out2;
3520 	ar.lright = map->m_lblk;
3521 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3522 	if (err)
3523 		goto out2;
3524 
3525 	/*
3526 	 * See if request is beyond maximum number of blocks we can have in
3527 	 * a single extent. For an initialized extent this limit is
3528 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3529 	 * EXT_UNINIT_MAX_LEN.
3530 	 */
3531 	if (map->m_len > EXT_INIT_MAX_LEN &&
3532 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3533 		map->m_len = EXT_INIT_MAX_LEN;
3534 	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3535 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3536 		map->m_len = EXT_UNINIT_MAX_LEN;
3537 
3538 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3539 	newex.ee_block = cpu_to_le32(map->m_lblk);
3540 	newex.ee_len = cpu_to_le16(map->m_len);
3541 	err = ext4_ext_check_overlap(inode, &newex, path);
3542 	if (err)
3543 		allocated = ext4_ext_get_actual_len(&newex);
3544 	else
3545 		allocated = map->m_len;
3546 
3547 	/* allocate new block */
3548 	ar.inode = inode;
3549 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3550 	ar.logical = map->m_lblk;
3551 	ar.len = allocated;
3552 	if (S_ISREG(inode->i_mode))
3553 		ar.flags = EXT4_MB_HINT_DATA;
3554 	else
3555 		/* disable in-core preallocation for non-regular files */
3556 		ar.flags = 0;
3557 	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3558 		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3559 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3560 	if (!newblock)
3561 		goto out2;
3562 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3563 		  ar.goal, newblock, allocated);
3564 
3565 	/* try to insert new extent into found leaf and return */
3566 	ext4_ext_store_pblock(&newex, newblock);
3567 	newex.ee_len = cpu_to_le16(ar.len);
3568 	/* Mark uninitialized */
3569 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3570 		ext4_ext_mark_uninitialized(&newex);
3571 		/*
3572 		 * io_end structure was created for every IO write to an
3573 		 * uninitialized extent. To avoid unnecessary conversion,
3574 		 * here we flag the IO that really needs the conversion.
3575 		 * For non asycn direct IO case, flag the inode state
3576 		 * that we need to perform conversion when IO is done.
3577 		 */
3578 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3579 			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3580 				io->flag = EXT4_IO_END_UNWRITTEN;
3581 				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3582 			} else
3583 				ext4_set_inode_state(inode,
3584 						     EXT4_STATE_DIO_UNWRITTEN);
3585 		}
3586 		if (ext4_should_dioread_nolock(inode))
3587 			map->m_flags |= EXT4_MAP_UNINIT;
3588 	}
3589 
3590 	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3591 	if (!err)
3592 		err = ext4_ext_insert_extent(handle, inode, path,
3593 					     &newex, flags);
3594 	if (err) {
3595 		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3596 			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3597 		/* free data blocks we just allocated */
3598 		/* not a good idea to call discard here directly,
3599 		 * but otherwise we'd need to call it every free() */
3600 		ext4_discard_preallocations(inode);
3601 		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3602 				 ext4_ext_get_actual_len(&newex), fb_flags);
3603 		goto out2;
3604 	}
3605 
3606 	/* previous routine could use block we allocated */
3607 	newblock = ext4_ext_pblock(&newex);
3608 	allocated = ext4_ext_get_actual_len(&newex);
3609 	if (allocated > map->m_len)
3610 		allocated = map->m_len;
3611 	map->m_flags |= EXT4_MAP_NEW;
3612 
3613 	/*
3614 	 * Update reserved blocks/metadata blocks after successful
3615 	 * block allocation which had been deferred till now.
3616 	 */
3617 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3618 		ext4_da_update_reserve_space(inode, allocated, 1);
3619 
3620 	/*
3621 	 * Cache the extent and update transaction to commit on fdatasync only
3622 	 * when it is _not_ an uninitialized extent.
3623 	 */
3624 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3625 		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3626 		ext4_update_inode_fsync_trans(handle, inode, 1);
3627 	} else
3628 		ext4_update_inode_fsync_trans(handle, inode, 0);
3629 out:
3630 	if (allocated > map->m_len)
3631 		allocated = map->m_len;
3632 	ext4_ext_show_leaf(inode, path);
3633 	map->m_flags |= EXT4_MAP_MAPPED;
3634 	map->m_pblk = newblock;
3635 	map->m_len = allocated;
3636 out2:
3637 	if (path) {
3638 		ext4_ext_drop_refs(path);
3639 		kfree(path);
3640 	}
3641 	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3642 		newblock, map->m_len, err ? err : allocated);
3643 
3644 	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3645 			punched_out : allocated;
3646 
3647 	return err ? err : result;
3648 }
3649 
3650 void ext4_ext_truncate(struct inode *inode)
3651 {
3652 	struct address_space *mapping = inode->i_mapping;
3653 	struct super_block *sb = inode->i_sb;
3654 	ext4_lblk_t last_block;
3655 	handle_t *handle;
3656 	loff_t page_len;
3657 	int err = 0;
3658 
3659 	/*
3660 	 * finish any pending end_io work so we won't run the risk of
3661 	 * converting any truncated blocks to initialized later
3662 	 */
3663 	ext4_flush_completed_IO(inode);
3664 
3665 	/*
3666 	 * probably first extent we're gonna free will be last in block
3667 	 */
3668 	err = ext4_writepage_trans_blocks(inode);
3669 	handle = ext4_journal_start(inode, err);
3670 	if (IS_ERR(handle))
3671 		return;
3672 
3673 	if (inode->i_size % PAGE_CACHE_SIZE != 0) {
3674 		page_len = PAGE_CACHE_SIZE -
3675 			(inode->i_size & (PAGE_CACHE_SIZE - 1));
3676 
3677 		err = ext4_discard_partial_page_buffers(handle,
3678 			mapping, inode->i_size, page_len, 0);
3679 
3680 		if (err)
3681 			goto out_stop;
3682 	}
3683 
3684 	if (ext4_orphan_add(handle, inode))
3685 		goto out_stop;
3686 
3687 	down_write(&EXT4_I(inode)->i_data_sem);
3688 	ext4_ext_invalidate_cache(inode);
3689 
3690 	ext4_discard_preallocations(inode);
3691 
3692 	/*
3693 	 * TODO: optimization is possible here.
3694 	 * Probably we need not scan at all,
3695 	 * because page truncation is enough.
3696 	 */
3697 
3698 	/* we have to know where to truncate from in crash case */
3699 	EXT4_I(inode)->i_disksize = inode->i_size;
3700 	ext4_mark_inode_dirty(handle, inode);
3701 
3702 	last_block = (inode->i_size + sb->s_blocksize - 1)
3703 			>> EXT4_BLOCK_SIZE_BITS(sb);
3704 	err = ext4_ext_remove_space(inode, last_block);
3705 
3706 	/* In a multi-transaction truncate, we only make the final
3707 	 * transaction synchronous.
3708 	 */
3709 	if (IS_SYNC(inode))
3710 		ext4_handle_sync(handle);
3711 
3712 	up_write(&EXT4_I(inode)->i_data_sem);
3713 
3714 out_stop:
3715 	/*
3716 	 * If this was a simple ftruncate() and the file will remain alive,
3717 	 * then we need to clear up the orphan record which we created above.
3718 	 * However, if this was a real unlink then we were called by
3719 	 * ext4_delete_inode(), and we allow that function to clean up the
3720 	 * orphan info for us.
3721 	 */
3722 	if (inode->i_nlink)
3723 		ext4_orphan_del(handle, inode);
3724 
3725 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3726 	ext4_mark_inode_dirty(handle, inode);
3727 	ext4_journal_stop(handle);
3728 }
3729 
3730 static void ext4_falloc_update_inode(struct inode *inode,
3731 				int mode, loff_t new_size, int update_ctime)
3732 {
3733 	struct timespec now;
3734 
3735 	if (update_ctime) {
3736 		now = current_fs_time(inode->i_sb);
3737 		if (!timespec_equal(&inode->i_ctime, &now))
3738 			inode->i_ctime = now;
3739 	}
3740 	/*
3741 	 * Update only when preallocation was requested beyond
3742 	 * the file size.
3743 	 */
3744 	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3745 		if (new_size > i_size_read(inode))
3746 			i_size_write(inode, new_size);
3747 		if (new_size > EXT4_I(inode)->i_disksize)
3748 			ext4_update_i_disksize(inode, new_size);
3749 	} else {
3750 		/*
3751 		 * Mark that we allocate beyond EOF so the subsequent truncate
3752 		 * can proceed even if the new size is the same as i_size.
3753 		 */
3754 		if (new_size > i_size_read(inode))
3755 			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3756 	}
3757 
3758 }
3759 
3760 /*
3761  * preallocate space for a file. This implements ext4's fallocate file
3762  * operation, which gets called from sys_fallocate system call.
3763  * For block-mapped files, posix_fallocate should fall back to the method
3764  * of writing zeroes to the required new blocks (the same behavior which is
3765  * expected for file systems which do not support fallocate() system call).
3766  */
3767 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3768 {
3769 	struct inode *inode = file->f_path.dentry->d_inode;
3770 	handle_t *handle;
3771 	loff_t new_size;
3772 	unsigned int max_blocks;
3773 	int ret = 0;
3774 	int ret2 = 0;
3775 	int retries = 0;
3776 	struct ext4_map_blocks map;
3777 	unsigned int credits, blkbits = inode->i_blkbits;
3778 
3779 	/*
3780 	 * currently supporting (pre)allocate mode for extent-based
3781 	 * files _only_
3782 	 */
3783 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3784 		return -EOPNOTSUPP;
3785 
3786 	/* Return error if mode is not supported */
3787 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3788 		return -EOPNOTSUPP;
3789 
3790 	if (mode & FALLOC_FL_PUNCH_HOLE)
3791 		return ext4_punch_hole(file, offset, len);
3792 
3793 	trace_ext4_fallocate_enter(inode, offset, len, mode);
3794 	map.m_lblk = offset >> blkbits;
3795 	/*
3796 	 * We can't just convert len to max_blocks because
3797 	 * If blocksize = 4096 offset = 3072 and len = 2048
3798 	 */
3799 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3800 		- map.m_lblk;
3801 	/*
3802 	 * credits to insert 1 extent into extent tree
3803 	 */
3804 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3805 	mutex_lock(&inode->i_mutex);
3806 	ret = inode_newsize_ok(inode, (len + offset));
3807 	if (ret) {
3808 		mutex_unlock(&inode->i_mutex);
3809 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3810 		return ret;
3811 	}
3812 retry:
3813 	while (ret >= 0 && ret < max_blocks) {
3814 		map.m_lblk = map.m_lblk + ret;
3815 		map.m_len = max_blocks = max_blocks - ret;
3816 		handle = ext4_journal_start(inode, credits);
3817 		if (IS_ERR(handle)) {
3818 			ret = PTR_ERR(handle);
3819 			break;
3820 		}
3821 		ret = ext4_map_blocks(handle, inode, &map,
3822 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3823 				      EXT4_GET_BLOCKS_NO_NORMALIZE);
3824 		if (ret <= 0) {
3825 #ifdef EXT4FS_DEBUG
3826 			WARN_ON(ret <= 0);
3827 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3828 				    "returned error inode#%lu, block=%u, "
3829 				    "max_blocks=%u", __func__,
3830 				    inode->i_ino, map.m_lblk, max_blocks);
3831 #endif
3832 			ext4_mark_inode_dirty(handle, inode);
3833 			ret2 = ext4_journal_stop(handle);
3834 			break;
3835 		}
3836 		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3837 						blkbits) >> blkbits))
3838 			new_size = offset + len;
3839 		else
3840 			new_size = ((loff_t) map.m_lblk + ret) << blkbits;
3841 
3842 		ext4_falloc_update_inode(inode, mode, new_size,
3843 					 (map.m_flags & EXT4_MAP_NEW));
3844 		ext4_mark_inode_dirty(handle, inode);
3845 		ret2 = ext4_journal_stop(handle);
3846 		if (ret2)
3847 			break;
3848 	}
3849 	if (ret == -ENOSPC &&
3850 			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3851 		ret = 0;
3852 		goto retry;
3853 	}
3854 	mutex_unlock(&inode->i_mutex);
3855 	trace_ext4_fallocate_exit(inode, offset, max_blocks,
3856 				ret > 0 ? ret2 : ret);
3857 	return ret > 0 ? ret2 : ret;
3858 }
3859 
3860 /*
3861  * This function convert a range of blocks to written extents
3862  * The caller of this function will pass the start offset and the size.
3863  * all unwritten extents within this range will be converted to
3864  * written extents.
3865  *
3866  * This function is called from the direct IO end io call back
3867  * function, to convert the fallocated extents after IO is completed.
3868  * Returns 0 on success.
3869  */
3870 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3871 				    ssize_t len)
3872 {
3873 	handle_t *handle;
3874 	unsigned int max_blocks;
3875 	int ret = 0;
3876 	int ret2 = 0;
3877 	struct ext4_map_blocks map;
3878 	unsigned int credits, blkbits = inode->i_blkbits;
3879 
3880 	map.m_lblk = offset >> blkbits;
3881 	/*
3882 	 * We can't just convert len to max_blocks because
3883 	 * If blocksize = 4096 offset = 3072 and len = 2048
3884 	 */
3885 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3886 		      map.m_lblk);
3887 	/*
3888 	 * credits to insert 1 extent into extent tree
3889 	 */
3890 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3891 	while (ret >= 0 && ret < max_blocks) {
3892 		map.m_lblk += ret;
3893 		map.m_len = (max_blocks -= ret);
3894 		handle = ext4_journal_start(inode, credits);
3895 		if (IS_ERR(handle)) {
3896 			ret = PTR_ERR(handle);
3897 			break;
3898 		}
3899 		ret = ext4_map_blocks(handle, inode, &map,
3900 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3901 		if (ret <= 0) {
3902 			WARN_ON(ret <= 0);
3903 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3904 				    "returned error inode#%lu, block=%u, "
3905 				    "max_blocks=%u", __func__,
3906 				    inode->i_ino, map.m_lblk, map.m_len);
3907 		}
3908 		ext4_mark_inode_dirty(handle, inode);
3909 		ret2 = ext4_journal_stop(handle);
3910 		if (ret <= 0 || ret2 )
3911 			break;
3912 	}
3913 	return ret > 0 ? ret2 : ret;
3914 }
3915 
3916 /*
3917  * Callback function called for each extent to gather FIEMAP information.
3918  */
3919 static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
3920 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3921 		       void *data)
3922 {
3923 	__u64	logical;
3924 	__u64	physical;
3925 	__u64	length;
3926 	__u32	flags = 0;
3927 	int		ret = 0;
3928 	struct fiemap_extent_info *fieinfo = data;
3929 	unsigned char blksize_bits;
3930 
3931 	blksize_bits = inode->i_sb->s_blocksize_bits;
3932 	logical = (__u64)newex->ec_block << blksize_bits;
3933 
3934 	if (newex->ec_start == 0) {
3935 		/*
3936 		 * No extent in extent-tree contains block @newex->ec_start,
3937 		 * then the block may stay in 1)a hole or 2)delayed-extent.
3938 		 *
3939 		 * Holes or delayed-extents are processed as follows.
3940 		 * 1. lookup dirty pages with specified range in pagecache.
3941 		 *    If no page is got, then there is no delayed-extent and
3942 		 *    return with EXT_CONTINUE.
3943 		 * 2. find the 1st mapped buffer,
3944 		 * 3. check if the mapped buffer is both in the request range
3945 		 *    and a delayed buffer. If not, there is no delayed-extent,
3946 		 *    then return.
3947 		 * 4. a delayed-extent is found, the extent will be collected.
3948 		 */
3949 		ext4_lblk_t	end = 0;
3950 		pgoff_t		last_offset;
3951 		pgoff_t		offset;
3952 		pgoff_t		index;
3953 		pgoff_t		start_index = 0;
3954 		struct page	**pages = NULL;
3955 		struct buffer_head *bh = NULL;
3956 		struct buffer_head *head = NULL;
3957 		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
3958 
3959 		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
3960 		if (pages == NULL)
3961 			return -ENOMEM;
3962 
3963 		offset = logical >> PAGE_SHIFT;
3964 repeat:
3965 		last_offset = offset;
3966 		head = NULL;
3967 		ret = find_get_pages_tag(inode->i_mapping, &offset,
3968 					PAGECACHE_TAG_DIRTY, nr_pages, pages);
3969 
3970 		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3971 			/* First time, try to find a mapped buffer. */
3972 			if (ret == 0) {
3973 out:
3974 				for (index = 0; index < ret; index++)
3975 					page_cache_release(pages[index]);
3976 				/* just a hole. */
3977 				kfree(pages);
3978 				return EXT_CONTINUE;
3979 			}
3980 			index = 0;
3981 
3982 next_page:
3983 			/* Try to find the 1st mapped buffer. */
3984 			end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3985 				  blksize_bits;
3986 			if (!page_has_buffers(pages[index]))
3987 				goto out;
3988 			head = page_buffers(pages[index]);
3989 			if (!head)
3990 				goto out;
3991 
3992 			index++;
3993 			bh = head;
3994 			do {
3995 				if (end >= newex->ec_block +
3996 					newex->ec_len)
3997 					/* The buffer is out of
3998 					 * the request range.
3999 					 */
4000 					goto out;
4001 
4002 				if (buffer_mapped(bh) &&
4003 				    end >= newex->ec_block) {
4004 					start_index = index - 1;
4005 					/* get the 1st mapped buffer. */
4006 					goto found_mapped_buffer;
4007 				}
4008 
4009 				bh = bh->b_this_page;
4010 				end++;
4011 			} while (bh != head);
4012 
4013 			/* No mapped buffer in the range found in this page,
4014 			 * We need to look up next page.
4015 			 */
4016 			if (index >= ret) {
4017 				/* There is no page left, but we need to limit
4018 				 * newex->ec_len.
4019 				 */
4020 				newex->ec_len = end - newex->ec_block;
4021 				goto out;
4022 			}
4023 			goto next_page;
4024 		} else {
4025 			/*Find contiguous delayed buffers. */
4026 			if (ret > 0 && pages[0]->index == last_offset)
4027 				head = page_buffers(pages[0]);
4028 			bh = head;
4029 			index = 1;
4030 			start_index = 0;
4031 		}
4032 
4033 found_mapped_buffer:
4034 		if (bh != NULL && buffer_delay(bh)) {
4035 			/* 1st or contiguous delayed buffer found. */
4036 			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4037 				/*
4038 				 * 1st delayed buffer found, record
4039 				 * the start of extent.
4040 				 */
4041 				flags |= FIEMAP_EXTENT_DELALLOC;
4042 				newex->ec_block = end;
4043 				logical = (__u64)end << blksize_bits;
4044 			}
4045 			/* Find contiguous delayed buffers. */
4046 			do {
4047 				if (!buffer_delay(bh))
4048 					goto found_delayed_extent;
4049 				bh = bh->b_this_page;
4050 				end++;
4051 			} while (bh != head);
4052 
4053 			for (; index < ret; index++) {
4054 				if (!page_has_buffers(pages[index])) {
4055 					bh = NULL;
4056 					break;
4057 				}
4058 				head = page_buffers(pages[index]);
4059 				if (!head) {
4060 					bh = NULL;
4061 					break;
4062 				}
4063 
4064 				if (pages[index]->index !=
4065 				    pages[start_index]->index + index
4066 				    - start_index) {
4067 					/* Blocks are not contiguous. */
4068 					bh = NULL;
4069 					break;
4070 				}
4071 				bh = head;
4072 				do {
4073 					if (!buffer_delay(bh))
4074 						/* Delayed-extent ends. */
4075 						goto found_delayed_extent;
4076 					bh = bh->b_this_page;
4077 					end++;
4078 				} while (bh != head);
4079 			}
4080 		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4081 			/* a hole found. */
4082 			goto out;
4083 
4084 found_delayed_extent:
4085 		newex->ec_len = min(end - newex->ec_block,
4086 						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4087 		if (ret == nr_pages && bh != NULL &&
4088 			newex->ec_len < EXT_INIT_MAX_LEN &&
4089 			buffer_delay(bh)) {
4090 			/* Have not collected an extent and continue. */
4091 			for (index = 0; index < ret; index++)
4092 				page_cache_release(pages[index]);
4093 			goto repeat;
4094 		}
4095 
4096 		for (index = 0; index < ret; index++)
4097 			page_cache_release(pages[index]);
4098 		kfree(pages);
4099 	}
4100 
4101 	physical = (__u64)newex->ec_start << blksize_bits;
4102 	length =   (__u64)newex->ec_len << blksize_bits;
4103 
4104 	if (ex && ext4_ext_is_uninitialized(ex))
4105 		flags |= FIEMAP_EXTENT_UNWRITTEN;
4106 
4107 	if (next == EXT_MAX_BLOCKS)
4108 		flags |= FIEMAP_EXTENT_LAST;
4109 
4110 	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4111 					length, flags);
4112 	if (ret < 0)
4113 		return ret;
4114 	if (ret == 1)
4115 		return EXT_BREAK;
4116 	return EXT_CONTINUE;
4117 }
4118 
4119 /* fiemap flags we can handle specified here */
4120 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4121 
4122 static int ext4_xattr_fiemap(struct inode *inode,
4123 				struct fiemap_extent_info *fieinfo)
4124 {
4125 	__u64 physical = 0;
4126 	__u64 length;
4127 	__u32 flags = FIEMAP_EXTENT_LAST;
4128 	int blockbits = inode->i_sb->s_blocksize_bits;
4129 	int error = 0;
4130 
4131 	/* in-inode? */
4132 	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4133 		struct ext4_iloc iloc;
4134 		int offset;	/* offset of xattr in inode */
4135 
4136 		error = ext4_get_inode_loc(inode, &iloc);
4137 		if (error)
4138 			return error;
4139 		physical = iloc.bh->b_blocknr << blockbits;
4140 		offset = EXT4_GOOD_OLD_INODE_SIZE +
4141 				EXT4_I(inode)->i_extra_isize;
4142 		physical += offset;
4143 		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4144 		flags |= FIEMAP_EXTENT_DATA_INLINE;
4145 		brelse(iloc.bh);
4146 	} else { /* external block */
4147 		physical = EXT4_I(inode)->i_file_acl << blockbits;
4148 		length = inode->i_sb->s_blocksize;
4149 	}
4150 
4151 	if (physical)
4152 		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4153 						length, flags);
4154 	return (error < 0 ? error : 0);
4155 }
4156 
4157 /*
4158  * ext4_ext_punch_hole
4159  *
4160  * Punches a hole of "length" bytes in a file starting
4161  * at byte "offset"
4162  *
4163  * @inode:  The inode of the file to punch a hole in
4164  * @offset: The starting byte offset of the hole
4165  * @length: The length of the hole
4166  *
4167  * Returns the number of blocks removed or negative on err
4168  */
4169 int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4170 {
4171 	struct inode *inode = file->f_path.dentry->d_inode;
4172 	struct super_block *sb = inode->i_sb;
4173 	struct ext4_ext_cache cache_ex;
4174 	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4175 	struct address_space *mapping = inode->i_mapping;
4176 	struct ext4_map_blocks map;
4177 	handle_t *handle;
4178 	loff_t first_page, last_page, page_len;
4179 	loff_t first_page_offset, last_page_offset;
4180 	int ret, credits, blocks_released, err = 0;
4181 
4182 	/* No need to punch hole beyond i_size */
4183 	if (offset >= inode->i_size)
4184 		return 0;
4185 
4186 	/*
4187 	 * If the hole extends beyond i_size, set the hole
4188 	 * to end after the page that contains i_size
4189 	 */
4190 	if (offset + length > inode->i_size) {
4191 		length = inode->i_size +
4192 		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
4193 		   offset;
4194 	}
4195 
4196 	first_block = (offset + sb->s_blocksize - 1) >>
4197 		EXT4_BLOCK_SIZE_BITS(sb);
4198 	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4199 
4200 	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4201 	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4202 
4203 	first_page_offset = first_page << PAGE_CACHE_SHIFT;
4204 	last_page_offset = last_page << PAGE_CACHE_SHIFT;
4205 
4206 	/*
4207 	 * Write out all dirty pages to avoid race conditions
4208 	 * Then release them.
4209 	 */
4210 	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4211 		err = filemap_write_and_wait_range(mapping,
4212 			offset, offset + length - 1);
4213 
4214 		if (err)
4215 			return err;
4216 	}
4217 
4218 	/* Now release the pages */
4219 	if (last_page_offset > first_page_offset) {
4220 		truncate_inode_pages_range(mapping, first_page_offset,
4221 					   last_page_offset-1);
4222 	}
4223 
4224 	/* finish any pending end_io work */
4225 	ext4_flush_completed_IO(inode);
4226 
4227 	credits = ext4_writepage_trans_blocks(inode);
4228 	handle = ext4_journal_start(inode, credits);
4229 	if (IS_ERR(handle))
4230 		return PTR_ERR(handle);
4231 
4232 	err = ext4_orphan_add(handle, inode);
4233 	if (err)
4234 		goto out;
4235 
4236 	/*
4237 	 * Now we need to zero out the non-page-aligned data in the
4238 	 * pages at the start and tail of the hole, and unmap the buffer
4239 	 * heads for the block aligned regions of the page that were
4240 	 * completely zeroed.
4241 	 */
4242 	if (first_page > last_page) {
4243 		/*
4244 		 * If the file space being truncated is contained within a page
4245 		 * just zero out and unmap the middle of that page
4246 		 */
4247 		err = ext4_discard_partial_page_buffers(handle,
4248 			mapping, offset, length, 0);
4249 
4250 		if (err)
4251 			goto out;
4252 	} else {
4253 		/*
4254 		 * zero out and unmap the partial page that contains
4255 		 * the start of the hole
4256 		 */
4257 		page_len  = first_page_offset - offset;
4258 		if (page_len > 0) {
4259 			err = ext4_discard_partial_page_buffers(handle, mapping,
4260 						   offset, page_len, 0);
4261 			if (err)
4262 				goto out;
4263 		}
4264 
4265 		/*
4266 		 * zero out and unmap the partial page that contains
4267 		 * the end of the hole
4268 		 */
4269 		page_len = offset + length - last_page_offset;
4270 		if (page_len > 0) {
4271 			err = ext4_discard_partial_page_buffers(handle, mapping,
4272 					last_page_offset, page_len, 0);
4273 			if (err)
4274 				goto out;
4275 		}
4276 	}
4277 
4278 
4279 	/*
4280 	 * If i_size is contained in the last page, we need to
4281 	 * unmap and zero the partial page after i_size
4282 	 */
4283 	if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
4284 	   inode->i_size % PAGE_CACHE_SIZE != 0) {
4285 
4286 		page_len = PAGE_CACHE_SIZE -
4287 			(inode->i_size & (PAGE_CACHE_SIZE - 1));
4288 
4289 		if (page_len > 0) {
4290 			err = ext4_discard_partial_page_buffers(handle,
4291 			  mapping, inode->i_size, page_len, 0);
4292 
4293 			if (err)
4294 				goto out;
4295 		}
4296 	}
4297 
4298 	/* If there are no blocks to remove, return now */
4299 	if (first_block >= last_block)
4300 		goto out;
4301 
4302 	down_write(&EXT4_I(inode)->i_data_sem);
4303 	ext4_ext_invalidate_cache(inode);
4304 	ext4_discard_preallocations(inode);
4305 
4306 	/*
4307 	 * Loop over all the blocks and identify blocks
4308 	 * that need to be punched out
4309 	 */
4310 	iblock = first_block;
4311 	blocks_released = 0;
4312 	while (iblock < last_block) {
4313 		max_blocks = last_block - iblock;
4314 		num_blocks = 1;
4315 		memset(&map, 0, sizeof(map));
4316 		map.m_lblk = iblock;
4317 		map.m_len = max_blocks;
4318 		ret = ext4_ext_map_blocks(handle, inode, &map,
4319 			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4320 
4321 		if (ret > 0) {
4322 			blocks_released += ret;
4323 			num_blocks = ret;
4324 		} else if (ret == 0) {
4325 			/*
4326 			 * If map blocks could not find the block,
4327 			 * then it is in a hole.  If the hole was
4328 			 * not already cached, then map blocks should
4329 			 * put it in the cache.  So we can get the hole
4330 			 * out of the cache
4331 			 */
4332 			memset(&cache_ex, 0, sizeof(cache_ex));
4333 			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4334 				!cache_ex.ec_start) {
4335 
4336 				/* The hole is cached */
4337 				num_blocks = cache_ex.ec_block +
4338 				cache_ex.ec_len - iblock;
4339 
4340 			} else {
4341 				/* The block could not be identified */
4342 				err = -EIO;
4343 				break;
4344 			}
4345 		} else {
4346 			/* Map blocks error */
4347 			err = ret;
4348 			break;
4349 		}
4350 
4351 		if (num_blocks == 0) {
4352 			/* This condition should never happen */
4353 			ext_debug("Block lookup failed");
4354 			err = -EIO;
4355 			break;
4356 		}
4357 
4358 		iblock += num_blocks;
4359 	}
4360 
4361 	if (blocks_released > 0) {
4362 		ext4_ext_invalidate_cache(inode);
4363 		ext4_discard_preallocations(inode);
4364 	}
4365 
4366 	if (IS_SYNC(inode))
4367 		ext4_handle_sync(handle);
4368 
4369 	up_write(&EXT4_I(inode)->i_data_sem);
4370 
4371 out:
4372 	ext4_orphan_del(handle, inode);
4373 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4374 	ext4_mark_inode_dirty(handle, inode);
4375 	ext4_journal_stop(handle);
4376 	return err;
4377 }
4378 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4379 		__u64 start, __u64 len)
4380 {
4381 	ext4_lblk_t start_blk;
4382 	int error = 0;
4383 
4384 	/* fallback to generic here if not in extents fmt */
4385 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4386 		return generic_block_fiemap(inode, fieinfo, start, len,
4387 			ext4_get_block);
4388 
4389 	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4390 		return -EBADR;
4391 
4392 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4393 		error = ext4_xattr_fiemap(inode, fieinfo);
4394 	} else {
4395 		ext4_lblk_t len_blks;
4396 		__u64 last_blk;
4397 
4398 		start_blk = start >> inode->i_sb->s_blocksize_bits;
4399 		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4400 		if (last_blk >= EXT_MAX_BLOCKS)
4401 			last_blk = EXT_MAX_BLOCKS-1;
4402 		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4403 
4404 		/*
4405 		 * Walk the extent tree gathering extent information.
4406 		 * ext4_ext_fiemap_cb will push extents back to user.
4407 		 */
4408 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4409 					  ext4_ext_fiemap_cb, fieinfo);
4410 	}
4411 
4412 	return error;
4413 }
4414