xref: /openbmc/linux/fs/ext4/extents.c (revision 47ea3bb59cf47298b1cbb4d7bef056b3afea0879)
1 /*
2  * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3  * Written by Alex Tomas <alex@clusterfs.com>
4  *
5  * Architecture independence:
6  *   Copyright (c) 2005, Bull S.A.
7  *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public Licens
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
21  */
22 
23 /*
24  * Extents support for EXT4
25  *
26  * TODO:
27  *   - ext4*_error() should be used in some situations
28  *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
29  *   - smart tree reduction
30  */
31 
32 #include <linux/module.h>
33 #include <linux/fs.h>
34 #include <linux/time.h>
35 #include <linux/jbd2.h>
36 #include <linux/highuid.h>
37 #include <linux/pagemap.h>
38 #include <linux/quotaops.h>
39 #include <linux/string.h>
40 #include <linux/slab.h>
41 #include <linux/falloc.h>
42 #include <asm/uaccess.h>
43 #include <linux/fiemap.h>
44 #include "ext4_jbd2.h"
45 #include "ext4_extents.h"
46 
47 #include <trace/events/ext4.h>
48 
49 static int ext4_ext_truncate_extend_restart(handle_t *handle,
50 					    struct inode *inode,
51 					    int needed)
52 {
53 	int err;
54 
55 	if (!ext4_handle_valid(handle))
56 		return 0;
57 	if (handle->h_buffer_credits > needed)
58 		return 0;
59 	err = ext4_journal_extend(handle, needed);
60 	if (err <= 0)
61 		return err;
62 	err = ext4_truncate_restart_trans(handle, inode, needed);
63 	if (err == 0)
64 		err = -EAGAIN;
65 
66 	return err;
67 }
68 
69 /*
70  * could return:
71  *  - EROFS
72  *  - ENOMEM
73  */
74 static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
75 				struct ext4_ext_path *path)
76 {
77 	if (path->p_bh) {
78 		/* path points to block */
79 		return ext4_journal_get_write_access(handle, path->p_bh);
80 	}
81 	/* path points to leaf/index in inode body */
82 	/* we use in-core data, no need to protect them */
83 	return 0;
84 }
85 
86 /*
87  * could return:
88  *  - EROFS
89  *  - ENOMEM
90  *  - EIO
91  */
92 static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
93 				struct ext4_ext_path *path)
94 {
95 	int err;
96 	if (path->p_bh) {
97 		/* path points to block */
98 		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
99 	} else {
100 		/* path points to leaf/index in inode body */
101 		err = ext4_mark_inode_dirty(handle, inode);
102 	}
103 	return err;
104 }
105 
106 static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
107 			      struct ext4_ext_path *path,
108 			      ext4_lblk_t block)
109 {
110 	struct ext4_inode_info *ei = EXT4_I(inode);
111 	ext4_fsblk_t bg_start;
112 	ext4_fsblk_t last_block;
113 	ext4_grpblk_t colour;
114 	ext4_group_t block_group;
115 	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
116 	int depth;
117 
118 	if (path) {
119 		struct ext4_extent *ex;
120 		depth = path->p_depth;
121 
122 		/*
123 		 * Try to predict block placement assuming that we are
124 		 * filling in a file which will eventually be
125 		 * non-sparse --- i.e., in the case of libbfd writing
126 		 * an ELF object sections out-of-order but in a way
127 		 * the eventually results in a contiguous object or
128 		 * executable file, or some database extending a table
129 		 * space file.  However, this is actually somewhat
130 		 * non-ideal if we are writing a sparse file such as
131 		 * qemu or KVM writing a raw image file that is going
132 		 * to stay fairly sparse, since it will end up
133 		 * fragmenting the file system's free space.  Maybe we
134 		 * should have some hueristics or some way to allow
135 		 * userspace to pass a hint to file system,
136 		 * especially if the latter case turns out to be
137 		 * common.
138 		 */
139 		ex = path[depth].p_ext;
140 		if (ex) {
141 			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
142 			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
143 
144 			if (block > ext_block)
145 				return ext_pblk + (block - ext_block);
146 			else
147 				return ext_pblk - (ext_block - block);
148 		}
149 
150 		/* it looks like index is empty;
151 		 * try to find starting block from index itself */
152 		if (path[depth].p_bh)
153 			return path[depth].p_bh->b_blocknr;
154 	}
155 
156 	/* OK. use inode's group */
157 	block_group = ei->i_block_group;
158 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
159 		/*
160 		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
161 		 * block groups per flexgroup, reserve the first block
162 		 * group for directories and special files.  Regular
163 		 * files will start at the second block group.  This
164 		 * tends to speed up directory access and improves
165 		 * fsck times.
166 		 */
167 		block_group &= ~(flex_size-1);
168 		if (S_ISREG(inode->i_mode))
169 			block_group++;
170 	}
171 	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
172 	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
173 
174 	/*
175 	 * If we are doing delayed allocation, we don't need take
176 	 * colour into account.
177 	 */
178 	if (test_opt(inode->i_sb, DELALLOC))
179 		return bg_start;
180 
181 	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
182 		colour = (current->pid % 16) *
183 			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
184 	else
185 		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
186 	return bg_start + colour + block;
187 }
188 
189 /*
190  * Allocation for a meta data block
191  */
192 static ext4_fsblk_t
193 ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
194 			struct ext4_ext_path *path,
195 			struct ext4_extent *ex, int *err)
196 {
197 	ext4_fsblk_t goal, newblock;
198 
199 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
200 	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
201 	return newblock;
202 }
203 
204 static inline int ext4_ext_space_block(struct inode *inode, int check)
205 {
206 	int size;
207 
208 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
209 			/ sizeof(struct ext4_extent);
210 	if (!check) {
211 #ifdef AGGRESSIVE_TEST
212 		if (size > 6)
213 			size = 6;
214 #endif
215 	}
216 	return size;
217 }
218 
219 static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
220 {
221 	int size;
222 
223 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
224 			/ sizeof(struct ext4_extent_idx);
225 	if (!check) {
226 #ifdef AGGRESSIVE_TEST
227 		if (size > 5)
228 			size = 5;
229 #endif
230 	}
231 	return size;
232 }
233 
234 static inline int ext4_ext_space_root(struct inode *inode, int check)
235 {
236 	int size;
237 
238 	size = sizeof(EXT4_I(inode)->i_data);
239 	size -= sizeof(struct ext4_extent_header);
240 	size /= sizeof(struct ext4_extent);
241 	if (!check) {
242 #ifdef AGGRESSIVE_TEST
243 		if (size > 3)
244 			size = 3;
245 #endif
246 	}
247 	return size;
248 }
249 
250 static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
251 {
252 	int size;
253 
254 	size = sizeof(EXT4_I(inode)->i_data);
255 	size -= sizeof(struct ext4_extent_header);
256 	size /= sizeof(struct ext4_extent_idx);
257 	if (!check) {
258 #ifdef AGGRESSIVE_TEST
259 		if (size > 4)
260 			size = 4;
261 #endif
262 	}
263 	return size;
264 }
265 
266 /*
267  * Calculate the number of metadata blocks needed
268  * to allocate @blocks
269  * Worse case is one block per extent
270  */
271 int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
272 {
273 	struct ext4_inode_info *ei = EXT4_I(inode);
274 	int idxs, num = 0;
275 
276 	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
277 		/ sizeof(struct ext4_extent_idx));
278 
279 	/*
280 	 * If the new delayed allocation block is contiguous with the
281 	 * previous da block, it can share index blocks with the
282 	 * previous block, so we only need to allocate a new index
283 	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
284 	 * an additional index block, and at ldxs**3 blocks, yet
285 	 * another index blocks.
286 	 */
287 	if (ei->i_da_metadata_calc_len &&
288 	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
289 		if ((ei->i_da_metadata_calc_len % idxs) == 0)
290 			num++;
291 		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
292 			num++;
293 		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
294 			num++;
295 			ei->i_da_metadata_calc_len = 0;
296 		} else
297 			ei->i_da_metadata_calc_len++;
298 		ei->i_da_metadata_calc_last_lblock++;
299 		return num;
300 	}
301 
302 	/*
303 	 * In the worst case we need a new set of index blocks at
304 	 * every level of the inode's extent tree.
305 	 */
306 	ei->i_da_metadata_calc_len = 1;
307 	ei->i_da_metadata_calc_last_lblock = lblock;
308 	return ext_depth(inode) + 1;
309 }
310 
311 static int
312 ext4_ext_max_entries(struct inode *inode, int depth)
313 {
314 	int max;
315 
316 	if (depth == ext_depth(inode)) {
317 		if (depth == 0)
318 			max = ext4_ext_space_root(inode, 1);
319 		else
320 			max = ext4_ext_space_root_idx(inode, 1);
321 	} else {
322 		if (depth == 0)
323 			max = ext4_ext_space_block(inode, 1);
324 		else
325 			max = ext4_ext_space_block_idx(inode, 1);
326 	}
327 
328 	return max;
329 }
330 
331 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
332 {
333 	ext4_fsblk_t block = ext4_ext_pblock(ext);
334 	int len = ext4_ext_get_actual_len(ext);
335 
336 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
337 }
338 
339 static int ext4_valid_extent_idx(struct inode *inode,
340 				struct ext4_extent_idx *ext_idx)
341 {
342 	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
343 
344 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
345 }
346 
347 static int ext4_valid_extent_entries(struct inode *inode,
348 				struct ext4_extent_header *eh,
349 				int depth)
350 {
351 	struct ext4_extent *ext;
352 	struct ext4_extent_idx *ext_idx;
353 	unsigned short entries;
354 	if (eh->eh_entries == 0)
355 		return 1;
356 
357 	entries = le16_to_cpu(eh->eh_entries);
358 
359 	if (depth == 0) {
360 		/* leaf entries */
361 		ext = EXT_FIRST_EXTENT(eh);
362 		while (entries) {
363 			if (!ext4_valid_extent(inode, ext))
364 				return 0;
365 			ext++;
366 			entries--;
367 		}
368 	} else {
369 		ext_idx = EXT_FIRST_INDEX(eh);
370 		while (entries) {
371 			if (!ext4_valid_extent_idx(inode, ext_idx))
372 				return 0;
373 			ext_idx++;
374 			entries--;
375 		}
376 	}
377 	return 1;
378 }
379 
380 static int __ext4_ext_check(const char *function, unsigned int line,
381 			    struct inode *inode, struct ext4_extent_header *eh,
382 			    int depth)
383 {
384 	const char *error_msg;
385 	int max = 0;
386 
387 	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
388 		error_msg = "invalid magic";
389 		goto corrupted;
390 	}
391 	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
392 		error_msg = "unexpected eh_depth";
393 		goto corrupted;
394 	}
395 	if (unlikely(eh->eh_max == 0)) {
396 		error_msg = "invalid eh_max";
397 		goto corrupted;
398 	}
399 	max = ext4_ext_max_entries(inode, depth);
400 	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
401 		error_msg = "too large eh_max";
402 		goto corrupted;
403 	}
404 	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
405 		error_msg = "invalid eh_entries";
406 		goto corrupted;
407 	}
408 	if (!ext4_valid_extent_entries(inode, eh, depth)) {
409 		error_msg = "invalid extent entries";
410 		goto corrupted;
411 	}
412 	return 0;
413 
414 corrupted:
415 	ext4_error_inode(inode, function, line, 0,
416 			"bad header/extent: %s - magic %x, "
417 			"entries %u, max %u(%u), depth %u(%u)",
418 			error_msg, le16_to_cpu(eh->eh_magic),
419 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
420 			max, le16_to_cpu(eh->eh_depth), depth);
421 
422 	return -EIO;
423 }
424 
425 #define ext4_ext_check(inode, eh, depth)	\
426 	__ext4_ext_check(__func__, __LINE__, inode, eh, depth)
427 
428 int ext4_ext_check_inode(struct inode *inode)
429 {
430 	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
431 }
432 
433 #ifdef EXT_DEBUG
434 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
435 {
436 	int k, l = path->p_depth;
437 
438 	ext_debug("path:");
439 	for (k = 0; k <= l; k++, path++) {
440 		if (path->p_idx) {
441 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
442 			    ext4_idx_pblock(path->p_idx));
443 		} else if (path->p_ext) {
444 			ext_debug("  %d:[%d]%d:%llu ",
445 				  le32_to_cpu(path->p_ext->ee_block),
446 				  ext4_ext_is_uninitialized(path->p_ext),
447 				  ext4_ext_get_actual_len(path->p_ext),
448 				  ext4_ext_pblock(path->p_ext));
449 		} else
450 			ext_debug("  []");
451 	}
452 	ext_debug("\n");
453 }
454 
455 static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
456 {
457 	int depth = ext_depth(inode);
458 	struct ext4_extent_header *eh;
459 	struct ext4_extent *ex;
460 	int i;
461 
462 	if (!path)
463 		return;
464 
465 	eh = path[depth].p_hdr;
466 	ex = EXT_FIRST_EXTENT(eh);
467 
468 	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
469 
470 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
471 		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
472 			  ext4_ext_is_uninitialized(ex),
473 			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
474 	}
475 	ext_debug("\n");
476 }
477 #else
478 #define ext4_ext_show_path(inode, path)
479 #define ext4_ext_show_leaf(inode, path)
480 #endif
481 
482 void ext4_ext_drop_refs(struct ext4_ext_path *path)
483 {
484 	int depth = path->p_depth;
485 	int i;
486 
487 	for (i = 0; i <= depth; i++, path++)
488 		if (path->p_bh) {
489 			brelse(path->p_bh);
490 			path->p_bh = NULL;
491 		}
492 }
493 
494 /*
495  * ext4_ext_binsearch_idx:
496  * binary search for the closest index of the given block
497  * the header must be checked before calling this
498  */
499 static void
500 ext4_ext_binsearch_idx(struct inode *inode,
501 			struct ext4_ext_path *path, ext4_lblk_t block)
502 {
503 	struct ext4_extent_header *eh = path->p_hdr;
504 	struct ext4_extent_idx *r, *l, *m;
505 
506 
507 	ext_debug("binsearch for %u(idx):  ", block);
508 
509 	l = EXT_FIRST_INDEX(eh) + 1;
510 	r = EXT_LAST_INDEX(eh);
511 	while (l <= r) {
512 		m = l + (r - l) / 2;
513 		if (block < le32_to_cpu(m->ei_block))
514 			r = m - 1;
515 		else
516 			l = m + 1;
517 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block),
518 				m, le32_to_cpu(m->ei_block),
519 				r, le32_to_cpu(r->ei_block));
520 	}
521 
522 	path->p_idx = l - 1;
523 	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
524 		  ext4_idx_pblock(path->p_idx));
525 
526 #ifdef CHECK_BINSEARCH
527 	{
528 		struct ext4_extent_idx *chix, *ix;
529 		int k;
530 
531 		chix = ix = EXT_FIRST_INDEX(eh);
532 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
533 		  if (k != 0 &&
534 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
535 				printk(KERN_DEBUG "k=%d, ix=0x%p, "
536 				       "first=0x%p\n", k,
537 				       ix, EXT_FIRST_INDEX(eh));
538 				printk(KERN_DEBUG "%u <= %u\n",
539 				       le32_to_cpu(ix->ei_block),
540 				       le32_to_cpu(ix[-1].ei_block));
541 			}
542 			BUG_ON(k && le32_to_cpu(ix->ei_block)
543 					   <= le32_to_cpu(ix[-1].ei_block));
544 			if (block < le32_to_cpu(ix->ei_block))
545 				break;
546 			chix = ix;
547 		}
548 		BUG_ON(chix != path->p_idx);
549 	}
550 #endif
551 
552 }
553 
554 /*
555  * ext4_ext_binsearch:
556  * binary search for closest extent of the given block
557  * the header must be checked before calling this
558  */
559 static void
560 ext4_ext_binsearch(struct inode *inode,
561 		struct ext4_ext_path *path, ext4_lblk_t block)
562 {
563 	struct ext4_extent_header *eh = path->p_hdr;
564 	struct ext4_extent *r, *l, *m;
565 
566 	if (eh->eh_entries == 0) {
567 		/*
568 		 * this leaf is empty:
569 		 * we get such a leaf in split/add case
570 		 */
571 		return;
572 	}
573 
574 	ext_debug("binsearch for %u:  ", block);
575 
576 	l = EXT_FIRST_EXTENT(eh) + 1;
577 	r = EXT_LAST_EXTENT(eh);
578 
579 	while (l <= r) {
580 		m = l + (r - l) / 2;
581 		if (block < le32_to_cpu(m->ee_block))
582 			r = m - 1;
583 		else
584 			l = m + 1;
585 		ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block),
586 				m, le32_to_cpu(m->ee_block),
587 				r, le32_to_cpu(r->ee_block));
588 	}
589 
590 	path->p_ext = l - 1;
591 	ext_debug("  -> %d:%llu:[%d]%d ",
592 			le32_to_cpu(path->p_ext->ee_block),
593 			ext4_ext_pblock(path->p_ext),
594 			ext4_ext_is_uninitialized(path->p_ext),
595 			ext4_ext_get_actual_len(path->p_ext));
596 
597 #ifdef CHECK_BINSEARCH
598 	{
599 		struct ext4_extent *chex, *ex;
600 		int k;
601 
602 		chex = ex = EXT_FIRST_EXTENT(eh);
603 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
604 			BUG_ON(k && le32_to_cpu(ex->ee_block)
605 					  <= le32_to_cpu(ex[-1].ee_block));
606 			if (block < le32_to_cpu(ex->ee_block))
607 				break;
608 			chex = ex;
609 		}
610 		BUG_ON(chex != path->p_ext);
611 	}
612 #endif
613 
614 }
615 
616 int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
617 {
618 	struct ext4_extent_header *eh;
619 
620 	eh = ext_inode_hdr(inode);
621 	eh->eh_depth = 0;
622 	eh->eh_entries = 0;
623 	eh->eh_magic = EXT4_EXT_MAGIC;
624 	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
625 	ext4_mark_inode_dirty(handle, inode);
626 	ext4_ext_invalidate_cache(inode);
627 	return 0;
628 }
629 
630 struct ext4_ext_path *
631 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
632 					struct ext4_ext_path *path)
633 {
634 	struct ext4_extent_header *eh;
635 	struct buffer_head *bh;
636 	short int depth, i, ppos = 0, alloc = 0;
637 
638 	eh = ext_inode_hdr(inode);
639 	depth = ext_depth(inode);
640 
641 	/* account possible depth increase */
642 	if (!path) {
643 		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
644 				GFP_NOFS);
645 		if (!path)
646 			return ERR_PTR(-ENOMEM);
647 		alloc = 1;
648 	}
649 	path[0].p_hdr = eh;
650 	path[0].p_bh = NULL;
651 
652 	i = depth;
653 	/* walk through the tree */
654 	while (i) {
655 		int need_to_validate = 0;
656 
657 		ext_debug("depth %d: num %d, max %d\n",
658 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
659 
660 		ext4_ext_binsearch_idx(inode, path + ppos, block);
661 		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
662 		path[ppos].p_depth = i;
663 		path[ppos].p_ext = NULL;
664 
665 		bh = sb_getblk(inode->i_sb, path[ppos].p_block);
666 		if (unlikely(!bh))
667 			goto err;
668 		if (!bh_uptodate_or_lock(bh)) {
669 			trace_ext4_ext_load_extent(inode, block,
670 						path[ppos].p_block);
671 			if (bh_submit_read(bh) < 0) {
672 				put_bh(bh);
673 				goto err;
674 			}
675 			/* validate the extent entries */
676 			need_to_validate = 1;
677 		}
678 		eh = ext_block_hdr(bh);
679 		ppos++;
680 		if (unlikely(ppos > depth)) {
681 			put_bh(bh);
682 			EXT4_ERROR_INODE(inode,
683 					 "ppos %d > depth %d", ppos, depth);
684 			goto err;
685 		}
686 		path[ppos].p_bh = bh;
687 		path[ppos].p_hdr = eh;
688 		i--;
689 
690 		if (need_to_validate && ext4_ext_check(inode, eh, i))
691 			goto err;
692 	}
693 
694 	path[ppos].p_depth = i;
695 	path[ppos].p_ext = NULL;
696 	path[ppos].p_idx = NULL;
697 
698 	/* find extent */
699 	ext4_ext_binsearch(inode, path + ppos, block);
700 	/* if not an empty leaf */
701 	if (path[ppos].p_ext)
702 		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
703 
704 	ext4_ext_show_path(inode, path);
705 
706 	return path;
707 
708 err:
709 	ext4_ext_drop_refs(path);
710 	if (alloc)
711 		kfree(path);
712 	return ERR_PTR(-EIO);
713 }
714 
715 /*
716  * ext4_ext_insert_index:
717  * insert new index [@logical;@ptr] into the block at @curp;
718  * check where to insert: before @curp or after @curp
719  */
720 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
721 				 struct ext4_ext_path *curp,
722 				 int logical, ext4_fsblk_t ptr)
723 {
724 	struct ext4_extent_idx *ix;
725 	int len, err;
726 
727 	err = ext4_ext_get_access(handle, inode, curp);
728 	if (err)
729 		return err;
730 
731 	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
732 		EXT4_ERROR_INODE(inode,
733 				 "logical %d == ei_block %d!",
734 				 logical, le32_to_cpu(curp->p_idx->ei_block));
735 		return -EIO;
736 	}
737 	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
738 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
739 		/* insert after */
740 		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
741 			len = (len - 1) * sizeof(struct ext4_extent_idx);
742 			len = len < 0 ? 0 : len;
743 			ext_debug("insert new index %d after: %llu. "
744 					"move %d from 0x%p to 0x%p\n",
745 					logical, ptr, len,
746 					(curp->p_idx + 1), (curp->p_idx + 2));
747 			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
748 		}
749 		ix = curp->p_idx + 1;
750 	} else {
751 		/* insert before */
752 		len = len * sizeof(struct ext4_extent_idx);
753 		len = len < 0 ? 0 : len;
754 		ext_debug("insert new index %d before: %llu. "
755 				"move %d from 0x%p to 0x%p\n",
756 				logical, ptr, len,
757 				curp->p_idx, (curp->p_idx + 1));
758 		memmove(curp->p_idx + 1, curp->p_idx, len);
759 		ix = curp->p_idx;
760 	}
761 
762 	ix->ei_block = cpu_to_le32(logical);
763 	ext4_idx_store_pblock(ix, ptr);
764 	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
765 
766 	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
767 			     > le16_to_cpu(curp->p_hdr->eh_max))) {
768 		EXT4_ERROR_INODE(inode,
769 				 "logical %d == ei_block %d!",
770 				 logical, le32_to_cpu(curp->p_idx->ei_block));
771 		return -EIO;
772 	}
773 	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
774 		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
775 		return -EIO;
776 	}
777 
778 	err = ext4_ext_dirty(handle, inode, curp);
779 	ext4_std_error(inode->i_sb, err);
780 
781 	return err;
782 }
783 
784 /*
785  * ext4_ext_split:
786  * inserts new subtree into the path, using free index entry
787  * at depth @at:
788  * - allocates all needed blocks (new leaf and all intermediate index blocks)
789  * - makes decision where to split
790  * - moves remaining extents and index entries (right to the split point)
791  *   into the newly allocated blocks
792  * - initializes subtree
793  */
794 static int ext4_ext_split(handle_t *handle, struct inode *inode,
795 				struct ext4_ext_path *path,
796 				struct ext4_extent *newext, int at)
797 {
798 	struct buffer_head *bh = NULL;
799 	int depth = ext_depth(inode);
800 	struct ext4_extent_header *neh;
801 	struct ext4_extent_idx *fidx;
802 	struct ext4_extent *ex;
803 	int i = at, k, m, a;
804 	ext4_fsblk_t newblock, oldblock;
805 	__le32 border;
806 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
807 	int err = 0;
808 
809 	/* make decision: where to split? */
810 	/* FIXME: now decision is simplest: at current extent */
811 
812 	/* if current leaf will be split, then we should use
813 	 * border from split point */
814 	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
815 		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
816 		return -EIO;
817 	}
818 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
819 		border = path[depth].p_ext[1].ee_block;
820 		ext_debug("leaf will be split."
821 				" next leaf starts at %d\n",
822 				  le32_to_cpu(border));
823 	} else {
824 		border = newext->ee_block;
825 		ext_debug("leaf will be added."
826 				" next leaf starts at %d\n",
827 				le32_to_cpu(border));
828 	}
829 
830 	/*
831 	 * If error occurs, then we break processing
832 	 * and mark filesystem read-only. index won't
833 	 * be inserted and tree will be in consistent
834 	 * state. Next mount will repair buffers too.
835 	 */
836 
837 	/*
838 	 * Get array to track all allocated blocks.
839 	 * We need this to handle errors and free blocks
840 	 * upon them.
841 	 */
842 	ablocks = kzalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
843 	if (!ablocks)
844 		return -ENOMEM;
845 
846 	/* allocate all needed blocks */
847 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
848 	for (a = 0; a < depth - at; a++) {
849 		newblock = ext4_ext_new_meta_block(handle, inode, path,
850 						   newext, &err);
851 		if (newblock == 0)
852 			goto cleanup;
853 		ablocks[a] = newblock;
854 	}
855 
856 	/* initialize new leaf */
857 	newblock = ablocks[--a];
858 	if (unlikely(newblock == 0)) {
859 		EXT4_ERROR_INODE(inode, "newblock == 0!");
860 		err = -EIO;
861 		goto cleanup;
862 	}
863 	bh = sb_getblk(inode->i_sb, newblock);
864 	if (!bh) {
865 		err = -EIO;
866 		goto cleanup;
867 	}
868 	lock_buffer(bh);
869 
870 	err = ext4_journal_get_create_access(handle, bh);
871 	if (err)
872 		goto cleanup;
873 
874 	neh = ext_block_hdr(bh);
875 	neh->eh_entries = 0;
876 	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
877 	neh->eh_magic = EXT4_EXT_MAGIC;
878 	neh->eh_depth = 0;
879 	ex = EXT_FIRST_EXTENT(neh);
880 
881 	/* move remainder of path[depth] to the new leaf */
882 	if (unlikely(path[depth].p_hdr->eh_entries !=
883 		     path[depth].p_hdr->eh_max)) {
884 		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
885 				 path[depth].p_hdr->eh_entries,
886 				 path[depth].p_hdr->eh_max);
887 		err = -EIO;
888 		goto cleanup;
889 	}
890 	/* start copy from next extent */
891 	/* TODO: we could do it by single memmove */
892 	m = 0;
893 	path[depth].p_ext++;
894 	while (path[depth].p_ext <=
895 			EXT_MAX_EXTENT(path[depth].p_hdr)) {
896 		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
897 				le32_to_cpu(path[depth].p_ext->ee_block),
898 				ext4_ext_pblock(path[depth].p_ext),
899 				ext4_ext_is_uninitialized(path[depth].p_ext),
900 				ext4_ext_get_actual_len(path[depth].p_ext),
901 				newblock);
902 		/*memmove(ex++, path[depth].p_ext++,
903 				sizeof(struct ext4_extent));
904 		neh->eh_entries++;*/
905 		path[depth].p_ext++;
906 		m++;
907 	}
908 	if (m) {
909 		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
910 		le16_add_cpu(&neh->eh_entries, m);
911 	}
912 
913 	set_buffer_uptodate(bh);
914 	unlock_buffer(bh);
915 
916 	err = ext4_handle_dirty_metadata(handle, inode, bh);
917 	if (err)
918 		goto cleanup;
919 	brelse(bh);
920 	bh = NULL;
921 
922 	/* correct old leaf */
923 	if (m) {
924 		err = ext4_ext_get_access(handle, inode, path + depth);
925 		if (err)
926 			goto cleanup;
927 		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
928 		err = ext4_ext_dirty(handle, inode, path + depth);
929 		if (err)
930 			goto cleanup;
931 
932 	}
933 
934 	/* create intermediate indexes */
935 	k = depth - at - 1;
936 	if (unlikely(k < 0)) {
937 		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
938 		err = -EIO;
939 		goto cleanup;
940 	}
941 	if (k)
942 		ext_debug("create %d intermediate indices\n", k);
943 	/* insert new index into current index block */
944 	/* current depth stored in i var */
945 	i = depth - 1;
946 	while (k--) {
947 		oldblock = newblock;
948 		newblock = ablocks[--a];
949 		bh = sb_getblk(inode->i_sb, newblock);
950 		if (!bh) {
951 			err = -EIO;
952 			goto cleanup;
953 		}
954 		lock_buffer(bh);
955 
956 		err = ext4_journal_get_create_access(handle, bh);
957 		if (err)
958 			goto cleanup;
959 
960 		neh = ext_block_hdr(bh);
961 		neh->eh_entries = cpu_to_le16(1);
962 		neh->eh_magic = EXT4_EXT_MAGIC;
963 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
964 		neh->eh_depth = cpu_to_le16(depth - i);
965 		fidx = EXT_FIRST_INDEX(neh);
966 		fidx->ei_block = border;
967 		ext4_idx_store_pblock(fidx, oldblock);
968 
969 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 				i, newblock, le32_to_cpu(border), oldblock);
971 		/* copy indexes */
972 		m = 0;
973 		path[i].p_idx++;
974 
975 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
976 				EXT_MAX_INDEX(path[i].p_hdr));
977 		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978 					EXT_LAST_INDEX(path[i].p_hdr))) {
979 			EXT4_ERROR_INODE(inode,
980 					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
981 					 le32_to_cpu(path[i].p_ext->ee_block));
982 			err = -EIO;
983 			goto cleanup;
984 		}
985 		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
986 			ext_debug("%d: move %d:%llu in new index %llu\n", i,
987 					le32_to_cpu(path[i].p_idx->ei_block),
988 					ext4_idx_pblock(path[i].p_idx),
989 					newblock);
990 			/*memmove(++fidx, path[i].p_idx++,
991 					sizeof(struct ext4_extent_idx));
992 			neh->eh_entries++;
993 			BUG_ON(neh->eh_entries > neh->eh_max);*/
994 			path[i].p_idx++;
995 			m++;
996 		}
997 		if (m) {
998 			memmove(++fidx, path[i].p_idx - m,
999 				sizeof(struct ext4_extent_idx) * m);
1000 			le16_add_cpu(&neh->eh_entries, m);
1001 		}
1002 		set_buffer_uptodate(bh);
1003 		unlock_buffer(bh);
1004 
1005 		err = ext4_handle_dirty_metadata(handle, inode, bh);
1006 		if (err)
1007 			goto cleanup;
1008 		brelse(bh);
1009 		bh = NULL;
1010 
1011 		/* correct old index */
1012 		if (m) {
1013 			err = ext4_ext_get_access(handle, inode, path + i);
1014 			if (err)
1015 				goto cleanup;
1016 			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1017 			err = ext4_ext_dirty(handle, inode, path + i);
1018 			if (err)
1019 				goto cleanup;
1020 		}
1021 
1022 		i--;
1023 	}
1024 
1025 	/* insert new index */
1026 	err = ext4_ext_insert_index(handle, inode, path + at,
1027 				    le32_to_cpu(border), newblock);
1028 
1029 cleanup:
1030 	if (bh) {
1031 		if (buffer_locked(bh))
1032 			unlock_buffer(bh);
1033 		brelse(bh);
1034 	}
1035 
1036 	if (err) {
1037 		/* free all allocated blocks in error case */
1038 		for (i = 0; i < depth; i++) {
1039 			if (!ablocks[i])
1040 				continue;
1041 			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1042 					 EXT4_FREE_BLOCKS_METADATA);
1043 		}
1044 	}
1045 	kfree(ablocks);
1046 
1047 	return err;
1048 }
1049 
1050 /*
1051  * ext4_ext_grow_indepth:
1052  * implements tree growing procedure:
1053  * - allocates new block
1054  * - moves top-level data (index block or leaf) into the new block
1055  * - initializes new top-level, creating index that points to the
1056  *   just created block
1057  */
1058 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1059 					struct ext4_ext_path *path,
1060 					struct ext4_extent *newext)
1061 {
1062 	struct ext4_ext_path *curp = path;
1063 	struct ext4_extent_header *neh;
1064 	struct buffer_head *bh;
1065 	ext4_fsblk_t newblock;
1066 	int err = 0;
1067 
1068 	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1069 	if (newblock == 0)
1070 		return err;
1071 
1072 	bh = sb_getblk(inode->i_sb, newblock);
1073 	if (!bh) {
1074 		err = -EIO;
1075 		ext4_std_error(inode->i_sb, err);
1076 		return err;
1077 	}
1078 	lock_buffer(bh);
1079 
1080 	err = ext4_journal_get_create_access(handle, bh);
1081 	if (err) {
1082 		unlock_buffer(bh);
1083 		goto out;
1084 	}
1085 
1086 	/* move top-level index/leaf into new block */
1087 	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
1088 
1089 	/* set size of new block */
1090 	neh = ext_block_hdr(bh);
1091 	/* old root could have indexes or leaves
1092 	 * so calculate e_max right way */
1093 	if (ext_depth(inode))
1094 		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1095 	else
1096 		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1097 	neh->eh_magic = EXT4_EXT_MAGIC;
1098 	set_buffer_uptodate(bh);
1099 	unlock_buffer(bh);
1100 
1101 	err = ext4_handle_dirty_metadata(handle, inode, bh);
1102 	if (err)
1103 		goto out;
1104 
1105 	/* create index in new top-level index: num,max,pointer */
1106 	err = ext4_ext_get_access(handle, inode, curp);
1107 	if (err)
1108 		goto out;
1109 
1110 	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1111 	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1112 	curp->p_hdr->eh_entries = cpu_to_le16(1);
1113 	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1114 
1115 	if (path[0].p_hdr->eh_depth)
1116 		curp->p_idx->ei_block =
1117 			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
1118 	else
1119 		curp->p_idx->ei_block =
1120 			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
1121 	ext4_idx_store_pblock(curp->p_idx, newblock);
1122 
1123 	neh = ext_inode_hdr(inode);
1124 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1125 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1126 		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1127 		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1128 
1129 	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1130 	err = ext4_ext_dirty(handle, inode, curp);
1131 out:
1132 	brelse(bh);
1133 
1134 	return err;
1135 }
1136 
1137 /*
1138  * ext4_ext_create_new_leaf:
1139  * finds empty index and adds new leaf.
1140  * if no free index is found, then it requests in-depth growing.
1141  */
1142 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1143 					struct ext4_ext_path *path,
1144 					struct ext4_extent *newext)
1145 {
1146 	struct ext4_ext_path *curp;
1147 	int depth, i, err = 0;
1148 
1149 repeat:
1150 	i = depth = ext_depth(inode);
1151 
1152 	/* walk up to the tree and look for free index entry */
1153 	curp = path + depth;
1154 	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1155 		i--;
1156 		curp--;
1157 	}
1158 
1159 	/* we use already allocated block for index block,
1160 	 * so subsequent data blocks should be contiguous */
1161 	if (EXT_HAS_FREE_INDEX(curp)) {
1162 		/* if we found index with free entry, then use that
1163 		 * entry: create all needed subtree and add new leaf */
1164 		err = ext4_ext_split(handle, inode, path, newext, i);
1165 		if (err)
1166 			goto out;
1167 
1168 		/* refill path */
1169 		ext4_ext_drop_refs(path);
1170 		path = ext4_ext_find_extent(inode,
1171 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1172 				    path);
1173 		if (IS_ERR(path))
1174 			err = PTR_ERR(path);
1175 	} else {
1176 		/* tree is full, time to grow in depth */
1177 		err = ext4_ext_grow_indepth(handle, inode, path, newext);
1178 		if (err)
1179 			goto out;
1180 
1181 		/* refill path */
1182 		ext4_ext_drop_refs(path);
1183 		path = ext4_ext_find_extent(inode,
1184 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1185 				    path);
1186 		if (IS_ERR(path)) {
1187 			err = PTR_ERR(path);
1188 			goto out;
1189 		}
1190 
1191 		/*
1192 		 * only first (depth 0 -> 1) produces free space;
1193 		 * in all other cases we have to split the grown tree
1194 		 */
1195 		depth = ext_depth(inode);
1196 		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1197 			/* now we need to split */
1198 			goto repeat;
1199 		}
1200 	}
1201 
1202 out:
1203 	return err;
1204 }
1205 
1206 /*
1207  * search the closest allocated block to the left for *logical
1208  * and returns it at @logical + it's physical address at @phys
1209  * if *logical is the smallest allocated block, the function
1210  * returns 0 at @phys
1211  * return value contains 0 (success) or error code
1212  */
1213 static int ext4_ext_search_left(struct inode *inode,
1214 				struct ext4_ext_path *path,
1215 				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1216 {
1217 	struct ext4_extent_idx *ix;
1218 	struct ext4_extent *ex;
1219 	int depth, ee_len;
1220 
1221 	if (unlikely(path == NULL)) {
1222 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1223 		return -EIO;
1224 	}
1225 	depth = path->p_depth;
1226 	*phys = 0;
1227 
1228 	if (depth == 0 && path->p_ext == NULL)
1229 		return 0;
1230 
1231 	/* usually extent in the path covers blocks smaller
1232 	 * then *logical, but it can be that extent is the
1233 	 * first one in the file */
1234 
1235 	ex = path[depth].p_ext;
1236 	ee_len = ext4_ext_get_actual_len(ex);
1237 	if (*logical < le32_to_cpu(ex->ee_block)) {
1238 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1239 			EXT4_ERROR_INODE(inode,
1240 					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1241 					 *logical, le32_to_cpu(ex->ee_block));
1242 			return -EIO;
1243 		}
1244 		while (--depth >= 0) {
1245 			ix = path[depth].p_idx;
1246 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1247 				EXT4_ERROR_INODE(inode,
1248 				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1249 				  ix != NULL ? ix->ei_block : 0,
1250 				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1251 				    EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block : 0,
1252 				  depth);
1253 				return -EIO;
1254 			}
1255 		}
1256 		return 0;
1257 	}
1258 
1259 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1260 		EXT4_ERROR_INODE(inode,
1261 				 "logical %d < ee_block %d + ee_len %d!",
1262 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1263 		return -EIO;
1264 	}
1265 
1266 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1267 	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1268 	return 0;
1269 }
1270 
1271 /*
1272  * search the closest allocated block to the right for *logical
1273  * and returns it at @logical + it's physical address at @phys
1274  * if *logical is the smallest allocated block, the function
1275  * returns 0 at @phys
1276  * return value contains 0 (success) or error code
1277  */
1278 static int ext4_ext_search_right(struct inode *inode,
1279 				 struct ext4_ext_path *path,
1280 				 ext4_lblk_t *logical, ext4_fsblk_t *phys)
1281 {
1282 	struct buffer_head *bh = NULL;
1283 	struct ext4_extent_header *eh;
1284 	struct ext4_extent_idx *ix;
1285 	struct ext4_extent *ex;
1286 	ext4_fsblk_t block;
1287 	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1288 	int ee_len;
1289 
1290 	if (unlikely(path == NULL)) {
1291 		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1292 		return -EIO;
1293 	}
1294 	depth = path->p_depth;
1295 	*phys = 0;
1296 
1297 	if (depth == 0 && path->p_ext == NULL)
1298 		return 0;
1299 
1300 	/* usually extent in the path covers blocks smaller
1301 	 * then *logical, but it can be that extent is the
1302 	 * first one in the file */
1303 
1304 	ex = path[depth].p_ext;
1305 	ee_len = ext4_ext_get_actual_len(ex);
1306 	if (*logical < le32_to_cpu(ex->ee_block)) {
1307 		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1308 			EXT4_ERROR_INODE(inode,
1309 					 "first_extent(path[%d].p_hdr) != ex",
1310 					 depth);
1311 			return -EIO;
1312 		}
1313 		while (--depth >= 0) {
1314 			ix = path[depth].p_idx;
1315 			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1316 				EXT4_ERROR_INODE(inode,
1317 						 "ix != EXT_FIRST_INDEX *logical %d!",
1318 						 *logical);
1319 				return -EIO;
1320 			}
1321 		}
1322 		*logical = le32_to_cpu(ex->ee_block);
1323 		*phys = ext4_ext_pblock(ex);
1324 		return 0;
1325 	}
1326 
1327 	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1328 		EXT4_ERROR_INODE(inode,
1329 				 "logical %d < ee_block %d + ee_len %d!",
1330 				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1331 		return -EIO;
1332 	}
1333 
1334 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1335 		/* next allocated block in this leaf */
1336 		ex++;
1337 		*logical = le32_to_cpu(ex->ee_block);
1338 		*phys = ext4_ext_pblock(ex);
1339 		return 0;
1340 	}
1341 
1342 	/* go up and search for index to the right */
1343 	while (--depth >= 0) {
1344 		ix = path[depth].p_idx;
1345 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1346 			goto got_index;
1347 	}
1348 
1349 	/* we've gone up to the root and found no index to the right */
1350 	return 0;
1351 
1352 got_index:
1353 	/* we've found index to the right, let's
1354 	 * follow it and find the closest allocated
1355 	 * block to the right */
1356 	ix++;
1357 	block = ext4_idx_pblock(ix);
1358 	while (++depth < path->p_depth) {
1359 		bh = sb_bread(inode->i_sb, block);
1360 		if (bh == NULL)
1361 			return -EIO;
1362 		eh = ext_block_hdr(bh);
1363 		/* subtract from p_depth to get proper eh_depth */
1364 		if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1365 			put_bh(bh);
1366 			return -EIO;
1367 		}
1368 		ix = EXT_FIRST_INDEX(eh);
1369 		block = ext4_idx_pblock(ix);
1370 		put_bh(bh);
1371 	}
1372 
1373 	bh = sb_bread(inode->i_sb, block);
1374 	if (bh == NULL)
1375 		return -EIO;
1376 	eh = ext_block_hdr(bh);
1377 	if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
1378 		put_bh(bh);
1379 		return -EIO;
1380 	}
1381 	ex = EXT_FIRST_EXTENT(eh);
1382 	*logical = le32_to_cpu(ex->ee_block);
1383 	*phys = ext4_ext_pblock(ex);
1384 	put_bh(bh);
1385 	return 0;
1386 }
1387 
1388 /*
1389  * ext4_ext_next_allocated_block:
1390  * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
1391  * NOTE: it considers block number from index entry as
1392  * allocated block. Thus, index entries have to be consistent
1393  * with leaves.
1394  */
1395 static ext4_lblk_t
1396 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1397 {
1398 	int depth;
1399 
1400 	BUG_ON(path == NULL);
1401 	depth = path->p_depth;
1402 
1403 	if (depth == 0 && path->p_ext == NULL)
1404 		return EXT_MAX_BLOCK;
1405 
1406 	while (depth >= 0) {
1407 		if (depth == path->p_depth) {
1408 			/* leaf */
1409 			if (path[depth].p_ext !=
1410 					EXT_LAST_EXTENT(path[depth].p_hdr))
1411 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
1412 		} else {
1413 			/* index */
1414 			if (path[depth].p_idx !=
1415 					EXT_LAST_INDEX(path[depth].p_hdr))
1416 			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
1417 		}
1418 		depth--;
1419 	}
1420 
1421 	return EXT_MAX_BLOCK;
1422 }
1423 
1424 /*
1425  * ext4_ext_next_leaf_block:
1426  * returns first allocated block from next leaf or EXT_MAX_BLOCK
1427  */
1428 static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
1429 					struct ext4_ext_path *path)
1430 {
1431 	int depth;
1432 
1433 	BUG_ON(path == NULL);
1434 	depth = path->p_depth;
1435 
1436 	/* zero-tree has no leaf blocks at all */
1437 	if (depth == 0)
1438 		return EXT_MAX_BLOCK;
1439 
1440 	/* go to index block */
1441 	depth--;
1442 
1443 	while (depth >= 0) {
1444 		if (path[depth].p_idx !=
1445 				EXT_LAST_INDEX(path[depth].p_hdr))
1446 			return (ext4_lblk_t)
1447 				le32_to_cpu(path[depth].p_idx[1].ei_block);
1448 		depth--;
1449 	}
1450 
1451 	return EXT_MAX_BLOCK;
1452 }
1453 
1454 /*
1455  * ext4_ext_correct_indexes:
1456  * if leaf gets modified and modified extent is first in the leaf,
1457  * then we have to correct all indexes above.
1458  * TODO: do we need to correct tree in all cases?
1459  */
1460 static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1461 				struct ext4_ext_path *path)
1462 {
1463 	struct ext4_extent_header *eh;
1464 	int depth = ext_depth(inode);
1465 	struct ext4_extent *ex;
1466 	__le32 border;
1467 	int k, err = 0;
1468 
1469 	eh = path[depth].p_hdr;
1470 	ex = path[depth].p_ext;
1471 
1472 	if (unlikely(ex == NULL || eh == NULL)) {
1473 		EXT4_ERROR_INODE(inode,
1474 				 "ex %p == NULL or eh %p == NULL", ex, eh);
1475 		return -EIO;
1476 	}
1477 
1478 	if (depth == 0) {
1479 		/* there is no tree at all */
1480 		return 0;
1481 	}
1482 
1483 	if (ex != EXT_FIRST_EXTENT(eh)) {
1484 		/* we correct tree if first leaf got modified only */
1485 		return 0;
1486 	}
1487 
1488 	/*
1489 	 * TODO: we need correction if border is smaller than current one
1490 	 */
1491 	k = depth - 1;
1492 	border = path[depth].p_ext->ee_block;
1493 	err = ext4_ext_get_access(handle, inode, path + k);
1494 	if (err)
1495 		return err;
1496 	path[k].p_idx->ei_block = border;
1497 	err = ext4_ext_dirty(handle, inode, path + k);
1498 	if (err)
1499 		return err;
1500 
1501 	while (k--) {
1502 		/* change all left-side indexes */
1503 		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1504 			break;
1505 		err = ext4_ext_get_access(handle, inode, path + k);
1506 		if (err)
1507 			break;
1508 		path[k].p_idx->ei_block = border;
1509 		err = ext4_ext_dirty(handle, inode, path + k);
1510 		if (err)
1511 			break;
1512 	}
1513 
1514 	return err;
1515 }
1516 
1517 int
1518 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1519 				struct ext4_extent *ex2)
1520 {
1521 	unsigned short ext1_ee_len, ext2_ee_len, max_len;
1522 
1523 	/*
1524 	 * Make sure that either both extents are uninitialized, or
1525 	 * both are _not_.
1526 	 */
1527 	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
1528 		return 0;
1529 
1530 	if (ext4_ext_is_uninitialized(ex1))
1531 		max_len = EXT_UNINIT_MAX_LEN;
1532 	else
1533 		max_len = EXT_INIT_MAX_LEN;
1534 
1535 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1536 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1537 
1538 	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1539 			le32_to_cpu(ex2->ee_block))
1540 		return 0;
1541 
1542 	/*
1543 	 * To allow future support for preallocated extents to be added
1544 	 * as an RO_COMPAT feature, refuse to merge to extents if
1545 	 * this can result in the top bit of ee_len being set.
1546 	 */
1547 	if (ext1_ee_len + ext2_ee_len > max_len)
1548 		return 0;
1549 #ifdef AGGRESSIVE_TEST
1550 	if (ext1_ee_len >= 4)
1551 		return 0;
1552 #endif
1553 
1554 	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1555 		return 1;
1556 	return 0;
1557 }
1558 
1559 /*
1560  * This function tries to merge the "ex" extent to the next extent in the tree.
1561  * It always tries to merge towards right. If you want to merge towards
1562  * left, pass "ex - 1" as argument instead of "ex".
1563  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1564  * 1 if they got merged.
1565  */
1566 static int ext4_ext_try_to_merge_right(struct inode *inode,
1567 				 struct ext4_ext_path *path,
1568 				 struct ext4_extent *ex)
1569 {
1570 	struct ext4_extent_header *eh;
1571 	unsigned int depth, len;
1572 	int merge_done = 0;
1573 	int uninitialized = 0;
1574 
1575 	depth = ext_depth(inode);
1576 	BUG_ON(path[depth].p_hdr == NULL);
1577 	eh = path[depth].p_hdr;
1578 
1579 	while (ex < EXT_LAST_EXTENT(eh)) {
1580 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1581 			break;
1582 		/* merge with next extent! */
1583 		if (ext4_ext_is_uninitialized(ex))
1584 			uninitialized = 1;
1585 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1586 				+ ext4_ext_get_actual_len(ex + 1));
1587 		if (uninitialized)
1588 			ext4_ext_mark_uninitialized(ex);
1589 
1590 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1591 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1592 				* sizeof(struct ext4_extent);
1593 			memmove(ex + 1, ex + 2, len);
1594 		}
1595 		le16_add_cpu(&eh->eh_entries, -1);
1596 		merge_done = 1;
1597 		WARN_ON(eh->eh_entries == 0);
1598 		if (!eh->eh_entries)
1599 			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1600 	}
1601 
1602 	return merge_done;
1603 }
1604 
1605 /*
1606  * This function tries to merge the @ex extent to neighbours in the tree.
1607  * return 1 if merge left else 0.
1608  */
1609 static int ext4_ext_try_to_merge(struct inode *inode,
1610 				  struct ext4_ext_path *path,
1611 				  struct ext4_extent *ex) {
1612 	struct ext4_extent_header *eh;
1613 	unsigned int depth;
1614 	int merge_done = 0;
1615 	int ret = 0;
1616 
1617 	depth = ext_depth(inode);
1618 	BUG_ON(path[depth].p_hdr == NULL);
1619 	eh = path[depth].p_hdr;
1620 
1621 	if (ex > EXT_FIRST_EXTENT(eh))
1622 		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1623 
1624 	if (!merge_done)
1625 		ret = ext4_ext_try_to_merge_right(inode, path, ex);
1626 
1627 	return ret;
1628 }
1629 
1630 /*
1631  * check if a portion of the "newext" extent overlaps with an
1632  * existing extent.
1633  *
1634  * If there is an overlap discovered, it updates the length of the newext
1635  * such that there will be no overlap, and then returns 1.
1636  * If there is no overlap found, it returns 0.
1637  */
1638 static unsigned int ext4_ext_check_overlap(struct inode *inode,
1639 					   struct ext4_extent *newext,
1640 					   struct ext4_ext_path *path)
1641 {
1642 	ext4_lblk_t b1, b2;
1643 	unsigned int depth, len1;
1644 	unsigned int ret = 0;
1645 
1646 	b1 = le32_to_cpu(newext->ee_block);
1647 	len1 = ext4_ext_get_actual_len(newext);
1648 	depth = ext_depth(inode);
1649 	if (!path[depth].p_ext)
1650 		goto out;
1651 	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
1652 
1653 	/*
1654 	 * get the next allocated block if the extent in the path
1655 	 * is before the requested block(s)
1656 	 */
1657 	if (b2 < b1) {
1658 		b2 = ext4_ext_next_allocated_block(path);
1659 		if (b2 == EXT_MAX_BLOCK)
1660 			goto out;
1661 	}
1662 
1663 	/* check for wrap through zero on extent logical start block*/
1664 	if (b1 + len1 < b1) {
1665 		len1 = EXT_MAX_BLOCK - b1;
1666 		newext->ee_len = cpu_to_le16(len1);
1667 		ret = 1;
1668 	}
1669 
1670 	/* check for overlap */
1671 	if (b1 + len1 > b2) {
1672 		newext->ee_len = cpu_to_le16(b2 - b1);
1673 		ret = 1;
1674 	}
1675 out:
1676 	return ret;
1677 }
1678 
1679 /*
1680  * ext4_ext_insert_extent:
1681  * tries to merge requsted extent into the existing extent or
1682  * inserts requested extent as new one into the tree,
1683  * creating new leaf in the no-space case.
1684  */
1685 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1686 				struct ext4_ext_path *path,
1687 				struct ext4_extent *newext, int flag)
1688 {
1689 	struct ext4_extent_header *eh;
1690 	struct ext4_extent *ex, *fex;
1691 	struct ext4_extent *nearex; /* nearest extent */
1692 	struct ext4_ext_path *npath = NULL;
1693 	int depth, len, err;
1694 	ext4_lblk_t next;
1695 	unsigned uninitialized = 0;
1696 
1697 	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1698 		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1699 		return -EIO;
1700 	}
1701 	depth = ext_depth(inode);
1702 	ex = path[depth].p_ext;
1703 	if (unlikely(path[depth].p_hdr == NULL)) {
1704 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1705 		return -EIO;
1706 	}
1707 
1708 	/* try to insert block into found extent and return */
1709 	if (ex && !(flag & EXT4_GET_BLOCKS_PRE_IO)
1710 		&& ext4_can_extents_be_merged(inode, ex, newext)) {
1711 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1712 			  ext4_ext_is_uninitialized(newext),
1713 			  ext4_ext_get_actual_len(newext),
1714 			  le32_to_cpu(ex->ee_block),
1715 			  ext4_ext_is_uninitialized(ex),
1716 			  ext4_ext_get_actual_len(ex),
1717 			  ext4_ext_pblock(ex));
1718 		err = ext4_ext_get_access(handle, inode, path + depth);
1719 		if (err)
1720 			return err;
1721 
1722 		/*
1723 		 * ext4_can_extents_be_merged should have checked that either
1724 		 * both extents are uninitialized, or both aren't. Thus we
1725 		 * need to check only one of them here.
1726 		 */
1727 		if (ext4_ext_is_uninitialized(ex))
1728 			uninitialized = 1;
1729 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1730 					+ ext4_ext_get_actual_len(newext));
1731 		if (uninitialized)
1732 			ext4_ext_mark_uninitialized(ex);
1733 		eh = path[depth].p_hdr;
1734 		nearex = ex;
1735 		goto merge;
1736 	}
1737 
1738 repeat:
1739 	depth = ext_depth(inode);
1740 	eh = path[depth].p_hdr;
1741 	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
1742 		goto has_space;
1743 
1744 	/* probably next leaf has space for us? */
1745 	fex = EXT_LAST_EXTENT(eh);
1746 	next = ext4_ext_next_leaf_block(inode, path);
1747 	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
1748 	    && next != EXT_MAX_BLOCK) {
1749 		ext_debug("next leaf block - %d\n", next);
1750 		BUG_ON(npath != NULL);
1751 		npath = ext4_ext_find_extent(inode, next, NULL);
1752 		if (IS_ERR(npath))
1753 			return PTR_ERR(npath);
1754 		BUG_ON(npath->p_depth != path->p_depth);
1755 		eh = npath[depth].p_hdr;
1756 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1757 			ext_debug("next leaf isn't full(%d)\n",
1758 				  le16_to_cpu(eh->eh_entries));
1759 			path = npath;
1760 			goto repeat;
1761 		}
1762 		ext_debug("next leaf has no free space(%d,%d)\n",
1763 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
1764 	}
1765 
1766 	/*
1767 	 * There is no free space in the found leaf.
1768 	 * We're gonna add a new leaf in the tree.
1769 	 */
1770 	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
1771 	if (err)
1772 		goto cleanup;
1773 	depth = ext_depth(inode);
1774 	eh = path[depth].p_hdr;
1775 
1776 has_space:
1777 	nearex = path[depth].p_ext;
1778 
1779 	err = ext4_ext_get_access(handle, inode, path + depth);
1780 	if (err)
1781 		goto cleanup;
1782 
1783 	if (!nearex) {
1784 		/* there is no extent in this leaf, create first one */
1785 		ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1786 				le32_to_cpu(newext->ee_block),
1787 				ext4_ext_pblock(newext),
1788 				ext4_ext_is_uninitialized(newext),
1789 				ext4_ext_get_actual_len(newext));
1790 		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1791 	} else if (le32_to_cpu(newext->ee_block)
1792 			   > le32_to_cpu(nearex->ee_block)) {
1793 /*		BUG_ON(newext->ee_block == nearex->ee_block); */
1794 		if (nearex != EXT_LAST_EXTENT(eh)) {
1795 			len = EXT_MAX_EXTENT(eh) - nearex;
1796 			len = (len - 1) * sizeof(struct ext4_extent);
1797 			len = len < 0 ? 0 : len;
1798 			ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1799 					"move %d from 0x%p to 0x%p\n",
1800 					le32_to_cpu(newext->ee_block),
1801 					ext4_ext_pblock(newext),
1802 					ext4_ext_is_uninitialized(newext),
1803 					ext4_ext_get_actual_len(newext),
1804 					nearex, len, nearex + 1, nearex + 2);
1805 			memmove(nearex + 2, nearex + 1, len);
1806 		}
1807 		path[depth].p_ext = nearex + 1;
1808 	} else {
1809 		BUG_ON(newext->ee_block == nearex->ee_block);
1810 		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1811 		len = len < 0 ? 0 : len;
1812 		ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1813 				"move %d from 0x%p to 0x%p\n",
1814 				le32_to_cpu(newext->ee_block),
1815 				ext4_ext_pblock(newext),
1816 				ext4_ext_is_uninitialized(newext),
1817 				ext4_ext_get_actual_len(newext),
1818 				nearex, len, nearex + 1, nearex + 2);
1819 		memmove(nearex + 1, nearex, len);
1820 		path[depth].p_ext = nearex;
1821 	}
1822 
1823 	le16_add_cpu(&eh->eh_entries, 1);
1824 	nearex = path[depth].p_ext;
1825 	nearex->ee_block = newext->ee_block;
1826 	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
1827 	nearex->ee_len = newext->ee_len;
1828 
1829 merge:
1830 	/* try to merge extents to the right */
1831 	if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
1832 		ext4_ext_try_to_merge(inode, path, nearex);
1833 
1834 	/* try to merge extents to the left */
1835 
1836 	/* time to correct all indexes above */
1837 	err = ext4_ext_correct_indexes(handle, inode, path);
1838 	if (err)
1839 		goto cleanup;
1840 
1841 	err = ext4_ext_dirty(handle, inode, path + depth);
1842 
1843 cleanup:
1844 	if (npath) {
1845 		ext4_ext_drop_refs(npath);
1846 		kfree(npath);
1847 	}
1848 	ext4_ext_invalidate_cache(inode);
1849 	return err;
1850 }
1851 
1852 static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1853 			       ext4_lblk_t num, ext_prepare_callback func,
1854 			       void *cbdata)
1855 {
1856 	struct ext4_ext_path *path = NULL;
1857 	struct ext4_ext_cache cbex;
1858 	struct ext4_extent *ex;
1859 	ext4_lblk_t next, start = 0, end = 0;
1860 	ext4_lblk_t last = block + num;
1861 	int depth, exists, err = 0;
1862 
1863 	BUG_ON(func == NULL);
1864 	BUG_ON(inode == NULL);
1865 
1866 	while (block < last && block != EXT_MAX_BLOCK) {
1867 		num = last - block;
1868 		/* find extent for this block */
1869 		down_read(&EXT4_I(inode)->i_data_sem);
1870 		path = ext4_ext_find_extent(inode, block, path);
1871 		up_read(&EXT4_I(inode)->i_data_sem);
1872 		if (IS_ERR(path)) {
1873 			err = PTR_ERR(path);
1874 			path = NULL;
1875 			break;
1876 		}
1877 
1878 		depth = ext_depth(inode);
1879 		if (unlikely(path[depth].p_hdr == NULL)) {
1880 			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1881 			err = -EIO;
1882 			break;
1883 		}
1884 		ex = path[depth].p_ext;
1885 		next = ext4_ext_next_allocated_block(path);
1886 
1887 		exists = 0;
1888 		if (!ex) {
1889 			/* there is no extent yet, so try to allocate
1890 			 * all requested space */
1891 			start = block;
1892 			end = block + num;
1893 		} else if (le32_to_cpu(ex->ee_block) > block) {
1894 			/* need to allocate space before found extent */
1895 			start = block;
1896 			end = le32_to_cpu(ex->ee_block);
1897 			if (block + num < end)
1898 				end = block + num;
1899 		} else if (block >= le32_to_cpu(ex->ee_block)
1900 					+ ext4_ext_get_actual_len(ex)) {
1901 			/* need to allocate space after found extent */
1902 			start = block;
1903 			end = block + num;
1904 			if (end >= next)
1905 				end = next;
1906 		} else if (block >= le32_to_cpu(ex->ee_block)) {
1907 			/*
1908 			 * some part of requested space is covered
1909 			 * by found extent
1910 			 */
1911 			start = block;
1912 			end = le32_to_cpu(ex->ee_block)
1913 				+ ext4_ext_get_actual_len(ex);
1914 			if (block + num < end)
1915 				end = block + num;
1916 			exists = 1;
1917 		} else {
1918 			BUG();
1919 		}
1920 		BUG_ON(end <= start);
1921 
1922 		if (!exists) {
1923 			cbex.ec_block = start;
1924 			cbex.ec_len = end - start;
1925 			cbex.ec_start = 0;
1926 		} else {
1927 			cbex.ec_block = le32_to_cpu(ex->ee_block);
1928 			cbex.ec_len = ext4_ext_get_actual_len(ex);
1929 			cbex.ec_start = ext4_ext_pblock(ex);
1930 		}
1931 
1932 		if (unlikely(cbex.ec_len == 0)) {
1933 			EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
1934 			err = -EIO;
1935 			break;
1936 		}
1937 		err = func(inode, path, &cbex, ex, cbdata);
1938 		ext4_ext_drop_refs(path);
1939 
1940 		if (err < 0)
1941 			break;
1942 
1943 		if (err == EXT_REPEAT)
1944 			continue;
1945 		else if (err == EXT_BREAK) {
1946 			err = 0;
1947 			break;
1948 		}
1949 
1950 		if (ext_depth(inode) != depth) {
1951 			/* depth was changed. we have to realloc path */
1952 			kfree(path);
1953 			path = NULL;
1954 		}
1955 
1956 		block = cbex.ec_block + cbex.ec_len;
1957 	}
1958 
1959 	if (path) {
1960 		ext4_ext_drop_refs(path);
1961 		kfree(path);
1962 	}
1963 
1964 	return err;
1965 }
1966 
1967 static void
1968 ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
1969 			__u32 len, ext4_fsblk_t start)
1970 {
1971 	struct ext4_ext_cache *cex;
1972 	BUG_ON(len == 0);
1973 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1974 	cex = &EXT4_I(inode)->i_cached_extent;
1975 	cex->ec_block = block;
1976 	cex->ec_len = len;
1977 	cex->ec_start = start;
1978 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1979 }
1980 
1981 /*
1982  * ext4_ext_put_gap_in_cache:
1983  * calculate boundaries of the gap that the requested block fits into
1984  * and cache this gap
1985  */
1986 static void
1987 ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
1988 				ext4_lblk_t block)
1989 {
1990 	int depth = ext_depth(inode);
1991 	unsigned long len;
1992 	ext4_lblk_t lblock;
1993 	struct ext4_extent *ex;
1994 
1995 	ex = path[depth].p_ext;
1996 	if (ex == NULL) {
1997 		/* there is no extent yet, so gap is [0;-] */
1998 		lblock = 0;
1999 		len = EXT_MAX_BLOCK;
2000 		ext_debug("cache gap(whole file):");
2001 	} else if (block < le32_to_cpu(ex->ee_block)) {
2002 		lblock = block;
2003 		len = le32_to_cpu(ex->ee_block) - block;
2004 		ext_debug("cache gap(before): %u [%u:%u]",
2005 				block,
2006 				le32_to_cpu(ex->ee_block),
2007 				 ext4_ext_get_actual_len(ex));
2008 	} else if (block >= le32_to_cpu(ex->ee_block)
2009 			+ ext4_ext_get_actual_len(ex)) {
2010 		ext4_lblk_t next;
2011 		lblock = le32_to_cpu(ex->ee_block)
2012 			+ ext4_ext_get_actual_len(ex);
2013 
2014 		next = ext4_ext_next_allocated_block(path);
2015 		ext_debug("cache gap(after): [%u:%u] %u",
2016 				le32_to_cpu(ex->ee_block),
2017 				ext4_ext_get_actual_len(ex),
2018 				block);
2019 		BUG_ON(next == lblock);
2020 		len = next - lblock;
2021 	} else {
2022 		lblock = len = 0;
2023 		BUG();
2024 	}
2025 
2026 	ext_debug(" -> %u:%lu\n", lblock, len);
2027 	ext4_ext_put_in_cache(inode, lblock, len, 0);
2028 }
2029 
2030 /*
2031  * Return 0 if cache is invalid; 1 if the cache is valid
2032  */
2033 static int
2034 ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2035 			struct ext4_extent *ex)
2036 {
2037 	struct ext4_ext_cache *cex;
2038 	int ret = 0;
2039 
2040 	/*
2041 	 * We borrow i_block_reservation_lock to protect i_cached_extent
2042 	 */
2043 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2044 	cex = &EXT4_I(inode)->i_cached_extent;
2045 
2046 	/* has cache valid data? */
2047 	if (cex->ec_len == 0)
2048 		goto errout;
2049 
2050 	if (in_range(block, cex->ec_block, cex->ec_len)) {
2051 		ex->ee_block = cpu_to_le32(cex->ec_block);
2052 		ext4_ext_store_pblock(ex, cex->ec_start);
2053 		ex->ee_len = cpu_to_le16(cex->ec_len);
2054 		ext_debug("%u cached by %u:%u:%llu\n",
2055 				block,
2056 				cex->ec_block, cex->ec_len, cex->ec_start);
2057 		ret = 1;
2058 	}
2059 errout:
2060 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2061 	return ret;
2062 }
2063 
2064 /*
2065  * ext4_ext_rm_idx:
2066  * removes index from the index block.
2067  * It's used in truncate case only, thus all requests are for
2068  * last index in the block only.
2069  */
2070 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2071 			struct ext4_ext_path *path)
2072 {
2073 	int err;
2074 	ext4_fsblk_t leaf;
2075 
2076 	/* free index block */
2077 	path--;
2078 	leaf = ext4_idx_pblock(path->p_idx);
2079 	if (unlikely(path->p_hdr->eh_entries == 0)) {
2080 		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2081 		return -EIO;
2082 	}
2083 	err = ext4_ext_get_access(handle, inode, path);
2084 	if (err)
2085 		return err;
2086 	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2087 	err = ext4_ext_dirty(handle, inode, path);
2088 	if (err)
2089 		return err;
2090 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
2091 	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2092 			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2093 	return err;
2094 }
2095 
2096 /*
2097  * ext4_ext_calc_credits_for_single_extent:
2098  * This routine returns max. credits that needed to insert an extent
2099  * to the extent tree.
2100  * When pass the actual path, the caller should calculate credits
2101  * under i_data_sem.
2102  */
2103 int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2104 						struct ext4_ext_path *path)
2105 {
2106 	if (path) {
2107 		int depth = ext_depth(inode);
2108 		int ret = 0;
2109 
2110 		/* probably there is space in leaf? */
2111 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2112 				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2113 
2114 			/*
2115 			 *  There are some space in the leaf tree, no
2116 			 *  need to account for leaf block credit
2117 			 *
2118 			 *  bitmaps and block group descriptor blocks
2119 			 *  and other metadat blocks still need to be
2120 			 *  accounted.
2121 			 */
2122 			/* 1 bitmap, 1 block group descriptor */
2123 			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2124 			return ret;
2125 		}
2126 	}
2127 
2128 	return ext4_chunk_trans_blocks(inode, nrblocks);
2129 }
2130 
2131 /*
2132  * How many index/leaf blocks need to change/allocate to modify nrblocks?
2133  *
2134  * if nrblocks are fit in a single extent (chunk flag is 1), then
2135  * in the worse case, each tree level index/leaf need to be changed
2136  * if the tree split due to insert a new extent, then the old tree
2137  * index/leaf need to be updated too
2138  *
2139  * If the nrblocks are discontiguous, they could cause
2140  * the whole tree split more than once, but this is really rare.
2141  */
2142 int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2143 {
2144 	int index;
2145 	int depth = ext_depth(inode);
2146 
2147 	if (chunk)
2148 		index = depth * 2;
2149 	else
2150 		index = depth * 3;
2151 
2152 	return index;
2153 }
2154 
2155 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2156 				struct ext4_extent *ex,
2157 				ext4_lblk_t from, ext4_lblk_t to)
2158 {
2159 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
2160 	int flags = EXT4_FREE_BLOCKS_FORGET;
2161 
2162 	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2163 		flags |= EXT4_FREE_BLOCKS_METADATA;
2164 #ifdef EXTENTS_STATS
2165 	{
2166 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2167 		spin_lock(&sbi->s_ext_stats_lock);
2168 		sbi->s_ext_blocks += ee_len;
2169 		sbi->s_ext_extents++;
2170 		if (ee_len < sbi->s_ext_min)
2171 			sbi->s_ext_min = ee_len;
2172 		if (ee_len > sbi->s_ext_max)
2173 			sbi->s_ext_max = ee_len;
2174 		if (ext_depth(inode) > sbi->s_depth_max)
2175 			sbi->s_depth_max = ext_depth(inode);
2176 		spin_unlock(&sbi->s_ext_stats_lock);
2177 	}
2178 #endif
2179 	if (from >= le32_to_cpu(ex->ee_block)
2180 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2181 		/* tail removal */
2182 		ext4_lblk_t num;
2183 		ext4_fsblk_t start;
2184 
2185 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
2186 		start = ext4_ext_pblock(ex) + ee_len - num;
2187 		ext_debug("free last %u blocks starting %llu\n", num, start);
2188 		ext4_free_blocks(handle, inode, NULL, start, num, flags);
2189 	} else if (from == le32_to_cpu(ex->ee_block)
2190 		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2191 		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
2192 			from, to, le32_to_cpu(ex->ee_block), ee_len);
2193 	} else {
2194 		printk(KERN_INFO "strange request: removal(2) "
2195 				"%u-%u from %u:%u\n",
2196 				from, to, le32_to_cpu(ex->ee_block), ee_len);
2197 	}
2198 	return 0;
2199 }
2200 
2201 static int
2202 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2203 		struct ext4_ext_path *path, ext4_lblk_t start)
2204 {
2205 	int err = 0, correct_index = 0;
2206 	int depth = ext_depth(inode), credits;
2207 	struct ext4_extent_header *eh;
2208 	ext4_lblk_t a, b, block;
2209 	unsigned num;
2210 	ext4_lblk_t ex_ee_block;
2211 	unsigned short ex_ee_len;
2212 	unsigned uninitialized = 0;
2213 	struct ext4_extent *ex;
2214 
2215 	/* the header must be checked already in ext4_ext_remove_space() */
2216 	ext_debug("truncate since %u in leaf\n", start);
2217 	if (!path[depth].p_hdr)
2218 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2219 	eh = path[depth].p_hdr;
2220 	if (unlikely(path[depth].p_hdr == NULL)) {
2221 		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2222 		return -EIO;
2223 	}
2224 	/* find where to start removing */
2225 	ex = EXT_LAST_EXTENT(eh);
2226 
2227 	ex_ee_block = le32_to_cpu(ex->ee_block);
2228 	ex_ee_len = ext4_ext_get_actual_len(ex);
2229 
2230 	while (ex >= EXT_FIRST_EXTENT(eh) &&
2231 			ex_ee_block + ex_ee_len > start) {
2232 
2233 		if (ext4_ext_is_uninitialized(ex))
2234 			uninitialized = 1;
2235 		else
2236 			uninitialized = 0;
2237 
2238 		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2239 			 uninitialized, ex_ee_len);
2240 		path[depth].p_ext = ex;
2241 
2242 		a = ex_ee_block > start ? ex_ee_block : start;
2243 		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
2244 			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
2245 
2246 		ext_debug("  border %u:%u\n", a, b);
2247 
2248 		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
2249 			block = 0;
2250 			num = 0;
2251 			BUG();
2252 		} else if (a != ex_ee_block) {
2253 			/* remove tail of the extent */
2254 			block = ex_ee_block;
2255 			num = a - block;
2256 		} else if (b != ex_ee_block + ex_ee_len - 1) {
2257 			/* remove head of the extent */
2258 			block = a;
2259 			num = b - a;
2260 			/* there is no "make a hole" API yet */
2261 			BUG();
2262 		} else {
2263 			/* remove whole extent: excellent! */
2264 			block = ex_ee_block;
2265 			num = 0;
2266 			BUG_ON(a != ex_ee_block);
2267 			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
2268 		}
2269 
2270 		/*
2271 		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2272 		 * descriptor) for each block group; assume two block
2273 		 * groups plus ex_ee_len/blocks_per_block_group for
2274 		 * the worst case
2275 		 */
2276 		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2277 		if (ex == EXT_FIRST_EXTENT(eh)) {
2278 			correct_index = 1;
2279 			credits += (ext_depth(inode)) + 1;
2280 		}
2281 		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2282 
2283 		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2284 		if (err)
2285 			goto out;
2286 
2287 		err = ext4_ext_get_access(handle, inode, path + depth);
2288 		if (err)
2289 			goto out;
2290 
2291 		err = ext4_remove_blocks(handle, inode, ex, a, b);
2292 		if (err)
2293 			goto out;
2294 
2295 		if (num == 0) {
2296 			/* this extent is removed; mark slot entirely unused */
2297 			ext4_ext_store_pblock(ex, 0);
2298 			le16_add_cpu(&eh->eh_entries, -1);
2299 		}
2300 
2301 		ex->ee_block = cpu_to_le32(block);
2302 		ex->ee_len = cpu_to_le16(num);
2303 		/*
2304 		 * Do not mark uninitialized if all the blocks in the
2305 		 * extent have been removed.
2306 		 */
2307 		if (uninitialized && num)
2308 			ext4_ext_mark_uninitialized(ex);
2309 
2310 		err = ext4_ext_dirty(handle, inode, path + depth);
2311 		if (err)
2312 			goto out;
2313 
2314 		ext_debug("new extent: %u:%u:%llu\n", block, num,
2315 				ext4_ext_pblock(ex));
2316 		ex--;
2317 		ex_ee_block = le32_to_cpu(ex->ee_block);
2318 		ex_ee_len = ext4_ext_get_actual_len(ex);
2319 	}
2320 
2321 	if (correct_index && eh->eh_entries)
2322 		err = ext4_ext_correct_indexes(handle, inode, path);
2323 
2324 	/* if this leaf is free, then we should
2325 	 * remove it from index block above */
2326 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2327 		err = ext4_ext_rm_idx(handle, inode, path + depth);
2328 
2329 out:
2330 	return err;
2331 }
2332 
2333 /*
2334  * ext4_ext_more_to_rm:
2335  * returns 1 if current index has to be freed (even partial)
2336  */
2337 static int
2338 ext4_ext_more_to_rm(struct ext4_ext_path *path)
2339 {
2340 	BUG_ON(path->p_idx == NULL);
2341 
2342 	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2343 		return 0;
2344 
2345 	/*
2346 	 * if truncate on deeper level happened, it wasn't partial,
2347 	 * so we have to consider current index for truncation
2348 	 */
2349 	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2350 		return 0;
2351 	return 1;
2352 }
2353 
2354 static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2355 {
2356 	struct super_block *sb = inode->i_sb;
2357 	int depth = ext_depth(inode);
2358 	struct ext4_ext_path *path;
2359 	handle_t *handle;
2360 	int i, err;
2361 
2362 	ext_debug("truncate since %u\n", start);
2363 
2364 	/* probably first extent we're gonna free will be last in block */
2365 	handle = ext4_journal_start(inode, depth + 1);
2366 	if (IS_ERR(handle))
2367 		return PTR_ERR(handle);
2368 
2369 again:
2370 	ext4_ext_invalidate_cache(inode);
2371 
2372 	/*
2373 	 * We start scanning from right side, freeing all the blocks
2374 	 * after i_size and walking into the tree depth-wise.
2375 	 */
2376 	depth = ext_depth(inode);
2377 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2378 	if (path == NULL) {
2379 		ext4_journal_stop(handle);
2380 		return -ENOMEM;
2381 	}
2382 	path[0].p_depth = depth;
2383 	path[0].p_hdr = ext_inode_hdr(inode);
2384 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2385 		err = -EIO;
2386 		goto out;
2387 	}
2388 	i = err = 0;
2389 
2390 	while (i >= 0 && err == 0) {
2391 		if (i == depth) {
2392 			/* this is leaf block */
2393 			err = ext4_ext_rm_leaf(handle, inode, path, start);
2394 			/* root level has p_bh == NULL, brelse() eats this */
2395 			brelse(path[i].p_bh);
2396 			path[i].p_bh = NULL;
2397 			i--;
2398 			continue;
2399 		}
2400 
2401 		/* this is index block */
2402 		if (!path[i].p_hdr) {
2403 			ext_debug("initialize header\n");
2404 			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2405 		}
2406 
2407 		if (!path[i].p_idx) {
2408 			/* this level hasn't been touched yet */
2409 			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2410 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2411 			ext_debug("init index ptr: hdr 0x%p, num %d\n",
2412 				  path[i].p_hdr,
2413 				  le16_to_cpu(path[i].p_hdr->eh_entries));
2414 		} else {
2415 			/* we were already here, see at next index */
2416 			path[i].p_idx--;
2417 		}
2418 
2419 		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
2420 				i, EXT_FIRST_INDEX(path[i].p_hdr),
2421 				path[i].p_idx);
2422 		if (ext4_ext_more_to_rm(path + i)) {
2423 			struct buffer_head *bh;
2424 			/* go to the next level */
2425 			ext_debug("move to level %d (block %llu)\n",
2426 				  i + 1, ext4_idx_pblock(path[i].p_idx));
2427 			memset(path + i + 1, 0, sizeof(*path));
2428 			bh = sb_bread(sb, ext4_idx_pblock(path[i].p_idx));
2429 			if (!bh) {
2430 				/* should we reset i_size? */
2431 				err = -EIO;
2432 				break;
2433 			}
2434 			if (WARN_ON(i + 1 > depth)) {
2435 				err = -EIO;
2436 				break;
2437 			}
2438 			if (ext4_ext_check(inode, ext_block_hdr(bh),
2439 							depth - i - 1)) {
2440 				err = -EIO;
2441 				break;
2442 			}
2443 			path[i + 1].p_bh = bh;
2444 
2445 			/* save actual number of indexes since this
2446 			 * number is changed at the next iteration */
2447 			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
2448 			i++;
2449 		} else {
2450 			/* we finished processing this index, go up */
2451 			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
2452 				/* index is empty, remove it;
2453 				 * handle must be already prepared by the
2454 				 * truncatei_leaf() */
2455 				err = ext4_ext_rm_idx(handle, inode, path + i);
2456 			}
2457 			/* root level has p_bh == NULL, brelse() eats this */
2458 			brelse(path[i].p_bh);
2459 			path[i].p_bh = NULL;
2460 			i--;
2461 			ext_debug("return to level %d\n", i);
2462 		}
2463 	}
2464 
2465 	/* TODO: flexible tree reduction should be here */
2466 	if (path->p_hdr->eh_entries == 0) {
2467 		/*
2468 		 * truncate to zero freed all the tree,
2469 		 * so we need to correct eh_depth
2470 		 */
2471 		err = ext4_ext_get_access(handle, inode, path);
2472 		if (err == 0) {
2473 			ext_inode_hdr(inode)->eh_depth = 0;
2474 			ext_inode_hdr(inode)->eh_max =
2475 				cpu_to_le16(ext4_ext_space_root(inode, 0));
2476 			err = ext4_ext_dirty(handle, inode, path);
2477 		}
2478 	}
2479 out:
2480 	ext4_ext_drop_refs(path);
2481 	kfree(path);
2482 	if (err == -EAGAIN)
2483 		goto again;
2484 	ext4_journal_stop(handle);
2485 
2486 	return err;
2487 }
2488 
2489 /*
2490  * called at mount time
2491  */
2492 void ext4_ext_init(struct super_block *sb)
2493 {
2494 	/*
2495 	 * possible initialization would be here
2496 	 */
2497 
2498 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2499 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2500 		printk(KERN_INFO "EXT4-fs: file extents enabled");
2501 #ifdef AGGRESSIVE_TEST
2502 		printk(", aggressive tests");
2503 #endif
2504 #ifdef CHECK_BINSEARCH
2505 		printk(", check binsearch");
2506 #endif
2507 #ifdef EXTENTS_STATS
2508 		printk(", stats");
2509 #endif
2510 		printk("\n");
2511 #endif
2512 #ifdef EXTENTS_STATS
2513 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2514 		EXT4_SB(sb)->s_ext_min = 1 << 30;
2515 		EXT4_SB(sb)->s_ext_max = 0;
2516 #endif
2517 	}
2518 }
2519 
2520 /*
2521  * called at umount time
2522  */
2523 void ext4_ext_release(struct super_block *sb)
2524 {
2525 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
2526 		return;
2527 
2528 #ifdef EXTENTS_STATS
2529 	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
2530 		struct ext4_sb_info *sbi = EXT4_SB(sb);
2531 		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
2532 			sbi->s_ext_blocks, sbi->s_ext_extents,
2533 			sbi->s_ext_blocks / sbi->s_ext_extents);
2534 		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
2535 			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
2536 	}
2537 #endif
2538 }
2539 
2540 /* FIXME!! we need to try to merge to left or right after zero-out  */
2541 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2542 {
2543 	ext4_fsblk_t ee_pblock;
2544 	unsigned int ee_len;
2545 	int ret;
2546 
2547 	ee_len    = ext4_ext_get_actual_len(ex);
2548 	ee_pblock = ext4_ext_pblock(ex);
2549 
2550 	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
2551 	if (ret > 0)
2552 		ret = 0;
2553 
2554 	return ret;
2555 }
2556 
2557 /*
2558  * used by extent splitting.
2559  */
2560 #define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
2561 					due to ENOSPC */
2562 #define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
2563 #define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
2564 
2565 /*
2566  * ext4_split_extent_at() splits an extent at given block.
2567  *
2568  * @handle: the journal handle
2569  * @inode: the file inode
2570  * @path: the path to the extent
2571  * @split: the logical block where the extent is splitted.
2572  * @split_flags: indicates if the extent could be zeroout if split fails, and
2573  *		 the states(init or uninit) of new extents.
2574  * @flags: flags used to insert new extent to extent tree.
2575  *
2576  *
2577  * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2578  * of which are deterimined by split_flag.
2579  *
2580  * There are two cases:
2581  *  a> the extent are splitted into two extent.
2582  *  b> split is not needed, and just mark the extent.
2583  *
2584  * return 0 on success.
2585  */
2586 static int ext4_split_extent_at(handle_t *handle,
2587 			     struct inode *inode,
2588 			     struct ext4_ext_path *path,
2589 			     ext4_lblk_t split,
2590 			     int split_flag,
2591 			     int flags)
2592 {
2593 	ext4_fsblk_t newblock;
2594 	ext4_lblk_t ee_block;
2595 	struct ext4_extent *ex, newex, orig_ex;
2596 	struct ext4_extent *ex2 = NULL;
2597 	unsigned int ee_len, depth;
2598 	int err = 0;
2599 
2600 	ext_debug("ext4_split_extents_at: inode %lu, logical"
2601 		"block %llu\n", inode->i_ino, (unsigned long long)split);
2602 
2603 	ext4_ext_show_leaf(inode, path);
2604 
2605 	depth = ext_depth(inode);
2606 	ex = path[depth].p_ext;
2607 	ee_block = le32_to_cpu(ex->ee_block);
2608 	ee_len = ext4_ext_get_actual_len(ex);
2609 	newblock = split - ee_block + ext4_ext_pblock(ex);
2610 
2611 	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2612 
2613 	err = ext4_ext_get_access(handle, inode, path + depth);
2614 	if (err)
2615 		goto out;
2616 
2617 	if (split == ee_block) {
2618 		/*
2619 		 * case b: block @split is the block that the extent begins with
2620 		 * then we just change the state of the extent, and splitting
2621 		 * is not needed.
2622 		 */
2623 		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2624 			ext4_ext_mark_uninitialized(ex);
2625 		else
2626 			ext4_ext_mark_initialized(ex);
2627 
2628 		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2629 			ext4_ext_try_to_merge(inode, path, ex);
2630 
2631 		err = ext4_ext_dirty(handle, inode, path + depth);
2632 		goto out;
2633 	}
2634 
2635 	/* case a */
2636 	memcpy(&orig_ex, ex, sizeof(orig_ex));
2637 	ex->ee_len = cpu_to_le16(split - ee_block);
2638 	if (split_flag & EXT4_EXT_MARK_UNINIT1)
2639 		ext4_ext_mark_uninitialized(ex);
2640 
2641 	/*
2642 	 * path may lead to new leaf, not to original leaf any more
2643 	 * after ext4_ext_insert_extent() returns,
2644 	 */
2645 	err = ext4_ext_dirty(handle, inode, path + depth);
2646 	if (err)
2647 		goto fix_extent_len;
2648 
2649 	ex2 = &newex;
2650 	ex2->ee_block = cpu_to_le32(split);
2651 	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
2652 	ext4_ext_store_pblock(ex2, newblock);
2653 	if (split_flag & EXT4_EXT_MARK_UNINIT2)
2654 		ext4_ext_mark_uninitialized(ex2);
2655 
2656 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2657 	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2658 		err = ext4_ext_zeroout(inode, &orig_ex);
2659 		if (err)
2660 			goto fix_extent_len;
2661 		/* update the extent length and mark as initialized */
2662 		ex->ee_len = cpu_to_le32(ee_len);
2663 		ext4_ext_try_to_merge(inode, path, ex);
2664 		err = ext4_ext_dirty(handle, inode, path + depth);
2665 		goto out;
2666 	} else if (err)
2667 		goto fix_extent_len;
2668 
2669 out:
2670 	ext4_ext_show_leaf(inode, path);
2671 	return err;
2672 
2673 fix_extent_len:
2674 	ex->ee_len = orig_ex.ee_len;
2675 	ext4_ext_dirty(handle, inode, path + depth);
2676 	return err;
2677 }
2678 
2679 /*
2680  * ext4_split_extents() splits an extent and mark extent which is covered
2681  * by @map as split_flags indicates
2682  *
2683  * It may result in splitting the extent into multiple extents (upto three)
2684  * There are three possibilities:
2685  *   a> There is no split required
2686  *   b> Splits in two extents: Split is happening at either end of the extent
2687  *   c> Splits in three extents: Somone is splitting in middle of the extent
2688  *
2689  */
2690 static int ext4_split_extent(handle_t *handle,
2691 			      struct inode *inode,
2692 			      struct ext4_ext_path *path,
2693 			      struct ext4_map_blocks *map,
2694 			      int split_flag,
2695 			      int flags)
2696 {
2697 	ext4_lblk_t ee_block;
2698 	struct ext4_extent *ex;
2699 	unsigned int ee_len, depth;
2700 	int err = 0;
2701 	int uninitialized;
2702 	int split_flag1, flags1;
2703 
2704 	depth = ext_depth(inode);
2705 	ex = path[depth].p_ext;
2706 	ee_block = le32_to_cpu(ex->ee_block);
2707 	ee_len = ext4_ext_get_actual_len(ex);
2708 	uninitialized = ext4_ext_is_uninitialized(ex);
2709 
2710 	if (map->m_lblk + map->m_len < ee_block + ee_len) {
2711 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2712 			      EXT4_EXT_MAY_ZEROOUT : 0;
2713 		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2714 		if (uninitialized)
2715 			split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2716 				       EXT4_EXT_MARK_UNINIT2;
2717 		err = ext4_split_extent_at(handle, inode, path,
2718 				map->m_lblk + map->m_len, split_flag1, flags1);
2719 	}
2720 
2721 	ext4_ext_drop_refs(path);
2722 	path = ext4_ext_find_extent(inode, map->m_lblk, path);
2723 	if (IS_ERR(path))
2724 		return PTR_ERR(path);
2725 
2726 	if (map->m_lblk >= ee_block) {
2727 		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2728 			      EXT4_EXT_MAY_ZEROOUT : 0;
2729 		if (uninitialized)
2730 			split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2731 		if (split_flag & EXT4_EXT_MARK_UNINIT2)
2732 			split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2733 		err = ext4_split_extent_at(handle, inode, path,
2734 				map->m_lblk, split_flag1, flags);
2735 		if (err)
2736 			goto out;
2737 	}
2738 
2739 	ext4_ext_show_leaf(inode, path);
2740 out:
2741 	return err ? err : map->m_len;
2742 }
2743 
2744 #define EXT4_EXT_ZERO_LEN 7
2745 /*
2746  * This function is called by ext4_ext_map_blocks() if someone tries to write
2747  * to an uninitialized extent. It may result in splitting the uninitialized
2748  * extent into multiple extents (up to three - one initialized and two
2749  * uninitialized).
2750  * There are three possibilities:
2751  *   a> There is no split required: Entire extent should be initialized
2752  *   b> Splits in two extents: Write is happening at either end of the extent
2753  *   c> Splits in three extents: Somone is writing in middle of the extent
2754  */
2755 static int ext4_ext_convert_to_initialized(handle_t *handle,
2756 					   struct inode *inode,
2757 					   struct ext4_map_blocks *map,
2758 					   struct ext4_ext_path *path)
2759 {
2760 	struct ext4_extent *ex, newex, orig_ex;
2761 	struct ext4_extent *ex1 = NULL;
2762 	struct ext4_extent *ex2 = NULL;
2763 	struct ext4_extent *ex3 = NULL;
2764 	struct ext4_extent_header *eh;
2765 	ext4_lblk_t ee_block, eof_block;
2766 	unsigned int allocated, ee_len, depth;
2767 	ext4_fsblk_t newblock;
2768 	int err = 0;
2769 	int ret = 0;
2770 	int may_zeroout;
2771 
2772 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2773 		"block %llu, max_blocks %u\n", inode->i_ino,
2774 		(unsigned long long)map->m_lblk, map->m_len);
2775 
2776 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2777 		inode->i_sb->s_blocksize_bits;
2778 	if (eof_block < map->m_lblk + map->m_len)
2779 		eof_block = map->m_lblk + map->m_len;
2780 
2781 	depth = ext_depth(inode);
2782 	eh = path[depth].p_hdr;
2783 	ex = path[depth].p_ext;
2784 	ee_block = le32_to_cpu(ex->ee_block);
2785 	ee_len = ext4_ext_get_actual_len(ex);
2786 	allocated = ee_len - (map->m_lblk - ee_block);
2787 	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2788 
2789 	ex2 = ex;
2790 	orig_ex.ee_block = ex->ee_block;
2791 	orig_ex.ee_len   = cpu_to_le16(ee_len);
2792 	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2793 
2794 	/*
2795 	 * It is safe to convert extent to initialized via explicit
2796 	 * zeroout only if extent is fully insde i_size or new_size.
2797 	 */
2798 	may_zeroout = ee_block + ee_len <= eof_block;
2799 
2800 	err = ext4_ext_get_access(handle, inode, path + depth);
2801 	if (err)
2802 		goto out;
2803 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2804 	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2805 		err =  ext4_ext_zeroout(inode, &orig_ex);
2806 		if (err)
2807 			goto fix_extent_len;
2808 		/* update the extent length and mark as initialized */
2809 		ex->ee_block = orig_ex.ee_block;
2810 		ex->ee_len   = orig_ex.ee_len;
2811 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2812 		ext4_ext_dirty(handle, inode, path + depth);
2813 		/* zeroed the full extent */
2814 		return allocated;
2815 	}
2816 
2817 	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2818 	if (map->m_lblk > ee_block) {
2819 		ex1 = ex;
2820 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2821 		ext4_ext_mark_uninitialized(ex1);
2822 		ex2 = &newex;
2823 	}
2824 	/*
2825 	 * for sanity, update the length of the ex2 extent before
2826 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2827 	 * overlap of blocks.
2828 	 */
2829 	if (!ex1 && allocated > map->m_len)
2830 		ex2->ee_len = cpu_to_le16(map->m_len);
2831 	/* ex3: to ee_block + ee_len : uninitialised */
2832 	if (allocated > map->m_len) {
2833 		unsigned int newdepth;
2834 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2835 		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2836 			/*
2837 			 * map->m_lblk == ee_block is handled by the zerouout
2838 			 * at the beginning.
2839 			 * Mark first half uninitialized.
2840 			 * Mark second half initialized and zero out the
2841 			 * initialized extent
2842 			 */
2843 			ex->ee_block = orig_ex.ee_block;
2844 			ex->ee_len   = cpu_to_le16(ee_len - allocated);
2845 			ext4_ext_mark_uninitialized(ex);
2846 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2847 			ext4_ext_dirty(handle, inode, path + depth);
2848 
2849 			ex3 = &newex;
2850 			ex3->ee_block = cpu_to_le32(map->m_lblk);
2851 			ext4_ext_store_pblock(ex3, newblock);
2852 			ex3->ee_len = cpu_to_le16(allocated);
2853 			err = ext4_ext_insert_extent(handle, inode, path,
2854 							ex3, 0);
2855 			if (err == -ENOSPC) {
2856 				err =  ext4_ext_zeroout(inode, &orig_ex);
2857 				if (err)
2858 					goto fix_extent_len;
2859 				ex->ee_block = orig_ex.ee_block;
2860 				ex->ee_len   = orig_ex.ee_len;
2861 				ext4_ext_store_pblock(ex,
2862 					ext4_ext_pblock(&orig_ex));
2863 				ext4_ext_dirty(handle, inode, path + depth);
2864 				/* blocks available from map->m_lblk */
2865 				return allocated;
2866 
2867 			} else if (err)
2868 				goto fix_extent_len;
2869 
2870 			/*
2871 			 * We need to zero out the second half because
2872 			 * an fallocate request can update file size and
2873 			 * converting the second half to initialized extent
2874 			 * implies that we can leak some junk data to user
2875 			 * space.
2876 			 */
2877 			err =  ext4_ext_zeroout(inode, ex3);
2878 			if (err) {
2879 				/*
2880 				 * We should actually mark the
2881 				 * second half as uninit and return error
2882 				 * Insert would have changed the extent
2883 				 */
2884 				depth = ext_depth(inode);
2885 				ext4_ext_drop_refs(path);
2886 				path = ext4_ext_find_extent(inode, map->m_lblk,
2887 							    path);
2888 				if (IS_ERR(path)) {
2889 					err = PTR_ERR(path);
2890 					return err;
2891 				}
2892 				/* get the second half extent details */
2893 				ex = path[depth].p_ext;
2894 				err = ext4_ext_get_access(handle, inode,
2895 								path + depth);
2896 				if (err)
2897 					return err;
2898 				ext4_ext_mark_uninitialized(ex);
2899 				ext4_ext_dirty(handle, inode, path + depth);
2900 				return err;
2901 			}
2902 
2903 			/* zeroed the second half */
2904 			return allocated;
2905 		}
2906 		ex3 = &newex;
2907 		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2908 		ext4_ext_store_pblock(ex3, newblock + map->m_len);
2909 		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2910 		ext4_ext_mark_uninitialized(ex3);
2911 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2912 		if (err == -ENOSPC && may_zeroout) {
2913 			err =  ext4_ext_zeroout(inode, &orig_ex);
2914 			if (err)
2915 				goto fix_extent_len;
2916 			/* update the extent length and mark as initialized */
2917 			ex->ee_block = orig_ex.ee_block;
2918 			ex->ee_len   = orig_ex.ee_len;
2919 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2920 			ext4_ext_dirty(handle, inode, path + depth);
2921 			/* zeroed the full extent */
2922 			/* blocks available from map->m_lblk */
2923 			return allocated;
2924 
2925 		} else if (err)
2926 			goto fix_extent_len;
2927 		/*
2928 		 * The depth, and hence eh & ex might change
2929 		 * as part of the insert above.
2930 		 */
2931 		newdepth = ext_depth(inode);
2932 		/*
2933 		 * update the extent length after successful insert of the
2934 		 * split extent
2935 		 */
2936 		ee_len -= ext4_ext_get_actual_len(ex3);
2937 		orig_ex.ee_len = cpu_to_le16(ee_len);
2938 		may_zeroout = ee_block + ee_len <= eof_block;
2939 
2940 		depth = newdepth;
2941 		ext4_ext_drop_refs(path);
2942 		path = ext4_ext_find_extent(inode, map->m_lblk, path);
2943 		if (IS_ERR(path)) {
2944 			err = PTR_ERR(path);
2945 			goto out;
2946 		}
2947 		eh = path[depth].p_hdr;
2948 		ex = path[depth].p_ext;
2949 		if (ex2 != &newex)
2950 			ex2 = ex;
2951 
2952 		err = ext4_ext_get_access(handle, inode, path + depth);
2953 		if (err)
2954 			goto out;
2955 
2956 		allocated = map->m_len;
2957 
2958 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2959 		 * to insert a extent in the middle zerout directly
2960 		 * otherwise give the extent a chance to merge to left
2961 		 */
2962 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2963 			map->m_lblk != ee_block && may_zeroout) {
2964 			err =  ext4_ext_zeroout(inode, &orig_ex);
2965 			if (err)
2966 				goto fix_extent_len;
2967 			/* update the extent length and mark as initialized */
2968 			ex->ee_block = orig_ex.ee_block;
2969 			ex->ee_len   = orig_ex.ee_len;
2970 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2971 			ext4_ext_dirty(handle, inode, path + depth);
2972 			/* zero out the first half */
2973 			/* blocks available from map->m_lblk */
2974 			return allocated;
2975 		}
2976 	}
2977 	/*
2978 	 * If there was a change of depth as part of the
2979 	 * insertion of ex3 above, we need to update the length
2980 	 * of the ex1 extent again here
2981 	 */
2982 	if (ex1 && ex1 != ex) {
2983 		ex1 = ex;
2984 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2985 		ext4_ext_mark_uninitialized(ex1);
2986 		ex2 = &newex;
2987 	}
2988 	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2989 	ex2->ee_block = cpu_to_le32(map->m_lblk);
2990 	ext4_ext_store_pblock(ex2, newblock);
2991 	ex2->ee_len = cpu_to_le16(allocated);
2992 	if (ex2 != ex)
2993 		goto insert;
2994 	/*
2995 	 * New (initialized) extent starts from the first block
2996 	 * in the current extent. i.e., ex2 == ex
2997 	 * We have to see if it can be merged with the extent
2998 	 * on the left.
2999 	 */
3000 	if (ex2 > EXT_FIRST_EXTENT(eh)) {
3001 		/*
3002 		 * To merge left, pass "ex2 - 1" to try_to_merge(),
3003 		 * since it merges towards right _only_.
3004 		 */
3005 		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
3006 		if (ret) {
3007 			err = ext4_ext_correct_indexes(handle, inode, path);
3008 			if (err)
3009 				goto out;
3010 			depth = ext_depth(inode);
3011 			ex2--;
3012 		}
3013 	}
3014 	/*
3015 	 * Try to Merge towards right. This might be required
3016 	 * only when the whole extent is being written to.
3017 	 * i.e. ex2 == ex and ex3 == NULL.
3018 	 */
3019 	if (!ex3) {
3020 		ret = ext4_ext_try_to_merge(inode, path, ex2);
3021 		if (ret) {
3022 			err = ext4_ext_correct_indexes(handle, inode, path);
3023 			if (err)
3024 				goto out;
3025 		}
3026 	}
3027 	/* Mark modified extent as dirty */
3028 	err = ext4_ext_dirty(handle, inode, path + depth);
3029 	goto out;
3030 insert:
3031 	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
3032 	if (err == -ENOSPC && may_zeroout) {
3033 		err =  ext4_ext_zeroout(inode, &orig_ex);
3034 		if (err)
3035 			goto fix_extent_len;
3036 		/* update the extent length and mark as initialized */
3037 		ex->ee_block = orig_ex.ee_block;
3038 		ex->ee_len   = orig_ex.ee_len;
3039 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3040 		ext4_ext_dirty(handle, inode, path + depth);
3041 		/* zero out the first half */
3042 		return allocated;
3043 	} else if (err)
3044 		goto fix_extent_len;
3045 out:
3046 	ext4_ext_show_leaf(inode, path);
3047 	return err ? err : allocated;
3048 
3049 fix_extent_len:
3050 	ex->ee_block = orig_ex.ee_block;
3051 	ex->ee_len   = orig_ex.ee_len;
3052 	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3053 	ext4_ext_mark_uninitialized(ex);
3054 	ext4_ext_dirty(handle, inode, path + depth);
3055 	return err;
3056 }
3057 
3058 /*
3059  * This function is called by ext4_ext_map_blocks() from
3060  * ext4_get_blocks_dio_write() when DIO to write
3061  * to an uninitialized extent.
3062  *
3063  * Writing to an uninitialized extent may result in splitting the uninitialized
3064  * extent into multiple /initialized uninitialized extents (up to three)
3065  * There are three possibilities:
3066  *   a> There is no split required: Entire extent should be uninitialized
3067  *   b> Splits in two extents: Write is happening at either end of the extent
3068  *   c> Splits in three extents: Somone is writing in middle of the extent
3069  *
3070  * One of more index blocks maybe needed if the extent tree grow after
3071  * the uninitialized extent split. To prevent ENOSPC occur at the IO
3072  * complete, we need to split the uninitialized extent before DIO submit
3073  * the IO. The uninitialized extent called at this time will be split
3074  * into three uninitialized extent(at most). After IO complete, the part
3075  * being filled will be convert to initialized by the end_io callback function
3076  * via ext4_convert_unwritten_extents().
3077  *
3078  * Returns the size of uninitialized extent to be written on success.
3079  */
3080 static int ext4_split_unwritten_extents(handle_t *handle,
3081 					struct inode *inode,
3082 					struct ext4_map_blocks *map,
3083 					struct ext4_ext_path *path,
3084 					int flags)
3085 {
3086 	struct ext4_extent *ex, newex, orig_ex;
3087 	struct ext4_extent *ex1 = NULL;
3088 	struct ext4_extent *ex2 = NULL;
3089 	struct ext4_extent *ex3 = NULL;
3090 	ext4_lblk_t ee_block, eof_block;
3091 	unsigned int allocated, ee_len, depth;
3092 	ext4_fsblk_t newblock;
3093 	int err = 0;
3094 	int may_zeroout;
3095 
3096 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
3097 		"block %llu, max_blocks %u\n", inode->i_ino,
3098 		(unsigned long long)map->m_lblk, map->m_len);
3099 
3100 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
3101 		inode->i_sb->s_blocksize_bits;
3102 	if (eof_block < map->m_lblk + map->m_len)
3103 		eof_block = map->m_lblk + map->m_len;
3104 
3105 	depth = ext_depth(inode);
3106 	ex = path[depth].p_ext;
3107 	ee_block = le32_to_cpu(ex->ee_block);
3108 	ee_len = ext4_ext_get_actual_len(ex);
3109 	allocated = ee_len - (map->m_lblk - ee_block);
3110 	newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
3111 
3112 	ex2 = ex;
3113 	orig_ex.ee_block = ex->ee_block;
3114 	orig_ex.ee_len   = cpu_to_le16(ee_len);
3115 	ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
3116 
3117 	/*
3118 	 * It is safe to convert extent to initialized via explicit
3119 	 * zeroout only if extent is fully insde i_size or new_size.
3120 	 */
3121 	may_zeroout = ee_block + ee_len <= eof_block;
3122 
3123 	/*
3124  	 * If the uninitialized extent begins at the same logical
3125  	 * block where the write begins, and the write completely
3126  	 * covers the extent, then we don't need to split it.
3127  	 */
3128 	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
3129 		return allocated;
3130 
3131 	err = ext4_ext_get_access(handle, inode, path + depth);
3132 	if (err)
3133 		goto out;
3134 	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
3135 	if (map->m_lblk > ee_block) {
3136 		ex1 = ex;
3137 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3138 		ext4_ext_mark_uninitialized(ex1);
3139 		ex2 = &newex;
3140 	}
3141 	/*
3142 	 * for sanity, update the length of the ex2 extent before
3143 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
3144 	 * overlap of blocks.
3145 	 */
3146 	if (!ex1 && allocated > map->m_len)
3147 		ex2->ee_len = cpu_to_le16(map->m_len);
3148 	/* ex3: to ee_block + ee_len : uninitialised */
3149 	if (allocated > map->m_len) {
3150 		unsigned int newdepth;
3151 		ex3 = &newex;
3152 		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
3153 		ext4_ext_store_pblock(ex3, newblock + map->m_len);
3154 		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
3155 		ext4_ext_mark_uninitialized(ex3);
3156 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
3157 		if (err == -ENOSPC && may_zeroout) {
3158 			err =  ext4_ext_zeroout(inode, &orig_ex);
3159 			if (err)
3160 				goto fix_extent_len;
3161 			/* update the extent length and mark as initialized */
3162 			ex->ee_block = orig_ex.ee_block;
3163 			ex->ee_len   = orig_ex.ee_len;
3164 			ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3165 			ext4_ext_dirty(handle, inode, path + depth);
3166 			/* zeroed the full extent */
3167 			/* blocks available from map->m_lblk */
3168 			return allocated;
3169 
3170 		} else if (err)
3171 			goto fix_extent_len;
3172 		/*
3173 		 * The depth, and hence eh & ex might change
3174 		 * as part of the insert above.
3175 		 */
3176 		newdepth = ext_depth(inode);
3177 		/*
3178 		 * update the extent length after successful insert of the
3179 		 * split extent
3180 		 */
3181 		ee_len -= ext4_ext_get_actual_len(ex3);
3182 		orig_ex.ee_len = cpu_to_le16(ee_len);
3183 		may_zeroout = ee_block + ee_len <= eof_block;
3184 
3185 		depth = newdepth;
3186 		ext4_ext_drop_refs(path);
3187 		path = ext4_ext_find_extent(inode, map->m_lblk, path);
3188 		if (IS_ERR(path)) {
3189 			err = PTR_ERR(path);
3190 			goto out;
3191 		}
3192 		ex = path[depth].p_ext;
3193 		if (ex2 != &newex)
3194 			ex2 = ex;
3195 
3196 		err = ext4_ext_get_access(handle, inode, path + depth);
3197 		if (err)
3198 			goto out;
3199 
3200 		allocated = map->m_len;
3201 	}
3202 	/*
3203 	 * If there was a change of depth as part of the
3204 	 * insertion of ex3 above, we need to update the length
3205 	 * of the ex1 extent again here
3206 	 */
3207 	if (ex1 && ex1 != ex) {
3208 		ex1 = ex;
3209 		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3210 		ext4_ext_mark_uninitialized(ex1);
3211 		ex2 = &newex;
3212 	}
3213 	/*
3214 	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3215 	 * using direct I/O, uninitialised still.
3216 	 */
3217 	ex2->ee_block = cpu_to_le32(map->m_lblk);
3218 	ext4_ext_store_pblock(ex2, newblock);
3219 	ex2->ee_len = cpu_to_le16(allocated);
3220 	ext4_ext_mark_uninitialized(ex2);
3221 	if (ex2 != ex)
3222 		goto insert;
3223 	/* Mark modified extent as dirty */
3224 	err = ext4_ext_dirty(handle, inode, path + depth);
3225 	ext_debug("out here\n");
3226 	goto out;
3227 insert:
3228 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3229 	if (err == -ENOSPC && may_zeroout) {
3230 		err =  ext4_ext_zeroout(inode, &orig_ex);
3231 		if (err)
3232 			goto fix_extent_len;
3233 		/* update the extent length and mark as initialized */
3234 		ex->ee_block = orig_ex.ee_block;
3235 		ex->ee_len   = orig_ex.ee_len;
3236 		ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3237 		ext4_ext_dirty(handle, inode, path + depth);
3238 		/* zero out the first half */
3239 		return allocated;
3240 	} else if (err)
3241 		goto fix_extent_len;
3242 out:
3243 	ext4_ext_show_leaf(inode, path);
3244 	return err ? err : allocated;
3245 
3246 fix_extent_len:
3247 	ex->ee_block = orig_ex.ee_block;
3248 	ex->ee_len   = orig_ex.ee_len;
3249 	ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3250 	ext4_ext_mark_uninitialized(ex);
3251 	ext4_ext_dirty(handle, inode, path + depth);
3252 	return err;
3253 }
3254 
3255 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3256 					      struct inode *inode,
3257 					      struct ext4_ext_path *path)
3258 {
3259 	struct ext4_extent *ex;
3260 	struct ext4_extent_header *eh;
3261 	int depth;
3262 	int err = 0;
3263 
3264 	depth = ext_depth(inode);
3265 	eh = path[depth].p_hdr;
3266 	ex = path[depth].p_ext;
3267 
3268 	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3269 		"block %llu, max_blocks %u\n", inode->i_ino,
3270 		(unsigned long long)le32_to_cpu(ex->ee_block),
3271 		ext4_ext_get_actual_len(ex));
3272 
3273 	err = ext4_ext_get_access(handle, inode, path + depth);
3274 	if (err)
3275 		goto out;
3276 	/* first mark the extent as initialized */
3277 	ext4_ext_mark_initialized(ex);
3278 
3279 	/* note: ext4_ext_correct_indexes() isn't needed here because
3280 	 * borders are not changed
3281 	 */
3282 	ext4_ext_try_to_merge(inode, path, ex);
3283 
3284 	/* Mark modified extent as dirty */
3285 	err = ext4_ext_dirty(handle, inode, path + depth);
3286 out:
3287 	ext4_ext_show_leaf(inode, path);
3288 	return err;
3289 }
3290 
3291 static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3292 			sector_t block, int count)
3293 {
3294 	int i;
3295 	for (i = 0; i < count; i++)
3296                 unmap_underlying_metadata(bdev, block + i);
3297 }
3298 
3299 /*
3300  * Handle EOFBLOCKS_FL flag, clearing it if necessary
3301  */
3302 static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3303 			      ext4_lblk_t lblk,
3304 			      struct ext4_ext_path *path,
3305 			      unsigned int len)
3306 {
3307 	int i, depth;
3308 	struct ext4_extent_header *eh;
3309 	struct ext4_extent *last_ex;
3310 
3311 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3312 		return 0;
3313 
3314 	depth = ext_depth(inode);
3315 	eh = path[depth].p_hdr;
3316 
3317 	if (unlikely(!eh->eh_entries)) {
3318 		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
3319 				 "EOFBLOCKS_FL set");
3320 		return -EIO;
3321 	}
3322 	last_ex = EXT_LAST_EXTENT(eh);
3323 	/*
3324 	 * We should clear the EOFBLOCKS_FL flag if we are writing the
3325 	 * last block in the last extent in the file.  We test this by
3326 	 * first checking to see if the caller to
3327 	 * ext4_ext_get_blocks() was interested in the last block (or
3328 	 * a block beyond the last block) in the current extent.  If
3329 	 * this turns out to be false, we can bail out from this
3330 	 * function immediately.
3331 	 */
3332 	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
3333 	    ext4_ext_get_actual_len(last_ex))
3334 		return 0;
3335 	/*
3336 	 * If the caller does appear to be planning to write at or
3337 	 * beyond the end of the current extent, we then test to see
3338 	 * if the current extent is the last extent in the file, by
3339 	 * checking to make sure it was reached via the rightmost node
3340 	 * at each level of the tree.
3341 	 */
3342 	for (i = depth-1; i >= 0; i--)
3343 		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3344 			return 0;
3345 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3346 	return ext4_mark_inode_dirty(handle, inode);
3347 }
3348 
3349 static int
3350 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3351 			struct ext4_map_blocks *map,
3352 			struct ext4_ext_path *path, int flags,
3353 			unsigned int allocated, ext4_fsblk_t newblock)
3354 {
3355 	int ret = 0;
3356 	int err = 0;
3357 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3358 
3359 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3360 		  "block %llu, max_blocks %u, flags %d, allocated %u",
3361 		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3362 		  flags, allocated);
3363 	ext4_ext_show_leaf(inode, path);
3364 
3365 	/* get_block() before submit the IO, split the extent */
3366 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3367 		ret = ext4_split_unwritten_extents(handle, inode, map,
3368 						   path, flags);
3369 		/*
3370 		 * Flag the inode(non aio case) or end_io struct (aio case)
3371 		 * that this IO needs to conversion to written when IO is
3372 		 * completed
3373 		 */
3374 		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3375 			io->flag = EXT4_IO_END_UNWRITTEN;
3376 			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3377 		} else
3378 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3379 		if (ext4_should_dioread_nolock(inode))
3380 			map->m_flags |= EXT4_MAP_UNINIT;
3381 		goto out;
3382 	}
3383 	/* IO end_io complete, convert the filled extent to written */
3384 	if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
3385 		ret = ext4_convert_unwritten_extents_endio(handle, inode,
3386 							path);
3387 		if (ret >= 0) {
3388 			ext4_update_inode_fsync_trans(handle, inode, 1);
3389 			err = check_eofblocks_fl(handle, inode, map->m_lblk,
3390 						 path, map->m_len);
3391 		} else
3392 			err = ret;
3393 		goto out2;
3394 	}
3395 	/* buffered IO case */
3396 	/*
3397 	 * repeat fallocate creation request
3398 	 * we already have an unwritten extent
3399 	 */
3400 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3401 		goto map_out;
3402 
3403 	/* buffered READ or buffered write_begin() lookup */
3404 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3405 		/*
3406 		 * We have blocks reserved already.  We
3407 		 * return allocated blocks so that delalloc
3408 		 * won't do block reservation for us.  But
3409 		 * the buffer head will be unmapped so that
3410 		 * a read from the block returns 0s.
3411 		 */
3412 		map->m_flags |= EXT4_MAP_UNWRITTEN;
3413 		goto out1;
3414 	}
3415 
3416 	/* buffered write, writepage time, convert*/
3417 	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3418 	if (ret >= 0) {
3419 		ext4_update_inode_fsync_trans(handle, inode, 1);
3420 		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
3421 					 map->m_len);
3422 		if (err < 0)
3423 			goto out2;
3424 	}
3425 
3426 out:
3427 	if (ret <= 0) {
3428 		err = ret;
3429 		goto out2;
3430 	} else
3431 		allocated = ret;
3432 	map->m_flags |= EXT4_MAP_NEW;
3433 	/*
3434 	 * if we allocated more blocks than requested
3435 	 * we need to make sure we unmap the extra block
3436 	 * allocated. The actual needed block will get
3437 	 * unmapped later when we find the buffer_head marked
3438 	 * new.
3439 	 */
3440 	if (allocated > map->m_len) {
3441 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3442 					newblock + map->m_len,
3443 					allocated - map->m_len);
3444 		allocated = map->m_len;
3445 	}
3446 
3447 	/*
3448 	 * If we have done fallocate with the offset that is already
3449 	 * delayed allocated, we would have block reservation
3450 	 * and quota reservation done in the delayed write path.
3451 	 * But fallocate would have already updated quota and block
3452 	 * count for this offset. So cancel these reservation
3453 	 */
3454 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3455 		ext4_da_update_reserve_space(inode, allocated, 0);
3456 
3457 map_out:
3458 	map->m_flags |= EXT4_MAP_MAPPED;
3459 out1:
3460 	if (allocated > map->m_len)
3461 		allocated = map->m_len;
3462 	ext4_ext_show_leaf(inode, path);
3463 	map->m_pblk = newblock;
3464 	map->m_len = allocated;
3465 out2:
3466 	if (path) {
3467 		ext4_ext_drop_refs(path);
3468 		kfree(path);
3469 	}
3470 	return err ? err : allocated;
3471 }
3472 
3473 /*
3474  * Block allocation/map/preallocation routine for extents based files
3475  *
3476  *
3477  * Need to be called with
3478  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
3479  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
3480  *
3481  * return > 0, number of of blocks already mapped/allocated
3482  *          if create == 0 and these are pre-allocated blocks
3483  *          	buffer head is unmapped
3484  *          otherwise blocks are mapped
3485  *
3486  * return = 0, if plain look up failed (blocks have not been allocated)
3487  *          buffer head is unmapped
3488  *
3489  * return < 0, error case.
3490  */
3491 int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3492 			struct ext4_map_blocks *map, int flags)
3493 {
3494 	struct ext4_ext_path *path = NULL;
3495 	struct ext4_extent newex, *ex;
3496 	ext4_fsblk_t newblock = 0;
3497 	int err = 0, depth, ret;
3498 	unsigned int allocated = 0;
3499 	struct ext4_allocation_request ar;
3500 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3501 
3502 	ext_debug("blocks %u/%u requested for inode %lu\n",
3503 		  map->m_lblk, map->m_len, inode->i_ino);
3504 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3505 
3506 	/* check in cache */
3507 	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3508 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
3509 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3510 				/*
3511 				 * block isn't allocated yet and
3512 				 * user doesn't want to allocate it
3513 				 */
3514 				goto out2;
3515 			}
3516 			/* we should allocate requested block */
3517 		} else {
3518 			/* block is already allocated */
3519 			newblock = map->m_lblk
3520 				   - le32_to_cpu(newex.ee_block)
3521 				   + ext4_ext_pblock(&newex);
3522 			/* number of remaining blocks in the extent */
3523 			allocated = ext4_ext_get_actual_len(&newex) -
3524 				(map->m_lblk - le32_to_cpu(newex.ee_block));
3525 			goto out;
3526 		}
3527 	}
3528 
3529 	/* find extent for this block */
3530 	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3531 	if (IS_ERR(path)) {
3532 		err = PTR_ERR(path);
3533 		path = NULL;
3534 		goto out2;
3535 	}
3536 
3537 	depth = ext_depth(inode);
3538 
3539 	/*
3540 	 * consistent leaf must not be empty;
3541 	 * this situation is possible, though, _during_ tree modification;
3542 	 * this is why assert can't be put in ext4_ext_find_extent()
3543 	 */
3544 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3545 		EXT4_ERROR_INODE(inode, "bad extent address "
3546 				 "lblock: %lu, depth: %d pblock %lld",
3547 				 (unsigned long) map->m_lblk, depth,
3548 				 path[depth].p_block);
3549 		err = -EIO;
3550 		goto out2;
3551 	}
3552 
3553 	ex = path[depth].p_ext;
3554 	if (ex) {
3555 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3556 		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3557 		unsigned short ee_len;
3558 
3559 		/*
3560 		 * Uninitialized extents are treated as holes, except that
3561 		 * we split out initialized portions during a write.
3562 		 */
3563 		ee_len = ext4_ext_get_actual_len(ex);
3564 		/* if found extent covers block, simply return it */
3565 		if (in_range(map->m_lblk, ee_block, ee_len)) {
3566 			newblock = map->m_lblk - ee_block + ee_start;
3567 			/* number of remaining blocks in the extent */
3568 			allocated = ee_len - (map->m_lblk - ee_block);
3569 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3570 				  ee_block, ee_len, newblock);
3571 
3572 			/* Do not put uninitialized extent in the cache */
3573 			if (!ext4_ext_is_uninitialized(ex)) {
3574 				ext4_ext_put_in_cache(inode, ee_block,
3575 							ee_len, ee_start);
3576 				goto out;
3577 			}
3578 			ret = ext4_ext_handle_uninitialized_extents(handle,
3579 					inode, map, path, flags, allocated,
3580 					newblock);
3581 			return ret;
3582 		}
3583 	}
3584 
3585 	/*
3586 	 * requested block isn't allocated yet;
3587 	 * we couldn't try to create block if create flag is zero
3588 	 */
3589 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3590 		/*
3591 		 * put just found gap into cache to speed up
3592 		 * subsequent requests
3593 		 */
3594 		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3595 		goto out2;
3596 	}
3597 	/*
3598 	 * Okay, we need to do block allocation.
3599 	 */
3600 
3601 	/* find neighbour allocated blocks */
3602 	ar.lleft = map->m_lblk;
3603 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3604 	if (err)
3605 		goto out2;
3606 	ar.lright = map->m_lblk;
3607 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3608 	if (err)
3609 		goto out2;
3610 
3611 	/*
3612 	 * See if request is beyond maximum number of blocks we can have in
3613 	 * a single extent. For an initialized extent this limit is
3614 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3615 	 * EXT_UNINIT_MAX_LEN.
3616 	 */
3617 	if (map->m_len > EXT_INIT_MAX_LEN &&
3618 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3619 		map->m_len = EXT_INIT_MAX_LEN;
3620 	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3621 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3622 		map->m_len = EXT_UNINIT_MAX_LEN;
3623 
3624 	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3625 	newex.ee_block = cpu_to_le32(map->m_lblk);
3626 	newex.ee_len = cpu_to_le16(map->m_len);
3627 	err = ext4_ext_check_overlap(inode, &newex, path);
3628 	if (err)
3629 		allocated = ext4_ext_get_actual_len(&newex);
3630 	else
3631 		allocated = map->m_len;
3632 
3633 	/* allocate new block */
3634 	ar.inode = inode;
3635 	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3636 	ar.logical = map->m_lblk;
3637 	ar.len = allocated;
3638 	if (S_ISREG(inode->i_mode))
3639 		ar.flags = EXT4_MB_HINT_DATA;
3640 	else
3641 		/* disable in-core preallocation for non-regular files */
3642 		ar.flags = 0;
3643 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
3644 	if (!newblock)
3645 		goto out2;
3646 	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
3647 		  ar.goal, newblock, allocated);
3648 
3649 	/* try to insert new extent into found leaf and return */
3650 	ext4_ext_store_pblock(&newex, newblock);
3651 	newex.ee_len = cpu_to_le16(ar.len);
3652 	/* Mark uninitialized */
3653 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
3654 		ext4_ext_mark_uninitialized(&newex);
3655 		/*
3656 		 * io_end structure was created for every IO write to an
3657 		 * uninitialized extent. To avoid unnecessary conversion,
3658 		 * here we flag the IO that really needs the conversion.
3659 		 * For non asycn direct IO case, flag the inode state
3660 		 * that we need to perform conversion when IO is done.
3661 		 */
3662 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3663 			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3664 				io->flag = EXT4_IO_END_UNWRITTEN;
3665 				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3666 			} else
3667 				ext4_set_inode_state(inode,
3668 						     EXT4_STATE_DIO_UNWRITTEN);
3669 		}
3670 		if (ext4_should_dioread_nolock(inode))
3671 			map->m_flags |= EXT4_MAP_UNINIT;
3672 	}
3673 
3674 	err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3675 	if (err)
3676 		goto out2;
3677 
3678 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3679 	if (err) {
3680 		/* free data blocks we just allocated */
3681 		/* not a good idea to call discard here directly,
3682 		 * but otherwise we'd need to call it every free() */
3683 		ext4_discard_preallocations(inode);
3684 		ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3685 				 ext4_ext_get_actual_len(&newex), 0);
3686 		goto out2;
3687 	}
3688 
3689 	/* previous routine could use block we allocated */
3690 	newblock = ext4_ext_pblock(&newex);
3691 	allocated = ext4_ext_get_actual_len(&newex);
3692 	if (allocated > map->m_len)
3693 		allocated = map->m_len;
3694 	map->m_flags |= EXT4_MAP_NEW;
3695 
3696 	/*
3697 	 * Update reserved blocks/metadata blocks after successful
3698 	 * block allocation which had been deferred till now.
3699 	 */
3700 	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
3701 		ext4_da_update_reserve_space(inode, allocated, 1);
3702 
3703 	/*
3704 	 * Cache the extent and update transaction to commit on fdatasync only
3705 	 * when it is _not_ an uninitialized extent.
3706 	 */
3707 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3708 		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
3709 		ext4_update_inode_fsync_trans(handle, inode, 1);
3710 	} else
3711 		ext4_update_inode_fsync_trans(handle, inode, 0);
3712 out:
3713 	if (allocated > map->m_len)
3714 		allocated = map->m_len;
3715 	ext4_ext_show_leaf(inode, path);
3716 	map->m_flags |= EXT4_MAP_MAPPED;
3717 	map->m_pblk = newblock;
3718 	map->m_len = allocated;
3719 out2:
3720 	if (path) {
3721 		ext4_ext_drop_refs(path);
3722 		kfree(path);
3723 	}
3724 	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3725 		newblock, map->m_len, err ? err : allocated);
3726 	return err ? err : allocated;
3727 }
3728 
3729 void ext4_ext_truncate(struct inode *inode)
3730 {
3731 	struct address_space *mapping = inode->i_mapping;
3732 	struct super_block *sb = inode->i_sb;
3733 	ext4_lblk_t last_block;
3734 	handle_t *handle;
3735 	int err = 0;
3736 
3737 	/*
3738 	 * finish any pending end_io work so we won't run the risk of
3739 	 * converting any truncated blocks to initialized later
3740 	 */
3741 	ext4_flush_completed_IO(inode);
3742 
3743 	/*
3744 	 * probably first extent we're gonna free will be last in block
3745 	 */
3746 	err = ext4_writepage_trans_blocks(inode);
3747 	handle = ext4_journal_start(inode, err);
3748 	if (IS_ERR(handle))
3749 		return;
3750 
3751 	if (inode->i_size & (sb->s_blocksize - 1))
3752 		ext4_block_truncate_page(handle, mapping, inode->i_size);
3753 
3754 	if (ext4_orphan_add(handle, inode))
3755 		goto out_stop;
3756 
3757 	down_write(&EXT4_I(inode)->i_data_sem);
3758 	ext4_ext_invalidate_cache(inode);
3759 
3760 	ext4_discard_preallocations(inode);
3761 
3762 	/*
3763 	 * TODO: optimization is possible here.
3764 	 * Probably we need not scan at all,
3765 	 * because page truncation is enough.
3766 	 */
3767 
3768 	/* we have to know where to truncate from in crash case */
3769 	EXT4_I(inode)->i_disksize = inode->i_size;
3770 	ext4_mark_inode_dirty(handle, inode);
3771 
3772 	last_block = (inode->i_size + sb->s_blocksize - 1)
3773 			>> EXT4_BLOCK_SIZE_BITS(sb);
3774 	err = ext4_ext_remove_space(inode, last_block);
3775 
3776 	/* In a multi-transaction truncate, we only make the final
3777 	 * transaction synchronous.
3778 	 */
3779 	if (IS_SYNC(inode))
3780 		ext4_handle_sync(handle);
3781 
3782 out_stop:
3783 	up_write(&EXT4_I(inode)->i_data_sem);
3784 	/*
3785 	 * If this was a simple ftruncate() and the file will remain alive,
3786 	 * then we need to clear up the orphan record which we created above.
3787 	 * However, if this was a real unlink then we were called by
3788 	 * ext4_delete_inode(), and we allow that function to clean up the
3789 	 * orphan info for us.
3790 	 */
3791 	if (inode->i_nlink)
3792 		ext4_orphan_del(handle, inode);
3793 
3794 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
3795 	ext4_mark_inode_dirty(handle, inode);
3796 	ext4_journal_stop(handle);
3797 }
3798 
3799 static void ext4_falloc_update_inode(struct inode *inode,
3800 				int mode, loff_t new_size, int update_ctime)
3801 {
3802 	struct timespec now;
3803 
3804 	if (update_ctime) {
3805 		now = current_fs_time(inode->i_sb);
3806 		if (!timespec_equal(&inode->i_ctime, &now))
3807 			inode->i_ctime = now;
3808 	}
3809 	/*
3810 	 * Update only when preallocation was requested beyond
3811 	 * the file size.
3812 	 */
3813 	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3814 		if (new_size > i_size_read(inode))
3815 			i_size_write(inode, new_size);
3816 		if (new_size > EXT4_I(inode)->i_disksize)
3817 			ext4_update_i_disksize(inode, new_size);
3818 	} else {
3819 		/*
3820 		 * Mark that we allocate beyond EOF so the subsequent truncate
3821 		 * can proceed even if the new size is the same as i_size.
3822 		 */
3823 		if (new_size > i_size_read(inode))
3824 			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3825 	}
3826 
3827 }
3828 
3829 /*
3830  * preallocate space for a file. This implements ext4's fallocate file
3831  * operation, which gets called from sys_fallocate system call.
3832  * For block-mapped files, posix_fallocate should fall back to the method
3833  * of writing zeroes to the required new blocks (the same behavior which is
3834  * expected for file systems which do not support fallocate() system call).
3835  */
3836 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3837 {
3838 	struct inode *inode = file->f_path.dentry->d_inode;
3839 	handle_t *handle;
3840 	loff_t new_size;
3841 	unsigned int max_blocks;
3842 	int ret = 0;
3843 	int ret2 = 0;
3844 	int retries = 0;
3845 	struct ext4_map_blocks map;
3846 	unsigned int credits, blkbits = inode->i_blkbits;
3847 
3848 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
3849 	if (mode & ~FALLOC_FL_KEEP_SIZE)
3850 		return -EOPNOTSUPP;
3851 
3852 	/*
3853 	 * currently supporting (pre)allocate mode for extent-based
3854 	 * files _only_
3855 	 */
3856 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3857 		return -EOPNOTSUPP;
3858 
3859 	trace_ext4_fallocate_enter(inode, offset, len, mode);
3860 	map.m_lblk = offset >> blkbits;
3861 	/*
3862 	 * We can't just convert len to max_blocks because
3863 	 * If blocksize = 4096 offset = 3072 and len = 2048
3864 	 */
3865 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3866 		- map.m_lblk;
3867 	/*
3868 	 * credits to insert 1 extent into extent tree
3869 	 */
3870 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3871 	mutex_lock(&inode->i_mutex);
3872 	ret = inode_newsize_ok(inode, (len + offset));
3873 	if (ret) {
3874 		mutex_unlock(&inode->i_mutex);
3875 		trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3876 		return ret;
3877 	}
3878 retry:
3879 	while (ret >= 0 && ret < max_blocks) {
3880 		map.m_lblk = map.m_lblk + ret;
3881 		map.m_len = max_blocks = max_blocks - ret;
3882 		handle = ext4_journal_start(inode, credits);
3883 		if (IS_ERR(handle)) {
3884 			ret = PTR_ERR(handle);
3885 			break;
3886 		}
3887 		ret = ext4_map_blocks(handle, inode, &map,
3888 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3889 		if (ret <= 0) {
3890 #ifdef EXT4FS_DEBUG
3891 			WARN_ON(ret <= 0);
3892 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3893 				    "returned error inode#%lu, block=%u, "
3894 				    "max_blocks=%u", __func__,
3895 				    inode->i_ino, map.m_lblk, max_blocks);
3896 #endif
3897 			ext4_mark_inode_dirty(handle, inode);
3898 			ret2 = ext4_journal_stop(handle);
3899 			break;
3900 		}
3901 		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3902 						blkbits) >> blkbits))
3903 			new_size = offset + len;
3904 		else
3905 			new_size = (map.m_lblk + ret) << blkbits;
3906 
3907 		ext4_falloc_update_inode(inode, mode, new_size,
3908 					 (map.m_flags & EXT4_MAP_NEW));
3909 		ext4_mark_inode_dirty(handle, inode);
3910 		ret2 = ext4_journal_stop(handle);
3911 		if (ret2)
3912 			break;
3913 	}
3914 	if (ret == -ENOSPC &&
3915 			ext4_should_retry_alloc(inode->i_sb, &retries)) {
3916 		ret = 0;
3917 		goto retry;
3918 	}
3919 	mutex_unlock(&inode->i_mutex);
3920 	trace_ext4_fallocate_exit(inode, offset, max_blocks,
3921 				ret > 0 ? ret2 : ret);
3922 	return ret > 0 ? ret2 : ret;
3923 }
3924 
3925 /*
3926  * This function convert a range of blocks to written extents
3927  * The caller of this function will pass the start offset and the size.
3928  * all unwritten extents within this range will be converted to
3929  * written extents.
3930  *
3931  * This function is called from the direct IO end io call back
3932  * function, to convert the fallocated extents after IO is completed.
3933  * Returns 0 on success.
3934  */
3935 int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3936 				    ssize_t len)
3937 {
3938 	handle_t *handle;
3939 	unsigned int max_blocks;
3940 	int ret = 0;
3941 	int ret2 = 0;
3942 	struct ext4_map_blocks map;
3943 	unsigned int credits, blkbits = inode->i_blkbits;
3944 
3945 	map.m_lblk = offset >> blkbits;
3946 	/*
3947 	 * We can't just convert len to max_blocks because
3948 	 * If blocksize = 4096 offset = 3072 and len = 2048
3949 	 */
3950 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3951 		      map.m_lblk);
3952 	/*
3953 	 * credits to insert 1 extent into extent tree
3954 	 */
3955 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
3956 	while (ret >= 0 && ret < max_blocks) {
3957 		map.m_lblk += ret;
3958 		map.m_len = (max_blocks -= ret);
3959 		handle = ext4_journal_start(inode, credits);
3960 		if (IS_ERR(handle)) {
3961 			ret = PTR_ERR(handle);
3962 			break;
3963 		}
3964 		ret = ext4_map_blocks(handle, inode, &map,
3965 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3966 		if (ret <= 0) {
3967 			WARN_ON(ret <= 0);
3968 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
3969 				    "returned error inode#%lu, block=%u, "
3970 				    "max_blocks=%u", __func__,
3971 				    inode->i_ino, map.m_lblk, map.m_len);
3972 		}
3973 		ext4_mark_inode_dirty(handle, inode);
3974 		ret2 = ext4_journal_stop(handle);
3975 		if (ret <= 0 || ret2 )
3976 			break;
3977 	}
3978 	return ret > 0 ? ret2 : ret;
3979 }
3980 
3981 /*
3982  * Callback function called for each extent to gather FIEMAP information.
3983  */
3984 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3985 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
3986 		       void *data)
3987 {
3988 	__u64	logical;
3989 	__u64	physical;
3990 	__u64	length;
3991 	loff_t	size;
3992 	__u32	flags = 0;
3993 	int		ret = 0;
3994 	struct fiemap_extent_info *fieinfo = data;
3995 	unsigned char blksize_bits;
3996 
3997 	blksize_bits = inode->i_sb->s_blocksize_bits;
3998 	logical = (__u64)newex->ec_block << blksize_bits;
3999 
4000 	if (newex->ec_start == 0) {
4001 		/*
4002 		 * No extent in extent-tree contains block @newex->ec_start,
4003 		 * then the block may stay in 1)a hole or 2)delayed-extent.
4004 		 *
4005 		 * Holes or delayed-extents are processed as follows.
4006 		 * 1. lookup dirty pages with specified range in pagecache.
4007 		 *    If no page is got, then there is no delayed-extent and
4008 		 *    return with EXT_CONTINUE.
4009 		 * 2. find the 1st mapped buffer,
4010 		 * 3. check if the mapped buffer is both in the request range
4011 		 *    and a delayed buffer. If not, there is no delayed-extent,
4012 		 *    then return.
4013 		 * 4. a delayed-extent is found, the extent will be collected.
4014 		 */
4015 		ext4_lblk_t	end = 0;
4016 		pgoff_t		last_offset;
4017 		pgoff_t		offset;
4018 		pgoff_t		index;
4019 		struct page	**pages = NULL;
4020 		struct buffer_head *bh = NULL;
4021 		struct buffer_head *head = NULL;
4022 		unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
4023 
4024 		pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
4025 		if (pages == NULL)
4026 			return -ENOMEM;
4027 
4028 		offset = logical >> PAGE_SHIFT;
4029 repeat:
4030 		last_offset = offset;
4031 		head = NULL;
4032 		ret = find_get_pages_tag(inode->i_mapping, &offset,
4033 					PAGECACHE_TAG_DIRTY, nr_pages, pages);
4034 
4035 		if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4036 			/* First time, try to find a mapped buffer. */
4037 			if (ret == 0) {
4038 out:
4039 				for (index = 0; index < ret; index++)
4040 					page_cache_release(pages[index]);
4041 				/* just a hole. */
4042 				kfree(pages);
4043 				return EXT_CONTINUE;
4044 			}
4045 
4046 			/* Try to find the 1st mapped buffer. */
4047 			end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
4048 				  blksize_bits;
4049 			if (!page_has_buffers(pages[0]))
4050 				goto out;
4051 			head = page_buffers(pages[0]);
4052 			if (!head)
4053 				goto out;
4054 
4055 			bh = head;
4056 			do {
4057 				if (buffer_mapped(bh)) {
4058 					/* get the 1st mapped buffer. */
4059 					if (end > newex->ec_block +
4060 						newex->ec_len)
4061 						/* The buffer is out of
4062 						 * the request range.
4063 						 */
4064 						goto out;
4065 					goto found_mapped_buffer;
4066 				}
4067 				bh = bh->b_this_page;
4068 				end++;
4069 			} while (bh != head);
4070 
4071 			/* No mapped buffer found. */
4072 			goto out;
4073 		} else {
4074 			/*Find contiguous delayed buffers. */
4075 			if (ret > 0 && pages[0]->index == last_offset)
4076 				head = page_buffers(pages[0]);
4077 			bh = head;
4078 		}
4079 
4080 found_mapped_buffer:
4081 		if (bh != NULL && buffer_delay(bh)) {
4082 			/* 1st or contiguous delayed buffer found. */
4083 			if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
4084 				/*
4085 				 * 1st delayed buffer found, record
4086 				 * the start of extent.
4087 				 */
4088 				flags |= FIEMAP_EXTENT_DELALLOC;
4089 				newex->ec_block = end;
4090 				logical = (__u64)end << blksize_bits;
4091 			}
4092 			/* Find contiguous delayed buffers. */
4093 			do {
4094 				if (!buffer_delay(bh))
4095 					goto found_delayed_extent;
4096 				bh = bh->b_this_page;
4097 				end++;
4098 			} while (bh != head);
4099 
4100 			for (index = 1; index < ret; index++) {
4101 				if (!page_has_buffers(pages[index])) {
4102 					bh = NULL;
4103 					break;
4104 				}
4105 				head = page_buffers(pages[index]);
4106 				if (!head) {
4107 					bh = NULL;
4108 					break;
4109 				}
4110 				if (pages[index]->index !=
4111 					pages[0]->index + index) {
4112 					/* Blocks are not contiguous. */
4113 					bh = NULL;
4114 					break;
4115 				}
4116 				bh = head;
4117 				do {
4118 					if (!buffer_delay(bh))
4119 						/* Delayed-extent ends. */
4120 						goto found_delayed_extent;
4121 					bh = bh->b_this_page;
4122 					end++;
4123 				} while (bh != head);
4124 			}
4125 		} else if (!(flags & FIEMAP_EXTENT_DELALLOC))
4126 			/* a hole found. */
4127 			goto out;
4128 
4129 found_delayed_extent:
4130 		newex->ec_len = min(end - newex->ec_block,
4131 						(ext4_lblk_t)EXT_INIT_MAX_LEN);
4132 		if (ret == nr_pages && bh != NULL &&
4133 			newex->ec_len < EXT_INIT_MAX_LEN &&
4134 			buffer_delay(bh)) {
4135 			/* Have not collected an extent and continue. */
4136 			for (index = 0; index < ret; index++)
4137 				page_cache_release(pages[index]);
4138 			goto repeat;
4139 		}
4140 
4141 		for (index = 0; index < ret; index++)
4142 			page_cache_release(pages[index]);
4143 		kfree(pages);
4144 	}
4145 
4146 	physical = (__u64)newex->ec_start << blksize_bits;
4147 	length =   (__u64)newex->ec_len << blksize_bits;
4148 
4149 	if (ex && ext4_ext_is_uninitialized(ex))
4150 		flags |= FIEMAP_EXTENT_UNWRITTEN;
4151 
4152 	size = i_size_read(inode);
4153 	if (logical + length >= size)
4154 		flags |= FIEMAP_EXTENT_LAST;
4155 
4156 	ret = fiemap_fill_next_extent(fieinfo, logical, physical,
4157 					length, flags);
4158 	if (ret < 0)
4159 		return ret;
4160 	if (ret == 1)
4161 		return EXT_BREAK;
4162 	return EXT_CONTINUE;
4163 }
4164 
4165 /* fiemap flags we can handle specified here */
4166 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
4167 
4168 static int ext4_xattr_fiemap(struct inode *inode,
4169 				struct fiemap_extent_info *fieinfo)
4170 {
4171 	__u64 physical = 0;
4172 	__u64 length;
4173 	__u32 flags = FIEMAP_EXTENT_LAST;
4174 	int blockbits = inode->i_sb->s_blocksize_bits;
4175 	int error = 0;
4176 
4177 	/* in-inode? */
4178 	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4179 		struct ext4_iloc iloc;
4180 		int offset;	/* offset of xattr in inode */
4181 
4182 		error = ext4_get_inode_loc(inode, &iloc);
4183 		if (error)
4184 			return error;
4185 		physical = iloc.bh->b_blocknr << blockbits;
4186 		offset = EXT4_GOOD_OLD_INODE_SIZE +
4187 				EXT4_I(inode)->i_extra_isize;
4188 		physical += offset;
4189 		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4190 		flags |= FIEMAP_EXTENT_DATA_INLINE;
4191 		brelse(iloc.bh);
4192 	} else { /* external block */
4193 		physical = EXT4_I(inode)->i_file_acl << blockbits;
4194 		length = inode->i_sb->s_blocksize;
4195 	}
4196 
4197 	if (physical)
4198 		error = fiemap_fill_next_extent(fieinfo, 0, physical,
4199 						length, flags);
4200 	return (error < 0 ? error : 0);
4201 }
4202 
4203 int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4204 		__u64 start, __u64 len)
4205 {
4206 	ext4_lblk_t start_blk;
4207 	int error = 0;
4208 
4209 	/* fallback to generic here if not in extents fmt */
4210 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4211 		return generic_block_fiemap(inode, fieinfo, start, len,
4212 			ext4_get_block);
4213 
4214 	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
4215 		return -EBADR;
4216 
4217 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4218 		error = ext4_xattr_fiemap(inode, fieinfo);
4219 	} else {
4220 		ext4_lblk_t len_blks;
4221 		__u64 last_blk;
4222 
4223 		start_blk = start >> inode->i_sb->s_blocksize_bits;
4224 		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
4225 		if (last_blk >= EXT_MAX_BLOCK)
4226 			last_blk = EXT_MAX_BLOCK-1;
4227 		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
4228 
4229 		/*
4230 		 * Walk the extent tree gathering extent information.
4231 		 * ext4_ext_fiemap_cb will push extents back to user.
4232 		 */
4233 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
4234 					  ext4_ext_fiemap_cb, fieinfo);
4235 	}
4236 
4237 	return error;
4238 }
4239 
4240