xref: /openbmc/linux/fs/gfs2/bmap.c (revision 5927145e)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
16 #include <linux/iomap.h>
17 
18 #include "gfs2.h"
19 #include "incore.h"
20 #include "bmap.h"
21 #include "glock.h"
22 #include "inode.h"
23 #include "meta_io.h"
24 #include "quota.h"
25 #include "rgrp.h"
26 #include "log.h"
27 #include "super.h"
28 #include "trans.h"
29 #include "dir.h"
30 #include "util.h"
31 #include "trace_gfs2.h"
32 
33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
34  * block is 512, so __u16 is fine for that. It saves stack space to
35  * keep it small.
36  */
37 struct metapath {
38 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
39 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
40 	int mp_fheight; /* find_metapath height */
41 	int mp_aheight; /* actual height (lookup height) */
42 };
43 
44 /**
45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
46  * @ip: the inode
47  * @dibh: the dinode buffer
48  * @block: the block number that was allocated
49  * @page: The (optional) page. This is looked up if @page is NULL
50  *
51  * Returns: errno
52  */
53 
54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
55 			       u64 block, struct page *page)
56 {
57 	struct inode *inode = &ip->i_inode;
58 	struct buffer_head *bh;
59 	int release = 0;
60 
61 	if (!page || page->index) {
62 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63 		if (!page)
64 			return -ENOMEM;
65 		release = 1;
66 	}
67 
68 	if (!PageUptodate(page)) {
69 		void *kaddr = kmap(page);
70 		u64 dsize = i_size_read(inode);
71 
72 		if (dsize > gfs2_max_stuffed_size(ip))
73 			dsize = gfs2_max_stuffed_size(ip);
74 
75 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
77 		kunmap(page);
78 
79 		SetPageUptodate(page);
80 	}
81 
82 	if (!page_has_buffers(page))
83 		create_empty_buffers(page, BIT(inode->i_blkbits),
84 				     BIT(BH_Uptodate));
85 
86 	bh = page_buffers(page);
87 
88 	if (!buffer_mapped(bh))
89 		map_bh(bh, inode->i_sb, block);
90 
91 	set_buffer_uptodate(bh);
92 	if (!gfs2_is_jdata(ip))
93 		mark_buffer_dirty(bh);
94 	if (!gfs2_is_writeback(ip))
95 		gfs2_trans_add_data(ip->i_gl, bh);
96 
97 	if (release) {
98 		unlock_page(page);
99 		put_page(page);
100 	}
101 
102 	return 0;
103 }
104 
105 /**
106  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
107  * @ip: The GFS2 inode to unstuff
108  * @page: The (optional) page. This is looked up if the @page is NULL
109  *
110  * This routine unstuffs a dinode and returns it to a "normal" state such
111  * that the height can be grown in the traditional way.
112  *
113  * Returns: errno
114  */
115 
116 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
117 {
118 	struct buffer_head *bh, *dibh;
119 	struct gfs2_dinode *di;
120 	u64 block = 0;
121 	int isdir = gfs2_is_dir(ip);
122 	int error;
123 
124 	down_write(&ip->i_rw_mutex);
125 
126 	error = gfs2_meta_inode_buffer(ip, &dibh);
127 	if (error)
128 		goto out;
129 
130 	if (i_size_read(&ip->i_inode)) {
131 		/* Get a free block, fill it with the stuffed data,
132 		   and write it out to disk */
133 
134 		unsigned int n = 1;
135 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136 		if (error)
137 			goto out_brelse;
138 		if (isdir) {
139 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
140 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
141 			if (error)
142 				goto out_brelse;
143 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
144 					      dibh, sizeof(struct gfs2_dinode));
145 			brelse(bh);
146 		} else {
147 			error = gfs2_unstuffer_page(ip, dibh, block, page);
148 			if (error)
149 				goto out_brelse;
150 		}
151 	}
152 
153 	/*  Set up the pointer to the new block  */
154 
155 	gfs2_trans_add_meta(ip->i_gl, dibh);
156 	di = (struct gfs2_dinode *)dibh->b_data;
157 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
158 
159 	if (i_size_read(&ip->i_inode)) {
160 		*(__be64 *)(di + 1) = cpu_to_be64(block);
161 		gfs2_add_inode_blocks(&ip->i_inode, 1);
162 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163 	}
164 
165 	ip->i_height = 1;
166 	di->di_height = cpu_to_be16(1);
167 
168 out_brelse:
169 	brelse(dibh);
170 out:
171 	up_write(&ip->i_rw_mutex);
172 	return error;
173 }
174 
175 
176 /**
177  * find_metapath - Find path through the metadata tree
178  * @sdp: The superblock
179  * @mp: The metapath to return the result in
180  * @block: The disk block to look up
181  * @height: The pre-calculated height of the metadata tree
182  *
183  *   This routine returns a struct metapath structure that defines a path
184  *   through the metadata of inode "ip" to get to block "block".
185  *
186  *   Example:
187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188  *   filesystem with a blocksize of 4096.
189  *
190  *   find_metapath() would return a struct metapath structure set to:
191  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
192  *   and mp_list[2] = 165.
193  *
194  *   That means that in order to get to the block containing the byte at
195  *   offset 101342453, we would load the indirect block pointed to by pointer
196  *   0 in the dinode.  We would then load the indirect block pointed to by
197  *   pointer 48 in that indirect block.  We would then load the data block
198  *   pointed to by pointer 165 in that indirect block.
199  *
200  *             ----------------------------------------
201  *             | Dinode |                             |
202  *             |        |                            4|
203  *             |        |0 1 2 3 4 5                 9|
204  *             |        |                            6|
205  *             ----------------------------------------
206  *                       |
207  *                       |
208  *                       V
209  *             ----------------------------------------
210  *             | Indirect Block                       |
211  *             |                                     5|
212  *             |            4 4 4 4 4 5 5            1|
213  *             |0           5 6 7 8 9 0 1            2|
214  *             ----------------------------------------
215  *                                |
216  *                                |
217  *                                V
218  *             ----------------------------------------
219  *             | Indirect Block                       |
220  *             |                         1 1 1 1 1   5|
221  *             |                         6 6 6 6 6   1|
222  *             |0                        3 4 5 6 7   2|
223  *             ----------------------------------------
224  *                                           |
225  *                                           |
226  *                                           V
227  *             ----------------------------------------
228  *             | Data block containing offset         |
229  *             |            101342453                 |
230  *             |                                      |
231  *             |                                      |
232  *             ----------------------------------------
233  *
234  */
235 
236 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
237 			  struct metapath *mp, unsigned int height)
238 {
239 	unsigned int i;
240 
241 	mp->mp_fheight = height;
242 	for (i = height; i--;)
243 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
244 }
245 
246 static inline unsigned int metapath_branch_start(const struct metapath *mp)
247 {
248 	if (mp->mp_list[0] == 0)
249 		return 2;
250 	return 1;
251 }
252 
253 /**
254  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
255  * @height: The metadata height (0 = dinode)
256  * @mp: The metapath
257  */
258 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
259 {
260 	struct buffer_head *bh = mp->mp_bh[height];
261 	if (height == 0)
262 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
263 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264 }
265 
266 /**
267  * metapointer - Return pointer to start of metadata in a buffer
268  * @height: The metadata height (0 = dinode)
269  * @mp: The metapath
270  *
271  * Return a pointer to the block number of the next height of the metadata
272  * tree given a buffer containing the pointer to the current height of the
273  * metadata tree.
274  */
275 
276 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
277 {
278 	__be64 *p = metaptr1(height, mp);
279 	return p + mp->mp_list[height];
280 }
281 
282 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
283 {
284 	const __be64 *t;
285 
286 	for (t = start; t < end; t++) {
287 		struct buffer_head *rabh;
288 
289 		if (!*t)
290 			continue;
291 
292 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
293 		if (trylock_buffer(rabh)) {
294 			if (!buffer_uptodate(rabh)) {
295 				rabh->b_end_io = end_buffer_read_sync;
296 				submit_bh(REQ_OP_READ,
297 					  REQ_RAHEAD | REQ_META | REQ_PRIO,
298 					  rabh);
299 				continue;
300 			}
301 			unlock_buffer(rabh);
302 		}
303 		brelse(rabh);
304 	}
305 }
306 
307 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
308 			     unsigned int x, unsigned int h)
309 {
310 	for (; x < h; x++) {
311 		__be64 *ptr = metapointer(x, mp);
312 		u64 dblock = be64_to_cpu(*ptr);
313 		int ret;
314 
315 		if (!dblock)
316 			break;
317 		ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
318 		if (ret)
319 			return ret;
320 	}
321 	mp->mp_aheight = x + 1;
322 	return 0;
323 }
324 
325 /**
326  * lookup_metapath - Walk the metadata tree to a specific point
327  * @ip: The inode
328  * @mp: The metapath
329  *
330  * Assumes that the inode's buffer has already been looked up and
331  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
332  * by find_metapath().
333  *
334  * If this function encounters part of the tree which has not been
335  * allocated, it returns the current height of the tree at the point
336  * at which it found the unallocated block. Blocks which are found are
337  * added to the mp->mp_bh[] list.
338  *
339  * Returns: error
340  */
341 
342 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
343 {
344 	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
345 }
346 
347 /**
348  * fillup_metapath - fill up buffers for the metadata path to a specific height
349  * @ip: The inode
350  * @mp: The metapath
351  * @h: The height to which it should be mapped
352  *
353  * Similar to lookup_metapath, but does lookups for a range of heights
354  *
355  * Returns: error or the number of buffers filled
356  */
357 
358 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
359 {
360 	unsigned int x = 0;
361 	int ret;
362 
363 	if (h) {
364 		/* find the first buffer we need to look up. */
365 		for (x = h - 1; x > 0; x--) {
366 			if (mp->mp_bh[x])
367 				break;
368 		}
369 	}
370 	ret = __fillup_metapath(ip, mp, x, h);
371 	if (ret)
372 		return ret;
373 	return mp->mp_aheight - x - 1;
374 }
375 
376 static inline void release_metapath(struct metapath *mp)
377 {
378 	int i;
379 
380 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
381 		if (mp->mp_bh[i] == NULL)
382 			break;
383 		brelse(mp->mp_bh[i]);
384 	}
385 }
386 
387 /**
388  * gfs2_extent_length - Returns length of an extent of blocks
389  * @start: Start of the buffer
390  * @len: Length of the buffer in bytes
391  * @ptr: Current position in the buffer
392  * @limit: Max extent length to return (0 = unlimited)
393  * @eob: Set to 1 if we hit "end of block"
394  *
395  * If the first block is zero (unallocated) it will return the number of
396  * unallocated blocks in the extent, otherwise it will return the number
397  * of contiguous blocks in the extent.
398  *
399  * Returns: The length of the extent (minimum of one block)
400  */
401 
402 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
403 {
404 	const __be64 *end = (start + len);
405 	const __be64 *first = ptr;
406 	u64 d = be64_to_cpu(*ptr);
407 
408 	*eob = 0;
409 	do {
410 		ptr++;
411 		if (ptr >= end)
412 			break;
413 		if (limit && --limit == 0)
414 			break;
415 		if (d)
416 			d++;
417 	} while(be64_to_cpu(*ptr) == d);
418 	if (ptr >= end)
419 		*eob = 1;
420 	return (ptr - first);
421 }
422 
423 static inline void bmap_lock(struct gfs2_inode *ip, int create)
424 {
425 	if (create)
426 		down_write(&ip->i_rw_mutex);
427 	else
428 		down_read(&ip->i_rw_mutex);
429 }
430 
431 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
432 {
433 	if (create)
434 		up_write(&ip->i_rw_mutex);
435 	else
436 		up_read(&ip->i_rw_mutex);
437 }
438 
439 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
440 					 struct gfs2_glock *gl, unsigned int i,
441 					 unsigned offset, u64 bn)
442 {
443 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
444 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
445 				 sizeof(struct gfs2_dinode)));
446 	BUG_ON(i < 1);
447 	BUG_ON(mp->mp_bh[i] != NULL);
448 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
449 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
450 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
451 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
452 	ptr += offset;
453 	*ptr = cpu_to_be64(bn);
454 	return ptr;
455 }
456 
457 enum alloc_state {
458 	ALLOC_DATA = 0,
459 	ALLOC_GROW_DEPTH = 1,
460 	ALLOC_GROW_HEIGHT = 2,
461 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
462 };
463 
464 /**
465  * gfs2_bmap_alloc - Build a metadata tree of the requested height
466  * @inode: The GFS2 inode
467  * @lblock: The logical starting block of the extent
468  * @bh_map: This is used to return the mapping details
469  * @zero_new: True if newly allocated blocks should be zeroed
470  * @mp: The metapath, with proper height information calculated
471  * @maxlen: The max number of data blocks to alloc
472  * @dblock: Pointer to return the resulting new block
473  * @dblks: Pointer to return the number of blocks allocated
474  *
475  * In this routine we may have to alloc:
476  *   i) Indirect blocks to grow the metadata tree height
477  *  ii) Indirect blocks to fill in lower part of the metadata tree
478  * iii) Data blocks
479  *
480  * The function is in two parts. The first part works out the total
481  * number of blocks which we need. The second part does the actual
482  * allocation asking for an extent at a time (if enough contiguous free
483  * blocks are available, there will only be one request per bmap call)
484  * and uses the state machine to initialise the blocks in order.
485  *
486  * Returns: errno on error
487  */
488 
489 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
490 			    unsigned flags, struct metapath *mp)
491 {
492 	struct gfs2_inode *ip = GFS2_I(inode);
493 	struct gfs2_sbd *sdp = GFS2_SB(inode);
494 	struct super_block *sb = sdp->sd_vfs;
495 	struct buffer_head *dibh = mp->mp_bh[0];
496 	u64 bn;
497 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
498 	unsigned dblks = 0;
499 	unsigned ptrs_per_blk;
500 	const unsigned end_of_metadata = mp->mp_fheight - 1;
501 	int ret;
502 	enum alloc_state state;
503 	__be64 *ptr;
504 	__be64 zero_bn = 0;
505 	size_t maxlen = iomap->length >> inode->i_blkbits;
506 
507 	BUG_ON(mp->mp_aheight < 1);
508 	BUG_ON(dibh == NULL);
509 
510 	gfs2_trans_add_meta(ip->i_gl, dibh);
511 
512 	if (mp->mp_fheight == mp->mp_aheight) {
513 		struct buffer_head *bh;
514 		int eob;
515 
516 		/* Bottom indirect block exists, find unalloced extent size */
517 		ptr = metapointer(end_of_metadata, mp);
518 		bh = mp->mp_bh[end_of_metadata];
519 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
520 					   maxlen, &eob);
521 		BUG_ON(dblks < 1);
522 		state = ALLOC_DATA;
523 	} else {
524 		/* Need to allocate indirect blocks */
525 		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
526 			sdp->sd_diptrs;
527 		dblks = min(maxlen, (size_t)(ptrs_per_blk -
528 					     mp->mp_list[end_of_metadata]));
529 		if (mp->mp_fheight == ip->i_height) {
530 			/* Writing into existing tree, extend tree down */
531 			iblks = mp->mp_fheight - mp->mp_aheight;
532 			state = ALLOC_GROW_DEPTH;
533 		} else {
534 			/* Building up tree height */
535 			state = ALLOC_GROW_HEIGHT;
536 			iblks = mp->mp_fheight - ip->i_height;
537 			branch_start = metapath_branch_start(mp);
538 			iblks += (mp->mp_fheight - branch_start);
539 		}
540 	}
541 
542 	/* start of the second part of the function (state machine) */
543 
544 	blks = dblks + iblks;
545 	i = mp->mp_aheight;
546 	do {
547 		int error;
548 		n = blks - alloced;
549 		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
550 		if (error)
551 			return error;
552 		alloced += n;
553 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
554 			gfs2_trans_add_unrevoke(sdp, bn, n);
555 		switch (state) {
556 		/* Growing height of tree */
557 		case ALLOC_GROW_HEIGHT:
558 			if (i == 1) {
559 				ptr = (__be64 *)(dibh->b_data +
560 						 sizeof(struct gfs2_dinode));
561 				zero_bn = *ptr;
562 			}
563 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
564 			     i++, n--)
565 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
566 			if (i - 1 == mp->mp_fheight - ip->i_height) {
567 				i--;
568 				gfs2_buffer_copy_tail(mp->mp_bh[i],
569 						sizeof(struct gfs2_meta_header),
570 						dibh, sizeof(struct gfs2_dinode));
571 				gfs2_buffer_clear_tail(dibh,
572 						sizeof(struct gfs2_dinode) +
573 						sizeof(__be64));
574 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
575 					sizeof(struct gfs2_meta_header));
576 				*ptr = zero_bn;
577 				state = ALLOC_GROW_DEPTH;
578 				for(i = branch_start; i < mp->mp_fheight; i++) {
579 					if (mp->mp_bh[i] == NULL)
580 						break;
581 					brelse(mp->mp_bh[i]);
582 					mp->mp_bh[i] = NULL;
583 				}
584 				i = branch_start;
585 			}
586 			if (n == 0)
587 				break;
588 		/* Branching from existing tree */
589 		case ALLOC_GROW_DEPTH:
590 			if (i > 1 && i < mp->mp_fheight)
591 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
592 			for (; i < mp->mp_fheight && n > 0; i++, n--)
593 				gfs2_indirect_init(mp, ip->i_gl, i,
594 						   mp->mp_list[i-1], bn++);
595 			if (i == mp->mp_fheight)
596 				state = ALLOC_DATA;
597 			if (n == 0)
598 				break;
599 		/* Tree complete, adding data blocks */
600 		case ALLOC_DATA:
601 			BUG_ON(n > dblks);
602 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
603 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
604 			dblks = n;
605 			ptr = metapointer(end_of_metadata, mp);
606 			iomap->addr = bn << inode->i_blkbits;
607 			iomap->flags |= IOMAP_F_NEW;
608 			while (n-- > 0)
609 				*ptr++ = cpu_to_be64(bn++);
610 			if (flags & IOMAP_ZERO) {
611 				ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
612 						       dblks, GFP_NOFS);
613 				if (ret) {
614 					fs_err(sdp,
615 					       "Failed to zero data buffers\n");
616 					flags &= ~IOMAP_ZERO;
617 				}
618 			}
619 			break;
620 		}
621 	} while (iomap->addr == IOMAP_NULL_ADDR);
622 
623 	iomap->length = (u64)dblks << inode->i_blkbits;
624 	ip->i_height = mp->mp_fheight;
625 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
626 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
627 	return 0;
628 }
629 
630 /**
631  * hole_size - figure out the size of a hole
632  * @inode: The inode
633  * @lblock: The logical starting block number
634  * @mp: The metapath
635  *
636  * Returns: The hole size in bytes
637  *
638  */
639 static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
640 {
641 	struct gfs2_inode *ip = GFS2_I(inode);
642 	struct gfs2_sbd *sdp = GFS2_SB(inode);
643 	struct metapath mp_eof;
644 	u64 factor = 1;
645 	int hgt;
646 	u64 holesz = 0;
647 	const __be64 *first, *end, *ptr;
648 	const struct buffer_head *bh;
649 	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
650 	int zeroptrs;
651 	bool done = false;
652 
653 	/* Get another metapath, to the very last byte */
654 	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
655 	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
656 		bh = mp->mp_bh[hgt];
657 		if (bh) {
658 			zeroptrs = 0;
659 			first = metapointer(hgt, mp);
660 			end = (const __be64 *)(bh->b_data + bh->b_size);
661 
662 			for (ptr = first; ptr < end; ptr++) {
663 				if (*ptr) {
664 					done = true;
665 					break;
666 				} else {
667 					zeroptrs++;
668 				}
669 			}
670 		} else {
671 			zeroptrs = sdp->sd_inptrs;
672 		}
673 		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
674 			holesz = lblock_stop - lblock + 1;
675 			break;
676 		}
677 		holesz += factor * zeroptrs;
678 
679 		factor *= sdp->sd_inptrs;
680 		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
681 			(mp->mp_list[hgt - 1])++;
682 	}
683 	return holesz << inode->i_blkbits;
684 }
685 
686 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
687 {
688 	struct gfs2_inode *ip = GFS2_I(inode);
689 
690 	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
691 		      sizeof(struct gfs2_dinode);
692 	iomap->offset = 0;
693 	iomap->length = i_size_read(inode);
694 	iomap->type = IOMAP_MAPPED;
695 	iomap->flags = IOMAP_F_DATA_INLINE;
696 }
697 
698 /**
699  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
700  * @inode: The inode
701  * @pos: Starting position in bytes
702  * @length: Length to map, in bytes
703  * @flags: iomap flags
704  * @iomap: The iomap structure
705  *
706  * Returns: errno
707  */
708 int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
709 		     unsigned flags, struct iomap *iomap)
710 {
711 	struct gfs2_inode *ip = GFS2_I(inode);
712 	struct gfs2_sbd *sdp = GFS2_SB(inode);
713 	struct metapath mp = { .mp_aheight = 1, };
714 	unsigned int factor = sdp->sd_sb.sb_bsize;
715 	const u64 *arr = sdp->sd_heightsize;
716 	__be64 *ptr;
717 	sector_t lblock;
718 	sector_t lend;
719 	int ret = 0;
720 	int eob;
721 	unsigned int len;
722 	struct buffer_head *bh;
723 	u8 height;
724 
725 	trace_gfs2_iomap_start(ip, pos, length, flags);
726 	if (!length) {
727 		ret = -EINVAL;
728 		goto out;
729 	}
730 
731 	if (gfs2_is_stuffed(ip)) {
732 		if (flags & IOMAP_REPORT) {
733 			gfs2_stuffed_iomap(inode, iomap);
734 			if (pos >= iomap->length)
735 				ret = -ENOENT;
736 			goto out;
737 		}
738 		BUG_ON(!(flags & IOMAP_WRITE));
739 	}
740 
741 	lblock = pos >> inode->i_blkbits;
742 	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
743 
744 	iomap->offset = lblock << inode->i_blkbits;
745 	iomap->addr = IOMAP_NULL_ADDR;
746 	iomap->type = IOMAP_HOLE;
747 	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
748 	iomap->flags = IOMAP_F_MERGED;
749 	bmap_lock(ip, flags & IOMAP_WRITE);
750 
751 	/*
752 	 * Directory data blocks have a struct gfs2_meta_header header, so the
753 	 * remaining size is smaller than the filesystem block size.  Logical
754 	 * block numbers for directories are in units of this remaining size!
755 	 */
756 	if (gfs2_is_dir(ip)) {
757 		factor = sdp->sd_jbsize;
758 		arr = sdp->sd_jheightsize;
759 	}
760 
761 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
762 	if (ret)
763 		goto out_release;
764 
765 	height = ip->i_height;
766 	while ((lblock + 1) * factor > arr[height])
767 		height++;
768 	find_metapath(sdp, lblock, &mp, height);
769 	if (height > ip->i_height || gfs2_is_stuffed(ip))
770 		goto do_alloc;
771 
772 	ret = lookup_metapath(ip, &mp);
773 	if (ret)
774 		goto out_release;
775 
776 	if (mp.mp_aheight != ip->i_height)
777 		goto do_alloc;
778 
779 	ptr = metapointer(ip->i_height - 1, &mp);
780 	if (*ptr == 0)
781 		goto do_alloc;
782 
783 	iomap->type = IOMAP_MAPPED;
784 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
785 
786 	bh = mp.mp_bh[ip->i_height - 1];
787 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
788 	if (eob)
789 		iomap->flags |= IOMAP_F_BOUNDARY;
790 	iomap->length = (u64)len << inode->i_blkbits;
791 
792 out_release:
793 	release_metapath(&mp);
794 	bmap_unlock(ip, flags & IOMAP_WRITE);
795 out:
796 	trace_gfs2_iomap_end(ip, iomap, ret);
797 	return ret;
798 
799 do_alloc:
800 	if (flags & IOMAP_WRITE) {
801 		ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
802 	} else if (flags & IOMAP_REPORT) {
803 		loff_t size = i_size_read(inode);
804 		if (pos >= size)
805 			ret = -ENOENT;
806 		else if (height <= ip->i_height)
807 			iomap->length = hole_size(inode, lblock, &mp);
808 		else
809 			iomap->length = size - pos;
810 	} else {
811 		if (height <= ip->i_height)
812 			iomap->length = hole_size(inode, lblock, &mp);
813 	}
814 	goto out_release;
815 }
816 
817 /**
818  * gfs2_block_map - Map a block from an inode to a disk block
819  * @inode: The inode
820  * @lblock: The logical block number
821  * @bh_map: The bh to be mapped
822  * @create: True if its ok to alloc blocks to satify the request
823  *
824  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
825  * read of metadata will be required before the next block can be
826  * mapped. Sets buffer_new() if new blocks were allocated.
827  *
828  * Returns: errno
829  */
830 
831 int gfs2_block_map(struct inode *inode, sector_t lblock,
832 		   struct buffer_head *bh_map, int create)
833 {
834 	struct gfs2_inode *ip = GFS2_I(inode);
835 	struct iomap iomap;
836 	int ret, flags = 0;
837 
838 	clear_buffer_mapped(bh_map);
839 	clear_buffer_new(bh_map);
840 	clear_buffer_boundary(bh_map);
841 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
842 
843 	if (create)
844 		flags |= IOMAP_WRITE;
845 	if (buffer_zeronew(bh_map))
846 		flags |= IOMAP_ZERO;
847 	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
848 			       bh_map->b_size, flags, &iomap);
849 	if (ret) {
850 		if (!create && ret == -ENOENT) {
851 			/* Return unmapped buffer beyond the end of file.  */
852 			ret = 0;
853 		}
854 		goto out;
855 	}
856 
857 	if (iomap.length > bh_map->b_size) {
858 		iomap.length = bh_map->b_size;
859 		iomap.flags &= ~IOMAP_F_BOUNDARY;
860 	}
861 	if (iomap.addr != IOMAP_NULL_ADDR)
862 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
863 	bh_map->b_size = iomap.length;
864 	if (iomap.flags & IOMAP_F_BOUNDARY)
865 		set_buffer_boundary(bh_map);
866 	if (iomap.flags & IOMAP_F_NEW)
867 		set_buffer_new(bh_map);
868 
869 out:
870 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
871 	return ret;
872 }
873 
874 /*
875  * Deprecated: do not use in new code
876  */
877 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
878 {
879 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
880 	int ret;
881 	int create = *new;
882 
883 	BUG_ON(!extlen);
884 	BUG_ON(!dblock);
885 	BUG_ON(!new);
886 
887 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
888 	ret = gfs2_block_map(inode, lblock, &bh, create);
889 	*extlen = bh.b_size >> inode->i_blkbits;
890 	*dblock = bh.b_blocknr;
891 	if (buffer_new(&bh))
892 		*new = 1;
893 	else
894 		*new = 0;
895 	return ret;
896 }
897 
898 /**
899  * gfs2_block_zero_range - Deal with zeroing out data
900  *
901  * This is partly borrowed from ext3.
902  */
903 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
904 				 unsigned int length)
905 {
906 	struct address_space *mapping = inode->i_mapping;
907 	struct gfs2_inode *ip = GFS2_I(inode);
908 	unsigned long index = from >> PAGE_SHIFT;
909 	unsigned offset = from & (PAGE_SIZE-1);
910 	unsigned blocksize, iblock, pos;
911 	struct buffer_head *bh;
912 	struct page *page;
913 	int err;
914 
915 	page = find_or_create_page(mapping, index, GFP_NOFS);
916 	if (!page)
917 		return 0;
918 
919 	blocksize = inode->i_sb->s_blocksize;
920 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
921 
922 	if (!page_has_buffers(page))
923 		create_empty_buffers(page, blocksize, 0);
924 
925 	/* Find the buffer that contains "offset" */
926 	bh = page_buffers(page);
927 	pos = blocksize;
928 	while (offset >= pos) {
929 		bh = bh->b_this_page;
930 		iblock++;
931 		pos += blocksize;
932 	}
933 
934 	err = 0;
935 
936 	if (!buffer_mapped(bh)) {
937 		gfs2_block_map(inode, iblock, bh, 0);
938 		/* unmapped? It's a hole - nothing to do */
939 		if (!buffer_mapped(bh))
940 			goto unlock;
941 	}
942 
943 	/* Ok, it's mapped. Make sure it's up-to-date */
944 	if (PageUptodate(page))
945 		set_buffer_uptodate(bh);
946 
947 	if (!buffer_uptodate(bh)) {
948 		err = -EIO;
949 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
950 		wait_on_buffer(bh);
951 		/* Uhhuh. Read error. Complain and punt. */
952 		if (!buffer_uptodate(bh))
953 			goto unlock;
954 		err = 0;
955 	}
956 
957 	if (!gfs2_is_writeback(ip))
958 		gfs2_trans_add_data(ip->i_gl, bh);
959 
960 	zero_user(page, offset, length);
961 	mark_buffer_dirty(bh);
962 unlock:
963 	unlock_page(page);
964 	put_page(page);
965 	return err;
966 }
967 
968 #define GFS2_JTRUNC_REVOKES 8192
969 
970 /**
971  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
972  * @inode: The inode being truncated
973  * @oldsize: The original (larger) size
974  * @newsize: The new smaller size
975  *
976  * With jdata files, we have to journal a revoke for each block which is
977  * truncated. As a result, we need to split this into separate transactions
978  * if the number of pages being truncated gets too large.
979  */
980 
981 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
982 {
983 	struct gfs2_sbd *sdp = GFS2_SB(inode);
984 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
985 	u64 chunk;
986 	int error;
987 
988 	while (oldsize != newsize) {
989 		struct gfs2_trans *tr;
990 		unsigned int offs;
991 
992 		chunk = oldsize - newsize;
993 		if (chunk > max_chunk)
994 			chunk = max_chunk;
995 
996 		offs = oldsize & ~PAGE_MASK;
997 		if (offs && chunk > PAGE_SIZE)
998 			chunk = offs + ((chunk - offs) & PAGE_MASK);
999 
1000 		truncate_pagecache(inode, oldsize - chunk);
1001 		oldsize -= chunk;
1002 
1003 		tr = current->journal_info;
1004 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1005 			continue;
1006 
1007 		gfs2_trans_end(sdp);
1008 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1009 		if (error)
1010 			return error;
1011 	}
1012 
1013 	return 0;
1014 }
1015 
1016 static int trunc_start(struct inode *inode, u64 newsize)
1017 {
1018 	struct gfs2_inode *ip = GFS2_I(inode);
1019 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1020 	struct buffer_head *dibh = NULL;
1021 	int journaled = gfs2_is_jdata(ip);
1022 	u64 oldsize = inode->i_size;
1023 	int error;
1024 
1025 	if (journaled)
1026 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1027 	else
1028 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1029 	if (error)
1030 		return error;
1031 
1032 	error = gfs2_meta_inode_buffer(ip, &dibh);
1033 	if (error)
1034 		goto out;
1035 
1036 	gfs2_trans_add_meta(ip->i_gl, dibh);
1037 
1038 	if (gfs2_is_stuffed(ip)) {
1039 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1040 	} else {
1041 		unsigned int blocksize = i_blocksize(inode);
1042 		unsigned int offs = newsize & (blocksize - 1);
1043 		if (offs) {
1044 			error = gfs2_block_zero_range(inode, newsize,
1045 						      blocksize - offs);
1046 			if (error)
1047 				goto out;
1048 		}
1049 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1050 	}
1051 
1052 	i_size_write(inode, newsize);
1053 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1054 	gfs2_dinode_out(ip, dibh->b_data);
1055 
1056 	if (journaled)
1057 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1058 	else
1059 		truncate_pagecache(inode, newsize);
1060 
1061 out:
1062 	brelse(dibh);
1063 	if (current->journal_info)
1064 		gfs2_trans_end(sdp);
1065 	return error;
1066 }
1067 
1068 /**
1069  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1070  * @ip: inode
1071  * @rg_gh: holder of resource group glock
1072  * @bh: buffer head to sweep
1073  * @start: starting point in bh
1074  * @end: end point in bh
1075  * @meta: true if bh points to metadata (rather than data)
1076  * @btotal: place to keep count of total blocks freed
1077  *
1078  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1079  * free, and free them all. However, we do it one rgrp at a time. If this
1080  * block has references to multiple rgrps, we break it into individual
1081  * transactions. This allows other processes to use the rgrps while we're
1082  * focused on a single one, for better concurrency / performance.
1083  * At every transaction boundary, we rewrite the inode into the journal.
1084  * That way the bitmaps are kept consistent with the inode and we can recover
1085  * if we're interrupted by power-outages.
1086  *
1087  * Returns: 0, or return code if an error occurred.
1088  *          *btotal has the total number of blocks freed
1089  */
1090 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1091 			      struct buffer_head *bh, __be64 *start, __be64 *end,
1092 			      bool meta, u32 *btotal)
1093 {
1094 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1095 	struct gfs2_rgrpd *rgd;
1096 	struct gfs2_trans *tr;
1097 	__be64 *p;
1098 	int blks_outside_rgrp;
1099 	u64 bn, bstart, isize_blks;
1100 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1101 	int ret = 0;
1102 	bool buf_in_tr = false; /* buffer was added to transaction */
1103 
1104 more_rgrps:
1105 	rgd = NULL;
1106 	if (gfs2_holder_initialized(rd_gh)) {
1107 		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1108 		gfs2_assert_withdraw(sdp,
1109 			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1110 	}
1111 	blks_outside_rgrp = 0;
1112 	bstart = 0;
1113 	blen = 0;
1114 
1115 	for (p = start; p < end; p++) {
1116 		if (!*p)
1117 			continue;
1118 		bn = be64_to_cpu(*p);
1119 
1120 		if (rgd) {
1121 			if (!rgrp_contains_block(rgd, bn)) {
1122 				blks_outside_rgrp++;
1123 				continue;
1124 			}
1125 		} else {
1126 			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1127 			if (unlikely(!rgd)) {
1128 				ret = -EIO;
1129 				goto out;
1130 			}
1131 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1132 						 0, rd_gh);
1133 			if (ret)
1134 				goto out;
1135 
1136 			/* Must be done with the rgrp glock held: */
1137 			if (gfs2_rs_active(&ip->i_res) &&
1138 			    rgd == ip->i_res.rs_rbm.rgd)
1139 				gfs2_rs_deltree(&ip->i_res);
1140 		}
1141 
1142 		/* The size of our transactions will be unknown until we
1143 		   actually process all the metadata blocks that relate to
1144 		   the rgrp. So we estimate. We know it can't be more than
1145 		   the dinode's i_blocks and we don't want to exceed the
1146 		   journal flush threshold, sd_log_thresh2. */
1147 		if (current->journal_info == NULL) {
1148 			unsigned int jblocks_rqsted, revokes;
1149 
1150 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1151 				RES_INDIRECT;
1152 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1153 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1154 				jblocks_rqsted +=
1155 					atomic_read(&sdp->sd_log_thresh2);
1156 			else
1157 				jblocks_rqsted += isize_blks;
1158 			revokes = jblocks_rqsted;
1159 			if (meta)
1160 				revokes += end - start;
1161 			else if (ip->i_depth)
1162 				revokes += sdp->sd_inptrs;
1163 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1164 			if (ret)
1165 				goto out_unlock;
1166 			down_write(&ip->i_rw_mutex);
1167 		}
1168 		/* check if we will exceed the transaction blocks requested */
1169 		tr = current->journal_info;
1170 		if (tr->tr_num_buf_new + RES_STATFS +
1171 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1172 			/* We set blks_outside_rgrp to ensure the loop will
1173 			   be repeated for the same rgrp, but with a new
1174 			   transaction. */
1175 			blks_outside_rgrp++;
1176 			/* This next part is tricky. If the buffer was added
1177 			   to the transaction, we've already set some block
1178 			   pointers to 0, so we better follow through and free
1179 			   them, or we will introduce corruption (so break).
1180 			   This may be impossible, or at least rare, but I
1181 			   decided to cover the case regardless.
1182 
1183 			   If the buffer was not added to the transaction
1184 			   (this call), doing so would exceed our transaction
1185 			   size, so we need to end the transaction and start a
1186 			   new one (so goto). */
1187 
1188 			if (buf_in_tr)
1189 				break;
1190 			goto out_unlock;
1191 		}
1192 
1193 		gfs2_trans_add_meta(ip->i_gl, bh);
1194 		buf_in_tr = true;
1195 		*p = 0;
1196 		if (bstart + blen == bn) {
1197 			blen++;
1198 			continue;
1199 		}
1200 		if (bstart) {
1201 			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1202 			(*btotal) += blen;
1203 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1204 		}
1205 		bstart = bn;
1206 		blen = 1;
1207 	}
1208 	if (bstart) {
1209 		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1210 		(*btotal) += blen;
1211 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1212 	}
1213 out_unlock:
1214 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1215 					    outside the rgrp we just processed,
1216 					    do it all over again. */
1217 		if (current->journal_info) {
1218 			struct buffer_head *dibh;
1219 
1220 			ret = gfs2_meta_inode_buffer(ip, &dibh);
1221 			if (ret)
1222 				goto out;
1223 
1224 			/* Every transaction boundary, we rewrite the dinode
1225 			   to keep its di_blocks current in case of failure. */
1226 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1227 				current_time(&ip->i_inode);
1228 			gfs2_trans_add_meta(ip->i_gl, dibh);
1229 			gfs2_dinode_out(ip, dibh->b_data);
1230 			brelse(dibh);
1231 			up_write(&ip->i_rw_mutex);
1232 			gfs2_trans_end(sdp);
1233 		}
1234 		gfs2_glock_dq_uninit(rd_gh);
1235 		cond_resched();
1236 		goto more_rgrps;
1237 	}
1238 out:
1239 	return ret;
1240 }
1241 
1242 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1243 {
1244 	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1245 		return false;
1246 	return true;
1247 }
1248 
1249 /**
1250  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1251  * @mp: starting metapath
1252  * @h: desired height to search
1253  *
1254  * Assumes the metapath is valid (with buffers) out to height h.
1255  * Returns: true if a non-null pointer was found in the metapath buffer
1256  *          false if all remaining pointers are NULL in the buffer
1257  */
1258 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1259 			     unsigned int h,
1260 			     __u16 *end_list, unsigned int end_aligned)
1261 {
1262 	struct buffer_head *bh = mp->mp_bh[h];
1263 	__be64 *first, *ptr, *end;
1264 
1265 	first = metaptr1(h, mp);
1266 	ptr = first + mp->mp_list[h];
1267 	end = (__be64 *)(bh->b_data + bh->b_size);
1268 	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1269 		bool keep_end = h < end_aligned;
1270 		end = first + end_list[h] + keep_end;
1271 	}
1272 
1273 	while (ptr < end) {
1274 		if (*ptr) { /* if we have a non-null pointer */
1275 			mp->mp_list[h] = ptr - first;
1276 			h++;
1277 			if (h < GFS2_MAX_META_HEIGHT)
1278 				mp->mp_list[h] = 0;
1279 			return true;
1280 		}
1281 		ptr++;
1282 	}
1283 	return false;
1284 }
1285 
1286 enum dealloc_states {
1287 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1288 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1289 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1290 	DEALLOC_DONE = 3,       /* process complete */
1291 };
1292 
1293 static inline void
1294 metapointer_range(struct metapath *mp, int height,
1295 		  __u16 *start_list, unsigned int start_aligned,
1296 		  __u16 *end_list, unsigned int end_aligned,
1297 		  __be64 **start, __be64 **end)
1298 {
1299 	struct buffer_head *bh = mp->mp_bh[height];
1300 	__be64 *first;
1301 
1302 	first = metaptr1(height, mp);
1303 	*start = first;
1304 	if (mp_eq_to_hgt(mp, start_list, height)) {
1305 		bool keep_start = height < start_aligned;
1306 		*start = first + start_list[height] + keep_start;
1307 	}
1308 	*end = (__be64 *)(bh->b_data + bh->b_size);
1309 	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1310 		bool keep_end = height < end_aligned;
1311 		*end = first + end_list[height] + keep_end;
1312 	}
1313 }
1314 
1315 static inline bool walk_done(struct gfs2_sbd *sdp,
1316 			     struct metapath *mp, int height,
1317 			     __u16 *end_list, unsigned int end_aligned)
1318 {
1319 	__u16 end;
1320 
1321 	if (end_list) {
1322 		bool keep_end = height < end_aligned;
1323 		if (!mp_eq_to_hgt(mp, end_list, height))
1324 			return false;
1325 		end = end_list[height] + keep_end;
1326 	} else
1327 		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1328 	return mp->mp_list[height] >= end;
1329 }
1330 
1331 /**
1332  * punch_hole - deallocate blocks in a file
1333  * @ip: inode to truncate
1334  * @offset: the start of the hole
1335  * @length: the size of the hole (or 0 for truncate)
1336  *
1337  * Punch a hole into a file or truncate a file at a given position.  This
1338  * function operates in whole blocks (@offset and @length are rounded
1339  * accordingly); partially filled blocks must be cleared otherwise.
1340  *
1341  * This function works from the bottom up, and from the right to the left. In
1342  * other words, it strips off the highest layer (data) before stripping any of
1343  * the metadata. Doing it this way is best in case the operation is interrupted
1344  * by power failure, etc.  The dinode is rewritten in every transaction to
1345  * guarantee integrity.
1346  */
1347 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1348 {
1349 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1350 	struct metapath mp = {};
1351 	struct buffer_head *dibh, *bh;
1352 	struct gfs2_holder rd_gh;
1353 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1354 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1355 	__u16 start_list[GFS2_MAX_META_HEIGHT];
1356 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1357 	unsigned int start_aligned, uninitialized_var(end_aligned);
1358 	unsigned int strip_h = ip->i_height - 1;
1359 	u32 btotal = 0;
1360 	int ret, state;
1361 	int mp_h; /* metapath buffers are read in to this height */
1362 	u64 prev_bnr = 0;
1363 	__be64 *start, *end;
1364 
1365 	/*
1366 	 * The start position of the hole is defined by lblock, start_list, and
1367 	 * start_aligned.  The end position of the hole is defined by lend,
1368 	 * end_list, and end_aligned.
1369 	 *
1370 	 * start_aligned and end_aligned define down to which height the start
1371 	 * and end positions are aligned to the metadata tree (i.e., the
1372 	 * position is a multiple of the metadata granularity at the height
1373 	 * above).  This determines at which heights additional meta pointers
1374 	 * needs to be preserved for the remaining data.
1375 	 */
1376 
1377 	if (length) {
1378 		u64 maxsize = sdp->sd_heightsize[ip->i_height];
1379 		u64 end_offset = offset + length;
1380 		u64 lend;
1381 
1382 		/*
1383 		 * Clip the end at the maximum file size for the given height:
1384 		 * that's how far the metadata goes; files bigger than that
1385 		 * will have additional layers of indirection.
1386 		 */
1387 		if (end_offset > maxsize)
1388 			end_offset = maxsize;
1389 		lend = end_offset >> bsize_shift;
1390 
1391 		if (lblock >= lend)
1392 			return 0;
1393 
1394 		find_metapath(sdp, lend, &mp, ip->i_height);
1395 		end_list = __end_list;
1396 		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1397 
1398 		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1399 			if (end_list[mp_h])
1400 				break;
1401 		}
1402 		end_aligned = mp_h;
1403 	}
1404 
1405 	find_metapath(sdp, lblock, &mp, ip->i_height);
1406 	memcpy(start_list, mp.mp_list, sizeof(start_list));
1407 
1408 	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1409 		if (start_list[mp_h])
1410 			break;
1411 	}
1412 	start_aligned = mp_h;
1413 
1414 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1415 	if (ret)
1416 		return ret;
1417 
1418 	mp.mp_bh[0] = dibh;
1419 	ret = lookup_metapath(ip, &mp);
1420 	if (ret)
1421 		goto out_metapath;
1422 
1423 	/* issue read-ahead on metadata */
1424 	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1425 		metapointer_range(&mp, mp_h, start_list, start_aligned,
1426 				  end_list, end_aligned, &start, &end);
1427 		gfs2_metapath_ra(ip->i_gl, start, end);
1428 	}
1429 
1430 	if (mp.mp_aheight == ip->i_height)
1431 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1432 	else
1433 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1434 
1435 	ret = gfs2_rindex_update(sdp);
1436 	if (ret)
1437 		goto out_metapath;
1438 
1439 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1440 	if (ret)
1441 		goto out_metapath;
1442 	gfs2_holder_mark_uninitialized(&rd_gh);
1443 
1444 	mp_h = strip_h;
1445 
1446 	while (state != DEALLOC_DONE) {
1447 		switch (state) {
1448 		/* Truncate a full metapath at the given strip height.
1449 		 * Note that strip_h == mp_h in order to be in this state. */
1450 		case DEALLOC_MP_FULL:
1451 			bh = mp.mp_bh[mp_h];
1452 			gfs2_assert_withdraw(sdp, bh);
1453 			if (gfs2_assert_withdraw(sdp,
1454 						 prev_bnr != bh->b_blocknr)) {
1455 				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1456 				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1457 				       sdp->sd_fsname,
1458 				       (unsigned long long)ip->i_no_addr,
1459 				       prev_bnr, ip->i_height, strip_h, mp_h);
1460 			}
1461 			prev_bnr = bh->b_blocknr;
1462 
1463 			if (gfs2_metatype_check(sdp, bh,
1464 						(mp_h ? GFS2_METATYPE_IN :
1465 							GFS2_METATYPE_DI))) {
1466 				ret = -EIO;
1467 				goto out;
1468 			}
1469 
1470 			/*
1471 			 * Below, passing end_aligned as 0 gives us the
1472 			 * metapointer range excluding the end point: the end
1473 			 * point is the first metapath we must not deallocate!
1474 			 */
1475 
1476 			metapointer_range(&mp, mp_h, start_list, start_aligned,
1477 					  end_list, 0 /* end_aligned */,
1478 					  &start, &end);
1479 			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1480 						 start, end,
1481 						 mp_h != ip->i_height - 1,
1482 						 &btotal);
1483 
1484 			/* If we hit an error or just swept dinode buffer,
1485 			   just exit. */
1486 			if (ret || !mp_h) {
1487 				state = DEALLOC_DONE;
1488 				break;
1489 			}
1490 			state = DEALLOC_MP_LOWER;
1491 			break;
1492 
1493 		/* lower the metapath strip height */
1494 		case DEALLOC_MP_LOWER:
1495 			/* We're done with the current buffer, so release it,
1496 			   unless it's the dinode buffer. Then back up to the
1497 			   previous pointer. */
1498 			if (mp_h) {
1499 				brelse(mp.mp_bh[mp_h]);
1500 				mp.mp_bh[mp_h] = NULL;
1501 			}
1502 			/* If we can't get any lower in height, we've stripped
1503 			   off all we can. Next step is to back up and start
1504 			   stripping the previous level of metadata. */
1505 			if (mp_h == 0) {
1506 				strip_h--;
1507 				memcpy(mp.mp_list, start_list, sizeof(start_list));
1508 				mp_h = strip_h;
1509 				state = DEALLOC_FILL_MP;
1510 				break;
1511 			}
1512 			mp.mp_list[mp_h] = 0;
1513 			mp_h--; /* search one metadata height down */
1514 			mp.mp_list[mp_h]++;
1515 			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1516 				break;
1517 			/* Here we've found a part of the metapath that is not
1518 			 * allocated. We need to search at that height for the
1519 			 * next non-null pointer. */
1520 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1521 				state = DEALLOC_FILL_MP;
1522 				mp_h++;
1523 			}
1524 			/* No more non-null pointers at this height. Back up
1525 			   to the previous height and try again. */
1526 			break; /* loop around in the same state */
1527 
1528 		/* Fill the metapath with buffers to the given height. */
1529 		case DEALLOC_FILL_MP:
1530 			/* Fill the buffers out to the current height. */
1531 			ret = fillup_metapath(ip, &mp, mp_h);
1532 			if (ret < 0)
1533 				goto out;
1534 
1535 			/* issue read-ahead on metadata */
1536 			if (mp.mp_aheight > 1) {
1537 				for (; ret > 1; ret--) {
1538 					metapointer_range(&mp, mp.mp_aheight - ret,
1539 							  start_list, start_aligned,
1540 							  end_list, end_aligned,
1541 							  &start, &end);
1542 					gfs2_metapath_ra(ip->i_gl, start, end);
1543 				}
1544 			}
1545 
1546 			/* If buffers found for the entire strip height */
1547 			if (mp.mp_aheight - 1 == strip_h) {
1548 				state = DEALLOC_MP_FULL;
1549 				break;
1550 			}
1551 			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1552 				mp_h = mp.mp_aheight - 1;
1553 
1554 			/* If we find a non-null block pointer, crawl a bit
1555 			   higher up in the metapath and try again, otherwise
1556 			   we need to look lower for a new starting point. */
1557 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1558 				mp_h++;
1559 			else
1560 				state = DEALLOC_MP_LOWER;
1561 			break;
1562 		}
1563 	}
1564 
1565 	if (btotal) {
1566 		if (current->journal_info == NULL) {
1567 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1568 					       RES_QUOTA, 0);
1569 			if (ret)
1570 				goto out;
1571 			down_write(&ip->i_rw_mutex);
1572 		}
1573 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1574 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1575 				  ip->i_inode.i_gid);
1576 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1577 		gfs2_trans_add_meta(ip->i_gl, dibh);
1578 		gfs2_dinode_out(ip, dibh->b_data);
1579 		up_write(&ip->i_rw_mutex);
1580 		gfs2_trans_end(sdp);
1581 	}
1582 
1583 out:
1584 	if (gfs2_holder_initialized(&rd_gh))
1585 		gfs2_glock_dq_uninit(&rd_gh);
1586 	if (current->journal_info) {
1587 		up_write(&ip->i_rw_mutex);
1588 		gfs2_trans_end(sdp);
1589 		cond_resched();
1590 	}
1591 	gfs2_quota_unhold(ip);
1592 out_metapath:
1593 	release_metapath(&mp);
1594 	return ret;
1595 }
1596 
1597 static int trunc_end(struct gfs2_inode *ip)
1598 {
1599 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1600 	struct buffer_head *dibh;
1601 	int error;
1602 
1603 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1604 	if (error)
1605 		return error;
1606 
1607 	down_write(&ip->i_rw_mutex);
1608 
1609 	error = gfs2_meta_inode_buffer(ip, &dibh);
1610 	if (error)
1611 		goto out;
1612 
1613 	if (!i_size_read(&ip->i_inode)) {
1614 		ip->i_height = 0;
1615 		ip->i_goal = ip->i_no_addr;
1616 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1617 		gfs2_ordered_del_inode(ip);
1618 	}
1619 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1620 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1621 
1622 	gfs2_trans_add_meta(ip->i_gl, dibh);
1623 	gfs2_dinode_out(ip, dibh->b_data);
1624 	brelse(dibh);
1625 
1626 out:
1627 	up_write(&ip->i_rw_mutex);
1628 	gfs2_trans_end(sdp);
1629 	return error;
1630 }
1631 
1632 /**
1633  * do_shrink - make a file smaller
1634  * @inode: the inode
1635  * @newsize: the size to make the file
1636  *
1637  * Called with an exclusive lock on @inode. The @size must
1638  * be equal to or smaller than the current inode size.
1639  *
1640  * Returns: errno
1641  */
1642 
1643 static int do_shrink(struct inode *inode, u64 newsize)
1644 {
1645 	struct gfs2_inode *ip = GFS2_I(inode);
1646 	int error;
1647 
1648 	error = trunc_start(inode, newsize);
1649 	if (error < 0)
1650 		return error;
1651 	if (gfs2_is_stuffed(ip))
1652 		return 0;
1653 
1654 	error = punch_hole(ip, newsize, 0);
1655 	if (error == 0)
1656 		error = trunc_end(ip);
1657 
1658 	return error;
1659 }
1660 
1661 void gfs2_trim_blocks(struct inode *inode)
1662 {
1663 	int ret;
1664 
1665 	ret = do_shrink(inode, inode->i_size);
1666 	WARN_ON(ret != 0);
1667 }
1668 
1669 /**
1670  * do_grow - Touch and update inode size
1671  * @inode: The inode
1672  * @size: The new size
1673  *
1674  * This function updates the timestamps on the inode and
1675  * may also increase the size of the inode. This function
1676  * must not be called with @size any smaller than the current
1677  * inode size.
1678  *
1679  * Although it is not strictly required to unstuff files here,
1680  * earlier versions of GFS2 have a bug in the stuffed file reading
1681  * code which will result in a buffer overrun if the size is larger
1682  * than the max stuffed file size. In order to prevent this from
1683  * occurring, such files are unstuffed, but in other cases we can
1684  * just update the inode size directly.
1685  *
1686  * Returns: 0 on success, or -ve on error
1687  */
1688 
1689 static int do_grow(struct inode *inode, u64 size)
1690 {
1691 	struct gfs2_inode *ip = GFS2_I(inode);
1692 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1693 	struct gfs2_alloc_parms ap = { .target = 1, };
1694 	struct buffer_head *dibh;
1695 	int error;
1696 	int unstuff = 0;
1697 
1698 	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
1699 		error = gfs2_quota_lock_check(ip, &ap);
1700 		if (error)
1701 			return error;
1702 
1703 		error = gfs2_inplace_reserve(ip, &ap);
1704 		if (error)
1705 			goto do_grow_qunlock;
1706 		unstuff = 1;
1707 	}
1708 
1709 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1710 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1711 				  0 : RES_QUOTA), 0);
1712 	if (error)
1713 		goto do_grow_release;
1714 
1715 	if (unstuff) {
1716 		error = gfs2_unstuff_dinode(ip, NULL);
1717 		if (error)
1718 			goto do_end_trans;
1719 	}
1720 
1721 	error = gfs2_meta_inode_buffer(ip, &dibh);
1722 	if (error)
1723 		goto do_end_trans;
1724 
1725 	i_size_write(inode, size);
1726 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1727 	gfs2_trans_add_meta(ip->i_gl, dibh);
1728 	gfs2_dinode_out(ip, dibh->b_data);
1729 	brelse(dibh);
1730 
1731 do_end_trans:
1732 	gfs2_trans_end(sdp);
1733 do_grow_release:
1734 	if (unstuff) {
1735 		gfs2_inplace_release(ip);
1736 do_grow_qunlock:
1737 		gfs2_quota_unlock(ip);
1738 	}
1739 	return error;
1740 }
1741 
1742 /**
1743  * gfs2_setattr_size - make a file a given size
1744  * @inode: the inode
1745  * @newsize: the size to make the file
1746  *
1747  * The file size can grow, shrink, or stay the same size. This
1748  * is called holding i_mutex and an exclusive glock on the inode
1749  * in question.
1750  *
1751  * Returns: errno
1752  */
1753 
1754 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1755 {
1756 	struct gfs2_inode *ip = GFS2_I(inode);
1757 	int ret;
1758 
1759 	BUG_ON(!S_ISREG(inode->i_mode));
1760 
1761 	ret = inode_newsize_ok(inode, newsize);
1762 	if (ret)
1763 		return ret;
1764 
1765 	inode_dio_wait(inode);
1766 
1767 	ret = gfs2_rsqa_alloc(ip);
1768 	if (ret)
1769 		goto out;
1770 
1771 	if (newsize >= inode->i_size) {
1772 		ret = do_grow(inode, newsize);
1773 		goto out;
1774 	}
1775 
1776 	ret = do_shrink(inode, newsize);
1777 out:
1778 	gfs2_rsqa_delete(ip, NULL);
1779 	return ret;
1780 }
1781 
1782 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1783 {
1784 	int error;
1785 	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
1786 	if (!error)
1787 		error = trunc_end(ip);
1788 	return error;
1789 }
1790 
1791 int gfs2_file_dealloc(struct gfs2_inode *ip)
1792 {
1793 	return punch_hole(ip, 0, 0);
1794 }
1795 
1796 /**
1797  * gfs2_free_journal_extents - Free cached journal bmap info
1798  * @jd: The journal
1799  *
1800  */
1801 
1802 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1803 {
1804 	struct gfs2_journal_extent *jext;
1805 
1806 	while(!list_empty(&jd->extent_list)) {
1807 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1808 		list_del(&jext->list);
1809 		kfree(jext);
1810 	}
1811 }
1812 
1813 /**
1814  * gfs2_add_jextent - Add or merge a new extent to extent cache
1815  * @jd: The journal descriptor
1816  * @lblock: The logical block at start of new extent
1817  * @dblock: The physical block at start of new extent
1818  * @blocks: Size of extent in fs blocks
1819  *
1820  * Returns: 0 on success or -ENOMEM
1821  */
1822 
1823 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1824 {
1825 	struct gfs2_journal_extent *jext;
1826 
1827 	if (!list_empty(&jd->extent_list)) {
1828 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1829 		if ((jext->dblock + jext->blocks) == dblock) {
1830 			jext->blocks += blocks;
1831 			return 0;
1832 		}
1833 	}
1834 
1835 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1836 	if (jext == NULL)
1837 		return -ENOMEM;
1838 	jext->dblock = dblock;
1839 	jext->lblock = lblock;
1840 	jext->blocks = blocks;
1841 	list_add_tail(&jext->list, &jd->extent_list);
1842 	jd->nr_extents++;
1843 	return 0;
1844 }
1845 
1846 /**
1847  * gfs2_map_journal_extents - Cache journal bmap info
1848  * @sdp: The super block
1849  * @jd: The journal to map
1850  *
1851  * Create a reusable "extent" mapping from all logical
1852  * blocks to all physical blocks for the given journal.  This will save
1853  * us time when writing journal blocks.  Most journals will have only one
1854  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1855  * arranges the journal blocks sequentially to maximize performance.
1856  * So the extent would map the first block for the entire file length.
1857  * However, gfs2_jadd can happen while file activity is happening, so
1858  * those journals may not be sequential.  Less likely is the case where
1859  * the users created their own journals by mounting the metafs and
1860  * laying it out.  But it's still possible.  These journals might have
1861  * several extents.
1862  *
1863  * Returns: 0 on success, or error on failure
1864  */
1865 
1866 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1867 {
1868 	u64 lblock = 0;
1869 	u64 lblock_stop;
1870 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1871 	struct buffer_head bh;
1872 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1873 	u64 size;
1874 	int rc;
1875 
1876 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1877 	size = (lblock_stop - lblock) << shift;
1878 	jd->nr_extents = 0;
1879 	WARN_ON(!list_empty(&jd->extent_list));
1880 
1881 	do {
1882 		bh.b_state = 0;
1883 		bh.b_blocknr = 0;
1884 		bh.b_size = size;
1885 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1886 		if (rc || !buffer_mapped(&bh))
1887 			goto fail;
1888 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1889 		if (rc)
1890 			goto fail;
1891 		size -= bh.b_size;
1892 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1893 	} while(size > 0);
1894 
1895 	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1896 		jd->nr_extents);
1897 	return 0;
1898 
1899 fail:
1900 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1901 		rc, jd->jd_jid,
1902 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
1903 		jd->nr_extents);
1904 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1905 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1906 		bh.b_state, (unsigned long long)bh.b_size);
1907 	gfs2_free_journal_extents(jd);
1908 	return rc;
1909 }
1910 
1911 /**
1912  * gfs2_write_alloc_required - figure out if a write will require an allocation
1913  * @ip: the file being written to
1914  * @offset: the offset to write to
1915  * @len: the number of bytes being written
1916  *
1917  * Returns: 1 if an alloc is required, 0 otherwise
1918  */
1919 
1920 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1921 			      unsigned int len)
1922 {
1923 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1924 	struct buffer_head bh;
1925 	unsigned int shift;
1926 	u64 lblock, lblock_stop, size;
1927 	u64 end_of_file;
1928 
1929 	if (!len)
1930 		return 0;
1931 
1932 	if (gfs2_is_stuffed(ip)) {
1933 		if (offset + len > gfs2_max_stuffed_size(ip))
1934 			return 1;
1935 		return 0;
1936 	}
1937 
1938 	shift = sdp->sd_sb.sb_bsize_shift;
1939 	BUG_ON(gfs2_is_dir(ip));
1940 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1941 	lblock = offset >> shift;
1942 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1943 	if (lblock_stop > end_of_file)
1944 		return 1;
1945 
1946 	size = (lblock_stop - lblock) << shift;
1947 	do {
1948 		bh.b_state = 0;
1949 		bh.b_size = size;
1950 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1951 		if (!buffer_mapped(&bh))
1952 			return 1;
1953 		size -= bh.b_size;
1954 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1955 	} while(size > 0);
1956 
1957 	return 0;
1958 }
1959 
1960 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
1961 {
1962 	struct gfs2_inode *ip = GFS2_I(inode);
1963 	struct buffer_head *dibh;
1964 	int error;
1965 
1966 	if (offset >= inode->i_size)
1967 		return 0;
1968 	if (offset + length > inode->i_size)
1969 		length = inode->i_size - offset;
1970 
1971 	error = gfs2_meta_inode_buffer(ip, &dibh);
1972 	if (error)
1973 		return error;
1974 	gfs2_trans_add_meta(ip->i_gl, dibh);
1975 	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
1976 	       length);
1977 	brelse(dibh);
1978 	return 0;
1979 }
1980 
1981 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
1982 					 loff_t length)
1983 {
1984 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1985 	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1986 	int error;
1987 
1988 	while (length) {
1989 		struct gfs2_trans *tr;
1990 		loff_t chunk;
1991 		unsigned int offs;
1992 
1993 		chunk = length;
1994 		if (chunk > max_chunk)
1995 			chunk = max_chunk;
1996 
1997 		offs = offset & ~PAGE_MASK;
1998 		if (offs && chunk > PAGE_SIZE)
1999 			chunk = offs + ((chunk - offs) & PAGE_MASK);
2000 
2001 		truncate_pagecache_range(inode, offset, chunk);
2002 		offset += chunk;
2003 		length -= chunk;
2004 
2005 		tr = current->journal_info;
2006 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2007 			continue;
2008 
2009 		gfs2_trans_end(sdp);
2010 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2011 		if (error)
2012 			return error;
2013 	}
2014 	return 0;
2015 }
2016 
2017 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2018 {
2019 	struct inode *inode = file_inode(file);
2020 	struct gfs2_inode *ip = GFS2_I(inode);
2021 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2022 	int error;
2023 
2024 	if (gfs2_is_jdata(ip))
2025 		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2026 					 GFS2_JTRUNC_REVOKES);
2027 	else
2028 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2029 	if (error)
2030 		return error;
2031 
2032 	if (gfs2_is_stuffed(ip)) {
2033 		error = stuffed_zero_range(inode, offset, length);
2034 		if (error)
2035 			goto out;
2036 	} else {
2037 		unsigned int start_off, end_off, blocksize;
2038 
2039 		blocksize = i_blocksize(inode);
2040 		start_off = offset & (blocksize - 1);
2041 		end_off = (offset + length) & (blocksize - 1);
2042 		if (start_off) {
2043 			unsigned int len = length;
2044 			if (length > blocksize - start_off)
2045 				len = blocksize - start_off;
2046 			error = gfs2_block_zero_range(inode, offset, len);
2047 			if (error)
2048 				goto out;
2049 			if (start_off + length < blocksize)
2050 				end_off = 0;
2051 		}
2052 		if (end_off) {
2053 			error = gfs2_block_zero_range(inode,
2054 				offset + length - end_off, end_off);
2055 			if (error)
2056 				goto out;
2057 		}
2058 	}
2059 
2060 	if (gfs2_is_jdata(ip)) {
2061 		BUG_ON(!current->journal_info);
2062 		gfs2_journaled_truncate_range(inode, offset, length);
2063 	} else
2064 		truncate_pagecache_range(inode, offset, offset + length - 1);
2065 
2066 	file_update_time(file);
2067 	mark_inode_dirty(inode);
2068 
2069 	if (current->journal_info)
2070 		gfs2_trans_end(sdp);
2071 
2072 	if (!gfs2_is_stuffed(ip))
2073 		error = punch_hole(ip, offset, length);
2074 
2075 out:
2076 	if (current->journal_info)
2077 		gfs2_trans_end(sdp);
2078 	return error;
2079 }
2080