xref: /openbmc/linux/fs/gfs2/bmap.c (revision 160b8e75)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
16 #include <linux/iomap.h>
17 
18 #include "gfs2.h"
19 #include "incore.h"
20 #include "bmap.h"
21 #include "glock.h"
22 #include "inode.h"
23 #include "meta_io.h"
24 #include "quota.h"
25 #include "rgrp.h"
26 #include "log.h"
27 #include "super.h"
28 #include "trans.h"
29 #include "dir.h"
30 #include "util.h"
31 #include "trace_gfs2.h"
32 
33 /* This doesn't need to be that large as max 64 bit pointers in a 4k
34  * block is 512, so __u16 is fine for that. It saves stack space to
35  * keep it small.
36  */
37 struct metapath {
38 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
39 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
40 	int mp_fheight; /* find_metapath height */
41 	int mp_aheight; /* actual height (lookup height) */
42 };
43 
44 /**
45  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
46  * @ip: the inode
47  * @dibh: the dinode buffer
48  * @block: the block number that was allocated
49  * @page: The (optional) page. This is looked up if @page is NULL
50  *
51  * Returns: errno
52  */
53 
54 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
55 			       u64 block, struct page *page)
56 {
57 	struct inode *inode = &ip->i_inode;
58 	struct buffer_head *bh;
59 	int release = 0;
60 
61 	if (!page || page->index) {
62 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63 		if (!page)
64 			return -ENOMEM;
65 		release = 1;
66 	}
67 
68 	if (!PageUptodate(page)) {
69 		void *kaddr = kmap(page);
70 		u64 dsize = i_size_read(inode);
71 
72 		if (dsize > gfs2_max_stuffed_size(ip))
73 			dsize = gfs2_max_stuffed_size(ip);
74 
75 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
77 		kunmap(page);
78 
79 		SetPageUptodate(page);
80 	}
81 
82 	if (!page_has_buffers(page))
83 		create_empty_buffers(page, BIT(inode->i_blkbits),
84 				     BIT(BH_Uptodate));
85 
86 	bh = page_buffers(page);
87 
88 	if (!buffer_mapped(bh))
89 		map_bh(bh, inode->i_sb, block);
90 
91 	set_buffer_uptodate(bh);
92 	if (!gfs2_is_jdata(ip))
93 		mark_buffer_dirty(bh);
94 	if (!gfs2_is_writeback(ip))
95 		gfs2_trans_add_data(ip->i_gl, bh);
96 
97 	if (release) {
98 		unlock_page(page);
99 		put_page(page);
100 	}
101 
102 	return 0;
103 }
104 
105 /**
106  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
107  * @ip: The GFS2 inode to unstuff
108  * @page: The (optional) page. This is looked up if the @page is NULL
109  *
110  * This routine unstuffs a dinode and returns it to a "normal" state such
111  * that the height can be grown in the traditional way.
112  *
113  * Returns: errno
114  */
115 
116 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
117 {
118 	struct buffer_head *bh, *dibh;
119 	struct gfs2_dinode *di;
120 	u64 block = 0;
121 	int isdir = gfs2_is_dir(ip);
122 	int error;
123 
124 	down_write(&ip->i_rw_mutex);
125 
126 	error = gfs2_meta_inode_buffer(ip, &dibh);
127 	if (error)
128 		goto out;
129 
130 	if (i_size_read(&ip->i_inode)) {
131 		/* Get a free block, fill it with the stuffed data,
132 		   and write it out to disk */
133 
134 		unsigned int n = 1;
135 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136 		if (error)
137 			goto out_brelse;
138 		if (isdir) {
139 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
140 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
141 			if (error)
142 				goto out_brelse;
143 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
144 					      dibh, sizeof(struct gfs2_dinode));
145 			brelse(bh);
146 		} else {
147 			error = gfs2_unstuffer_page(ip, dibh, block, page);
148 			if (error)
149 				goto out_brelse;
150 		}
151 	}
152 
153 	/*  Set up the pointer to the new block  */
154 
155 	gfs2_trans_add_meta(ip->i_gl, dibh);
156 	di = (struct gfs2_dinode *)dibh->b_data;
157 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
158 
159 	if (i_size_read(&ip->i_inode)) {
160 		*(__be64 *)(di + 1) = cpu_to_be64(block);
161 		gfs2_add_inode_blocks(&ip->i_inode, 1);
162 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163 	}
164 
165 	ip->i_height = 1;
166 	di->di_height = cpu_to_be16(1);
167 
168 out_brelse:
169 	brelse(dibh);
170 out:
171 	up_write(&ip->i_rw_mutex);
172 	return error;
173 }
174 
175 
176 /**
177  * find_metapath - Find path through the metadata tree
178  * @sdp: The superblock
179  * @mp: The metapath to return the result in
180  * @block: The disk block to look up
181  * @height: The pre-calculated height of the metadata tree
182  *
183  *   This routine returns a struct metapath structure that defines a path
184  *   through the metadata of inode "ip" to get to block "block".
185  *
186  *   Example:
187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188  *   filesystem with a blocksize of 4096.
189  *
190  *   find_metapath() would return a struct metapath structure set to:
191  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
192  *   and mp_list[2] = 165.
193  *
194  *   That means that in order to get to the block containing the byte at
195  *   offset 101342453, we would load the indirect block pointed to by pointer
196  *   0 in the dinode.  We would then load the indirect block pointed to by
197  *   pointer 48 in that indirect block.  We would then load the data block
198  *   pointed to by pointer 165 in that indirect block.
199  *
200  *             ----------------------------------------
201  *             | Dinode |                             |
202  *             |        |                            4|
203  *             |        |0 1 2 3 4 5                 9|
204  *             |        |                            6|
205  *             ----------------------------------------
206  *                       |
207  *                       |
208  *                       V
209  *             ----------------------------------------
210  *             | Indirect Block                       |
211  *             |                                     5|
212  *             |            4 4 4 4 4 5 5            1|
213  *             |0           5 6 7 8 9 0 1            2|
214  *             ----------------------------------------
215  *                                |
216  *                                |
217  *                                V
218  *             ----------------------------------------
219  *             | Indirect Block                       |
220  *             |                         1 1 1 1 1   5|
221  *             |                         6 6 6 6 6   1|
222  *             |0                        3 4 5 6 7   2|
223  *             ----------------------------------------
224  *                                           |
225  *                                           |
226  *                                           V
227  *             ----------------------------------------
228  *             | Data block containing offset         |
229  *             |            101342453                 |
230  *             |                                      |
231  *             |                                      |
232  *             ----------------------------------------
233  *
234  */
235 
236 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
237 			  struct metapath *mp, unsigned int height)
238 {
239 	unsigned int i;
240 
241 	mp->mp_fheight = height;
242 	for (i = height; i--;)
243 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
244 }
245 
246 static inline unsigned int metapath_branch_start(const struct metapath *mp)
247 {
248 	if (mp->mp_list[0] == 0)
249 		return 2;
250 	return 1;
251 }
252 
253 /**
254  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
255  * @height: The metadata height (0 = dinode)
256  * @mp: The metapath
257  */
258 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
259 {
260 	struct buffer_head *bh = mp->mp_bh[height];
261 	if (height == 0)
262 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
263 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264 }
265 
266 /**
267  * metapointer - Return pointer to start of metadata in a buffer
268  * @height: The metadata height (0 = dinode)
269  * @mp: The metapath
270  *
271  * Return a pointer to the block number of the next height of the metadata
272  * tree given a buffer containing the pointer to the current height of the
273  * metadata tree.
274  */
275 
276 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
277 {
278 	__be64 *p = metaptr1(height, mp);
279 	return p + mp->mp_list[height];
280 }
281 
282 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
283 {
284 	const __be64 *t;
285 
286 	for (t = start; t < end; t++) {
287 		struct buffer_head *rabh;
288 
289 		if (!*t)
290 			continue;
291 
292 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
293 		if (trylock_buffer(rabh)) {
294 			if (!buffer_uptodate(rabh)) {
295 				rabh->b_end_io = end_buffer_read_sync;
296 				submit_bh(REQ_OP_READ,
297 					  REQ_RAHEAD | REQ_META | REQ_PRIO,
298 					  rabh);
299 				continue;
300 			}
301 			unlock_buffer(rabh);
302 		}
303 		brelse(rabh);
304 	}
305 }
306 
307 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
308 			     unsigned int x, unsigned int h)
309 {
310 	for (; x < h; x++) {
311 		__be64 *ptr = metapointer(x, mp);
312 		u64 dblock = be64_to_cpu(*ptr);
313 		int ret;
314 
315 		if (!dblock)
316 			break;
317 		ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
318 		if (ret)
319 			return ret;
320 	}
321 	mp->mp_aheight = x + 1;
322 	return 0;
323 }
324 
325 /**
326  * lookup_metapath - Walk the metadata tree to a specific point
327  * @ip: The inode
328  * @mp: The metapath
329  *
330  * Assumes that the inode's buffer has already been looked up and
331  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
332  * by find_metapath().
333  *
334  * If this function encounters part of the tree which has not been
335  * allocated, it returns the current height of the tree at the point
336  * at which it found the unallocated block. Blocks which are found are
337  * added to the mp->mp_bh[] list.
338  *
339  * Returns: error
340  */
341 
342 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
343 {
344 	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
345 }
346 
347 /**
348  * fillup_metapath - fill up buffers for the metadata path to a specific height
349  * @ip: The inode
350  * @mp: The metapath
351  * @h: The height to which it should be mapped
352  *
353  * Similar to lookup_metapath, but does lookups for a range of heights
354  *
355  * Returns: error or the number of buffers filled
356  */
357 
358 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
359 {
360 	unsigned int x = 0;
361 	int ret;
362 
363 	if (h) {
364 		/* find the first buffer we need to look up. */
365 		for (x = h - 1; x > 0; x--) {
366 			if (mp->mp_bh[x])
367 				break;
368 		}
369 	}
370 	ret = __fillup_metapath(ip, mp, x, h);
371 	if (ret)
372 		return ret;
373 	return mp->mp_aheight - x - 1;
374 }
375 
376 static inline void release_metapath(struct metapath *mp)
377 {
378 	int i;
379 
380 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
381 		if (mp->mp_bh[i] == NULL)
382 			break;
383 		brelse(mp->mp_bh[i]);
384 	}
385 }
386 
387 /**
388  * gfs2_extent_length - Returns length of an extent of blocks
389  * @start: Start of the buffer
390  * @len: Length of the buffer in bytes
391  * @ptr: Current position in the buffer
392  * @limit: Max extent length to return (0 = unlimited)
393  * @eob: Set to 1 if we hit "end of block"
394  *
395  * If the first block is zero (unallocated) it will return the number of
396  * unallocated blocks in the extent, otherwise it will return the number
397  * of contiguous blocks in the extent.
398  *
399  * Returns: The length of the extent (minimum of one block)
400  */
401 
402 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
403 {
404 	const __be64 *end = (start + len);
405 	const __be64 *first = ptr;
406 	u64 d = be64_to_cpu(*ptr);
407 
408 	*eob = 0;
409 	do {
410 		ptr++;
411 		if (ptr >= end)
412 			break;
413 		if (limit && --limit == 0)
414 			break;
415 		if (d)
416 			d++;
417 	} while(be64_to_cpu(*ptr) == d);
418 	if (ptr >= end)
419 		*eob = 1;
420 	return (ptr - first);
421 }
422 
423 static inline void bmap_lock(struct gfs2_inode *ip, int create)
424 {
425 	if (create)
426 		down_write(&ip->i_rw_mutex);
427 	else
428 		down_read(&ip->i_rw_mutex);
429 }
430 
431 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
432 {
433 	if (create)
434 		up_write(&ip->i_rw_mutex);
435 	else
436 		up_read(&ip->i_rw_mutex);
437 }
438 
439 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
440 					 struct gfs2_glock *gl, unsigned int i,
441 					 unsigned offset, u64 bn)
442 {
443 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
444 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
445 				 sizeof(struct gfs2_dinode)));
446 	BUG_ON(i < 1);
447 	BUG_ON(mp->mp_bh[i] != NULL);
448 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
449 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
450 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
451 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
452 	ptr += offset;
453 	*ptr = cpu_to_be64(bn);
454 	return ptr;
455 }
456 
457 enum alloc_state {
458 	ALLOC_DATA = 0,
459 	ALLOC_GROW_DEPTH = 1,
460 	ALLOC_GROW_HEIGHT = 2,
461 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
462 };
463 
464 /**
465  * gfs2_bmap_alloc - Build a metadata tree of the requested height
466  * @inode: The GFS2 inode
467  * @lblock: The logical starting block of the extent
468  * @bh_map: This is used to return the mapping details
469  * @zero_new: True if newly allocated blocks should be zeroed
470  * @mp: The metapath, with proper height information calculated
471  * @maxlen: The max number of data blocks to alloc
472  * @dblock: Pointer to return the resulting new block
473  * @dblks: Pointer to return the number of blocks allocated
474  *
475  * In this routine we may have to alloc:
476  *   i) Indirect blocks to grow the metadata tree height
477  *  ii) Indirect blocks to fill in lower part of the metadata tree
478  * iii) Data blocks
479  *
480  * The function is in two parts. The first part works out the total
481  * number of blocks which we need. The second part does the actual
482  * allocation asking for an extent at a time (if enough contiguous free
483  * blocks are available, there will only be one request per bmap call)
484  * and uses the state machine to initialise the blocks in order.
485  *
486  * Returns: errno on error
487  */
488 
489 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
490 			    unsigned flags, struct metapath *mp)
491 {
492 	struct gfs2_inode *ip = GFS2_I(inode);
493 	struct gfs2_sbd *sdp = GFS2_SB(inode);
494 	struct super_block *sb = sdp->sd_vfs;
495 	struct buffer_head *dibh = mp->mp_bh[0];
496 	u64 bn;
497 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
498 	unsigned dblks = 0;
499 	unsigned ptrs_per_blk;
500 	const unsigned end_of_metadata = mp->mp_fheight - 1;
501 	int ret;
502 	enum alloc_state state;
503 	__be64 *ptr;
504 	__be64 zero_bn = 0;
505 	size_t maxlen = iomap->length >> inode->i_blkbits;
506 
507 	BUG_ON(mp->mp_aheight < 1);
508 	BUG_ON(dibh == NULL);
509 
510 	gfs2_trans_add_meta(ip->i_gl, dibh);
511 
512 	if (mp->mp_fheight == mp->mp_aheight) {
513 		struct buffer_head *bh;
514 		int eob;
515 
516 		/* Bottom indirect block exists, find unalloced extent size */
517 		ptr = metapointer(end_of_metadata, mp);
518 		bh = mp->mp_bh[end_of_metadata];
519 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
520 					   maxlen, &eob);
521 		BUG_ON(dblks < 1);
522 		state = ALLOC_DATA;
523 	} else {
524 		/* Need to allocate indirect blocks */
525 		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
526 			sdp->sd_diptrs;
527 		dblks = min(maxlen, (size_t)(ptrs_per_blk -
528 					     mp->mp_list[end_of_metadata]));
529 		if (mp->mp_fheight == ip->i_height) {
530 			/* Writing into existing tree, extend tree down */
531 			iblks = mp->mp_fheight - mp->mp_aheight;
532 			state = ALLOC_GROW_DEPTH;
533 		} else {
534 			/* Building up tree height */
535 			state = ALLOC_GROW_HEIGHT;
536 			iblks = mp->mp_fheight - ip->i_height;
537 			branch_start = metapath_branch_start(mp);
538 			iblks += (mp->mp_fheight - branch_start);
539 		}
540 	}
541 
542 	/* start of the second part of the function (state machine) */
543 
544 	blks = dblks + iblks;
545 	i = mp->mp_aheight;
546 	do {
547 		int error;
548 		n = blks - alloced;
549 		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
550 		if (error)
551 			return error;
552 		alloced += n;
553 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
554 			gfs2_trans_add_unrevoke(sdp, bn, n);
555 		switch (state) {
556 		/* Growing height of tree */
557 		case ALLOC_GROW_HEIGHT:
558 			if (i == 1) {
559 				ptr = (__be64 *)(dibh->b_data +
560 						 sizeof(struct gfs2_dinode));
561 				zero_bn = *ptr;
562 			}
563 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
564 			     i++, n--)
565 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
566 			if (i - 1 == mp->mp_fheight - ip->i_height) {
567 				i--;
568 				gfs2_buffer_copy_tail(mp->mp_bh[i],
569 						sizeof(struct gfs2_meta_header),
570 						dibh, sizeof(struct gfs2_dinode));
571 				gfs2_buffer_clear_tail(dibh,
572 						sizeof(struct gfs2_dinode) +
573 						sizeof(__be64));
574 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
575 					sizeof(struct gfs2_meta_header));
576 				*ptr = zero_bn;
577 				state = ALLOC_GROW_DEPTH;
578 				for(i = branch_start; i < mp->mp_fheight; i++) {
579 					if (mp->mp_bh[i] == NULL)
580 						break;
581 					brelse(mp->mp_bh[i]);
582 					mp->mp_bh[i] = NULL;
583 				}
584 				i = branch_start;
585 			}
586 			if (n == 0)
587 				break;
588 		/* Branching from existing tree */
589 		case ALLOC_GROW_DEPTH:
590 			if (i > 1 && i < mp->mp_fheight)
591 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
592 			for (; i < mp->mp_fheight && n > 0; i++, n--)
593 				gfs2_indirect_init(mp, ip->i_gl, i,
594 						   mp->mp_list[i-1], bn++);
595 			if (i == mp->mp_fheight)
596 				state = ALLOC_DATA;
597 			if (n == 0)
598 				break;
599 		/* Tree complete, adding data blocks */
600 		case ALLOC_DATA:
601 			BUG_ON(n > dblks);
602 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
603 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
604 			dblks = n;
605 			ptr = metapointer(end_of_metadata, mp);
606 			iomap->addr = bn << inode->i_blkbits;
607 			iomap->flags |= IOMAP_F_NEW;
608 			while (n-- > 0)
609 				*ptr++ = cpu_to_be64(bn++);
610 			if (flags & IOMAP_ZERO) {
611 				ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
612 						       dblks, GFP_NOFS);
613 				if (ret) {
614 					fs_err(sdp,
615 					       "Failed to zero data buffers\n");
616 					flags &= ~IOMAP_ZERO;
617 				}
618 			}
619 			break;
620 		}
621 	} while (iomap->addr == IOMAP_NULL_ADDR);
622 
623 	iomap->length = (u64)dblks << inode->i_blkbits;
624 	ip->i_height = mp->mp_fheight;
625 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
626 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
627 	return 0;
628 }
629 
630 /**
631  * hole_size - figure out the size of a hole
632  * @inode: The inode
633  * @lblock: The logical starting block number
634  * @mp: The metapath
635  *
636  * Returns: The hole size in bytes
637  *
638  */
639 static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
640 {
641 	struct gfs2_inode *ip = GFS2_I(inode);
642 	struct gfs2_sbd *sdp = GFS2_SB(inode);
643 	struct metapath mp_eof;
644 	u64 factor = 1;
645 	int hgt;
646 	u64 holesz = 0;
647 	const __be64 *first, *end, *ptr;
648 	const struct buffer_head *bh;
649 	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
650 	int zeroptrs;
651 	bool done = false;
652 
653 	/* Get another metapath, to the very last byte */
654 	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
655 	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
656 		bh = mp->mp_bh[hgt];
657 		if (bh) {
658 			zeroptrs = 0;
659 			first = metapointer(hgt, mp);
660 			end = (const __be64 *)(bh->b_data + bh->b_size);
661 
662 			for (ptr = first; ptr < end; ptr++) {
663 				if (*ptr) {
664 					done = true;
665 					break;
666 				} else {
667 					zeroptrs++;
668 				}
669 			}
670 		} else {
671 			zeroptrs = sdp->sd_inptrs;
672 		}
673 		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
674 			holesz = lblock_stop - lblock + 1;
675 			break;
676 		}
677 		holesz += factor * zeroptrs;
678 
679 		factor *= sdp->sd_inptrs;
680 		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
681 			(mp->mp_list[hgt - 1])++;
682 	}
683 	return holesz << inode->i_blkbits;
684 }
685 
686 static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
687 {
688 	struct gfs2_inode *ip = GFS2_I(inode);
689 
690 	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
691 		      sizeof(struct gfs2_dinode);
692 	iomap->offset = 0;
693 	iomap->length = i_size_read(inode);
694 	iomap->type = IOMAP_MAPPED;
695 	iomap->flags = IOMAP_F_DATA_INLINE;
696 }
697 
698 /**
699  * gfs2_iomap_begin - Map blocks from an inode to disk blocks
700  * @inode: The inode
701  * @pos: Starting position in bytes
702  * @length: Length to map, in bytes
703  * @flags: iomap flags
704  * @iomap: The iomap structure
705  *
706  * Returns: errno
707  */
708 int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
709 		     unsigned flags, struct iomap *iomap)
710 {
711 	struct gfs2_inode *ip = GFS2_I(inode);
712 	struct gfs2_sbd *sdp = GFS2_SB(inode);
713 	struct metapath mp = { .mp_aheight = 1, };
714 	unsigned int factor = sdp->sd_sb.sb_bsize;
715 	const u64 *arr = sdp->sd_heightsize;
716 	__be64 *ptr;
717 	sector_t lblock;
718 	sector_t lend;
719 	int ret;
720 	int eob;
721 	unsigned int len;
722 	struct buffer_head *bh;
723 	u8 height;
724 
725 	trace_gfs2_iomap_start(ip, pos, length, flags);
726 	if (!length) {
727 		ret = -EINVAL;
728 		goto out;
729 	}
730 
731 	if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
732 		gfs2_stuffed_iomap(inode, iomap);
733 		if (pos >= iomap->length)
734 			return -ENOENT;
735 		ret = 0;
736 		goto out;
737 	}
738 
739 	lblock = pos >> inode->i_blkbits;
740 	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
741 
742 	iomap->offset = lblock << inode->i_blkbits;
743 	iomap->addr = IOMAP_NULL_ADDR;
744 	iomap->type = IOMAP_HOLE;
745 	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
746 	iomap->flags = IOMAP_F_MERGED;
747 	bmap_lock(ip, 0);
748 
749 	/*
750 	 * Directory data blocks have a struct gfs2_meta_header header, so the
751 	 * remaining size is smaller than the filesystem block size.  Logical
752 	 * block numbers for directories are in units of this remaining size!
753 	 */
754 	if (gfs2_is_dir(ip)) {
755 		factor = sdp->sd_jbsize;
756 		arr = sdp->sd_jheightsize;
757 	}
758 
759 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
760 	if (ret)
761 		goto out_release;
762 
763 	height = ip->i_height;
764 	while ((lblock + 1) * factor > arr[height])
765 		height++;
766 	find_metapath(sdp, lblock, &mp, height);
767 	if (height > ip->i_height || gfs2_is_stuffed(ip))
768 		goto do_alloc;
769 
770 	ret = lookup_metapath(ip, &mp);
771 	if (ret)
772 		goto out_release;
773 
774 	if (mp.mp_aheight != ip->i_height)
775 		goto do_alloc;
776 
777 	ptr = metapointer(ip->i_height - 1, &mp);
778 	if (*ptr == 0)
779 		goto do_alloc;
780 
781 	iomap->type = IOMAP_MAPPED;
782 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
783 
784 	bh = mp.mp_bh[ip->i_height - 1];
785 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
786 	if (eob)
787 		iomap->flags |= IOMAP_F_BOUNDARY;
788 	iomap->length = (u64)len << inode->i_blkbits;
789 
790 	ret = 0;
791 
792 out_release:
793 	release_metapath(&mp);
794 	bmap_unlock(ip, 0);
795 out:
796 	trace_gfs2_iomap_end(ip, iomap, ret);
797 	return ret;
798 
799 do_alloc:
800 	if (!(flags & IOMAP_WRITE)) {
801 		if (pos >= i_size_read(inode)) {
802 			ret = -ENOENT;
803 			goto out_release;
804 		}
805 		ret = 0;
806 		iomap->length = hole_size(inode, lblock, &mp);
807 		goto out_release;
808 	}
809 
810 	ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
811 	goto out_release;
812 }
813 
814 /**
815  * gfs2_block_map - Map a block from an inode to a disk block
816  * @inode: The inode
817  * @lblock: The logical block number
818  * @bh_map: The bh to be mapped
819  * @create: True if its ok to alloc blocks to satify the request
820  *
821  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
822  * read of metadata will be required before the next block can be
823  * mapped. Sets buffer_new() if new blocks were allocated.
824  *
825  * Returns: errno
826  */
827 
828 int gfs2_block_map(struct inode *inode, sector_t lblock,
829 		   struct buffer_head *bh_map, int create)
830 {
831 	struct gfs2_inode *ip = GFS2_I(inode);
832 	struct iomap iomap;
833 	int ret, flags = 0;
834 
835 	clear_buffer_mapped(bh_map);
836 	clear_buffer_new(bh_map);
837 	clear_buffer_boundary(bh_map);
838 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
839 
840 	if (create)
841 		flags |= IOMAP_WRITE;
842 	if (buffer_zeronew(bh_map))
843 		flags |= IOMAP_ZERO;
844 	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
845 			       bh_map->b_size, flags, &iomap);
846 	if (ret) {
847 		if (!create && ret == -ENOENT) {
848 			/* Return unmapped buffer beyond the end of file.  */
849 			ret = 0;
850 		}
851 		goto out;
852 	}
853 
854 	if (iomap.length > bh_map->b_size) {
855 		iomap.length = bh_map->b_size;
856 		iomap.flags &= ~IOMAP_F_BOUNDARY;
857 	}
858 	if (iomap.addr != IOMAP_NULL_ADDR)
859 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
860 	bh_map->b_size = iomap.length;
861 	if (iomap.flags & IOMAP_F_BOUNDARY)
862 		set_buffer_boundary(bh_map);
863 	if (iomap.flags & IOMAP_F_NEW)
864 		set_buffer_new(bh_map);
865 
866 out:
867 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
868 	return ret;
869 }
870 
871 /*
872  * Deprecated: do not use in new code
873  */
874 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
875 {
876 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
877 	int ret;
878 	int create = *new;
879 
880 	BUG_ON(!extlen);
881 	BUG_ON(!dblock);
882 	BUG_ON(!new);
883 
884 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
885 	ret = gfs2_block_map(inode, lblock, &bh, create);
886 	*extlen = bh.b_size >> inode->i_blkbits;
887 	*dblock = bh.b_blocknr;
888 	if (buffer_new(&bh))
889 		*new = 1;
890 	else
891 		*new = 0;
892 	return ret;
893 }
894 
895 /**
896  * gfs2_block_zero_range - Deal with zeroing out data
897  *
898  * This is partly borrowed from ext3.
899  */
900 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
901 				 unsigned int length)
902 {
903 	struct address_space *mapping = inode->i_mapping;
904 	struct gfs2_inode *ip = GFS2_I(inode);
905 	unsigned long index = from >> PAGE_SHIFT;
906 	unsigned offset = from & (PAGE_SIZE-1);
907 	unsigned blocksize, iblock, pos;
908 	struct buffer_head *bh;
909 	struct page *page;
910 	int err;
911 
912 	page = find_or_create_page(mapping, index, GFP_NOFS);
913 	if (!page)
914 		return 0;
915 
916 	blocksize = inode->i_sb->s_blocksize;
917 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
918 
919 	if (!page_has_buffers(page))
920 		create_empty_buffers(page, blocksize, 0);
921 
922 	/* Find the buffer that contains "offset" */
923 	bh = page_buffers(page);
924 	pos = blocksize;
925 	while (offset >= pos) {
926 		bh = bh->b_this_page;
927 		iblock++;
928 		pos += blocksize;
929 	}
930 
931 	err = 0;
932 
933 	if (!buffer_mapped(bh)) {
934 		gfs2_block_map(inode, iblock, bh, 0);
935 		/* unmapped? It's a hole - nothing to do */
936 		if (!buffer_mapped(bh))
937 			goto unlock;
938 	}
939 
940 	/* Ok, it's mapped. Make sure it's up-to-date */
941 	if (PageUptodate(page))
942 		set_buffer_uptodate(bh);
943 
944 	if (!buffer_uptodate(bh)) {
945 		err = -EIO;
946 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
947 		wait_on_buffer(bh);
948 		/* Uhhuh. Read error. Complain and punt. */
949 		if (!buffer_uptodate(bh))
950 			goto unlock;
951 		err = 0;
952 	}
953 
954 	if (!gfs2_is_writeback(ip))
955 		gfs2_trans_add_data(ip->i_gl, bh);
956 
957 	zero_user(page, offset, length);
958 	mark_buffer_dirty(bh);
959 unlock:
960 	unlock_page(page);
961 	put_page(page);
962 	return err;
963 }
964 
965 #define GFS2_JTRUNC_REVOKES 8192
966 
967 /**
968  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
969  * @inode: The inode being truncated
970  * @oldsize: The original (larger) size
971  * @newsize: The new smaller size
972  *
973  * With jdata files, we have to journal a revoke for each block which is
974  * truncated. As a result, we need to split this into separate transactions
975  * if the number of pages being truncated gets too large.
976  */
977 
978 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
979 {
980 	struct gfs2_sbd *sdp = GFS2_SB(inode);
981 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
982 	u64 chunk;
983 	int error;
984 
985 	while (oldsize != newsize) {
986 		struct gfs2_trans *tr;
987 		unsigned int offs;
988 
989 		chunk = oldsize - newsize;
990 		if (chunk > max_chunk)
991 			chunk = max_chunk;
992 
993 		offs = oldsize & ~PAGE_MASK;
994 		if (offs && chunk > PAGE_SIZE)
995 			chunk = offs + ((chunk - offs) & PAGE_MASK);
996 
997 		truncate_pagecache(inode, oldsize - chunk);
998 		oldsize -= chunk;
999 
1000 		tr = current->journal_info;
1001 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1002 			continue;
1003 
1004 		gfs2_trans_end(sdp);
1005 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1006 		if (error)
1007 			return error;
1008 	}
1009 
1010 	return 0;
1011 }
1012 
1013 static int trunc_start(struct inode *inode, u64 newsize)
1014 {
1015 	struct gfs2_inode *ip = GFS2_I(inode);
1016 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1017 	struct buffer_head *dibh = NULL;
1018 	int journaled = gfs2_is_jdata(ip);
1019 	u64 oldsize = inode->i_size;
1020 	int error;
1021 
1022 	if (journaled)
1023 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1024 	else
1025 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1026 	if (error)
1027 		return error;
1028 
1029 	error = gfs2_meta_inode_buffer(ip, &dibh);
1030 	if (error)
1031 		goto out;
1032 
1033 	gfs2_trans_add_meta(ip->i_gl, dibh);
1034 
1035 	if (gfs2_is_stuffed(ip)) {
1036 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1037 	} else {
1038 		unsigned int blocksize = i_blocksize(inode);
1039 		unsigned int offs = newsize & (blocksize - 1);
1040 		if (offs) {
1041 			error = gfs2_block_zero_range(inode, newsize,
1042 						      blocksize - offs);
1043 			if (error)
1044 				goto out;
1045 		}
1046 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1047 	}
1048 
1049 	i_size_write(inode, newsize);
1050 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1051 	gfs2_dinode_out(ip, dibh->b_data);
1052 
1053 	if (journaled)
1054 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1055 	else
1056 		truncate_pagecache(inode, newsize);
1057 
1058 out:
1059 	brelse(dibh);
1060 	if (current->journal_info)
1061 		gfs2_trans_end(sdp);
1062 	return error;
1063 }
1064 
1065 /**
1066  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1067  * @ip: inode
1068  * @rg_gh: holder of resource group glock
1069  * @bh: buffer head to sweep
1070  * @start: starting point in bh
1071  * @end: end point in bh
1072  * @meta: true if bh points to metadata (rather than data)
1073  * @btotal: place to keep count of total blocks freed
1074  *
1075  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1076  * free, and free them all. However, we do it one rgrp at a time. If this
1077  * block has references to multiple rgrps, we break it into individual
1078  * transactions. This allows other processes to use the rgrps while we're
1079  * focused on a single one, for better concurrency / performance.
1080  * At every transaction boundary, we rewrite the inode into the journal.
1081  * That way the bitmaps are kept consistent with the inode and we can recover
1082  * if we're interrupted by power-outages.
1083  *
1084  * Returns: 0, or return code if an error occurred.
1085  *          *btotal has the total number of blocks freed
1086  */
1087 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1088 			      struct buffer_head *bh, __be64 *start, __be64 *end,
1089 			      bool meta, u32 *btotal)
1090 {
1091 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1092 	struct gfs2_rgrpd *rgd;
1093 	struct gfs2_trans *tr;
1094 	__be64 *p;
1095 	int blks_outside_rgrp;
1096 	u64 bn, bstart, isize_blks;
1097 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1098 	int ret = 0;
1099 	bool buf_in_tr = false; /* buffer was added to transaction */
1100 
1101 more_rgrps:
1102 	rgd = NULL;
1103 	if (gfs2_holder_initialized(rd_gh)) {
1104 		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1105 		gfs2_assert_withdraw(sdp,
1106 			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1107 	}
1108 	blks_outside_rgrp = 0;
1109 	bstart = 0;
1110 	blen = 0;
1111 
1112 	for (p = start; p < end; p++) {
1113 		if (!*p)
1114 			continue;
1115 		bn = be64_to_cpu(*p);
1116 
1117 		if (rgd) {
1118 			if (!rgrp_contains_block(rgd, bn)) {
1119 				blks_outside_rgrp++;
1120 				continue;
1121 			}
1122 		} else {
1123 			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1124 			if (unlikely(!rgd)) {
1125 				ret = -EIO;
1126 				goto out;
1127 			}
1128 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1129 						 0, rd_gh);
1130 			if (ret)
1131 				goto out;
1132 
1133 			/* Must be done with the rgrp glock held: */
1134 			if (gfs2_rs_active(&ip->i_res) &&
1135 			    rgd == ip->i_res.rs_rbm.rgd)
1136 				gfs2_rs_deltree(&ip->i_res);
1137 		}
1138 
1139 		/* The size of our transactions will be unknown until we
1140 		   actually process all the metadata blocks that relate to
1141 		   the rgrp. So we estimate. We know it can't be more than
1142 		   the dinode's i_blocks and we don't want to exceed the
1143 		   journal flush threshold, sd_log_thresh2. */
1144 		if (current->journal_info == NULL) {
1145 			unsigned int jblocks_rqsted, revokes;
1146 
1147 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1148 				RES_INDIRECT;
1149 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1150 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1151 				jblocks_rqsted +=
1152 					atomic_read(&sdp->sd_log_thresh2);
1153 			else
1154 				jblocks_rqsted += isize_blks;
1155 			revokes = jblocks_rqsted;
1156 			if (meta)
1157 				revokes += end - start;
1158 			else if (ip->i_depth)
1159 				revokes += sdp->sd_inptrs;
1160 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1161 			if (ret)
1162 				goto out_unlock;
1163 			down_write(&ip->i_rw_mutex);
1164 		}
1165 		/* check if we will exceed the transaction blocks requested */
1166 		tr = current->journal_info;
1167 		if (tr->tr_num_buf_new + RES_STATFS +
1168 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1169 			/* We set blks_outside_rgrp to ensure the loop will
1170 			   be repeated for the same rgrp, but with a new
1171 			   transaction. */
1172 			blks_outside_rgrp++;
1173 			/* This next part is tricky. If the buffer was added
1174 			   to the transaction, we've already set some block
1175 			   pointers to 0, so we better follow through and free
1176 			   them, or we will introduce corruption (so break).
1177 			   This may be impossible, or at least rare, but I
1178 			   decided to cover the case regardless.
1179 
1180 			   If the buffer was not added to the transaction
1181 			   (this call), doing so would exceed our transaction
1182 			   size, so we need to end the transaction and start a
1183 			   new one (so goto). */
1184 
1185 			if (buf_in_tr)
1186 				break;
1187 			goto out_unlock;
1188 		}
1189 
1190 		gfs2_trans_add_meta(ip->i_gl, bh);
1191 		buf_in_tr = true;
1192 		*p = 0;
1193 		if (bstart + blen == bn) {
1194 			blen++;
1195 			continue;
1196 		}
1197 		if (bstart) {
1198 			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1199 			(*btotal) += blen;
1200 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1201 		}
1202 		bstart = bn;
1203 		blen = 1;
1204 	}
1205 	if (bstart) {
1206 		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1207 		(*btotal) += blen;
1208 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1209 	}
1210 out_unlock:
1211 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1212 					    outside the rgrp we just processed,
1213 					    do it all over again. */
1214 		if (current->journal_info) {
1215 			struct buffer_head *dibh;
1216 
1217 			ret = gfs2_meta_inode_buffer(ip, &dibh);
1218 			if (ret)
1219 				goto out;
1220 
1221 			/* Every transaction boundary, we rewrite the dinode
1222 			   to keep its di_blocks current in case of failure. */
1223 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1224 				current_time(&ip->i_inode);
1225 			gfs2_trans_add_meta(ip->i_gl, dibh);
1226 			gfs2_dinode_out(ip, dibh->b_data);
1227 			brelse(dibh);
1228 			up_write(&ip->i_rw_mutex);
1229 			gfs2_trans_end(sdp);
1230 		}
1231 		gfs2_glock_dq_uninit(rd_gh);
1232 		cond_resched();
1233 		goto more_rgrps;
1234 	}
1235 out:
1236 	return ret;
1237 }
1238 
1239 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1240 {
1241 	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1242 		return false;
1243 	return true;
1244 }
1245 
1246 /**
1247  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1248  * @mp: starting metapath
1249  * @h: desired height to search
1250  *
1251  * Assumes the metapath is valid (with buffers) out to height h.
1252  * Returns: true if a non-null pointer was found in the metapath buffer
1253  *          false if all remaining pointers are NULL in the buffer
1254  */
1255 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1256 			     unsigned int h,
1257 			     __u16 *end_list, unsigned int end_aligned)
1258 {
1259 	struct buffer_head *bh = mp->mp_bh[h];
1260 	__be64 *first, *ptr, *end;
1261 
1262 	first = metaptr1(h, mp);
1263 	ptr = first + mp->mp_list[h];
1264 	end = (__be64 *)(bh->b_data + bh->b_size);
1265 	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1266 		bool keep_end = h < end_aligned;
1267 		end = first + end_list[h] + keep_end;
1268 	}
1269 
1270 	while (ptr < end) {
1271 		if (*ptr) { /* if we have a non-null pointer */
1272 			mp->mp_list[h] = ptr - first;
1273 			h++;
1274 			if (h < GFS2_MAX_META_HEIGHT)
1275 				mp->mp_list[h] = 0;
1276 			return true;
1277 		}
1278 		ptr++;
1279 	}
1280 	return false;
1281 }
1282 
1283 enum dealloc_states {
1284 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1285 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1286 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1287 	DEALLOC_DONE = 3,       /* process complete */
1288 };
1289 
1290 static inline void
1291 metapointer_range(struct metapath *mp, int height,
1292 		  __u16 *start_list, unsigned int start_aligned,
1293 		  __u16 *end_list, unsigned int end_aligned,
1294 		  __be64 **start, __be64 **end)
1295 {
1296 	struct buffer_head *bh = mp->mp_bh[height];
1297 	__be64 *first;
1298 
1299 	first = metaptr1(height, mp);
1300 	*start = first;
1301 	if (mp_eq_to_hgt(mp, start_list, height)) {
1302 		bool keep_start = height < start_aligned;
1303 		*start = first + start_list[height] + keep_start;
1304 	}
1305 	*end = (__be64 *)(bh->b_data + bh->b_size);
1306 	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1307 		bool keep_end = height < end_aligned;
1308 		*end = first + end_list[height] + keep_end;
1309 	}
1310 }
1311 
1312 static inline bool walk_done(struct gfs2_sbd *sdp,
1313 			     struct metapath *mp, int height,
1314 			     __u16 *end_list, unsigned int end_aligned)
1315 {
1316 	__u16 end;
1317 
1318 	if (end_list) {
1319 		bool keep_end = height < end_aligned;
1320 		if (!mp_eq_to_hgt(mp, end_list, height))
1321 			return false;
1322 		end = end_list[height] + keep_end;
1323 	} else
1324 		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1325 	return mp->mp_list[height] >= end;
1326 }
1327 
1328 /**
1329  * punch_hole - deallocate blocks in a file
1330  * @ip: inode to truncate
1331  * @offset: the start of the hole
1332  * @length: the size of the hole (or 0 for truncate)
1333  *
1334  * Punch a hole into a file or truncate a file at a given position.  This
1335  * function operates in whole blocks (@offset and @length are rounded
1336  * accordingly); partially filled blocks must be cleared otherwise.
1337  *
1338  * This function works from the bottom up, and from the right to the left. In
1339  * other words, it strips off the highest layer (data) before stripping any of
1340  * the metadata. Doing it this way is best in case the operation is interrupted
1341  * by power failure, etc.  The dinode is rewritten in every transaction to
1342  * guarantee integrity.
1343  */
1344 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1345 {
1346 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1347 	struct metapath mp = {};
1348 	struct buffer_head *dibh, *bh;
1349 	struct gfs2_holder rd_gh;
1350 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1351 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1352 	__u16 start_list[GFS2_MAX_META_HEIGHT];
1353 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1354 	unsigned int start_aligned, uninitialized_var(end_aligned);
1355 	unsigned int strip_h = ip->i_height - 1;
1356 	u32 btotal = 0;
1357 	int ret, state;
1358 	int mp_h; /* metapath buffers are read in to this height */
1359 	u64 prev_bnr = 0;
1360 	__be64 *start, *end;
1361 
1362 	/*
1363 	 * The start position of the hole is defined by lblock, start_list, and
1364 	 * start_aligned.  The end position of the hole is defined by lend,
1365 	 * end_list, and end_aligned.
1366 	 *
1367 	 * start_aligned and end_aligned define down to which height the start
1368 	 * and end positions are aligned to the metadata tree (i.e., the
1369 	 * position is a multiple of the metadata granularity at the height
1370 	 * above).  This determines at which heights additional meta pointers
1371 	 * needs to be preserved for the remaining data.
1372 	 */
1373 
1374 	if (length) {
1375 		u64 maxsize = sdp->sd_heightsize[ip->i_height];
1376 		u64 end_offset = offset + length;
1377 		u64 lend;
1378 
1379 		/*
1380 		 * Clip the end at the maximum file size for the given height:
1381 		 * that's how far the metadata goes; files bigger than that
1382 		 * will have additional layers of indirection.
1383 		 */
1384 		if (end_offset > maxsize)
1385 			end_offset = maxsize;
1386 		lend = end_offset >> bsize_shift;
1387 
1388 		if (lblock >= lend)
1389 			return 0;
1390 
1391 		find_metapath(sdp, lend, &mp, ip->i_height);
1392 		end_list = __end_list;
1393 		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1394 
1395 		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1396 			if (end_list[mp_h])
1397 				break;
1398 		}
1399 		end_aligned = mp_h;
1400 	}
1401 
1402 	find_metapath(sdp, lblock, &mp, ip->i_height);
1403 	memcpy(start_list, mp.mp_list, sizeof(start_list));
1404 
1405 	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1406 		if (start_list[mp_h])
1407 			break;
1408 	}
1409 	start_aligned = mp_h;
1410 
1411 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1412 	if (ret)
1413 		return ret;
1414 
1415 	mp.mp_bh[0] = dibh;
1416 	ret = lookup_metapath(ip, &mp);
1417 	if (ret)
1418 		goto out_metapath;
1419 
1420 	/* issue read-ahead on metadata */
1421 	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1422 		metapointer_range(&mp, mp_h, start_list, start_aligned,
1423 				  end_list, end_aligned, &start, &end);
1424 		gfs2_metapath_ra(ip->i_gl, start, end);
1425 	}
1426 
1427 	if (mp.mp_aheight == ip->i_height)
1428 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1429 	else
1430 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1431 
1432 	ret = gfs2_rindex_update(sdp);
1433 	if (ret)
1434 		goto out_metapath;
1435 
1436 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1437 	if (ret)
1438 		goto out_metapath;
1439 	gfs2_holder_mark_uninitialized(&rd_gh);
1440 
1441 	mp_h = strip_h;
1442 
1443 	while (state != DEALLOC_DONE) {
1444 		switch (state) {
1445 		/* Truncate a full metapath at the given strip height.
1446 		 * Note that strip_h == mp_h in order to be in this state. */
1447 		case DEALLOC_MP_FULL:
1448 			bh = mp.mp_bh[mp_h];
1449 			gfs2_assert_withdraw(sdp, bh);
1450 			if (gfs2_assert_withdraw(sdp,
1451 						 prev_bnr != bh->b_blocknr)) {
1452 				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1453 				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1454 				       sdp->sd_fsname,
1455 				       (unsigned long long)ip->i_no_addr,
1456 				       prev_bnr, ip->i_height, strip_h, mp_h);
1457 			}
1458 			prev_bnr = bh->b_blocknr;
1459 
1460 			if (gfs2_metatype_check(sdp, bh,
1461 						(mp_h ? GFS2_METATYPE_IN :
1462 							GFS2_METATYPE_DI))) {
1463 				ret = -EIO;
1464 				goto out;
1465 			}
1466 
1467 			/*
1468 			 * Below, passing end_aligned as 0 gives us the
1469 			 * metapointer range excluding the end point: the end
1470 			 * point is the first metapath we must not deallocate!
1471 			 */
1472 
1473 			metapointer_range(&mp, mp_h, start_list, start_aligned,
1474 					  end_list, 0 /* end_aligned */,
1475 					  &start, &end);
1476 			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1477 						 start, end,
1478 						 mp_h != ip->i_height - 1,
1479 						 &btotal);
1480 
1481 			/* If we hit an error or just swept dinode buffer,
1482 			   just exit. */
1483 			if (ret || !mp_h) {
1484 				state = DEALLOC_DONE;
1485 				break;
1486 			}
1487 			state = DEALLOC_MP_LOWER;
1488 			break;
1489 
1490 		/* lower the metapath strip height */
1491 		case DEALLOC_MP_LOWER:
1492 			/* We're done with the current buffer, so release it,
1493 			   unless it's the dinode buffer. Then back up to the
1494 			   previous pointer. */
1495 			if (mp_h) {
1496 				brelse(mp.mp_bh[mp_h]);
1497 				mp.mp_bh[mp_h] = NULL;
1498 			}
1499 			/* If we can't get any lower in height, we've stripped
1500 			   off all we can. Next step is to back up and start
1501 			   stripping the previous level of metadata. */
1502 			if (mp_h == 0) {
1503 				strip_h--;
1504 				memcpy(mp.mp_list, start_list, sizeof(start_list));
1505 				mp_h = strip_h;
1506 				state = DEALLOC_FILL_MP;
1507 				break;
1508 			}
1509 			mp.mp_list[mp_h] = 0;
1510 			mp_h--; /* search one metadata height down */
1511 			mp.mp_list[mp_h]++;
1512 			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1513 				break;
1514 			/* Here we've found a part of the metapath that is not
1515 			 * allocated. We need to search at that height for the
1516 			 * next non-null pointer. */
1517 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1518 				state = DEALLOC_FILL_MP;
1519 				mp_h++;
1520 			}
1521 			/* No more non-null pointers at this height. Back up
1522 			   to the previous height and try again. */
1523 			break; /* loop around in the same state */
1524 
1525 		/* Fill the metapath with buffers to the given height. */
1526 		case DEALLOC_FILL_MP:
1527 			/* Fill the buffers out to the current height. */
1528 			ret = fillup_metapath(ip, &mp, mp_h);
1529 			if (ret < 0)
1530 				goto out;
1531 
1532 			/* issue read-ahead on metadata */
1533 			if (mp.mp_aheight > 1) {
1534 				for (; ret > 1; ret--) {
1535 					metapointer_range(&mp, mp.mp_aheight - ret,
1536 							  start_list, start_aligned,
1537 							  end_list, end_aligned,
1538 							  &start, &end);
1539 					gfs2_metapath_ra(ip->i_gl, start, end);
1540 				}
1541 			}
1542 
1543 			/* If buffers found for the entire strip height */
1544 			if (mp.mp_aheight - 1 == strip_h) {
1545 				state = DEALLOC_MP_FULL;
1546 				break;
1547 			}
1548 			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1549 				mp_h = mp.mp_aheight - 1;
1550 
1551 			/* If we find a non-null block pointer, crawl a bit
1552 			   higher up in the metapath and try again, otherwise
1553 			   we need to look lower for a new starting point. */
1554 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1555 				mp_h++;
1556 			else
1557 				state = DEALLOC_MP_LOWER;
1558 			break;
1559 		}
1560 	}
1561 
1562 	if (btotal) {
1563 		if (current->journal_info == NULL) {
1564 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1565 					       RES_QUOTA, 0);
1566 			if (ret)
1567 				goto out;
1568 			down_write(&ip->i_rw_mutex);
1569 		}
1570 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1571 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1572 				  ip->i_inode.i_gid);
1573 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1574 		gfs2_trans_add_meta(ip->i_gl, dibh);
1575 		gfs2_dinode_out(ip, dibh->b_data);
1576 		up_write(&ip->i_rw_mutex);
1577 		gfs2_trans_end(sdp);
1578 	}
1579 
1580 out:
1581 	if (gfs2_holder_initialized(&rd_gh))
1582 		gfs2_glock_dq_uninit(&rd_gh);
1583 	if (current->journal_info) {
1584 		up_write(&ip->i_rw_mutex);
1585 		gfs2_trans_end(sdp);
1586 		cond_resched();
1587 	}
1588 	gfs2_quota_unhold(ip);
1589 out_metapath:
1590 	release_metapath(&mp);
1591 	return ret;
1592 }
1593 
1594 static int trunc_end(struct gfs2_inode *ip)
1595 {
1596 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1597 	struct buffer_head *dibh;
1598 	int error;
1599 
1600 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1601 	if (error)
1602 		return error;
1603 
1604 	down_write(&ip->i_rw_mutex);
1605 
1606 	error = gfs2_meta_inode_buffer(ip, &dibh);
1607 	if (error)
1608 		goto out;
1609 
1610 	if (!i_size_read(&ip->i_inode)) {
1611 		ip->i_height = 0;
1612 		ip->i_goal = ip->i_no_addr;
1613 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1614 		gfs2_ordered_del_inode(ip);
1615 	}
1616 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1617 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1618 
1619 	gfs2_trans_add_meta(ip->i_gl, dibh);
1620 	gfs2_dinode_out(ip, dibh->b_data);
1621 	brelse(dibh);
1622 
1623 out:
1624 	up_write(&ip->i_rw_mutex);
1625 	gfs2_trans_end(sdp);
1626 	return error;
1627 }
1628 
1629 /**
1630  * do_shrink - make a file smaller
1631  * @inode: the inode
1632  * @newsize: the size to make the file
1633  *
1634  * Called with an exclusive lock on @inode. The @size must
1635  * be equal to or smaller than the current inode size.
1636  *
1637  * Returns: errno
1638  */
1639 
1640 static int do_shrink(struct inode *inode, u64 newsize)
1641 {
1642 	struct gfs2_inode *ip = GFS2_I(inode);
1643 	int error;
1644 
1645 	error = trunc_start(inode, newsize);
1646 	if (error < 0)
1647 		return error;
1648 	if (gfs2_is_stuffed(ip))
1649 		return 0;
1650 
1651 	error = punch_hole(ip, newsize, 0);
1652 	if (error == 0)
1653 		error = trunc_end(ip);
1654 
1655 	return error;
1656 }
1657 
1658 void gfs2_trim_blocks(struct inode *inode)
1659 {
1660 	int ret;
1661 
1662 	ret = do_shrink(inode, inode->i_size);
1663 	WARN_ON(ret != 0);
1664 }
1665 
1666 /**
1667  * do_grow - Touch and update inode size
1668  * @inode: The inode
1669  * @size: The new size
1670  *
1671  * This function updates the timestamps on the inode and
1672  * may also increase the size of the inode. This function
1673  * must not be called with @size any smaller than the current
1674  * inode size.
1675  *
1676  * Although it is not strictly required to unstuff files here,
1677  * earlier versions of GFS2 have a bug in the stuffed file reading
1678  * code which will result in a buffer overrun if the size is larger
1679  * than the max stuffed file size. In order to prevent this from
1680  * occurring, such files are unstuffed, but in other cases we can
1681  * just update the inode size directly.
1682  *
1683  * Returns: 0 on success, or -ve on error
1684  */
1685 
1686 static int do_grow(struct inode *inode, u64 size)
1687 {
1688 	struct gfs2_inode *ip = GFS2_I(inode);
1689 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1690 	struct gfs2_alloc_parms ap = { .target = 1, };
1691 	struct buffer_head *dibh;
1692 	int error;
1693 	int unstuff = 0;
1694 
1695 	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
1696 		error = gfs2_quota_lock_check(ip, &ap);
1697 		if (error)
1698 			return error;
1699 
1700 		error = gfs2_inplace_reserve(ip, &ap);
1701 		if (error)
1702 			goto do_grow_qunlock;
1703 		unstuff = 1;
1704 	}
1705 
1706 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1707 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1708 				  0 : RES_QUOTA), 0);
1709 	if (error)
1710 		goto do_grow_release;
1711 
1712 	if (unstuff) {
1713 		error = gfs2_unstuff_dinode(ip, NULL);
1714 		if (error)
1715 			goto do_end_trans;
1716 	}
1717 
1718 	error = gfs2_meta_inode_buffer(ip, &dibh);
1719 	if (error)
1720 		goto do_end_trans;
1721 
1722 	i_size_write(inode, size);
1723 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1724 	gfs2_trans_add_meta(ip->i_gl, dibh);
1725 	gfs2_dinode_out(ip, dibh->b_data);
1726 	brelse(dibh);
1727 
1728 do_end_trans:
1729 	gfs2_trans_end(sdp);
1730 do_grow_release:
1731 	if (unstuff) {
1732 		gfs2_inplace_release(ip);
1733 do_grow_qunlock:
1734 		gfs2_quota_unlock(ip);
1735 	}
1736 	return error;
1737 }
1738 
1739 /**
1740  * gfs2_setattr_size - make a file a given size
1741  * @inode: the inode
1742  * @newsize: the size to make the file
1743  *
1744  * The file size can grow, shrink, or stay the same size. This
1745  * is called holding i_mutex and an exclusive glock on the inode
1746  * in question.
1747  *
1748  * Returns: errno
1749  */
1750 
1751 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1752 {
1753 	struct gfs2_inode *ip = GFS2_I(inode);
1754 	int ret;
1755 
1756 	BUG_ON(!S_ISREG(inode->i_mode));
1757 
1758 	ret = inode_newsize_ok(inode, newsize);
1759 	if (ret)
1760 		return ret;
1761 
1762 	inode_dio_wait(inode);
1763 
1764 	ret = gfs2_rsqa_alloc(ip);
1765 	if (ret)
1766 		goto out;
1767 
1768 	if (newsize >= inode->i_size) {
1769 		ret = do_grow(inode, newsize);
1770 		goto out;
1771 	}
1772 
1773 	ret = do_shrink(inode, newsize);
1774 out:
1775 	gfs2_rsqa_delete(ip, NULL);
1776 	return ret;
1777 }
1778 
1779 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1780 {
1781 	int error;
1782 	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
1783 	if (!error)
1784 		error = trunc_end(ip);
1785 	return error;
1786 }
1787 
1788 int gfs2_file_dealloc(struct gfs2_inode *ip)
1789 {
1790 	return punch_hole(ip, 0, 0);
1791 }
1792 
1793 /**
1794  * gfs2_free_journal_extents - Free cached journal bmap info
1795  * @jd: The journal
1796  *
1797  */
1798 
1799 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1800 {
1801 	struct gfs2_journal_extent *jext;
1802 
1803 	while(!list_empty(&jd->extent_list)) {
1804 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1805 		list_del(&jext->list);
1806 		kfree(jext);
1807 	}
1808 }
1809 
1810 /**
1811  * gfs2_add_jextent - Add or merge a new extent to extent cache
1812  * @jd: The journal descriptor
1813  * @lblock: The logical block at start of new extent
1814  * @dblock: The physical block at start of new extent
1815  * @blocks: Size of extent in fs blocks
1816  *
1817  * Returns: 0 on success or -ENOMEM
1818  */
1819 
1820 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1821 {
1822 	struct gfs2_journal_extent *jext;
1823 
1824 	if (!list_empty(&jd->extent_list)) {
1825 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1826 		if ((jext->dblock + jext->blocks) == dblock) {
1827 			jext->blocks += blocks;
1828 			return 0;
1829 		}
1830 	}
1831 
1832 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1833 	if (jext == NULL)
1834 		return -ENOMEM;
1835 	jext->dblock = dblock;
1836 	jext->lblock = lblock;
1837 	jext->blocks = blocks;
1838 	list_add_tail(&jext->list, &jd->extent_list);
1839 	jd->nr_extents++;
1840 	return 0;
1841 }
1842 
1843 /**
1844  * gfs2_map_journal_extents - Cache journal bmap info
1845  * @sdp: The super block
1846  * @jd: The journal to map
1847  *
1848  * Create a reusable "extent" mapping from all logical
1849  * blocks to all physical blocks for the given journal.  This will save
1850  * us time when writing journal blocks.  Most journals will have only one
1851  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1852  * arranges the journal blocks sequentially to maximize performance.
1853  * So the extent would map the first block for the entire file length.
1854  * However, gfs2_jadd can happen while file activity is happening, so
1855  * those journals may not be sequential.  Less likely is the case where
1856  * the users created their own journals by mounting the metafs and
1857  * laying it out.  But it's still possible.  These journals might have
1858  * several extents.
1859  *
1860  * Returns: 0 on success, or error on failure
1861  */
1862 
1863 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1864 {
1865 	u64 lblock = 0;
1866 	u64 lblock_stop;
1867 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1868 	struct buffer_head bh;
1869 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1870 	u64 size;
1871 	int rc;
1872 
1873 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1874 	size = (lblock_stop - lblock) << shift;
1875 	jd->nr_extents = 0;
1876 	WARN_ON(!list_empty(&jd->extent_list));
1877 
1878 	do {
1879 		bh.b_state = 0;
1880 		bh.b_blocknr = 0;
1881 		bh.b_size = size;
1882 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1883 		if (rc || !buffer_mapped(&bh))
1884 			goto fail;
1885 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1886 		if (rc)
1887 			goto fail;
1888 		size -= bh.b_size;
1889 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1890 	} while(size > 0);
1891 
1892 	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1893 		jd->nr_extents);
1894 	return 0;
1895 
1896 fail:
1897 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1898 		rc, jd->jd_jid,
1899 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
1900 		jd->nr_extents);
1901 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1902 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1903 		bh.b_state, (unsigned long long)bh.b_size);
1904 	gfs2_free_journal_extents(jd);
1905 	return rc;
1906 }
1907 
1908 /**
1909  * gfs2_write_alloc_required - figure out if a write will require an allocation
1910  * @ip: the file being written to
1911  * @offset: the offset to write to
1912  * @len: the number of bytes being written
1913  *
1914  * Returns: 1 if an alloc is required, 0 otherwise
1915  */
1916 
1917 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1918 			      unsigned int len)
1919 {
1920 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1921 	struct buffer_head bh;
1922 	unsigned int shift;
1923 	u64 lblock, lblock_stop, size;
1924 	u64 end_of_file;
1925 
1926 	if (!len)
1927 		return 0;
1928 
1929 	if (gfs2_is_stuffed(ip)) {
1930 		if (offset + len > gfs2_max_stuffed_size(ip))
1931 			return 1;
1932 		return 0;
1933 	}
1934 
1935 	shift = sdp->sd_sb.sb_bsize_shift;
1936 	BUG_ON(gfs2_is_dir(ip));
1937 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1938 	lblock = offset >> shift;
1939 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1940 	if (lblock_stop > end_of_file)
1941 		return 1;
1942 
1943 	size = (lblock_stop - lblock) << shift;
1944 	do {
1945 		bh.b_state = 0;
1946 		bh.b_size = size;
1947 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1948 		if (!buffer_mapped(&bh))
1949 			return 1;
1950 		size -= bh.b_size;
1951 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1952 	} while(size > 0);
1953 
1954 	return 0;
1955 }
1956 
1957 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
1958 {
1959 	struct gfs2_inode *ip = GFS2_I(inode);
1960 	struct buffer_head *dibh;
1961 	int error;
1962 
1963 	if (offset >= inode->i_size)
1964 		return 0;
1965 	if (offset + length > inode->i_size)
1966 		length = inode->i_size - offset;
1967 
1968 	error = gfs2_meta_inode_buffer(ip, &dibh);
1969 	if (error)
1970 		return error;
1971 	gfs2_trans_add_meta(ip->i_gl, dibh);
1972 	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
1973 	       length);
1974 	brelse(dibh);
1975 	return 0;
1976 }
1977 
1978 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
1979 					 loff_t length)
1980 {
1981 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1982 	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1983 	int error;
1984 
1985 	while (length) {
1986 		struct gfs2_trans *tr;
1987 		loff_t chunk;
1988 		unsigned int offs;
1989 
1990 		chunk = length;
1991 		if (chunk > max_chunk)
1992 			chunk = max_chunk;
1993 
1994 		offs = offset & ~PAGE_MASK;
1995 		if (offs && chunk > PAGE_SIZE)
1996 			chunk = offs + ((chunk - offs) & PAGE_MASK);
1997 
1998 		truncate_pagecache_range(inode, offset, chunk);
1999 		offset += chunk;
2000 		length -= chunk;
2001 
2002 		tr = current->journal_info;
2003 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2004 			continue;
2005 
2006 		gfs2_trans_end(sdp);
2007 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2008 		if (error)
2009 			return error;
2010 	}
2011 	return 0;
2012 }
2013 
2014 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2015 {
2016 	struct inode *inode = file_inode(file);
2017 	struct gfs2_inode *ip = GFS2_I(inode);
2018 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2019 	int error;
2020 
2021 	if (gfs2_is_jdata(ip))
2022 		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2023 					 GFS2_JTRUNC_REVOKES);
2024 	else
2025 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2026 	if (error)
2027 		return error;
2028 
2029 	if (gfs2_is_stuffed(ip)) {
2030 		error = stuffed_zero_range(inode, offset, length);
2031 		if (error)
2032 			goto out;
2033 	} else {
2034 		unsigned int start_off, end_off, blocksize;
2035 
2036 		blocksize = i_blocksize(inode);
2037 		start_off = offset & (blocksize - 1);
2038 		end_off = (offset + length) & (blocksize - 1);
2039 		if (start_off) {
2040 			unsigned int len = length;
2041 			if (length > blocksize - start_off)
2042 				len = blocksize - start_off;
2043 			error = gfs2_block_zero_range(inode, offset, len);
2044 			if (error)
2045 				goto out;
2046 			if (start_off + length < blocksize)
2047 				end_off = 0;
2048 		}
2049 		if (end_off) {
2050 			error = gfs2_block_zero_range(inode,
2051 				offset + length - end_off, end_off);
2052 			if (error)
2053 				goto out;
2054 		}
2055 	}
2056 
2057 	if (gfs2_is_jdata(ip)) {
2058 		BUG_ON(!current->journal_info);
2059 		gfs2_journaled_truncate_range(inode, offset, length);
2060 	} else
2061 		truncate_pagecache_range(inode, offset, offset + length - 1);
2062 
2063 	file_update_time(file);
2064 	mark_inode_dirty(inode);
2065 
2066 	if (current->journal_info)
2067 		gfs2_trans_end(sdp);
2068 
2069 	if (!gfs2_is_stuffed(ip))
2070 		error = punch_hole(ip, offset, length);
2071 
2072 out:
2073 	if (current->journal_info)
2074 		gfs2_trans_end(sdp);
2075 	return error;
2076 }
2077