xref: /openbmc/linux/fs/gfs2/bmap.c (revision 91db9311)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #include <linux/spinlock.h>
8 #include <linux/completion.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/gfs2_ondisk.h>
12 #include <linux/crc32.h>
13 #include <linux/iomap.h>
14 #include <linux/ktime.h>
15 
16 #include "gfs2.h"
17 #include "incore.h"
18 #include "bmap.h"
19 #include "glock.h"
20 #include "inode.h"
21 #include "meta_io.h"
22 #include "quota.h"
23 #include "rgrp.h"
24 #include "log.h"
25 #include "super.h"
26 #include "trans.h"
27 #include "dir.h"
28 #include "util.h"
29 #include "aops.h"
30 #include "trace_gfs2.h"
31 
32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
33  * block is 512, so __u16 is fine for that. It saves stack space to
34  * keep it small.
35  */
36 struct metapath {
37 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
39 	int mp_fheight; /* find_metapath height */
40 	int mp_aheight; /* actual height (lookup height) */
41 };
42 
43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
44 
45 /**
46  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
47  * @ip: the inode
48  * @dibh: the dinode buffer
49  * @block: the block number that was allocated
50  * @page: The (optional) page. This is looked up if @page is NULL
51  *
52  * Returns: errno
53  */
54 
55 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
56 			       u64 block, struct page *page)
57 {
58 	struct inode *inode = &ip->i_inode;
59 	struct buffer_head *bh;
60 	int release = 0;
61 
62 	if (!page || page->index) {
63 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
64 		if (!page)
65 			return -ENOMEM;
66 		release = 1;
67 	}
68 
69 	if (!PageUptodate(page)) {
70 		void *kaddr = kmap(page);
71 		u64 dsize = i_size_read(inode);
72 
73 		if (dsize > gfs2_max_stuffed_size(ip))
74 			dsize = gfs2_max_stuffed_size(ip);
75 
76 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
77 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
78 		kunmap(page);
79 
80 		SetPageUptodate(page);
81 	}
82 
83 	if (!page_has_buffers(page))
84 		create_empty_buffers(page, BIT(inode->i_blkbits),
85 				     BIT(BH_Uptodate));
86 
87 	bh = page_buffers(page);
88 
89 	if (!buffer_mapped(bh))
90 		map_bh(bh, inode->i_sb, block);
91 
92 	set_buffer_uptodate(bh);
93 	if (gfs2_is_jdata(ip))
94 		gfs2_trans_add_data(ip->i_gl, bh);
95 	else {
96 		mark_buffer_dirty(bh);
97 		gfs2_ordered_add_inode(ip);
98 	}
99 
100 	if (release) {
101 		unlock_page(page);
102 		put_page(page);
103 	}
104 
105 	return 0;
106 }
107 
108 /**
109  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
110  * @ip: The GFS2 inode to unstuff
111  * @page: The (optional) page. This is looked up if the @page is NULL
112  *
113  * This routine unstuffs a dinode and returns it to a "normal" state such
114  * that the height can be grown in the traditional way.
115  *
116  * Returns: errno
117  */
118 
119 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
120 {
121 	struct buffer_head *bh, *dibh;
122 	struct gfs2_dinode *di;
123 	u64 block = 0;
124 	int isdir = gfs2_is_dir(ip);
125 	int error;
126 
127 	down_write(&ip->i_rw_mutex);
128 
129 	error = gfs2_meta_inode_buffer(ip, &dibh);
130 	if (error)
131 		goto out;
132 
133 	if (i_size_read(&ip->i_inode)) {
134 		/* Get a free block, fill it with the stuffed data,
135 		   and write it out to disk */
136 
137 		unsigned int n = 1;
138 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
139 		if (error)
140 			goto out_brelse;
141 		if (isdir) {
142 			gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
143 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
144 			if (error)
145 				goto out_brelse;
146 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
147 					      dibh, sizeof(struct gfs2_dinode));
148 			brelse(bh);
149 		} else {
150 			error = gfs2_unstuffer_page(ip, dibh, block, page);
151 			if (error)
152 				goto out_brelse;
153 		}
154 	}
155 
156 	/*  Set up the pointer to the new block  */
157 
158 	gfs2_trans_add_meta(ip->i_gl, dibh);
159 	di = (struct gfs2_dinode *)dibh->b_data;
160 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
161 
162 	if (i_size_read(&ip->i_inode)) {
163 		*(__be64 *)(di + 1) = cpu_to_be64(block);
164 		gfs2_add_inode_blocks(&ip->i_inode, 1);
165 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
166 	}
167 
168 	ip->i_height = 1;
169 	di->di_height = cpu_to_be16(1);
170 
171 out_brelse:
172 	brelse(dibh);
173 out:
174 	up_write(&ip->i_rw_mutex);
175 	return error;
176 }
177 
178 
179 /**
180  * find_metapath - Find path through the metadata tree
181  * @sdp: The superblock
182  * @block: The disk block to look up
183  * @mp: The metapath to return the result in
184  * @height: The pre-calculated height of the metadata tree
185  *
186  *   This routine returns a struct metapath structure that defines a path
187  *   through the metadata of inode "ip" to get to block "block".
188  *
189  *   Example:
190  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
191  *   filesystem with a blocksize of 4096.
192  *
193  *   find_metapath() would return a struct metapath structure set to:
194  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
195  *
196  *   That means that in order to get to the block containing the byte at
197  *   offset 101342453, we would load the indirect block pointed to by pointer
198  *   0 in the dinode.  We would then load the indirect block pointed to by
199  *   pointer 48 in that indirect block.  We would then load the data block
200  *   pointed to by pointer 165 in that indirect block.
201  *
202  *             ----------------------------------------
203  *             | Dinode |                             |
204  *             |        |                            4|
205  *             |        |0 1 2 3 4 5                 9|
206  *             |        |                            6|
207  *             ----------------------------------------
208  *                       |
209  *                       |
210  *                       V
211  *             ----------------------------------------
212  *             | Indirect Block                       |
213  *             |                                     5|
214  *             |            4 4 4 4 4 5 5            1|
215  *             |0           5 6 7 8 9 0 1            2|
216  *             ----------------------------------------
217  *                                |
218  *                                |
219  *                                V
220  *             ----------------------------------------
221  *             | Indirect Block                       |
222  *             |                         1 1 1 1 1   5|
223  *             |                         6 6 6 6 6   1|
224  *             |0                        3 4 5 6 7   2|
225  *             ----------------------------------------
226  *                                           |
227  *                                           |
228  *                                           V
229  *             ----------------------------------------
230  *             | Data block containing offset         |
231  *             |            101342453                 |
232  *             |                                      |
233  *             |                                      |
234  *             ----------------------------------------
235  *
236  */
237 
238 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
239 			  struct metapath *mp, unsigned int height)
240 {
241 	unsigned int i;
242 
243 	mp->mp_fheight = height;
244 	for (i = height; i--;)
245 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
246 }
247 
248 static inline unsigned int metapath_branch_start(const struct metapath *mp)
249 {
250 	if (mp->mp_list[0] == 0)
251 		return 2;
252 	return 1;
253 }
254 
255 /**
256  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
257  * @height: The metadata height (0 = dinode)
258  * @mp: The metapath
259  */
260 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
261 {
262 	struct buffer_head *bh = mp->mp_bh[height];
263 	if (height == 0)
264 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
265 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
266 }
267 
268 /**
269  * metapointer - Return pointer to start of metadata in a buffer
270  * @height: The metadata height (0 = dinode)
271  * @mp: The metapath
272  *
273  * Return a pointer to the block number of the next height of the metadata
274  * tree given a buffer containing the pointer to the current height of the
275  * metadata tree.
276  */
277 
278 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
279 {
280 	__be64 *p = metaptr1(height, mp);
281 	return p + mp->mp_list[height];
282 }
283 
284 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
285 {
286 	const struct buffer_head *bh = mp->mp_bh[height];
287 	return (const __be64 *)(bh->b_data + bh->b_size);
288 }
289 
290 static void clone_metapath(struct metapath *clone, struct metapath *mp)
291 {
292 	unsigned int hgt;
293 
294 	*clone = *mp;
295 	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
296 		get_bh(clone->mp_bh[hgt]);
297 }
298 
299 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
300 {
301 	const __be64 *t;
302 
303 	for (t = start; t < end; t++) {
304 		struct buffer_head *rabh;
305 
306 		if (!*t)
307 			continue;
308 
309 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
310 		if (trylock_buffer(rabh)) {
311 			if (!buffer_uptodate(rabh)) {
312 				rabh->b_end_io = end_buffer_read_sync;
313 				submit_bh(REQ_OP_READ,
314 					  REQ_RAHEAD | REQ_META | REQ_PRIO,
315 					  rabh);
316 				continue;
317 			}
318 			unlock_buffer(rabh);
319 		}
320 		brelse(rabh);
321 	}
322 }
323 
324 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
325 			     unsigned int x, unsigned int h)
326 {
327 	for (; x < h; x++) {
328 		__be64 *ptr = metapointer(x, mp);
329 		u64 dblock = be64_to_cpu(*ptr);
330 		int ret;
331 
332 		if (!dblock)
333 			break;
334 		ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
335 		if (ret)
336 			return ret;
337 	}
338 	mp->mp_aheight = x + 1;
339 	return 0;
340 }
341 
342 /**
343  * lookup_metapath - Walk the metadata tree to a specific point
344  * @ip: The inode
345  * @mp: The metapath
346  *
347  * Assumes that the inode's buffer has already been looked up and
348  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
349  * by find_metapath().
350  *
351  * If this function encounters part of the tree which has not been
352  * allocated, it returns the current height of the tree at the point
353  * at which it found the unallocated block. Blocks which are found are
354  * added to the mp->mp_bh[] list.
355  *
356  * Returns: error
357  */
358 
359 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
360 {
361 	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
362 }
363 
364 /**
365  * fillup_metapath - fill up buffers for the metadata path to a specific height
366  * @ip: The inode
367  * @mp: The metapath
368  * @h: The height to which it should be mapped
369  *
370  * Similar to lookup_metapath, but does lookups for a range of heights
371  *
372  * Returns: error or the number of buffers filled
373  */
374 
375 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
376 {
377 	unsigned int x = 0;
378 	int ret;
379 
380 	if (h) {
381 		/* find the first buffer we need to look up. */
382 		for (x = h - 1; x > 0; x--) {
383 			if (mp->mp_bh[x])
384 				break;
385 		}
386 	}
387 	ret = __fillup_metapath(ip, mp, x, h);
388 	if (ret)
389 		return ret;
390 	return mp->mp_aheight - x - 1;
391 }
392 
393 static void release_metapath(struct metapath *mp)
394 {
395 	int i;
396 
397 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
398 		if (mp->mp_bh[i] == NULL)
399 			break;
400 		brelse(mp->mp_bh[i]);
401 		mp->mp_bh[i] = NULL;
402 	}
403 }
404 
405 /**
406  * gfs2_extent_length - Returns length of an extent of blocks
407  * @bh: The metadata block
408  * @ptr: Current position in @bh
409  * @limit: Max extent length to return
410  * @eob: Set to 1 if we hit "end of block"
411  *
412  * Returns: The length of the extent (minimum of one block)
413  */
414 
415 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
416 {
417 	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
418 	const __be64 *first = ptr;
419 	u64 d = be64_to_cpu(*ptr);
420 
421 	*eob = 0;
422 	do {
423 		ptr++;
424 		if (ptr >= end)
425 			break;
426 		d++;
427 	} while(be64_to_cpu(*ptr) == d);
428 	if (ptr >= end)
429 		*eob = 1;
430 	return ptr - first;
431 }
432 
433 typedef const __be64 *(*gfs2_metadata_walker)(
434 		struct metapath *mp,
435 		const __be64 *start, const __be64 *end,
436 		u64 factor, void *data);
437 
438 #define WALK_STOP ((__be64 *)0)
439 #define WALK_NEXT ((__be64 *)1)
440 
441 static int gfs2_walk_metadata(struct inode *inode, sector_t lblock,
442 		u64 len, struct metapath *mp, gfs2_metadata_walker walker,
443 		void *data)
444 {
445 	struct metapath clone;
446 	struct gfs2_inode *ip = GFS2_I(inode);
447 	struct gfs2_sbd *sdp = GFS2_SB(inode);
448 	const __be64 *start, *end, *ptr;
449 	u64 factor = 1;
450 	unsigned int hgt;
451 	int ret = 0;
452 
453 	for (hgt = ip->i_height - 1; hgt >= mp->mp_aheight; hgt--)
454 		factor *= sdp->sd_inptrs;
455 
456 	for (;;) {
457 		u64 step;
458 
459 		/* Walk indirect block. */
460 		start = metapointer(hgt, mp);
461 		end = metaend(hgt, mp);
462 
463 		step = (end - start) * factor;
464 		if (step > len)
465 			end = start + DIV_ROUND_UP_ULL(len, factor);
466 
467 		ptr = walker(mp, start, end, factor, data);
468 		if (ptr == WALK_STOP)
469 			break;
470 		if (step >= len)
471 			break;
472 		len -= step;
473 		if (ptr != WALK_NEXT) {
474 			BUG_ON(!*ptr);
475 			mp->mp_list[hgt] += ptr - start;
476 			goto fill_up_metapath;
477 		}
478 
479 lower_metapath:
480 		/* Decrease height of metapath. */
481 		if (mp != &clone) {
482 			clone_metapath(&clone, mp);
483 			mp = &clone;
484 		}
485 		brelse(mp->mp_bh[hgt]);
486 		mp->mp_bh[hgt] = NULL;
487 		if (!hgt)
488 			break;
489 		hgt--;
490 		factor *= sdp->sd_inptrs;
491 
492 		/* Advance in metadata tree. */
493 		(mp->mp_list[hgt])++;
494 		start = metapointer(hgt, mp);
495 		end = metaend(hgt, mp);
496 		if (start >= end) {
497 			mp->mp_list[hgt] = 0;
498 			if (!hgt)
499 				break;
500 			goto lower_metapath;
501 		}
502 
503 fill_up_metapath:
504 		/* Increase height of metapath. */
505 		if (mp != &clone) {
506 			clone_metapath(&clone, mp);
507 			mp = &clone;
508 		}
509 		ret = fillup_metapath(ip, mp, ip->i_height - 1);
510 		if (ret < 0)
511 			break;
512 		hgt += ret;
513 		for (; ret; ret--)
514 			do_div(factor, sdp->sd_inptrs);
515 		mp->mp_aheight = hgt + 1;
516 	}
517 	if (mp == &clone)
518 		release_metapath(mp);
519 	return ret;
520 }
521 
522 struct gfs2_hole_walker_args {
523 	u64 blocks;
524 };
525 
526 static const __be64 *gfs2_hole_walker(struct metapath *mp,
527 		const __be64 *start, const __be64 *end,
528 		u64 factor, void *data)
529 {
530 	struct gfs2_hole_walker_args *args = data;
531 	const __be64 *ptr;
532 
533 	for (ptr = start; ptr < end; ptr++) {
534 		if (*ptr) {
535 			args->blocks += (ptr - start) * factor;
536 			if (mp->mp_aheight == mp->mp_fheight)
537 				return WALK_STOP;
538 			return ptr;  /* increase height */
539 		}
540 	}
541 	args->blocks += (end - start) * factor;
542 	return WALK_NEXT;
543 }
544 
545 /**
546  * gfs2_hole_size - figure out the size of a hole
547  * @inode: The inode
548  * @lblock: The logical starting block number
549  * @len: How far to look (in blocks)
550  * @mp: The metapath at lblock
551  * @iomap: The iomap to store the hole size in
552  *
553  * This function modifies @mp.
554  *
555  * Returns: errno on error
556  */
557 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
558 			  struct metapath *mp, struct iomap *iomap)
559 {
560 	struct gfs2_hole_walker_args args = { };
561 	int ret = 0;
562 
563 	ret = gfs2_walk_metadata(inode, lblock, len, mp, gfs2_hole_walker, &args);
564 	if (!ret)
565 		iomap->length = args.blocks << inode->i_blkbits;
566 	return ret;
567 }
568 
569 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
570 					 struct gfs2_glock *gl, unsigned int i,
571 					 unsigned offset, u64 bn)
572 {
573 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
574 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
575 				 sizeof(struct gfs2_dinode)));
576 	BUG_ON(i < 1);
577 	BUG_ON(mp->mp_bh[i] != NULL);
578 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
579 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
580 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
581 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
582 	ptr += offset;
583 	*ptr = cpu_to_be64(bn);
584 	return ptr;
585 }
586 
587 enum alloc_state {
588 	ALLOC_DATA = 0,
589 	ALLOC_GROW_DEPTH = 1,
590 	ALLOC_GROW_HEIGHT = 2,
591 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
592 };
593 
594 /**
595  * gfs2_iomap_alloc - Build a metadata tree of the requested height
596  * @inode: The GFS2 inode
597  * @iomap: The iomap structure
598  * @mp: The metapath, with proper height information calculated
599  *
600  * In this routine we may have to alloc:
601  *   i) Indirect blocks to grow the metadata tree height
602  *  ii) Indirect blocks to fill in lower part of the metadata tree
603  * iii) Data blocks
604  *
605  * This function is called after gfs2_iomap_get, which works out the
606  * total number of blocks which we need via gfs2_alloc_size.
607  *
608  * We then do the actual allocation asking for an extent at a time (if
609  * enough contiguous free blocks are available, there will only be one
610  * allocation request per call) and uses the state machine to initialise
611  * the blocks in order.
612  *
613  * Right now, this function will allocate at most one indirect block
614  * worth of data -- with a default block size of 4K, that's slightly
615  * less than 2M.  If this limitation is ever removed to allow huge
616  * allocations, we would probably still want to limit the iomap size we
617  * return to avoid stalling other tasks during huge writes; the next
618  * iomap iteration would then find the blocks already allocated.
619  *
620  * Returns: errno on error
621  */
622 
623 static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
624 			    struct metapath *mp)
625 {
626 	struct gfs2_inode *ip = GFS2_I(inode);
627 	struct gfs2_sbd *sdp = GFS2_SB(inode);
628 	struct buffer_head *dibh = mp->mp_bh[0];
629 	u64 bn;
630 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
631 	size_t dblks = iomap->length >> inode->i_blkbits;
632 	const unsigned end_of_metadata = mp->mp_fheight - 1;
633 	int ret;
634 	enum alloc_state state;
635 	__be64 *ptr;
636 	__be64 zero_bn = 0;
637 
638 	BUG_ON(mp->mp_aheight < 1);
639 	BUG_ON(dibh == NULL);
640 	BUG_ON(dblks < 1);
641 
642 	gfs2_trans_add_meta(ip->i_gl, dibh);
643 
644 	down_write(&ip->i_rw_mutex);
645 
646 	if (mp->mp_fheight == mp->mp_aheight) {
647 		/* Bottom indirect block exists */
648 		state = ALLOC_DATA;
649 	} else {
650 		/* Need to allocate indirect blocks */
651 		if (mp->mp_fheight == ip->i_height) {
652 			/* Writing into existing tree, extend tree down */
653 			iblks = mp->mp_fheight - mp->mp_aheight;
654 			state = ALLOC_GROW_DEPTH;
655 		} else {
656 			/* Building up tree height */
657 			state = ALLOC_GROW_HEIGHT;
658 			iblks = mp->mp_fheight - ip->i_height;
659 			branch_start = metapath_branch_start(mp);
660 			iblks += (mp->mp_fheight - branch_start);
661 		}
662 	}
663 
664 	/* start of the second part of the function (state machine) */
665 
666 	blks = dblks + iblks;
667 	i = mp->mp_aheight;
668 	do {
669 		n = blks - alloced;
670 		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
671 		if (ret)
672 			goto out;
673 		alloced += n;
674 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
675 			gfs2_trans_remove_revoke(sdp, bn, n);
676 		switch (state) {
677 		/* Growing height of tree */
678 		case ALLOC_GROW_HEIGHT:
679 			if (i == 1) {
680 				ptr = (__be64 *)(dibh->b_data +
681 						 sizeof(struct gfs2_dinode));
682 				zero_bn = *ptr;
683 			}
684 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
685 			     i++, n--)
686 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
687 			if (i - 1 == mp->mp_fheight - ip->i_height) {
688 				i--;
689 				gfs2_buffer_copy_tail(mp->mp_bh[i],
690 						sizeof(struct gfs2_meta_header),
691 						dibh, sizeof(struct gfs2_dinode));
692 				gfs2_buffer_clear_tail(dibh,
693 						sizeof(struct gfs2_dinode) +
694 						sizeof(__be64));
695 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
696 					sizeof(struct gfs2_meta_header));
697 				*ptr = zero_bn;
698 				state = ALLOC_GROW_DEPTH;
699 				for(i = branch_start; i < mp->mp_fheight; i++) {
700 					if (mp->mp_bh[i] == NULL)
701 						break;
702 					brelse(mp->mp_bh[i]);
703 					mp->mp_bh[i] = NULL;
704 				}
705 				i = branch_start;
706 			}
707 			if (n == 0)
708 				break;
709 		/* fall through - To branching from existing tree */
710 		case ALLOC_GROW_DEPTH:
711 			if (i > 1 && i < mp->mp_fheight)
712 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
713 			for (; i < mp->mp_fheight && n > 0; i++, n--)
714 				gfs2_indirect_init(mp, ip->i_gl, i,
715 						   mp->mp_list[i-1], bn++);
716 			if (i == mp->mp_fheight)
717 				state = ALLOC_DATA;
718 			if (n == 0)
719 				break;
720 		/* fall through - To tree complete, adding data blocks */
721 		case ALLOC_DATA:
722 			BUG_ON(n > dblks);
723 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
724 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
725 			dblks = n;
726 			ptr = metapointer(end_of_metadata, mp);
727 			iomap->addr = bn << inode->i_blkbits;
728 			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
729 			while (n-- > 0)
730 				*ptr++ = cpu_to_be64(bn++);
731 			break;
732 		}
733 	} while (iomap->addr == IOMAP_NULL_ADDR);
734 
735 	iomap->type = IOMAP_MAPPED;
736 	iomap->length = (u64)dblks << inode->i_blkbits;
737 	ip->i_height = mp->mp_fheight;
738 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
739 	gfs2_dinode_out(ip, dibh->b_data);
740 out:
741 	up_write(&ip->i_rw_mutex);
742 	return ret;
743 }
744 
745 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
746 
747 /**
748  * gfs2_alloc_size - Compute the maximum allocation size
749  * @inode: The inode
750  * @mp: The metapath
751  * @size: Requested size in blocks
752  *
753  * Compute the maximum size of the next allocation at @mp.
754  *
755  * Returns: size in blocks
756  */
757 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
758 {
759 	struct gfs2_inode *ip = GFS2_I(inode);
760 	struct gfs2_sbd *sdp = GFS2_SB(inode);
761 	const __be64 *first, *ptr, *end;
762 
763 	/*
764 	 * For writes to stuffed files, this function is called twice via
765 	 * gfs2_iomap_get, before and after unstuffing. The size we return the
766 	 * first time needs to be large enough to get the reservation and
767 	 * allocation sizes right.  The size we return the second time must
768 	 * be exact or else gfs2_iomap_alloc won't do the right thing.
769 	 */
770 
771 	if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
772 		unsigned int maxsize = mp->mp_fheight > 1 ?
773 			sdp->sd_inptrs : sdp->sd_diptrs;
774 		maxsize -= mp->mp_list[mp->mp_fheight - 1];
775 		if (size > maxsize)
776 			size = maxsize;
777 		return size;
778 	}
779 
780 	first = metapointer(ip->i_height - 1, mp);
781 	end = metaend(ip->i_height - 1, mp);
782 	if (end - first > size)
783 		end = first + size;
784 	for (ptr = first; ptr < end; ptr++) {
785 		if (*ptr)
786 			break;
787 	}
788 	return ptr - first;
789 }
790 
791 /**
792  * gfs2_iomap_get - Map blocks from an inode to disk blocks
793  * @inode: The inode
794  * @pos: Starting position in bytes
795  * @length: Length to map, in bytes
796  * @flags: iomap flags
797  * @iomap: The iomap structure
798  * @mp: The metapath
799  *
800  * Returns: errno
801  */
802 static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
803 			  unsigned flags, struct iomap *iomap,
804 			  struct metapath *mp)
805 {
806 	struct gfs2_inode *ip = GFS2_I(inode);
807 	struct gfs2_sbd *sdp = GFS2_SB(inode);
808 	loff_t size = i_size_read(inode);
809 	__be64 *ptr;
810 	sector_t lblock;
811 	sector_t lblock_stop;
812 	int ret;
813 	int eob;
814 	u64 len;
815 	struct buffer_head *dibh = NULL, *bh;
816 	u8 height;
817 
818 	if (!length)
819 		return -EINVAL;
820 
821 	down_read(&ip->i_rw_mutex);
822 
823 	ret = gfs2_meta_inode_buffer(ip, &dibh);
824 	if (ret)
825 		goto unlock;
826 	mp->mp_bh[0] = dibh;
827 
828 	if (gfs2_is_stuffed(ip)) {
829 		if (flags & IOMAP_WRITE) {
830 			loff_t max_size = gfs2_max_stuffed_size(ip);
831 
832 			if (pos + length > max_size)
833 				goto unstuff;
834 			iomap->length = max_size;
835 		} else {
836 			if (pos >= size) {
837 				if (flags & IOMAP_REPORT) {
838 					ret = -ENOENT;
839 					goto unlock;
840 				} else {
841 					/* report a hole */
842 					iomap->offset = pos;
843 					iomap->length = length;
844 					goto do_alloc;
845 				}
846 			}
847 			iomap->length = size;
848 		}
849 		iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
850 			      sizeof(struct gfs2_dinode);
851 		iomap->type = IOMAP_INLINE;
852 		iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
853 		goto out;
854 	}
855 
856 unstuff:
857 	lblock = pos >> inode->i_blkbits;
858 	iomap->offset = lblock << inode->i_blkbits;
859 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
860 	len = lblock_stop - lblock + 1;
861 	iomap->length = len << inode->i_blkbits;
862 
863 	height = ip->i_height;
864 	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
865 		height++;
866 	find_metapath(sdp, lblock, mp, height);
867 	if (height > ip->i_height || gfs2_is_stuffed(ip))
868 		goto do_alloc;
869 
870 	ret = lookup_metapath(ip, mp);
871 	if (ret)
872 		goto unlock;
873 
874 	if (mp->mp_aheight != ip->i_height)
875 		goto do_alloc;
876 
877 	ptr = metapointer(ip->i_height - 1, mp);
878 	if (*ptr == 0)
879 		goto do_alloc;
880 
881 	bh = mp->mp_bh[ip->i_height - 1];
882 	len = gfs2_extent_length(bh, ptr, len, &eob);
883 
884 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
885 	iomap->length = len << inode->i_blkbits;
886 	iomap->type = IOMAP_MAPPED;
887 	iomap->flags |= IOMAP_F_MERGED;
888 	if (eob)
889 		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
890 
891 out:
892 	iomap->bdev = inode->i_sb->s_bdev;
893 unlock:
894 	up_read(&ip->i_rw_mutex);
895 	return ret;
896 
897 do_alloc:
898 	iomap->addr = IOMAP_NULL_ADDR;
899 	iomap->type = IOMAP_HOLE;
900 	if (flags & IOMAP_REPORT) {
901 		if (pos >= size)
902 			ret = -ENOENT;
903 		else if (height == ip->i_height)
904 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
905 		else
906 			iomap->length = size - pos;
907 	} else if (flags & IOMAP_WRITE) {
908 		u64 alloc_size;
909 
910 		if (flags & IOMAP_DIRECT)
911 			goto out;  /* (see gfs2_file_direct_write) */
912 
913 		len = gfs2_alloc_size(inode, mp, len);
914 		alloc_size = len << inode->i_blkbits;
915 		if (alloc_size < iomap->length)
916 			iomap->length = alloc_size;
917 	} else {
918 		if (pos < size && height == ip->i_height)
919 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
920 	}
921 	goto out;
922 }
923 
924 /**
925  * gfs2_lblk_to_dblk - convert logical block to disk block
926  * @inode: the inode of the file we're mapping
927  * @lblock: the block relative to the start of the file
928  * @dblock: the returned dblock, if no error
929  *
930  * This function maps a single block from a file logical block (relative to
931  * the start of the file) to a file system absolute block using iomap.
932  *
933  * Returns: the absolute file system block, or an error
934  */
935 int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock)
936 {
937 	struct iomap iomap = { };
938 	struct metapath mp = { .mp_aheight = 1, };
939 	loff_t pos = (loff_t)lblock << inode->i_blkbits;
940 	int ret;
941 
942 	ret = gfs2_iomap_get(inode, pos, i_blocksize(inode), 0, &iomap, &mp);
943 	release_metapath(&mp);
944 	if (ret == 0)
945 		*dblock = iomap.addr >> inode->i_blkbits;
946 
947 	return ret;
948 }
949 
950 static int gfs2_write_lock(struct inode *inode)
951 {
952 	struct gfs2_inode *ip = GFS2_I(inode);
953 	struct gfs2_sbd *sdp = GFS2_SB(inode);
954 	int error;
955 
956 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
957 	error = gfs2_glock_nq(&ip->i_gh);
958 	if (error)
959 		goto out_uninit;
960 	if (&ip->i_inode == sdp->sd_rindex) {
961 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
962 
963 		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
964 					   GL_NOCACHE, &m_ip->i_gh);
965 		if (error)
966 			goto out_unlock;
967 	}
968 	return 0;
969 
970 out_unlock:
971 	gfs2_glock_dq(&ip->i_gh);
972 out_uninit:
973 	gfs2_holder_uninit(&ip->i_gh);
974 	return error;
975 }
976 
977 static void gfs2_write_unlock(struct inode *inode)
978 {
979 	struct gfs2_inode *ip = GFS2_I(inode);
980 	struct gfs2_sbd *sdp = GFS2_SB(inode);
981 
982 	if (&ip->i_inode == sdp->sd_rindex) {
983 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
984 
985 		gfs2_glock_dq_uninit(&m_ip->i_gh);
986 	}
987 	gfs2_glock_dq_uninit(&ip->i_gh);
988 }
989 
990 static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
991 				   unsigned len, struct iomap *iomap)
992 {
993 	unsigned int blockmask = i_blocksize(inode) - 1;
994 	struct gfs2_sbd *sdp = GFS2_SB(inode);
995 	unsigned int blocks;
996 
997 	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
998 	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
999 }
1000 
1001 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
1002 				 unsigned copied, struct page *page,
1003 				 struct iomap *iomap)
1004 {
1005 	struct gfs2_trans *tr = current->journal_info;
1006 	struct gfs2_inode *ip = GFS2_I(inode);
1007 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1008 
1009 	if (page && !gfs2_is_stuffed(ip))
1010 		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
1011 
1012 	if (tr->tr_num_buf_new)
1013 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1014 
1015 	gfs2_trans_end(sdp);
1016 }
1017 
1018 static const struct iomap_page_ops gfs2_iomap_page_ops = {
1019 	.page_prepare = gfs2_iomap_page_prepare,
1020 	.page_done = gfs2_iomap_page_done,
1021 };
1022 
1023 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1024 				  loff_t length, unsigned flags,
1025 				  struct iomap *iomap,
1026 				  struct metapath *mp)
1027 {
1028 	struct gfs2_inode *ip = GFS2_I(inode);
1029 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1030 	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1031 	bool unstuff, alloc_required;
1032 	int ret;
1033 
1034 	ret = gfs2_write_lock(inode);
1035 	if (ret)
1036 		return ret;
1037 
1038 	unstuff = gfs2_is_stuffed(ip) &&
1039 		  pos + length > gfs2_max_stuffed_size(ip);
1040 
1041 	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, mp);
1042 	if (ret)
1043 		goto out_unlock;
1044 
1045 	alloc_required = unstuff || iomap->type == IOMAP_HOLE;
1046 
1047 	if (alloc_required || gfs2_is_jdata(ip))
1048 		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1049 				       &ind_blocks);
1050 
1051 	if (alloc_required) {
1052 		struct gfs2_alloc_parms ap = {
1053 			.target = data_blocks + ind_blocks
1054 		};
1055 
1056 		ret = gfs2_quota_lock_check(ip, &ap);
1057 		if (ret)
1058 			goto out_unlock;
1059 
1060 		ret = gfs2_inplace_reserve(ip, &ap);
1061 		if (ret)
1062 			goto out_qunlock;
1063 	}
1064 
1065 	rblocks = RES_DINODE + ind_blocks;
1066 	if (gfs2_is_jdata(ip))
1067 		rblocks += data_blocks;
1068 	if (ind_blocks || data_blocks)
1069 		rblocks += RES_STATFS + RES_QUOTA;
1070 	if (inode == sdp->sd_rindex)
1071 		rblocks += 2 * RES_STATFS;
1072 	if (alloc_required)
1073 		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1074 
1075 	if (unstuff || iomap->type == IOMAP_HOLE) {
1076 		struct gfs2_trans *tr;
1077 
1078 		ret = gfs2_trans_begin(sdp, rblocks,
1079 				       iomap->length >> inode->i_blkbits);
1080 		if (ret)
1081 			goto out_trans_fail;
1082 
1083 		if (unstuff) {
1084 			ret = gfs2_unstuff_dinode(ip, NULL);
1085 			if (ret)
1086 				goto out_trans_end;
1087 			release_metapath(mp);
1088 			ret = gfs2_iomap_get(inode, iomap->offset,
1089 					     iomap->length, flags, iomap, mp);
1090 			if (ret)
1091 				goto out_trans_end;
1092 		}
1093 
1094 		if (iomap->type == IOMAP_HOLE) {
1095 			ret = gfs2_iomap_alloc(inode, iomap, mp);
1096 			if (ret) {
1097 				gfs2_trans_end(sdp);
1098 				gfs2_inplace_release(ip);
1099 				punch_hole(ip, iomap->offset, iomap->length);
1100 				goto out_qunlock;
1101 			}
1102 		}
1103 
1104 		tr = current->journal_info;
1105 		if (tr->tr_num_buf_new)
1106 			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1107 
1108 		gfs2_trans_end(sdp);
1109 	}
1110 
1111 	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1112 		iomap->page_ops = &gfs2_iomap_page_ops;
1113 	return 0;
1114 
1115 out_trans_end:
1116 	gfs2_trans_end(sdp);
1117 out_trans_fail:
1118 	if (alloc_required)
1119 		gfs2_inplace_release(ip);
1120 out_qunlock:
1121 	if (alloc_required)
1122 		gfs2_quota_unlock(ip);
1123 out_unlock:
1124 	gfs2_write_unlock(inode);
1125 	return ret;
1126 }
1127 
1128 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1129 			    unsigned flags, struct iomap *iomap)
1130 {
1131 	struct gfs2_inode *ip = GFS2_I(inode);
1132 	struct metapath mp = { .mp_aheight = 1, };
1133 	int ret;
1134 
1135 	iomap->flags |= IOMAP_F_BUFFER_HEAD;
1136 
1137 	trace_gfs2_iomap_start(ip, pos, length, flags);
1138 	if ((flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)) {
1139 		ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1140 	} else {
1141 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1142 
1143 		/*
1144 		 * Silently fall back to buffered I/O for stuffed files or if
1145 		 * we've hot a hole (see gfs2_file_direct_write).
1146 		 */
1147 		if ((flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT) &&
1148 		    iomap->type != IOMAP_MAPPED)
1149 			ret = -ENOTBLK;
1150 	}
1151 	release_metapath(&mp);
1152 	trace_gfs2_iomap_end(ip, iomap, ret);
1153 	return ret;
1154 }
1155 
1156 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1157 			  ssize_t written, unsigned flags, struct iomap *iomap)
1158 {
1159 	struct gfs2_inode *ip = GFS2_I(inode);
1160 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1161 
1162 	if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != IOMAP_WRITE)
1163 		goto out;
1164 
1165 	if (!gfs2_is_stuffed(ip))
1166 		gfs2_ordered_add_inode(ip);
1167 
1168 	if (inode == sdp->sd_rindex)
1169 		adjust_fs_space(inode);
1170 
1171 	gfs2_inplace_release(ip);
1172 
1173 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1174 		/* Deallocate blocks that were just allocated. */
1175 		loff_t blockmask = i_blocksize(inode) - 1;
1176 		loff_t end = (pos + length) & ~blockmask;
1177 
1178 		pos = (pos + written + blockmask) & ~blockmask;
1179 		if (pos < end) {
1180 			truncate_pagecache_range(inode, pos, end - 1);
1181 			punch_hole(ip, pos, end - pos);
1182 		}
1183 	}
1184 
1185 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1186 		gfs2_quota_unlock(ip);
1187 
1188 	if (unlikely(!written))
1189 		goto out_unlock;
1190 
1191 	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1192 		mark_inode_dirty(inode);
1193 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1194 
1195 out_unlock:
1196 	gfs2_write_unlock(inode);
1197 out:
1198 	return 0;
1199 }
1200 
1201 const struct iomap_ops gfs2_iomap_ops = {
1202 	.iomap_begin = gfs2_iomap_begin,
1203 	.iomap_end = gfs2_iomap_end,
1204 };
1205 
1206 /**
1207  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1208  * @inode: The inode
1209  * @lblock: The logical block number
1210  * @bh_map: The bh to be mapped
1211  * @create: True if its ok to alloc blocks to satify the request
1212  *
1213  * The size of the requested mapping is defined in bh_map->b_size.
1214  *
1215  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1216  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1217  * bh_map->b_size to indicate the size of the mapping when @lblock and
1218  * successive blocks are mapped, up to the requested size.
1219  *
1220  * Sets buffer_boundary() if a read of metadata will be required
1221  * before the next block can be mapped. Sets buffer_new() if new
1222  * blocks were allocated.
1223  *
1224  * Returns: errno
1225  */
1226 
1227 int gfs2_block_map(struct inode *inode, sector_t lblock,
1228 		   struct buffer_head *bh_map, int create)
1229 {
1230 	struct gfs2_inode *ip = GFS2_I(inode);
1231 	loff_t pos = (loff_t)lblock << inode->i_blkbits;
1232 	loff_t length = bh_map->b_size;
1233 	struct metapath mp = { .mp_aheight = 1, };
1234 	struct iomap iomap = { };
1235 	int ret;
1236 
1237 	clear_buffer_mapped(bh_map);
1238 	clear_buffer_new(bh_map);
1239 	clear_buffer_boundary(bh_map);
1240 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1241 
1242 	if (create) {
1243 		ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, &iomap, &mp);
1244 		if (!ret && iomap.type == IOMAP_HOLE)
1245 			ret = gfs2_iomap_alloc(inode, &iomap, &mp);
1246 		release_metapath(&mp);
1247 	} else {
1248 		ret = gfs2_iomap_get(inode, pos, length, 0, &iomap, &mp);
1249 		release_metapath(&mp);
1250 	}
1251 	if (ret)
1252 		goto out;
1253 
1254 	if (iomap.length > bh_map->b_size) {
1255 		iomap.length = bh_map->b_size;
1256 		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1257 	}
1258 	if (iomap.addr != IOMAP_NULL_ADDR)
1259 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1260 	bh_map->b_size = iomap.length;
1261 	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1262 		set_buffer_boundary(bh_map);
1263 	if (iomap.flags & IOMAP_F_NEW)
1264 		set_buffer_new(bh_map);
1265 
1266 out:
1267 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1268 	return ret;
1269 }
1270 
1271 /*
1272  * Deprecated: do not use in new code
1273  */
1274 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
1275 {
1276 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
1277 	int ret;
1278 	int create = *new;
1279 
1280 	BUG_ON(!extlen);
1281 	BUG_ON(!dblock);
1282 	BUG_ON(!new);
1283 
1284 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
1285 	ret = gfs2_block_map(inode, lblock, &bh, create);
1286 	*extlen = bh.b_size >> inode->i_blkbits;
1287 	*dblock = bh.b_blocknr;
1288 	if (buffer_new(&bh))
1289 		*new = 1;
1290 	else
1291 		*new = 0;
1292 	return ret;
1293 }
1294 
1295 /**
1296  * gfs2_block_zero_range - Deal with zeroing out data
1297  *
1298  * This is partly borrowed from ext3.
1299  */
1300 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1301 				 unsigned int length)
1302 {
1303 	struct address_space *mapping = inode->i_mapping;
1304 	struct gfs2_inode *ip = GFS2_I(inode);
1305 	unsigned long index = from >> PAGE_SHIFT;
1306 	unsigned offset = from & (PAGE_SIZE-1);
1307 	unsigned blocksize, iblock, pos;
1308 	struct buffer_head *bh;
1309 	struct page *page;
1310 	int err;
1311 
1312 	page = find_or_create_page(mapping, index, GFP_NOFS);
1313 	if (!page)
1314 		return 0;
1315 
1316 	blocksize = inode->i_sb->s_blocksize;
1317 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
1318 
1319 	if (!page_has_buffers(page))
1320 		create_empty_buffers(page, blocksize, 0);
1321 
1322 	/* Find the buffer that contains "offset" */
1323 	bh = page_buffers(page);
1324 	pos = blocksize;
1325 	while (offset >= pos) {
1326 		bh = bh->b_this_page;
1327 		iblock++;
1328 		pos += blocksize;
1329 	}
1330 
1331 	err = 0;
1332 
1333 	if (!buffer_mapped(bh)) {
1334 		gfs2_block_map(inode, iblock, bh, 0);
1335 		/* unmapped? It's a hole - nothing to do */
1336 		if (!buffer_mapped(bh))
1337 			goto unlock;
1338 	}
1339 
1340 	/* Ok, it's mapped. Make sure it's up-to-date */
1341 	if (PageUptodate(page))
1342 		set_buffer_uptodate(bh);
1343 
1344 	if (!buffer_uptodate(bh)) {
1345 		err = -EIO;
1346 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1347 		wait_on_buffer(bh);
1348 		/* Uhhuh. Read error. Complain and punt. */
1349 		if (!buffer_uptodate(bh))
1350 			goto unlock;
1351 		err = 0;
1352 	}
1353 
1354 	if (gfs2_is_jdata(ip))
1355 		gfs2_trans_add_data(ip->i_gl, bh);
1356 	else
1357 		gfs2_ordered_add_inode(ip);
1358 
1359 	zero_user(page, offset, length);
1360 	mark_buffer_dirty(bh);
1361 unlock:
1362 	unlock_page(page);
1363 	put_page(page);
1364 	return err;
1365 }
1366 
1367 #define GFS2_JTRUNC_REVOKES 8192
1368 
1369 /**
1370  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1371  * @inode: The inode being truncated
1372  * @oldsize: The original (larger) size
1373  * @newsize: The new smaller size
1374  *
1375  * With jdata files, we have to journal a revoke for each block which is
1376  * truncated. As a result, we need to split this into separate transactions
1377  * if the number of pages being truncated gets too large.
1378  */
1379 
1380 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1381 {
1382 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1383 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1384 	u64 chunk;
1385 	int error;
1386 
1387 	while (oldsize != newsize) {
1388 		struct gfs2_trans *tr;
1389 		unsigned int offs;
1390 
1391 		chunk = oldsize - newsize;
1392 		if (chunk > max_chunk)
1393 			chunk = max_chunk;
1394 
1395 		offs = oldsize & ~PAGE_MASK;
1396 		if (offs && chunk > PAGE_SIZE)
1397 			chunk = offs + ((chunk - offs) & PAGE_MASK);
1398 
1399 		truncate_pagecache(inode, oldsize - chunk);
1400 		oldsize -= chunk;
1401 
1402 		tr = current->journal_info;
1403 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1404 			continue;
1405 
1406 		gfs2_trans_end(sdp);
1407 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1408 		if (error)
1409 			return error;
1410 	}
1411 
1412 	return 0;
1413 }
1414 
1415 static int trunc_start(struct inode *inode, u64 newsize)
1416 {
1417 	struct gfs2_inode *ip = GFS2_I(inode);
1418 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1419 	struct buffer_head *dibh = NULL;
1420 	int journaled = gfs2_is_jdata(ip);
1421 	u64 oldsize = inode->i_size;
1422 	int error;
1423 
1424 	if (journaled)
1425 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1426 	else
1427 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1428 	if (error)
1429 		return error;
1430 
1431 	error = gfs2_meta_inode_buffer(ip, &dibh);
1432 	if (error)
1433 		goto out;
1434 
1435 	gfs2_trans_add_meta(ip->i_gl, dibh);
1436 
1437 	if (gfs2_is_stuffed(ip)) {
1438 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1439 	} else {
1440 		unsigned int blocksize = i_blocksize(inode);
1441 		unsigned int offs = newsize & (blocksize - 1);
1442 		if (offs) {
1443 			error = gfs2_block_zero_range(inode, newsize,
1444 						      blocksize - offs);
1445 			if (error)
1446 				goto out;
1447 		}
1448 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1449 	}
1450 
1451 	i_size_write(inode, newsize);
1452 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1453 	gfs2_dinode_out(ip, dibh->b_data);
1454 
1455 	if (journaled)
1456 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1457 	else
1458 		truncate_pagecache(inode, newsize);
1459 
1460 out:
1461 	brelse(dibh);
1462 	if (current->journal_info)
1463 		gfs2_trans_end(sdp);
1464 	return error;
1465 }
1466 
1467 int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
1468 			 struct iomap *iomap)
1469 {
1470 	struct metapath mp = { .mp_aheight = 1, };
1471 	int ret;
1472 
1473 	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1474 	if (!ret && iomap->type == IOMAP_HOLE)
1475 		ret = gfs2_iomap_alloc(inode, iomap, &mp);
1476 	release_metapath(&mp);
1477 	return ret;
1478 }
1479 
1480 /**
1481  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1482  * @ip: inode
1483  * @rg_gh: holder of resource group glock
1484  * @bh: buffer head to sweep
1485  * @start: starting point in bh
1486  * @end: end point in bh
1487  * @meta: true if bh points to metadata (rather than data)
1488  * @btotal: place to keep count of total blocks freed
1489  *
1490  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1491  * free, and free them all. However, we do it one rgrp at a time. If this
1492  * block has references to multiple rgrps, we break it into individual
1493  * transactions. This allows other processes to use the rgrps while we're
1494  * focused on a single one, for better concurrency / performance.
1495  * At every transaction boundary, we rewrite the inode into the journal.
1496  * That way the bitmaps are kept consistent with the inode and we can recover
1497  * if we're interrupted by power-outages.
1498  *
1499  * Returns: 0, or return code if an error occurred.
1500  *          *btotal has the total number of blocks freed
1501  */
1502 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1503 			      struct buffer_head *bh, __be64 *start, __be64 *end,
1504 			      bool meta, u32 *btotal)
1505 {
1506 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1507 	struct gfs2_rgrpd *rgd;
1508 	struct gfs2_trans *tr;
1509 	__be64 *p;
1510 	int blks_outside_rgrp;
1511 	u64 bn, bstart, isize_blks;
1512 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1513 	int ret = 0;
1514 	bool buf_in_tr = false; /* buffer was added to transaction */
1515 
1516 more_rgrps:
1517 	rgd = NULL;
1518 	if (gfs2_holder_initialized(rd_gh)) {
1519 		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1520 		gfs2_assert_withdraw(sdp,
1521 			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1522 	}
1523 	blks_outside_rgrp = 0;
1524 	bstart = 0;
1525 	blen = 0;
1526 
1527 	for (p = start; p < end; p++) {
1528 		if (!*p)
1529 			continue;
1530 		bn = be64_to_cpu(*p);
1531 
1532 		if (rgd) {
1533 			if (!rgrp_contains_block(rgd, bn)) {
1534 				blks_outside_rgrp++;
1535 				continue;
1536 			}
1537 		} else {
1538 			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1539 			if (unlikely(!rgd)) {
1540 				ret = -EIO;
1541 				goto out;
1542 			}
1543 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1544 						 0, rd_gh);
1545 			if (ret)
1546 				goto out;
1547 
1548 			/* Must be done with the rgrp glock held: */
1549 			if (gfs2_rs_active(&ip->i_res) &&
1550 			    rgd == ip->i_res.rs_rbm.rgd)
1551 				gfs2_rs_deltree(&ip->i_res);
1552 		}
1553 
1554 		/* The size of our transactions will be unknown until we
1555 		   actually process all the metadata blocks that relate to
1556 		   the rgrp. So we estimate. We know it can't be more than
1557 		   the dinode's i_blocks and we don't want to exceed the
1558 		   journal flush threshold, sd_log_thresh2. */
1559 		if (current->journal_info == NULL) {
1560 			unsigned int jblocks_rqsted, revokes;
1561 
1562 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1563 				RES_INDIRECT;
1564 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1565 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1566 				jblocks_rqsted +=
1567 					atomic_read(&sdp->sd_log_thresh2);
1568 			else
1569 				jblocks_rqsted += isize_blks;
1570 			revokes = jblocks_rqsted;
1571 			if (meta)
1572 				revokes += end - start;
1573 			else if (ip->i_depth)
1574 				revokes += sdp->sd_inptrs;
1575 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1576 			if (ret)
1577 				goto out_unlock;
1578 			down_write(&ip->i_rw_mutex);
1579 		}
1580 		/* check if we will exceed the transaction blocks requested */
1581 		tr = current->journal_info;
1582 		if (tr->tr_num_buf_new + RES_STATFS +
1583 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1584 			/* We set blks_outside_rgrp to ensure the loop will
1585 			   be repeated for the same rgrp, but with a new
1586 			   transaction. */
1587 			blks_outside_rgrp++;
1588 			/* This next part is tricky. If the buffer was added
1589 			   to the transaction, we've already set some block
1590 			   pointers to 0, so we better follow through and free
1591 			   them, or we will introduce corruption (so break).
1592 			   This may be impossible, or at least rare, but I
1593 			   decided to cover the case regardless.
1594 
1595 			   If the buffer was not added to the transaction
1596 			   (this call), doing so would exceed our transaction
1597 			   size, so we need to end the transaction and start a
1598 			   new one (so goto). */
1599 
1600 			if (buf_in_tr)
1601 				break;
1602 			goto out_unlock;
1603 		}
1604 
1605 		gfs2_trans_add_meta(ip->i_gl, bh);
1606 		buf_in_tr = true;
1607 		*p = 0;
1608 		if (bstart + blen == bn) {
1609 			blen++;
1610 			continue;
1611 		}
1612 		if (bstart) {
1613 			__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1614 			(*btotal) += blen;
1615 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1616 		}
1617 		bstart = bn;
1618 		blen = 1;
1619 	}
1620 	if (bstart) {
1621 		__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1622 		(*btotal) += blen;
1623 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1624 	}
1625 out_unlock:
1626 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1627 					    outside the rgrp we just processed,
1628 					    do it all over again. */
1629 		if (current->journal_info) {
1630 			struct buffer_head *dibh;
1631 
1632 			ret = gfs2_meta_inode_buffer(ip, &dibh);
1633 			if (ret)
1634 				goto out;
1635 
1636 			/* Every transaction boundary, we rewrite the dinode
1637 			   to keep its di_blocks current in case of failure. */
1638 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1639 				current_time(&ip->i_inode);
1640 			gfs2_trans_add_meta(ip->i_gl, dibh);
1641 			gfs2_dinode_out(ip, dibh->b_data);
1642 			brelse(dibh);
1643 			up_write(&ip->i_rw_mutex);
1644 			gfs2_trans_end(sdp);
1645 		}
1646 		gfs2_glock_dq_uninit(rd_gh);
1647 		cond_resched();
1648 		goto more_rgrps;
1649 	}
1650 out:
1651 	return ret;
1652 }
1653 
1654 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1655 {
1656 	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1657 		return false;
1658 	return true;
1659 }
1660 
1661 /**
1662  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1663  * @mp: starting metapath
1664  * @h: desired height to search
1665  *
1666  * Assumes the metapath is valid (with buffers) out to height h.
1667  * Returns: true if a non-null pointer was found in the metapath buffer
1668  *          false if all remaining pointers are NULL in the buffer
1669  */
1670 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1671 			     unsigned int h,
1672 			     __u16 *end_list, unsigned int end_aligned)
1673 {
1674 	struct buffer_head *bh = mp->mp_bh[h];
1675 	__be64 *first, *ptr, *end;
1676 
1677 	first = metaptr1(h, mp);
1678 	ptr = first + mp->mp_list[h];
1679 	end = (__be64 *)(bh->b_data + bh->b_size);
1680 	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1681 		bool keep_end = h < end_aligned;
1682 		end = first + end_list[h] + keep_end;
1683 	}
1684 
1685 	while (ptr < end) {
1686 		if (*ptr) { /* if we have a non-null pointer */
1687 			mp->mp_list[h] = ptr - first;
1688 			h++;
1689 			if (h < GFS2_MAX_META_HEIGHT)
1690 				mp->mp_list[h] = 0;
1691 			return true;
1692 		}
1693 		ptr++;
1694 	}
1695 	return false;
1696 }
1697 
1698 enum dealloc_states {
1699 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1700 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1701 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1702 	DEALLOC_DONE = 3,       /* process complete */
1703 };
1704 
1705 static inline void
1706 metapointer_range(struct metapath *mp, int height,
1707 		  __u16 *start_list, unsigned int start_aligned,
1708 		  __u16 *end_list, unsigned int end_aligned,
1709 		  __be64 **start, __be64 **end)
1710 {
1711 	struct buffer_head *bh = mp->mp_bh[height];
1712 	__be64 *first;
1713 
1714 	first = metaptr1(height, mp);
1715 	*start = first;
1716 	if (mp_eq_to_hgt(mp, start_list, height)) {
1717 		bool keep_start = height < start_aligned;
1718 		*start = first + start_list[height] + keep_start;
1719 	}
1720 	*end = (__be64 *)(bh->b_data + bh->b_size);
1721 	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1722 		bool keep_end = height < end_aligned;
1723 		*end = first + end_list[height] + keep_end;
1724 	}
1725 }
1726 
1727 static inline bool walk_done(struct gfs2_sbd *sdp,
1728 			     struct metapath *mp, int height,
1729 			     __u16 *end_list, unsigned int end_aligned)
1730 {
1731 	__u16 end;
1732 
1733 	if (end_list) {
1734 		bool keep_end = height < end_aligned;
1735 		if (!mp_eq_to_hgt(mp, end_list, height))
1736 			return false;
1737 		end = end_list[height] + keep_end;
1738 	} else
1739 		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1740 	return mp->mp_list[height] >= end;
1741 }
1742 
1743 /**
1744  * punch_hole - deallocate blocks in a file
1745  * @ip: inode to truncate
1746  * @offset: the start of the hole
1747  * @length: the size of the hole (or 0 for truncate)
1748  *
1749  * Punch a hole into a file or truncate a file at a given position.  This
1750  * function operates in whole blocks (@offset and @length are rounded
1751  * accordingly); partially filled blocks must be cleared otherwise.
1752  *
1753  * This function works from the bottom up, and from the right to the left. In
1754  * other words, it strips off the highest layer (data) before stripping any of
1755  * the metadata. Doing it this way is best in case the operation is interrupted
1756  * by power failure, etc.  The dinode is rewritten in every transaction to
1757  * guarantee integrity.
1758  */
1759 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1760 {
1761 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1762 	u64 maxsize = sdp->sd_heightsize[ip->i_height];
1763 	struct metapath mp = {};
1764 	struct buffer_head *dibh, *bh;
1765 	struct gfs2_holder rd_gh;
1766 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1767 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1768 	__u16 start_list[GFS2_MAX_META_HEIGHT];
1769 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1770 	unsigned int start_aligned, uninitialized_var(end_aligned);
1771 	unsigned int strip_h = ip->i_height - 1;
1772 	u32 btotal = 0;
1773 	int ret, state;
1774 	int mp_h; /* metapath buffers are read in to this height */
1775 	u64 prev_bnr = 0;
1776 	__be64 *start, *end;
1777 
1778 	if (offset >= maxsize) {
1779 		/*
1780 		 * The starting point lies beyond the allocated meta-data;
1781 		 * there are no blocks do deallocate.
1782 		 */
1783 		return 0;
1784 	}
1785 
1786 	/*
1787 	 * The start position of the hole is defined by lblock, start_list, and
1788 	 * start_aligned.  The end position of the hole is defined by lend,
1789 	 * end_list, and end_aligned.
1790 	 *
1791 	 * start_aligned and end_aligned define down to which height the start
1792 	 * and end positions are aligned to the metadata tree (i.e., the
1793 	 * position is a multiple of the metadata granularity at the height
1794 	 * above).  This determines at which heights additional meta pointers
1795 	 * needs to be preserved for the remaining data.
1796 	 */
1797 
1798 	if (length) {
1799 		u64 end_offset = offset + length;
1800 		u64 lend;
1801 
1802 		/*
1803 		 * Clip the end at the maximum file size for the given height:
1804 		 * that's how far the metadata goes; files bigger than that
1805 		 * will have additional layers of indirection.
1806 		 */
1807 		if (end_offset > maxsize)
1808 			end_offset = maxsize;
1809 		lend = end_offset >> bsize_shift;
1810 
1811 		if (lblock >= lend)
1812 			return 0;
1813 
1814 		find_metapath(sdp, lend, &mp, ip->i_height);
1815 		end_list = __end_list;
1816 		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1817 
1818 		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1819 			if (end_list[mp_h])
1820 				break;
1821 		}
1822 		end_aligned = mp_h;
1823 	}
1824 
1825 	find_metapath(sdp, lblock, &mp, ip->i_height);
1826 	memcpy(start_list, mp.mp_list, sizeof(start_list));
1827 
1828 	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1829 		if (start_list[mp_h])
1830 			break;
1831 	}
1832 	start_aligned = mp_h;
1833 
1834 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1835 	if (ret)
1836 		return ret;
1837 
1838 	mp.mp_bh[0] = dibh;
1839 	ret = lookup_metapath(ip, &mp);
1840 	if (ret)
1841 		goto out_metapath;
1842 
1843 	/* issue read-ahead on metadata */
1844 	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1845 		metapointer_range(&mp, mp_h, start_list, start_aligned,
1846 				  end_list, end_aligned, &start, &end);
1847 		gfs2_metapath_ra(ip->i_gl, start, end);
1848 	}
1849 
1850 	if (mp.mp_aheight == ip->i_height)
1851 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1852 	else
1853 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1854 
1855 	ret = gfs2_rindex_update(sdp);
1856 	if (ret)
1857 		goto out_metapath;
1858 
1859 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1860 	if (ret)
1861 		goto out_metapath;
1862 	gfs2_holder_mark_uninitialized(&rd_gh);
1863 
1864 	mp_h = strip_h;
1865 
1866 	while (state != DEALLOC_DONE) {
1867 		switch (state) {
1868 		/* Truncate a full metapath at the given strip height.
1869 		 * Note that strip_h == mp_h in order to be in this state. */
1870 		case DEALLOC_MP_FULL:
1871 			bh = mp.mp_bh[mp_h];
1872 			gfs2_assert_withdraw(sdp, bh);
1873 			if (gfs2_assert_withdraw(sdp,
1874 						 prev_bnr != bh->b_blocknr)) {
1875 				fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1876 					 "s_h:%u, mp_h:%u\n",
1877 				       (unsigned long long)ip->i_no_addr,
1878 				       prev_bnr, ip->i_height, strip_h, mp_h);
1879 			}
1880 			prev_bnr = bh->b_blocknr;
1881 
1882 			if (gfs2_metatype_check(sdp, bh,
1883 						(mp_h ? GFS2_METATYPE_IN :
1884 							GFS2_METATYPE_DI))) {
1885 				ret = -EIO;
1886 				goto out;
1887 			}
1888 
1889 			/*
1890 			 * Below, passing end_aligned as 0 gives us the
1891 			 * metapointer range excluding the end point: the end
1892 			 * point is the first metapath we must not deallocate!
1893 			 */
1894 
1895 			metapointer_range(&mp, mp_h, start_list, start_aligned,
1896 					  end_list, 0 /* end_aligned */,
1897 					  &start, &end);
1898 			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1899 						 start, end,
1900 						 mp_h != ip->i_height - 1,
1901 						 &btotal);
1902 
1903 			/* If we hit an error or just swept dinode buffer,
1904 			   just exit. */
1905 			if (ret || !mp_h) {
1906 				state = DEALLOC_DONE;
1907 				break;
1908 			}
1909 			state = DEALLOC_MP_LOWER;
1910 			break;
1911 
1912 		/* lower the metapath strip height */
1913 		case DEALLOC_MP_LOWER:
1914 			/* We're done with the current buffer, so release it,
1915 			   unless it's the dinode buffer. Then back up to the
1916 			   previous pointer. */
1917 			if (mp_h) {
1918 				brelse(mp.mp_bh[mp_h]);
1919 				mp.mp_bh[mp_h] = NULL;
1920 			}
1921 			/* If we can't get any lower in height, we've stripped
1922 			   off all we can. Next step is to back up and start
1923 			   stripping the previous level of metadata. */
1924 			if (mp_h == 0) {
1925 				strip_h--;
1926 				memcpy(mp.mp_list, start_list, sizeof(start_list));
1927 				mp_h = strip_h;
1928 				state = DEALLOC_FILL_MP;
1929 				break;
1930 			}
1931 			mp.mp_list[mp_h] = 0;
1932 			mp_h--; /* search one metadata height down */
1933 			mp.mp_list[mp_h]++;
1934 			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1935 				break;
1936 			/* Here we've found a part of the metapath that is not
1937 			 * allocated. We need to search at that height for the
1938 			 * next non-null pointer. */
1939 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1940 				state = DEALLOC_FILL_MP;
1941 				mp_h++;
1942 			}
1943 			/* No more non-null pointers at this height. Back up
1944 			   to the previous height and try again. */
1945 			break; /* loop around in the same state */
1946 
1947 		/* Fill the metapath with buffers to the given height. */
1948 		case DEALLOC_FILL_MP:
1949 			/* Fill the buffers out to the current height. */
1950 			ret = fillup_metapath(ip, &mp, mp_h);
1951 			if (ret < 0)
1952 				goto out;
1953 
1954 			/* On the first pass, issue read-ahead on metadata. */
1955 			if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1956 				unsigned int height = mp.mp_aheight - 1;
1957 
1958 				/* No read-ahead for data blocks. */
1959 				if (mp.mp_aheight - 1 == strip_h)
1960 					height--;
1961 
1962 				for (; height >= mp.mp_aheight - ret; height--) {
1963 					metapointer_range(&mp, height,
1964 							  start_list, start_aligned,
1965 							  end_list, end_aligned,
1966 							  &start, &end);
1967 					gfs2_metapath_ra(ip->i_gl, start, end);
1968 				}
1969 			}
1970 
1971 			/* If buffers found for the entire strip height */
1972 			if (mp.mp_aheight - 1 == strip_h) {
1973 				state = DEALLOC_MP_FULL;
1974 				break;
1975 			}
1976 			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1977 				mp_h = mp.mp_aheight - 1;
1978 
1979 			/* If we find a non-null block pointer, crawl a bit
1980 			   higher up in the metapath and try again, otherwise
1981 			   we need to look lower for a new starting point. */
1982 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1983 				mp_h++;
1984 			else
1985 				state = DEALLOC_MP_LOWER;
1986 			break;
1987 		}
1988 	}
1989 
1990 	if (btotal) {
1991 		if (current->journal_info == NULL) {
1992 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1993 					       RES_QUOTA, 0);
1994 			if (ret)
1995 				goto out;
1996 			down_write(&ip->i_rw_mutex);
1997 		}
1998 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1999 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
2000 				  ip->i_inode.i_gid);
2001 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2002 		gfs2_trans_add_meta(ip->i_gl, dibh);
2003 		gfs2_dinode_out(ip, dibh->b_data);
2004 		up_write(&ip->i_rw_mutex);
2005 		gfs2_trans_end(sdp);
2006 	}
2007 
2008 out:
2009 	if (gfs2_holder_initialized(&rd_gh))
2010 		gfs2_glock_dq_uninit(&rd_gh);
2011 	if (current->journal_info) {
2012 		up_write(&ip->i_rw_mutex);
2013 		gfs2_trans_end(sdp);
2014 		cond_resched();
2015 	}
2016 	gfs2_quota_unhold(ip);
2017 out_metapath:
2018 	release_metapath(&mp);
2019 	return ret;
2020 }
2021 
2022 static int trunc_end(struct gfs2_inode *ip)
2023 {
2024 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2025 	struct buffer_head *dibh;
2026 	int error;
2027 
2028 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2029 	if (error)
2030 		return error;
2031 
2032 	down_write(&ip->i_rw_mutex);
2033 
2034 	error = gfs2_meta_inode_buffer(ip, &dibh);
2035 	if (error)
2036 		goto out;
2037 
2038 	if (!i_size_read(&ip->i_inode)) {
2039 		ip->i_height = 0;
2040 		ip->i_goal = ip->i_no_addr;
2041 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2042 		gfs2_ordered_del_inode(ip);
2043 	}
2044 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2045 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2046 
2047 	gfs2_trans_add_meta(ip->i_gl, dibh);
2048 	gfs2_dinode_out(ip, dibh->b_data);
2049 	brelse(dibh);
2050 
2051 out:
2052 	up_write(&ip->i_rw_mutex);
2053 	gfs2_trans_end(sdp);
2054 	return error;
2055 }
2056 
2057 /**
2058  * do_shrink - make a file smaller
2059  * @inode: the inode
2060  * @newsize: the size to make the file
2061  *
2062  * Called with an exclusive lock on @inode. The @size must
2063  * be equal to or smaller than the current inode size.
2064  *
2065  * Returns: errno
2066  */
2067 
2068 static int do_shrink(struct inode *inode, u64 newsize)
2069 {
2070 	struct gfs2_inode *ip = GFS2_I(inode);
2071 	int error;
2072 
2073 	error = trunc_start(inode, newsize);
2074 	if (error < 0)
2075 		return error;
2076 	if (gfs2_is_stuffed(ip))
2077 		return 0;
2078 
2079 	error = punch_hole(ip, newsize, 0);
2080 	if (error == 0)
2081 		error = trunc_end(ip);
2082 
2083 	return error;
2084 }
2085 
2086 void gfs2_trim_blocks(struct inode *inode)
2087 {
2088 	int ret;
2089 
2090 	ret = do_shrink(inode, inode->i_size);
2091 	WARN_ON(ret != 0);
2092 }
2093 
2094 /**
2095  * do_grow - Touch and update inode size
2096  * @inode: The inode
2097  * @size: The new size
2098  *
2099  * This function updates the timestamps on the inode and
2100  * may also increase the size of the inode. This function
2101  * must not be called with @size any smaller than the current
2102  * inode size.
2103  *
2104  * Although it is not strictly required to unstuff files here,
2105  * earlier versions of GFS2 have a bug in the stuffed file reading
2106  * code which will result in a buffer overrun if the size is larger
2107  * than the max stuffed file size. In order to prevent this from
2108  * occurring, such files are unstuffed, but in other cases we can
2109  * just update the inode size directly.
2110  *
2111  * Returns: 0 on success, or -ve on error
2112  */
2113 
2114 static int do_grow(struct inode *inode, u64 size)
2115 {
2116 	struct gfs2_inode *ip = GFS2_I(inode);
2117 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2118 	struct gfs2_alloc_parms ap = { .target = 1, };
2119 	struct buffer_head *dibh;
2120 	int error;
2121 	int unstuff = 0;
2122 
2123 	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2124 		error = gfs2_quota_lock_check(ip, &ap);
2125 		if (error)
2126 			return error;
2127 
2128 		error = gfs2_inplace_reserve(ip, &ap);
2129 		if (error)
2130 			goto do_grow_qunlock;
2131 		unstuff = 1;
2132 	}
2133 
2134 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2135 				 (unstuff &&
2136 				  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2137 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2138 				  0 : RES_QUOTA), 0);
2139 	if (error)
2140 		goto do_grow_release;
2141 
2142 	if (unstuff) {
2143 		error = gfs2_unstuff_dinode(ip, NULL);
2144 		if (error)
2145 			goto do_end_trans;
2146 	}
2147 
2148 	error = gfs2_meta_inode_buffer(ip, &dibh);
2149 	if (error)
2150 		goto do_end_trans;
2151 
2152 	i_size_write(inode, size);
2153 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2154 	gfs2_trans_add_meta(ip->i_gl, dibh);
2155 	gfs2_dinode_out(ip, dibh->b_data);
2156 	brelse(dibh);
2157 
2158 do_end_trans:
2159 	gfs2_trans_end(sdp);
2160 do_grow_release:
2161 	if (unstuff) {
2162 		gfs2_inplace_release(ip);
2163 do_grow_qunlock:
2164 		gfs2_quota_unlock(ip);
2165 	}
2166 	return error;
2167 }
2168 
2169 /**
2170  * gfs2_setattr_size - make a file a given size
2171  * @inode: the inode
2172  * @newsize: the size to make the file
2173  *
2174  * The file size can grow, shrink, or stay the same size. This
2175  * is called holding i_rwsem and an exclusive glock on the inode
2176  * in question.
2177  *
2178  * Returns: errno
2179  */
2180 
2181 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2182 {
2183 	struct gfs2_inode *ip = GFS2_I(inode);
2184 	int ret;
2185 
2186 	BUG_ON(!S_ISREG(inode->i_mode));
2187 
2188 	ret = inode_newsize_ok(inode, newsize);
2189 	if (ret)
2190 		return ret;
2191 
2192 	inode_dio_wait(inode);
2193 
2194 	ret = gfs2_rsqa_alloc(ip);
2195 	if (ret)
2196 		goto out;
2197 
2198 	if (newsize >= inode->i_size) {
2199 		ret = do_grow(inode, newsize);
2200 		goto out;
2201 	}
2202 
2203 	ret = do_shrink(inode, newsize);
2204 out:
2205 	gfs2_rsqa_delete(ip, NULL);
2206 	return ret;
2207 }
2208 
2209 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2210 {
2211 	int error;
2212 	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2213 	if (!error)
2214 		error = trunc_end(ip);
2215 	return error;
2216 }
2217 
2218 int gfs2_file_dealloc(struct gfs2_inode *ip)
2219 {
2220 	return punch_hole(ip, 0, 0);
2221 }
2222 
2223 /**
2224  * gfs2_free_journal_extents - Free cached journal bmap info
2225  * @jd: The journal
2226  *
2227  */
2228 
2229 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2230 {
2231 	struct gfs2_journal_extent *jext;
2232 
2233 	while(!list_empty(&jd->extent_list)) {
2234 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
2235 		list_del(&jext->list);
2236 		kfree(jext);
2237 	}
2238 }
2239 
2240 /**
2241  * gfs2_add_jextent - Add or merge a new extent to extent cache
2242  * @jd: The journal descriptor
2243  * @lblock: The logical block at start of new extent
2244  * @dblock: The physical block at start of new extent
2245  * @blocks: Size of extent in fs blocks
2246  *
2247  * Returns: 0 on success or -ENOMEM
2248  */
2249 
2250 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2251 {
2252 	struct gfs2_journal_extent *jext;
2253 
2254 	if (!list_empty(&jd->extent_list)) {
2255 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
2256 		if ((jext->dblock + jext->blocks) == dblock) {
2257 			jext->blocks += blocks;
2258 			return 0;
2259 		}
2260 	}
2261 
2262 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2263 	if (jext == NULL)
2264 		return -ENOMEM;
2265 	jext->dblock = dblock;
2266 	jext->lblock = lblock;
2267 	jext->blocks = blocks;
2268 	list_add_tail(&jext->list, &jd->extent_list);
2269 	jd->nr_extents++;
2270 	return 0;
2271 }
2272 
2273 /**
2274  * gfs2_map_journal_extents - Cache journal bmap info
2275  * @sdp: The super block
2276  * @jd: The journal to map
2277  *
2278  * Create a reusable "extent" mapping from all logical
2279  * blocks to all physical blocks for the given journal.  This will save
2280  * us time when writing journal blocks.  Most journals will have only one
2281  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2282  * arranges the journal blocks sequentially to maximize performance.
2283  * So the extent would map the first block for the entire file length.
2284  * However, gfs2_jadd can happen while file activity is happening, so
2285  * those journals may not be sequential.  Less likely is the case where
2286  * the users created their own journals by mounting the metafs and
2287  * laying it out.  But it's still possible.  These journals might have
2288  * several extents.
2289  *
2290  * Returns: 0 on success, or error on failure
2291  */
2292 
2293 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2294 {
2295 	u64 lblock = 0;
2296 	u64 lblock_stop;
2297 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2298 	struct buffer_head bh;
2299 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2300 	u64 size;
2301 	int rc;
2302 	ktime_t start, end;
2303 
2304 	start = ktime_get();
2305 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
2306 	size = (lblock_stop - lblock) << shift;
2307 	jd->nr_extents = 0;
2308 	WARN_ON(!list_empty(&jd->extent_list));
2309 
2310 	do {
2311 		bh.b_state = 0;
2312 		bh.b_blocknr = 0;
2313 		bh.b_size = size;
2314 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2315 		if (rc || !buffer_mapped(&bh))
2316 			goto fail;
2317 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2318 		if (rc)
2319 			goto fail;
2320 		size -= bh.b_size;
2321 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2322 	} while(size > 0);
2323 
2324 	end = ktime_get();
2325 	fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2326 		jd->nr_extents, ktime_ms_delta(end, start));
2327 	return 0;
2328 
2329 fail:
2330 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2331 		rc, jd->jd_jid,
2332 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
2333 		jd->nr_extents);
2334 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2335 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2336 		bh.b_state, (unsigned long long)bh.b_size);
2337 	gfs2_free_journal_extents(jd);
2338 	return rc;
2339 }
2340 
2341 /**
2342  * gfs2_write_alloc_required - figure out if a write will require an allocation
2343  * @ip: the file being written to
2344  * @offset: the offset to write to
2345  * @len: the number of bytes being written
2346  *
2347  * Returns: 1 if an alloc is required, 0 otherwise
2348  */
2349 
2350 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2351 			      unsigned int len)
2352 {
2353 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2354 	struct buffer_head bh;
2355 	unsigned int shift;
2356 	u64 lblock, lblock_stop, size;
2357 	u64 end_of_file;
2358 
2359 	if (!len)
2360 		return 0;
2361 
2362 	if (gfs2_is_stuffed(ip)) {
2363 		if (offset + len > gfs2_max_stuffed_size(ip))
2364 			return 1;
2365 		return 0;
2366 	}
2367 
2368 	shift = sdp->sd_sb.sb_bsize_shift;
2369 	BUG_ON(gfs2_is_dir(ip));
2370 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2371 	lblock = offset >> shift;
2372 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2373 	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2374 		return 1;
2375 
2376 	size = (lblock_stop - lblock) << shift;
2377 	do {
2378 		bh.b_state = 0;
2379 		bh.b_size = size;
2380 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2381 		if (!buffer_mapped(&bh))
2382 			return 1;
2383 		size -= bh.b_size;
2384 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2385 	} while(size > 0);
2386 
2387 	return 0;
2388 }
2389 
2390 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2391 {
2392 	struct gfs2_inode *ip = GFS2_I(inode);
2393 	struct buffer_head *dibh;
2394 	int error;
2395 
2396 	if (offset >= inode->i_size)
2397 		return 0;
2398 	if (offset + length > inode->i_size)
2399 		length = inode->i_size - offset;
2400 
2401 	error = gfs2_meta_inode_buffer(ip, &dibh);
2402 	if (error)
2403 		return error;
2404 	gfs2_trans_add_meta(ip->i_gl, dibh);
2405 	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2406 	       length);
2407 	brelse(dibh);
2408 	return 0;
2409 }
2410 
2411 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2412 					 loff_t length)
2413 {
2414 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2415 	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2416 	int error;
2417 
2418 	while (length) {
2419 		struct gfs2_trans *tr;
2420 		loff_t chunk;
2421 		unsigned int offs;
2422 
2423 		chunk = length;
2424 		if (chunk > max_chunk)
2425 			chunk = max_chunk;
2426 
2427 		offs = offset & ~PAGE_MASK;
2428 		if (offs && chunk > PAGE_SIZE)
2429 			chunk = offs + ((chunk - offs) & PAGE_MASK);
2430 
2431 		truncate_pagecache_range(inode, offset, chunk);
2432 		offset += chunk;
2433 		length -= chunk;
2434 
2435 		tr = current->journal_info;
2436 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2437 			continue;
2438 
2439 		gfs2_trans_end(sdp);
2440 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2441 		if (error)
2442 			return error;
2443 	}
2444 	return 0;
2445 }
2446 
2447 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2448 {
2449 	struct inode *inode = file_inode(file);
2450 	struct gfs2_inode *ip = GFS2_I(inode);
2451 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2452 	int error;
2453 
2454 	if (gfs2_is_jdata(ip))
2455 		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2456 					 GFS2_JTRUNC_REVOKES);
2457 	else
2458 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2459 	if (error)
2460 		return error;
2461 
2462 	if (gfs2_is_stuffed(ip)) {
2463 		error = stuffed_zero_range(inode, offset, length);
2464 		if (error)
2465 			goto out;
2466 	} else {
2467 		unsigned int start_off, end_len, blocksize;
2468 
2469 		blocksize = i_blocksize(inode);
2470 		start_off = offset & (blocksize - 1);
2471 		end_len = (offset + length) & (blocksize - 1);
2472 		if (start_off) {
2473 			unsigned int len = length;
2474 			if (length > blocksize - start_off)
2475 				len = blocksize - start_off;
2476 			error = gfs2_block_zero_range(inode, offset, len);
2477 			if (error)
2478 				goto out;
2479 			if (start_off + length < blocksize)
2480 				end_len = 0;
2481 		}
2482 		if (end_len) {
2483 			error = gfs2_block_zero_range(inode,
2484 				offset + length - end_len, end_len);
2485 			if (error)
2486 				goto out;
2487 		}
2488 	}
2489 
2490 	if (gfs2_is_jdata(ip)) {
2491 		BUG_ON(!current->journal_info);
2492 		gfs2_journaled_truncate_range(inode, offset, length);
2493 	} else
2494 		truncate_pagecache_range(inode, offset, offset + length - 1);
2495 
2496 	file_update_time(file);
2497 	mark_inode_dirty(inode);
2498 
2499 	if (current->journal_info)
2500 		gfs2_trans_end(sdp);
2501 
2502 	if (!gfs2_is_stuffed(ip))
2503 		error = punch_hole(ip, offset, length);
2504 
2505 out:
2506 	if (current->journal_info)
2507 		gfs2_trans_end(sdp);
2508 	return error;
2509 }
2510