xref: /openbmc/linux/fs/gfs2/bmap.c (revision e65e175b07bef5974045cc42238de99057669ca7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #include <linux/spinlock.h>
8 #include <linux/completion.h>
9 #include <linux/buffer_head.h>
10 #include <linux/blkdev.h>
11 #include <linux/gfs2_ondisk.h>
12 #include <linux/crc32.h>
13 #include <linux/iomap.h>
14 #include <linux/ktime.h>
15 
16 #include "gfs2.h"
17 #include "incore.h"
18 #include "bmap.h"
19 #include "glock.h"
20 #include "inode.h"
21 #include "meta_io.h"
22 #include "quota.h"
23 #include "rgrp.h"
24 #include "log.h"
25 #include "super.h"
26 #include "trans.h"
27 #include "dir.h"
28 #include "util.h"
29 #include "aops.h"
30 #include "trace_gfs2.h"
31 
32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
33  * block is 512, so __u16 is fine for that. It saves stack space to
34  * keep it small.
35  */
36 struct metapath {
37 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
39 	int mp_fheight; /* find_metapath height */
40 	int mp_aheight; /* actual height (lookup height) */
41 };
42 
43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
44 
45 /**
46  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
47  * @ip: the inode
48  * @dibh: the dinode buffer
49  * @block: the block number that was allocated
50  * @page: The (optional) page. This is looked up if @page is NULL
51  *
52  * Returns: errno
53  */
54 
55 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
56 			       u64 block, struct page *page)
57 {
58 	struct inode *inode = &ip->i_inode;
59 
60 	if (!PageUptodate(page)) {
61 		void *kaddr = kmap(page);
62 		u64 dsize = i_size_read(inode);
63 
64 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
65 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
66 		kunmap(page);
67 
68 		SetPageUptodate(page);
69 	}
70 
71 	if (gfs2_is_jdata(ip)) {
72 		struct buffer_head *bh;
73 
74 		if (!page_has_buffers(page))
75 			create_empty_buffers(page, BIT(inode->i_blkbits),
76 					     BIT(BH_Uptodate));
77 
78 		bh = page_buffers(page);
79 		if (!buffer_mapped(bh))
80 			map_bh(bh, inode->i_sb, block);
81 
82 		set_buffer_uptodate(bh);
83 		gfs2_trans_add_data(ip->i_gl, bh);
84 	} else {
85 		set_page_dirty(page);
86 		gfs2_ordered_add_inode(ip);
87 	}
88 
89 	return 0;
90 }
91 
92 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
93 {
94 	struct buffer_head *bh, *dibh;
95 	struct gfs2_dinode *di;
96 	u64 block = 0;
97 	int isdir = gfs2_is_dir(ip);
98 	int error;
99 
100 	error = gfs2_meta_inode_buffer(ip, &dibh);
101 	if (error)
102 		return error;
103 
104 	if (i_size_read(&ip->i_inode)) {
105 		/* Get a free block, fill it with the stuffed data,
106 		   and write it out to disk */
107 
108 		unsigned int n = 1;
109 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
110 		if (error)
111 			goto out_brelse;
112 		if (isdir) {
113 			gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
114 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
115 			if (error)
116 				goto out_brelse;
117 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
118 					      dibh, sizeof(struct gfs2_dinode));
119 			brelse(bh);
120 		} else {
121 			error = gfs2_unstuffer_page(ip, dibh, block, page);
122 			if (error)
123 				goto out_brelse;
124 		}
125 	}
126 
127 	/*  Set up the pointer to the new block  */
128 
129 	gfs2_trans_add_meta(ip->i_gl, dibh);
130 	di = (struct gfs2_dinode *)dibh->b_data;
131 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
132 
133 	if (i_size_read(&ip->i_inode)) {
134 		*(__be64 *)(di + 1) = cpu_to_be64(block);
135 		gfs2_add_inode_blocks(&ip->i_inode, 1);
136 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
137 	}
138 
139 	ip->i_height = 1;
140 	di->di_height = cpu_to_be16(1);
141 
142 out_brelse:
143 	brelse(dibh);
144 	return error;
145 }
146 
147 /**
148  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
149  * @ip: The GFS2 inode to unstuff
150  *
151  * This routine unstuffs a dinode and returns it to a "normal" state such
152  * that the height can be grown in the traditional way.
153  *
154  * Returns: errno
155  */
156 
157 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
158 {
159 	struct inode *inode = &ip->i_inode;
160 	struct page *page;
161 	int error;
162 
163 	down_write(&ip->i_rw_mutex);
164 	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
165 	error = -ENOMEM;
166 	if (!page)
167 		goto out;
168 	error = __gfs2_unstuff_inode(ip, page);
169 	unlock_page(page);
170 	put_page(page);
171 out:
172 	up_write(&ip->i_rw_mutex);
173 	return error;
174 }
175 
176 /**
177  * find_metapath - Find path through the metadata tree
178  * @sdp: The superblock
179  * @block: The disk block to look up
180  * @mp: The metapath to return the result in
181  * @height: The pre-calculated height of the metadata tree
182  *
183  *   This routine returns a struct metapath structure that defines a path
184  *   through the metadata of inode "ip" to get to block "block".
185  *
186  *   Example:
187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188  *   filesystem with a blocksize of 4096.
189  *
190  *   find_metapath() would return a struct metapath structure set to:
191  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
192  *
193  *   That means that in order to get to the block containing the byte at
194  *   offset 101342453, we would load the indirect block pointed to by pointer
195  *   0 in the dinode.  We would then load the indirect block pointed to by
196  *   pointer 48 in that indirect block.  We would then load the data block
197  *   pointed to by pointer 165 in that indirect block.
198  *
199  *             ----------------------------------------
200  *             | Dinode |                             |
201  *             |        |                            4|
202  *             |        |0 1 2 3 4 5                 9|
203  *             |        |                            6|
204  *             ----------------------------------------
205  *                       |
206  *                       |
207  *                       V
208  *             ----------------------------------------
209  *             | Indirect Block                       |
210  *             |                                     5|
211  *             |            4 4 4 4 4 5 5            1|
212  *             |0           5 6 7 8 9 0 1            2|
213  *             ----------------------------------------
214  *                                |
215  *                                |
216  *                                V
217  *             ----------------------------------------
218  *             | Indirect Block                       |
219  *             |                         1 1 1 1 1   5|
220  *             |                         6 6 6 6 6   1|
221  *             |0                        3 4 5 6 7   2|
222  *             ----------------------------------------
223  *                                           |
224  *                                           |
225  *                                           V
226  *             ----------------------------------------
227  *             | Data block containing offset         |
228  *             |            101342453                 |
229  *             |                                      |
230  *             |                                      |
231  *             ----------------------------------------
232  *
233  */
234 
235 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
236 			  struct metapath *mp, unsigned int height)
237 {
238 	unsigned int i;
239 
240 	mp->mp_fheight = height;
241 	for (i = height; i--;)
242 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
243 }
244 
245 static inline unsigned int metapath_branch_start(const struct metapath *mp)
246 {
247 	if (mp->mp_list[0] == 0)
248 		return 2;
249 	return 1;
250 }
251 
252 /**
253  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
254  * @height: The metadata height (0 = dinode)
255  * @mp: The metapath
256  */
257 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
258 {
259 	struct buffer_head *bh = mp->mp_bh[height];
260 	if (height == 0)
261 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
262 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
263 }
264 
265 /**
266  * metapointer - Return pointer to start of metadata in a buffer
267  * @height: The metadata height (0 = dinode)
268  * @mp: The metapath
269  *
270  * Return a pointer to the block number of the next height of the metadata
271  * tree given a buffer containing the pointer to the current height of the
272  * metadata tree.
273  */
274 
275 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
276 {
277 	__be64 *p = metaptr1(height, mp);
278 	return p + mp->mp_list[height];
279 }
280 
281 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
282 {
283 	const struct buffer_head *bh = mp->mp_bh[height];
284 	return (const __be64 *)(bh->b_data + bh->b_size);
285 }
286 
287 static void clone_metapath(struct metapath *clone, struct metapath *mp)
288 {
289 	unsigned int hgt;
290 
291 	*clone = *mp;
292 	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
293 		get_bh(clone->mp_bh[hgt]);
294 }
295 
296 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
297 {
298 	const __be64 *t;
299 
300 	for (t = start; t < end; t++) {
301 		struct buffer_head *rabh;
302 
303 		if (!*t)
304 			continue;
305 
306 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
307 		if (trylock_buffer(rabh)) {
308 			if (!buffer_uptodate(rabh)) {
309 				rabh->b_end_io = end_buffer_read_sync;
310 				submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
311 					  REQ_PRIO, rabh);
312 				continue;
313 			}
314 			unlock_buffer(rabh);
315 		}
316 		brelse(rabh);
317 	}
318 }
319 
320 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
321 			     unsigned int x, unsigned int h)
322 {
323 	for (; x < h; x++) {
324 		__be64 *ptr = metapointer(x, mp);
325 		u64 dblock = be64_to_cpu(*ptr);
326 		int ret;
327 
328 		if (!dblock)
329 			break;
330 		ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
331 		if (ret)
332 			return ret;
333 	}
334 	mp->mp_aheight = x + 1;
335 	return 0;
336 }
337 
338 /**
339  * lookup_metapath - Walk the metadata tree to a specific point
340  * @ip: The inode
341  * @mp: The metapath
342  *
343  * Assumes that the inode's buffer has already been looked up and
344  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
345  * by find_metapath().
346  *
347  * If this function encounters part of the tree which has not been
348  * allocated, it returns the current height of the tree at the point
349  * at which it found the unallocated block. Blocks which are found are
350  * added to the mp->mp_bh[] list.
351  *
352  * Returns: error
353  */
354 
355 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
356 {
357 	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
358 }
359 
360 /**
361  * fillup_metapath - fill up buffers for the metadata path to a specific height
362  * @ip: The inode
363  * @mp: The metapath
364  * @h: The height to which it should be mapped
365  *
366  * Similar to lookup_metapath, but does lookups for a range of heights
367  *
368  * Returns: error or the number of buffers filled
369  */
370 
371 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
372 {
373 	unsigned int x = 0;
374 	int ret;
375 
376 	if (h) {
377 		/* find the first buffer we need to look up. */
378 		for (x = h - 1; x > 0; x--) {
379 			if (mp->mp_bh[x])
380 				break;
381 		}
382 	}
383 	ret = __fillup_metapath(ip, mp, x, h);
384 	if (ret)
385 		return ret;
386 	return mp->mp_aheight - x - 1;
387 }
388 
389 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
390 {
391 	sector_t factor = 1, block = 0;
392 	int hgt;
393 
394 	for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
395 		if (hgt < mp->mp_aheight)
396 			block += mp->mp_list[hgt] * factor;
397 		factor *= sdp->sd_inptrs;
398 	}
399 	return block;
400 }
401 
402 static void release_metapath(struct metapath *mp)
403 {
404 	int i;
405 
406 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
407 		if (mp->mp_bh[i] == NULL)
408 			break;
409 		brelse(mp->mp_bh[i]);
410 		mp->mp_bh[i] = NULL;
411 	}
412 }
413 
414 /**
415  * gfs2_extent_length - Returns length of an extent of blocks
416  * @bh: The metadata block
417  * @ptr: Current position in @bh
418  * @limit: Max extent length to return
419  * @eob: Set to 1 if we hit "end of block"
420  *
421  * Returns: The length of the extent (minimum of one block)
422  */
423 
424 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
425 {
426 	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
427 	const __be64 *first = ptr;
428 	u64 d = be64_to_cpu(*ptr);
429 
430 	*eob = 0;
431 	do {
432 		ptr++;
433 		if (ptr >= end)
434 			break;
435 		d++;
436 	} while(be64_to_cpu(*ptr) == d);
437 	if (ptr >= end)
438 		*eob = 1;
439 	return ptr - first;
440 }
441 
442 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
443 
444 /*
445  * gfs2_metadata_walker - walk an indirect block
446  * @mp: Metapath to indirect block
447  * @ptrs: Number of pointers to look at
448  *
449  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
450  * indirect block to follow.
451  */
452 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
453 						   unsigned int ptrs);
454 
455 /*
456  * gfs2_walk_metadata - walk a tree of indirect blocks
457  * @inode: The inode
458  * @mp: Starting point of walk
459  * @max_len: Maximum number of blocks to walk
460  * @walker: Called during the walk
461  *
462  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
463  * past the end of metadata, and a negative error code otherwise.
464  */
465 
466 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
467 		u64 max_len, gfs2_metadata_walker walker)
468 {
469 	struct gfs2_inode *ip = GFS2_I(inode);
470 	struct gfs2_sbd *sdp = GFS2_SB(inode);
471 	u64 factor = 1;
472 	unsigned int hgt;
473 	int ret;
474 
475 	/*
476 	 * The walk starts in the lowest allocated indirect block, which may be
477 	 * before the position indicated by @mp.  Adjust @max_len accordingly
478 	 * to avoid a short walk.
479 	 */
480 	for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
481 		max_len += mp->mp_list[hgt] * factor;
482 		mp->mp_list[hgt] = 0;
483 		factor *= sdp->sd_inptrs;
484 	}
485 
486 	for (;;) {
487 		u16 start = mp->mp_list[hgt];
488 		enum walker_status status;
489 		unsigned int ptrs;
490 		u64 len;
491 
492 		/* Walk indirect block. */
493 		ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
494 		len = ptrs * factor;
495 		if (len > max_len)
496 			ptrs = DIV_ROUND_UP_ULL(max_len, factor);
497 		status = walker(mp, ptrs);
498 		switch (status) {
499 		case WALK_STOP:
500 			return 1;
501 		case WALK_FOLLOW:
502 			BUG_ON(mp->mp_aheight == mp->mp_fheight);
503 			ptrs = mp->mp_list[hgt] - start;
504 			len = ptrs * factor;
505 			break;
506 		case WALK_CONTINUE:
507 			break;
508 		}
509 		if (len >= max_len)
510 			break;
511 		max_len -= len;
512 		if (status == WALK_FOLLOW)
513 			goto fill_up_metapath;
514 
515 lower_metapath:
516 		/* Decrease height of metapath. */
517 		brelse(mp->mp_bh[hgt]);
518 		mp->mp_bh[hgt] = NULL;
519 		mp->mp_list[hgt] = 0;
520 		if (!hgt)
521 			break;
522 		hgt--;
523 		factor *= sdp->sd_inptrs;
524 
525 		/* Advance in metadata tree. */
526 		(mp->mp_list[hgt])++;
527 		if (hgt) {
528 			if (mp->mp_list[hgt] >= sdp->sd_inptrs)
529 				goto lower_metapath;
530 		} else {
531 			if (mp->mp_list[hgt] >= sdp->sd_diptrs)
532 				break;
533 		}
534 
535 fill_up_metapath:
536 		/* Increase height of metapath. */
537 		ret = fillup_metapath(ip, mp, ip->i_height - 1);
538 		if (ret < 0)
539 			return ret;
540 		hgt += ret;
541 		for (; ret; ret--)
542 			do_div(factor, sdp->sd_inptrs);
543 		mp->mp_aheight = hgt + 1;
544 	}
545 	return 0;
546 }
547 
548 static enum walker_status gfs2_hole_walker(struct metapath *mp,
549 					   unsigned int ptrs)
550 {
551 	const __be64 *start, *ptr, *end;
552 	unsigned int hgt;
553 
554 	hgt = mp->mp_aheight - 1;
555 	start = metapointer(hgt, mp);
556 	end = start + ptrs;
557 
558 	for (ptr = start; ptr < end; ptr++) {
559 		if (*ptr) {
560 			mp->mp_list[hgt] += ptr - start;
561 			if (mp->mp_aheight == mp->mp_fheight)
562 				return WALK_STOP;
563 			return WALK_FOLLOW;
564 		}
565 	}
566 	return WALK_CONTINUE;
567 }
568 
569 /**
570  * gfs2_hole_size - figure out the size of a hole
571  * @inode: The inode
572  * @lblock: The logical starting block number
573  * @len: How far to look (in blocks)
574  * @mp: The metapath at lblock
575  * @iomap: The iomap to store the hole size in
576  *
577  * This function modifies @mp.
578  *
579  * Returns: errno on error
580  */
581 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
582 			  struct metapath *mp, struct iomap *iomap)
583 {
584 	struct metapath clone;
585 	u64 hole_size;
586 	int ret;
587 
588 	clone_metapath(&clone, mp);
589 	ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
590 	if (ret < 0)
591 		goto out;
592 
593 	if (ret == 1)
594 		hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
595 	else
596 		hole_size = len;
597 	iomap->length = hole_size << inode->i_blkbits;
598 	ret = 0;
599 
600 out:
601 	release_metapath(&clone);
602 	return ret;
603 }
604 
605 static inline void gfs2_indirect_init(struct metapath *mp,
606 				      struct gfs2_glock *gl, unsigned int i,
607 				      unsigned offset, u64 bn)
608 {
609 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
610 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
611 				 sizeof(struct gfs2_dinode)));
612 	BUG_ON(i < 1);
613 	BUG_ON(mp->mp_bh[i] != NULL);
614 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
615 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
616 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
617 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
618 	ptr += offset;
619 	*ptr = cpu_to_be64(bn);
620 }
621 
622 enum alloc_state {
623 	ALLOC_DATA = 0,
624 	ALLOC_GROW_DEPTH = 1,
625 	ALLOC_GROW_HEIGHT = 2,
626 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
627 };
628 
629 /**
630  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
631  * @inode: The GFS2 inode
632  * @iomap: The iomap structure
633  * @mp: The metapath, with proper height information calculated
634  *
635  * In this routine we may have to alloc:
636  *   i) Indirect blocks to grow the metadata tree height
637  *  ii) Indirect blocks to fill in lower part of the metadata tree
638  * iii) Data blocks
639  *
640  * This function is called after __gfs2_iomap_get, which works out the
641  * total number of blocks which we need via gfs2_alloc_size.
642  *
643  * We then do the actual allocation asking for an extent at a time (if
644  * enough contiguous free blocks are available, there will only be one
645  * allocation request per call) and uses the state machine to initialise
646  * the blocks in order.
647  *
648  * Right now, this function will allocate at most one indirect block
649  * worth of data -- with a default block size of 4K, that's slightly
650  * less than 2M.  If this limitation is ever removed to allow huge
651  * allocations, we would probably still want to limit the iomap size we
652  * return to avoid stalling other tasks during huge writes; the next
653  * iomap iteration would then find the blocks already allocated.
654  *
655  * Returns: errno on error
656  */
657 
658 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
659 			      struct metapath *mp)
660 {
661 	struct gfs2_inode *ip = GFS2_I(inode);
662 	struct gfs2_sbd *sdp = GFS2_SB(inode);
663 	struct buffer_head *dibh = mp->mp_bh[0];
664 	u64 bn;
665 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
666 	size_t dblks = iomap->length >> inode->i_blkbits;
667 	const unsigned end_of_metadata = mp->mp_fheight - 1;
668 	int ret;
669 	enum alloc_state state;
670 	__be64 *ptr;
671 	__be64 zero_bn = 0;
672 
673 	BUG_ON(mp->mp_aheight < 1);
674 	BUG_ON(dibh == NULL);
675 	BUG_ON(dblks < 1);
676 
677 	gfs2_trans_add_meta(ip->i_gl, dibh);
678 
679 	down_write(&ip->i_rw_mutex);
680 
681 	if (mp->mp_fheight == mp->mp_aheight) {
682 		/* Bottom indirect block exists */
683 		state = ALLOC_DATA;
684 	} else {
685 		/* Need to allocate indirect blocks */
686 		if (mp->mp_fheight == ip->i_height) {
687 			/* Writing into existing tree, extend tree down */
688 			iblks = mp->mp_fheight - mp->mp_aheight;
689 			state = ALLOC_GROW_DEPTH;
690 		} else {
691 			/* Building up tree height */
692 			state = ALLOC_GROW_HEIGHT;
693 			iblks = mp->mp_fheight - ip->i_height;
694 			branch_start = metapath_branch_start(mp);
695 			iblks += (mp->mp_fheight - branch_start);
696 		}
697 	}
698 
699 	/* start of the second part of the function (state machine) */
700 
701 	blks = dblks + iblks;
702 	i = mp->mp_aheight;
703 	do {
704 		n = blks - alloced;
705 		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
706 		if (ret)
707 			goto out;
708 		alloced += n;
709 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
710 			gfs2_trans_remove_revoke(sdp, bn, n);
711 		switch (state) {
712 		/* Growing height of tree */
713 		case ALLOC_GROW_HEIGHT:
714 			if (i == 1) {
715 				ptr = (__be64 *)(dibh->b_data +
716 						 sizeof(struct gfs2_dinode));
717 				zero_bn = *ptr;
718 			}
719 			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
720 			     i++, n--)
721 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
722 			if (i - 1 == mp->mp_fheight - ip->i_height) {
723 				i--;
724 				gfs2_buffer_copy_tail(mp->mp_bh[i],
725 						sizeof(struct gfs2_meta_header),
726 						dibh, sizeof(struct gfs2_dinode));
727 				gfs2_buffer_clear_tail(dibh,
728 						sizeof(struct gfs2_dinode) +
729 						sizeof(__be64));
730 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
731 					sizeof(struct gfs2_meta_header));
732 				*ptr = zero_bn;
733 				state = ALLOC_GROW_DEPTH;
734 				for(i = branch_start; i < mp->mp_fheight; i++) {
735 					if (mp->mp_bh[i] == NULL)
736 						break;
737 					brelse(mp->mp_bh[i]);
738 					mp->mp_bh[i] = NULL;
739 				}
740 				i = branch_start;
741 			}
742 			if (n == 0)
743 				break;
744 			fallthrough;	/* To branching from existing tree */
745 		case ALLOC_GROW_DEPTH:
746 			if (i > 1 && i < mp->mp_fheight)
747 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
748 			for (; i < mp->mp_fheight && n > 0; i++, n--)
749 				gfs2_indirect_init(mp, ip->i_gl, i,
750 						   mp->mp_list[i-1], bn++);
751 			if (i == mp->mp_fheight)
752 				state = ALLOC_DATA;
753 			if (n == 0)
754 				break;
755 			fallthrough;	/* To tree complete, adding data blocks */
756 		case ALLOC_DATA:
757 			BUG_ON(n > dblks);
758 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
759 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
760 			dblks = n;
761 			ptr = metapointer(end_of_metadata, mp);
762 			iomap->addr = bn << inode->i_blkbits;
763 			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
764 			while (n-- > 0)
765 				*ptr++ = cpu_to_be64(bn++);
766 			break;
767 		}
768 	} while (iomap->addr == IOMAP_NULL_ADDR);
769 
770 	iomap->type = IOMAP_MAPPED;
771 	iomap->length = (u64)dblks << inode->i_blkbits;
772 	ip->i_height = mp->mp_fheight;
773 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
774 	gfs2_dinode_out(ip, dibh->b_data);
775 out:
776 	up_write(&ip->i_rw_mutex);
777 	return ret;
778 }
779 
780 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
781 
782 /**
783  * gfs2_alloc_size - Compute the maximum allocation size
784  * @inode: The inode
785  * @mp: The metapath
786  * @size: Requested size in blocks
787  *
788  * Compute the maximum size of the next allocation at @mp.
789  *
790  * Returns: size in blocks
791  */
792 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
793 {
794 	struct gfs2_inode *ip = GFS2_I(inode);
795 	struct gfs2_sbd *sdp = GFS2_SB(inode);
796 	const __be64 *first, *ptr, *end;
797 
798 	/*
799 	 * For writes to stuffed files, this function is called twice via
800 	 * __gfs2_iomap_get, before and after unstuffing. The size we return the
801 	 * first time needs to be large enough to get the reservation and
802 	 * allocation sizes right.  The size we return the second time must
803 	 * be exact or else __gfs2_iomap_alloc won't do the right thing.
804 	 */
805 
806 	if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
807 		unsigned int maxsize = mp->mp_fheight > 1 ?
808 			sdp->sd_inptrs : sdp->sd_diptrs;
809 		maxsize -= mp->mp_list[mp->mp_fheight - 1];
810 		if (size > maxsize)
811 			size = maxsize;
812 		return size;
813 	}
814 
815 	first = metapointer(ip->i_height - 1, mp);
816 	end = metaend(ip->i_height - 1, mp);
817 	if (end - first > size)
818 		end = first + size;
819 	for (ptr = first; ptr < end; ptr++) {
820 		if (*ptr)
821 			break;
822 	}
823 	return ptr - first;
824 }
825 
826 /**
827  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
828  * @inode: The inode
829  * @pos: Starting position in bytes
830  * @length: Length to map, in bytes
831  * @flags: iomap flags
832  * @iomap: The iomap structure
833  * @mp: The metapath
834  *
835  * Returns: errno
836  */
837 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
838 			    unsigned flags, struct iomap *iomap,
839 			    struct metapath *mp)
840 {
841 	struct gfs2_inode *ip = GFS2_I(inode);
842 	struct gfs2_sbd *sdp = GFS2_SB(inode);
843 	loff_t size = i_size_read(inode);
844 	__be64 *ptr;
845 	sector_t lblock;
846 	sector_t lblock_stop;
847 	int ret;
848 	int eob;
849 	u64 len;
850 	struct buffer_head *dibh = NULL, *bh;
851 	u8 height;
852 
853 	if (!length)
854 		return -EINVAL;
855 
856 	down_read(&ip->i_rw_mutex);
857 
858 	ret = gfs2_meta_inode_buffer(ip, &dibh);
859 	if (ret)
860 		goto unlock;
861 	mp->mp_bh[0] = dibh;
862 
863 	if (gfs2_is_stuffed(ip)) {
864 		if (flags & IOMAP_WRITE) {
865 			loff_t max_size = gfs2_max_stuffed_size(ip);
866 
867 			if (pos + length > max_size)
868 				goto unstuff;
869 			iomap->length = max_size;
870 		} else {
871 			if (pos >= size) {
872 				if (flags & IOMAP_REPORT) {
873 					ret = -ENOENT;
874 					goto unlock;
875 				} else {
876 					iomap->offset = pos;
877 					iomap->length = length;
878 					goto hole_found;
879 				}
880 			}
881 			iomap->length = size;
882 		}
883 		iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
884 			      sizeof(struct gfs2_dinode);
885 		iomap->type = IOMAP_INLINE;
886 		iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
887 		goto out;
888 	}
889 
890 unstuff:
891 	lblock = pos >> inode->i_blkbits;
892 	iomap->offset = lblock << inode->i_blkbits;
893 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
894 	len = lblock_stop - lblock + 1;
895 	iomap->length = len << inode->i_blkbits;
896 
897 	height = ip->i_height;
898 	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
899 		height++;
900 	find_metapath(sdp, lblock, mp, height);
901 	if (height > ip->i_height || gfs2_is_stuffed(ip))
902 		goto do_alloc;
903 
904 	ret = lookup_metapath(ip, mp);
905 	if (ret)
906 		goto unlock;
907 
908 	if (mp->mp_aheight != ip->i_height)
909 		goto do_alloc;
910 
911 	ptr = metapointer(ip->i_height - 1, mp);
912 	if (*ptr == 0)
913 		goto do_alloc;
914 
915 	bh = mp->mp_bh[ip->i_height - 1];
916 	len = gfs2_extent_length(bh, ptr, len, &eob);
917 
918 	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
919 	iomap->length = len << inode->i_blkbits;
920 	iomap->type = IOMAP_MAPPED;
921 	iomap->flags |= IOMAP_F_MERGED;
922 	if (eob)
923 		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
924 
925 out:
926 	iomap->bdev = inode->i_sb->s_bdev;
927 unlock:
928 	up_read(&ip->i_rw_mutex);
929 	return ret;
930 
931 do_alloc:
932 	if (flags & IOMAP_REPORT) {
933 		if (pos >= size)
934 			ret = -ENOENT;
935 		else if (height == ip->i_height)
936 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
937 		else
938 			iomap->length = size - iomap->offset;
939 	} else if (flags & IOMAP_WRITE) {
940 		u64 alloc_size;
941 
942 		if (flags & IOMAP_DIRECT)
943 			goto out;  /* (see gfs2_file_direct_write) */
944 
945 		len = gfs2_alloc_size(inode, mp, len);
946 		alloc_size = len << inode->i_blkbits;
947 		if (alloc_size < iomap->length)
948 			iomap->length = alloc_size;
949 	} else {
950 		if (pos < size && height == ip->i_height)
951 			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
952 	}
953 hole_found:
954 	iomap->addr = IOMAP_NULL_ADDR;
955 	iomap->type = IOMAP_HOLE;
956 	goto out;
957 }
958 
959 static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
960 				   unsigned len)
961 {
962 	unsigned int blockmask = i_blocksize(inode) - 1;
963 	struct gfs2_sbd *sdp = GFS2_SB(inode);
964 	unsigned int blocks;
965 
966 	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
967 	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
968 }
969 
970 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
971 				 unsigned copied, struct page *page)
972 {
973 	struct gfs2_trans *tr = current->journal_info;
974 	struct gfs2_inode *ip = GFS2_I(inode);
975 	struct gfs2_sbd *sdp = GFS2_SB(inode);
976 
977 	if (page && !gfs2_is_stuffed(ip))
978 		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
979 
980 	if (tr->tr_num_buf_new)
981 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
982 
983 	gfs2_trans_end(sdp);
984 }
985 
986 static const struct iomap_page_ops gfs2_iomap_page_ops = {
987 	.page_prepare = gfs2_iomap_page_prepare,
988 	.page_done = gfs2_iomap_page_done,
989 };
990 
991 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
992 				  loff_t length, unsigned flags,
993 				  struct iomap *iomap,
994 				  struct metapath *mp)
995 {
996 	struct gfs2_inode *ip = GFS2_I(inode);
997 	struct gfs2_sbd *sdp = GFS2_SB(inode);
998 	bool unstuff;
999 	int ret;
1000 
1001 	unstuff = gfs2_is_stuffed(ip) &&
1002 		  pos + length > gfs2_max_stuffed_size(ip);
1003 
1004 	if (unstuff || iomap->type == IOMAP_HOLE) {
1005 		unsigned int data_blocks, ind_blocks;
1006 		struct gfs2_alloc_parms ap = {};
1007 		unsigned int rblocks;
1008 		struct gfs2_trans *tr;
1009 
1010 		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1011 				       &ind_blocks);
1012 		ap.target = data_blocks + ind_blocks;
1013 		ret = gfs2_quota_lock_check(ip, &ap);
1014 		if (ret)
1015 			return ret;
1016 
1017 		ret = gfs2_inplace_reserve(ip, &ap);
1018 		if (ret)
1019 			goto out_qunlock;
1020 
1021 		rblocks = RES_DINODE + ind_blocks;
1022 		if (gfs2_is_jdata(ip))
1023 			rblocks += data_blocks;
1024 		if (ind_blocks || data_blocks)
1025 			rblocks += RES_STATFS + RES_QUOTA;
1026 		if (inode == sdp->sd_rindex)
1027 			rblocks += 2 * RES_STATFS;
1028 		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1029 
1030 		ret = gfs2_trans_begin(sdp, rblocks,
1031 				       iomap->length >> inode->i_blkbits);
1032 		if (ret)
1033 			goto out_trans_fail;
1034 
1035 		if (unstuff) {
1036 			ret = gfs2_unstuff_dinode(ip);
1037 			if (ret)
1038 				goto out_trans_end;
1039 			release_metapath(mp);
1040 			ret = __gfs2_iomap_get(inode, iomap->offset,
1041 					       iomap->length, flags, iomap, mp);
1042 			if (ret)
1043 				goto out_trans_end;
1044 		}
1045 
1046 		if (iomap->type == IOMAP_HOLE) {
1047 			ret = __gfs2_iomap_alloc(inode, iomap, mp);
1048 			if (ret) {
1049 				gfs2_trans_end(sdp);
1050 				gfs2_inplace_release(ip);
1051 				punch_hole(ip, iomap->offset, iomap->length);
1052 				goto out_qunlock;
1053 			}
1054 		}
1055 
1056 		tr = current->journal_info;
1057 		if (tr->tr_num_buf_new)
1058 			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1059 
1060 		gfs2_trans_end(sdp);
1061 	}
1062 
1063 	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1064 		iomap->page_ops = &gfs2_iomap_page_ops;
1065 	return 0;
1066 
1067 out_trans_end:
1068 	gfs2_trans_end(sdp);
1069 out_trans_fail:
1070 	gfs2_inplace_release(ip);
1071 out_qunlock:
1072 	gfs2_quota_unlock(ip);
1073 	return ret;
1074 }
1075 
1076 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1077 			    unsigned flags, struct iomap *iomap,
1078 			    struct iomap *srcmap)
1079 {
1080 	struct gfs2_inode *ip = GFS2_I(inode);
1081 	struct metapath mp = { .mp_aheight = 1, };
1082 	int ret;
1083 
1084 	if (gfs2_is_jdata(ip))
1085 		iomap->flags |= IOMAP_F_BUFFER_HEAD;
1086 
1087 	trace_gfs2_iomap_start(ip, pos, length, flags);
1088 	ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1089 	if (ret)
1090 		goto out_unlock;
1091 
1092 	switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1093 	case IOMAP_WRITE:
1094 		if (flags & IOMAP_DIRECT) {
1095 			/*
1096 			 * Silently fall back to buffered I/O for stuffed files
1097 			 * or if we've got a hole (see gfs2_file_direct_write).
1098 			 */
1099 			if (iomap->type != IOMAP_MAPPED)
1100 				ret = -ENOTBLK;
1101 			goto out_unlock;
1102 		}
1103 		break;
1104 	case IOMAP_ZERO:
1105 		if (iomap->type == IOMAP_HOLE)
1106 			goto out_unlock;
1107 		break;
1108 	default:
1109 		goto out_unlock;
1110 	}
1111 
1112 	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1113 
1114 out_unlock:
1115 	release_metapath(&mp);
1116 	trace_gfs2_iomap_end(ip, iomap, ret);
1117 	return ret;
1118 }
1119 
1120 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1121 			  ssize_t written, unsigned flags, struct iomap *iomap)
1122 {
1123 	struct gfs2_inode *ip = GFS2_I(inode);
1124 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1125 
1126 	switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1127 	case IOMAP_WRITE:
1128 		if (flags & IOMAP_DIRECT)
1129 			return 0;
1130 		break;
1131 	case IOMAP_ZERO:
1132 		 if (iomap->type == IOMAP_HOLE)
1133 			 return 0;
1134 		 break;
1135 	default:
1136 		 return 0;
1137 	}
1138 
1139 	if (!gfs2_is_stuffed(ip))
1140 		gfs2_ordered_add_inode(ip);
1141 
1142 	if (inode == sdp->sd_rindex)
1143 		adjust_fs_space(inode);
1144 
1145 	gfs2_inplace_release(ip);
1146 
1147 	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1148 		gfs2_quota_unlock(ip);
1149 
1150 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1151 		/* Deallocate blocks that were just allocated. */
1152 		loff_t hstart = round_up(pos + written, i_blocksize(inode));
1153 		loff_t hend = iomap->offset + iomap->length;
1154 
1155 		if (hstart < hend) {
1156 			truncate_pagecache_range(inode, hstart, hend - 1);
1157 			punch_hole(ip, hstart, hend - hstart);
1158 		}
1159 	}
1160 
1161 	if (unlikely(!written))
1162 		return 0;
1163 
1164 	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1165 		mark_inode_dirty(inode);
1166 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1167 	return 0;
1168 }
1169 
1170 const struct iomap_ops gfs2_iomap_ops = {
1171 	.iomap_begin = gfs2_iomap_begin,
1172 	.iomap_end = gfs2_iomap_end,
1173 };
1174 
1175 /**
1176  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1177  * @inode: The inode
1178  * @lblock: The logical block number
1179  * @bh_map: The bh to be mapped
1180  * @create: True if its ok to alloc blocks to satify the request
1181  *
1182  * The size of the requested mapping is defined in bh_map->b_size.
1183  *
1184  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1185  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1186  * bh_map->b_size to indicate the size of the mapping when @lblock and
1187  * successive blocks are mapped, up to the requested size.
1188  *
1189  * Sets buffer_boundary() if a read of metadata will be required
1190  * before the next block can be mapped. Sets buffer_new() if new
1191  * blocks were allocated.
1192  *
1193  * Returns: errno
1194  */
1195 
1196 int gfs2_block_map(struct inode *inode, sector_t lblock,
1197 		   struct buffer_head *bh_map, int create)
1198 {
1199 	struct gfs2_inode *ip = GFS2_I(inode);
1200 	loff_t pos = (loff_t)lblock << inode->i_blkbits;
1201 	loff_t length = bh_map->b_size;
1202 	struct iomap iomap = { };
1203 	int ret;
1204 
1205 	clear_buffer_mapped(bh_map);
1206 	clear_buffer_new(bh_map);
1207 	clear_buffer_boundary(bh_map);
1208 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1209 
1210 	if (!create)
1211 		ret = gfs2_iomap_get(inode, pos, length, &iomap);
1212 	else
1213 		ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1214 	if (ret)
1215 		goto out;
1216 
1217 	if (iomap.length > bh_map->b_size) {
1218 		iomap.length = bh_map->b_size;
1219 		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1220 	}
1221 	if (iomap.addr != IOMAP_NULL_ADDR)
1222 		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1223 	bh_map->b_size = iomap.length;
1224 	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1225 		set_buffer_boundary(bh_map);
1226 	if (iomap.flags & IOMAP_F_NEW)
1227 		set_buffer_new(bh_map);
1228 
1229 out:
1230 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1231 	return ret;
1232 }
1233 
1234 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1235 		    unsigned int *extlen)
1236 {
1237 	unsigned int blkbits = inode->i_blkbits;
1238 	struct iomap iomap = { };
1239 	unsigned int len;
1240 	int ret;
1241 
1242 	ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1243 			     &iomap);
1244 	if (ret)
1245 		return ret;
1246 	if (iomap.type != IOMAP_MAPPED)
1247 		return -EIO;
1248 	*dblock = iomap.addr >> blkbits;
1249 	len = iomap.length >> blkbits;
1250 	if (len < *extlen)
1251 		*extlen = len;
1252 	return 0;
1253 }
1254 
1255 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1256 		      unsigned int *extlen, bool *new)
1257 {
1258 	unsigned int blkbits = inode->i_blkbits;
1259 	struct iomap iomap = { };
1260 	unsigned int len;
1261 	int ret;
1262 
1263 	ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1264 			       &iomap);
1265 	if (ret)
1266 		return ret;
1267 	if (iomap.type != IOMAP_MAPPED)
1268 		return -EIO;
1269 	*dblock = iomap.addr >> blkbits;
1270 	len = iomap.length >> blkbits;
1271 	if (len < *extlen)
1272 		*extlen = len;
1273 	*new = iomap.flags & IOMAP_F_NEW;
1274 	return 0;
1275 }
1276 
1277 /*
1278  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1279  * uses iomap write to perform its actions, which begin their own transactions
1280  * (iomap_begin, page_prepare, etc.)
1281  */
1282 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1283 				 unsigned int length)
1284 {
1285 	BUG_ON(current->journal_info);
1286 	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1287 }
1288 
1289 #define GFS2_JTRUNC_REVOKES 8192
1290 
1291 /**
1292  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1293  * @inode: The inode being truncated
1294  * @oldsize: The original (larger) size
1295  * @newsize: The new smaller size
1296  *
1297  * With jdata files, we have to journal a revoke for each block which is
1298  * truncated. As a result, we need to split this into separate transactions
1299  * if the number of pages being truncated gets too large.
1300  */
1301 
1302 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1303 {
1304 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1305 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1306 	u64 chunk;
1307 	int error;
1308 
1309 	while (oldsize != newsize) {
1310 		struct gfs2_trans *tr;
1311 		unsigned int offs;
1312 
1313 		chunk = oldsize - newsize;
1314 		if (chunk > max_chunk)
1315 			chunk = max_chunk;
1316 
1317 		offs = oldsize & ~PAGE_MASK;
1318 		if (offs && chunk > PAGE_SIZE)
1319 			chunk = offs + ((chunk - offs) & PAGE_MASK);
1320 
1321 		truncate_pagecache(inode, oldsize - chunk);
1322 		oldsize -= chunk;
1323 
1324 		tr = current->journal_info;
1325 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1326 			continue;
1327 
1328 		gfs2_trans_end(sdp);
1329 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1330 		if (error)
1331 			return error;
1332 	}
1333 
1334 	return 0;
1335 }
1336 
1337 static int trunc_start(struct inode *inode, u64 newsize)
1338 {
1339 	struct gfs2_inode *ip = GFS2_I(inode);
1340 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1341 	struct buffer_head *dibh = NULL;
1342 	int journaled = gfs2_is_jdata(ip);
1343 	u64 oldsize = inode->i_size;
1344 	int error;
1345 
1346 	if (!gfs2_is_stuffed(ip)) {
1347 		unsigned int blocksize = i_blocksize(inode);
1348 		unsigned int offs = newsize & (blocksize - 1);
1349 		if (offs) {
1350 			error = gfs2_block_zero_range(inode, newsize,
1351 						      blocksize - offs);
1352 			if (error)
1353 				return error;
1354 		}
1355 	}
1356 	if (journaled)
1357 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1358 	else
1359 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1360 	if (error)
1361 		return error;
1362 
1363 	error = gfs2_meta_inode_buffer(ip, &dibh);
1364 	if (error)
1365 		goto out;
1366 
1367 	gfs2_trans_add_meta(ip->i_gl, dibh);
1368 
1369 	if (gfs2_is_stuffed(ip))
1370 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1371 	else
1372 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1373 
1374 	i_size_write(inode, newsize);
1375 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1376 	gfs2_dinode_out(ip, dibh->b_data);
1377 
1378 	if (journaled)
1379 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1380 	else
1381 		truncate_pagecache(inode, newsize);
1382 
1383 out:
1384 	brelse(dibh);
1385 	if (current->journal_info)
1386 		gfs2_trans_end(sdp);
1387 	return error;
1388 }
1389 
1390 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1391 		   struct iomap *iomap)
1392 {
1393 	struct metapath mp = { .mp_aheight = 1, };
1394 	int ret;
1395 
1396 	ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1397 	release_metapath(&mp);
1398 	return ret;
1399 }
1400 
1401 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1402 		     struct iomap *iomap)
1403 {
1404 	struct metapath mp = { .mp_aheight = 1, };
1405 	int ret;
1406 
1407 	ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1408 	if (!ret && iomap->type == IOMAP_HOLE)
1409 		ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1410 	release_metapath(&mp);
1411 	return ret;
1412 }
1413 
1414 /**
1415  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1416  * @ip: inode
1417  * @rd_gh: holder of resource group glock
1418  * @bh: buffer head to sweep
1419  * @start: starting point in bh
1420  * @end: end point in bh
1421  * @meta: true if bh points to metadata (rather than data)
1422  * @btotal: place to keep count of total blocks freed
1423  *
1424  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1425  * free, and free them all. However, we do it one rgrp at a time. If this
1426  * block has references to multiple rgrps, we break it into individual
1427  * transactions. This allows other processes to use the rgrps while we're
1428  * focused on a single one, for better concurrency / performance.
1429  * At every transaction boundary, we rewrite the inode into the journal.
1430  * That way the bitmaps are kept consistent with the inode and we can recover
1431  * if we're interrupted by power-outages.
1432  *
1433  * Returns: 0, or return code if an error occurred.
1434  *          *btotal has the total number of blocks freed
1435  */
1436 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1437 			      struct buffer_head *bh, __be64 *start, __be64 *end,
1438 			      bool meta, u32 *btotal)
1439 {
1440 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1441 	struct gfs2_rgrpd *rgd;
1442 	struct gfs2_trans *tr;
1443 	__be64 *p;
1444 	int blks_outside_rgrp;
1445 	u64 bn, bstart, isize_blks;
1446 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1447 	int ret = 0;
1448 	bool buf_in_tr = false; /* buffer was added to transaction */
1449 
1450 more_rgrps:
1451 	rgd = NULL;
1452 	if (gfs2_holder_initialized(rd_gh)) {
1453 		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1454 		gfs2_assert_withdraw(sdp,
1455 			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1456 	}
1457 	blks_outside_rgrp = 0;
1458 	bstart = 0;
1459 	blen = 0;
1460 
1461 	for (p = start; p < end; p++) {
1462 		if (!*p)
1463 			continue;
1464 		bn = be64_to_cpu(*p);
1465 
1466 		if (rgd) {
1467 			if (!rgrp_contains_block(rgd, bn)) {
1468 				blks_outside_rgrp++;
1469 				continue;
1470 			}
1471 		} else {
1472 			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1473 			if (unlikely(!rgd)) {
1474 				ret = -EIO;
1475 				goto out;
1476 			}
1477 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1478 						 LM_FLAG_NODE_SCOPE, rd_gh);
1479 			if (ret)
1480 				goto out;
1481 
1482 			/* Must be done with the rgrp glock held: */
1483 			if (gfs2_rs_active(&ip->i_res) &&
1484 			    rgd == ip->i_res.rs_rgd)
1485 				gfs2_rs_deltree(&ip->i_res);
1486 		}
1487 
1488 		/* The size of our transactions will be unknown until we
1489 		   actually process all the metadata blocks that relate to
1490 		   the rgrp. So we estimate. We know it can't be more than
1491 		   the dinode's i_blocks and we don't want to exceed the
1492 		   journal flush threshold, sd_log_thresh2. */
1493 		if (current->journal_info == NULL) {
1494 			unsigned int jblocks_rqsted, revokes;
1495 
1496 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1497 				RES_INDIRECT;
1498 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1499 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1500 				jblocks_rqsted +=
1501 					atomic_read(&sdp->sd_log_thresh2);
1502 			else
1503 				jblocks_rqsted += isize_blks;
1504 			revokes = jblocks_rqsted;
1505 			if (meta)
1506 				revokes += end - start;
1507 			else if (ip->i_depth)
1508 				revokes += sdp->sd_inptrs;
1509 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1510 			if (ret)
1511 				goto out_unlock;
1512 			down_write(&ip->i_rw_mutex);
1513 		}
1514 		/* check if we will exceed the transaction blocks requested */
1515 		tr = current->journal_info;
1516 		if (tr->tr_num_buf_new + RES_STATFS +
1517 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1518 			/* We set blks_outside_rgrp to ensure the loop will
1519 			   be repeated for the same rgrp, but with a new
1520 			   transaction. */
1521 			blks_outside_rgrp++;
1522 			/* This next part is tricky. If the buffer was added
1523 			   to the transaction, we've already set some block
1524 			   pointers to 0, so we better follow through and free
1525 			   them, or we will introduce corruption (so break).
1526 			   This may be impossible, or at least rare, but I
1527 			   decided to cover the case regardless.
1528 
1529 			   If the buffer was not added to the transaction
1530 			   (this call), doing so would exceed our transaction
1531 			   size, so we need to end the transaction and start a
1532 			   new one (so goto). */
1533 
1534 			if (buf_in_tr)
1535 				break;
1536 			goto out_unlock;
1537 		}
1538 
1539 		gfs2_trans_add_meta(ip->i_gl, bh);
1540 		buf_in_tr = true;
1541 		*p = 0;
1542 		if (bstart + blen == bn) {
1543 			blen++;
1544 			continue;
1545 		}
1546 		if (bstart) {
1547 			__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1548 			(*btotal) += blen;
1549 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1550 		}
1551 		bstart = bn;
1552 		blen = 1;
1553 	}
1554 	if (bstart) {
1555 		__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1556 		(*btotal) += blen;
1557 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1558 	}
1559 out_unlock:
1560 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1561 					    outside the rgrp we just processed,
1562 					    do it all over again. */
1563 		if (current->journal_info) {
1564 			struct buffer_head *dibh;
1565 
1566 			ret = gfs2_meta_inode_buffer(ip, &dibh);
1567 			if (ret)
1568 				goto out;
1569 
1570 			/* Every transaction boundary, we rewrite the dinode
1571 			   to keep its di_blocks current in case of failure. */
1572 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1573 				current_time(&ip->i_inode);
1574 			gfs2_trans_add_meta(ip->i_gl, dibh);
1575 			gfs2_dinode_out(ip, dibh->b_data);
1576 			brelse(dibh);
1577 			up_write(&ip->i_rw_mutex);
1578 			gfs2_trans_end(sdp);
1579 			buf_in_tr = false;
1580 		}
1581 		gfs2_glock_dq_uninit(rd_gh);
1582 		cond_resched();
1583 		goto more_rgrps;
1584 	}
1585 out:
1586 	return ret;
1587 }
1588 
1589 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1590 {
1591 	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1592 		return false;
1593 	return true;
1594 }
1595 
1596 /**
1597  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1598  * @sdp: The superblock
1599  * @mp: starting metapath
1600  * @h: desired height to search
1601  * @end_list: See punch_hole().
1602  * @end_aligned: See punch_hole().
1603  *
1604  * Assumes the metapath is valid (with buffers) out to height h.
1605  * Returns: true if a non-null pointer was found in the metapath buffer
1606  *          false if all remaining pointers are NULL in the buffer
1607  */
1608 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1609 			     unsigned int h,
1610 			     __u16 *end_list, unsigned int end_aligned)
1611 {
1612 	struct buffer_head *bh = mp->mp_bh[h];
1613 	__be64 *first, *ptr, *end;
1614 
1615 	first = metaptr1(h, mp);
1616 	ptr = first + mp->mp_list[h];
1617 	end = (__be64 *)(bh->b_data + bh->b_size);
1618 	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1619 		bool keep_end = h < end_aligned;
1620 		end = first + end_list[h] + keep_end;
1621 	}
1622 
1623 	while (ptr < end) {
1624 		if (*ptr) { /* if we have a non-null pointer */
1625 			mp->mp_list[h] = ptr - first;
1626 			h++;
1627 			if (h < GFS2_MAX_META_HEIGHT)
1628 				mp->mp_list[h] = 0;
1629 			return true;
1630 		}
1631 		ptr++;
1632 	}
1633 	return false;
1634 }
1635 
1636 enum dealloc_states {
1637 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1638 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1639 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1640 	DEALLOC_DONE = 3,       /* process complete */
1641 };
1642 
1643 static inline void
1644 metapointer_range(struct metapath *mp, int height,
1645 		  __u16 *start_list, unsigned int start_aligned,
1646 		  __u16 *end_list, unsigned int end_aligned,
1647 		  __be64 **start, __be64 **end)
1648 {
1649 	struct buffer_head *bh = mp->mp_bh[height];
1650 	__be64 *first;
1651 
1652 	first = metaptr1(height, mp);
1653 	*start = first;
1654 	if (mp_eq_to_hgt(mp, start_list, height)) {
1655 		bool keep_start = height < start_aligned;
1656 		*start = first + start_list[height] + keep_start;
1657 	}
1658 	*end = (__be64 *)(bh->b_data + bh->b_size);
1659 	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1660 		bool keep_end = height < end_aligned;
1661 		*end = first + end_list[height] + keep_end;
1662 	}
1663 }
1664 
1665 static inline bool walk_done(struct gfs2_sbd *sdp,
1666 			     struct metapath *mp, int height,
1667 			     __u16 *end_list, unsigned int end_aligned)
1668 {
1669 	__u16 end;
1670 
1671 	if (end_list) {
1672 		bool keep_end = height < end_aligned;
1673 		if (!mp_eq_to_hgt(mp, end_list, height))
1674 			return false;
1675 		end = end_list[height] + keep_end;
1676 	} else
1677 		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1678 	return mp->mp_list[height] >= end;
1679 }
1680 
1681 /**
1682  * punch_hole - deallocate blocks in a file
1683  * @ip: inode to truncate
1684  * @offset: the start of the hole
1685  * @length: the size of the hole (or 0 for truncate)
1686  *
1687  * Punch a hole into a file or truncate a file at a given position.  This
1688  * function operates in whole blocks (@offset and @length are rounded
1689  * accordingly); partially filled blocks must be cleared otherwise.
1690  *
1691  * This function works from the bottom up, and from the right to the left. In
1692  * other words, it strips off the highest layer (data) before stripping any of
1693  * the metadata. Doing it this way is best in case the operation is interrupted
1694  * by power failure, etc.  The dinode is rewritten in every transaction to
1695  * guarantee integrity.
1696  */
1697 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1698 {
1699 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1700 	u64 maxsize = sdp->sd_heightsize[ip->i_height];
1701 	struct metapath mp = {};
1702 	struct buffer_head *dibh, *bh;
1703 	struct gfs2_holder rd_gh;
1704 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1705 	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1706 	__u16 start_list[GFS2_MAX_META_HEIGHT];
1707 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1708 	unsigned int start_aligned, end_aligned;
1709 	unsigned int strip_h = ip->i_height - 1;
1710 	u32 btotal = 0;
1711 	int ret, state;
1712 	int mp_h; /* metapath buffers are read in to this height */
1713 	u64 prev_bnr = 0;
1714 	__be64 *start, *end;
1715 
1716 	if (offset >= maxsize) {
1717 		/*
1718 		 * The starting point lies beyond the allocated meta-data;
1719 		 * there are no blocks do deallocate.
1720 		 */
1721 		return 0;
1722 	}
1723 
1724 	/*
1725 	 * The start position of the hole is defined by lblock, start_list, and
1726 	 * start_aligned.  The end position of the hole is defined by lend,
1727 	 * end_list, and end_aligned.
1728 	 *
1729 	 * start_aligned and end_aligned define down to which height the start
1730 	 * and end positions are aligned to the metadata tree (i.e., the
1731 	 * position is a multiple of the metadata granularity at the height
1732 	 * above).  This determines at which heights additional meta pointers
1733 	 * needs to be preserved for the remaining data.
1734 	 */
1735 
1736 	if (length) {
1737 		u64 end_offset = offset + length;
1738 		u64 lend;
1739 
1740 		/*
1741 		 * Clip the end at the maximum file size for the given height:
1742 		 * that's how far the metadata goes; files bigger than that
1743 		 * will have additional layers of indirection.
1744 		 */
1745 		if (end_offset > maxsize)
1746 			end_offset = maxsize;
1747 		lend = end_offset >> bsize_shift;
1748 
1749 		if (lblock >= lend)
1750 			return 0;
1751 
1752 		find_metapath(sdp, lend, &mp, ip->i_height);
1753 		end_list = __end_list;
1754 		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1755 
1756 		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1757 			if (end_list[mp_h])
1758 				break;
1759 		}
1760 		end_aligned = mp_h;
1761 	}
1762 
1763 	find_metapath(sdp, lblock, &mp, ip->i_height);
1764 	memcpy(start_list, mp.mp_list, sizeof(start_list));
1765 
1766 	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1767 		if (start_list[mp_h])
1768 			break;
1769 	}
1770 	start_aligned = mp_h;
1771 
1772 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1773 	if (ret)
1774 		return ret;
1775 
1776 	mp.mp_bh[0] = dibh;
1777 	ret = lookup_metapath(ip, &mp);
1778 	if (ret)
1779 		goto out_metapath;
1780 
1781 	/* issue read-ahead on metadata */
1782 	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1783 		metapointer_range(&mp, mp_h, start_list, start_aligned,
1784 				  end_list, end_aligned, &start, &end);
1785 		gfs2_metapath_ra(ip->i_gl, start, end);
1786 	}
1787 
1788 	if (mp.mp_aheight == ip->i_height)
1789 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1790 	else
1791 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1792 
1793 	ret = gfs2_rindex_update(sdp);
1794 	if (ret)
1795 		goto out_metapath;
1796 
1797 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1798 	if (ret)
1799 		goto out_metapath;
1800 	gfs2_holder_mark_uninitialized(&rd_gh);
1801 
1802 	mp_h = strip_h;
1803 
1804 	while (state != DEALLOC_DONE) {
1805 		switch (state) {
1806 		/* Truncate a full metapath at the given strip height.
1807 		 * Note that strip_h == mp_h in order to be in this state. */
1808 		case DEALLOC_MP_FULL:
1809 			bh = mp.mp_bh[mp_h];
1810 			gfs2_assert_withdraw(sdp, bh);
1811 			if (gfs2_assert_withdraw(sdp,
1812 						 prev_bnr != bh->b_blocknr)) {
1813 				fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1814 					 "s_h:%u, mp_h:%u\n",
1815 				       (unsigned long long)ip->i_no_addr,
1816 				       prev_bnr, ip->i_height, strip_h, mp_h);
1817 			}
1818 			prev_bnr = bh->b_blocknr;
1819 
1820 			if (gfs2_metatype_check(sdp, bh,
1821 						(mp_h ? GFS2_METATYPE_IN :
1822 							GFS2_METATYPE_DI))) {
1823 				ret = -EIO;
1824 				goto out;
1825 			}
1826 
1827 			/*
1828 			 * Below, passing end_aligned as 0 gives us the
1829 			 * metapointer range excluding the end point: the end
1830 			 * point is the first metapath we must not deallocate!
1831 			 */
1832 
1833 			metapointer_range(&mp, mp_h, start_list, start_aligned,
1834 					  end_list, 0 /* end_aligned */,
1835 					  &start, &end);
1836 			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1837 						 start, end,
1838 						 mp_h != ip->i_height - 1,
1839 						 &btotal);
1840 
1841 			/* If we hit an error or just swept dinode buffer,
1842 			   just exit. */
1843 			if (ret || !mp_h) {
1844 				state = DEALLOC_DONE;
1845 				break;
1846 			}
1847 			state = DEALLOC_MP_LOWER;
1848 			break;
1849 
1850 		/* lower the metapath strip height */
1851 		case DEALLOC_MP_LOWER:
1852 			/* We're done with the current buffer, so release it,
1853 			   unless it's the dinode buffer. Then back up to the
1854 			   previous pointer. */
1855 			if (mp_h) {
1856 				brelse(mp.mp_bh[mp_h]);
1857 				mp.mp_bh[mp_h] = NULL;
1858 			}
1859 			/* If we can't get any lower in height, we've stripped
1860 			   off all we can. Next step is to back up and start
1861 			   stripping the previous level of metadata. */
1862 			if (mp_h == 0) {
1863 				strip_h--;
1864 				memcpy(mp.mp_list, start_list, sizeof(start_list));
1865 				mp_h = strip_h;
1866 				state = DEALLOC_FILL_MP;
1867 				break;
1868 			}
1869 			mp.mp_list[mp_h] = 0;
1870 			mp_h--; /* search one metadata height down */
1871 			mp.mp_list[mp_h]++;
1872 			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1873 				break;
1874 			/* Here we've found a part of the metapath that is not
1875 			 * allocated. We need to search at that height for the
1876 			 * next non-null pointer. */
1877 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1878 				state = DEALLOC_FILL_MP;
1879 				mp_h++;
1880 			}
1881 			/* No more non-null pointers at this height. Back up
1882 			   to the previous height and try again. */
1883 			break; /* loop around in the same state */
1884 
1885 		/* Fill the metapath with buffers to the given height. */
1886 		case DEALLOC_FILL_MP:
1887 			/* Fill the buffers out to the current height. */
1888 			ret = fillup_metapath(ip, &mp, mp_h);
1889 			if (ret < 0)
1890 				goto out;
1891 
1892 			/* On the first pass, issue read-ahead on metadata. */
1893 			if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1894 				unsigned int height = mp.mp_aheight - 1;
1895 
1896 				/* No read-ahead for data blocks. */
1897 				if (mp.mp_aheight - 1 == strip_h)
1898 					height--;
1899 
1900 				for (; height >= mp.mp_aheight - ret; height--) {
1901 					metapointer_range(&mp, height,
1902 							  start_list, start_aligned,
1903 							  end_list, end_aligned,
1904 							  &start, &end);
1905 					gfs2_metapath_ra(ip->i_gl, start, end);
1906 				}
1907 			}
1908 
1909 			/* If buffers found for the entire strip height */
1910 			if (mp.mp_aheight - 1 == strip_h) {
1911 				state = DEALLOC_MP_FULL;
1912 				break;
1913 			}
1914 			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1915 				mp_h = mp.mp_aheight - 1;
1916 
1917 			/* If we find a non-null block pointer, crawl a bit
1918 			   higher up in the metapath and try again, otherwise
1919 			   we need to look lower for a new starting point. */
1920 			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1921 				mp_h++;
1922 			else
1923 				state = DEALLOC_MP_LOWER;
1924 			break;
1925 		}
1926 	}
1927 
1928 	if (btotal) {
1929 		if (current->journal_info == NULL) {
1930 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1931 					       RES_QUOTA, 0);
1932 			if (ret)
1933 				goto out;
1934 			down_write(&ip->i_rw_mutex);
1935 		}
1936 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1937 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1938 				  ip->i_inode.i_gid);
1939 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1940 		gfs2_trans_add_meta(ip->i_gl, dibh);
1941 		gfs2_dinode_out(ip, dibh->b_data);
1942 		up_write(&ip->i_rw_mutex);
1943 		gfs2_trans_end(sdp);
1944 	}
1945 
1946 out:
1947 	if (gfs2_holder_initialized(&rd_gh))
1948 		gfs2_glock_dq_uninit(&rd_gh);
1949 	if (current->journal_info) {
1950 		up_write(&ip->i_rw_mutex);
1951 		gfs2_trans_end(sdp);
1952 		cond_resched();
1953 	}
1954 	gfs2_quota_unhold(ip);
1955 out_metapath:
1956 	release_metapath(&mp);
1957 	return ret;
1958 }
1959 
1960 static int trunc_end(struct gfs2_inode *ip)
1961 {
1962 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1963 	struct buffer_head *dibh;
1964 	int error;
1965 
1966 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1967 	if (error)
1968 		return error;
1969 
1970 	down_write(&ip->i_rw_mutex);
1971 
1972 	error = gfs2_meta_inode_buffer(ip, &dibh);
1973 	if (error)
1974 		goto out;
1975 
1976 	if (!i_size_read(&ip->i_inode)) {
1977 		ip->i_height = 0;
1978 		ip->i_goal = ip->i_no_addr;
1979 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1980 		gfs2_ordered_del_inode(ip);
1981 	}
1982 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1983 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1984 
1985 	gfs2_trans_add_meta(ip->i_gl, dibh);
1986 	gfs2_dinode_out(ip, dibh->b_data);
1987 	brelse(dibh);
1988 
1989 out:
1990 	up_write(&ip->i_rw_mutex);
1991 	gfs2_trans_end(sdp);
1992 	return error;
1993 }
1994 
1995 /**
1996  * do_shrink - make a file smaller
1997  * @inode: the inode
1998  * @newsize: the size to make the file
1999  *
2000  * Called with an exclusive lock on @inode. The @size must
2001  * be equal to or smaller than the current inode size.
2002  *
2003  * Returns: errno
2004  */
2005 
2006 static int do_shrink(struct inode *inode, u64 newsize)
2007 {
2008 	struct gfs2_inode *ip = GFS2_I(inode);
2009 	int error;
2010 
2011 	error = trunc_start(inode, newsize);
2012 	if (error < 0)
2013 		return error;
2014 	if (gfs2_is_stuffed(ip))
2015 		return 0;
2016 
2017 	error = punch_hole(ip, newsize, 0);
2018 	if (error == 0)
2019 		error = trunc_end(ip);
2020 
2021 	return error;
2022 }
2023 
2024 void gfs2_trim_blocks(struct inode *inode)
2025 {
2026 	int ret;
2027 
2028 	ret = do_shrink(inode, inode->i_size);
2029 	WARN_ON(ret != 0);
2030 }
2031 
2032 /**
2033  * do_grow - Touch and update inode size
2034  * @inode: The inode
2035  * @size: The new size
2036  *
2037  * This function updates the timestamps on the inode and
2038  * may also increase the size of the inode. This function
2039  * must not be called with @size any smaller than the current
2040  * inode size.
2041  *
2042  * Although it is not strictly required to unstuff files here,
2043  * earlier versions of GFS2 have a bug in the stuffed file reading
2044  * code which will result in a buffer overrun if the size is larger
2045  * than the max stuffed file size. In order to prevent this from
2046  * occurring, such files are unstuffed, but in other cases we can
2047  * just update the inode size directly.
2048  *
2049  * Returns: 0 on success, or -ve on error
2050  */
2051 
2052 static int do_grow(struct inode *inode, u64 size)
2053 {
2054 	struct gfs2_inode *ip = GFS2_I(inode);
2055 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2056 	struct gfs2_alloc_parms ap = { .target = 1, };
2057 	struct buffer_head *dibh;
2058 	int error;
2059 	int unstuff = 0;
2060 
2061 	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2062 		error = gfs2_quota_lock_check(ip, &ap);
2063 		if (error)
2064 			return error;
2065 
2066 		error = gfs2_inplace_reserve(ip, &ap);
2067 		if (error)
2068 			goto do_grow_qunlock;
2069 		unstuff = 1;
2070 	}
2071 
2072 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2073 				 (unstuff &&
2074 				  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2075 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2076 				  0 : RES_QUOTA), 0);
2077 	if (error)
2078 		goto do_grow_release;
2079 
2080 	if (unstuff) {
2081 		error = gfs2_unstuff_dinode(ip);
2082 		if (error)
2083 			goto do_end_trans;
2084 	}
2085 
2086 	error = gfs2_meta_inode_buffer(ip, &dibh);
2087 	if (error)
2088 		goto do_end_trans;
2089 
2090 	truncate_setsize(inode, size);
2091 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2092 	gfs2_trans_add_meta(ip->i_gl, dibh);
2093 	gfs2_dinode_out(ip, dibh->b_data);
2094 	brelse(dibh);
2095 
2096 do_end_trans:
2097 	gfs2_trans_end(sdp);
2098 do_grow_release:
2099 	if (unstuff) {
2100 		gfs2_inplace_release(ip);
2101 do_grow_qunlock:
2102 		gfs2_quota_unlock(ip);
2103 	}
2104 	return error;
2105 }
2106 
2107 /**
2108  * gfs2_setattr_size - make a file a given size
2109  * @inode: the inode
2110  * @newsize: the size to make the file
2111  *
2112  * The file size can grow, shrink, or stay the same size. This
2113  * is called holding i_rwsem and an exclusive glock on the inode
2114  * in question.
2115  *
2116  * Returns: errno
2117  */
2118 
2119 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2120 {
2121 	struct gfs2_inode *ip = GFS2_I(inode);
2122 	int ret;
2123 
2124 	BUG_ON(!S_ISREG(inode->i_mode));
2125 
2126 	ret = inode_newsize_ok(inode, newsize);
2127 	if (ret)
2128 		return ret;
2129 
2130 	inode_dio_wait(inode);
2131 
2132 	ret = gfs2_qa_get(ip);
2133 	if (ret)
2134 		goto out;
2135 
2136 	if (newsize >= inode->i_size) {
2137 		ret = do_grow(inode, newsize);
2138 		goto out;
2139 	}
2140 
2141 	ret = do_shrink(inode, newsize);
2142 out:
2143 	gfs2_rs_delete(ip);
2144 	gfs2_qa_put(ip);
2145 	return ret;
2146 }
2147 
2148 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2149 {
2150 	int error;
2151 	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2152 	if (!error)
2153 		error = trunc_end(ip);
2154 	return error;
2155 }
2156 
2157 int gfs2_file_dealloc(struct gfs2_inode *ip)
2158 {
2159 	return punch_hole(ip, 0, 0);
2160 }
2161 
2162 /**
2163  * gfs2_free_journal_extents - Free cached journal bmap info
2164  * @jd: The journal
2165  *
2166  */
2167 
2168 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2169 {
2170 	struct gfs2_journal_extent *jext;
2171 
2172 	while(!list_empty(&jd->extent_list)) {
2173 		jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2174 		list_del(&jext->list);
2175 		kfree(jext);
2176 	}
2177 }
2178 
2179 /**
2180  * gfs2_add_jextent - Add or merge a new extent to extent cache
2181  * @jd: The journal descriptor
2182  * @lblock: The logical block at start of new extent
2183  * @dblock: The physical block at start of new extent
2184  * @blocks: Size of extent in fs blocks
2185  *
2186  * Returns: 0 on success or -ENOMEM
2187  */
2188 
2189 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2190 {
2191 	struct gfs2_journal_extent *jext;
2192 
2193 	if (!list_empty(&jd->extent_list)) {
2194 		jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2195 		if ((jext->dblock + jext->blocks) == dblock) {
2196 			jext->blocks += blocks;
2197 			return 0;
2198 		}
2199 	}
2200 
2201 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2202 	if (jext == NULL)
2203 		return -ENOMEM;
2204 	jext->dblock = dblock;
2205 	jext->lblock = lblock;
2206 	jext->blocks = blocks;
2207 	list_add_tail(&jext->list, &jd->extent_list);
2208 	jd->nr_extents++;
2209 	return 0;
2210 }
2211 
2212 /**
2213  * gfs2_map_journal_extents - Cache journal bmap info
2214  * @sdp: The super block
2215  * @jd: The journal to map
2216  *
2217  * Create a reusable "extent" mapping from all logical
2218  * blocks to all physical blocks for the given journal.  This will save
2219  * us time when writing journal blocks.  Most journals will have only one
2220  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2221  * arranges the journal blocks sequentially to maximize performance.
2222  * So the extent would map the first block for the entire file length.
2223  * However, gfs2_jadd can happen while file activity is happening, so
2224  * those journals may not be sequential.  Less likely is the case where
2225  * the users created their own journals by mounting the metafs and
2226  * laying it out.  But it's still possible.  These journals might have
2227  * several extents.
2228  *
2229  * Returns: 0 on success, or error on failure
2230  */
2231 
2232 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2233 {
2234 	u64 lblock = 0;
2235 	u64 lblock_stop;
2236 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2237 	struct buffer_head bh;
2238 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2239 	u64 size;
2240 	int rc;
2241 	ktime_t start, end;
2242 
2243 	start = ktime_get();
2244 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
2245 	size = (lblock_stop - lblock) << shift;
2246 	jd->nr_extents = 0;
2247 	WARN_ON(!list_empty(&jd->extent_list));
2248 
2249 	do {
2250 		bh.b_state = 0;
2251 		bh.b_blocknr = 0;
2252 		bh.b_size = size;
2253 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2254 		if (rc || !buffer_mapped(&bh))
2255 			goto fail;
2256 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2257 		if (rc)
2258 			goto fail;
2259 		size -= bh.b_size;
2260 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2261 	} while(size > 0);
2262 
2263 	end = ktime_get();
2264 	fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2265 		jd->nr_extents, ktime_ms_delta(end, start));
2266 	return 0;
2267 
2268 fail:
2269 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2270 		rc, jd->jd_jid,
2271 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
2272 		jd->nr_extents);
2273 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2274 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2275 		bh.b_state, (unsigned long long)bh.b_size);
2276 	gfs2_free_journal_extents(jd);
2277 	return rc;
2278 }
2279 
2280 /**
2281  * gfs2_write_alloc_required - figure out if a write will require an allocation
2282  * @ip: the file being written to
2283  * @offset: the offset to write to
2284  * @len: the number of bytes being written
2285  *
2286  * Returns: 1 if an alloc is required, 0 otherwise
2287  */
2288 
2289 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2290 			      unsigned int len)
2291 {
2292 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2293 	struct buffer_head bh;
2294 	unsigned int shift;
2295 	u64 lblock, lblock_stop, size;
2296 	u64 end_of_file;
2297 
2298 	if (!len)
2299 		return 0;
2300 
2301 	if (gfs2_is_stuffed(ip)) {
2302 		if (offset + len > gfs2_max_stuffed_size(ip))
2303 			return 1;
2304 		return 0;
2305 	}
2306 
2307 	shift = sdp->sd_sb.sb_bsize_shift;
2308 	BUG_ON(gfs2_is_dir(ip));
2309 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2310 	lblock = offset >> shift;
2311 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2312 	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2313 		return 1;
2314 
2315 	size = (lblock_stop - lblock) << shift;
2316 	do {
2317 		bh.b_state = 0;
2318 		bh.b_size = size;
2319 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2320 		if (!buffer_mapped(&bh))
2321 			return 1;
2322 		size -= bh.b_size;
2323 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2324 	} while(size > 0);
2325 
2326 	return 0;
2327 }
2328 
2329 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2330 {
2331 	struct gfs2_inode *ip = GFS2_I(inode);
2332 	struct buffer_head *dibh;
2333 	int error;
2334 
2335 	if (offset >= inode->i_size)
2336 		return 0;
2337 	if (offset + length > inode->i_size)
2338 		length = inode->i_size - offset;
2339 
2340 	error = gfs2_meta_inode_buffer(ip, &dibh);
2341 	if (error)
2342 		return error;
2343 	gfs2_trans_add_meta(ip->i_gl, dibh);
2344 	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2345 	       length);
2346 	brelse(dibh);
2347 	return 0;
2348 }
2349 
2350 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2351 					 loff_t length)
2352 {
2353 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2354 	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2355 	int error;
2356 
2357 	while (length) {
2358 		struct gfs2_trans *tr;
2359 		loff_t chunk;
2360 		unsigned int offs;
2361 
2362 		chunk = length;
2363 		if (chunk > max_chunk)
2364 			chunk = max_chunk;
2365 
2366 		offs = offset & ~PAGE_MASK;
2367 		if (offs && chunk > PAGE_SIZE)
2368 			chunk = offs + ((chunk - offs) & PAGE_MASK);
2369 
2370 		truncate_pagecache_range(inode, offset, chunk);
2371 		offset += chunk;
2372 		length -= chunk;
2373 
2374 		tr = current->journal_info;
2375 		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2376 			continue;
2377 
2378 		gfs2_trans_end(sdp);
2379 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2380 		if (error)
2381 			return error;
2382 	}
2383 	return 0;
2384 }
2385 
2386 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2387 {
2388 	struct inode *inode = file_inode(file);
2389 	struct gfs2_inode *ip = GFS2_I(inode);
2390 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2391 	unsigned int blocksize = i_blocksize(inode);
2392 	loff_t start, end;
2393 	int error;
2394 
2395 	if (!gfs2_is_stuffed(ip)) {
2396 		unsigned int start_off, end_len;
2397 
2398 		start_off = offset & (blocksize - 1);
2399 		end_len = (offset + length) & (blocksize - 1);
2400 		if (start_off) {
2401 			unsigned int len = length;
2402 			if (length > blocksize - start_off)
2403 				len = blocksize - start_off;
2404 			error = gfs2_block_zero_range(inode, offset, len);
2405 			if (error)
2406 				goto out;
2407 			if (start_off + length < blocksize)
2408 				end_len = 0;
2409 		}
2410 		if (end_len) {
2411 			error = gfs2_block_zero_range(inode,
2412 				offset + length - end_len, end_len);
2413 			if (error)
2414 				goto out;
2415 		}
2416 	}
2417 
2418 	start = round_down(offset, blocksize);
2419 	end = round_up(offset + length, blocksize) - 1;
2420 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2421 	if (error)
2422 		return error;
2423 
2424 	if (gfs2_is_jdata(ip))
2425 		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2426 					 GFS2_JTRUNC_REVOKES);
2427 	else
2428 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2429 	if (error)
2430 		return error;
2431 
2432 	if (gfs2_is_stuffed(ip)) {
2433 		error = stuffed_zero_range(inode, offset, length);
2434 		if (error)
2435 			goto out;
2436 	}
2437 
2438 	if (gfs2_is_jdata(ip)) {
2439 		BUG_ON(!current->journal_info);
2440 		gfs2_journaled_truncate_range(inode, offset, length);
2441 	} else
2442 		truncate_pagecache_range(inode, offset, offset + length - 1);
2443 
2444 	file_update_time(file);
2445 	mark_inode_dirty(inode);
2446 
2447 	if (current->journal_info)
2448 		gfs2_trans_end(sdp);
2449 
2450 	if (!gfs2_is_stuffed(ip))
2451 		error = punch_hole(ip, offset, length);
2452 
2453 out:
2454 	if (current->journal_info)
2455 		gfs2_trans_end(sdp);
2456 	return error;
2457 }
2458 
2459 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2460 		loff_t offset)
2461 {
2462 	int ret;
2463 
2464 	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2465 		return -EIO;
2466 
2467 	if (offset >= wpc->iomap.offset &&
2468 	    offset < wpc->iomap.offset + wpc->iomap.length)
2469 		return 0;
2470 
2471 	memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2472 	ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2473 	return ret;
2474 }
2475 
2476 const struct iomap_writeback_ops gfs2_writeback_ops = {
2477 	.map_blocks		= gfs2_map_blocks,
2478 };
2479