xref: /openbmc/linux/fs/gfs2/bmap.c (revision e639c869)
1  /*
2   * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3   * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4   *
5   * This copyrighted material is made available to anyone wishing to use,
6   * modify, copy, or redistribute it subject to the terms and conditions
7   * of the GNU General Public License version 2.
8   */
9  
10  #include <linux/spinlock.h>
11  #include <linux/completion.h>
12  #include <linux/buffer_head.h>
13  #include <linux/blkdev.h>
14  #include <linux/gfs2_ondisk.h>
15  #include <linux/crc32.h>
16  #include <linux/iomap.h>
17  
18  #include "gfs2.h"
19  #include "incore.h"
20  #include "bmap.h"
21  #include "glock.h"
22  #include "inode.h"
23  #include "meta_io.h"
24  #include "quota.h"
25  #include "rgrp.h"
26  #include "log.h"
27  #include "super.h"
28  #include "trans.h"
29  #include "dir.h"
30  #include "util.h"
31  #include "trace_gfs2.h"
32  
33  /* This doesn't need to be that large as max 64 bit pointers in a 4k
34   * block is 512, so __u16 is fine for that. It saves stack space to
35   * keep it small.
36   */
37  struct metapath {
38  	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
39  	__u16 mp_list[GFS2_MAX_META_HEIGHT];
40  	int mp_fheight; /* find_metapath height */
41  	int mp_aheight; /* actual height (lookup height) */
42  };
43  
44  /**
45   * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
46   * @ip: the inode
47   * @dibh: the dinode buffer
48   * @block: the block number that was allocated
49   * @page: The (optional) page. This is looked up if @page is NULL
50   *
51   * Returns: errno
52   */
53  
54  static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
55  			       u64 block, struct page *page)
56  {
57  	struct inode *inode = &ip->i_inode;
58  	struct buffer_head *bh;
59  	int release = 0;
60  
61  	if (!page || page->index) {
62  		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63  		if (!page)
64  			return -ENOMEM;
65  		release = 1;
66  	}
67  
68  	if (!PageUptodate(page)) {
69  		void *kaddr = kmap(page);
70  		u64 dsize = i_size_read(inode);
71  
72  		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
73  			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
74  
75  		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
76  		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
77  		kunmap(page);
78  
79  		SetPageUptodate(page);
80  	}
81  
82  	if (!page_has_buffers(page))
83  		create_empty_buffers(page, BIT(inode->i_blkbits),
84  				     BIT(BH_Uptodate));
85  
86  	bh = page_buffers(page);
87  
88  	if (!buffer_mapped(bh))
89  		map_bh(bh, inode->i_sb, block);
90  
91  	set_buffer_uptodate(bh);
92  	if (!gfs2_is_jdata(ip))
93  		mark_buffer_dirty(bh);
94  	if (!gfs2_is_writeback(ip))
95  		gfs2_trans_add_data(ip->i_gl, bh);
96  
97  	if (release) {
98  		unlock_page(page);
99  		put_page(page);
100  	}
101  
102  	return 0;
103  }
104  
105  /**
106   * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
107   * @ip: The GFS2 inode to unstuff
108   * @page: The (optional) page. This is looked up if the @page is NULL
109   *
110   * This routine unstuffs a dinode and returns it to a "normal" state such
111   * that the height can be grown in the traditional way.
112   *
113   * Returns: errno
114   */
115  
116  int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
117  {
118  	struct buffer_head *bh, *dibh;
119  	struct gfs2_dinode *di;
120  	u64 block = 0;
121  	int isdir = gfs2_is_dir(ip);
122  	int error;
123  
124  	down_write(&ip->i_rw_mutex);
125  
126  	error = gfs2_meta_inode_buffer(ip, &dibh);
127  	if (error)
128  		goto out;
129  
130  	if (i_size_read(&ip->i_inode)) {
131  		/* Get a free block, fill it with the stuffed data,
132  		   and write it out to disk */
133  
134  		unsigned int n = 1;
135  		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136  		if (error)
137  			goto out_brelse;
138  		if (isdir) {
139  			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
140  			error = gfs2_dir_get_new_buffer(ip, block, &bh);
141  			if (error)
142  				goto out_brelse;
143  			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
144  					      dibh, sizeof(struct gfs2_dinode));
145  			brelse(bh);
146  		} else {
147  			error = gfs2_unstuffer_page(ip, dibh, block, page);
148  			if (error)
149  				goto out_brelse;
150  		}
151  	}
152  
153  	/*  Set up the pointer to the new block  */
154  
155  	gfs2_trans_add_meta(ip->i_gl, dibh);
156  	di = (struct gfs2_dinode *)dibh->b_data;
157  	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
158  
159  	if (i_size_read(&ip->i_inode)) {
160  		*(__be64 *)(di + 1) = cpu_to_be64(block);
161  		gfs2_add_inode_blocks(&ip->i_inode, 1);
162  		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163  	}
164  
165  	ip->i_height = 1;
166  	di->di_height = cpu_to_be16(1);
167  
168  out_brelse:
169  	brelse(dibh);
170  out:
171  	up_write(&ip->i_rw_mutex);
172  	return error;
173  }
174  
175  
176  /**
177   * find_metapath - Find path through the metadata tree
178   * @sdp: The superblock
179   * @mp: The metapath to return the result in
180   * @block: The disk block to look up
181   * @height: The pre-calculated height of the metadata tree
182   *
183   *   This routine returns a struct metapath structure that defines a path
184   *   through the metadata of inode "ip" to get to block "block".
185   *
186   *   Example:
187   *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188   *   filesystem with a blocksize of 4096.
189   *
190   *   find_metapath() would return a struct metapath structure set to:
191   *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
192   *   and mp_list[2] = 165.
193   *
194   *   That means that in order to get to the block containing the byte at
195   *   offset 101342453, we would load the indirect block pointed to by pointer
196   *   0 in the dinode.  We would then load the indirect block pointed to by
197   *   pointer 48 in that indirect block.  We would then load the data block
198   *   pointed to by pointer 165 in that indirect block.
199   *
200   *             ----------------------------------------
201   *             | Dinode |                             |
202   *             |        |                            4|
203   *             |        |0 1 2 3 4 5                 9|
204   *             |        |                            6|
205   *             ----------------------------------------
206   *                       |
207   *                       |
208   *                       V
209   *             ----------------------------------------
210   *             | Indirect Block                       |
211   *             |                                     5|
212   *             |            4 4 4 4 4 5 5            1|
213   *             |0           5 6 7 8 9 0 1            2|
214   *             ----------------------------------------
215   *                                |
216   *                                |
217   *                                V
218   *             ----------------------------------------
219   *             | Indirect Block                       |
220   *             |                         1 1 1 1 1   5|
221   *             |                         6 6 6 6 6   1|
222   *             |0                        3 4 5 6 7   2|
223   *             ----------------------------------------
224   *                                           |
225   *                                           |
226   *                                           V
227   *             ----------------------------------------
228   *             | Data block containing offset         |
229   *             |            101342453                 |
230   *             |                                      |
231   *             |                                      |
232   *             ----------------------------------------
233   *
234   */
235  
236  static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
237  			  struct metapath *mp, unsigned int height)
238  {
239  	unsigned int i;
240  
241  	mp->mp_fheight = height;
242  	for (i = height; i--;)
243  		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
244  }
245  
246  static inline unsigned int metapath_branch_start(const struct metapath *mp)
247  {
248  	if (mp->mp_list[0] == 0)
249  		return 2;
250  	return 1;
251  }
252  
253  /**
254   * metaptr1 - Return the first possible metadata pointer in a metapath buffer
255   * @height: The metadata height (0 = dinode)
256   * @mp: The metapath
257   */
258  static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
259  {
260  	struct buffer_head *bh = mp->mp_bh[height];
261  	if (height == 0)
262  		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
263  	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
264  }
265  
266  /**
267   * metapointer - Return pointer to start of metadata in a buffer
268   * @height: The metadata height (0 = dinode)
269   * @mp: The metapath
270   *
271   * Return a pointer to the block number of the next height of the metadata
272   * tree given a buffer containing the pointer to the current height of the
273   * metadata tree.
274   */
275  
276  static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
277  {
278  	__be64 *p = metaptr1(height, mp);
279  	return p + mp->mp_list[height];
280  }
281  
282  static void gfs2_metapath_ra(struct gfs2_glock *gl,
283  			     const struct buffer_head *bh, const __be64 *pos)
284  {
285  	struct buffer_head *rabh;
286  	const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
287  	const __be64 *t;
288  
289  	for (t = pos; t < endp; t++) {
290  		if (!*t)
291  			continue;
292  
293  		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
294  		if (trylock_buffer(rabh)) {
295  			if (!buffer_uptodate(rabh)) {
296  				rabh->b_end_io = end_buffer_read_sync;
297  				submit_bh(REQ_OP_READ,
298  					  REQ_RAHEAD | REQ_META | REQ_PRIO,
299  					  rabh);
300  				continue;
301  			}
302  			unlock_buffer(rabh);
303  		}
304  		brelse(rabh);
305  	}
306  }
307  
308  /**
309   * lookup_mp_height - helper function for lookup_metapath
310   * @ip: the inode
311   * @mp: the metapath
312   * @h: the height which needs looking up
313   */
314  static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
315  {
316  	__be64 *ptr = metapointer(h, mp);
317  	u64 dblock = be64_to_cpu(*ptr);
318  
319  	if (!dblock)
320  		return h + 1;
321  
322  	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
323  }
324  
325  /**
326   * lookup_metapath - Walk the metadata tree to a specific point
327   * @ip: The inode
328   * @mp: The metapath
329   *
330   * Assumes that the inode's buffer has already been looked up and
331   * hooked onto mp->mp_bh[0] and that the metapath has been initialised
332   * by find_metapath().
333   *
334   * If this function encounters part of the tree which has not been
335   * allocated, it returns the current height of the tree at the point
336   * at which it found the unallocated block. Blocks which are found are
337   * added to the mp->mp_bh[] list.
338   *
339   * Returns: error or height of metadata tree
340   */
341  
342  static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
343  {
344  	unsigned int end_of_metadata = ip->i_height - 1;
345  	unsigned int x;
346  	int ret;
347  
348  	for (x = 0; x < end_of_metadata; x++) {
349  		ret = lookup_mp_height(ip, mp, x);
350  		if (ret)
351  			goto out;
352  	}
353  
354  	ret = ip->i_height;
355  out:
356  	mp->mp_aheight = ret;
357  	return ret;
358  }
359  
360  /**
361   * fillup_metapath - fill up buffers for the metadata path to a specific height
362   * @ip: The inode
363   * @mp: The metapath
364   * @h: The height to which it should be mapped
365   *
366   * Similar to lookup_metapath, but does lookups for a range of heights
367   *
368   * Returns: error or height of metadata tree
369   */
370  
371  static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
372  {
373  	unsigned int start_h = h - 1;
374  	int ret;
375  
376  	if (h) {
377  		/* find the first buffer we need to look up. */
378  		while (start_h > 0 && mp->mp_bh[start_h] == NULL)
379  			start_h--;
380  		for (; start_h < h; start_h++) {
381  			ret = lookup_mp_height(ip, mp, start_h);
382  			if (ret)
383  				return ret;
384  		}
385  	}
386  	return ip->i_height;
387  }
388  
389  static inline void release_metapath(struct metapath *mp)
390  {
391  	int i;
392  
393  	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
394  		if (mp->mp_bh[i] == NULL)
395  			break;
396  		brelse(mp->mp_bh[i]);
397  	}
398  }
399  
400  /**
401   * gfs2_extent_length - Returns length of an extent of blocks
402   * @start: Start of the buffer
403   * @len: Length of the buffer in bytes
404   * @ptr: Current position in the buffer
405   * @limit: Max extent length to return (0 = unlimited)
406   * @eob: Set to 1 if we hit "end of block"
407   *
408   * If the first block is zero (unallocated) it will return the number of
409   * unallocated blocks in the extent, otherwise it will return the number
410   * of contiguous blocks in the extent.
411   *
412   * Returns: The length of the extent (minimum of one block)
413   */
414  
415  static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
416  {
417  	const __be64 *end = (start + len);
418  	const __be64 *first = ptr;
419  	u64 d = be64_to_cpu(*ptr);
420  
421  	*eob = 0;
422  	do {
423  		ptr++;
424  		if (ptr >= end)
425  			break;
426  		if (limit && --limit == 0)
427  			break;
428  		if (d)
429  			d++;
430  	} while(be64_to_cpu(*ptr) == d);
431  	if (ptr >= end)
432  		*eob = 1;
433  	return (ptr - first);
434  }
435  
436  static inline void bmap_lock(struct gfs2_inode *ip, int create)
437  {
438  	if (create)
439  		down_write(&ip->i_rw_mutex);
440  	else
441  		down_read(&ip->i_rw_mutex);
442  }
443  
444  static inline void bmap_unlock(struct gfs2_inode *ip, int create)
445  {
446  	if (create)
447  		up_write(&ip->i_rw_mutex);
448  	else
449  		up_read(&ip->i_rw_mutex);
450  }
451  
452  static inline __be64 *gfs2_indirect_init(struct metapath *mp,
453  					 struct gfs2_glock *gl, unsigned int i,
454  					 unsigned offset, u64 bn)
455  {
456  	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
457  		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
458  				 sizeof(struct gfs2_dinode)));
459  	BUG_ON(i < 1);
460  	BUG_ON(mp->mp_bh[i] != NULL);
461  	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
462  	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
463  	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
464  	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
465  	ptr += offset;
466  	*ptr = cpu_to_be64(bn);
467  	return ptr;
468  }
469  
470  enum alloc_state {
471  	ALLOC_DATA = 0,
472  	ALLOC_GROW_DEPTH = 1,
473  	ALLOC_GROW_HEIGHT = 2,
474  	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
475  };
476  
477  static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
478  {
479  	if (hgt)
480  		return sdp->sd_inptrs;
481  	return sdp->sd_diptrs;
482  }
483  
484  /**
485   * gfs2_bmap_alloc - Build a metadata tree of the requested height
486   * @inode: The GFS2 inode
487   * @lblock: The logical starting block of the extent
488   * @bh_map: This is used to return the mapping details
489   * @zero_new: True if newly allocated blocks should be zeroed
490   * @mp: The metapath, with proper height information calculated
491   * @maxlen: The max number of data blocks to alloc
492   * @dblock: Pointer to return the resulting new block
493   * @dblks: Pointer to return the number of blocks allocated
494   *
495   * In this routine we may have to alloc:
496   *   i) Indirect blocks to grow the metadata tree height
497   *  ii) Indirect blocks to fill in lower part of the metadata tree
498   * iii) Data blocks
499   *
500   * The function is in two parts. The first part works out the total
501   * number of blocks which we need. The second part does the actual
502   * allocation asking for an extent at a time (if enough contiguous free
503   * blocks are available, there will only be one request per bmap call)
504   * and uses the state machine to initialise the blocks in order.
505   *
506   * Returns: errno on error
507   */
508  
509  static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
510  			    unsigned flags, struct metapath *mp)
511  {
512  	struct gfs2_inode *ip = GFS2_I(inode);
513  	struct gfs2_sbd *sdp = GFS2_SB(inode);
514  	struct super_block *sb = sdp->sd_vfs;
515  	struct buffer_head *dibh = mp->mp_bh[0];
516  	u64 bn;
517  	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
518  	unsigned dblks = 0;
519  	unsigned ptrs_per_blk;
520  	const unsigned end_of_metadata = mp->mp_fheight - 1;
521  	int ret;
522  	enum alloc_state state;
523  	__be64 *ptr;
524  	__be64 zero_bn = 0;
525  	size_t maxlen = iomap->length >> inode->i_blkbits;
526  
527  	BUG_ON(mp->mp_aheight < 1);
528  	BUG_ON(dibh == NULL);
529  
530  	gfs2_trans_add_meta(ip->i_gl, dibh);
531  
532  	if (mp->mp_fheight == mp->mp_aheight) {
533  		struct buffer_head *bh;
534  		int eob;
535  
536  		/* Bottom indirect block exists, find unalloced extent size */
537  		ptr = metapointer(end_of_metadata, mp);
538  		bh = mp->mp_bh[end_of_metadata];
539  		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr,
540  					   maxlen, &eob);
541  		BUG_ON(dblks < 1);
542  		state = ALLOC_DATA;
543  	} else {
544  		/* Need to allocate indirect blocks */
545  		ptrs_per_blk = mp->mp_fheight > 1 ? sdp->sd_inptrs :
546  			sdp->sd_diptrs;
547  		dblks = min(maxlen, (size_t)(ptrs_per_blk -
548  					     mp->mp_list[end_of_metadata]));
549  		if (mp->mp_fheight == ip->i_height) {
550  			/* Writing into existing tree, extend tree down */
551  			iblks = mp->mp_fheight - mp->mp_aheight;
552  			state = ALLOC_GROW_DEPTH;
553  		} else {
554  			/* Building up tree height */
555  			state = ALLOC_GROW_HEIGHT;
556  			iblks = mp->mp_fheight - ip->i_height;
557  			branch_start = metapath_branch_start(mp);
558  			iblks += (mp->mp_fheight - branch_start);
559  		}
560  	}
561  
562  	/* start of the second part of the function (state machine) */
563  
564  	blks = dblks + iblks;
565  	i = mp->mp_aheight;
566  	do {
567  		int error;
568  		n = blks - alloced;
569  		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
570  		if (error)
571  			return error;
572  		alloced += n;
573  		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
574  			gfs2_trans_add_unrevoke(sdp, bn, n);
575  		switch (state) {
576  		/* Growing height of tree */
577  		case ALLOC_GROW_HEIGHT:
578  			if (i == 1) {
579  				ptr = (__be64 *)(dibh->b_data +
580  						 sizeof(struct gfs2_dinode));
581  				zero_bn = *ptr;
582  			}
583  			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
584  			     i++, n--)
585  				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
586  			if (i - 1 == mp->mp_fheight - ip->i_height) {
587  				i--;
588  				gfs2_buffer_copy_tail(mp->mp_bh[i],
589  						sizeof(struct gfs2_meta_header),
590  						dibh, sizeof(struct gfs2_dinode));
591  				gfs2_buffer_clear_tail(dibh,
592  						sizeof(struct gfs2_dinode) +
593  						sizeof(__be64));
594  				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
595  					sizeof(struct gfs2_meta_header));
596  				*ptr = zero_bn;
597  				state = ALLOC_GROW_DEPTH;
598  				for(i = branch_start; i < mp->mp_fheight; i++) {
599  					if (mp->mp_bh[i] == NULL)
600  						break;
601  					brelse(mp->mp_bh[i]);
602  					mp->mp_bh[i] = NULL;
603  				}
604  				i = branch_start;
605  			}
606  			if (n == 0)
607  				break;
608  		/* Branching from existing tree */
609  		case ALLOC_GROW_DEPTH:
610  			if (i > 1 && i < mp->mp_fheight)
611  				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
612  			for (; i < mp->mp_fheight && n > 0; i++, n--)
613  				gfs2_indirect_init(mp, ip->i_gl, i,
614  						   mp->mp_list[i-1], bn++);
615  			if (i == mp->mp_fheight)
616  				state = ALLOC_DATA;
617  			if (n == 0)
618  				break;
619  		/* Tree complete, adding data blocks */
620  		case ALLOC_DATA:
621  			BUG_ON(n > dblks);
622  			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
623  			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
624  			dblks = n;
625  			ptr = metapointer(end_of_metadata, mp);
626  			iomap->addr = bn << inode->i_blkbits;
627  			iomap->flags |= IOMAP_F_NEW;
628  			while (n-- > 0)
629  				*ptr++ = cpu_to_be64(bn++);
630  			if (flags & IOMAP_ZERO) {
631  				ret = sb_issue_zeroout(sb, iomap->addr >> inode->i_blkbits,
632  						       dblks, GFP_NOFS);
633  				if (ret) {
634  					fs_err(sdp,
635  					       "Failed to zero data buffers\n");
636  					flags &= ~IOMAP_ZERO;
637  				}
638  			}
639  			break;
640  		}
641  	} while (iomap->addr == IOMAP_NULL_ADDR);
642  
643  	iomap->length = (u64)dblks << inode->i_blkbits;
644  	ip->i_height = mp->mp_fheight;
645  	gfs2_add_inode_blocks(&ip->i_inode, alloced);
646  	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
647  	return 0;
648  }
649  
650  /**
651   * hole_size - figure out the size of a hole
652   * @inode: The inode
653   * @lblock: The logical starting block number
654   * @mp: The metapath
655   *
656   * Returns: The hole size in bytes
657   *
658   */
659  static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
660  {
661  	struct gfs2_inode *ip = GFS2_I(inode);
662  	struct gfs2_sbd *sdp = GFS2_SB(inode);
663  	struct metapath mp_eof;
664  	u64 factor = 1;
665  	int hgt;
666  	u64 holesz = 0;
667  	const __be64 *first, *end, *ptr;
668  	const struct buffer_head *bh;
669  	u64 lblock_stop = (i_size_read(inode) - 1) >> inode->i_blkbits;
670  	int zeroptrs;
671  	bool done = false;
672  
673  	/* Get another metapath, to the very last byte */
674  	find_metapath(sdp, lblock_stop, &mp_eof, ip->i_height);
675  	for (hgt = ip->i_height - 1; hgt >= 0 && !done; hgt--) {
676  		bh = mp->mp_bh[hgt];
677  		if (bh) {
678  			zeroptrs = 0;
679  			first = metapointer(hgt, mp);
680  			end = (const __be64 *)(bh->b_data + bh->b_size);
681  
682  			for (ptr = first; ptr < end; ptr++) {
683  				if (*ptr) {
684  					done = true;
685  					break;
686  				} else {
687  					zeroptrs++;
688  				}
689  			}
690  		} else {
691  			zeroptrs = sdp->sd_inptrs;
692  		}
693  		if (factor * zeroptrs >= lblock_stop - lblock + 1) {
694  			holesz = lblock_stop - lblock + 1;
695  			break;
696  		}
697  		holesz += factor * zeroptrs;
698  
699  		factor *= sdp->sd_inptrs;
700  		if (hgt && (mp->mp_list[hgt - 1] < mp_eof.mp_list[hgt - 1]))
701  			(mp->mp_list[hgt - 1])++;
702  	}
703  	return holesz << inode->i_blkbits;
704  }
705  
706  static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
707  {
708  	struct gfs2_inode *ip = GFS2_I(inode);
709  
710  	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
711  		      sizeof(struct gfs2_dinode);
712  	iomap->offset = 0;
713  	iomap->length = i_size_read(inode);
714  	iomap->type = IOMAP_MAPPED;
715  	iomap->flags = IOMAP_F_DATA_INLINE;
716  }
717  
718  /**
719   * gfs2_iomap_begin - Map blocks from an inode to disk blocks
720   * @inode: The inode
721   * @pos: Starting position in bytes
722   * @length: Length to map, in bytes
723   * @flags: iomap flags
724   * @iomap: The iomap structure
725   *
726   * Returns: errno
727   */
728  int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
729  		     unsigned flags, struct iomap *iomap)
730  {
731  	struct gfs2_inode *ip = GFS2_I(inode);
732  	struct gfs2_sbd *sdp = GFS2_SB(inode);
733  	struct metapath mp = { .mp_aheight = 1, };
734  	unsigned int factor = sdp->sd_sb.sb_bsize;
735  	const u64 *arr = sdp->sd_heightsize;
736  	__be64 *ptr;
737  	sector_t lblock;
738  	sector_t lend;
739  	int ret;
740  	int eob;
741  	unsigned int len;
742  	struct buffer_head *bh;
743  	u8 height;
744  
745  	trace_gfs2_iomap_start(ip, pos, length, flags);
746  	if (!length) {
747  		ret = -EINVAL;
748  		goto out;
749  	}
750  
751  	if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
752  		gfs2_stuffed_iomap(inode, iomap);
753  		if (pos >= iomap->length)
754  			return -ENOENT;
755  		ret = 0;
756  		goto out;
757  	}
758  
759  	lblock = pos >> inode->i_blkbits;
760  	lend = (pos + length + sdp->sd_sb.sb_bsize - 1) >> inode->i_blkbits;
761  
762  	iomap->offset = lblock << inode->i_blkbits;
763  	iomap->addr = IOMAP_NULL_ADDR;
764  	iomap->type = IOMAP_HOLE;
765  	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
766  	iomap->flags = IOMAP_F_MERGED;
767  	bmap_lock(ip, 0);
768  
769  	/*
770  	 * Directory data blocks have a struct gfs2_meta_header header, so the
771  	 * remaining size is smaller than the filesystem block size.  Logical
772  	 * block numbers for directories are in units of this remaining size!
773  	 */
774  	if (gfs2_is_dir(ip)) {
775  		factor = sdp->sd_jbsize;
776  		arr = sdp->sd_jheightsize;
777  	}
778  
779  	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
780  	if (ret)
781  		goto out_release;
782  
783  	height = ip->i_height;
784  	while ((lblock + 1) * factor > arr[height])
785  		height++;
786  	find_metapath(sdp, lblock, &mp, height);
787  	if (height > ip->i_height || gfs2_is_stuffed(ip))
788  		goto do_alloc;
789  
790  	ret = lookup_metapath(ip, &mp);
791  	if (ret < 0)
792  		goto out_release;
793  
794  	if (mp.mp_aheight != ip->i_height)
795  		goto do_alloc;
796  
797  	ptr = metapointer(ip->i_height - 1, &mp);
798  	if (*ptr == 0)
799  		goto do_alloc;
800  
801  	iomap->type = IOMAP_MAPPED;
802  	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
803  
804  	bh = mp.mp_bh[ip->i_height - 1];
805  	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, lend - lblock, &eob);
806  	if (eob)
807  		iomap->flags |= IOMAP_F_BOUNDARY;
808  	iomap->length = (u64)len << inode->i_blkbits;
809  
810  	ret = 0;
811  
812  out_release:
813  	release_metapath(&mp);
814  	bmap_unlock(ip, 0);
815  out:
816  	trace_gfs2_iomap_end(ip, iomap, ret);
817  	return ret;
818  
819  do_alloc:
820  	if (!(flags & IOMAP_WRITE)) {
821  		if (pos >= i_size_read(inode)) {
822  			ret = -ENOENT;
823  			goto out_release;
824  		}
825  		ret = 0;
826  		iomap->length = hole_size(inode, lblock, &mp);
827  		goto out_release;
828  	}
829  
830  	ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
831  	goto out_release;
832  }
833  
834  /**
835   * gfs2_block_map - Map a block from an inode to a disk block
836   * @inode: The inode
837   * @lblock: The logical block number
838   * @bh_map: The bh to be mapped
839   * @create: True if its ok to alloc blocks to satify the request
840   *
841   * Sets buffer_mapped() if successful, sets buffer_boundary() if a
842   * read of metadata will be required before the next block can be
843   * mapped. Sets buffer_new() if new blocks were allocated.
844   *
845   * Returns: errno
846   */
847  
848  int gfs2_block_map(struct inode *inode, sector_t lblock,
849  		   struct buffer_head *bh_map, int create)
850  {
851  	struct gfs2_inode *ip = GFS2_I(inode);
852  	struct iomap iomap;
853  	int ret, flags = 0;
854  
855  	clear_buffer_mapped(bh_map);
856  	clear_buffer_new(bh_map);
857  	clear_buffer_boundary(bh_map);
858  	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
859  
860  	if (create)
861  		flags |= IOMAP_WRITE;
862  	if (buffer_zeronew(bh_map))
863  		flags |= IOMAP_ZERO;
864  	ret = gfs2_iomap_begin(inode, (loff_t)lblock << inode->i_blkbits,
865  			       bh_map->b_size, flags, &iomap);
866  	if (ret) {
867  		if (!create && ret == -ENOENT) {
868  			/* Return unmapped buffer beyond the end of file.  */
869  			ret = 0;
870  		}
871  		goto out;
872  	}
873  
874  	if (iomap.length > bh_map->b_size) {
875  		iomap.length = bh_map->b_size;
876  		iomap.flags &= ~IOMAP_F_BOUNDARY;
877  	}
878  	if (iomap.addr != IOMAP_NULL_ADDR)
879  		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
880  	bh_map->b_size = iomap.length;
881  	if (iomap.flags & IOMAP_F_BOUNDARY)
882  		set_buffer_boundary(bh_map);
883  	if (iomap.flags & IOMAP_F_NEW)
884  		set_buffer_new(bh_map);
885  
886  out:
887  	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
888  	return ret;
889  }
890  
891  /*
892   * Deprecated: do not use in new code
893   */
894  int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
895  {
896  	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
897  	int ret;
898  	int create = *new;
899  
900  	BUG_ON(!extlen);
901  	BUG_ON(!dblock);
902  	BUG_ON(!new);
903  
904  	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
905  	ret = gfs2_block_map(inode, lblock, &bh, create);
906  	*extlen = bh.b_size >> inode->i_blkbits;
907  	*dblock = bh.b_blocknr;
908  	if (buffer_new(&bh))
909  		*new = 1;
910  	else
911  		*new = 0;
912  	return ret;
913  }
914  
915  /**
916   * gfs2_block_truncate_page - Deal with zeroing out data for truncate
917   *
918   * This is partly borrowed from ext3.
919   */
920  static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
921  {
922  	struct inode *inode = mapping->host;
923  	struct gfs2_inode *ip = GFS2_I(inode);
924  	unsigned long index = from >> PAGE_SHIFT;
925  	unsigned offset = from & (PAGE_SIZE-1);
926  	unsigned blocksize, iblock, length, pos;
927  	struct buffer_head *bh;
928  	struct page *page;
929  	int err;
930  
931  	page = find_or_create_page(mapping, index, GFP_NOFS);
932  	if (!page)
933  		return 0;
934  
935  	blocksize = inode->i_sb->s_blocksize;
936  	length = blocksize - (offset & (blocksize - 1));
937  	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
938  
939  	if (!page_has_buffers(page))
940  		create_empty_buffers(page, blocksize, 0);
941  
942  	/* Find the buffer that contains "offset" */
943  	bh = page_buffers(page);
944  	pos = blocksize;
945  	while (offset >= pos) {
946  		bh = bh->b_this_page;
947  		iblock++;
948  		pos += blocksize;
949  	}
950  
951  	err = 0;
952  
953  	if (!buffer_mapped(bh)) {
954  		gfs2_block_map(inode, iblock, bh, 0);
955  		/* unmapped? It's a hole - nothing to do */
956  		if (!buffer_mapped(bh))
957  			goto unlock;
958  	}
959  
960  	/* Ok, it's mapped. Make sure it's up-to-date */
961  	if (PageUptodate(page))
962  		set_buffer_uptodate(bh);
963  
964  	if (!buffer_uptodate(bh)) {
965  		err = -EIO;
966  		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
967  		wait_on_buffer(bh);
968  		/* Uhhuh. Read error. Complain and punt. */
969  		if (!buffer_uptodate(bh))
970  			goto unlock;
971  		err = 0;
972  	}
973  
974  	if (!gfs2_is_writeback(ip))
975  		gfs2_trans_add_data(ip->i_gl, bh);
976  
977  	zero_user(page, offset, length);
978  	mark_buffer_dirty(bh);
979  unlock:
980  	unlock_page(page);
981  	put_page(page);
982  	return err;
983  }
984  
985  #define GFS2_JTRUNC_REVOKES 8192
986  
987  /**
988   * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
989   * @inode: The inode being truncated
990   * @oldsize: The original (larger) size
991   * @newsize: The new smaller size
992   *
993   * With jdata files, we have to journal a revoke for each block which is
994   * truncated. As a result, we need to split this into separate transactions
995   * if the number of pages being truncated gets too large.
996   */
997  
998  static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
999  {
1000  	struct gfs2_sbd *sdp = GFS2_SB(inode);
1001  	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1002  	u64 chunk;
1003  	int error;
1004  
1005  	while (oldsize != newsize) {
1006  		chunk = oldsize - newsize;
1007  		if (chunk > max_chunk)
1008  			chunk = max_chunk;
1009  		truncate_pagecache(inode, oldsize - chunk);
1010  		oldsize -= chunk;
1011  		gfs2_trans_end(sdp);
1012  		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1013  		if (error)
1014  			return error;
1015  	}
1016  
1017  	return 0;
1018  }
1019  
1020  static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1021  {
1022  	struct gfs2_inode *ip = GFS2_I(inode);
1023  	struct gfs2_sbd *sdp = GFS2_SB(inode);
1024  	struct address_space *mapping = inode->i_mapping;
1025  	struct buffer_head *dibh;
1026  	int journaled = gfs2_is_jdata(ip);
1027  	int error;
1028  
1029  	if (journaled)
1030  		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1031  	else
1032  		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1033  	if (error)
1034  		return error;
1035  
1036  	error = gfs2_meta_inode_buffer(ip, &dibh);
1037  	if (error)
1038  		goto out;
1039  
1040  	gfs2_trans_add_meta(ip->i_gl, dibh);
1041  
1042  	if (gfs2_is_stuffed(ip)) {
1043  		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044  	} else {
1045  		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1046  			error = gfs2_block_truncate_page(mapping, newsize);
1047  			if (error)
1048  				goto out_brelse;
1049  		}
1050  		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1051  	}
1052  
1053  	i_size_write(inode, newsize);
1054  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1055  	gfs2_dinode_out(ip, dibh->b_data);
1056  
1057  	if (journaled)
1058  		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1059  	else
1060  		truncate_pagecache(inode, newsize);
1061  
1062  	if (error) {
1063  		brelse(dibh);
1064  		return error;
1065  	}
1066  
1067  out_brelse:
1068  	brelse(dibh);
1069  out:
1070  	gfs2_trans_end(sdp);
1071  	return error;
1072  }
1073  
1074  /**
1075   * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1076   * @ip: inode
1077   * @rg_gh: holder of resource group glock
1078   * @mp: current metapath fully populated with buffers
1079   * @btotal: place to keep count of total blocks freed
1080   * @hgt: height we're processing
1081   * @first: true if this is the first call to this function for this height
1082   *
1083   * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1084   * free, and free them all. However, we do it one rgrp at a time. If this
1085   * block has references to multiple rgrps, we break it into individual
1086   * transactions. This allows other processes to use the rgrps while we're
1087   * focused on a single one, for better concurrency / performance.
1088   * At every transaction boundary, we rewrite the inode into the journal.
1089   * That way the bitmaps are kept consistent with the inode and we can recover
1090   * if we're interrupted by power-outages.
1091   *
1092   * Returns: 0, or return code if an error occurred.
1093   *          *btotal has the total number of blocks freed
1094   */
1095  static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1096  			      const struct metapath *mp, u32 *btotal, int hgt,
1097  			      bool preserve1)
1098  {
1099  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1100  	struct gfs2_rgrpd *rgd;
1101  	struct gfs2_trans *tr;
1102  	struct buffer_head *bh = mp->mp_bh[hgt];
1103  	__be64 *top, *bottom, *p;
1104  	int blks_outside_rgrp;
1105  	u64 bn, bstart, isize_blks;
1106  	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1107  	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
1108  	int ret = 0;
1109  	bool buf_in_tr = false; /* buffer was added to transaction */
1110  
1111  	if (gfs2_metatype_check(sdp, bh,
1112  				(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
1113  		return -EIO;
1114  
1115  more_rgrps:
1116  	blks_outside_rgrp = 0;
1117  	bstart = 0;
1118  	blen = 0;
1119  	top = metapointer(hgt, mp); /* first ptr from metapath */
1120  	/* If we're keeping some data at the truncation point, we've got to
1121  	   preserve the metadata tree by adding 1 to the starting metapath. */
1122  	if (preserve1)
1123  		top++;
1124  
1125  	bottom = (__be64 *)(bh->b_data + bh->b_size);
1126  
1127  	for (p = top; p < bottom; p++) {
1128  		if (!*p)
1129  			continue;
1130  		bn = be64_to_cpu(*p);
1131  		if (gfs2_holder_initialized(rd_gh)) {
1132  			rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1133  			gfs2_assert_withdraw(sdp,
1134  				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1135  		} else {
1136  			rgd = gfs2_blk2rgrpd(sdp, bn, false);
1137  			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1138  						 0, rd_gh);
1139  			if (ret)
1140  				goto out;
1141  
1142  			/* Must be done with the rgrp glock held: */
1143  			if (gfs2_rs_active(&ip->i_res) &&
1144  			    rgd == ip->i_res.rs_rbm.rgd)
1145  				gfs2_rs_deltree(&ip->i_res);
1146  		}
1147  
1148  		if (!rgrp_contains_block(rgd, bn)) {
1149  			blks_outside_rgrp++;
1150  			continue;
1151  		}
1152  
1153  		/* The size of our transactions will be unknown until we
1154  		   actually process all the metadata blocks that relate to
1155  		   the rgrp. So we estimate. We know it can't be more than
1156  		   the dinode's i_blocks and we don't want to exceed the
1157  		   journal flush threshold, sd_log_thresh2. */
1158  		if (current->journal_info == NULL) {
1159  			unsigned int jblocks_rqsted, revokes;
1160  
1161  			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1162  				RES_INDIRECT;
1163  			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1164  			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1165  				jblocks_rqsted +=
1166  					atomic_read(&sdp->sd_log_thresh2);
1167  			else
1168  				jblocks_rqsted += isize_blks;
1169  			revokes = jblocks_rqsted;
1170  			if (meta)
1171  				revokes += hptrs(sdp, hgt);
1172  			else if (ip->i_depth)
1173  				revokes += sdp->sd_inptrs;
1174  			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1175  			if (ret)
1176  				goto out_unlock;
1177  			down_write(&ip->i_rw_mutex);
1178  		}
1179  		/* check if we will exceed the transaction blocks requested */
1180  		tr = current->journal_info;
1181  		if (tr->tr_num_buf_new + RES_STATFS +
1182  		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1183  			/* We set blks_outside_rgrp to ensure the loop will
1184  			   be repeated for the same rgrp, but with a new
1185  			   transaction. */
1186  			blks_outside_rgrp++;
1187  			/* This next part is tricky. If the buffer was added
1188  			   to the transaction, we've already set some block
1189  			   pointers to 0, so we better follow through and free
1190  			   them, or we will introduce corruption (so break).
1191  			   This may be impossible, or at least rare, but I
1192  			   decided to cover the case regardless.
1193  
1194  			   If the buffer was not added to the transaction
1195  			   (this call), doing so would exceed our transaction
1196  			   size, so we need to end the transaction and start a
1197  			   new one (so goto). */
1198  
1199  			if (buf_in_tr)
1200  				break;
1201  			goto out_unlock;
1202  		}
1203  
1204  		gfs2_trans_add_meta(ip->i_gl, bh);
1205  		buf_in_tr = true;
1206  		*p = 0;
1207  		if (bstart + blen == bn) {
1208  			blen++;
1209  			continue;
1210  		}
1211  		if (bstart) {
1212  			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1213  			(*btotal) += blen;
1214  			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1215  		}
1216  		bstart = bn;
1217  		blen = 1;
1218  	}
1219  	if (bstart) {
1220  		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1221  		(*btotal) += blen;
1222  		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1223  	}
1224  out_unlock:
1225  	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1226  					    outside the rgrp we just processed,
1227  					    do it all over again. */
1228  		if (current->journal_info) {
1229  			struct buffer_head *dibh = mp->mp_bh[0];
1230  
1231  			/* Every transaction boundary, we rewrite the dinode
1232  			   to keep its di_blocks current in case of failure. */
1233  			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1234  				current_time(&ip->i_inode);
1235  			gfs2_trans_add_meta(ip->i_gl, dibh);
1236  			gfs2_dinode_out(ip, dibh->b_data);
1237  			up_write(&ip->i_rw_mutex);
1238  			gfs2_trans_end(sdp);
1239  		}
1240  		gfs2_glock_dq_uninit(rd_gh);
1241  		cond_resched();
1242  		goto more_rgrps;
1243  	}
1244  out:
1245  	return ret;
1246  }
1247  
1248  /**
1249   * find_nonnull_ptr - find a non-null pointer given a metapath and height
1250   * assumes the metapath is valid (with buffers) out to height h
1251   * @mp: starting metapath
1252   * @h: desired height to search
1253   *
1254   * Returns: true if a non-null pointer was found in the metapath buffer
1255   *          false if all remaining pointers are NULL in the buffer
1256   */
1257  static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1258  			     unsigned int h)
1259  {
1260  	__be64 *ptr;
1261  	unsigned int ptrs = hptrs(sdp, h) - 1;
1262  
1263  	while (true) {
1264  		ptr = metapointer(h, mp);
1265  		if (*ptr) { /* if we have a non-null pointer */
1266  			/* Now zero the metapath after the current height. */
1267  			h++;
1268  			if (h < GFS2_MAX_META_HEIGHT)
1269  				memset(&mp->mp_list[h], 0,
1270  				       (GFS2_MAX_META_HEIGHT - h) *
1271  				       sizeof(mp->mp_list[0]));
1272  			return true;
1273  		}
1274  
1275  		if (mp->mp_list[h] < ptrs)
1276  			mp->mp_list[h]++;
1277  		else
1278  			return false; /* no more pointers in this buffer */
1279  	}
1280  }
1281  
1282  enum dealloc_states {
1283  	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1284  	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1285  	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1286  	DEALLOC_DONE = 3,       /* process complete */
1287  };
1288  
1289  static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h)
1290  {
1291  	if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0])))
1292  		return false;
1293  	return true;
1294  }
1295  
1296  /**
1297   * trunc_dealloc - truncate a file down to a desired size
1298   * @ip: inode to truncate
1299   * @newsize: The desired size of the file
1300   *
1301   * This function truncates a file to newsize. It works from the
1302   * bottom up, and from the right to the left. In other words, it strips off
1303   * the highest layer (data) before stripping any of the metadata. Doing it
1304   * this way is best in case the operation is interrupted by power failure, etc.
1305   * The dinode is rewritten in every transaction to guarantee integrity.
1306   */
1307  static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1308  {
1309  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1310  	struct metapath mp;
1311  	struct buffer_head *dibh, *bh;
1312  	struct gfs2_holder rd_gh;
1313  	u64 lblock;
1314  	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1315  	unsigned int strip_h = ip->i_height - 1;
1316  	u32 btotal = 0;
1317  	int ret, state;
1318  	int mp_h; /* metapath buffers are read in to this height */
1319  	sector_t last_ra = 0;
1320  	u64 prev_bnr = 0;
1321  	bool preserve1; /* need to preserve the first meta pointer? */
1322  
1323  	if (!newsize)
1324  		lblock = 0;
1325  	else
1326  		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1327  
1328  	memset(&mp, 0, sizeof(mp));
1329  	find_metapath(sdp, lblock, &mp, ip->i_height);
1330  
1331  	memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1332  
1333  	ret = gfs2_meta_inode_buffer(ip, &dibh);
1334  	if (ret)
1335  		return ret;
1336  
1337  	mp.mp_bh[0] = dibh;
1338  	ret = lookup_metapath(ip, &mp);
1339  	if (ret == ip->i_height)
1340  		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1341  	else
1342  		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1343  
1344  	ret = gfs2_rindex_update(sdp);
1345  	if (ret)
1346  		goto out_metapath;
1347  
1348  	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1349  	if (ret)
1350  		goto out_metapath;
1351  	gfs2_holder_mark_uninitialized(&rd_gh);
1352  
1353  	mp_h = strip_h;
1354  
1355  	while (state != DEALLOC_DONE) {
1356  		switch (state) {
1357  		/* Truncate a full metapath at the given strip height.
1358  		 * Note that strip_h == mp_h in order to be in this state. */
1359  		case DEALLOC_MP_FULL:
1360  			if (mp_h > 0) { /* issue read-ahead on metadata */
1361  				__be64 *top;
1362  
1363  				bh = mp.mp_bh[mp_h - 1];
1364  				if (bh->b_blocknr != last_ra) {
1365  					last_ra = bh->b_blocknr;
1366  					top = metaptr1(mp_h - 1, &mp);
1367  					gfs2_metapath_ra(ip->i_gl, bh, top);
1368  				}
1369  			}
1370  			/* If we're truncating to a non-zero size and the mp is
1371  			   at the beginning of file for the strip height, we
1372  			   need to preserve the first metadata pointer. */
1373  			preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h));
1374  			bh = mp.mp_bh[mp_h];
1375  			gfs2_assert_withdraw(sdp, bh);
1376  			if (gfs2_assert_withdraw(sdp,
1377  						 prev_bnr != bh->b_blocknr)) {
1378  				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1379  				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1380  				       sdp->sd_fsname,
1381  				       (unsigned long long)ip->i_no_addr,
1382  				       prev_bnr, ip->i_height, strip_h, mp_h);
1383  			}
1384  			prev_bnr = bh->b_blocknr;
1385  			ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1386  						 mp_h, preserve1);
1387  			/* If we hit an error or just swept dinode buffer,
1388  			   just exit. */
1389  			if (ret || !mp_h) {
1390  				state = DEALLOC_DONE;
1391  				break;
1392  			}
1393  			state = DEALLOC_MP_LOWER;
1394  			break;
1395  
1396  		/* lower the metapath strip height */
1397  		case DEALLOC_MP_LOWER:
1398  			/* We're done with the current buffer, so release it,
1399  			   unless it's the dinode buffer. Then back up to the
1400  			   previous pointer. */
1401  			if (mp_h) {
1402  				brelse(mp.mp_bh[mp_h]);
1403  				mp.mp_bh[mp_h] = NULL;
1404  			}
1405  			/* If we can't get any lower in height, we've stripped
1406  			   off all we can. Next step is to back up and start
1407  			   stripping the previous level of metadata. */
1408  			if (mp_h == 0) {
1409  				strip_h--;
1410  				memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1411  				mp_h = strip_h;
1412  				state = DEALLOC_FILL_MP;
1413  				break;
1414  			}
1415  			mp.mp_list[mp_h] = 0;
1416  			mp_h--; /* search one metadata height down */
1417  			if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1418  				break; /* loop around in the same state */
1419  			mp.mp_list[mp_h]++;
1420  			/* Here we've found a part of the metapath that is not
1421  			 * allocated. We need to search at that height for the
1422  			 * next non-null pointer. */
1423  			if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1424  				state = DEALLOC_FILL_MP;
1425  				mp_h++;
1426  			}
1427  			/* No more non-null pointers at this height. Back up
1428  			   to the previous height and try again. */
1429  			break; /* loop around in the same state */
1430  
1431  		/* Fill the metapath with buffers to the given height. */
1432  		case DEALLOC_FILL_MP:
1433  			/* Fill the buffers out to the current height. */
1434  			ret = fillup_metapath(ip, &mp, mp_h);
1435  			if (ret < 0)
1436  				goto out;
1437  
1438  			/* If buffers found for the entire strip height */
1439  			if ((ret == ip->i_height) && (mp_h == strip_h)) {
1440  				state = DEALLOC_MP_FULL;
1441  				break;
1442  			}
1443  			if (ret < ip->i_height) /* We have a partial height */
1444  				mp_h = ret - 1;
1445  
1446  			/* If we find a non-null block pointer, crawl a bit
1447  			   higher up in the metapath and try again, otherwise
1448  			   we need to look lower for a new starting point. */
1449  			if (find_nonnull_ptr(sdp, &mp, mp_h))
1450  				mp_h++;
1451  			else
1452  				state = DEALLOC_MP_LOWER;
1453  			break;
1454  		}
1455  	}
1456  
1457  	if (btotal) {
1458  		if (current->journal_info == NULL) {
1459  			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1460  					       RES_QUOTA, 0);
1461  			if (ret)
1462  				goto out;
1463  			down_write(&ip->i_rw_mutex);
1464  		}
1465  		gfs2_statfs_change(sdp, 0, +btotal, 0);
1466  		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1467  				  ip->i_inode.i_gid);
1468  		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1469  		gfs2_trans_add_meta(ip->i_gl, dibh);
1470  		gfs2_dinode_out(ip, dibh->b_data);
1471  		up_write(&ip->i_rw_mutex);
1472  		gfs2_trans_end(sdp);
1473  	}
1474  
1475  out:
1476  	if (gfs2_holder_initialized(&rd_gh))
1477  		gfs2_glock_dq_uninit(&rd_gh);
1478  	if (current->journal_info) {
1479  		up_write(&ip->i_rw_mutex);
1480  		gfs2_trans_end(sdp);
1481  		cond_resched();
1482  	}
1483  	gfs2_quota_unhold(ip);
1484  out_metapath:
1485  	release_metapath(&mp);
1486  	return ret;
1487  }
1488  
1489  static int trunc_end(struct gfs2_inode *ip)
1490  {
1491  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1492  	struct buffer_head *dibh;
1493  	int error;
1494  
1495  	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1496  	if (error)
1497  		return error;
1498  
1499  	down_write(&ip->i_rw_mutex);
1500  
1501  	error = gfs2_meta_inode_buffer(ip, &dibh);
1502  	if (error)
1503  		goto out;
1504  
1505  	if (!i_size_read(&ip->i_inode)) {
1506  		ip->i_height = 0;
1507  		ip->i_goal = ip->i_no_addr;
1508  		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1509  		gfs2_ordered_del_inode(ip);
1510  	}
1511  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1512  	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1513  
1514  	gfs2_trans_add_meta(ip->i_gl, dibh);
1515  	gfs2_dinode_out(ip, dibh->b_data);
1516  	brelse(dibh);
1517  
1518  out:
1519  	up_write(&ip->i_rw_mutex);
1520  	gfs2_trans_end(sdp);
1521  	return error;
1522  }
1523  
1524  /**
1525   * do_shrink - make a file smaller
1526   * @inode: the inode
1527   * @oldsize: the current inode size
1528   * @newsize: the size to make the file
1529   *
1530   * Called with an exclusive lock on @inode. The @size must
1531   * be equal to or smaller than the current inode size.
1532   *
1533   * Returns: errno
1534   */
1535  
1536  static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1537  {
1538  	struct gfs2_inode *ip = GFS2_I(inode);
1539  	int error;
1540  
1541  	error = trunc_start(inode, oldsize, newsize);
1542  	if (error < 0)
1543  		return error;
1544  	if (gfs2_is_stuffed(ip))
1545  		return 0;
1546  
1547  	error = trunc_dealloc(ip, newsize);
1548  	if (error == 0)
1549  		error = trunc_end(ip);
1550  
1551  	return error;
1552  }
1553  
1554  void gfs2_trim_blocks(struct inode *inode)
1555  {
1556  	u64 size = inode->i_size;
1557  	int ret;
1558  
1559  	ret = do_shrink(inode, size, size);
1560  	WARN_ON(ret != 0);
1561  }
1562  
1563  /**
1564   * do_grow - Touch and update inode size
1565   * @inode: The inode
1566   * @size: The new size
1567   *
1568   * This function updates the timestamps on the inode and
1569   * may also increase the size of the inode. This function
1570   * must not be called with @size any smaller than the current
1571   * inode size.
1572   *
1573   * Although it is not strictly required to unstuff files here,
1574   * earlier versions of GFS2 have a bug in the stuffed file reading
1575   * code which will result in a buffer overrun if the size is larger
1576   * than the max stuffed file size. In order to prevent this from
1577   * occurring, such files are unstuffed, but in other cases we can
1578   * just update the inode size directly.
1579   *
1580   * Returns: 0 on success, or -ve on error
1581   */
1582  
1583  static int do_grow(struct inode *inode, u64 size)
1584  {
1585  	struct gfs2_inode *ip = GFS2_I(inode);
1586  	struct gfs2_sbd *sdp = GFS2_SB(inode);
1587  	struct gfs2_alloc_parms ap = { .target = 1, };
1588  	struct buffer_head *dibh;
1589  	int error;
1590  	int unstuff = 0;
1591  
1592  	if (gfs2_is_stuffed(ip) &&
1593  	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1594  		error = gfs2_quota_lock_check(ip, &ap);
1595  		if (error)
1596  			return error;
1597  
1598  		error = gfs2_inplace_reserve(ip, &ap);
1599  		if (error)
1600  			goto do_grow_qunlock;
1601  		unstuff = 1;
1602  	}
1603  
1604  	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1605  				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1606  				  0 : RES_QUOTA), 0);
1607  	if (error)
1608  		goto do_grow_release;
1609  
1610  	if (unstuff) {
1611  		error = gfs2_unstuff_dinode(ip, NULL);
1612  		if (error)
1613  			goto do_end_trans;
1614  	}
1615  
1616  	error = gfs2_meta_inode_buffer(ip, &dibh);
1617  	if (error)
1618  		goto do_end_trans;
1619  
1620  	i_size_write(inode, size);
1621  	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1622  	gfs2_trans_add_meta(ip->i_gl, dibh);
1623  	gfs2_dinode_out(ip, dibh->b_data);
1624  	brelse(dibh);
1625  
1626  do_end_trans:
1627  	gfs2_trans_end(sdp);
1628  do_grow_release:
1629  	if (unstuff) {
1630  		gfs2_inplace_release(ip);
1631  do_grow_qunlock:
1632  		gfs2_quota_unlock(ip);
1633  	}
1634  	return error;
1635  }
1636  
1637  /**
1638   * gfs2_setattr_size - make a file a given size
1639   * @inode: the inode
1640   * @newsize: the size to make the file
1641   *
1642   * The file size can grow, shrink, or stay the same size. This
1643   * is called holding i_mutex and an exclusive glock on the inode
1644   * in question.
1645   *
1646   * Returns: errno
1647   */
1648  
1649  int gfs2_setattr_size(struct inode *inode, u64 newsize)
1650  {
1651  	struct gfs2_inode *ip = GFS2_I(inode);
1652  	int ret;
1653  	u64 oldsize;
1654  
1655  	BUG_ON(!S_ISREG(inode->i_mode));
1656  
1657  	ret = inode_newsize_ok(inode, newsize);
1658  	if (ret)
1659  		return ret;
1660  
1661  	inode_dio_wait(inode);
1662  
1663  	ret = gfs2_rsqa_alloc(ip);
1664  	if (ret)
1665  		goto out;
1666  
1667  	oldsize = inode->i_size;
1668  	if (newsize >= oldsize) {
1669  		ret = do_grow(inode, newsize);
1670  		goto out;
1671  	}
1672  
1673  	ret = do_shrink(inode, oldsize, newsize);
1674  out:
1675  	gfs2_rsqa_delete(ip, NULL);
1676  	return ret;
1677  }
1678  
1679  int gfs2_truncatei_resume(struct gfs2_inode *ip)
1680  {
1681  	int error;
1682  	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1683  	if (!error)
1684  		error = trunc_end(ip);
1685  	return error;
1686  }
1687  
1688  int gfs2_file_dealloc(struct gfs2_inode *ip)
1689  {
1690  	return trunc_dealloc(ip, 0);
1691  }
1692  
1693  /**
1694   * gfs2_free_journal_extents - Free cached journal bmap info
1695   * @jd: The journal
1696   *
1697   */
1698  
1699  void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1700  {
1701  	struct gfs2_journal_extent *jext;
1702  
1703  	while(!list_empty(&jd->extent_list)) {
1704  		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1705  		list_del(&jext->list);
1706  		kfree(jext);
1707  	}
1708  }
1709  
1710  /**
1711   * gfs2_add_jextent - Add or merge a new extent to extent cache
1712   * @jd: The journal descriptor
1713   * @lblock: The logical block at start of new extent
1714   * @dblock: The physical block at start of new extent
1715   * @blocks: Size of extent in fs blocks
1716   *
1717   * Returns: 0 on success or -ENOMEM
1718   */
1719  
1720  static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1721  {
1722  	struct gfs2_journal_extent *jext;
1723  
1724  	if (!list_empty(&jd->extent_list)) {
1725  		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1726  		if ((jext->dblock + jext->blocks) == dblock) {
1727  			jext->blocks += blocks;
1728  			return 0;
1729  		}
1730  	}
1731  
1732  	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1733  	if (jext == NULL)
1734  		return -ENOMEM;
1735  	jext->dblock = dblock;
1736  	jext->lblock = lblock;
1737  	jext->blocks = blocks;
1738  	list_add_tail(&jext->list, &jd->extent_list);
1739  	jd->nr_extents++;
1740  	return 0;
1741  }
1742  
1743  /**
1744   * gfs2_map_journal_extents - Cache journal bmap info
1745   * @sdp: The super block
1746   * @jd: The journal to map
1747   *
1748   * Create a reusable "extent" mapping from all logical
1749   * blocks to all physical blocks for the given journal.  This will save
1750   * us time when writing journal blocks.  Most journals will have only one
1751   * extent that maps all their logical blocks.  That's because gfs2.mkfs
1752   * arranges the journal blocks sequentially to maximize performance.
1753   * So the extent would map the first block for the entire file length.
1754   * However, gfs2_jadd can happen while file activity is happening, so
1755   * those journals may not be sequential.  Less likely is the case where
1756   * the users created their own journals by mounting the metafs and
1757   * laying it out.  But it's still possible.  These journals might have
1758   * several extents.
1759   *
1760   * Returns: 0 on success, or error on failure
1761   */
1762  
1763  int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1764  {
1765  	u64 lblock = 0;
1766  	u64 lblock_stop;
1767  	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1768  	struct buffer_head bh;
1769  	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1770  	u64 size;
1771  	int rc;
1772  
1773  	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1774  	size = (lblock_stop - lblock) << shift;
1775  	jd->nr_extents = 0;
1776  	WARN_ON(!list_empty(&jd->extent_list));
1777  
1778  	do {
1779  		bh.b_state = 0;
1780  		bh.b_blocknr = 0;
1781  		bh.b_size = size;
1782  		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1783  		if (rc || !buffer_mapped(&bh))
1784  			goto fail;
1785  		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1786  		if (rc)
1787  			goto fail;
1788  		size -= bh.b_size;
1789  		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1790  	} while(size > 0);
1791  
1792  	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1793  		jd->nr_extents);
1794  	return 0;
1795  
1796  fail:
1797  	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1798  		rc, jd->jd_jid,
1799  		(unsigned long long)(i_size_read(jd->jd_inode) - size),
1800  		jd->nr_extents);
1801  	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1802  		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1803  		bh.b_state, (unsigned long long)bh.b_size);
1804  	gfs2_free_journal_extents(jd);
1805  	return rc;
1806  }
1807  
1808  /**
1809   * gfs2_write_alloc_required - figure out if a write will require an allocation
1810   * @ip: the file being written to
1811   * @offset: the offset to write to
1812   * @len: the number of bytes being written
1813   *
1814   * Returns: 1 if an alloc is required, 0 otherwise
1815   */
1816  
1817  int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1818  			      unsigned int len)
1819  {
1820  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1821  	struct buffer_head bh;
1822  	unsigned int shift;
1823  	u64 lblock, lblock_stop, size;
1824  	u64 end_of_file;
1825  
1826  	if (!len)
1827  		return 0;
1828  
1829  	if (gfs2_is_stuffed(ip)) {
1830  		if (offset + len >
1831  		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1832  			return 1;
1833  		return 0;
1834  	}
1835  
1836  	shift = sdp->sd_sb.sb_bsize_shift;
1837  	BUG_ON(gfs2_is_dir(ip));
1838  	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1839  	lblock = offset >> shift;
1840  	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1841  	if (lblock_stop > end_of_file)
1842  		return 1;
1843  
1844  	size = (lblock_stop - lblock) << shift;
1845  	do {
1846  		bh.b_state = 0;
1847  		bh.b_size = size;
1848  		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1849  		if (!buffer_mapped(&bh))
1850  			return 1;
1851  		size -= bh.b_size;
1852  		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1853  	} while(size > 0);
1854  
1855  	return 0;
1856  }
1857  
1858