xref: /openbmc/linux/fs/gfs2/bmap.c (revision c8dbaa22)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/blkdev.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/crc32.h>
16 
17 #include "gfs2.h"
18 #include "incore.h"
19 #include "bmap.h"
20 #include "glock.h"
21 #include "inode.h"
22 #include "meta_io.h"
23 #include "quota.h"
24 #include "rgrp.h"
25 #include "log.h"
26 #include "super.h"
27 #include "trans.h"
28 #include "dir.h"
29 #include "util.h"
30 #include "trace_gfs2.h"
31 
32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
33  * block is 512, so __u16 is fine for that. It saves stack space to
34  * keep it small.
35  */
36 struct metapath {
37 	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
39 };
40 
41 /**
42  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
43  * @ip: the inode
44  * @dibh: the dinode buffer
45  * @block: the block number that was allocated
46  * @page: The (optional) page. This is looked up if @page is NULL
47  *
48  * Returns: errno
49  */
50 
51 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
52 			       u64 block, struct page *page)
53 {
54 	struct inode *inode = &ip->i_inode;
55 	struct buffer_head *bh;
56 	int release = 0;
57 
58 	if (!page || page->index) {
59 		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
60 		if (!page)
61 			return -ENOMEM;
62 		release = 1;
63 	}
64 
65 	if (!PageUptodate(page)) {
66 		void *kaddr = kmap(page);
67 		u64 dsize = i_size_read(inode);
68 
69 		if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
70 			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
71 
72 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
73 		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
74 		kunmap(page);
75 
76 		SetPageUptodate(page);
77 	}
78 
79 	if (!page_has_buffers(page))
80 		create_empty_buffers(page, BIT(inode->i_blkbits),
81 				     BIT(BH_Uptodate));
82 
83 	bh = page_buffers(page);
84 
85 	if (!buffer_mapped(bh))
86 		map_bh(bh, inode->i_sb, block);
87 
88 	set_buffer_uptodate(bh);
89 	if (!gfs2_is_jdata(ip))
90 		mark_buffer_dirty(bh);
91 	if (!gfs2_is_writeback(ip))
92 		gfs2_trans_add_data(ip->i_gl, bh);
93 
94 	if (release) {
95 		unlock_page(page);
96 		put_page(page);
97 	}
98 
99 	return 0;
100 }
101 
102 /**
103  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
104  * @ip: The GFS2 inode to unstuff
105  * @page: The (optional) page. This is looked up if the @page is NULL
106  *
107  * This routine unstuffs a dinode and returns it to a "normal" state such
108  * that the height can be grown in the traditional way.
109  *
110  * Returns: errno
111  */
112 
113 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
114 {
115 	struct buffer_head *bh, *dibh;
116 	struct gfs2_dinode *di;
117 	u64 block = 0;
118 	int isdir = gfs2_is_dir(ip);
119 	int error;
120 
121 	down_write(&ip->i_rw_mutex);
122 
123 	error = gfs2_meta_inode_buffer(ip, &dibh);
124 	if (error)
125 		goto out;
126 
127 	if (i_size_read(&ip->i_inode)) {
128 		/* Get a free block, fill it with the stuffed data,
129 		   and write it out to disk */
130 
131 		unsigned int n = 1;
132 		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
133 		if (error)
134 			goto out_brelse;
135 		if (isdir) {
136 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
137 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
138 			if (error)
139 				goto out_brelse;
140 			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
141 					      dibh, sizeof(struct gfs2_dinode));
142 			brelse(bh);
143 		} else {
144 			error = gfs2_unstuffer_page(ip, dibh, block, page);
145 			if (error)
146 				goto out_brelse;
147 		}
148 	}
149 
150 	/*  Set up the pointer to the new block  */
151 
152 	gfs2_trans_add_meta(ip->i_gl, dibh);
153 	di = (struct gfs2_dinode *)dibh->b_data;
154 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
155 
156 	if (i_size_read(&ip->i_inode)) {
157 		*(__be64 *)(di + 1) = cpu_to_be64(block);
158 		gfs2_add_inode_blocks(&ip->i_inode, 1);
159 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
160 	}
161 
162 	ip->i_height = 1;
163 	di->di_height = cpu_to_be16(1);
164 
165 out_brelse:
166 	brelse(dibh);
167 out:
168 	up_write(&ip->i_rw_mutex);
169 	return error;
170 }
171 
172 
173 /**
174  * find_metapath - Find path through the metadata tree
175  * @sdp: The superblock
176  * @mp: The metapath to return the result in
177  * @block: The disk block to look up
178  * @height: The pre-calculated height of the metadata tree
179  *
180  *   This routine returns a struct metapath structure that defines a path
181  *   through the metadata of inode "ip" to get to block "block".
182  *
183  *   Example:
184  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
185  *   filesystem with a blocksize of 4096.
186  *
187  *   find_metapath() would return a struct metapath structure set to:
188  *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
189  *   and mp_list[2] = 165.
190  *
191  *   That means that in order to get to the block containing the byte at
192  *   offset 101342453, we would load the indirect block pointed to by pointer
193  *   0 in the dinode.  We would then load the indirect block pointed to by
194  *   pointer 48 in that indirect block.  We would then load the data block
195  *   pointed to by pointer 165 in that indirect block.
196  *
197  *             ----------------------------------------
198  *             | Dinode |                             |
199  *             |        |                            4|
200  *             |        |0 1 2 3 4 5                 9|
201  *             |        |                            6|
202  *             ----------------------------------------
203  *                       |
204  *                       |
205  *                       V
206  *             ----------------------------------------
207  *             | Indirect Block                       |
208  *             |                                     5|
209  *             |            4 4 4 4 4 5 5            1|
210  *             |0           5 6 7 8 9 0 1            2|
211  *             ----------------------------------------
212  *                                |
213  *                                |
214  *                                V
215  *             ----------------------------------------
216  *             | Indirect Block                       |
217  *             |                         1 1 1 1 1   5|
218  *             |                         6 6 6 6 6   1|
219  *             |0                        3 4 5 6 7   2|
220  *             ----------------------------------------
221  *                                           |
222  *                                           |
223  *                                           V
224  *             ----------------------------------------
225  *             | Data block containing offset         |
226  *             |            101342453                 |
227  *             |                                      |
228  *             |                                      |
229  *             ----------------------------------------
230  *
231  */
232 
233 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
234 			  struct metapath *mp, unsigned int height)
235 {
236 	unsigned int i;
237 
238 	for (i = height; i--;)
239 		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
240 
241 }
242 
243 static inline unsigned int metapath_branch_start(const struct metapath *mp)
244 {
245 	if (mp->mp_list[0] == 0)
246 		return 2;
247 	return 1;
248 }
249 
250 /**
251  * metaptr1 - Return the first possible metadata pointer in a metaath buffer
252  * @height: The metadata height (0 = dinode)
253  * @mp: The metapath
254  */
255 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
256 {
257 	struct buffer_head *bh = mp->mp_bh[height];
258 	if (height == 0)
259 		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
260 	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
261 }
262 
263 /**
264  * metapointer - Return pointer to start of metadata in a buffer
265  * @height: The metadata height (0 = dinode)
266  * @mp: The metapath
267  *
268  * Return a pointer to the block number of the next height of the metadata
269  * tree given a buffer containing the pointer to the current height of the
270  * metadata tree.
271  */
272 
273 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
274 {
275 	__be64 *p = metaptr1(height, mp);
276 	return p + mp->mp_list[height];
277 }
278 
279 static void gfs2_metapath_ra(struct gfs2_glock *gl,
280 			     const struct buffer_head *bh, const __be64 *pos)
281 {
282 	struct buffer_head *rabh;
283 	const __be64 *endp = (const __be64 *)(bh->b_data + bh->b_size);
284 	const __be64 *t;
285 
286 	for (t = pos; t < endp; t++) {
287 		if (!*t)
288 			continue;
289 
290 		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
291 		if (trylock_buffer(rabh)) {
292 			if (!buffer_uptodate(rabh)) {
293 				rabh->b_end_io = end_buffer_read_sync;
294 				submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META,
295 						rabh);
296 				continue;
297 			}
298 			unlock_buffer(rabh);
299 		}
300 		brelse(rabh);
301 	}
302 }
303 
304 /**
305  * lookup_mp_height - helper function for lookup_metapath
306  * @ip: the inode
307  * @mp: the metapath
308  * @h: the height which needs looking up
309  */
310 static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
311 {
312 	__be64 *ptr = metapointer(h, mp);
313 	u64 dblock = be64_to_cpu(*ptr);
314 
315 	if (!dblock)
316 		return h + 1;
317 
318 	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
319 }
320 
321 /**
322  * lookup_metapath - Walk the metadata tree to a specific point
323  * @ip: The inode
324  * @mp: The metapath
325  *
326  * Assumes that the inode's buffer has already been looked up and
327  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
328  * by find_metapath().
329  *
330  * If this function encounters part of the tree which has not been
331  * allocated, it returns the current height of the tree at the point
332  * at which it found the unallocated block. Blocks which are found are
333  * added to the mp->mp_bh[] list.
334  *
335  * Returns: error or height of metadata tree
336  */
337 
338 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
339 {
340 	unsigned int end_of_metadata = ip->i_height - 1;
341 	unsigned int x;
342 	int ret;
343 
344 	for (x = 0; x < end_of_metadata; x++) {
345 		ret = lookup_mp_height(ip, mp, x);
346 		if (ret)
347 			return ret;
348 	}
349 
350 	return ip->i_height;
351 }
352 
353 /**
354  * fillup_metapath - fill up buffers for the metadata path to a specific height
355  * @ip: The inode
356  * @mp: The metapath
357  * @h: The height to which it should be mapped
358  *
359  * Similar to lookup_metapath, but does lookups for a range of heights
360  *
361  * Returns: error or height of metadata tree
362  */
363 
364 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
365 {
366 	unsigned int start_h = h - 1;
367 	int ret;
368 
369 	if (h) {
370 		/* find the first buffer we need to look up. */
371 		while (start_h > 0 && mp->mp_bh[start_h] == NULL)
372 			start_h--;
373 		for (; start_h < h; start_h++) {
374 			ret = lookup_mp_height(ip, mp, start_h);
375 			if (ret)
376 				return ret;
377 		}
378 	}
379 	return ip->i_height;
380 }
381 
382 static inline void release_metapath(struct metapath *mp)
383 {
384 	int i;
385 
386 	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
387 		if (mp->mp_bh[i] == NULL)
388 			break;
389 		brelse(mp->mp_bh[i]);
390 	}
391 }
392 
393 /**
394  * gfs2_extent_length - Returns length of an extent of blocks
395  * @start: Start of the buffer
396  * @len: Length of the buffer in bytes
397  * @ptr: Current position in the buffer
398  * @limit: Max extent length to return (0 = unlimited)
399  * @eob: Set to 1 if we hit "end of block"
400  *
401  * If the first block is zero (unallocated) it will return the number of
402  * unallocated blocks in the extent, otherwise it will return the number
403  * of contiguous blocks in the extent.
404  *
405  * Returns: The length of the extent (minimum of one block)
406  */
407 
408 static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
409 {
410 	const __be64 *end = (start + len);
411 	const __be64 *first = ptr;
412 	u64 d = be64_to_cpu(*ptr);
413 
414 	*eob = 0;
415 	do {
416 		ptr++;
417 		if (ptr >= end)
418 			break;
419 		if (limit && --limit == 0)
420 			break;
421 		if (d)
422 			d++;
423 	} while(be64_to_cpu(*ptr) == d);
424 	if (ptr >= end)
425 		*eob = 1;
426 	return (ptr - first);
427 }
428 
429 static inline void bmap_lock(struct gfs2_inode *ip, int create)
430 {
431 	if (create)
432 		down_write(&ip->i_rw_mutex);
433 	else
434 		down_read(&ip->i_rw_mutex);
435 }
436 
437 static inline void bmap_unlock(struct gfs2_inode *ip, int create)
438 {
439 	if (create)
440 		up_write(&ip->i_rw_mutex);
441 	else
442 		up_read(&ip->i_rw_mutex);
443 }
444 
445 static inline __be64 *gfs2_indirect_init(struct metapath *mp,
446 					 struct gfs2_glock *gl, unsigned int i,
447 					 unsigned offset, u64 bn)
448 {
449 	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
450 		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
451 				 sizeof(struct gfs2_dinode)));
452 	BUG_ON(i < 1);
453 	BUG_ON(mp->mp_bh[i] != NULL);
454 	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
455 	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
456 	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
457 	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
458 	ptr += offset;
459 	*ptr = cpu_to_be64(bn);
460 	return ptr;
461 }
462 
463 enum alloc_state {
464 	ALLOC_DATA = 0,
465 	ALLOC_GROW_DEPTH = 1,
466 	ALLOC_GROW_HEIGHT = 2,
467 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
468 };
469 
470 static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
471 {
472 	if (hgt)
473 		return sdp->sd_inptrs;
474 	return sdp->sd_diptrs;
475 }
476 
477 /**
478  * gfs2_bmap_alloc - Build a metadata tree of the requested height
479  * @inode: The GFS2 inode
480  * @lblock: The logical starting block of the extent
481  * @bh_map: This is used to return the mapping details
482  * @mp: The metapath
483  * @sheight: The starting height (i.e. whats already mapped)
484  * @height: The height to build to
485  * @maxlen: The max number of data blocks to alloc
486  *
487  * In this routine we may have to alloc:
488  *   i) Indirect blocks to grow the metadata tree height
489  *  ii) Indirect blocks to fill in lower part of the metadata tree
490  * iii) Data blocks
491  *
492  * The function is in two parts. The first part works out the total
493  * number of blocks which we need. The second part does the actual
494  * allocation asking for an extent at a time (if enough contiguous free
495  * blocks are available, there will only be one request per bmap call)
496  * and uses the state machine to initialise the blocks in order.
497  *
498  * Returns: errno on error
499  */
500 
501 static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
502 			   struct buffer_head *bh_map, struct metapath *mp,
503 			   const unsigned int sheight,
504 			   const unsigned int height,
505 			   const size_t maxlen)
506 {
507 	struct gfs2_inode *ip = GFS2_I(inode);
508 	struct gfs2_sbd *sdp = GFS2_SB(inode);
509 	struct super_block *sb = sdp->sd_vfs;
510 	struct buffer_head *dibh = mp->mp_bh[0];
511 	u64 bn, dblock = 0;
512 	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
513 	unsigned dblks = 0;
514 	unsigned ptrs_per_blk;
515 	const unsigned end_of_metadata = height - 1;
516 	int ret;
517 	int eob = 0;
518 	enum alloc_state state;
519 	__be64 *ptr;
520 	__be64 zero_bn = 0;
521 
522 	BUG_ON(sheight < 1);
523 	BUG_ON(dibh == NULL);
524 
525 	gfs2_trans_add_meta(ip->i_gl, dibh);
526 
527 	if (height == sheight) {
528 		struct buffer_head *bh;
529 		/* Bottom indirect block exists, find unalloced extent size */
530 		ptr = metapointer(end_of_metadata, mp);
531 		bh = mp->mp_bh[end_of_metadata];
532 		dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
533 					   &eob);
534 		BUG_ON(dblks < 1);
535 		state = ALLOC_DATA;
536 	} else {
537 		/* Need to allocate indirect blocks */
538 		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
539 		dblks = min(maxlen, (size_t)(ptrs_per_blk -
540 					     mp->mp_list[end_of_metadata]));
541 		if (height == ip->i_height) {
542 			/* Writing into existing tree, extend tree down */
543 			iblks = height - sheight;
544 			state = ALLOC_GROW_DEPTH;
545 		} else {
546 			/* Building up tree height */
547 			state = ALLOC_GROW_HEIGHT;
548 			iblks = height - ip->i_height;
549 			branch_start = metapath_branch_start(mp);
550 			iblks += (height - branch_start);
551 		}
552 	}
553 
554 	/* start of the second part of the function (state machine) */
555 
556 	blks = dblks + iblks;
557 	i = sheight;
558 	do {
559 		int error;
560 		n = blks - alloced;
561 		error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
562 		if (error)
563 			return error;
564 		alloced += n;
565 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
566 			gfs2_trans_add_unrevoke(sdp, bn, n);
567 		switch (state) {
568 		/* Growing height of tree */
569 		case ALLOC_GROW_HEIGHT:
570 			if (i == 1) {
571 				ptr = (__be64 *)(dibh->b_data +
572 						 sizeof(struct gfs2_dinode));
573 				zero_bn = *ptr;
574 			}
575 			for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
576 				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
577 			if (i - 1 == height - ip->i_height) {
578 				i--;
579 				gfs2_buffer_copy_tail(mp->mp_bh[i],
580 						sizeof(struct gfs2_meta_header),
581 						dibh, sizeof(struct gfs2_dinode));
582 				gfs2_buffer_clear_tail(dibh,
583 						sizeof(struct gfs2_dinode) +
584 						sizeof(__be64));
585 				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
586 					sizeof(struct gfs2_meta_header));
587 				*ptr = zero_bn;
588 				state = ALLOC_GROW_DEPTH;
589 				for(i = branch_start; i < height; i++) {
590 					if (mp->mp_bh[i] == NULL)
591 						break;
592 					brelse(mp->mp_bh[i]);
593 					mp->mp_bh[i] = NULL;
594 				}
595 				i = branch_start;
596 			}
597 			if (n == 0)
598 				break;
599 		/* Branching from existing tree */
600 		case ALLOC_GROW_DEPTH:
601 			if (i > 1 && i < height)
602 				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
603 			for (; i < height && n > 0; i++, n--)
604 				gfs2_indirect_init(mp, ip->i_gl, i,
605 						   mp->mp_list[i-1], bn++);
606 			if (i == height)
607 				state = ALLOC_DATA;
608 			if (n == 0)
609 				break;
610 		/* Tree complete, adding data blocks */
611 		case ALLOC_DATA:
612 			BUG_ON(n > dblks);
613 			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
614 			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
615 			dblks = n;
616 			ptr = metapointer(end_of_metadata, mp);
617 			dblock = bn;
618 			while (n-- > 0)
619 				*ptr++ = cpu_to_be64(bn++);
620 			if (buffer_zeronew(bh_map)) {
621 				ret = sb_issue_zeroout(sb, dblock, dblks,
622 						       GFP_NOFS);
623 				if (ret) {
624 					fs_err(sdp,
625 					       "Failed to zero data buffers\n");
626 					clear_buffer_zeronew(bh_map);
627 				}
628 			}
629 			break;
630 		}
631 	} while ((state != ALLOC_DATA) || !dblock);
632 
633 	ip->i_height = height;
634 	gfs2_add_inode_blocks(&ip->i_inode, alloced);
635 	gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
636 	map_bh(bh_map, inode->i_sb, dblock);
637 	bh_map->b_size = dblks << inode->i_blkbits;
638 	set_buffer_new(bh_map);
639 	return 0;
640 }
641 
642 /**
643  * gfs2_block_map - Map a block from an inode to a disk block
644  * @inode: The inode
645  * @lblock: The logical block number
646  * @bh_map: The bh to be mapped
647  * @create: True if its ok to alloc blocks to satify the request
648  *
649  * Sets buffer_mapped() if successful, sets buffer_boundary() if a
650  * read of metadata will be required before the next block can be
651  * mapped. Sets buffer_new() if new blocks were allocated.
652  *
653  * Returns: errno
654  */
655 
656 int gfs2_block_map(struct inode *inode, sector_t lblock,
657 		   struct buffer_head *bh_map, int create)
658 {
659 	struct gfs2_inode *ip = GFS2_I(inode);
660 	struct gfs2_sbd *sdp = GFS2_SB(inode);
661 	unsigned int bsize = sdp->sd_sb.sb_bsize;
662 	const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
663 	const u64 *arr = sdp->sd_heightsize;
664 	__be64 *ptr;
665 	u64 size;
666 	struct metapath mp;
667 	int ret;
668 	int eob;
669 	unsigned int len;
670 	struct buffer_head *bh;
671 	u8 height;
672 
673 	BUG_ON(maxlen == 0);
674 
675 	memset(&mp, 0, sizeof(mp));
676 	bmap_lock(ip, create);
677 	clear_buffer_mapped(bh_map);
678 	clear_buffer_new(bh_map);
679 	clear_buffer_boundary(bh_map);
680 	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
681 	if (gfs2_is_dir(ip)) {
682 		bsize = sdp->sd_jbsize;
683 		arr = sdp->sd_jheightsize;
684 	}
685 
686 	ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
687 	if (ret)
688 		goto out;
689 
690 	height = ip->i_height;
691 	size = (lblock + 1) * bsize;
692 	while (size > arr[height])
693 		height++;
694 	find_metapath(sdp, lblock, &mp, height);
695 	ret = 1;
696 	if (height > ip->i_height || gfs2_is_stuffed(ip))
697 		goto do_alloc;
698 	ret = lookup_metapath(ip, &mp);
699 	if (ret < 0)
700 		goto out;
701 	if (ret != ip->i_height)
702 		goto do_alloc;
703 	ptr = metapointer(ip->i_height - 1, &mp);
704 	if (*ptr == 0)
705 		goto do_alloc;
706 	map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
707 	bh = mp.mp_bh[ip->i_height - 1];
708 	len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
709 	bh_map->b_size = (len << inode->i_blkbits);
710 	if (eob)
711 		set_buffer_boundary(bh_map);
712 	ret = 0;
713 out:
714 	release_metapath(&mp);
715 	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
716 	bmap_unlock(ip, create);
717 	return ret;
718 
719 do_alloc:
720 	/* All allocations are done here, firstly check create flag */
721 	if (!create) {
722 		BUG_ON(gfs2_is_stuffed(ip));
723 		ret = 0;
724 		goto out;
725 	}
726 
727 	/* At this point ret is the tree depth of already allocated blocks */
728 	ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
729 	goto out;
730 }
731 
732 /*
733  * Deprecated: do not use in new code
734  */
735 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
736 {
737 	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
738 	int ret;
739 	int create = *new;
740 
741 	BUG_ON(!extlen);
742 	BUG_ON(!dblock);
743 	BUG_ON(!new);
744 
745 	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
746 	ret = gfs2_block_map(inode, lblock, &bh, create);
747 	*extlen = bh.b_size >> inode->i_blkbits;
748 	*dblock = bh.b_blocknr;
749 	if (buffer_new(&bh))
750 		*new = 1;
751 	else
752 		*new = 0;
753 	return ret;
754 }
755 
756 /**
757  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
758  *
759  * This is partly borrowed from ext3.
760  */
761 static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
762 {
763 	struct inode *inode = mapping->host;
764 	struct gfs2_inode *ip = GFS2_I(inode);
765 	unsigned long index = from >> PAGE_SHIFT;
766 	unsigned offset = from & (PAGE_SIZE-1);
767 	unsigned blocksize, iblock, length, pos;
768 	struct buffer_head *bh;
769 	struct page *page;
770 	int err;
771 
772 	page = find_or_create_page(mapping, index, GFP_NOFS);
773 	if (!page)
774 		return 0;
775 
776 	blocksize = inode->i_sb->s_blocksize;
777 	length = blocksize - (offset & (blocksize - 1));
778 	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
779 
780 	if (!page_has_buffers(page))
781 		create_empty_buffers(page, blocksize, 0);
782 
783 	/* Find the buffer that contains "offset" */
784 	bh = page_buffers(page);
785 	pos = blocksize;
786 	while (offset >= pos) {
787 		bh = bh->b_this_page;
788 		iblock++;
789 		pos += blocksize;
790 	}
791 
792 	err = 0;
793 
794 	if (!buffer_mapped(bh)) {
795 		gfs2_block_map(inode, iblock, bh, 0);
796 		/* unmapped? It's a hole - nothing to do */
797 		if (!buffer_mapped(bh))
798 			goto unlock;
799 	}
800 
801 	/* Ok, it's mapped. Make sure it's up-to-date */
802 	if (PageUptodate(page))
803 		set_buffer_uptodate(bh);
804 
805 	if (!buffer_uptodate(bh)) {
806 		err = -EIO;
807 		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
808 		wait_on_buffer(bh);
809 		/* Uhhuh. Read error. Complain and punt. */
810 		if (!buffer_uptodate(bh))
811 			goto unlock;
812 		err = 0;
813 	}
814 
815 	if (!gfs2_is_writeback(ip))
816 		gfs2_trans_add_data(ip->i_gl, bh);
817 
818 	zero_user(page, offset, length);
819 	mark_buffer_dirty(bh);
820 unlock:
821 	unlock_page(page);
822 	put_page(page);
823 	return err;
824 }
825 
826 #define GFS2_JTRUNC_REVOKES 8192
827 
828 /**
829  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
830  * @inode: The inode being truncated
831  * @oldsize: The original (larger) size
832  * @newsize: The new smaller size
833  *
834  * With jdata files, we have to journal a revoke for each block which is
835  * truncated. As a result, we need to split this into separate transactions
836  * if the number of pages being truncated gets too large.
837  */
838 
839 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
840 {
841 	struct gfs2_sbd *sdp = GFS2_SB(inode);
842 	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
843 	u64 chunk;
844 	int error;
845 
846 	while (oldsize != newsize) {
847 		chunk = oldsize - newsize;
848 		if (chunk > max_chunk)
849 			chunk = max_chunk;
850 		truncate_pagecache(inode, oldsize - chunk);
851 		oldsize -= chunk;
852 		gfs2_trans_end(sdp);
853 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
854 		if (error)
855 			return error;
856 	}
857 
858 	return 0;
859 }
860 
861 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
862 {
863 	struct gfs2_inode *ip = GFS2_I(inode);
864 	struct gfs2_sbd *sdp = GFS2_SB(inode);
865 	struct address_space *mapping = inode->i_mapping;
866 	struct buffer_head *dibh;
867 	int journaled = gfs2_is_jdata(ip);
868 	int error;
869 
870 	if (journaled)
871 		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
872 	else
873 		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
874 	if (error)
875 		return error;
876 
877 	error = gfs2_meta_inode_buffer(ip, &dibh);
878 	if (error)
879 		goto out;
880 
881 	gfs2_trans_add_meta(ip->i_gl, dibh);
882 
883 	if (gfs2_is_stuffed(ip)) {
884 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
885 	} else {
886 		if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
887 			error = gfs2_block_truncate_page(mapping, newsize);
888 			if (error)
889 				goto out_brelse;
890 		}
891 		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
892 	}
893 
894 	i_size_write(inode, newsize);
895 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
896 	gfs2_dinode_out(ip, dibh->b_data);
897 
898 	if (journaled)
899 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
900 	else
901 		truncate_pagecache(inode, newsize);
902 
903 	if (error) {
904 		brelse(dibh);
905 		return error;
906 	}
907 
908 out_brelse:
909 	brelse(dibh);
910 out:
911 	gfs2_trans_end(sdp);
912 	return error;
913 }
914 
915 /**
916  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
917  * @ip: inode
918  * @rg_gh: holder of resource group glock
919  * @mp: current metapath fully populated with buffers
920  * @btotal: place to keep count of total blocks freed
921  * @hgt: height we're processing
922  * @first: true if this is the first call to this function for this height
923  *
924  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
925  * free, and free them all. However, we do it one rgrp at a time. If this
926  * block has references to multiple rgrps, we break it into individual
927  * transactions. This allows other processes to use the rgrps while we're
928  * focused on a single one, for better concurrency / performance.
929  * At every transaction boundary, we rewrite the inode into the journal.
930  * That way the bitmaps are kept consistent with the inode and we can recover
931  * if we're interrupted by power-outages.
932  *
933  * Returns: 0, or return code if an error occurred.
934  *          *btotal has the total number of blocks freed
935  */
936 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
937 			      const struct metapath *mp, u32 *btotal, int hgt,
938 			      bool preserve1)
939 {
940 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
941 	struct gfs2_rgrpd *rgd;
942 	struct gfs2_trans *tr;
943 	struct buffer_head *bh = mp->mp_bh[hgt];
944 	__be64 *top, *bottom, *p;
945 	int blks_outside_rgrp;
946 	u64 bn, bstart, isize_blks;
947 	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
948 	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
949 	int ret = 0;
950 	bool buf_in_tr = false; /* buffer was added to transaction */
951 
952 	if (gfs2_metatype_check(sdp, bh,
953 				(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
954 		return -EIO;
955 
956 more_rgrps:
957 	blks_outside_rgrp = 0;
958 	bstart = 0;
959 	blen = 0;
960 	top = metapointer(hgt, mp); /* first ptr from metapath */
961 	/* If we're keeping some data at the truncation point, we've got to
962 	   preserve the metadata tree by adding 1 to the starting metapath. */
963 	if (preserve1)
964 		top++;
965 
966 	bottom = (__be64 *)(bh->b_data + bh->b_size);
967 
968 	for (p = top; p < bottom; p++) {
969 		if (!*p)
970 			continue;
971 		bn = be64_to_cpu(*p);
972 		if (gfs2_holder_initialized(rd_gh)) {
973 			rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
974 			gfs2_assert_withdraw(sdp,
975 				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
976 		} else {
977 			rgd = gfs2_blk2rgrpd(sdp, bn, false);
978 			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
979 						 0, rd_gh);
980 			if (ret)
981 				goto out;
982 
983 			/* Must be done with the rgrp glock held: */
984 			if (gfs2_rs_active(&ip->i_res) &&
985 			    rgd == ip->i_res.rs_rbm.rgd)
986 				gfs2_rs_deltree(&ip->i_res);
987 		}
988 
989 		if (!rgrp_contains_block(rgd, bn)) {
990 			blks_outside_rgrp++;
991 			continue;
992 		}
993 
994 		/* The size of our transactions will be unknown until we
995 		   actually process all the metadata blocks that relate to
996 		   the rgrp. So we estimate. We know it can't be more than
997 		   the dinode's i_blocks and we don't want to exceed the
998 		   journal flush threshold, sd_log_thresh2. */
999 		if (current->journal_info == NULL) {
1000 			unsigned int jblocks_rqsted, revokes;
1001 
1002 			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1003 				RES_INDIRECT;
1004 			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1005 			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1006 				jblocks_rqsted +=
1007 					atomic_read(&sdp->sd_log_thresh2);
1008 			else
1009 				jblocks_rqsted += isize_blks;
1010 			revokes = jblocks_rqsted;
1011 			if (meta)
1012 				revokes += hptrs(sdp, hgt);
1013 			else if (ip->i_depth)
1014 				revokes += sdp->sd_inptrs;
1015 			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1016 			if (ret)
1017 				goto out_unlock;
1018 			down_write(&ip->i_rw_mutex);
1019 		}
1020 		/* check if we will exceed the transaction blocks requested */
1021 		tr = current->journal_info;
1022 		if (tr->tr_num_buf_new + RES_STATFS +
1023 		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1024 			/* We set blks_outside_rgrp to ensure the loop will
1025 			   be repeated for the same rgrp, but with a new
1026 			   transaction. */
1027 			blks_outside_rgrp++;
1028 			/* This next part is tricky. If the buffer was added
1029 			   to the transaction, we've already set some block
1030 			   pointers to 0, so we better follow through and free
1031 			   them, or we will introduce corruption (so break).
1032 			   This may be impossible, or at least rare, but I
1033 			   decided to cover the case regardless.
1034 
1035 			   If the buffer was not added to the transaction
1036 			   (this call), doing so would exceed our transaction
1037 			   size, so we need to end the transaction and start a
1038 			   new one (so goto). */
1039 
1040 			if (buf_in_tr)
1041 				break;
1042 			goto out_unlock;
1043 		}
1044 
1045 		gfs2_trans_add_meta(ip->i_gl, bh);
1046 		buf_in_tr = true;
1047 		*p = 0;
1048 		if (bstart + blen == bn) {
1049 			blen++;
1050 			continue;
1051 		}
1052 		if (bstart) {
1053 			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1054 			(*btotal) += blen;
1055 			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1056 		}
1057 		bstart = bn;
1058 		blen = 1;
1059 	}
1060 	if (bstart) {
1061 		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1062 		(*btotal) += blen;
1063 		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1064 	}
1065 out_unlock:
1066 	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1067 					    outside the rgrp we just processed,
1068 					    do it all over again. */
1069 		if (current->journal_info) {
1070 			struct buffer_head *dibh = mp->mp_bh[0];
1071 
1072 			/* Every transaction boundary, we rewrite the dinode
1073 			   to keep its di_blocks current in case of failure. */
1074 			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1075 				current_time(&ip->i_inode);
1076 			gfs2_trans_add_meta(ip->i_gl, dibh);
1077 			gfs2_dinode_out(ip, dibh->b_data);
1078 			up_write(&ip->i_rw_mutex);
1079 			gfs2_trans_end(sdp);
1080 		}
1081 		gfs2_glock_dq_uninit(rd_gh);
1082 		cond_resched();
1083 		goto more_rgrps;
1084 	}
1085 out:
1086 	return ret;
1087 }
1088 
1089 /**
1090  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1091  * assumes the metapath is valid (with buffers) out to height h
1092  * @mp: starting metapath
1093  * @h: desired height to search
1094  *
1095  * Returns: true if a non-null pointer was found in the metapath buffer
1096  *          false if all remaining pointers are NULL in the buffer
1097  */
1098 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1099 			     unsigned int h)
1100 {
1101 	__be64 *ptr;
1102 	unsigned int ptrs = hptrs(sdp, h) - 1;
1103 
1104 	while (true) {
1105 		ptr = metapointer(h, mp);
1106 		if (*ptr) /* if we have a non-null pointer */
1107 			return true;
1108 
1109 		if (mp->mp_list[h] < ptrs)
1110 			mp->mp_list[h]++;
1111 		else
1112 			return false; /* no more pointers in this buffer */
1113 	}
1114 }
1115 
1116 enum dealloc_states {
1117 	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1118 	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1119 	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1120 	DEALLOC_DONE = 3,       /* process complete */
1121 };
1122 
1123 /**
1124  * trunc_dealloc - truncate a file down to a desired size
1125  * @ip: inode to truncate
1126  * @newsize: The desired size of the file
1127  *
1128  * This function truncates a file to newsize. It works from the
1129  * bottom up, and from the right to the left. In other words, it strips off
1130  * the highest layer (data) before stripping any of the metadata. Doing it
1131  * this way is best in case the operation is interrupted by power failure, etc.
1132  * The dinode is rewritten in every transaction to guarantee integrity.
1133  */
1134 static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1135 {
1136 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1137 	struct metapath mp;
1138 	struct buffer_head *dibh, *bh;
1139 	struct gfs2_holder rd_gh;
1140 	u64 lblock;
1141 	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1142 	unsigned int strip_h = ip->i_height - 1;
1143 	u32 btotal = 0;
1144 	int ret, state;
1145 	int mp_h; /* metapath buffers are read in to this height */
1146 	sector_t last_ra = 0;
1147 	u64 prev_bnr = 0;
1148 	bool preserve1; /* need to preserve the first meta pointer? */
1149 
1150 	if (!newsize)
1151 		lblock = 0;
1152 	else
1153 		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1154 
1155 	memset(&mp, 0, sizeof(mp));
1156 	find_metapath(sdp, lblock, &mp, ip->i_height);
1157 
1158 	memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1159 
1160 	ret = gfs2_meta_inode_buffer(ip, &dibh);
1161 	if (ret)
1162 		return ret;
1163 
1164 	mp.mp_bh[0] = dibh;
1165 	ret = lookup_metapath(ip, &mp);
1166 	if (ret == ip->i_height)
1167 		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1168 	else
1169 		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1170 
1171 	ret = gfs2_rindex_update(sdp);
1172 	if (ret)
1173 		goto out_metapath;
1174 
1175 	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1176 	if (ret)
1177 		goto out_metapath;
1178 	gfs2_holder_mark_uninitialized(&rd_gh);
1179 
1180 	mp_h = strip_h;
1181 
1182 	while (state != DEALLOC_DONE) {
1183 		switch (state) {
1184 		/* Truncate a full metapath at the given strip height.
1185 		 * Note that strip_h == mp_h in order to be in this state. */
1186 		case DEALLOC_MP_FULL:
1187 			if (mp_h > 0) { /* issue read-ahead on metadata */
1188 				__be64 *top;
1189 
1190 				bh = mp.mp_bh[mp_h - 1];
1191 				if (bh->b_blocknr != last_ra) {
1192 					last_ra = bh->b_blocknr;
1193 					top = metaptr1(mp_h - 1, &mp);
1194 					gfs2_metapath_ra(ip->i_gl, bh, top);
1195 				}
1196 			}
1197 			/* If we're truncating to a non-zero size and the mp is
1198 			   at the beginning of file for the strip height, we
1199 			   need to preserve the first metadata pointer. */
1200 			preserve1 = (newsize &&
1201 				     (mp.mp_list[mp_h] == nbof[mp_h]));
1202 			bh = mp.mp_bh[mp_h];
1203 			gfs2_assert_withdraw(sdp, bh);
1204 			if (gfs2_assert_withdraw(sdp,
1205 						 prev_bnr != bh->b_blocknr)) {
1206 				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1207 				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1208 				       sdp->sd_fsname,
1209 				       (unsigned long long)ip->i_no_addr,
1210 				       prev_bnr, ip->i_height, strip_h, mp_h);
1211 			}
1212 			prev_bnr = bh->b_blocknr;
1213 			ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1214 						 mp_h, preserve1);
1215 			/* If we hit an error or just swept dinode buffer,
1216 			   just exit. */
1217 			if (ret || !mp_h) {
1218 				state = DEALLOC_DONE;
1219 				break;
1220 			}
1221 			state = DEALLOC_MP_LOWER;
1222 			break;
1223 
1224 		/* lower the metapath strip height */
1225 		case DEALLOC_MP_LOWER:
1226 			/* We're done with the current buffer, so release it,
1227 			   unless it's the dinode buffer. Then back up to the
1228 			   previous pointer. */
1229 			if (mp_h) {
1230 				brelse(mp.mp_bh[mp_h]);
1231 				mp.mp_bh[mp_h] = NULL;
1232 			}
1233 			/* If we can't get any lower in height, we've stripped
1234 			   off all we can. Next step is to back up and start
1235 			   stripping the previous level of metadata. */
1236 			if (mp_h == 0) {
1237 				strip_h--;
1238 				memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1239 				mp_h = strip_h;
1240 				state = DEALLOC_FILL_MP;
1241 				break;
1242 			}
1243 			mp.mp_list[mp_h] = 0;
1244 			mp_h--; /* search one metadata height down */
1245 			if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1246 				break; /* loop around in the same state */
1247 			mp.mp_list[mp_h]++;
1248 			/* Here we've found a part of the metapath that is not
1249 			 * allocated. We need to search at that height for the
1250 			 * next non-null pointer. */
1251 			if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1252 				state = DEALLOC_FILL_MP;
1253 				mp_h++;
1254 			}
1255 			/* No more non-null pointers at this height. Back up
1256 			   to the previous height and try again. */
1257 			break; /* loop around in the same state */
1258 
1259 		/* Fill the metapath with buffers to the given height. */
1260 		case DEALLOC_FILL_MP:
1261 			/* Fill the buffers out to the current height. */
1262 			ret = fillup_metapath(ip, &mp, mp_h);
1263 			if (ret < 0)
1264 				goto out;
1265 
1266 			/* If buffers found for the entire strip height */
1267 			if ((ret == ip->i_height) && (mp_h == strip_h)) {
1268 				state = DEALLOC_MP_FULL;
1269 				break;
1270 			}
1271 			if (ret < ip->i_height) /* We have a partial height */
1272 				mp_h = ret - 1;
1273 
1274 			/* If we find a non-null block pointer, crawl a bit
1275 			   higher up in the metapath and try again, otherwise
1276 			   we need to look lower for a new starting point. */
1277 			if (find_nonnull_ptr(sdp, &mp, mp_h))
1278 				mp_h++;
1279 			else
1280 				state = DEALLOC_MP_LOWER;
1281 			break;
1282 		}
1283 	}
1284 
1285 	if (btotal) {
1286 		if (current->journal_info == NULL) {
1287 			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1288 					       RES_QUOTA, 0);
1289 			if (ret)
1290 				goto out;
1291 			down_write(&ip->i_rw_mutex);
1292 		}
1293 		gfs2_statfs_change(sdp, 0, +btotal, 0);
1294 		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1295 				  ip->i_inode.i_gid);
1296 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1297 		gfs2_trans_add_meta(ip->i_gl, dibh);
1298 		gfs2_dinode_out(ip, dibh->b_data);
1299 		up_write(&ip->i_rw_mutex);
1300 		gfs2_trans_end(sdp);
1301 	}
1302 
1303 out:
1304 	if (gfs2_holder_initialized(&rd_gh))
1305 		gfs2_glock_dq_uninit(&rd_gh);
1306 	if (current->journal_info) {
1307 		up_write(&ip->i_rw_mutex);
1308 		gfs2_trans_end(sdp);
1309 		cond_resched();
1310 	}
1311 	gfs2_quota_unhold(ip);
1312 out_metapath:
1313 	release_metapath(&mp);
1314 	return ret;
1315 }
1316 
1317 static int trunc_end(struct gfs2_inode *ip)
1318 {
1319 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1320 	struct buffer_head *dibh;
1321 	int error;
1322 
1323 	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1324 	if (error)
1325 		return error;
1326 
1327 	down_write(&ip->i_rw_mutex);
1328 
1329 	error = gfs2_meta_inode_buffer(ip, &dibh);
1330 	if (error)
1331 		goto out;
1332 
1333 	if (!i_size_read(&ip->i_inode)) {
1334 		ip->i_height = 0;
1335 		ip->i_goal = ip->i_no_addr;
1336 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1337 		gfs2_ordered_del_inode(ip);
1338 	}
1339 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1340 	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1341 
1342 	gfs2_trans_add_meta(ip->i_gl, dibh);
1343 	gfs2_dinode_out(ip, dibh->b_data);
1344 	brelse(dibh);
1345 
1346 out:
1347 	up_write(&ip->i_rw_mutex);
1348 	gfs2_trans_end(sdp);
1349 	return error;
1350 }
1351 
1352 /**
1353  * do_shrink - make a file smaller
1354  * @inode: the inode
1355  * @oldsize: the current inode size
1356  * @newsize: the size to make the file
1357  *
1358  * Called with an exclusive lock on @inode. The @size must
1359  * be equal to or smaller than the current inode size.
1360  *
1361  * Returns: errno
1362  */
1363 
1364 static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1365 {
1366 	struct gfs2_inode *ip = GFS2_I(inode);
1367 	int error;
1368 
1369 	error = trunc_start(inode, oldsize, newsize);
1370 	if (error < 0)
1371 		return error;
1372 	if (gfs2_is_stuffed(ip))
1373 		return 0;
1374 
1375 	error = trunc_dealloc(ip, newsize);
1376 	if (error == 0)
1377 		error = trunc_end(ip);
1378 
1379 	return error;
1380 }
1381 
1382 void gfs2_trim_blocks(struct inode *inode)
1383 {
1384 	u64 size = inode->i_size;
1385 	int ret;
1386 
1387 	ret = do_shrink(inode, size, size);
1388 	WARN_ON(ret != 0);
1389 }
1390 
1391 /**
1392  * do_grow - Touch and update inode size
1393  * @inode: The inode
1394  * @size: The new size
1395  *
1396  * This function updates the timestamps on the inode and
1397  * may also increase the size of the inode. This function
1398  * must not be called with @size any smaller than the current
1399  * inode size.
1400  *
1401  * Although it is not strictly required to unstuff files here,
1402  * earlier versions of GFS2 have a bug in the stuffed file reading
1403  * code which will result in a buffer overrun if the size is larger
1404  * than the max stuffed file size. In order to prevent this from
1405  * occurring, such files are unstuffed, but in other cases we can
1406  * just update the inode size directly.
1407  *
1408  * Returns: 0 on success, or -ve on error
1409  */
1410 
1411 static int do_grow(struct inode *inode, u64 size)
1412 {
1413 	struct gfs2_inode *ip = GFS2_I(inode);
1414 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1415 	struct gfs2_alloc_parms ap = { .target = 1, };
1416 	struct buffer_head *dibh;
1417 	int error;
1418 	int unstuff = 0;
1419 
1420 	if (gfs2_is_stuffed(ip) &&
1421 	    (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1422 		error = gfs2_quota_lock_check(ip, &ap);
1423 		if (error)
1424 			return error;
1425 
1426 		error = gfs2_inplace_reserve(ip, &ap);
1427 		if (error)
1428 			goto do_grow_qunlock;
1429 		unstuff = 1;
1430 	}
1431 
1432 	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
1433 				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
1434 				  0 : RES_QUOTA), 0);
1435 	if (error)
1436 		goto do_grow_release;
1437 
1438 	if (unstuff) {
1439 		error = gfs2_unstuff_dinode(ip, NULL);
1440 		if (error)
1441 			goto do_end_trans;
1442 	}
1443 
1444 	error = gfs2_meta_inode_buffer(ip, &dibh);
1445 	if (error)
1446 		goto do_end_trans;
1447 
1448 	i_size_write(inode, size);
1449 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1450 	gfs2_trans_add_meta(ip->i_gl, dibh);
1451 	gfs2_dinode_out(ip, dibh->b_data);
1452 	brelse(dibh);
1453 
1454 do_end_trans:
1455 	gfs2_trans_end(sdp);
1456 do_grow_release:
1457 	if (unstuff) {
1458 		gfs2_inplace_release(ip);
1459 do_grow_qunlock:
1460 		gfs2_quota_unlock(ip);
1461 	}
1462 	return error;
1463 }
1464 
1465 /**
1466  * gfs2_setattr_size - make a file a given size
1467  * @inode: the inode
1468  * @newsize: the size to make the file
1469  *
1470  * The file size can grow, shrink, or stay the same size. This
1471  * is called holding i_mutex and an exclusive glock on the inode
1472  * in question.
1473  *
1474  * Returns: errno
1475  */
1476 
1477 int gfs2_setattr_size(struct inode *inode, u64 newsize)
1478 {
1479 	struct gfs2_inode *ip = GFS2_I(inode);
1480 	int ret;
1481 	u64 oldsize;
1482 
1483 	BUG_ON(!S_ISREG(inode->i_mode));
1484 
1485 	ret = inode_newsize_ok(inode, newsize);
1486 	if (ret)
1487 		return ret;
1488 
1489 	inode_dio_wait(inode);
1490 
1491 	ret = gfs2_rsqa_alloc(ip);
1492 	if (ret)
1493 		goto out;
1494 
1495 	oldsize = inode->i_size;
1496 	if (newsize >= oldsize) {
1497 		ret = do_grow(inode, newsize);
1498 		goto out;
1499 	}
1500 
1501 	ret = do_shrink(inode, oldsize, newsize);
1502 out:
1503 	gfs2_rsqa_delete(ip, NULL);
1504 	return ret;
1505 }
1506 
1507 int gfs2_truncatei_resume(struct gfs2_inode *ip)
1508 {
1509 	int error;
1510 	error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1511 	if (!error)
1512 		error = trunc_end(ip);
1513 	return error;
1514 }
1515 
1516 int gfs2_file_dealloc(struct gfs2_inode *ip)
1517 {
1518 	return trunc_dealloc(ip, 0);
1519 }
1520 
1521 /**
1522  * gfs2_free_journal_extents - Free cached journal bmap info
1523  * @jd: The journal
1524  *
1525  */
1526 
1527 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
1528 {
1529 	struct gfs2_journal_extent *jext;
1530 
1531 	while(!list_empty(&jd->extent_list)) {
1532 		jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
1533 		list_del(&jext->list);
1534 		kfree(jext);
1535 	}
1536 }
1537 
1538 /**
1539  * gfs2_add_jextent - Add or merge a new extent to extent cache
1540  * @jd: The journal descriptor
1541  * @lblock: The logical block at start of new extent
1542  * @dblock: The physical block at start of new extent
1543  * @blocks: Size of extent in fs blocks
1544  *
1545  * Returns: 0 on success or -ENOMEM
1546  */
1547 
1548 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
1549 {
1550 	struct gfs2_journal_extent *jext;
1551 
1552 	if (!list_empty(&jd->extent_list)) {
1553 		jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
1554 		if ((jext->dblock + jext->blocks) == dblock) {
1555 			jext->blocks += blocks;
1556 			return 0;
1557 		}
1558 	}
1559 
1560 	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
1561 	if (jext == NULL)
1562 		return -ENOMEM;
1563 	jext->dblock = dblock;
1564 	jext->lblock = lblock;
1565 	jext->blocks = blocks;
1566 	list_add_tail(&jext->list, &jd->extent_list);
1567 	jd->nr_extents++;
1568 	return 0;
1569 }
1570 
1571 /**
1572  * gfs2_map_journal_extents - Cache journal bmap info
1573  * @sdp: The super block
1574  * @jd: The journal to map
1575  *
1576  * Create a reusable "extent" mapping from all logical
1577  * blocks to all physical blocks for the given journal.  This will save
1578  * us time when writing journal blocks.  Most journals will have only one
1579  * extent that maps all their logical blocks.  That's because gfs2.mkfs
1580  * arranges the journal blocks sequentially to maximize performance.
1581  * So the extent would map the first block for the entire file length.
1582  * However, gfs2_jadd can happen while file activity is happening, so
1583  * those journals may not be sequential.  Less likely is the case where
1584  * the users created their own journals by mounting the metafs and
1585  * laying it out.  But it's still possible.  These journals might have
1586  * several extents.
1587  *
1588  * Returns: 0 on success, or error on failure
1589  */
1590 
1591 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
1592 {
1593 	u64 lblock = 0;
1594 	u64 lblock_stop;
1595 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
1596 	struct buffer_head bh;
1597 	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1598 	u64 size;
1599 	int rc;
1600 
1601 	lblock_stop = i_size_read(jd->jd_inode) >> shift;
1602 	size = (lblock_stop - lblock) << shift;
1603 	jd->nr_extents = 0;
1604 	WARN_ON(!list_empty(&jd->extent_list));
1605 
1606 	do {
1607 		bh.b_state = 0;
1608 		bh.b_blocknr = 0;
1609 		bh.b_size = size;
1610 		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
1611 		if (rc || !buffer_mapped(&bh))
1612 			goto fail;
1613 		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
1614 		if (rc)
1615 			goto fail;
1616 		size -= bh.b_size;
1617 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1618 	} while(size > 0);
1619 
1620 	fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
1621 		jd->nr_extents);
1622 	return 0;
1623 
1624 fail:
1625 	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
1626 		rc, jd->jd_jid,
1627 		(unsigned long long)(i_size_read(jd->jd_inode) - size),
1628 		jd->nr_extents);
1629 	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
1630 		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
1631 		bh.b_state, (unsigned long long)bh.b_size);
1632 	gfs2_free_journal_extents(jd);
1633 	return rc;
1634 }
1635 
1636 /**
1637  * gfs2_write_alloc_required - figure out if a write will require an allocation
1638  * @ip: the file being written to
1639  * @offset: the offset to write to
1640  * @len: the number of bytes being written
1641  *
1642  * Returns: 1 if an alloc is required, 0 otherwise
1643  */
1644 
1645 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1646 			      unsigned int len)
1647 {
1648 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1649 	struct buffer_head bh;
1650 	unsigned int shift;
1651 	u64 lblock, lblock_stop, size;
1652 	u64 end_of_file;
1653 
1654 	if (!len)
1655 		return 0;
1656 
1657 	if (gfs2_is_stuffed(ip)) {
1658 		if (offset + len >
1659 		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
1660 			return 1;
1661 		return 0;
1662 	}
1663 
1664 	shift = sdp->sd_sb.sb_bsize_shift;
1665 	BUG_ON(gfs2_is_dir(ip));
1666 	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1667 	lblock = offset >> shift;
1668 	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1669 	if (lblock_stop > end_of_file)
1670 		return 1;
1671 
1672 	size = (lblock_stop - lblock) << shift;
1673 	do {
1674 		bh.b_state = 0;
1675 		bh.b_size = size;
1676 		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1677 		if (!buffer_mapped(&bh))
1678 			return 1;
1679 		size -= bh.b_size;
1680 		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1681 	} while(size > 0);
1682 
1683 	return 0;
1684 }
1685 
1686