xref: /openbmc/linux/fs/gfs2/rgrp.c (revision c5c87812)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  * @wcount: The inode's write count, or NULL
668  *
669  */
670 void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
671 {
672 	down_write(&ip->i_rw_mutex);
673 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
674 		gfs2_rs_deltree(&ip->i_res);
675 	up_write(&ip->i_rw_mutex);
676 }
677 
678 /**
679  * return_all_reservations - return all reserved blocks back to the rgrp.
680  * @rgd: the rgrp that needs its space back
681  *
682  * We previously reserved a bunch of blocks for allocation. Now we need to
683  * give them back. This leave the reservation structures in tact, but removes
684  * all of their corresponding "no-fly zones".
685  */
686 static void return_all_reservations(struct gfs2_rgrpd *rgd)
687 {
688 	struct rb_node *n;
689 	struct gfs2_blkreserv *rs;
690 
691 	spin_lock(&rgd->rd_rsspin);
692 	while ((n = rb_first(&rgd->rd_rstree))) {
693 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
694 		__rs_deltree(rs);
695 	}
696 	spin_unlock(&rgd->rd_rsspin);
697 }
698 
699 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
700 {
701 	struct rb_node *n;
702 	struct gfs2_rgrpd *rgd;
703 	struct gfs2_glock *gl;
704 
705 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
706 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
707 		gl = rgd->rd_gl;
708 
709 		rb_erase(n, &sdp->sd_rindex_tree);
710 
711 		if (gl) {
712 			if (gl->gl_state != LM_ST_UNLOCKED) {
713 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
714 				flush_delayed_work(&gl->gl_work);
715 			}
716 			gfs2_rgrp_brelse(rgd);
717 			glock_clear_object(gl, rgd);
718 			gfs2_glock_put(gl);
719 		}
720 
721 		gfs2_free_clones(rgd);
722 		kfree(rgd->rd_bits);
723 		rgd->rd_bits = NULL;
724 		return_all_reservations(rgd);
725 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
726 	}
727 }
728 
729 /**
730  * gfs2_compute_bitstructs - Compute the bitmap sizes
731  * @rgd: The resource group descriptor
732  *
733  * Calculates bitmap descriptors, one for each block that contains bitmap data
734  *
735  * Returns: errno
736  */
737 
738 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
739 {
740 	struct gfs2_sbd *sdp = rgd->rd_sbd;
741 	struct gfs2_bitmap *bi;
742 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
743 	u32 bytes_left, bytes;
744 	int x;
745 
746 	if (!length)
747 		return -EINVAL;
748 
749 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
750 	if (!rgd->rd_bits)
751 		return -ENOMEM;
752 
753 	bytes_left = rgd->rd_bitbytes;
754 
755 	for (x = 0; x < length; x++) {
756 		bi = rgd->rd_bits + x;
757 
758 		bi->bi_flags = 0;
759 		/* small rgrp; bitmap stored completely in header block */
760 		if (length == 1) {
761 			bytes = bytes_left;
762 			bi->bi_offset = sizeof(struct gfs2_rgrp);
763 			bi->bi_start = 0;
764 			bi->bi_bytes = bytes;
765 			bi->bi_blocks = bytes * GFS2_NBBY;
766 		/* header block */
767 		} else if (x == 0) {
768 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
769 			bi->bi_offset = sizeof(struct gfs2_rgrp);
770 			bi->bi_start = 0;
771 			bi->bi_bytes = bytes;
772 			bi->bi_blocks = bytes * GFS2_NBBY;
773 		/* last block */
774 		} else if (x + 1 == length) {
775 			bytes = bytes_left;
776 			bi->bi_offset = sizeof(struct gfs2_meta_header);
777 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
778 			bi->bi_bytes = bytes;
779 			bi->bi_blocks = bytes * GFS2_NBBY;
780 		/* other blocks */
781 		} else {
782 			bytes = sdp->sd_sb.sb_bsize -
783 				sizeof(struct gfs2_meta_header);
784 			bi->bi_offset = sizeof(struct gfs2_meta_header);
785 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
786 			bi->bi_bytes = bytes;
787 			bi->bi_blocks = bytes * GFS2_NBBY;
788 		}
789 
790 		bytes_left -= bytes;
791 	}
792 
793 	if (bytes_left) {
794 		gfs2_consist_rgrpd(rgd);
795 		return -EIO;
796 	}
797 	bi = rgd->rd_bits + (length - 1);
798 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
799 		gfs2_lm(sdp,
800 			"ri_addr = %llu\n"
801 			"ri_length = %u\n"
802 			"ri_data0 = %llu\n"
803 			"ri_data = %u\n"
804 			"ri_bitbytes = %u\n"
805 			"start=%u len=%u offset=%u\n",
806 			(unsigned long long)rgd->rd_addr,
807 			rgd->rd_length,
808 			(unsigned long long)rgd->rd_data0,
809 			rgd->rd_data,
810 			rgd->rd_bitbytes,
811 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
812 		gfs2_consist_rgrpd(rgd);
813 		return -EIO;
814 	}
815 
816 	return 0;
817 }
818 
819 /**
820  * gfs2_ri_total - Total up the file system space, according to the rindex.
821  * @sdp: the filesystem
822  *
823  */
824 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
825 {
826 	u64 total_data = 0;
827 	struct inode *inode = sdp->sd_rindex;
828 	struct gfs2_inode *ip = GFS2_I(inode);
829 	char buf[sizeof(struct gfs2_rindex)];
830 	int error, rgrps;
831 
832 	for (rgrps = 0;; rgrps++) {
833 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
834 
835 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
836 			break;
837 		error = gfs2_internal_read(ip, buf, &pos,
838 					   sizeof(struct gfs2_rindex));
839 		if (error != sizeof(struct gfs2_rindex))
840 			break;
841 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
842 	}
843 	return total_data;
844 }
845 
846 static int rgd_insert(struct gfs2_rgrpd *rgd)
847 {
848 	struct gfs2_sbd *sdp = rgd->rd_sbd;
849 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
850 
851 	/* Figure out where to put new node */
852 	while (*newn) {
853 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
854 						  rd_node);
855 
856 		parent = *newn;
857 		if (rgd->rd_addr < cur->rd_addr)
858 			newn = &((*newn)->rb_left);
859 		else if (rgd->rd_addr > cur->rd_addr)
860 			newn = &((*newn)->rb_right);
861 		else
862 			return -EEXIST;
863 	}
864 
865 	rb_link_node(&rgd->rd_node, parent, newn);
866 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
867 	sdp->sd_rgrps++;
868 	return 0;
869 }
870 
871 /**
872  * read_rindex_entry - Pull in a new resource index entry from the disk
873  * @ip: Pointer to the rindex inode
874  *
875  * Returns: 0 on success, > 0 on EOF, error code otherwise
876  */
877 
878 static int read_rindex_entry(struct gfs2_inode *ip)
879 {
880 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
881 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
882 	struct gfs2_rindex buf;
883 	int error;
884 	struct gfs2_rgrpd *rgd;
885 
886 	if (pos >= i_size_read(&ip->i_inode))
887 		return 1;
888 
889 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
890 				   sizeof(struct gfs2_rindex));
891 
892 	if (error != sizeof(struct gfs2_rindex))
893 		return (error == 0) ? 1 : error;
894 
895 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
896 	error = -ENOMEM;
897 	if (!rgd)
898 		return error;
899 
900 	rgd->rd_sbd = sdp;
901 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
902 	rgd->rd_length = be32_to_cpu(buf.ri_length);
903 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
904 	rgd->rd_data = be32_to_cpu(buf.ri_data);
905 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
906 	spin_lock_init(&rgd->rd_rsspin);
907 
908 	error = compute_bitstructs(rgd);
909 	if (error)
910 		goto fail;
911 
912 	error = gfs2_glock_get(sdp, rgd->rd_addr,
913 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
914 	if (error)
915 		goto fail;
916 
917 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
918 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
919 	if (rgd->rd_data > sdp->sd_max_rg_data)
920 		sdp->sd_max_rg_data = rgd->rd_data;
921 	spin_lock(&sdp->sd_rindex_spin);
922 	error = rgd_insert(rgd);
923 	spin_unlock(&sdp->sd_rindex_spin);
924 	if (!error) {
925 		glock_set_object(rgd->rd_gl, rgd);
926 		return 0;
927 	}
928 
929 	error = 0; /* someone else read in the rgrp; free it and ignore it */
930 	gfs2_glock_put(rgd->rd_gl);
931 
932 fail:
933 	kfree(rgd->rd_bits);
934 	rgd->rd_bits = NULL;
935 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
936 	return error;
937 }
938 
939 /**
940  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
941  * @sdp: the GFS2 superblock
942  *
943  * The purpose of this function is to select a subset of the resource groups
944  * and mark them as PREFERRED. We do it in such a way that each node prefers
945  * to use a unique set of rgrps to minimize glock contention.
946  */
947 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
948 {
949 	struct gfs2_rgrpd *rgd, *first;
950 	int i;
951 
952 	/* Skip an initial number of rgrps, based on this node's journal ID.
953 	   That should start each node out on its own set. */
954 	rgd = gfs2_rgrpd_get_first(sdp);
955 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
956 		rgd = gfs2_rgrpd_get_next(rgd);
957 	first = rgd;
958 
959 	do {
960 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
961 		for (i = 0; i < sdp->sd_journals; i++) {
962 			rgd = gfs2_rgrpd_get_next(rgd);
963 			if (!rgd || rgd == first)
964 				break;
965 		}
966 	} while (rgd && rgd != first);
967 }
968 
969 /**
970  * gfs2_ri_update - Pull in a new resource index from the disk
971  * @ip: pointer to the rindex inode
972  *
973  * Returns: 0 on successful update, error code otherwise
974  */
975 
976 static int gfs2_ri_update(struct gfs2_inode *ip)
977 {
978 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
979 	int error;
980 
981 	do {
982 		error = read_rindex_entry(ip);
983 	} while (error == 0);
984 
985 	if (error < 0)
986 		return error;
987 
988 	set_rgrp_preferences(sdp);
989 
990 	sdp->sd_rindex_uptodate = 1;
991 	return 0;
992 }
993 
994 /**
995  * gfs2_rindex_update - Update the rindex if required
996  * @sdp: The GFS2 superblock
997  *
998  * We grab a lock on the rindex inode to make sure that it doesn't
999  * change whilst we are performing an operation. We keep this lock
1000  * for quite long periods of time compared to other locks. This
1001  * doesn't matter, since it is shared and it is very, very rarely
1002  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1003  *
1004  * This makes sure that we're using the latest copy of the resource index
1005  * special file, which might have been updated if someone expanded the
1006  * filesystem (via gfs2_grow utility), which adds new resource groups.
1007  *
1008  * Returns: 0 on succeess, error code otherwise
1009  */
1010 
1011 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1012 {
1013 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1014 	struct gfs2_glock *gl = ip->i_gl;
1015 	struct gfs2_holder ri_gh;
1016 	int error = 0;
1017 	int unlock_required = 0;
1018 
1019 	/* Read new copy from disk if we don't have the latest */
1020 	if (!sdp->sd_rindex_uptodate) {
1021 		if (!gfs2_glock_is_locked_by_me(gl)) {
1022 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1023 			if (error)
1024 				return error;
1025 			unlock_required = 1;
1026 		}
1027 		if (!sdp->sd_rindex_uptodate)
1028 			error = gfs2_ri_update(ip);
1029 		if (unlock_required)
1030 			gfs2_glock_dq_uninit(&ri_gh);
1031 	}
1032 
1033 	return error;
1034 }
1035 
1036 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1037 {
1038 	const struct gfs2_rgrp *str = buf;
1039 	u32 rg_flags;
1040 
1041 	rg_flags = be32_to_cpu(str->rg_flags);
1042 	rg_flags &= ~GFS2_RDF_MASK;
1043 	rgd->rd_flags &= GFS2_RDF_MASK;
1044 	rgd->rd_flags |= rg_flags;
1045 	rgd->rd_free = be32_to_cpu(str->rg_free);
1046 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1047 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1048 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1049 }
1050 
1051 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1052 {
1053 	const struct gfs2_rgrp *str = buf;
1054 
1055 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1056 	rgl->rl_flags = str->rg_flags;
1057 	rgl->rl_free = str->rg_free;
1058 	rgl->rl_dinodes = str->rg_dinodes;
1059 	rgl->rl_igeneration = str->rg_igeneration;
1060 	rgl->__pad = 0UL;
1061 }
1062 
1063 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1064 {
1065 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1066 	struct gfs2_rgrp *str = buf;
1067 	u32 crc;
1068 
1069 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1070 	str->rg_free = cpu_to_be32(rgd->rd_free);
1071 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1072 	if (next == NULL)
1073 		str->rg_skip = 0;
1074 	else if (next->rd_addr > rgd->rd_addr)
1075 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1076 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1077 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1078 	str->rg_data = cpu_to_be32(rgd->rd_data);
1079 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1080 	str->rg_crc = 0;
1081 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1082 	str->rg_crc = cpu_to_be32(crc);
1083 
1084 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1085 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1086 }
1087 
1088 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1089 {
1090 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1091 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1092 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1093 	int valid = 1;
1094 
1095 	if (rgl->rl_flags != str->rg_flags) {
1096 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1097 			(unsigned long long)rgd->rd_addr,
1098 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1099 		valid = 0;
1100 	}
1101 	if (rgl->rl_free != str->rg_free) {
1102 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1103 			(unsigned long long)rgd->rd_addr,
1104 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1105 		valid = 0;
1106 	}
1107 	if (rgl->rl_dinodes != str->rg_dinodes) {
1108 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1109 			(unsigned long long)rgd->rd_addr,
1110 			be32_to_cpu(rgl->rl_dinodes),
1111 			be32_to_cpu(str->rg_dinodes));
1112 		valid = 0;
1113 	}
1114 	if (rgl->rl_igeneration != str->rg_igeneration) {
1115 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1116 			(unsigned long long)rgd->rd_addr,
1117 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1118 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1119 		valid = 0;
1120 	}
1121 	return valid;
1122 }
1123 
1124 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1125 {
1126 	struct gfs2_bitmap *bi;
1127 	const u32 length = rgd->rd_length;
1128 	const u8 *buffer = NULL;
1129 	u32 i, goal, count = 0;
1130 
1131 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1132 		goal = 0;
1133 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1134 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1135 		while (goal < bi->bi_blocks) {
1136 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1137 					   GFS2_BLKST_UNLINKED);
1138 			if (goal == BFITNOENT)
1139 				break;
1140 			count++;
1141 			goal++;
1142 		}
1143 	}
1144 
1145 	return count;
1146 }
1147 
1148 
1149 /**
1150  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1151  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1152  *
1153  * Read in all of a Resource Group's header and bitmap blocks.
1154  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1155  *
1156  * Returns: errno
1157  */
1158 
1159 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1160 {
1161 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1162 	struct gfs2_glock *gl = rgd->rd_gl;
1163 	unsigned int length = rgd->rd_length;
1164 	struct gfs2_bitmap *bi;
1165 	unsigned int x, y;
1166 	int error;
1167 
1168 	if (rgd->rd_bits[0].bi_bh != NULL)
1169 		return 0;
1170 
1171 	for (x = 0; x < length; x++) {
1172 		bi = rgd->rd_bits + x;
1173 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1174 		if (error)
1175 			goto fail;
1176 	}
1177 
1178 	for (y = length; y--;) {
1179 		bi = rgd->rd_bits + y;
1180 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1181 		if (error)
1182 			goto fail;
1183 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1184 					      GFS2_METATYPE_RG)) {
1185 			error = -EIO;
1186 			goto fail;
1187 		}
1188 	}
1189 
1190 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1191 		for (x = 0; x < length; x++)
1192 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1193 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1194 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1195 		rgd->rd_free_clone = rgd->rd_free;
1196 		/* max out the rgrp allocation failure point */
1197 		rgd->rd_extfail_pt = rgd->rd_free;
1198 	}
1199 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1200 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1201 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1202 				     rgd->rd_bits[0].bi_bh->b_data);
1203 	}
1204 	else if (sdp->sd_args.ar_rgrplvb) {
1205 		if (!gfs2_rgrp_lvb_valid(rgd)){
1206 			gfs2_consist_rgrpd(rgd);
1207 			error = -EIO;
1208 			goto fail;
1209 		}
1210 		if (rgd->rd_rgl->rl_unlinked == 0)
1211 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1212 	}
1213 	return 0;
1214 
1215 fail:
1216 	while (x--) {
1217 		bi = rgd->rd_bits + x;
1218 		brelse(bi->bi_bh);
1219 		bi->bi_bh = NULL;
1220 		gfs2_assert_warn(sdp, !bi->bi_clone);
1221 	}
1222 
1223 	return error;
1224 }
1225 
1226 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1227 {
1228 	u32 rl_flags;
1229 
1230 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1231 		return 0;
1232 
1233 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1234 		return gfs2_rgrp_bh_get(rgd);
1235 
1236 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1237 	rl_flags &= ~GFS2_RDF_MASK;
1238 	rgd->rd_flags &= GFS2_RDF_MASK;
1239 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1240 	if (rgd->rd_rgl->rl_unlinked == 0)
1241 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1242 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1243 	rgd->rd_free_clone = rgd->rd_free;
1244 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1245 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1246 	return 0;
1247 }
1248 
1249 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1250 {
1251 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1252 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1253 
1254 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1255 		return 0;
1256 	return gfs2_rgrp_bh_get(rgd);
1257 }
1258 
1259 /**
1260  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1261  * @rgd: The resource group
1262  *
1263  */
1264 
1265 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1266 {
1267 	int x, length = rgd->rd_length;
1268 
1269 	for (x = 0; x < length; x++) {
1270 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1271 		if (bi->bi_bh) {
1272 			brelse(bi->bi_bh);
1273 			bi->bi_bh = NULL;
1274 		}
1275 	}
1276 }
1277 
1278 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1279 			     struct buffer_head *bh,
1280 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1281 {
1282 	struct super_block *sb = sdp->sd_vfs;
1283 	u64 blk;
1284 	sector_t start = 0;
1285 	sector_t nr_blks = 0;
1286 	int rv;
1287 	unsigned int x;
1288 	u32 trimmed = 0;
1289 	u8 diff;
1290 
1291 	for (x = 0; x < bi->bi_bytes; x++) {
1292 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1293 		clone += bi->bi_offset;
1294 		clone += x;
1295 		if (bh) {
1296 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1297 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1298 		} else {
1299 			diff = ~(*clone | (*clone >> 1));
1300 		}
1301 		diff &= 0x55;
1302 		if (diff == 0)
1303 			continue;
1304 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1305 		while(diff) {
1306 			if (diff & 1) {
1307 				if (nr_blks == 0)
1308 					goto start_new_extent;
1309 				if ((start + nr_blks) != blk) {
1310 					if (nr_blks >= minlen) {
1311 						rv = sb_issue_discard(sb,
1312 							start, nr_blks,
1313 							GFP_NOFS, 0);
1314 						if (rv)
1315 							goto fail;
1316 						trimmed += nr_blks;
1317 					}
1318 					nr_blks = 0;
1319 start_new_extent:
1320 					start = blk;
1321 				}
1322 				nr_blks++;
1323 			}
1324 			diff >>= 2;
1325 			blk++;
1326 		}
1327 	}
1328 	if (nr_blks >= minlen) {
1329 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1330 		if (rv)
1331 			goto fail;
1332 		trimmed += nr_blks;
1333 	}
1334 	if (ptrimmed)
1335 		*ptrimmed = trimmed;
1336 	return 0;
1337 
1338 fail:
1339 	if (sdp->sd_args.ar_discard)
1340 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1341 	sdp->sd_args.ar_discard = 0;
1342 	return -EIO;
1343 }
1344 
1345 /**
1346  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1347  * @filp: Any file on the filesystem
1348  * @argp: Pointer to the arguments (also used to pass result)
1349  *
1350  * Returns: 0 on success, otherwise error code
1351  */
1352 
1353 int gfs2_fitrim(struct file *filp, void __user *argp)
1354 {
1355 	struct inode *inode = file_inode(filp);
1356 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1357 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1358 	struct buffer_head *bh;
1359 	struct gfs2_rgrpd *rgd;
1360 	struct gfs2_rgrpd *rgd_end;
1361 	struct gfs2_holder gh;
1362 	struct fstrim_range r;
1363 	int ret = 0;
1364 	u64 amt;
1365 	u64 trimmed = 0;
1366 	u64 start, end, minlen;
1367 	unsigned int x;
1368 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1369 
1370 	if (!capable(CAP_SYS_ADMIN))
1371 		return -EPERM;
1372 
1373 	if (!blk_queue_discard(q))
1374 		return -EOPNOTSUPP;
1375 
1376 	if (copy_from_user(&r, argp, sizeof(r)))
1377 		return -EFAULT;
1378 
1379 	ret = gfs2_rindex_update(sdp);
1380 	if (ret)
1381 		return ret;
1382 
1383 	start = r.start >> bs_shift;
1384 	end = start + (r.len >> bs_shift);
1385 	minlen = max_t(u64, r.minlen,
1386 		       q->limits.discard_granularity) >> bs_shift;
1387 
1388 	if (end <= start || minlen > sdp->sd_max_rg_data)
1389 		return -EINVAL;
1390 
1391 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1392 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1393 
1394 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1395 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1396 		return -EINVAL; /* start is beyond the end of the fs */
1397 
1398 	while (1) {
1399 
1400 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1401 		if (ret)
1402 			goto out;
1403 
1404 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1405 			/* Trim each bitmap in the rgrp */
1406 			for (x = 0; x < rgd->rd_length; x++) {
1407 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1408 				ret = gfs2_rgrp_send_discards(sdp,
1409 						rgd->rd_data0, NULL, bi, minlen,
1410 						&amt);
1411 				if (ret) {
1412 					gfs2_glock_dq_uninit(&gh);
1413 					goto out;
1414 				}
1415 				trimmed += amt;
1416 			}
1417 
1418 			/* Mark rgrp as having been trimmed */
1419 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1420 			if (ret == 0) {
1421 				bh = rgd->rd_bits[0].bi_bh;
1422 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1423 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1424 				gfs2_rgrp_out(rgd, bh->b_data);
1425 				gfs2_trans_end(sdp);
1426 			}
1427 		}
1428 		gfs2_glock_dq_uninit(&gh);
1429 
1430 		if (rgd == rgd_end)
1431 			break;
1432 
1433 		rgd = gfs2_rgrpd_get_next(rgd);
1434 	}
1435 
1436 out:
1437 	r.len = trimmed << bs_shift;
1438 	if (copy_to_user(argp, &r, sizeof(r)))
1439 		return -EFAULT;
1440 
1441 	return ret;
1442 }
1443 
1444 /**
1445  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1446  * @ip: the inode structure
1447  *
1448  */
1449 static void rs_insert(struct gfs2_inode *ip)
1450 {
1451 	struct rb_node **newn, *parent = NULL;
1452 	int rc;
1453 	struct gfs2_blkreserv *rs = &ip->i_res;
1454 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1455 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1456 
1457 	BUG_ON(gfs2_rs_active(rs));
1458 
1459 	spin_lock(&rgd->rd_rsspin);
1460 	newn = &rgd->rd_rstree.rb_node;
1461 	while (*newn) {
1462 		struct gfs2_blkreserv *cur =
1463 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1464 
1465 		parent = *newn;
1466 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1467 		if (rc > 0)
1468 			newn = &((*newn)->rb_right);
1469 		else if (rc < 0)
1470 			newn = &((*newn)->rb_left);
1471 		else {
1472 			spin_unlock(&rgd->rd_rsspin);
1473 			WARN_ON(1);
1474 			return;
1475 		}
1476 	}
1477 
1478 	rb_link_node(&rs->rs_node, parent, newn);
1479 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1480 
1481 	/* Do our rgrp accounting for the reservation */
1482 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1483 	spin_unlock(&rgd->rd_rsspin);
1484 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1485 }
1486 
1487 /**
1488  * rgd_free - return the number of free blocks we can allocate.
1489  * @rgd: the resource group
1490  *
1491  * This function returns the number of free blocks for an rgrp.
1492  * That's the clone-free blocks (blocks that are free, not including those
1493  * still being used for unlinked files that haven't been deleted.)
1494  *
1495  * It also subtracts any blocks reserved by someone else, but does not
1496  * include free blocks that are still part of our current reservation,
1497  * because obviously we can (and will) allocate them.
1498  */
1499 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1500 {
1501 	u32 tot_reserved, tot_free;
1502 
1503 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1504 		return 0;
1505 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1506 
1507 	if (rgd->rd_free_clone < tot_reserved)
1508 		tot_reserved = 0;
1509 
1510 	tot_free = rgd->rd_free_clone - tot_reserved;
1511 
1512 	return tot_free;
1513 }
1514 
1515 /**
1516  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1517  * @rgd: the resource group descriptor
1518  * @ip: pointer to the inode for which we're reserving blocks
1519  * @ap: the allocation parameters
1520  *
1521  */
1522 
1523 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1524 			   const struct gfs2_alloc_parms *ap)
1525 {
1526 	struct gfs2_rbm rbm = { .rgd = rgd, };
1527 	u64 goal;
1528 	struct gfs2_blkreserv *rs = &ip->i_res;
1529 	u32 extlen;
1530 	u32 free_blocks = rgd_free(rgd, rs);
1531 	int ret;
1532 	struct inode *inode = &ip->i_inode;
1533 
1534 	if (S_ISDIR(inode->i_mode))
1535 		extlen = 1;
1536 	else {
1537 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1538 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1539 	}
1540 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1541 		return;
1542 
1543 	/* Find bitmap block that contains bits for goal block */
1544 	if (rgrp_contains_block(rgd, ip->i_goal))
1545 		goal = ip->i_goal;
1546 	else
1547 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1548 
1549 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1550 		return;
1551 
1552 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1553 	if (ret == 0) {
1554 		rs->rs_rbm = rbm;
1555 		rs->rs_free = extlen;
1556 		rs_insert(ip);
1557 	} else {
1558 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1559 			rgd->rd_last_alloc = 0;
1560 	}
1561 }
1562 
1563 /**
1564  * gfs2_next_unreserved_block - Return next block that is not reserved
1565  * @rgd: The resource group
1566  * @block: The starting block
1567  * @length: The required length
1568  * @ip: Ignore any reservations for this inode
1569  *
1570  * If the block does not appear in any reservation, then return the
1571  * block number unchanged. If it does appear in the reservation, then
1572  * keep looking through the tree of reservations in order to find the
1573  * first block number which is not reserved.
1574  */
1575 
1576 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1577 				      u32 length,
1578 				      const struct gfs2_inode *ip)
1579 {
1580 	struct gfs2_blkreserv *rs;
1581 	struct rb_node *n;
1582 	int rc;
1583 
1584 	spin_lock(&rgd->rd_rsspin);
1585 	n = rgd->rd_rstree.rb_node;
1586 	while (n) {
1587 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1588 		rc = rs_cmp(block, length, rs);
1589 		if (rc < 0)
1590 			n = n->rb_left;
1591 		else if (rc > 0)
1592 			n = n->rb_right;
1593 		else
1594 			break;
1595 	}
1596 
1597 	if (n) {
1598 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1599 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1600 			n = n->rb_right;
1601 			if (n == NULL)
1602 				break;
1603 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1604 		}
1605 	}
1606 
1607 	spin_unlock(&rgd->rd_rsspin);
1608 	return block;
1609 }
1610 
1611 /**
1612  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1613  * @rbm: The current position in the resource group
1614  * @ip: The inode for which we are searching for blocks
1615  * @minext: The minimum extent length
1616  * @maxext: A pointer to the maximum extent structure
1617  *
1618  * This checks the current position in the rgrp to see whether there is
1619  * a reservation covering this block. If not then this function is a
1620  * no-op. If there is, then the position is moved to the end of the
1621  * contiguous reservation(s) so that we are pointing at the first
1622  * non-reserved block.
1623  *
1624  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1625  */
1626 
1627 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1628 					     const struct gfs2_inode *ip,
1629 					     u32 minext,
1630 					     struct gfs2_extent *maxext)
1631 {
1632 	u64 block = gfs2_rbm_to_block(rbm);
1633 	u32 extlen = 1;
1634 	u64 nblock;
1635 	int ret;
1636 
1637 	/*
1638 	 * If we have a minimum extent length, then skip over any extent
1639 	 * which is less than the min extent length in size.
1640 	 */
1641 	if (minext) {
1642 		extlen = gfs2_free_extlen(rbm, minext);
1643 		if (extlen <= maxext->len)
1644 			goto fail;
1645 	}
1646 
1647 	/*
1648 	 * Check the extent which has been found against the reservations
1649 	 * and skip if parts of it are already reserved
1650 	 */
1651 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1652 	if (nblock == block) {
1653 		if (!minext || extlen >= minext)
1654 			return 0;
1655 
1656 		if (extlen > maxext->len) {
1657 			maxext->len = extlen;
1658 			maxext->rbm = *rbm;
1659 		}
1660 fail:
1661 		nblock = block + extlen;
1662 	}
1663 	ret = gfs2_rbm_from_block(rbm, nblock);
1664 	if (ret < 0)
1665 		return ret;
1666 	return 1;
1667 }
1668 
1669 /**
1670  * gfs2_rbm_find - Look for blocks of a particular state
1671  * @rbm: Value/result starting position and final position
1672  * @state: The state which we want to find
1673  * @minext: Pointer to the requested extent length (NULL for a single block)
1674  *          This is updated to be the actual reservation size.
1675  * @ip: If set, check for reservations
1676  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1677  *          around until we've reached the starting point.
1678  *
1679  * Side effects:
1680  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1681  *   has no free blocks in it.
1682  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1683  *   has come up short on a free block search.
1684  *
1685  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1686  */
1687 
1688 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1689 			 const struct gfs2_inode *ip, bool nowrap)
1690 {
1691 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1692 	struct buffer_head *bh;
1693 	int last_bii;
1694 	u32 offset;
1695 	u8 *buffer;
1696 	bool wrapped = false;
1697 	int ret;
1698 	struct gfs2_bitmap *bi;
1699 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1700 
1701 	/*
1702 	 * Determine the last bitmap to search.  If we're not starting at the
1703 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1704 	 * the entire resource group.
1705 	 */
1706 	last_bii = rbm->bii - (rbm->offset == 0);
1707 
1708 	while(1) {
1709 		bi = rbm_bi(rbm);
1710 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1711 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1712 		    (state == GFS2_BLKST_FREE))
1713 			goto next_bitmap;
1714 
1715 		bh = bi->bi_bh;
1716 		buffer = bh->b_data + bi->bi_offset;
1717 		WARN_ON(!buffer_uptodate(bh));
1718 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1719 			buffer = bi->bi_clone + bi->bi_offset;
1720 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1721 		if (offset == BFITNOENT) {
1722 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1723 				set_bit(GBF_FULL, &bi->bi_flags);
1724 			goto next_bitmap;
1725 		}
1726 		rbm->offset = offset;
1727 		if (ip == NULL)
1728 			return 0;
1729 
1730 		ret = gfs2_reservation_check_and_update(rbm, ip,
1731 							minext ? *minext : 0,
1732 							&maxext);
1733 		if (ret == 0)
1734 			return 0;
1735 		if (ret > 0)
1736 			goto next_iter;
1737 		if (ret == -E2BIG) {
1738 			rbm->bii = 0;
1739 			rbm->offset = 0;
1740 			goto res_covered_end_of_rgrp;
1741 		}
1742 		return ret;
1743 
1744 next_bitmap:	/* Find next bitmap in the rgrp */
1745 		rbm->offset = 0;
1746 		rbm->bii++;
1747 		if (rbm->bii == rbm->rgd->rd_length)
1748 			rbm->bii = 0;
1749 res_covered_end_of_rgrp:
1750 		if (rbm->bii == 0) {
1751 			if (wrapped)
1752 				break;
1753 			wrapped = true;
1754 			if (nowrap)
1755 				break;
1756 		}
1757 next_iter:
1758 		/* Have we scanned the entire resource group? */
1759 		if (wrapped && rbm->bii > last_bii)
1760 			break;
1761 	}
1762 
1763 	if (minext == NULL || state != GFS2_BLKST_FREE)
1764 		return -ENOSPC;
1765 
1766 	/* If the extent was too small, and it's smaller than the smallest
1767 	   to have failed before, remember for future reference that it's
1768 	   useless to search this rgrp again for this amount or more. */
1769 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1770 	    *minext < rbm->rgd->rd_extfail_pt)
1771 		rbm->rgd->rd_extfail_pt = *minext;
1772 
1773 	/* If the maximum extent we found is big enough to fulfill the
1774 	   minimum requirements, use it anyway. */
1775 	if (maxext.len) {
1776 		*rbm = maxext.rbm;
1777 		*minext = maxext.len;
1778 		return 0;
1779 	}
1780 
1781 	return -ENOSPC;
1782 }
1783 
1784 /**
1785  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1786  * @rgd: The rgrp
1787  * @last_unlinked: block address of the last dinode we unlinked
1788  * @skip: block address we should explicitly not unlink
1789  *
1790  * Returns: 0 if no error
1791  *          The inode, if one has been found, in inode.
1792  */
1793 
1794 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1795 {
1796 	u64 block;
1797 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1798 	struct gfs2_glock *gl;
1799 	struct gfs2_inode *ip;
1800 	int error;
1801 	int found = 0;
1802 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1803 
1804 	while (1) {
1805 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1806 				      true);
1807 		if (error == -ENOSPC)
1808 			break;
1809 		if (WARN_ON_ONCE(error))
1810 			break;
1811 
1812 		block = gfs2_rbm_to_block(&rbm);
1813 		if (gfs2_rbm_from_block(&rbm, block + 1))
1814 			break;
1815 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1816 			continue;
1817 		if (block == skip)
1818 			continue;
1819 		*last_unlinked = block;
1820 
1821 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1822 		if (error)
1823 			continue;
1824 
1825 		/* If the inode is already in cache, we can ignore it here
1826 		 * because the existing inode disposal code will deal with
1827 		 * it when all refs have gone away. Accessing gl_object like
1828 		 * this is not safe in general. Here it is ok because we do
1829 		 * not dereference the pointer, and we only need an approx
1830 		 * answer to whether it is NULL or not.
1831 		 */
1832 		ip = gl->gl_object;
1833 
1834 		if (ip || !gfs2_queue_delete_work(gl, 0))
1835 			gfs2_glock_put(gl);
1836 		else
1837 			found++;
1838 
1839 		/* Limit reclaim to sensible number of tasks */
1840 		if (found > NR_CPUS)
1841 			return;
1842 	}
1843 
1844 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1845 	return;
1846 }
1847 
1848 /**
1849  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1850  * @rgd: The rgrp in question
1851  * @loops: An indication of how picky we can be (0=very, 1=less so)
1852  *
1853  * This function uses the recently added glock statistics in order to
1854  * figure out whether a parciular resource group is suffering from
1855  * contention from multiple nodes. This is done purely on the basis
1856  * of timings, since this is the only data we have to work with and
1857  * our aim here is to reject a resource group which is highly contended
1858  * but (very important) not to do this too often in order to ensure that
1859  * we do not land up introducing fragmentation by changing resource
1860  * groups when not actually required.
1861  *
1862  * The calculation is fairly simple, we want to know whether the SRTTB
1863  * (i.e. smoothed round trip time for blocking operations) to acquire
1864  * the lock for this rgrp's glock is significantly greater than the
1865  * time taken for resource groups on average. We introduce a margin in
1866  * the form of the variable @var which is computed as the sum of the two
1867  * respective variences, and multiplied by a factor depending on @loops
1868  * and whether we have a lot of data to base the decision on. This is
1869  * then tested against the square difference of the means in order to
1870  * decide whether the result is statistically significant or not.
1871  *
1872  * Returns: A boolean verdict on the congestion status
1873  */
1874 
1875 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1876 {
1877 	const struct gfs2_glock *gl = rgd->rd_gl;
1878 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1879 	struct gfs2_lkstats *st;
1880 	u64 r_dcount, l_dcount;
1881 	u64 l_srttb, a_srttb = 0;
1882 	s64 srttb_diff;
1883 	u64 sqr_diff;
1884 	u64 var;
1885 	int cpu, nonzero = 0;
1886 
1887 	preempt_disable();
1888 	for_each_present_cpu(cpu) {
1889 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1890 		if (st->stats[GFS2_LKS_SRTTB]) {
1891 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1892 			nonzero++;
1893 		}
1894 	}
1895 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1896 	if (nonzero)
1897 		do_div(a_srttb, nonzero);
1898 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1899 	var = st->stats[GFS2_LKS_SRTTVARB] +
1900 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1901 	preempt_enable();
1902 
1903 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1904 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1905 
1906 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1907 		return false;
1908 
1909 	srttb_diff = a_srttb - l_srttb;
1910 	sqr_diff = srttb_diff * srttb_diff;
1911 
1912 	var *= 2;
1913 	if (l_dcount < 8 || r_dcount < 8)
1914 		var *= 2;
1915 	if (loops == 1)
1916 		var *= 2;
1917 
1918 	return ((srttb_diff < 0) && (sqr_diff > var));
1919 }
1920 
1921 /**
1922  * gfs2_rgrp_used_recently
1923  * @rs: The block reservation with the rgrp to test
1924  * @msecs: The time limit in milliseconds
1925  *
1926  * Returns: True if the rgrp glock has been used within the time limit
1927  */
1928 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1929 				    u64 msecs)
1930 {
1931 	u64 tdiff;
1932 
1933 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1934                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1935 
1936 	return tdiff > (msecs * 1000 * 1000);
1937 }
1938 
1939 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1940 {
1941 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1942 	u32 skip;
1943 
1944 	get_random_bytes(&skip, sizeof(skip));
1945 	return skip % sdp->sd_rgrps;
1946 }
1947 
1948 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1949 {
1950 	struct gfs2_rgrpd *rgd = *pos;
1951 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1952 
1953 	rgd = gfs2_rgrpd_get_next(rgd);
1954 	if (rgd == NULL)
1955 		rgd = gfs2_rgrpd_get_first(sdp);
1956 	*pos = rgd;
1957 	if (rgd != begin) /* If we didn't wrap */
1958 		return true;
1959 	return false;
1960 }
1961 
1962 /**
1963  * fast_to_acquire - determine if a resource group will be fast to acquire
1964  *
1965  * If this is one of our preferred rgrps, it should be quicker to acquire,
1966  * because we tried to set ourselves up as dlm lock master.
1967  */
1968 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1969 {
1970 	struct gfs2_glock *gl = rgd->rd_gl;
1971 
1972 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1973 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1974 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1975 		return 1;
1976 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1977 		return 1;
1978 	return 0;
1979 }
1980 
1981 /**
1982  * gfs2_inplace_reserve - Reserve space in the filesystem
1983  * @ip: the inode to reserve space for
1984  * @ap: the allocation parameters
1985  *
1986  * We try our best to find an rgrp that has at least ap->target blocks
1987  * available. After a couple of passes (loops == 2), the prospects of finding
1988  * such an rgrp diminish. At this stage, we return the first rgrp that has
1989  * at least ap->min_target blocks available. Either way, we set ap->allowed to
1990  * the number of blocks available in the chosen rgrp.
1991  *
1992  * Returns: 0 on success,
1993  *          -ENOMEM if a suitable rgrp can't be found
1994  *          errno otherwise
1995  */
1996 
1997 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
1998 {
1999 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2000 	struct gfs2_rgrpd *begin = NULL;
2001 	struct gfs2_blkreserv *rs = &ip->i_res;
2002 	int error = 0, rg_locked, flags = 0;
2003 	u64 last_unlinked = NO_BLOCK;
2004 	int loops = 0;
2005 	u32 free_blocks, skip = 0;
2006 
2007 	if (sdp->sd_args.ar_rgrplvb)
2008 		flags |= GL_SKIP;
2009 	if (gfs2_assert_warn(sdp, ap->target))
2010 		return -EINVAL;
2011 	if (gfs2_rs_active(rs)) {
2012 		begin = rs->rs_rbm.rgd;
2013 	} else if (rs->rs_rbm.rgd &&
2014 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2015 		begin = rs->rs_rbm.rgd;
2016 	} else {
2017 		check_and_update_goal(ip);
2018 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2019 	}
2020 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2021 		skip = gfs2_orlov_skip(ip);
2022 	if (rs->rs_rbm.rgd == NULL)
2023 		return -EBADSLT;
2024 
2025 	while (loops < 3) {
2026 		rg_locked = 1;
2027 
2028 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2029 			rg_locked = 0;
2030 			if (skip && skip--)
2031 				goto next_rgrp;
2032 			if (!gfs2_rs_active(rs)) {
2033 				if (loops == 0 &&
2034 				    !fast_to_acquire(rs->rs_rbm.rgd))
2035 					goto next_rgrp;
2036 				if ((loops < 2) &&
2037 				    gfs2_rgrp_used_recently(rs, 1000) &&
2038 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2039 					goto next_rgrp;
2040 			}
2041 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2042 						   LM_ST_EXCLUSIVE, flags,
2043 						   &ip->i_rgd_gh);
2044 			if (unlikely(error))
2045 				return error;
2046 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2047 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2048 				goto skip_rgrp;
2049 			if (sdp->sd_args.ar_rgrplvb) {
2050 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2051 				if (unlikely(error)) {
2052 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2053 					return error;
2054 				}
2055 			}
2056 		}
2057 
2058 		/* Skip unusable resource groups */
2059 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2060 						 GFS2_RDF_ERROR)) ||
2061 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2062 			goto skip_rgrp;
2063 
2064 		if (sdp->sd_args.ar_rgrplvb)
2065 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2066 
2067 		/* Get a reservation if we don't already have one */
2068 		if (!gfs2_rs_active(rs))
2069 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2070 
2071 		/* Skip rgrps when we can't get a reservation on first pass */
2072 		if (!gfs2_rs_active(rs) && (loops < 1))
2073 			goto check_rgrp;
2074 
2075 		/* If rgrp has enough free space, use it */
2076 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2077 		if (free_blocks >= ap->target ||
2078 		    (loops == 2 && ap->min_target &&
2079 		     free_blocks >= ap->min_target)) {
2080 			ap->allowed = free_blocks;
2081 			return 0;
2082 		}
2083 check_rgrp:
2084 		/* Check for unlinked inodes which can be reclaimed */
2085 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2086 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2087 					ip->i_no_addr);
2088 skip_rgrp:
2089 		/* Drop reservation, if we couldn't use reserved rgrp */
2090 		if (gfs2_rs_active(rs))
2091 			gfs2_rs_deltree(rs);
2092 
2093 		/* Unlock rgrp if required */
2094 		if (!rg_locked)
2095 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2096 next_rgrp:
2097 		/* Find the next rgrp, and continue looking */
2098 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2099 			continue;
2100 		if (skip)
2101 			continue;
2102 
2103 		/* If we've scanned all the rgrps, but found no free blocks
2104 		 * then this checks for some less likely conditions before
2105 		 * trying again.
2106 		 */
2107 		loops++;
2108 		/* Check that fs hasn't grown if writing to rindex */
2109 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2110 			error = gfs2_ri_update(ip);
2111 			if (error)
2112 				return error;
2113 		}
2114 		/* Flushing the log may release space */
2115 		if (loops == 2)
2116 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2117 				       GFS2_LFC_INPLACE_RESERVE);
2118 	}
2119 
2120 	return -ENOSPC;
2121 }
2122 
2123 /**
2124  * gfs2_inplace_release - release an inplace reservation
2125  * @ip: the inode the reservation was taken out on
2126  *
2127  * Release a reservation made by gfs2_inplace_reserve().
2128  */
2129 
2130 void gfs2_inplace_release(struct gfs2_inode *ip)
2131 {
2132 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2133 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2134 }
2135 
2136 /**
2137  * gfs2_alloc_extent - allocate an extent from a given bitmap
2138  * @rbm: the resource group information
2139  * @dinode: TRUE if the first block we allocate is for a dinode
2140  * @n: The extent length (value/result)
2141  *
2142  * Add the bitmap buffer to the transaction.
2143  * Set the found bits to @new_state to change block's allocation state.
2144  */
2145 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2146 			     unsigned int *n)
2147 {
2148 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2149 	const unsigned int elen = *n;
2150 	u64 block;
2151 	int ret;
2152 
2153 	*n = 1;
2154 	block = gfs2_rbm_to_block(rbm);
2155 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2156 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2157 	block++;
2158 	while (*n < elen) {
2159 		ret = gfs2_rbm_from_block(&pos, block);
2160 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2161 			break;
2162 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2163 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2164 		(*n)++;
2165 		block++;
2166 	}
2167 }
2168 
2169 /**
2170  * rgblk_free - Change alloc state of given block(s)
2171  * @sdp: the filesystem
2172  * @rgd: the resource group the blocks are in
2173  * @bstart: the start of a run of blocks to free
2174  * @blen: the length of the block run (all must lie within ONE RG!)
2175  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2176  */
2177 
2178 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2179 		       u64 bstart, u32 blen, unsigned char new_state)
2180 {
2181 	struct gfs2_rbm rbm;
2182 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2183 
2184 	rbm.rgd = rgd;
2185 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2186 		return;
2187 	while (blen--) {
2188 		bi = rbm_bi(&rbm);
2189 		if (bi != bi_prev) {
2190 			if (!bi->bi_clone) {
2191 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2192 						      GFP_NOFS | __GFP_NOFAIL);
2193 				memcpy(bi->bi_clone + bi->bi_offset,
2194 				       bi->bi_bh->b_data + bi->bi_offset,
2195 				       bi->bi_bytes);
2196 			}
2197 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2198 			bi_prev = bi;
2199 		}
2200 		gfs2_setbit(&rbm, false, new_state);
2201 		gfs2_rbm_incr(&rbm);
2202 	}
2203 }
2204 
2205 /**
2206  * gfs2_rgrp_dump - print out an rgrp
2207  * @seq: The iterator
2208  * @rgd: The rgrp in question
2209  * @fs_id_buf: pointer to file system id (if requested)
2210  *
2211  */
2212 
2213 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2214 		    const char *fs_id_buf)
2215 {
2216 	struct gfs2_blkreserv *trs;
2217 	const struct rb_node *n;
2218 
2219 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2220 		       fs_id_buf,
2221 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2222 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2223 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2224 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2225 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2226 
2227 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2228 			       be32_to_cpu(rgl->rl_flags),
2229 			       be32_to_cpu(rgl->rl_free),
2230 			       be32_to_cpu(rgl->rl_dinodes));
2231 	}
2232 	spin_lock(&rgd->rd_rsspin);
2233 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2234 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2235 		dump_rs(seq, trs, fs_id_buf);
2236 	}
2237 	spin_unlock(&rgd->rd_rsspin);
2238 }
2239 
2240 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2241 {
2242 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2243 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2244 
2245 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2246 		(unsigned long long)rgd->rd_addr);
2247 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2248 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2249 	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2250 	rgd->rd_flags |= GFS2_RDF_ERROR;
2251 }
2252 
2253 /**
2254  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2255  * @ip: The inode we have just allocated blocks for
2256  * @rbm: The start of the allocated blocks
2257  * @len: The extent length
2258  *
2259  * Adjusts a reservation after an allocation has taken place. If the
2260  * reservation does not match the allocation, or if it is now empty
2261  * then it is removed.
2262  */
2263 
2264 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2265 				    const struct gfs2_rbm *rbm, unsigned len)
2266 {
2267 	struct gfs2_blkreserv *rs = &ip->i_res;
2268 	struct gfs2_rgrpd *rgd = rbm->rgd;
2269 	unsigned rlen;
2270 	u64 block;
2271 	int ret;
2272 
2273 	spin_lock(&rgd->rd_rsspin);
2274 	if (gfs2_rs_active(rs)) {
2275 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2276 			block = gfs2_rbm_to_block(rbm);
2277 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2278 			rlen = min(rs->rs_free, len);
2279 			rs->rs_free -= rlen;
2280 			rgd->rd_reserved -= rlen;
2281 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2282 			if (rs->rs_free && !ret)
2283 				goto out;
2284 			/* We used up our block reservation, so we should
2285 			   reserve more blocks next time. */
2286 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2287 		}
2288 		__rs_deltree(rs);
2289 	}
2290 out:
2291 	spin_unlock(&rgd->rd_rsspin);
2292 }
2293 
2294 /**
2295  * gfs2_set_alloc_start - Set starting point for block allocation
2296  * @rbm: The rbm which will be set to the required location
2297  * @ip: The gfs2 inode
2298  * @dinode: Flag to say if allocation includes a new inode
2299  *
2300  * This sets the starting point from the reservation if one is active
2301  * otherwise it falls back to guessing a start point based on the
2302  * inode's goal block or the last allocation point in the rgrp.
2303  */
2304 
2305 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2306 				 const struct gfs2_inode *ip, bool dinode)
2307 {
2308 	u64 goal;
2309 
2310 	if (gfs2_rs_active(&ip->i_res)) {
2311 		*rbm = ip->i_res.rs_rbm;
2312 		return;
2313 	}
2314 
2315 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2316 		goal = ip->i_goal;
2317 	else
2318 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2319 
2320 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2321 		rbm->bii = 0;
2322 		rbm->offset = 0;
2323 	}
2324 }
2325 
2326 /**
2327  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2328  * @ip: the inode to allocate the block for
2329  * @bn: Used to return the starting block number
2330  * @nblocks: requested number of blocks/extent length (value/result)
2331  * @dinode: 1 if we're allocating a dinode block, else 0
2332  * @generation: the generation number of the inode
2333  *
2334  * Returns: 0 or error
2335  */
2336 
2337 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2338 		      bool dinode, u64 *generation)
2339 {
2340 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2341 	struct buffer_head *dibh;
2342 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2343 	unsigned int ndata;
2344 	u64 block; /* block, within the file system scope */
2345 	int error;
2346 
2347 	gfs2_set_alloc_start(&rbm, ip, dinode);
2348 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2349 
2350 	if (error == -ENOSPC) {
2351 		gfs2_set_alloc_start(&rbm, ip, dinode);
2352 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2353 	}
2354 
2355 	/* Since all blocks are reserved in advance, this shouldn't happen */
2356 	if (error) {
2357 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2358 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2359 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2360 			rbm.rgd->rd_extfail_pt);
2361 		goto rgrp_error;
2362 	}
2363 
2364 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2365 	block = gfs2_rbm_to_block(&rbm);
2366 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2367 	if (gfs2_rs_active(&ip->i_res))
2368 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2369 	ndata = *nblocks;
2370 	if (dinode)
2371 		ndata--;
2372 
2373 	if (!dinode) {
2374 		ip->i_goal = block + ndata - 1;
2375 		error = gfs2_meta_inode_buffer(ip, &dibh);
2376 		if (error == 0) {
2377 			struct gfs2_dinode *di =
2378 				(struct gfs2_dinode *)dibh->b_data;
2379 			gfs2_trans_add_meta(ip->i_gl, dibh);
2380 			di->di_goal_meta = di->di_goal_data =
2381 				cpu_to_be64(ip->i_goal);
2382 			brelse(dibh);
2383 		}
2384 	}
2385 	if (rbm.rgd->rd_free < *nblocks) {
2386 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2387 		goto rgrp_error;
2388 	}
2389 
2390 	rbm.rgd->rd_free -= *nblocks;
2391 	if (dinode) {
2392 		rbm.rgd->rd_dinodes++;
2393 		*generation = rbm.rgd->rd_igeneration++;
2394 		if (*generation == 0)
2395 			*generation = rbm.rgd->rd_igeneration++;
2396 	}
2397 
2398 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2399 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2400 
2401 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2402 	if (dinode)
2403 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2404 
2405 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2406 
2407 	rbm.rgd->rd_free_clone -= *nblocks;
2408 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2409 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2410 	*bn = block;
2411 	return 0;
2412 
2413 rgrp_error:
2414 	gfs2_rgrp_error(rbm.rgd);
2415 	return -EIO;
2416 }
2417 
2418 /**
2419  * __gfs2_free_blocks - free a contiguous run of block(s)
2420  * @ip: the inode these blocks are being freed from
2421  * @rgd: the resource group the blocks are in
2422  * @bstart: first block of a run of contiguous blocks
2423  * @blen: the length of the block run
2424  * @meta: 1 if the blocks represent metadata
2425  *
2426  */
2427 
2428 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2429 			u64 bstart, u32 blen, int meta)
2430 {
2431 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2432 
2433 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2434 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2435 	rgd->rd_free += blen;
2436 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2437 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2438 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2439 
2440 	/* Directories keep their data in the metadata address space */
2441 	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2442 		gfs2_journal_wipe(ip, bstart, blen);
2443 }
2444 
2445 /**
2446  * gfs2_free_meta - free a contiguous run of data block(s)
2447  * @ip: the inode these blocks are being freed from
2448  * @rgd: the resource group the blocks are in
2449  * @bstart: first block of a run of contiguous blocks
2450  * @blen: the length of the block run
2451  *
2452  */
2453 
2454 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2455 		    u64 bstart, u32 blen)
2456 {
2457 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2458 
2459 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2460 	gfs2_statfs_change(sdp, 0, +blen, 0);
2461 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2462 }
2463 
2464 void gfs2_unlink_di(struct inode *inode)
2465 {
2466 	struct gfs2_inode *ip = GFS2_I(inode);
2467 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2468 	struct gfs2_rgrpd *rgd;
2469 	u64 blkno = ip->i_no_addr;
2470 
2471 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2472 	if (!rgd)
2473 		return;
2474 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2475 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2476 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2477 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2478 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2479 }
2480 
2481 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2482 {
2483 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2484 
2485 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2486 	if (!rgd->rd_dinodes)
2487 		gfs2_consist_rgrpd(rgd);
2488 	rgd->rd_dinodes--;
2489 	rgd->rd_free++;
2490 
2491 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2492 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2493 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2494 
2495 	gfs2_statfs_change(sdp, 0, +1, -1);
2496 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2497 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2498 	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2499 }
2500 
2501 /**
2502  * gfs2_check_blk_type - Check the type of a block
2503  * @sdp: The superblock
2504  * @no_addr: The block number to check
2505  * @type: The block type we are looking for
2506  *
2507  * Returns: 0 if the block type matches the expected type
2508  *          -ESTALE if it doesn't match
2509  *          or -ve errno if something went wrong while checking
2510  */
2511 
2512 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2513 {
2514 	struct gfs2_rgrpd *rgd;
2515 	struct gfs2_holder rgd_gh;
2516 	struct gfs2_rbm rbm;
2517 	int error = -EINVAL;
2518 
2519 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2520 	if (!rgd)
2521 		goto fail;
2522 
2523 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2524 	if (error)
2525 		goto fail;
2526 
2527 	rbm.rgd = rgd;
2528 	error = gfs2_rbm_from_block(&rbm, no_addr);
2529 	if (WARN_ON_ONCE(error))
2530 		goto fail;
2531 
2532 	if (gfs2_testbit(&rbm, false) != type)
2533 		error = -ESTALE;
2534 
2535 	gfs2_glock_dq_uninit(&rgd_gh);
2536 fail:
2537 	return error;
2538 }
2539 
2540 /**
2541  * gfs2_rlist_add - add a RG to a list of RGs
2542  * @ip: the inode
2543  * @rlist: the list of resource groups
2544  * @block: the block
2545  *
2546  * Figure out what RG a block belongs to and add that RG to the list
2547  *
2548  * FIXME: Don't use NOFAIL
2549  *
2550  */
2551 
2552 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2553 		    u64 block)
2554 {
2555 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2556 	struct gfs2_rgrpd *rgd;
2557 	struct gfs2_rgrpd **tmp;
2558 	unsigned int new_space;
2559 	unsigned int x;
2560 
2561 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2562 		return;
2563 
2564 	/*
2565 	 * The resource group last accessed is kept in the last position.
2566 	 */
2567 
2568 	if (rlist->rl_rgrps) {
2569 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2570 		if (rgrp_contains_block(rgd, block))
2571 			return;
2572 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2573 	} else {
2574 		rgd = ip->i_res.rs_rbm.rgd;
2575 		if (!rgd || !rgrp_contains_block(rgd, block))
2576 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2577 	}
2578 
2579 	if (!rgd) {
2580 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2581 		       (unsigned long long)block);
2582 		return;
2583 	}
2584 
2585 	for (x = 0; x < rlist->rl_rgrps; x++) {
2586 		if (rlist->rl_rgd[x] == rgd) {
2587 			swap(rlist->rl_rgd[x],
2588 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2589 			return;
2590 		}
2591 	}
2592 
2593 	if (rlist->rl_rgrps == rlist->rl_space) {
2594 		new_space = rlist->rl_space + 10;
2595 
2596 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2597 			      GFP_NOFS | __GFP_NOFAIL);
2598 
2599 		if (rlist->rl_rgd) {
2600 			memcpy(tmp, rlist->rl_rgd,
2601 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2602 			kfree(rlist->rl_rgd);
2603 		}
2604 
2605 		rlist->rl_space = new_space;
2606 		rlist->rl_rgd = tmp;
2607 	}
2608 
2609 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2610 }
2611 
2612 /**
2613  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2614  *      and initialize an array of glock holders for them
2615  * @rlist: the list of resource groups
2616  *
2617  * FIXME: Don't use NOFAIL
2618  *
2619  */
2620 
2621 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2622 {
2623 	unsigned int x;
2624 
2625 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2626 				      sizeof(struct gfs2_holder),
2627 				      GFP_NOFS | __GFP_NOFAIL);
2628 	for (x = 0; x < rlist->rl_rgrps; x++)
2629 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2630 				LM_ST_EXCLUSIVE, 0,
2631 				&rlist->rl_ghs[x]);
2632 }
2633 
2634 /**
2635  * gfs2_rlist_free - free a resource group list
2636  * @rlist: the list of resource groups
2637  *
2638  */
2639 
2640 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2641 {
2642 	unsigned int x;
2643 
2644 	kfree(rlist->rl_rgd);
2645 
2646 	if (rlist->rl_ghs) {
2647 		for (x = 0; x < rlist->rl_rgrps; x++)
2648 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2649 		kfree(rlist->rl_ghs);
2650 		rlist->rl_ghs = NULL;
2651 	}
2652 }
2653 
2654