xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 68198dca)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
16 #include <linux/fs.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
22 
23 #include "gfs2.h"
24 #include "incore.h"
25 #include "glock.h"
26 #include "glops.h"
27 #include "lops.h"
28 #include "meta_io.h"
29 #include "quota.h"
30 #include "rgrp.h"
31 #include "super.h"
32 #include "trans.h"
33 #include "util.h"
34 #include "log.h"
35 #include "inode.h"
36 #include "trace_gfs2.h"
37 
38 #define BFITNOENT ((u32)~0)
39 #define NO_BLOCK ((u64)~0)
40 
41 #if BITS_PER_LONG == 32
42 #define LBITMASK   (0x55555555UL)
43 #define LBITSKIP55 (0x55555555UL)
44 #define LBITSKIP00 (0x00000000UL)
45 #else
46 #define LBITMASK   (0x5555555555555555UL)
47 #define LBITSKIP55 (0x5555555555555555UL)
48 #define LBITSKIP00 (0x0000000000000000UL)
49 #endif
50 
51 /*
52  * These routines are used by the resource group routines (rgrp.c)
53  * to keep track of block allocation.  Each block is represented by two
54  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
55  *
56  * 0 = Free
57  * 1 = Used (not metadata)
58  * 2 = Unlinked (still in use) inode
59  * 3 = Used (metadata)
60  */
61 
62 struct gfs2_extent {
63 	struct gfs2_rbm rbm;
64 	u32 len;
65 };
66 
67 static const char valid_change[16] = {
68 	        /* current */
69 	/* n */ 0, 1, 1, 1,
70 	/* e */ 1, 0, 0, 0,
71 	/* w */ 0, 0, 0, 1,
72 	        1, 0, 0, 0
73 };
74 
75 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
76 			 const struct gfs2_inode *ip, bool nowrap);
77 
78 
79 /**
80  * gfs2_setbit - Set a bit in the bitmaps
81  * @rbm: The position of the bit to set
82  * @do_clone: Also set the clone bitmap, if it exists
83  * @new_state: the new state of the block
84  *
85  */
86 
87 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
88 			       unsigned char new_state)
89 {
90 	unsigned char *byte1, *byte2, *end, cur_state;
91 	struct gfs2_bitmap *bi = rbm_bi(rbm);
92 	unsigned int buflen = bi->bi_len;
93 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
94 
95 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
96 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
97 
98 	BUG_ON(byte1 >= end);
99 
100 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
101 
102 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
103 		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
104 			rbm->offset, cur_state, new_state);
105 		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
106 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
107 		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
108 			bi->bi_offset, bi->bi_len);
109 		dump_stack();
110 		gfs2_consist_rgrpd(rbm->rgd);
111 		return;
112 	}
113 	*byte1 ^= (cur_state ^ new_state) << bit;
114 
115 	if (do_clone && bi->bi_clone) {
116 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
117 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
118 		*byte2 ^= (cur_state ^ new_state) << bit;
119 	}
120 }
121 
122 /**
123  * gfs2_testbit - test a bit in the bitmaps
124  * @rbm: The bit to test
125  *
126  * Returns: The two bit block state of the requested bit
127  */
128 
129 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
130 {
131 	struct gfs2_bitmap *bi = rbm_bi(rbm);
132 	const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
133 	const u8 *byte;
134 	unsigned int bit;
135 
136 	byte = buffer + (rbm->offset / GFS2_NBBY);
137 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
138 
139 	return (*byte >> bit) & GFS2_BIT_MASK;
140 }
141 
142 /**
143  * gfs2_bit_search
144  * @ptr: Pointer to bitmap data
145  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
146  * @state: The state we are searching for
147  *
148  * We xor the bitmap data with a patter which is the bitwise opposite
149  * of what we are looking for, this gives rise to a pattern of ones
150  * wherever there is a match. Since we have two bits per entry, we
151  * take this pattern, shift it down by one place and then and it with
152  * the original. All the even bit positions (0,2,4, etc) then represent
153  * successful matches, so we mask with 0x55555..... to remove the unwanted
154  * odd bit positions.
155  *
156  * This allows searching of a whole u64 at once (32 blocks) with a
157  * single test (on 64 bit arches).
158  */
159 
160 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
161 {
162 	u64 tmp;
163 	static const u64 search[] = {
164 		[0] = 0xffffffffffffffffULL,
165 		[1] = 0xaaaaaaaaaaaaaaaaULL,
166 		[2] = 0x5555555555555555ULL,
167 		[3] = 0x0000000000000000ULL,
168 	};
169 	tmp = le64_to_cpu(*ptr) ^ search[state];
170 	tmp &= (tmp >> 1);
171 	tmp &= mask;
172 	return tmp;
173 }
174 
175 /**
176  * rs_cmp - multi-block reservation range compare
177  * @blk: absolute file system block number of the new reservation
178  * @len: number of blocks in the new reservation
179  * @rs: existing reservation to compare against
180  *
181  * returns: 1 if the block range is beyond the reach of the reservation
182  *         -1 if the block range is before the start of the reservation
183  *          0 if the block range overlaps with the reservation
184  */
185 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
186 {
187 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
188 
189 	if (blk >= startblk + rs->rs_free)
190 		return 1;
191 	if (blk + len - 1 < startblk)
192 		return -1;
193 	return 0;
194 }
195 
196 /**
197  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
198  *       a block in a given allocation state.
199  * @buf: the buffer that holds the bitmaps
200  * @len: the length (in bytes) of the buffer
201  * @goal: start search at this block's bit-pair (within @buffer)
202  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
203  *
204  * Scope of @goal and returned block number is only within this bitmap buffer,
205  * not entire rgrp or filesystem.  @buffer will be offset from the actual
206  * beginning of a bitmap block buffer, skipping any header structures, but
207  * headers are always a multiple of 64 bits long so that the buffer is
208  * always aligned to a 64 bit boundary.
209  *
210  * The size of the buffer is in bytes, but is it assumed that it is
211  * always ok to read a complete multiple of 64 bits at the end
212  * of the block in case the end is no aligned to a natural boundary.
213  *
214  * Return: the block number (bitmap buffer scope) that was found
215  */
216 
217 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
218 		       u32 goal, u8 state)
219 {
220 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
221 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
222 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
223 	u64 tmp;
224 	u64 mask = 0x5555555555555555ULL;
225 	u32 bit;
226 
227 	/* Mask off bits we don't care about at the start of the search */
228 	mask <<= spoint;
229 	tmp = gfs2_bit_search(ptr, mask, state);
230 	ptr++;
231 	while(tmp == 0 && ptr < end) {
232 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
233 		ptr++;
234 	}
235 	/* Mask off any bits which are more than len bytes from the start */
236 	if (ptr == end && (len & (sizeof(u64) - 1)))
237 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
238 	/* Didn't find anything, so return */
239 	if (tmp == 0)
240 		return BFITNOENT;
241 	ptr--;
242 	bit = __ffs64(tmp);
243 	bit /= 2;	/* two bits per entry in the bitmap */
244 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
245 }
246 
247 /**
248  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
249  * @rbm: The rbm with rgd already set correctly
250  * @block: The block number (filesystem relative)
251  *
252  * This sets the bi and offset members of an rbm based on a
253  * resource group and a filesystem relative block number. The
254  * resource group must be set in the rbm on entry, the bi and
255  * offset members will be set by this function.
256  *
257  * Returns: 0 on success, or an error code
258  */
259 
260 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
261 {
262 	u64 rblock = block - rbm->rgd->rd_data0;
263 
264 	if (WARN_ON_ONCE(rblock > UINT_MAX))
265 		return -EINVAL;
266 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
267 		return -E2BIG;
268 
269 	rbm->bii = 0;
270 	rbm->offset = (u32)(rblock);
271 	/* Check if the block is within the first block */
272 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
273 		return 0;
274 
275 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
276 	rbm->offset += (sizeof(struct gfs2_rgrp) -
277 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
278 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
279 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
280 	return 0;
281 }
282 
283 /**
284  * gfs2_rbm_incr - increment an rbm structure
285  * @rbm: The rbm with rgd already set correctly
286  *
287  * This function takes an existing rbm structure and increments it to the next
288  * viable block offset.
289  *
290  * Returns: If incrementing the offset would cause the rbm to go past the
291  *          end of the rgrp, true is returned, otherwise false.
292  *
293  */
294 
295 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
296 {
297 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
298 		rbm->offset++;
299 		return false;
300 	}
301 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
302 		return true;
303 
304 	rbm->offset = 0;
305 	rbm->bii++;
306 	return false;
307 }
308 
309 /**
310  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
311  * @rbm: Position to search (value/result)
312  * @n_unaligned: Number of unaligned blocks to check
313  * @len: Decremented for each block found (terminate on zero)
314  *
315  * Returns: true if a non-free block is encountered
316  */
317 
318 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
319 {
320 	u32 n;
321 	u8 res;
322 
323 	for (n = 0; n < n_unaligned; n++) {
324 		res = gfs2_testbit(rbm);
325 		if (res != GFS2_BLKST_FREE)
326 			return true;
327 		(*len)--;
328 		if (*len == 0)
329 			return true;
330 		if (gfs2_rbm_incr(rbm))
331 			return true;
332 	}
333 
334 	return false;
335 }
336 
337 /**
338  * gfs2_free_extlen - Return extent length of free blocks
339  * @rrbm: Starting position
340  * @len: Max length to check
341  *
342  * Starting at the block specified by the rbm, see how many free blocks
343  * there are, not reading more than len blocks ahead. This can be done
344  * using memchr_inv when the blocks are byte aligned, but has to be done
345  * on a block by block basis in case of unaligned blocks. Also this
346  * function can cope with bitmap boundaries (although it must stop on
347  * a resource group boundary)
348  *
349  * Returns: Number of free blocks in the extent
350  */
351 
352 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
353 {
354 	struct gfs2_rbm rbm = *rrbm;
355 	u32 n_unaligned = rbm.offset & 3;
356 	u32 size = len;
357 	u32 bytes;
358 	u32 chunk_size;
359 	u8 *ptr, *start, *end;
360 	u64 block;
361 	struct gfs2_bitmap *bi;
362 
363 	if (n_unaligned &&
364 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
365 		goto out;
366 
367 	n_unaligned = len & 3;
368 	/* Start is now byte aligned */
369 	while (len > 3) {
370 		bi = rbm_bi(&rbm);
371 		start = bi->bi_bh->b_data;
372 		if (bi->bi_clone)
373 			start = bi->bi_clone;
374 		end = start + bi->bi_bh->b_size;
375 		start += bi->bi_offset;
376 		BUG_ON(rbm.offset & 3);
377 		start += (rbm.offset / GFS2_NBBY);
378 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
379 		ptr = memchr_inv(start, 0, bytes);
380 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
381 		chunk_size *= GFS2_NBBY;
382 		BUG_ON(len < chunk_size);
383 		len -= chunk_size;
384 		block = gfs2_rbm_to_block(&rbm);
385 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
386 			n_unaligned = 0;
387 			break;
388 		}
389 		if (ptr) {
390 			n_unaligned = 3;
391 			break;
392 		}
393 		n_unaligned = len & 3;
394 	}
395 
396 	/* Deal with any bits left over at the end */
397 	if (n_unaligned)
398 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
399 out:
400 	return size - len;
401 }
402 
403 /**
404  * gfs2_bitcount - count the number of bits in a certain state
405  * @rgd: the resource group descriptor
406  * @buffer: the buffer that holds the bitmaps
407  * @buflen: the length (in bytes) of the buffer
408  * @state: the state of the block we're looking for
409  *
410  * Returns: The number of bits
411  */
412 
413 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
414 			 unsigned int buflen, u8 state)
415 {
416 	const u8 *byte = buffer;
417 	const u8 *end = buffer + buflen;
418 	const u8 state1 = state << 2;
419 	const u8 state2 = state << 4;
420 	const u8 state3 = state << 6;
421 	u32 count = 0;
422 
423 	for (; byte < end; byte++) {
424 		if (((*byte) & 0x03) == state)
425 			count++;
426 		if (((*byte) & 0x0C) == state1)
427 			count++;
428 		if (((*byte) & 0x30) == state2)
429 			count++;
430 		if (((*byte) & 0xC0) == state3)
431 			count++;
432 	}
433 
434 	return count;
435 }
436 
437 /**
438  * gfs2_rgrp_verify - Verify that a resource group is consistent
439  * @rgd: the rgrp
440  *
441  */
442 
443 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
444 {
445 	struct gfs2_sbd *sdp = rgd->rd_sbd;
446 	struct gfs2_bitmap *bi = NULL;
447 	u32 length = rgd->rd_length;
448 	u32 count[4], tmp;
449 	int buf, x;
450 
451 	memset(count, 0, 4 * sizeof(u32));
452 
453 	/* Count # blocks in each of 4 possible allocation states */
454 	for (buf = 0; buf < length; buf++) {
455 		bi = rgd->rd_bits + buf;
456 		for (x = 0; x < 4; x++)
457 			count[x] += gfs2_bitcount(rgd,
458 						  bi->bi_bh->b_data +
459 						  bi->bi_offset,
460 						  bi->bi_len, x);
461 	}
462 
463 	if (count[0] != rgd->rd_free) {
464 		if (gfs2_consist_rgrpd(rgd))
465 			fs_err(sdp, "free data mismatch:  %u != %u\n",
466 			       count[0], rgd->rd_free);
467 		return;
468 	}
469 
470 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
471 	if (count[1] != tmp) {
472 		if (gfs2_consist_rgrpd(rgd))
473 			fs_err(sdp, "used data mismatch:  %u != %u\n",
474 			       count[1], tmp);
475 		return;
476 	}
477 
478 	if (count[2] + count[3] != rgd->rd_dinodes) {
479 		if (gfs2_consist_rgrpd(rgd))
480 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
481 			       count[2] + count[3], rgd->rd_dinodes);
482 		return;
483 	}
484 }
485 
486 /**
487  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
488  * @sdp: The GFS2 superblock
489  * @blk: The data block number
490  * @exact: True if this needs to be an exact match
491  *
492  * Returns: The resource group, or NULL if not found
493  */
494 
495 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
496 {
497 	struct rb_node *n, *next;
498 	struct gfs2_rgrpd *cur;
499 
500 	spin_lock(&sdp->sd_rindex_spin);
501 	n = sdp->sd_rindex_tree.rb_node;
502 	while (n) {
503 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
504 		next = NULL;
505 		if (blk < cur->rd_addr)
506 			next = n->rb_left;
507 		else if (blk >= cur->rd_data0 + cur->rd_data)
508 			next = n->rb_right;
509 		if (next == NULL) {
510 			spin_unlock(&sdp->sd_rindex_spin);
511 			if (exact) {
512 				if (blk < cur->rd_addr)
513 					return NULL;
514 				if (blk >= cur->rd_data0 + cur->rd_data)
515 					return NULL;
516 			}
517 			return cur;
518 		}
519 		n = next;
520 	}
521 	spin_unlock(&sdp->sd_rindex_spin);
522 
523 	return NULL;
524 }
525 
526 /**
527  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
528  * @sdp: The GFS2 superblock
529  *
530  * Returns: The first rgrp in the filesystem
531  */
532 
533 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
534 {
535 	const struct rb_node *n;
536 	struct gfs2_rgrpd *rgd;
537 
538 	spin_lock(&sdp->sd_rindex_spin);
539 	n = rb_first(&sdp->sd_rindex_tree);
540 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
541 	spin_unlock(&sdp->sd_rindex_spin);
542 
543 	return rgd;
544 }
545 
546 /**
547  * gfs2_rgrpd_get_next - get the next RG
548  * @rgd: the resource group descriptor
549  *
550  * Returns: The next rgrp
551  */
552 
553 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
554 {
555 	struct gfs2_sbd *sdp = rgd->rd_sbd;
556 	const struct rb_node *n;
557 
558 	spin_lock(&sdp->sd_rindex_spin);
559 	n = rb_next(&rgd->rd_node);
560 	if (n == NULL)
561 		n = rb_first(&sdp->sd_rindex_tree);
562 
563 	if (unlikely(&rgd->rd_node == n)) {
564 		spin_unlock(&sdp->sd_rindex_spin);
565 		return NULL;
566 	}
567 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
568 	spin_unlock(&sdp->sd_rindex_spin);
569 	return rgd;
570 }
571 
572 void check_and_update_goal(struct gfs2_inode *ip)
573 {
574 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
575 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
576 		ip->i_goal = ip->i_no_addr;
577 }
578 
579 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
580 {
581 	int x;
582 
583 	for (x = 0; x < rgd->rd_length; x++) {
584 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
585 		kfree(bi->bi_clone);
586 		bi->bi_clone = NULL;
587 	}
588 }
589 
590 /**
591  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
592  *                 plus a quota allocations data structure, if necessary
593  * @ip: the inode for this reservation
594  */
595 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
596 {
597 	return gfs2_qa_alloc(ip);
598 }
599 
600 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
601 {
602 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
603 		       (unsigned long long)rs->rs_inum,
604 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
605 		       rs->rs_rbm.offset, rs->rs_free);
606 }
607 
608 /**
609  * __rs_deltree - remove a multi-block reservation from the rgd tree
610  * @rs: The reservation to remove
611  *
612  */
613 static void __rs_deltree(struct gfs2_blkreserv *rs)
614 {
615 	struct gfs2_rgrpd *rgd;
616 
617 	if (!gfs2_rs_active(rs))
618 		return;
619 
620 	rgd = rs->rs_rbm.rgd;
621 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
622 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
623 	RB_CLEAR_NODE(&rs->rs_node);
624 
625 	if (rs->rs_free) {
626 		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
627 
628 		/* return reserved blocks to the rgrp */
629 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
630 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
631 		/* The rgrp extent failure point is likely not to increase;
632 		   it will only do so if the freed blocks are somehow
633 		   contiguous with a span of free blocks that follows. Still,
634 		   it will force the number to be recalculated later. */
635 		rgd->rd_extfail_pt += rs->rs_free;
636 		rs->rs_free = 0;
637 		clear_bit(GBF_FULL, &bi->bi_flags);
638 	}
639 }
640 
641 /**
642  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
643  * @rs: The reservation to remove
644  *
645  */
646 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
647 {
648 	struct gfs2_rgrpd *rgd;
649 
650 	rgd = rs->rs_rbm.rgd;
651 	if (rgd) {
652 		spin_lock(&rgd->rd_rsspin);
653 		__rs_deltree(rs);
654 		BUG_ON(rs->rs_free);
655 		spin_unlock(&rgd->rd_rsspin);
656 	}
657 }
658 
659 /**
660  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
661  * @ip: The inode for this reservation
662  * @wcount: The inode's write count, or NULL
663  *
664  */
665 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
666 {
667 	down_write(&ip->i_rw_mutex);
668 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
669 		gfs2_rs_deltree(&ip->i_res);
670 	up_write(&ip->i_rw_mutex);
671 	gfs2_qa_delete(ip, wcount);
672 }
673 
674 /**
675  * return_all_reservations - return all reserved blocks back to the rgrp.
676  * @rgd: the rgrp that needs its space back
677  *
678  * We previously reserved a bunch of blocks for allocation. Now we need to
679  * give them back. This leave the reservation structures in tact, but removes
680  * all of their corresponding "no-fly zones".
681  */
682 static void return_all_reservations(struct gfs2_rgrpd *rgd)
683 {
684 	struct rb_node *n;
685 	struct gfs2_blkreserv *rs;
686 
687 	spin_lock(&rgd->rd_rsspin);
688 	while ((n = rb_first(&rgd->rd_rstree))) {
689 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
690 		__rs_deltree(rs);
691 	}
692 	spin_unlock(&rgd->rd_rsspin);
693 }
694 
695 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
696 {
697 	struct rb_node *n;
698 	struct gfs2_rgrpd *rgd;
699 	struct gfs2_glock *gl;
700 
701 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
702 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
703 		gl = rgd->rd_gl;
704 
705 		rb_erase(n, &sdp->sd_rindex_tree);
706 
707 		if (gl) {
708 			glock_clear_object(gl, rgd);
709 			gfs2_glock_put(gl);
710 		}
711 
712 		gfs2_free_clones(rgd);
713 		kfree(rgd->rd_bits);
714 		rgd->rd_bits = NULL;
715 		return_all_reservations(rgd);
716 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
717 	}
718 }
719 
720 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
721 {
722 	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
723 	pr_info("ri_length = %u\n", rgd->rd_length);
724 	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
725 	pr_info("ri_data = %u\n", rgd->rd_data);
726 	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
727 }
728 
729 /**
730  * gfs2_compute_bitstructs - Compute the bitmap sizes
731  * @rgd: The resource group descriptor
732  *
733  * Calculates bitmap descriptors, one for each block that contains bitmap data
734  *
735  * Returns: errno
736  */
737 
738 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
739 {
740 	struct gfs2_sbd *sdp = rgd->rd_sbd;
741 	struct gfs2_bitmap *bi;
742 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
743 	u32 bytes_left, bytes;
744 	int x;
745 
746 	if (!length)
747 		return -EINVAL;
748 
749 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
750 	if (!rgd->rd_bits)
751 		return -ENOMEM;
752 
753 	bytes_left = rgd->rd_bitbytes;
754 
755 	for (x = 0; x < length; x++) {
756 		bi = rgd->rd_bits + x;
757 
758 		bi->bi_flags = 0;
759 		/* small rgrp; bitmap stored completely in header block */
760 		if (length == 1) {
761 			bytes = bytes_left;
762 			bi->bi_offset = sizeof(struct gfs2_rgrp);
763 			bi->bi_start = 0;
764 			bi->bi_len = bytes;
765 			bi->bi_blocks = bytes * GFS2_NBBY;
766 		/* header block */
767 		} else if (x == 0) {
768 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
769 			bi->bi_offset = sizeof(struct gfs2_rgrp);
770 			bi->bi_start = 0;
771 			bi->bi_len = bytes;
772 			bi->bi_blocks = bytes * GFS2_NBBY;
773 		/* last block */
774 		} else if (x + 1 == length) {
775 			bytes = bytes_left;
776 			bi->bi_offset = sizeof(struct gfs2_meta_header);
777 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
778 			bi->bi_len = bytes;
779 			bi->bi_blocks = bytes * GFS2_NBBY;
780 		/* other blocks */
781 		} else {
782 			bytes = sdp->sd_sb.sb_bsize -
783 				sizeof(struct gfs2_meta_header);
784 			bi->bi_offset = sizeof(struct gfs2_meta_header);
785 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
786 			bi->bi_len = bytes;
787 			bi->bi_blocks = bytes * GFS2_NBBY;
788 		}
789 
790 		bytes_left -= bytes;
791 	}
792 
793 	if (bytes_left) {
794 		gfs2_consist_rgrpd(rgd);
795 		return -EIO;
796 	}
797 	bi = rgd->rd_bits + (length - 1);
798 	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
799 		if (gfs2_consist_rgrpd(rgd)) {
800 			gfs2_rindex_print(rgd);
801 			fs_err(sdp, "start=%u len=%u offset=%u\n",
802 			       bi->bi_start, bi->bi_len, bi->bi_offset);
803 		}
804 		return -EIO;
805 	}
806 
807 	return 0;
808 }
809 
810 /**
811  * gfs2_ri_total - Total up the file system space, according to the rindex.
812  * @sdp: the filesystem
813  *
814  */
815 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
816 {
817 	u64 total_data = 0;
818 	struct inode *inode = sdp->sd_rindex;
819 	struct gfs2_inode *ip = GFS2_I(inode);
820 	char buf[sizeof(struct gfs2_rindex)];
821 	int error, rgrps;
822 
823 	for (rgrps = 0;; rgrps++) {
824 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
825 
826 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
827 			break;
828 		error = gfs2_internal_read(ip, buf, &pos,
829 					   sizeof(struct gfs2_rindex));
830 		if (error != sizeof(struct gfs2_rindex))
831 			break;
832 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
833 	}
834 	return total_data;
835 }
836 
837 static int rgd_insert(struct gfs2_rgrpd *rgd)
838 {
839 	struct gfs2_sbd *sdp = rgd->rd_sbd;
840 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
841 
842 	/* Figure out where to put new node */
843 	while (*newn) {
844 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
845 						  rd_node);
846 
847 		parent = *newn;
848 		if (rgd->rd_addr < cur->rd_addr)
849 			newn = &((*newn)->rb_left);
850 		else if (rgd->rd_addr > cur->rd_addr)
851 			newn = &((*newn)->rb_right);
852 		else
853 			return -EEXIST;
854 	}
855 
856 	rb_link_node(&rgd->rd_node, parent, newn);
857 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
858 	sdp->sd_rgrps++;
859 	return 0;
860 }
861 
862 /**
863  * read_rindex_entry - Pull in a new resource index entry from the disk
864  * @ip: Pointer to the rindex inode
865  *
866  * Returns: 0 on success, > 0 on EOF, error code otherwise
867  */
868 
869 static int read_rindex_entry(struct gfs2_inode *ip)
870 {
871 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
872 	const unsigned bsize = sdp->sd_sb.sb_bsize;
873 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
874 	struct gfs2_rindex buf;
875 	int error;
876 	struct gfs2_rgrpd *rgd;
877 
878 	if (pos >= i_size_read(&ip->i_inode))
879 		return 1;
880 
881 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
882 				   sizeof(struct gfs2_rindex));
883 
884 	if (error != sizeof(struct gfs2_rindex))
885 		return (error == 0) ? 1 : error;
886 
887 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
888 	error = -ENOMEM;
889 	if (!rgd)
890 		return error;
891 
892 	rgd->rd_sbd = sdp;
893 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
894 	rgd->rd_length = be32_to_cpu(buf.ri_length);
895 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
896 	rgd->rd_data = be32_to_cpu(buf.ri_data);
897 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
898 	spin_lock_init(&rgd->rd_rsspin);
899 
900 	error = compute_bitstructs(rgd);
901 	if (error)
902 		goto fail;
903 
904 	error = gfs2_glock_get(sdp, rgd->rd_addr,
905 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
906 	if (error)
907 		goto fail;
908 
909 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
910 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
911 	if (rgd->rd_data > sdp->sd_max_rg_data)
912 		sdp->sd_max_rg_data = rgd->rd_data;
913 	spin_lock(&sdp->sd_rindex_spin);
914 	error = rgd_insert(rgd);
915 	spin_unlock(&sdp->sd_rindex_spin);
916 	if (!error) {
917 		glock_set_object(rgd->rd_gl, rgd);
918 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
919 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
920 						    rgd->rd_length) * bsize) - 1;
921 		return 0;
922 	}
923 
924 	error = 0; /* someone else read in the rgrp; free it and ignore it */
925 	gfs2_glock_put(rgd->rd_gl);
926 
927 fail:
928 	kfree(rgd->rd_bits);
929 	rgd->rd_bits = NULL;
930 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
931 	return error;
932 }
933 
934 /**
935  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
936  * @sdp: the GFS2 superblock
937  *
938  * The purpose of this function is to select a subset of the resource groups
939  * and mark them as PREFERRED. We do it in such a way that each node prefers
940  * to use a unique set of rgrps to minimize glock contention.
941  */
942 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
943 {
944 	struct gfs2_rgrpd *rgd, *first;
945 	int i;
946 
947 	/* Skip an initial number of rgrps, based on this node's journal ID.
948 	   That should start each node out on its own set. */
949 	rgd = gfs2_rgrpd_get_first(sdp);
950 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
951 		rgd = gfs2_rgrpd_get_next(rgd);
952 	first = rgd;
953 
954 	do {
955 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
956 		for (i = 0; i < sdp->sd_journals; i++) {
957 			rgd = gfs2_rgrpd_get_next(rgd);
958 			if (!rgd || rgd == first)
959 				break;
960 		}
961 	} while (rgd && rgd != first);
962 }
963 
964 /**
965  * gfs2_ri_update - Pull in a new resource index from the disk
966  * @ip: pointer to the rindex inode
967  *
968  * Returns: 0 on successful update, error code otherwise
969  */
970 
971 static int gfs2_ri_update(struct gfs2_inode *ip)
972 {
973 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
974 	int error;
975 
976 	do {
977 		error = read_rindex_entry(ip);
978 	} while (error == 0);
979 
980 	if (error < 0)
981 		return error;
982 
983 	set_rgrp_preferences(sdp);
984 
985 	sdp->sd_rindex_uptodate = 1;
986 	return 0;
987 }
988 
989 /**
990  * gfs2_rindex_update - Update the rindex if required
991  * @sdp: The GFS2 superblock
992  *
993  * We grab a lock on the rindex inode to make sure that it doesn't
994  * change whilst we are performing an operation. We keep this lock
995  * for quite long periods of time compared to other locks. This
996  * doesn't matter, since it is shared and it is very, very rarely
997  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
998  *
999  * This makes sure that we're using the latest copy of the resource index
1000  * special file, which might have been updated if someone expanded the
1001  * filesystem (via gfs2_grow utility), which adds new resource groups.
1002  *
1003  * Returns: 0 on succeess, error code otherwise
1004  */
1005 
1006 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1007 {
1008 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1009 	struct gfs2_glock *gl = ip->i_gl;
1010 	struct gfs2_holder ri_gh;
1011 	int error = 0;
1012 	int unlock_required = 0;
1013 
1014 	/* Read new copy from disk if we don't have the latest */
1015 	if (!sdp->sd_rindex_uptodate) {
1016 		if (!gfs2_glock_is_locked_by_me(gl)) {
1017 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1018 			if (error)
1019 				return error;
1020 			unlock_required = 1;
1021 		}
1022 		if (!sdp->sd_rindex_uptodate)
1023 			error = gfs2_ri_update(ip);
1024 		if (unlock_required)
1025 			gfs2_glock_dq_uninit(&ri_gh);
1026 	}
1027 
1028 	return error;
1029 }
1030 
1031 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1032 {
1033 	const struct gfs2_rgrp *str = buf;
1034 	u32 rg_flags;
1035 
1036 	rg_flags = be32_to_cpu(str->rg_flags);
1037 	rg_flags &= ~GFS2_RDF_MASK;
1038 	rgd->rd_flags &= GFS2_RDF_MASK;
1039 	rgd->rd_flags |= rg_flags;
1040 	rgd->rd_free = be32_to_cpu(str->rg_free);
1041 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1042 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1043 }
1044 
1045 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1046 {
1047 	struct gfs2_rgrp *str = buf;
1048 
1049 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1050 	str->rg_free = cpu_to_be32(rgd->rd_free);
1051 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1052 	str->__pad = cpu_to_be32(0);
1053 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1054 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1055 }
1056 
1057 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1058 {
1059 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1060 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1061 
1062 	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
1063 	    rgl->rl_dinodes != str->rg_dinodes ||
1064 	    rgl->rl_igeneration != str->rg_igeneration)
1065 		return 0;
1066 	return 1;
1067 }
1068 
1069 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1070 {
1071 	const struct gfs2_rgrp *str = buf;
1072 
1073 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1074 	rgl->rl_flags = str->rg_flags;
1075 	rgl->rl_free = str->rg_free;
1076 	rgl->rl_dinodes = str->rg_dinodes;
1077 	rgl->rl_igeneration = str->rg_igeneration;
1078 	rgl->__pad = 0UL;
1079 }
1080 
1081 static void update_rgrp_lvb_unlinked(struct gfs2_rgrpd *rgd, u32 change)
1082 {
1083 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1084 	u32 unlinked = be32_to_cpu(rgl->rl_unlinked) + change;
1085 	rgl->rl_unlinked = cpu_to_be32(unlinked);
1086 }
1087 
1088 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1089 {
1090 	struct gfs2_bitmap *bi;
1091 	const u32 length = rgd->rd_length;
1092 	const u8 *buffer = NULL;
1093 	u32 i, goal, count = 0;
1094 
1095 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1096 		goal = 0;
1097 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1098 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1099 		while (goal < bi->bi_len * GFS2_NBBY) {
1100 			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
1101 					   GFS2_BLKST_UNLINKED);
1102 			if (goal == BFITNOENT)
1103 				break;
1104 			count++;
1105 			goal++;
1106 		}
1107 	}
1108 
1109 	return count;
1110 }
1111 
1112 
1113 /**
1114  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1115  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1116  *
1117  * Read in all of a Resource Group's header and bitmap blocks.
1118  * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
1119  *
1120  * Returns: errno
1121  */
1122 
1123 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1124 {
1125 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1126 	struct gfs2_glock *gl = rgd->rd_gl;
1127 	unsigned int length = rgd->rd_length;
1128 	struct gfs2_bitmap *bi;
1129 	unsigned int x, y;
1130 	int error;
1131 
1132 	if (rgd->rd_bits[0].bi_bh != NULL)
1133 		return 0;
1134 
1135 	for (x = 0; x < length; x++) {
1136 		bi = rgd->rd_bits + x;
1137 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1138 		if (error)
1139 			goto fail;
1140 	}
1141 
1142 	for (y = length; y--;) {
1143 		bi = rgd->rd_bits + y;
1144 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1145 		if (error)
1146 			goto fail;
1147 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1148 					      GFS2_METATYPE_RG)) {
1149 			error = -EIO;
1150 			goto fail;
1151 		}
1152 	}
1153 
1154 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1155 		for (x = 0; x < length; x++)
1156 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1157 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1158 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1159 		rgd->rd_free_clone = rgd->rd_free;
1160 		/* max out the rgrp allocation failure point */
1161 		rgd->rd_extfail_pt = rgd->rd_free;
1162 	}
1163 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1164 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1165 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1166 				     rgd->rd_bits[0].bi_bh->b_data);
1167 	}
1168 	else if (sdp->sd_args.ar_rgrplvb) {
1169 		if (!gfs2_rgrp_lvb_valid(rgd)){
1170 			gfs2_consist_rgrpd(rgd);
1171 			error = -EIO;
1172 			goto fail;
1173 		}
1174 		if (rgd->rd_rgl->rl_unlinked == 0)
1175 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1176 	}
1177 	return 0;
1178 
1179 fail:
1180 	while (x--) {
1181 		bi = rgd->rd_bits + x;
1182 		brelse(bi->bi_bh);
1183 		bi->bi_bh = NULL;
1184 		gfs2_assert_warn(sdp, !bi->bi_clone);
1185 	}
1186 
1187 	return error;
1188 }
1189 
1190 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1191 {
1192 	u32 rl_flags;
1193 
1194 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1195 		return 0;
1196 
1197 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1198 		return gfs2_rgrp_bh_get(rgd);
1199 
1200 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1201 	rl_flags &= ~GFS2_RDF_MASK;
1202 	rgd->rd_flags &= GFS2_RDF_MASK;
1203 	rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1204 	if (rgd->rd_rgl->rl_unlinked == 0)
1205 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1206 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1207 	rgd->rd_free_clone = rgd->rd_free;
1208 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1209 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1210 	return 0;
1211 }
1212 
1213 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1214 {
1215 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1216 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1217 
1218 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1219 		return 0;
1220 	return gfs2_rgrp_bh_get(rgd);
1221 }
1222 
1223 /**
1224  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1225  * @rgd: The resource group
1226  *
1227  */
1228 
1229 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1230 {
1231 	int x, length = rgd->rd_length;
1232 
1233 	for (x = 0; x < length; x++) {
1234 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1235 		if (bi->bi_bh) {
1236 			brelse(bi->bi_bh);
1237 			bi->bi_bh = NULL;
1238 		}
1239 	}
1240 
1241 }
1242 
1243 /**
1244  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1245  * @gh: The glock holder for the resource group
1246  *
1247  */
1248 
1249 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1250 {
1251 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1252 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1253 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1254 
1255 	if (rgd && demote_requested)
1256 		gfs2_rgrp_brelse(rgd);
1257 }
1258 
1259 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1260 			     struct buffer_head *bh,
1261 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1262 {
1263 	struct super_block *sb = sdp->sd_vfs;
1264 	u64 blk;
1265 	sector_t start = 0;
1266 	sector_t nr_blks = 0;
1267 	int rv;
1268 	unsigned int x;
1269 	u32 trimmed = 0;
1270 	u8 diff;
1271 
1272 	for (x = 0; x < bi->bi_len; x++) {
1273 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1274 		clone += bi->bi_offset;
1275 		clone += x;
1276 		if (bh) {
1277 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1278 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1279 		} else {
1280 			diff = ~(*clone | (*clone >> 1));
1281 		}
1282 		diff &= 0x55;
1283 		if (diff == 0)
1284 			continue;
1285 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1286 		while(diff) {
1287 			if (diff & 1) {
1288 				if (nr_blks == 0)
1289 					goto start_new_extent;
1290 				if ((start + nr_blks) != blk) {
1291 					if (nr_blks >= minlen) {
1292 						rv = sb_issue_discard(sb,
1293 							start, nr_blks,
1294 							GFP_NOFS, 0);
1295 						if (rv)
1296 							goto fail;
1297 						trimmed += nr_blks;
1298 					}
1299 					nr_blks = 0;
1300 start_new_extent:
1301 					start = blk;
1302 				}
1303 				nr_blks++;
1304 			}
1305 			diff >>= 2;
1306 			blk++;
1307 		}
1308 	}
1309 	if (nr_blks >= minlen) {
1310 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1311 		if (rv)
1312 			goto fail;
1313 		trimmed += nr_blks;
1314 	}
1315 	if (ptrimmed)
1316 		*ptrimmed = trimmed;
1317 	return 0;
1318 
1319 fail:
1320 	if (sdp->sd_args.ar_discard)
1321 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
1322 	sdp->sd_args.ar_discard = 0;
1323 	return -EIO;
1324 }
1325 
1326 /**
1327  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1328  * @filp: Any file on the filesystem
1329  * @argp: Pointer to the arguments (also used to pass result)
1330  *
1331  * Returns: 0 on success, otherwise error code
1332  */
1333 
1334 int gfs2_fitrim(struct file *filp, void __user *argp)
1335 {
1336 	struct inode *inode = file_inode(filp);
1337 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1338 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1339 	struct buffer_head *bh;
1340 	struct gfs2_rgrpd *rgd;
1341 	struct gfs2_rgrpd *rgd_end;
1342 	struct gfs2_holder gh;
1343 	struct fstrim_range r;
1344 	int ret = 0;
1345 	u64 amt;
1346 	u64 trimmed = 0;
1347 	u64 start, end, minlen;
1348 	unsigned int x;
1349 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1350 
1351 	if (!capable(CAP_SYS_ADMIN))
1352 		return -EPERM;
1353 
1354 	if (!blk_queue_discard(q))
1355 		return -EOPNOTSUPP;
1356 
1357 	if (copy_from_user(&r, argp, sizeof(r)))
1358 		return -EFAULT;
1359 
1360 	ret = gfs2_rindex_update(sdp);
1361 	if (ret)
1362 		return ret;
1363 
1364 	start = r.start >> bs_shift;
1365 	end = start + (r.len >> bs_shift);
1366 	minlen = max_t(u64, r.minlen,
1367 		       q->limits.discard_granularity) >> bs_shift;
1368 
1369 	if (end <= start || minlen > sdp->sd_max_rg_data)
1370 		return -EINVAL;
1371 
1372 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1373 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1374 
1375 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1376 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1377 		return -EINVAL; /* start is beyond the end of the fs */
1378 
1379 	while (1) {
1380 
1381 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1382 		if (ret)
1383 			goto out;
1384 
1385 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1386 			/* Trim each bitmap in the rgrp */
1387 			for (x = 0; x < rgd->rd_length; x++) {
1388 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1389 				ret = gfs2_rgrp_send_discards(sdp,
1390 						rgd->rd_data0, NULL, bi, minlen,
1391 						&amt);
1392 				if (ret) {
1393 					gfs2_glock_dq_uninit(&gh);
1394 					goto out;
1395 				}
1396 				trimmed += amt;
1397 			}
1398 
1399 			/* Mark rgrp as having been trimmed */
1400 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1401 			if (ret == 0) {
1402 				bh = rgd->rd_bits[0].bi_bh;
1403 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1404 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1405 				gfs2_rgrp_out(rgd, bh->b_data);
1406 				gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
1407 				gfs2_trans_end(sdp);
1408 			}
1409 		}
1410 		gfs2_glock_dq_uninit(&gh);
1411 
1412 		if (rgd == rgd_end)
1413 			break;
1414 
1415 		rgd = gfs2_rgrpd_get_next(rgd);
1416 	}
1417 
1418 out:
1419 	r.len = trimmed << bs_shift;
1420 	if (copy_to_user(argp, &r, sizeof(r)))
1421 		return -EFAULT;
1422 
1423 	return ret;
1424 }
1425 
1426 /**
1427  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1428  * @ip: the inode structure
1429  *
1430  */
1431 static void rs_insert(struct gfs2_inode *ip)
1432 {
1433 	struct rb_node **newn, *parent = NULL;
1434 	int rc;
1435 	struct gfs2_blkreserv *rs = &ip->i_res;
1436 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1437 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1438 
1439 	BUG_ON(gfs2_rs_active(rs));
1440 
1441 	spin_lock(&rgd->rd_rsspin);
1442 	newn = &rgd->rd_rstree.rb_node;
1443 	while (*newn) {
1444 		struct gfs2_blkreserv *cur =
1445 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1446 
1447 		parent = *newn;
1448 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1449 		if (rc > 0)
1450 			newn = &((*newn)->rb_right);
1451 		else if (rc < 0)
1452 			newn = &((*newn)->rb_left);
1453 		else {
1454 			spin_unlock(&rgd->rd_rsspin);
1455 			WARN_ON(1);
1456 			return;
1457 		}
1458 	}
1459 
1460 	rb_link_node(&rs->rs_node, parent, newn);
1461 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1462 
1463 	/* Do our rgrp accounting for the reservation */
1464 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1465 	spin_unlock(&rgd->rd_rsspin);
1466 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1467 }
1468 
1469 /**
1470  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1471  * @rgd: the resource group descriptor
1472  * @ip: pointer to the inode for which we're reserving blocks
1473  * @ap: the allocation parameters
1474  *
1475  */
1476 
1477 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1478 			   const struct gfs2_alloc_parms *ap)
1479 {
1480 	struct gfs2_rbm rbm = { .rgd = rgd, };
1481 	u64 goal;
1482 	struct gfs2_blkreserv *rs = &ip->i_res;
1483 	u32 extlen;
1484 	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
1485 	int ret;
1486 	struct inode *inode = &ip->i_inode;
1487 
1488 	if (S_ISDIR(inode->i_mode))
1489 		extlen = 1;
1490 	else {
1491 		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
1492 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1493 	}
1494 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1495 		return;
1496 
1497 	/* Find bitmap block that contains bits for goal block */
1498 	if (rgrp_contains_block(rgd, ip->i_goal))
1499 		goal = ip->i_goal;
1500 	else
1501 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1502 
1503 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1504 		return;
1505 
1506 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1507 	if (ret == 0) {
1508 		rs->rs_rbm = rbm;
1509 		rs->rs_free = extlen;
1510 		rs->rs_inum = ip->i_no_addr;
1511 		rs_insert(ip);
1512 	} else {
1513 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1514 			rgd->rd_last_alloc = 0;
1515 	}
1516 }
1517 
1518 /**
1519  * gfs2_next_unreserved_block - Return next block that is not reserved
1520  * @rgd: The resource group
1521  * @block: The starting block
1522  * @length: The required length
1523  * @ip: Ignore any reservations for this inode
1524  *
1525  * If the block does not appear in any reservation, then return the
1526  * block number unchanged. If it does appear in the reservation, then
1527  * keep looking through the tree of reservations in order to find the
1528  * first block number which is not reserved.
1529  */
1530 
1531 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1532 				      u32 length,
1533 				      const struct gfs2_inode *ip)
1534 {
1535 	struct gfs2_blkreserv *rs;
1536 	struct rb_node *n;
1537 	int rc;
1538 
1539 	spin_lock(&rgd->rd_rsspin);
1540 	n = rgd->rd_rstree.rb_node;
1541 	while (n) {
1542 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1543 		rc = rs_cmp(block, length, rs);
1544 		if (rc < 0)
1545 			n = n->rb_left;
1546 		else if (rc > 0)
1547 			n = n->rb_right;
1548 		else
1549 			break;
1550 	}
1551 
1552 	if (n) {
1553 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1554 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1555 			n = n->rb_right;
1556 			if (n == NULL)
1557 				break;
1558 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1559 		}
1560 	}
1561 
1562 	spin_unlock(&rgd->rd_rsspin);
1563 	return block;
1564 }
1565 
1566 /**
1567  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1568  * @rbm: The current position in the resource group
1569  * @ip: The inode for which we are searching for blocks
1570  * @minext: The minimum extent length
1571  * @maxext: A pointer to the maximum extent structure
1572  *
1573  * This checks the current position in the rgrp to see whether there is
1574  * a reservation covering this block. If not then this function is a
1575  * no-op. If there is, then the position is moved to the end of the
1576  * contiguous reservation(s) so that we are pointing at the first
1577  * non-reserved block.
1578  *
1579  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1580  */
1581 
1582 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1583 					     const struct gfs2_inode *ip,
1584 					     u32 minext,
1585 					     struct gfs2_extent *maxext)
1586 {
1587 	u64 block = gfs2_rbm_to_block(rbm);
1588 	u32 extlen = 1;
1589 	u64 nblock;
1590 	int ret;
1591 
1592 	/*
1593 	 * If we have a minimum extent length, then skip over any extent
1594 	 * which is less than the min extent length in size.
1595 	 */
1596 	if (minext) {
1597 		extlen = gfs2_free_extlen(rbm, minext);
1598 		if (extlen <= maxext->len)
1599 			goto fail;
1600 	}
1601 
1602 	/*
1603 	 * Check the extent which has been found against the reservations
1604 	 * and skip if parts of it are already reserved
1605 	 */
1606 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1607 	if (nblock == block) {
1608 		if (!minext || extlen >= minext)
1609 			return 0;
1610 
1611 		if (extlen > maxext->len) {
1612 			maxext->len = extlen;
1613 			maxext->rbm = *rbm;
1614 		}
1615 fail:
1616 		nblock = block + extlen;
1617 	}
1618 	ret = gfs2_rbm_from_block(rbm, nblock);
1619 	if (ret < 0)
1620 		return ret;
1621 	return 1;
1622 }
1623 
1624 /**
1625  * gfs2_rbm_find - Look for blocks of a particular state
1626  * @rbm: Value/result starting position and final position
1627  * @state: The state which we want to find
1628  * @minext: Pointer to the requested extent length (NULL for a single block)
1629  *          This is updated to be the actual reservation size.
1630  * @ip: If set, check for reservations
1631  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1632  *          around until we've reached the starting point.
1633  *
1634  * Side effects:
1635  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1636  *   has no free blocks in it.
1637  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1638  *   has come up short on a free block search.
1639  *
1640  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1641  */
1642 
1643 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1644 			 const struct gfs2_inode *ip, bool nowrap)
1645 {
1646 	struct buffer_head *bh;
1647 	int initial_bii;
1648 	u32 initial_offset;
1649 	int first_bii = rbm->bii;
1650 	u32 first_offset = rbm->offset;
1651 	u32 offset;
1652 	u8 *buffer;
1653 	int n = 0;
1654 	int iters = rbm->rgd->rd_length;
1655 	int ret;
1656 	struct gfs2_bitmap *bi;
1657 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1658 
1659 	/* If we are not starting at the beginning of a bitmap, then we
1660 	 * need to add one to the bitmap count to ensure that we search
1661 	 * the starting bitmap twice.
1662 	 */
1663 	if (rbm->offset != 0)
1664 		iters++;
1665 
1666 	while(1) {
1667 		bi = rbm_bi(rbm);
1668 		if (test_bit(GBF_FULL, &bi->bi_flags) &&
1669 		    (state == GFS2_BLKST_FREE))
1670 			goto next_bitmap;
1671 
1672 		bh = bi->bi_bh;
1673 		buffer = bh->b_data + bi->bi_offset;
1674 		WARN_ON(!buffer_uptodate(bh));
1675 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1676 			buffer = bi->bi_clone + bi->bi_offset;
1677 		initial_offset = rbm->offset;
1678 		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
1679 		if (offset == BFITNOENT)
1680 			goto bitmap_full;
1681 		rbm->offset = offset;
1682 		if (ip == NULL)
1683 			return 0;
1684 
1685 		initial_bii = rbm->bii;
1686 		ret = gfs2_reservation_check_and_update(rbm, ip,
1687 							minext ? *minext : 0,
1688 							&maxext);
1689 		if (ret == 0)
1690 			return 0;
1691 		if (ret > 0) {
1692 			n += (rbm->bii - initial_bii);
1693 			goto next_iter;
1694 		}
1695 		if (ret == -E2BIG) {
1696 			rbm->bii = 0;
1697 			rbm->offset = 0;
1698 			n += (rbm->bii - initial_bii);
1699 			goto res_covered_end_of_rgrp;
1700 		}
1701 		return ret;
1702 
1703 bitmap_full:	/* Mark bitmap as full and fall through */
1704 		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1705 			set_bit(GBF_FULL, &bi->bi_flags);
1706 
1707 next_bitmap:	/* Find next bitmap in the rgrp */
1708 		rbm->offset = 0;
1709 		rbm->bii++;
1710 		if (rbm->bii == rbm->rgd->rd_length)
1711 			rbm->bii = 0;
1712 res_covered_end_of_rgrp:
1713 		if ((rbm->bii == 0) && nowrap)
1714 			break;
1715 		n++;
1716 next_iter:
1717 		if (n >= iters)
1718 			break;
1719 	}
1720 
1721 	if (minext == NULL || state != GFS2_BLKST_FREE)
1722 		return -ENOSPC;
1723 
1724 	/* If the extent was too small, and it's smaller than the smallest
1725 	   to have failed before, remember for future reference that it's
1726 	   useless to search this rgrp again for this amount or more. */
1727 	if ((first_offset == 0) && (first_bii == 0) &&
1728 	    (*minext < rbm->rgd->rd_extfail_pt))
1729 		rbm->rgd->rd_extfail_pt = *minext;
1730 
1731 	/* If the maximum extent we found is big enough to fulfill the
1732 	   minimum requirements, use it anyway. */
1733 	if (maxext.len) {
1734 		*rbm = maxext.rbm;
1735 		*minext = maxext.len;
1736 		return 0;
1737 	}
1738 
1739 	return -ENOSPC;
1740 }
1741 
1742 /**
1743  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1744  * @rgd: The rgrp
1745  * @last_unlinked: block address of the last dinode we unlinked
1746  * @skip: block address we should explicitly not unlink
1747  *
1748  * Returns: 0 if no error
1749  *          The inode, if one has been found, in inode.
1750  */
1751 
1752 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1753 {
1754 	u64 block;
1755 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1756 	struct gfs2_glock *gl;
1757 	struct gfs2_inode *ip;
1758 	int error;
1759 	int found = 0;
1760 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1761 
1762 	while (1) {
1763 		down_write(&sdp->sd_log_flush_lock);
1764 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1765 				      true);
1766 		up_write(&sdp->sd_log_flush_lock);
1767 		if (error == -ENOSPC)
1768 			break;
1769 		if (WARN_ON_ONCE(error))
1770 			break;
1771 
1772 		block = gfs2_rbm_to_block(&rbm);
1773 		if (gfs2_rbm_from_block(&rbm, block + 1))
1774 			break;
1775 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1776 			continue;
1777 		if (block == skip)
1778 			continue;
1779 		*last_unlinked = block;
1780 
1781 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1782 		if (error)
1783 			continue;
1784 
1785 		/* If the inode is already in cache, we can ignore it here
1786 		 * because the existing inode disposal code will deal with
1787 		 * it when all refs have gone away. Accessing gl_object like
1788 		 * this is not safe in general. Here it is ok because we do
1789 		 * not dereference the pointer, and we only need an approx
1790 		 * answer to whether it is NULL or not.
1791 		 */
1792 		ip = gl->gl_object;
1793 
1794 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1795 			gfs2_glock_put(gl);
1796 		else
1797 			found++;
1798 
1799 		/* Limit reclaim to sensible number of tasks */
1800 		if (found > NR_CPUS)
1801 			return;
1802 	}
1803 
1804 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1805 	return;
1806 }
1807 
1808 /**
1809  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1810  * @rgd: The rgrp in question
1811  * @loops: An indication of how picky we can be (0=very, 1=less so)
1812  *
1813  * This function uses the recently added glock statistics in order to
1814  * figure out whether a parciular resource group is suffering from
1815  * contention from multiple nodes. This is done purely on the basis
1816  * of timings, since this is the only data we have to work with and
1817  * our aim here is to reject a resource group which is highly contended
1818  * but (very important) not to do this too often in order to ensure that
1819  * we do not land up introducing fragmentation by changing resource
1820  * groups when not actually required.
1821  *
1822  * The calculation is fairly simple, we want to know whether the SRTTB
1823  * (i.e. smoothed round trip time for blocking operations) to acquire
1824  * the lock for this rgrp's glock is significantly greater than the
1825  * time taken for resource groups on average. We introduce a margin in
1826  * the form of the variable @var which is computed as the sum of the two
1827  * respective variences, and multiplied by a factor depending on @loops
1828  * and whether we have a lot of data to base the decision on. This is
1829  * then tested against the square difference of the means in order to
1830  * decide whether the result is statistically significant or not.
1831  *
1832  * Returns: A boolean verdict on the congestion status
1833  */
1834 
1835 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1836 {
1837 	const struct gfs2_glock *gl = rgd->rd_gl;
1838 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1839 	struct gfs2_lkstats *st;
1840 	u64 r_dcount, l_dcount;
1841 	u64 l_srttb, a_srttb = 0;
1842 	s64 srttb_diff;
1843 	u64 sqr_diff;
1844 	u64 var;
1845 	int cpu, nonzero = 0;
1846 
1847 	preempt_disable();
1848 	for_each_present_cpu(cpu) {
1849 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1850 		if (st->stats[GFS2_LKS_SRTTB]) {
1851 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1852 			nonzero++;
1853 		}
1854 	}
1855 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1856 	if (nonzero)
1857 		do_div(a_srttb, nonzero);
1858 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1859 	var = st->stats[GFS2_LKS_SRTTVARB] +
1860 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1861 	preempt_enable();
1862 
1863 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1864 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1865 
1866 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1867 		return false;
1868 
1869 	srttb_diff = a_srttb - l_srttb;
1870 	sqr_diff = srttb_diff * srttb_diff;
1871 
1872 	var *= 2;
1873 	if (l_dcount < 8 || r_dcount < 8)
1874 		var *= 2;
1875 	if (loops == 1)
1876 		var *= 2;
1877 
1878 	return ((srttb_diff < 0) && (sqr_diff > var));
1879 }
1880 
1881 /**
1882  * gfs2_rgrp_used_recently
1883  * @rs: The block reservation with the rgrp to test
1884  * @msecs: The time limit in milliseconds
1885  *
1886  * Returns: True if the rgrp glock has been used within the time limit
1887  */
1888 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1889 				    u64 msecs)
1890 {
1891 	u64 tdiff;
1892 
1893 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1894                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1895 
1896 	return tdiff > (msecs * 1000 * 1000);
1897 }
1898 
1899 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1900 {
1901 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1902 	u32 skip;
1903 
1904 	get_random_bytes(&skip, sizeof(skip));
1905 	return skip % sdp->sd_rgrps;
1906 }
1907 
1908 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1909 {
1910 	struct gfs2_rgrpd *rgd = *pos;
1911 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1912 
1913 	rgd = gfs2_rgrpd_get_next(rgd);
1914 	if (rgd == NULL)
1915 		rgd = gfs2_rgrpd_get_first(sdp);
1916 	*pos = rgd;
1917 	if (rgd != begin) /* If we didn't wrap */
1918 		return true;
1919 	return false;
1920 }
1921 
1922 /**
1923  * fast_to_acquire - determine if a resource group will be fast to acquire
1924  *
1925  * If this is one of our preferred rgrps, it should be quicker to acquire,
1926  * because we tried to set ourselves up as dlm lock master.
1927  */
1928 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1929 {
1930 	struct gfs2_glock *gl = rgd->rd_gl;
1931 
1932 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1933 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1934 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1935 		return 1;
1936 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1937 		return 1;
1938 	return 0;
1939 }
1940 
1941 /**
1942  * gfs2_inplace_reserve - Reserve space in the filesystem
1943  * @ip: the inode to reserve space for
1944  * @ap: the allocation parameters
1945  *
1946  * We try our best to find an rgrp that has at least ap->target blocks
1947  * available. After a couple of passes (loops == 2), the prospects of finding
1948  * such an rgrp diminish. At this stage, we return the first rgrp that has
1949  * atleast ap->min_target blocks available. Either way, we set ap->allowed to
1950  * the number of blocks available in the chosen rgrp.
1951  *
1952  * Returns: 0 on success,
1953  *          -ENOMEM if a suitable rgrp can't be found
1954  *          errno otherwise
1955  */
1956 
1957 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
1958 {
1959 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1960 	struct gfs2_rgrpd *begin = NULL;
1961 	struct gfs2_blkreserv *rs = &ip->i_res;
1962 	int error = 0, rg_locked, flags = 0;
1963 	u64 last_unlinked = NO_BLOCK;
1964 	int loops = 0;
1965 	u32 skip = 0;
1966 
1967 	if (sdp->sd_args.ar_rgrplvb)
1968 		flags |= GL_SKIP;
1969 	if (gfs2_assert_warn(sdp, ap->target))
1970 		return -EINVAL;
1971 	if (gfs2_rs_active(rs)) {
1972 		begin = rs->rs_rbm.rgd;
1973 	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1974 		rs->rs_rbm.rgd = begin = ip->i_rgd;
1975 	} else {
1976 		check_and_update_goal(ip);
1977 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1978 	}
1979 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
1980 		skip = gfs2_orlov_skip(ip);
1981 	if (rs->rs_rbm.rgd == NULL)
1982 		return -EBADSLT;
1983 
1984 	while (loops < 3) {
1985 		rg_locked = 1;
1986 
1987 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
1988 			rg_locked = 0;
1989 			if (skip && skip--)
1990 				goto next_rgrp;
1991 			if (!gfs2_rs_active(rs)) {
1992 				if (loops == 0 &&
1993 				    !fast_to_acquire(rs->rs_rbm.rgd))
1994 					goto next_rgrp;
1995 				if ((loops < 2) &&
1996 				    gfs2_rgrp_used_recently(rs, 1000) &&
1997 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
1998 					goto next_rgrp;
1999 			}
2000 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2001 						   LM_ST_EXCLUSIVE, flags,
2002 						   &rs->rs_rgd_gh);
2003 			if (unlikely(error))
2004 				return error;
2005 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2006 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2007 				goto skip_rgrp;
2008 			if (sdp->sd_args.ar_rgrplvb) {
2009 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2010 				if (unlikely(error)) {
2011 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2012 					return error;
2013 				}
2014 			}
2015 		}
2016 
2017 		/* Skip unuseable resource groups */
2018 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2019 						 GFS2_RDF_ERROR)) ||
2020 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2021 			goto skip_rgrp;
2022 
2023 		if (sdp->sd_args.ar_rgrplvb)
2024 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2025 
2026 		/* Get a reservation if we don't already have one */
2027 		if (!gfs2_rs_active(rs))
2028 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2029 
2030 		/* Skip rgrps when we can't get a reservation on first pass */
2031 		if (!gfs2_rs_active(rs) && (loops < 1))
2032 			goto check_rgrp;
2033 
2034 		/* If rgrp has enough free space, use it */
2035 		if (rs->rs_rbm.rgd->rd_free_clone >= ap->target ||
2036 		    (loops == 2 && ap->min_target &&
2037 		     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
2038 			ip->i_rgd = rs->rs_rbm.rgd;
2039 			ap->allowed = ip->i_rgd->rd_free_clone;
2040 			return 0;
2041 		}
2042 check_rgrp:
2043 		/* Check for unlinked inodes which can be reclaimed */
2044 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2045 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2046 					ip->i_no_addr);
2047 skip_rgrp:
2048 		/* Drop reservation, if we couldn't use reserved rgrp */
2049 		if (gfs2_rs_active(rs))
2050 			gfs2_rs_deltree(rs);
2051 
2052 		/* Unlock rgrp if required */
2053 		if (!rg_locked)
2054 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2055 next_rgrp:
2056 		/* Find the next rgrp, and continue looking */
2057 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2058 			continue;
2059 		if (skip)
2060 			continue;
2061 
2062 		/* If we've scanned all the rgrps, but found no free blocks
2063 		 * then this checks for some less likely conditions before
2064 		 * trying again.
2065 		 */
2066 		loops++;
2067 		/* Check that fs hasn't grown if writing to rindex */
2068 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2069 			error = gfs2_ri_update(ip);
2070 			if (error)
2071 				return error;
2072 		}
2073 		/* Flushing the log may release space */
2074 		if (loops == 2)
2075 			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
2076 	}
2077 
2078 	return -ENOSPC;
2079 }
2080 
2081 /**
2082  * gfs2_inplace_release - release an inplace reservation
2083  * @ip: the inode the reservation was taken out on
2084  *
2085  * Release a reservation made by gfs2_inplace_reserve().
2086  */
2087 
2088 void gfs2_inplace_release(struct gfs2_inode *ip)
2089 {
2090 	struct gfs2_blkreserv *rs = &ip->i_res;
2091 
2092 	if (gfs2_holder_initialized(&rs->rs_rgd_gh))
2093 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2094 }
2095 
2096 /**
2097  * gfs2_get_block_type - Check a block in a RG is of given type
2098  * @rgd: the resource group holding the block
2099  * @block: the block number
2100  *
2101  * Returns: The block type (GFS2_BLKST_*)
2102  */
2103 
2104 static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
2105 {
2106 	struct gfs2_rbm rbm = { .rgd = rgd, };
2107 	int ret;
2108 
2109 	ret = gfs2_rbm_from_block(&rbm, block);
2110 	WARN_ON_ONCE(ret != 0);
2111 
2112 	return gfs2_testbit(&rbm);
2113 }
2114 
2115 
2116 /**
2117  * gfs2_alloc_extent - allocate an extent from a given bitmap
2118  * @rbm: the resource group information
2119  * @dinode: TRUE if the first block we allocate is for a dinode
2120  * @n: The extent length (value/result)
2121  *
2122  * Add the bitmap buffer to the transaction.
2123  * Set the found bits to @new_state to change block's allocation state.
2124  */
2125 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2126 			     unsigned int *n)
2127 {
2128 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2129 	const unsigned int elen = *n;
2130 	u64 block;
2131 	int ret;
2132 
2133 	*n = 1;
2134 	block = gfs2_rbm_to_block(rbm);
2135 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2136 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2137 	block++;
2138 	while (*n < elen) {
2139 		ret = gfs2_rbm_from_block(&pos, block);
2140 		if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
2141 			break;
2142 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2143 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2144 		(*n)++;
2145 		block++;
2146 	}
2147 }
2148 
2149 /**
2150  * rgblk_free - Change alloc state of given block(s)
2151  * @sdp: the filesystem
2152  * @bstart: the start of a run of blocks to free
2153  * @blen: the length of the block run (all must lie within ONE RG!)
2154  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2155  *
2156  * Returns:  Resource group containing the block(s)
2157  */
2158 
2159 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2160 				     u32 blen, unsigned char new_state)
2161 {
2162 	struct gfs2_rbm rbm;
2163 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2164 
2165 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2166 	if (!rbm.rgd) {
2167 		if (gfs2_consist(sdp))
2168 			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
2169 		return NULL;
2170 	}
2171 
2172 	gfs2_rbm_from_block(&rbm, bstart);
2173 	while (blen--) {
2174 		bi = rbm_bi(&rbm);
2175 		if (bi != bi_prev) {
2176 			if (!bi->bi_clone) {
2177 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2178 						      GFP_NOFS | __GFP_NOFAIL);
2179 				memcpy(bi->bi_clone + bi->bi_offset,
2180 				       bi->bi_bh->b_data + bi->bi_offset,
2181 				       bi->bi_len);
2182 			}
2183 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2184 			bi_prev = bi;
2185 		}
2186 		gfs2_setbit(&rbm, false, new_state);
2187 		gfs2_rbm_incr(&rbm);
2188 	}
2189 
2190 	return rbm.rgd;
2191 }
2192 
2193 /**
2194  * gfs2_rgrp_dump - print out an rgrp
2195  * @seq: The iterator
2196  * @gl: The glock in question
2197  *
2198  */
2199 
2200 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2201 {
2202 	struct gfs2_rgrpd *rgd = gl->gl_object;
2203 	struct gfs2_blkreserv *trs;
2204 	const struct rb_node *n;
2205 
2206 	if (rgd == NULL)
2207 		return;
2208 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2209 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2210 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2211 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2212 	spin_lock(&rgd->rd_rsspin);
2213 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2214 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2215 		dump_rs(seq, trs);
2216 	}
2217 	spin_unlock(&rgd->rd_rsspin);
2218 }
2219 
2220 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2221 {
2222 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2223 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2224 		(unsigned long long)rgd->rd_addr);
2225 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2226 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2227 	rgd->rd_flags |= GFS2_RDF_ERROR;
2228 }
2229 
2230 /**
2231  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2232  * @ip: The inode we have just allocated blocks for
2233  * @rbm: The start of the allocated blocks
2234  * @len: The extent length
2235  *
2236  * Adjusts a reservation after an allocation has taken place. If the
2237  * reservation does not match the allocation, or if it is now empty
2238  * then it is removed.
2239  */
2240 
2241 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2242 				    const struct gfs2_rbm *rbm, unsigned len)
2243 {
2244 	struct gfs2_blkreserv *rs = &ip->i_res;
2245 	struct gfs2_rgrpd *rgd = rbm->rgd;
2246 	unsigned rlen;
2247 	u64 block;
2248 	int ret;
2249 
2250 	spin_lock(&rgd->rd_rsspin);
2251 	if (gfs2_rs_active(rs)) {
2252 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2253 			block = gfs2_rbm_to_block(rbm);
2254 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2255 			rlen = min(rs->rs_free, len);
2256 			rs->rs_free -= rlen;
2257 			rgd->rd_reserved -= rlen;
2258 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2259 			if (rs->rs_free && !ret)
2260 				goto out;
2261 			/* We used up our block reservation, so we should
2262 			   reserve more blocks next time. */
2263 			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
2264 		}
2265 		__rs_deltree(rs);
2266 	}
2267 out:
2268 	spin_unlock(&rgd->rd_rsspin);
2269 }
2270 
2271 /**
2272  * gfs2_set_alloc_start - Set starting point for block allocation
2273  * @rbm: The rbm which will be set to the required location
2274  * @ip: The gfs2 inode
2275  * @dinode: Flag to say if allocation includes a new inode
2276  *
2277  * This sets the starting point from the reservation if one is active
2278  * otherwise it falls back to guessing a start point based on the
2279  * inode's goal block or the last allocation point in the rgrp.
2280  */
2281 
2282 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2283 				 const struct gfs2_inode *ip, bool dinode)
2284 {
2285 	u64 goal;
2286 
2287 	if (gfs2_rs_active(&ip->i_res)) {
2288 		*rbm = ip->i_res.rs_rbm;
2289 		return;
2290 	}
2291 
2292 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2293 		goal = ip->i_goal;
2294 	else
2295 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2296 
2297 	gfs2_rbm_from_block(rbm, goal);
2298 }
2299 
2300 /**
2301  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2302  * @ip: the inode to allocate the block for
2303  * @bn: Used to return the starting block number
2304  * @nblocks: requested number of blocks/extent length (value/result)
2305  * @dinode: 1 if we're allocating a dinode block, else 0
2306  * @generation: the generation number of the inode
2307  *
2308  * Returns: 0 or error
2309  */
2310 
2311 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2312 		      bool dinode, u64 *generation)
2313 {
2314 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2315 	struct buffer_head *dibh;
2316 	struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
2317 	unsigned int ndata;
2318 	u64 block; /* block, within the file system scope */
2319 	int error;
2320 
2321 	gfs2_set_alloc_start(&rbm, ip, dinode);
2322 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2323 
2324 	if (error == -ENOSPC) {
2325 		gfs2_set_alloc_start(&rbm, ip, dinode);
2326 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2327 	}
2328 
2329 	/* Since all blocks are reserved in advance, this shouldn't happen */
2330 	if (error) {
2331 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2332 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2333 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2334 			rbm.rgd->rd_extfail_pt);
2335 		goto rgrp_error;
2336 	}
2337 
2338 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2339 	block = gfs2_rbm_to_block(&rbm);
2340 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2341 	if (gfs2_rs_active(&ip->i_res))
2342 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2343 	ndata = *nblocks;
2344 	if (dinode)
2345 		ndata--;
2346 
2347 	if (!dinode) {
2348 		ip->i_goal = block + ndata - 1;
2349 		error = gfs2_meta_inode_buffer(ip, &dibh);
2350 		if (error == 0) {
2351 			struct gfs2_dinode *di =
2352 				(struct gfs2_dinode *)dibh->b_data;
2353 			gfs2_trans_add_meta(ip->i_gl, dibh);
2354 			di->di_goal_meta = di->di_goal_data =
2355 				cpu_to_be64(ip->i_goal);
2356 			brelse(dibh);
2357 		}
2358 	}
2359 	if (rbm.rgd->rd_free < *nblocks) {
2360 		pr_warn("nblocks=%u\n", *nblocks);
2361 		goto rgrp_error;
2362 	}
2363 
2364 	rbm.rgd->rd_free -= *nblocks;
2365 	if (dinode) {
2366 		rbm.rgd->rd_dinodes++;
2367 		*generation = rbm.rgd->rd_igeneration++;
2368 		if (*generation == 0)
2369 			*generation = rbm.rgd->rd_igeneration++;
2370 	}
2371 
2372 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2373 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2374 	gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
2375 
2376 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2377 	if (dinode)
2378 		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2379 
2380 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2381 
2382 	rbm.rgd->rd_free_clone -= *nblocks;
2383 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2384 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2385 	*bn = block;
2386 	return 0;
2387 
2388 rgrp_error:
2389 	gfs2_rgrp_error(rbm.rgd);
2390 	return -EIO;
2391 }
2392 
2393 /**
2394  * __gfs2_free_blocks - free a contiguous run of block(s)
2395  * @ip: the inode these blocks are being freed from
2396  * @bstart: first block of a run of contiguous blocks
2397  * @blen: the length of the block run
2398  * @meta: 1 if the blocks represent metadata
2399  *
2400  */
2401 
2402 void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
2403 {
2404 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2405 	struct gfs2_rgrpd *rgd;
2406 
2407 	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
2408 	if (!rgd)
2409 		return;
2410 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2411 	rgd->rd_free += blen;
2412 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2413 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2414 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2415 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2416 
2417 	/* Directories keep their data in the metadata address space */
2418 	if (meta || ip->i_depth)
2419 		gfs2_meta_wipe(ip, bstart, blen);
2420 }
2421 
2422 /**
2423  * gfs2_free_meta - free a contiguous run of data block(s)
2424  * @ip: the inode these blocks are being freed from
2425  * @bstart: first block of a run of contiguous blocks
2426  * @blen: the length of the block run
2427  *
2428  */
2429 
2430 void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
2431 {
2432 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2433 
2434 	__gfs2_free_blocks(ip, bstart, blen, 1);
2435 	gfs2_statfs_change(sdp, 0, +blen, 0);
2436 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2437 }
2438 
2439 void gfs2_unlink_di(struct inode *inode)
2440 {
2441 	struct gfs2_inode *ip = GFS2_I(inode);
2442 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2443 	struct gfs2_rgrpd *rgd;
2444 	u64 blkno = ip->i_no_addr;
2445 
2446 	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
2447 	if (!rgd)
2448 		return;
2449 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2450 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2451 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2452 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2453 	update_rgrp_lvb_unlinked(rgd, 1);
2454 }
2455 
2456 static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
2457 {
2458 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2459 	struct gfs2_rgrpd *tmp_rgd;
2460 
2461 	tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
2462 	if (!tmp_rgd)
2463 		return;
2464 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
2465 
2466 	if (!rgd->rd_dinodes)
2467 		gfs2_consist_rgrpd(rgd);
2468 	rgd->rd_dinodes--;
2469 	rgd->rd_free++;
2470 
2471 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2472 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2473 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
2474 	update_rgrp_lvb_unlinked(rgd, -1);
2475 
2476 	gfs2_statfs_change(sdp, 0, +1, -1);
2477 }
2478 
2479 
2480 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2481 {
2482 	gfs2_free_uninit_di(rgd, ip->i_no_addr);
2483 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2484 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2485 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2486 }
2487 
2488 /**
2489  * gfs2_check_blk_type - Check the type of a block
2490  * @sdp: The superblock
2491  * @no_addr: The block number to check
2492  * @type: The block type we are looking for
2493  *
2494  * Returns: 0 if the block type matches the expected type
2495  *          -ESTALE if it doesn't match
2496  *          or -ve errno if something went wrong while checking
2497  */
2498 
2499 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2500 {
2501 	struct gfs2_rgrpd *rgd;
2502 	struct gfs2_holder rgd_gh;
2503 	int error = -EINVAL;
2504 
2505 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2506 	if (!rgd)
2507 		goto fail;
2508 
2509 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2510 	if (error)
2511 		goto fail;
2512 
2513 	if (gfs2_get_block_type(rgd, no_addr) != type)
2514 		error = -ESTALE;
2515 
2516 	gfs2_glock_dq_uninit(&rgd_gh);
2517 fail:
2518 	return error;
2519 }
2520 
2521 /**
2522  * gfs2_rlist_add - add a RG to a list of RGs
2523  * @ip: the inode
2524  * @rlist: the list of resource groups
2525  * @block: the block
2526  *
2527  * Figure out what RG a block belongs to and add that RG to the list
2528  *
2529  * FIXME: Don't use NOFAIL
2530  *
2531  */
2532 
2533 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2534 		    u64 block)
2535 {
2536 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2537 	struct gfs2_rgrpd *rgd;
2538 	struct gfs2_rgrpd **tmp;
2539 	unsigned int new_space;
2540 	unsigned int x;
2541 
2542 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2543 		return;
2544 
2545 	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block))
2546 		rgd = ip->i_rgd;
2547 	else
2548 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2549 	if (!rgd) {
2550 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block);
2551 		return;
2552 	}
2553 	ip->i_rgd = rgd;
2554 
2555 	for (x = 0; x < rlist->rl_rgrps; x++)
2556 		if (rlist->rl_rgd[x] == rgd)
2557 			return;
2558 
2559 	if (rlist->rl_rgrps == rlist->rl_space) {
2560 		new_space = rlist->rl_space + 10;
2561 
2562 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2563 			      GFP_NOFS | __GFP_NOFAIL);
2564 
2565 		if (rlist->rl_rgd) {
2566 			memcpy(tmp, rlist->rl_rgd,
2567 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2568 			kfree(rlist->rl_rgd);
2569 		}
2570 
2571 		rlist->rl_space = new_space;
2572 		rlist->rl_rgd = tmp;
2573 	}
2574 
2575 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2576 }
2577 
2578 /**
2579  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2580  *      and initialize an array of glock holders for them
2581  * @rlist: the list of resource groups
2582  * @state: the lock state to acquire the RG lock in
2583  *
2584  * FIXME: Don't use NOFAIL
2585  *
2586  */
2587 
2588 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
2589 {
2590 	unsigned int x;
2591 
2592 	rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder),
2593 				GFP_NOFS | __GFP_NOFAIL);
2594 	for (x = 0; x < rlist->rl_rgrps; x++)
2595 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2596 				state, 0,
2597 				&rlist->rl_ghs[x]);
2598 }
2599 
2600 /**
2601  * gfs2_rlist_free - free a resource group list
2602  * @rlist: the list of resource groups
2603  *
2604  */
2605 
2606 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2607 {
2608 	unsigned int x;
2609 
2610 	kfree(rlist->rl_rgd);
2611 
2612 	if (rlist->rl_ghs) {
2613 		for (x = 0; x < rlist->rl_rgrps; x++)
2614 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2615 		kfree(rlist->rl_ghs);
2616 		rlist->rl_ghs = NULL;
2617 	}
2618 }
2619 
2620