xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 93f5715e)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
16 #include <linux/fs.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
22 
23 #include "gfs2.h"
24 #include "incore.h"
25 #include "glock.h"
26 #include "glops.h"
27 #include "lops.h"
28 #include "meta_io.h"
29 #include "quota.h"
30 #include "rgrp.h"
31 #include "super.h"
32 #include "trans.h"
33 #include "util.h"
34 #include "log.h"
35 #include "inode.h"
36 #include "trace_gfs2.h"
37 #include "dir.h"
38 
39 #define BFITNOENT ((u32)~0)
40 #define NO_BLOCK ((u64)~0)
41 
42 #if BITS_PER_LONG == 32
43 #define LBITMASK   (0x55555555UL)
44 #define LBITSKIP55 (0x55555555UL)
45 #define LBITSKIP00 (0x00000000UL)
46 #else
47 #define LBITMASK   (0x5555555555555555UL)
48 #define LBITSKIP55 (0x5555555555555555UL)
49 #define LBITSKIP00 (0x0000000000000000UL)
50 #endif
51 
52 /*
53  * These routines are used by the resource group routines (rgrp.c)
54  * to keep track of block allocation.  Each block is represented by two
55  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
56  *
57  * 0 = Free
58  * 1 = Used (not metadata)
59  * 2 = Unlinked (still in use) inode
60  * 3 = Used (metadata)
61  */
62 
63 struct gfs2_extent {
64 	struct gfs2_rbm rbm;
65 	u32 len;
66 };
67 
68 static const char valid_change[16] = {
69 	        /* current */
70 	/* n */ 0, 1, 1, 1,
71 	/* e */ 1, 0, 0, 0,
72 	/* w */ 0, 0, 0, 1,
73 	        1, 0, 0, 0
74 };
75 
76 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
77 			 const struct gfs2_inode *ip, bool nowrap);
78 
79 
80 /**
81  * gfs2_setbit - Set a bit in the bitmaps
82  * @rbm: The position of the bit to set
83  * @do_clone: Also set the clone bitmap, if it exists
84  * @new_state: the new state of the block
85  *
86  */
87 
88 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
89 			       unsigned char new_state)
90 {
91 	unsigned char *byte1, *byte2, *end, cur_state;
92 	struct gfs2_bitmap *bi = rbm_bi(rbm);
93 	unsigned int buflen = bi->bi_len;
94 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
95 
96 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
97 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
98 
99 	BUG_ON(byte1 >= end);
100 
101 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
102 
103 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
104 		pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
105 			rbm->offset, cur_state, new_state);
106 		pr_warn("rgrp=0x%llx bi_start=0x%x\n",
107 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
108 		pr_warn("bi_offset=0x%x bi_len=0x%x\n",
109 			bi->bi_offset, bi->bi_len);
110 		dump_stack();
111 		gfs2_consist_rgrpd(rbm->rgd);
112 		return;
113 	}
114 	*byte1 ^= (cur_state ^ new_state) << bit;
115 
116 	if (do_clone && bi->bi_clone) {
117 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
118 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
119 		*byte2 ^= (cur_state ^ new_state) << bit;
120 	}
121 }
122 
123 /**
124  * gfs2_testbit - test a bit in the bitmaps
125  * @rbm: The bit to test
126  * @use_clone: If true, test the clone bitmap, not the official bitmap.
127  *
128  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
129  * not the "real" bitmaps, to avoid allocating recently freed blocks.
130  *
131  * Returns: The two bit block state of the requested bit
132  */
133 
134 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
135 {
136 	struct gfs2_bitmap *bi = rbm_bi(rbm);
137 	const u8 *buffer;
138 	const u8 *byte;
139 	unsigned int bit;
140 
141 	if (use_clone && bi->bi_clone)
142 		buffer = bi->bi_clone;
143 	else
144 		buffer = bi->bi_bh->b_data;
145 	buffer += bi->bi_offset;
146 	byte = buffer + (rbm->offset / GFS2_NBBY);
147 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
148 
149 	return (*byte >> bit) & GFS2_BIT_MASK;
150 }
151 
152 /**
153  * gfs2_bit_search
154  * @ptr: Pointer to bitmap data
155  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
156  * @state: The state we are searching for
157  *
158  * We xor the bitmap data with a patter which is the bitwise opposite
159  * of what we are looking for, this gives rise to a pattern of ones
160  * wherever there is a match. Since we have two bits per entry, we
161  * take this pattern, shift it down by one place and then and it with
162  * the original. All the even bit positions (0,2,4, etc) then represent
163  * successful matches, so we mask with 0x55555..... to remove the unwanted
164  * odd bit positions.
165  *
166  * This allows searching of a whole u64 at once (32 blocks) with a
167  * single test (on 64 bit arches).
168  */
169 
170 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
171 {
172 	u64 tmp;
173 	static const u64 search[] = {
174 		[0] = 0xffffffffffffffffULL,
175 		[1] = 0xaaaaaaaaaaaaaaaaULL,
176 		[2] = 0x5555555555555555ULL,
177 		[3] = 0x0000000000000000ULL,
178 	};
179 	tmp = le64_to_cpu(*ptr) ^ search[state];
180 	tmp &= (tmp >> 1);
181 	tmp &= mask;
182 	return tmp;
183 }
184 
185 /**
186  * rs_cmp - multi-block reservation range compare
187  * @blk: absolute file system block number of the new reservation
188  * @len: number of blocks in the new reservation
189  * @rs: existing reservation to compare against
190  *
191  * returns: 1 if the block range is beyond the reach of the reservation
192  *         -1 if the block range is before the start of the reservation
193  *          0 if the block range overlaps with the reservation
194  */
195 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
196 {
197 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
198 
199 	if (blk >= startblk + rs->rs_free)
200 		return 1;
201 	if (blk + len - 1 < startblk)
202 		return -1;
203 	return 0;
204 }
205 
206 /**
207  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
208  *       a block in a given allocation state.
209  * @buf: the buffer that holds the bitmaps
210  * @len: the length (in bytes) of the buffer
211  * @goal: start search at this block's bit-pair (within @buffer)
212  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
213  *
214  * Scope of @goal and returned block number is only within this bitmap buffer,
215  * not entire rgrp or filesystem.  @buffer will be offset from the actual
216  * beginning of a bitmap block buffer, skipping any header structures, but
217  * headers are always a multiple of 64 bits long so that the buffer is
218  * always aligned to a 64 bit boundary.
219  *
220  * The size of the buffer is in bytes, but is it assumed that it is
221  * always ok to read a complete multiple of 64 bits at the end
222  * of the block in case the end is no aligned to a natural boundary.
223  *
224  * Return: the block number (bitmap buffer scope) that was found
225  */
226 
227 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
228 		       u32 goal, u8 state)
229 {
230 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
231 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
232 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
233 	u64 tmp;
234 	u64 mask = 0x5555555555555555ULL;
235 	u32 bit;
236 
237 	/* Mask off bits we don't care about at the start of the search */
238 	mask <<= spoint;
239 	tmp = gfs2_bit_search(ptr, mask, state);
240 	ptr++;
241 	while(tmp == 0 && ptr < end) {
242 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
243 		ptr++;
244 	}
245 	/* Mask off any bits which are more than len bytes from the start */
246 	if (ptr == end && (len & (sizeof(u64) - 1)))
247 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
248 	/* Didn't find anything, so return */
249 	if (tmp == 0)
250 		return BFITNOENT;
251 	ptr--;
252 	bit = __ffs64(tmp);
253 	bit /= 2;	/* two bits per entry in the bitmap */
254 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
255 }
256 
257 /**
258  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
259  * @rbm: The rbm with rgd already set correctly
260  * @block: The block number (filesystem relative)
261  *
262  * This sets the bi and offset members of an rbm based on a
263  * resource group and a filesystem relative block number. The
264  * resource group must be set in the rbm on entry, the bi and
265  * offset members will be set by this function.
266  *
267  * Returns: 0 on success, or an error code
268  */
269 
270 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
271 {
272 	u64 rblock = block - rbm->rgd->rd_data0;
273 
274 	if (WARN_ON_ONCE(rblock > UINT_MAX))
275 		return -EINVAL;
276 	if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
277 		return -E2BIG;
278 
279 	rbm->bii = 0;
280 	rbm->offset = (u32)(rblock);
281 	/* Check if the block is within the first block */
282 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
283 		return 0;
284 
285 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
286 	rbm->offset += (sizeof(struct gfs2_rgrp) -
287 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
288 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
289 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
290 	return 0;
291 }
292 
293 /**
294  * gfs2_rbm_incr - increment an rbm structure
295  * @rbm: The rbm with rgd already set correctly
296  *
297  * This function takes an existing rbm structure and increments it to the next
298  * viable block offset.
299  *
300  * Returns: If incrementing the offset would cause the rbm to go past the
301  *          end of the rgrp, true is returned, otherwise false.
302  *
303  */
304 
305 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
306 {
307 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
308 		rbm->offset++;
309 		return false;
310 	}
311 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
312 		return true;
313 
314 	rbm->offset = 0;
315 	rbm->bii++;
316 	return false;
317 }
318 
319 /**
320  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
321  * @rbm: Position to search (value/result)
322  * @n_unaligned: Number of unaligned blocks to check
323  * @len: Decremented for each block found (terminate on zero)
324  *
325  * Returns: true if a non-free block is encountered
326  */
327 
328 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
329 {
330 	u32 n;
331 	u8 res;
332 
333 	for (n = 0; n < n_unaligned; n++) {
334 		res = gfs2_testbit(rbm, true);
335 		if (res != GFS2_BLKST_FREE)
336 			return true;
337 		(*len)--;
338 		if (*len == 0)
339 			return true;
340 		if (gfs2_rbm_incr(rbm))
341 			return true;
342 	}
343 
344 	return false;
345 }
346 
347 /**
348  * gfs2_free_extlen - Return extent length of free blocks
349  * @rrbm: Starting position
350  * @len: Max length to check
351  *
352  * Starting at the block specified by the rbm, see how many free blocks
353  * there are, not reading more than len blocks ahead. This can be done
354  * using memchr_inv when the blocks are byte aligned, but has to be done
355  * on a block by block basis in case of unaligned blocks. Also this
356  * function can cope with bitmap boundaries (although it must stop on
357  * a resource group boundary)
358  *
359  * Returns: Number of free blocks in the extent
360  */
361 
362 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
363 {
364 	struct gfs2_rbm rbm = *rrbm;
365 	u32 n_unaligned = rbm.offset & 3;
366 	u32 size = len;
367 	u32 bytes;
368 	u32 chunk_size;
369 	u8 *ptr, *start, *end;
370 	u64 block;
371 	struct gfs2_bitmap *bi;
372 
373 	if (n_unaligned &&
374 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
375 		goto out;
376 
377 	n_unaligned = len & 3;
378 	/* Start is now byte aligned */
379 	while (len > 3) {
380 		bi = rbm_bi(&rbm);
381 		start = bi->bi_bh->b_data;
382 		if (bi->bi_clone)
383 			start = bi->bi_clone;
384 		start += bi->bi_offset;
385 		end = start + bi->bi_len;
386 		BUG_ON(rbm.offset & 3);
387 		start += (rbm.offset / GFS2_NBBY);
388 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
389 		ptr = memchr_inv(start, 0, bytes);
390 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
391 		chunk_size *= GFS2_NBBY;
392 		BUG_ON(len < chunk_size);
393 		len -= chunk_size;
394 		block = gfs2_rbm_to_block(&rbm);
395 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
396 			n_unaligned = 0;
397 			break;
398 		}
399 		if (ptr) {
400 			n_unaligned = 3;
401 			break;
402 		}
403 		n_unaligned = len & 3;
404 	}
405 
406 	/* Deal with any bits left over at the end */
407 	if (n_unaligned)
408 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
409 out:
410 	return size - len;
411 }
412 
413 /**
414  * gfs2_bitcount - count the number of bits in a certain state
415  * @rgd: the resource group descriptor
416  * @buffer: the buffer that holds the bitmaps
417  * @buflen: the length (in bytes) of the buffer
418  * @state: the state of the block we're looking for
419  *
420  * Returns: The number of bits
421  */
422 
423 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
424 			 unsigned int buflen, u8 state)
425 {
426 	const u8 *byte = buffer;
427 	const u8 *end = buffer + buflen;
428 	const u8 state1 = state << 2;
429 	const u8 state2 = state << 4;
430 	const u8 state3 = state << 6;
431 	u32 count = 0;
432 
433 	for (; byte < end; byte++) {
434 		if (((*byte) & 0x03) == state)
435 			count++;
436 		if (((*byte) & 0x0C) == state1)
437 			count++;
438 		if (((*byte) & 0x30) == state2)
439 			count++;
440 		if (((*byte) & 0xC0) == state3)
441 			count++;
442 	}
443 
444 	return count;
445 }
446 
447 /**
448  * gfs2_rgrp_verify - Verify that a resource group is consistent
449  * @rgd: the rgrp
450  *
451  */
452 
453 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
454 {
455 	struct gfs2_sbd *sdp = rgd->rd_sbd;
456 	struct gfs2_bitmap *bi = NULL;
457 	u32 length = rgd->rd_length;
458 	u32 count[4], tmp;
459 	int buf, x;
460 
461 	memset(count, 0, 4 * sizeof(u32));
462 
463 	/* Count # blocks in each of 4 possible allocation states */
464 	for (buf = 0; buf < length; buf++) {
465 		bi = rgd->rd_bits + buf;
466 		for (x = 0; x < 4; x++)
467 			count[x] += gfs2_bitcount(rgd,
468 						  bi->bi_bh->b_data +
469 						  bi->bi_offset,
470 						  bi->bi_len, x);
471 	}
472 
473 	if (count[0] != rgd->rd_free) {
474 		if (gfs2_consist_rgrpd(rgd))
475 			fs_err(sdp, "free data mismatch:  %u != %u\n",
476 			       count[0], rgd->rd_free);
477 		return;
478 	}
479 
480 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
481 	if (count[1] != tmp) {
482 		if (gfs2_consist_rgrpd(rgd))
483 			fs_err(sdp, "used data mismatch:  %u != %u\n",
484 			       count[1], tmp);
485 		return;
486 	}
487 
488 	if (count[2] + count[3] != rgd->rd_dinodes) {
489 		if (gfs2_consist_rgrpd(rgd))
490 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
491 			       count[2] + count[3], rgd->rd_dinodes);
492 		return;
493 	}
494 }
495 
496 /**
497  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
498  * @sdp: The GFS2 superblock
499  * @blk: The data block number
500  * @exact: True if this needs to be an exact match
501  *
502  * The @exact argument should be set to true by most callers. The exception
503  * is when we need to match blocks which are not represented by the rgrp
504  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
505  * there for alignment purposes. Another way of looking at it is that @exact
506  * matches only valid data/metadata blocks, but with @exact false, it will
507  * match any block within the extent of the rgrp.
508  *
509  * Returns: The resource group, or NULL if not found
510  */
511 
512 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
513 {
514 	struct rb_node *n, *next;
515 	struct gfs2_rgrpd *cur;
516 
517 	spin_lock(&sdp->sd_rindex_spin);
518 	n = sdp->sd_rindex_tree.rb_node;
519 	while (n) {
520 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
521 		next = NULL;
522 		if (blk < cur->rd_addr)
523 			next = n->rb_left;
524 		else if (blk >= cur->rd_data0 + cur->rd_data)
525 			next = n->rb_right;
526 		if (next == NULL) {
527 			spin_unlock(&sdp->sd_rindex_spin);
528 			if (exact) {
529 				if (blk < cur->rd_addr)
530 					return NULL;
531 				if (blk >= cur->rd_data0 + cur->rd_data)
532 					return NULL;
533 			}
534 			return cur;
535 		}
536 		n = next;
537 	}
538 	spin_unlock(&sdp->sd_rindex_spin);
539 
540 	return NULL;
541 }
542 
543 /**
544  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
545  * @sdp: The GFS2 superblock
546  *
547  * Returns: The first rgrp in the filesystem
548  */
549 
550 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
551 {
552 	const struct rb_node *n;
553 	struct gfs2_rgrpd *rgd;
554 
555 	spin_lock(&sdp->sd_rindex_spin);
556 	n = rb_first(&sdp->sd_rindex_tree);
557 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
558 	spin_unlock(&sdp->sd_rindex_spin);
559 
560 	return rgd;
561 }
562 
563 /**
564  * gfs2_rgrpd_get_next - get the next RG
565  * @rgd: the resource group descriptor
566  *
567  * Returns: The next rgrp
568  */
569 
570 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
571 {
572 	struct gfs2_sbd *sdp = rgd->rd_sbd;
573 	const struct rb_node *n;
574 
575 	spin_lock(&sdp->sd_rindex_spin);
576 	n = rb_next(&rgd->rd_node);
577 	if (n == NULL)
578 		n = rb_first(&sdp->sd_rindex_tree);
579 
580 	if (unlikely(&rgd->rd_node == n)) {
581 		spin_unlock(&sdp->sd_rindex_spin);
582 		return NULL;
583 	}
584 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
585 	spin_unlock(&sdp->sd_rindex_spin);
586 	return rgd;
587 }
588 
589 void check_and_update_goal(struct gfs2_inode *ip)
590 {
591 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
592 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
593 		ip->i_goal = ip->i_no_addr;
594 }
595 
596 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
597 {
598 	int x;
599 
600 	for (x = 0; x < rgd->rd_length; x++) {
601 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
602 		kfree(bi->bi_clone);
603 		bi->bi_clone = NULL;
604 	}
605 }
606 
607 /**
608  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
609  *                 plus a quota allocations data structure, if necessary
610  * @ip: the inode for this reservation
611  */
612 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
613 {
614 	return gfs2_qa_alloc(ip);
615 }
616 
617 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
618 {
619 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
620 
621 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
622 		       (unsigned long long)ip->i_no_addr,
623 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
624 		       rs->rs_rbm.offset, rs->rs_free);
625 }
626 
627 /**
628  * __rs_deltree - remove a multi-block reservation from the rgd tree
629  * @rs: The reservation to remove
630  *
631  */
632 static void __rs_deltree(struct gfs2_blkreserv *rs)
633 {
634 	struct gfs2_rgrpd *rgd;
635 
636 	if (!gfs2_rs_active(rs))
637 		return;
638 
639 	rgd = rs->rs_rbm.rgd;
640 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
641 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
642 	RB_CLEAR_NODE(&rs->rs_node);
643 
644 	if (rs->rs_free) {
645 		struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
646 
647 		/* return reserved blocks to the rgrp */
648 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
649 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
650 		/* The rgrp extent failure point is likely not to increase;
651 		   it will only do so if the freed blocks are somehow
652 		   contiguous with a span of free blocks that follows. Still,
653 		   it will force the number to be recalculated later. */
654 		rgd->rd_extfail_pt += rs->rs_free;
655 		rs->rs_free = 0;
656 		clear_bit(GBF_FULL, &bi->bi_flags);
657 	}
658 }
659 
660 /**
661  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
662  * @rs: The reservation to remove
663  *
664  */
665 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
666 {
667 	struct gfs2_rgrpd *rgd;
668 
669 	rgd = rs->rs_rbm.rgd;
670 	if (rgd) {
671 		spin_lock(&rgd->rd_rsspin);
672 		__rs_deltree(rs);
673 		BUG_ON(rs->rs_free);
674 		spin_unlock(&rgd->rd_rsspin);
675 	}
676 }
677 
678 /**
679  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
680  * @ip: The inode for this reservation
681  * @wcount: The inode's write count, or NULL
682  *
683  */
684 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
685 {
686 	down_write(&ip->i_rw_mutex);
687 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
688 		gfs2_rs_deltree(&ip->i_res);
689 	up_write(&ip->i_rw_mutex);
690 	gfs2_qa_delete(ip, wcount);
691 }
692 
693 /**
694  * return_all_reservations - return all reserved blocks back to the rgrp.
695  * @rgd: the rgrp that needs its space back
696  *
697  * We previously reserved a bunch of blocks for allocation. Now we need to
698  * give them back. This leave the reservation structures in tact, but removes
699  * all of their corresponding "no-fly zones".
700  */
701 static void return_all_reservations(struct gfs2_rgrpd *rgd)
702 {
703 	struct rb_node *n;
704 	struct gfs2_blkreserv *rs;
705 
706 	spin_lock(&rgd->rd_rsspin);
707 	while ((n = rb_first(&rgd->rd_rstree))) {
708 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
709 		__rs_deltree(rs);
710 	}
711 	spin_unlock(&rgd->rd_rsspin);
712 }
713 
714 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
715 {
716 	struct rb_node *n;
717 	struct gfs2_rgrpd *rgd;
718 	struct gfs2_glock *gl;
719 
720 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
721 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
722 		gl = rgd->rd_gl;
723 
724 		rb_erase(n, &sdp->sd_rindex_tree);
725 
726 		if (gl) {
727 			glock_clear_object(gl, rgd);
728 			gfs2_glock_put(gl);
729 		}
730 
731 		gfs2_free_clones(rgd);
732 		kfree(rgd->rd_bits);
733 		rgd->rd_bits = NULL;
734 		return_all_reservations(rgd);
735 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
736 	}
737 }
738 
739 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
740 {
741 	pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
742 	pr_info("ri_length = %u\n", rgd->rd_length);
743 	pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
744 	pr_info("ri_data = %u\n", rgd->rd_data);
745 	pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
746 }
747 
748 /**
749  * gfs2_compute_bitstructs - Compute the bitmap sizes
750  * @rgd: The resource group descriptor
751  *
752  * Calculates bitmap descriptors, one for each block that contains bitmap data
753  *
754  * Returns: errno
755  */
756 
757 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
758 {
759 	struct gfs2_sbd *sdp = rgd->rd_sbd;
760 	struct gfs2_bitmap *bi;
761 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
762 	u32 bytes_left, bytes;
763 	int x;
764 
765 	if (!length)
766 		return -EINVAL;
767 
768 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
769 	if (!rgd->rd_bits)
770 		return -ENOMEM;
771 
772 	bytes_left = rgd->rd_bitbytes;
773 
774 	for (x = 0; x < length; x++) {
775 		bi = rgd->rd_bits + x;
776 
777 		bi->bi_flags = 0;
778 		/* small rgrp; bitmap stored completely in header block */
779 		if (length == 1) {
780 			bytes = bytes_left;
781 			bi->bi_offset = sizeof(struct gfs2_rgrp);
782 			bi->bi_start = 0;
783 			bi->bi_len = bytes;
784 			bi->bi_blocks = bytes * GFS2_NBBY;
785 		/* header block */
786 		} else if (x == 0) {
787 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
788 			bi->bi_offset = sizeof(struct gfs2_rgrp);
789 			bi->bi_start = 0;
790 			bi->bi_len = bytes;
791 			bi->bi_blocks = bytes * GFS2_NBBY;
792 		/* last block */
793 		} else if (x + 1 == length) {
794 			bytes = bytes_left;
795 			bi->bi_offset = sizeof(struct gfs2_meta_header);
796 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
797 			bi->bi_len = bytes;
798 			bi->bi_blocks = bytes * GFS2_NBBY;
799 		/* other blocks */
800 		} else {
801 			bytes = sdp->sd_sb.sb_bsize -
802 				sizeof(struct gfs2_meta_header);
803 			bi->bi_offset = sizeof(struct gfs2_meta_header);
804 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
805 			bi->bi_len = bytes;
806 			bi->bi_blocks = bytes * GFS2_NBBY;
807 		}
808 
809 		bytes_left -= bytes;
810 	}
811 
812 	if (bytes_left) {
813 		gfs2_consist_rgrpd(rgd);
814 		return -EIO;
815 	}
816 	bi = rgd->rd_bits + (length - 1);
817 	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
818 		if (gfs2_consist_rgrpd(rgd)) {
819 			gfs2_rindex_print(rgd);
820 			fs_err(sdp, "start=%u len=%u offset=%u\n",
821 			       bi->bi_start, bi->bi_len, bi->bi_offset);
822 		}
823 		return -EIO;
824 	}
825 
826 	return 0;
827 }
828 
829 /**
830  * gfs2_ri_total - Total up the file system space, according to the rindex.
831  * @sdp: the filesystem
832  *
833  */
834 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
835 {
836 	u64 total_data = 0;
837 	struct inode *inode = sdp->sd_rindex;
838 	struct gfs2_inode *ip = GFS2_I(inode);
839 	char buf[sizeof(struct gfs2_rindex)];
840 	int error, rgrps;
841 
842 	for (rgrps = 0;; rgrps++) {
843 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
844 
845 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
846 			break;
847 		error = gfs2_internal_read(ip, buf, &pos,
848 					   sizeof(struct gfs2_rindex));
849 		if (error != sizeof(struct gfs2_rindex))
850 			break;
851 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
852 	}
853 	return total_data;
854 }
855 
856 static int rgd_insert(struct gfs2_rgrpd *rgd)
857 {
858 	struct gfs2_sbd *sdp = rgd->rd_sbd;
859 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
860 
861 	/* Figure out where to put new node */
862 	while (*newn) {
863 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
864 						  rd_node);
865 
866 		parent = *newn;
867 		if (rgd->rd_addr < cur->rd_addr)
868 			newn = &((*newn)->rb_left);
869 		else if (rgd->rd_addr > cur->rd_addr)
870 			newn = &((*newn)->rb_right);
871 		else
872 			return -EEXIST;
873 	}
874 
875 	rb_link_node(&rgd->rd_node, parent, newn);
876 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
877 	sdp->sd_rgrps++;
878 	return 0;
879 }
880 
881 /**
882  * read_rindex_entry - Pull in a new resource index entry from the disk
883  * @ip: Pointer to the rindex inode
884  *
885  * Returns: 0 on success, > 0 on EOF, error code otherwise
886  */
887 
888 static int read_rindex_entry(struct gfs2_inode *ip)
889 {
890 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
891 	const unsigned bsize = sdp->sd_sb.sb_bsize;
892 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
893 	struct gfs2_rindex buf;
894 	int error;
895 	struct gfs2_rgrpd *rgd;
896 
897 	if (pos >= i_size_read(&ip->i_inode))
898 		return 1;
899 
900 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
901 				   sizeof(struct gfs2_rindex));
902 
903 	if (error != sizeof(struct gfs2_rindex))
904 		return (error == 0) ? 1 : error;
905 
906 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
907 	error = -ENOMEM;
908 	if (!rgd)
909 		return error;
910 
911 	rgd->rd_sbd = sdp;
912 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
913 	rgd->rd_length = be32_to_cpu(buf.ri_length);
914 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
915 	rgd->rd_data = be32_to_cpu(buf.ri_data);
916 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
917 	spin_lock_init(&rgd->rd_rsspin);
918 
919 	error = compute_bitstructs(rgd);
920 	if (error)
921 		goto fail;
922 
923 	error = gfs2_glock_get(sdp, rgd->rd_addr,
924 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
925 	if (error)
926 		goto fail;
927 
928 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
929 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
930 	if (rgd->rd_data > sdp->sd_max_rg_data)
931 		sdp->sd_max_rg_data = rgd->rd_data;
932 	spin_lock(&sdp->sd_rindex_spin);
933 	error = rgd_insert(rgd);
934 	spin_unlock(&sdp->sd_rindex_spin);
935 	if (!error) {
936 		glock_set_object(rgd->rd_gl, rgd);
937 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
938 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
939 						    rgd->rd_length) * bsize) - 1;
940 		return 0;
941 	}
942 
943 	error = 0; /* someone else read in the rgrp; free it and ignore it */
944 	gfs2_glock_put(rgd->rd_gl);
945 
946 fail:
947 	kfree(rgd->rd_bits);
948 	rgd->rd_bits = NULL;
949 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
950 	return error;
951 }
952 
953 /**
954  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
955  * @sdp: the GFS2 superblock
956  *
957  * The purpose of this function is to select a subset of the resource groups
958  * and mark them as PREFERRED. We do it in such a way that each node prefers
959  * to use a unique set of rgrps to minimize glock contention.
960  */
961 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
962 {
963 	struct gfs2_rgrpd *rgd, *first;
964 	int i;
965 
966 	/* Skip an initial number of rgrps, based on this node's journal ID.
967 	   That should start each node out on its own set. */
968 	rgd = gfs2_rgrpd_get_first(sdp);
969 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
970 		rgd = gfs2_rgrpd_get_next(rgd);
971 	first = rgd;
972 
973 	do {
974 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
975 		for (i = 0; i < sdp->sd_journals; i++) {
976 			rgd = gfs2_rgrpd_get_next(rgd);
977 			if (!rgd || rgd == first)
978 				break;
979 		}
980 	} while (rgd && rgd != first);
981 }
982 
983 /**
984  * gfs2_ri_update - Pull in a new resource index from the disk
985  * @ip: pointer to the rindex inode
986  *
987  * Returns: 0 on successful update, error code otherwise
988  */
989 
990 static int gfs2_ri_update(struct gfs2_inode *ip)
991 {
992 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
993 	int error;
994 
995 	do {
996 		error = read_rindex_entry(ip);
997 	} while (error == 0);
998 
999 	if (error < 0)
1000 		return error;
1001 
1002 	set_rgrp_preferences(sdp);
1003 
1004 	sdp->sd_rindex_uptodate = 1;
1005 	return 0;
1006 }
1007 
1008 /**
1009  * gfs2_rindex_update - Update the rindex if required
1010  * @sdp: The GFS2 superblock
1011  *
1012  * We grab a lock on the rindex inode to make sure that it doesn't
1013  * change whilst we are performing an operation. We keep this lock
1014  * for quite long periods of time compared to other locks. This
1015  * doesn't matter, since it is shared and it is very, very rarely
1016  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1017  *
1018  * This makes sure that we're using the latest copy of the resource index
1019  * special file, which might have been updated if someone expanded the
1020  * filesystem (via gfs2_grow utility), which adds new resource groups.
1021  *
1022  * Returns: 0 on succeess, error code otherwise
1023  */
1024 
1025 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1026 {
1027 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1028 	struct gfs2_glock *gl = ip->i_gl;
1029 	struct gfs2_holder ri_gh;
1030 	int error = 0;
1031 	int unlock_required = 0;
1032 
1033 	/* Read new copy from disk if we don't have the latest */
1034 	if (!sdp->sd_rindex_uptodate) {
1035 		if (!gfs2_glock_is_locked_by_me(gl)) {
1036 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1037 			if (error)
1038 				return error;
1039 			unlock_required = 1;
1040 		}
1041 		if (!sdp->sd_rindex_uptodate)
1042 			error = gfs2_ri_update(ip);
1043 		if (unlock_required)
1044 			gfs2_glock_dq_uninit(&ri_gh);
1045 	}
1046 
1047 	return error;
1048 }
1049 
1050 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1051 {
1052 	const struct gfs2_rgrp *str = buf;
1053 	u32 rg_flags;
1054 
1055 	rg_flags = be32_to_cpu(str->rg_flags);
1056 	rg_flags &= ~GFS2_RDF_MASK;
1057 	rgd->rd_flags &= GFS2_RDF_MASK;
1058 	rgd->rd_flags |= rg_flags;
1059 	rgd->rd_free = be32_to_cpu(str->rg_free);
1060 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1061 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1062 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1063 }
1064 
1065 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1066 {
1067 	const struct gfs2_rgrp *str = buf;
1068 
1069 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1070 	rgl->rl_flags = str->rg_flags;
1071 	rgl->rl_free = str->rg_free;
1072 	rgl->rl_dinodes = str->rg_dinodes;
1073 	rgl->rl_igeneration = str->rg_igeneration;
1074 	rgl->__pad = 0UL;
1075 }
1076 
1077 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1078 {
1079 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1080 	struct gfs2_rgrp *str = buf;
1081 	u32 crc;
1082 
1083 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1084 	str->rg_free = cpu_to_be32(rgd->rd_free);
1085 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1086 	if (next == NULL)
1087 		str->rg_skip = 0;
1088 	else if (next->rd_addr > rgd->rd_addr)
1089 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1090 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1091 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1092 	str->rg_data = cpu_to_be32(rgd->rd_data);
1093 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1094 	str->rg_crc = 0;
1095 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1096 	str->rg_crc = cpu_to_be32(crc);
1097 
1098 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1099 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1100 }
1101 
1102 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1103 {
1104 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1105 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1106 
1107 	if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
1108 	    rgl->rl_dinodes != str->rg_dinodes ||
1109 	    rgl->rl_igeneration != str->rg_igeneration)
1110 		return 0;
1111 	return 1;
1112 }
1113 
1114 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1115 {
1116 	struct gfs2_bitmap *bi;
1117 	const u32 length = rgd->rd_length;
1118 	const u8 *buffer = NULL;
1119 	u32 i, goal, count = 0;
1120 
1121 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1122 		goal = 0;
1123 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1124 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1125 		while (goal < bi->bi_len * GFS2_NBBY) {
1126 			goal = gfs2_bitfit(buffer, bi->bi_len, goal,
1127 					   GFS2_BLKST_UNLINKED);
1128 			if (goal == BFITNOENT)
1129 				break;
1130 			count++;
1131 			goal++;
1132 		}
1133 	}
1134 
1135 	return count;
1136 }
1137 
1138 
1139 /**
1140  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1141  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1142  *
1143  * Read in all of a Resource Group's header and bitmap blocks.
1144  * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
1145  *
1146  * Returns: errno
1147  */
1148 
1149 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1150 {
1151 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1152 	struct gfs2_glock *gl = rgd->rd_gl;
1153 	unsigned int length = rgd->rd_length;
1154 	struct gfs2_bitmap *bi;
1155 	unsigned int x, y;
1156 	int error;
1157 
1158 	if (rgd->rd_bits[0].bi_bh != NULL)
1159 		return 0;
1160 
1161 	for (x = 0; x < length; x++) {
1162 		bi = rgd->rd_bits + x;
1163 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1164 		if (error)
1165 			goto fail;
1166 	}
1167 
1168 	for (y = length; y--;) {
1169 		bi = rgd->rd_bits + y;
1170 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1171 		if (error)
1172 			goto fail;
1173 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1174 					      GFS2_METATYPE_RG)) {
1175 			error = -EIO;
1176 			goto fail;
1177 		}
1178 	}
1179 
1180 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1181 		for (x = 0; x < length; x++)
1182 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1183 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1184 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1185 		rgd->rd_free_clone = rgd->rd_free;
1186 		/* max out the rgrp allocation failure point */
1187 		rgd->rd_extfail_pt = rgd->rd_free;
1188 	}
1189 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1190 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1191 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1192 				     rgd->rd_bits[0].bi_bh->b_data);
1193 	}
1194 	else if (sdp->sd_args.ar_rgrplvb) {
1195 		if (!gfs2_rgrp_lvb_valid(rgd)){
1196 			gfs2_consist_rgrpd(rgd);
1197 			error = -EIO;
1198 			goto fail;
1199 		}
1200 		if (rgd->rd_rgl->rl_unlinked == 0)
1201 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1202 	}
1203 	return 0;
1204 
1205 fail:
1206 	while (x--) {
1207 		bi = rgd->rd_bits + x;
1208 		brelse(bi->bi_bh);
1209 		bi->bi_bh = NULL;
1210 		gfs2_assert_warn(sdp, !bi->bi_clone);
1211 	}
1212 
1213 	return error;
1214 }
1215 
1216 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1217 {
1218 	u32 rl_flags;
1219 
1220 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1221 		return 0;
1222 
1223 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1224 		return gfs2_rgrp_bh_get(rgd);
1225 
1226 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1227 	rl_flags &= ~GFS2_RDF_MASK;
1228 	rgd->rd_flags &= GFS2_RDF_MASK;
1229 	rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1230 	if (rgd->rd_rgl->rl_unlinked == 0)
1231 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1232 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1233 	rgd->rd_free_clone = rgd->rd_free;
1234 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1235 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1236 	return 0;
1237 }
1238 
1239 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1240 {
1241 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1242 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1243 
1244 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1245 		return 0;
1246 	return gfs2_rgrp_bh_get(rgd);
1247 }
1248 
1249 /**
1250  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1251  * @rgd: The resource group
1252  *
1253  */
1254 
1255 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1256 {
1257 	int x, length = rgd->rd_length;
1258 
1259 	for (x = 0; x < length; x++) {
1260 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1261 		if (bi->bi_bh) {
1262 			brelse(bi->bi_bh);
1263 			bi->bi_bh = NULL;
1264 		}
1265 	}
1266 
1267 }
1268 
1269 /**
1270  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1271  * @gh: The glock holder for the resource group
1272  *
1273  */
1274 
1275 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1276 {
1277 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1278 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1279 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1280 
1281 	if (rgd && demote_requested)
1282 		gfs2_rgrp_brelse(rgd);
1283 }
1284 
1285 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1286 			     struct buffer_head *bh,
1287 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1288 {
1289 	struct super_block *sb = sdp->sd_vfs;
1290 	u64 blk;
1291 	sector_t start = 0;
1292 	sector_t nr_blks = 0;
1293 	int rv;
1294 	unsigned int x;
1295 	u32 trimmed = 0;
1296 	u8 diff;
1297 
1298 	for (x = 0; x < bi->bi_len; x++) {
1299 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1300 		clone += bi->bi_offset;
1301 		clone += x;
1302 		if (bh) {
1303 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1304 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1305 		} else {
1306 			diff = ~(*clone | (*clone >> 1));
1307 		}
1308 		diff &= 0x55;
1309 		if (diff == 0)
1310 			continue;
1311 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1312 		while(diff) {
1313 			if (diff & 1) {
1314 				if (nr_blks == 0)
1315 					goto start_new_extent;
1316 				if ((start + nr_blks) != blk) {
1317 					if (nr_blks >= minlen) {
1318 						rv = sb_issue_discard(sb,
1319 							start, nr_blks,
1320 							GFP_NOFS, 0);
1321 						if (rv)
1322 							goto fail;
1323 						trimmed += nr_blks;
1324 					}
1325 					nr_blks = 0;
1326 start_new_extent:
1327 					start = blk;
1328 				}
1329 				nr_blks++;
1330 			}
1331 			diff >>= 2;
1332 			blk++;
1333 		}
1334 	}
1335 	if (nr_blks >= minlen) {
1336 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1337 		if (rv)
1338 			goto fail;
1339 		trimmed += nr_blks;
1340 	}
1341 	if (ptrimmed)
1342 		*ptrimmed = trimmed;
1343 	return 0;
1344 
1345 fail:
1346 	if (sdp->sd_args.ar_discard)
1347 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1348 	sdp->sd_args.ar_discard = 0;
1349 	return -EIO;
1350 }
1351 
1352 /**
1353  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1354  * @filp: Any file on the filesystem
1355  * @argp: Pointer to the arguments (also used to pass result)
1356  *
1357  * Returns: 0 on success, otherwise error code
1358  */
1359 
1360 int gfs2_fitrim(struct file *filp, void __user *argp)
1361 {
1362 	struct inode *inode = file_inode(filp);
1363 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1364 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1365 	struct buffer_head *bh;
1366 	struct gfs2_rgrpd *rgd;
1367 	struct gfs2_rgrpd *rgd_end;
1368 	struct gfs2_holder gh;
1369 	struct fstrim_range r;
1370 	int ret = 0;
1371 	u64 amt;
1372 	u64 trimmed = 0;
1373 	u64 start, end, minlen;
1374 	unsigned int x;
1375 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1376 
1377 	if (!capable(CAP_SYS_ADMIN))
1378 		return -EPERM;
1379 
1380 	if (!blk_queue_discard(q))
1381 		return -EOPNOTSUPP;
1382 
1383 	if (copy_from_user(&r, argp, sizeof(r)))
1384 		return -EFAULT;
1385 
1386 	ret = gfs2_rindex_update(sdp);
1387 	if (ret)
1388 		return ret;
1389 
1390 	start = r.start >> bs_shift;
1391 	end = start + (r.len >> bs_shift);
1392 	minlen = max_t(u64, r.minlen,
1393 		       q->limits.discard_granularity) >> bs_shift;
1394 
1395 	if (end <= start || minlen > sdp->sd_max_rg_data)
1396 		return -EINVAL;
1397 
1398 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1399 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1400 
1401 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1402 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1403 		return -EINVAL; /* start is beyond the end of the fs */
1404 
1405 	while (1) {
1406 
1407 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1408 		if (ret)
1409 			goto out;
1410 
1411 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1412 			/* Trim each bitmap in the rgrp */
1413 			for (x = 0; x < rgd->rd_length; x++) {
1414 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1415 				ret = gfs2_rgrp_send_discards(sdp,
1416 						rgd->rd_data0, NULL, bi, minlen,
1417 						&amt);
1418 				if (ret) {
1419 					gfs2_glock_dq_uninit(&gh);
1420 					goto out;
1421 				}
1422 				trimmed += amt;
1423 			}
1424 
1425 			/* Mark rgrp as having been trimmed */
1426 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1427 			if (ret == 0) {
1428 				bh = rgd->rd_bits[0].bi_bh;
1429 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1430 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1431 				gfs2_rgrp_out(rgd, bh->b_data);
1432 				gfs2_trans_end(sdp);
1433 			}
1434 		}
1435 		gfs2_glock_dq_uninit(&gh);
1436 
1437 		if (rgd == rgd_end)
1438 			break;
1439 
1440 		rgd = gfs2_rgrpd_get_next(rgd);
1441 	}
1442 
1443 out:
1444 	r.len = trimmed << bs_shift;
1445 	if (copy_to_user(argp, &r, sizeof(r)))
1446 		return -EFAULT;
1447 
1448 	return ret;
1449 }
1450 
1451 /**
1452  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1453  * @ip: the inode structure
1454  *
1455  */
1456 static void rs_insert(struct gfs2_inode *ip)
1457 {
1458 	struct rb_node **newn, *parent = NULL;
1459 	int rc;
1460 	struct gfs2_blkreserv *rs = &ip->i_res;
1461 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1462 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1463 
1464 	BUG_ON(gfs2_rs_active(rs));
1465 
1466 	spin_lock(&rgd->rd_rsspin);
1467 	newn = &rgd->rd_rstree.rb_node;
1468 	while (*newn) {
1469 		struct gfs2_blkreserv *cur =
1470 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1471 
1472 		parent = *newn;
1473 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1474 		if (rc > 0)
1475 			newn = &((*newn)->rb_right);
1476 		else if (rc < 0)
1477 			newn = &((*newn)->rb_left);
1478 		else {
1479 			spin_unlock(&rgd->rd_rsspin);
1480 			WARN_ON(1);
1481 			return;
1482 		}
1483 	}
1484 
1485 	rb_link_node(&rs->rs_node, parent, newn);
1486 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1487 
1488 	/* Do our rgrp accounting for the reservation */
1489 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1490 	spin_unlock(&rgd->rd_rsspin);
1491 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1492 }
1493 
1494 /**
1495  * rgd_free - return the number of free blocks we can allocate.
1496  * @rgd: the resource group
1497  *
1498  * This function returns the number of free blocks for an rgrp.
1499  * That's the clone-free blocks (blocks that are free, not including those
1500  * still being used for unlinked files that haven't been deleted.)
1501  *
1502  * It also subtracts any blocks reserved by someone else, but does not
1503  * include free blocks that are still part of our current reservation,
1504  * because obviously we can (and will) allocate them.
1505  */
1506 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1507 {
1508 	u32 tot_reserved, tot_free;
1509 
1510 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1511 		return 0;
1512 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1513 
1514 	if (rgd->rd_free_clone < tot_reserved)
1515 		tot_reserved = 0;
1516 
1517 	tot_free = rgd->rd_free_clone - tot_reserved;
1518 
1519 	return tot_free;
1520 }
1521 
1522 /**
1523  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1524  * @rgd: the resource group descriptor
1525  * @ip: pointer to the inode for which we're reserving blocks
1526  * @ap: the allocation parameters
1527  *
1528  */
1529 
1530 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1531 			   const struct gfs2_alloc_parms *ap)
1532 {
1533 	struct gfs2_rbm rbm = { .rgd = rgd, };
1534 	u64 goal;
1535 	struct gfs2_blkreserv *rs = &ip->i_res;
1536 	u32 extlen;
1537 	u32 free_blocks = rgd_free(rgd, rs);
1538 	int ret;
1539 	struct inode *inode = &ip->i_inode;
1540 
1541 	if (S_ISDIR(inode->i_mode))
1542 		extlen = 1;
1543 	else {
1544 		extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
1545 		extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
1546 	}
1547 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1548 		return;
1549 
1550 	/* Find bitmap block that contains bits for goal block */
1551 	if (rgrp_contains_block(rgd, ip->i_goal))
1552 		goal = ip->i_goal;
1553 	else
1554 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1555 
1556 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1557 		return;
1558 
1559 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1560 	if (ret == 0) {
1561 		rs->rs_rbm = rbm;
1562 		rs->rs_free = extlen;
1563 		rs_insert(ip);
1564 	} else {
1565 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1566 			rgd->rd_last_alloc = 0;
1567 	}
1568 }
1569 
1570 /**
1571  * gfs2_next_unreserved_block - Return next block that is not reserved
1572  * @rgd: The resource group
1573  * @block: The starting block
1574  * @length: The required length
1575  * @ip: Ignore any reservations for this inode
1576  *
1577  * If the block does not appear in any reservation, then return the
1578  * block number unchanged. If it does appear in the reservation, then
1579  * keep looking through the tree of reservations in order to find the
1580  * first block number which is not reserved.
1581  */
1582 
1583 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1584 				      u32 length,
1585 				      const struct gfs2_inode *ip)
1586 {
1587 	struct gfs2_blkreserv *rs;
1588 	struct rb_node *n;
1589 	int rc;
1590 
1591 	spin_lock(&rgd->rd_rsspin);
1592 	n = rgd->rd_rstree.rb_node;
1593 	while (n) {
1594 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1595 		rc = rs_cmp(block, length, rs);
1596 		if (rc < 0)
1597 			n = n->rb_left;
1598 		else if (rc > 0)
1599 			n = n->rb_right;
1600 		else
1601 			break;
1602 	}
1603 
1604 	if (n) {
1605 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1606 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1607 			n = n->rb_right;
1608 			if (n == NULL)
1609 				break;
1610 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1611 		}
1612 	}
1613 
1614 	spin_unlock(&rgd->rd_rsspin);
1615 	return block;
1616 }
1617 
1618 /**
1619  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1620  * @rbm: The current position in the resource group
1621  * @ip: The inode for which we are searching for blocks
1622  * @minext: The minimum extent length
1623  * @maxext: A pointer to the maximum extent structure
1624  *
1625  * This checks the current position in the rgrp to see whether there is
1626  * a reservation covering this block. If not then this function is a
1627  * no-op. If there is, then the position is moved to the end of the
1628  * contiguous reservation(s) so that we are pointing at the first
1629  * non-reserved block.
1630  *
1631  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1632  */
1633 
1634 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1635 					     const struct gfs2_inode *ip,
1636 					     u32 minext,
1637 					     struct gfs2_extent *maxext)
1638 {
1639 	u64 block = gfs2_rbm_to_block(rbm);
1640 	u32 extlen = 1;
1641 	u64 nblock;
1642 	int ret;
1643 
1644 	/*
1645 	 * If we have a minimum extent length, then skip over any extent
1646 	 * which is less than the min extent length in size.
1647 	 */
1648 	if (minext) {
1649 		extlen = gfs2_free_extlen(rbm, minext);
1650 		if (extlen <= maxext->len)
1651 			goto fail;
1652 	}
1653 
1654 	/*
1655 	 * Check the extent which has been found against the reservations
1656 	 * and skip if parts of it are already reserved
1657 	 */
1658 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1659 	if (nblock == block) {
1660 		if (!minext || extlen >= minext)
1661 			return 0;
1662 
1663 		if (extlen > maxext->len) {
1664 			maxext->len = extlen;
1665 			maxext->rbm = *rbm;
1666 		}
1667 fail:
1668 		nblock = block + extlen;
1669 	}
1670 	ret = gfs2_rbm_from_block(rbm, nblock);
1671 	if (ret < 0)
1672 		return ret;
1673 	return 1;
1674 }
1675 
1676 /**
1677  * gfs2_rbm_find - Look for blocks of a particular state
1678  * @rbm: Value/result starting position and final position
1679  * @state: The state which we want to find
1680  * @minext: Pointer to the requested extent length (NULL for a single block)
1681  *          This is updated to be the actual reservation size.
1682  * @ip: If set, check for reservations
1683  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1684  *          around until we've reached the starting point.
1685  *
1686  * Side effects:
1687  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1688  *   has no free blocks in it.
1689  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1690  *   has come up short on a free block search.
1691  *
1692  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1693  */
1694 
1695 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1696 			 const struct gfs2_inode *ip, bool nowrap)
1697 {
1698 	struct buffer_head *bh;
1699 	int initial_bii;
1700 	u32 initial_offset;
1701 	int first_bii = rbm->bii;
1702 	u32 first_offset = rbm->offset;
1703 	u32 offset;
1704 	u8 *buffer;
1705 	int n = 0;
1706 	int iters = rbm->rgd->rd_length;
1707 	int ret;
1708 	struct gfs2_bitmap *bi;
1709 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1710 
1711 	/* If we are not starting at the beginning of a bitmap, then we
1712 	 * need to add one to the bitmap count to ensure that we search
1713 	 * the starting bitmap twice.
1714 	 */
1715 	if (rbm->offset != 0)
1716 		iters++;
1717 
1718 	while(1) {
1719 		bi = rbm_bi(rbm);
1720 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1721 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1722 		    (state == GFS2_BLKST_FREE))
1723 			goto next_bitmap;
1724 
1725 		bh = bi->bi_bh;
1726 		buffer = bh->b_data + bi->bi_offset;
1727 		WARN_ON(!buffer_uptodate(bh));
1728 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1729 			buffer = bi->bi_clone + bi->bi_offset;
1730 		initial_offset = rbm->offset;
1731 		offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
1732 		if (offset == BFITNOENT)
1733 			goto bitmap_full;
1734 		rbm->offset = offset;
1735 		if (ip == NULL)
1736 			return 0;
1737 
1738 		initial_bii = rbm->bii;
1739 		ret = gfs2_reservation_check_and_update(rbm, ip,
1740 							minext ? *minext : 0,
1741 							&maxext);
1742 		if (ret == 0)
1743 			return 0;
1744 		if (ret > 0) {
1745 			n += (rbm->bii - initial_bii);
1746 			goto next_iter;
1747 		}
1748 		if (ret == -E2BIG) {
1749 			rbm->bii = 0;
1750 			rbm->offset = 0;
1751 			n += (rbm->bii - initial_bii);
1752 			goto res_covered_end_of_rgrp;
1753 		}
1754 		return ret;
1755 
1756 bitmap_full:	/* Mark bitmap as full and fall through */
1757 		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1758 			set_bit(GBF_FULL, &bi->bi_flags);
1759 
1760 next_bitmap:	/* Find next bitmap in the rgrp */
1761 		rbm->offset = 0;
1762 		rbm->bii++;
1763 		if (rbm->bii == rbm->rgd->rd_length)
1764 			rbm->bii = 0;
1765 res_covered_end_of_rgrp:
1766 		if ((rbm->bii == 0) && nowrap)
1767 			break;
1768 		n++;
1769 next_iter:
1770 		if (n >= iters)
1771 			break;
1772 	}
1773 
1774 	if (minext == NULL || state != GFS2_BLKST_FREE)
1775 		return -ENOSPC;
1776 
1777 	/* If the extent was too small, and it's smaller than the smallest
1778 	   to have failed before, remember for future reference that it's
1779 	   useless to search this rgrp again for this amount or more. */
1780 	if ((first_offset == 0) && (first_bii == 0) &&
1781 	    (*minext < rbm->rgd->rd_extfail_pt))
1782 		rbm->rgd->rd_extfail_pt = *minext;
1783 
1784 	/* If the maximum extent we found is big enough to fulfill the
1785 	   minimum requirements, use it anyway. */
1786 	if (maxext.len) {
1787 		*rbm = maxext.rbm;
1788 		*minext = maxext.len;
1789 		return 0;
1790 	}
1791 
1792 	return -ENOSPC;
1793 }
1794 
1795 /**
1796  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1797  * @rgd: The rgrp
1798  * @last_unlinked: block address of the last dinode we unlinked
1799  * @skip: block address we should explicitly not unlink
1800  *
1801  * Returns: 0 if no error
1802  *          The inode, if one has been found, in inode.
1803  */
1804 
1805 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1806 {
1807 	u64 block;
1808 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1809 	struct gfs2_glock *gl;
1810 	struct gfs2_inode *ip;
1811 	int error;
1812 	int found = 0;
1813 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1814 
1815 	while (1) {
1816 		down_write(&sdp->sd_log_flush_lock);
1817 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1818 				      true);
1819 		up_write(&sdp->sd_log_flush_lock);
1820 		if (error == -ENOSPC)
1821 			break;
1822 		if (WARN_ON_ONCE(error))
1823 			break;
1824 
1825 		block = gfs2_rbm_to_block(&rbm);
1826 		if (gfs2_rbm_from_block(&rbm, block + 1))
1827 			break;
1828 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1829 			continue;
1830 		if (block == skip)
1831 			continue;
1832 		*last_unlinked = block;
1833 
1834 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1835 		if (error)
1836 			continue;
1837 
1838 		/* If the inode is already in cache, we can ignore it here
1839 		 * because the existing inode disposal code will deal with
1840 		 * it when all refs have gone away. Accessing gl_object like
1841 		 * this is not safe in general. Here it is ok because we do
1842 		 * not dereference the pointer, and we only need an approx
1843 		 * answer to whether it is NULL or not.
1844 		 */
1845 		ip = gl->gl_object;
1846 
1847 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1848 			gfs2_glock_put(gl);
1849 		else
1850 			found++;
1851 
1852 		/* Limit reclaim to sensible number of tasks */
1853 		if (found > NR_CPUS)
1854 			return;
1855 	}
1856 
1857 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1858 	return;
1859 }
1860 
1861 /**
1862  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1863  * @rgd: The rgrp in question
1864  * @loops: An indication of how picky we can be (0=very, 1=less so)
1865  *
1866  * This function uses the recently added glock statistics in order to
1867  * figure out whether a parciular resource group is suffering from
1868  * contention from multiple nodes. This is done purely on the basis
1869  * of timings, since this is the only data we have to work with and
1870  * our aim here is to reject a resource group which is highly contended
1871  * but (very important) not to do this too often in order to ensure that
1872  * we do not land up introducing fragmentation by changing resource
1873  * groups when not actually required.
1874  *
1875  * The calculation is fairly simple, we want to know whether the SRTTB
1876  * (i.e. smoothed round trip time for blocking operations) to acquire
1877  * the lock for this rgrp's glock is significantly greater than the
1878  * time taken for resource groups on average. We introduce a margin in
1879  * the form of the variable @var which is computed as the sum of the two
1880  * respective variences, and multiplied by a factor depending on @loops
1881  * and whether we have a lot of data to base the decision on. This is
1882  * then tested against the square difference of the means in order to
1883  * decide whether the result is statistically significant or not.
1884  *
1885  * Returns: A boolean verdict on the congestion status
1886  */
1887 
1888 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1889 {
1890 	const struct gfs2_glock *gl = rgd->rd_gl;
1891 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1892 	struct gfs2_lkstats *st;
1893 	u64 r_dcount, l_dcount;
1894 	u64 l_srttb, a_srttb = 0;
1895 	s64 srttb_diff;
1896 	u64 sqr_diff;
1897 	u64 var;
1898 	int cpu, nonzero = 0;
1899 
1900 	preempt_disable();
1901 	for_each_present_cpu(cpu) {
1902 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1903 		if (st->stats[GFS2_LKS_SRTTB]) {
1904 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1905 			nonzero++;
1906 		}
1907 	}
1908 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1909 	if (nonzero)
1910 		do_div(a_srttb, nonzero);
1911 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1912 	var = st->stats[GFS2_LKS_SRTTVARB] +
1913 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1914 	preempt_enable();
1915 
1916 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1917 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1918 
1919 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1920 		return false;
1921 
1922 	srttb_diff = a_srttb - l_srttb;
1923 	sqr_diff = srttb_diff * srttb_diff;
1924 
1925 	var *= 2;
1926 	if (l_dcount < 8 || r_dcount < 8)
1927 		var *= 2;
1928 	if (loops == 1)
1929 		var *= 2;
1930 
1931 	return ((srttb_diff < 0) && (sqr_diff > var));
1932 }
1933 
1934 /**
1935  * gfs2_rgrp_used_recently
1936  * @rs: The block reservation with the rgrp to test
1937  * @msecs: The time limit in milliseconds
1938  *
1939  * Returns: True if the rgrp glock has been used within the time limit
1940  */
1941 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1942 				    u64 msecs)
1943 {
1944 	u64 tdiff;
1945 
1946 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1947                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1948 
1949 	return tdiff > (msecs * 1000 * 1000);
1950 }
1951 
1952 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1953 {
1954 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1955 	u32 skip;
1956 
1957 	get_random_bytes(&skip, sizeof(skip));
1958 	return skip % sdp->sd_rgrps;
1959 }
1960 
1961 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1962 {
1963 	struct gfs2_rgrpd *rgd = *pos;
1964 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1965 
1966 	rgd = gfs2_rgrpd_get_next(rgd);
1967 	if (rgd == NULL)
1968 		rgd = gfs2_rgrpd_get_first(sdp);
1969 	*pos = rgd;
1970 	if (rgd != begin) /* If we didn't wrap */
1971 		return true;
1972 	return false;
1973 }
1974 
1975 /**
1976  * fast_to_acquire - determine if a resource group will be fast to acquire
1977  *
1978  * If this is one of our preferred rgrps, it should be quicker to acquire,
1979  * because we tried to set ourselves up as dlm lock master.
1980  */
1981 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1982 {
1983 	struct gfs2_glock *gl = rgd->rd_gl;
1984 
1985 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1986 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1987 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1988 		return 1;
1989 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1990 		return 1;
1991 	return 0;
1992 }
1993 
1994 /**
1995  * gfs2_inplace_reserve - Reserve space in the filesystem
1996  * @ip: the inode to reserve space for
1997  * @ap: the allocation parameters
1998  *
1999  * We try our best to find an rgrp that has at least ap->target blocks
2000  * available. After a couple of passes (loops == 2), the prospects of finding
2001  * such an rgrp diminish. At this stage, we return the first rgrp that has
2002  * atleast ap->min_target blocks available. Either way, we set ap->allowed to
2003  * the number of blocks available in the chosen rgrp.
2004  *
2005  * Returns: 0 on success,
2006  *          -ENOMEM if a suitable rgrp can't be found
2007  *          errno otherwise
2008  */
2009 
2010 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2011 {
2012 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2013 	struct gfs2_rgrpd *begin = NULL;
2014 	struct gfs2_blkreserv *rs = &ip->i_res;
2015 	int error = 0, rg_locked, flags = 0;
2016 	u64 last_unlinked = NO_BLOCK;
2017 	int loops = 0;
2018 	u32 free_blocks, skip = 0;
2019 
2020 	if (sdp->sd_args.ar_rgrplvb)
2021 		flags |= GL_SKIP;
2022 	if (gfs2_assert_warn(sdp, ap->target))
2023 		return -EINVAL;
2024 	if (gfs2_rs_active(rs)) {
2025 		begin = rs->rs_rbm.rgd;
2026 	} else if (rs->rs_rbm.rgd &&
2027 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2028 		begin = rs->rs_rbm.rgd;
2029 	} else {
2030 		check_and_update_goal(ip);
2031 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2032 	}
2033 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2034 		skip = gfs2_orlov_skip(ip);
2035 	if (rs->rs_rbm.rgd == NULL)
2036 		return -EBADSLT;
2037 
2038 	while (loops < 3) {
2039 		rg_locked = 1;
2040 
2041 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2042 			rg_locked = 0;
2043 			if (skip && skip--)
2044 				goto next_rgrp;
2045 			if (!gfs2_rs_active(rs)) {
2046 				if (loops == 0 &&
2047 				    !fast_to_acquire(rs->rs_rbm.rgd))
2048 					goto next_rgrp;
2049 				if ((loops < 2) &&
2050 				    gfs2_rgrp_used_recently(rs, 1000) &&
2051 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2052 					goto next_rgrp;
2053 			}
2054 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2055 						   LM_ST_EXCLUSIVE, flags,
2056 						   &rs->rs_rgd_gh);
2057 			if (unlikely(error))
2058 				return error;
2059 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2060 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2061 				goto skip_rgrp;
2062 			if (sdp->sd_args.ar_rgrplvb) {
2063 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2064 				if (unlikely(error)) {
2065 					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2066 					return error;
2067 				}
2068 			}
2069 		}
2070 
2071 		/* Skip unuseable resource groups */
2072 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2073 						 GFS2_RDF_ERROR)) ||
2074 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2075 			goto skip_rgrp;
2076 
2077 		if (sdp->sd_args.ar_rgrplvb)
2078 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2079 
2080 		/* Get a reservation if we don't already have one */
2081 		if (!gfs2_rs_active(rs))
2082 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2083 
2084 		/* Skip rgrps when we can't get a reservation on first pass */
2085 		if (!gfs2_rs_active(rs) && (loops < 1))
2086 			goto check_rgrp;
2087 
2088 		/* If rgrp has enough free space, use it */
2089 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2090 		if (free_blocks >= ap->target ||
2091 		    (loops == 2 && ap->min_target &&
2092 		     free_blocks >= ap->min_target)) {
2093 			ap->allowed = free_blocks;
2094 			return 0;
2095 		}
2096 check_rgrp:
2097 		/* Check for unlinked inodes which can be reclaimed */
2098 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2099 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2100 					ip->i_no_addr);
2101 skip_rgrp:
2102 		/* Drop reservation, if we couldn't use reserved rgrp */
2103 		if (gfs2_rs_active(rs))
2104 			gfs2_rs_deltree(rs);
2105 
2106 		/* Unlock rgrp if required */
2107 		if (!rg_locked)
2108 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2109 next_rgrp:
2110 		/* Find the next rgrp, and continue looking */
2111 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2112 			continue;
2113 		if (skip)
2114 			continue;
2115 
2116 		/* If we've scanned all the rgrps, but found no free blocks
2117 		 * then this checks for some less likely conditions before
2118 		 * trying again.
2119 		 */
2120 		loops++;
2121 		/* Check that fs hasn't grown if writing to rindex */
2122 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2123 			error = gfs2_ri_update(ip);
2124 			if (error)
2125 				return error;
2126 		}
2127 		/* Flushing the log may release space */
2128 		if (loops == 2)
2129 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2130 				       GFS2_LFC_INPLACE_RESERVE);
2131 	}
2132 
2133 	return -ENOSPC;
2134 }
2135 
2136 /**
2137  * gfs2_inplace_release - release an inplace reservation
2138  * @ip: the inode the reservation was taken out on
2139  *
2140  * Release a reservation made by gfs2_inplace_reserve().
2141  */
2142 
2143 void gfs2_inplace_release(struct gfs2_inode *ip)
2144 {
2145 	struct gfs2_blkreserv *rs = &ip->i_res;
2146 
2147 	if (gfs2_holder_initialized(&rs->rs_rgd_gh))
2148 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
2149 }
2150 
2151 /**
2152  * gfs2_alloc_extent - allocate an extent from a given bitmap
2153  * @rbm: the resource group information
2154  * @dinode: TRUE if the first block we allocate is for a dinode
2155  * @n: The extent length (value/result)
2156  *
2157  * Add the bitmap buffer to the transaction.
2158  * Set the found bits to @new_state to change block's allocation state.
2159  */
2160 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2161 			     unsigned int *n)
2162 {
2163 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2164 	const unsigned int elen = *n;
2165 	u64 block;
2166 	int ret;
2167 
2168 	*n = 1;
2169 	block = gfs2_rbm_to_block(rbm);
2170 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2171 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2172 	block++;
2173 	while (*n < elen) {
2174 		ret = gfs2_rbm_from_block(&pos, block);
2175 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2176 			break;
2177 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2178 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2179 		(*n)++;
2180 		block++;
2181 	}
2182 }
2183 
2184 /**
2185  * rgblk_free - Change alloc state of given block(s)
2186  * @sdp: the filesystem
2187  * @bstart: the start of a run of blocks to free
2188  * @blen: the length of the block run (all must lie within ONE RG!)
2189  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2190  *
2191  * Returns:  Resource group containing the block(s)
2192  */
2193 
2194 static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2195 				     u32 blen, unsigned char new_state)
2196 {
2197 	struct gfs2_rbm rbm;
2198 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2199 
2200 	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2201 	if (!rbm.rgd) {
2202 		if (gfs2_consist(sdp))
2203 			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
2204 		return NULL;
2205 	}
2206 
2207 	gfs2_rbm_from_block(&rbm, bstart);
2208 	while (blen--) {
2209 		bi = rbm_bi(&rbm);
2210 		if (bi != bi_prev) {
2211 			if (!bi->bi_clone) {
2212 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2213 						      GFP_NOFS | __GFP_NOFAIL);
2214 				memcpy(bi->bi_clone + bi->bi_offset,
2215 				       bi->bi_bh->b_data + bi->bi_offset,
2216 				       bi->bi_len);
2217 			}
2218 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2219 			bi_prev = bi;
2220 		}
2221 		gfs2_setbit(&rbm, false, new_state);
2222 		gfs2_rbm_incr(&rbm);
2223 	}
2224 
2225 	return rbm.rgd;
2226 }
2227 
2228 /**
2229  * gfs2_rgrp_dump - print out an rgrp
2230  * @seq: The iterator
2231  * @gl: The glock in question
2232  *
2233  */
2234 
2235 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2236 {
2237 	struct gfs2_rgrpd *rgd = gl->gl_object;
2238 	struct gfs2_blkreserv *trs;
2239 	const struct rb_node *n;
2240 
2241 	if (rgd == NULL)
2242 		return;
2243 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2244 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2245 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2246 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2247 	spin_lock(&rgd->rd_rsspin);
2248 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2249 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2250 		dump_rs(seq, trs);
2251 	}
2252 	spin_unlock(&rgd->rd_rsspin);
2253 }
2254 
2255 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2256 {
2257 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2258 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2259 		(unsigned long long)rgd->rd_addr);
2260 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2261 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2262 	rgd->rd_flags |= GFS2_RDF_ERROR;
2263 }
2264 
2265 /**
2266  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2267  * @ip: The inode we have just allocated blocks for
2268  * @rbm: The start of the allocated blocks
2269  * @len: The extent length
2270  *
2271  * Adjusts a reservation after an allocation has taken place. If the
2272  * reservation does not match the allocation, or if it is now empty
2273  * then it is removed.
2274  */
2275 
2276 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2277 				    const struct gfs2_rbm *rbm, unsigned len)
2278 {
2279 	struct gfs2_blkreserv *rs = &ip->i_res;
2280 	struct gfs2_rgrpd *rgd = rbm->rgd;
2281 	unsigned rlen;
2282 	u64 block;
2283 	int ret;
2284 
2285 	spin_lock(&rgd->rd_rsspin);
2286 	if (gfs2_rs_active(rs)) {
2287 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2288 			block = gfs2_rbm_to_block(rbm);
2289 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2290 			rlen = min(rs->rs_free, len);
2291 			rs->rs_free -= rlen;
2292 			rgd->rd_reserved -= rlen;
2293 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2294 			if (rs->rs_free && !ret)
2295 				goto out;
2296 			/* We used up our block reservation, so we should
2297 			   reserve more blocks next time. */
2298 			atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
2299 		}
2300 		__rs_deltree(rs);
2301 	}
2302 out:
2303 	spin_unlock(&rgd->rd_rsspin);
2304 }
2305 
2306 /**
2307  * gfs2_set_alloc_start - Set starting point for block allocation
2308  * @rbm: The rbm which will be set to the required location
2309  * @ip: The gfs2 inode
2310  * @dinode: Flag to say if allocation includes a new inode
2311  *
2312  * This sets the starting point from the reservation if one is active
2313  * otherwise it falls back to guessing a start point based on the
2314  * inode's goal block or the last allocation point in the rgrp.
2315  */
2316 
2317 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2318 				 const struct gfs2_inode *ip, bool dinode)
2319 {
2320 	u64 goal;
2321 
2322 	if (gfs2_rs_active(&ip->i_res)) {
2323 		*rbm = ip->i_res.rs_rbm;
2324 		return;
2325 	}
2326 
2327 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2328 		goal = ip->i_goal;
2329 	else
2330 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2331 
2332 	gfs2_rbm_from_block(rbm, goal);
2333 }
2334 
2335 /**
2336  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2337  * @ip: the inode to allocate the block for
2338  * @bn: Used to return the starting block number
2339  * @nblocks: requested number of blocks/extent length (value/result)
2340  * @dinode: 1 if we're allocating a dinode block, else 0
2341  * @generation: the generation number of the inode
2342  *
2343  * Returns: 0 or error
2344  */
2345 
2346 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2347 		      bool dinode, u64 *generation)
2348 {
2349 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2350 	struct buffer_head *dibh;
2351 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2352 	unsigned int ndata;
2353 	u64 block; /* block, within the file system scope */
2354 	int error;
2355 
2356 	gfs2_set_alloc_start(&rbm, ip, dinode);
2357 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2358 
2359 	if (error == -ENOSPC) {
2360 		gfs2_set_alloc_start(&rbm, ip, dinode);
2361 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2362 	}
2363 
2364 	/* Since all blocks are reserved in advance, this shouldn't happen */
2365 	if (error) {
2366 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2367 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2368 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2369 			rbm.rgd->rd_extfail_pt);
2370 		goto rgrp_error;
2371 	}
2372 
2373 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2374 	block = gfs2_rbm_to_block(&rbm);
2375 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2376 	if (gfs2_rs_active(&ip->i_res))
2377 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2378 	ndata = *nblocks;
2379 	if (dinode)
2380 		ndata--;
2381 
2382 	if (!dinode) {
2383 		ip->i_goal = block + ndata - 1;
2384 		error = gfs2_meta_inode_buffer(ip, &dibh);
2385 		if (error == 0) {
2386 			struct gfs2_dinode *di =
2387 				(struct gfs2_dinode *)dibh->b_data;
2388 			gfs2_trans_add_meta(ip->i_gl, dibh);
2389 			di->di_goal_meta = di->di_goal_data =
2390 				cpu_to_be64(ip->i_goal);
2391 			brelse(dibh);
2392 		}
2393 	}
2394 	if (rbm.rgd->rd_free < *nblocks) {
2395 		pr_warn("nblocks=%u\n", *nblocks);
2396 		goto rgrp_error;
2397 	}
2398 
2399 	rbm.rgd->rd_free -= *nblocks;
2400 	if (dinode) {
2401 		rbm.rgd->rd_dinodes++;
2402 		*generation = rbm.rgd->rd_igeneration++;
2403 		if (*generation == 0)
2404 			*generation = rbm.rgd->rd_igeneration++;
2405 	}
2406 
2407 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2408 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2409 
2410 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2411 	if (dinode)
2412 		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2413 
2414 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2415 
2416 	rbm.rgd->rd_free_clone -= *nblocks;
2417 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2418 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2419 	*bn = block;
2420 	return 0;
2421 
2422 rgrp_error:
2423 	gfs2_rgrp_error(rbm.rgd);
2424 	return -EIO;
2425 }
2426 
2427 /**
2428  * __gfs2_free_blocks - free a contiguous run of block(s)
2429  * @ip: the inode these blocks are being freed from
2430  * @bstart: first block of a run of contiguous blocks
2431  * @blen: the length of the block run
2432  * @meta: 1 if the blocks represent metadata
2433  *
2434  */
2435 
2436 void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
2437 {
2438 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2439 	struct gfs2_rgrpd *rgd;
2440 
2441 	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
2442 	if (!rgd)
2443 		return;
2444 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2445 	rgd->rd_free += blen;
2446 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2447 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2448 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2449 
2450 	/* Directories keep their data in the metadata address space */
2451 	if (meta || ip->i_depth)
2452 		gfs2_meta_wipe(ip, bstart, blen);
2453 }
2454 
2455 /**
2456  * gfs2_free_meta - free a contiguous run of data block(s)
2457  * @ip: the inode these blocks are being freed from
2458  * @bstart: first block of a run of contiguous blocks
2459  * @blen: the length of the block run
2460  *
2461  */
2462 
2463 void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
2464 {
2465 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2466 
2467 	__gfs2_free_blocks(ip, bstart, blen, 1);
2468 	gfs2_statfs_change(sdp, 0, +blen, 0);
2469 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2470 }
2471 
2472 void gfs2_unlink_di(struct inode *inode)
2473 {
2474 	struct gfs2_inode *ip = GFS2_I(inode);
2475 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2476 	struct gfs2_rgrpd *rgd;
2477 	u64 blkno = ip->i_no_addr;
2478 
2479 	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
2480 	if (!rgd)
2481 		return;
2482 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2483 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2484 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2485 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2486 }
2487 
2488 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2489 {
2490 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2491 	struct gfs2_rgrpd *tmp_rgd;
2492 
2493 	tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2494 	if (!tmp_rgd)
2495 		return;
2496 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
2497 
2498 	if (!rgd->rd_dinodes)
2499 		gfs2_consist_rgrpd(rgd);
2500 	rgd->rd_dinodes--;
2501 	rgd->rd_free++;
2502 
2503 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2504 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2505 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2506 
2507 	gfs2_statfs_change(sdp, 0, +1, -1);
2508 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2509 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2510 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2511 }
2512 
2513 /**
2514  * gfs2_check_blk_type - Check the type of a block
2515  * @sdp: The superblock
2516  * @no_addr: The block number to check
2517  * @type: The block type we are looking for
2518  *
2519  * Returns: 0 if the block type matches the expected type
2520  *          -ESTALE if it doesn't match
2521  *          or -ve errno if something went wrong while checking
2522  */
2523 
2524 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2525 {
2526 	struct gfs2_rgrpd *rgd;
2527 	struct gfs2_holder rgd_gh;
2528 	struct gfs2_rbm rbm;
2529 	int error = -EINVAL;
2530 
2531 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2532 	if (!rgd)
2533 		goto fail;
2534 
2535 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2536 	if (error)
2537 		goto fail;
2538 
2539 	rbm.rgd = rgd;
2540 	error = gfs2_rbm_from_block(&rbm, no_addr);
2541 	WARN_ON_ONCE(error != 0);
2542 
2543 	if (gfs2_testbit(&rbm, false) != type)
2544 		error = -ESTALE;
2545 
2546 	gfs2_glock_dq_uninit(&rgd_gh);
2547 fail:
2548 	return error;
2549 }
2550 
2551 /**
2552  * gfs2_rlist_add - add a RG to a list of RGs
2553  * @ip: the inode
2554  * @rlist: the list of resource groups
2555  * @block: the block
2556  *
2557  * Figure out what RG a block belongs to and add that RG to the list
2558  *
2559  * FIXME: Don't use NOFAIL
2560  *
2561  */
2562 
2563 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2564 		    u64 block)
2565 {
2566 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2567 	struct gfs2_rgrpd *rgd;
2568 	struct gfs2_rgrpd **tmp;
2569 	unsigned int new_space;
2570 	unsigned int x;
2571 
2572 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2573 		return;
2574 
2575 	/*
2576 	 * The resource group last accessed is kept in the last position.
2577 	 */
2578 
2579 	if (rlist->rl_rgrps) {
2580 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2581 		if (rgrp_contains_block(rgd, block))
2582 			return;
2583 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2584 	} else {
2585 		rgd = ip->i_res.rs_rbm.rgd;
2586 		if (!rgd || !rgrp_contains_block(rgd, block))
2587 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2588 	}
2589 
2590 	if (!rgd) {
2591 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2592 		       (unsigned long long)block);
2593 		return;
2594 	}
2595 
2596 	for (x = 0; x < rlist->rl_rgrps; x++) {
2597 		if (rlist->rl_rgd[x] == rgd) {
2598 			swap(rlist->rl_rgd[x],
2599 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2600 			return;
2601 		}
2602 	}
2603 
2604 	if (rlist->rl_rgrps == rlist->rl_space) {
2605 		new_space = rlist->rl_space + 10;
2606 
2607 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2608 			      GFP_NOFS | __GFP_NOFAIL);
2609 
2610 		if (rlist->rl_rgd) {
2611 			memcpy(tmp, rlist->rl_rgd,
2612 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2613 			kfree(rlist->rl_rgd);
2614 		}
2615 
2616 		rlist->rl_space = new_space;
2617 		rlist->rl_rgd = tmp;
2618 	}
2619 
2620 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2621 }
2622 
2623 /**
2624  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2625  *      and initialize an array of glock holders for them
2626  * @rlist: the list of resource groups
2627  * @state: the lock state to acquire the RG lock in
2628  *
2629  * FIXME: Don't use NOFAIL
2630  *
2631  */
2632 
2633 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
2634 {
2635 	unsigned int x;
2636 
2637 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2638 				      sizeof(struct gfs2_holder),
2639 				      GFP_NOFS | __GFP_NOFAIL);
2640 	for (x = 0; x < rlist->rl_rgrps; x++)
2641 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2642 				state, 0,
2643 				&rlist->rl_ghs[x]);
2644 }
2645 
2646 /**
2647  * gfs2_rlist_free - free a resource group list
2648  * @rlist: the list of resource groups
2649  *
2650  */
2651 
2652 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2653 {
2654 	unsigned int x;
2655 
2656 	kfree(rlist->rl_rgd);
2657 
2658 	if (rlist->rl_ghs) {
2659 		for (x = 0; x < rlist->rl_rgrps; x++)
2660 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2661 		kfree(rlist->rl_ghs);
2662 		rlist->rl_ghs = NULL;
2663 	}
2664 }
2665 
2666