xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 2208f39c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461 			count[0], rgd->rd_free);
462 		gfs2_consist_rgrpd(rgd);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469 			count[1], tmp);
470 		gfs2_consist_rgrpd(rgd);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476 			count[2] + count[3], rgd->rd_dinodes);
477 		gfs2_consist_rgrpd(rgd);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
593 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594 		    const char *fs_id_buf)
595 {
596 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597 
598 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599 		       (unsigned long long)ip->i_no_addr,
600 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601 		       rs->rs_rbm.offset, rs->rs_free);
602 }
603 
604 /**
605  * __rs_deltree - remove a multi-block reservation from the rgd tree
606  * @rs: The reservation to remove
607  *
608  */
609 static void __rs_deltree(struct gfs2_blkreserv *rs)
610 {
611 	struct gfs2_rgrpd *rgd;
612 
613 	if (!gfs2_rs_active(rs))
614 		return;
615 
616 	rgd = rs->rs_rbm.rgd;
617 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619 	RB_CLEAR_NODE(&rs->rs_node);
620 
621 	if (rs->rs_free) {
622 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623 				 rs->rs_free - 1;
624 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625 		struct gfs2_bitmap *start, *last;
626 
627 		/* return reserved blocks to the rgrp */
628 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630 		/* The rgrp extent failure point is likely not to increase;
631 		   it will only do so if the freed blocks are somehow
632 		   contiguous with a span of free blocks that follows. Still,
633 		   it will force the number to be recalculated later. */
634 		rgd->rd_extfail_pt += rs->rs_free;
635 		rs->rs_free = 0;
636 		if (gfs2_rbm_from_block(&last_rbm, last_block))
637 			return;
638 		start = rbm_bi(&rs->rs_rbm);
639 		last = rbm_bi(&last_rbm);
640 		do
641 			clear_bit(GBF_FULL, &start->bi_flags);
642 		while (start++ != last);
643 	}
644 }
645 
646 /**
647  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648  * @rs: The reservation to remove
649  *
650  */
651 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652 {
653 	struct gfs2_rgrpd *rgd;
654 
655 	rgd = rs->rs_rbm.rgd;
656 	if (rgd) {
657 		spin_lock(&rgd->rd_rsspin);
658 		__rs_deltree(rs);
659 		BUG_ON(rs->rs_free);
660 		spin_unlock(&rgd->rd_rsspin);
661 	}
662 }
663 
664 /**
665  * gfs2_rs_delete - delete a multi-block reservation
666  * @ip: The inode for this reservation
667  * @wcount: The inode's write count, or NULL
668  *
669  */
670 void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
671 {
672 	down_write(&ip->i_rw_mutex);
673 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
674 		gfs2_rs_deltree(&ip->i_res);
675 	up_write(&ip->i_rw_mutex);
676 }
677 
678 /**
679  * return_all_reservations - return all reserved blocks back to the rgrp.
680  * @rgd: the rgrp that needs its space back
681  *
682  * We previously reserved a bunch of blocks for allocation. Now we need to
683  * give them back. This leave the reservation structures in tact, but removes
684  * all of their corresponding "no-fly zones".
685  */
686 static void return_all_reservations(struct gfs2_rgrpd *rgd)
687 {
688 	struct rb_node *n;
689 	struct gfs2_blkreserv *rs;
690 
691 	spin_lock(&rgd->rd_rsspin);
692 	while ((n = rb_first(&rgd->rd_rstree))) {
693 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
694 		__rs_deltree(rs);
695 	}
696 	spin_unlock(&rgd->rd_rsspin);
697 }
698 
699 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
700 {
701 	struct rb_node *n;
702 	struct gfs2_rgrpd *rgd;
703 	struct gfs2_glock *gl;
704 
705 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
706 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
707 		gl = rgd->rd_gl;
708 
709 		rb_erase(n, &sdp->sd_rindex_tree);
710 
711 		if (gl) {
712 			if (gl->gl_state != LM_ST_UNLOCKED) {
713 				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
714 				flush_delayed_work(&gl->gl_work);
715 			}
716 			gfs2_rgrp_brelse(rgd);
717 			glock_clear_object(gl, rgd);
718 			gfs2_glock_put(gl);
719 		}
720 
721 		gfs2_free_clones(rgd);
722 		return_all_reservations(rgd);
723 		kfree(rgd->rd_bits);
724 		rgd->rd_bits = NULL;
725 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
726 	}
727 }
728 
729 /**
730  * gfs2_compute_bitstructs - Compute the bitmap sizes
731  * @rgd: The resource group descriptor
732  *
733  * Calculates bitmap descriptors, one for each block that contains bitmap data
734  *
735  * Returns: errno
736  */
737 
738 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
739 {
740 	struct gfs2_sbd *sdp = rgd->rd_sbd;
741 	struct gfs2_bitmap *bi;
742 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
743 	u32 bytes_left, bytes;
744 	int x;
745 
746 	if (!length)
747 		return -EINVAL;
748 
749 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
750 	if (!rgd->rd_bits)
751 		return -ENOMEM;
752 
753 	bytes_left = rgd->rd_bitbytes;
754 
755 	for (x = 0; x < length; x++) {
756 		bi = rgd->rd_bits + x;
757 
758 		bi->bi_flags = 0;
759 		/* small rgrp; bitmap stored completely in header block */
760 		if (length == 1) {
761 			bytes = bytes_left;
762 			bi->bi_offset = sizeof(struct gfs2_rgrp);
763 			bi->bi_start = 0;
764 			bi->bi_bytes = bytes;
765 			bi->bi_blocks = bytes * GFS2_NBBY;
766 		/* header block */
767 		} else if (x == 0) {
768 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
769 			bi->bi_offset = sizeof(struct gfs2_rgrp);
770 			bi->bi_start = 0;
771 			bi->bi_bytes = bytes;
772 			bi->bi_blocks = bytes * GFS2_NBBY;
773 		/* last block */
774 		} else if (x + 1 == length) {
775 			bytes = bytes_left;
776 			bi->bi_offset = sizeof(struct gfs2_meta_header);
777 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
778 			bi->bi_bytes = bytes;
779 			bi->bi_blocks = bytes * GFS2_NBBY;
780 		/* other blocks */
781 		} else {
782 			bytes = sdp->sd_sb.sb_bsize -
783 				sizeof(struct gfs2_meta_header);
784 			bi->bi_offset = sizeof(struct gfs2_meta_header);
785 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
786 			bi->bi_bytes = bytes;
787 			bi->bi_blocks = bytes * GFS2_NBBY;
788 		}
789 
790 		bytes_left -= bytes;
791 	}
792 
793 	if (bytes_left) {
794 		gfs2_consist_rgrpd(rgd);
795 		return -EIO;
796 	}
797 	bi = rgd->rd_bits + (length - 1);
798 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
799 		gfs2_lm(sdp,
800 			"ri_addr = %llu\n"
801 			"ri_length = %u\n"
802 			"ri_data0 = %llu\n"
803 			"ri_data = %u\n"
804 			"ri_bitbytes = %u\n"
805 			"start=%u len=%u offset=%u\n",
806 			(unsigned long long)rgd->rd_addr,
807 			rgd->rd_length,
808 			(unsigned long long)rgd->rd_data0,
809 			rgd->rd_data,
810 			rgd->rd_bitbytes,
811 			bi->bi_start, bi->bi_bytes, bi->bi_offset);
812 		gfs2_consist_rgrpd(rgd);
813 		return -EIO;
814 	}
815 
816 	return 0;
817 }
818 
819 /**
820  * gfs2_ri_total - Total up the file system space, according to the rindex.
821  * @sdp: the filesystem
822  *
823  */
824 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
825 {
826 	u64 total_data = 0;
827 	struct inode *inode = sdp->sd_rindex;
828 	struct gfs2_inode *ip = GFS2_I(inode);
829 	char buf[sizeof(struct gfs2_rindex)];
830 	int error, rgrps;
831 
832 	for (rgrps = 0;; rgrps++) {
833 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
834 
835 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
836 			break;
837 		error = gfs2_internal_read(ip, buf, &pos,
838 					   sizeof(struct gfs2_rindex));
839 		if (error != sizeof(struct gfs2_rindex))
840 			break;
841 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
842 	}
843 	return total_data;
844 }
845 
846 static int rgd_insert(struct gfs2_rgrpd *rgd)
847 {
848 	struct gfs2_sbd *sdp = rgd->rd_sbd;
849 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
850 
851 	/* Figure out where to put new node */
852 	while (*newn) {
853 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
854 						  rd_node);
855 
856 		parent = *newn;
857 		if (rgd->rd_addr < cur->rd_addr)
858 			newn = &((*newn)->rb_left);
859 		else if (rgd->rd_addr > cur->rd_addr)
860 			newn = &((*newn)->rb_right);
861 		else
862 			return -EEXIST;
863 	}
864 
865 	rb_link_node(&rgd->rd_node, parent, newn);
866 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
867 	sdp->sd_rgrps++;
868 	return 0;
869 }
870 
871 /**
872  * read_rindex_entry - Pull in a new resource index entry from the disk
873  * @ip: Pointer to the rindex inode
874  *
875  * Returns: 0 on success, > 0 on EOF, error code otherwise
876  */
877 
878 static int read_rindex_entry(struct gfs2_inode *ip)
879 {
880 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
881 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
882 	struct gfs2_rindex buf;
883 	int error;
884 	struct gfs2_rgrpd *rgd;
885 
886 	if (pos >= i_size_read(&ip->i_inode))
887 		return 1;
888 
889 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
890 				   sizeof(struct gfs2_rindex));
891 
892 	if (error != sizeof(struct gfs2_rindex))
893 		return (error == 0) ? 1 : error;
894 
895 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
896 	error = -ENOMEM;
897 	if (!rgd)
898 		return error;
899 
900 	rgd->rd_sbd = sdp;
901 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
902 	rgd->rd_length = be32_to_cpu(buf.ri_length);
903 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
904 	rgd->rd_data = be32_to_cpu(buf.ri_data);
905 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
906 	spin_lock_init(&rgd->rd_rsspin);
907 
908 	error = compute_bitstructs(rgd);
909 	if (error)
910 		goto fail;
911 
912 	error = gfs2_glock_get(sdp, rgd->rd_addr,
913 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
914 	if (error)
915 		goto fail;
916 
917 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
918 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
919 	if (rgd->rd_data > sdp->sd_max_rg_data)
920 		sdp->sd_max_rg_data = rgd->rd_data;
921 	spin_lock(&sdp->sd_rindex_spin);
922 	error = rgd_insert(rgd);
923 	spin_unlock(&sdp->sd_rindex_spin);
924 	if (!error) {
925 		glock_set_object(rgd->rd_gl, rgd);
926 		return 0;
927 	}
928 
929 	error = 0; /* someone else read in the rgrp; free it and ignore it */
930 	gfs2_glock_put(rgd->rd_gl);
931 
932 fail:
933 	kfree(rgd->rd_bits);
934 	rgd->rd_bits = NULL;
935 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
936 	return error;
937 }
938 
939 /**
940  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
941  * @sdp: the GFS2 superblock
942  *
943  * The purpose of this function is to select a subset of the resource groups
944  * and mark them as PREFERRED. We do it in such a way that each node prefers
945  * to use a unique set of rgrps to minimize glock contention.
946  */
947 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
948 {
949 	struct gfs2_rgrpd *rgd, *first;
950 	int i;
951 
952 	/* Skip an initial number of rgrps, based on this node's journal ID.
953 	   That should start each node out on its own set. */
954 	rgd = gfs2_rgrpd_get_first(sdp);
955 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
956 		rgd = gfs2_rgrpd_get_next(rgd);
957 	first = rgd;
958 
959 	do {
960 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
961 		for (i = 0; i < sdp->sd_journals; i++) {
962 			rgd = gfs2_rgrpd_get_next(rgd);
963 			if (!rgd || rgd == first)
964 				break;
965 		}
966 	} while (rgd && rgd != first);
967 }
968 
969 /**
970  * gfs2_ri_update - Pull in a new resource index from the disk
971  * @ip: pointer to the rindex inode
972  *
973  * Returns: 0 on successful update, error code otherwise
974  */
975 
976 static int gfs2_ri_update(struct gfs2_inode *ip)
977 {
978 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
979 	int error;
980 
981 	do {
982 		error = read_rindex_entry(ip);
983 	} while (error == 0);
984 
985 	if (error < 0)
986 		return error;
987 
988 	set_rgrp_preferences(sdp);
989 
990 	sdp->sd_rindex_uptodate = 1;
991 	return 0;
992 }
993 
994 /**
995  * gfs2_rindex_update - Update the rindex if required
996  * @sdp: The GFS2 superblock
997  *
998  * We grab a lock on the rindex inode to make sure that it doesn't
999  * change whilst we are performing an operation. We keep this lock
1000  * for quite long periods of time compared to other locks. This
1001  * doesn't matter, since it is shared and it is very, very rarely
1002  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1003  *
1004  * This makes sure that we're using the latest copy of the resource index
1005  * special file, which might have been updated if someone expanded the
1006  * filesystem (via gfs2_grow utility), which adds new resource groups.
1007  *
1008  * Returns: 0 on succeess, error code otherwise
1009  */
1010 
1011 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1012 {
1013 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1014 	struct gfs2_glock *gl = ip->i_gl;
1015 	struct gfs2_holder ri_gh;
1016 	int error = 0;
1017 	int unlock_required = 0;
1018 
1019 	/* Read new copy from disk if we don't have the latest */
1020 	if (!sdp->sd_rindex_uptodate) {
1021 		if (!gfs2_glock_is_locked_by_me(gl)) {
1022 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1023 			if (error)
1024 				return error;
1025 			unlock_required = 1;
1026 		}
1027 		if (!sdp->sd_rindex_uptodate)
1028 			error = gfs2_ri_update(ip);
1029 		if (unlock_required)
1030 			gfs2_glock_dq_uninit(&ri_gh);
1031 	}
1032 
1033 	return error;
1034 }
1035 
1036 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1037 {
1038 	const struct gfs2_rgrp *str = buf;
1039 	u32 rg_flags;
1040 
1041 	rg_flags = be32_to_cpu(str->rg_flags);
1042 	rg_flags &= ~GFS2_RDF_MASK;
1043 	rgd->rd_flags &= GFS2_RDF_MASK;
1044 	rgd->rd_flags |= rg_flags;
1045 	rgd->rd_free = be32_to_cpu(str->rg_free);
1046 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1047 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1048 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1049 }
1050 
1051 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1052 {
1053 	const struct gfs2_rgrp *str = buf;
1054 
1055 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1056 	rgl->rl_flags = str->rg_flags;
1057 	rgl->rl_free = str->rg_free;
1058 	rgl->rl_dinodes = str->rg_dinodes;
1059 	rgl->rl_igeneration = str->rg_igeneration;
1060 	rgl->__pad = 0UL;
1061 }
1062 
1063 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1064 {
1065 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1066 	struct gfs2_rgrp *str = buf;
1067 	u32 crc;
1068 
1069 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1070 	str->rg_free = cpu_to_be32(rgd->rd_free);
1071 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1072 	if (next == NULL)
1073 		str->rg_skip = 0;
1074 	else if (next->rd_addr > rgd->rd_addr)
1075 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1076 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1077 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1078 	str->rg_data = cpu_to_be32(rgd->rd_data);
1079 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1080 	str->rg_crc = 0;
1081 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1082 	str->rg_crc = cpu_to_be32(crc);
1083 
1084 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1085 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1086 }
1087 
1088 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1089 {
1090 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1091 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1092 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1093 	int valid = 1;
1094 
1095 	if (rgl->rl_flags != str->rg_flags) {
1096 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1097 			(unsigned long long)rgd->rd_addr,
1098 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1099 		valid = 0;
1100 	}
1101 	if (rgl->rl_free != str->rg_free) {
1102 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1103 			(unsigned long long)rgd->rd_addr,
1104 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1105 		valid = 0;
1106 	}
1107 	if (rgl->rl_dinodes != str->rg_dinodes) {
1108 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1109 			(unsigned long long)rgd->rd_addr,
1110 			be32_to_cpu(rgl->rl_dinodes),
1111 			be32_to_cpu(str->rg_dinodes));
1112 		valid = 0;
1113 	}
1114 	if (rgl->rl_igeneration != str->rg_igeneration) {
1115 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1116 			(unsigned long long)rgd->rd_addr,
1117 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1118 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1119 		valid = 0;
1120 	}
1121 	return valid;
1122 }
1123 
1124 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1125 {
1126 	struct gfs2_bitmap *bi;
1127 	const u32 length = rgd->rd_length;
1128 	const u8 *buffer = NULL;
1129 	u32 i, goal, count = 0;
1130 
1131 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1132 		goal = 0;
1133 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1134 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1135 		while (goal < bi->bi_blocks) {
1136 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1137 					   GFS2_BLKST_UNLINKED);
1138 			if (goal == BFITNOENT)
1139 				break;
1140 			count++;
1141 			goal++;
1142 		}
1143 	}
1144 
1145 	return count;
1146 }
1147 
1148 
1149 /**
1150  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1151  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1152  *
1153  * Read in all of a Resource Group's header and bitmap blocks.
1154  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1155  *
1156  * Returns: errno
1157  */
1158 
1159 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1160 {
1161 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1162 	struct gfs2_glock *gl = rgd->rd_gl;
1163 	unsigned int length = rgd->rd_length;
1164 	struct gfs2_bitmap *bi;
1165 	unsigned int x, y;
1166 	int error;
1167 
1168 	if (rgd->rd_bits[0].bi_bh != NULL)
1169 		return 0;
1170 
1171 	for (x = 0; x < length; x++) {
1172 		bi = rgd->rd_bits + x;
1173 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1174 		if (error)
1175 			goto fail;
1176 	}
1177 
1178 	for (y = length; y--;) {
1179 		bi = rgd->rd_bits + y;
1180 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1181 		if (error)
1182 			goto fail;
1183 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1184 					      GFS2_METATYPE_RG)) {
1185 			error = -EIO;
1186 			goto fail;
1187 		}
1188 	}
1189 
1190 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1191 		for (x = 0; x < length; x++)
1192 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1193 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1194 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1195 		rgd->rd_free_clone = rgd->rd_free;
1196 		/* max out the rgrp allocation failure point */
1197 		rgd->rd_extfail_pt = rgd->rd_free;
1198 	}
1199 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1200 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1201 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1202 				     rgd->rd_bits[0].bi_bh->b_data);
1203 	}
1204 	else if (sdp->sd_args.ar_rgrplvb) {
1205 		if (!gfs2_rgrp_lvb_valid(rgd)){
1206 			gfs2_consist_rgrpd(rgd);
1207 			error = -EIO;
1208 			goto fail;
1209 		}
1210 		if (rgd->rd_rgl->rl_unlinked == 0)
1211 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1212 	}
1213 	return 0;
1214 
1215 fail:
1216 	while (x--) {
1217 		bi = rgd->rd_bits + x;
1218 		brelse(bi->bi_bh);
1219 		bi->bi_bh = NULL;
1220 		gfs2_assert_warn(sdp, !bi->bi_clone);
1221 	}
1222 
1223 	return error;
1224 }
1225 
1226 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1227 {
1228 	u32 rl_flags;
1229 
1230 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1231 		return 0;
1232 
1233 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1234 		return gfs2_rgrp_bh_get(rgd);
1235 
1236 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1237 	rl_flags &= ~GFS2_RDF_MASK;
1238 	rgd->rd_flags &= GFS2_RDF_MASK;
1239 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1240 	if (rgd->rd_rgl->rl_unlinked == 0)
1241 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1242 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1243 	rgd->rd_free_clone = rgd->rd_free;
1244 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1245 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1246 	return 0;
1247 }
1248 
1249 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1250 {
1251 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1252 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1253 
1254 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1255 		return 0;
1256 	return gfs2_rgrp_bh_get(rgd);
1257 }
1258 
1259 /**
1260  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1261  * @rgd: The resource group
1262  *
1263  */
1264 
1265 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1266 {
1267 	int x, length = rgd->rd_length;
1268 
1269 	for (x = 0; x < length; x++) {
1270 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1271 		if (bi->bi_bh) {
1272 			brelse(bi->bi_bh);
1273 			bi->bi_bh = NULL;
1274 		}
1275 	}
1276 }
1277 
1278 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1279 			     struct buffer_head *bh,
1280 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1281 {
1282 	struct super_block *sb = sdp->sd_vfs;
1283 	u64 blk;
1284 	sector_t start = 0;
1285 	sector_t nr_blks = 0;
1286 	int rv;
1287 	unsigned int x;
1288 	u32 trimmed = 0;
1289 	u8 diff;
1290 
1291 	for (x = 0; x < bi->bi_bytes; x++) {
1292 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1293 		clone += bi->bi_offset;
1294 		clone += x;
1295 		if (bh) {
1296 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1297 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1298 		} else {
1299 			diff = ~(*clone | (*clone >> 1));
1300 		}
1301 		diff &= 0x55;
1302 		if (diff == 0)
1303 			continue;
1304 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1305 		while(diff) {
1306 			if (diff & 1) {
1307 				if (nr_blks == 0)
1308 					goto start_new_extent;
1309 				if ((start + nr_blks) != blk) {
1310 					if (nr_blks >= minlen) {
1311 						rv = sb_issue_discard(sb,
1312 							start, nr_blks,
1313 							GFP_NOFS, 0);
1314 						if (rv)
1315 							goto fail;
1316 						trimmed += nr_blks;
1317 					}
1318 					nr_blks = 0;
1319 start_new_extent:
1320 					start = blk;
1321 				}
1322 				nr_blks++;
1323 			}
1324 			diff >>= 2;
1325 			blk++;
1326 		}
1327 	}
1328 	if (nr_blks >= minlen) {
1329 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1330 		if (rv)
1331 			goto fail;
1332 		trimmed += nr_blks;
1333 	}
1334 	if (ptrimmed)
1335 		*ptrimmed = trimmed;
1336 	return 0;
1337 
1338 fail:
1339 	if (sdp->sd_args.ar_discard)
1340 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1341 	sdp->sd_args.ar_discard = 0;
1342 	return -EIO;
1343 }
1344 
1345 /**
1346  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1347  * @filp: Any file on the filesystem
1348  * @argp: Pointer to the arguments (also used to pass result)
1349  *
1350  * Returns: 0 on success, otherwise error code
1351  */
1352 
1353 int gfs2_fitrim(struct file *filp, void __user *argp)
1354 {
1355 	struct inode *inode = file_inode(filp);
1356 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1357 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1358 	struct buffer_head *bh;
1359 	struct gfs2_rgrpd *rgd;
1360 	struct gfs2_rgrpd *rgd_end;
1361 	struct gfs2_holder gh;
1362 	struct fstrim_range r;
1363 	int ret = 0;
1364 	u64 amt;
1365 	u64 trimmed = 0;
1366 	u64 start, end, minlen;
1367 	unsigned int x;
1368 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1369 
1370 	if (!capable(CAP_SYS_ADMIN))
1371 		return -EPERM;
1372 
1373 	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1374 		return -EROFS;
1375 
1376 	if (!blk_queue_discard(q))
1377 		return -EOPNOTSUPP;
1378 
1379 	if (copy_from_user(&r, argp, sizeof(r)))
1380 		return -EFAULT;
1381 
1382 	ret = gfs2_rindex_update(sdp);
1383 	if (ret)
1384 		return ret;
1385 
1386 	start = r.start >> bs_shift;
1387 	end = start + (r.len >> bs_shift);
1388 	minlen = max_t(u64, r.minlen,
1389 		       q->limits.discard_granularity) >> bs_shift;
1390 
1391 	if (end <= start || minlen > sdp->sd_max_rg_data)
1392 		return -EINVAL;
1393 
1394 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1395 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1396 
1397 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1398 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1399 		return -EINVAL; /* start is beyond the end of the fs */
1400 
1401 	while (1) {
1402 
1403 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1404 		if (ret)
1405 			goto out;
1406 
1407 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1408 			/* Trim each bitmap in the rgrp */
1409 			for (x = 0; x < rgd->rd_length; x++) {
1410 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1411 				ret = gfs2_rgrp_send_discards(sdp,
1412 						rgd->rd_data0, NULL, bi, minlen,
1413 						&amt);
1414 				if (ret) {
1415 					gfs2_glock_dq_uninit(&gh);
1416 					goto out;
1417 				}
1418 				trimmed += amt;
1419 			}
1420 
1421 			/* Mark rgrp as having been trimmed */
1422 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1423 			if (ret == 0) {
1424 				bh = rgd->rd_bits[0].bi_bh;
1425 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1426 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1427 				gfs2_rgrp_out(rgd, bh->b_data);
1428 				gfs2_trans_end(sdp);
1429 			}
1430 		}
1431 		gfs2_glock_dq_uninit(&gh);
1432 
1433 		if (rgd == rgd_end)
1434 			break;
1435 
1436 		rgd = gfs2_rgrpd_get_next(rgd);
1437 	}
1438 
1439 out:
1440 	r.len = trimmed << bs_shift;
1441 	if (copy_to_user(argp, &r, sizeof(r)))
1442 		return -EFAULT;
1443 
1444 	return ret;
1445 }
1446 
1447 /**
1448  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1449  * @ip: the inode structure
1450  *
1451  */
1452 static void rs_insert(struct gfs2_inode *ip)
1453 {
1454 	struct rb_node **newn, *parent = NULL;
1455 	int rc;
1456 	struct gfs2_blkreserv *rs = &ip->i_res;
1457 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1458 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1459 
1460 	BUG_ON(gfs2_rs_active(rs));
1461 
1462 	spin_lock(&rgd->rd_rsspin);
1463 	newn = &rgd->rd_rstree.rb_node;
1464 	while (*newn) {
1465 		struct gfs2_blkreserv *cur =
1466 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1467 
1468 		parent = *newn;
1469 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1470 		if (rc > 0)
1471 			newn = &((*newn)->rb_right);
1472 		else if (rc < 0)
1473 			newn = &((*newn)->rb_left);
1474 		else {
1475 			spin_unlock(&rgd->rd_rsspin);
1476 			WARN_ON(1);
1477 			return;
1478 		}
1479 	}
1480 
1481 	rb_link_node(&rs->rs_node, parent, newn);
1482 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1483 
1484 	/* Do our rgrp accounting for the reservation */
1485 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1486 	spin_unlock(&rgd->rd_rsspin);
1487 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1488 }
1489 
1490 /**
1491  * rgd_free - return the number of free blocks we can allocate.
1492  * @rgd: the resource group
1493  *
1494  * This function returns the number of free blocks for an rgrp.
1495  * That's the clone-free blocks (blocks that are free, not including those
1496  * still being used for unlinked files that haven't been deleted.)
1497  *
1498  * It also subtracts any blocks reserved by someone else, but does not
1499  * include free blocks that are still part of our current reservation,
1500  * because obviously we can (and will) allocate them.
1501  */
1502 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1503 {
1504 	u32 tot_reserved, tot_free;
1505 
1506 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1507 		return 0;
1508 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1509 
1510 	if (rgd->rd_free_clone < tot_reserved)
1511 		tot_reserved = 0;
1512 
1513 	tot_free = rgd->rd_free_clone - tot_reserved;
1514 
1515 	return tot_free;
1516 }
1517 
1518 /**
1519  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1520  * @rgd: the resource group descriptor
1521  * @ip: pointer to the inode for which we're reserving blocks
1522  * @ap: the allocation parameters
1523  *
1524  */
1525 
1526 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1527 			   const struct gfs2_alloc_parms *ap)
1528 {
1529 	struct gfs2_rbm rbm = { .rgd = rgd, };
1530 	u64 goal;
1531 	struct gfs2_blkreserv *rs = &ip->i_res;
1532 	u32 extlen;
1533 	u32 free_blocks = rgd_free(rgd, rs);
1534 	int ret;
1535 	struct inode *inode = &ip->i_inode;
1536 
1537 	if (S_ISDIR(inode->i_mode))
1538 		extlen = 1;
1539 	else {
1540 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1541 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1542 	}
1543 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1544 		return;
1545 
1546 	/* Find bitmap block that contains bits for goal block */
1547 	if (rgrp_contains_block(rgd, ip->i_goal))
1548 		goal = ip->i_goal;
1549 	else
1550 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1551 
1552 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1553 		return;
1554 
1555 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1556 	if (ret == 0) {
1557 		rs->rs_rbm = rbm;
1558 		rs->rs_free = extlen;
1559 		rs_insert(ip);
1560 	} else {
1561 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1562 			rgd->rd_last_alloc = 0;
1563 	}
1564 }
1565 
1566 /**
1567  * gfs2_next_unreserved_block - Return next block that is not reserved
1568  * @rgd: The resource group
1569  * @block: The starting block
1570  * @length: The required length
1571  * @ip: Ignore any reservations for this inode
1572  *
1573  * If the block does not appear in any reservation, then return the
1574  * block number unchanged. If it does appear in the reservation, then
1575  * keep looking through the tree of reservations in order to find the
1576  * first block number which is not reserved.
1577  */
1578 
1579 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1580 				      u32 length,
1581 				      const struct gfs2_inode *ip)
1582 {
1583 	struct gfs2_blkreserv *rs;
1584 	struct rb_node *n;
1585 	int rc;
1586 
1587 	spin_lock(&rgd->rd_rsspin);
1588 	n = rgd->rd_rstree.rb_node;
1589 	while (n) {
1590 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1591 		rc = rs_cmp(block, length, rs);
1592 		if (rc < 0)
1593 			n = n->rb_left;
1594 		else if (rc > 0)
1595 			n = n->rb_right;
1596 		else
1597 			break;
1598 	}
1599 
1600 	if (n) {
1601 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1602 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1603 			n = n->rb_right;
1604 			if (n == NULL)
1605 				break;
1606 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1607 		}
1608 	}
1609 
1610 	spin_unlock(&rgd->rd_rsspin);
1611 	return block;
1612 }
1613 
1614 /**
1615  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1616  * @rbm: The current position in the resource group
1617  * @ip: The inode for which we are searching for blocks
1618  * @minext: The minimum extent length
1619  * @maxext: A pointer to the maximum extent structure
1620  *
1621  * This checks the current position in the rgrp to see whether there is
1622  * a reservation covering this block. If not then this function is a
1623  * no-op. If there is, then the position is moved to the end of the
1624  * contiguous reservation(s) so that we are pointing at the first
1625  * non-reserved block.
1626  *
1627  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1628  */
1629 
1630 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1631 					     const struct gfs2_inode *ip,
1632 					     u32 minext,
1633 					     struct gfs2_extent *maxext)
1634 {
1635 	u64 block = gfs2_rbm_to_block(rbm);
1636 	u32 extlen = 1;
1637 	u64 nblock;
1638 	int ret;
1639 
1640 	/*
1641 	 * If we have a minimum extent length, then skip over any extent
1642 	 * which is less than the min extent length in size.
1643 	 */
1644 	if (minext) {
1645 		extlen = gfs2_free_extlen(rbm, minext);
1646 		if (extlen <= maxext->len)
1647 			goto fail;
1648 	}
1649 
1650 	/*
1651 	 * Check the extent which has been found against the reservations
1652 	 * and skip if parts of it are already reserved
1653 	 */
1654 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1655 	if (nblock == block) {
1656 		if (!minext || extlen >= minext)
1657 			return 0;
1658 
1659 		if (extlen > maxext->len) {
1660 			maxext->len = extlen;
1661 			maxext->rbm = *rbm;
1662 		}
1663 fail:
1664 		nblock = block + extlen;
1665 	}
1666 	ret = gfs2_rbm_from_block(rbm, nblock);
1667 	if (ret < 0)
1668 		return ret;
1669 	return 1;
1670 }
1671 
1672 /**
1673  * gfs2_rbm_find - Look for blocks of a particular state
1674  * @rbm: Value/result starting position and final position
1675  * @state: The state which we want to find
1676  * @minext: Pointer to the requested extent length (NULL for a single block)
1677  *          This is updated to be the actual reservation size.
1678  * @ip: If set, check for reservations
1679  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1680  *          around until we've reached the starting point.
1681  *
1682  * Side effects:
1683  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1684  *   has no free blocks in it.
1685  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1686  *   has come up short on a free block search.
1687  *
1688  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1689  */
1690 
1691 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1692 			 const struct gfs2_inode *ip, bool nowrap)
1693 {
1694 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1695 	struct buffer_head *bh;
1696 	int last_bii;
1697 	u32 offset;
1698 	u8 *buffer;
1699 	bool wrapped = false;
1700 	int ret;
1701 	struct gfs2_bitmap *bi;
1702 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1703 
1704 	/*
1705 	 * Determine the last bitmap to search.  If we're not starting at the
1706 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1707 	 * the entire resource group.
1708 	 */
1709 	last_bii = rbm->bii - (rbm->offset == 0);
1710 
1711 	while(1) {
1712 		bi = rbm_bi(rbm);
1713 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1714 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1715 		    (state == GFS2_BLKST_FREE))
1716 			goto next_bitmap;
1717 
1718 		bh = bi->bi_bh;
1719 		buffer = bh->b_data + bi->bi_offset;
1720 		WARN_ON(!buffer_uptodate(bh));
1721 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1722 			buffer = bi->bi_clone + bi->bi_offset;
1723 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1724 		if (offset == BFITNOENT) {
1725 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1726 				set_bit(GBF_FULL, &bi->bi_flags);
1727 			goto next_bitmap;
1728 		}
1729 		rbm->offset = offset;
1730 		if (ip == NULL)
1731 			return 0;
1732 
1733 		ret = gfs2_reservation_check_and_update(rbm, ip,
1734 							minext ? *minext : 0,
1735 							&maxext);
1736 		if (ret == 0)
1737 			return 0;
1738 		if (ret > 0)
1739 			goto next_iter;
1740 		if (ret == -E2BIG) {
1741 			rbm->bii = 0;
1742 			rbm->offset = 0;
1743 			goto res_covered_end_of_rgrp;
1744 		}
1745 		return ret;
1746 
1747 next_bitmap:	/* Find next bitmap in the rgrp */
1748 		rbm->offset = 0;
1749 		rbm->bii++;
1750 		if (rbm->bii == rbm->rgd->rd_length)
1751 			rbm->bii = 0;
1752 res_covered_end_of_rgrp:
1753 		if (rbm->bii == 0) {
1754 			if (wrapped)
1755 				break;
1756 			wrapped = true;
1757 			if (nowrap)
1758 				break;
1759 		}
1760 next_iter:
1761 		/* Have we scanned the entire resource group? */
1762 		if (wrapped && rbm->bii > last_bii)
1763 			break;
1764 	}
1765 
1766 	if (minext == NULL || state != GFS2_BLKST_FREE)
1767 		return -ENOSPC;
1768 
1769 	/* If the extent was too small, and it's smaller than the smallest
1770 	   to have failed before, remember for future reference that it's
1771 	   useless to search this rgrp again for this amount or more. */
1772 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1773 	    *minext < rbm->rgd->rd_extfail_pt)
1774 		rbm->rgd->rd_extfail_pt = *minext;
1775 
1776 	/* If the maximum extent we found is big enough to fulfill the
1777 	   minimum requirements, use it anyway. */
1778 	if (maxext.len) {
1779 		*rbm = maxext.rbm;
1780 		*minext = maxext.len;
1781 		return 0;
1782 	}
1783 
1784 	return -ENOSPC;
1785 }
1786 
1787 /**
1788  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1789  * @rgd: The rgrp
1790  * @last_unlinked: block address of the last dinode we unlinked
1791  * @skip: block address we should explicitly not unlink
1792  *
1793  * Returns: 0 if no error
1794  *          The inode, if one has been found, in inode.
1795  */
1796 
1797 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1798 {
1799 	u64 block;
1800 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1801 	struct gfs2_glock *gl;
1802 	struct gfs2_inode *ip;
1803 	int error;
1804 	int found = 0;
1805 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1806 
1807 	while (1) {
1808 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1809 				      true);
1810 		if (error == -ENOSPC)
1811 			break;
1812 		if (WARN_ON_ONCE(error))
1813 			break;
1814 
1815 		block = gfs2_rbm_to_block(&rbm);
1816 		if (gfs2_rbm_from_block(&rbm, block + 1))
1817 			break;
1818 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1819 			continue;
1820 		if (block == skip)
1821 			continue;
1822 		*last_unlinked = block;
1823 
1824 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1825 		if (error)
1826 			continue;
1827 
1828 		/* If the inode is already in cache, we can ignore it here
1829 		 * because the existing inode disposal code will deal with
1830 		 * it when all refs have gone away. Accessing gl_object like
1831 		 * this is not safe in general. Here it is ok because we do
1832 		 * not dereference the pointer, and we only need an approx
1833 		 * answer to whether it is NULL or not.
1834 		 */
1835 		ip = gl->gl_object;
1836 
1837 		if (ip || !gfs2_queue_delete_work(gl, 0))
1838 			gfs2_glock_put(gl);
1839 		else
1840 			found++;
1841 
1842 		/* Limit reclaim to sensible number of tasks */
1843 		if (found > NR_CPUS)
1844 			return;
1845 	}
1846 
1847 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1848 	return;
1849 }
1850 
1851 /**
1852  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1853  * @rgd: The rgrp in question
1854  * @loops: An indication of how picky we can be (0=very, 1=less so)
1855  *
1856  * This function uses the recently added glock statistics in order to
1857  * figure out whether a parciular resource group is suffering from
1858  * contention from multiple nodes. This is done purely on the basis
1859  * of timings, since this is the only data we have to work with and
1860  * our aim here is to reject a resource group which is highly contended
1861  * but (very important) not to do this too often in order to ensure that
1862  * we do not land up introducing fragmentation by changing resource
1863  * groups when not actually required.
1864  *
1865  * The calculation is fairly simple, we want to know whether the SRTTB
1866  * (i.e. smoothed round trip time for blocking operations) to acquire
1867  * the lock for this rgrp's glock is significantly greater than the
1868  * time taken for resource groups on average. We introduce a margin in
1869  * the form of the variable @var which is computed as the sum of the two
1870  * respective variences, and multiplied by a factor depending on @loops
1871  * and whether we have a lot of data to base the decision on. This is
1872  * then tested against the square difference of the means in order to
1873  * decide whether the result is statistically significant or not.
1874  *
1875  * Returns: A boolean verdict on the congestion status
1876  */
1877 
1878 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1879 {
1880 	const struct gfs2_glock *gl = rgd->rd_gl;
1881 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1882 	struct gfs2_lkstats *st;
1883 	u64 r_dcount, l_dcount;
1884 	u64 l_srttb, a_srttb = 0;
1885 	s64 srttb_diff;
1886 	u64 sqr_diff;
1887 	u64 var;
1888 	int cpu, nonzero = 0;
1889 
1890 	preempt_disable();
1891 	for_each_present_cpu(cpu) {
1892 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1893 		if (st->stats[GFS2_LKS_SRTTB]) {
1894 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1895 			nonzero++;
1896 		}
1897 	}
1898 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1899 	if (nonzero)
1900 		do_div(a_srttb, nonzero);
1901 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1902 	var = st->stats[GFS2_LKS_SRTTVARB] +
1903 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1904 	preempt_enable();
1905 
1906 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1907 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1908 
1909 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1910 		return false;
1911 
1912 	srttb_diff = a_srttb - l_srttb;
1913 	sqr_diff = srttb_diff * srttb_diff;
1914 
1915 	var *= 2;
1916 	if (l_dcount < 8 || r_dcount < 8)
1917 		var *= 2;
1918 	if (loops == 1)
1919 		var *= 2;
1920 
1921 	return ((srttb_diff < 0) && (sqr_diff > var));
1922 }
1923 
1924 /**
1925  * gfs2_rgrp_used_recently
1926  * @rs: The block reservation with the rgrp to test
1927  * @msecs: The time limit in milliseconds
1928  *
1929  * Returns: True if the rgrp glock has been used within the time limit
1930  */
1931 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1932 				    u64 msecs)
1933 {
1934 	u64 tdiff;
1935 
1936 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1937                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1938 
1939 	return tdiff > (msecs * 1000 * 1000);
1940 }
1941 
1942 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1943 {
1944 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1945 	u32 skip;
1946 
1947 	get_random_bytes(&skip, sizeof(skip));
1948 	return skip % sdp->sd_rgrps;
1949 }
1950 
1951 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1952 {
1953 	struct gfs2_rgrpd *rgd = *pos;
1954 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1955 
1956 	rgd = gfs2_rgrpd_get_next(rgd);
1957 	if (rgd == NULL)
1958 		rgd = gfs2_rgrpd_get_first(sdp);
1959 	*pos = rgd;
1960 	if (rgd != begin) /* If we didn't wrap */
1961 		return true;
1962 	return false;
1963 }
1964 
1965 /**
1966  * fast_to_acquire - determine if a resource group will be fast to acquire
1967  *
1968  * If this is one of our preferred rgrps, it should be quicker to acquire,
1969  * because we tried to set ourselves up as dlm lock master.
1970  */
1971 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1972 {
1973 	struct gfs2_glock *gl = rgd->rd_gl;
1974 
1975 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1976 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1977 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1978 		return 1;
1979 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1980 		return 1;
1981 	return 0;
1982 }
1983 
1984 /**
1985  * gfs2_inplace_reserve - Reserve space in the filesystem
1986  * @ip: the inode to reserve space for
1987  * @ap: the allocation parameters
1988  *
1989  * We try our best to find an rgrp that has at least ap->target blocks
1990  * available. After a couple of passes (loops == 2), the prospects of finding
1991  * such an rgrp diminish. At this stage, we return the first rgrp that has
1992  * at least ap->min_target blocks available. Either way, we set ap->allowed to
1993  * the number of blocks available in the chosen rgrp.
1994  *
1995  * Returns: 0 on success,
1996  *          -ENOMEM if a suitable rgrp can't be found
1997  *          errno otherwise
1998  */
1999 
2000 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2001 {
2002 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2003 	struct gfs2_rgrpd *begin = NULL;
2004 	struct gfs2_blkreserv *rs = &ip->i_res;
2005 	int error = 0, rg_locked, flags = 0;
2006 	u64 last_unlinked = NO_BLOCK;
2007 	int loops = 0;
2008 	u32 free_blocks, skip = 0;
2009 
2010 	if (sdp->sd_args.ar_rgrplvb)
2011 		flags |= GL_SKIP;
2012 	if (gfs2_assert_warn(sdp, ap->target))
2013 		return -EINVAL;
2014 	if (gfs2_rs_active(rs)) {
2015 		begin = rs->rs_rbm.rgd;
2016 	} else if (rs->rs_rbm.rgd &&
2017 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2018 		begin = rs->rs_rbm.rgd;
2019 	} else {
2020 		check_and_update_goal(ip);
2021 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2022 	}
2023 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2024 		skip = gfs2_orlov_skip(ip);
2025 	if (rs->rs_rbm.rgd == NULL)
2026 		return -EBADSLT;
2027 
2028 	while (loops < 3) {
2029 		rg_locked = 1;
2030 
2031 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2032 			rg_locked = 0;
2033 			if (skip && skip--)
2034 				goto next_rgrp;
2035 			if (!gfs2_rs_active(rs)) {
2036 				if (loops == 0 &&
2037 				    !fast_to_acquire(rs->rs_rbm.rgd))
2038 					goto next_rgrp;
2039 				if ((loops < 2) &&
2040 				    gfs2_rgrp_used_recently(rs, 1000) &&
2041 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2042 					goto next_rgrp;
2043 			}
2044 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2045 						   LM_ST_EXCLUSIVE, flags,
2046 						   &ip->i_rgd_gh);
2047 			if (unlikely(error))
2048 				return error;
2049 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2050 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2051 				goto skip_rgrp;
2052 			if (sdp->sd_args.ar_rgrplvb) {
2053 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2054 				if (unlikely(error)) {
2055 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2056 					return error;
2057 				}
2058 			}
2059 		}
2060 
2061 		/* Skip unusable resource groups */
2062 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2063 						 GFS2_RDF_ERROR)) ||
2064 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2065 			goto skip_rgrp;
2066 
2067 		if (sdp->sd_args.ar_rgrplvb)
2068 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2069 
2070 		/* Get a reservation if we don't already have one */
2071 		if (!gfs2_rs_active(rs))
2072 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2073 
2074 		/* Skip rgrps when we can't get a reservation on first pass */
2075 		if (!gfs2_rs_active(rs) && (loops < 1))
2076 			goto check_rgrp;
2077 
2078 		/* If rgrp has enough free space, use it */
2079 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2080 		if (free_blocks >= ap->target ||
2081 		    (loops == 2 && ap->min_target &&
2082 		     free_blocks >= ap->min_target)) {
2083 			ap->allowed = free_blocks;
2084 			return 0;
2085 		}
2086 check_rgrp:
2087 		/* Check for unlinked inodes which can be reclaimed */
2088 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2089 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2090 					ip->i_no_addr);
2091 skip_rgrp:
2092 		/* Drop reservation, if we couldn't use reserved rgrp */
2093 		if (gfs2_rs_active(rs))
2094 			gfs2_rs_deltree(rs);
2095 
2096 		/* Unlock rgrp if required */
2097 		if (!rg_locked)
2098 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2099 next_rgrp:
2100 		/* Find the next rgrp, and continue looking */
2101 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2102 			continue;
2103 		if (skip)
2104 			continue;
2105 
2106 		/* If we've scanned all the rgrps, but found no free blocks
2107 		 * then this checks for some less likely conditions before
2108 		 * trying again.
2109 		 */
2110 		loops++;
2111 		/* Check that fs hasn't grown if writing to rindex */
2112 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2113 			error = gfs2_ri_update(ip);
2114 			if (error)
2115 				return error;
2116 		}
2117 		/* Flushing the log may release space */
2118 		if (loops == 2)
2119 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2120 				       GFS2_LFC_INPLACE_RESERVE);
2121 	}
2122 
2123 	return -ENOSPC;
2124 }
2125 
2126 /**
2127  * gfs2_inplace_release - release an inplace reservation
2128  * @ip: the inode the reservation was taken out on
2129  *
2130  * Release a reservation made by gfs2_inplace_reserve().
2131  */
2132 
2133 void gfs2_inplace_release(struct gfs2_inode *ip)
2134 {
2135 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2136 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2137 }
2138 
2139 /**
2140  * gfs2_alloc_extent - allocate an extent from a given bitmap
2141  * @rbm: the resource group information
2142  * @dinode: TRUE if the first block we allocate is for a dinode
2143  * @n: The extent length (value/result)
2144  *
2145  * Add the bitmap buffer to the transaction.
2146  * Set the found bits to @new_state to change block's allocation state.
2147  */
2148 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2149 			     unsigned int *n)
2150 {
2151 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2152 	const unsigned int elen = *n;
2153 	u64 block;
2154 	int ret;
2155 
2156 	*n = 1;
2157 	block = gfs2_rbm_to_block(rbm);
2158 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2159 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2160 	block++;
2161 	while (*n < elen) {
2162 		ret = gfs2_rbm_from_block(&pos, block);
2163 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2164 			break;
2165 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2166 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2167 		(*n)++;
2168 		block++;
2169 	}
2170 }
2171 
2172 /**
2173  * rgblk_free - Change alloc state of given block(s)
2174  * @sdp: the filesystem
2175  * @rgd: the resource group the blocks are in
2176  * @bstart: the start of a run of blocks to free
2177  * @blen: the length of the block run (all must lie within ONE RG!)
2178  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2179  */
2180 
2181 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2182 		       u64 bstart, u32 blen, unsigned char new_state)
2183 {
2184 	struct gfs2_rbm rbm;
2185 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2186 
2187 	rbm.rgd = rgd;
2188 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2189 		return;
2190 	while (blen--) {
2191 		bi = rbm_bi(&rbm);
2192 		if (bi != bi_prev) {
2193 			if (!bi->bi_clone) {
2194 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2195 						      GFP_NOFS | __GFP_NOFAIL);
2196 				memcpy(bi->bi_clone + bi->bi_offset,
2197 				       bi->bi_bh->b_data + bi->bi_offset,
2198 				       bi->bi_bytes);
2199 			}
2200 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2201 			bi_prev = bi;
2202 		}
2203 		gfs2_setbit(&rbm, false, new_state);
2204 		gfs2_rbm_incr(&rbm);
2205 	}
2206 }
2207 
2208 /**
2209  * gfs2_rgrp_dump - print out an rgrp
2210  * @seq: The iterator
2211  * @rgd: The rgrp in question
2212  * @fs_id_buf: pointer to file system id (if requested)
2213  *
2214  */
2215 
2216 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2217 		    const char *fs_id_buf)
2218 {
2219 	struct gfs2_blkreserv *trs;
2220 	const struct rb_node *n;
2221 
2222 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2223 		       fs_id_buf,
2224 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2225 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2226 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2227 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2228 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2229 
2230 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2231 			       be32_to_cpu(rgl->rl_flags),
2232 			       be32_to_cpu(rgl->rl_free),
2233 			       be32_to_cpu(rgl->rl_dinodes));
2234 	}
2235 	spin_lock(&rgd->rd_rsspin);
2236 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2237 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2238 		dump_rs(seq, trs, fs_id_buf);
2239 	}
2240 	spin_unlock(&rgd->rd_rsspin);
2241 }
2242 
2243 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2244 {
2245 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2246 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2247 
2248 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2249 		(unsigned long long)rgd->rd_addr);
2250 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2251 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2252 	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2253 	rgd->rd_flags |= GFS2_RDF_ERROR;
2254 }
2255 
2256 /**
2257  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2258  * @ip: The inode we have just allocated blocks for
2259  * @rbm: The start of the allocated blocks
2260  * @len: The extent length
2261  *
2262  * Adjusts a reservation after an allocation has taken place. If the
2263  * reservation does not match the allocation, or if it is now empty
2264  * then it is removed.
2265  */
2266 
2267 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2268 				    const struct gfs2_rbm *rbm, unsigned len)
2269 {
2270 	struct gfs2_blkreserv *rs = &ip->i_res;
2271 	struct gfs2_rgrpd *rgd = rbm->rgd;
2272 	unsigned rlen;
2273 	u64 block;
2274 	int ret;
2275 
2276 	spin_lock(&rgd->rd_rsspin);
2277 	if (gfs2_rs_active(rs)) {
2278 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2279 			block = gfs2_rbm_to_block(rbm);
2280 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2281 			rlen = min(rs->rs_free, len);
2282 			rs->rs_free -= rlen;
2283 			rgd->rd_reserved -= rlen;
2284 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2285 			if (rs->rs_free && !ret)
2286 				goto out;
2287 			/* We used up our block reservation, so we should
2288 			   reserve more blocks next time. */
2289 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2290 		}
2291 		__rs_deltree(rs);
2292 	}
2293 out:
2294 	spin_unlock(&rgd->rd_rsspin);
2295 }
2296 
2297 /**
2298  * gfs2_set_alloc_start - Set starting point for block allocation
2299  * @rbm: The rbm which will be set to the required location
2300  * @ip: The gfs2 inode
2301  * @dinode: Flag to say if allocation includes a new inode
2302  *
2303  * This sets the starting point from the reservation if one is active
2304  * otherwise it falls back to guessing a start point based on the
2305  * inode's goal block or the last allocation point in the rgrp.
2306  */
2307 
2308 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2309 				 const struct gfs2_inode *ip, bool dinode)
2310 {
2311 	u64 goal;
2312 
2313 	if (gfs2_rs_active(&ip->i_res)) {
2314 		*rbm = ip->i_res.rs_rbm;
2315 		return;
2316 	}
2317 
2318 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2319 		goal = ip->i_goal;
2320 	else
2321 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2322 
2323 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2324 		rbm->bii = 0;
2325 		rbm->offset = 0;
2326 	}
2327 }
2328 
2329 /**
2330  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2331  * @ip: the inode to allocate the block for
2332  * @bn: Used to return the starting block number
2333  * @nblocks: requested number of blocks/extent length (value/result)
2334  * @dinode: 1 if we're allocating a dinode block, else 0
2335  * @generation: the generation number of the inode
2336  *
2337  * Returns: 0 or error
2338  */
2339 
2340 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2341 		      bool dinode, u64 *generation)
2342 {
2343 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2344 	struct buffer_head *dibh;
2345 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2346 	unsigned int ndata;
2347 	u64 block; /* block, within the file system scope */
2348 	int error;
2349 
2350 	gfs2_set_alloc_start(&rbm, ip, dinode);
2351 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2352 
2353 	if (error == -ENOSPC) {
2354 		gfs2_set_alloc_start(&rbm, ip, dinode);
2355 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2356 	}
2357 
2358 	/* Since all blocks are reserved in advance, this shouldn't happen */
2359 	if (error) {
2360 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2361 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2362 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2363 			rbm.rgd->rd_extfail_pt);
2364 		goto rgrp_error;
2365 	}
2366 
2367 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2368 	block = gfs2_rbm_to_block(&rbm);
2369 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2370 	if (gfs2_rs_active(&ip->i_res))
2371 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2372 	ndata = *nblocks;
2373 	if (dinode)
2374 		ndata--;
2375 
2376 	if (!dinode) {
2377 		ip->i_goal = block + ndata - 1;
2378 		error = gfs2_meta_inode_buffer(ip, &dibh);
2379 		if (error == 0) {
2380 			struct gfs2_dinode *di =
2381 				(struct gfs2_dinode *)dibh->b_data;
2382 			gfs2_trans_add_meta(ip->i_gl, dibh);
2383 			di->di_goal_meta = di->di_goal_data =
2384 				cpu_to_be64(ip->i_goal);
2385 			brelse(dibh);
2386 		}
2387 	}
2388 	if (rbm.rgd->rd_free < *nblocks) {
2389 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2390 		goto rgrp_error;
2391 	}
2392 
2393 	rbm.rgd->rd_free -= *nblocks;
2394 	if (dinode) {
2395 		rbm.rgd->rd_dinodes++;
2396 		*generation = rbm.rgd->rd_igeneration++;
2397 		if (*generation == 0)
2398 			*generation = rbm.rgd->rd_igeneration++;
2399 	}
2400 
2401 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2402 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2403 
2404 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2405 	if (dinode)
2406 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2407 
2408 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2409 
2410 	rbm.rgd->rd_free_clone -= *nblocks;
2411 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2412 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2413 	*bn = block;
2414 	return 0;
2415 
2416 rgrp_error:
2417 	gfs2_rgrp_error(rbm.rgd);
2418 	return -EIO;
2419 }
2420 
2421 /**
2422  * __gfs2_free_blocks - free a contiguous run of block(s)
2423  * @ip: the inode these blocks are being freed from
2424  * @rgd: the resource group the blocks are in
2425  * @bstart: first block of a run of contiguous blocks
2426  * @blen: the length of the block run
2427  * @meta: 1 if the blocks represent metadata
2428  *
2429  */
2430 
2431 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2432 			u64 bstart, u32 blen, int meta)
2433 {
2434 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2435 
2436 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2437 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2438 	rgd->rd_free += blen;
2439 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2440 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2441 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2442 
2443 	/* Directories keep their data in the metadata address space */
2444 	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2445 		gfs2_journal_wipe(ip, bstart, blen);
2446 }
2447 
2448 /**
2449  * gfs2_free_meta - free a contiguous run of data block(s)
2450  * @ip: the inode these blocks are being freed from
2451  * @rgd: the resource group the blocks are in
2452  * @bstart: first block of a run of contiguous blocks
2453  * @blen: the length of the block run
2454  *
2455  */
2456 
2457 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2458 		    u64 bstart, u32 blen)
2459 {
2460 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2461 
2462 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2463 	gfs2_statfs_change(sdp, 0, +blen, 0);
2464 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2465 }
2466 
2467 void gfs2_unlink_di(struct inode *inode)
2468 {
2469 	struct gfs2_inode *ip = GFS2_I(inode);
2470 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2471 	struct gfs2_rgrpd *rgd;
2472 	u64 blkno = ip->i_no_addr;
2473 
2474 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2475 	if (!rgd)
2476 		return;
2477 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2478 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2479 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2480 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2481 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2482 }
2483 
2484 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2485 {
2486 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2487 
2488 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2489 	if (!rgd->rd_dinodes)
2490 		gfs2_consist_rgrpd(rgd);
2491 	rgd->rd_dinodes--;
2492 	rgd->rd_free++;
2493 
2494 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2495 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2496 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2497 
2498 	gfs2_statfs_change(sdp, 0, +1, -1);
2499 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2500 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2501 	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2502 }
2503 
2504 /**
2505  * gfs2_check_blk_type - Check the type of a block
2506  * @sdp: The superblock
2507  * @no_addr: The block number to check
2508  * @type: The block type we are looking for
2509  *
2510  * Returns: 0 if the block type matches the expected type
2511  *          -ESTALE if it doesn't match
2512  *          or -ve errno if something went wrong while checking
2513  */
2514 
2515 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2516 {
2517 	struct gfs2_rgrpd *rgd;
2518 	struct gfs2_holder rgd_gh;
2519 	struct gfs2_rbm rbm;
2520 	int error = -EINVAL;
2521 
2522 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2523 	if (!rgd)
2524 		goto fail;
2525 
2526 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2527 	if (error)
2528 		goto fail;
2529 
2530 	rbm.rgd = rgd;
2531 	error = gfs2_rbm_from_block(&rbm, no_addr);
2532 	if (WARN_ON_ONCE(error))
2533 		goto fail;
2534 
2535 	if (gfs2_testbit(&rbm, false) != type)
2536 		error = -ESTALE;
2537 
2538 	gfs2_glock_dq_uninit(&rgd_gh);
2539 fail:
2540 	return error;
2541 }
2542 
2543 /**
2544  * gfs2_rlist_add - add a RG to a list of RGs
2545  * @ip: the inode
2546  * @rlist: the list of resource groups
2547  * @block: the block
2548  *
2549  * Figure out what RG a block belongs to and add that RG to the list
2550  *
2551  * FIXME: Don't use NOFAIL
2552  *
2553  */
2554 
2555 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2556 		    u64 block)
2557 {
2558 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2559 	struct gfs2_rgrpd *rgd;
2560 	struct gfs2_rgrpd **tmp;
2561 	unsigned int new_space;
2562 	unsigned int x;
2563 
2564 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2565 		return;
2566 
2567 	/*
2568 	 * The resource group last accessed is kept in the last position.
2569 	 */
2570 
2571 	if (rlist->rl_rgrps) {
2572 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2573 		if (rgrp_contains_block(rgd, block))
2574 			return;
2575 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2576 	} else {
2577 		rgd = ip->i_res.rs_rbm.rgd;
2578 		if (!rgd || !rgrp_contains_block(rgd, block))
2579 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2580 	}
2581 
2582 	if (!rgd) {
2583 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2584 		       (unsigned long long)block);
2585 		return;
2586 	}
2587 
2588 	for (x = 0; x < rlist->rl_rgrps; x++) {
2589 		if (rlist->rl_rgd[x] == rgd) {
2590 			swap(rlist->rl_rgd[x],
2591 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2592 			return;
2593 		}
2594 	}
2595 
2596 	if (rlist->rl_rgrps == rlist->rl_space) {
2597 		new_space = rlist->rl_space + 10;
2598 
2599 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2600 			      GFP_NOFS | __GFP_NOFAIL);
2601 
2602 		if (rlist->rl_rgd) {
2603 			memcpy(tmp, rlist->rl_rgd,
2604 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2605 			kfree(rlist->rl_rgd);
2606 		}
2607 
2608 		rlist->rl_space = new_space;
2609 		rlist->rl_rgd = tmp;
2610 	}
2611 
2612 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2613 }
2614 
2615 /**
2616  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2617  *      and initialize an array of glock holders for them
2618  * @rlist: the list of resource groups
2619  *
2620  * FIXME: Don't use NOFAIL
2621  *
2622  */
2623 
2624 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2625 {
2626 	unsigned int x;
2627 
2628 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2629 				      sizeof(struct gfs2_holder),
2630 				      GFP_NOFS | __GFP_NOFAIL);
2631 	for (x = 0; x < rlist->rl_rgrps; x++)
2632 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2633 				LM_ST_EXCLUSIVE, 0,
2634 				&rlist->rl_ghs[x]);
2635 }
2636 
2637 /**
2638  * gfs2_rlist_free - free a resource group list
2639  * @rlist: the list of resource groups
2640  *
2641  */
2642 
2643 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2644 {
2645 	unsigned int x;
2646 
2647 	kfree(rlist->rl_rgd);
2648 
2649 	if (rlist->rl_ghs) {
2650 		for (x = 0; x < rlist->rl_rgrps; x++)
2651 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2652 		kfree(rlist->rl_ghs);
2653 		rlist->rl_ghs = NULL;
2654 	}
2655 }
2656 
2657