xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 0c874100)
1 /*
2  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
3  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
4  *
5  * This copyrighted material is made available to anyone wishing to use,
6  * modify, copy, or redistribute it subject to the terms and conditions
7  * of the GNU General Public License version 2.
8  */
9 
10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11 
12 #include <linux/slab.h>
13 #include <linux/spinlock.h>
14 #include <linux/completion.h>
15 #include <linux/buffer_head.h>
16 #include <linux/fs.h>
17 #include <linux/gfs2_ondisk.h>
18 #include <linux/prefetch.h>
19 #include <linux/blkdev.h>
20 #include <linux/rbtree.h>
21 #include <linux/random.h>
22 
23 #include "gfs2.h"
24 #include "incore.h"
25 #include "glock.h"
26 #include "glops.h"
27 #include "lops.h"
28 #include "meta_io.h"
29 #include "quota.h"
30 #include "rgrp.h"
31 #include "super.h"
32 #include "trans.h"
33 #include "util.h"
34 #include "log.h"
35 #include "inode.h"
36 #include "trace_gfs2.h"
37 #include "dir.h"
38 
39 #define BFITNOENT ((u32)~0)
40 #define NO_BLOCK ((u64)~0)
41 
42 #if BITS_PER_LONG == 32
43 #define LBITMASK   (0x55555555UL)
44 #define LBITSKIP55 (0x55555555UL)
45 #define LBITSKIP00 (0x00000000UL)
46 #else
47 #define LBITMASK   (0x5555555555555555UL)
48 #define LBITSKIP55 (0x5555555555555555UL)
49 #define LBITSKIP00 (0x0000000000000000UL)
50 #endif
51 
52 /*
53  * These routines are used by the resource group routines (rgrp.c)
54  * to keep track of block allocation.  Each block is represented by two
55  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
56  *
57  * 0 = Free
58  * 1 = Used (not metadata)
59  * 2 = Unlinked (still in use) inode
60  * 3 = Used (metadata)
61  */
62 
63 struct gfs2_extent {
64 	struct gfs2_rbm rbm;
65 	u32 len;
66 };
67 
68 static const char valid_change[16] = {
69 	        /* current */
70 	/* n */ 0, 1, 1, 1,
71 	/* e */ 1, 0, 0, 0,
72 	/* w */ 0, 0, 0, 1,
73 	        1, 0, 0, 0
74 };
75 
76 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
77 			 const struct gfs2_inode *ip, bool nowrap);
78 
79 
80 /**
81  * gfs2_setbit - Set a bit in the bitmaps
82  * @rbm: The position of the bit to set
83  * @do_clone: Also set the clone bitmap, if it exists
84  * @new_state: the new state of the block
85  *
86  */
87 
88 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
89 			       unsigned char new_state)
90 {
91 	unsigned char *byte1, *byte2, *end, cur_state;
92 	struct gfs2_bitmap *bi = rbm_bi(rbm);
93 	unsigned int buflen = bi->bi_bytes;
94 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
95 
96 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
97 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
98 
99 	BUG_ON(byte1 >= end);
100 
101 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
102 
103 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
104 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
105 
106 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
107 			rbm->offset, cur_state, new_state);
108 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
109 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
110 			(unsigned long long)bi->bi_bh->b_blocknr);
111 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
112 			bi->bi_offset, bi->bi_bytes,
113 			(unsigned long long)gfs2_rbm_to_block(rbm));
114 		dump_stack();
115 		gfs2_consist_rgrpd(rbm->rgd);
116 		return;
117 	}
118 	*byte1 ^= (cur_state ^ new_state) << bit;
119 
120 	if (do_clone && bi->bi_clone) {
121 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
122 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
123 		*byte2 ^= (cur_state ^ new_state) << bit;
124 	}
125 }
126 
127 /**
128  * gfs2_testbit - test a bit in the bitmaps
129  * @rbm: The bit to test
130  * @use_clone: If true, test the clone bitmap, not the official bitmap.
131  *
132  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
133  * not the "real" bitmaps, to avoid allocating recently freed blocks.
134  *
135  * Returns: The two bit block state of the requested bit
136  */
137 
138 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
139 {
140 	struct gfs2_bitmap *bi = rbm_bi(rbm);
141 	const u8 *buffer;
142 	const u8 *byte;
143 	unsigned int bit;
144 
145 	if (use_clone && bi->bi_clone)
146 		buffer = bi->bi_clone;
147 	else
148 		buffer = bi->bi_bh->b_data;
149 	buffer += bi->bi_offset;
150 	byte = buffer + (rbm->offset / GFS2_NBBY);
151 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
152 
153 	return (*byte >> bit) & GFS2_BIT_MASK;
154 }
155 
156 /**
157  * gfs2_bit_search
158  * @ptr: Pointer to bitmap data
159  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
160  * @state: The state we are searching for
161  *
162  * We xor the bitmap data with a patter which is the bitwise opposite
163  * of what we are looking for, this gives rise to a pattern of ones
164  * wherever there is a match. Since we have two bits per entry, we
165  * take this pattern, shift it down by one place and then and it with
166  * the original. All the even bit positions (0,2,4, etc) then represent
167  * successful matches, so we mask with 0x55555..... to remove the unwanted
168  * odd bit positions.
169  *
170  * This allows searching of a whole u64 at once (32 blocks) with a
171  * single test (on 64 bit arches).
172  */
173 
174 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
175 {
176 	u64 tmp;
177 	static const u64 search[] = {
178 		[0] = 0xffffffffffffffffULL,
179 		[1] = 0xaaaaaaaaaaaaaaaaULL,
180 		[2] = 0x5555555555555555ULL,
181 		[3] = 0x0000000000000000ULL,
182 	};
183 	tmp = le64_to_cpu(*ptr) ^ search[state];
184 	tmp &= (tmp >> 1);
185 	tmp &= mask;
186 	return tmp;
187 }
188 
189 /**
190  * rs_cmp - multi-block reservation range compare
191  * @blk: absolute file system block number of the new reservation
192  * @len: number of blocks in the new reservation
193  * @rs: existing reservation to compare against
194  *
195  * returns: 1 if the block range is beyond the reach of the reservation
196  *         -1 if the block range is before the start of the reservation
197  *          0 if the block range overlaps with the reservation
198  */
199 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
200 {
201 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
202 
203 	if (blk >= startblk + rs->rs_free)
204 		return 1;
205 	if (blk + len - 1 < startblk)
206 		return -1;
207 	return 0;
208 }
209 
210 /**
211  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
212  *       a block in a given allocation state.
213  * @buf: the buffer that holds the bitmaps
214  * @len: the length (in bytes) of the buffer
215  * @goal: start search at this block's bit-pair (within @buffer)
216  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
217  *
218  * Scope of @goal and returned block number is only within this bitmap buffer,
219  * not entire rgrp or filesystem.  @buffer will be offset from the actual
220  * beginning of a bitmap block buffer, skipping any header structures, but
221  * headers are always a multiple of 64 bits long so that the buffer is
222  * always aligned to a 64 bit boundary.
223  *
224  * The size of the buffer is in bytes, but is it assumed that it is
225  * always ok to read a complete multiple of 64 bits at the end
226  * of the block in case the end is no aligned to a natural boundary.
227  *
228  * Return: the block number (bitmap buffer scope) that was found
229  */
230 
231 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
232 		       u32 goal, u8 state)
233 {
234 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
235 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
236 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
237 	u64 tmp;
238 	u64 mask = 0x5555555555555555ULL;
239 	u32 bit;
240 
241 	/* Mask off bits we don't care about at the start of the search */
242 	mask <<= spoint;
243 	tmp = gfs2_bit_search(ptr, mask, state);
244 	ptr++;
245 	while(tmp == 0 && ptr < end) {
246 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
247 		ptr++;
248 	}
249 	/* Mask off any bits which are more than len bytes from the start */
250 	if (ptr == end && (len & (sizeof(u64) - 1)))
251 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
252 	/* Didn't find anything, so return */
253 	if (tmp == 0)
254 		return BFITNOENT;
255 	ptr--;
256 	bit = __ffs64(tmp);
257 	bit /= 2;	/* two bits per entry in the bitmap */
258 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
259 }
260 
261 /**
262  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
263  * @rbm: The rbm with rgd already set correctly
264  * @block: The block number (filesystem relative)
265  *
266  * This sets the bi and offset members of an rbm based on a
267  * resource group and a filesystem relative block number. The
268  * resource group must be set in the rbm on entry, the bi and
269  * offset members will be set by this function.
270  *
271  * Returns: 0 on success, or an error code
272  */
273 
274 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
275 {
276 	if (!rgrp_contains_block(rbm->rgd, block))
277 		return -E2BIG;
278 	rbm->bii = 0;
279 	rbm->offset = block - rbm->rgd->rd_data0;
280 	/* Check if the block is within the first block */
281 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
282 		return 0;
283 
284 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
285 	rbm->offset += (sizeof(struct gfs2_rgrp) -
286 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
287 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
288 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
289 	return 0;
290 }
291 
292 /**
293  * gfs2_rbm_incr - increment an rbm structure
294  * @rbm: The rbm with rgd already set correctly
295  *
296  * This function takes an existing rbm structure and increments it to the next
297  * viable block offset.
298  *
299  * Returns: If incrementing the offset would cause the rbm to go past the
300  *          end of the rgrp, true is returned, otherwise false.
301  *
302  */
303 
304 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
305 {
306 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
307 		rbm->offset++;
308 		return false;
309 	}
310 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
311 		return true;
312 
313 	rbm->offset = 0;
314 	rbm->bii++;
315 	return false;
316 }
317 
318 /**
319  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
320  * @rbm: Position to search (value/result)
321  * @n_unaligned: Number of unaligned blocks to check
322  * @len: Decremented for each block found (terminate on zero)
323  *
324  * Returns: true if a non-free block is encountered
325  */
326 
327 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
328 {
329 	u32 n;
330 	u8 res;
331 
332 	for (n = 0; n < n_unaligned; n++) {
333 		res = gfs2_testbit(rbm, true);
334 		if (res != GFS2_BLKST_FREE)
335 			return true;
336 		(*len)--;
337 		if (*len == 0)
338 			return true;
339 		if (gfs2_rbm_incr(rbm))
340 			return true;
341 	}
342 
343 	return false;
344 }
345 
346 /**
347  * gfs2_free_extlen - Return extent length of free blocks
348  * @rrbm: Starting position
349  * @len: Max length to check
350  *
351  * Starting at the block specified by the rbm, see how many free blocks
352  * there are, not reading more than len blocks ahead. This can be done
353  * using memchr_inv when the blocks are byte aligned, but has to be done
354  * on a block by block basis in case of unaligned blocks. Also this
355  * function can cope with bitmap boundaries (although it must stop on
356  * a resource group boundary)
357  *
358  * Returns: Number of free blocks in the extent
359  */
360 
361 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
362 {
363 	struct gfs2_rbm rbm = *rrbm;
364 	u32 n_unaligned = rbm.offset & 3;
365 	u32 size = len;
366 	u32 bytes;
367 	u32 chunk_size;
368 	u8 *ptr, *start, *end;
369 	u64 block;
370 	struct gfs2_bitmap *bi;
371 
372 	if (n_unaligned &&
373 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
374 		goto out;
375 
376 	n_unaligned = len & 3;
377 	/* Start is now byte aligned */
378 	while (len > 3) {
379 		bi = rbm_bi(&rbm);
380 		start = bi->bi_bh->b_data;
381 		if (bi->bi_clone)
382 			start = bi->bi_clone;
383 		start += bi->bi_offset;
384 		end = start + bi->bi_bytes;
385 		BUG_ON(rbm.offset & 3);
386 		start += (rbm.offset / GFS2_NBBY);
387 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
388 		ptr = memchr_inv(start, 0, bytes);
389 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
390 		chunk_size *= GFS2_NBBY;
391 		BUG_ON(len < chunk_size);
392 		len -= chunk_size;
393 		block = gfs2_rbm_to_block(&rbm);
394 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
395 			n_unaligned = 0;
396 			break;
397 		}
398 		if (ptr) {
399 			n_unaligned = 3;
400 			break;
401 		}
402 		n_unaligned = len & 3;
403 	}
404 
405 	/* Deal with any bits left over at the end */
406 	if (n_unaligned)
407 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
408 out:
409 	return size - len;
410 }
411 
412 /**
413  * gfs2_bitcount - count the number of bits in a certain state
414  * @rgd: the resource group descriptor
415  * @buffer: the buffer that holds the bitmaps
416  * @buflen: the length (in bytes) of the buffer
417  * @state: the state of the block we're looking for
418  *
419  * Returns: The number of bits
420  */
421 
422 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
423 			 unsigned int buflen, u8 state)
424 {
425 	const u8 *byte = buffer;
426 	const u8 *end = buffer + buflen;
427 	const u8 state1 = state << 2;
428 	const u8 state2 = state << 4;
429 	const u8 state3 = state << 6;
430 	u32 count = 0;
431 
432 	for (; byte < end; byte++) {
433 		if (((*byte) & 0x03) == state)
434 			count++;
435 		if (((*byte) & 0x0C) == state1)
436 			count++;
437 		if (((*byte) & 0x30) == state2)
438 			count++;
439 		if (((*byte) & 0xC0) == state3)
440 			count++;
441 	}
442 
443 	return count;
444 }
445 
446 /**
447  * gfs2_rgrp_verify - Verify that a resource group is consistent
448  * @rgd: the rgrp
449  *
450  */
451 
452 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
453 {
454 	struct gfs2_sbd *sdp = rgd->rd_sbd;
455 	struct gfs2_bitmap *bi = NULL;
456 	u32 length = rgd->rd_length;
457 	u32 count[4], tmp;
458 	int buf, x;
459 
460 	memset(count, 0, 4 * sizeof(u32));
461 
462 	/* Count # blocks in each of 4 possible allocation states */
463 	for (buf = 0; buf < length; buf++) {
464 		bi = rgd->rd_bits + buf;
465 		for (x = 0; x < 4; x++)
466 			count[x] += gfs2_bitcount(rgd,
467 						  bi->bi_bh->b_data +
468 						  bi->bi_offset,
469 						  bi->bi_bytes, x);
470 	}
471 
472 	if (count[0] != rgd->rd_free) {
473 		if (gfs2_consist_rgrpd(rgd))
474 			fs_err(sdp, "free data mismatch:  %u != %u\n",
475 			       count[0], rgd->rd_free);
476 		return;
477 	}
478 
479 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
480 	if (count[1] != tmp) {
481 		if (gfs2_consist_rgrpd(rgd))
482 			fs_err(sdp, "used data mismatch:  %u != %u\n",
483 			       count[1], tmp);
484 		return;
485 	}
486 
487 	if (count[2] + count[3] != rgd->rd_dinodes) {
488 		if (gfs2_consist_rgrpd(rgd))
489 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
490 			       count[2] + count[3], rgd->rd_dinodes);
491 		return;
492 	}
493 }
494 
495 /**
496  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
497  * @sdp: The GFS2 superblock
498  * @blk: The data block number
499  * @exact: True if this needs to be an exact match
500  *
501  * The @exact argument should be set to true by most callers. The exception
502  * is when we need to match blocks which are not represented by the rgrp
503  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
504  * there for alignment purposes. Another way of looking at it is that @exact
505  * matches only valid data/metadata blocks, but with @exact false, it will
506  * match any block within the extent of the rgrp.
507  *
508  * Returns: The resource group, or NULL if not found
509  */
510 
511 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
512 {
513 	struct rb_node *n, *next;
514 	struct gfs2_rgrpd *cur;
515 
516 	spin_lock(&sdp->sd_rindex_spin);
517 	n = sdp->sd_rindex_tree.rb_node;
518 	while (n) {
519 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
520 		next = NULL;
521 		if (blk < cur->rd_addr)
522 			next = n->rb_left;
523 		else if (blk >= cur->rd_data0 + cur->rd_data)
524 			next = n->rb_right;
525 		if (next == NULL) {
526 			spin_unlock(&sdp->sd_rindex_spin);
527 			if (exact) {
528 				if (blk < cur->rd_addr)
529 					return NULL;
530 				if (blk >= cur->rd_data0 + cur->rd_data)
531 					return NULL;
532 			}
533 			return cur;
534 		}
535 		n = next;
536 	}
537 	spin_unlock(&sdp->sd_rindex_spin);
538 
539 	return NULL;
540 }
541 
542 /**
543  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
544  * @sdp: The GFS2 superblock
545  *
546  * Returns: The first rgrp in the filesystem
547  */
548 
549 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
550 {
551 	const struct rb_node *n;
552 	struct gfs2_rgrpd *rgd;
553 
554 	spin_lock(&sdp->sd_rindex_spin);
555 	n = rb_first(&sdp->sd_rindex_tree);
556 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
557 	spin_unlock(&sdp->sd_rindex_spin);
558 
559 	return rgd;
560 }
561 
562 /**
563  * gfs2_rgrpd_get_next - get the next RG
564  * @rgd: the resource group descriptor
565  *
566  * Returns: The next rgrp
567  */
568 
569 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
570 {
571 	struct gfs2_sbd *sdp = rgd->rd_sbd;
572 	const struct rb_node *n;
573 
574 	spin_lock(&sdp->sd_rindex_spin);
575 	n = rb_next(&rgd->rd_node);
576 	if (n == NULL)
577 		n = rb_first(&sdp->sd_rindex_tree);
578 
579 	if (unlikely(&rgd->rd_node == n)) {
580 		spin_unlock(&sdp->sd_rindex_spin);
581 		return NULL;
582 	}
583 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
584 	spin_unlock(&sdp->sd_rindex_spin);
585 	return rgd;
586 }
587 
588 void check_and_update_goal(struct gfs2_inode *ip)
589 {
590 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
591 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
592 		ip->i_goal = ip->i_no_addr;
593 }
594 
595 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
596 {
597 	int x;
598 
599 	for (x = 0; x < rgd->rd_length; x++) {
600 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
601 		kfree(bi->bi_clone);
602 		bi->bi_clone = NULL;
603 	}
604 }
605 
606 /**
607  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
608  *                 plus a quota allocations data structure, if necessary
609  * @ip: the inode for this reservation
610  */
611 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
612 {
613 	return gfs2_qa_alloc(ip);
614 }
615 
616 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
617 {
618 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
619 
620 	gfs2_print_dbg(seq, "  B: n:%llu s:%llu b:%u f:%u\n",
621 		       (unsigned long long)ip->i_no_addr,
622 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
623 		       rs->rs_rbm.offset, rs->rs_free);
624 }
625 
626 /**
627  * __rs_deltree - remove a multi-block reservation from the rgd tree
628  * @rs: The reservation to remove
629  *
630  */
631 static void __rs_deltree(struct gfs2_blkreserv *rs)
632 {
633 	struct gfs2_rgrpd *rgd;
634 
635 	if (!gfs2_rs_active(rs))
636 		return;
637 
638 	rgd = rs->rs_rbm.rgd;
639 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
640 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
641 	RB_CLEAR_NODE(&rs->rs_node);
642 
643 	if (rs->rs_free) {
644 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
645 				 rs->rs_free - 1;
646 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
647 		struct gfs2_bitmap *start, *last;
648 
649 		/* return reserved blocks to the rgrp */
650 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
651 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
652 		/* The rgrp extent failure point is likely not to increase;
653 		   it will only do so if the freed blocks are somehow
654 		   contiguous with a span of free blocks that follows. Still,
655 		   it will force the number to be recalculated later. */
656 		rgd->rd_extfail_pt += rs->rs_free;
657 		rs->rs_free = 0;
658 		if (gfs2_rbm_from_block(&last_rbm, last_block))
659 			return;
660 		start = rbm_bi(&rs->rs_rbm);
661 		last = rbm_bi(&last_rbm);
662 		do
663 			clear_bit(GBF_FULL, &start->bi_flags);
664 		while (start++ != last);
665 	}
666 }
667 
668 /**
669  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
670  * @rs: The reservation to remove
671  *
672  */
673 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
674 {
675 	struct gfs2_rgrpd *rgd;
676 
677 	rgd = rs->rs_rbm.rgd;
678 	if (rgd) {
679 		spin_lock(&rgd->rd_rsspin);
680 		__rs_deltree(rs);
681 		BUG_ON(rs->rs_free);
682 		spin_unlock(&rgd->rd_rsspin);
683 	}
684 }
685 
686 /**
687  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
688  * @ip: The inode for this reservation
689  * @wcount: The inode's write count, or NULL
690  *
691  */
692 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
693 {
694 	down_write(&ip->i_rw_mutex);
695 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
696 		gfs2_rs_deltree(&ip->i_res);
697 	up_write(&ip->i_rw_mutex);
698 	gfs2_qa_delete(ip, wcount);
699 }
700 
701 /**
702  * return_all_reservations - return all reserved blocks back to the rgrp.
703  * @rgd: the rgrp that needs its space back
704  *
705  * We previously reserved a bunch of blocks for allocation. Now we need to
706  * give them back. This leave the reservation structures in tact, but removes
707  * all of their corresponding "no-fly zones".
708  */
709 static void return_all_reservations(struct gfs2_rgrpd *rgd)
710 {
711 	struct rb_node *n;
712 	struct gfs2_blkreserv *rs;
713 
714 	spin_lock(&rgd->rd_rsspin);
715 	while ((n = rb_first(&rgd->rd_rstree))) {
716 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
717 		__rs_deltree(rs);
718 	}
719 	spin_unlock(&rgd->rd_rsspin);
720 }
721 
722 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
723 {
724 	struct rb_node *n;
725 	struct gfs2_rgrpd *rgd;
726 	struct gfs2_glock *gl;
727 
728 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
729 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
730 		gl = rgd->rd_gl;
731 
732 		rb_erase(n, &sdp->sd_rindex_tree);
733 
734 		if (gl) {
735 			glock_clear_object(gl, rgd);
736 			gfs2_glock_put(gl);
737 		}
738 
739 		gfs2_free_clones(rgd);
740 		kfree(rgd->rd_bits);
741 		rgd->rd_bits = NULL;
742 		return_all_reservations(rgd);
743 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
744 	}
745 }
746 
747 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
748 {
749 	struct gfs2_sbd *sdp = rgd->rd_sbd;
750 
751 	fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
752 	fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
753 	fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
754 	fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
755 	fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
756 }
757 
758 /**
759  * gfs2_compute_bitstructs - Compute the bitmap sizes
760  * @rgd: The resource group descriptor
761  *
762  * Calculates bitmap descriptors, one for each block that contains bitmap data
763  *
764  * Returns: errno
765  */
766 
767 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
768 {
769 	struct gfs2_sbd *sdp = rgd->rd_sbd;
770 	struct gfs2_bitmap *bi;
771 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
772 	u32 bytes_left, bytes;
773 	int x;
774 
775 	if (!length)
776 		return -EINVAL;
777 
778 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
779 	if (!rgd->rd_bits)
780 		return -ENOMEM;
781 
782 	bytes_left = rgd->rd_bitbytes;
783 
784 	for (x = 0; x < length; x++) {
785 		bi = rgd->rd_bits + x;
786 
787 		bi->bi_flags = 0;
788 		/* small rgrp; bitmap stored completely in header block */
789 		if (length == 1) {
790 			bytes = bytes_left;
791 			bi->bi_offset = sizeof(struct gfs2_rgrp);
792 			bi->bi_start = 0;
793 			bi->bi_bytes = bytes;
794 			bi->bi_blocks = bytes * GFS2_NBBY;
795 		/* header block */
796 		} else if (x == 0) {
797 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
798 			bi->bi_offset = sizeof(struct gfs2_rgrp);
799 			bi->bi_start = 0;
800 			bi->bi_bytes = bytes;
801 			bi->bi_blocks = bytes * GFS2_NBBY;
802 		/* last block */
803 		} else if (x + 1 == length) {
804 			bytes = bytes_left;
805 			bi->bi_offset = sizeof(struct gfs2_meta_header);
806 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
807 			bi->bi_bytes = bytes;
808 			bi->bi_blocks = bytes * GFS2_NBBY;
809 		/* other blocks */
810 		} else {
811 			bytes = sdp->sd_sb.sb_bsize -
812 				sizeof(struct gfs2_meta_header);
813 			bi->bi_offset = sizeof(struct gfs2_meta_header);
814 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
815 			bi->bi_bytes = bytes;
816 			bi->bi_blocks = bytes * GFS2_NBBY;
817 		}
818 
819 		bytes_left -= bytes;
820 	}
821 
822 	if (bytes_left) {
823 		gfs2_consist_rgrpd(rgd);
824 		return -EIO;
825 	}
826 	bi = rgd->rd_bits + (length - 1);
827 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
828 		if (gfs2_consist_rgrpd(rgd)) {
829 			gfs2_rindex_print(rgd);
830 			fs_err(sdp, "start=%u len=%u offset=%u\n",
831 			       bi->bi_start, bi->bi_bytes, bi->bi_offset);
832 		}
833 		return -EIO;
834 	}
835 
836 	return 0;
837 }
838 
839 /**
840  * gfs2_ri_total - Total up the file system space, according to the rindex.
841  * @sdp: the filesystem
842  *
843  */
844 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
845 {
846 	u64 total_data = 0;
847 	struct inode *inode = sdp->sd_rindex;
848 	struct gfs2_inode *ip = GFS2_I(inode);
849 	char buf[sizeof(struct gfs2_rindex)];
850 	int error, rgrps;
851 
852 	for (rgrps = 0;; rgrps++) {
853 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
854 
855 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
856 			break;
857 		error = gfs2_internal_read(ip, buf, &pos,
858 					   sizeof(struct gfs2_rindex));
859 		if (error != sizeof(struct gfs2_rindex))
860 			break;
861 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
862 	}
863 	return total_data;
864 }
865 
866 static int rgd_insert(struct gfs2_rgrpd *rgd)
867 {
868 	struct gfs2_sbd *sdp = rgd->rd_sbd;
869 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
870 
871 	/* Figure out where to put new node */
872 	while (*newn) {
873 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
874 						  rd_node);
875 
876 		parent = *newn;
877 		if (rgd->rd_addr < cur->rd_addr)
878 			newn = &((*newn)->rb_left);
879 		else if (rgd->rd_addr > cur->rd_addr)
880 			newn = &((*newn)->rb_right);
881 		else
882 			return -EEXIST;
883 	}
884 
885 	rb_link_node(&rgd->rd_node, parent, newn);
886 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
887 	sdp->sd_rgrps++;
888 	return 0;
889 }
890 
891 /**
892  * read_rindex_entry - Pull in a new resource index entry from the disk
893  * @ip: Pointer to the rindex inode
894  *
895  * Returns: 0 on success, > 0 on EOF, error code otherwise
896  */
897 
898 static int read_rindex_entry(struct gfs2_inode *ip)
899 {
900 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
901 	const unsigned bsize = sdp->sd_sb.sb_bsize;
902 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
903 	struct gfs2_rindex buf;
904 	int error;
905 	struct gfs2_rgrpd *rgd;
906 
907 	if (pos >= i_size_read(&ip->i_inode))
908 		return 1;
909 
910 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
911 				   sizeof(struct gfs2_rindex));
912 
913 	if (error != sizeof(struct gfs2_rindex))
914 		return (error == 0) ? 1 : error;
915 
916 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
917 	error = -ENOMEM;
918 	if (!rgd)
919 		return error;
920 
921 	rgd->rd_sbd = sdp;
922 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
923 	rgd->rd_length = be32_to_cpu(buf.ri_length);
924 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
925 	rgd->rd_data = be32_to_cpu(buf.ri_data);
926 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
927 	spin_lock_init(&rgd->rd_rsspin);
928 
929 	error = compute_bitstructs(rgd);
930 	if (error)
931 		goto fail;
932 
933 	error = gfs2_glock_get(sdp, rgd->rd_addr,
934 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
935 	if (error)
936 		goto fail;
937 
938 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
939 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
940 	if (rgd->rd_data > sdp->sd_max_rg_data)
941 		sdp->sd_max_rg_data = rgd->rd_data;
942 	spin_lock(&sdp->sd_rindex_spin);
943 	error = rgd_insert(rgd);
944 	spin_unlock(&sdp->sd_rindex_spin);
945 	if (!error) {
946 		glock_set_object(rgd->rd_gl, rgd);
947 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
948 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
949 						    rgd->rd_length) * bsize) - 1;
950 		return 0;
951 	}
952 
953 	error = 0; /* someone else read in the rgrp; free it and ignore it */
954 	gfs2_glock_put(rgd->rd_gl);
955 
956 fail:
957 	kfree(rgd->rd_bits);
958 	rgd->rd_bits = NULL;
959 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
960 	return error;
961 }
962 
963 /**
964  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
965  * @sdp: the GFS2 superblock
966  *
967  * The purpose of this function is to select a subset of the resource groups
968  * and mark them as PREFERRED. We do it in such a way that each node prefers
969  * to use a unique set of rgrps to minimize glock contention.
970  */
971 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
972 {
973 	struct gfs2_rgrpd *rgd, *first;
974 	int i;
975 
976 	/* Skip an initial number of rgrps, based on this node's journal ID.
977 	   That should start each node out on its own set. */
978 	rgd = gfs2_rgrpd_get_first(sdp);
979 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
980 		rgd = gfs2_rgrpd_get_next(rgd);
981 	first = rgd;
982 
983 	do {
984 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
985 		for (i = 0; i < sdp->sd_journals; i++) {
986 			rgd = gfs2_rgrpd_get_next(rgd);
987 			if (!rgd || rgd == first)
988 				break;
989 		}
990 	} while (rgd && rgd != first);
991 }
992 
993 /**
994  * gfs2_ri_update - Pull in a new resource index from the disk
995  * @ip: pointer to the rindex inode
996  *
997  * Returns: 0 on successful update, error code otherwise
998  */
999 
1000 static int gfs2_ri_update(struct gfs2_inode *ip)
1001 {
1002 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1003 	int error;
1004 
1005 	do {
1006 		error = read_rindex_entry(ip);
1007 	} while (error == 0);
1008 
1009 	if (error < 0)
1010 		return error;
1011 
1012 	set_rgrp_preferences(sdp);
1013 
1014 	sdp->sd_rindex_uptodate = 1;
1015 	return 0;
1016 }
1017 
1018 /**
1019  * gfs2_rindex_update - Update the rindex if required
1020  * @sdp: The GFS2 superblock
1021  *
1022  * We grab a lock on the rindex inode to make sure that it doesn't
1023  * change whilst we are performing an operation. We keep this lock
1024  * for quite long periods of time compared to other locks. This
1025  * doesn't matter, since it is shared and it is very, very rarely
1026  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1027  *
1028  * This makes sure that we're using the latest copy of the resource index
1029  * special file, which might have been updated if someone expanded the
1030  * filesystem (via gfs2_grow utility), which adds new resource groups.
1031  *
1032  * Returns: 0 on succeess, error code otherwise
1033  */
1034 
1035 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1036 {
1037 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1038 	struct gfs2_glock *gl = ip->i_gl;
1039 	struct gfs2_holder ri_gh;
1040 	int error = 0;
1041 	int unlock_required = 0;
1042 
1043 	/* Read new copy from disk if we don't have the latest */
1044 	if (!sdp->sd_rindex_uptodate) {
1045 		if (!gfs2_glock_is_locked_by_me(gl)) {
1046 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1047 			if (error)
1048 				return error;
1049 			unlock_required = 1;
1050 		}
1051 		if (!sdp->sd_rindex_uptodate)
1052 			error = gfs2_ri_update(ip);
1053 		if (unlock_required)
1054 			gfs2_glock_dq_uninit(&ri_gh);
1055 	}
1056 
1057 	return error;
1058 }
1059 
1060 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1061 {
1062 	const struct gfs2_rgrp *str = buf;
1063 	u32 rg_flags;
1064 
1065 	rg_flags = be32_to_cpu(str->rg_flags);
1066 	rg_flags &= ~GFS2_RDF_MASK;
1067 	rgd->rd_flags &= GFS2_RDF_MASK;
1068 	rgd->rd_flags |= rg_flags;
1069 	rgd->rd_free = be32_to_cpu(str->rg_free);
1070 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1071 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1072 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1073 }
1074 
1075 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1076 {
1077 	const struct gfs2_rgrp *str = buf;
1078 
1079 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1080 	rgl->rl_flags = str->rg_flags;
1081 	rgl->rl_free = str->rg_free;
1082 	rgl->rl_dinodes = str->rg_dinodes;
1083 	rgl->rl_igeneration = str->rg_igeneration;
1084 	rgl->__pad = 0UL;
1085 }
1086 
1087 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1088 {
1089 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1090 	struct gfs2_rgrp *str = buf;
1091 	u32 crc;
1092 
1093 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1094 	str->rg_free = cpu_to_be32(rgd->rd_free);
1095 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1096 	if (next == NULL)
1097 		str->rg_skip = 0;
1098 	else if (next->rd_addr > rgd->rd_addr)
1099 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1100 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1101 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1102 	str->rg_data = cpu_to_be32(rgd->rd_data);
1103 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1104 	str->rg_crc = 0;
1105 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1106 	str->rg_crc = cpu_to_be32(crc);
1107 
1108 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1109 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1110 }
1111 
1112 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1113 {
1114 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1115 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1116 	int valid = 1;
1117 
1118 	if (rgl->rl_flags != str->rg_flags) {
1119 		printk(KERN_WARNING "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1120 		       (unsigned long long)rgd->rd_addr,
1121 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1122 		valid = 0;
1123 	}
1124 	if (rgl->rl_free != str->rg_free) {
1125 		printk(KERN_WARNING "GFS2: rgd: %llu lvb free mismatch %u/%u",
1126 		       (unsigned long long)rgd->rd_addr,
1127 		       be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1128 		valid = 0;
1129 	}
1130 	if (rgl->rl_dinodes != str->rg_dinodes) {
1131 		printk(KERN_WARNING "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1132 		       (unsigned long long)rgd->rd_addr,
1133 		       be32_to_cpu(rgl->rl_dinodes),
1134 		       be32_to_cpu(str->rg_dinodes));
1135 		valid = 0;
1136 	}
1137 	if (rgl->rl_igeneration != str->rg_igeneration) {
1138 		printk(KERN_WARNING "GFS2: rgd: %llu lvb igen mismatch "
1139 		       "%llu/%llu", (unsigned long long)rgd->rd_addr,
1140 		       (unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1141 		       (unsigned long long)be64_to_cpu(str->rg_igeneration));
1142 		valid = 0;
1143 	}
1144 	return valid;
1145 }
1146 
1147 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1148 {
1149 	struct gfs2_bitmap *bi;
1150 	const u32 length = rgd->rd_length;
1151 	const u8 *buffer = NULL;
1152 	u32 i, goal, count = 0;
1153 
1154 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1155 		goal = 0;
1156 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1157 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1158 		while (goal < bi->bi_blocks) {
1159 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1160 					   GFS2_BLKST_UNLINKED);
1161 			if (goal == BFITNOENT)
1162 				break;
1163 			count++;
1164 			goal++;
1165 		}
1166 	}
1167 
1168 	return count;
1169 }
1170 
1171 
1172 /**
1173  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1174  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1175  *
1176  * Read in all of a Resource Group's header and bitmap blocks.
1177  * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
1178  *
1179  * Returns: errno
1180  */
1181 
1182 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1183 {
1184 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1185 	struct gfs2_glock *gl = rgd->rd_gl;
1186 	unsigned int length = rgd->rd_length;
1187 	struct gfs2_bitmap *bi;
1188 	unsigned int x, y;
1189 	int error;
1190 
1191 	if (rgd->rd_bits[0].bi_bh != NULL)
1192 		return 0;
1193 
1194 	for (x = 0; x < length; x++) {
1195 		bi = rgd->rd_bits + x;
1196 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1197 		if (error)
1198 			goto fail;
1199 	}
1200 
1201 	for (y = length; y--;) {
1202 		bi = rgd->rd_bits + y;
1203 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1204 		if (error)
1205 			goto fail;
1206 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1207 					      GFS2_METATYPE_RG)) {
1208 			error = -EIO;
1209 			goto fail;
1210 		}
1211 	}
1212 
1213 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1214 		for (x = 0; x < length; x++)
1215 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1216 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1217 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1218 		rgd->rd_free_clone = rgd->rd_free;
1219 		/* max out the rgrp allocation failure point */
1220 		rgd->rd_extfail_pt = rgd->rd_free;
1221 	}
1222 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1223 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1224 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1225 				     rgd->rd_bits[0].bi_bh->b_data);
1226 	}
1227 	else if (sdp->sd_args.ar_rgrplvb) {
1228 		if (!gfs2_rgrp_lvb_valid(rgd)){
1229 			gfs2_consist_rgrpd(rgd);
1230 			error = -EIO;
1231 			goto fail;
1232 		}
1233 		if (rgd->rd_rgl->rl_unlinked == 0)
1234 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1235 	}
1236 	return 0;
1237 
1238 fail:
1239 	while (x--) {
1240 		bi = rgd->rd_bits + x;
1241 		brelse(bi->bi_bh);
1242 		bi->bi_bh = NULL;
1243 		gfs2_assert_warn(sdp, !bi->bi_clone);
1244 	}
1245 
1246 	return error;
1247 }
1248 
1249 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1250 {
1251 	u32 rl_flags;
1252 
1253 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1254 		return 0;
1255 
1256 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1257 		return gfs2_rgrp_bh_get(rgd);
1258 
1259 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1260 	rl_flags &= ~GFS2_RDF_MASK;
1261 	rgd->rd_flags &= GFS2_RDF_MASK;
1262 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1263 	if (rgd->rd_rgl->rl_unlinked == 0)
1264 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1265 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1266 	rgd->rd_free_clone = rgd->rd_free;
1267 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1268 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1269 	return 0;
1270 }
1271 
1272 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1273 {
1274 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1275 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1276 
1277 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1278 		return 0;
1279 	return gfs2_rgrp_bh_get(rgd);
1280 }
1281 
1282 /**
1283  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1284  * @rgd: The resource group
1285  *
1286  */
1287 
1288 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1289 {
1290 	int x, length = rgd->rd_length;
1291 
1292 	for (x = 0; x < length; x++) {
1293 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1294 		if (bi->bi_bh) {
1295 			brelse(bi->bi_bh);
1296 			bi->bi_bh = NULL;
1297 		}
1298 	}
1299 
1300 }
1301 
1302 /**
1303  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1304  * @gh: The glock holder for the resource group
1305  *
1306  */
1307 
1308 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1309 {
1310 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1311 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1312 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1313 
1314 	if (rgd && demote_requested)
1315 		gfs2_rgrp_brelse(rgd);
1316 }
1317 
1318 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1319 			     struct buffer_head *bh,
1320 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1321 {
1322 	struct super_block *sb = sdp->sd_vfs;
1323 	u64 blk;
1324 	sector_t start = 0;
1325 	sector_t nr_blks = 0;
1326 	int rv;
1327 	unsigned int x;
1328 	u32 trimmed = 0;
1329 	u8 diff;
1330 
1331 	for (x = 0; x < bi->bi_bytes; x++) {
1332 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1333 		clone += bi->bi_offset;
1334 		clone += x;
1335 		if (bh) {
1336 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1337 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1338 		} else {
1339 			diff = ~(*clone | (*clone >> 1));
1340 		}
1341 		diff &= 0x55;
1342 		if (diff == 0)
1343 			continue;
1344 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1345 		while(diff) {
1346 			if (diff & 1) {
1347 				if (nr_blks == 0)
1348 					goto start_new_extent;
1349 				if ((start + nr_blks) != blk) {
1350 					if (nr_blks >= minlen) {
1351 						rv = sb_issue_discard(sb,
1352 							start, nr_blks,
1353 							GFP_NOFS, 0);
1354 						if (rv)
1355 							goto fail;
1356 						trimmed += nr_blks;
1357 					}
1358 					nr_blks = 0;
1359 start_new_extent:
1360 					start = blk;
1361 				}
1362 				nr_blks++;
1363 			}
1364 			diff >>= 2;
1365 			blk++;
1366 		}
1367 	}
1368 	if (nr_blks >= minlen) {
1369 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1370 		if (rv)
1371 			goto fail;
1372 		trimmed += nr_blks;
1373 	}
1374 	if (ptrimmed)
1375 		*ptrimmed = trimmed;
1376 	return 0;
1377 
1378 fail:
1379 	if (sdp->sd_args.ar_discard)
1380 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1381 	sdp->sd_args.ar_discard = 0;
1382 	return -EIO;
1383 }
1384 
1385 /**
1386  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1387  * @filp: Any file on the filesystem
1388  * @argp: Pointer to the arguments (also used to pass result)
1389  *
1390  * Returns: 0 on success, otherwise error code
1391  */
1392 
1393 int gfs2_fitrim(struct file *filp, void __user *argp)
1394 {
1395 	struct inode *inode = file_inode(filp);
1396 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1397 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1398 	struct buffer_head *bh;
1399 	struct gfs2_rgrpd *rgd;
1400 	struct gfs2_rgrpd *rgd_end;
1401 	struct gfs2_holder gh;
1402 	struct fstrim_range r;
1403 	int ret = 0;
1404 	u64 amt;
1405 	u64 trimmed = 0;
1406 	u64 start, end, minlen;
1407 	unsigned int x;
1408 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1409 
1410 	if (!capable(CAP_SYS_ADMIN))
1411 		return -EPERM;
1412 
1413 	if (!blk_queue_discard(q))
1414 		return -EOPNOTSUPP;
1415 
1416 	if (copy_from_user(&r, argp, sizeof(r)))
1417 		return -EFAULT;
1418 
1419 	ret = gfs2_rindex_update(sdp);
1420 	if (ret)
1421 		return ret;
1422 
1423 	start = r.start >> bs_shift;
1424 	end = start + (r.len >> bs_shift);
1425 	minlen = max_t(u64, r.minlen,
1426 		       q->limits.discard_granularity) >> bs_shift;
1427 
1428 	if (end <= start || minlen > sdp->sd_max_rg_data)
1429 		return -EINVAL;
1430 
1431 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1432 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1433 
1434 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1435 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1436 		return -EINVAL; /* start is beyond the end of the fs */
1437 
1438 	while (1) {
1439 
1440 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1441 		if (ret)
1442 			goto out;
1443 
1444 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1445 			/* Trim each bitmap in the rgrp */
1446 			for (x = 0; x < rgd->rd_length; x++) {
1447 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1448 				ret = gfs2_rgrp_send_discards(sdp,
1449 						rgd->rd_data0, NULL, bi, minlen,
1450 						&amt);
1451 				if (ret) {
1452 					gfs2_glock_dq_uninit(&gh);
1453 					goto out;
1454 				}
1455 				trimmed += amt;
1456 			}
1457 
1458 			/* Mark rgrp as having been trimmed */
1459 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1460 			if (ret == 0) {
1461 				bh = rgd->rd_bits[0].bi_bh;
1462 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1463 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1464 				gfs2_rgrp_out(rgd, bh->b_data);
1465 				gfs2_trans_end(sdp);
1466 			}
1467 		}
1468 		gfs2_glock_dq_uninit(&gh);
1469 
1470 		if (rgd == rgd_end)
1471 			break;
1472 
1473 		rgd = gfs2_rgrpd_get_next(rgd);
1474 	}
1475 
1476 out:
1477 	r.len = trimmed << bs_shift;
1478 	if (copy_to_user(argp, &r, sizeof(r)))
1479 		return -EFAULT;
1480 
1481 	return ret;
1482 }
1483 
1484 /**
1485  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1486  * @ip: the inode structure
1487  *
1488  */
1489 static void rs_insert(struct gfs2_inode *ip)
1490 {
1491 	struct rb_node **newn, *parent = NULL;
1492 	int rc;
1493 	struct gfs2_blkreserv *rs = &ip->i_res;
1494 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1495 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1496 
1497 	BUG_ON(gfs2_rs_active(rs));
1498 
1499 	spin_lock(&rgd->rd_rsspin);
1500 	newn = &rgd->rd_rstree.rb_node;
1501 	while (*newn) {
1502 		struct gfs2_blkreserv *cur =
1503 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1504 
1505 		parent = *newn;
1506 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1507 		if (rc > 0)
1508 			newn = &((*newn)->rb_right);
1509 		else if (rc < 0)
1510 			newn = &((*newn)->rb_left);
1511 		else {
1512 			spin_unlock(&rgd->rd_rsspin);
1513 			WARN_ON(1);
1514 			return;
1515 		}
1516 	}
1517 
1518 	rb_link_node(&rs->rs_node, parent, newn);
1519 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1520 
1521 	/* Do our rgrp accounting for the reservation */
1522 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1523 	spin_unlock(&rgd->rd_rsspin);
1524 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1525 }
1526 
1527 /**
1528  * rgd_free - return the number of free blocks we can allocate.
1529  * @rgd: the resource group
1530  *
1531  * This function returns the number of free blocks for an rgrp.
1532  * That's the clone-free blocks (blocks that are free, not including those
1533  * still being used for unlinked files that haven't been deleted.)
1534  *
1535  * It also subtracts any blocks reserved by someone else, but does not
1536  * include free blocks that are still part of our current reservation,
1537  * because obviously we can (and will) allocate them.
1538  */
1539 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1540 {
1541 	u32 tot_reserved, tot_free;
1542 
1543 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1544 		return 0;
1545 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1546 
1547 	if (rgd->rd_free_clone < tot_reserved)
1548 		tot_reserved = 0;
1549 
1550 	tot_free = rgd->rd_free_clone - tot_reserved;
1551 
1552 	return tot_free;
1553 }
1554 
1555 /**
1556  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1557  * @rgd: the resource group descriptor
1558  * @ip: pointer to the inode for which we're reserving blocks
1559  * @ap: the allocation parameters
1560  *
1561  */
1562 
1563 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1564 			   const struct gfs2_alloc_parms *ap)
1565 {
1566 	struct gfs2_rbm rbm = { .rgd = rgd, };
1567 	u64 goal;
1568 	struct gfs2_blkreserv *rs = &ip->i_res;
1569 	u32 extlen;
1570 	u32 free_blocks = rgd_free(rgd, rs);
1571 	int ret;
1572 	struct inode *inode = &ip->i_inode;
1573 
1574 	if (S_ISDIR(inode->i_mode))
1575 		extlen = 1;
1576 	else {
1577 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1578 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1579 	}
1580 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1581 		return;
1582 
1583 	/* Find bitmap block that contains bits for goal block */
1584 	if (rgrp_contains_block(rgd, ip->i_goal))
1585 		goal = ip->i_goal;
1586 	else
1587 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1588 
1589 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1590 		return;
1591 
1592 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1593 	if (ret == 0) {
1594 		rs->rs_rbm = rbm;
1595 		rs->rs_free = extlen;
1596 		rs_insert(ip);
1597 	} else {
1598 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1599 			rgd->rd_last_alloc = 0;
1600 	}
1601 }
1602 
1603 /**
1604  * gfs2_next_unreserved_block - Return next block that is not reserved
1605  * @rgd: The resource group
1606  * @block: The starting block
1607  * @length: The required length
1608  * @ip: Ignore any reservations for this inode
1609  *
1610  * If the block does not appear in any reservation, then return the
1611  * block number unchanged. If it does appear in the reservation, then
1612  * keep looking through the tree of reservations in order to find the
1613  * first block number which is not reserved.
1614  */
1615 
1616 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1617 				      u32 length,
1618 				      const struct gfs2_inode *ip)
1619 {
1620 	struct gfs2_blkreserv *rs;
1621 	struct rb_node *n;
1622 	int rc;
1623 
1624 	spin_lock(&rgd->rd_rsspin);
1625 	n = rgd->rd_rstree.rb_node;
1626 	while (n) {
1627 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1628 		rc = rs_cmp(block, length, rs);
1629 		if (rc < 0)
1630 			n = n->rb_left;
1631 		else if (rc > 0)
1632 			n = n->rb_right;
1633 		else
1634 			break;
1635 	}
1636 
1637 	if (n) {
1638 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1639 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1640 			n = n->rb_right;
1641 			if (n == NULL)
1642 				break;
1643 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1644 		}
1645 	}
1646 
1647 	spin_unlock(&rgd->rd_rsspin);
1648 	return block;
1649 }
1650 
1651 /**
1652  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1653  * @rbm: The current position in the resource group
1654  * @ip: The inode for which we are searching for blocks
1655  * @minext: The minimum extent length
1656  * @maxext: A pointer to the maximum extent structure
1657  *
1658  * This checks the current position in the rgrp to see whether there is
1659  * a reservation covering this block. If not then this function is a
1660  * no-op. If there is, then the position is moved to the end of the
1661  * contiguous reservation(s) so that we are pointing at the first
1662  * non-reserved block.
1663  *
1664  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1665  */
1666 
1667 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1668 					     const struct gfs2_inode *ip,
1669 					     u32 minext,
1670 					     struct gfs2_extent *maxext)
1671 {
1672 	u64 block = gfs2_rbm_to_block(rbm);
1673 	u32 extlen = 1;
1674 	u64 nblock;
1675 	int ret;
1676 
1677 	/*
1678 	 * If we have a minimum extent length, then skip over any extent
1679 	 * which is less than the min extent length in size.
1680 	 */
1681 	if (minext) {
1682 		extlen = gfs2_free_extlen(rbm, minext);
1683 		if (extlen <= maxext->len)
1684 			goto fail;
1685 	}
1686 
1687 	/*
1688 	 * Check the extent which has been found against the reservations
1689 	 * and skip if parts of it are already reserved
1690 	 */
1691 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1692 	if (nblock == block) {
1693 		if (!minext || extlen >= minext)
1694 			return 0;
1695 
1696 		if (extlen > maxext->len) {
1697 			maxext->len = extlen;
1698 			maxext->rbm = *rbm;
1699 		}
1700 fail:
1701 		nblock = block + extlen;
1702 	}
1703 	ret = gfs2_rbm_from_block(rbm, nblock);
1704 	if (ret < 0)
1705 		return ret;
1706 	return 1;
1707 }
1708 
1709 /**
1710  * gfs2_rbm_find - Look for blocks of a particular state
1711  * @rbm: Value/result starting position and final position
1712  * @state: The state which we want to find
1713  * @minext: Pointer to the requested extent length (NULL for a single block)
1714  *          This is updated to be the actual reservation size.
1715  * @ip: If set, check for reservations
1716  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1717  *          around until we've reached the starting point.
1718  *
1719  * Side effects:
1720  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1721  *   has no free blocks in it.
1722  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1723  *   has come up short on a free block search.
1724  *
1725  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1726  */
1727 
1728 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1729 			 const struct gfs2_inode *ip, bool nowrap)
1730 {
1731 	struct buffer_head *bh;
1732 	int initial_bii;
1733 	u32 initial_offset;
1734 	int first_bii = rbm->bii;
1735 	u32 first_offset = rbm->offset;
1736 	u32 offset;
1737 	u8 *buffer;
1738 	int n = 0;
1739 	int iters = rbm->rgd->rd_length;
1740 	int ret;
1741 	struct gfs2_bitmap *bi;
1742 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1743 
1744 	/* If we are not starting at the beginning of a bitmap, then we
1745 	 * need to add one to the bitmap count to ensure that we search
1746 	 * the starting bitmap twice.
1747 	 */
1748 	if (rbm->offset != 0)
1749 		iters++;
1750 
1751 	while(1) {
1752 		bi = rbm_bi(rbm);
1753 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1754 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1755 		    (state == GFS2_BLKST_FREE))
1756 			goto next_bitmap;
1757 
1758 		bh = bi->bi_bh;
1759 		buffer = bh->b_data + bi->bi_offset;
1760 		WARN_ON(!buffer_uptodate(bh));
1761 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1762 			buffer = bi->bi_clone + bi->bi_offset;
1763 		initial_offset = rbm->offset;
1764 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1765 		if (offset == BFITNOENT)
1766 			goto bitmap_full;
1767 		rbm->offset = offset;
1768 		if (ip == NULL)
1769 			return 0;
1770 
1771 		initial_bii = rbm->bii;
1772 		ret = gfs2_reservation_check_and_update(rbm, ip,
1773 							minext ? *minext : 0,
1774 							&maxext);
1775 		if (ret == 0)
1776 			return 0;
1777 		if (ret > 0) {
1778 			n += (rbm->bii - initial_bii);
1779 			goto next_iter;
1780 		}
1781 		if (ret == -E2BIG) {
1782 			rbm->bii = 0;
1783 			rbm->offset = 0;
1784 			n += (rbm->bii - initial_bii);
1785 			goto res_covered_end_of_rgrp;
1786 		}
1787 		return ret;
1788 
1789 bitmap_full:	/* Mark bitmap as full and fall through */
1790 		if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
1791 			set_bit(GBF_FULL, &bi->bi_flags);
1792 
1793 next_bitmap:	/* Find next bitmap in the rgrp */
1794 		rbm->offset = 0;
1795 		rbm->bii++;
1796 		if (rbm->bii == rbm->rgd->rd_length)
1797 			rbm->bii = 0;
1798 res_covered_end_of_rgrp:
1799 		if ((rbm->bii == 0) && nowrap)
1800 			break;
1801 		n++;
1802 next_iter:
1803 		if (n >= iters)
1804 			break;
1805 	}
1806 
1807 	if (minext == NULL || state != GFS2_BLKST_FREE)
1808 		return -ENOSPC;
1809 
1810 	/* If the extent was too small, and it's smaller than the smallest
1811 	   to have failed before, remember for future reference that it's
1812 	   useless to search this rgrp again for this amount or more. */
1813 	if ((first_offset == 0) && (first_bii == 0) &&
1814 	    (*minext < rbm->rgd->rd_extfail_pt))
1815 		rbm->rgd->rd_extfail_pt = *minext;
1816 
1817 	/* If the maximum extent we found is big enough to fulfill the
1818 	   minimum requirements, use it anyway. */
1819 	if (maxext.len) {
1820 		*rbm = maxext.rbm;
1821 		*minext = maxext.len;
1822 		return 0;
1823 	}
1824 
1825 	return -ENOSPC;
1826 }
1827 
1828 /**
1829  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1830  * @rgd: The rgrp
1831  * @last_unlinked: block address of the last dinode we unlinked
1832  * @skip: block address we should explicitly not unlink
1833  *
1834  * Returns: 0 if no error
1835  *          The inode, if one has been found, in inode.
1836  */
1837 
1838 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1839 {
1840 	u64 block;
1841 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1842 	struct gfs2_glock *gl;
1843 	struct gfs2_inode *ip;
1844 	int error;
1845 	int found = 0;
1846 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1847 
1848 	while (1) {
1849 		down_write(&sdp->sd_log_flush_lock);
1850 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1851 				      true);
1852 		up_write(&sdp->sd_log_flush_lock);
1853 		if (error == -ENOSPC)
1854 			break;
1855 		if (WARN_ON_ONCE(error))
1856 			break;
1857 
1858 		block = gfs2_rbm_to_block(&rbm);
1859 		if (gfs2_rbm_from_block(&rbm, block + 1))
1860 			break;
1861 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1862 			continue;
1863 		if (block == skip)
1864 			continue;
1865 		*last_unlinked = block;
1866 
1867 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1868 		if (error)
1869 			continue;
1870 
1871 		/* If the inode is already in cache, we can ignore it here
1872 		 * because the existing inode disposal code will deal with
1873 		 * it when all refs have gone away. Accessing gl_object like
1874 		 * this is not safe in general. Here it is ok because we do
1875 		 * not dereference the pointer, and we only need an approx
1876 		 * answer to whether it is NULL or not.
1877 		 */
1878 		ip = gl->gl_object;
1879 
1880 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1881 			gfs2_glock_put(gl);
1882 		else
1883 			found++;
1884 
1885 		/* Limit reclaim to sensible number of tasks */
1886 		if (found > NR_CPUS)
1887 			return;
1888 	}
1889 
1890 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1891 	return;
1892 }
1893 
1894 /**
1895  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1896  * @rgd: The rgrp in question
1897  * @loops: An indication of how picky we can be (0=very, 1=less so)
1898  *
1899  * This function uses the recently added glock statistics in order to
1900  * figure out whether a parciular resource group is suffering from
1901  * contention from multiple nodes. This is done purely on the basis
1902  * of timings, since this is the only data we have to work with and
1903  * our aim here is to reject a resource group which is highly contended
1904  * but (very important) not to do this too often in order to ensure that
1905  * we do not land up introducing fragmentation by changing resource
1906  * groups when not actually required.
1907  *
1908  * The calculation is fairly simple, we want to know whether the SRTTB
1909  * (i.e. smoothed round trip time for blocking operations) to acquire
1910  * the lock for this rgrp's glock is significantly greater than the
1911  * time taken for resource groups on average. We introduce a margin in
1912  * the form of the variable @var which is computed as the sum of the two
1913  * respective variences, and multiplied by a factor depending on @loops
1914  * and whether we have a lot of data to base the decision on. This is
1915  * then tested against the square difference of the means in order to
1916  * decide whether the result is statistically significant or not.
1917  *
1918  * Returns: A boolean verdict on the congestion status
1919  */
1920 
1921 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1922 {
1923 	const struct gfs2_glock *gl = rgd->rd_gl;
1924 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1925 	struct gfs2_lkstats *st;
1926 	u64 r_dcount, l_dcount;
1927 	u64 l_srttb, a_srttb = 0;
1928 	s64 srttb_diff;
1929 	u64 sqr_diff;
1930 	u64 var;
1931 	int cpu, nonzero = 0;
1932 
1933 	preempt_disable();
1934 	for_each_present_cpu(cpu) {
1935 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1936 		if (st->stats[GFS2_LKS_SRTTB]) {
1937 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1938 			nonzero++;
1939 		}
1940 	}
1941 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1942 	if (nonzero)
1943 		do_div(a_srttb, nonzero);
1944 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1945 	var = st->stats[GFS2_LKS_SRTTVARB] +
1946 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1947 	preempt_enable();
1948 
1949 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1950 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1951 
1952 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1953 		return false;
1954 
1955 	srttb_diff = a_srttb - l_srttb;
1956 	sqr_diff = srttb_diff * srttb_diff;
1957 
1958 	var *= 2;
1959 	if (l_dcount < 8 || r_dcount < 8)
1960 		var *= 2;
1961 	if (loops == 1)
1962 		var *= 2;
1963 
1964 	return ((srttb_diff < 0) && (sqr_diff > var));
1965 }
1966 
1967 /**
1968  * gfs2_rgrp_used_recently
1969  * @rs: The block reservation with the rgrp to test
1970  * @msecs: The time limit in milliseconds
1971  *
1972  * Returns: True if the rgrp glock has been used within the time limit
1973  */
1974 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1975 				    u64 msecs)
1976 {
1977 	u64 tdiff;
1978 
1979 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1980                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1981 
1982 	return tdiff > (msecs * 1000 * 1000);
1983 }
1984 
1985 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1986 {
1987 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1988 	u32 skip;
1989 
1990 	get_random_bytes(&skip, sizeof(skip));
1991 	return skip % sdp->sd_rgrps;
1992 }
1993 
1994 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1995 {
1996 	struct gfs2_rgrpd *rgd = *pos;
1997 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1998 
1999 	rgd = gfs2_rgrpd_get_next(rgd);
2000 	if (rgd == NULL)
2001 		rgd = gfs2_rgrpd_get_first(sdp);
2002 	*pos = rgd;
2003 	if (rgd != begin) /* If we didn't wrap */
2004 		return true;
2005 	return false;
2006 }
2007 
2008 /**
2009  * fast_to_acquire - determine if a resource group will be fast to acquire
2010  *
2011  * If this is one of our preferred rgrps, it should be quicker to acquire,
2012  * because we tried to set ourselves up as dlm lock master.
2013  */
2014 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2015 {
2016 	struct gfs2_glock *gl = rgd->rd_gl;
2017 
2018 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2019 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2020 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2021 		return 1;
2022 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2023 		return 1;
2024 	return 0;
2025 }
2026 
2027 /**
2028  * gfs2_inplace_reserve - Reserve space in the filesystem
2029  * @ip: the inode to reserve space for
2030  * @ap: the allocation parameters
2031  *
2032  * We try our best to find an rgrp that has at least ap->target blocks
2033  * available. After a couple of passes (loops == 2), the prospects of finding
2034  * such an rgrp diminish. At this stage, we return the first rgrp that has
2035  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2036  * the number of blocks available in the chosen rgrp.
2037  *
2038  * Returns: 0 on success,
2039  *          -ENOMEM if a suitable rgrp can't be found
2040  *          errno otherwise
2041  */
2042 
2043 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2044 {
2045 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2046 	struct gfs2_rgrpd *begin = NULL;
2047 	struct gfs2_blkreserv *rs = &ip->i_res;
2048 	int error = 0, rg_locked, flags = 0;
2049 	u64 last_unlinked = NO_BLOCK;
2050 	int loops = 0;
2051 	u32 free_blocks, skip = 0;
2052 
2053 	if (sdp->sd_args.ar_rgrplvb)
2054 		flags |= GL_SKIP;
2055 	if (gfs2_assert_warn(sdp, ap->target))
2056 		return -EINVAL;
2057 	if (gfs2_rs_active(rs)) {
2058 		begin = rs->rs_rbm.rgd;
2059 	} else if (rs->rs_rbm.rgd &&
2060 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2061 		begin = rs->rs_rbm.rgd;
2062 	} else {
2063 		check_and_update_goal(ip);
2064 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2065 	}
2066 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2067 		skip = gfs2_orlov_skip(ip);
2068 	if (rs->rs_rbm.rgd == NULL)
2069 		return -EBADSLT;
2070 
2071 	while (loops < 3) {
2072 		rg_locked = 1;
2073 
2074 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2075 			rg_locked = 0;
2076 			if (skip && skip--)
2077 				goto next_rgrp;
2078 			if (!gfs2_rs_active(rs)) {
2079 				if (loops == 0 &&
2080 				    !fast_to_acquire(rs->rs_rbm.rgd))
2081 					goto next_rgrp;
2082 				if ((loops < 2) &&
2083 				    gfs2_rgrp_used_recently(rs, 1000) &&
2084 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2085 					goto next_rgrp;
2086 			}
2087 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2088 						   LM_ST_EXCLUSIVE, flags,
2089 						   &ip->i_rgd_gh);
2090 			if (unlikely(error))
2091 				return error;
2092 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2093 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2094 				goto skip_rgrp;
2095 			if (sdp->sd_args.ar_rgrplvb) {
2096 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2097 				if (unlikely(error)) {
2098 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2099 					return error;
2100 				}
2101 			}
2102 		}
2103 
2104 		/* Skip unusable resource groups */
2105 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2106 						 GFS2_RDF_ERROR)) ||
2107 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2108 			goto skip_rgrp;
2109 
2110 		if (sdp->sd_args.ar_rgrplvb)
2111 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2112 
2113 		/* Get a reservation if we don't already have one */
2114 		if (!gfs2_rs_active(rs))
2115 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2116 
2117 		/* Skip rgrps when we can't get a reservation on first pass */
2118 		if (!gfs2_rs_active(rs) && (loops < 1))
2119 			goto check_rgrp;
2120 
2121 		/* If rgrp has enough free space, use it */
2122 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2123 		if (free_blocks >= ap->target ||
2124 		    (loops == 2 && ap->min_target &&
2125 		     free_blocks >= ap->min_target)) {
2126 			ap->allowed = free_blocks;
2127 			return 0;
2128 		}
2129 check_rgrp:
2130 		/* Check for unlinked inodes which can be reclaimed */
2131 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2132 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2133 					ip->i_no_addr);
2134 skip_rgrp:
2135 		/* Drop reservation, if we couldn't use reserved rgrp */
2136 		if (gfs2_rs_active(rs))
2137 			gfs2_rs_deltree(rs);
2138 
2139 		/* Unlock rgrp if required */
2140 		if (!rg_locked)
2141 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2142 next_rgrp:
2143 		/* Find the next rgrp, and continue looking */
2144 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2145 			continue;
2146 		if (skip)
2147 			continue;
2148 
2149 		/* If we've scanned all the rgrps, but found no free blocks
2150 		 * then this checks for some less likely conditions before
2151 		 * trying again.
2152 		 */
2153 		loops++;
2154 		/* Check that fs hasn't grown if writing to rindex */
2155 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2156 			error = gfs2_ri_update(ip);
2157 			if (error)
2158 				return error;
2159 		}
2160 		/* Flushing the log may release space */
2161 		if (loops == 2)
2162 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2163 				       GFS2_LFC_INPLACE_RESERVE);
2164 	}
2165 
2166 	return -ENOSPC;
2167 }
2168 
2169 /**
2170  * gfs2_inplace_release - release an inplace reservation
2171  * @ip: the inode the reservation was taken out on
2172  *
2173  * Release a reservation made by gfs2_inplace_reserve().
2174  */
2175 
2176 void gfs2_inplace_release(struct gfs2_inode *ip)
2177 {
2178 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2179 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2180 }
2181 
2182 /**
2183  * gfs2_alloc_extent - allocate an extent from a given bitmap
2184  * @rbm: the resource group information
2185  * @dinode: TRUE if the first block we allocate is for a dinode
2186  * @n: The extent length (value/result)
2187  *
2188  * Add the bitmap buffer to the transaction.
2189  * Set the found bits to @new_state to change block's allocation state.
2190  */
2191 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2192 			     unsigned int *n)
2193 {
2194 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2195 	const unsigned int elen = *n;
2196 	u64 block;
2197 	int ret;
2198 
2199 	*n = 1;
2200 	block = gfs2_rbm_to_block(rbm);
2201 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2202 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2203 	block++;
2204 	while (*n < elen) {
2205 		ret = gfs2_rbm_from_block(&pos, block);
2206 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2207 			break;
2208 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2209 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2210 		(*n)++;
2211 		block++;
2212 	}
2213 }
2214 
2215 /**
2216  * rgblk_free - Change alloc state of given block(s)
2217  * @sdp: the filesystem
2218  * @rgd: the resource group the blocks are in
2219  * @bstart: the start of a run of blocks to free
2220  * @blen: the length of the block run (all must lie within ONE RG!)
2221  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2222  */
2223 
2224 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2225 		       u64 bstart, u32 blen, unsigned char new_state)
2226 {
2227 	struct gfs2_rbm rbm;
2228 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2229 
2230 	rbm.rgd = rgd;
2231 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2232 		return;
2233 	while (blen--) {
2234 		bi = rbm_bi(&rbm);
2235 		if (bi != bi_prev) {
2236 			if (!bi->bi_clone) {
2237 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2238 						      GFP_NOFS | __GFP_NOFAIL);
2239 				memcpy(bi->bi_clone + bi->bi_offset,
2240 				       bi->bi_bh->b_data + bi->bi_offset,
2241 				       bi->bi_bytes);
2242 			}
2243 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2244 			bi_prev = bi;
2245 		}
2246 		gfs2_setbit(&rbm, false, new_state);
2247 		gfs2_rbm_incr(&rbm);
2248 	}
2249 }
2250 
2251 /**
2252  * gfs2_rgrp_dump - print out an rgrp
2253  * @seq: The iterator
2254  * @gl: The glock in question
2255  *
2256  */
2257 
2258 void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
2259 {
2260 	struct gfs2_rgrpd *rgd = gl->gl_object;
2261 	struct gfs2_blkreserv *trs;
2262 	const struct rb_node *n;
2263 
2264 	if (rgd == NULL)
2265 		return;
2266 	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2267 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2268 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2269 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2270 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2271 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2272 
2273 		gfs2_print_dbg(seq, "  L: f:%02x b:%u i:%u\n",
2274 			       be32_to_cpu(rgl->rl_flags),
2275 			       be32_to_cpu(rgl->rl_free),
2276 			       be32_to_cpu(rgl->rl_dinodes));
2277 	}
2278 	spin_lock(&rgd->rd_rsspin);
2279 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2280 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2281 		dump_rs(seq, trs);
2282 	}
2283 	spin_unlock(&rgd->rd_rsspin);
2284 }
2285 
2286 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2287 {
2288 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2289 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2290 		(unsigned long long)rgd->rd_addr);
2291 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2292 	gfs2_rgrp_dump(NULL, rgd->rd_gl);
2293 	rgd->rd_flags |= GFS2_RDF_ERROR;
2294 }
2295 
2296 /**
2297  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2298  * @ip: The inode we have just allocated blocks for
2299  * @rbm: The start of the allocated blocks
2300  * @len: The extent length
2301  *
2302  * Adjusts a reservation after an allocation has taken place. If the
2303  * reservation does not match the allocation, or if it is now empty
2304  * then it is removed.
2305  */
2306 
2307 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2308 				    const struct gfs2_rbm *rbm, unsigned len)
2309 {
2310 	struct gfs2_blkreserv *rs = &ip->i_res;
2311 	struct gfs2_rgrpd *rgd = rbm->rgd;
2312 	unsigned rlen;
2313 	u64 block;
2314 	int ret;
2315 
2316 	spin_lock(&rgd->rd_rsspin);
2317 	if (gfs2_rs_active(rs)) {
2318 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2319 			block = gfs2_rbm_to_block(rbm);
2320 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2321 			rlen = min(rs->rs_free, len);
2322 			rs->rs_free -= rlen;
2323 			rgd->rd_reserved -= rlen;
2324 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2325 			if (rs->rs_free && !ret)
2326 				goto out;
2327 			/* We used up our block reservation, so we should
2328 			   reserve more blocks next time. */
2329 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2330 		}
2331 		__rs_deltree(rs);
2332 	}
2333 out:
2334 	spin_unlock(&rgd->rd_rsspin);
2335 }
2336 
2337 /**
2338  * gfs2_set_alloc_start - Set starting point for block allocation
2339  * @rbm: The rbm which will be set to the required location
2340  * @ip: The gfs2 inode
2341  * @dinode: Flag to say if allocation includes a new inode
2342  *
2343  * This sets the starting point from the reservation if one is active
2344  * otherwise it falls back to guessing a start point based on the
2345  * inode's goal block or the last allocation point in the rgrp.
2346  */
2347 
2348 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2349 				 const struct gfs2_inode *ip, bool dinode)
2350 {
2351 	u64 goal;
2352 
2353 	if (gfs2_rs_active(&ip->i_res)) {
2354 		*rbm = ip->i_res.rs_rbm;
2355 		return;
2356 	}
2357 
2358 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2359 		goal = ip->i_goal;
2360 	else
2361 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2362 
2363 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2364 		rbm->bii = 0;
2365 		rbm->offset = 0;
2366 	}
2367 }
2368 
2369 /**
2370  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2371  * @ip: the inode to allocate the block for
2372  * @bn: Used to return the starting block number
2373  * @nblocks: requested number of blocks/extent length (value/result)
2374  * @dinode: 1 if we're allocating a dinode block, else 0
2375  * @generation: the generation number of the inode
2376  *
2377  * Returns: 0 or error
2378  */
2379 
2380 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2381 		      bool dinode, u64 *generation)
2382 {
2383 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2384 	struct buffer_head *dibh;
2385 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2386 	unsigned int ndata;
2387 	u64 block; /* block, within the file system scope */
2388 	int error;
2389 
2390 	gfs2_set_alloc_start(&rbm, ip, dinode);
2391 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2392 
2393 	if (error == -ENOSPC) {
2394 		gfs2_set_alloc_start(&rbm, ip, dinode);
2395 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2396 	}
2397 
2398 	/* Since all blocks are reserved in advance, this shouldn't happen */
2399 	if (error) {
2400 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2401 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2402 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2403 			rbm.rgd->rd_extfail_pt);
2404 		goto rgrp_error;
2405 	}
2406 
2407 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2408 	block = gfs2_rbm_to_block(&rbm);
2409 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2410 	if (gfs2_rs_active(&ip->i_res))
2411 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2412 	ndata = *nblocks;
2413 	if (dinode)
2414 		ndata--;
2415 
2416 	if (!dinode) {
2417 		ip->i_goal = block + ndata - 1;
2418 		error = gfs2_meta_inode_buffer(ip, &dibh);
2419 		if (error == 0) {
2420 			struct gfs2_dinode *di =
2421 				(struct gfs2_dinode *)dibh->b_data;
2422 			gfs2_trans_add_meta(ip->i_gl, dibh);
2423 			di->di_goal_meta = di->di_goal_data =
2424 				cpu_to_be64(ip->i_goal);
2425 			brelse(dibh);
2426 		}
2427 	}
2428 	if (rbm.rgd->rd_free < *nblocks) {
2429 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2430 		goto rgrp_error;
2431 	}
2432 
2433 	rbm.rgd->rd_free -= *nblocks;
2434 	if (dinode) {
2435 		rbm.rgd->rd_dinodes++;
2436 		*generation = rbm.rgd->rd_igeneration++;
2437 		if (*generation == 0)
2438 			*generation = rbm.rgd->rd_igeneration++;
2439 	}
2440 
2441 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2442 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2443 
2444 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2445 	if (dinode)
2446 		gfs2_trans_add_unrevoke(sdp, block, *nblocks);
2447 
2448 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2449 
2450 	rbm.rgd->rd_free_clone -= *nblocks;
2451 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2452 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2453 	*bn = block;
2454 	return 0;
2455 
2456 rgrp_error:
2457 	gfs2_rgrp_error(rbm.rgd);
2458 	return -EIO;
2459 }
2460 
2461 /**
2462  * __gfs2_free_blocks - free a contiguous run of block(s)
2463  * @ip: the inode these blocks are being freed from
2464  * @rgd: the resource group the blocks are in
2465  * @bstart: first block of a run of contiguous blocks
2466  * @blen: the length of the block run
2467  * @meta: 1 if the blocks represent metadata
2468  *
2469  */
2470 
2471 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2472 			u64 bstart, u32 blen, int meta)
2473 {
2474 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2475 
2476 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2477 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2478 	rgd->rd_free += blen;
2479 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2480 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2481 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2482 
2483 	/* Directories keep their data in the metadata address space */
2484 	if (meta || ip->i_depth)
2485 		gfs2_meta_wipe(ip, bstart, blen);
2486 }
2487 
2488 /**
2489  * gfs2_free_meta - free a contiguous run of data block(s)
2490  * @ip: the inode these blocks are being freed from
2491  * @rgd: the resource group the blocks are in
2492  * @bstart: first block of a run of contiguous blocks
2493  * @blen: the length of the block run
2494  *
2495  */
2496 
2497 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2498 		    u64 bstart, u32 blen)
2499 {
2500 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2501 
2502 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2503 	gfs2_statfs_change(sdp, 0, +blen, 0);
2504 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2505 }
2506 
2507 void gfs2_unlink_di(struct inode *inode)
2508 {
2509 	struct gfs2_inode *ip = GFS2_I(inode);
2510 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2511 	struct gfs2_rgrpd *rgd;
2512 	u64 blkno = ip->i_no_addr;
2513 
2514 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2515 	if (!rgd)
2516 		return;
2517 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2518 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2519 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2520 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2521 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2522 }
2523 
2524 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2525 {
2526 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2527 
2528 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2529 	if (!rgd->rd_dinodes)
2530 		gfs2_consist_rgrpd(rgd);
2531 	rgd->rd_dinodes--;
2532 	rgd->rd_free++;
2533 
2534 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2535 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2536 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2537 
2538 	gfs2_statfs_change(sdp, 0, +1, -1);
2539 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2540 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2541 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2542 }
2543 
2544 /**
2545  * gfs2_check_blk_type - Check the type of a block
2546  * @sdp: The superblock
2547  * @no_addr: The block number to check
2548  * @type: The block type we are looking for
2549  *
2550  * Returns: 0 if the block type matches the expected type
2551  *          -ESTALE if it doesn't match
2552  *          or -ve errno if something went wrong while checking
2553  */
2554 
2555 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2556 {
2557 	struct gfs2_rgrpd *rgd;
2558 	struct gfs2_holder rgd_gh;
2559 	struct gfs2_rbm rbm;
2560 	int error = -EINVAL;
2561 
2562 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2563 	if (!rgd)
2564 		goto fail;
2565 
2566 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2567 	if (error)
2568 		goto fail;
2569 
2570 	rbm.rgd = rgd;
2571 	error = gfs2_rbm_from_block(&rbm, no_addr);
2572 	if (WARN_ON_ONCE(error))
2573 		goto fail;
2574 
2575 	if (gfs2_testbit(&rbm, false) != type)
2576 		error = -ESTALE;
2577 
2578 	gfs2_glock_dq_uninit(&rgd_gh);
2579 fail:
2580 	return error;
2581 }
2582 
2583 /**
2584  * gfs2_rlist_add - add a RG to a list of RGs
2585  * @ip: the inode
2586  * @rlist: the list of resource groups
2587  * @block: the block
2588  *
2589  * Figure out what RG a block belongs to and add that RG to the list
2590  *
2591  * FIXME: Don't use NOFAIL
2592  *
2593  */
2594 
2595 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2596 		    u64 block)
2597 {
2598 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2599 	struct gfs2_rgrpd *rgd;
2600 	struct gfs2_rgrpd **tmp;
2601 	unsigned int new_space;
2602 	unsigned int x;
2603 
2604 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2605 		return;
2606 
2607 	/*
2608 	 * The resource group last accessed is kept in the last position.
2609 	 */
2610 
2611 	if (rlist->rl_rgrps) {
2612 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2613 		if (rgrp_contains_block(rgd, block))
2614 			return;
2615 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2616 	} else {
2617 		rgd = ip->i_res.rs_rbm.rgd;
2618 		if (!rgd || !rgrp_contains_block(rgd, block))
2619 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2620 	}
2621 
2622 	if (!rgd) {
2623 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2624 		       (unsigned long long)block);
2625 		return;
2626 	}
2627 
2628 	for (x = 0; x < rlist->rl_rgrps; x++) {
2629 		if (rlist->rl_rgd[x] == rgd) {
2630 			swap(rlist->rl_rgd[x],
2631 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2632 			return;
2633 		}
2634 	}
2635 
2636 	if (rlist->rl_rgrps == rlist->rl_space) {
2637 		new_space = rlist->rl_space + 10;
2638 
2639 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2640 			      GFP_NOFS | __GFP_NOFAIL);
2641 
2642 		if (rlist->rl_rgd) {
2643 			memcpy(tmp, rlist->rl_rgd,
2644 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2645 			kfree(rlist->rl_rgd);
2646 		}
2647 
2648 		rlist->rl_space = new_space;
2649 		rlist->rl_rgd = tmp;
2650 	}
2651 
2652 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2653 }
2654 
2655 /**
2656  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2657  *      and initialize an array of glock holders for them
2658  * @rlist: the list of resource groups
2659  *
2660  * FIXME: Don't use NOFAIL
2661  *
2662  */
2663 
2664 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2665 {
2666 	unsigned int x;
2667 
2668 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2669 				      sizeof(struct gfs2_holder),
2670 				      GFP_NOFS | __GFP_NOFAIL);
2671 	for (x = 0; x < rlist->rl_rgrps; x++)
2672 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2673 				LM_ST_EXCLUSIVE, 0,
2674 				&rlist->rl_ghs[x]);
2675 }
2676 
2677 /**
2678  * gfs2_rlist_free - free a resource group list
2679  * @rlist: the list of resource groups
2680  *
2681  */
2682 
2683 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2684 {
2685 	unsigned int x;
2686 
2687 	kfree(rlist->rl_rgd);
2688 
2689 	if (rlist->rl_ghs) {
2690 		for (x = 0; x < rlist->rl_rgrps; x++)
2691 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2692 		kfree(rlist->rl_ghs);
2693 		rlist->rl_ghs = NULL;
2694 	}
2695 }
2696 
2697