xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 015d239a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4  * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5  */
6 
7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8 
9 #include <linux/slab.h>
10 #include <linux/spinlock.h>
11 #include <linux/completion.h>
12 #include <linux/buffer_head.h>
13 #include <linux/fs.h>
14 #include <linux/gfs2_ondisk.h>
15 #include <linux/prefetch.h>
16 #include <linux/blkdev.h>
17 #include <linux/rbtree.h>
18 #include <linux/random.h>
19 
20 #include "gfs2.h"
21 #include "incore.h"
22 #include "glock.h"
23 #include "glops.h"
24 #include "lops.h"
25 #include "meta_io.h"
26 #include "quota.h"
27 #include "rgrp.h"
28 #include "super.h"
29 #include "trans.h"
30 #include "util.h"
31 #include "log.h"
32 #include "inode.h"
33 #include "trace_gfs2.h"
34 #include "dir.h"
35 
36 #define BFITNOENT ((u32)~0)
37 #define NO_BLOCK ((u64)~0)
38 
39 /*
40  * These routines are used by the resource group routines (rgrp.c)
41  * to keep track of block allocation.  Each block is represented by two
42  * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43  *
44  * 0 = Free
45  * 1 = Used (not metadata)
46  * 2 = Unlinked (still in use) inode
47  * 3 = Used (metadata)
48  */
49 
50 struct gfs2_extent {
51 	struct gfs2_rbm rbm;
52 	u32 len;
53 };
54 
55 static const char valid_change[16] = {
56 	        /* current */
57 	/* n */ 0, 1, 1, 1,
58 	/* e */ 1, 0, 0, 0,
59 	/* w */ 0, 0, 0, 1,
60 	        1, 0, 0, 0
61 };
62 
63 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64 			 const struct gfs2_inode *ip, bool nowrap);
65 
66 
67 /**
68  * gfs2_setbit - Set a bit in the bitmaps
69  * @rbm: The position of the bit to set
70  * @do_clone: Also set the clone bitmap, if it exists
71  * @new_state: the new state of the block
72  *
73  */
74 
75 static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76 			       unsigned char new_state)
77 {
78 	unsigned char *byte1, *byte2, *end, cur_state;
79 	struct gfs2_bitmap *bi = rbm_bi(rbm);
80 	unsigned int buflen = bi->bi_bytes;
81 	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82 
83 	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84 	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85 
86 	BUG_ON(byte1 >= end);
87 
88 	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89 
90 	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91 		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92 
93 		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94 			rbm->offset, cur_state, new_state);
95 		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96 			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97 			(unsigned long long)bi->bi_bh->b_blocknr);
98 		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99 			bi->bi_offset, bi->bi_bytes,
100 			(unsigned long long)gfs2_rbm_to_block(rbm));
101 		dump_stack();
102 		gfs2_consist_rgrpd(rbm->rgd);
103 		return;
104 	}
105 	*byte1 ^= (cur_state ^ new_state) << bit;
106 
107 	if (do_clone && bi->bi_clone) {
108 		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109 		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110 		*byte2 ^= (cur_state ^ new_state) << bit;
111 	}
112 }
113 
114 /**
115  * gfs2_testbit - test a bit in the bitmaps
116  * @rbm: The bit to test
117  * @use_clone: If true, test the clone bitmap, not the official bitmap.
118  *
119  * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120  * not the "real" bitmaps, to avoid allocating recently freed blocks.
121  *
122  * Returns: The two bit block state of the requested bit
123  */
124 
125 static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126 {
127 	struct gfs2_bitmap *bi = rbm_bi(rbm);
128 	const u8 *buffer;
129 	const u8 *byte;
130 	unsigned int bit;
131 
132 	if (use_clone && bi->bi_clone)
133 		buffer = bi->bi_clone;
134 	else
135 		buffer = bi->bi_bh->b_data;
136 	buffer += bi->bi_offset;
137 	byte = buffer + (rbm->offset / GFS2_NBBY);
138 	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139 
140 	return (*byte >> bit) & GFS2_BIT_MASK;
141 }
142 
143 /**
144  * gfs2_bit_search
145  * @ptr: Pointer to bitmap data
146  * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147  * @state: The state we are searching for
148  *
149  * We xor the bitmap data with a patter which is the bitwise opposite
150  * of what we are looking for, this gives rise to a pattern of ones
151  * wherever there is a match. Since we have two bits per entry, we
152  * take this pattern, shift it down by one place and then and it with
153  * the original. All the even bit positions (0,2,4, etc) then represent
154  * successful matches, so we mask with 0x55555..... to remove the unwanted
155  * odd bit positions.
156  *
157  * This allows searching of a whole u64 at once (32 blocks) with a
158  * single test (on 64 bit arches).
159  */
160 
161 static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162 {
163 	u64 tmp;
164 	static const u64 search[] = {
165 		[0] = 0xffffffffffffffffULL,
166 		[1] = 0xaaaaaaaaaaaaaaaaULL,
167 		[2] = 0x5555555555555555ULL,
168 		[3] = 0x0000000000000000ULL,
169 	};
170 	tmp = le64_to_cpu(*ptr) ^ search[state];
171 	tmp &= (tmp >> 1);
172 	tmp &= mask;
173 	return tmp;
174 }
175 
176 /**
177  * rs_cmp - multi-block reservation range compare
178  * @blk: absolute file system block number of the new reservation
179  * @len: number of blocks in the new reservation
180  * @rs: existing reservation to compare against
181  *
182  * returns: 1 if the block range is beyond the reach of the reservation
183  *         -1 if the block range is before the start of the reservation
184  *          0 if the block range overlaps with the reservation
185  */
186 static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187 {
188 	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189 
190 	if (blk >= startblk + rs->rs_free)
191 		return 1;
192 	if (blk + len - 1 < startblk)
193 		return -1;
194 	return 0;
195 }
196 
197 /**
198  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199  *       a block in a given allocation state.
200  * @buf: the buffer that holds the bitmaps
201  * @len: the length (in bytes) of the buffer
202  * @goal: start search at this block's bit-pair (within @buffer)
203  * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204  *
205  * Scope of @goal and returned block number is only within this bitmap buffer,
206  * not entire rgrp or filesystem.  @buffer will be offset from the actual
207  * beginning of a bitmap block buffer, skipping any header structures, but
208  * headers are always a multiple of 64 bits long so that the buffer is
209  * always aligned to a 64 bit boundary.
210  *
211  * The size of the buffer is in bytes, but is it assumed that it is
212  * always ok to read a complete multiple of 64 bits at the end
213  * of the block in case the end is no aligned to a natural boundary.
214  *
215  * Return: the block number (bitmap buffer scope) that was found
216  */
217 
218 static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219 		       u32 goal, u8 state)
220 {
221 	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222 	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223 	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224 	u64 tmp;
225 	u64 mask = 0x5555555555555555ULL;
226 	u32 bit;
227 
228 	/* Mask off bits we don't care about at the start of the search */
229 	mask <<= spoint;
230 	tmp = gfs2_bit_search(ptr, mask, state);
231 	ptr++;
232 	while(tmp == 0 && ptr < end) {
233 		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234 		ptr++;
235 	}
236 	/* Mask off any bits which are more than len bytes from the start */
237 	if (ptr == end && (len & (sizeof(u64) - 1)))
238 		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239 	/* Didn't find anything, so return */
240 	if (tmp == 0)
241 		return BFITNOENT;
242 	ptr--;
243 	bit = __ffs64(tmp);
244 	bit /= 2;	/* two bits per entry in the bitmap */
245 	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246 }
247 
248 /**
249  * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250  * @rbm: The rbm with rgd already set correctly
251  * @block: The block number (filesystem relative)
252  *
253  * This sets the bi and offset members of an rbm based on a
254  * resource group and a filesystem relative block number. The
255  * resource group must be set in the rbm on entry, the bi and
256  * offset members will be set by this function.
257  *
258  * Returns: 0 on success, or an error code
259  */
260 
261 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262 {
263 	if (!rgrp_contains_block(rbm->rgd, block))
264 		return -E2BIG;
265 	rbm->bii = 0;
266 	rbm->offset = block - rbm->rgd->rd_data0;
267 	/* Check if the block is within the first block */
268 	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269 		return 0;
270 
271 	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272 	rbm->offset += (sizeof(struct gfs2_rgrp) -
273 			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274 	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275 	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276 	return 0;
277 }
278 
279 /**
280  * gfs2_rbm_incr - increment an rbm structure
281  * @rbm: The rbm with rgd already set correctly
282  *
283  * This function takes an existing rbm structure and increments it to the next
284  * viable block offset.
285  *
286  * Returns: If incrementing the offset would cause the rbm to go past the
287  *          end of the rgrp, true is returned, otherwise false.
288  *
289  */
290 
291 static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292 {
293 	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294 		rbm->offset++;
295 		return false;
296 	}
297 	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298 		return true;
299 
300 	rbm->offset = 0;
301 	rbm->bii++;
302 	return false;
303 }
304 
305 /**
306  * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307  * @rbm: Position to search (value/result)
308  * @n_unaligned: Number of unaligned blocks to check
309  * @len: Decremented for each block found (terminate on zero)
310  *
311  * Returns: true if a non-free block is encountered
312  */
313 
314 static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315 {
316 	u32 n;
317 	u8 res;
318 
319 	for (n = 0; n < n_unaligned; n++) {
320 		res = gfs2_testbit(rbm, true);
321 		if (res != GFS2_BLKST_FREE)
322 			return true;
323 		(*len)--;
324 		if (*len == 0)
325 			return true;
326 		if (gfs2_rbm_incr(rbm))
327 			return true;
328 	}
329 
330 	return false;
331 }
332 
333 /**
334  * gfs2_free_extlen - Return extent length of free blocks
335  * @rrbm: Starting position
336  * @len: Max length to check
337  *
338  * Starting at the block specified by the rbm, see how many free blocks
339  * there are, not reading more than len blocks ahead. This can be done
340  * using memchr_inv when the blocks are byte aligned, but has to be done
341  * on a block by block basis in case of unaligned blocks. Also this
342  * function can cope with bitmap boundaries (although it must stop on
343  * a resource group boundary)
344  *
345  * Returns: Number of free blocks in the extent
346  */
347 
348 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349 {
350 	struct gfs2_rbm rbm = *rrbm;
351 	u32 n_unaligned = rbm.offset & 3;
352 	u32 size = len;
353 	u32 bytes;
354 	u32 chunk_size;
355 	u8 *ptr, *start, *end;
356 	u64 block;
357 	struct gfs2_bitmap *bi;
358 
359 	if (n_unaligned &&
360 	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361 		goto out;
362 
363 	n_unaligned = len & 3;
364 	/* Start is now byte aligned */
365 	while (len > 3) {
366 		bi = rbm_bi(&rbm);
367 		start = bi->bi_bh->b_data;
368 		if (bi->bi_clone)
369 			start = bi->bi_clone;
370 		start += bi->bi_offset;
371 		end = start + bi->bi_bytes;
372 		BUG_ON(rbm.offset & 3);
373 		start += (rbm.offset / GFS2_NBBY);
374 		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375 		ptr = memchr_inv(start, 0, bytes);
376 		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377 		chunk_size *= GFS2_NBBY;
378 		BUG_ON(len < chunk_size);
379 		len -= chunk_size;
380 		block = gfs2_rbm_to_block(&rbm);
381 		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382 			n_unaligned = 0;
383 			break;
384 		}
385 		if (ptr) {
386 			n_unaligned = 3;
387 			break;
388 		}
389 		n_unaligned = len & 3;
390 	}
391 
392 	/* Deal with any bits left over at the end */
393 	if (n_unaligned)
394 		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395 out:
396 	return size - len;
397 }
398 
399 /**
400  * gfs2_bitcount - count the number of bits in a certain state
401  * @rgd: the resource group descriptor
402  * @buffer: the buffer that holds the bitmaps
403  * @buflen: the length (in bytes) of the buffer
404  * @state: the state of the block we're looking for
405  *
406  * Returns: The number of bits
407  */
408 
409 static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410 			 unsigned int buflen, u8 state)
411 {
412 	const u8 *byte = buffer;
413 	const u8 *end = buffer + buflen;
414 	const u8 state1 = state << 2;
415 	const u8 state2 = state << 4;
416 	const u8 state3 = state << 6;
417 	u32 count = 0;
418 
419 	for (; byte < end; byte++) {
420 		if (((*byte) & 0x03) == state)
421 			count++;
422 		if (((*byte) & 0x0C) == state1)
423 			count++;
424 		if (((*byte) & 0x30) == state2)
425 			count++;
426 		if (((*byte) & 0xC0) == state3)
427 			count++;
428 	}
429 
430 	return count;
431 }
432 
433 /**
434  * gfs2_rgrp_verify - Verify that a resource group is consistent
435  * @rgd: the rgrp
436  *
437  */
438 
439 void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440 {
441 	struct gfs2_sbd *sdp = rgd->rd_sbd;
442 	struct gfs2_bitmap *bi = NULL;
443 	u32 length = rgd->rd_length;
444 	u32 count[4], tmp;
445 	int buf, x;
446 
447 	memset(count, 0, 4 * sizeof(u32));
448 
449 	/* Count # blocks in each of 4 possible allocation states */
450 	for (buf = 0; buf < length; buf++) {
451 		bi = rgd->rd_bits + buf;
452 		for (x = 0; x < 4; x++)
453 			count[x] += gfs2_bitcount(rgd,
454 						  bi->bi_bh->b_data +
455 						  bi->bi_offset,
456 						  bi->bi_bytes, x);
457 	}
458 
459 	if (count[0] != rgd->rd_free) {
460 		if (gfs2_consist_rgrpd(rgd))
461 			fs_err(sdp, "free data mismatch:  %u != %u\n",
462 			       count[0], rgd->rd_free);
463 		return;
464 	}
465 
466 	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467 	if (count[1] != tmp) {
468 		if (gfs2_consist_rgrpd(rgd))
469 			fs_err(sdp, "used data mismatch:  %u != %u\n",
470 			       count[1], tmp);
471 		return;
472 	}
473 
474 	if (count[2] + count[3] != rgd->rd_dinodes) {
475 		if (gfs2_consist_rgrpd(rgd))
476 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
477 			       count[2] + count[3], rgd->rd_dinodes);
478 		return;
479 	}
480 }
481 
482 /**
483  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484  * @sdp: The GFS2 superblock
485  * @blk: The data block number
486  * @exact: True if this needs to be an exact match
487  *
488  * The @exact argument should be set to true by most callers. The exception
489  * is when we need to match blocks which are not represented by the rgrp
490  * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491  * there for alignment purposes. Another way of looking at it is that @exact
492  * matches only valid data/metadata blocks, but with @exact false, it will
493  * match any block within the extent of the rgrp.
494  *
495  * Returns: The resource group, or NULL if not found
496  */
497 
498 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499 {
500 	struct rb_node *n, *next;
501 	struct gfs2_rgrpd *cur;
502 
503 	spin_lock(&sdp->sd_rindex_spin);
504 	n = sdp->sd_rindex_tree.rb_node;
505 	while (n) {
506 		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507 		next = NULL;
508 		if (blk < cur->rd_addr)
509 			next = n->rb_left;
510 		else if (blk >= cur->rd_data0 + cur->rd_data)
511 			next = n->rb_right;
512 		if (next == NULL) {
513 			spin_unlock(&sdp->sd_rindex_spin);
514 			if (exact) {
515 				if (blk < cur->rd_addr)
516 					return NULL;
517 				if (blk >= cur->rd_data0 + cur->rd_data)
518 					return NULL;
519 			}
520 			return cur;
521 		}
522 		n = next;
523 	}
524 	spin_unlock(&sdp->sd_rindex_spin);
525 
526 	return NULL;
527 }
528 
529 /**
530  * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531  * @sdp: The GFS2 superblock
532  *
533  * Returns: The first rgrp in the filesystem
534  */
535 
536 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537 {
538 	const struct rb_node *n;
539 	struct gfs2_rgrpd *rgd;
540 
541 	spin_lock(&sdp->sd_rindex_spin);
542 	n = rb_first(&sdp->sd_rindex_tree);
543 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544 	spin_unlock(&sdp->sd_rindex_spin);
545 
546 	return rgd;
547 }
548 
549 /**
550  * gfs2_rgrpd_get_next - get the next RG
551  * @rgd: the resource group descriptor
552  *
553  * Returns: The next rgrp
554  */
555 
556 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557 {
558 	struct gfs2_sbd *sdp = rgd->rd_sbd;
559 	const struct rb_node *n;
560 
561 	spin_lock(&sdp->sd_rindex_spin);
562 	n = rb_next(&rgd->rd_node);
563 	if (n == NULL)
564 		n = rb_first(&sdp->sd_rindex_tree);
565 
566 	if (unlikely(&rgd->rd_node == n)) {
567 		spin_unlock(&sdp->sd_rindex_spin);
568 		return NULL;
569 	}
570 	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571 	spin_unlock(&sdp->sd_rindex_spin);
572 	return rgd;
573 }
574 
575 void check_and_update_goal(struct gfs2_inode *ip)
576 {
577 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578 	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579 		ip->i_goal = ip->i_no_addr;
580 }
581 
582 void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583 {
584 	int x;
585 
586 	for (x = 0; x < rgd->rd_length; x++) {
587 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588 		kfree(bi->bi_clone);
589 		bi->bi_clone = NULL;
590 	}
591 }
592 
593 /**
594  * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
595  *                 plus a quota allocations data structure, if necessary
596  * @ip: the inode for this reservation
597  */
598 int gfs2_rsqa_alloc(struct gfs2_inode *ip)
599 {
600 	return gfs2_qa_alloc(ip);
601 }
602 
603 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
604 		    const char *fs_id_buf)
605 {
606 	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
607 
608 	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
609 		       (unsigned long long)ip->i_no_addr,
610 		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
611 		       rs->rs_rbm.offset, rs->rs_free);
612 }
613 
614 /**
615  * __rs_deltree - remove a multi-block reservation from the rgd tree
616  * @rs: The reservation to remove
617  *
618  */
619 static void __rs_deltree(struct gfs2_blkreserv *rs)
620 {
621 	struct gfs2_rgrpd *rgd;
622 
623 	if (!gfs2_rs_active(rs))
624 		return;
625 
626 	rgd = rs->rs_rbm.rgd;
627 	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
628 	rb_erase(&rs->rs_node, &rgd->rd_rstree);
629 	RB_CLEAR_NODE(&rs->rs_node);
630 
631 	if (rs->rs_free) {
632 		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
633 				 rs->rs_free - 1;
634 		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
635 		struct gfs2_bitmap *start, *last;
636 
637 		/* return reserved blocks to the rgrp */
638 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
639 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
640 		/* The rgrp extent failure point is likely not to increase;
641 		   it will only do so if the freed blocks are somehow
642 		   contiguous with a span of free blocks that follows. Still,
643 		   it will force the number to be recalculated later. */
644 		rgd->rd_extfail_pt += rs->rs_free;
645 		rs->rs_free = 0;
646 		if (gfs2_rbm_from_block(&last_rbm, last_block))
647 			return;
648 		start = rbm_bi(&rs->rs_rbm);
649 		last = rbm_bi(&last_rbm);
650 		do
651 			clear_bit(GBF_FULL, &start->bi_flags);
652 		while (start++ != last);
653 	}
654 }
655 
656 /**
657  * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
658  * @rs: The reservation to remove
659  *
660  */
661 void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
662 {
663 	struct gfs2_rgrpd *rgd;
664 
665 	rgd = rs->rs_rbm.rgd;
666 	if (rgd) {
667 		spin_lock(&rgd->rd_rsspin);
668 		__rs_deltree(rs);
669 		BUG_ON(rs->rs_free);
670 		spin_unlock(&rgd->rd_rsspin);
671 	}
672 }
673 
674 /**
675  * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
676  * @ip: The inode for this reservation
677  * @wcount: The inode's write count, or NULL
678  *
679  */
680 void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
681 {
682 	down_write(&ip->i_rw_mutex);
683 	if ((wcount == NULL) || (atomic_read(wcount) <= 1))
684 		gfs2_rs_deltree(&ip->i_res);
685 	up_write(&ip->i_rw_mutex);
686 	gfs2_qa_delete(ip, wcount);
687 }
688 
689 /**
690  * return_all_reservations - return all reserved blocks back to the rgrp.
691  * @rgd: the rgrp that needs its space back
692  *
693  * We previously reserved a bunch of blocks for allocation. Now we need to
694  * give them back. This leave the reservation structures in tact, but removes
695  * all of their corresponding "no-fly zones".
696  */
697 static void return_all_reservations(struct gfs2_rgrpd *rgd)
698 {
699 	struct rb_node *n;
700 	struct gfs2_blkreserv *rs;
701 
702 	spin_lock(&rgd->rd_rsspin);
703 	while ((n = rb_first(&rgd->rd_rstree))) {
704 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
705 		__rs_deltree(rs);
706 	}
707 	spin_unlock(&rgd->rd_rsspin);
708 }
709 
710 void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
711 {
712 	struct rb_node *n;
713 	struct gfs2_rgrpd *rgd;
714 	struct gfs2_glock *gl;
715 
716 	while ((n = rb_first(&sdp->sd_rindex_tree))) {
717 		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
718 		gl = rgd->rd_gl;
719 
720 		rb_erase(n, &sdp->sd_rindex_tree);
721 
722 		if (gl) {
723 			glock_clear_object(gl, rgd);
724 			gfs2_rgrp_brelse(rgd);
725 			gfs2_glock_put(gl);
726 		}
727 
728 		gfs2_free_clones(rgd);
729 		kfree(rgd->rd_bits);
730 		rgd->rd_bits = NULL;
731 		return_all_reservations(rgd);
732 		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
733 	}
734 }
735 
736 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
737 {
738 	struct gfs2_sbd *sdp = rgd->rd_sbd;
739 
740 	fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
741 	fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
742 	fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
743 	fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
744 	fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
745 }
746 
747 /**
748  * gfs2_compute_bitstructs - Compute the bitmap sizes
749  * @rgd: The resource group descriptor
750  *
751  * Calculates bitmap descriptors, one for each block that contains bitmap data
752  *
753  * Returns: errno
754  */
755 
756 static int compute_bitstructs(struct gfs2_rgrpd *rgd)
757 {
758 	struct gfs2_sbd *sdp = rgd->rd_sbd;
759 	struct gfs2_bitmap *bi;
760 	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
761 	u32 bytes_left, bytes;
762 	int x;
763 
764 	if (!length)
765 		return -EINVAL;
766 
767 	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
768 	if (!rgd->rd_bits)
769 		return -ENOMEM;
770 
771 	bytes_left = rgd->rd_bitbytes;
772 
773 	for (x = 0; x < length; x++) {
774 		bi = rgd->rd_bits + x;
775 
776 		bi->bi_flags = 0;
777 		/* small rgrp; bitmap stored completely in header block */
778 		if (length == 1) {
779 			bytes = bytes_left;
780 			bi->bi_offset = sizeof(struct gfs2_rgrp);
781 			bi->bi_start = 0;
782 			bi->bi_bytes = bytes;
783 			bi->bi_blocks = bytes * GFS2_NBBY;
784 		/* header block */
785 		} else if (x == 0) {
786 			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
787 			bi->bi_offset = sizeof(struct gfs2_rgrp);
788 			bi->bi_start = 0;
789 			bi->bi_bytes = bytes;
790 			bi->bi_blocks = bytes * GFS2_NBBY;
791 		/* last block */
792 		} else if (x + 1 == length) {
793 			bytes = bytes_left;
794 			bi->bi_offset = sizeof(struct gfs2_meta_header);
795 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
796 			bi->bi_bytes = bytes;
797 			bi->bi_blocks = bytes * GFS2_NBBY;
798 		/* other blocks */
799 		} else {
800 			bytes = sdp->sd_sb.sb_bsize -
801 				sizeof(struct gfs2_meta_header);
802 			bi->bi_offset = sizeof(struct gfs2_meta_header);
803 			bi->bi_start = rgd->rd_bitbytes - bytes_left;
804 			bi->bi_bytes = bytes;
805 			bi->bi_blocks = bytes * GFS2_NBBY;
806 		}
807 
808 		bytes_left -= bytes;
809 	}
810 
811 	if (bytes_left) {
812 		gfs2_consist_rgrpd(rgd);
813 		return -EIO;
814 	}
815 	bi = rgd->rd_bits + (length - 1);
816 	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
817 		if (gfs2_consist_rgrpd(rgd)) {
818 			gfs2_rindex_print(rgd);
819 			fs_err(sdp, "start=%u len=%u offset=%u\n",
820 			       bi->bi_start, bi->bi_bytes, bi->bi_offset);
821 		}
822 		return -EIO;
823 	}
824 
825 	return 0;
826 }
827 
828 /**
829  * gfs2_ri_total - Total up the file system space, according to the rindex.
830  * @sdp: the filesystem
831  *
832  */
833 u64 gfs2_ri_total(struct gfs2_sbd *sdp)
834 {
835 	u64 total_data = 0;
836 	struct inode *inode = sdp->sd_rindex;
837 	struct gfs2_inode *ip = GFS2_I(inode);
838 	char buf[sizeof(struct gfs2_rindex)];
839 	int error, rgrps;
840 
841 	for (rgrps = 0;; rgrps++) {
842 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
843 
844 		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
845 			break;
846 		error = gfs2_internal_read(ip, buf, &pos,
847 					   sizeof(struct gfs2_rindex));
848 		if (error != sizeof(struct gfs2_rindex))
849 			break;
850 		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
851 	}
852 	return total_data;
853 }
854 
855 static int rgd_insert(struct gfs2_rgrpd *rgd)
856 {
857 	struct gfs2_sbd *sdp = rgd->rd_sbd;
858 	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
859 
860 	/* Figure out where to put new node */
861 	while (*newn) {
862 		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
863 						  rd_node);
864 
865 		parent = *newn;
866 		if (rgd->rd_addr < cur->rd_addr)
867 			newn = &((*newn)->rb_left);
868 		else if (rgd->rd_addr > cur->rd_addr)
869 			newn = &((*newn)->rb_right);
870 		else
871 			return -EEXIST;
872 	}
873 
874 	rb_link_node(&rgd->rd_node, parent, newn);
875 	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
876 	sdp->sd_rgrps++;
877 	return 0;
878 }
879 
880 /**
881  * read_rindex_entry - Pull in a new resource index entry from the disk
882  * @ip: Pointer to the rindex inode
883  *
884  * Returns: 0 on success, > 0 on EOF, error code otherwise
885  */
886 
887 static int read_rindex_entry(struct gfs2_inode *ip)
888 {
889 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
890 	const unsigned bsize = sdp->sd_sb.sb_bsize;
891 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
892 	struct gfs2_rindex buf;
893 	int error;
894 	struct gfs2_rgrpd *rgd;
895 
896 	if (pos >= i_size_read(&ip->i_inode))
897 		return 1;
898 
899 	error = gfs2_internal_read(ip, (char *)&buf, &pos,
900 				   sizeof(struct gfs2_rindex));
901 
902 	if (error != sizeof(struct gfs2_rindex))
903 		return (error == 0) ? 1 : error;
904 
905 	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
906 	error = -ENOMEM;
907 	if (!rgd)
908 		return error;
909 
910 	rgd->rd_sbd = sdp;
911 	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
912 	rgd->rd_length = be32_to_cpu(buf.ri_length);
913 	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
914 	rgd->rd_data = be32_to_cpu(buf.ri_data);
915 	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
916 	spin_lock_init(&rgd->rd_rsspin);
917 
918 	error = compute_bitstructs(rgd);
919 	if (error)
920 		goto fail;
921 
922 	error = gfs2_glock_get(sdp, rgd->rd_addr,
923 			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
924 	if (error)
925 		goto fail;
926 
927 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
928 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
929 	if (rgd->rd_data > sdp->sd_max_rg_data)
930 		sdp->sd_max_rg_data = rgd->rd_data;
931 	spin_lock(&sdp->sd_rindex_spin);
932 	error = rgd_insert(rgd);
933 	spin_unlock(&sdp->sd_rindex_spin);
934 	if (!error) {
935 		glock_set_object(rgd->rd_gl, rgd);
936 		rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
937 		rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr +
938 						    rgd->rd_length) * bsize) - 1;
939 		return 0;
940 	}
941 
942 	error = 0; /* someone else read in the rgrp; free it and ignore it */
943 	gfs2_glock_put(rgd->rd_gl);
944 
945 fail:
946 	kfree(rgd->rd_bits);
947 	rgd->rd_bits = NULL;
948 	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
949 	return error;
950 }
951 
952 /**
953  * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
954  * @sdp: the GFS2 superblock
955  *
956  * The purpose of this function is to select a subset of the resource groups
957  * and mark them as PREFERRED. We do it in such a way that each node prefers
958  * to use a unique set of rgrps to minimize glock contention.
959  */
960 static void set_rgrp_preferences(struct gfs2_sbd *sdp)
961 {
962 	struct gfs2_rgrpd *rgd, *first;
963 	int i;
964 
965 	/* Skip an initial number of rgrps, based on this node's journal ID.
966 	   That should start each node out on its own set. */
967 	rgd = gfs2_rgrpd_get_first(sdp);
968 	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
969 		rgd = gfs2_rgrpd_get_next(rgd);
970 	first = rgd;
971 
972 	do {
973 		rgd->rd_flags |= GFS2_RDF_PREFERRED;
974 		for (i = 0; i < sdp->sd_journals; i++) {
975 			rgd = gfs2_rgrpd_get_next(rgd);
976 			if (!rgd || rgd == first)
977 				break;
978 		}
979 	} while (rgd && rgd != first);
980 }
981 
982 /**
983  * gfs2_ri_update - Pull in a new resource index from the disk
984  * @ip: pointer to the rindex inode
985  *
986  * Returns: 0 on successful update, error code otherwise
987  */
988 
989 static int gfs2_ri_update(struct gfs2_inode *ip)
990 {
991 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
992 	int error;
993 
994 	do {
995 		error = read_rindex_entry(ip);
996 	} while (error == 0);
997 
998 	if (error < 0)
999 		return error;
1000 
1001 	set_rgrp_preferences(sdp);
1002 
1003 	sdp->sd_rindex_uptodate = 1;
1004 	return 0;
1005 }
1006 
1007 /**
1008  * gfs2_rindex_update - Update the rindex if required
1009  * @sdp: The GFS2 superblock
1010  *
1011  * We grab a lock on the rindex inode to make sure that it doesn't
1012  * change whilst we are performing an operation. We keep this lock
1013  * for quite long periods of time compared to other locks. This
1014  * doesn't matter, since it is shared and it is very, very rarely
1015  * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1016  *
1017  * This makes sure that we're using the latest copy of the resource index
1018  * special file, which might have been updated if someone expanded the
1019  * filesystem (via gfs2_grow utility), which adds new resource groups.
1020  *
1021  * Returns: 0 on succeess, error code otherwise
1022  */
1023 
1024 int gfs2_rindex_update(struct gfs2_sbd *sdp)
1025 {
1026 	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1027 	struct gfs2_glock *gl = ip->i_gl;
1028 	struct gfs2_holder ri_gh;
1029 	int error = 0;
1030 	int unlock_required = 0;
1031 
1032 	/* Read new copy from disk if we don't have the latest */
1033 	if (!sdp->sd_rindex_uptodate) {
1034 		if (!gfs2_glock_is_locked_by_me(gl)) {
1035 			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1036 			if (error)
1037 				return error;
1038 			unlock_required = 1;
1039 		}
1040 		if (!sdp->sd_rindex_uptodate)
1041 			error = gfs2_ri_update(ip);
1042 		if (unlock_required)
1043 			gfs2_glock_dq_uninit(&ri_gh);
1044 	}
1045 
1046 	return error;
1047 }
1048 
1049 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1050 {
1051 	const struct gfs2_rgrp *str = buf;
1052 	u32 rg_flags;
1053 
1054 	rg_flags = be32_to_cpu(str->rg_flags);
1055 	rg_flags &= ~GFS2_RDF_MASK;
1056 	rgd->rd_flags &= GFS2_RDF_MASK;
1057 	rgd->rd_flags |= rg_flags;
1058 	rgd->rd_free = be32_to_cpu(str->rg_free);
1059 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1060 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1061 	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1062 }
1063 
1064 static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1065 {
1066 	const struct gfs2_rgrp *str = buf;
1067 
1068 	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1069 	rgl->rl_flags = str->rg_flags;
1070 	rgl->rl_free = str->rg_free;
1071 	rgl->rl_dinodes = str->rg_dinodes;
1072 	rgl->rl_igeneration = str->rg_igeneration;
1073 	rgl->__pad = 0UL;
1074 }
1075 
1076 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1077 {
1078 	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1079 	struct gfs2_rgrp *str = buf;
1080 	u32 crc;
1081 
1082 	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1083 	str->rg_free = cpu_to_be32(rgd->rd_free);
1084 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1085 	if (next == NULL)
1086 		str->rg_skip = 0;
1087 	else if (next->rd_addr > rgd->rd_addr)
1088 		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1089 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1090 	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1091 	str->rg_data = cpu_to_be32(rgd->rd_data);
1092 	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1093 	str->rg_crc = 0;
1094 	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1095 	str->rg_crc = cpu_to_be32(crc);
1096 
1097 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1098 	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1099 }
1100 
1101 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1102 {
1103 	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1104 	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1105 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1106 	int valid = 1;
1107 
1108 	if (rgl->rl_flags != str->rg_flags) {
1109 		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1110 			(unsigned long long)rgd->rd_addr,
1111 		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1112 		valid = 0;
1113 	}
1114 	if (rgl->rl_free != str->rg_free) {
1115 		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1116 			(unsigned long long)rgd->rd_addr,
1117 			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1118 		valid = 0;
1119 	}
1120 	if (rgl->rl_dinodes != str->rg_dinodes) {
1121 		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1122 			(unsigned long long)rgd->rd_addr,
1123 			be32_to_cpu(rgl->rl_dinodes),
1124 			be32_to_cpu(str->rg_dinodes));
1125 		valid = 0;
1126 	}
1127 	if (rgl->rl_igeneration != str->rg_igeneration) {
1128 		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1129 			(unsigned long long)rgd->rd_addr,
1130 			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1131 			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1132 		valid = 0;
1133 	}
1134 	return valid;
1135 }
1136 
1137 static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1138 {
1139 	struct gfs2_bitmap *bi;
1140 	const u32 length = rgd->rd_length;
1141 	const u8 *buffer = NULL;
1142 	u32 i, goal, count = 0;
1143 
1144 	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1145 		goal = 0;
1146 		buffer = bi->bi_bh->b_data + bi->bi_offset;
1147 		WARN_ON(!buffer_uptodate(bi->bi_bh));
1148 		while (goal < bi->bi_blocks) {
1149 			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1150 					   GFS2_BLKST_UNLINKED);
1151 			if (goal == BFITNOENT)
1152 				break;
1153 			count++;
1154 			goal++;
1155 		}
1156 	}
1157 
1158 	return count;
1159 }
1160 
1161 
1162 /**
1163  * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1164  * @rgd: the struct gfs2_rgrpd describing the RG to read in
1165  *
1166  * Read in all of a Resource Group's header and bitmap blocks.
1167  * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1168  *
1169  * Returns: errno
1170  */
1171 
1172 static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1173 {
1174 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1175 	struct gfs2_glock *gl = rgd->rd_gl;
1176 	unsigned int length = rgd->rd_length;
1177 	struct gfs2_bitmap *bi;
1178 	unsigned int x, y;
1179 	int error;
1180 
1181 	if (rgd->rd_bits[0].bi_bh != NULL)
1182 		return 0;
1183 
1184 	for (x = 0; x < length; x++) {
1185 		bi = rgd->rd_bits + x;
1186 		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1187 		if (error)
1188 			goto fail;
1189 	}
1190 
1191 	for (y = length; y--;) {
1192 		bi = rgd->rd_bits + y;
1193 		error = gfs2_meta_wait(sdp, bi->bi_bh);
1194 		if (error)
1195 			goto fail;
1196 		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1197 					      GFS2_METATYPE_RG)) {
1198 			error = -EIO;
1199 			goto fail;
1200 		}
1201 	}
1202 
1203 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1204 		for (x = 0; x < length; x++)
1205 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1206 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1207 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1208 		rgd->rd_free_clone = rgd->rd_free;
1209 		/* max out the rgrp allocation failure point */
1210 		rgd->rd_extfail_pt = rgd->rd_free;
1211 	}
1212 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1213 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1214 		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1215 				     rgd->rd_bits[0].bi_bh->b_data);
1216 	}
1217 	else if (sdp->sd_args.ar_rgrplvb) {
1218 		if (!gfs2_rgrp_lvb_valid(rgd)){
1219 			gfs2_consist_rgrpd(rgd);
1220 			error = -EIO;
1221 			goto fail;
1222 		}
1223 		if (rgd->rd_rgl->rl_unlinked == 0)
1224 			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1225 	}
1226 	return 0;
1227 
1228 fail:
1229 	while (x--) {
1230 		bi = rgd->rd_bits + x;
1231 		brelse(bi->bi_bh);
1232 		bi->bi_bh = NULL;
1233 		gfs2_assert_warn(sdp, !bi->bi_clone);
1234 	}
1235 
1236 	return error;
1237 }
1238 
1239 static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1240 {
1241 	u32 rl_flags;
1242 
1243 	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1244 		return 0;
1245 
1246 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1247 		return gfs2_rgrp_bh_get(rgd);
1248 
1249 	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1250 	rl_flags &= ~GFS2_RDF_MASK;
1251 	rgd->rd_flags &= GFS2_RDF_MASK;
1252 	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1253 	if (rgd->rd_rgl->rl_unlinked == 0)
1254 		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1255 	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1256 	rgd->rd_free_clone = rgd->rd_free;
1257 	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1258 	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1259 	return 0;
1260 }
1261 
1262 int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1263 {
1264 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1265 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1266 
1267 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1268 		return 0;
1269 	return gfs2_rgrp_bh_get(rgd);
1270 }
1271 
1272 /**
1273  * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1274  * @rgd: The resource group
1275  *
1276  */
1277 
1278 void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1279 {
1280 	int x, length = rgd->rd_length;
1281 
1282 	for (x = 0; x < length; x++) {
1283 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1284 		if (bi->bi_bh) {
1285 			brelse(bi->bi_bh);
1286 			bi->bi_bh = NULL;
1287 		}
1288 	}
1289 
1290 }
1291 
1292 /**
1293  * gfs2_rgrp_go_unlock - Unlock a rgrp glock
1294  * @gh: The glock holder for the resource group
1295  *
1296  */
1297 
1298 void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
1299 {
1300 	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1301 	int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) |
1302 		test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags);
1303 
1304 	if (rgd && demote_requested)
1305 		gfs2_rgrp_brelse(rgd);
1306 }
1307 
1308 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1309 			     struct buffer_head *bh,
1310 			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1311 {
1312 	struct super_block *sb = sdp->sd_vfs;
1313 	u64 blk;
1314 	sector_t start = 0;
1315 	sector_t nr_blks = 0;
1316 	int rv;
1317 	unsigned int x;
1318 	u32 trimmed = 0;
1319 	u8 diff;
1320 
1321 	for (x = 0; x < bi->bi_bytes; x++) {
1322 		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1323 		clone += bi->bi_offset;
1324 		clone += x;
1325 		if (bh) {
1326 			const u8 *orig = bh->b_data + bi->bi_offset + x;
1327 			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1328 		} else {
1329 			diff = ~(*clone | (*clone >> 1));
1330 		}
1331 		diff &= 0x55;
1332 		if (diff == 0)
1333 			continue;
1334 		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1335 		while(diff) {
1336 			if (diff & 1) {
1337 				if (nr_blks == 0)
1338 					goto start_new_extent;
1339 				if ((start + nr_blks) != blk) {
1340 					if (nr_blks >= minlen) {
1341 						rv = sb_issue_discard(sb,
1342 							start, nr_blks,
1343 							GFP_NOFS, 0);
1344 						if (rv)
1345 							goto fail;
1346 						trimmed += nr_blks;
1347 					}
1348 					nr_blks = 0;
1349 start_new_extent:
1350 					start = blk;
1351 				}
1352 				nr_blks++;
1353 			}
1354 			diff >>= 2;
1355 			blk++;
1356 		}
1357 	}
1358 	if (nr_blks >= minlen) {
1359 		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1360 		if (rv)
1361 			goto fail;
1362 		trimmed += nr_blks;
1363 	}
1364 	if (ptrimmed)
1365 		*ptrimmed = trimmed;
1366 	return 0;
1367 
1368 fail:
1369 	if (sdp->sd_args.ar_discard)
1370 		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1371 	sdp->sd_args.ar_discard = 0;
1372 	return -EIO;
1373 }
1374 
1375 /**
1376  * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1377  * @filp: Any file on the filesystem
1378  * @argp: Pointer to the arguments (also used to pass result)
1379  *
1380  * Returns: 0 on success, otherwise error code
1381  */
1382 
1383 int gfs2_fitrim(struct file *filp, void __user *argp)
1384 {
1385 	struct inode *inode = file_inode(filp);
1386 	struct gfs2_sbd *sdp = GFS2_SB(inode);
1387 	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1388 	struct buffer_head *bh;
1389 	struct gfs2_rgrpd *rgd;
1390 	struct gfs2_rgrpd *rgd_end;
1391 	struct gfs2_holder gh;
1392 	struct fstrim_range r;
1393 	int ret = 0;
1394 	u64 amt;
1395 	u64 trimmed = 0;
1396 	u64 start, end, minlen;
1397 	unsigned int x;
1398 	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1399 
1400 	if (!capable(CAP_SYS_ADMIN))
1401 		return -EPERM;
1402 
1403 	if (!blk_queue_discard(q))
1404 		return -EOPNOTSUPP;
1405 
1406 	if (copy_from_user(&r, argp, sizeof(r)))
1407 		return -EFAULT;
1408 
1409 	ret = gfs2_rindex_update(sdp);
1410 	if (ret)
1411 		return ret;
1412 
1413 	start = r.start >> bs_shift;
1414 	end = start + (r.len >> bs_shift);
1415 	minlen = max_t(u64, r.minlen,
1416 		       q->limits.discard_granularity) >> bs_shift;
1417 
1418 	if (end <= start || minlen > sdp->sd_max_rg_data)
1419 		return -EINVAL;
1420 
1421 	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1422 	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1423 
1424 	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1425 	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1426 		return -EINVAL; /* start is beyond the end of the fs */
1427 
1428 	while (1) {
1429 
1430 		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1431 		if (ret)
1432 			goto out;
1433 
1434 		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1435 			/* Trim each bitmap in the rgrp */
1436 			for (x = 0; x < rgd->rd_length; x++) {
1437 				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1438 				ret = gfs2_rgrp_send_discards(sdp,
1439 						rgd->rd_data0, NULL, bi, minlen,
1440 						&amt);
1441 				if (ret) {
1442 					gfs2_glock_dq_uninit(&gh);
1443 					goto out;
1444 				}
1445 				trimmed += amt;
1446 			}
1447 
1448 			/* Mark rgrp as having been trimmed */
1449 			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1450 			if (ret == 0) {
1451 				bh = rgd->rd_bits[0].bi_bh;
1452 				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1453 				gfs2_trans_add_meta(rgd->rd_gl, bh);
1454 				gfs2_rgrp_out(rgd, bh->b_data);
1455 				gfs2_trans_end(sdp);
1456 			}
1457 		}
1458 		gfs2_glock_dq_uninit(&gh);
1459 
1460 		if (rgd == rgd_end)
1461 			break;
1462 
1463 		rgd = gfs2_rgrpd_get_next(rgd);
1464 	}
1465 
1466 out:
1467 	r.len = trimmed << bs_shift;
1468 	if (copy_to_user(argp, &r, sizeof(r)))
1469 		return -EFAULT;
1470 
1471 	return ret;
1472 }
1473 
1474 /**
1475  * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1476  * @ip: the inode structure
1477  *
1478  */
1479 static void rs_insert(struct gfs2_inode *ip)
1480 {
1481 	struct rb_node **newn, *parent = NULL;
1482 	int rc;
1483 	struct gfs2_blkreserv *rs = &ip->i_res;
1484 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1485 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1486 
1487 	BUG_ON(gfs2_rs_active(rs));
1488 
1489 	spin_lock(&rgd->rd_rsspin);
1490 	newn = &rgd->rd_rstree.rb_node;
1491 	while (*newn) {
1492 		struct gfs2_blkreserv *cur =
1493 			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1494 
1495 		parent = *newn;
1496 		rc = rs_cmp(fsblock, rs->rs_free, cur);
1497 		if (rc > 0)
1498 			newn = &((*newn)->rb_right);
1499 		else if (rc < 0)
1500 			newn = &((*newn)->rb_left);
1501 		else {
1502 			spin_unlock(&rgd->rd_rsspin);
1503 			WARN_ON(1);
1504 			return;
1505 		}
1506 	}
1507 
1508 	rb_link_node(&rs->rs_node, parent, newn);
1509 	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1510 
1511 	/* Do our rgrp accounting for the reservation */
1512 	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1513 	spin_unlock(&rgd->rd_rsspin);
1514 	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1515 }
1516 
1517 /**
1518  * rgd_free - return the number of free blocks we can allocate.
1519  * @rgd: the resource group
1520  *
1521  * This function returns the number of free blocks for an rgrp.
1522  * That's the clone-free blocks (blocks that are free, not including those
1523  * still being used for unlinked files that haven't been deleted.)
1524  *
1525  * It also subtracts any blocks reserved by someone else, but does not
1526  * include free blocks that are still part of our current reservation,
1527  * because obviously we can (and will) allocate them.
1528  */
1529 static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1530 {
1531 	u32 tot_reserved, tot_free;
1532 
1533 	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1534 		return 0;
1535 	tot_reserved = rgd->rd_reserved - rs->rs_free;
1536 
1537 	if (rgd->rd_free_clone < tot_reserved)
1538 		tot_reserved = 0;
1539 
1540 	tot_free = rgd->rd_free_clone - tot_reserved;
1541 
1542 	return tot_free;
1543 }
1544 
1545 /**
1546  * rg_mblk_search - find a group of multiple free blocks to form a reservation
1547  * @rgd: the resource group descriptor
1548  * @ip: pointer to the inode for which we're reserving blocks
1549  * @ap: the allocation parameters
1550  *
1551  */
1552 
1553 static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1554 			   const struct gfs2_alloc_parms *ap)
1555 {
1556 	struct gfs2_rbm rbm = { .rgd = rgd, };
1557 	u64 goal;
1558 	struct gfs2_blkreserv *rs = &ip->i_res;
1559 	u32 extlen;
1560 	u32 free_blocks = rgd_free(rgd, rs);
1561 	int ret;
1562 	struct inode *inode = &ip->i_inode;
1563 
1564 	if (S_ISDIR(inode->i_mode))
1565 		extlen = 1;
1566 	else {
1567 		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1568 		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1569 	}
1570 	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1571 		return;
1572 
1573 	/* Find bitmap block that contains bits for goal block */
1574 	if (rgrp_contains_block(rgd, ip->i_goal))
1575 		goal = ip->i_goal;
1576 	else
1577 		goal = rgd->rd_last_alloc + rgd->rd_data0;
1578 
1579 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1580 		return;
1581 
1582 	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1583 	if (ret == 0) {
1584 		rs->rs_rbm = rbm;
1585 		rs->rs_free = extlen;
1586 		rs_insert(ip);
1587 	} else {
1588 		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1589 			rgd->rd_last_alloc = 0;
1590 	}
1591 }
1592 
1593 /**
1594  * gfs2_next_unreserved_block - Return next block that is not reserved
1595  * @rgd: The resource group
1596  * @block: The starting block
1597  * @length: The required length
1598  * @ip: Ignore any reservations for this inode
1599  *
1600  * If the block does not appear in any reservation, then return the
1601  * block number unchanged. If it does appear in the reservation, then
1602  * keep looking through the tree of reservations in order to find the
1603  * first block number which is not reserved.
1604  */
1605 
1606 static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1607 				      u32 length,
1608 				      const struct gfs2_inode *ip)
1609 {
1610 	struct gfs2_blkreserv *rs;
1611 	struct rb_node *n;
1612 	int rc;
1613 
1614 	spin_lock(&rgd->rd_rsspin);
1615 	n = rgd->rd_rstree.rb_node;
1616 	while (n) {
1617 		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1618 		rc = rs_cmp(block, length, rs);
1619 		if (rc < 0)
1620 			n = n->rb_left;
1621 		else if (rc > 0)
1622 			n = n->rb_right;
1623 		else
1624 			break;
1625 	}
1626 
1627 	if (n) {
1628 		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1629 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1630 			n = n->rb_right;
1631 			if (n == NULL)
1632 				break;
1633 			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1634 		}
1635 	}
1636 
1637 	spin_unlock(&rgd->rd_rsspin);
1638 	return block;
1639 }
1640 
1641 /**
1642  * gfs2_reservation_check_and_update - Check for reservations during block alloc
1643  * @rbm: The current position in the resource group
1644  * @ip: The inode for which we are searching for blocks
1645  * @minext: The minimum extent length
1646  * @maxext: A pointer to the maximum extent structure
1647  *
1648  * This checks the current position in the rgrp to see whether there is
1649  * a reservation covering this block. If not then this function is a
1650  * no-op. If there is, then the position is moved to the end of the
1651  * contiguous reservation(s) so that we are pointing at the first
1652  * non-reserved block.
1653  *
1654  * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1655  */
1656 
1657 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1658 					     const struct gfs2_inode *ip,
1659 					     u32 minext,
1660 					     struct gfs2_extent *maxext)
1661 {
1662 	u64 block = gfs2_rbm_to_block(rbm);
1663 	u32 extlen = 1;
1664 	u64 nblock;
1665 	int ret;
1666 
1667 	/*
1668 	 * If we have a minimum extent length, then skip over any extent
1669 	 * which is less than the min extent length in size.
1670 	 */
1671 	if (minext) {
1672 		extlen = gfs2_free_extlen(rbm, minext);
1673 		if (extlen <= maxext->len)
1674 			goto fail;
1675 	}
1676 
1677 	/*
1678 	 * Check the extent which has been found against the reservations
1679 	 * and skip if parts of it are already reserved
1680 	 */
1681 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1682 	if (nblock == block) {
1683 		if (!minext || extlen >= minext)
1684 			return 0;
1685 
1686 		if (extlen > maxext->len) {
1687 			maxext->len = extlen;
1688 			maxext->rbm = *rbm;
1689 		}
1690 fail:
1691 		nblock = block + extlen;
1692 	}
1693 	ret = gfs2_rbm_from_block(rbm, nblock);
1694 	if (ret < 0)
1695 		return ret;
1696 	return 1;
1697 }
1698 
1699 /**
1700  * gfs2_rbm_find - Look for blocks of a particular state
1701  * @rbm: Value/result starting position and final position
1702  * @state: The state which we want to find
1703  * @minext: Pointer to the requested extent length (NULL for a single block)
1704  *          This is updated to be the actual reservation size.
1705  * @ip: If set, check for reservations
1706  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1707  *          around until we've reached the starting point.
1708  *
1709  * Side effects:
1710  * - If looking for free blocks, we set GBF_FULL on each bitmap which
1711  *   has no free blocks in it.
1712  * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1713  *   has come up short on a free block search.
1714  *
1715  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1716  */
1717 
1718 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1719 			 const struct gfs2_inode *ip, bool nowrap)
1720 {
1721 	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1722 	struct buffer_head *bh;
1723 	int last_bii;
1724 	u32 offset;
1725 	u8 *buffer;
1726 	bool wrapped = false;
1727 	int ret;
1728 	struct gfs2_bitmap *bi;
1729 	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1730 
1731 	/*
1732 	 * Determine the last bitmap to search.  If we're not starting at the
1733 	 * beginning of a bitmap, we need to search that bitmap twice to scan
1734 	 * the entire resource group.
1735 	 */
1736 	last_bii = rbm->bii - (rbm->offset == 0);
1737 
1738 	while(1) {
1739 		bi = rbm_bi(rbm);
1740 		if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) &&
1741 		    test_bit(GBF_FULL, &bi->bi_flags) &&
1742 		    (state == GFS2_BLKST_FREE))
1743 			goto next_bitmap;
1744 
1745 		bh = bi->bi_bh;
1746 		buffer = bh->b_data + bi->bi_offset;
1747 		WARN_ON(!buffer_uptodate(bh));
1748 		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1749 			buffer = bi->bi_clone + bi->bi_offset;
1750 		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1751 		if (offset == BFITNOENT) {
1752 			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1753 				set_bit(GBF_FULL, &bi->bi_flags);
1754 			goto next_bitmap;
1755 		}
1756 		rbm->offset = offset;
1757 		if (ip == NULL)
1758 			return 0;
1759 
1760 		ret = gfs2_reservation_check_and_update(rbm, ip,
1761 							minext ? *minext : 0,
1762 							&maxext);
1763 		if (ret == 0)
1764 			return 0;
1765 		if (ret > 0)
1766 			goto next_iter;
1767 		if (ret == -E2BIG) {
1768 			rbm->bii = 0;
1769 			rbm->offset = 0;
1770 			goto res_covered_end_of_rgrp;
1771 		}
1772 		return ret;
1773 
1774 next_bitmap:	/* Find next bitmap in the rgrp */
1775 		rbm->offset = 0;
1776 		rbm->bii++;
1777 		if (rbm->bii == rbm->rgd->rd_length)
1778 			rbm->bii = 0;
1779 res_covered_end_of_rgrp:
1780 		if (rbm->bii == 0) {
1781 			if (wrapped)
1782 				break;
1783 			wrapped = true;
1784 			if (nowrap)
1785 				break;
1786 		}
1787 next_iter:
1788 		/* Have we scanned the entire resource group? */
1789 		if (wrapped && rbm->bii > last_bii)
1790 			break;
1791 	}
1792 
1793 	if (minext == NULL || state != GFS2_BLKST_FREE)
1794 		return -ENOSPC;
1795 
1796 	/* If the extent was too small, and it's smaller than the smallest
1797 	   to have failed before, remember for future reference that it's
1798 	   useless to search this rgrp again for this amount or more. */
1799 	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1800 	    *minext < rbm->rgd->rd_extfail_pt)
1801 		rbm->rgd->rd_extfail_pt = *minext;
1802 
1803 	/* If the maximum extent we found is big enough to fulfill the
1804 	   minimum requirements, use it anyway. */
1805 	if (maxext.len) {
1806 		*rbm = maxext.rbm;
1807 		*minext = maxext.len;
1808 		return 0;
1809 	}
1810 
1811 	return -ENOSPC;
1812 }
1813 
1814 /**
1815  * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1816  * @rgd: The rgrp
1817  * @last_unlinked: block address of the last dinode we unlinked
1818  * @skip: block address we should explicitly not unlink
1819  *
1820  * Returns: 0 if no error
1821  *          The inode, if one has been found, in inode.
1822  */
1823 
1824 static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1825 {
1826 	u64 block;
1827 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1828 	struct gfs2_glock *gl;
1829 	struct gfs2_inode *ip;
1830 	int error;
1831 	int found = 0;
1832 	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1833 
1834 	while (1) {
1835 		down_write(&sdp->sd_log_flush_lock);
1836 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1837 				      true);
1838 		up_write(&sdp->sd_log_flush_lock);
1839 		if (error == -ENOSPC)
1840 			break;
1841 		if (WARN_ON_ONCE(error))
1842 			break;
1843 
1844 		block = gfs2_rbm_to_block(&rbm);
1845 		if (gfs2_rbm_from_block(&rbm, block + 1))
1846 			break;
1847 		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1848 			continue;
1849 		if (block == skip)
1850 			continue;
1851 		*last_unlinked = block;
1852 
1853 		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1854 		if (error)
1855 			continue;
1856 
1857 		/* If the inode is already in cache, we can ignore it here
1858 		 * because the existing inode disposal code will deal with
1859 		 * it when all refs have gone away. Accessing gl_object like
1860 		 * this is not safe in general. Here it is ok because we do
1861 		 * not dereference the pointer, and we only need an approx
1862 		 * answer to whether it is NULL or not.
1863 		 */
1864 		ip = gl->gl_object;
1865 
1866 		if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
1867 			gfs2_glock_put(gl);
1868 		else
1869 			found++;
1870 
1871 		/* Limit reclaim to sensible number of tasks */
1872 		if (found > NR_CPUS)
1873 			return;
1874 	}
1875 
1876 	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1877 	return;
1878 }
1879 
1880 /**
1881  * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1882  * @rgd: The rgrp in question
1883  * @loops: An indication of how picky we can be (0=very, 1=less so)
1884  *
1885  * This function uses the recently added glock statistics in order to
1886  * figure out whether a parciular resource group is suffering from
1887  * contention from multiple nodes. This is done purely on the basis
1888  * of timings, since this is the only data we have to work with and
1889  * our aim here is to reject a resource group which is highly contended
1890  * but (very important) not to do this too often in order to ensure that
1891  * we do not land up introducing fragmentation by changing resource
1892  * groups when not actually required.
1893  *
1894  * The calculation is fairly simple, we want to know whether the SRTTB
1895  * (i.e. smoothed round trip time for blocking operations) to acquire
1896  * the lock for this rgrp's glock is significantly greater than the
1897  * time taken for resource groups on average. We introduce a margin in
1898  * the form of the variable @var which is computed as the sum of the two
1899  * respective variences, and multiplied by a factor depending on @loops
1900  * and whether we have a lot of data to base the decision on. This is
1901  * then tested against the square difference of the means in order to
1902  * decide whether the result is statistically significant or not.
1903  *
1904  * Returns: A boolean verdict on the congestion status
1905  */
1906 
1907 static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1908 {
1909 	const struct gfs2_glock *gl = rgd->rd_gl;
1910 	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1911 	struct gfs2_lkstats *st;
1912 	u64 r_dcount, l_dcount;
1913 	u64 l_srttb, a_srttb = 0;
1914 	s64 srttb_diff;
1915 	u64 sqr_diff;
1916 	u64 var;
1917 	int cpu, nonzero = 0;
1918 
1919 	preempt_disable();
1920 	for_each_present_cpu(cpu) {
1921 		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1922 		if (st->stats[GFS2_LKS_SRTTB]) {
1923 			a_srttb += st->stats[GFS2_LKS_SRTTB];
1924 			nonzero++;
1925 		}
1926 	}
1927 	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1928 	if (nonzero)
1929 		do_div(a_srttb, nonzero);
1930 	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1931 	var = st->stats[GFS2_LKS_SRTTVARB] +
1932 	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1933 	preempt_enable();
1934 
1935 	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1936 	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1937 
1938 	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1939 		return false;
1940 
1941 	srttb_diff = a_srttb - l_srttb;
1942 	sqr_diff = srttb_diff * srttb_diff;
1943 
1944 	var *= 2;
1945 	if (l_dcount < 8 || r_dcount < 8)
1946 		var *= 2;
1947 	if (loops == 1)
1948 		var *= 2;
1949 
1950 	return ((srttb_diff < 0) && (sqr_diff > var));
1951 }
1952 
1953 /**
1954  * gfs2_rgrp_used_recently
1955  * @rs: The block reservation with the rgrp to test
1956  * @msecs: The time limit in milliseconds
1957  *
1958  * Returns: True if the rgrp glock has been used within the time limit
1959  */
1960 static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1961 				    u64 msecs)
1962 {
1963 	u64 tdiff;
1964 
1965 	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1966                             rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1967 
1968 	return tdiff > (msecs * 1000 * 1000);
1969 }
1970 
1971 static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1972 {
1973 	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1974 	u32 skip;
1975 
1976 	get_random_bytes(&skip, sizeof(skip));
1977 	return skip % sdp->sd_rgrps;
1978 }
1979 
1980 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1981 {
1982 	struct gfs2_rgrpd *rgd = *pos;
1983 	struct gfs2_sbd *sdp = rgd->rd_sbd;
1984 
1985 	rgd = gfs2_rgrpd_get_next(rgd);
1986 	if (rgd == NULL)
1987 		rgd = gfs2_rgrpd_get_first(sdp);
1988 	*pos = rgd;
1989 	if (rgd != begin) /* If we didn't wrap */
1990 		return true;
1991 	return false;
1992 }
1993 
1994 /**
1995  * fast_to_acquire - determine if a resource group will be fast to acquire
1996  *
1997  * If this is one of our preferred rgrps, it should be quicker to acquire,
1998  * because we tried to set ourselves up as dlm lock master.
1999  */
2000 static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2001 {
2002 	struct gfs2_glock *gl = rgd->rd_gl;
2003 
2004 	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2005 	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2006 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2007 		return 1;
2008 	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2009 		return 1;
2010 	return 0;
2011 }
2012 
2013 /**
2014  * gfs2_inplace_reserve - Reserve space in the filesystem
2015  * @ip: the inode to reserve space for
2016  * @ap: the allocation parameters
2017  *
2018  * We try our best to find an rgrp that has at least ap->target blocks
2019  * available. After a couple of passes (loops == 2), the prospects of finding
2020  * such an rgrp diminish. At this stage, we return the first rgrp that has
2021  * at least ap->min_target blocks available. Either way, we set ap->allowed to
2022  * the number of blocks available in the chosen rgrp.
2023  *
2024  * Returns: 0 on success,
2025  *          -ENOMEM if a suitable rgrp can't be found
2026  *          errno otherwise
2027  */
2028 
2029 int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2030 {
2031 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2032 	struct gfs2_rgrpd *begin = NULL;
2033 	struct gfs2_blkreserv *rs = &ip->i_res;
2034 	int error = 0, rg_locked, flags = 0;
2035 	u64 last_unlinked = NO_BLOCK;
2036 	int loops = 0;
2037 	u32 free_blocks, skip = 0;
2038 
2039 	if (sdp->sd_args.ar_rgrplvb)
2040 		flags |= GL_SKIP;
2041 	if (gfs2_assert_warn(sdp, ap->target))
2042 		return -EINVAL;
2043 	if (gfs2_rs_active(rs)) {
2044 		begin = rs->rs_rbm.rgd;
2045 	} else if (rs->rs_rbm.rgd &&
2046 		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2047 		begin = rs->rs_rbm.rgd;
2048 	} else {
2049 		check_and_update_goal(ip);
2050 		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2051 	}
2052 	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2053 		skip = gfs2_orlov_skip(ip);
2054 	if (rs->rs_rbm.rgd == NULL)
2055 		return -EBADSLT;
2056 
2057 	while (loops < 3) {
2058 		rg_locked = 1;
2059 
2060 		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2061 			rg_locked = 0;
2062 			if (skip && skip--)
2063 				goto next_rgrp;
2064 			if (!gfs2_rs_active(rs)) {
2065 				if (loops == 0 &&
2066 				    !fast_to_acquire(rs->rs_rbm.rgd))
2067 					goto next_rgrp;
2068 				if ((loops < 2) &&
2069 				    gfs2_rgrp_used_recently(rs, 1000) &&
2070 				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2071 					goto next_rgrp;
2072 			}
2073 			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2074 						   LM_ST_EXCLUSIVE, flags,
2075 						   &ip->i_rgd_gh);
2076 			if (unlikely(error))
2077 				return error;
2078 			if (!gfs2_rs_active(rs) && (loops < 2) &&
2079 			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2080 				goto skip_rgrp;
2081 			if (sdp->sd_args.ar_rgrplvb) {
2082 				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2083 				if (unlikely(error)) {
2084 					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2085 					return error;
2086 				}
2087 			}
2088 		}
2089 
2090 		/* Skip unusable resource groups */
2091 		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2092 						 GFS2_RDF_ERROR)) ||
2093 		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2094 			goto skip_rgrp;
2095 
2096 		if (sdp->sd_args.ar_rgrplvb)
2097 			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2098 
2099 		/* Get a reservation if we don't already have one */
2100 		if (!gfs2_rs_active(rs))
2101 			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2102 
2103 		/* Skip rgrps when we can't get a reservation on first pass */
2104 		if (!gfs2_rs_active(rs) && (loops < 1))
2105 			goto check_rgrp;
2106 
2107 		/* If rgrp has enough free space, use it */
2108 		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2109 		if (free_blocks >= ap->target ||
2110 		    (loops == 2 && ap->min_target &&
2111 		     free_blocks >= ap->min_target)) {
2112 			ap->allowed = free_blocks;
2113 			return 0;
2114 		}
2115 check_rgrp:
2116 		/* Check for unlinked inodes which can be reclaimed */
2117 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2118 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2119 					ip->i_no_addr);
2120 skip_rgrp:
2121 		/* Drop reservation, if we couldn't use reserved rgrp */
2122 		if (gfs2_rs_active(rs))
2123 			gfs2_rs_deltree(rs);
2124 
2125 		/* Unlock rgrp if required */
2126 		if (!rg_locked)
2127 			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2128 next_rgrp:
2129 		/* Find the next rgrp, and continue looking */
2130 		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2131 			continue;
2132 		if (skip)
2133 			continue;
2134 
2135 		/* If we've scanned all the rgrps, but found no free blocks
2136 		 * then this checks for some less likely conditions before
2137 		 * trying again.
2138 		 */
2139 		loops++;
2140 		/* Check that fs hasn't grown if writing to rindex */
2141 		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2142 			error = gfs2_ri_update(ip);
2143 			if (error)
2144 				return error;
2145 		}
2146 		/* Flushing the log may release space */
2147 		if (loops == 2)
2148 			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2149 				       GFS2_LFC_INPLACE_RESERVE);
2150 	}
2151 
2152 	return -ENOSPC;
2153 }
2154 
2155 /**
2156  * gfs2_inplace_release - release an inplace reservation
2157  * @ip: the inode the reservation was taken out on
2158  *
2159  * Release a reservation made by gfs2_inplace_reserve().
2160  */
2161 
2162 void gfs2_inplace_release(struct gfs2_inode *ip)
2163 {
2164 	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2165 		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2166 }
2167 
2168 /**
2169  * gfs2_alloc_extent - allocate an extent from a given bitmap
2170  * @rbm: the resource group information
2171  * @dinode: TRUE if the first block we allocate is for a dinode
2172  * @n: The extent length (value/result)
2173  *
2174  * Add the bitmap buffer to the transaction.
2175  * Set the found bits to @new_state to change block's allocation state.
2176  */
2177 static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2178 			     unsigned int *n)
2179 {
2180 	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2181 	const unsigned int elen = *n;
2182 	u64 block;
2183 	int ret;
2184 
2185 	*n = 1;
2186 	block = gfs2_rbm_to_block(rbm);
2187 	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2188 	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2189 	block++;
2190 	while (*n < elen) {
2191 		ret = gfs2_rbm_from_block(&pos, block);
2192 		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2193 			break;
2194 		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2195 		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2196 		(*n)++;
2197 		block++;
2198 	}
2199 }
2200 
2201 /**
2202  * rgblk_free - Change alloc state of given block(s)
2203  * @sdp: the filesystem
2204  * @rgd: the resource group the blocks are in
2205  * @bstart: the start of a run of blocks to free
2206  * @blen: the length of the block run (all must lie within ONE RG!)
2207  * @new_state: GFS2_BLKST_XXX the after-allocation block state
2208  */
2209 
2210 static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2211 		       u64 bstart, u32 blen, unsigned char new_state)
2212 {
2213 	struct gfs2_rbm rbm;
2214 	struct gfs2_bitmap *bi, *bi_prev = NULL;
2215 
2216 	rbm.rgd = rgd;
2217 	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2218 		return;
2219 	while (blen--) {
2220 		bi = rbm_bi(&rbm);
2221 		if (bi != bi_prev) {
2222 			if (!bi->bi_clone) {
2223 				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2224 						      GFP_NOFS | __GFP_NOFAIL);
2225 				memcpy(bi->bi_clone + bi->bi_offset,
2226 				       bi->bi_bh->b_data + bi->bi_offset,
2227 				       bi->bi_bytes);
2228 			}
2229 			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2230 			bi_prev = bi;
2231 		}
2232 		gfs2_setbit(&rbm, false, new_state);
2233 		gfs2_rbm_incr(&rbm);
2234 	}
2235 }
2236 
2237 /**
2238  * gfs2_rgrp_dump - print out an rgrp
2239  * @seq: The iterator
2240  * @gl: The glock in question
2241  * @fs_id_buf: pointer to file system id (if requested)
2242  *
2243  */
2244 
2245 void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
2246 		    const char *fs_id_buf)
2247 {
2248 	struct gfs2_rgrpd *rgd = gl->gl_object;
2249 	struct gfs2_blkreserv *trs;
2250 	const struct rb_node *n;
2251 
2252 	if (rgd == NULL)
2253 		return;
2254 	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2255 		       fs_id_buf,
2256 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2257 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2258 		       rgd->rd_reserved, rgd->rd_extfail_pt);
2259 	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2260 		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2261 
2262 		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2263 			       be32_to_cpu(rgl->rl_flags),
2264 			       be32_to_cpu(rgl->rl_free),
2265 			       be32_to_cpu(rgl->rl_dinodes));
2266 	}
2267 	spin_lock(&rgd->rd_rsspin);
2268 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2269 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2270 		dump_rs(seq, trs, fs_id_buf);
2271 	}
2272 	spin_unlock(&rgd->rd_rsspin);
2273 }
2274 
2275 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2276 {
2277 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2278 	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2279 
2280 	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2281 		(unsigned long long)rgd->rd_addr);
2282 	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2283 	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2284 	gfs2_rgrp_dump(NULL, rgd->rd_gl, fs_id_buf);
2285 	rgd->rd_flags |= GFS2_RDF_ERROR;
2286 }
2287 
2288 /**
2289  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2290  * @ip: The inode we have just allocated blocks for
2291  * @rbm: The start of the allocated blocks
2292  * @len: The extent length
2293  *
2294  * Adjusts a reservation after an allocation has taken place. If the
2295  * reservation does not match the allocation, or if it is now empty
2296  * then it is removed.
2297  */
2298 
2299 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2300 				    const struct gfs2_rbm *rbm, unsigned len)
2301 {
2302 	struct gfs2_blkreserv *rs = &ip->i_res;
2303 	struct gfs2_rgrpd *rgd = rbm->rgd;
2304 	unsigned rlen;
2305 	u64 block;
2306 	int ret;
2307 
2308 	spin_lock(&rgd->rd_rsspin);
2309 	if (gfs2_rs_active(rs)) {
2310 		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2311 			block = gfs2_rbm_to_block(rbm);
2312 			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2313 			rlen = min(rs->rs_free, len);
2314 			rs->rs_free -= rlen;
2315 			rgd->rd_reserved -= rlen;
2316 			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2317 			if (rs->rs_free && !ret)
2318 				goto out;
2319 			/* We used up our block reservation, so we should
2320 			   reserve more blocks next time. */
2321 			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2322 		}
2323 		__rs_deltree(rs);
2324 	}
2325 out:
2326 	spin_unlock(&rgd->rd_rsspin);
2327 }
2328 
2329 /**
2330  * gfs2_set_alloc_start - Set starting point for block allocation
2331  * @rbm: The rbm which will be set to the required location
2332  * @ip: The gfs2 inode
2333  * @dinode: Flag to say if allocation includes a new inode
2334  *
2335  * This sets the starting point from the reservation if one is active
2336  * otherwise it falls back to guessing a start point based on the
2337  * inode's goal block or the last allocation point in the rgrp.
2338  */
2339 
2340 static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2341 				 const struct gfs2_inode *ip, bool dinode)
2342 {
2343 	u64 goal;
2344 
2345 	if (gfs2_rs_active(&ip->i_res)) {
2346 		*rbm = ip->i_res.rs_rbm;
2347 		return;
2348 	}
2349 
2350 	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2351 		goal = ip->i_goal;
2352 	else
2353 		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2354 
2355 	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2356 		rbm->bii = 0;
2357 		rbm->offset = 0;
2358 	}
2359 }
2360 
2361 /**
2362  * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2363  * @ip: the inode to allocate the block for
2364  * @bn: Used to return the starting block number
2365  * @nblocks: requested number of blocks/extent length (value/result)
2366  * @dinode: 1 if we're allocating a dinode block, else 0
2367  * @generation: the generation number of the inode
2368  *
2369  * Returns: 0 or error
2370  */
2371 
2372 int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2373 		      bool dinode, u64 *generation)
2374 {
2375 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2376 	struct buffer_head *dibh;
2377 	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2378 	unsigned int ndata;
2379 	u64 block; /* block, within the file system scope */
2380 	int error;
2381 
2382 	gfs2_set_alloc_start(&rbm, ip, dinode);
2383 	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
2384 
2385 	if (error == -ENOSPC) {
2386 		gfs2_set_alloc_start(&rbm, ip, dinode);
2387 		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false);
2388 	}
2389 
2390 	/* Since all blocks are reserved in advance, this shouldn't happen */
2391 	if (error) {
2392 		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2393 			(unsigned long long)ip->i_no_addr, error, *nblocks,
2394 			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2395 			rbm.rgd->rd_extfail_pt);
2396 		goto rgrp_error;
2397 	}
2398 
2399 	gfs2_alloc_extent(&rbm, dinode, nblocks);
2400 	block = gfs2_rbm_to_block(&rbm);
2401 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2402 	if (gfs2_rs_active(&ip->i_res))
2403 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2404 	ndata = *nblocks;
2405 	if (dinode)
2406 		ndata--;
2407 
2408 	if (!dinode) {
2409 		ip->i_goal = block + ndata - 1;
2410 		error = gfs2_meta_inode_buffer(ip, &dibh);
2411 		if (error == 0) {
2412 			struct gfs2_dinode *di =
2413 				(struct gfs2_dinode *)dibh->b_data;
2414 			gfs2_trans_add_meta(ip->i_gl, dibh);
2415 			di->di_goal_meta = di->di_goal_data =
2416 				cpu_to_be64(ip->i_goal);
2417 			brelse(dibh);
2418 		}
2419 	}
2420 	if (rbm.rgd->rd_free < *nblocks) {
2421 		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2422 		goto rgrp_error;
2423 	}
2424 
2425 	rbm.rgd->rd_free -= *nblocks;
2426 	if (dinode) {
2427 		rbm.rgd->rd_dinodes++;
2428 		*generation = rbm.rgd->rd_igeneration++;
2429 		if (*generation == 0)
2430 			*generation = rbm.rgd->rd_igeneration++;
2431 	}
2432 
2433 	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2434 	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2435 
2436 	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2437 	if (dinode)
2438 		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2439 
2440 	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2441 
2442 	rbm.rgd->rd_free_clone -= *nblocks;
2443 	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2444 			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2445 	*bn = block;
2446 	return 0;
2447 
2448 rgrp_error:
2449 	gfs2_rgrp_error(rbm.rgd);
2450 	return -EIO;
2451 }
2452 
2453 /**
2454  * __gfs2_free_blocks - free a contiguous run of block(s)
2455  * @ip: the inode these blocks are being freed from
2456  * @rgd: the resource group the blocks are in
2457  * @bstart: first block of a run of contiguous blocks
2458  * @blen: the length of the block run
2459  * @meta: 1 if the blocks represent metadata
2460  *
2461  */
2462 
2463 void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2464 			u64 bstart, u32 blen, int meta)
2465 {
2466 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2467 
2468 	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2469 	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2470 	rgd->rd_free += blen;
2471 	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2472 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2473 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2474 
2475 	/* Directories keep their data in the metadata address space */
2476 	if (meta || ip->i_depth)
2477 		gfs2_meta_wipe(ip, bstart, blen);
2478 }
2479 
2480 /**
2481  * gfs2_free_meta - free a contiguous run of data block(s)
2482  * @ip: the inode these blocks are being freed from
2483  * @rgd: the resource group the blocks are in
2484  * @bstart: first block of a run of contiguous blocks
2485  * @blen: the length of the block run
2486  *
2487  */
2488 
2489 void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2490 		    u64 bstart, u32 blen)
2491 {
2492 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2493 
2494 	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2495 	gfs2_statfs_change(sdp, 0, +blen, 0);
2496 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2497 }
2498 
2499 void gfs2_unlink_di(struct inode *inode)
2500 {
2501 	struct gfs2_inode *ip = GFS2_I(inode);
2502 	struct gfs2_sbd *sdp = GFS2_SB(inode);
2503 	struct gfs2_rgrpd *rgd;
2504 	u64 blkno = ip->i_no_addr;
2505 
2506 	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2507 	if (!rgd)
2508 		return;
2509 	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2510 	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2511 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2512 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2513 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2514 }
2515 
2516 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2517 {
2518 	struct gfs2_sbd *sdp = rgd->rd_sbd;
2519 
2520 	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2521 	if (!rgd->rd_dinodes)
2522 		gfs2_consist_rgrpd(rgd);
2523 	rgd->rd_dinodes--;
2524 	rgd->rd_free++;
2525 
2526 	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2527 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2528 	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2529 
2530 	gfs2_statfs_change(sdp, 0, +1, -1);
2531 	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2532 	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2533 	gfs2_meta_wipe(ip, ip->i_no_addr, 1);
2534 }
2535 
2536 /**
2537  * gfs2_check_blk_type - Check the type of a block
2538  * @sdp: The superblock
2539  * @no_addr: The block number to check
2540  * @type: The block type we are looking for
2541  *
2542  * Returns: 0 if the block type matches the expected type
2543  *          -ESTALE if it doesn't match
2544  *          or -ve errno if something went wrong while checking
2545  */
2546 
2547 int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2548 {
2549 	struct gfs2_rgrpd *rgd;
2550 	struct gfs2_holder rgd_gh;
2551 	struct gfs2_rbm rbm;
2552 	int error = -EINVAL;
2553 
2554 	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2555 	if (!rgd)
2556 		goto fail;
2557 
2558 	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2559 	if (error)
2560 		goto fail;
2561 
2562 	rbm.rgd = rgd;
2563 	error = gfs2_rbm_from_block(&rbm, no_addr);
2564 	if (WARN_ON_ONCE(error))
2565 		goto fail;
2566 
2567 	if (gfs2_testbit(&rbm, false) != type)
2568 		error = -ESTALE;
2569 
2570 	gfs2_glock_dq_uninit(&rgd_gh);
2571 fail:
2572 	return error;
2573 }
2574 
2575 /**
2576  * gfs2_rlist_add - add a RG to a list of RGs
2577  * @ip: the inode
2578  * @rlist: the list of resource groups
2579  * @block: the block
2580  *
2581  * Figure out what RG a block belongs to and add that RG to the list
2582  *
2583  * FIXME: Don't use NOFAIL
2584  *
2585  */
2586 
2587 void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2588 		    u64 block)
2589 {
2590 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2591 	struct gfs2_rgrpd *rgd;
2592 	struct gfs2_rgrpd **tmp;
2593 	unsigned int new_space;
2594 	unsigned int x;
2595 
2596 	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2597 		return;
2598 
2599 	/*
2600 	 * The resource group last accessed is kept in the last position.
2601 	 */
2602 
2603 	if (rlist->rl_rgrps) {
2604 		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2605 		if (rgrp_contains_block(rgd, block))
2606 			return;
2607 		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2608 	} else {
2609 		rgd = ip->i_res.rs_rbm.rgd;
2610 		if (!rgd || !rgrp_contains_block(rgd, block))
2611 			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2612 	}
2613 
2614 	if (!rgd) {
2615 		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2616 		       (unsigned long long)block);
2617 		return;
2618 	}
2619 
2620 	for (x = 0; x < rlist->rl_rgrps; x++) {
2621 		if (rlist->rl_rgd[x] == rgd) {
2622 			swap(rlist->rl_rgd[x],
2623 			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2624 			return;
2625 		}
2626 	}
2627 
2628 	if (rlist->rl_rgrps == rlist->rl_space) {
2629 		new_space = rlist->rl_space + 10;
2630 
2631 		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2632 			      GFP_NOFS | __GFP_NOFAIL);
2633 
2634 		if (rlist->rl_rgd) {
2635 			memcpy(tmp, rlist->rl_rgd,
2636 			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2637 			kfree(rlist->rl_rgd);
2638 		}
2639 
2640 		rlist->rl_space = new_space;
2641 		rlist->rl_rgd = tmp;
2642 	}
2643 
2644 	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2645 }
2646 
2647 /**
2648  * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2649  *      and initialize an array of glock holders for them
2650  * @rlist: the list of resource groups
2651  *
2652  * FIXME: Don't use NOFAIL
2653  *
2654  */
2655 
2656 void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2657 {
2658 	unsigned int x;
2659 
2660 	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2661 				      sizeof(struct gfs2_holder),
2662 				      GFP_NOFS | __GFP_NOFAIL);
2663 	for (x = 0; x < rlist->rl_rgrps; x++)
2664 		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2665 				LM_ST_EXCLUSIVE, 0,
2666 				&rlist->rl_ghs[x]);
2667 }
2668 
2669 /**
2670  * gfs2_rlist_free - free a resource group list
2671  * @rlist: the list of resource groups
2672  *
2673  */
2674 
2675 void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2676 {
2677 	unsigned int x;
2678 
2679 	kfree(rlist->rl_rgd);
2680 
2681 	if (rlist->rl_ghs) {
2682 		for (x = 0; x < rlist->rl_rgrps; x++)
2683 			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2684 		kfree(rlist->rl_ghs);
2685 		rlist->rl_ghs = NULL;
2686 	}
2687 }
2688 
2689