xref: /openbmc/linux/fs/gfs2/rgrp.c (revision 9b5a7f4a2a8dcda461f9c7a6671150f4a8a902e8)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4   * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5   */
6  
7  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8  
9  #include <linux/slab.h>
10  #include <linux/spinlock.h>
11  #include <linux/completion.h>
12  #include <linux/buffer_head.h>
13  #include <linux/fs.h>
14  #include <linux/gfs2_ondisk.h>
15  #include <linux/prefetch.h>
16  #include <linux/blkdev.h>
17  #include <linux/rbtree.h>
18  #include <linux/random.h>
19  
20  #include "gfs2.h"
21  #include "incore.h"
22  #include "glock.h"
23  #include "glops.h"
24  #include "lops.h"
25  #include "meta_io.h"
26  #include "quota.h"
27  #include "rgrp.h"
28  #include "super.h"
29  #include "trans.h"
30  #include "util.h"
31  #include "log.h"
32  #include "inode.h"
33  #include "trace_gfs2.h"
34  #include "dir.h"
35  
36  #define BFITNOENT ((u32)~0)
37  #define NO_BLOCK ((u64)~0)
38  
39  struct gfs2_rbm {
40  	struct gfs2_rgrpd *rgd;
41  	u32 offset;		/* The offset is bitmap relative */
42  	int bii;		/* Bitmap index */
43  };
44  
45  static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
46  {
47  	return rbm->rgd->rd_bits + rbm->bii;
48  }
49  
50  static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
51  {
52  	BUG_ON(rbm->offset >= rbm->rgd->rd_data);
53  	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
54  		rbm->offset;
55  }
56  
57  /*
58   * These routines are used by the resource group routines (rgrp.c)
59   * to keep track of block allocation.  Each block is represented by two
60   * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
61   *
62   * 0 = Free
63   * 1 = Used (not metadata)
64   * 2 = Unlinked (still in use) inode
65   * 3 = Used (metadata)
66   */
67  
68  struct gfs2_extent {
69  	struct gfs2_rbm rbm;
70  	u32 len;
71  };
72  
73  static const char valid_change[16] = {
74  	        /* current */
75  	/* n */ 0, 1, 1, 1,
76  	/* e */ 1, 0, 0, 0,
77  	/* w */ 0, 0, 0, 1,
78  	        1, 0, 0, 0
79  };
80  
81  static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
82  			 struct gfs2_blkreserv *rs, bool nowrap);
83  
84  
85  /**
86   * gfs2_setbit - Set a bit in the bitmaps
87   * @rbm: The position of the bit to set
88   * @do_clone: Also set the clone bitmap, if it exists
89   * @new_state: the new state of the block
90   *
91   */
92  
93  static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
94  			       unsigned char new_state)
95  {
96  	unsigned char *byte1, *byte2, *end, cur_state;
97  	struct gfs2_bitmap *bi = rbm_bi(rbm);
98  	unsigned int buflen = bi->bi_bytes;
99  	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
100  
101  	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
102  	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
103  
104  	BUG_ON(byte1 >= end);
105  
106  	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
107  
108  	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
109  		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
110  
111  		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
112  			rbm->offset, cur_state, new_state);
113  		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
114  			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
115  			(unsigned long long)bi->bi_bh->b_blocknr);
116  		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
117  			bi->bi_offset, bi->bi_bytes,
118  			(unsigned long long)gfs2_rbm_to_block(rbm));
119  		dump_stack();
120  		gfs2_consist_rgrpd(rbm->rgd);
121  		return;
122  	}
123  	*byte1 ^= (cur_state ^ new_state) << bit;
124  
125  	if (do_clone && bi->bi_clone) {
126  		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
127  		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
128  		*byte2 ^= (cur_state ^ new_state) << bit;
129  	}
130  }
131  
132  /**
133   * gfs2_testbit - test a bit in the bitmaps
134   * @rbm: The bit to test
135   * @use_clone: If true, test the clone bitmap, not the official bitmap.
136   *
137   * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
138   * not the "real" bitmaps, to avoid allocating recently freed blocks.
139   *
140   * Returns: The two bit block state of the requested bit
141   */
142  
143  static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
144  {
145  	struct gfs2_bitmap *bi = rbm_bi(rbm);
146  	const u8 *buffer;
147  	const u8 *byte;
148  	unsigned int bit;
149  
150  	if (use_clone && bi->bi_clone)
151  		buffer = bi->bi_clone;
152  	else
153  		buffer = bi->bi_bh->b_data;
154  	buffer += bi->bi_offset;
155  	byte = buffer + (rbm->offset / GFS2_NBBY);
156  	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
157  
158  	return (*byte >> bit) & GFS2_BIT_MASK;
159  }
160  
161  /**
162   * gfs2_bit_search
163   * @ptr: Pointer to bitmap data
164   * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
165   * @state: The state we are searching for
166   *
167   * We xor the bitmap data with a patter which is the bitwise opposite
168   * of what we are looking for, this gives rise to a pattern of ones
169   * wherever there is a match. Since we have two bits per entry, we
170   * take this pattern, shift it down by one place and then and it with
171   * the original. All the even bit positions (0,2,4, etc) then represent
172   * successful matches, so we mask with 0x55555..... to remove the unwanted
173   * odd bit positions.
174   *
175   * This allows searching of a whole u64 at once (32 blocks) with a
176   * single test (on 64 bit arches).
177   */
178  
179  static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
180  {
181  	u64 tmp;
182  	static const u64 search[] = {
183  		[0] = 0xffffffffffffffffULL,
184  		[1] = 0xaaaaaaaaaaaaaaaaULL,
185  		[2] = 0x5555555555555555ULL,
186  		[3] = 0x0000000000000000ULL,
187  	};
188  	tmp = le64_to_cpu(*ptr) ^ search[state];
189  	tmp &= (tmp >> 1);
190  	tmp &= mask;
191  	return tmp;
192  }
193  
194  /**
195   * rs_cmp - multi-block reservation range compare
196   * @start: start of the new reservation
197   * @len: number of blocks in the new reservation
198   * @rs: existing reservation to compare against
199   *
200   * returns: 1 if the block range is beyond the reach of the reservation
201   *         -1 if the block range is before the start of the reservation
202   *          0 if the block range overlaps with the reservation
203   */
204  static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs)
205  {
206  	if (start >= rs->rs_start + rs->rs_requested)
207  		return 1;
208  	if (rs->rs_start >= start + len)
209  		return -1;
210  	return 0;
211  }
212  
213  /**
214   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
215   *       a block in a given allocation state.
216   * @buf: the buffer that holds the bitmaps
217   * @len: the length (in bytes) of the buffer
218   * @goal: start search at this block's bit-pair (within @buffer)
219   * @state: GFS2_BLKST_XXX the state of the block we're looking for.
220   *
221   * Scope of @goal and returned block number is only within this bitmap buffer,
222   * not entire rgrp or filesystem.  @buffer will be offset from the actual
223   * beginning of a bitmap block buffer, skipping any header structures, but
224   * headers are always a multiple of 64 bits long so that the buffer is
225   * always aligned to a 64 bit boundary.
226   *
227   * The size of the buffer is in bytes, but is it assumed that it is
228   * always ok to read a complete multiple of 64 bits at the end
229   * of the block in case the end is no aligned to a natural boundary.
230   *
231   * Return: the block number (bitmap buffer scope) that was found
232   */
233  
234  static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
235  		       u32 goal, u8 state)
236  {
237  	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
238  	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
239  	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
240  	u64 tmp;
241  	u64 mask = 0x5555555555555555ULL;
242  	u32 bit;
243  
244  	/* Mask off bits we don't care about at the start of the search */
245  	mask <<= spoint;
246  	tmp = gfs2_bit_search(ptr, mask, state);
247  	ptr++;
248  	while(tmp == 0 && ptr < end) {
249  		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
250  		ptr++;
251  	}
252  	/* Mask off any bits which are more than len bytes from the start */
253  	if (ptr == end && (len & (sizeof(u64) - 1)))
254  		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
255  	/* Didn't find anything, so return */
256  	if (tmp == 0)
257  		return BFITNOENT;
258  	ptr--;
259  	bit = __ffs64(tmp);
260  	bit /= 2;	/* two bits per entry in the bitmap */
261  	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
262  }
263  
264  /**
265   * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
266   * @rbm: The rbm with rgd already set correctly
267   * @block: The block number (filesystem relative)
268   *
269   * This sets the bi and offset members of an rbm based on a
270   * resource group and a filesystem relative block number. The
271   * resource group must be set in the rbm on entry, the bi and
272   * offset members will be set by this function.
273   *
274   * Returns: 0 on success, or an error code
275   */
276  
277  static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
278  {
279  	if (!rgrp_contains_block(rbm->rgd, block))
280  		return -E2BIG;
281  	rbm->bii = 0;
282  	rbm->offset = block - rbm->rgd->rd_data0;
283  	/* Check if the block is within the first block */
284  	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
285  		return 0;
286  
287  	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
288  	rbm->offset += (sizeof(struct gfs2_rgrp) -
289  			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
290  	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
291  	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
292  	return 0;
293  }
294  
295  /**
296   * gfs2_rbm_add - add a number of blocks to an rbm
297   * @rbm: The rbm with rgd already set correctly
298   * @blocks: The number of blocks to add to rpm
299   *
300   * This function takes an existing rbm structure and adds a number of blocks to
301   * it.
302   *
303   * Returns: True if the new rbm would point past the end of the rgrp.
304   */
305  
306  static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks)
307  {
308  	struct gfs2_rgrpd *rgd = rbm->rgd;
309  	struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii;
310  
311  	if (rbm->offset + blocks < bi->bi_blocks) {
312  		rbm->offset += blocks;
313  		return false;
314  	}
315  	blocks -= bi->bi_blocks - rbm->offset;
316  
317  	for(;;) {
318  		bi++;
319  		if (bi == rgd->rd_bits + rgd->rd_length)
320  			return true;
321  		if (blocks < bi->bi_blocks) {
322  			rbm->offset = blocks;
323  			rbm->bii = bi - rgd->rd_bits;
324  			return false;
325  		}
326  		blocks -= bi->bi_blocks;
327  	}
328  }
329  
330  /**
331   * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
332   * @rbm: Position to search (value/result)
333   * @n_unaligned: Number of unaligned blocks to check
334   * @len: Decremented for each block found (terminate on zero)
335   *
336   * Returns: true if a non-free block is encountered or the end of the resource
337   *	    group is reached.
338   */
339  
340  static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
341  {
342  	u32 n;
343  	u8 res;
344  
345  	for (n = 0; n < n_unaligned; n++) {
346  		res = gfs2_testbit(rbm, true);
347  		if (res != GFS2_BLKST_FREE)
348  			return true;
349  		(*len)--;
350  		if (*len == 0)
351  			return true;
352  		if (gfs2_rbm_add(rbm, 1))
353  			return true;
354  	}
355  
356  	return false;
357  }
358  
359  /**
360   * gfs2_free_extlen - Return extent length of free blocks
361   * @rrbm: Starting position
362   * @len: Max length to check
363   *
364   * Starting at the block specified by the rbm, see how many free blocks
365   * there are, not reading more than len blocks ahead. This can be done
366   * using memchr_inv when the blocks are byte aligned, but has to be done
367   * on a block by block basis in case of unaligned blocks. Also this
368   * function can cope with bitmap boundaries (although it must stop on
369   * a resource group boundary)
370   *
371   * Returns: Number of free blocks in the extent
372   */
373  
374  static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
375  {
376  	struct gfs2_rbm rbm = *rrbm;
377  	u32 n_unaligned = rbm.offset & 3;
378  	u32 size = len;
379  	u32 bytes;
380  	u32 chunk_size;
381  	u8 *ptr, *start, *end;
382  	u64 block;
383  	struct gfs2_bitmap *bi;
384  
385  	if (n_unaligned &&
386  	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
387  		goto out;
388  
389  	n_unaligned = len & 3;
390  	/* Start is now byte aligned */
391  	while (len > 3) {
392  		bi = rbm_bi(&rbm);
393  		start = bi->bi_bh->b_data;
394  		if (bi->bi_clone)
395  			start = bi->bi_clone;
396  		start += bi->bi_offset;
397  		end = start + bi->bi_bytes;
398  		BUG_ON(rbm.offset & 3);
399  		start += (rbm.offset / GFS2_NBBY);
400  		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
401  		ptr = memchr_inv(start, 0, bytes);
402  		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
403  		chunk_size *= GFS2_NBBY;
404  		BUG_ON(len < chunk_size);
405  		len -= chunk_size;
406  		block = gfs2_rbm_to_block(&rbm);
407  		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
408  			n_unaligned = 0;
409  			break;
410  		}
411  		if (ptr) {
412  			n_unaligned = 3;
413  			break;
414  		}
415  		n_unaligned = len & 3;
416  	}
417  
418  	/* Deal with any bits left over at the end */
419  	if (n_unaligned)
420  		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
421  out:
422  	return size - len;
423  }
424  
425  /**
426   * gfs2_bitcount - count the number of bits in a certain state
427   * @rgd: the resource group descriptor
428   * @buffer: the buffer that holds the bitmaps
429   * @buflen: the length (in bytes) of the buffer
430   * @state: the state of the block we're looking for
431   *
432   * Returns: The number of bits
433   */
434  
435  static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
436  			 unsigned int buflen, u8 state)
437  {
438  	const u8 *byte = buffer;
439  	const u8 *end = buffer + buflen;
440  	const u8 state1 = state << 2;
441  	const u8 state2 = state << 4;
442  	const u8 state3 = state << 6;
443  	u32 count = 0;
444  
445  	for (; byte < end; byte++) {
446  		if (((*byte) & 0x03) == state)
447  			count++;
448  		if (((*byte) & 0x0C) == state1)
449  			count++;
450  		if (((*byte) & 0x30) == state2)
451  			count++;
452  		if (((*byte) & 0xC0) == state3)
453  			count++;
454  	}
455  
456  	return count;
457  }
458  
459  /**
460   * gfs2_rgrp_verify - Verify that a resource group is consistent
461   * @rgd: the rgrp
462   *
463   */
464  
465  void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
466  {
467  	struct gfs2_sbd *sdp = rgd->rd_sbd;
468  	struct gfs2_bitmap *bi = NULL;
469  	u32 length = rgd->rd_length;
470  	u32 count[4], tmp;
471  	int buf, x;
472  
473  	memset(count, 0, 4 * sizeof(u32));
474  
475  	/* Count # blocks in each of 4 possible allocation states */
476  	for (buf = 0; buf < length; buf++) {
477  		bi = rgd->rd_bits + buf;
478  		for (x = 0; x < 4; x++)
479  			count[x] += gfs2_bitcount(rgd,
480  						  bi->bi_bh->b_data +
481  						  bi->bi_offset,
482  						  bi->bi_bytes, x);
483  	}
484  
485  	if (count[0] != rgd->rd_free) {
486  		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
487  			count[0], rgd->rd_free);
488  		gfs2_consist_rgrpd(rgd);
489  		return;
490  	}
491  
492  	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
493  	if (count[1] != tmp) {
494  		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
495  			count[1], tmp);
496  		gfs2_consist_rgrpd(rgd);
497  		return;
498  	}
499  
500  	if (count[2] + count[3] != rgd->rd_dinodes) {
501  		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
502  			count[2] + count[3], rgd->rd_dinodes);
503  		gfs2_consist_rgrpd(rgd);
504  		return;
505  	}
506  }
507  
508  /**
509   * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
510   * @sdp: The GFS2 superblock
511   * @blk: The data block number
512   * @exact: True if this needs to be an exact match
513   *
514   * The @exact argument should be set to true by most callers. The exception
515   * is when we need to match blocks which are not represented by the rgrp
516   * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
517   * there for alignment purposes. Another way of looking at it is that @exact
518   * matches only valid data/metadata blocks, but with @exact false, it will
519   * match any block within the extent of the rgrp.
520   *
521   * Returns: The resource group, or NULL if not found
522   */
523  
524  struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
525  {
526  	struct rb_node *n, *next;
527  	struct gfs2_rgrpd *cur;
528  
529  	spin_lock(&sdp->sd_rindex_spin);
530  	n = sdp->sd_rindex_tree.rb_node;
531  	while (n) {
532  		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
533  		next = NULL;
534  		if (blk < cur->rd_addr)
535  			next = n->rb_left;
536  		else if (blk >= cur->rd_data0 + cur->rd_data)
537  			next = n->rb_right;
538  		if (next == NULL) {
539  			spin_unlock(&sdp->sd_rindex_spin);
540  			if (exact) {
541  				if (blk < cur->rd_addr)
542  					return NULL;
543  				if (blk >= cur->rd_data0 + cur->rd_data)
544  					return NULL;
545  			}
546  			return cur;
547  		}
548  		n = next;
549  	}
550  	spin_unlock(&sdp->sd_rindex_spin);
551  
552  	return NULL;
553  }
554  
555  /**
556   * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
557   * @sdp: The GFS2 superblock
558   *
559   * Returns: The first rgrp in the filesystem
560   */
561  
562  struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
563  {
564  	const struct rb_node *n;
565  	struct gfs2_rgrpd *rgd;
566  
567  	spin_lock(&sdp->sd_rindex_spin);
568  	n = rb_first(&sdp->sd_rindex_tree);
569  	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
570  	spin_unlock(&sdp->sd_rindex_spin);
571  
572  	return rgd;
573  }
574  
575  /**
576   * gfs2_rgrpd_get_next - get the next RG
577   * @rgd: the resource group descriptor
578   *
579   * Returns: The next rgrp
580   */
581  
582  struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
583  {
584  	struct gfs2_sbd *sdp = rgd->rd_sbd;
585  	const struct rb_node *n;
586  
587  	spin_lock(&sdp->sd_rindex_spin);
588  	n = rb_next(&rgd->rd_node);
589  	if (n == NULL)
590  		n = rb_first(&sdp->sd_rindex_tree);
591  
592  	if (unlikely(&rgd->rd_node == n)) {
593  		spin_unlock(&sdp->sd_rindex_spin);
594  		return NULL;
595  	}
596  	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
597  	spin_unlock(&sdp->sd_rindex_spin);
598  	return rgd;
599  }
600  
601  void check_and_update_goal(struct gfs2_inode *ip)
602  {
603  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
604  	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
605  		ip->i_goal = ip->i_no_addr;
606  }
607  
608  void gfs2_free_clones(struct gfs2_rgrpd *rgd)
609  {
610  	int x;
611  
612  	for (x = 0; x < rgd->rd_length; x++) {
613  		struct gfs2_bitmap *bi = rgd->rd_bits + x;
614  		kfree(bi->bi_clone);
615  		bi->bi_clone = NULL;
616  	}
617  }
618  
619  static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
620  		    const char *fs_id_buf)
621  {
622  	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
623  
624  	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu f:%u\n",
625  		       fs_id_buf,
626  		       (unsigned long long)ip->i_no_addr,
627  		       (unsigned long long)rs->rs_start,
628  		       rs->rs_requested);
629  }
630  
631  /**
632   * __rs_deltree - remove a multi-block reservation from the rgd tree
633   * @rs: The reservation to remove
634   *
635   */
636  static void __rs_deltree(struct gfs2_blkreserv *rs)
637  {
638  	struct gfs2_rgrpd *rgd;
639  
640  	if (!gfs2_rs_active(rs))
641  		return;
642  
643  	rgd = rs->rs_rgd;
644  	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
645  	rb_erase(&rs->rs_node, &rgd->rd_rstree);
646  	RB_CLEAR_NODE(&rs->rs_node);
647  
648  	if (rs->rs_requested) {
649  		/* return requested blocks to the rgrp */
650  		BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested);
651  		rs->rs_rgd->rd_requested -= rs->rs_requested;
652  
653  		/* The rgrp extent failure point is likely not to increase;
654  		   it will only do so if the freed blocks are somehow
655  		   contiguous with a span of free blocks that follows. Still,
656  		   it will force the number to be recalculated later. */
657  		rgd->rd_extfail_pt += rs->rs_requested;
658  		rs->rs_requested = 0;
659  	}
660  }
661  
662  /**
663   * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
664   * @rs: The reservation to remove
665   *
666   */
667  void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
668  {
669  	struct gfs2_rgrpd *rgd;
670  
671  	rgd = rs->rs_rgd;
672  	if (rgd) {
673  		spin_lock(&rgd->rd_rsspin);
674  		__rs_deltree(rs);
675  		BUG_ON(rs->rs_requested);
676  		spin_unlock(&rgd->rd_rsspin);
677  	}
678  }
679  
680  /**
681   * gfs2_rs_delete - delete a multi-block reservation
682   * @ip: The inode for this reservation
683   *
684   */
685  void gfs2_rs_delete(struct gfs2_inode *ip)
686  {
687  	struct inode *inode = &ip->i_inode;
688  
689  	down_write(&ip->i_rw_mutex);
690  	if (atomic_read(&inode->i_writecount) <= 1)
691  		gfs2_rs_deltree(&ip->i_res);
692  	up_write(&ip->i_rw_mutex);
693  }
694  
695  /**
696   * return_all_reservations - return all reserved blocks back to the rgrp.
697   * @rgd: the rgrp that needs its space back
698   *
699   * We previously reserved a bunch of blocks for allocation. Now we need to
700   * give them back. This leave the reservation structures in tact, but removes
701   * all of their corresponding "no-fly zones".
702   */
703  static void return_all_reservations(struct gfs2_rgrpd *rgd)
704  {
705  	struct rb_node *n;
706  	struct gfs2_blkreserv *rs;
707  
708  	spin_lock(&rgd->rd_rsspin);
709  	while ((n = rb_first(&rgd->rd_rstree))) {
710  		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
711  		__rs_deltree(rs);
712  	}
713  	spin_unlock(&rgd->rd_rsspin);
714  }
715  
716  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
717  {
718  	struct rb_node *n;
719  	struct gfs2_rgrpd *rgd;
720  	struct gfs2_glock *gl;
721  
722  	while ((n = rb_first(&sdp->sd_rindex_tree))) {
723  		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
724  		gl = rgd->rd_gl;
725  
726  		rb_erase(n, &sdp->sd_rindex_tree);
727  
728  		if (gl) {
729  			if (gl->gl_state != LM_ST_UNLOCKED) {
730  				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
731  				flush_delayed_work(&gl->gl_work);
732  			}
733  			gfs2_rgrp_brelse(rgd);
734  			glock_clear_object(gl, rgd);
735  			gfs2_glock_put(gl);
736  		}
737  
738  		gfs2_free_clones(rgd);
739  		return_all_reservations(rgd);
740  		kfree(rgd->rd_bits);
741  		rgd->rd_bits = NULL;
742  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
743  	}
744  }
745  
746  /**
747   * compute_bitstructs - Compute the bitmap sizes
748   * @rgd: The resource group descriptor
749   *
750   * Calculates bitmap descriptors, one for each block that contains bitmap data
751   *
752   * Returns: errno
753   */
754  
755  static int compute_bitstructs(struct gfs2_rgrpd *rgd)
756  {
757  	struct gfs2_sbd *sdp = rgd->rd_sbd;
758  	struct gfs2_bitmap *bi;
759  	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
760  	u32 bytes_left, bytes;
761  	int x;
762  
763  	if (!length)
764  		return -EINVAL;
765  
766  	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
767  	if (!rgd->rd_bits)
768  		return -ENOMEM;
769  
770  	bytes_left = rgd->rd_bitbytes;
771  
772  	for (x = 0; x < length; x++) {
773  		bi = rgd->rd_bits + x;
774  
775  		bi->bi_flags = 0;
776  		/* small rgrp; bitmap stored completely in header block */
777  		if (length == 1) {
778  			bytes = bytes_left;
779  			bi->bi_offset = sizeof(struct gfs2_rgrp);
780  			bi->bi_start = 0;
781  			bi->bi_bytes = bytes;
782  			bi->bi_blocks = bytes * GFS2_NBBY;
783  		/* header block */
784  		} else if (x == 0) {
785  			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
786  			bi->bi_offset = sizeof(struct gfs2_rgrp);
787  			bi->bi_start = 0;
788  			bi->bi_bytes = bytes;
789  			bi->bi_blocks = bytes * GFS2_NBBY;
790  		/* last block */
791  		} else if (x + 1 == length) {
792  			bytes = bytes_left;
793  			bi->bi_offset = sizeof(struct gfs2_meta_header);
794  			bi->bi_start = rgd->rd_bitbytes - bytes_left;
795  			bi->bi_bytes = bytes;
796  			bi->bi_blocks = bytes * GFS2_NBBY;
797  		/* other blocks */
798  		} else {
799  			bytes = sdp->sd_sb.sb_bsize -
800  				sizeof(struct gfs2_meta_header);
801  			bi->bi_offset = sizeof(struct gfs2_meta_header);
802  			bi->bi_start = rgd->rd_bitbytes - bytes_left;
803  			bi->bi_bytes = bytes;
804  			bi->bi_blocks = bytes * GFS2_NBBY;
805  		}
806  
807  		bytes_left -= bytes;
808  	}
809  
810  	if (bytes_left) {
811  		gfs2_consist_rgrpd(rgd);
812  		return -EIO;
813  	}
814  	bi = rgd->rd_bits + (length - 1);
815  	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
816  		gfs2_lm(sdp,
817  			"ri_addr = %llu\n"
818  			"ri_length = %u\n"
819  			"ri_data0 = %llu\n"
820  			"ri_data = %u\n"
821  			"ri_bitbytes = %u\n"
822  			"start=%u len=%u offset=%u\n",
823  			(unsigned long long)rgd->rd_addr,
824  			rgd->rd_length,
825  			(unsigned long long)rgd->rd_data0,
826  			rgd->rd_data,
827  			rgd->rd_bitbytes,
828  			bi->bi_start, bi->bi_bytes, bi->bi_offset);
829  		gfs2_consist_rgrpd(rgd);
830  		return -EIO;
831  	}
832  
833  	return 0;
834  }
835  
836  /**
837   * gfs2_ri_total - Total up the file system space, according to the rindex.
838   * @sdp: the filesystem
839   *
840   */
841  u64 gfs2_ri_total(struct gfs2_sbd *sdp)
842  {
843  	u64 total_data = 0;
844  	struct inode *inode = sdp->sd_rindex;
845  	struct gfs2_inode *ip = GFS2_I(inode);
846  	char buf[sizeof(struct gfs2_rindex)];
847  	int error, rgrps;
848  
849  	for (rgrps = 0;; rgrps++) {
850  		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
851  
852  		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
853  			break;
854  		error = gfs2_internal_read(ip, buf, &pos,
855  					   sizeof(struct gfs2_rindex));
856  		if (error != sizeof(struct gfs2_rindex))
857  			break;
858  		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
859  	}
860  	return total_data;
861  }
862  
863  static int rgd_insert(struct gfs2_rgrpd *rgd)
864  {
865  	struct gfs2_sbd *sdp = rgd->rd_sbd;
866  	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
867  
868  	/* Figure out where to put new node */
869  	while (*newn) {
870  		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
871  						  rd_node);
872  
873  		parent = *newn;
874  		if (rgd->rd_addr < cur->rd_addr)
875  			newn = &((*newn)->rb_left);
876  		else if (rgd->rd_addr > cur->rd_addr)
877  			newn = &((*newn)->rb_right);
878  		else
879  			return -EEXIST;
880  	}
881  
882  	rb_link_node(&rgd->rd_node, parent, newn);
883  	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
884  	sdp->sd_rgrps++;
885  	return 0;
886  }
887  
888  /**
889   * read_rindex_entry - Pull in a new resource index entry from the disk
890   * @ip: Pointer to the rindex inode
891   *
892   * Returns: 0 on success, > 0 on EOF, error code otherwise
893   */
894  
895  static int read_rindex_entry(struct gfs2_inode *ip)
896  {
897  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
898  	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
899  	struct gfs2_rindex buf;
900  	int error;
901  	struct gfs2_rgrpd *rgd;
902  
903  	if (pos >= i_size_read(&ip->i_inode))
904  		return 1;
905  
906  	error = gfs2_internal_read(ip, (char *)&buf, &pos,
907  				   sizeof(struct gfs2_rindex));
908  
909  	if (error != sizeof(struct gfs2_rindex))
910  		return (error == 0) ? 1 : error;
911  
912  	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
913  	error = -ENOMEM;
914  	if (!rgd)
915  		return error;
916  
917  	rgd->rd_sbd = sdp;
918  	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
919  	rgd->rd_length = be32_to_cpu(buf.ri_length);
920  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
921  	rgd->rd_data = be32_to_cpu(buf.ri_data);
922  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
923  	spin_lock_init(&rgd->rd_rsspin);
924  	mutex_init(&rgd->rd_mutex);
925  
926  	error = gfs2_glock_get(sdp, rgd->rd_addr,
927  			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
928  	if (error)
929  		goto fail;
930  
931  	error = compute_bitstructs(rgd);
932  	if (error)
933  		goto fail_glock;
934  
935  	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
936  	rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
937  	if (rgd->rd_data > sdp->sd_max_rg_data)
938  		sdp->sd_max_rg_data = rgd->rd_data;
939  	spin_lock(&sdp->sd_rindex_spin);
940  	error = rgd_insert(rgd);
941  	spin_unlock(&sdp->sd_rindex_spin);
942  	if (!error) {
943  		glock_set_object(rgd->rd_gl, rgd);
944  		return 0;
945  	}
946  
947  	error = 0; /* someone else read in the rgrp; free it and ignore it */
948  fail_glock:
949  	gfs2_glock_put(rgd->rd_gl);
950  
951  fail:
952  	kfree(rgd->rd_bits);
953  	rgd->rd_bits = NULL;
954  	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
955  	return error;
956  }
957  
958  /**
959   * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
960   * @sdp: the GFS2 superblock
961   *
962   * The purpose of this function is to select a subset of the resource groups
963   * and mark them as PREFERRED. We do it in such a way that each node prefers
964   * to use a unique set of rgrps to minimize glock contention.
965   */
966  static void set_rgrp_preferences(struct gfs2_sbd *sdp)
967  {
968  	struct gfs2_rgrpd *rgd, *first;
969  	int i;
970  
971  	/* Skip an initial number of rgrps, based on this node's journal ID.
972  	   That should start each node out on its own set. */
973  	rgd = gfs2_rgrpd_get_first(sdp);
974  	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
975  		rgd = gfs2_rgrpd_get_next(rgd);
976  	first = rgd;
977  
978  	do {
979  		rgd->rd_flags |= GFS2_RDF_PREFERRED;
980  		for (i = 0; i < sdp->sd_journals; i++) {
981  			rgd = gfs2_rgrpd_get_next(rgd);
982  			if (!rgd || rgd == first)
983  				break;
984  		}
985  	} while (rgd && rgd != first);
986  }
987  
988  /**
989   * gfs2_ri_update - Pull in a new resource index from the disk
990   * @ip: pointer to the rindex inode
991   *
992   * Returns: 0 on successful update, error code otherwise
993   */
994  
995  static int gfs2_ri_update(struct gfs2_inode *ip)
996  {
997  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
998  	int error;
999  
1000  	do {
1001  		error = read_rindex_entry(ip);
1002  	} while (error == 0);
1003  
1004  	if (error < 0)
1005  		return error;
1006  
1007  	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
1008  		fs_err(sdp, "no resource groups found in the file system.\n");
1009  		return -ENOENT;
1010  	}
1011  	set_rgrp_preferences(sdp);
1012  
1013  	sdp->sd_rindex_uptodate = 1;
1014  	return 0;
1015  }
1016  
1017  /**
1018   * gfs2_rindex_update - Update the rindex if required
1019   * @sdp: The GFS2 superblock
1020   *
1021   * We grab a lock on the rindex inode to make sure that it doesn't
1022   * change whilst we are performing an operation. We keep this lock
1023   * for quite long periods of time compared to other locks. This
1024   * doesn't matter, since it is shared and it is very, very rarely
1025   * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1026   *
1027   * This makes sure that we're using the latest copy of the resource index
1028   * special file, which might have been updated if someone expanded the
1029   * filesystem (via gfs2_grow utility), which adds new resource groups.
1030   *
1031   * Returns: 0 on succeess, error code otherwise
1032   */
1033  
1034  int gfs2_rindex_update(struct gfs2_sbd *sdp)
1035  {
1036  	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1037  	struct gfs2_glock *gl = ip->i_gl;
1038  	struct gfs2_holder ri_gh;
1039  	int error = 0;
1040  	int unlock_required = 0;
1041  
1042  	/* Read new copy from disk if we don't have the latest */
1043  	if (!sdp->sd_rindex_uptodate) {
1044  		if (!gfs2_glock_is_locked_by_me(gl)) {
1045  			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1046  			if (error)
1047  				return error;
1048  			unlock_required = 1;
1049  		}
1050  		if (!sdp->sd_rindex_uptodate)
1051  			error = gfs2_ri_update(ip);
1052  		if (unlock_required)
1053  			gfs2_glock_dq_uninit(&ri_gh);
1054  	}
1055  
1056  	return error;
1057  }
1058  
1059  static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1060  {
1061  	const struct gfs2_rgrp *str = buf;
1062  	u32 rg_flags;
1063  
1064  	rg_flags = be32_to_cpu(str->rg_flags);
1065  	rg_flags &= ~GFS2_RDF_MASK;
1066  	rgd->rd_flags &= GFS2_RDF_MASK;
1067  	rgd->rd_flags |= rg_flags;
1068  	rgd->rd_free = be32_to_cpu(str->rg_free);
1069  	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1070  	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1071  	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1072  }
1073  
1074  static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1075  {
1076  	const struct gfs2_rgrp *str = buf;
1077  
1078  	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1079  	rgl->rl_flags = str->rg_flags;
1080  	rgl->rl_free = str->rg_free;
1081  	rgl->rl_dinodes = str->rg_dinodes;
1082  	rgl->rl_igeneration = str->rg_igeneration;
1083  	rgl->__pad = 0UL;
1084  }
1085  
1086  static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1087  {
1088  	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1089  	struct gfs2_rgrp *str = buf;
1090  	u32 crc;
1091  
1092  	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1093  	str->rg_free = cpu_to_be32(rgd->rd_free);
1094  	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1095  	if (next == NULL)
1096  		str->rg_skip = 0;
1097  	else if (next->rd_addr > rgd->rd_addr)
1098  		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1099  	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1100  	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1101  	str->rg_data = cpu_to_be32(rgd->rd_data);
1102  	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1103  	str->rg_crc = 0;
1104  	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1105  	str->rg_crc = cpu_to_be32(crc);
1106  
1107  	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1108  	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1109  }
1110  
1111  static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1112  {
1113  	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1114  	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1115  	struct gfs2_sbd *sdp = rgd->rd_sbd;
1116  	int valid = 1;
1117  
1118  	if (rgl->rl_flags != str->rg_flags) {
1119  		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1120  			(unsigned long long)rgd->rd_addr,
1121  		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1122  		valid = 0;
1123  	}
1124  	if (rgl->rl_free != str->rg_free) {
1125  		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1126  			(unsigned long long)rgd->rd_addr,
1127  			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1128  		valid = 0;
1129  	}
1130  	if (rgl->rl_dinodes != str->rg_dinodes) {
1131  		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1132  			(unsigned long long)rgd->rd_addr,
1133  			be32_to_cpu(rgl->rl_dinodes),
1134  			be32_to_cpu(str->rg_dinodes));
1135  		valid = 0;
1136  	}
1137  	if (rgl->rl_igeneration != str->rg_igeneration) {
1138  		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1139  			(unsigned long long)rgd->rd_addr,
1140  			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1141  			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1142  		valid = 0;
1143  	}
1144  	return valid;
1145  }
1146  
1147  static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1148  {
1149  	struct gfs2_bitmap *bi;
1150  	const u32 length = rgd->rd_length;
1151  	const u8 *buffer = NULL;
1152  	u32 i, goal, count = 0;
1153  
1154  	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1155  		goal = 0;
1156  		buffer = bi->bi_bh->b_data + bi->bi_offset;
1157  		WARN_ON(!buffer_uptodate(bi->bi_bh));
1158  		while (goal < bi->bi_blocks) {
1159  			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1160  					   GFS2_BLKST_UNLINKED);
1161  			if (goal == BFITNOENT)
1162  				break;
1163  			count++;
1164  			goal++;
1165  		}
1166  	}
1167  
1168  	return count;
1169  }
1170  
1171  static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
1172  {
1173  	struct gfs2_bitmap *bi;
1174  	int x;
1175  
1176  	if (rgd->rd_free) {
1177  		for (x = 0; x < rgd->rd_length; x++) {
1178  			bi = rgd->rd_bits + x;
1179  			clear_bit(GBF_FULL, &bi->bi_flags);
1180  		}
1181  	} else {
1182  		for (x = 0; x < rgd->rd_length; x++) {
1183  			bi = rgd->rd_bits + x;
1184  			set_bit(GBF_FULL, &bi->bi_flags);
1185  		}
1186  	}
1187  }
1188  
1189  /**
1190   * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
1191   * @gh: the glock holder representing the rgrpd to read in
1192   *
1193   * Read in all of a Resource Group's header and bitmap blocks.
1194   * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1195   *
1196   * Returns: errno
1197   */
1198  
1199  int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
1200  {
1201  	struct gfs2_glock *gl = gh->gh_gl;
1202  	struct gfs2_rgrpd *rgd = gl->gl_object;
1203  	struct gfs2_sbd *sdp = rgd->rd_sbd;
1204  	unsigned int length = rgd->rd_length;
1205  	struct gfs2_bitmap *bi;
1206  	unsigned int x, y;
1207  	int error;
1208  
1209  	if (rgd->rd_bits[0].bi_bh != NULL)
1210  		return 0;
1211  
1212  	for (x = 0; x < length; x++) {
1213  		bi = rgd->rd_bits + x;
1214  		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1215  		if (error)
1216  			goto fail;
1217  	}
1218  
1219  	for (y = length; y--;) {
1220  		bi = rgd->rd_bits + y;
1221  		error = gfs2_meta_wait(sdp, bi->bi_bh);
1222  		if (error)
1223  			goto fail;
1224  		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1225  					      GFS2_METATYPE_RG)) {
1226  			error = -EIO;
1227  			goto fail;
1228  		}
1229  	}
1230  
1231  	gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1232  	rgrp_set_bitmap_flags(rgd);
1233  	rgd->rd_flags |= GFS2_RDF_CHECK;
1234  	rgd->rd_free_clone = rgd->rd_free;
1235  	GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
1236  	/* max out the rgrp allocation failure point */
1237  	rgd->rd_extfail_pt = rgd->rd_free;
1238  	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1239  		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1240  		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1241  				     rgd->rd_bits[0].bi_bh->b_data);
1242  	} else if (sdp->sd_args.ar_rgrplvb) {
1243  		if (!gfs2_rgrp_lvb_valid(rgd)){
1244  			gfs2_consist_rgrpd(rgd);
1245  			error = -EIO;
1246  			goto fail;
1247  		}
1248  		if (rgd->rd_rgl->rl_unlinked == 0)
1249  			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1250  	}
1251  	return 0;
1252  
1253  fail:
1254  	while (x--) {
1255  		bi = rgd->rd_bits + x;
1256  		brelse(bi->bi_bh);
1257  		bi->bi_bh = NULL;
1258  		gfs2_assert_warn(sdp, !bi->bi_clone);
1259  	}
1260  	return error;
1261  }
1262  
1263  static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
1264  {
1265  	u32 rl_flags;
1266  
1267  	if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags))
1268  		return 0;
1269  
1270  	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1271  		return gfs2_instantiate(gh);
1272  
1273  	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1274  	rl_flags &= ~GFS2_RDF_MASK;
1275  	rgd->rd_flags &= GFS2_RDF_MASK;
1276  	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1277  	if (rgd->rd_rgl->rl_unlinked == 0)
1278  		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1279  	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1280  	rgrp_set_bitmap_flags(rgd);
1281  	rgd->rd_free_clone = rgd->rd_free;
1282  	GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
1283  	/* max out the rgrp allocation failure point */
1284  	rgd->rd_extfail_pt = rgd->rd_free;
1285  	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1286  	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1287  	return 0;
1288  }
1289  
1290  /**
1291   * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1292   * @rgd: The resource group
1293   *
1294   */
1295  
1296  void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1297  {
1298  	int x, length = rgd->rd_length;
1299  
1300  	for (x = 0; x < length; x++) {
1301  		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1302  		if (bi->bi_bh) {
1303  			brelse(bi->bi_bh);
1304  			bi->bi_bh = NULL;
1305  		}
1306  	}
1307  	set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags);
1308  }
1309  
1310  int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1311  			     struct buffer_head *bh,
1312  			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1313  {
1314  	struct super_block *sb = sdp->sd_vfs;
1315  	u64 blk;
1316  	sector_t start = 0;
1317  	sector_t nr_blks = 0;
1318  	int rv;
1319  	unsigned int x;
1320  	u32 trimmed = 0;
1321  	u8 diff;
1322  
1323  	for (x = 0; x < bi->bi_bytes; x++) {
1324  		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1325  		clone += bi->bi_offset;
1326  		clone += x;
1327  		if (bh) {
1328  			const u8 *orig = bh->b_data + bi->bi_offset + x;
1329  			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1330  		} else {
1331  			diff = ~(*clone | (*clone >> 1));
1332  		}
1333  		diff &= 0x55;
1334  		if (diff == 0)
1335  			continue;
1336  		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1337  		while(diff) {
1338  			if (diff & 1) {
1339  				if (nr_blks == 0)
1340  					goto start_new_extent;
1341  				if ((start + nr_blks) != blk) {
1342  					if (nr_blks >= minlen) {
1343  						rv = sb_issue_discard(sb,
1344  							start, nr_blks,
1345  							GFP_NOFS, 0);
1346  						if (rv)
1347  							goto fail;
1348  						trimmed += nr_blks;
1349  					}
1350  					nr_blks = 0;
1351  start_new_extent:
1352  					start = blk;
1353  				}
1354  				nr_blks++;
1355  			}
1356  			diff >>= 2;
1357  			blk++;
1358  		}
1359  	}
1360  	if (nr_blks >= minlen) {
1361  		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1362  		if (rv)
1363  			goto fail;
1364  		trimmed += nr_blks;
1365  	}
1366  	if (ptrimmed)
1367  		*ptrimmed = trimmed;
1368  	return 0;
1369  
1370  fail:
1371  	if (sdp->sd_args.ar_discard)
1372  		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1373  	sdp->sd_args.ar_discard = 0;
1374  	return -EIO;
1375  }
1376  
1377  /**
1378   * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1379   * @filp: Any file on the filesystem
1380   * @argp: Pointer to the arguments (also used to pass result)
1381   *
1382   * Returns: 0 on success, otherwise error code
1383   */
1384  
1385  int gfs2_fitrim(struct file *filp, void __user *argp)
1386  {
1387  	struct inode *inode = file_inode(filp);
1388  	struct gfs2_sbd *sdp = GFS2_SB(inode);
1389  	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1390  	struct buffer_head *bh;
1391  	struct gfs2_rgrpd *rgd;
1392  	struct gfs2_rgrpd *rgd_end;
1393  	struct gfs2_holder gh;
1394  	struct fstrim_range r;
1395  	int ret = 0;
1396  	u64 amt;
1397  	u64 trimmed = 0;
1398  	u64 start, end, minlen;
1399  	unsigned int x;
1400  	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1401  
1402  	if (!capable(CAP_SYS_ADMIN))
1403  		return -EPERM;
1404  
1405  	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1406  		return -EROFS;
1407  
1408  	if (!blk_queue_discard(q))
1409  		return -EOPNOTSUPP;
1410  
1411  	if (copy_from_user(&r, argp, sizeof(r)))
1412  		return -EFAULT;
1413  
1414  	ret = gfs2_rindex_update(sdp);
1415  	if (ret)
1416  		return ret;
1417  
1418  	start = r.start >> bs_shift;
1419  	end = start + (r.len >> bs_shift);
1420  	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
1421  	minlen = max_t(u64, minlen,
1422  		       q->limits.discard_granularity) >> bs_shift;
1423  
1424  	if (end <= start || minlen > sdp->sd_max_rg_data)
1425  		return -EINVAL;
1426  
1427  	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1428  	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1429  
1430  	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1431  	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1432  		return -EINVAL; /* start is beyond the end of the fs */
1433  
1434  	while (1) {
1435  
1436  		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1437  					 LM_FLAG_NODE_SCOPE, &gh);
1438  		if (ret)
1439  			goto out;
1440  
1441  		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1442  			/* Trim each bitmap in the rgrp */
1443  			for (x = 0; x < rgd->rd_length; x++) {
1444  				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1445  				rgrp_lock_local(rgd);
1446  				ret = gfs2_rgrp_send_discards(sdp,
1447  						rgd->rd_data0, NULL, bi, minlen,
1448  						&amt);
1449  				rgrp_unlock_local(rgd);
1450  				if (ret) {
1451  					gfs2_glock_dq_uninit(&gh);
1452  					goto out;
1453  				}
1454  				trimmed += amt;
1455  			}
1456  
1457  			/* Mark rgrp as having been trimmed */
1458  			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1459  			if (ret == 0) {
1460  				bh = rgd->rd_bits[0].bi_bh;
1461  				rgrp_lock_local(rgd);
1462  				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1463  				gfs2_trans_add_meta(rgd->rd_gl, bh);
1464  				gfs2_rgrp_out(rgd, bh->b_data);
1465  				rgrp_unlock_local(rgd);
1466  				gfs2_trans_end(sdp);
1467  			}
1468  		}
1469  		gfs2_glock_dq_uninit(&gh);
1470  
1471  		if (rgd == rgd_end)
1472  			break;
1473  
1474  		rgd = gfs2_rgrpd_get_next(rgd);
1475  	}
1476  
1477  out:
1478  	r.len = trimmed << bs_shift;
1479  	if (copy_to_user(argp, &r, sizeof(r)))
1480  		return -EFAULT;
1481  
1482  	return ret;
1483  }
1484  
1485  /**
1486   * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1487   * @ip: the inode structure
1488   *
1489   */
1490  static void rs_insert(struct gfs2_inode *ip)
1491  {
1492  	struct rb_node **newn, *parent = NULL;
1493  	int rc;
1494  	struct gfs2_blkreserv *rs = &ip->i_res;
1495  	struct gfs2_rgrpd *rgd = rs->rs_rgd;
1496  
1497  	BUG_ON(gfs2_rs_active(rs));
1498  
1499  	spin_lock(&rgd->rd_rsspin);
1500  	newn = &rgd->rd_rstree.rb_node;
1501  	while (*newn) {
1502  		struct gfs2_blkreserv *cur =
1503  			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1504  
1505  		parent = *newn;
1506  		rc = rs_cmp(rs->rs_start, rs->rs_requested, cur);
1507  		if (rc > 0)
1508  			newn = &((*newn)->rb_right);
1509  		else if (rc < 0)
1510  			newn = &((*newn)->rb_left);
1511  		else {
1512  			spin_unlock(&rgd->rd_rsspin);
1513  			WARN_ON(1);
1514  			return;
1515  		}
1516  	}
1517  
1518  	rb_link_node(&rs->rs_node, parent, newn);
1519  	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1520  
1521  	/* Do our rgrp accounting for the reservation */
1522  	rgd->rd_requested += rs->rs_requested; /* blocks requested */
1523  	spin_unlock(&rgd->rd_rsspin);
1524  	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1525  }
1526  
1527  /**
1528   * rgd_free - return the number of free blocks we can allocate
1529   * @rgd: the resource group
1530   * @rs: The reservation to free
1531   *
1532   * This function returns the number of free blocks for an rgrp.
1533   * That's the clone-free blocks (blocks that are free, not including those
1534   * still being used for unlinked files that haven't been deleted.)
1535   *
1536   * It also subtracts any blocks reserved by someone else, but does not
1537   * include free blocks that are still part of our current reservation,
1538   * because obviously we can (and will) allocate them.
1539   */
1540  static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1541  {
1542  	u32 tot_reserved, tot_free;
1543  
1544  	if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested))
1545  		return 0;
1546  	tot_reserved = rgd->rd_requested - rs->rs_requested;
1547  
1548  	if (rgd->rd_free_clone < tot_reserved)
1549  		tot_reserved = 0;
1550  
1551  	tot_free = rgd->rd_free_clone - tot_reserved;
1552  
1553  	return tot_free;
1554  }
1555  
1556  /**
1557   * rg_mblk_search - find a group of multiple free blocks to form a reservation
1558   * @rgd: the resource group descriptor
1559   * @ip: pointer to the inode for which we're reserving blocks
1560   * @ap: the allocation parameters
1561   *
1562   */
1563  
1564  static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1565  			   const struct gfs2_alloc_parms *ap)
1566  {
1567  	struct gfs2_rbm rbm = { .rgd = rgd, };
1568  	u64 goal;
1569  	struct gfs2_blkreserv *rs = &ip->i_res;
1570  	u32 extlen;
1571  	u32 free_blocks, blocks_available;
1572  	int ret;
1573  	struct inode *inode = &ip->i_inode;
1574  
1575  	spin_lock(&rgd->rd_rsspin);
1576  	free_blocks = rgd_free(rgd, rs);
1577  	if (rgd->rd_free_clone < rgd->rd_requested)
1578  		free_blocks = 0;
1579  	blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
1580  	if (rgd == rs->rs_rgd)
1581  		blocks_available += rs->rs_reserved;
1582  	spin_unlock(&rgd->rd_rsspin);
1583  
1584  	if (S_ISDIR(inode->i_mode))
1585  		extlen = 1;
1586  	else {
1587  		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1588  		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1589  	}
1590  	if (free_blocks < extlen || blocks_available < extlen)
1591  		return;
1592  
1593  	/* Find bitmap block that contains bits for goal block */
1594  	if (rgrp_contains_block(rgd, ip->i_goal))
1595  		goal = ip->i_goal;
1596  	else
1597  		goal = rgd->rd_last_alloc + rgd->rd_data0;
1598  
1599  	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1600  		return;
1601  
1602  	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true);
1603  	if (ret == 0) {
1604  		rs->rs_start = gfs2_rbm_to_block(&rbm);
1605  		rs->rs_requested = extlen;
1606  		rs_insert(ip);
1607  	} else {
1608  		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1609  			rgd->rd_last_alloc = 0;
1610  	}
1611  }
1612  
1613  /**
1614   * gfs2_next_unreserved_block - Return next block that is not reserved
1615   * @rgd: The resource group
1616   * @block: The starting block
1617   * @length: The required length
1618   * @ignore_rs: Reservation to ignore
1619   *
1620   * If the block does not appear in any reservation, then return the
1621   * block number unchanged. If it does appear in the reservation, then
1622   * keep looking through the tree of reservations in order to find the
1623   * first block number which is not reserved.
1624   */
1625  
1626  static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1627  				      u32 length,
1628  				      struct gfs2_blkreserv *ignore_rs)
1629  {
1630  	struct gfs2_blkreserv *rs;
1631  	struct rb_node *n;
1632  	int rc;
1633  
1634  	spin_lock(&rgd->rd_rsspin);
1635  	n = rgd->rd_rstree.rb_node;
1636  	while (n) {
1637  		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1638  		rc = rs_cmp(block, length, rs);
1639  		if (rc < 0)
1640  			n = n->rb_left;
1641  		else if (rc > 0)
1642  			n = n->rb_right;
1643  		else
1644  			break;
1645  	}
1646  
1647  	if (n) {
1648  		while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) {
1649  			block = rs->rs_start + rs->rs_requested;
1650  			n = n->rb_right;
1651  			if (n == NULL)
1652  				break;
1653  			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1654  		}
1655  	}
1656  
1657  	spin_unlock(&rgd->rd_rsspin);
1658  	return block;
1659  }
1660  
1661  /**
1662   * gfs2_reservation_check_and_update - Check for reservations during block alloc
1663   * @rbm: The current position in the resource group
1664   * @rs: Our own reservation
1665   * @minext: The minimum extent length
1666   * @maxext: A pointer to the maximum extent structure
1667   *
1668   * This checks the current position in the rgrp to see whether there is
1669   * a reservation covering this block. If not then this function is a
1670   * no-op. If there is, then the position is moved to the end of the
1671   * contiguous reservation(s) so that we are pointing at the first
1672   * non-reserved block.
1673   *
1674   * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1675   */
1676  
1677  static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1678  					     struct gfs2_blkreserv *rs,
1679  					     u32 minext,
1680  					     struct gfs2_extent *maxext)
1681  {
1682  	u64 block = gfs2_rbm_to_block(rbm);
1683  	u32 extlen = 1;
1684  	u64 nblock;
1685  
1686  	/*
1687  	 * If we have a minimum extent length, then skip over any extent
1688  	 * which is less than the min extent length in size.
1689  	 */
1690  	if (minext > 1) {
1691  		extlen = gfs2_free_extlen(rbm, minext);
1692  		if (extlen <= maxext->len)
1693  			goto fail;
1694  	}
1695  
1696  	/*
1697  	 * Check the extent which has been found against the reservations
1698  	 * and skip if parts of it are already reserved
1699  	 */
1700  	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs);
1701  	if (nblock == block) {
1702  		if (!minext || extlen >= minext)
1703  			return 0;
1704  
1705  		if (extlen > maxext->len) {
1706  			maxext->len = extlen;
1707  			maxext->rbm = *rbm;
1708  		}
1709  	} else {
1710  		u64 len = nblock - block;
1711  		if (len >= (u64)1 << 32)
1712  			return -E2BIG;
1713  		extlen = len;
1714  	}
1715  fail:
1716  	if (gfs2_rbm_add(rbm, extlen))
1717  		return -E2BIG;
1718  	return 1;
1719  }
1720  
1721  /**
1722   * gfs2_rbm_find - Look for blocks of a particular state
1723   * @rbm: Value/result starting position and final position
1724   * @state: The state which we want to find
1725   * @minext: Pointer to the requested extent length
1726   *          This is updated to be the actual reservation size.
1727   * @rs: Our own reservation (NULL to skip checking for reservations)
1728   * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1729   *          around until we've reached the starting point.
1730   *
1731   * Side effects:
1732   * - If looking for free blocks, we set GBF_FULL on each bitmap which
1733   *   has no free blocks in it.
1734   * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1735   *   has come up short on a free block search.
1736   *
1737   * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1738   */
1739  
1740  static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1741  			 struct gfs2_blkreserv *rs, bool nowrap)
1742  {
1743  	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1744  	struct buffer_head *bh;
1745  	int last_bii;
1746  	u32 offset;
1747  	u8 *buffer;
1748  	bool wrapped = false;
1749  	int ret;
1750  	struct gfs2_bitmap *bi;
1751  	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1752  
1753  	/*
1754  	 * Determine the last bitmap to search.  If we're not starting at the
1755  	 * beginning of a bitmap, we need to search that bitmap twice to scan
1756  	 * the entire resource group.
1757  	 */
1758  	last_bii = rbm->bii - (rbm->offset == 0);
1759  
1760  	while(1) {
1761  		bi = rbm_bi(rbm);
1762  		if (test_bit(GBF_FULL, &bi->bi_flags) &&
1763  		    (state == GFS2_BLKST_FREE))
1764  			goto next_bitmap;
1765  
1766  		bh = bi->bi_bh;
1767  		buffer = bh->b_data + bi->bi_offset;
1768  		WARN_ON(!buffer_uptodate(bh));
1769  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1770  			buffer = bi->bi_clone + bi->bi_offset;
1771  		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1772  		if (offset == BFITNOENT) {
1773  			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1774  				set_bit(GBF_FULL, &bi->bi_flags);
1775  			goto next_bitmap;
1776  		}
1777  		rbm->offset = offset;
1778  		if (!rs || !minext)
1779  			return 0;
1780  
1781  		ret = gfs2_reservation_check_and_update(rbm, rs, *minext,
1782  							&maxext);
1783  		if (ret == 0)
1784  			return 0;
1785  		if (ret > 0)
1786  			goto next_iter;
1787  		if (ret == -E2BIG) {
1788  			rbm->bii = 0;
1789  			rbm->offset = 0;
1790  			goto res_covered_end_of_rgrp;
1791  		}
1792  		return ret;
1793  
1794  next_bitmap:	/* Find next bitmap in the rgrp */
1795  		rbm->offset = 0;
1796  		rbm->bii++;
1797  		if (rbm->bii == rbm->rgd->rd_length)
1798  			rbm->bii = 0;
1799  res_covered_end_of_rgrp:
1800  		if (rbm->bii == 0) {
1801  			if (wrapped)
1802  				break;
1803  			wrapped = true;
1804  			if (nowrap)
1805  				break;
1806  		}
1807  next_iter:
1808  		/* Have we scanned the entire resource group? */
1809  		if (wrapped && rbm->bii > last_bii)
1810  			break;
1811  	}
1812  
1813  	if (state != GFS2_BLKST_FREE)
1814  		return -ENOSPC;
1815  
1816  	/* If the extent was too small, and it's smaller than the smallest
1817  	   to have failed before, remember for future reference that it's
1818  	   useless to search this rgrp again for this amount or more. */
1819  	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1820  	    *minext < rbm->rgd->rd_extfail_pt)
1821  		rbm->rgd->rd_extfail_pt = *minext - 1;
1822  
1823  	/* If the maximum extent we found is big enough to fulfill the
1824  	   minimum requirements, use it anyway. */
1825  	if (maxext.len) {
1826  		*rbm = maxext.rbm;
1827  		*minext = maxext.len;
1828  		return 0;
1829  	}
1830  
1831  	return -ENOSPC;
1832  }
1833  
1834  /**
1835   * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1836   * @rgd: The rgrp
1837   * @last_unlinked: block address of the last dinode we unlinked
1838   * @skip: block address we should explicitly not unlink
1839   *
1840   * Returns: 0 if no error
1841   *          The inode, if one has been found, in inode.
1842   */
1843  
1844  static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1845  {
1846  	u64 block;
1847  	struct gfs2_sbd *sdp = rgd->rd_sbd;
1848  	struct gfs2_glock *gl;
1849  	struct gfs2_inode *ip;
1850  	int error;
1851  	int found = 0;
1852  	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1853  
1854  	while (1) {
1855  		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1856  				      true);
1857  		if (error == -ENOSPC)
1858  			break;
1859  		if (WARN_ON_ONCE(error))
1860  			break;
1861  
1862  		block = gfs2_rbm_to_block(&rbm);
1863  		if (gfs2_rbm_from_block(&rbm, block + 1))
1864  			break;
1865  		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1866  			continue;
1867  		if (block == skip)
1868  			continue;
1869  		*last_unlinked = block;
1870  
1871  		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1872  		if (error)
1873  			continue;
1874  
1875  		/* If the inode is already in cache, we can ignore it here
1876  		 * because the existing inode disposal code will deal with
1877  		 * it when all refs have gone away. Accessing gl_object like
1878  		 * this is not safe in general. Here it is ok because we do
1879  		 * not dereference the pointer, and we only need an approx
1880  		 * answer to whether it is NULL or not.
1881  		 */
1882  		ip = gl->gl_object;
1883  
1884  		if (ip || !gfs2_queue_delete_work(gl, 0))
1885  			gfs2_glock_put(gl);
1886  		else
1887  			found++;
1888  
1889  		/* Limit reclaim to sensible number of tasks */
1890  		if (found > NR_CPUS)
1891  			return;
1892  	}
1893  
1894  	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1895  	return;
1896  }
1897  
1898  /**
1899   * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1900   * @rgd: The rgrp in question
1901   * @loops: An indication of how picky we can be (0=very, 1=less so)
1902   *
1903   * This function uses the recently added glock statistics in order to
1904   * figure out whether a parciular resource group is suffering from
1905   * contention from multiple nodes. This is done purely on the basis
1906   * of timings, since this is the only data we have to work with and
1907   * our aim here is to reject a resource group which is highly contended
1908   * but (very important) not to do this too often in order to ensure that
1909   * we do not land up introducing fragmentation by changing resource
1910   * groups when not actually required.
1911   *
1912   * The calculation is fairly simple, we want to know whether the SRTTB
1913   * (i.e. smoothed round trip time for blocking operations) to acquire
1914   * the lock for this rgrp's glock is significantly greater than the
1915   * time taken for resource groups on average. We introduce a margin in
1916   * the form of the variable @var which is computed as the sum of the two
1917   * respective variences, and multiplied by a factor depending on @loops
1918   * and whether we have a lot of data to base the decision on. This is
1919   * then tested against the square difference of the means in order to
1920   * decide whether the result is statistically significant or not.
1921   *
1922   * Returns: A boolean verdict on the congestion status
1923   */
1924  
1925  static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1926  {
1927  	const struct gfs2_glock *gl = rgd->rd_gl;
1928  	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1929  	struct gfs2_lkstats *st;
1930  	u64 r_dcount, l_dcount;
1931  	u64 l_srttb, a_srttb = 0;
1932  	s64 srttb_diff;
1933  	u64 sqr_diff;
1934  	u64 var;
1935  	int cpu, nonzero = 0;
1936  
1937  	preempt_disable();
1938  	for_each_present_cpu(cpu) {
1939  		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1940  		if (st->stats[GFS2_LKS_SRTTB]) {
1941  			a_srttb += st->stats[GFS2_LKS_SRTTB];
1942  			nonzero++;
1943  		}
1944  	}
1945  	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1946  	if (nonzero)
1947  		do_div(a_srttb, nonzero);
1948  	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1949  	var = st->stats[GFS2_LKS_SRTTVARB] +
1950  	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1951  	preempt_enable();
1952  
1953  	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1954  	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1955  
1956  	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1957  		return false;
1958  
1959  	srttb_diff = a_srttb - l_srttb;
1960  	sqr_diff = srttb_diff * srttb_diff;
1961  
1962  	var *= 2;
1963  	if (l_dcount < 8 || r_dcount < 8)
1964  		var *= 2;
1965  	if (loops == 1)
1966  		var *= 2;
1967  
1968  	return ((srttb_diff < 0) && (sqr_diff > var));
1969  }
1970  
1971  /**
1972   * gfs2_rgrp_used_recently
1973   * @rs: The block reservation with the rgrp to test
1974   * @msecs: The time limit in milliseconds
1975   *
1976   * Returns: True if the rgrp glock has been used within the time limit
1977   */
1978  static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1979  				    u64 msecs)
1980  {
1981  	u64 tdiff;
1982  
1983  	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1984                              rs->rs_rgd->rd_gl->gl_dstamp));
1985  
1986  	return tdiff > (msecs * 1000 * 1000);
1987  }
1988  
1989  static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1990  {
1991  	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1992  	u32 skip;
1993  
1994  	get_random_bytes(&skip, sizeof(skip));
1995  	return skip % sdp->sd_rgrps;
1996  }
1997  
1998  static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1999  {
2000  	struct gfs2_rgrpd *rgd = *pos;
2001  	struct gfs2_sbd *sdp = rgd->rd_sbd;
2002  
2003  	rgd = gfs2_rgrpd_get_next(rgd);
2004  	if (rgd == NULL)
2005  		rgd = gfs2_rgrpd_get_first(sdp);
2006  	*pos = rgd;
2007  	if (rgd != begin) /* If we didn't wrap */
2008  		return true;
2009  	return false;
2010  }
2011  
2012  /**
2013   * fast_to_acquire - determine if a resource group will be fast to acquire
2014   * @rgd: The rgrp
2015   *
2016   * If this is one of our preferred rgrps, it should be quicker to acquire,
2017   * because we tried to set ourselves up as dlm lock master.
2018   */
2019  static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
2020  {
2021  	struct gfs2_glock *gl = rgd->rd_gl;
2022  
2023  	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
2024  	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
2025  	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
2026  		return 1;
2027  	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
2028  		return 1;
2029  	return 0;
2030  }
2031  
2032  /**
2033   * gfs2_inplace_reserve - Reserve space in the filesystem
2034   * @ip: the inode to reserve space for
2035   * @ap: the allocation parameters
2036   *
2037   * We try our best to find an rgrp that has at least ap->target blocks
2038   * available. After a couple of passes (loops == 2), the prospects of finding
2039   * such an rgrp diminish. At this stage, we return the first rgrp that has
2040   * at least ap->min_target blocks available.
2041   *
2042   * Returns: 0 on success,
2043   *          -ENOMEM if a suitable rgrp can't be found
2044   *          errno otherwise
2045   */
2046  
2047  int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2048  {
2049  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2050  	struct gfs2_rgrpd *begin = NULL;
2051  	struct gfs2_blkreserv *rs = &ip->i_res;
2052  	int error = 0, flags = LM_FLAG_NODE_SCOPE;
2053  	bool rg_locked;
2054  	u64 last_unlinked = NO_BLOCK;
2055  	u32 target = ap->target;
2056  	int loops = 0;
2057  	u32 free_blocks, blocks_available, skip = 0;
2058  
2059  	BUG_ON(rs->rs_reserved);
2060  
2061  	if (sdp->sd_args.ar_rgrplvb)
2062  		flags |= GL_SKIP;
2063  	if (gfs2_assert_warn(sdp, target))
2064  		return -EINVAL;
2065  	if (gfs2_rs_active(rs)) {
2066  		begin = rs->rs_rgd;
2067  	} else if (rs->rs_rgd &&
2068  		   rgrp_contains_block(rs->rs_rgd, ip->i_goal)) {
2069  		begin = rs->rs_rgd;
2070  	} else {
2071  		check_and_update_goal(ip);
2072  		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2073  	}
2074  	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2075  		skip = gfs2_orlov_skip(ip);
2076  	if (rs->rs_rgd == NULL)
2077  		return -EBADSLT;
2078  
2079  	while (loops < 3) {
2080  		struct gfs2_rgrpd *rgd;
2081  
2082  		rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl);
2083  		if (rg_locked) {
2084  			rgrp_lock_local(rs->rs_rgd);
2085  		} else {
2086  			if (skip && skip--)
2087  				goto next_rgrp;
2088  			if (!gfs2_rs_active(rs)) {
2089  				if (loops == 0 &&
2090  				    !fast_to_acquire(rs->rs_rgd))
2091  					goto next_rgrp;
2092  				if ((loops < 2) &&
2093  				    gfs2_rgrp_used_recently(rs, 1000) &&
2094  				    gfs2_rgrp_congested(rs->rs_rgd, loops))
2095  					goto next_rgrp;
2096  			}
2097  			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
2098  						   LM_ST_EXCLUSIVE, flags,
2099  						   &ip->i_rgd_gh);
2100  			if (unlikely(error))
2101  				return error;
2102  			rgrp_lock_local(rs->rs_rgd);
2103  			if (!gfs2_rs_active(rs) && (loops < 2) &&
2104  			    gfs2_rgrp_congested(rs->rs_rgd, loops))
2105  				goto skip_rgrp;
2106  			if (sdp->sd_args.ar_rgrplvb) {
2107  				error = update_rgrp_lvb(rs->rs_rgd,
2108  							&ip->i_rgd_gh);
2109  				if (unlikely(error)) {
2110  					rgrp_unlock_local(rs->rs_rgd);
2111  					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2112  					return error;
2113  				}
2114  			}
2115  		}
2116  
2117  		/* Skip unusable resource groups */
2118  		if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC |
2119  						 GFS2_RDF_ERROR)) ||
2120  		    (loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
2121  			goto skip_rgrp;
2122  
2123  		if (sdp->sd_args.ar_rgrplvb) {
2124  			error = gfs2_instantiate(&ip->i_rgd_gh);
2125  			if (error)
2126  				goto skip_rgrp;
2127  		}
2128  
2129  		/* Get a reservation if we don't already have one */
2130  		if (!gfs2_rs_active(rs))
2131  			rg_mblk_search(rs->rs_rgd, ip, ap);
2132  
2133  		/* Skip rgrps when we can't get a reservation on first pass */
2134  		if (!gfs2_rs_active(rs) && (loops < 1))
2135  			goto check_rgrp;
2136  
2137  		/* If rgrp has enough free space, use it */
2138  		rgd = rs->rs_rgd;
2139  		spin_lock(&rgd->rd_rsspin);
2140  		free_blocks = rgd_free(rgd, rs);
2141  		blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
2142  		if (free_blocks < target || blocks_available < target) {
2143  			spin_unlock(&rgd->rd_rsspin);
2144  			goto check_rgrp;
2145  		}
2146  		rs->rs_reserved = ap->target;
2147  		if (rs->rs_reserved > blocks_available)
2148  			rs->rs_reserved = blocks_available;
2149  		rgd->rd_reserved += rs->rs_reserved;
2150  		spin_unlock(&rgd->rd_rsspin);
2151  		rgrp_unlock_local(rs->rs_rgd);
2152  		return 0;
2153  check_rgrp:
2154  		/* Check for unlinked inodes which can be reclaimed */
2155  		if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK)
2156  			try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
2157  					ip->i_no_addr);
2158  skip_rgrp:
2159  		rgrp_unlock_local(rs->rs_rgd);
2160  
2161  		/* Drop reservation, if we couldn't use reserved rgrp */
2162  		if (gfs2_rs_active(rs))
2163  			gfs2_rs_deltree(rs);
2164  
2165  		/* Unlock rgrp if required */
2166  		if (!rg_locked)
2167  			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2168  next_rgrp:
2169  		/* Find the next rgrp, and continue looking */
2170  		if (gfs2_select_rgrp(&rs->rs_rgd, begin))
2171  			continue;
2172  		if (skip)
2173  			continue;
2174  
2175  		/* If we've scanned all the rgrps, but found no free blocks
2176  		 * then this checks for some less likely conditions before
2177  		 * trying again.
2178  		 */
2179  		loops++;
2180  		/* Check that fs hasn't grown if writing to rindex */
2181  		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2182  			error = gfs2_ri_update(ip);
2183  			if (error)
2184  				return error;
2185  		}
2186  		/* Flushing the log may release space */
2187  		if (loops == 2) {
2188  			if (ap->min_target)
2189  				target = ap->min_target;
2190  			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2191  				       GFS2_LFC_INPLACE_RESERVE);
2192  		}
2193  	}
2194  
2195  	return -ENOSPC;
2196  }
2197  
2198  /**
2199   * gfs2_inplace_release - release an inplace reservation
2200   * @ip: the inode the reservation was taken out on
2201   *
2202   * Release a reservation made by gfs2_inplace_reserve().
2203   */
2204  
2205  void gfs2_inplace_release(struct gfs2_inode *ip)
2206  {
2207  	struct gfs2_blkreserv *rs = &ip->i_res;
2208  
2209  	if (rs->rs_reserved) {
2210  		struct gfs2_rgrpd *rgd = rs->rs_rgd;
2211  
2212  		spin_lock(&rgd->rd_rsspin);
2213  		GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved);
2214  		rgd->rd_reserved -= rs->rs_reserved;
2215  		spin_unlock(&rgd->rd_rsspin);
2216  		rs->rs_reserved = 0;
2217  	}
2218  	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2219  		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2220  }
2221  
2222  /**
2223   * gfs2_alloc_extent - allocate an extent from a given bitmap
2224   * @rbm: the resource group information
2225   * @dinode: TRUE if the first block we allocate is for a dinode
2226   * @n: The extent length (value/result)
2227   *
2228   * Add the bitmap buffer to the transaction.
2229   * Set the found bits to @new_state to change block's allocation state.
2230   */
2231  static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2232  			     unsigned int *n)
2233  {
2234  	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2235  	const unsigned int elen = *n;
2236  	u64 block;
2237  	int ret;
2238  
2239  	*n = 1;
2240  	block = gfs2_rbm_to_block(rbm);
2241  	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2242  	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2243  	block++;
2244  	while (*n < elen) {
2245  		ret = gfs2_rbm_from_block(&pos, block);
2246  		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2247  			break;
2248  		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2249  		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2250  		(*n)++;
2251  		block++;
2252  	}
2253  }
2254  
2255  /**
2256   * rgblk_free - Change alloc state of given block(s)
2257   * @sdp: the filesystem
2258   * @rgd: the resource group the blocks are in
2259   * @bstart: the start of a run of blocks to free
2260   * @blen: the length of the block run (all must lie within ONE RG!)
2261   * @new_state: GFS2_BLKST_XXX the after-allocation block state
2262   */
2263  
2264  static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2265  		       u64 bstart, u32 blen, unsigned char new_state)
2266  {
2267  	struct gfs2_rbm rbm;
2268  	struct gfs2_bitmap *bi, *bi_prev = NULL;
2269  
2270  	rbm.rgd = rgd;
2271  	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2272  		return;
2273  	while (blen--) {
2274  		bi = rbm_bi(&rbm);
2275  		if (bi != bi_prev) {
2276  			if (!bi->bi_clone) {
2277  				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2278  						      GFP_NOFS | __GFP_NOFAIL);
2279  				memcpy(bi->bi_clone + bi->bi_offset,
2280  				       bi->bi_bh->b_data + bi->bi_offset,
2281  				       bi->bi_bytes);
2282  			}
2283  			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2284  			bi_prev = bi;
2285  		}
2286  		gfs2_setbit(&rbm, false, new_state);
2287  		gfs2_rbm_add(&rbm, 1);
2288  	}
2289  }
2290  
2291  /**
2292   * gfs2_rgrp_dump - print out an rgrp
2293   * @seq: The iterator
2294   * @rgd: The rgrp in question
2295   * @fs_id_buf: pointer to file system id (if requested)
2296   *
2297   */
2298  
2299  void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2300  		    const char *fs_id_buf)
2301  {
2302  	struct gfs2_blkreserv *trs;
2303  	const struct rb_node *n;
2304  
2305  	spin_lock(&rgd->rd_rsspin);
2306  	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n",
2307  		       fs_id_buf,
2308  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2309  		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2310  		       rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt);
2311  	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
2312  		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2313  
2314  		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2315  			       be32_to_cpu(rgl->rl_flags),
2316  			       be32_to_cpu(rgl->rl_free),
2317  			       be32_to_cpu(rgl->rl_dinodes));
2318  	}
2319  	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2320  		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2321  		dump_rs(seq, trs, fs_id_buf);
2322  	}
2323  	spin_unlock(&rgd->rd_rsspin);
2324  }
2325  
2326  static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2327  {
2328  	struct gfs2_sbd *sdp = rgd->rd_sbd;
2329  	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2330  
2331  	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2332  		(unsigned long long)rgd->rd_addr);
2333  	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2334  	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2335  	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2336  	rgd->rd_flags |= GFS2_RDF_ERROR;
2337  }
2338  
2339  /**
2340   * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2341   * @ip: The inode we have just allocated blocks for
2342   * @rbm: The start of the allocated blocks
2343   * @len: The extent length
2344   *
2345   * Adjusts a reservation after an allocation has taken place. If the
2346   * reservation does not match the allocation, or if it is now empty
2347   * then it is removed.
2348   */
2349  
2350  static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2351  				    const struct gfs2_rbm *rbm, unsigned len)
2352  {
2353  	struct gfs2_blkreserv *rs = &ip->i_res;
2354  	struct gfs2_rgrpd *rgd = rbm->rgd;
2355  
2356  	BUG_ON(rs->rs_reserved < len);
2357  	rs->rs_reserved -= len;
2358  	if (gfs2_rs_active(rs)) {
2359  		u64 start = gfs2_rbm_to_block(rbm);
2360  
2361  		if (rs->rs_start == start) {
2362  			unsigned int rlen;
2363  
2364  			rs->rs_start += len;
2365  			rlen = min(rs->rs_requested, len);
2366  			rs->rs_requested -= rlen;
2367  			rgd->rd_requested -= rlen;
2368  			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2369  			if (rs->rs_start < rgd->rd_data0 + rgd->rd_data &&
2370  			    rs->rs_requested)
2371  				return;
2372  			/* We used up our block reservation, so we should
2373  			   reserve more blocks next time. */
2374  			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2375  		}
2376  		__rs_deltree(rs);
2377  	}
2378  }
2379  
2380  /**
2381   * gfs2_set_alloc_start - Set starting point for block allocation
2382   * @rbm: The rbm which will be set to the required location
2383   * @ip: The gfs2 inode
2384   * @dinode: Flag to say if allocation includes a new inode
2385   *
2386   * This sets the starting point from the reservation if one is active
2387   * otherwise it falls back to guessing a start point based on the
2388   * inode's goal block or the last allocation point in the rgrp.
2389   */
2390  
2391  static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2392  				 const struct gfs2_inode *ip, bool dinode)
2393  {
2394  	u64 goal;
2395  
2396  	if (gfs2_rs_active(&ip->i_res)) {
2397  		goal = ip->i_res.rs_start;
2398  	} else {
2399  		if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2400  			goal = ip->i_goal;
2401  		else
2402  			goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2403  	}
2404  	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2405  		rbm->bii = 0;
2406  		rbm->offset = 0;
2407  	}
2408  }
2409  
2410  /**
2411   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2412   * @ip: the inode to allocate the block for
2413   * @bn: Used to return the starting block number
2414   * @nblocks: requested number of blocks/extent length (value/result)
2415   * @dinode: 1 if we're allocating a dinode block, else 0
2416   * @generation: the generation number of the inode
2417   *
2418   * Returns: 0 or error
2419   */
2420  
2421  int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2422  		      bool dinode, u64 *generation)
2423  {
2424  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2425  	struct buffer_head *dibh;
2426  	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, };
2427  	u64 block; /* block, within the file system scope */
2428  	u32 minext = 1;
2429  	int error = -ENOSPC;
2430  
2431  	BUG_ON(ip->i_res.rs_reserved < *nblocks);
2432  
2433  	rgrp_lock_local(rbm.rgd);
2434  	if (gfs2_rs_active(&ip->i_res)) {
2435  		gfs2_set_alloc_start(&rbm, ip, dinode);
2436  		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false);
2437  	}
2438  	if (error == -ENOSPC) {
2439  		gfs2_set_alloc_start(&rbm, ip, dinode);
2440  		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false);
2441  	}
2442  
2443  	/* Since all blocks are reserved in advance, this shouldn't happen */
2444  	if (error) {
2445  		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2446  			(unsigned long long)ip->i_no_addr, error, *nblocks,
2447  			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2448  			rbm.rgd->rd_extfail_pt);
2449  		goto rgrp_error;
2450  	}
2451  
2452  	gfs2_alloc_extent(&rbm, dinode, nblocks);
2453  	block = gfs2_rbm_to_block(&rbm);
2454  	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2455  	if (!dinode) {
2456  		ip->i_goal = block + *nblocks - 1;
2457  		error = gfs2_meta_inode_buffer(ip, &dibh);
2458  		if (error == 0) {
2459  			struct gfs2_dinode *di =
2460  				(struct gfs2_dinode *)dibh->b_data;
2461  			gfs2_trans_add_meta(ip->i_gl, dibh);
2462  			di->di_goal_meta = di->di_goal_data =
2463  				cpu_to_be64(ip->i_goal);
2464  			brelse(dibh);
2465  		}
2466  	}
2467  	spin_lock(&rbm.rgd->rd_rsspin);
2468  	gfs2_adjust_reservation(ip, &rbm, *nblocks);
2469  	if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) {
2470  		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2471  		spin_unlock(&rbm.rgd->rd_rsspin);
2472  		goto rgrp_error;
2473  	}
2474  	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks);
2475  	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks);
2476  	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks);
2477  	rbm.rgd->rd_reserved -= *nblocks;
2478  	rbm.rgd->rd_free_clone -= *nblocks;
2479  	rbm.rgd->rd_free -= *nblocks;
2480  	spin_unlock(&rbm.rgd->rd_rsspin);
2481  	if (dinode) {
2482  		rbm.rgd->rd_dinodes++;
2483  		*generation = rbm.rgd->rd_igeneration++;
2484  		if (*generation == 0)
2485  			*generation = rbm.rgd->rd_igeneration++;
2486  	}
2487  
2488  	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2489  	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2490  	rgrp_unlock_local(rbm.rgd);
2491  
2492  	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2493  	if (dinode)
2494  		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2495  
2496  	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2497  
2498  	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2499  			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2500  	*bn = block;
2501  	return 0;
2502  
2503  rgrp_error:
2504  	rgrp_unlock_local(rbm.rgd);
2505  	gfs2_rgrp_error(rbm.rgd);
2506  	return -EIO;
2507  }
2508  
2509  /**
2510   * __gfs2_free_blocks - free a contiguous run of block(s)
2511   * @ip: the inode these blocks are being freed from
2512   * @rgd: the resource group the blocks are in
2513   * @bstart: first block of a run of contiguous blocks
2514   * @blen: the length of the block run
2515   * @meta: 1 if the blocks represent metadata
2516   *
2517   */
2518  
2519  void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2520  			u64 bstart, u32 blen, int meta)
2521  {
2522  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2523  
2524  	rgrp_lock_local(rgd);
2525  	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2526  	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2527  	rgd->rd_free += blen;
2528  	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2529  	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2530  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2531  	rgrp_unlock_local(rgd);
2532  
2533  	/* Directories keep their data in the metadata address space */
2534  	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2535  		gfs2_journal_wipe(ip, bstart, blen);
2536  }
2537  
2538  /**
2539   * gfs2_free_meta - free a contiguous run of data block(s)
2540   * @ip: the inode these blocks are being freed from
2541   * @rgd: the resource group the blocks are in
2542   * @bstart: first block of a run of contiguous blocks
2543   * @blen: the length of the block run
2544   *
2545   */
2546  
2547  void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2548  		    u64 bstart, u32 blen)
2549  {
2550  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2551  
2552  	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2553  	gfs2_statfs_change(sdp, 0, +blen, 0);
2554  	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2555  }
2556  
2557  void gfs2_unlink_di(struct inode *inode)
2558  {
2559  	struct gfs2_inode *ip = GFS2_I(inode);
2560  	struct gfs2_sbd *sdp = GFS2_SB(inode);
2561  	struct gfs2_rgrpd *rgd;
2562  	u64 blkno = ip->i_no_addr;
2563  
2564  	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2565  	if (!rgd)
2566  		return;
2567  	rgrp_lock_local(rgd);
2568  	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2569  	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2570  	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2571  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2572  	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2573  	rgrp_unlock_local(rgd);
2574  }
2575  
2576  void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2577  {
2578  	struct gfs2_sbd *sdp = rgd->rd_sbd;
2579  
2580  	rgrp_lock_local(rgd);
2581  	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2582  	if (!rgd->rd_dinodes)
2583  		gfs2_consist_rgrpd(rgd);
2584  	rgd->rd_dinodes--;
2585  	rgd->rd_free++;
2586  
2587  	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2588  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2589  	rgrp_unlock_local(rgd);
2590  	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2591  
2592  	gfs2_statfs_change(sdp, 0, +1, -1);
2593  	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2594  	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2595  	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2596  }
2597  
2598  /**
2599   * gfs2_check_blk_type - Check the type of a block
2600   * @sdp: The superblock
2601   * @no_addr: The block number to check
2602   * @type: The block type we are looking for
2603   *
2604   * The inode glock of @no_addr must be held.  The @type to check for is either
2605   * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE
2606   * or GFS2_BLKST_USED would make no sense.
2607   *
2608   * Returns: 0 if the block type matches the expected type
2609   *          -ESTALE if it doesn't match
2610   *          or -ve errno if something went wrong while checking
2611   */
2612  
2613  int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2614  {
2615  	struct gfs2_rgrpd *rgd;
2616  	struct gfs2_holder rgd_gh;
2617  	struct gfs2_rbm rbm;
2618  	int error = -EINVAL;
2619  
2620  	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2621  	if (!rgd)
2622  		goto fail;
2623  
2624  	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2625  	if (error)
2626  		goto fail;
2627  
2628  	rbm.rgd = rgd;
2629  	error = gfs2_rbm_from_block(&rbm, no_addr);
2630  	if (!WARN_ON_ONCE(error)) {
2631  		/*
2632  		 * No need to take the local resource group lock here; the
2633  		 * inode glock of @no_addr provides the necessary
2634  		 * synchronization in case the block is an inode.  (In case
2635  		 * the block is not an inode, the block type will not match
2636  		 * the @type we are looking for.)
2637  		 */
2638  		if (gfs2_testbit(&rbm, false) != type)
2639  			error = -ESTALE;
2640  	}
2641  
2642  	gfs2_glock_dq_uninit(&rgd_gh);
2643  
2644  fail:
2645  	return error;
2646  }
2647  
2648  /**
2649   * gfs2_rlist_add - add a RG to a list of RGs
2650   * @ip: the inode
2651   * @rlist: the list of resource groups
2652   * @block: the block
2653   *
2654   * Figure out what RG a block belongs to and add that RG to the list
2655   *
2656   * FIXME: Don't use NOFAIL
2657   *
2658   */
2659  
2660  void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2661  		    u64 block)
2662  {
2663  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2664  	struct gfs2_rgrpd *rgd;
2665  	struct gfs2_rgrpd **tmp;
2666  	unsigned int new_space;
2667  	unsigned int x;
2668  
2669  	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2670  		return;
2671  
2672  	/*
2673  	 * The resource group last accessed is kept in the last position.
2674  	 */
2675  
2676  	if (rlist->rl_rgrps) {
2677  		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2678  		if (rgrp_contains_block(rgd, block))
2679  			return;
2680  		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2681  	} else {
2682  		rgd = ip->i_res.rs_rgd;
2683  		if (!rgd || !rgrp_contains_block(rgd, block))
2684  			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2685  	}
2686  
2687  	if (!rgd) {
2688  		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2689  		       (unsigned long long)block);
2690  		return;
2691  	}
2692  
2693  	for (x = 0; x < rlist->rl_rgrps; x++) {
2694  		if (rlist->rl_rgd[x] == rgd) {
2695  			swap(rlist->rl_rgd[x],
2696  			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2697  			return;
2698  		}
2699  	}
2700  
2701  	if (rlist->rl_rgrps == rlist->rl_space) {
2702  		new_space = rlist->rl_space + 10;
2703  
2704  		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2705  			      GFP_NOFS | __GFP_NOFAIL);
2706  
2707  		if (rlist->rl_rgd) {
2708  			memcpy(tmp, rlist->rl_rgd,
2709  			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2710  			kfree(rlist->rl_rgd);
2711  		}
2712  
2713  		rlist->rl_space = new_space;
2714  		rlist->rl_rgd = tmp;
2715  	}
2716  
2717  	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2718  }
2719  
2720  /**
2721   * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2722   *      and initialize an array of glock holders for them
2723   * @rlist: the list of resource groups
2724   *
2725   * FIXME: Don't use NOFAIL
2726   *
2727   */
2728  
2729  void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2730  {
2731  	unsigned int x;
2732  
2733  	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2734  				      sizeof(struct gfs2_holder),
2735  				      GFP_NOFS | __GFP_NOFAIL);
2736  	for (x = 0; x < rlist->rl_rgrps; x++)
2737  		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE,
2738  				 LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]);
2739  }
2740  
2741  /**
2742   * gfs2_rlist_free - free a resource group list
2743   * @rlist: the list of resource groups
2744   *
2745   */
2746  
2747  void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2748  {
2749  	unsigned int x;
2750  
2751  	kfree(rlist->rl_rgd);
2752  
2753  	if (rlist->rl_ghs) {
2754  		for (x = 0; x < rlist->rl_rgrps; x++)
2755  			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2756  		kfree(rlist->rl_ghs);
2757  		rlist->rl_ghs = NULL;
2758  	}
2759  }
2760  
2761  void rgrp_lock_local(struct gfs2_rgrpd *rgd)
2762  {
2763  	mutex_lock(&rgd->rd_mutex);
2764  }
2765  
2766  void rgrp_unlock_local(struct gfs2_rgrpd *rgd)
2767  {
2768  	mutex_unlock(&rgd->rd_mutex);
2769  }
2770