xref: /openbmc/linux/fs/xfs/libxfs/xfs_alloc.c (revision 2a954832)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 
30 struct kmem_cache	*xfs_extfree_item_cache;
31 
32 struct workqueue_struct *xfs_alloc_wq;
33 
34 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
35 
36 #define	XFSA_FIXUP_BNO_OK	1
37 #define	XFSA_FIXUP_CNT_OK	2
38 
39 /*
40  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
41  * the beginning of the block for a proper header with the location information
42  * and CRC.
43  */
44 unsigned int
45 xfs_agfl_size(
46 	struct xfs_mount	*mp)
47 {
48 	unsigned int		size = mp->m_sb.sb_sectsize;
49 
50 	if (xfs_has_crc(mp))
51 		size -= sizeof(struct xfs_agfl);
52 
53 	return size / sizeof(xfs_agblock_t);
54 }
55 
56 unsigned int
57 xfs_refc_block(
58 	struct xfs_mount	*mp)
59 {
60 	if (xfs_has_rmapbt(mp))
61 		return XFS_RMAP_BLOCK(mp) + 1;
62 	if (xfs_has_finobt(mp))
63 		return XFS_FIBT_BLOCK(mp) + 1;
64 	return XFS_IBT_BLOCK(mp) + 1;
65 }
66 
67 xfs_extlen_t
68 xfs_prealloc_blocks(
69 	struct xfs_mount	*mp)
70 {
71 	if (xfs_has_reflink(mp))
72 		return xfs_refc_block(mp) + 1;
73 	if (xfs_has_rmapbt(mp))
74 		return XFS_RMAP_BLOCK(mp) + 1;
75 	if (xfs_has_finobt(mp))
76 		return XFS_FIBT_BLOCK(mp) + 1;
77 	return XFS_IBT_BLOCK(mp) + 1;
78 }
79 
80 /*
81  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
82  * guarantee that we can refill the AGFL prior to allocating space in a nearly
83  * full AG.  Although the space described by the free space btrees, the
84  * blocks used by the freesp btrees themselves, and the blocks owned by the
85  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
86  * free space in the AG drop so low that the free space btrees cannot refill an
87  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
88  * until the fs goes down, we subtract this many AG blocks from the incore
89  * fdblocks to ensure user allocation does not overcommit the space the
90  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
91  * withhold space from xfs_mod_fdblocks, so we do not account for that here.
92  */
93 #define XFS_ALLOCBT_AGFL_RESERVE	4
94 
95 /*
96  * Compute the number of blocks that we set aside to guarantee the ability to
97  * refill the AGFL and handle a full bmap btree split.
98  *
99  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
100  * AGF buffer (PV 947395), we place constraints on the relationship among
101  * actual allocations for data blocks, freelist blocks, and potential file data
102  * bmap btree blocks. However, these restrictions may result in no actual space
103  * allocated for a delayed extent, for example, a data block in a certain AG is
104  * allocated but there is no additional block for the additional bmap btree
105  * block due to a split of the bmap btree of the file. The result of this may
106  * lead to an infinite loop when the file gets flushed to disk and all delayed
107  * extents need to be actually allocated. To get around this, we explicitly set
108  * aside a few blocks which will not be reserved in delayed allocation.
109  *
110  * For each AG, we need to reserve enough blocks to replenish a totally empty
111  * AGFL and 4 more to handle a potential split of the file's bmap btree.
112  */
113 unsigned int
114 xfs_alloc_set_aside(
115 	struct xfs_mount	*mp)
116 {
117 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
118 }
119 
120 /*
121  * When deciding how much space to allocate out of an AG, we limit the
122  * allocation maximum size to the size the AG. However, we cannot use all the
123  * blocks in the AG - some are permanently used by metadata. These
124  * blocks are generally:
125  *	- the AG superblock, AGF, AGI and AGFL
126  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
127  *	  the AGI free inode and rmap btree root blocks.
128  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
129  *	- the rmapbt root block
130  *
131  * The AG headers are sector sized, so the amount of space they take up is
132  * dependent on filesystem geometry. The others are all single blocks.
133  */
134 unsigned int
135 xfs_alloc_ag_max_usable(
136 	struct xfs_mount	*mp)
137 {
138 	unsigned int		blocks;
139 
140 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
141 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
142 	blocks += 3;			/* AGF, AGI btree root blocks */
143 	if (xfs_has_finobt(mp))
144 		blocks++;		/* finobt root block */
145 	if (xfs_has_rmapbt(mp))
146 		blocks++;		/* rmap root block */
147 	if (xfs_has_reflink(mp))
148 		blocks++;		/* refcount root block */
149 
150 	return mp->m_sb.sb_agblocks - blocks;
151 }
152 
153 /*
154  * Lookup the record equal to [bno, len] in the btree given by cur.
155  */
156 STATIC int				/* error */
157 xfs_alloc_lookup_eq(
158 	struct xfs_btree_cur	*cur,	/* btree cursor */
159 	xfs_agblock_t		bno,	/* starting block of extent */
160 	xfs_extlen_t		len,	/* length of extent */
161 	int			*stat)	/* success/failure */
162 {
163 	int			error;
164 
165 	cur->bc_rec.a.ar_startblock = bno;
166 	cur->bc_rec.a.ar_blockcount = len;
167 	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
168 	cur->bc_ag.abt.active = (*stat == 1);
169 	return error;
170 }
171 
172 /*
173  * Lookup the first record greater than or equal to [bno, len]
174  * in the btree given by cur.
175  */
176 int				/* error */
177 xfs_alloc_lookup_ge(
178 	struct xfs_btree_cur	*cur,	/* btree cursor */
179 	xfs_agblock_t		bno,	/* starting block of extent */
180 	xfs_extlen_t		len,	/* length of extent */
181 	int			*stat)	/* success/failure */
182 {
183 	int			error;
184 
185 	cur->bc_rec.a.ar_startblock = bno;
186 	cur->bc_rec.a.ar_blockcount = len;
187 	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
188 	cur->bc_ag.abt.active = (*stat == 1);
189 	return error;
190 }
191 
192 /*
193  * Lookup the first record less than or equal to [bno, len]
194  * in the btree given by cur.
195  */
196 int					/* error */
197 xfs_alloc_lookup_le(
198 	struct xfs_btree_cur	*cur,	/* btree cursor */
199 	xfs_agblock_t		bno,	/* starting block of extent */
200 	xfs_extlen_t		len,	/* length of extent */
201 	int			*stat)	/* success/failure */
202 {
203 	int			error;
204 	cur->bc_rec.a.ar_startblock = bno;
205 	cur->bc_rec.a.ar_blockcount = len;
206 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
207 	cur->bc_ag.abt.active = (*stat == 1);
208 	return error;
209 }
210 
211 static inline bool
212 xfs_alloc_cur_active(
213 	struct xfs_btree_cur	*cur)
214 {
215 	return cur && cur->bc_ag.abt.active;
216 }
217 
218 /*
219  * Update the record referred to by cur to the value given
220  * by [bno, len].
221  * This either works (return 0) or gets an EFSCORRUPTED error.
222  */
223 STATIC int				/* error */
224 xfs_alloc_update(
225 	struct xfs_btree_cur	*cur,	/* btree cursor */
226 	xfs_agblock_t		bno,	/* starting block of extent */
227 	xfs_extlen_t		len)	/* length of extent */
228 {
229 	union xfs_btree_rec	rec;
230 
231 	rec.alloc.ar_startblock = cpu_to_be32(bno);
232 	rec.alloc.ar_blockcount = cpu_to_be32(len);
233 	return xfs_btree_update(cur, &rec);
234 }
235 
236 /* Convert the ondisk btree record to its incore representation. */
237 void
238 xfs_alloc_btrec_to_irec(
239 	const union xfs_btree_rec	*rec,
240 	struct xfs_alloc_rec_incore	*irec)
241 {
242 	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
243 	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
244 }
245 
246 /* Simple checks for free space records. */
247 xfs_failaddr_t
248 xfs_alloc_check_irec(
249 	struct xfs_btree_cur		*cur,
250 	const struct xfs_alloc_rec_incore *irec)
251 {
252 	struct xfs_perag		*pag = cur->bc_ag.pag;
253 
254 	if (irec->ar_blockcount == 0)
255 		return __this_address;
256 
257 	/* check for valid extent range, including overflow */
258 	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
259 		return __this_address;
260 
261 	return NULL;
262 }
263 
264 static inline int
265 xfs_alloc_complain_bad_rec(
266 	struct xfs_btree_cur		*cur,
267 	xfs_failaddr_t			fa,
268 	const struct xfs_alloc_rec_incore *irec)
269 {
270 	struct xfs_mount		*mp = cur->bc_mp;
271 
272 	xfs_warn(mp,
273 		"%s Freespace BTree record corruption in AG %d detected at %pS!",
274 		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
275 		cur->bc_ag.pag->pag_agno, fa);
276 	xfs_warn(mp,
277 		"start block 0x%x block count 0x%x", irec->ar_startblock,
278 		irec->ar_blockcount);
279 	return -EFSCORRUPTED;
280 }
281 
282 /*
283  * Get the data from the pointed-to record.
284  */
285 int					/* error */
286 xfs_alloc_get_rec(
287 	struct xfs_btree_cur	*cur,	/* btree cursor */
288 	xfs_agblock_t		*bno,	/* output: starting block of extent */
289 	xfs_extlen_t		*len,	/* output: length of extent */
290 	int			*stat)	/* output: success/failure */
291 {
292 	struct xfs_alloc_rec_incore irec;
293 	union xfs_btree_rec	*rec;
294 	xfs_failaddr_t		fa;
295 	int			error;
296 
297 	error = xfs_btree_get_rec(cur, &rec, stat);
298 	if (error || !(*stat))
299 		return error;
300 
301 	xfs_alloc_btrec_to_irec(rec, &irec);
302 	fa = xfs_alloc_check_irec(cur, &irec);
303 	if (fa)
304 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
305 
306 	*bno = irec.ar_startblock;
307 	*len = irec.ar_blockcount;
308 	return 0;
309 }
310 
311 /*
312  * Compute aligned version of the found extent.
313  * Takes alignment and min length into account.
314  */
315 STATIC bool
316 xfs_alloc_compute_aligned(
317 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
318 	xfs_agblock_t	foundbno,	/* starting block in found extent */
319 	xfs_extlen_t	foundlen,	/* length in found extent */
320 	xfs_agblock_t	*resbno,	/* result block number */
321 	xfs_extlen_t	*reslen,	/* result length */
322 	unsigned	*busy_gen)
323 {
324 	xfs_agblock_t	bno = foundbno;
325 	xfs_extlen_t	len = foundlen;
326 	xfs_extlen_t	diff;
327 	bool		busy;
328 
329 	/* Trim busy sections out of found extent */
330 	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
331 
332 	/*
333 	 * If we have a largish extent that happens to start before min_agbno,
334 	 * see if we can shift it into range...
335 	 */
336 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
337 		diff = args->min_agbno - bno;
338 		if (len > diff) {
339 			bno += diff;
340 			len -= diff;
341 		}
342 	}
343 
344 	if (args->alignment > 1 && len >= args->minlen) {
345 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
346 
347 		diff = aligned_bno - bno;
348 
349 		*resbno = aligned_bno;
350 		*reslen = diff >= len ? 0 : len - diff;
351 	} else {
352 		*resbno = bno;
353 		*reslen = len;
354 	}
355 
356 	return busy;
357 }
358 
359 /*
360  * Compute best start block and diff for "near" allocations.
361  * freelen >= wantlen already checked by caller.
362  */
363 STATIC xfs_extlen_t			/* difference value (absolute) */
364 xfs_alloc_compute_diff(
365 	xfs_agblock_t	wantbno,	/* target starting block */
366 	xfs_extlen_t	wantlen,	/* target length */
367 	xfs_extlen_t	alignment,	/* target alignment */
368 	int		datatype,	/* are we allocating data? */
369 	xfs_agblock_t	freebno,	/* freespace's starting block */
370 	xfs_extlen_t	freelen,	/* freespace's length */
371 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
372 {
373 	xfs_agblock_t	freeend;	/* end of freespace extent */
374 	xfs_agblock_t	newbno1;	/* return block number */
375 	xfs_agblock_t	newbno2;	/* other new block number */
376 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
377 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
378 	xfs_agblock_t	wantend;	/* end of target extent */
379 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
380 
381 	ASSERT(freelen >= wantlen);
382 	freeend = freebno + freelen;
383 	wantend = wantbno + wantlen;
384 	/*
385 	 * We want to allocate from the start of a free extent if it is past
386 	 * the desired block or if we are allocating user data and the free
387 	 * extent is before desired block. The second case is there to allow
388 	 * for contiguous allocation from the remaining free space if the file
389 	 * grows in the short term.
390 	 */
391 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
392 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
393 			newbno1 = NULLAGBLOCK;
394 	} else if (freeend >= wantend && alignment > 1) {
395 		newbno1 = roundup(wantbno, alignment);
396 		newbno2 = newbno1 - alignment;
397 		if (newbno1 >= freeend)
398 			newbno1 = NULLAGBLOCK;
399 		else
400 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
401 		if (newbno2 < freebno)
402 			newbno2 = NULLAGBLOCK;
403 		else
404 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
405 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
406 			if (newlen1 < newlen2 ||
407 			    (newlen1 == newlen2 &&
408 			     XFS_ABSDIFF(newbno1, wantbno) >
409 			     XFS_ABSDIFF(newbno2, wantbno)))
410 				newbno1 = newbno2;
411 		} else if (newbno2 != NULLAGBLOCK)
412 			newbno1 = newbno2;
413 	} else if (freeend >= wantend) {
414 		newbno1 = wantbno;
415 	} else if (alignment > 1) {
416 		newbno1 = roundup(freeend - wantlen, alignment);
417 		if (newbno1 > freeend - wantlen &&
418 		    newbno1 - alignment >= freebno)
419 			newbno1 -= alignment;
420 		else if (newbno1 >= freeend)
421 			newbno1 = NULLAGBLOCK;
422 	} else
423 		newbno1 = freeend - wantlen;
424 	*newbnop = newbno1;
425 	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
426 }
427 
428 /*
429  * Fix up the length, based on mod and prod.
430  * len should be k * prod + mod for some k.
431  * If len is too small it is returned unchanged.
432  * If len hits maxlen it is left alone.
433  */
434 STATIC void
435 xfs_alloc_fix_len(
436 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
437 {
438 	xfs_extlen_t	k;
439 	xfs_extlen_t	rlen;
440 
441 	ASSERT(args->mod < args->prod);
442 	rlen = args->len;
443 	ASSERT(rlen >= args->minlen);
444 	ASSERT(rlen <= args->maxlen);
445 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
446 	    (args->mod == 0 && rlen < args->prod))
447 		return;
448 	k = rlen % args->prod;
449 	if (k == args->mod)
450 		return;
451 	if (k > args->mod)
452 		rlen = rlen - (k - args->mod);
453 	else
454 		rlen = rlen - args->prod + (args->mod - k);
455 	/* casts to (int) catch length underflows */
456 	if ((int)rlen < (int)args->minlen)
457 		return;
458 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
459 	ASSERT(rlen % args->prod == args->mod);
460 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
461 		rlen + args->minleft);
462 	args->len = rlen;
463 }
464 
465 /*
466  * Update the two btrees, logically removing from freespace the extent
467  * starting at rbno, rlen blocks.  The extent is contained within the
468  * actual (current) free extent fbno for flen blocks.
469  * Flags are passed in indicating whether the cursors are set to the
470  * relevant records.
471  */
472 STATIC int				/* error code */
473 xfs_alloc_fixup_trees(
474 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
475 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
476 	xfs_agblock_t	fbno,		/* starting block of free extent */
477 	xfs_extlen_t	flen,		/* length of free extent */
478 	xfs_agblock_t	rbno,		/* starting block of returned extent */
479 	xfs_extlen_t	rlen,		/* length of returned extent */
480 	int		flags)		/* flags, XFSA_FIXUP_... */
481 {
482 	int		error;		/* error code */
483 	int		i;		/* operation results */
484 	xfs_agblock_t	nfbno1;		/* first new free startblock */
485 	xfs_agblock_t	nfbno2;		/* second new free startblock */
486 	xfs_extlen_t	nflen1=0;	/* first new free length */
487 	xfs_extlen_t	nflen2=0;	/* second new free length */
488 	struct xfs_mount *mp;
489 
490 	mp = cnt_cur->bc_mp;
491 
492 	/*
493 	 * Look up the record in the by-size tree if necessary.
494 	 */
495 	if (flags & XFSA_FIXUP_CNT_OK) {
496 #ifdef DEBUG
497 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
498 			return error;
499 		if (XFS_IS_CORRUPT(mp,
500 				   i != 1 ||
501 				   nfbno1 != fbno ||
502 				   nflen1 != flen))
503 			return -EFSCORRUPTED;
504 #endif
505 	} else {
506 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
507 			return error;
508 		if (XFS_IS_CORRUPT(mp, i != 1))
509 			return -EFSCORRUPTED;
510 	}
511 	/*
512 	 * Look up the record in the by-block tree if necessary.
513 	 */
514 	if (flags & XFSA_FIXUP_BNO_OK) {
515 #ifdef DEBUG
516 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
517 			return error;
518 		if (XFS_IS_CORRUPT(mp,
519 				   i != 1 ||
520 				   nfbno1 != fbno ||
521 				   nflen1 != flen))
522 			return -EFSCORRUPTED;
523 #endif
524 	} else {
525 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
526 			return error;
527 		if (XFS_IS_CORRUPT(mp, i != 1))
528 			return -EFSCORRUPTED;
529 	}
530 
531 #ifdef DEBUG
532 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
533 		struct xfs_btree_block	*bnoblock;
534 		struct xfs_btree_block	*cntblock;
535 
536 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
537 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
538 
539 		if (XFS_IS_CORRUPT(mp,
540 				   bnoblock->bb_numrecs !=
541 				   cntblock->bb_numrecs))
542 			return -EFSCORRUPTED;
543 	}
544 #endif
545 
546 	/*
547 	 * Deal with all four cases: the allocated record is contained
548 	 * within the freespace record, so we can have new freespace
549 	 * at either (or both) end, or no freespace remaining.
550 	 */
551 	if (rbno == fbno && rlen == flen)
552 		nfbno1 = nfbno2 = NULLAGBLOCK;
553 	else if (rbno == fbno) {
554 		nfbno1 = rbno + rlen;
555 		nflen1 = flen - rlen;
556 		nfbno2 = NULLAGBLOCK;
557 	} else if (rbno + rlen == fbno + flen) {
558 		nfbno1 = fbno;
559 		nflen1 = flen - rlen;
560 		nfbno2 = NULLAGBLOCK;
561 	} else {
562 		nfbno1 = fbno;
563 		nflen1 = rbno - fbno;
564 		nfbno2 = rbno + rlen;
565 		nflen2 = (fbno + flen) - nfbno2;
566 	}
567 	/*
568 	 * Delete the entry from the by-size btree.
569 	 */
570 	if ((error = xfs_btree_delete(cnt_cur, &i)))
571 		return error;
572 	if (XFS_IS_CORRUPT(mp, i != 1))
573 		return -EFSCORRUPTED;
574 	/*
575 	 * Add new by-size btree entry(s).
576 	 */
577 	if (nfbno1 != NULLAGBLOCK) {
578 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
579 			return error;
580 		if (XFS_IS_CORRUPT(mp, i != 0))
581 			return -EFSCORRUPTED;
582 		if ((error = xfs_btree_insert(cnt_cur, &i)))
583 			return error;
584 		if (XFS_IS_CORRUPT(mp, i != 1))
585 			return -EFSCORRUPTED;
586 	}
587 	if (nfbno2 != NULLAGBLOCK) {
588 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
589 			return error;
590 		if (XFS_IS_CORRUPT(mp, i != 0))
591 			return -EFSCORRUPTED;
592 		if ((error = xfs_btree_insert(cnt_cur, &i)))
593 			return error;
594 		if (XFS_IS_CORRUPT(mp, i != 1))
595 			return -EFSCORRUPTED;
596 	}
597 	/*
598 	 * Fix up the by-block btree entry(s).
599 	 */
600 	if (nfbno1 == NULLAGBLOCK) {
601 		/*
602 		 * No remaining freespace, just delete the by-block tree entry.
603 		 */
604 		if ((error = xfs_btree_delete(bno_cur, &i)))
605 			return error;
606 		if (XFS_IS_CORRUPT(mp, i != 1))
607 			return -EFSCORRUPTED;
608 	} else {
609 		/*
610 		 * Update the by-block entry to start later|be shorter.
611 		 */
612 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
613 			return error;
614 	}
615 	if (nfbno2 != NULLAGBLOCK) {
616 		/*
617 		 * 2 resulting free entries, need to add one.
618 		 */
619 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
620 			return error;
621 		if (XFS_IS_CORRUPT(mp, i != 0))
622 			return -EFSCORRUPTED;
623 		if ((error = xfs_btree_insert(bno_cur, &i)))
624 			return error;
625 		if (XFS_IS_CORRUPT(mp, i != 1))
626 			return -EFSCORRUPTED;
627 	}
628 	return 0;
629 }
630 
631 static xfs_failaddr_t
632 xfs_agfl_verify(
633 	struct xfs_buf	*bp)
634 {
635 	struct xfs_mount *mp = bp->b_mount;
636 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
637 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
638 	int		i;
639 
640 	/*
641 	 * There is no verification of non-crc AGFLs because mkfs does not
642 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
643 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
644 	 * can't verify just those entries are valid.
645 	 */
646 	if (!xfs_has_crc(mp))
647 		return NULL;
648 
649 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
650 		return __this_address;
651 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
652 		return __this_address;
653 	/*
654 	 * during growfs operations, the perag is not fully initialised,
655 	 * so we can't use it for any useful checking. growfs ensures we can't
656 	 * use it by using uncached buffers that don't have the perag attached
657 	 * so we can detect and avoid this problem.
658 	 */
659 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
660 		return __this_address;
661 
662 	for (i = 0; i < xfs_agfl_size(mp); i++) {
663 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
664 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
665 			return __this_address;
666 	}
667 
668 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
669 		return __this_address;
670 	return NULL;
671 }
672 
673 static void
674 xfs_agfl_read_verify(
675 	struct xfs_buf	*bp)
676 {
677 	struct xfs_mount *mp = bp->b_mount;
678 	xfs_failaddr_t	fa;
679 
680 	/*
681 	 * There is no verification of non-crc AGFLs because mkfs does not
682 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
683 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
684 	 * can't verify just those entries are valid.
685 	 */
686 	if (!xfs_has_crc(mp))
687 		return;
688 
689 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
690 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
691 	else {
692 		fa = xfs_agfl_verify(bp);
693 		if (fa)
694 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
695 	}
696 }
697 
698 static void
699 xfs_agfl_write_verify(
700 	struct xfs_buf	*bp)
701 {
702 	struct xfs_mount	*mp = bp->b_mount;
703 	struct xfs_buf_log_item	*bip = bp->b_log_item;
704 	xfs_failaddr_t		fa;
705 
706 	/* no verification of non-crc AGFLs */
707 	if (!xfs_has_crc(mp))
708 		return;
709 
710 	fa = xfs_agfl_verify(bp);
711 	if (fa) {
712 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
713 		return;
714 	}
715 
716 	if (bip)
717 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
718 
719 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
720 }
721 
722 const struct xfs_buf_ops xfs_agfl_buf_ops = {
723 	.name = "xfs_agfl",
724 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
725 	.verify_read = xfs_agfl_read_verify,
726 	.verify_write = xfs_agfl_write_verify,
727 	.verify_struct = xfs_agfl_verify,
728 };
729 
730 /*
731  * Read in the allocation group free block array.
732  */
733 int
734 xfs_alloc_read_agfl(
735 	struct xfs_perag	*pag,
736 	struct xfs_trans	*tp,
737 	struct xfs_buf		**bpp)
738 {
739 	struct xfs_mount	*mp = pag->pag_mount;
740 	struct xfs_buf		*bp;
741 	int			error;
742 
743 	error = xfs_trans_read_buf(
744 			mp, tp, mp->m_ddev_targp,
745 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
746 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
747 	if (error)
748 		return error;
749 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
750 	*bpp = bp;
751 	return 0;
752 }
753 
754 STATIC int
755 xfs_alloc_update_counters(
756 	struct xfs_trans	*tp,
757 	struct xfs_buf		*agbp,
758 	long			len)
759 {
760 	struct xfs_agf		*agf = agbp->b_addr;
761 
762 	agbp->b_pag->pagf_freeblks += len;
763 	be32_add_cpu(&agf->agf_freeblks, len);
764 
765 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
766 		     be32_to_cpu(agf->agf_length))) {
767 		xfs_buf_mark_corrupt(agbp);
768 		return -EFSCORRUPTED;
769 	}
770 
771 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
772 	return 0;
773 }
774 
775 /*
776  * Block allocation algorithm and data structures.
777  */
778 struct xfs_alloc_cur {
779 	struct xfs_btree_cur		*cnt;	/* btree cursors */
780 	struct xfs_btree_cur		*bnolt;
781 	struct xfs_btree_cur		*bnogt;
782 	xfs_extlen_t			cur_len;/* current search length */
783 	xfs_agblock_t			rec_bno;/* extent startblock */
784 	xfs_extlen_t			rec_len;/* extent length */
785 	xfs_agblock_t			bno;	/* alloc bno */
786 	xfs_extlen_t			len;	/* alloc len */
787 	xfs_extlen_t			diff;	/* diff from search bno */
788 	unsigned int			busy_gen;/* busy state */
789 	bool				busy;
790 };
791 
792 /*
793  * Set up cursors, etc. in the extent allocation cursor. This function can be
794  * called multiple times to reset an initialized structure without having to
795  * reallocate cursors.
796  */
797 static int
798 xfs_alloc_cur_setup(
799 	struct xfs_alloc_arg	*args,
800 	struct xfs_alloc_cur	*acur)
801 {
802 	int			error;
803 	int			i;
804 
805 	acur->cur_len = args->maxlen;
806 	acur->rec_bno = 0;
807 	acur->rec_len = 0;
808 	acur->bno = 0;
809 	acur->len = 0;
810 	acur->diff = -1;
811 	acur->busy = false;
812 	acur->busy_gen = 0;
813 
814 	/*
815 	 * Perform an initial cntbt lookup to check for availability of maxlen
816 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
817 	 * attempt a small allocation.
818 	 */
819 	if (!acur->cnt)
820 		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
821 					args->agbp, args->pag, XFS_BTNUM_CNT);
822 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
823 	if (error)
824 		return error;
825 
826 	/*
827 	 * Allocate the bnobt left and right search cursors.
828 	 */
829 	if (!acur->bnolt)
830 		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
831 					args->agbp, args->pag, XFS_BTNUM_BNO);
832 	if (!acur->bnogt)
833 		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
834 					args->agbp, args->pag, XFS_BTNUM_BNO);
835 	return i == 1 ? 0 : -ENOSPC;
836 }
837 
838 static void
839 xfs_alloc_cur_close(
840 	struct xfs_alloc_cur	*acur,
841 	bool			error)
842 {
843 	int			cur_error = XFS_BTREE_NOERROR;
844 
845 	if (error)
846 		cur_error = XFS_BTREE_ERROR;
847 
848 	if (acur->cnt)
849 		xfs_btree_del_cursor(acur->cnt, cur_error);
850 	if (acur->bnolt)
851 		xfs_btree_del_cursor(acur->bnolt, cur_error);
852 	if (acur->bnogt)
853 		xfs_btree_del_cursor(acur->bnogt, cur_error);
854 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
855 }
856 
857 /*
858  * Check an extent for allocation and track the best available candidate in the
859  * allocation structure. The cursor is deactivated if it has entered an out of
860  * range state based on allocation arguments. Optionally return the extent
861  * extent geometry and allocation status if requested by the caller.
862  */
863 static int
864 xfs_alloc_cur_check(
865 	struct xfs_alloc_arg	*args,
866 	struct xfs_alloc_cur	*acur,
867 	struct xfs_btree_cur	*cur,
868 	int			*new)
869 {
870 	int			error, i;
871 	xfs_agblock_t		bno, bnoa, bnew;
872 	xfs_extlen_t		len, lena, diff = -1;
873 	bool			busy;
874 	unsigned		busy_gen = 0;
875 	bool			deactivate = false;
876 	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
877 
878 	*new = 0;
879 
880 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
881 	if (error)
882 		return error;
883 	if (XFS_IS_CORRUPT(args->mp, i != 1))
884 		return -EFSCORRUPTED;
885 
886 	/*
887 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
888 	 * range (i.e., walking backwards looking for a minlen extent).
889 	 */
890 	if (len < args->minlen) {
891 		deactivate = !isbnobt;
892 		goto out;
893 	}
894 
895 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
896 					 &busy_gen);
897 	acur->busy |= busy;
898 	if (busy)
899 		acur->busy_gen = busy_gen;
900 	/* deactivate a bnobt cursor outside of locality range */
901 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
902 		deactivate = isbnobt;
903 		goto out;
904 	}
905 	if (lena < args->minlen)
906 		goto out;
907 
908 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
909 	xfs_alloc_fix_len(args);
910 	ASSERT(args->len >= args->minlen);
911 	if (args->len < acur->len)
912 		goto out;
913 
914 	/*
915 	 * We have an aligned record that satisfies minlen and beats or matches
916 	 * the candidate extent size. Compare locality for near allocation mode.
917 	 */
918 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
919 				      args->alignment, args->datatype,
920 				      bnoa, lena, &bnew);
921 	if (bnew == NULLAGBLOCK)
922 		goto out;
923 
924 	/*
925 	 * Deactivate a bnobt cursor with worse locality than the current best.
926 	 */
927 	if (diff > acur->diff) {
928 		deactivate = isbnobt;
929 		goto out;
930 	}
931 
932 	ASSERT(args->len > acur->len ||
933 	       (args->len == acur->len && diff <= acur->diff));
934 	acur->rec_bno = bno;
935 	acur->rec_len = len;
936 	acur->bno = bnew;
937 	acur->len = args->len;
938 	acur->diff = diff;
939 	*new = 1;
940 
941 	/*
942 	 * We're done if we found a perfect allocation. This only deactivates
943 	 * the current cursor, but this is just an optimization to terminate a
944 	 * cntbt search that otherwise runs to the edge of the tree.
945 	 */
946 	if (acur->diff == 0 && acur->len == args->maxlen)
947 		deactivate = true;
948 out:
949 	if (deactivate)
950 		cur->bc_ag.abt.active = false;
951 	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
952 				  *new);
953 	return 0;
954 }
955 
956 /*
957  * Complete an allocation of a candidate extent. Remove the extent from both
958  * trees and update the args structure.
959  */
960 STATIC int
961 xfs_alloc_cur_finish(
962 	struct xfs_alloc_arg	*args,
963 	struct xfs_alloc_cur	*acur)
964 {
965 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
966 	int			error;
967 
968 	ASSERT(acur->cnt && acur->bnolt);
969 	ASSERT(acur->bno >= acur->rec_bno);
970 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
971 	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
972 
973 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
974 				      acur->rec_len, acur->bno, acur->len, 0);
975 	if (error)
976 		return error;
977 
978 	args->agbno = acur->bno;
979 	args->len = acur->len;
980 	args->wasfromfl = 0;
981 
982 	trace_xfs_alloc_cur(args);
983 	return 0;
984 }
985 
986 /*
987  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
988  * bno optimized lookup to search for extents with ideal size and locality.
989  */
990 STATIC int
991 xfs_alloc_cntbt_iter(
992 	struct xfs_alloc_arg		*args,
993 	struct xfs_alloc_cur		*acur)
994 {
995 	struct xfs_btree_cur	*cur = acur->cnt;
996 	xfs_agblock_t		bno;
997 	xfs_extlen_t		len, cur_len;
998 	int			error;
999 	int			i;
1000 
1001 	if (!xfs_alloc_cur_active(cur))
1002 		return 0;
1003 
1004 	/* locality optimized lookup */
1005 	cur_len = acur->cur_len;
1006 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1007 	if (error)
1008 		return error;
1009 	if (i == 0)
1010 		return 0;
1011 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1012 	if (error)
1013 		return error;
1014 
1015 	/* check the current record and update search length from it */
1016 	error = xfs_alloc_cur_check(args, acur, cur, &i);
1017 	if (error)
1018 		return error;
1019 	ASSERT(len >= acur->cur_len);
1020 	acur->cur_len = len;
1021 
1022 	/*
1023 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1024 	 * secondary key and so the current record may lie just before or after
1025 	 * agbno. If it is past agbno, check the previous record too so long as
1026 	 * the length matches as it may be closer. Don't check a smaller record
1027 	 * because that could deactivate our cursor.
1028 	 */
1029 	if (bno > args->agbno) {
1030 		error = xfs_btree_decrement(cur, 0, &i);
1031 		if (!error && i) {
1032 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1033 			if (!error && i && len == acur->cur_len)
1034 				error = xfs_alloc_cur_check(args, acur, cur,
1035 							    &i);
1036 		}
1037 		if (error)
1038 			return error;
1039 	}
1040 
1041 	/*
1042 	 * Increment the search key until we find at least one allocation
1043 	 * candidate or if the extent we found was larger. Otherwise, double the
1044 	 * search key to optimize the search. Efficiency is more important here
1045 	 * than absolute best locality.
1046 	 */
1047 	cur_len <<= 1;
1048 	if (!acur->len || acur->cur_len >= cur_len)
1049 		acur->cur_len++;
1050 	else
1051 		acur->cur_len = cur_len;
1052 
1053 	return error;
1054 }
1055 
1056 /*
1057  * Deal with the case where only small freespaces remain. Either return the
1058  * contents of the last freespace record, or allocate space from the freelist if
1059  * there is nothing in the tree.
1060  */
1061 STATIC int			/* error */
1062 xfs_alloc_ag_vextent_small(
1063 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1064 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1065 	xfs_agblock_t		*fbnop,	/* result block number */
1066 	xfs_extlen_t		*flenp,	/* result length */
1067 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1068 {
1069 	struct xfs_agf		*agf = args->agbp->b_addr;
1070 	int			error = 0;
1071 	xfs_agblock_t		fbno = NULLAGBLOCK;
1072 	xfs_extlen_t		flen = 0;
1073 	int			i = 0;
1074 
1075 	/*
1076 	 * If a cntbt cursor is provided, try to allocate the largest record in
1077 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1078 	 * allocation. Make sure to respect minleft even when pulling from the
1079 	 * freelist.
1080 	 */
1081 	if (ccur)
1082 		error = xfs_btree_decrement(ccur, 0, &i);
1083 	if (error)
1084 		goto error;
1085 	if (i) {
1086 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1087 		if (error)
1088 			goto error;
1089 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1090 			error = -EFSCORRUPTED;
1091 			goto error;
1092 		}
1093 		goto out;
1094 	}
1095 
1096 	if (args->minlen != 1 || args->alignment != 1 ||
1097 	    args->resv == XFS_AG_RESV_AGFL ||
1098 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1099 		goto out;
1100 
1101 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1102 			&fbno, 0);
1103 	if (error)
1104 		goto error;
1105 	if (fbno == NULLAGBLOCK)
1106 		goto out;
1107 
1108 	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1109 			      (args->datatype & XFS_ALLOC_NOBUSY));
1110 
1111 	if (args->datatype & XFS_ALLOC_USERDATA) {
1112 		struct xfs_buf	*bp;
1113 
1114 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1115 				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1116 				args->mp->m_bsize, 0, &bp);
1117 		if (error)
1118 			goto error;
1119 		xfs_trans_binval(args->tp, bp);
1120 	}
1121 	*fbnop = args->agbno = fbno;
1122 	*flenp = args->len = 1;
1123 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1124 		error = -EFSCORRUPTED;
1125 		goto error;
1126 	}
1127 	args->wasfromfl = 1;
1128 	trace_xfs_alloc_small_freelist(args);
1129 
1130 	/*
1131 	 * If we're feeding an AGFL block to something that doesn't live in the
1132 	 * free space, we need to clear out the OWN_AG rmap.
1133 	 */
1134 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1135 			      &XFS_RMAP_OINFO_AG);
1136 	if (error)
1137 		goto error;
1138 
1139 	*stat = 0;
1140 	return 0;
1141 
1142 out:
1143 	/*
1144 	 * Can't do the allocation, give up.
1145 	 */
1146 	if (flen < args->minlen) {
1147 		args->agbno = NULLAGBLOCK;
1148 		trace_xfs_alloc_small_notenough(args);
1149 		flen = 0;
1150 	}
1151 	*fbnop = fbno;
1152 	*flenp = flen;
1153 	*stat = 1;
1154 	trace_xfs_alloc_small_done(args);
1155 	return 0;
1156 
1157 error:
1158 	trace_xfs_alloc_small_error(args);
1159 	return error;
1160 }
1161 
1162 /*
1163  * Allocate a variable extent at exactly agno/bno.
1164  * Extent's length (returned in *len) will be between minlen and maxlen,
1165  * and of the form k * prod + mod unless there's nothing that large.
1166  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1167  */
1168 STATIC int			/* error */
1169 xfs_alloc_ag_vextent_exact(
1170 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1171 {
1172 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1173 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1174 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1175 	int		error;
1176 	xfs_agblock_t	fbno;	/* start block of found extent */
1177 	xfs_extlen_t	flen;	/* length of found extent */
1178 	xfs_agblock_t	tbno;	/* start block of busy extent */
1179 	xfs_extlen_t	tlen;	/* length of busy extent */
1180 	xfs_agblock_t	tend;	/* end block of busy extent */
1181 	int		i;	/* success/failure of operation */
1182 	unsigned	busy_gen;
1183 
1184 	ASSERT(args->alignment == 1);
1185 
1186 	/*
1187 	 * Allocate/initialize a cursor for the by-number freespace btree.
1188 	 */
1189 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1190 					  args->pag, XFS_BTNUM_BNO);
1191 
1192 	/*
1193 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1194 	 * Look for the closest free block <= bno, it must contain bno
1195 	 * if any free block does.
1196 	 */
1197 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1198 	if (error)
1199 		goto error0;
1200 	if (!i)
1201 		goto not_found;
1202 
1203 	/*
1204 	 * Grab the freespace record.
1205 	 */
1206 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1207 	if (error)
1208 		goto error0;
1209 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1210 		error = -EFSCORRUPTED;
1211 		goto error0;
1212 	}
1213 	ASSERT(fbno <= args->agbno);
1214 
1215 	/*
1216 	 * Check for overlapping busy extents.
1217 	 */
1218 	tbno = fbno;
1219 	tlen = flen;
1220 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1221 
1222 	/*
1223 	 * Give up if the start of the extent is busy, or the freespace isn't
1224 	 * long enough for the minimum request.
1225 	 */
1226 	if (tbno > args->agbno)
1227 		goto not_found;
1228 	if (tlen < args->minlen)
1229 		goto not_found;
1230 	tend = tbno + tlen;
1231 	if (tend < args->agbno + args->minlen)
1232 		goto not_found;
1233 
1234 	/*
1235 	 * End of extent will be smaller of the freespace end and the
1236 	 * maximal requested end.
1237 	 *
1238 	 * Fix the length according to mod and prod if given.
1239 	 */
1240 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1241 						- args->agbno;
1242 	xfs_alloc_fix_len(args);
1243 	ASSERT(args->agbno + args->len <= tend);
1244 
1245 	/*
1246 	 * We are allocating agbno for args->len
1247 	 * Allocate/initialize a cursor for the by-size btree.
1248 	 */
1249 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1250 					args->pag, XFS_BTNUM_CNT);
1251 	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1252 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1253 				      args->len, XFSA_FIXUP_BNO_OK);
1254 	if (error) {
1255 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1256 		goto error0;
1257 	}
1258 
1259 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1260 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1261 
1262 	args->wasfromfl = 0;
1263 	trace_xfs_alloc_exact_done(args);
1264 	return 0;
1265 
1266 not_found:
1267 	/* Didn't find it, return null. */
1268 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1269 	args->agbno = NULLAGBLOCK;
1270 	trace_xfs_alloc_exact_notfound(args);
1271 	return 0;
1272 
1273 error0:
1274 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1275 	trace_xfs_alloc_exact_error(args);
1276 	return error;
1277 }
1278 
1279 /*
1280  * Search a given number of btree records in a given direction. Check each
1281  * record against the good extent we've already found.
1282  */
1283 STATIC int
1284 xfs_alloc_walk_iter(
1285 	struct xfs_alloc_arg	*args,
1286 	struct xfs_alloc_cur	*acur,
1287 	struct xfs_btree_cur	*cur,
1288 	bool			increment,
1289 	bool			find_one, /* quit on first candidate */
1290 	int			count,    /* rec count (-1 for infinite) */
1291 	int			*stat)
1292 {
1293 	int			error;
1294 	int			i;
1295 
1296 	*stat = 0;
1297 
1298 	/*
1299 	 * Search so long as the cursor is active or we find a better extent.
1300 	 * The cursor is deactivated if it extends beyond the range of the
1301 	 * current allocation candidate.
1302 	 */
1303 	while (xfs_alloc_cur_active(cur) && count) {
1304 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1305 		if (error)
1306 			return error;
1307 		if (i == 1) {
1308 			*stat = 1;
1309 			if (find_one)
1310 				break;
1311 		}
1312 		if (!xfs_alloc_cur_active(cur))
1313 			break;
1314 
1315 		if (increment)
1316 			error = xfs_btree_increment(cur, 0, &i);
1317 		else
1318 			error = xfs_btree_decrement(cur, 0, &i);
1319 		if (error)
1320 			return error;
1321 		if (i == 0)
1322 			cur->bc_ag.abt.active = false;
1323 
1324 		if (count > 0)
1325 			count--;
1326 	}
1327 
1328 	return 0;
1329 }
1330 
1331 /*
1332  * Search the by-bno and by-size btrees in parallel in search of an extent with
1333  * ideal locality based on the NEAR mode ->agbno locality hint.
1334  */
1335 STATIC int
1336 xfs_alloc_ag_vextent_locality(
1337 	struct xfs_alloc_arg	*args,
1338 	struct xfs_alloc_cur	*acur,
1339 	int			*stat)
1340 {
1341 	struct xfs_btree_cur	*fbcur = NULL;
1342 	int			error;
1343 	int			i;
1344 	bool			fbinc;
1345 
1346 	ASSERT(acur->len == 0);
1347 
1348 	*stat = 0;
1349 
1350 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1351 	if (error)
1352 		return error;
1353 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1354 	if (error)
1355 		return error;
1356 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1357 	if (error)
1358 		return error;
1359 
1360 	/*
1361 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1362 	 * right and lookup the closest extent to the locality hint for each
1363 	 * extent size key in the cntbt. The entire search terminates
1364 	 * immediately on a bnobt hit because that means we've found best case
1365 	 * locality. Otherwise the search continues until the cntbt cursor runs
1366 	 * off the end of the tree. If no allocation candidate is found at this
1367 	 * point, give up on locality, walk backwards from the end of the cntbt
1368 	 * and take the first available extent.
1369 	 *
1370 	 * The parallel tree searches balance each other out to provide fairly
1371 	 * consistent performance for various situations. The bnobt search can
1372 	 * have pathological behavior in the worst case scenario of larger
1373 	 * allocation requests and fragmented free space. On the other hand, the
1374 	 * bnobt is able to satisfy most smaller allocation requests much more
1375 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1376 	 * free space and sets of free extents for larger allocation requests
1377 	 * more quickly than the bnobt. Since the locality hint is just a hint
1378 	 * and we don't want to scan the entire bnobt for perfect locality, the
1379 	 * cntbt search essentially bounds the bnobt search such that we can
1380 	 * find good enough locality at reasonable performance in most cases.
1381 	 */
1382 	while (xfs_alloc_cur_active(acur->bnolt) ||
1383 	       xfs_alloc_cur_active(acur->bnogt) ||
1384 	       xfs_alloc_cur_active(acur->cnt)) {
1385 
1386 		trace_xfs_alloc_cur_lookup(args);
1387 
1388 		/*
1389 		 * Search the bnobt left and right. In the case of a hit, finish
1390 		 * the search in the opposite direction and we're done.
1391 		 */
1392 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1393 					    true, 1, &i);
1394 		if (error)
1395 			return error;
1396 		if (i == 1) {
1397 			trace_xfs_alloc_cur_left(args);
1398 			fbcur = acur->bnogt;
1399 			fbinc = true;
1400 			break;
1401 		}
1402 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1403 					    1, &i);
1404 		if (error)
1405 			return error;
1406 		if (i == 1) {
1407 			trace_xfs_alloc_cur_right(args);
1408 			fbcur = acur->bnolt;
1409 			fbinc = false;
1410 			break;
1411 		}
1412 
1413 		/*
1414 		 * Check the extent with best locality based on the current
1415 		 * extent size search key and keep track of the best candidate.
1416 		 */
1417 		error = xfs_alloc_cntbt_iter(args, acur);
1418 		if (error)
1419 			return error;
1420 		if (!xfs_alloc_cur_active(acur->cnt)) {
1421 			trace_xfs_alloc_cur_lookup_done(args);
1422 			break;
1423 		}
1424 	}
1425 
1426 	/*
1427 	 * If we failed to find anything due to busy extents, return empty
1428 	 * handed so the caller can flush and retry. If no busy extents were
1429 	 * found, walk backwards from the end of the cntbt as a last resort.
1430 	 */
1431 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1432 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1433 		if (error)
1434 			return error;
1435 		if (i) {
1436 			acur->cnt->bc_ag.abt.active = true;
1437 			fbcur = acur->cnt;
1438 			fbinc = false;
1439 		}
1440 	}
1441 
1442 	/*
1443 	 * Search in the opposite direction for a better entry in the case of
1444 	 * a bnobt hit or walk backwards from the end of the cntbt.
1445 	 */
1446 	if (fbcur) {
1447 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1448 					    &i);
1449 		if (error)
1450 			return error;
1451 	}
1452 
1453 	if (acur->len)
1454 		*stat = 1;
1455 
1456 	return 0;
1457 }
1458 
1459 /* Check the last block of the cnt btree for allocations. */
1460 static int
1461 xfs_alloc_ag_vextent_lastblock(
1462 	struct xfs_alloc_arg	*args,
1463 	struct xfs_alloc_cur	*acur,
1464 	xfs_agblock_t		*bno,
1465 	xfs_extlen_t		*len,
1466 	bool			*allocated)
1467 {
1468 	int			error;
1469 	int			i;
1470 
1471 #ifdef DEBUG
1472 	/* Randomly don't execute the first algorithm. */
1473 	if (get_random_u32_below(2))
1474 		return 0;
1475 #endif
1476 
1477 	/*
1478 	 * Start from the entry that lookup found, sequence through all larger
1479 	 * free blocks.  If we're actually pointing at a record smaller than
1480 	 * maxlen, go to the start of this block, and skip all those smaller
1481 	 * than minlen.
1482 	 */
1483 	if (*len || args->alignment > 1) {
1484 		acur->cnt->bc_levels[0].ptr = 1;
1485 		do {
1486 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1487 			if (error)
1488 				return error;
1489 			if (XFS_IS_CORRUPT(args->mp, i != 1))
1490 				return -EFSCORRUPTED;
1491 			if (*len >= args->minlen)
1492 				break;
1493 			error = xfs_btree_increment(acur->cnt, 0, &i);
1494 			if (error)
1495 				return error;
1496 		} while (i);
1497 		ASSERT(*len >= args->minlen);
1498 		if (!i)
1499 			return 0;
1500 	}
1501 
1502 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1503 	if (error)
1504 		return error;
1505 
1506 	/*
1507 	 * It didn't work.  We COULD be in a case where there's a good record
1508 	 * somewhere, so try again.
1509 	 */
1510 	if (acur->len == 0)
1511 		return 0;
1512 
1513 	trace_xfs_alloc_near_first(args);
1514 	*allocated = true;
1515 	return 0;
1516 }
1517 
1518 /*
1519  * Allocate a variable extent near bno in the allocation group agno.
1520  * Extent's length (returned in len) will be between minlen and maxlen,
1521  * and of the form k * prod + mod unless there's nothing that large.
1522  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1523  */
1524 STATIC int
1525 xfs_alloc_ag_vextent_near(
1526 	struct xfs_alloc_arg	*args)
1527 {
1528 	struct xfs_alloc_cur	acur = {};
1529 	int			error;		/* error code */
1530 	int			i;		/* result code, temporary */
1531 	xfs_agblock_t		bno;
1532 	xfs_extlen_t		len;
1533 
1534 	/* handle uninitialized agbno range so caller doesn't have to */
1535 	if (!args->min_agbno && !args->max_agbno)
1536 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1537 	ASSERT(args->min_agbno <= args->max_agbno);
1538 
1539 	/* clamp agbno to the range if it's outside */
1540 	if (args->agbno < args->min_agbno)
1541 		args->agbno = args->min_agbno;
1542 	if (args->agbno > args->max_agbno)
1543 		args->agbno = args->max_agbno;
1544 
1545 restart:
1546 	len = 0;
1547 
1548 	/*
1549 	 * Set up cursors and see if there are any free extents as big as
1550 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1551 	 * empty.
1552 	 */
1553 	error = xfs_alloc_cur_setup(args, &acur);
1554 	if (error == -ENOSPC) {
1555 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1556 				&len, &i);
1557 		if (error)
1558 			goto out;
1559 		if (i == 0 || len == 0) {
1560 			trace_xfs_alloc_near_noentry(args);
1561 			goto out;
1562 		}
1563 		ASSERT(i == 1);
1564 	} else if (error) {
1565 		goto out;
1566 	}
1567 
1568 	/*
1569 	 * First algorithm.
1570 	 * If the requested extent is large wrt the freespaces available
1571 	 * in this a.g., then the cursor will be pointing to a btree entry
1572 	 * near the right edge of the tree.  If it's in the last btree leaf
1573 	 * block, then we just examine all the entries in that block
1574 	 * that are big enough, and pick the best one.
1575 	 */
1576 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1577 		bool		allocated = false;
1578 
1579 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1580 				&allocated);
1581 		if (error)
1582 			goto out;
1583 		if (allocated)
1584 			goto alloc_finish;
1585 	}
1586 
1587 	/*
1588 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1589 	 * locality.
1590 	 */
1591 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1592 	if (error)
1593 		goto out;
1594 
1595 	/*
1596 	 * If we couldn't get anything, give up.
1597 	 */
1598 	if (!acur.len) {
1599 		if (acur.busy) {
1600 			trace_xfs_alloc_near_busy(args);
1601 			xfs_extent_busy_flush(args->mp, args->pag,
1602 					      acur.busy_gen);
1603 			goto restart;
1604 		}
1605 		trace_xfs_alloc_size_neither(args);
1606 		args->agbno = NULLAGBLOCK;
1607 		goto out;
1608 	}
1609 
1610 alloc_finish:
1611 	/* fix up btrees on a successful allocation */
1612 	error = xfs_alloc_cur_finish(args, &acur);
1613 
1614 out:
1615 	xfs_alloc_cur_close(&acur, error);
1616 	return error;
1617 }
1618 
1619 /*
1620  * Allocate a variable extent anywhere in the allocation group agno.
1621  * Extent's length (returned in len) will be between minlen and maxlen,
1622  * and of the form k * prod + mod unless there's nothing that large.
1623  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1624  */
1625 STATIC int				/* error */
1626 xfs_alloc_ag_vextent_size(
1627 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
1628 {
1629 	struct xfs_agf	*agf = args->agbp->b_addr;
1630 	struct xfs_btree_cur *bno_cur;	/* cursor for bno btree */
1631 	struct xfs_btree_cur *cnt_cur;	/* cursor for cnt btree */
1632 	int		error;		/* error result */
1633 	xfs_agblock_t	fbno;		/* start of found freespace */
1634 	xfs_extlen_t	flen;		/* length of found freespace */
1635 	int		i;		/* temp status variable */
1636 	xfs_agblock_t	rbno;		/* returned block number */
1637 	xfs_extlen_t	rlen;		/* length of returned extent */
1638 	bool		busy;
1639 	unsigned	busy_gen;
1640 
1641 restart:
1642 	/*
1643 	 * Allocate and initialize a cursor for the by-size btree.
1644 	 */
1645 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1646 					args->pag, XFS_BTNUM_CNT);
1647 	bno_cur = NULL;
1648 
1649 	/*
1650 	 * Look for an entry >= maxlen+alignment-1 blocks.
1651 	 */
1652 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1653 			args->maxlen + args->alignment - 1, &i)))
1654 		goto error0;
1655 
1656 	/*
1657 	 * If none then we have to settle for a smaller extent. In the case that
1658 	 * there are no large extents, this will return the last entry in the
1659 	 * tree unless the tree is empty. In the case that there are only busy
1660 	 * large extents, this will return the largest small extent unless there
1661 	 * are no smaller extents available.
1662 	 */
1663 	if (!i) {
1664 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1665 						   &fbno, &flen, &i);
1666 		if (error)
1667 			goto error0;
1668 		if (i == 0 || flen == 0) {
1669 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1670 			trace_xfs_alloc_size_noentry(args);
1671 			return 0;
1672 		}
1673 		ASSERT(i == 1);
1674 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1675 				&rlen, &busy_gen);
1676 	} else {
1677 		/*
1678 		 * Search for a non-busy extent that is large enough.
1679 		 */
1680 		for (;;) {
1681 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1682 			if (error)
1683 				goto error0;
1684 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1685 				error = -EFSCORRUPTED;
1686 				goto error0;
1687 			}
1688 
1689 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1690 					&rbno, &rlen, &busy_gen);
1691 
1692 			if (rlen >= args->maxlen)
1693 				break;
1694 
1695 			error = xfs_btree_increment(cnt_cur, 0, &i);
1696 			if (error)
1697 				goto error0;
1698 			if (i == 0) {
1699 				/*
1700 				 * Our only valid extents must have been busy.
1701 				 * Make it unbusy by forcing the log out and
1702 				 * retrying.
1703 				 */
1704 				xfs_btree_del_cursor(cnt_cur,
1705 						     XFS_BTREE_NOERROR);
1706 				trace_xfs_alloc_size_busy(args);
1707 				xfs_extent_busy_flush(args->mp,
1708 							args->pag, busy_gen);
1709 				goto restart;
1710 			}
1711 		}
1712 	}
1713 
1714 	/*
1715 	 * In the first case above, we got the last entry in the
1716 	 * by-size btree.  Now we check to see if the space hits maxlen
1717 	 * once aligned; if not, we search left for something better.
1718 	 * This can't happen in the second case above.
1719 	 */
1720 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1721 	if (XFS_IS_CORRUPT(args->mp,
1722 			   rlen != 0 &&
1723 			   (rlen > flen ||
1724 			    rbno + rlen > fbno + flen))) {
1725 		error = -EFSCORRUPTED;
1726 		goto error0;
1727 	}
1728 	if (rlen < args->maxlen) {
1729 		xfs_agblock_t	bestfbno;
1730 		xfs_extlen_t	bestflen;
1731 		xfs_agblock_t	bestrbno;
1732 		xfs_extlen_t	bestrlen;
1733 
1734 		bestrlen = rlen;
1735 		bestrbno = rbno;
1736 		bestflen = flen;
1737 		bestfbno = fbno;
1738 		for (;;) {
1739 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1740 				goto error0;
1741 			if (i == 0)
1742 				break;
1743 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1744 					&i)))
1745 				goto error0;
1746 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1747 				error = -EFSCORRUPTED;
1748 				goto error0;
1749 			}
1750 			if (flen < bestrlen)
1751 				break;
1752 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1753 					&rbno, &rlen, &busy_gen);
1754 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1755 			if (XFS_IS_CORRUPT(args->mp,
1756 					   rlen != 0 &&
1757 					   (rlen > flen ||
1758 					    rbno + rlen > fbno + flen))) {
1759 				error = -EFSCORRUPTED;
1760 				goto error0;
1761 			}
1762 			if (rlen > bestrlen) {
1763 				bestrlen = rlen;
1764 				bestrbno = rbno;
1765 				bestflen = flen;
1766 				bestfbno = fbno;
1767 				if (rlen == args->maxlen)
1768 					break;
1769 			}
1770 		}
1771 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1772 				&i)))
1773 			goto error0;
1774 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1775 			error = -EFSCORRUPTED;
1776 			goto error0;
1777 		}
1778 		rlen = bestrlen;
1779 		rbno = bestrbno;
1780 		flen = bestflen;
1781 		fbno = bestfbno;
1782 	}
1783 	args->wasfromfl = 0;
1784 	/*
1785 	 * Fix up the length.
1786 	 */
1787 	args->len = rlen;
1788 	if (rlen < args->minlen) {
1789 		if (busy) {
1790 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1791 			trace_xfs_alloc_size_busy(args);
1792 			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1793 			goto restart;
1794 		}
1795 		goto out_nominleft;
1796 	}
1797 	xfs_alloc_fix_len(args);
1798 
1799 	rlen = args->len;
1800 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1801 		error = -EFSCORRUPTED;
1802 		goto error0;
1803 	}
1804 	/*
1805 	 * Allocate and initialize a cursor for the by-block tree.
1806 	 */
1807 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1808 					args->pag, XFS_BTNUM_BNO);
1809 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1810 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1811 		goto error0;
1812 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1813 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1814 	cnt_cur = bno_cur = NULL;
1815 	args->len = rlen;
1816 	args->agbno = rbno;
1817 	if (XFS_IS_CORRUPT(args->mp,
1818 			   args->agbno + args->len >
1819 			   be32_to_cpu(agf->agf_length))) {
1820 		error = -EFSCORRUPTED;
1821 		goto error0;
1822 	}
1823 	trace_xfs_alloc_size_done(args);
1824 	return 0;
1825 
1826 error0:
1827 	trace_xfs_alloc_size_error(args);
1828 	if (cnt_cur)
1829 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1830 	if (bno_cur)
1831 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1832 	return error;
1833 
1834 out_nominleft:
1835 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1836 	trace_xfs_alloc_size_nominleft(args);
1837 	args->agbno = NULLAGBLOCK;
1838 	return 0;
1839 }
1840 
1841 /*
1842  * Free the extent starting at agno/bno for length.
1843  */
1844 STATIC int
1845 xfs_free_ag_extent(
1846 	struct xfs_trans		*tp,
1847 	struct xfs_buf			*agbp,
1848 	xfs_agnumber_t			agno,
1849 	xfs_agblock_t			bno,
1850 	xfs_extlen_t			len,
1851 	const struct xfs_owner_info	*oinfo,
1852 	enum xfs_ag_resv_type		type)
1853 {
1854 	struct xfs_mount		*mp;
1855 	struct xfs_btree_cur		*bno_cur;
1856 	struct xfs_btree_cur		*cnt_cur;
1857 	xfs_agblock_t			gtbno; /* start of right neighbor */
1858 	xfs_extlen_t			gtlen; /* length of right neighbor */
1859 	xfs_agblock_t			ltbno; /* start of left neighbor */
1860 	xfs_extlen_t			ltlen; /* length of left neighbor */
1861 	xfs_agblock_t			nbno; /* new starting block of freesp */
1862 	xfs_extlen_t			nlen; /* new length of freespace */
1863 	int				haveleft; /* have a left neighbor */
1864 	int				haveright; /* have a right neighbor */
1865 	int				i;
1866 	int				error;
1867 	struct xfs_perag		*pag = agbp->b_pag;
1868 
1869 	bno_cur = cnt_cur = NULL;
1870 	mp = tp->t_mountp;
1871 
1872 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1873 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1874 		if (error)
1875 			goto error0;
1876 	}
1877 
1878 	/*
1879 	 * Allocate and initialize a cursor for the by-block btree.
1880 	 */
1881 	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1882 	/*
1883 	 * Look for a neighboring block on the left (lower block numbers)
1884 	 * that is contiguous with this space.
1885 	 */
1886 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1887 		goto error0;
1888 	if (haveleft) {
1889 		/*
1890 		 * There is a block to our left.
1891 		 */
1892 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1893 			goto error0;
1894 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1895 			error = -EFSCORRUPTED;
1896 			goto error0;
1897 		}
1898 		/*
1899 		 * It's not contiguous, though.
1900 		 */
1901 		if (ltbno + ltlen < bno)
1902 			haveleft = 0;
1903 		else {
1904 			/*
1905 			 * If this failure happens the request to free this
1906 			 * space was invalid, it's (partly) already free.
1907 			 * Very bad.
1908 			 */
1909 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1910 				error = -EFSCORRUPTED;
1911 				goto error0;
1912 			}
1913 		}
1914 	}
1915 	/*
1916 	 * Look for a neighboring block on the right (higher block numbers)
1917 	 * that is contiguous with this space.
1918 	 */
1919 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1920 		goto error0;
1921 	if (haveright) {
1922 		/*
1923 		 * There is a block to our right.
1924 		 */
1925 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1926 			goto error0;
1927 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1928 			error = -EFSCORRUPTED;
1929 			goto error0;
1930 		}
1931 		/*
1932 		 * It's not contiguous, though.
1933 		 */
1934 		if (bno + len < gtbno)
1935 			haveright = 0;
1936 		else {
1937 			/*
1938 			 * If this failure happens the request to free this
1939 			 * space was invalid, it's (partly) already free.
1940 			 * Very bad.
1941 			 */
1942 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1943 				error = -EFSCORRUPTED;
1944 				goto error0;
1945 			}
1946 		}
1947 	}
1948 	/*
1949 	 * Now allocate and initialize a cursor for the by-size tree.
1950 	 */
1951 	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
1952 	/*
1953 	 * Have both left and right contiguous neighbors.
1954 	 * Merge all three into a single free block.
1955 	 */
1956 	if (haveleft && haveright) {
1957 		/*
1958 		 * Delete the old by-size entry on the left.
1959 		 */
1960 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1961 			goto error0;
1962 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1963 			error = -EFSCORRUPTED;
1964 			goto error0;
1965 		}
1966 		if ((error = xfs_btree_delete(cnt_cur, &i)))
1967 			goto error0;
1968 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1969 			error = -EFSCORRUPTED;
1970 			goto error0;
1971 		}
1972 		/*
1973 		 * Delete the old by-size entry on the right.
1974 		 */
1975 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1976 			goto error0;
1977 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1978 			error = -EFSCORRUPTED;
1979 			goto error0;
1980 		}
1981 		if ((error = xfs_btree_delete(cnt_cur, &i)))
1982 			goto error0;
1983 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1984 			error = -EFSCORRUPTED;
1985 			goto error0;
1986 		}
1987 		/*
1988 		 * Delete the old by-block entry for the right block.
1989 		 */
1990 		if ((error = xfs_btree_delete(bno_cur, &i)))
1991 			goto error0;
1992 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1993 			error = -EFSCORRUPTED;
1994 			goto error0;
1995 		}
1996 		/*
1997 		 * Move the by-block cursor back to the left neighbor.
1998 		 */
1999 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2000 			goto error0;
2001 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2002 			error = -EFSCORRUPTED;
2003 			goto error0;
2004 		}
2005 #ifdef DEBUG
2006 		/*
2007 		 * Check that this is the right record: delete didn't
2008 		 * mangle the cursor.
2009 		 */
2010 		{
2011 			xfs_agblock_t	xxbno;
2012 			xfs_extlen_t	xxlen;
2013 
2014 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2015 					&i)))
2016 				goto error0;
2017 			if (XFS_IS_CORRUPT(mp,
2018 					   i != 1 ||
2019 					   xxbno != ltbno ||
2020 					   xxlen != ltlen)) {
2021 				error = -EFSCORRUPTED;
2022 				goto error0;
2023 			}
2024 		}
2025 #endif
2026 		/*
2027 		 * Update remaining by-block entry to the new, joined block.
2028 		 */
2029 		nbno = ltbno;
2030 		nlen = len + ltlen + gtlen;
2031 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2032 			goto error0;
2033 	}
2034 	/*
2035 	 * Have only a left contiguous neighbor.
2036 	 * Merge it together with the new freespace.
2037 	 */
2038 	else if (haveleft) {
2039 		/*
2040 		 * Delete the old by-size entry on the left.
2041 		 */
2042 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2043 			goto error0;
2044 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2045 			error = -EFSCORRUPTED;
2046 			goto error0;
2047 		}
2048 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2049 			goto error0;
2050 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2051 			error = -EFSCORRUPTED;
2052 			goto error0;
2053 		}
2054 		/*
2055 		 * Back up the by-block cursor to the left neighbor, and
2056 		 * update its length.
2057 		 */
2058 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2059 			goto error0;
2060 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2061 			error = -EFSCORRUPTED;
2062 			goto error0;
2063 		}
2064 		nbno = ltbno;
2065 		nlen = len + ltlen;
2066 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2067 			goto error0;
2068 	}
2069 	/*
2070 	 * Have only a right contiguous neighbor.
2071 	 * Merge it together with the new freespace.
2072 	 */
2073 	else if (haveright) {
2074 		/*
2075 		 * Delete the old by-size entry on the right.
2076 		 */
2077 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2078 			goto error0;
2079 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2080 			error = -EFSCORRUPTED;
2081 			goto error0;
2082 		}
2083 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2084 			goto error0;
2085 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2086 			error = -EFSCORRUPTED;
2087 			goto error0;
2088 		}
2089 		/*
2090 		 * Update the starting block and length of the right
2091 		 * neighbor in the by-block tree.
2092 		 */
2093 		nbno = bno;
2094 		nlen = len + gtlen;
2095 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2096 			goto error0;
2097 	}
2098 	/*
2099 	 * No contiguous neighbors.
2100 	 * Insert the new freespace into the by-block tree.
2101 	 */
2102 	else {
2103 		nbno = bno;
2104 		nlen = len;
2105 		if ((error = xfs_btree_insert(bno_cur, &i)))
2106 			goto error0;
2107 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2108 			error = -EFSCORRUPTED;
2109 			goto error0;
2110 		}
2111 	}
2112 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2113 	bno_cur = NULL;
2114 	/*
2115 	 * In all cases we need to insert the new freespace in the by-size tree.
2116 	 */
2117 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2118 		goto error0;
2119 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2120 		error = -EFSCORRUPTED;
2121 		goto error0;
2122 	}
2123 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2124 		goto error0;
2125 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2126 		error = -EFSCORRUPTED;
2127 		goto error0;
2128 	}
2129 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2130 	cnt_cur = NULL;
2131 
2132 	/*
2133 	 * Update the freespace totals in the ag and superblock.
2134 	 */
2135 	error = xfs_alloc_update_counters(tp, agbp, len);
2136 	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2137 	if (error)
2138 		goto error0;
2139 
2140 	XFS_STATS_INC(mp, xs_freex);
2141 	XFS_STATS_ADD(mp, xs_freeb, len);
2142 
2143 	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2144 
2145 	return 0;
2146 
2147  error0:
2148 	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2149 	if (bno_cur)
2150 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2151 	if (cnt_cur)
2152 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2153 	return error;
2154 }
2155 
2156 /*
2157  * Visible (exported) allocation/free functions.
2158  * Some of these are used just by xfs_alloc_btree.c and this file.
2159  */
2160 
2161 /*
2162  * Compute and fill in value of m_alloc_maxlevels.
2163  */
2164 void
2165 xfs_alloc_compute_maxlevels(
2166 	xfs_mount_t	*mp)	/* file system mount structure */
2167 {
2168 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2169 			(mp->m_sb.sb_agblocks + 1) / 2);
2170 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2171 }
2172 
2173 /*
2174  * Find the length of the longest extent in an AG.  The 'need' parameter
2175  * specifies how much space we're going to need for the AGFL and the
2176  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2177  * other callers.
2178  */
2179 xfs_extlen_t
2180 xfs_alloc_longest_free_extent(
2181 	struct xfs_perag	*pag,
2182 	xfs_extlen_t		need,
2183 	xfs_extlen_t		reserved)
2184 {
2185 	xfs_extlen_t		delta = 0;
2186 
2187 	/*
2188 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2189 	 * longest extent.
2190 	 */
2191 	if (need > pag->pagf_flcount)
2192 		delta = need - pag->pagf_flcount;
2193 
2194 	/*
2195 	 * If we cannot maintain others' reservations with space from the
2196 	 * not-longest freesp extents, we'll have to subtract /that/ from
2197 	 * the longest extent too.
2198 	 */
2199 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2200 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2201 
2202 	/*
2203 	 * If the longest extent is long enough to satisfy all the
2204 	 * reservations and AGFL rules in place, we can return this extent.
2205 	 */
2206 	if (pag->pagf_longest > delta)
2207 		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2208 				pag->pagf_longest - delta);
2209 
2210 	/* Otherwise, let the caller try for 1 block if there's space. */
2211 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2212 }
2213 
2214 /*
2215  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2216  * return the largest possible minimum length.
2217  */
2218 unsigned int
2219 xfs_alloc_min_freelist(
2220 	struct xfs_mount	*mp,
2221 	struct xfs_perag	*pag)
2222 {
2223 	/* AG btrees have at least 1 level. */
2224 	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2225 	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
2226 	unsigned int		min_free;
2227 
2228 	ASSERT(mp->m_alloc_maxlevels > 0);
2229 
2230 	/* space needed by-bno freespace btree */
2231 	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2232 				       mp->m_alloc_maxlevels);
2233 	/* space needed by-size freespace btree */
2234 	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2235 				       mp->m_alloc_maxlevels);
2236 	/* space needed reverse mapping used space btree */
2237 	if (xfs_has_rmapbt(mp))
2238 		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2239 						mp->m_rmap_maxlevels);
2240 
2241 	return min_free;
2242 }
2243 
2244 /*
2245  * Check if the operation we are fixing up the freelist for should go ahead or
2246  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2247  * is dependent on whether the size and shape of free space available will
2248  * permit the requested allocation to take place.
2249  */
2250 static bool
2251 xfs_alloc_space_available(
2252 	struct xfs_alloc_arg	*args,
2253 	xfs_extlen_t		min_free,
2254 	int			flags)
2255 {
2256 	struct xfs_perag	*pag = args->pag;
2257 	xfs_extlen_t		alloc_len, longest;
2258 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2259 	int			available;
2260 	xfs_extlen_t		agflcount;
2261 
2262 	if (flags & XFS_ALLOC_FLAG_FREEING)
2263 		return true;
2264 
2265 	reservation = xfs_ag_resv_needed(pag, args->resv);
2266 
2267 	/* do we have enough contiguous free space for the allocation? */
2268 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2269 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2270 	if (longest < alloc_len)
2271 		return false;
2272 
2273 	/*
2274 	 * Do we have enough free space remaining for the allocation? Don't
2275 	 * account extra agfl blocks because we are about to defer free them,
2276 	 * making them unavailable until the current transaction commits.
2277 	 */
2278 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2279 	available = (int)(pag->pagf_freeblks + agflcount -
2280 			  reservation - min_free - args->minleft);
2281 	if (available < (int)max(args->total, alloc_len))
2282 		return false;
2283 
2284 	/*
2285 	 * Clamp maxlen to the amount of free space available for the actual
2286 	 * extent allocation.
2287 	 */
2288 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2289 		args->maxlen = available;
2290 		ASSERT(args->maxlen > 0);
2291 		ASSERT(args->maxlen >= args->minlen);
2292 	}
2293 
2294 	return true;
2295 }
2296 
2297 int
2298 xfs_free_agfl_block(
2299 	struct xfs_trans	*tp,
2300 	xfs_agnumber_t		agno,
2301 	xfs_agblock_t		agbno,
2302 	struct xfs_buf		*agbp,
2303 	struct xfs_owner_info	*oinfo)
2304 {
2305 	int			error;
2306 	struct xfs_buf		*bp;
2307 
2308 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2309 				   XFS_AG_RESV_AGFL);
2310 	if (error)
2311 		return error;
2312 
2313 	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2314 			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2315 			tp->t_mountp->m_bsize, 0, &bp);
2316 	if (error)
2317 		return error;
2318 	xfs_trans_binval(tp, bp);
2319 
2320 	return 0;
2321 }
2322 
2323 /*
2324  * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2325  * is to detect an agfl header padding mismatch between current and early v5
2326  * kernels. This problem manifests as a 1-slot size difference between the
2327  * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2328  * may also catch variants of agfl count corruption unrelated to padding. Either
2329  * way, we'll reset the agfl and warn the user.
2330  *
2331  * Return true if a reset is required before the agfl can be used, false
2332  * otherwise.
2333  */
2334 static bool
2335 xfs_agfl_needs_reset(
2336 	struct xfs_mount	*mp,
2337 	struct xfs_agf		*agf)
2338 {
2339 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2340 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2341 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2342 	int			agfl_size = xfs_agfl_size(mp);
2343 	int			active;
2344 
2345 	/* no agfl header on v4 supers */
2346 	if (!xfs_has_crc(mp))
2347 		return false;
2348 
2349 	/*
2350 	 * The agf read verifier catches severe corruption of these fields.
2351 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2352 	 * the verifier allows it.
2353 	 */
2354 	if (f >= agfl_size || l >= agfl_size)
2355 		return true;
2356 	if (c > agfl_size)
2357 		return true;
2358 
2359 	/*
2360 	 * Check consistency between the on-disk count and the active range. An
2361 	 * agfl padding mismatch manifests as an inconsistent flcount.
2362 	 */
2363 	if (c && l >= f)
2364 		active = l - f + 1;
2365 	else if (c)
2366 		active = agfl_size - f + l + 1;
2367 	else
2368 		active = 0;
2369 
2370 	return active != c;
2371 }
2372 
2373 /*
2374  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2375  * agfl content cannot be trusted. Warn the user that a repair is required to
2376  * recover leaked blocks.
2377  *
2378  * The purpose of this mechanism is to handle filesystems affected by the agfl
2379  * header padding mismatch problem. A reset keeps the filesystem online with a
2380  * relatively minor free space accounting inconsistency rather than suffer the
2381  * inevitable crash from use of an invalid agfl block.
2382  */
2383 static void
2384 xfs_agfl_reset(
2385 	struct xfs_trans	*tp,
2386 	struct xfs_buf		*agbp,
2387 	struct xfs_perag	*pag)
2388 {
2389 	struct xfs_mount	*mp = tp->t_mountp;
2390 	struct xfs_agf		*agf = agbp->b_addr;
2391 
2392 	ASSERT(xfs_perag_agfl_needs_reset(pag));
2393 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2394 
2395 	xfs_warn(mp,
2396 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2397 	       "Please unmount and run xfs_repair.",
2398 	         pag->pag_agno, pag->pagf_flcount);
2399 
2400 	agf->agf_flfirst = 0;
2401 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2402 	agf->agf_flcount = 0;
2403 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2404 				    XFS_AGF_FLCOUNT);
2405 
2406 	pag->pagf_flcount = 0;
2407 	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2408 }
2409 
2410 /*
2411  * Defer an AGFL block free. This is effectively equivalent to
2412  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2413  *
2414  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2415  * allocation operations in a transaction. AGFL frees are prone to this problem
2416  * because for one they are always freed one at a time. Further, an immediate
2417  * AGFL block free can cause a btree join and require another block free before
2418  * the real allocation can proceed. Deferring the free disconnects freeing up
2419  * the AGFL slot from freeing the block.
2420  */
2421 STATIC void
2422 xfs_defer_agfl_block(
2423 	struct xfs_trans		*tp,
2424 	xfs_agnumber_t			agno,
2425 	xfs_fsblock_t			agbno,
2426 	struct xfs_owner_info		*oinfo)
2427 {
2428 	struct xfs_mount		*mp = tp->t_mountp;
2429 	struct xfs_extent_free_item	*xefi;
2430 
2431 	ASSERT(xfs_extfree_item_cache != NULL);
2432 	ASSERT(oinfo != NULL);
2433 
2434 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2435 			       GFP_KERNEL | __GFP_NOFAIL);
2436 	xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2437 	xefi->xefi_blockcount = 1;
2438 	xefi->xefi_owner = oinfo->oi_owner;
2439 
2440 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2441 
2442 	xfs_extent_free_get_group(mp, xefi);
2443 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
2444 }
2445 
2446 /*
2447  * Add the extent to the list of extents to be free at transaction end.
2448  * The list is maintained sorted (by block number).
2449  */
2450 void
2451 __xfs_free_extent_later(
2452 	struct xfs_trans		*tp,
2453 	xfs_fsblock_t			bno,
2454 	xfs_filblks_t			len,
2455 	const struct xfs_owner_info	*oinfo,
2456 	bool				skip_discard)
2457 {
2458 	struct xfs_extent_free_item	*xefi;
2459 	struct xfs_mount		*mp = tp->t_mountp;
2460 #ifdef DEBUG
2461 	xfs_agnumber_t			agno;
2462 	xfs_agblock_t			agbno;
2463 
2464 	ASSERT(bno != NULLFSBLOCK);
2465 	ASSERT(len > 0);
2466 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2467 	ASSERT(!isnullstartblock(bno));
2468 	agno = XFS_FSB_TO_AGNO(mp, bno);
2469 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2470 	ASSERT(agno < mp->m_sb.sb_agcount);
2471 	ASSERT(agbno < mp->m_sb.sb_agblocks);
2472 	ASSERT(len < mp->m_sb.sb_agblocks);
2473 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2474 #endif
2475 	ASSERT(xfs_extfree_item_cache != NULL);
2476 
2477 	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2478 			       GFP_KERNEL | __GFP_NOFAIL);
2479 	xefi->xefi_startblock = bno;
2480 	xefi->xefi_blockcount = (xfs_extlen_t)len;
2481 	if (skip_discard)
2482 		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2483 	if (oinfo) {
2484 		ASSERT(oinfo->oi_offset == 0);
2485 
2486 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2487 			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2488 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2489 			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2490 		xefi->xefi_owner = oinfo->oi_owner;
2491 	} else {
2492 		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2493 	}
2494 	trace_xfs_bmap_free_defer(mp,
2495 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2496 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2497 
2498 	xfs_extent_free_get_group(mp, xefi);
2499 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
2500 }
2501 
2502 #ifdef DEBUG
2503 /*
2504  * Check if an AGF has a free extent record whose length is equal to
2505  * args->minlen.
2506  */
2507 STATIC int
2508 xfs_exact_minlen_extent_available(
2509 	struct xfs_alloc_arg	*args,
2510 	struct xfs_buf		*agbp,
2511 	int			*stat)
2512 {
2513 	struct xfs_btree_cur	*cnt_cur;
2514 	xfs_agblock_t		fbno;
2515 	xfs_extlen_t		flen;
2516 	int			error = 0;
2517 
2518 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2519 					args->pag, XFS_BTNUM_CNT);
2520 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2521 	if (error)
2522 		goto out;
2523 
2524 	if (*stat == 0) {
2525 		error = -EFSCORRUPTED;
2526 		goto out;
2527 	}
2528 
2529 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2530 	if (error)
2531 		goto out;
2532 
2533 	if (*stat == 1 && flen != args->minlen)
2534 		*stat = 0;
2535 
2536 out:
2537 	xfs_btree_del_cursor(cnt_cur, error);
2538 
2539 	return error;
2540 }
2541 #endif
2542 
2543 /*
2544  * Decide whether to use this allocation group for this allocation.
2545  * If so, fix up the btree freelist's size.
2546  */
2547 int			/* error */
2548 xfs_alloc_fix_freelist(
2549 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2550 	int			flags)	/* XFS_ALLOC_FLAG_... */
2551 {
2552 	struct xfs_mount	*mp = args->mp;
2553 	struct xfs_perag	*pag = args->pag;
2554 	struct xfs_trans	*tp = args->tp;
2555 	struct xfs_buf		*agbp = NULL;
2556 	struct xfs_buf		*agflbp = NULL;
2557 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2558 	xfs_agblock_t		bno;	/* freelist block */
2559 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2560 	int			error = 0;
2561 
2562 	/* deferred ops (AGFL block frees) require permanent transactions */
2563 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2564 
2565 	if (!xfs_perag_initialised_agf(pag)) {
2566 		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2567 		if (error) {
2568 			/* Couldn't lock the AGF so skip this AG. */
2569 			if (error == -EAGAIN)
2570 				error = 0;
2571 			goto out_no_agbp;
2572 		}
2573 	}
2574 
2575 	/*
2576 	 * If this is a metadata preferred pag and we are user data then try
2577 	 * somewhere else if we are not being asked to try harder at this
2578 	 * point
2579 	 */
2580 	if (xfs_perag_prefers_metadata(pag) &&
2581 	    (args->datatype & XFS_ALLOC_USERDATA) &&
2582 	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2583 		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2584 		goto out_agbp_relse;
2585 	}
2586 
2587 	need = xfs_alloc_min_freelist(mp, pag);
2588 	if (!xfs_alloc_space_available(args, need, flags |
2589 			XFS_ALLOC_FLAG_CHECK))
2590 		goto out_agbp_relse;
2591 
2592 	/*
2593 	 * Get the a.g. freespace buffer.
2594 	 * Can fail if we're not blocking on locks, and it's held.
2595 	 */
2596 	if (!agbp) {
2597 		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2598 		if (error) {
2599 			/* Couldn't lock the AGF so skip this AG. */
2600 			if (error == -EAGAIN)
2601 				error = 0;
2602 			goto out_no_agbp;
2603 		}
2604 	}
2605 
2606 	/* reset a padding mismatched agfl before final free space check */
2607 	if (xfs_perag_agfl_needs_reset(pag))
2608 		xfs_agfl_reset(tp, agbp, pag);
2609 
2610 	/* If there isn't enough total space or single-extent, reject it. */
2611 	need = xfs_alloc_min_freelist(mp, pag);
2612 	if (!xfs_alloc_space_available(args, need, flags))
2613 		goto out_agbp_relse;
2614 
2615 #ifdef DEBUG
2616 	if (args->alloc_minlen_only) {
2617 		int stat;
2618 
2619 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2620 		if (error || !stat)
2621 			goto out_agbp_relse;
2622 	}
2623 #endif
2624 	/*
2625 	 * Make the freelist shorter if it's too long.
2626 	 *
2627 	 * Note that from this point onwards, we will always release the agf and
2628 	 * agfl buffers on error. This handles the case where we error out and
2629 	 * the buffers are clean or may not have been joined to the transaction
2630 	 * and hence need to be released manually. If they have been joined to
2631 	 * the transaction, then xfs_trans_brelse() will handle them
2632 	 * appropriately based on the recursion count and dirty state of the
2633 	 * buffer.
2634 	 *
2635 	 * XXX (dgc): When we have lots of free space, does this buy us
2636 	 * anything other than extra overhead when we need to put more blocks
2637 	 * back on the free list? Maybe we should only do this when space is
2638 	 * getting low or the AGFL is more than half full?
2639 	 *
2640 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2641 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2642 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2643 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2644 	 * both required to ensure that rmaps are correctly recorded for the
2645 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2646 	 * repair/rmap.c in xfsprogs for details.
2647 	 */
2648 	memset(&targs, 0, sizeof(targs));
2649 	/* struct copy below */
2650 	if (flags & XFS_ALLOC_FLAG_NORMAP)
2651 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2652 	else
2653 		targs.oinfo = XFS_RMAP_OINFO_AG;
2654 	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2655 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2656 		if (error)
2657 			goto out_agbp_relse;
2658 
2659 		/* defer agfl frees */
2660 		xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2661 	}
2662 
2663 	targs.tp = tp;
2664 	targs.mp = mp;
2665 	targs.agbp = agbp;
2666 	targs.agno = args->agno;
2667 	targs.alignment = targs.minlen = targs.prod = 1;
2668 	targs.pag = pag;
2669 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2670 	if (error)
2671 		goto out_agbp_relse;
2672 
2673 	/* Make the freelist longer if it's too short. */
2674 	while (pag->pagf_flcount < need) {
2675 		targs.agbno = 0;
2676 		targs.maxlen = need - pag->pagf_flcount;
2677 		targs.resv = XFS_AG_RESV_AGFL;
2678 
2679 		/* Allocate as many blocks as possible at once. */
2680 		error = xfs_alloc_ag_vextent_size(&targs);
2681 		if (error)
2682 			goto out_agflbp_relse;
2683 
2684 		/*
2685 		 * Stop if we run out.  Won't happen if callers are obeying
2686 		 * the restrictions correctly.  Can happen for free calls
2687 		 * on a completely full ag.
2688 		 */
2689 		if (targs.agbno == NULLAGBLOCK) {
2690 			if (flags & XFS_ALLOC_FLAG_FREEING)
2691 				break;
2692 			goto out_agflbp_relse;
2693 		}
2694 
2695 		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2696 			error = xfs_rmap_alloc(tp, agbp, pag,
2697 				       targs.agbno, targs.len, &targs.oinfo);
2698 			if (error)
2699 				goto out_agflbp_relse;
2700 		}
2701 		error = xfs_alloc_update_counters(tp, agbp,
2702 						  -((long)(targs.len)));
2703 		if (error)
2704 			goto out_agflbp_relse;
2705 
2706 		/*
2707 		 * Put each allocated block on the list.
2708 		 */
2709 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2710 			error = xfs_alloc_put_freelist(pag, tp, agbp,
2711 							agflbp, bno, 0);
2712 			if (error)
2713 				goto out_agflbp_relse;
2714 		}
2715 	}
2716 	xfs_trans_brelse(tp, agflbp);
2717 	args->agbp = agbp;
2718 	return 0;
2719 
2720 out_agflbp_relse:
2721 	xfs_trans_brelse(tp, agflbp);
2722 out_agbp_relse:
2723 	if (agbp)
2724 		xfs_trans_brelse(tp, agbp);
2725 out_no_agbp:
2726 	args->agbp = NULL;
2727 	return error;
2728 }
2729 
2730 /*
2731  * Get a block from the freelist.
2732  * Returns with the buffer for the block gotten.
2733  */
2734 int
2735 xfs_alloc_get_freelist(
2736 	struct xfs_perag	*pag,
2737 	struct xfs_trans	*tp,
2738 	struct xfs_buf		*agbp,
2739 	xfs_agblock_t		*bnop,
2740 	int			btreeblk)
2741 {
2742 	struct xfs_agf		*agf = agbp->b_addr;
2743 	struct xfs_buf		*agflbp;
2744 	xfs_agblock_t		bno;
2745 	__be32			*agfl_bno;
2746 	int			error;
2747 	uint32_t		logflags;
2748 	struct xfs_mount	*mp = tp->t_mountp;
2749 
2750 	/*
2751 	 * Freelist is empty, give up.
2752 	 */
2753 	if (!agf->agf_flcount) {
2754 		*bnop = NULLAGBLOCK;
2755 		return 0;
2756 	}
2757 	/*
2758 	 * Read the array of free blocks.
2759 	 */
2760 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2761 	if (error)
2762 		return error;
2763 
2764 
2765 	/*
2766 	 * Get the block number and update the data structures.
2767 	 */
2768 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2769 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2770 	be32_add_cpu(&agf->agf_flfirst, 1);
2771 	xfs_trans_brelse(tp, agflbp);
2772 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2773 		agf->agf_flfirst = 0;
2774 
2775 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
2776 	be32_add_cpu(&agf->agf_flcount, -1);
2777 	pag->pagf_flcount--;
2778 
2779 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2780 	if (btreeblk) {
2781 		be32_add_cpu(&agf->agf_btreeblks, 1);
2782 		pag->pagf_btreeblks++;
2783 		logflags |= XFS_AGF_BTREEBLKS;
2784 	}
2785 
2786 	xfs_alloc_log_agf(tp, agbp, logflags);
2787 	*bnop = bno;
2788 
2789 	return 0;
2790 }
2791 
2792 /*
2793  * Log the given fields from the agf structure.
2794  */
2795 void
2796 xfs_alloc_log_agf(
2797 	struct xfs_trans	*tp,
2798 	struct xfs_buf		*bp,
2799 	uint32_t		fields)
2800 {
2801 	int	first;		/* first byte offset */
2802 	int	last;		/* last byte offset */
2803 	static const short	offsets[] = {
2804 		offsetof(xfs_agf_t, agf_magicnum),
2805 		offsetof(xfs_agf_t, agf_versionnum),
2806 		offsetof(xfs_agf_t, agf_seqno),
2807 		offsetof(xfs_agf_t, agf_length),
2808 		offsetof(xfs_agf_t, agf_roots[0]),
2809 		offsetof(xfs_agf_t, agf_levels[0]),
2810 		offsetof(xfs_agf_t, agf_flfirst),
2811 		offsetof(xfs_agf_t, agf_fllast),
2812 		offsetof(xfs_agf_t, agf_flcount),
2813 		offsetof(xfs_agf_t, agf_freeblks),
2814 		offsetof(xfs_agf_t, agf_longest),
2815 		offsetof(xfs_agf_t, agf_btreeblks),
2816 		offsetof(xfs_agf_t, agf_uuid),
2817 		offsetof(xfs_agf_t, agf_rmap_blocks),
2818 		offsetof(xfs_agf_t, agf_refcount_blocks),
2819 		offsetof(xfs_agf_t, agf_refcount_root),
2820 		offsetof(xfs_agf_t, agf_refcount_level),
2821 		/* needed so that we don't log the whole rest of the structure: */
2822 		offsetof(xfs_agf_t, agf_spare64),
2823 		sizeof(xfs_agf_t)
2824 	};
2825 
2826 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2827 
2828 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2829 
2830 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2831 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2832 }
2833 
2834 /*
2835  * Put the block on the freelist for the allocation group.
2836  */
2837 int
2838 xfs_alloc_put_freelist(
2839 	struct xfs_perag	*pag,
2840 	struct xfs_trans	*tp,
2841 	struct xfs_buf		*agbp,
2842 	struct xfs_buf		*agflbp,
2843 	xfs_agblock_t		bno,
2844 	int			btreeblk)
2845 {
2846 	struct xfs_mount	*mp = tp->t_mountp;
2847 	struct xfs_agf		*agf = agbp->b_addr;
2848 	__be32			*blockp;
2849 	int			error;
2850 	uint32_t		logflags;
2851 	__be32			*agfl_bno;
2852 	int			startoff;
2853 
2854 	if (!agflbp) {
2855 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2856 		if (error)
2857 			return error;
2858 	}
2859 
2860 	be32_add_cpu(&agf->agf_fllast, 1);
2861 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2862 		agf->agf_fllast = 0;
2863 
2864 	ASSERT(!xfs_perag_agfl_needs_reset(pag));
2865 	be32_add_cpu(&agf->agf_flcount, 1);
2866 	pag->pagf_flcount++;
2867 
2868 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2869 	if (btreeblk) {
2870 		be32_add_cpu(&agf->agf_btreeblks, -1);
2871 		pag->pagf_btreeblks--;
2872 		logflags |= XFS_AGF_BTREEBLKS;
2873 	}
2874 
2875 	xfs_alloc_log_agf(tp, agbp, logflags);
2876 
2877 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2878 
2879 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2880 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2881 	*blockp = cpu_to_be32(bno);
2882 	startoff = (char *)blockp - (char *)agflbp->b_addr;
2883 
2884 	xfs_alloc_log_agf(tp, agbp, logflags);
2885 
2886 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2887 	xfs_trans_log_buf(tp, agflbp, startoff,
2888 			  startoff + sizeof(xfs_agblock_t) - 1);
2889 	return 0;
2890 }
2891 
2892 static xfs_failaddr_t
2893 xfs_agf_verify(
2894 	struct xfs_buf		*bp)
2895 {
2896 	struct xfs_mount	*mp = bp->b_mount;
2897 	struct xfs_agf		*agf = bp->b_addr;
2898 
2899 	if (xfs_has_crc(mp)) {
2900 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2901 			return __this_address;
2902 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2903 			return __this_address;
2904 	}
2905 
2906 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
2907 		return __this_address;
2908 
2909 	if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2910 	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2911 	      be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2912 	      be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2913 	      be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2914 		return __this_address;
2915 
2916 	if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2917 		return __this_address;
2918 
2919 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2920 	    be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2921 		return __this_address;
2922 
2923 	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2924 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2925 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
2926 						mp->m_alloc_maxlevels ||
2927 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
2928 						mp->m_alloc_maxlevels)
2929 		return __this_address;
2930 
2931 	if (xfs_has_rmapbt(mp) &&
2932 	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2933 	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
2934 						mp->m_rmap_maxlevels))
2935 		return __this_address;
2936 
2937 	if (xfs_has_rmapbt(mp) &&
2938 	    be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2939 		return __this_address;
2940 
2941 	/*
2942 	 * during growfs operations, the perag is not fully initialised,
2943 	 * so we can't use it for any useful checking. growfs ensures we can't
2944 	 * use it by using uncached buffers that don't have the perag attached
2945 	 * so we can detect and avoid this problem.
2946 	 */
2947 	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2948 		return __this_address;
2949 
2950 	if (xfs_has_lazysbcount(mp) &&
2951 	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2952 		return __this_address;
2953 
2954 	if (xfs_has_reflink(mp) &&
2955 	    be32_to_cpu(agf->agf_refcount_blocks) >
2956 	    be32_to_cpu(agf->agf_length))
2957 		return __this_address;
2958 
2959 	if (xfs_has_reflink(mp) &&
2960 	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2961 	     be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
2962 		return __this_address;
2963 
2964 	return NULL;
2965 
2966 }
2967 
2968 static void
2969 xfs_agf_read_verify(
2970 	struct xfs_buf	*bp)
2971 {
2972 	struct xfs_mount *mp = bp->b_mount;
2973 	xfs_failaddr_t	fa;
2974 
2975 	if (xfs_has_crc(mp) &&
2976 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2977 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2978 	else {
2979 		fa = xfs_agf_verify(bp);
2980 		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2981 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2982 	}
2983 }
2984 
2985 static void
2986 xfs_agf_write_verify(
2987 	struct xfs_buf	*bp)
2988 {
2989 	struct xfs_mount	*mp = bp->b_mount;
2990 	struct xfs_buf_log_item	*bip = bp->b_log_item;
2991 	struct xfs_agf		*agf = bp->b_addr;
2992 	xfs_failaddr_t		fa;
2993 
2994 	fa = xfs_agf_verify(bp);
2995 	if (fa) {
2996 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2997 		return;
2998 	}
2999 
3000 	if (!xfs_has_crc(mp))
3001 		return;
3002 
3003 	if (bip)
3004 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3005 
3006 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3007 }
3008 
3009 const struct xfs_buf_ops xfs_agf_buf_ops = {
3010 	.name = "xfs_agf",
3011 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3012 	.verify_read = xfs_agf_read_verify,
3013 	.verify_write = xfs_agf_write_verify,
3014 	.verify_struct = xfs_agf_verify,
3015 };
3016 
3017 /*
3018  * Read in the allocation group header (free/alloc section).
3019  */
3020 int
3021 xfs_read_agf(
3022 	struct xfs_perag	*pag,
3023 	struct xfs_trans	*tp,
3024 	int			flags,
3025 	struct xfs_buf		**agfbpp)
3026 {
3027 	struct xfs_mount	*mp = pag->pag_mount;
3028 	int			error;
3029 
3030 	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3031 
3032 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3033 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3034 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3035 	if (error)
3036 		return error;
3037 
3038 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3039 	return 0;
3040 }
3041 
3042 /*
3043  * Read in the allocation group header (free/alloc section) and initialise the
3044  * perag structure if necessary. If the caller provides @agfbpp, then return the
3045  * locked buffer to the caller, otherwise free it.
3046  */
3047 int
3048 xfs_alloc_read_agf(
3049 	struct xfs_perag	*pag,
3050 	struct xfs_trans	*tp,
3051 	int			flags,
3052 	struct xfs_buf		**agfbpp)
3053 {
3054 	struct xfs_buf		*agfbp;
3055 	struct xfs_agf		*agf;
3056 	int			error;
3057 	int			allocbt_blks;
3058 
3059 	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3060 
3061 	/* We don't support trylock when freeing. */
3062 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3063 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3064 	error = xfs_read_agf(pag, tp,
3065 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3066 			&agfbp);
3067 	if (error)
3068 		return error;
3069 
3070 	agf = agfbp->b_addr;
3071 	if (!xfs_perag_initialised_agf(pag)) {
3072 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3073 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3074 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3075 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3076 		pag->pagf_levels[XFS_BTNUM_BNOi] =
3077 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3078 		pag->pagf_levels[XFS_BTNUM_CNTi] =
3079 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3080 		pag->pagf_levels[XFS_BTNUM_RMAPi] =
3081 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3082 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3083 		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3084 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3085 		else
3086 			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3087 
3088 		/*
3089 		 * Update the in-core allocbt counter. Filter out the rmapbt
3090 		 * subset of the btreeblks counter because the rmapbt is managed
3091 		 * by perag reservation. Subtract one for the rmapbt root block
3092 		 * because the rmap counter includes it while the btreeblks
3093 		 * counter only tracks non-root blocks.
3094 		 */
3095 		allocbt_blks = pag->pagf_btreeblks;
3096 		if (xfs_has_rmapbt(pag->pag_mount))
3097 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3098 		if (allocbt_blks > 0)
3099 			atomic64_add(allocbt_blks,
3100 					&pag->pag_mount->m_allocbt_blks);
3101 
3102 		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3103 	}
3104 #ifdef DEBUG
3105 	else if (!xfs_is_shutdown(pag->pag_mount)) {
3106 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3107 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3108 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3109 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3110 		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3111 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3112 		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3113 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3114 	}
3115 #endif
3116 	if (agfbpp)
3117 		*agfbpp = agfbp;
3118 	else
3119 		xfs_trans_brelse(tp, agfbp);
3120 	return 0;
3121 }
3122 
3123 /*
3124  * Pre-proces allocation arguments to set initial state that we don't require
3125  * callers to set up correctly, as well as bounds check the allocation args
3126  * that are set up.
3127  */
3128 static int
3129 xfs_alloc_vextent_check_args(
3130 	struct xfs_alloc_arg	*args,
3131 	xfs_fsblock_t		target,
3132 	xfs_agnumber_t		*minimum_agno)
3133 {
3134 	struct xfs_mount	*mp = args->mp;
3135 	xfs_agblock_t		agsize;
3136 
3137 	args->fsbno = NULLFSBLOCK;
3138 
3139 	*minimum_agno = 0;
3140 	if (args->tp->t_highest_agno != NULLAGNUMBER)
3141 		*minimum_agno = args->tp->t_highest_agno;
3142 
3143 	/*
3144 	 * Just fix this up, for the case where the last a.g. is shorter
3145 	 * (or there's only one a.g.) and the caller couldn't easily figure
3146 	 * that out (xfs_bmap_alloc).
3147 	 */
3148 	agsize = mp->m_sb.sb_agblocks;
3149 	if (args->maxlen > agsize)
3150 		args->maxlen = agsize;
3151 	if (args->alignment == 0)
3152 		args->alignment = 1;
3153 
3154 	ASSERT(args->minlen > 0);
3155 	ASSERT(args->maxlen > 0);
3156 	ASSERT(args->alignment > 0);
3157 	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3158 
3159 	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3160 	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3161 	ASSERT(args->minlen <= args->maxlen);
3162 	ASSERT(args->minlen <= agsize);
3163 	ASSERT(args->mod < args->prod);
3164 
3165 	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3166 	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3167 	    args->minlen > args->maxlen || args->minlen > agsize ||
3168 	    args->mod >= args->prod) {
3169 		trace_xfs_alloc_vextent_badargs(args);
3170 		return -ENOSPC;
3171 	}
3172 
3173 	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3174 		trace_xfs_alloc_vextent_skip_deadlock(args);
3175 		return -ENOSPC;
3176 	}
3177 	return 0;
3178 
3179 }
3180 
3181 /*
3182  * Prepare an AG for allocation. If the AG is not prepared to accept the
3183  * allocation, return failure.
3184  *
3185  * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3186  * modified to hold their own perag references.
3187  */
3188 static int
3189 xfs_alloc_vextent_prepare_ag(
3190 	struct xfs_alloc_arg	*args)
3191 {
3192 	bool			need_pag = !args->pag;
3193 	int			error;
3194 
3195 	if (need_pag)
3196 		args->pag = xfs_perag_get(args->mp, args->agno);
3197 
3198 	args->agbp = NULL;
3199 	error = xfs_alloc_fix_freelist(args, 0);
3200 	if (error) {
3201 		trace_xfs_alloc_vextent_nofix(args);
3202 		if (need_pag)
3203 			xfs_perag_put(args->pag);
3204 		args->agbno = NULLAGBLOCK;
3205 		return error;
3206 	}
3207 	if (!args->agbp) {
3208 		/* cannot allocate in this AG at all */
3209 		trace_xfs_alloc_vextent_noagbp(args);
3210 		args->agbno = NULLAGBLOCK;
3211 		return 0;
3212 	}
3213 	args->wasfromfl = 0;
3214 	return 0;
3215 }
3216 
3217 /*
3218  * Post-process allocation results to account for the allocation if it succeed
3219  * and set the allocated block number correctly for the caller.
3220  *
3221  * XXX: we should really be returning ENOSPC for ENOSPC, not
3222  * hiding it behind a "successful" NULLFSBLOCK allocation.
3223  */
3224 static int
3225 xfs_alloc_vextent_finish(
3226 	struct xfs_alloc_arg	*args,
3227 	xfs_agnumber_t		minimum_agno,
3228 	int			alloc_error,
3229 	bool			drop_perag)
3230 {
3231 	struct xfs_mount	*mp = args->mp;
3232 	int			error = 0;
3233 
3234 	/*
3235 	 * We can end up here with a locked AGF. If we failed, the caller is
3236 	 * likely going to try to allocate again with different parameters, and
3237 	 * that can widen the AGs that are searched for free space. If we have
3238 	 * to do BMBT block allocation, we have to do a new allocation.
3239 	 *
3240 	 * Hence leaving this function with the AGF locked opens up potential
3241 	 * ABBA AGF deadlocks because a future allocation attempt in this
3242 	 * transaction may attempt to lock a lower number AGF.
3243 	 *
3244 	 * We can't release the AGF until the transaction is commited, so at
3245 	 * this point we must update the "first allocation" tracker to point at
3246 	 * this AG if the tracker is empty or points to a lower AG. This allows
3247 	 * the next allocation attempt to be modified appropriately to avoid
3248 	 * deadlocks.
3249 	 */
3250 	if (args->agbp &&
3251 	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3252 	     args->agno > minimum_agno))
3253 		args->tp->t_highest_agno = args->agno;
3254 
3255 	/*
3256 	 * If the allocation failed with an error or we had an ENOSPC result,
3257 	 * preserve the returned error whilst also marking the allocation result
3258 	 * as "no extent allocated". This ensures that callers that fail to
3259 	 * capture the error will still treat it as a failed allocation.
3260 	 */
3261 	if (alloc_error || args->agbno == NULLAGBLOCK) {
3262 		args->fsbno = NULLFSBLOCK;
3263 		error = alloc_error;
3264 		goto out_drop_perag;
3265 	}
3266 
3267 	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3268 
3269 	ASSERT(args->len >= args->minlen);
3270 	ASSERT(args->len <= args->maxlen);
3271 	ASSERT(args->agbno % args->alignment == 0);
3272 	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3273 
3274 	/* if not file data, insert new block into the reverse map btree */
3275 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3276 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3277 				       args->agbno, args->len, &args->oinfo);
3278 		if (error)
3279 			goto out_drop_perag;
3280 	}
3281 
3282 	if (!args->wasfromfl) {
3283 		error = xfs_alloc_update_counters(args->tp, args->agbp,
3284 						  -((long)(args->len)));
3285 		if (error)
3286 			goto out_drop_perag;
3287 
3288 		ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3289 				args->len));
3290 	}
3291 
3292 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3293 
3294 	XFS_STATS_INC(mp, xs_allocx);
3295 	XFS_STATS_ADD(mp, xs_allocb, args->len);
3296 
3297 	trace_xfs_alloc_vextent_finish(args);
3298 
3299 out_drop_perag:
3300 	if (drop_perag && args->pag) {
3301 		xfs_perag_rele(args->pag);
3302 		args->pag = NULL;
3303 	}
3304 	return error;
3305 }
3306 
3307 /*
3308  * Allocate within a single AG only. This uses a best-fit length algorithm so if
3309  * you need an exact sized allocation without locality constraints, this is the
3310  * fastest way to do it.
3311  *
3312  * Caller is expected to hold a perag reference in args->pag.
3313  */
3314 int
3315 xfs_alloc_vextent_this_ag(
3316 	struct xfs_alloc_arg	*args,
3317 	xfs_agnumber_t		agno)
3318 {
3319 	struct xfs_mount	*mp = args->mp;
3320 	xfs_agnumber_t		minimum_agno;
3321 	int			error;
3322 
3323 	ASSERT(args->pag != NULL);
3324 	ASSERT(args->pag->pag_agno == agno);
3325 
3326 	args->agno = agno;
3327 	args->agbno = 0;
3328 
3329 	trace_xfs_alloc_vextent_this_ag(args);
3330 
3331 	error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3332 			&minimum_agno);
3333 	if (error) {
3334 		if (error == -ENOSPC)
3335 			return 0;
3336 		return error;
3337 	}
3338 
3339 	error = xfs_alloc_vextent_prepare_ag(args);
3340 	if (!error && args->agbp)
3341 		error = xfs_alloc_ag_vextent_size(args);
3342 
3343 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3344 }
3345 
3346 /*
3347  * Iterate all AGs trying to allocate an extent starting from @start_ag.
3348  *
3349  * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3350  * allocation attempts in @start_agno have locality information. If we fail to
3351  * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3352  * we attempt to allocation in as there is no locality optimisation possible for
3353  * those allocations.
3354  *
3355  * On return, args->pag may be left referenced if we finish before the "all
3356  * failed" return point. The allocation finish still needs the perag, and
3357  * so the caller will release it once they've finished the allocation.
3358  *
3359  * When we wrap the AG iteration at the end of the filesystem, we have to be
3360  * careful not to wrap into AGs below ones we already have locked in the
3361  * transaction if we are doing a blocking iteration. This will result in an
3362  * out-of-order locking of AGFs and hence can cause deadlocks.
3363  */
3364 static int
3365 xfs_alloc_vextent_iterate_ags(
3366 	struct xfs_alloc_arg	*args,
3367 	xfs_agnumber_t		minimum_agno,
3368 	xfs_agnumber_t		start_agno,
3369 	xfs_agblock_t		target_agbno,
3370 	uint32_t		flags)
3371 {
3372 	struct xfs_mount	*mp = args->mp;
3373 	xfs_agnumber_t		restart_agno = minimum_agno;
3374 	xfs_agnumber_t		agno;
3375 	int			error = 0;
3376 
3377 	if (flags & XFS_ALLOC_FLAG_TRYLOCK)
3378 		restart_agno = 0;
3379 restart:
3380 	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3381 			mp->m_sb.sb_agcount, agno, args->pag) {
3382 		args->agno = agno;
3383 		error = xfs_alloc_vextent_prepare_ag(args);
3384 		if (error)
3385 			break;
3386 		if (!args->agbp) {
3387 			trace_xfs_alloc_vextent_loopfailed(args);
3388 			continue;
3389 		}
3390 
3391 		/*
3392 		 * Allocation is supposed to succeed now, so break out of the
3393 		 * loop regardless of whether we succeed or not.
3394 		 */
3395 		if (args->agno == start_agno && target_agbno) {
3396 			args->agbno = target_agbno;
3397 			error = xfs_alloc_ag_vextent_near(args);
3398 		} else {
3399 			args->agbno = 0;
3400 			error = xfs_alloc_ag_vextent_size(args);
3401 		}
3402 		break;
3403 	}
3404 	if (error) {
3405 		xfs_perag_rele(args->pag);
3406 		args->pag = NULL;
3407 		return error;
3408 	}
3409 	if (args->agbp)
3410 		return 0;
3411 
3412 	/*
3413 	 * We didn't find an AG we can alloation from. If we were given
3414 	 * constraining flags by the caller, drop them and retry the allocation
3415 	 * without any constraints being set.
3416 	 */
3417 	if (flags) {
3418 		flags = 0;
3419 		restart_agno = minimum_agno;
3420 		goto restart;
3421 	}
3422 
3423 	ASSERT(args->pag == NULL);
3424 	trace_xfs_alloc_vextent_allfailed(args);
3425 	return 0;
3426 }
3427 
3428 /*
3429  * Iterate from the AGs from the start AG to the end of the filesystem, trying
3430  * to allocate blocks. It starts with a near allocation attempt in the initial
3431  * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3432  * back to zero if allowed by previous allocations in this transaction,
3433  * otherwise will wrap back to the start AG and run a second blocking pass to
3434  * the end of the filesystem.
3435  */
3436 int
3437 xfs_alloc_vextent_start_ag(
3438 	struct xfs_alloc_arg	*args,
3439 	xfs_fsblock_t		target)
3440 {
3441 	struct xfs_mount	*mp = args->mp;
3442 	xfs_agnumber_t		minimum_agno;
3443 	xfs_agnumber_t		start_agno;
3444 	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3445 	bool			bump_rotor = false;
3446 	int			error;
3447 
3448 	ASSERT(args->pag == NULL);
3449 
3450 	args->agno = NULLAGNUMBER;
3451 	args->agbno = NULLAGBLOCK;
3452 
3453 	trace_xfs_alloc_vextent_start_ag(args);
3454 
3455 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3456 	if (error) {
3457 		if (error == -ENOSPC)
3458 			return 0;
3459 		return error;
3460 	}
3461 
3462 	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3463 	    xfs_is_inode32(mp)) {
3464 		target = XFS_AGB_TO_FSB(mp,
3465 				((mp->m_agfrotor / rotorstep) %
3466 				mp->m_sb.sb_agcount), 0);
3467 		bump_rotor = 1;
3468 	}
3469 
3470 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3471 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3472 			XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK);
3473 
3474 	if (bump_rotor) {
3475 		if (args->agno == start_agno)
3476 			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3477 				(mp->m_sb.sb_agcount * rotorstep);
3478 		else
3479 			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3480 				(mp->m_sb.sb_agcount * rotorstep);
3481 	}
3482 
3483 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3484 }
3485 
3486 /*
3487  * Iterate from the agno indicated via @target through to the end of the
3488  * filesystem attempting blocking allocation. This does not wrap or try a second
3489  * pass, so will not recurse into AGs lower than indicated by the target.
3490  */
3491 int
3492 xfs_alloc_vextent_first_ag(
3493 	struct xfs_alloc_arg	*args,
3494 	xfs_fsblock_t		target)
3495  {
3496 	struct xfs_mount	*mp = args->mp;
3497 	xfs_agnumber_t		minimum_agno;
3498 	xfs_agnumber_t		start_agno;
3499 	int			error;
3500 
3501 	ASSERT(args->pag == NULL);
3502 
3503 	args->agno = NULLAGNUMBER;
3504 	args->agbno = NULLAGBLOCK;
3505 
3506 	trace_xfs_alloc_vextent_first_ag(args);
3507 
3508 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3509 	if (error) {
3510 		if (error == -ENOSPC)
3511 			return 0;
3512 		return error;
3513 	}
3514 
3515 	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3516 	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3517 			XFS_FSB_TO_AGBNO(mp, target), 0);
3518 	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3519 }
3520 
3521 /*
3522  * Allocate at the exact block target or fail. Caller is expected to hold a
3523  * perag reference in args->pag.
3524  */
3525 int
3526 xfs_alloc_vextent_exact_bno(
3527 	struct xfs_alloc_arg	*args,
3528 	xfs_fsblock_t		target)
3529 {
3530 	struct xfs_mount	*mp = args->mp;
3531 	xfs_agnumber_t		minimum_agno;
3532 	int			error;
3533 
3534 	ASSERT(args->pag != NULL);
3535 	ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3536 
3537 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3538 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3539 
3540 	trace_xfs_alloc_vextent_exact_bno(args);
3541 
3542 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3543 	if (error) {
3544 		if (error == -ENOSPC)
3545 			return 0;
3546 		return error;
3547 	}
3548 
3549 	error = xfs_alloc_vextent_prepare_ag(args);
3550 	if (!error && args->agbp)
3551 		error = xfs_alloc_ag_vextent_exact(args);
3552 
3553 	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3554 }
3555 
3556 /*
3557  * Allocate an extent as close to the target as possible. If there are not
3558  * viable candidates in the AG, then fail the allocation.
3559  *
3560  * Caller may or may not have a per-ag reference in args->pag.
3561  */
3562 int
3563 xfs_alloc_vextent_near_bno(
3564 	struct xfs_alloc_arg	*args,
3565 	xfs_fsblock_t		target)
3566 {
3567 	struct xfs_mount	*mp = args->mp;
3568 	xfs_agnumber_t		minimum_agno;
3569 	bool			needs_perag = args->pag == NULL;
3570 	int			error;
3571 
3572 	if (!needs_perag)
3573 		ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3574 
3575 	args->agno = XFS_FSB_TO_AGNO(mp, target);
3576 	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3577 
3578 	trace_xfs_alloc_vextent_near_bno(args);
3579 
3580 	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3581 	if (error) {
3582 		if (error == -ENOSPC)
3583 			return 0;
3584 		return error;
3585 	}
3586 
3587 	if (needs_perag)
3588 		args->pag = xfs_perag_grab(mp, args->agno);
3589 
3590 	error = xfs_alloc_vextent_prepare_ag(args);
3591 	if (!error && args->agbp)
3592 		error = xfs_alloc_ag_vextent_near(args);
3593 
3594 	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3595 }
3596 
3597 /* Ensure that the freelist is at full capacity. */
3598 int
3599 xfs_free_extent_fix_freelist(
3600 	struct xfs_trans	*tp,
3601 	struct xfs_perag	*pag,
3602 	struct xfs_buf		**agbp)
3603 {
3604 	struct xfs_alloc_arg	args;
3605 	int			error;
3606 
3607 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3608 	args.tp = tp;
3609 	args.mp = tp->t_mountp;
3610 	args.agno = pag->pag_agno;
3611 	args.pag = pag;
3612 
3613 	/*
3614 	 * validate that the block number is legal - the enables us to detect
3615 	 * and handle a silent filesystem corruption rather than crashing.
3616 	 */
3617 	if (args.agno >= args.mp->m_sb.sb_agcount)
3618 		return -EFSCORRUPTED;
3619 
3620 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3621 	if (error)
3622 		return error;
3623 
3624 	*agbp = args.agbp;
3625 	return 0;
3626 }
3627 
3628 /*
3629  * Free an extent.
3630  * Just break up the extent address and hand off to xfs_free_ag_extent
3631  * after fixing up the freelist.
3632  */
3633 int
3634 __xfs_free_extent(
3635 	struct xfs_trans		*tp,
3636 	struct xfs_perag		*pag,
3637 	xfs_agblock_t			agbno,
3638 	xfs_extlen_t			len,
3639 	const struct xfs_owner_info	*oinfo,
3640 	enum xfs_ag_resv_type		type,
3641 	bool				skip_discard)
3642 {
3643 	struct xfs_mount		*mp = tp->t_mountp;
3644 	struct xfs_buf			*agbp;
3645 	struct xfs_agf			*agf;
3646 	int				error;
3647 	unsigned int			busy_flags = 0;
3648 
3649 	ASSERT(len != 0);
3650 	ASSERT(type != XFS_AG_RESV_AGFL);
3651 
3652 	if (XFS_TEST_ERROR(false, mp,
3653 			XFS_ERRTAG_FREE_EXTENT))
3654 		return -EIO;
3655 
3656 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3657 	if (error)
3658 		return error;
3659 	agf = agbp->b_addr;
3660 
3661 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3662 		error = -EFSCORRUPTED;
3663 		goto err_release;
3664 	}
3665 
3666 	/* validate the extent size is legal now we have the agf locked */
3667 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3668 		error = -EFSCORRUPTED;
3669 		goto err_release;
3670 	}
3671 
3672 	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3673 			type);
3674 	if (error)
3675 		goto err_release;
3676 
3677 	if (skip_discard)
3678 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3679 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3680 	return 0;
3681 
3682 err_release:
3683 	xfs_trans_brelse(tp, agbp);
3684 	return error;
3685 }
3686 
3687 struct xfs_alloc_query_range_info {
3688 	xfs_alloc_query_range_fn	fn;
3689 	void				*priv;
3690 };
3691 
3692 /* Format btree record and pass to our callback. */
3693 STATIC int
3694 xfs_alloc_query_range_helper(
3695 	struct xfs_btree_cur		*cur,
3696 	const union xfs_btree_rec	*rec,
3697 	void				*priv)
3698 {
3699 	struct xfs_alloc_query_range_info	*query = priv;
3700 	struct xfs_alloc_rec_incore		irec;
3701 	xfs_failaddr_t				fa;
3702 
3703 	xfs_alloc_btrec_to_irec(rec, &irec);
3704 	fa = xfs_alloc_check_irec(cur, &irec);
3705 	if (fa)
3706 		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
3707 
3708 	return query->fn(cur, &irec, query->priv);
3709 }
3710 
3711 /* Find all free space within a given range of blocks. */
3712 int
3713 xfs_alloc_query_range(
3714 	struct xfs_btree_cur			*cur,
3715 	const struct xfs_alloc_rec_incore	*low_rec,
3716 	const struct xfs_alloc_rec_incore	*high_rec,
3717 	xfs_alloc_query_range_fn		fn,
3718 	void					*priv)
3719 {
3720 	union xfs_btree_irec			low_brec;
3721 	union xfs_btree_irec			high_brec;
3722 	struct xfs_alloc_query_range_info	query;
3723 
3724 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3725 	low_brec.a = *low_rec;
3726 	high_brec.a = *high_rec;
3727 	query.priv = priv;
3728 	query.fn = fn;
3729 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
3730 			xfs_alloc_query_range_helper, &query);
3731 }
3732 
3733 /* Find all free space records. */
3734 int
3735 xfs_alloc_query_all(
3736 	struct xfs_btree_cur			*cur,
3737 	xfs_alloc_query_range_fn		fn,
3738 	void					*priv)
3739 {
3740 	struct xfs_alloc_query_range_info	query;
3741 
3742 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3743 	query.priv = priv;
3744 	query.fn = fn;
3745 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3746 }
3747 
3748 /*
3749  * Scan part of the keyspace of the free space and tell us if the area has no
3750  * records, is fully mapped by records, or is partially filled.
3751  */
3752 int
3753 xfs_alloc_has_records(
3754 	struct xfs_btree_cur	*cur,
3755 	xfs_agblock_t		bno,
3756 	xfs_extlen_t		len,
3757 	enum xbtree_recpacking	*outcome)
3758 {
3759 	union xfs_btree_irec	low;
3760 	union xfs_btree_irec	high;
3761 
3762 	memset(&low, 0, sizeof(low));
3763 	low.a.ar_startblock = bno;
3764 	memset(&high, 0xFF, sizeof(high));
3765 	high.a.ar_startblock = bno + len - 1;
3766 
3767 	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
3768 }
3769 
3770 /*
3771  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3772  * error code or XFS_ITER_*.
3773  */
3774 int
3775 xfs_agfl_walk(
3776 	struct xfs_mount	*mp,
3777 	struct xfs_agf		*agf,
3778 	struct xfs_buf		*agflbp,
3779 	xfs_agfl_walk_fn	walk_fn,
3780 	void			*priv)
3781 {
3782 	__be32			*agfl_bno;
3783 	unsigned int		i;
3784 	int			error;
3785 
3786 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3787 	i = be32_to_cpu(agf->agf_flfirst);
3788 
3789 	/* Nothing to walk in an empty AGFL. */
3790 	if (agf->agf_flcount == cpu_to_be32(0))
3791 		return 0;
3792 
3793 	/* Otherwise, walk from first to last, wrapping as needed. */
3794 	for (;;) {
3795 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3796 		if (error)
3797 			return error;
3798 		if (i == be32_to_cpu(agf->agf_fllast))
3799 			break;
3800 		if (++i == xfs_agfl_size(mp))
3801 			i = 0;
3802 	}
3803 
3804 	return 0;
3805 }
3806 
3807 int __init
3808 xfs_extfree_intent_init_cache(void)
3809 {
3810 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3811 			sizeof(struct xfs_extent_free_item),
3812 			0, 0, NULL);
3813 
3814 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3815 }
3816 
3817 void
3818 xfs_extfree_intent_destroy_cache(void)
3819 {
3820 	kmem_cache_destroy(xfs_extfree_item_cache);
3821 	xfs_extfree_item_cache = NULL;
3822 }
3823