xref: /openbmc/linux/fs/xfs/libxfs/xfs_alloc.c (revision fadbafc1)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
16 #include "xfs_rmap.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
25 #include "xfs_log.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_bmap.h"
29 
30 struct kmem_cache	*xfs_extfree_item_cache;
31 
32 struct workqueue_struct *xfs_alloc_wq;
33 
34 #define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
35 
36 #define	XFSA_FIXUP_BNO_OK	1
37 #define	XFSA_FIXUP_CNT_OK	2
38 
39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
42 
43 /*
44  * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
45  * the beginning of the block for a proper header with the location information
46  * and CRC.
47  */
48 unsigned int
49 xfs_agfl_size(
50 	struct xfs_mount	*mp)
51 {
52 	unsigned int		size = mp->m_sb.sb_sectsize;
53 
54 	if (xfs_has_crc(mp))
55 		size -= sizeof(struct xfs_agfl);
56 
57 	return size / sizeof(xfs_agblock_t);
58 }
59 
60 unsigned int
61 xfs_refc_block(
62 	struct xfs_mount	*mp)
63 {
64 	if (xfs_has_rmapbt(mp))
65 		return XFS_RMAP_BLOCK(mp) + 1;
66 	if (xfs_has_finobt(mp))
67 		return XFS_FIBT_BLOCK(mp) + 1;
68 	return XFS_IBT_BLOCK(mp) + 1;
69 }
70 
71 xfs_extlen_t
72 xfs_prealloc_blocks(
73 	struct xfs_mount	*mp)
74 {
75 	if (xfs_has_reflink(mp))
76 		return xfs_refc_block(mp) + 1;
77 	if (xfs_has_rmapbt(mp))
78 		return XFS_RMAP_BLOCK(mp) + 1;
79 	if (xfs_has_finobt(mp))
80 		return XFS_FIBT_BLOCK(mp) + 1;
81 	return XFS_IBT_BLOCK(mp) + 1;
82 }
83 
84 /*
85  * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
86  * guarantee that we can refill the AGFL prior to allocating space in a nearly
87  * full AG.  Although the space described by the free space btrees, the
88  * blocks used by the freesp btrees themselves, and the blocks owned by the
89  * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
90  * free space in the AG drop so low that the free space btrees cannot refill an
91  * empty AGFL up to the minimum level.  Rather than grind through empty AGs
92  * until the fs goes down, we subtract this many AG blocks from the incore
93  * fdblocks to ensure user allocation does not overcommit the space the
94  * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
95  * withhold space from xfs_mod_fdblocks, so we do not account for that here.
96  */
97 #define XFS_ALLOCBT_AGFL_RESERVE	4
98 
99 /*
100  * Compute the number of blocks that we set aside to guarantee the ability to
101  * refill the AGFL and handle a full bmap btree split.
102  *
103  * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
104  * AGF buffer (PV 947395), we place constraints on the relationship among
105  * actual allocations for data blocks, freelist blocks, and potential file data
106  * bmap btree blocks. However, these restrictions may result in no actual space
107  * allocated for a delayed extent, for example, a data block in a certain AG is
108  * allocated but there is no additional block for the additional bmap btree
109  * block due to a split of the bmap btree of the file. The result of this may
110  * lead to an infinite loop when the file gets flushed to disk and all delayed
111  * extents need to be actually allocated. To get around this, we explicitly set
112  * aside a few blocks which will not be reserved in delayed allocation.
113  *
114  * For each AG, we need to reserve enough blocks to replenish a totally empty
115  * AGFL and 4 more to handle a potential split of the file's bmap btree.
116  */
117 unsigned int
118 xfs_alloc_set_aside(
119 	struct xfs_mount	*mp)
120 {
121 	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
122 }
123 
124 /*
125  * When deciding how much space to allocate out of an AG, we limit the
126  * allocation maximum size to the size the AG. However, we cannot use all the
127  * blocks in the AG - some are permanently used by metadata. These
128  * blocks are generally:
129  *	- the AG superblock, AGF, AGI and AGFL
130  *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
131  *	  the AGI free inode and rmap btree root blocks.
132  *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
133  *	- the rmapbt root block
134  *
135  * The AG headers are sector sized, so the amount of space they take up is
136  * dependent on filesystem geometry. The others are all single blocks.
137  */
138 unsigned int
139 xfs_alloc_ag_max_usable(
140 	struct xfs_mount	*mp)
141 {
142 	unsigned int		blocks;
143 
144 	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
145 	blocks += XFS_ALLOCBT_AGFL_RESERVE;
146 	blocks += 3;			/* AGF, AGI btree root blocks */
147 	if (xfs_has_finobt(mp))
148 		blocks++;		/* finobt root block */
149 	if (xfs_has_rmapbt(mp))
150 		blocks++;		/* rmap root block */
151 	if (xfs_has_reflink(mp))
152 		blocks++;		/* refcount root block */
153 
154 	return mp->m_sb.sb_agblocks - blocks;
155 }
156 
157 /*
158  * Lookup the record equal to [bno, len] in the btree given by cur.
159  */
160 STATIC int				/* error */
161 xfs_alloc_lookup_eq(
162 	struct xfs_btree_cur	*cur,	/* btree cursor */
163 	xfs_agblock_t		bno,	/* starting block of extent */
164 	xfs_extlen_t		len,	/* length of extent */
165 	int			*stat)	/* success/failure */
166 {
167 	int			error;
168 
169 	cur->bc_rec.a.ar_startblock = bno;
170 	cur->bc_rec.a.ar_blockcount = len;
171 	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
172 	cur->bc_ag.abt.active = (*stat == 1);
173 	return error;
174 }
175 
176 /*
177  * Lookup the first record greater than or equal to [bno, len]
178  * in the btree given by cur.
179  */
180 int				/* error */
181 xfs_alloc_lookup_ge(
182 	struct xfs_btree_cur	*cur,	/* btree cursor */
183 	xfs_agblock_t		bno,	/* starting block of extent */
184 	xfs_extlen_t		len,	/* length of extent */
185 	int			*stat)	/* success/failure */
186 {
187 	int			error;
188 
189 	cur->bc_rec.a.ar_startblock = bno;
190 	cur->bc_rec.a.ar_blockcount = len;
191 	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
192 	cur->bc_ag.abt.active = (*stat == 1);
193 	return error;
194 }
195 
196 /*
197  * Lookup the first record less than or equal to [bno, len]
198  * in the btree given by cur.
199  */
200 int					/* error */
201 xfs_alloc_lookup_le(
202 	struct xfs_btree_cur	*cur,	/* btree cursor */
203 	xfs_agblock_t		bno,	/* starting block of extent */
204 	xfs_extlen_t		len,	/* length of extent */
205 	int			*stat)	/* success/failure */
206 {
207 	int			error;
208 	cur->bc_rec.a.ar_startblock = bno;
209 	cur->bc_rec.a.ar_blockcount = len;
210 	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
211 	cur->bc_ag.abt.active = (*stat == 1);
212 	return error;
213 }
214 
215 static inline bool
216 xfs_alloc_cur_active(
217 	struct xfs_btree_cur	*cur)
218 {
219 	return cur && cur->bc_ag.abt.active;
220 }
221 
222 /*
223  * Update the record referred to by cur to the value given
224  * by [bno, len].
225  * This either works (return 0) or gets an EFSCORRUPTED error.
226  */
227 STATIC int				/* error */
228 xfs_alloc_update(
229 	struct xfs_btree_cur	*cur,	/* btree cursor */
230 	xfs_agblock_t		bno,	/* starting block of extent */
231 	xfs_extlen_t		len)	/* length of extent */
232 {
233 	union xfs_btree_rec	rec;
234 
235 	rec.alloc.ar_startblock = cpu_to_be32(bno);
236 	rec.alloc.ar_blockcount = cpu_to_be32(len);
237 	return xfs_btree_update(cur, &rec);
238 }
239 
240 /*
241  * Get the data from the pointed-to record.
242  */
243 int					/* error */
244 xfs_alloc_get_rec(
245 	struct xfs_btree_cur	*cur,	/* btree cursor */
246 	xfs_agblock_t		*bno,	/* output: starting block of extent */
247 	xfs_extlen_t		*len,	/* output: length of extent */
248 	int			*stat)	/* output: success/failure */
249 {
250 	struct xfs_mount	*mp = cur->bc_mp;
251 	struct xfs_perag	*pag = cur->bc_ag.pag;
252 	union xfs_btree_rec	*rec;
253 	int			error;
254 
255 	error = xfs_btree_get_rec(cur, &rec, stat);
256 	if (error || !(*stat))
257 		return error;
258 
259 	*bno = be32_to_cpu(rec->alloc.ar_startblock);
260 	*len = be32_to_cpu(rec->alloc.ar_blockcount);
261 
262 	if (*len == 0)
263 		goto out_bad_rec;
264 
265 	/* check for valid extent range, including overflow */
266 	if (!xfs_verify_agbno(pag, *bno))
267 		goto out_bad_rec;
268 	if (*bno > *bno + *len)
269 		goto out_bad_rec;
270 	if (!xfs_verify_agbno(pag, *bno + *len - 1))
271 		goto out_bad_rec;
272 
273 	return 0;
274 
275 out_bad_rec:
276 	xfs_warn(mp,
277 		"%s Freespace BTree record corruption in AG %d detected!",
278 		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
279 		pag->pag_agno);
280 	xfs_warn(mp,
281 		"start block 0x%x block count 0x%x", *bno, *len);
282 	return -EFSCORRUPTED;
283 }
284 
285 /*
286  * Compute aligned version of the found extent.
287  * Takes alignment and min length into account.
288  */
289 STATIC bool
290 xfs_alloc_compute_aligned(
291 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
292 	xfs_agblock_t	foundbno,	/* starting block in found extent */
293 	xfs_extlen_t	foundlen,	/* length in found extent */
294 	xfs_agblock_t	*resbno,	/* result block number */
295 	xfs_extlen_t	*reslen,	/* result length */
296 	unsigned	*busy_gen)
297 {
298 	xfs_agblock_t	bno = foundbno;
299 	xfs_extlen_t	len = foundlen;
300 	xfs_extlen_t	diff;
301 	bool		busy;
302 
303 	/* Trim busy sections out of found extent */
304 	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
305 
306 	/*
307 	 * If we have a largish extent that happens to start before min_agbno,
308 	 * see if we can shift it into range...
309 	 */
310 	if (bno < args->min_agbno && bno + len > args->min_agbno) {
311 		diff = args->min_agbno - bno;
312 		if (len > diff) {
313 			bno += diff;
314 			len -= diff;
315 		}
316 	}
317 
318 	if (args->alignment > 1 && len >= args->minlen) {
319 		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
320 
321 		diff = aligned_bno - bno;
322 
323 		*resbno = aligned_bno;
324 		*reslen = diff >= len ? 0 : len - diff;
325 	} else {
326 		*resbno = bno;
327 		*reslen = len;
328 	}
329 
330 	return busy;
331 }
332 
333 /*
334  * Compute best start block and diff for "near" allocations.
335  * freelen >= wantlen already checked by caller.
336  */
337 STATIC xfs_extlen_t			/* difference value (absolute) */
338 xfs_alloc_compute_diff(
339 	xfs_agblock_t	wantbno,	/* target starting block */
340 	xfs_extlen_t	wantlen,	/* target length */
341 	xfs_extlen_t	alignment,	/* target alignment */
342 	int		datatype,	/* are we allocating data? */
343 	xfs_agblock_t	freebno,	/* freespace's starting block */
344 	xfs_extlen_t	freelen,	/* freespace's length */
345 	xfs_agblock_t	*newbnop)	/* result: best start block from free */
346 {
347 	xfs_agblock_t	freeend;	/* end of freespace extent */
348 	xfs_agblock_t	newbno1;	/* return block number */
349 	xfs_agblock_t	newbno2;	/* other new block number */
350 	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
351 	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
352 	xfs_agblock_t	wantend;	/* end of target extent */
353 	bool		userdata = datatype & XFS_ALLOC_USERDATA;
354 
355 	ASSERT(freelen >= wantlen);
356 	freeend = freebno + freelen;
357 	wantend = wantbno + wantlen;
358 	/*
359 	 * We want to allocate from the start of a free extent if it is past
360 	 * the desired block or if we are allocating user data and the free
361 	 * extent is before desired block. The second case is there to allow
362 	 * for contiguous allocation from the remaining free space if the file
363 	 * grows in the short term.
364 	 */
365 	if (freebno >= wantbno || (userdata && freeend < wantend)) {
366 		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
367 			newbno1 = NULLAGBLOCK;
368 	} else if (freeend >= wantend && alignment > 1) {
369 		newbno1 = roundup(wantbno, alignment);
370 		newbno2 = newbno1 - alignment;
371 		if (newbno1 >= freeend)
372 			newbno1 = NULLAGBLOCK;
373 		else
374 			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
375 		if (newbno2 < freebno)
376 			newbno2 = NULLAGBLOCK;
377 		else
378 			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
379 		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
380 			if (newlen1 < newlen2 ||
381 			    (newlen1 == newlen2 &&
382 			     XFS_ABSDIFF(newbno1, wantbno) >
383 			     XFS_ABSDIFF(newbno2, wantbno)))
384 				newbno1 = newbno2;
385 		} else if (newbno2 != NULLAGBLOCK)
386 			newbno1 = newbno2;
387 	} else if (freeend >= wantend) {
388 		newbno1 = wantbno;
389 	} else if (alignment > 1) {
390 		newbno1 = roundup(freeend - wantlen, alignment);
391 		if (newbno1 > freeend - wantlen &&
392 		    newbno1 - alignment >= freebno)
393 			newbno1 -= alignment;
394 		else if (newbno1 >= freeend)
395 			newbno1 = NULLAGBLOCK;
396 	} else
397 		newbno1 = freeend - wantlen;
398 	*newbnop = newbno1;
399 	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
400 }
401 
402 /*
403  * Fix up the length, based on mod and prod.
404  * len should be k * prod + mod for some k.
405  * If len is too small it is returned unchanged.
406  * If len hits maxlen it is left alone.
407  */
408 STATIC void
409 xfs_alloc_fix_len(
410 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
411 {
412 	xfs_extlen_t	k;
413 	xfs_extlen_t	rlen;
414 
415 	ASSERT(args->mod < args->prod);
416 	rlen = args->len;
417 	ASSERT(rlen >= args->minlen);
418 	ASSERT(rlen <= args->maxlen);
419 	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
420 	    (args->mod == 0 && rlen < args->prod))
421 		return;
422 	k = rlen % args->prod;
423 	if (k == args->mod)
424 		return;
425 	if (k > args->mod)
426 		rlen = rlen - (k - args->mod);
427 	else
428 		rlen = rlen - args->prod + (args->mod - k);
429 	/* casts to (int) catch length underflows */
430 	if ((int)rlen < (int)args->minlen)
431 		return;
432 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
433 	ASSERT(rlen % args->prod == args->mod);
434 	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
435 		rlen + args->minleft);
436 	args->len = rlen;
437 }
438 
439 /*
440  * Update the two btrees, logically removing from freespace the extent
441  * starting at rbno, rlen blocks.  The extent is contained within the
442  * actual (current) free extent fbno for flen blocks.
443  * Flags are passed in indicating whether the cursors are set to the
444  * relevant records.
445  */
446 STATIC int				/* error code */
447 xfs_alloc_fixup_trees(
448 	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
449 	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
450 	xfs_agblock_t	fbno,		/* starting block of free extent */
451 	xfs_extlen_t	flen,		/* length of free extent */
452 	xfs_agblock_t	rbno,		/* starting block of returned extent */
453 	xfs_extlen_t	rlen,		/* length of returned extent */
454 	int		flags)		/* flags, XFSA_FIXUP_... */
455 {
456 	int		error;		/* error code */
457 	int		i;		/* operation results */
458 	xfs_agblock_t	nfbno1;		/* first new free startblock */
459 	xfs_agblock_t	nfbno2;		/* second new free startblock */
460 	xfs_extlen_t	nflen1=0;	/* first new free length */
461 	xfs_extlen_t	nflen2=0;	/* second new free length */
462 	struct xfs_mount *mp;
463 
464 	mp = cnt_cur->bc_mp;
465 
466 	/*
467 	 * Look up the record in the by-size tree if necessary.
468 	 */
469 	if (flags & XFSA_FIXUP_CNT_OK) {
470 #ifdef DEBUG
471 		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
472 			return error;
473 		if (XFS_IS_CORRUPT(mp,
474 				   i != 1 ||
475 				   nfbno1 != fbno ||
476 				   nflen1 != flen))
477 			return -EFSCORRUPTED;
478 #endif
479 	} else {
480 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
481 			return error;
482 		if (XFS_IS_CORRUPT(mp, i != 1))
483 			return -EFSCORRUPTED;
484 	}
485 	/*
486 	 * Look up the record in the by-block tree if necessary.
487 	 */
488 	if (flags & XFSA_FIXUP_BNO_OK) {
489 #ifdef DEBUG
490 		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
491 			return error;
492 		if (XFS_IS_CORRUPT(mp,
493 				   i != 1 ||
494 				   nfbno1 != fbno ||
495 				   nflen1 != flen))
496 			return -EFSCORRUPTED;
497 #endif
498 	} else {
499 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
500 			return error;
501 		if (XFS_IS_CORRUPT(mp, i != 1))
502 			return -EFSCORRUPTED;
503 	}
504 
505 #ifdef DEBUG
506 	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
507 		struct xfs_btree_block	*bnoblock;
508 		struct xfs_btree_block	*cntblock;
509 
510 		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
511 		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
512 
513 		if (XFS_IS_CORRUPT(mp,
514 				   bnoblock->bb_numrecs !=
515 				   cntblock->bb_numrecs))
516 			return -EFSCORRUPTED;
517 	}
518 #endif
519 
520 	/*
521 	 * Deal with all four cases: the allocated record is contained
522 	 * within the freespace record, so we can have new freespace
523 	 * at either (or both) end, or no freespace remaining.
524 	 */
525 	if (rbno == fbno && rlen == flen)
526 		nfbno1 = nfbno2 = NULLAGBLOCK;
527 	else if (rbno == fbno) {
528 		nfbno1 = rbno + rlen;
529 		nflen1 = flen - rlen;
530 		nfbno2 = NULLAGBLOCK;
531 	} else if (rbno + rlen == fbno + flen) {
532 		nfbno1 = fbno;
533 		nflen1 = flen - rlen;
534 		nfbno2 = NULLAGBLOCK;
535 	} else {
536 		nfbno1 = fbno;
537 		nflen1 = rbno - fbno;
538 		nfbno2 = rbno + rlen;
539 		nflen2 = (fbno + flen) - nfbno2;
540 	}
541 	/*
542 	 * Delete the entry from the by-size btree.
543 	 */
544 	if ((error = xfs_btree_delete(cnt_cur, &i)))
545 		return error;
546 	if (XFS_IS_CORRUPT(mp, i != 1))
547 		return -EFSCORRUPTED;
548 	/*
549 	 * Add new by-size btree entry(s).
550 	 */
551 	if (nfbno1 != NULLAGBLOCK) {
552 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
553 			return error;
554 		if (XFS_IS_CORRUPT(mp, i != 0))
555 			return -EFSCORRUPTED;
556 		if ((error = xfs_btree_insert(cnt_cur, &i)))
557 			return error;
558 		if (XFS_IS_CORRUPT(mp, i != 1))
559 			return -EFSCORRUPTED;
560 	}
561 	if (nfbno2 != NULLAGBLOCK) {
562 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
563 			return error;
564 		if (XFS_IS_CORRUPT(mp, i != 0))
565 			return -EFSCORRUPTED;
566 		if ((error = xfs_btree_insert(cnt_cur, &i)))
567 			return error;
568 		if (XFS_IS_CORRUPT(mp, i != 1))
569 			return -EFSCORRUPTED;
570 	}
571 	/*
572 	 * Fix up the by-block btree entry(s).
573 	 */
574 	if (nfbno1 == NULLAGBLOCK) {
575 		/*
576 		 * No remaining freespace, just delete the by-block tree entry.
577 		 */
578 		if ((error = xfs_btree_delete(bno_cur, &i)))
579 			return error;
580 		if (XFS_IS_CORRUPT(mp, i != 1))
581 			return -EFSCORRUPTED;
582 	} else {
583 		/*
584 		 * Update the by-block entry to start later|be shorter.
585 		 */
586 		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
587 			return error;
588 	}
589 	if (nfbno2 != NULLAGBLOCK) {
590 		/*
591 		 * 2 resulting free entries, need to add one.
592 		 */
593 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
594 			return error;
595 		if (XFS_IS_CORRUPT(mp, i != 0))
596 			return -EFSCORRUPTED;
597 		if ((error = xfs_btree_insert(bno_cur, &i)))
598 			return error;
599 		if (XFS_IS_CORRUPT(mp, i != 1))
600 			return -EFSCORRUPTED;
601 	}
602 	return 0;
603 }
604 
605 static xfs_failaddr_t
606 xfs_agfl_verify(
607 	struct xfs_buf	*bp)
608 {
609 	struct xfs_mount *mp = bp->b_mount;
610 	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
611 	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
612 	int		i;
613 
614 	/*
615 	 * There is no verification of non-crc AGFLs because mkfs does not
616 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
617 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
618 	 * can't verify just those entries are valid.
619 	 */
620 	if (!xfs_has_crc(mp))
621 		return NULL;
622 
623 	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
624 		return __this_address;
625 	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
626 		return __this_address;
627 	/*
628 	 * during growfs operations, the perag is not fully initialised,
629 	 * so we can't use it for any useful checking. growfs ensures we can't
630 	 * use it by using uncached buffers that don't have the perag attached
631 	 * so we can detect and avoid this problem.
632 	 */
633 	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
634 		return __this_address;
635 
636 	for (i = 0; i < xfs_agfl_size(mp); i++) {
637 		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
638 		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
639 			return __this_address;
640 	}
641 
642 	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
643 		return __this_address;
644 	return NULL;
645 }
646 
647 static void
648 xfs_agfl_read_verify(
649 	struct xfs_buf	*bp)
650 {
651 	struct xfs_mount *mp = bp->b_mount;
652 	xfs_failaddr_t	fa;
653 
654 	/*
655 	 * There is no verification of non-crc AGFLs because mkfs does not
656 	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
657 	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
658 	 * can't verify just those entries are valid.
659 	 */
660 	if (!xfs_has_crc(mp))
661 		return;
662 
663 	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
664 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
665 	else {
666 		fa = xfs_agfl_verify(bp);
667 		if (fa)
668 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
669 	}
670 }
671 
672 static void
673 xfs_agfl_write_verify(
674 	struct xfs_buf	*bp)
675 {
676 	struct xfs_mount	*mp = bp->b_mount;
677 	struct xfs_buf_log_item	*bip = bp->b_log_item;
678 	xfs_failaddr_t		fa;
679 
680 	/* no verification of non-crc AGFLs */
681 	if (!xfs_has_crc(mp))
682 		return;
683 
684 	fa = xfs_agfl_verify(bp);
685 	if (fa) {
686 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
687 		return;
688 	}
689 
690 	if (bip)
691 		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
692 
693 	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
694 }
695 
696 const struct xfs_buf_ops xfs_agfl_buf_ops = {
697 	.name = "xfs_agfl",
698 	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
699 	.verify_read = xfs_agfl_read_verify,
700 	.verify_write = xfs_agfl_write_verify,
701 	.verify_struct = xfs_agfl_verify,
702 };
703 
704 /*
705  * Read in the allocation group free block array.
706  */
707 int
708 xfs_alloc_read_agfl(
709 	struct xfs_perag	*pag,
710 	struct xfs_trans	*tp,
711 	struct xfs_buf		**bpp)
712 {
713 	struct xfs_mount	*mp = pag->pag_mount;
714 	struct xfs_buf		*bp;
715 	int			error;
716 
717 	error = xfs_trans_read_buf(
718 			mp, tp, mp->m_ddev_targp,
719 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
720 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
721 	if (error)
722 		return error;
723 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
724 	*bpp = bp;
725 	return 0;
726 }
727 
728 STATIC int
729 xfs_alloc_update_counters(
730 	struct xfs_trans	*tp,
731 	struct xfs_buf		*agbp,
732 	long			len)
733 {
734 	struct xfs_agf		*agf = agbp->b_addr;
735 
736 	agbp->b_pag->pagf_freeblks += len;
737 	be32_add_cpu(&agf->agf_freeblks, len);
738 
739 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
740 		     be32_to_cpu(agf->agf_length))) {
741 		xfs_buf_mark_corrupt(agbp);
742 		return -EFSCORRUPTED;
743 	}
744 
745 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
746 	return 0;
747 }
748 
749 /*
750  * Block allocation algorithm and data structures.
751  */
752 struct xfs_alloc_cur {
753 	struct xfs_btree_cur		*cnt;	/* btree cursors */
754 	struct xfs_btree_cur		*bnolt;
755 	struct xfs_btree_cur		*bnogt;
756 	xfs_extlen_t			cur_len;/* current search length */
757 	xfs_agblock_t			rec_bno;/* extent startblock */
758 	xfs_extlen_t			rec_len;/* extent length */
759 	xfs_agblock_t			bno;	/* alloc bno */
760 	xfs_extlen_t			len;	/* alloc len */
761 	xfs_extlen_t			diff;	/* diff from search bno */
762 	unsigned int			busy_gen;/* busy state */
763 	bool				busy;
764 };
765 
766 /*
767  * Set up cursors, etc. in the extent allocation cursor. This function can be
768  * called multiple times to reset an initialized structure without having to
769  * reallocate cursors.
770  */
771 static int
772 xfs_alloc_cur_setup(
773 	struct xfs_alloc_arg	*args,
774 	struct xfs_alloc_cur	*acur)
775 {
776 	int			error;
777 	int			i;
778 
779 	ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
780 
781 	acur->cur_len = args->maxlen;
782 	acur->rec_bno = 0;
783 	acur->rec_len = 0;
784 	acur->bno = 0;
785 	acur->len = 0;
786 	acur->diff = -1;
787 	acur->busy = false;
788 	acur->busy_gen = 0;
789 
790 	/*
791 	 * Perform an initial cntbt lookup to check for availability of maxlen
792 	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
793 	 * attempt a small allocation.
794 	 */
795 	if (!acur->cnt)
796 		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
797 					args->agbp, args->pag, XFS_BTNUM_CNT);
798 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
799 	if (error)
800 		return error;
801 
802 	/*
803 	 * Allocate the bnobt left and right search cursors.
804 	 */
805 	if (!acur->bnolt)
806 		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
807 					args->agbp, args->pag, XFS_BTNUM_BNO);
808 	if (!acur->bnogt)
809 		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
810 					args->agbp, args->pag, XFS_BTNUM_BNO);
811 	return i == 1 ? 0 : -ENOSPC;
812 }
813 
814 static void
815 xfs_alloc_cur_close(
816 	struct xfs_alloc_cur	*acur,
817 	bool			error)
818 {
819 	int			cur_error = XFS_BTREE_NOERROR;
820 
821 	if (error)
822 		cur_error = XFS_BTREE_ERROR;
823 
824 	if (acur->cnt)
825 		xfs_btree_del_cursor(acur->cnt, cur_error);
826 	if (acur->bnolt)
827 		xfs_btree_del_cursor(acur->bnolt, cur_error);
828 	if (acur->bnogt)
829 		xfs_btree_del_cursor(acur->bnogt, cur_error);
830 	acur->cnt = acur->bnolt = acur->bnogt = NULL;
831 }
832 
833 /*
834  * Check an extent for allocation and track the best available candidate in the
835  * allocation structure. The cursor is deactivated if it has entered an out of
836  * range state based on allocation arguments. Optionally return the extent
837  * extent geometry and allocation status if requested by the caller.
838  */
839 static int
840 xfs_alloc_cur_check(
841 	struct xfs_alloc_arg	*args,
842 	struct xfs_alloc_cur	*acur,
843 	struct xfs_btree_cur	*cur,
844 	int			*new)
845 {
846 	int			error, i;
847 	xfs_agblock_t		bno, bnoa, bnew;
848 	xfs_extlen_t		len, lena, diff = -1;
849 	bool			busy;
850 	unsigned		busy_gen = 0;
851 	bool			deactivate = false;
852 	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
853 
854 	*new = 0;
855 
856 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
857 	if (error)
858 		return error;
859 	if (XFS_IS_CORRUPT(args->mp, i != 1))
860 		return -EFSCORRUPTED;
861 
862 	/*
863 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
864 	 * range (i.e., walking backwards looking for a minlen extent).
865 	 */
866 	if (len < args->minlen) {
867 		deactivate = !isbnobt;
868 		goto out;
869 	}
870 
871 	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
872 					 &busy_gen);
873 	acur->busy |= busy;
874 	if (busy)
875 		acur->busy_gen = busy_gen;
876 	/* deactivate a bnobt cursor outside of locality range */
877 	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
878 		deactivate = isbnobt;
879 		goto out;
880 	}
881 	if (lena < args->minlen)
882 		goto out;
883 
884 	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
885 	xfs_alloc_fix_len(args);
886 	ASSERT(args->len >= args->minlen);
887 	if (args->len < acur->len)
888 		goto out;
889 
890 	/*
891 	 * We have an aligned record that satisfies minlen and beats or matches
892 	 * the candidate extent size. Compare locality for near allocation mode.
893 	 */
894 	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
895 	diff = xfs_alloc_compute_diff(args->agbno, args->len,
896 				      args->alignment, args->datatype,
897 				      bnoa, lena, &bnew);
898 	if (bnew == NULLAGBLOCK)
899 		goto out;
900 
901 	/*
902 	 * Deactivate a bnobt cursor with worse locality than the current best.
903 	 */
904 	if (diff > acur->diff) {
905 		deactivate = isbnobt;
906 		goto out;
907 	}
908 
909 	ASSERT(args->len > acur->len ||
910 	       (args->len == acur->len && diff <= acur->diff));
911 	acur->rec_bno = bno;
912 	acur->rec_len = len;
913 	acur->bno = bnew;
914 	acur->len = args->len;
915 	acur->diff = diff;
916 	*new = 1;
917 
918 	/*
919 	 * We're done if we found a perfect allocation. This only deactivates
920 	 * the current cursor, but this is just an optimization to terminate a
921 	 * cntbt search that otherwise runs to the edge of the tree.
922 	 */
923 	if (acur->diff == 0 && acur->len == args->maxlen)
924 		deactivate = true;
925 out:
926 	if (deactivate)
927 		cur->bc_ag.abt.active = false;
928 	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
929 				  *new);
930 	return 0;
931 }
932 
933 /*
934  * Complete an allocation of a candidate extent. Remove the extent from both
935  * trees and update the args structure.
936  */
937 STATIC int
938 xfs_alloc_cur_finish(
939 	struct xfs_alloc_arg	*args,
940 	struct xfs_alloc_cur	*acur)
941 {
942 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
943 	int			error;
944 
945 	ASSERT(acur->cnt && acur->bnolt);
946 	ASSERT(acur->bno >= acur->rec_bno);
947 	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
948 	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
949 
950 	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
951 				      acur->rec_len, acur->bno, acur->len, 0);
952 	if (error)
953 		return error;
954 
955 	args->agbno = acur->bno;
956 	args->len = acur->len;
957 	args->wasfromfl = 0;
958 
959 	trace_xfs_alloc_cur(args);
960 	return 0;
961 }
962 
963 /*
964  * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
965  * bno optimized lookup to search for extents with ideal size and locality.
966  */
967 STATIC int
968 xfs_alloc_cntbt_iter(
969 	struct xfs_alloc_arg		*args,
970 	struct xfs_alloc_cur		*acur)
971 {
972 	struct xfs_btree_cur	*cur = acur->cnt;
973 	xfs_agblock_t		bno;
974 	xfs_extlen_t		len, cur_len;
975 	int			error;
976 	int			i;
977 
978 	if (!xfs_alloc_cur_active(cur))
979 		return 0;
980 
981 	/* locality optimized lookup */
982 	cur_len = acur->cur_len;
983 	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
984 	if (error)
985 		return error;
986 	if (i == 0)
987 		return 0;
988 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
989 	if (error)
990 		return error;
991 
992 	/* check the current record and update search length from it */
993 	error = xfs_alloc_cur_check(args, acur, cur, &i);
994 	if (error)
995 		return error;
996 	ASSERT(len >= acur->cur_len);
997 	acur->cur_len = len;
998 
999 	/*
1000 	 * We looked up the first record >= [agbno, len] above. The agbno is a
1001 	 * secondary key and so the current record may lie just before or after
1002 	 * agbno. If it is past agbno, check the previous record too so long as
1003 	 * the length matches as it may be closer. Don't check a smaller record
1004 	 * because that could deactivate our cursor.
1005 	 */
1006 	if (bno > args->agbno) {
1007 		error = xfs_btree_decrement(cur, 0, &i);
1008 		if (!error && i) {
1009 			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1010 			if (!error && i && len == acur->cur_len)
1011 				error = xfs_alloc_cur_check(args, acur, cur,
1012 							    &i);
1013 		}
1014 		if (error)
1015 			return error;
1016 	}
1017 
1018 	/*
1019 	 * Increment the search key until we find at least one allocation
1020 	 * candidate or if the extent we found was larger. Otherwise, double the
1021 	 * search key to optimize the search. Efficiency is more important here
1022 	 * than absolute best locality.
1023 	 */
1024 	cur_len <<= 1;
1025 	if (!acur->len || acur->cur_len >= cur_len)
1026 		acur->cur_len++;
1027 	else
1028 		acur->cur_len = cur_len;
1029 
1030 	return error;
1031 }
1032 
1033 /*
1034  * Deal with the case where only small freespaces remain. Either return the
1035  * contents of the last freespace record, or allocate space from the freelist if
1036  * there is nothing in the tree.
1037  */
1038 STATIC int			/* error */
1039 xfs_alloc_ag_vextent_small(
1040 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1041 	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1042 	xfs_agblock_t		*fbnop,	/* result block number */
1043 	xfs_extlen_t		*flenp,	/* result length */
1044 	int			*stat)	/* status: 0-freelist, 1-normal/none */
1045 {
1046 	struct xfs_agf		*agf = args->agbp->b_addr;
1047 	int			error = 0;
1048 	xfs_agblock_t		fbno = NULLAGBLOCK;
1049 	xfs_extlen_t		flen = 0;
1050 	int			i = 0;
1051 
1052 	/*
1053 	 * If a cntbt cursor is provided, try to allocate the largest record in
1054 	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1055 	 * allocation. Make sure to respect minleft even when pulling from the
1056 	 * freelist.
1057 	 */
1058 	if (ccur)
1059 		error = xfs_btree_decrement(ccur, 0, &i);
1060 	if (error)
1061 		goto error;
1062 	if (i) {
1063 		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1064 		if (error)
1065 			goto error;
1066 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1067 			error = -EFSCORRUPTED;
1068 			goto error;
1069 		}
1070 		goto out;
1071 	}
1072 
1073 	if (args->minlen != 1 || args->alignment != 1 ||
1074 	    args->resv == XFS_AG_RESV_AGFL ||
1075 	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1076 		goto out;
1077 
1078 	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1079 			&fbno, 0);
1080 	if (error)
1081 		goto error;
1082 	if (fbno == NULLAGBLOCK)
1083 		goto out;
1084 
1085 	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1086 			      (args->datatype & XFS_ALLOC_NOBUSY));
1087 
1088 	if (args->datatype & XFS_ALLOC_USERDATA) {
1089 		struct xfs_buf	*bp;
1090 
1091 		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1092 				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1093 				args->mp->m_bsize, 0, &bp);
1094 		if (error)
1095 			goto error;
1096 		xfs_trans_binval(args->tp, bp);
1097 	}
1098 	*fbnop = args->agbno = fbno;
1099 	*flenp = args->len = 1;
1100 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1101 		error = -EFSCORRUPTED;
1102 		goto error;
1103 	}
1104 	args->wasfromfl = 1;
1105 	trace_xfs_alloc_small_freelist(args);
1106 
1107 	/*
1108 	 * If we're feeding an AGFL block to something that doesn't live in the
1109 	 * free space, we need to clear out the OWN_AG rmap.
1110 	 */
1111 	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1112 			      &XFS_RMAP_OINFO_AG);
1113 	if (error)
1114 		goto error;
1115 
1116 	*stat = 0;
1117 	return 0;
1118 
1119 out:
1120 	/*
1121 	 * Can't do the allocation, give up.
1122 	 */
1123 	if (flen < args->minlen) {
1124 		args->agbno = NULLAGBLOCK;
1125 		trace_xfs_alloc_small_notenough(args);
1126 		flen = 0;
1127 	}
1128 	*fbnop = fbno;
1129 	*flenp = flen;
1130 	*stat = 1;
1131 	trace_xfs_alloc_small_done(args);
1132 	return 0;
1133 
1134 error:
1135 	trace_xfs_alloc_small_error(args);
1136 	return error;
1137 }
1138 
1139 /*
1140  * Allocate a variable extent in the allocation group agno.
1141  * Type and bno are used to determine where in the allocation group the
1142  * extent will start.
1143  * Extent's length (returned in *len) will be between minlen and maxlen,
1144  * and of the form k * prod + mod unless there's nothing that large.
1145  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1146  */
1147 STATIC int			/* error */
1148 xfs_alloc_ag_vextent(
1149 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
1150 {
1151 	int		error=0;
1152 
1153 	ASSERT(args->minlen > 0);
1154 	ASSERT(args->maxlen > 0);
1155 	ASSERT(args->minlen <= args->maxlen);
1156 	ASSERT(args->mod < args->prod);
1157 	ASSERT(args->alignment > 0);
1158 
1159 	/*
1160 	 * Branch to correct routine based on the type.
1161 	 */
1162 	args->wasfromfl = 0;
1163 	switch (args->type) {
1164 	case XFS_ALLOCTYPE_THIS_AG:
1165 		error = xfs_alloc_ag_vextent_size(args);
1166 		break;
1167 	case XFS_ALLOCTYPE_NEAR_BNO:
1168 		error = xfs_alloc_ag_vextent_near(args);
1169 		break;
1170 	case XFS_ALLOCTYPE_THIS_BNO:
1171 		error = xfs_alloc_ag_vextent_exact(args);
1172 		break;
1173 	default:
1174 		ASSERT(0);
1175 		/* NOTREACHED */
1176 	}
1177 
1178 	if (error || args->agbno == NULLAGBLOCK)
1179 		return error;
1180 
1181 	ASSERT(args->len >= args->minlen);
1182 	ASSERT(args->len <= args->maxlen);
1183 	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1184 	ASSERT(args->agbno % args->alignment == 0);
1185 
1186 	/* if not file data, insert new block into the reverse map btree */
1187 	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1188 		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
1189 				       args->agbno, args->len, &args->oinfo);
1190 		if (error)
1191 			return error;
1192 	}
1193 
1194 	if (!args->wasfromfl) {
1195 		error = xfs_alloc_update_counters(args->tp, args->agbp,
1196 						  -((long)(args->len)));
1197 		if (error)
1198 			return error;
1199 
1200 		ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
1201 					      args->agbno, args->len));
1202 	}
1203 
1204 	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1205 
1206 	XFS_STATS_INC(args->mp, xs_allocx);
1207 	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1208 	return error;
1209 }
1210 
1211 /*
1212  * Allocate a variable extent at exactly agno/bno.
1213  * Extent's length (returned in *len) will be between minlen and maxlen,
1214  * and of the form k * prod + mod unless there's nothing that large.
1215  * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1216  */
1217 STATIC int			/* error */
1218 xfs_alloc_ag_vextent_exact(
1219 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1220 {
1221 	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1222 	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1223 	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1224 	int		error;
1225 	xfs_agblock_t	fbno;	/* start block of found extent */
1226 	xfs_extlen_t	flen;	/* length of found extent */
1227 	xfs_agblock_t	tbno;	/* start block of busy extent */
1228 	xfs_extlen_t	tlen;	/* length of busy extent */
1229 	xfs_agblock_t	tend;	/* end block of busy extent */
1230 	int		i;	/* success/failure of operation */
1231 	unsigned	busy_gen;
1232 
1233 	ASSERT(args->alignment == 1);
1234 
1235 	/*
1236 	 * Allocate/initialize a cursor for the by-number freespace btree.
1237 	 */
1238 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1239 					  args->pag, XFS_BTNUM_BNO);
1240 
1241 	/*
1242 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1243 	 * Look for the closest free block <= bno, it must contain bno
1244 	 * if any free block does.
1245 	 */
1246 	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1247 	if (error)
1248 		goto error0;
1249 	if (!i)
1250 		goto not_found;
1251 
1252 	/*
1253 	 * Grab the freespace record.
1254 	 */
1255 	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1256 	if (error)
1257 		goto error0;
1258 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1259 		error = -EFSCORRUPTED;
1260 		goto error0;
1261 	}
1262 	ASSERT(fbno <= args->agbno);
1263 
1264 	/*
1265 	 * Check for overlapping busy extents.
1266 	 */
1267 	tbno = fbno;
1268 	tlen = flen;
1269 	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1270 
1271 	/*
1272 	 * Give up if the start of the extent is busy, or the freespace isn't
1273 	 * long enough for the minimum request.
1274 	 */
1275 	if (tbno > args->agbno)
1276 		goto not_found;
1277 	if (tlen < args->minlen)
1278 		goto not_found;
1279 	tend = tbno + tlen;
1280 	if (tend < args->agbno + args->minlen)
1281 		goto not_found;
1282 
1283 	/*
1284 	 * End of extent will be smaller of the freespace end and the
1285 	 * maximal requested end.
1286 	 *
1287 	 * Fix the length according to mod and prod if given.
1288 	 */
1289 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1290 						- args->agbno;
1291 	xfs_alloc_fix_len(args);
1292 	ASSERT(args->agbno + args->len <= tend);
1293 
1294 	/*
1295 	 * We are allocating agbno for args->len
1296 	 * Allocate/initialize a cursor for the by-size btree.
1297 	 */
1298 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1299 					args->pag, XFS_BTNUM_CNT);
1300 	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1301 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1302 				      args->len, XFSA_FIXUP_BNO_OK);
1303 	if (error) {
1304 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1305 		goto error0;
1306 	}
1307 
1308 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1309 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1310 
1311 	args->wasfromfl = 0;
1312 	trace_xfs_alloc_exact_done(args);
1313 	return 0;
1314 
1315 not_found:
1316 	/* Didn't find it, return null. */
1317 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1318 	args->agbno = NULLAGBLOCK;
1319 	trace_xfs_alloc_exact_notfound(args);
1320 	return 0;
1321 
1322 error0:
1323 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1324 	trace_xfs_alloc_exact_error(args);
1325 	return error;
1326 }
1327 
1328 /*
1329  * Search a given number of btree records in a given direction. Check each
1330  * record against the good extent we've already found.
1331  */
1332 STATIC int
1333 xfs_alloc_walk_iter(
1334 	struct xfs_alloc_arg	*args,
1335 	struct xfs_alloc_cur	*acur,
1336 	struct xfs_btree_cur	*cur,
1337 	bool			increment,
1338 	bool			find_one, /* quit on first candidate */
1339 	int			count,    /* rec count (-1 for infinite) */
1340 	int			*stat)
1341 {
1342 	int			error;
1343 	int			i;
1344 
1345 	*stat = 0;
1346 
1347 	/*
1348 	 * Search so long as the cursor is active or we find a better extent.
1349 	 * The cursor is deactivated if it extends beyond the range of the
1350 	 * current allocation candidate.
1351 	 */
1352 	while (xfs_alloc_cur_active(cur) && count) {
1353 		error = xfs_alloc_cur_check(args, acur, cur, &i);
1354 		if (error)
1355 			return error;
1356 		if (i == 1) {
1357 			*stat = 1;
1358 			if (find_one)
1359 				break;
1360 		}
1361 		if (!xfs_alloc_cur_active(cur))
1362 			break;
1363 
1364 		if (increment)
1365 			error = xfs_btree_increment(cur, 0, &i);
1366 		else
1367 			error = xfs_btree_decrement(cur, 0, &i);
1368 		if (error)
1369 			return error;
1370 		if (i == 0)
1371 			cur->bc_ag.abt.active = false;
1372 
1373 		if (count > 0)
1374 			count--;
1375 	}
1376 
1377 	return 0;
1378 }
1379 
1380 /*
1381  * Search the by-bno and by-size btrees in parallel in search of an extent with
1382  * ideal locality based on the NEAR mode ->agbno locality hint.
1383  */
1384 STATIC int
1385 xfs_alloc_ag_vextent_locality(
1386 	struct xfs_alloc_arg	*args,
1387 	struct xfs_alloc_cur	*acur,
1388 	int			*stat)
1389 {
1390 	struct xfs_btree_cur	*fbcur = NULL;
1391 	int			error;
1392 	int			i;
1393 	bool			fbinc;
1394 
1395 	ASSERT(acur->len == 0);
1396 	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1397 
1398 	*stat = 0;
1399 
1400 	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1401 	if (error)
1402 		return error;
1403 	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1404 	if (error)
1405 		return error;
1406 	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1407 	if (error)
1408 		return error;
1409 
1410 	/*
1411 	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1412 	 * right and lookup the closest extent to the locality hint for each
1413 	 * extent size key in the cntbt. The entire search terminates
1414 	 * immediately on a bnobt hit because that means we've found best case
1415 	 * locality. Otherwise the search continues until the cntbt cursor runs
1416 	 * off the end of the tree. If no allocation candidate is found at this
1417 	 * point, give up on locality, walk backwards from the end of the cntbt
1418 	 * and take the first available extent.
1419 	 *
1420 	 * The parallel tree searches balance each other out to provide fairly
1421 	 * consistent performance for various situations. The bnobt search can
1422 	 * have pathological behavior in the worst case scenario of larger
1423 	 * allocation requests and fragmented free space. On the other hand, the
1424 	 * bnobt is able to satisfy most smaller allocation requests much more
1425 	 * quickly than the cntbt. The cntbt search can sift through fragmented
1426 	 * free space and sets of free extents for larger allocation requests
1427 	 * more quickly than the bnobt. Since the locality hint is just a hint
1428 	 * and we don't want to scan the entire bnobt for perfect locality, the
1429 	 * cntbt search essentially bounds the bnobt search such that we can
1430 	 * find good enough locality at reasonable performance in most cases.
1431 	 */
1432 	while (xfs_alloc_cur_active(acur->bnolt) ||
1433 	       xfs_alloc_cur_active(acur->bnogt) ||
1434 	       xfs_alloc_cur_active(acur->cnt)) {
1435 
1436 		trace_xfs_alloc_cur_lookup(args);
1437 
1438 		/*
1439 		 * Search the bnobt left and right. In the case of a hit, finish
1440 		 * the search in the opposite direction and we're done.
1441 		 */
1442 		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1443 					    true, 1, &i);
1444 		if (error)
1445 			return error;
1446 		if (i == 1) {
1447 			trace_xfs_alloc_cur_left(args);
1448 			fbcur = acur->bnogt;
1449 			fbinc = true;
1450 			break;
1451 		}
1452 		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1453 					    1, &i);
1454 		if (error)
1455 			return error;
1456 		if (i == 1) {
1457 			trace_xfs_alloc_cur_right(args);
1458 			fbcur = acur->bnolt;
1459 			fbinc = false;
1460 			break;
1461 		}
1462 
1463 		/*
1464 		 * Check the extent with best locality based on the current
1465 		 * extent size search key and keep track of the best candidate.
1466 		 */
1467 		error = xfs_alloc_cntbt_iter(args, acur);
1468 		if (error)
1469 			return error;
1470 		if (!xfs_alloc_cur_active(acur->cnt)) {
1471 			trace_xfs_alloc_cur_lookup_done(args);
1472 			break;
1473 		}
1474 	}
1475 
1476 	/*
1477 	 * If we failed to find anything due to busy extents, return empty
1478 	 * handed so the caller can flush and retry. If no busy extents were
1479 	 * found, walk backwards from the end of the cntbt as a last resort.
1480 	 */
1481 	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1482 		error = xfs_btree_decrement(acur->cnt, 0, &i);
1483 		if (error)
1484 			return error;
1485 		if (i) {
1486 			acur->cnt->bc_ag.abt.active = true;
1487 			fbcur = acur->cnt;
1488 			fbinc = false;
1489 		}
1490 	}
1491 
1492 	/*
1493 	 * Search in the opposite direction for a better entry in the case of
1494 	 * a bnobt hit or walk backwards from the end of the cntbt.
1495 	 */
1496 	if (fbcur) {
1497 		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1498 					    &i);
1499 		if (error)
1500 			return error;
1501 	}
1502 
1503 	if (acur->len)
1504 		*stat = 1;
1505 
1506 	return 0;
1507 }
1508 
1509 /* Check the last block of the cnt btree for allocations. */
1510 static int
1511 xfs_alloc_ag_vextent_lastblock(
1512 	struct xfs_alloc_arg	*args,
1513 	struct xfs_alloc_cur	*acur,
1514 	xfs_agblock_t		*bno,
1515 	xfs_extlen_t		*len,
1516 	bool			*allocated)
1517 {
1518 	int			error;
1519 	int			i;
1520 
1521 #ifdef DEBUG
1522 	/* Randomly don't execute the first algorithm. */
1523 	if (prandom_u32_max(2))
1524 		return 0;
1525 #endif
1526 
1527 	/*
1528 	 * Start from the entry that lookup found, sequence through all larger
1529 	 * free blocks.  If we're actually pointing at a record smaller than
1530 	 * maxlen, go to the start of this block, and skip all those smaller
1531 	 * than minlen.
1532 	 */
1533 	if (*len || args->alignment > 1) {
1534 		acur->cnt->bc_levels[0].ptr = 1;
1535 		do {
1536 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1537 			if (error)
1538 				return error;
1539 			if (XFS_IS_CORRUPT(args->mp, i != 1))
1540 				return -EFSCORRUPTED;
1541 			if (*len >= args->minlen)
1542 				break;
1543 			error = xfs_btree_increment(acur->cnt, 0, &i);
1544 			if (error)
1545 				return error;
1546 		} while (i);
1547 		ASSERT(*len >= args->minlen);
1548 		if (!i)
1549 			return 0;
1550 	}
1551 
1552 	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1553 	if (error)
1554 		return error;
1555 
1556 	/*
1557 	 * It didn't work.  We COULD be in a case where there's a good record
1558 	 * somewhere, so try again.
1559 	 */
1560 	if (acur->len == 0)
1561 		return 0;
1562 
1563 	trace_xfs_alloc_near_first(args);
1564 	*allocated = true;
1565 	return 0;
1566 }
1567 
1568 /*
1569  * Allocate a variable extent near bno in the allocation group agno.
1570  * Extent's length (returned in len) will be between minlen and maxlen,
1571  * and of the form k * prod + mod unless there's nothing that large.
1572  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1573  */
1574 STATIC int
1575 xfs_alloc_ag_vextent_near(
1576 	struct xfs_alloc_arg	*args)
1577 {
1578 	struct xfs_alloc_cur	acur = {};
1579 	int			error;		/* error code */
1580 	int			i;		/* result code, temporary */
1581 	xfs_agblock_t		bno;
1582 	xfs_extlen_t		len;
1583 
1584 	/* handle uninitialized agbno range so caller doesn't have to */
1585 	if (!args->min_agbno && !args->max_agbno)
1586 		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1587 	ASSERT(args->min_agbno <= args->max_agbno);
1588 
1589 	/* clamp agbno to the range if it's outside */
1590 	if (args->agbno < args->min_agbno)
1591 		args->agbno = args->min_agbno;
1592 	if (args->agbno > args->max_agbno)
1593 		args->agbno = args->max_agbno;
1594 
1595 restart:
1596 	len = 0;
1597 
1598 	/*
1599 	 * Set up cursors and see if there are any free extents as big as
1600 	 * maxlen. If not, pick the last entry in the tree unless the tree is
1601 	 * empty.
1602 	 */
1603 	error = xfs_alloc_cur_setup(args, &acur);
1604 	if (error == -ENOSPC) {
1605 		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1606 				&len, &i);
1607 		if (error)
1608 			goto out;
1609 		if (i == 0 || len == 0) {
1610 			trace_xfs_alloc_near_noentry(args);
1611 			goto out;
1612 		}
1613 		ASSERT(i == 1);
1614 	} else if (error) {
1615 		goto out;
1616 	}
1617 
1618 	/*
1619 	 * First algorithm.
1620 	 * If the requested extent is large wrt the freespaces available
1621 	 * in this a.g., then the cursor will be pointing to a btree entry
1622 	 * near the right edge of the tree.  If it's in the last btree leaf
1623 	 * block, then we just examine all the entries in that block
1624 	 * that are big enough, and pick the best one.
1625 	 */
1626 	if (xfs_btree_islastblock(acur.cnt, 0)) {
1627 		bool		allocated = false;
1628 
1629 		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1630 				&allocated);
1631 		if (error)
1632 			goto out;
1633 		if (allocated)
1634 			goto alloc_finish;
1635 	}
1636 
1637 	/*
1638 	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1639 	 * locality.
1640 	 */
1641 	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1642 	if (error)
1643 		goto out;
1644 
1645 	/*
1646 	 * If we couldn't get anything, give up.
1647 	 */
1648 	if (!acur.len) {
1649 		if (acur.busy) {
1650 			trace_xfs_alloc_near_busy(args);
1651 			xfs_extent_busy_flush(args->mp, args->pag,
1652 					      acur.busy_gen);
1653 			goto restart;
1654 		}
1655 		trace_xfs_alloc_size_neither(args);
1656 		args->agbno = NULLAGBLOCK;
1657 		goto out;
1658 	}
1659 
1660 alloc_finish:
1661 	/* fix up btrees on a successful allocation */
1662 	error = xfs_alloc_cur_finish(args, &acur);
1663 
1664 out:
1665 	xfs_alloc_cur_close(&acur, error);
1666 	return error;
1667 }
1668 
1669 /*
1670  * Allocate a variable extent anywhere in the allocation group agno.
1671  * Extent's length (returned in len) will be between minlen and maxlen,
1672  * and of the form k * prod + mod unless there's nothing that large.
1673  * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1674  */
1675 STATIC int				/* error */
1676 xfs_alloc_ag_vextent_size(
1677 	xfs_alloc_arg_t	*args)		/* allocation argument structure */
1678 {
1679 	struct xfs_agf	*agf = args->agbp->b_addr;
1680 	struct xfs_btree_cur *bno_cur;	/* cursor for bno btree */
1681 	struct xfs_btree_cur *cnt_cur;	/* cursor for cnt btree */
1682 	int		error;		/* error result */
1683 	xfs_agblock_t	fbno;		/* start of found freespace */
1684 	xfs_extlen_t	flen;		/* length of found freespace */
1685 	int		i;		/* temp status variable */
1686 	xfs_agblock_t	rbno;		/* returned block number */
1687 	xfs_extlen_t	rlen;		/* length of returned extent */
1688 	bool		busy;
1689 	unsigned	busy_gen;
1690 
1691 restart:
1692 	/*
1693 	 * Allocate and initialize a cursor for the by-size btree.
1694 	 */
1695 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1696 					args->pag, XFS_BTNUM_CNT);
1697 	bno_cur = NULL;
1698 
1699 	/*
1700 	 * Look for an entry >= maxlen+alignment-1 blocks.
1701 	 */
1702 	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1703 			args->maxlen + args->alignment - 1, &i)))
1704 		goto error0;
1705 
1706 	/*
1707 	 * If none then we have to settle for a smaller extent. In the case that
1708 	 * there are no large extents, this will return the last entry in the
1709 	 * tree unless the tree is empty. In the case that there are only busy
1710 	 * large extents, this will return the largest small extent unless there
1711 	 * are no smaller extents available.
1712 	 */
1713 	if (!i) {
1714 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1715 						   &fbno, &flen, &i);
1716 		if (error)
1717 			goto error0;
1718 		if (i == 0 || flen == 0) {
1719 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1720 			trace_xfs_alloc_size_noentry(args);
1721 			return 0;
1722 		}
1723 		ASSERT(i == 1);
1724 		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1725 				&rlen, &busy_gen);
1726 	} else {
1727 		/*
1728 		 * Search for a non-busy extent that is large enough.
1729 		 */
1730 		for (;;) {
1731 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1732 			if (error)
1733 				goto error0;
1734 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1735 				error = -EFSCORRUPTED;
1736 				goto error0;
1737 			}
1738 
1739 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1740 					&rbno, &rlen, &busy_gen);
1741 
1742 			if (rlen >= args->maxlen)
1743 				break;
1744 
1745 			error = xfs_btree_increment(cnt_cur, 0, &i);
1746 			if (error)
1747 				goto error0;
1748 			if (i == 0) {
1749 				/*
1750 				 * Our only valid extents must have been busy.
1751 				 * Make it unbusy by forcing the log out and
1752 				 * retrying.
1753 				 */
1754 				xfs_btree_del_cursor(cnt_cur,
1755 						     XFS_BTREE_NOERROR);
1756 				trace_xfs_alloc_size_busy(args);
1757 				xfs_extent_busy_flush(args->mp,
1758 							args->pag, busy_gen);
1759 				goto restart;
1760 			}
1761 		}
1762 	}
1763 
1764 	/*
1765 	 * In the first case above, we got the last entry in the
1766 	 * by-size btree.  Now we check to see if the space hits maxlen
1767 	 * once aligned; if not, we search left for something better.
1768 	 * This can't happen in the second case above.
1769 	 */
1770 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1771 	if (XFS_IS_CORRUPT(args->mp,
1772 			   rlen != 0 &&
1773 			   (rlen > flen ||
1774 			    rbno + rlen > fbno + flen))) {
1775 		error = -EFSCORRUPTED;
1776 		goto error0;
1777 	}
1778 	if (rlen < args->maxlen) {
1779 		xfs_agblock_t	bestfbno;
1780 		xfs_extlen_t	bestflen;
1781 		xfs_agblock_t	bestrbno;
1782 		xfs_extlen_t	bestrlen;
1783 
1784 		bestrlen = rlen;
1785 		bestrbno = rbno;
1786 		bestflen = flen;
1787 		bestfbno = fbno;
1788 		for (;;) {
1789 			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1790 				goto error0;
1791 			if (i == 0)
1792 				break;
1793 			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1794 					&i)))
1795 				goto error0;
1796 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1797 				error = -EFSCORRUPTED;
1798 				goto error0;
1799 			}
1800 			if (flen < bestrlen)
1801 				break;
1802 			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1803 					&rbno, &rlen, &busy_gen);
1804 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1805 			if (XFS_IS_CORRUPT(args->mp,
1806 					   rlen != 0 &&
1807 					   (rlen > flen ||
1808 					    rbno + rlen > fbno + flen))) {
1809 				error = -EFSCORRUPTED;
1810 				goto error0;
1811 			}
1812 			if (rlen > bestrlen) {
1813 				bestrlen = rlen;
1814 				bestrbno = rbno;
1815 				bestflen = flen;
1816 				bestfbno = fbno;
1817 				if (rlen == args->maxlen)
1818 					break;
1819 			}
1820 		}
1821 		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1822 				&i)))
1823 			goto error0;
1824 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1825 			error = -EFSCORRUPTED;
1826 			goto error0;
1827 		}
1828 		rlen = bestrlen;
1829 		rbno = bestrbno;
1830 		flen = bestflen;
1831 		fbno = bestfbno;
1832 	}
1833 	args->wasfromfl = 0;
1834 	/*
1835 	 * Fix up the length.
1836 	 */
1837 	args->len = rlen;
1838 	if (rlen < args->minlen) {
1839 		if (busy) {
1840 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1841 			trace_xfs_alloc_size_busy(args);
1842 			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1843 			goto restart;
1844 		}
1845 		goto out_nominleft;
1846 	}
1847 	xfs_alloc_fix_len(args);
1848 
1849 	rlen = args->len;
1850 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1851 		error = -EFSCORRUPTED;
1852 		goto error0;
1853 	}
1854 	/*
1855 	 * Allocate and initialize a cursor for the by-block tree.
1856 	 */
1857 	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1858 					args->pag, XFS_BTNUM_BNO);
1859 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1860 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1861 		goto error0;
1862 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1863 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1864 	cnt_cur = bno_cur = NULL;
1865 	args->len = rlen;
1866 	args->agbno = rbno;
1867 	if (XFS_IS_CORRUPT(args->mp,
1868 			   args->agbno + args->len >
1869 			   be32_to_cpu(agf->agf_length))) {
1870 		error = -EFSCORRUPTED;
1871 		goto error0;
1872 	}
1873 	trace_xfs_alloc_size_done(args);
1874 	return 0;
1875 
1876 error0:
1877 	trace_xfs_alloc_size_error(args);
1878 	if (cnt_cur)
1879 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1880 	if (bno_cur)
1881 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1882 	return error;
1883 
1884 out_nominleft:
1885 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1886 	trace_xfs_alloc_size_nominleft(args);
1887 	args->agbno = NULLAGBLOCK;
1888 	return 0;
1889 }
1890 
1891 /*
1892  * Free the extent starting at agno/bno for length.
1893  */
1894 STATIC int
1895 xfs_free_ag_extent(
1896 	struct xfs_trans		*tp,
1897 	struct xfs_buf			*agbp,
1898 	xfs_agnumber_t			agno,
1899 	xfs_agblock_t			bno,
1900 	xfs_extlen_t			len,
1901 	const struct xfs_owner_info	*oinfo,
1902 	enum xfs_ag_resv_type		type)
1903 {
1904 	struct xfs_mount		*mp;
1905 	struct xfs_btree_cur		*bno_cur;
1906 	struct xfs_btree_cur		*cnt_cur;
1907 	xfs_agblock_t			gtbno; /* start of right neighbor */
1908 	xfs_extlen_t			gtlen; /* length of right neighbor */
1909 	xfs_agblock_t			ltbno; /* start of left neighbor */
1910 	xfs_extlen_t			ltlen; /* length of left neighbor */
1911 	xfs_agblock_t			nbno; /* new starting block of freesp */
1912 	xfs_extlen_t			nlen; /* new length of freespace */
1913 	int				haveleft; /* have a left neighbor */
1914 	int				haveright; /* have a right neighbor */
1915 	int				i;
1916 	int				error;
1917 	struct xfs_perag		*pag = agbp->b_pag;
1918 
1919 	bno_cur = cnt_cur = NULL;
1920 	mp = tp->t_mountp;
1921 
1922 	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1923 		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1924 		if (error)
1925 			goto error0;
1926 	}
1927 
1928 	/*
1929 	 * Allocate and initialize a cursor for the by-block btree.
1930 	 */
1931 	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1932 	/*
1933 	 * Look for a neighboring block on the left (lower block numbers)
1934 	 * that is contiguous with this space.
1935 	 */
1936 	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1937 		goto error0;
1938 	if (haveleft) {
1939 		/*
1940 		 * There is a block to our left.
1941 		 */
1942 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1943 			goto error0;
1944 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1945 			error = -EFSCORRUPTED;
1946 			goto error0;
1947 		}
1948 		/*
1949 		 * It's not contiguous, though.
1950 		 */
1951 		if (ltbno + ltlen < bno)
1952 			haveleft = 0;
1953 		else {
1954 			/*
1955 			 * If this failure happens the request to free this
1956 			 * space was invalid, it's (partly) already free.
1957 			 * Very bad.
1958 			 */
1959 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1960 				error = -EFSCORRUPTED;
1961 				goto error0;
1962 			}
1963 		}
1964 	}
1965 	/*
1966 	 * Look for a neighboring block on the right (higher block numbers)
1967 	 * that is contiguous with this space.
1968 	 */
1969 	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1970 		goto error0;
1971 	if (haveright) {
1972 		/*
1973 		 * There is a block to our right.
1974 		 */
1975 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1976 			goto error0;
1977 		if (XFS_IS_CORRUPT(mp, i != 1)) {
1978 			error = -EFSCORRUPTED;
1979 			goto error0;
1980 		}
1981 		/*
1982 		 * It's not contiguous, though.
1983 		 */
1984 		if (bno + len < gtbno)
1985 			haveright = 0;
1986 		else {
1987 			/*
1988 			 * If this failure happens the request to free this
1989 			 * space was invalid, it's (partly) already free.
1990 			 * Very bad.
1991 			 */
1992 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1993 				error = -EFSCORRUPTED;
1994 				goto error0;
1995 			}
1996 		}
1997 	}
1998 	/*
1999 	 * Now allocate and initialize a cursor for the by-size tree.
2000 	 */
2001 	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2002 	/*
2003 	 * Have both left and right contiguous neighbors.
2004 	 * Merge all three into a single free block.
2005 	 */
2006 	if (haveleft && haveright) {
2007 		/*
2008 		 * Delete the old by-size entry on the left.
2009 		 */
2010 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2011 			goto error0;
2012 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2013 			error = -EFSCORRUPTED;
2014 			goto error0;
2015 		}
2016 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2017 			goto error0;
2018 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2019 			error = -EFSCORRUPTED;
2020 			goto error0;
2021 		}
2022 		/*
2023 		 * Delete the old by-size entry on the right.
2024 		 */
2025 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2026 			goto error0;
2027 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2028 			error = -EFSCORRUPTED;
2029 			goto error0;
2030 		}
2031 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2032 			goto error0;
2033 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2034 			error = -EFSCORRUPTED;
2035 			goto error0;
2036 		}
2037 		/*
2038 		 * Delete the old by-block entry for the right block.
2039 		 */
2040 		if ((error = xfs_btree_delete(bno_cur, &i)))
2041 			goto error0;
2042 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2043 			error = -EFSCORRUPTED;
2044 			goto error0;
2045 		}
2046 		/*
2047 		 * Move the by-block cursor back to the left neighbor.
2048 		 */
2049 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2050 			goto error0;
2051 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2052 			error = -EFSCORRUPTED;
2053 			goto error0;
2054 		}
2055 #ifdef DEBUG
2056 		/*
2057 		 * Check that this is the right record: delete didn't
2058 		 * mangle the cursor.
2059 		 */
2060 		{
2061 			xfs_agblock_t	xxbno;
2062 			xfs_extlen_t	xxlen;
2063 
2064 			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2065 					&i)))
2066 				goto error0;
2067 			if (XFS_IS_CORRUPT(mp,
2068 					   i != 1 ||
2069 					   xxbno != ltbno ||
2070 					   xxlen != ltlen)) {
2071 				error = -EFSCORRUPTED;
2072 				goto error0;
2073 			}
2074 		}
2075 #endif
2076 		/*
2077 		 * Update remaining by-block entry to the new, joined block.
2078 		 */
2079 		nbno = ltbno;
2080 		nlen = len + ltlen + gtlen;
2081 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2082 			goto error0;
2083 	}
2084 	/*
2085 	 * Have only a left contiguous neighbor.
2086 	 * Merge it together with the new freespace.
2087 	 */
2088 	else if (haveleft) {
2089 		/*
2090 		 * Delete the old by-size entry on the left.
2091 		 */
2092 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2093 			goto error0;
2094 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2095 			error = -EFSCORRUPTED;
2096 			goto error0;
2097 		}
2098 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2099 			goto error0;
2100 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2101 			error = -EFSCORRUPTED;
2102 			goto error0;
2103 		}
2104 		/*
2105 		 * Back up the by-block cursor to the left neighbor, and
2106 		 * update its length.
2107 		 */
2108 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2109 			goto error0;
2110 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2111 			error = -EFSCORRUPTED;
2112 			goto error0;
2113 		}
2114 		nbno = ltbno;
2115 		nlen = len + ltlen;
2116 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2117 			goto error0;
2118 	}
2119 	/*
2120 	 * Have only a right contiguous neighbor.
2121 	 * Merge it together with the new freespace.
2122 	 */
2123 	else if (haveright) {
2124 		/*
2125 		 * Delete the old by-size entry on the right.
2126 		 */
2127 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2128 			goto error0;
2129 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2130 			error = -EFSCORRUPTED;
2131 			goto error0;
2132 		}
2133 		if ((error = xfs_btree_delete(cnt_cur, &i)))
2134 			goto error0;
2135 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2136 			error = -EFSCORRUPTED;
2137 			goto error0;
2138 		}
2139 		/*
2140 		 * Update the starting block and length of the right
2141 		 * neighbor in the by-block tree.
2142 		 */
2143 		nbno = bno;
2144 		nlen = len + gtlen;
2145 		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2146 			goto error0;
2147 	}
2148 	/*
2149 	 * No contiguous neighbors.
2150 	 * Insert the new freespace into the by-block tree.
2151 	 */
2152 	else {
2153 		nbno = bno;
2154 		nlen = len;
2155 		if ((error = xfs_btree_insert(bno_cur, &i)))
2156 			goto error0;
2157 		if (XFS_IS_CORRUPT(mp, i != 1)) {
2158 			error = -EFSCORRUPTED;
2159 			goto error0;
2160 		}
2161 	}
2162 	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2163 	bno_cur = NULL;
2164 	/*
2165 	 * In all cases we need to insert the new freespace in the by-size tree.
2166 	 */
2167 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2168 		goto error0;
2169 	if (XFS_IS_CORRUPT(mp, i != 0)) {
2170 		error = -EFSCORRUPTED;
2171 		goto error0;
2172 	}
2173 	if ((error = xfs_btree_insert(cnt_cur, &i)))
2174 		goto error0;
2175 	if (XFS_IS_CORRUPT(mp, i != 1)) {
2176 		error = -EFSCORRUPTED;
2177 		goto error0;
2178 	}
2179 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2180 	cnt_cur = NULL;
2181 
2182 	/*
2183 	 * Update the freespace totals in the ag and superblock.
2184 	 */
2185 	error = xfs_alloc_update_counters(tp, agbp, len);
2186 	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2187 	if (error)
2188 		goto error0;
2189 
2190 	XFS_STATS_INC(mp, xs_freex);
2191 	XFS_STATS_ADD(mp, xs_freeb, len);
2192 
2193 	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2194 
2195 	return 0;
2196 
2197  error0:
2198 	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2199 	if (bno_cur)
2200 		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2201 	if (cnt_cur)
2202 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2203 	return error;
2204 }
2205 
2206 /*
2207  * Visible (exported) allocation/free functions.
2208  * Some of these are used just by xfs_alloc_btree.c and this file.
2209  */
2210 
2211 /*
2212  * Compute and fill in value of m_alloc_maxlevels.
2213  */
2214 void
2215 xfs_alloc_compute_maxlevels(
2216 	xfs_mount_t	*mp)	/* file system mount structure */
2217 {
2218 	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2219 			(mp->m_sb.sb_agblocks + 1) / 2);
2220 	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2221 }
2222 
2223 /*
2224  * Find the length of the longest extent in an AG.  The 'need' parameter
2225  * specifies how much space we're going to need for the AGFL and the
2226  * 'reserved' parameter tells us how many blocks in this AG are reserved for
2227  * other callers.
2228  */
2229 xfs_extlen_t
2230 xfs_alloc_longest_free_extent(
2231 	struct xfs_perag	*pag,
2232 	xfs_extlen_t		need,
2233 	xfs_extlen_t		reserved)
2234 {
2235 	xfs_extlen_t		delta = 0;
2236 
2237 	/*
2238 	 * If the AGFL needs a recharge, we'll have to subtract that from the
2239 	 * longest extent.
2240 	 */
2241 	if (need > pag->pagf_flcount)
2242 		delta = need - pag->pagf_flcount;
2243 
2244 	/*
2245 	 * If we cannot maintain others' reservations with space from the
2246 	 * not-longest freesp extents, we'll have to subtract /that/ from
2247 	 * the longest extent too.
2248 	 */
2249 	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2250 		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2251 
2252 	/*
2253 	 * If the longest extent is long enough to satisfy all the
2254 	 * reservations and AGFL rules in place, we can return this extent.
2255 	 */
2256 	if (pag->pagf_longest > delta)
2257 		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2258 				pag->pagf_longest - delta);
2259 
2260 	/* Otherwise, let the caller try for 1 block if there's space. */
2261 	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2262 }
2263 
2264 /*
2265  * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2266  * return the largest possible minimum length.
2267  */
2268 unsigned int
2269 xfs_alloc_min_freelist(
2270 	struct xfs_mount	*mp,
2271 	struct xfs_perag	*pag)
2272 {
2273 	/* AG btrees have at least 1 level. */
2274 	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2275 	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
2276 	unsigned int		min_free;
2277 
2278 	ASSERT(mp->m_alloc_maxlevels > 0);
2279 
2280 	/* space needed by-bno freespace btree */
2281 	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2282 				       mp->m_alloc_maxlevels);
2283 	/* space needed by-size freespace btree */
2284 	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2285 				       mp->m_alloc_maxlevels);
2286 	/* space needed reverse mapping used space btree */
2287 	if (xfs_has_rmapbt(mp))
2288 		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2289 						mp->m_rmap_maxlevels);
2290 
2291 	return min_free;
2292 }
2293 
2294 /*
2295  * Check if the operation we are fixing up the freelist for should go ahead or
2296  * not. If we are freeing blocks, we always allow it, otherwise the allocation
2297  * is dependent on whether the size and shape of free space available will
2298  * permit the requested allocation to take place.
2299  */
2300 static bool
2301 xfs_alloc_space_available(
2302 	struct xfs_alloc_arg	*args,
2303 	xfs_extlen_t		min_free,
2304 	int			flags)
2305 {
2306 	struct xfs_perag	*pag = args->pag;
2307 	xfs_extlen_t		alloc_len, longest;
2308 	xfs_extlen_t		reservation; /* blocks that are still reserved */
2309 	int			available;
2310 	xfs_extlen_t		agflcount;
2311 
2312 	if (flags & XFS_ALLOC_FLAG_FREEING)
2313 		return true;
2314 
2315 	reservation = xfs_ag_resv_needed(pag, args->resv);
2316 
2317 	/* do we have enough contiguous free space for the allocation? */
2318 	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2319 	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2320 	if (longest < alloc_len)
2321 		return false;
2322 
2323 	/*
2324 	 * Do we have enough free space remaining for the allocation? Don't
2325 	 * account extra agfl blocks because we are about to defer free them,
2326 	 * making them unavailable until the current transaction commits.
2327 	 */
2328 	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2329 	available = (int)(pag->pagf_freeblks + agflcount -
2330 			  reservation - min_free - args->minleft);
2331 	if (available < (int)max(args->total, alloc_len))
2332 		return false;
2333 
2334 	/*
2335 	 * Clamp maxlen to the amount of free space available for the actual
2336 	 * extent allocation.
2337 	 */
2338 	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2339 		args->maxlen = available;
2340 		ASSERT(args->maxlen > 0);
2341 		ASSERT(args->maxlen >= args->minlen);
2342 	}
2343 
2344 	return true;
2345 }
2346 
2347 int
2348 xfs_free_agfl_block(
2349 	struct xfs_trans	*tp,
2350 	xfs_agnumber_t		agno,
2351 	xfs_agblock_t		agbno,
2352 	struct xfs_buf		*agbp,
2353 	struct xfs_owner_info	*oinfo)
2354 {
2355 	int			error;
2356 	struct xfs_buf		*bp;
2357 
2358 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2359 				   XFS_AG_RESV_AGFL);
2360 	if (error)
2361 		return error;
2362 
2363 	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2364 			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2365 			tp->t_mountp->m_bsize, 0, &bp);
2366 	if (error)
2367 		return error;
2368 	xfs_trans_binval(tp, bp);
2369 
2370 	return 0;
2371 }
2372 
2373 /*
2374  * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2375  * is to detect an agfl header padding mismatch between current and early v5
2376  * kernels. This problem manifests as a 1-slot size difference between the
2377  * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2378  * may also catch variants of agfl count corruption unrelated to padding. Either
2379  * way, we'll reset the agfl and warn the user.
2380  *
2381  * Return true if a reset is required before the agfl can be used, false
2382  * otherwise.
2383  */
2384 static bool
2385 xfs_agfl_needs_reset(
2386 	struct xfs_mount	*mp,
2387 	struct xfs_agf		*agf)
2388 {
2389 	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2390 	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2391 	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2392 	int			agfl_size = xfs_agfl_size(mp);
2393 	int			active;
2394 
2395 	/* no agfl header on v4 supers */
2396 	if (!xfs_has_crc(mp))
2397 		return false;
2398 
2399 	/*
2400 	 * The agf read verifier catches severe corruption of these fields.
2401 	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2402 	 * the verifier allows it.
2403 	 */
2404 	if (f >= agfl_size || l >= agfl_size)
2405 		return true;
2406 	if (c > agfl_size)
2407 		return true;
2408 
2409 	/*
2410 	 * Check consistency between the on-disk count and the active range. An
2411 	 * agfl padding mismatch manifests as an inconsistent flcount.
2412 	 */
2413 	if (c && l >= f)
2414 		active = l - f + 1;
2415 	else if (c)
2416 		active = agfl_size - f + l + 1;
2417 	else
2418 		active = 0;
2419 
2420 	return active != c;
2421 }
2422 
2423 /*
2424  * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2425  * agfl content cannot be trusted. Warn the user that a repair is required to
2426  * recover leaked blocks.
2427  *
2428  * The purpose of this mechanism is to handle filesystems affected by the agfl
2429  * header padding mismatch problem. A reset keeps the filesystem online with a
2430  * relatively minor free space accounting inconsistency rather than suffer the
2431  * inevitable crash from use of an invalid agfl block.
2432  */
2433 static void
2434 xfs_agfl_reset(
2435 	struct xfs_trans	*tp,
2436 	struct xfs_buf		*agbp,
2437 	struct xfs_perag	*pag)
2438 {
2439 	struct xfs_mount	*mp = tp->t_mountp;
2440 	struct xfs_agf		*agf = agbp->b_addr;
2441 
2442 	ASSERT(pag->pagf_agflreset);
2443 	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2444 
2445 	xfs_warn(mp,
2446 	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2447 	       "Please unmount and run xfs_repair.",
2448 	         pag->pag_agno, pag->pagf_flcount);
2449 
2450 	agf->agf_flfirst = 0;
2451 	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2452 	agf->agf_flcount = 0;
2453 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2454 				    XFS_AGF_FLCOUNT);
2455 
2456 	pag->pagf_flcount = 0;
2457 	pag->pagf_agflreset = false;
2458 }
2459 
2460 /*
2461  * Defer an AGFL block free. This is effectively equivalent to
2462  * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2463  *
2464  * Deferring AGFL frees helps prevent log reservation overruns due to too many
2465  * allocation operations in a transaction. AGFL frees are prone to this problem
2466  * because for one they are always freed one at a time. Further, an immediate
2467  * AGFL block free can cause a btree join and require another block free before
2468  * the real allocation can proceed. Deferring the free disconnects freeing up
2469  * the AGFL slot from freeing the block.
2470  */
2471 STATIC void
2472 xfs_defer_agfl_block(
2473 	struct xfs_trans		*tp,
2474 	xfs_agnumber_t			agno,
2475 	xfs_fsblock_t			agbno,
2476 	struct xfs_owner_info		*oinfo)
2477 {
2478 	struct xfs_mount		*mp = tp->t_mountp;
2479 	struct xfs_extent_free_item	*new;		/* new element */
2480 
2481 	ASSERT(xfs_extfree_item_cache != NULL);
2482 	ASSERT(oinfo != NULL);
2483 
2484 	new = kmem_cache_zalloc(xfs_extfree_item_cache,
2485 			       GFP_KERNEL | __GFP_NOFAIL);
2486 	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2487 	new->xefi_blockcount = 1;
2488 	new->xefi_owner = oinfo->oi_owner;
2489 
2490 	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2491 
2492 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2493 }
2494 
2495 /*
2496  * Add the extent to the list of extents to be free at transaction end.
2497  * The list is maintained sorted (by block number).
2498  */
2499 void
2500 __xfs_free_extent_later(
2501 	struct xfs_trans		*tp,
2502 	xfs_fsblock_t			bno,
2503 	xfs_filblks_t			len,
2504 	const struct xfs_owner_info	*oinfo,
2505 	bool				skip_discard)
2506 {
2507 	struct xfs_extent_free_item	*new;		/* new element */
2508 #ifdef DEBUG
2509 	struct xfs_mount		*mp = tp->t_mountp;
2510 	xfs_agnumber_t			agno;
2511 	xfs_agblock_t			agbno;
2512 
2513 	ASSERT(bno != NULLFSBLOCK);
2514 	ASSERT(len > 0);
2515 	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2516 	ASSERT(!isnullstartblock(bno));
2517 	agno = XFS_FSB_TO_AGNO(mp, bno);
2518 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2519 	ASSERT(agno < mp->m_sb.sb_agcount);
2520 	ASSERT(agbno < mp->m_sb.sb_agblocks);
2521 	ASSERT(len < mp->m_sb.sb_agblocks);
2522 	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2523 #endif
2524 	ASSERT(xfs_extfree_item_cache != NULL);
2525 
2526 	new = kmem_cache_zalloc(xfs_extfree_item_cache,
2527 			       GFP_KERNEL | __GFP_NOFAIL);
2528 	new->xefi_startblock = bno;
2529 	new->xefi_blockcount = (xfs_extlen_t)len;
2530 	if (skip_discard)
2531 		new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2532 	if (oinfo) {
2533 		ASSERT(oinfo->oi_offset == 0);
2534 
2535 		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2536 			new->xefi_flags |= XFS_EFI_ATTR_FORK;
2537 		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2538 			new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2539 		new->xefi_owner = oinfo->oi_owner;
2540 	} else {
2541 		new->xefi_owner = XFS_RMAP_OWN_NULL;
2542 	}
2543 	trace_xfs_bmap_free_defer(tp->t_mountp,
2544 			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2545 			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2546 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
2547 }
2548 
2549 #ifdef DEBUG
2550 /*
2551  * Check if an AGF has a free extent record whose length is equal to
2552  * args->minlen.
2553  */
2554 STATIC int
2555 xfs_exact_minlen_extent_available(
2556 	struct xfs_alloc_arg	*args,
2557 	struct xfs_buf		*agbp,
2558 	int			*stat)
2559 {
2560 	struct xfs_btree_cur	*cnt_cur;
2561 	xfs_agblock_t		fbno;
2562 	xfs_extlen_t		flen;
2563 	int			error = 0;
2564 
2565 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2566 					args->pag, XFS_BTNUM_CNT);
2567 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2568 	if (error)
2569 		goto out;
2570 
2571 	if (*stat == 0) {
2572 		error = -EFSCORRUPTED;
2573 		goto out;
2574 	}
2575 
2576 	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2577 	if (error)
2578 		goto out;
2579 
2580 	if (*stat == 1 && flen != args->minlen)
2581 		*stat = 0;
2582 
2583 out:
2584 	xfs_btree_del_cursor(cnt_cur, error);
2585 
2586 	return error;
2587 }
2588 #endif
2589 
2590 /*
2591  * Decide whether to use this allocation group for this allocation.
2592  * If so, fix up the btree freelist's size.
2593  */
2594 int			/* error */
2595 xfs_alloc_fix_freelist(
2596 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2597 	int			flags)	/* XFS_ALLOC_FLAG_... */
2598 {
2599 	struct xfs_mount	*mp = args->mp;
2600 	struct xfs_perag	*pag = args->pag;
2601 	struct xfs_trans	*tp = args->tp;
2602 	struct xfs_buf		*agbp = NULL;
2603 	struct xfs_buf		*agflbp = NULL;
2604 	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2605 	xfs_agblock_t		bno;	/* freelist block */
2606 	xfs_extlen_t		need;	/* total blocks needed in freelist */
2607 	int			error = 0;
2608 
2609 	/* deferred ops (AGFL block frees) require permanent transactions */
2610 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2611 
2612 	if (!pag->pagf_init) {
2613 		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2614 		if (error) {
2615 			/* Couldn't lock the AGF so skip this AG. */
2616 			if (error == -EAGAIN)
2617 				error = 0;
2618 			goto out_no_agbp;
2619 		}
2620 	}
2621 
2622 	/*
2623 	 * If this is a metadata preferred pag and we are user data then try
2624 	 * somewhere else if we are not being asked to try harder at this
2625 	 * point
2626 	 */
2627 	if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2628 	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2629 		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2630 		goto out_agbp_relse;
2631 	}
2632 
2633 	need = xfs_alloc_min_freelist(mp, pag);
2634 	if (!xfs_alloc_space_available(args, need, flags |
2635 			XFS_ALLOC_FLAG_CHECK))
2636 		goto out_agbp_relse;
2637 
2638 	/*
2639 	 * Get the a.g. freespace buffer.
2640 	 * Can fail if we're not blocking on locks, and it's held.
2641 	 */
2642 	if (!agbp) {
2643 		error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2644 		if (error) {
2645 			/* Couldn't lock the AGF so skip this AG. */
2646 			if (error == -EAGAIN)
2647 				error = 0;
2648 			goto out_no_agbp;
2649 		}
2650 	}
2651 
2652 	/* reset a padding mismatched agfl before final free space check */
2653 	if (pag->pagf_agflreset)
2654 		xfs_agfl_reset(tp, agbp, pag);
2655 
2656 	/* If there isn't enough total space or single-extent, reject it. */
2657 	need = xfs_alloc_min_freelist(mp, pag);
2658 	if (!xfs_alloc_space_available(args, need, flags))
2659 		goto out_agbp_relse;
2660 
2661 #ifdef DEBUG
2662 	if (args->alloc_minlen_only) {
2663 		int stat;
2664 
2665 		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2666 		if (error || !stat)
2667 			goto out_agbp_relse;
2668 	}
2669 #endif
2670 	/*
2671 	 * Make the freelist shorter if it's too long.
2672 	 *
2673 	 * Note that from this point onwards, we will always release the agf and
2674 	 * agfl buffers on error. This handles the case where we error out and
2675 	 * the buffers are clean or may not have been joined to the transaction
2676 	 * and hence need to be released manually. If they have been joined to
2677 	 * the transaction, then xfs_trans_brelse() will handle them
2678 	 * appropriately based on the recursion count and dirty state of the
2679 	 * buffer.
2680 	 *
2681 	 * XXX (dgc): When we have lots of free space, does this buy us
2682 	 * anything other than extra overhead when we need to put more blocks
2683 	 * back on the free list? Maybe we should only do this when space is
2684 	 * getting low or the AGFL is more than half full?
2685 	 *
2686 	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2687 	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2688 	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2689 	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2690 	 * both required to ensure that rmaps are correctly recorded for the
2691 	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2692 	 * repair/rmap.c in xfsprogs for details.
2693 	 */
2694 	memset(&targs, 0, sizeof(targs));
2695 	/* struct copy below */
2696 	if (flags & XFS_ALLOC_FLAG_NORMAP)
2697 		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2698 	else
2699 		targs.oinfo = XFS_RMAP_OINFO_AG;
2700 	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2701 		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2702 		if (error)
2703 			goto out_agbp_relse;
2704 
2705 		/* defer agfl frees */
2706 		xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2707 	}
2708 
2709 	targs.tp = tp;
2710 	targs.mp = mp;
2711 	targs.agbp = agbp;
2712 	targs.agno = args->agno;
2713 	targs.alignment = targs.minlen = targs.prod = 1;
2714 	targs.type = XFS_ALLOCTYPE_THIS_AG;
2715 	targs.pag = pag;
2716 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2717 	if (error)
2718 		goto out_agbp_relse;
2719 
2720 	/* Make the freelist longer if it's too short. */
2721 	while (pag->pagf_flcount < need) {
2722 		targs.agbno = 0;
2723 		targs.maxlen = need - pag->pagf_flcount;
2724 		targs.resv = XFS_AG_RESV_AGFL;
2725 
2726 		/* Allocate as many blocks as possible at once. */
2727 		error = xfs_alloc_ag_vextent(&targs);
2728 		if (error)
2729 			goto out_agflbp_relse;
2730 
2731 		/*
2732 		 * Stop if we run out.  Won't happen if callers are obeying
2733 		 * the restrictions correctly.  Can happen for free calls
2734 		 * on a completely full ag.
2735 		 */
2736 		if (targs.agbno == NULLAGBLOCK) {
2737 			if (flags & XFS_ALLOC_FLAG_FREEING)
2738 				break;
2739 			goto out_agflbp_relse;
2740 		}
2741 		/*
2742 		 * Put each allocated block on the list.
2743 		 */
2744 		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2745 			error = xfs_alloc_put_freelist(pag, tp, agbp,
2746 							agflbp, bno, 0);
2747 			if (error)
2748 				goto out_agflbp_relse;
2749 		}
2750 	}
2751 	xfs_trans_brelse(tp, agflbp);
2752 	args->agbp = agbp;
2753 	return 0;
2754 
2755 out_agflbp_relse:
2756 	xfs_trans_brelse(tp, agflbp);
2757 out_agbp_relse:
2758 	if (agbp)
2759 		xfs_trans_brelse(tp, agbp);
2760 out_no_agbp:
2761 	args->agbp = NULL;
2762 	return error;
2763 }
2764 
2765 /*
2766  * Get a block from the freelist.
2767  * Returns with the buffer for the block gotten.
2768  */
2769 int
2770 xfs_alloc_get_freelist(
2771 	struct xfs_perag	*pag,
2772 	struct xfs_trans	*tp,
2773 	struct xfs_buf		*agbp,
2774 	xfs_agblock_t		*bnop,
2775 	int			btreeblk)
2776 {
2777 	struct xfs_agf		*agf = agbp->b_addr;
2778 	struct xfs_buf		*agflbp;
2779 	xfs_agblock_t		bno;
2780 	__be32			*agfl_bno;
2781 	int			error;
2782 	uint32_t		logflags;
2783 	struct xfs_mount	*mp = tp->t_mountp;
2784 
2785 	/*
2786 	 * Freelist is empty, give up.
2787 	 */
2788 	if (!agf->agf_flcount) {
2789 		*bnop = NULLAGBLOCK;
2790 		return 0;
2791 	}
2792 	/*
2793 	 * Read the array of free blocks.
2794 	 */
2795 	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2796 	if (error)
2797 		return error;
2798 
2799 
2800 	/*
2801 	 * Get the block number and update the data structures.
2802 	 */
2803 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2804 	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2805 	be32_add_cpu(&agf->agf_flfirst, 1);
2806 	xfs_trans_brelse(tp, agflbp);
2807 	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2808 		agf->agf_flfirst = 0;
2809 
2810 	ASSERT(!pag->pagf_agflreset);
2811 	be32_add_cpu(&agf->agf_flcount, -1);
2812 	pag->pagf_flcount--;
2813 
2814 	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2815 	if (btreeblk) {
2816 		be32_add_cpu(&agf->agf_btreeblks, 1);
2817 		pag->pagf_btreeblks++;
2818 		logflags |= XFS_AGF_BTREEBLKS;
2819 	}
2820 
2821 	xfs_alloc_log_agf(tp, agbp, logflags);
2822 	*bnop = bno;
2823 
2824 	return 0;
2825 }
2826 
2827 /*
2828  * Log the given fields from the agf structure.
2829  */
2830 void
2831 xfs_alloc_log_agf(
2832 	struct xfs_trans	*tp,
2833 	struct xfs_buf		*bp,
2834 	uint32_t		fields)
2835 {
2836 	int	first;		/* first byte offset */
2837 	int	last;		/* last byte offset */
2838 	static const short	offsets[] = {
2839 		offsetof(xfs_agf_t, agf_magicnum),
2840 		offsetof(xfs_agf_t, agf_versionnum),
2841 		offsetof(xfs_agf_t, agf_seqno),
2842 		offsetof(xfs_agf_t, agf_length),
2843 		offsetof(xfs_agf_t, agf_roots[0]),
2844 		offsetof(xfs_agf_t, agf_levels[0]),
2845 		offsetof(xfs_agf_t, agf_flfirst),
2846 		offsetof(xfs_agf_t, agf_fllast),
2847 		offsetof(xfs_agf_t, agf_flcount),
2848 		offsetof(xfs_agf_t, agf_freeblks),
2849 		offsetof(xfs_agf_t, agf_longest),
2850 		offsetof(xfs_agf_t, agf_btreeblks),
2851 		offsetof(xfs_agf_t, agf_uuid),
2852 		offsetof(xfs_agf_t, agf_rmap_blocks),
2853 		offsetof(xfs_agf_t, agf_refcount_blocks),
2854 		offsetof(xfs_agf_t, agf_refcount_root),
2855 		offsetof(xfs_agf_t, agf_refcount_level),
2856 		/* needed so that we don't log the whole rest of the structure: */
2857 		offsetof(xfs_agf_t, agf_spare64),
2858 		sizeof(xfs_agf_t)
2859 	};
2860 
2861 	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2862 
2863 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2864 
2865 	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2866 	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2867 }
2868 
2869 /*
2870  * Put the block on the freelist for the allocation group.
2871  */
2872 int
2873 xfs_alloc_put_freelist(
2874 	struct xfs_perag	*pag,
2875 	struct xfs_trans	*tp,
2876 	struct xfs_buf		*agbp,
2877 	struct xfs_buf		*agflbp,
2878 	xfs_agblock_t		bno,
2879 	int			btreeblk)
2880 {
2881 	struct xfs_mount	*mp = tp->t_mountp;
2882 	struct xfs_agf		*agf = agbp->b_addr;
2883 	__be32			*blockp;
2884 	int			error;
2885 	uint32_t		logflags;
2886 	__be32			*agfl_bno;
2887 	int			startoff;
2888 
2889 	if (!agflbp) {
2890 		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2891 		if (error)
2892 			return error;
2893 	}
2894 
2895 	be32_add_cpu(&agf->agf_fllast, 1);
2896 	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2897 		agf->agf_fllast = 0;
2898 
2899 	ASSERT(!pag->pagf_agflreset);
2900 	be32_add_cpu(&agf->agf_flcount, 1);
2901 	pag->pagf_flcount++;
2902 
2903 	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2904 	if (btreeblk) {
2905 		be32_add_cpu(&agf->agf_btreeblks, -1);
2906 		pag->pagf_btreeblks--;
2907 		logflags |= XFS_AGF_BTREEBLKS;
2908 	}
2909 
2910 	xfs_alloc_log_agf(tp, agbp, logflags);
2911 
2912 	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2913 
2914 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2915 	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2916 	*blockp = cpu_to_be32(bno);
2917 	startoff = (char *)blockp - (char *)agflbp->b_addr;
2918 
2919 	xfs_alloc_log_agf(tp, agbp, logflags);
2920 
2921 	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2922 	xfs_trans_log_buf(tp, agflbp, startoff,
2923 			  startoff + sizeof(xfs_agblock_t) - 1);
2924 	return 0;
2925 }
2926 
2927 static xfs_failaddr_t
2928 xfs_agf_verify(
2929 	struct xfs_buf		*bp)
2930 {
2931 	struct xfs_mount	*mp = bp->b_mount;
2932 	struct xfs_agf		*agf = bp->b_addr;
2933 
2934 	if (xfs_has_crc(mp)) {
2935 		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2936 			return __this_address;
2937 		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2938 			return __this_address;
2939 	}
2940 
2941 	if (!xfs_verify_magic(bp, agf->agf_magicnum))
2942 		return __this_address;
2943 
2944 	if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2945 	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2946 	      be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2947 	      be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2948 	      be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2949 		return __this_address;
2950 
2951 	if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2952 		return __this_address;
2953 
2954 	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2955 	    be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2956 		return __this_address;
2957 
2958 	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2959 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2960 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
2961 						mp->m_alloc_maxlevels ||
2962 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
2963 						mp->m_alloc_maxlevels)
2964 		return __this_address;
2965 
2966 	if (xfs_has_rmapbt(mp) &&
2967 	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2968 	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
2969 						mp->m_rmap_maxlevels))
2970 		return __this_address;
2971 
2972 	if (xfs_has_rmapbt(mp) &&
2973 	    be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2974 		return __this_address;
2975 
2976 	/*
2977 	 * during growfs operations, the perag is not fully initialised,
2978 	 * so we can't use it for any useful checking. growfs ensures we can't
2979 	 * use it by using uncached buffers that don't have the perag attached
2980 	 * so we can detect and avoid this problem.
2981 	 */
2982 	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2983 		return __this_address;
2984 
2985 	if (xfs_has_lazysbcount(mp) &&
2986 	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2987 		return __this_address;
2988 
2989 	if (xfs_has_reflink(mp) &&
2990 	    be32_to_cpu(agf->agf_refcount_blocks) >
2991 	    be32_to_cpu(agf->agf_length))
2992 		return __this_address;
2993 
2994 	if (xfs_has_reflink(mp) &&
2995 	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2996 	     be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
2997 		return __this_address;
2998 
2999 	return NULL;
3000 
3001 }
3002 
3003 static void
3004 xfs_agf_read_verify(
3005 	struct xfs_buf	*bp)
3006 {
3007 	struct xfs_mount *mp = bp->b_mount;
3008 	xfs_failaddr_t	fa;
3009 
3010 	if (xfs_has_crc(mp) &&
3011 	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3012 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3013 	else {
3014 		fa = xfs_agf_verify(bp);
3015 		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3016 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3017 	}
3018 }
3019 
3020 static void
3021 xfs_agf_write_verify(
3022 	struct xfs_buf	*bp)
3023 {
3024 	struct xfs_mount	*mp = bp->b_mount;
3025 	struct xfs_buf_log_item	*bip = bp->b_log_item;
3026 	struct xfs_agf		*agf = bp->b_addr;
3027 	xfs_failaddr_t		fa;
3028 
3029 	fa = xfs_agf_verify(bp);
3030 	if (fa) {
3031 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3032 		return;
3033 	}
3034 
3035 	if (!xfs_has_crc(mp))
3036 		return;
3037 
3038 	if (bip)
3039 		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3040 
3041 	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3042 }
3043 
3044 const struct xfs_buf_ops xfs_agf_buf_ops = {
3045 	.name = "xfs_agf",
3046 	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3047 	.verify_read = xfs_agf_read_verify,
3048 	.verify_write = xfs_agf_write_verify,
3049 	.verify_struct = xfs_agf_verify,
3050 };
3051 
3052 /*
3053  * Read in the allocation group header (free/alloc section).
3054  */
3055 int
3056 xfs_read_agf(
3057 	struct xfs_perag	*pag,
3058 	struct xfs_trans	*tp,
3059 	int			flags,
3060 	struct xfs_buf		**agfbpp)
3061 {
3062 	struct xfs_mount	*mp = pag->pag_mount;
3063 	int			error;
3064 
3065 	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3066 
3067 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3068 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3069 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3070 	if (error)
3071 		return error;
3072 
3073 	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3074 	return 0;
3075 }
3076 
3077 /*
3078  * Read in the allocation group header (free/alloc section) and initialise the
3079  * perag structure if necessary. If the caller provides @agfbpp, then return the
3080  * locked buffer to the caller, otherwise free it.
3081  */
3082 int
3083 xfs_alloc_read_agf(
3084 	struct xfs_perag	*pag,
3085 	struct xfs_trans	*tp,
3086 	int			flags,
3087 	struct xfs_buf		**agfbpp)
3088 {
3089 	struct xfs_buf		*agfbp;
3090 	struct xfs_agf		*agf;
3091 	int			error;
3092 	int			allocbt_blks;
3093 
3094 	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3095 
3096 	/* We don't support trylock when freeing. */
3097 	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3098 			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3099 	error = xfs_read_agf(pag, tp,
3100 			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3101 			&agfbp);
3102 	if (error)
3103 		return error;
3104 
3105 	agf = agfbp->b_addr;
3106 	if (!pag->pagf_init) {
3107 		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3108 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3109 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3110 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3111 		pag->pagf_levels[XFS_BTNUM_BNOi] =
3112 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3113 		pag->pagf_levels[XFS_BTNUM_CNTi] =
3114 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3115 		pag->pagf_levels[XFS_BTNUM_RMAPi] =
3116 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3117 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3118 		pag->pagf_init = 1;
3119 		pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
3120 
3121 		/*
3122 		 * Update the in-core allocbt counter. Filter out the rmapbt
3123 		 * subset of the btreeblks counter because the rmapbt is managed
3124 		 * by perag reservation. Subtract one for the rmapbt root block
3125 		 * because the rmap counter includes it while the btreeblks
3126 		 * counter only tracks non-root blocks.
3127 		 */
3128 		allocbt_blks = pag->pagf_btreeblks;
3129 		if (xfs_has_rmapbt(pag->pag_mount))
3130 			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3131 		if (allocbt_blks > 0)
3132 			atomic64_add(allocbt_blks,
3133 					&pag->pag_mount->m_allocbt_blks);
3134 	}
3135 #ifdef DEBUG
3136 	else if (!xfs_is_shutdown(pag->pag_mount)) {
3137 		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3138 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3139 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3140 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3141 		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3142 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3143 		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3144 		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3145 	}
3146 #endif
3147 	if (agfbpp)
3148 		*agfbpp = agfbp;
3149 	else
3150 		xfs_trans_brelse(tp, agfbp);
3151 	return 0;
3152 }
3153 
3154 /*
3155  * Allocate an extent (variable-size).
3156  * Depending on the allocation type, we either look in a single allocation
3157  * group or loop over the allocation groups to find the result.
3158  */
3159 int				/* error */
3160 xfs_alloc_vextent(
3161 	struct xfs_alloc_arg	*args)	/* allocation argument structure */
3162 {
3163 	xfs_agblock_t		agsize;	/* allocation group size */
3164 	int			error;
3165 	int			flags;	/* XFS_ALLOC_FLAG_... locking flags */
3166 	struct xfs_mount	*mp;	/* mount structure pointer */
3167 	xfs_agnumber_t		sagno;	/* starting allocation group number */
3168 	xfs_alloctype_t		type;	/* input allocation type */
3169 	int			bump_rotor = 0;
3170 	xfs_agnumber_t		rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3171 
3172 	mp = args->mp;
3173 	type = args->otype = args->type;
3174 	args->agbno = NULLAGBLOCK;
3175 	/*
3176 	 * Just fix this up, for the case where the last a.g. is shorter
3177 	 * (or there's only one a.g.) and the caller couldn't easily figure
3178 	 * that out (xfs_bmap_alloc).
3179 	 */
3180 	agsize = mp->m_sb.sb_agblocks;
3181 	if (args->maxlen > agsize)
3182 		args->maxlen = agsize;
3183 	if (args->alignment == 0)
3184 		args->alignment = 1;
3185 	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3186 	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3187 	ASSERT(args->minlen <= args->maxlen);
3188 	ASSERT(args->minlen <= agsize);
3189 	ASSERT(args->mod < args->prod);
3190 	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3191 	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3192 	    args->minlen > args->maxlen || args->minlen > agsize ||
3193 	    args->mod >= args->prod) {
3194 		args->fsbno = NULLFSBLOCK;
3195 		trace_xfs_alloc_vextent_badargs(args);
3196 		return 0;
3197 	}
3198 
3199 	switch (type) {
3200 	case XFS_ALLOCTYPE_THIS_AG:
3201 	case XFS_ALLOCTYPE_NEAR_BNO:
3202 	case XFS_ALLOCTYPE_THIS_BNO:
3203 		/*
3204 		 * These three force us into a single a.g.
3205 		 */
3206 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3207 		args->pag = xfs_perag_get(mp, args->agno);
3208 		error = xfs_alloc_fix_freelist(args, 0);
3209 		if (error) {
3210 			trace_xfs_alloc_vextent_nofix(args);
3211 			goto error0;
3212 		}
3213 		if (!args->agbp) {
3214 			trace_xfs_alloc_vextent_noagbp(args);
3215 			break;
3216 		}
3217 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3218 		if ((error = xfs_alloc_ag_vextent(args)))
3219 			goto error0;
3220 		break;
3221 	case XFS_ALLOCTYPE_START_BNO:
3222 		/*
3223 		 * Try near allocation first, then anywhere-in-ag after
3224 		 * the first a.g. fails.
3225 		 */
3226 		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3227 		    xfs_is_inode32(mp)) {
3228 			args->fsbno = XFS_AGB_TO_FSB(mp,
3229 					((mp->m_agfrotor / rotorstep) %
3230 					mp->m_sb.sb_agcount), 0);
3231 			bump_rotor = 1;
3232 		}
3233 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3234 		args->type = XFS_ALLOCTYPE_NEAR_BNO;
3235 		fallthrough;
3236 	case XFS_ALLOCTYPE_FIRST_AG:
3237 		/*
3238 		 * Rotate through the allocation groups looking for a winner.
3239 		 */
3240 		if (type == XFS_ALLOCTYPE_FIRST_AG) {
3241 			/*
3242 			 * Start with allocation group given by bno.
3243 			 */
3244 			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3245 			args->type = XFS_ALLOCTYPE_THIS_AG;
3246 			sagno = 0;
3247 			flags = 0;
3248 		} else {
3249 			/*
3250 			 * Start with the given allocation group.
3251 			 */
3252 			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3253 			flags = XFS_ALLOC_FLAG_TRYLOCK;
3254 		}
3255 		/*
3256 		 * Loop over allocation groups twice; first time with
3257 		 * trylock set, second time without.
3258 		 */
3259 		for (;;) {
3260 			args->pag = xfs_perag_get(mp, args->agno);
3261 			error = xfs_alloc_fix_freelist(args, flags);
3262 			if (error) {
3263 				trace_xfs_alloc_vextent_nofix(args);
3264 				goto error0;
3265 			}
3266 			/*
3267 			 * If we get a buffer back then the allocation will fly.
3268 			 */
3269 			if (args->agbp) {
3270 				if ((error = xfs_alloc_ag_vextent(args)))
3271 					goto error0;
3272 				break;
3273 			}
3274 
3275 			trace_xfs_alloc_vextent_loopfailed(args);
3276 
3277 			/*
3278 			 * Didn't work, figure out the next iteration.
3279 			 */
3280 			if (args->agno == sagno &&
3281 			    type == XFS_ALLOCTYPE_START_BNO)
3282 				args->type = XFS_ALLOCTYPE_THIS_AG;
3283 			/*
3284 			* For the first allocation, we can try any AG to get
3285 			* space.  However, if we already have allocated a
3286 			* block, we don't want to try AGs whose number is below
3287 			* sagno. Otherwise, we may end up with out-of-order
3288 			* locking of AGF, which might cause deadlock.
3289 			*/
3290 			if (++(args->agno) == mp->m_sb.sb_agcount) {
3291 				if (args->tp->t_firstblock != NULLFSBLOCK)
3292 					args->agno = sagno;
3293 				else
3294 					args->agno = 0;
3295 			}
3296 			/*
3297 			 * Reached the starting a.g., must either be done
3298 			 * or switch to non-trylock mode.
3299 			 */
3300 			if (args->agno == sagno) {
3301 				if (flags == 0) {
3302 					args->agbno = NULLAGBLOCK;
3303 					trace_xfs_alloc_vextent_allfailed(args);
3304 					break;
3305 				}
3306 
3307 				flags = 0;
3308 				if (type == XFS_ALLOCTYPE_START_BNO) {
3309 					args->agbno = XFS_FSB_TO_AGBNO(mp,
3310 						args->fsbno);
3311 					args->type = XFS_ALLOCTYPE_NEAR_BNO;
3312 				}
3313 			}
3314 			xfs_perag_put(args->pag);
3315 		}
3316 		if (bump_rotor) {
3317 			if (args->agno == sagno)
3318 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
3319 					(mp->m_sb.sb_agcount * rotorstep);
3320 			else
3321 				mp->m_agfrotor = (args->agno * rotorstep + 1) %
3322 					(mp->m_sb.sb_agcount * rotorstep);
3323 		}
3324 		break;
3325 	default:
3326 		ASSERT(0);
3327 		/* NOTREACHED */
3328 	}
3329 	if (args->agbno == NULLAGBLOCK)
3330 		args->fsbno = NULLFSBLOCK;
3331 	else {
3332 		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3333 #ifdef DEBUG
3334 		ASSERT(args->len >= args->minlen);
3335 		ASSERT(args->len <= args->maxlen);
3336 		ASSERT(args->agbno % args->alignment == 0);
3337 		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3338 			args->len);
3339 #endif
3340 
3341 	}
3342 	xfs_perag_put(args->pag);
3343 	return 0;
3344 error0:
3345 	xfs_perag_put(args->pag);
3346 	return error;
3347 }
3348 
3349 /* Ensure that the freelist is at full capacity. */
3350 int
3351 xfs_free_extent_fix_freelist(
3352 	struct xfs_trans	*tp,
3353 	struct xfs_perag	*pag,
3354 	struct xfs_buf		**agbp)
3355 {
3356 	struct xfs_alloc_arg	args;
3357 	int			error;
3358 
3359 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3360 	args.tp = tp;
3361 	args.mp = tp->t_mountp;
3362 	args.agno = pag->pag_agno;
3363 	args.pag = pag;
3364 
3365 	/*
3366 	 * validate that the block number is legal - the enables us to detect
3367 	 * and handle a silent filesystem corruption rather than crashing.
3368 	 */
3369 	if (args.agno >= args.mp->m_sb.sb_agcount)
3370 		return -EFSCORRUPTED;
3371 
3372 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3373 	if (error)
3374 		return error;
3375 
3376 	*agbp = args.agbp;
3377 	return 0;
3378 }
3379 
3380 /*
3381  * Free an extent.
3382  * Just break up the extent address and hand off to xfs_free_ag_extent
3383  * after fixing up the freelist.
3384  */
3385 int
3386 __xfs_free_extent(
3387 	struct xfs_trans		*tp,
3388 	xfs_fsblock_t			bno,
3389 	xfs_extlen_t			len,
3390 	const struct xfs_owner_info	*oinfo,
3391 	enum xfs_ag_resv_type		type,
3392 	bool				skip_discard)
3393 {
3394 	struct xfs_mount		*mp = tp->t_mountp;
3395 	struct xfs_buf			*agbp;
3396 	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, bno);
3397 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp, bno);
3398 	struct xfs_agf			*agf;
3399 	int				error;
3400 	unsigned int			busy_flags = 0;
3401 	struct xfs_perag		*pag;
3402 
3403 	ASSERT(len != 0);
3404 	ASSERT(type != XFS_AG_RESV_AGFL);
3405 
3406 	if (XFS_TEST_ERROR(false, mp,
3407 			XFS_ERRTAG_FREE_EXTENT))
3408 		return -EIO;
3409 
3410 	pag = xfs_perag_get(mp, agno);
3411 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3412 	if (error)
3413 		goto err;
3414 	agf = agbp->b_addr;
3415 
3416 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3417 		error = -EFSCORRUPTED;
3418 		goto err_release;
3419 	}
3420 
3421 	/* validate the extent size is legal now we have the agf locked */
3422 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3423 		error = -EFSCORRUPTED;
3424 		goto err_release;
3425 	}
3426 
3427 	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3428 	if (error)
3429 		goto err_release;
3430 
3431 	if (skip_discard)
3432 		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3433 	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3434 	xfs_perag_put(pag);
3435 	return 0;
3436 
3437 err_release:
3438 	xfs_trans_brelse(tp, agbp);
3439 err:
3440 	xfs_perag_put(pag);
3441 	return error;
3442 }
3443 
3444 struct xfs_alloc_query_range_info {
3445 	xfs_alloc_query_range_fn	fn;
3446 	void				*priv;
3447 };
3448 
3449 /* Format btree record and pass to our callback. */
3450 STATIC int
3451 xfs_alloc_query_range_helper(
3452 	struct xfs_btree_cur		*cur,
3453 	const union xfs_btree_rec	*rec,
3454 	void				*priv)
3455 {
3456 	struct xfs_alloc_query_range_info	*query = priv;
3457 	struct xfs_alloc_rec_incore		irec;
3458 
3459 	irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3460 	irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3461 	return query->fn(cur, &irec, query->priv);
3462 }
3463 
3464 /* Find all free space within a given range of blocks. */
3465 int
3466 xfs_alloc_query_range(
3467 	struct xfs_btree_cur			*cur,
3468 	const struct xfs_alloc_rec_incore	*low_rec,
3469 	const struct xfs_alloc_rec_incore	*high_rec,
3470 	xfs_alloc_query_range_fn		fn,
3471 	void					*priv)
3472 {
3473 	union xfs_btree_irec			low_brec;
3474 	union xfs_btree_irec			high_brec;
3475 	struct xfs_alloc_query_range_info	query;
3476 
3477 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3478 	low_brec.a = *low_rec;
3479 	high_brec.a = *high_rec;
3480 	query.priv = priv;
3481 	query.fn = fn;
3482 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
3483 			xfs_alloc_query_range_helper, &query);
3484 }
3485 
3486 /* Find all free space records. */
3487 int
3488 xfs_alloc_query_all(
3489 	struct xfs_btree_cur			*cur,
3490 	xfs_alloc_query_range_fn		fn,
3491 	void					*priv)
3492 {
3493 	struct xfs_alloc_query_range_info	query;
3494 
3495 	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3496 	query.priv = priv;
3497 	query.fn = fn;
3498 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3499 }
3500 
3501 /* Is there a record covering a given extent? */
3502 int
3503 xfs_alloc_has_record(
3504 	struct xfs_btree_cur	*cur,
3505 	xfs_agblock_t		bno,
3506 	xfs_extlen_t		len,
3507 	bool			*exists)
3508 {
3509 	union xfs_btree_irec	low;
3510 	union xfs_btree_irec	high;
3511 
3512 	memset(&low, 0, sizeof(low));
3513 	low.a.ar_startblock = bno;
3514 	memset(&high, 0xFF, sizeof(high));
3515 	high.a.ar_startblock = bno + len - 1;
3516 
3517 	return xfs_btree_has_record(cur, &low, &high, exists);
3518 }
3519 
3520 /*
3521  * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3522  * error code or XFS_ITER_*.
3523  */
3524 int
3525 xfs_agfl_walk(
3526 	struct xfs_mount	*mp,
3527 	struct xfs_agf		*agf,
3528 	struct xfs_buf		*agflbp,
3529 	xfs_agfl_walk_fn	walk_fn,
3530 	void			*priv)
3531 {
3532 	__be32			*agfl_bno;
3533 	unsigned int		i;
3534 	int			error;
3535 
3536 	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3537 	i = be32_to_cpu(agf->agf_flfirst);
3538 
3539 	/* Nothing to walk in an empty AGFL. */
3540 	if (agf->agf_flcount == cpu_to_be32(0))
3541 		return 0;
3542 
3543 	/* Otherwise, walk from first to last, wrapping as needed. */
3544 	for (;;) {
3545 		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3546 		if (error)
3547 			return error;
3548 		if (i == be32_to_cpu(agf->agf_fllast))
3549 			break;
3550 		if (++i == xfs_agfl_size(mp))
3551 			i = 0;
3552 	}
3553 
3554 	return 0;
3555 }
3556 
3557 int __init
3558 xfs_extfree_intent_init_cache(void)
3559 {
3560 	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3561 			sizeof(struct xfs_extent_free_item),
3562 			0, 0, NULL);
3563 
3564 	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3565 }
3566 
3567 void
3568 xfs_extfree_intent_destroy_cache(void)
3569 {
3570 	kmem_cache_destroy(xfs_extfree_item_cache);
3571 	xfs_extfree_item_cache = NULL;
3572 }
3573