xref: /openbmc/linux/fs/xfs/libxfs/xfs_ag.c (revision 496cc140279b4517a23f4534e468ec9c66283f4b)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4  * Copyright (c) 2018 Red Hat, Inc.
5  * All rights reserved.
6  */
7 
8 #include "xfs.h"
9 #include "xfs_fs.h"
10 #include "xfs_shared.h"
11 #include "xfs_format.h"
12 #include "xfs_trans_resv.h"
13 #include "xfs_bit.h"
14 #include "xfs_sb.h"
15 #include "xfs_mount.h"
16 #include "xfs_btree.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_alloc.h"
20 #include "xfs_ialloc.h"
21 #include "xfs_rmap.h"
22 #include "xfs_ag.h"
23 #include "xfs_ag_resv.h"
24 #include "xfs_health.h"
25 #include "xfs_error.h"
26 #include "xfs_bmap.h"
27 #include "xfs_defer.h"
28 #include "xfs_log_format.h"
29 #include "xfs_trans.h"
30 #include "xfs_trace.h"
31 #include "xfs_inode.h"
32 #include "xfs_icache.h"
33 
34 
35 /*
36  * Passive reference counting access wrappers to the perag structures.  If the
37  * per-ag structure is to be freed, the freeing code is responsible for cleaning
38  * up objects with passive references before freeing the structure. This is
39  * things like cached buffers.
40  */
41 struct xfs_perag *
42 xfs_perag_get(
43 	struct xfs_mount	*mp,
44 	xfs_agnumber_t		agno)
45 {
46 	struct xfs_perag	*pag;
47 	int			ref = 0;
48 
49 	rcu_read_lock();
50 	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
51 	if (pag) {
52 		ASSERT(atomic_read(&pag->pag_ref) >= 0);
53 		ref = atomic_inc_return(&pag->pag_ref);
54 	}
55 	rcu_read_unlock();
56 	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
57 	return pag;
58 }
59 
60 /*
61  * search from @first to find the next perag with the given tag set.
62  */
63 struct xfs_perag *
64 xfs_perag_get_tag(
65 	struct xfs_mount	*mp,
66 	xfs_agnumber_t		first,
67 	unsigned int		tag)
68 {
69 	struct xfs_perag	*pag;
70 	int			found;
71 	int			ref;
72 
73 	rcu_read_lock();
74 	found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
75 					(void **)&pag, first, 1, tag);
76 	if (found <= 0) {
77 		rcu_read_unlock();
78 		return NULL;
79 	}
80 	ref = atomic_inc_return(&pag->pag_ref);
81 	rcu_read_unlock();
82 	trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
83 	return pag;
84 }
85 
86 void
87 xfs_perag_put(
88 	struct xfs_perag	*pag)
89 {
90 	int	ref;
91 
92 	ASSERT(atomic_read(&pag->pag_ref) > 0);
93 	ref = atomic_dec_return(&pag->pag_ref);
94 	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
95 }
96 
97 /*
98  * xfs_initialize_perag_data
99  *
100  * Read in each per-ag structure so we can count up the number of
101  * allocated inodes, free inodes and used filesystem blocks as this
102  * information is no longer persistent in the superblock. Once we have
103  * this information, write it into the in-core superblock structure.
104  */
105 int
106 xfs_initialize_perag_data(
107 	struct xfs_mount	*mp,
108 	xfs_agnumber_t		agcount)
109 {
110 	xfs_agnumber_t		index;
111 	struct xfs_perag	*pag;
112 	struct xfs_sb		*sbp = &mp->m_sb;
113 	uint64_t		ifree = 0;
114 	uint64_t		ialloc = 0;
115 	uint64_t		bfree = 0;
116 	uint64_t		bfreelst = 0;
117 	uint64_t		btree = 0;
118 	uint64_t		fdblocks;
119 	int			error = 0;
120 
121 	for (index = 0; index < agcount; index++) {
122 		/*
123 		 * read the agf, then the agi. This gets us
124 		 * all the information we need and populates the
125 		 * per-ag structures for us.
126 		 */
127 		error = xfs_alloc_pagf_init(mp, NULL, index, 0);
128 		if (error)
129 			return error;
130 
131 		error = xfs_ialloc_pagi_init(mp, NULL, index);
132 		if (error)
133 			return error;
134 		pag = xfs_perag_get(mp, index);
135 		ifree += pag->pagi_freecount;
136 		ialloc += pag->pagi_count;
137 		bfree += pag->pagf_freeblks;
138 		bfreelst += pag->pagf_flcount;
139 		btree += pag->pagf_btreeblks;
140 		xfs_perag_put(pag);
141 	}
142 	fdblocks = bfree + bfreelst + btree;
143 
144 	/*
145 	 * If the new summary counts are obviously incorrect, fail the
146 	 * mount operation because that implies the AGFs are also corrupt.
147 	 * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
148 	 * will prevent xfs_repair from fixing anything.
149 	 */
150 	if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
151 		xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
152 		error = -EFSCORRUPTED;
153 		goto out;
154 	}
155 
156 	/* Overwrite incore superblock counters with just-read data */
157 	spin_lock(&mp->m_sb_lock);
158 	sbp->sb_ifree = ifree;
159 	sbp->sb_icount = ialloc;
160 	sbp->sb_fdblocks = fdblocks;
161 	spin_unlock(&mp->m_sb_lock);
162 
163 	xfs_reinit_percpu_counters(mp);
164 out:
165 	xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
166 	return error;
167 }
168 
169 STATIC void
170 __xfs_free_perag(
171 	struct rcu_head	*head)
172 {
173 	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
174 
175 	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
176 	ASSERT(atomic_read(&pag->pag_ref) == 0);
177 	kmem_free(pag);
178 }
179 
180 /*
181  * Free up the per-ag resources associated with the mount structure.
182  */
183 void
184 xfs_free_perag(
185 	struct xfs_mount	*mp)
186 {
187 	struct xfs_perag	*pag;
188 	xfs_agnumber_t		agno;
189 
190 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
191 		spin_lock(&mp->m_perag_lock);
192 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
193 		spin_unlock(&mp->m_perag_lock);
194 		ASSERT(pag);
195 		ASSERT(atomic_read(&pag->pag_ref) == 0);
196 
197 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
198 		xfs_iunlink_destroy(pag);
199 		xfs_buf_hash_destroy(pag);
200 
201 		call_rcu(&pag->rcu_head, __xfs_free_perag);
202 	}
203 }
204 
205 int
206 xfs_initialize_perag(
207 	struct xfs_mount	*mp,
208 	xfs_agnumber_t		agcount,
209 	xfs_agnumber_t		*maxagi)
210 {
211 	struct xfs_perag	*pag;
212 	xfs_agnumber_t		index;
213 	xfs_agnumber_t		first_initialised = NULLAGNUMBER;
214 	int			error;
215 
216 	/*
217 	 * Walk the current per-ag tree so we don't try to initialise AGs
218 	 * that already exist (growfs case). Allocate and insert all the
219 	 * AGs we don't find ready for initialisation.
220 	 */
221 	for (index = 0; index < agcount; index++) {
222 		pag = xfs_perag_get(mp, index);
223 		if (pag) {
224 			xfs_perag_put(pag);
225 			continue;
226 		}
227 
228 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
229 		if (!pag) {
230 			error = -ENOMEM;
231 			goto out_unwind_new_pags;
232 		}
233 		pag->pag_agno = index;
234 		pag->pag_mount = mp;
235 
236 		error = radix_tree_preload(GFP_NOFS);
237 		if (error)
238 			goto out_free_pag;
239 
240 		spin_lock(&mp->m_perag_lock);
241 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
242 			WARN_ON_ONCE(1);
243 			spin_unlock(&mp->m_perag_lock);
244 			radix_tree_preload_end();
245 			error = -EEXIST;
246 			goto out_free_pag;
247 		}
248 		spin_unlock(&mp->m_perag_lock);
249 		radix_tree_preload_end();
250 
251 #ifdef __KERNEL__
252 		/* Place kernel structure only init below this point. */
253 		spin_lock_init(&pag->pag_ici_lock);
254 		spin_lock_init(&pag->pagb_lock);
255 		spin_lock_init(&pag->pag_state_lock);
256 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
257 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
258 		init_waitqueue_head(&pag->pagb_wait);
259 		pag->pagb_count = 0;
260 		pag->pagb_tree = RB_ROOT;
261 #endif /* __KERNEL__ */
262 
263 		error = xfs_buf_hash_init(pag);
264 		if (error)
265 			goto out_remove_pag;
266 
267 		error = xfs_iunlink_init(pag);
268 		if (error)
269 			goto out_hash_destroy;
270 
271 		/* first new pag is fully initialized */
272 		if (first_initialised == NULLAGNUMBER)
273 			first_initialised = index;
274 	}
275 
276 	index = xfs_set_inode_alloc(mp, agcount);
277 
278 	if (maxagi)
279 		*maxagi = index;
280 
281 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
282 	return 0;
283 
284 out_hash_destroy:
285 	xfs_buf_hash_destroy(pag);
286 out_remove_pag:
287 	radix_tree_delete(&mp->m_perag_tree, index);
288 out_free_pag:
289 	kmem_free(pag);
290 out_unwind_new_pags:
291 	/* unwind any prior newly initialized pags */
292 	for (index = first_initialised; index < agcount; index++) {
293 		pag = radix_tree_delete(&mp->m_perag_tree, index);
294 		if (!pag)
295 			break;
296 		xfs_buf_hash_destroy(pag);
297 		xfs_iunlink_destroy(pag);
298 		kmem_free(pag);
299 	}
300 	return error;
301 }
302 
303 static int
304 xfs_get_aghdr_buf(
305 	struct xfs_mount	*mp,
306 	xfs_daddr_t		blkno,
307 	size_t			numblks,
308 	struct xfs_buf		**bpp,
309 	const struct xfs_buf_ops *ops)
310 {
311 	struct xfs_buf		*bp;
312 	int			error;
313 
314 	error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
315 	if (error)
316 		return error;
317 
318 	bp->b_maps[0].bm_bn = blkno;
319 	bp->b_ops = ops;
320 
321 	*bpp = bp;
322 	return 0;
323 }
324 
325 static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
326 {
327 	return mp->m_sb.sb_logstart > 0 &&
328 	       id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
329 }
330 
331 /*
332  * Generic btree root block init function
333  */
334 static void
335 xfs_btroot_init(
336 	struct xfs_mount	*mp,
337 	struct xfs_buf		*bp,
338 	struct aghdr_init_data	*id)
339 {
340 	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
341 }
342 
343 /* Finish initializing a free space btree. */
344 static void
345 xfs_freesp_init_recs(
346 	struct xfs_mount	*mp,
347 	struct xfs_buf		*bp,
348 	struct aghdr_init_data	*id)
349 {
350 	struct xfs_alloc_rec	*arec;
351 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
352 
353 	arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
354 	arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
355 
356 	if (is_log_ag(mp, id)) {
357 		struct xfs_alloc_rec	*nrec;
358 		xfs_agblock_t		start = XFS_FSB_TO_AGBNO(mp,
359 							mp->m_sb.sb_logstart);
360 
361 		ASSERT(start >= mp->m_ag_prealloc_blocks);
362 		if (start != mp->m_ag_prealloc_blocks) {
363 			/*
364 			 * Modify first record to pad stripe align of log
365 			 */
366 			arec->ar_blockcount = cpu_to_be32(start -
367 						mp->m_ag_prealloc_blocks);
368 			nrec = arec + 1;
369 
370 			/*
371 			 * Insert second record at start of internal log
372 			 * which then gets trimmed.
373 			 */
374 			nrec->ar_startblock = cpu_to_be32(
375 					be32_to_cpu(arec->ar_startblock) +
376 					be32_to_cpu(arec->ar_blockcount));
377 			arec = nrec;
378 			be16_add_cpu(&block->bb_numrecs, 1);
379 		}
380 		/*
381 		 * Change record start to after the internal log
382 		 */
383 		be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
384 	}
385 
386 	/*
387 	 * Calculate the record block count and check for the case where
388 	 * the log might have consumed all available space in the AG. If
389 	 * so, reset the record count to 0 to avoid exposure of an invalid
390 	 * record start block.
391 	 */
392 	arec->ar_blockcount = cpu_to_be32(id->agsize -
393 					  be32_to_cpu(arec->ar_startblock));
394 	if (!arec->ar_blockcount)
395 		block->bb_numrecs = 0;
396 }
397 
398 /*
399  * Alloc btree root block init functions
400  */
401 static void
402 xfs_bnoroot_init(
403 	struct xfs_mount	*mp,
404 	struct xfs_buf		*bp,
405 	struct aghdr_init_data	*id)
406 {
407 	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
408 	xfs_freesp_init_recs(mp, bp, id);
409 }
410 
411 static void
412 xfs_cntroot_init(
413 	struct xfs_mount	*mp,
414 	struct xfs_buf		*bp,
415 	struct aghdr_init_data	*id)
416 {
417 	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
418 	xfs_freesp_init_recs(mp, bp, id);
419 }
420 
421 /*
422  * Reverse map root block init
423  */
424 static void
425 xfs_rmaproot_init(
426 	struct xfs_mount	*mp,
427 	struct xfs_buf		*bp,
428 	struct aghdr_init_data	*id)
429 {
430 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
431 	struct xfs_rmap_rec	*rrec;
432 
433 	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
434 
435 	/*
436 	 * mark the AG header regions as static metadata The BNO
437 	 * btree block is the first block after the headers, so
438 	 * it's location defines the size of region the static
439 	 * metadata consumes.
440 	 *
441 	 * Note: unlike mkfs, we never have to account for log
442 	 * space when growing the data regions
443 	 */
444 	rrec = XFS_RMAP_REC_ADDR(block, 1);
445 	rrec->rm_startblock = 0;
446 	rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
447 	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
448 	rrec->rm_offset = 0;
449 
450 	/* account freespace btree root blocks */
451 	rrec = XFS_RMAP_REC_ADDR(block, 2);
452 	rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
453 	rrec->rm_blockcount = cpu_to_be32(2);
454 	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
455 	rrec->rm_offset = 0;
456 
457 	/* account inode btree root blocks */
458 	rrec = XFS_RMAP_REC_ADDR(block, 3);
459 	rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
460 	rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
461 					  XFS_IBT_BLOCK(mp));
462 	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
463 	rrec->rm_offset = 0;
464 
465 	/* account for rmap btree root */
466 	rrec = XFS_RMAP_REC_ADDR(block, 4);
467 	rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
468 	rrec->rm_blockcount = cpu_to_be32(1);
469 	rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
470 	rrec->rm_offset = 0;
471 
472 	/* account for refc btree root */
473 	if (xfs_has_reflink(mp)) {
474 		rrec = XFS_RMAP_REC_ADDR(block, 5);
475 		rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
476 		rrec->rm_blockcount = cpu_to_be32(1);
477 		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
478 		rrec->rm_offset = 0;
479 		be16_add_cpu(&block->bb_numrecs, 1);
480 	}
481 
482 	/* account for the log space */
483 	if (is_log_ag(mp, id)) {
484 		rrec = XFS_RMAP_REC_ADDR(block,
485 				be16_to_cpu(block->bb_numrecs) + 1);
486 		rrec->rm_startblock = cpu_to_be32(
487 				XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
488 		rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
489 		rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
490 		rrec->rm_offset = 0;
491 		be16_add_cpu(&block->bb_numrecs, 1);
492 	}
493 }
494 
495 /*
496  * Initialise new secondary superblocks with the pre-grow geometry, but mark
497  * them as "in progress" so we know they haven't yet been activated. This will
498  * get cleared when the update with the new geometry information is done after
499  * changes to the primary are committed. This isn't strictly necessary, but we
500  * get it for free with the delayed buffer write lists and it means we can tell
501  * if a grow operation didn't complete properly after the fact.
502  */
503 static void
504 xfs_sbblock_init(
505 	struct xfs_mount	*mp,
506 	struct xfs_buf		*bp,
507 	struct aghdr_init_data	*id)
508 {
509 	struct xfs_dsb		*dsb = bp->b_addr;
510 
511 	xfs_sb_to_disk(dsb, &mp->m_sb);
512 	dsb->sb_inprogress = 1;
513 }
514 
515 static void
516 xfs_agfblock_init(
517 	struct xfs_mount	*mp,
518 	struct xfs_buf		*bp,
519 	struct aghdr_init_data	*id)
520 {
521 	struct xfs_agf		*agf = bp->b_addr;
522 	xfs_extlen_t		tmpsize;
523 
524 	agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
525 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
526 	agf->agf_seqno = cpu_to_be32(id->agno);
527 	agf->agf_length = cpu_to_be32(id->agsize);
528 	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
529 	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
530 	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
531 	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
532 	if (xfs_has_rmapbt(mp)) {
533 		agf->agf_roots[XFS_BTNUM_RMAPi] =
534 					cpu_to_be32(XFS_RMAP_BLOCK(mp));
535 		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
536 		agf->agf_rmap_blocks = cpu_to_be32(1);
537 	}
538 
539 	agf->agf_flfirst = cpu_to_be32(1);
540 	agf->agf_fllast = 0;
541 	agf->agf_flcount = 0;
542 	tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
543 	agf->agf_freeblks = cpu_to_be32(tmpsize);
544 	agf->agf_longest = cpu_to_be32(tmpsize);
545 	if (xfs_has_crc(mp))
546 		uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
547 	if (xfs_has_reflink(mp)) {
548 		agf->agf_refcount_root = cpu_to_be32(
549 				xfs_refc_block(mp));
550 		agf->agf_refcount_level = cpu_to_be32(1);
551 		agf->agf_refcount_blocks = cpu_to_be32(1);
552 	}
553 
554 	if (is_log_ag(mp, id)) {
555 		int64_t	logblocks = mp->m_sb.sb_logblocks;
556 
557 		be32_add_cpu(&agf->agf_freeblks, -logblocks);
558 		agf->agf_longest = cpu_to_be32(id->agsize -
559 			XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
560 	}
561 }
562 
563 static void
564 xfs_agflblock_init(
565 	struct xfs_mount	*mp,
566 	struct xfs_buf		*bp,
567 	struct aghdr_init_data	*id)
568 {
569 	struct xfs_agfl		*agfl = XFS_BUF_TO_AGFL(bp);
570 	__be32			*agfl_bno;
571 	int			bucket;
572 
573 	if (xfs_has_crc(mp)) {
574 		agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
575 		agfl->agfl_seqno = cpu_to_be32(id->agno);
576 		uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
577 	}
578 
579 	agfl_bno = xfs_buf_to_agfl_bno(bp);
580 	for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
581 		agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
582 }
583 
584 static void
585 xfs_agiblock_init(
586 	struct xfs_mount	*mp,
587 	struct xfs_buf		*bp,
588 	struct aghdr_init_data	*id)
589 {
590 	struct xfs_agi		*agi = bp->b_addr;
591 	int			bucket;
592 
593 	agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
594 	agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
595 	agi->agi_seqno = cpu_to_be32(id->agno);
596 	agi->agi_length = cpu_to_be32(id->agsize);
597 	agi->agi_count = 0;
598 	agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
599 	agi->agi_level = cpu_to_be32(1);
600 	agi->agi_freecount = 0;
601 	agi->agi_newino = cpu_to_be32(NULLAGINO);
602 	agi->agi_dirino = cpu_to_be32(NULLAGINO);
603 	if (xfs_has_crc(mp))
604 		uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
605 	if (xfs_has_finobt(mp)) {
606 		agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
607 		agi->agi_free_level = cpu_to_be32(1);
608 	}
609 	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
610 		agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
611 	if (xfs_has_inobtcounts(mp)) {
612 		agi->agi_iblocks = cpu_to_be32(1);
613 		if (xfs_has_finobt(mp))
614 			agi->agi_fblocks = cpu_to_be32(1);
615 	}
616 }
617 
618 typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
619 				  struct aghdr_init_data *id);
620 static int
621 xfs_ag_init_hdr(
622 	struct xfs_mount	*mp,
623 	struct aghdr_init_data	*id,
624 	aghdr_init_work_f	work,
625 	const struct xfs_buf_ops *ops)
626 {
627 	struct xfs_buf		*bp;
628 	int			error;
629 
630 	error = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, &bp, ops);
631 	if (error)
632 		return error;
633 
634 	(*work)(mp, bp, id);
635 
636 	xfs_buf_delwri_queue(bp, &id->buffer_list);
637 	xfs_buf_relse(bp);
638 	return 0;
639 }
640 
641 struct xfs_aghdr_grow_data {
642 	xfs_daddr_t		daddr;
643 	size_t			numblks;
644 	const struct xfs_buf_ops *ops;
645 	aghdr_init_work_f	work;
646 	xfs_btnum_t		type;
647 	bool			need_init;
648 };
649 
650 /*
651  * Prepare new AG headers to be written to disk. We use uncached buffers here,
652  * as it is assumed these new AG headers are currently beyond the currently
653  * valid filesystem address space. Using cached buffers would trip over EOFS
654  * corruption detection alogrithms in the buffer cache lookup routines.
655  *
656  * This is a non-transactional function, but the prepared buffers are added to a
657  * delayed write buffer list supplied by the caller so they can submit them to
658  * disk and wait on them as required.
659  */
660 int
661 xfs_ag_init_headers(
662 	struct xfs_mount	*mp,
663 	struct aghdr_init_data	*id)
664 
665 {
666 	struct xfs_aghdr_grow_data aghdr_data[] = {
667 	{ /* SB */
668 		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
669 		.numblks = XFS_FSS_TO_BB(mp, 1),
670 		.ops = &xfs_sb_buf_ops,
671 		.work = &xfs_sbblock_init,
672 		.need_init = true
673 	},
674 	{ /* AGF */
675 		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
676 		.numblks = XFS_FSS_TO_BB(mp, 1),
677 		.ops = &xfs_agf_buf_ops,
678 		.work = &xfs_agfblock_init,
679 		.need_init = true
680 	},
681 	{ /* AGFL */
682 		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
683 		.numblks = XFS_FSS_TO_BB(mp, 1),
684 		.ops = &xfs_agfl_buf_ops,
685 		.work = &xfs_agflblock_init,
686 		.need_init = true
687 	},
688 	{ /* AGI */
689 		.daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
690 		.numblks = XFS_FSS_TO_BB(mp, 1),
691 		.ops = &xfs_agi_buf_ops,
692 		.work = &xfs_agiblock_init,
693 		.need_init = true
694 	},
695 	{ /* BNO root block */
696 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
697 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
698 		.ops = &xfs_bnobt_buf_ops,
699 		.work = &xfs_bnoroot_init,
700 		.need_init = true
701 	},
702 	{ /* CNT root block */
703 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
704 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
705 		.ops = &xfs_cntbt_buf_ops,
706 		.work = &xfs_cntroot_init,
707 		.need_init = true
708 	},
709 	{ /* INO root block */
710 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
711 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
712 		.ops = &xfs_inobt_buf_ops,
713 		.work = &xfs_btroot_init,
714 		.type = XFS_BTNUM_INO,
715 		.need_init = true
716 	},
717 	{ /* FINO root block */
718 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
719 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
720 		.ops = &xfs_finobt_buf_ops,
721 		.work = &xfs_btroot_init,
722 		.type = XFS_BTNUM_FINO,
723 		.need_init =  xfs_has_finobt(mp)
724 	},
725 	{ /* RMAP root block */
726 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
727 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
728 		.ops = &xfs_rmapbt_buf_ops,
729 		.work = &xfs_rmaproot_init,
730 		.need_init = xfs_has_rmapbt(mp)
731 	},
732 	{ /* REFC root block */
733 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
734 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
735 		.ops = &xfs_refcountbt_buf_ops,
736 		.work = &xfs_btroot_init,
737 		.type = XFS_BTNUM_REFC,
738 		.need_init = xfs_has_reflink(mp)
739 	},
740 	{ /* NULL terminating block */
741 		.daddr = XFS_BUF_DADDR_NULL,
742 	}
743 	};
744 	struct  xfs_aghdr_grow_data *dp;
745 	int			error = 0;
746 
747 	/* Account for AG free space in new AG */
748 	id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
749 	for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
750 		if (!dp->need_init)
751 			continue;
752 
753 		id->daddr = dp->daddr;
754 		id->numblks = dp->numblks;
755 		id->type = dp->type;
756 		error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
757 		if (error)
758 			break;
759 	}
760 	return error;
761 }
762 
763 int
764 xfs_ag_shrink_space(
765 	struct xfs_mount	*mp,
766 	struct xfs_trans	**tpp,
767 	xfs_agnumber_t		agno,
768 	xfs_extlen_t		delta)
769 {
770 	struct xfs_alloc_arg	args = {
771 		.tp	= *tpp,
772 		.mp	= mp,
773 		.type	= XFS_ALLOCTYPE_THIS_BNO,
774 		.minlen = delta,
775 		.maxlen = delta,
776 		.oinfo	= XFS_RMAP_OINFO_SKIP_UPDATE,
777 		.resv	= XFS_AG_RESV_NONE,
778 		.prod	= 1
779 	};
780 	struct xfs_buf		*agibp, *agfbp;
781 	struct xfs_agi		*agi;
782 	struct xfs_agf		*agf;
783 	xfs_agblock_t		aglen;
784 	int			error, err2;
785 
786 	ASSERT(agno == mp->m_sb.sb_agcount - 1);
787 	error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp);
788 	if (error)
789 		return error;
790 
791 	agi = agibp->b_addr;
792 
793 	error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp);
794 	if (error)
795 		return error;
796 
797 	agf = agfbp->b_addr;
798 	aglen = be32_to_cpu(agi->agi_length);
799 	/* some extra paranoid checks before we shrink the ag */
800 	if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
801 		return -EFSCORRUPTED;
802 	if (delta >= aglen)
803 		return -EINVAL;
804 
805 	args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta);
806 
807 	/*
808 	 * Make sure that the last inode cluster cannot overlap with the new
809 	 * end of the AG, even if it's sparse.
810 	 */
811 	error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta);
812 	if (error)
813 		return error;
814 
815 	/*
816 	 * Disable perag reservations so it doesn't cause the allocation request
817 	 * to fail. We'll reestablish reservation before we return.
818 	 */
819 	error = xfs_ag_resv_free(agibp->b_pag);
820 	if (error)
821 		return error;
822 
823 	/* internal log shouldn't also show up in the free space btrees */
824 	error = xfs_alloc_vextent(&args);
825 	if (!error && args.agbno == NULLAGBLOCK)
826 		error = -ENOSPC;
827 
828 	if (error) {
829 		/*
830 		 * if extent allocation fails, need to roll the transaction to
831 		 * ensure that the AGFL fixup has been committed anyway.
832 		 */
833 		xfs_trans_bhold(*tpp, agfbp);
834 		err2 = xfs_trans_roll(tpp);
835 		if (err2)
836 			return err2;
837 		xfs_trans_bjoin(*tpp, agfbp);
838 		goto resv_init_out;
839 	}
840 
841 	/*
842 	 * if successfully deleted from freespace btrees, need to confirm
843 	 * per-AG reservation works as expected.
844 	 */
845 	be32_add_cpu(&agi->agi_length, -delta);
846 	be32_add_cpu(&agf->agf_length, -delta);
847 
848 	err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
849 	if (err2) {
850 		be32_add_cpu(&agi->agi_length, delta);
851 		be32_add_cpu(&agf->agf_length, delta);
852 		if (err2 != -ENOSPC)
853 			goto resv_err;
854 
855 		__xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
856 
857 		/*
858 		 * Roll the transaction before trying to re-init the per-ag
859 		 * reservation. The new transaction is clean so it will cancel
860 		 * without any side effects.
861 		 */
862 		error = xfs_defer_finish(tpp);
863 		if (error)
864 			return error;
865 
866 		error = -ENOSPC;
867 		goto resv_init_out;
868 	}
869 	xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
870 	xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
871 	return 0;
872 resv_init_out:
873 	err2 = xfs_ag_resv_init(agibp->b_pag, *tpp);
874 	if (!err2)
875 		return error;
876 resv_err:
877 	xfs_warn(mp, "Error %d reserving per-AG metadata reserve pool.", err2);
878 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
879 	return err2;
880 }
881 
882 /*
883  * Extent the AG indicated by the @id by the length passed in
884  */
885 int
886 xfs_ag_extend_space(
887 	struct xfs_mount	*mp,
888 	struct xfs_trans	*tp,
889 	struct aghdr_init_data	*id,
890 	xfs_extlen_t		len)
891 {
892 	struct xfs_buf		*bp;
893 	struct xfs_agi		*agi;
894 	struct xfs_agf		*agf;
895 	int			error;
896 
897 	/*
898 	 * Change the agi length.
899 	 */
900 	error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
901 	if (error)
902 		return error;
903 
904 	agi = bp->b_addr;
905 	be32_add_cpu(&agi->agi_length, len);
906 	ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
907 	       be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
908 	xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
909 
910 	/*
911 	 * Change agf length.
912 	 */
913 	error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
914 	if (error)
915 		return error;
916 
917 	agf = bp->b_addr;
918 	be32_add_cpu(&agf->agf_length, len);
919 	ASSERT(agf->agf_length == agi->agi_length);
920 	xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
921 
922 	/*
923 	 * Free the new space.
924 	 *
925 	 * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that
926 	 * this doesn't actually exist in the rmap btree.
927 	 */
928 	error = xfs_rmap_free(tp, bp, bp->b_pag,
929 				be32_to_cpu(agf->agf_length) - len,
930 				len, &XFS_RMAP_OINFO_SKIP_UPDATE);
931 	if (error)
932 		return error;
933 
934 	return  xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
935 					be32_to_cpu(agf->agf_length) - len),
936 				len, &XFS_RMAP_OINFO_SKIP_UPDATE,
937 				XFS_AG_RESV_NONE);
938 }
939 
940 /* Retrieve AG geometry. */
941 int
942 xfs_ag_get_geometry(
943 	struct xfs_mount	*mp,
944 	xfs_agnumber_t		agno,
945 	struct xfs_ag_geometry	*ageo)
946 {
947 	struct xfs_buf		*agi_bp;
948 	struct xfs_buf		*agf_bp;
949 	struct xfs_agi		*agi;
950 	struct xfs_agf		*agf;
951 	struct xfs_perag	*pag;
952 	unsigned int		freeblks;
953 	int			error;
954 
955 	if (agno >= mp->m_sb.sb_agcount)
956 		return -EINVAL;
957 
958 	/* Lock the AG headers. */
959 	error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
960 	if (error)
961 		return error;
962 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
963 	if (error)
964 		goto out_agi;
965 
966 	pag = agi_bp->b_pag;
967 
968 	/* Fill out form. */
969 	memset(ageo, 0, sizeof(*ageo));
970 	ageo->ag_number = agno;
971 
972 	agi = agi_bp->b_addr;
973 	ageo->ag_icount = be32_to_cpu(agi->agi_count);
974 	ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
975 
976 	agf = agf_bp->b_addr;
977 	ageo->ag_length = be32_to_cpu(agf->agf_length);
978 	freeblks = pag->pagf_freeblks +
979 		   pag->pagf_flcount +
980 		   pag->pagf_btreeblks -
981 		   xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
982 	ageo->ag_freeblks = freeblks;
983 	xfs_ag_geom_health(pag, ageo);
984 
985 	/* Release resources. */
986 	xfs_buf_relse(agf_bp);
987 out_agi:
988 	xfs_buf_relse(agi_bp);
989 	return error;
990 }
991