xref: /openbmc/linux/fs/xfs/scrub/btree.c (revision 88accf17226733088923635b580779a3c86b6f23)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 	case -ECHRNG:
40 		/* Used to restart an op with deadlock avoidance. */
41 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
42 		break;
43 	case -EFSBADCRC:
44 	case -EFSCORRUPTED:
45 		/* Note the badness but don't abort. */
46 		sc->sm->sm_flags |= errflag;
47 		*error = 0;
48 		fallthrough;
49 	default:
50 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
51 			trace_xchk_ifork_btree_op_error(sc, cur, level,
52 					*error, ret_ip);
53 		else
54 			trace_xchk_btree_op_error(sc, cur, level,
55 					*error, ret_ip);
56 		break;
57 	}
58 	return false;
59 }
60 
61 bool
62 xchk_btree_process_error(
63 	struct xfs_scrub	*sc,
64 	struct xfs_btree_cur	*cur,
65 	int			level,
66 	int			*error)
67 {
68 	return __xchk_btree_process_error(sc, cur, level, error,
69 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
70 }
71 
72 bool
73 xchk_btree_xref_process_error(
74 	struct xfs_scrub	*sc,
75 	struct xfs_btree_cur	*cur,
76 	int			level,
77 	int			*error)
78 {
79 	return __xchk_btree_process_error(sc, cur, level, error,
80 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
81 }
82 
83 /* Record btree block corruption. */
84 static void
85 __xchk_btree_set_corrupt(
86 	struct xfs_scrub	*sc,
87 	struct xfs_btree_cur	*cur,
88 	int			level,
89 	__u32			errflag,
90 	void			*ret_ip)
91 {
92 	sc->sm->sm_flags |= errflag;
93 
94 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
95 		trace_xchk_ifork_btree_error(sc, cur, level,
96 				ret_ip);
97 	else
98 		trace_xchk_btree_error(sc, cur, level,
99 				ret_ip);
100 }
101 
102 void
103 xchk_btree_set_corrupt(
104 	struct xfs_scrub	*sc,
105 	struct xfs_btree_cur	*cur,
106 	int			level)
107 {
108 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
109 			__return_address);
110 }
111 
112 void
113 xchk_btree_xref_set_corrupt(
114 	struct xfs_scrub	*sc,
115 	struct xfs_btree_cur	*cur,
116 	int			level)
117 {
118 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
119 			__return_address);
120 }
121 
122 /*
123  * Make sure this record is in order and doesn't stray outside of the parent
124  * keys.
125  */
126 STATIC void
127 xchk_btree_rec(
128 	struct xchk_btree	*bs)
129 {
130 	struct xfs_btree_cur	*cur = bs->cur;
131 	union xfs_btree_rec	*rec;
132 	union xfs_btree_key	key;
133 	union xfs_btree_key	hkey;
134 	union xfs_btree_key	*keyp;
135 	struct xfs_btree_block	*block;
136 	struct xfs_btree_block	*keyblock;
137 	struct xfs_buf		*bp;
138 
139 	block = xfs_btree_get_block(cur, 0, &bp);
140 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
141 
142 	trace_xchk_btree_rec(bs->sc, cur, 0);
143 
144 	/* If this isn't the first record, are they in order? */
145 	if (cur->bc_levels[0].ptr > 1 &&
146 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
147 		xchk_btree_set_corrupt(bs->sc, cur, 0);
148 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
149 
150 	if (cur->bc_nlevels == 1)
151 		return;
152 
153 	/* Is this at least as large as the parent low key? */
154 	cur->bc_ops->init_key_from_rec(&key, rec);
155 	keyblock = xfs_btree_get_block(cur, 1, &bp);
156 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
157 	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
158 		xchk_btree_set_corrupt(bs->sc, cur, 1);
159 
160 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
161 		return;
162 
163 	/* Is this no larger than the parent high key? */
164 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
165 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
166 	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
167 		xchk_btree_set_corrupt(bs->sc, cur, 1);
168 }
169 
170 /*
171  * Make sure this key is in order and doesn't stray outside of the parent
172  * keys.
173  */
174 STATIC void
175 xchk_btree_key(
176 	struct xchk_btree	*bs,
177 	int			level)
178 {
179 	struct xfs_btree_cur	*cur = bs->cur;
180 	union xfs_btree_key	*key;
181 	union xfs_btree_key	*keyp;
182 	struct xfs_btree_block	*block;
183 	struct xfs_btree_block	*keyblock;
184 	struct xfs_buf		*bp;
185 
186 	block = xfs_btree_get_block(cur, level, &bp);
187 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
188 
189 	trace_xchk_btree_key(bs->sc, cur, level);
190 
191 	/* If this isn't the first key, are they in order? */
192 	if (cur->bc_levels[level].ptr > 1 &&
193 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
194 		xchk_btree_set_corrupt(bs->sc, cur, level);
195 	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
196 
197 	if (level + 1 >= cur->bc_nlevels)
198 		return;
199 
200 	/* Is this at least as large as the parent low key? */
201 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
202 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
203 	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
204 		xchk_btree_set_corrupt(bs->sc, cur, level);
205 
206 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
207 		return;
208 
209 	/* Is this no larger than the parent high key? */
210 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
211 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
212 			keyblock);
213 	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
214 		xchk_btree_set_corrupt(bs->sc, cur, level);
215 }
216 
217 /*
218  * Check a btree pointer.  Returns true if it's ok to use this pointer.
219  * Callers do not need to set the corrupt flag.
220  */
221 static bool
222 xchk_btree_ptr_ok(
223 	struct xchk_btree	*bs,
224 	int			level,
225 	union xfs_btree_ptr	*ptr)
226 {
227 	bool			res;
228 
229 	/* A btree rooted in an inode has no block pointer to the root. */
230 	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
231 	    level == bs->cur->bc_nlevels)
232 		return true;
233 
234 	/* Otherwise, check the pointers. */
235 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
236 		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
237 	else
238 		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
239 	if (!res)
240 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
241 
242 	return res;
243 }
244 
245 /* Check that a btree block's sibling matches what we expect it. */
246 STATIC int
247 xchk_btree_block_check_sibling(
248 	struct xchk_btree	*bs,
249 	int			level,
250 	int			direction,
251 	union xfs_btree_ptr	*sibling)
252 {
253 	struct xfs_btree_cur	*cur = bs->cur;
254 	struct xfs_btree_block	*pblock;
255 	struct xfs_buf		*pbp;
256 	struct xfs_btree_cur	*ncur = NULL;
257 	union xfs_btree_ptr	*pp;
258 	int			success;
259 	int			error;
260 
261 	error = xfs_btree_dup_cursor(cur, &ncur);
262 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
263 	    !ncur)
264 		return error;
265 
266 	/*
267 	 * If the pointer is null, we shouldn't be able to move the upper
268 	 * level pointer anywhere.
269 	 */
270 	if (xfs_btree_ptr_is_null(cur, sibling)) {
271 		if (direction > 0)
272 			error = xfs_btree_increment(ncur, level + 1, &success);
273 		else
274 			error = xfs_btree_decrement(ncur, level + 1, &success);
275 		if (error == 0 && success)
276 			xchk_btree_set_corrupt(bs->sc, cur, level);
277 		error = 0;
278 		goto out;
279 	}
280 
281 	/* Increment upper level pointer. */
282 	if (direction > 0)
283 		error = xfs_btree_increment(ncur, level + 1, &success);
284 	else
285 		error = xfs_btree_decrement(ncur, level + 1, &success);
286 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
287 		goto out;
288 	if (!success) {
289 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
290 		goto out;
291 	}
292 
293 	/* Compare upper level pointer to sibling pointer. */
294 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
295 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
296 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
297 		goto out;
298 	if (pbp)
299 		xchk_buffer_recheck(bs->sc, pbp);
300 
301 	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
302 		xchk_btree_set_corrupt(bs->sc, cur, level);
303 out:
304 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
305 	return error;
306 }
307 
308 /* Check the siblings of a btree block. */
309 STATIC int
310 xchk_btree_block_check_siblings(
311 	struct xchk_btree	*bs,
312 	struct xfs_btree_block	*block)
313 {
314 	struct xfs_btree_cur	*cur = bs->cur;
315 	union xfs_btree_ptr	leftsib;
316 	union xfs_btree_ptr	rightsib;
317 	int			level;
318 	int			error = 0;
319 
320 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
321 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
322 	level = xfs_btree_get_level(block);
323 
324 	/* Root block should never have siblings. */
325 	if (level == cur->bc_nlevels - 1) {
326 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
327 		    !xfs_btree_ptr_is_null(cur, &rightsib))
328 			xchk_btree_set_corrupt(bs->sc, cur, level);
329 		goto out;
330 	}
331 
332 	/*
333 	 * Does the left & right sibling pointers match the adjacent
334 	 * parent level pointers?
335 	 * (These function absorbs error codes for us.)
336 	 */
337 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
338 	if (error)
339 		return error;
340 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
341 	if (error)
342 		return error;
343 out:
344 	return error;
345 }
346 
347 struct check_owner {
348 	struct list_head	list;
349 	xfs_daddr_t		daddr;
350 	int			level;
351 };
352 
353 /*
354  * Make sure this btree block isn't in the free list and that there's
355  * an rmap record for it.
356  */
357 STATIC int
358 xchk_btree_check_block_owner(
359 	struct xchk_btree	*bs,
360 	int			level,
361 	xfs_daddr_t		daddr)
362 {
363 	xfs_agnumber_t		agno;
364 	xfs_agblock_t		agbno;
365 	xfs_btnum_t		btnum;
366 	bool			init_sa;
367 	int			error = 0;
368 
369 	if (!bs->cur)
370 		return 0;
371 
372 	btnum = bs->cur->bc_btnum;
373 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
374 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
375 
376 	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
377 	if (init_sa) {
378 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
379 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
380 				level, &error))
381 			goto out_free;
382 	}
383 
384 	xchk_xref_is_used_space(bs->sc, agbno, 1);
385 	/*
386 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
387 	 * have to nullify it (to shut down further block owner checks) if
388 	 * self-xref encounters problems.
389 	 */
390 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
391 		bs->cur = NULL;
392 
393 	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
394 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
395 		bs->cur = NULL;
396 
397 out_free:
398 	if (init_sa)
399 		xchk_ag_free(bs->sc, &bs->sc->sa);
400 
401 	return error;
402 }
403 
404 /* Check the owner of a btree block. */
405 STATIC int
406 xchk_btree_check_owner(
407 	struct xchk_btree	*bs,
408 	int			level,
409 	struct xfs_buf		*bp)
410 {
411 	struct xfs_btree_cur	*cur = bs->cur;
412 
413 	/*
414 	 * In theory, xfs_btree_get_block should only give us a null buffer
415 	 * pointer for the root of a root-in-inode btree type, but we need
416 	 * to check defensively here in case the cursor state is also screwed
417 	 * up.
418 	 */
419 	if (bp == NULL) {
420 		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
421 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
422 		return 0;
423 	}
424 
425 	/*
426 	 * We want to cross-reference each btree block with the bnobt
427 	 * and the rmapbt.  We cannot cross-reference the bnobt or
428 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
429 	 * because we cannot alter the cursor and we'd prefer not to
430 	 * duplicate cursors.  Therefore, save the buffer daddr for
431 	 * later scanning.
432 	 */
433 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
434 		struct check_owner	*co;
435 
436 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
437 		if (!co)
438 			return -ENOMEM;
439 
440 		INIT_LIST_HEAD(&co->list);
441 		co->level = level;
442 		co->daddr = xfs_buf_daddr(bp);
443 		list_add_tail(&co->list, &bs->to_check);
444 		return 0;
445 	}
446 
447 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
448 }
449 
450 /* Decide if we want to check minrecs of a btree block in the inode root. */
451 static inline bool
452 xchk_btree_check_iroot_minrecs(
453 	struct xchk_btree	*bs)
454 {
455 	/*
456 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
457 	 * would miscalculate the space required for the data fork bmbt root
458 	 * when adding an attr fork, and promote the iroot contents to an
459 	 * external block unnecessarily.  This went unnoticed for many years
460 	 * until scrub found filesystems in this state.  Inode rooted btrees are
461 	 * not supposed to have immediate child blocks that are small enough
462 	 * that the contents could fit in the inode root, but we can't fail
463 	 * existing filesystems, so instead we disable the check for data fork
464 	 * bmap btrees when there's an attr fork.
465 	 */
466 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
467 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
468 	    xfs_inode_has_attr_fork(bs->sc->ip))
469 		return false;
470 
471 	return true;
472 }
473 
474 /*
475  * Check that this btree block has at least minrecs records or is one of the
476  * special blocks that don't require that.
477  */
478 STATIC void
479 xchk_btree_check_minrecs(
480 	struct xchk_btree	*bs,
481 	int			level,
482 	struct xfs_btree_block	*block)
483 {
484 	struct xfs_btree_cur	*cur = bs->cur;
485 	unsigned int		root_level = cur->bc_nlevels - 1;
486 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
487 
488 	/* More records than minrecs means the block is ok. */
489 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
490 		return;
491 
492 	/*
493 	 * For btrees rooted in the inode, it's possible that the root block
494 	 * contents spilled into a regular ondisk block because there wasn't
495 	 * enough space in the inode root.  The number of records in that
496 	 * child block might be less than the standard minrecs, but that's ok
497 	 * provided that there's only one direct child of the root.
498 	 */
499 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
500 	    level == cur->bc_nlevels - 2) {
501 		struct xfs_btree_block	*root_block;
502 		struct xfs_buf		*root_bp;
503 		int			root_maxrecs;
504 
505 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
506 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
507 		if (xchk_btree_check_iroot_minrecs(bs) &&
508 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
509 		     numrecs <= root_maxrecs))
510 			xchk_btree_set_corrupt(bs->sc, cur, level);
511 		return;
512 	}
513 
514 	/*
515 	 * Otherwise, only the root level is allowed to have fewer than minrecs
516 	 * records or keyptrs.
517 	 */
518 	if (level < root_level)
519 		xchk_btree_set_corrupt(bs->sc, cur, level);
520 }
521 
522 /*
523  * Grab and scrub a btree block given a btree pointer.  Returns block
524  * and buffer pointers (if applicable) if they're ok to use.
525  */
526 STATIC int
527 xchk_btree_get_block(
528 	struct xchk_btree	*bs,
529 	int			level,
530 	union xfs_btree_ptr	*pp,
531 	struct xfs_btree_block	**pblock,
532 	struct xfs_buf		**pbp)
533 {
534 	xfs_failaddr_t		failed_at;
535 	int			error;
536 
537 	*pblock = NULL;
538 	*pbp = NULL;
539 
540 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
541 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
542 	    !*pblock)
543 		return error;
544 
545 	xfs_btree_get_block(bs->cur, level, pbp);
546 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
547 		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
548 				level, *pbp);
549 	else
550 		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
551 				 level, *pbp);
552 	if (failed_at) {
553 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
554 		return 0;
555 	}
556 	if (*pbp)
557 		xchk_buffer_recheck(bs->sc, *pbp);
558 
559 	xchk_btree_check_minrecs(bs, level, *pblock);
560 
561 	/*
562 	 * Check the block's owner; this function absorbs error codes
563 	 * for us.
564 	 */
565 	error = xchk_btree_check_owner(bs, level, *pbp);
566 	if (error)
567 		return error;
568 
569 	/*
570 	 * Check the block's siblings; this function absorbs error codes
571 	 * for us.
572 	 */
573 	return xchk_btree_block_check_siblings(bs, *pblock);
574 }
575 
576 /*
577  * Check that the low and high keys of this block match the keys stored
578  * in the parent block.
579  */
580 STATIC void
581 xchk_btree_block_keys(
582 	struct xchk_btree	*bs,
583 	int			level,
584 	struct xfs_btree_block	*block)
585 {
586 	union xfs_btree_key	block_keys;
587 	struct xfs_btree_cur	*cur = bs->cur;
588 	union xfs_btree_key	*high_bk;
589 	union xfs_btree_key	*parent_keys;
590 	union xfs_btree_key	*high_pk;
591 	struct xfs_btree_block	*parent_block;
592 	struct xfs_buf		*bp;
593 
594 	if (level >= cur->bc_nlevels - 1)
595 		return;
596 
597 	/* Calculate the keys for this block. */
598 	xfs_btree_get_keys(cur, block, &block_keys);
599 
600 	/* Obtain the parent's copy of the keys for this block. */
601 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
602 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
603 			parent_block);
604 
605 	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
606 		xchk_btree_set_corrupt(bs->sc, cur, 1);
607 
608 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
609 		return;
610 
611 	/* Get high keys */
612 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
613 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
614 			parent_block);
615 
616 	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
617 		xchk_btree_set_corrupt(bs->sc, cur, 1);
618 }
619 
620 /*
621  * Visit all nodes and leaves of a btree.  Check that all pointers and
622  * records are in order, that the keys reflect the records, and use a callback
623  * so that the caller can verify individual records.
624  */
625 int
626 xchk_btree(
627 	struct xfs_scrub		*sc,
628 	struct xfs_btree_cur		*cur,
629 	xchk_btree_rec_fn		scrub_fn,
630 	const struct xfs_owner_info	*oinfo,
631 	void				*private)
632 {
633 	union xfs_btree_ptr		ptr;
634 	struct xchk_btree		*bs;
635 	union xfs_btree_ptr		*pp;
636 	union xfs_btree_rec		*recp;
637 	struct xfs_btree_block		*block;
638 	struct xfs_buf			*bp;
639 	struct check_owner		*co;
640 	struct check_owner		*n;
641 	size_t				cur_sz;
642 	int				level;
643 	int				error = 0;
644 
645 	/*
646 	 * Allocate the btree scrub context from the heap, because this
647 	 * structure can get rather large.  Don't let a caller feed us a
648 	 * totally absurd size.
649 	 */
650 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
651 	if (cur_sz > PAGE_SIZE) {
652 		xchk_btree_set_corrupt(sc, cur, 0);
653 		return 0;
654 	}
655 	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
656 	if (!bs)
657 		return -ENOMEM;
658 	bs->cur = cur;
659 	bs->scrub_rec = scrub_fn;
660 	bs->oinfo = oinfo;
661 	bs->private = private;
662 	bs->sc = sc;
663 
664 	/* Initialize scrub state */
665 	INIT_LIST_HEAD(&bs->to_check);
666 
667 	/*
668 	 * Load the root of the btree.  The helper function absorbs
669 	 * error codes for us.
670 	 */
671 	level = cur->bc_nlevels - 1;
672 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
673 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
674 		goto out;
675 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
676 	if (error || !block)
677 		goto out;
678 
679 	cur->bc_levels[level].ptr = 1;
680 
681 	while (level < cur->bc_nlevels) {
682 		block = xfs_btree_get_block(cur, level, &bp);
683 
684 		if (level == 0) {
685 			/* End of leaf, pop back towards the root. */
686 			if (cur->bc_levels[level].ptr >
687 			    be16_to_cpu(block->bb_numrecs)) {
688 				xchk_btree_block_keys(bs, level, block);
689 				if (level < cur->bc_nlevels - 1)
690 					cur->bc_levels[level + 1].ptr++;
691 				level++;
692 				continue;
693 			}
694 
695 			/* Records in order for scrub? */
696 			xchk_btree_rec(bs);
697 
698 			/* Call out to the record checker. */
699 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
700 					block);
701 			error = bs->scrub_rec(bs, recp);
702 			if (error)
703 				break;
704 			if (xchk_should_terminate(sc, &error) ||
705 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
706 				break;
707 
708 			cur->bc_levels[level].ptr++;
709 			continue;
710 		}
711 
712 		/* End of node, pop back towards the root. */
713 		if (cur->bc_levels[level].ptr >
714 					be16_to_cpu(block->bb_numrecs)) {
715 			xchk_btree_block_keys(bs, level, block);
716 			if (level < cur->bc_nlevels - 1)
717 				cur->bc_levels[level + 1].ptr++;
718 			level++;
719 			continue;
720 		}
721 
722 		/* Keys in order for scrub? */
723 		xchk_btree_key(bs, level);
724 
725 		/* Drill another level deeper. */
726 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
727 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
728 			cur->bc_levels[level].ptr++;
729 			continue;
730 		}
731 		level--;
732 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
733 		if (error || !block)
734 			goto out;
735 
736 		cur->bc_levels[level].ptr = 1;
737 	}
738 
739 out:
740 	/* Process deferred owner checks on btree blocks. */
741 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
742 		if (!error && bs->cur)
743 			error = xchk_btree_check_block_owner(bs, co->level,
744 					co->daddr);
745 		list_del(&co->list);
746 		kfree(co);
747 	}
748 	kfree(bs);
749 
750 	return error;
751 }
752