xref: /openbmc/linux/fs/xfs/scrub/btree.c (revision 27e45f2e)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2017 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 		/* Used to restart an op with deadlock avoidance. */
40 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
41 		break;
42 	case -EFSBADCRC:
43 	case -EFSCORRUPTED:
44 		/* Note the badness but don't abort. */
45 		sc->sm->sm_flags |= errflag;
46 		*error = 0;
47 		fallthrough;
48 	default:
49 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
50 			trace_xchk_ifork_btree_op_error(sc, cur, level,
51 					*error, ret_ip);
52 		else
53 			trace_xchk_btree_op_error(sc, cur, level,
54 					*error, ret_ip);
55 		break;
56 	}
57 	return false;
58 }
59 
60 bool
61 xchk_btree_process_error(
62 	struct xfs_scrub	*sc,
63 	struct xfs_btree_cur	*cur,
64 	int			level,
65 	int			*error)
66 {
67 	return __xchk_btree_process_error(sc, cur, level, error,
68 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
69 }
70 
71 bool
72 xchk_btree_xref_process_error(
73 	struct xfs_scrub	*sc,
74 	struct xfs_btree_cur	*cur,
75 	int			level,
76 	int			*error)
77 {
78 	return __xchk_btree_process_error(sc, cur, level, error,
79 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
80 }
81 
82 /* Record btree block corruption. */
83 static void
84 __xchk_btree_set_corrupt(
85 	struct xfs_scrub	*sc,
86 	struct xfs_btree_cur	*cur,
87 	int			level,
88 	__u32			errflag,
89 	void			*ret_ip)
90 {
91 	sc->sm->sm_flags |= errflag;
92 
93 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
94 		trace_xchk_ifork_btree_error(sc, cur, level,
95 				ret_ip);
96 	else
97 		trace_xchk_btree_error(sc, cur, level,
98 				ret_ip);
99 }
100 
101 void
102 xchk_btree_set_corrupt(
103 	struct xfs_scrub	*sc,
104 	struct xfs_btree_cur	*cur,
105 	int			level)
106 {
107 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
108 			__return_address);
109 }
110 
111 void
112 xchk_btree_xref_set_corrupt(
113 	struct xfs_scrub	*sc,
114 	struct xfs_btree_cur	*cur,
115 	int			level)
116 {
117 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
118 			__return_address);
119 }
120 
121 /*
122  * Make sure this record is in order and doesn't stray outside of the parent
123  * keys.
124  */
125 STATIC void
126 xchk_btree_rec(
127 	struct xchk_btree	*bs)
128 {
129 	struct xfs_btree_cur	*cur = bs->cur;
130 	union xfs_btree_rec	*rec;
131 	union xfs_btree_key	key;
132 	union xfs_btree_key	hkey;
133 	union xfs_btree_key	*keyp;
134 	struct xfs_btree_block	*block;
135 	struct xfs_btree_block	*keyblock;
136 	struct xfs_buf		*bp;
137 
138 	block = xfs_btree_get_block(cur, 0, &bp);
139 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
140 
141 	trace_xchk_btree_rec(bs->sc, cur, 0);
142 
143 	/* If this isn't the first record, are they in order? */
144 	if (cur->bc_levels[0].ptr > 1 &&
145 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
146 		xchk_btree_set_corrupt(bs->sc, cur, 0);
147 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
148 
149 	if (cur->bc_nlevels == 1)
150 		return;
151 
152 	/* Is this at least as large as the parent low key? */
153 	cur->bc_ops->init_key_from_rec(&key, rec);
154 	keyblock = xfs_btree_get_block(cur, 1, &bp);
155 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
156 	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
157 		xchk_btree_set_corrupt(bs->sc, cur, 1);
158 
159 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
160 		return;
161 
162 	/* Is this no larger than the parent high key? */
163 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
164 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
165 	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
166 		xchk_btree_set_corrupt(bs->sc, cur, 1);
167 }
168 
169 /*
170  * Make sure this key is in order and doesn't stray outside of the parent
171  * keys.
172  */
173 STATIC void
174 xchk_btree_key(
175 	struct xchk_btree	*bs,
176 	int			level)
177 {
178 	struct xfs_btree_cur	*cur = bs->cur;
179 	union xfs_btree_key	*key;
180 	union xfs_btree_key	*keyp;
181 	struct xfs_btree_block	*block;
182 	struct xfs_btree_block	*keyblock;
183 	struct xfs_buf		*bp;
184 
185 	block = xfs_btree_get_block(cur, level, &bp);
186 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
187 
188 	trace_xchk_btree_key(bs->sc, cur, level);
189 
190 	/* If this isn't the first key, are they in order? */
191 	if (cur->bc_levels[level].ptr > 1 &&
192 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
193 		xchk_btree_set_corrupt(bs->sc, cur, level);
194 	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
195 
196 	if (level + 1 >= cur->bc_nlevels)
197 		return;
198 
199 	/* Is this at least as large as the parent low key? */
200 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
201 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
202 	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
203 		xchk_btree_set_corrupt(bs->sc, cur, level);
204 
205 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
206 		return;
207 
208 	/* Is this no larger than the parent high key? */
209 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
210 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
211 			keyblock);
212 	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
213 		xchk_btree_set_corrupt(bs->sc, cur, level);
214 }
215 
216 /*
217  * Check a btree pointer.  Returns true if it's ok to use this pointer.
218  * Callers do not need to set the corrupt flag.
219  */
220 static bool
221 xchk_btree_ptr_ok(
222 	struct xchk_btree	*bs,
223 	int			level,
224 	union xfs_btree_ptr	*ptr)
225 {
226 	bool			res;
227 
228 	/* A btree rooted in an inode has no block pointer to the root. */
229 	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
230 	    level == bs->cur->bc_nlevels)
231 		return true;
232 
233 	/* Otherwise, check the pointers. */
234 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
235 		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
236 	else
237 		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
238 	if (!res)
239 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
240 
241 	return res;
242 }
243 
244 /* Check that a btree block's sibling matches what we expect it. */
245 STATIC int
246 xchk_btree_block_check_sibling(
247 	struct xchk_btree	*bs,
248 	int			level,
249 	int			direction,
250 	union xfs_btree_ptr	*sibling)
251 {
252 	struct xfs_btree_cur	*cur = bs->cur;
253 	struct xfs_btree_block	*pblock;
254 	struct xfs_buf		*pbp;
255 	struct xfs_btree_cur	*ncur = NULL;
256 	union xfs_btree_ptr	*pp;
257 	int			success;
258 	int			error;
259 
260 	error = xfs_btree_dup_cursor(cur, &ncur);
261 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
262 	    !ncur)
263 		return error;
264 
265 	/*
266 	 * If the pointer is null, we shouldn't be able to move the upper
267 	 * level pointer anywhere.
268 	 */
269 	if (xfs_btree_ptr_is_null(cur, sibling)) {
270 		if (direction > 0)
271 			error = xfs_btree_increment(ncur, level + 1, &success);
272 		else
273 			error = xfs_btree_decrement(ncur, level + 1, &success);
274 		if (error == 0 && success)
275 			xchk_btree_set_corrupt(bs->sc, cur, level);
276 		error = 0;
277 		goto out;
278 	}
279 
280 	/* Increment upper level pointer. */
281 	if (direction > 0)
282 		error = xfs_btree_increment(ncur, level + 1, &success);
283 	else
284 		error = xfs_btree_decrement(ncur, level + 1, &success);
285 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
286 		goto out;
287 	if (!success) {
288 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
289 		goto out;
290 	}
291 
292 	/* Compare upper level pointer to sibling pointer. */
293 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
294 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
295 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
296 		goto out;
297 	if (pbp)
298 		xchk_buffer_recheck(bs->sc, pbp);
299 
300 	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
301 		xchk_btree_set_corrupt(bs->sc, cur, level);
302 out:
303 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
304 	return error;
305 }
306 
307 /* Check the siblings of a btree block. */
308 STATIC int
309 xchk_btree_block_check_siblings(
310 	struct xchk_btree	*bs,
311 	struct xfs_btree_block	*block)
312 {
313 	struct xfs_btree_cur	*cur = bs->cur;
314 	union xfs_btree_ptr	leftsib;
315 	union xfs_btree_ptr	rightsib;
316 	int			level;
317 	int			error = 0;
318 
319 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
320 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
321 	level = xfs_btree_get_level(block);
322 
323 	/* Root block should never have siblings. */
324 	if (level == cur->bc_nlevels - 1) {
325 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
326 		    !xfs_btree_ptr_is_null(cur, &rightsib))
327 			xchk_btree_set_corrupt(bs->sc, cur, level);
328 		goto out;
329 	}
330 
331 	/*
332 	 * Does the left & right sibling pointers match the adjacent
333 	 * parent level pointers?
334 	 * (These function absorbs error codes for us.)
335 	 */
336 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
337 	if (error)
338 		return error;
339 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
340 	if (error)
341 		return error;
342 out:
343 	return error;
344 }
345 
346 struct check_owner {
347 	struct list_head	list;
348 	xfs_daddr_t		daddr;
349 	int			level;
350 };
351 
352 /*
353  * Make sure this btree block isn't in the free list and that there's
354  * an rmap record for it.
355  */
356 STATIC int
357 xchk_btree_check_block_owner(
358 	struct xchk_btree	*bs,
359 	int			level,
360 	xfs_daddr_t		daddr)
361 {
362 	xfs_agnumber_t		agno;
363 	xfs_agblock_t		agbno;
364 	xfs_btnum_t		btnum;
365 	bool			init_sa;
366 	int			error = 0;
367 
368 	if (!bs->cur)
369 		return 0;
370 
371 	btnum = bs->cur->bc_btnum;
372 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
373 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
374 
375 	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
376 	if (init_sa) {
377 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
378 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
379 				level, &error))
380 			goto out_free;
381 	}
382 
383 	xchk_xref_is_used_space(bs->sc, agbno, 1);
384 	/*
385 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
386 	 * have to nullify it (to shut down further block owner checks) if
387 	 * self-xref encounters problems.
388 	 */
389 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
390 		bs->cur = NULL;
391 
392 	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
393 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
394 		bs->cur = NULL;
395 
396 out_free:
397 	if (init_sa)
398 		xchk_ag_free(bs->sc, &bs->sc->sa);
399 
400 	return error;
401 }
402 
403 /* Check the owner of a btree block. */
404 STATIC int
405 xchk_btree_check_owner(
406 	struct xchk_btree	*bs,
407 	int			level,
408 	struct xfs_buf		*bp)
409 {
410 	struct xfs_btree_cur	*cur = bs->cur;
411 
412 	/*
413 	 * In theory, xfs_btree_get_block should only give us a null buffer
414 	 * pointer for the root of a root-in-inode btree type, but we need
415 	 * to check defensively here in case the cursor state is also screwed
416 	 * up.
417 	 */
418 	if (bp == NULL) {
419 		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
420 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
421 		return 0;
422 	}
423 
424 	/*
425 	 * We want to cross-reference each btree block with the bnobt
426 	 * and the rmapbt.  We cannot cross-reference the bnobt or
427 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
428 	 * because we cannot alter the cursor and we'd prefer not to
429 	 * duplicate cursors.  Therefore, save the buffer daddr for
430 	 * later scanning.
431 	 */
432 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
433 		struct check_owner	*co;
434 
435 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
436 		if (!co)
437 			return -ENOMEM;
438 
439 		INIT_LIST_HEAD(&co->list);
440 		co->level = level;
441 		co->daddr = xfs_buf_daddr(bp);
442 		list_add_tail(&co->list, &bs->to_check);
443 		return 0;
444 	}
445 
446 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
447 }
448 
449 /* Decide if we want to check minrecs of a btree block in the inode root. */
450 static inline bool
451 xchk_btree_check_iroot_minrecs(
452 	struct xchk_btree	*bs)
453 {
454 	/*
455 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
456 	 * would miscalculate the space required for the data fork bmbt root
457 	 * when adding an attr fork, and promote the iroot contents to an
458 	 * external block unnecessarily.  This went unnoticed for many years
459 	 * until scrub found filesystems in this state.  Inode rooted btrees are
460 	 * not supposed to have immediate child blocks that are small enough
461 	 * that the contents could fit in the inode root, but we can't fail
462 	 * existing filesystems, so instead we disable the check for data fork
463 	 * bmap btrees when there's an attr fork.
464 	 */
465 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
466 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
467 	    xfs_inode_has_attr_fork(bs->sc->ip))
468 		return false;
469 
470 	return true;
471 }
472 
473 /*
474  * Check that this btree block has at least minrecs records or is one of the
475  * special blocks that don't require that.
476  */
477 STATIC void
478 xchk_btree_check_minrecs(
479 	struct xchk_btree	*bs,
480 	int			level,
481 	struct xfs_btree_block	*block)
482 {
483 	struct xfs_btree_cur	*cur = bs->cur;
484 	unsigned int		root_level = cur->bc_nlevels - 1;
485 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
486 
487 	/* More records than minrecs means the block is ok. */
488 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
489 		return;
490 
491 	/*
492 	 * For btrees rooted in the inode, it's possible that the root block
493 	 * contents spilled into a regular ondisk block because there wasn't
494 	 * enough space in the inode root.  The number of records in that
495 	 * child block might be less than the standard minrecs, but that's ok
496 	 * provided that there's only one direct child of the root.
497 	 */
498 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
499 	    level == cur->bc_nlevels - 2) {
500 		struct xfs_btree_block	*root_block;
501 		struct xfs_buf		*root_bp;
502 		int			root_maxrecs;
503 
504 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
505 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
506 		if (xchk_btree_check_iroot_minrecs(bs) &&
507 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
508 		     numrecs <= root_maxrecs))
509 			xchk_btree_set_corrupt(bs->sc, cur, level);
510 		return;
511 	}
512 
513 	/*
514 	 * Otherwise, only the root level is allowed to have fewer than minrecs
515 	 * records or keyptrs.
516 	 */
517 	if (level < root_level)
518 		xchk_btree_set_corrupt(bs->sc, cur, level);
519 }
520 
521 /*
522  * Grab and scrub a btree block given a btree pointer.  Returns block
523  * and buffer pointers (if applicable) if they're ok to use.
524  */
525 STATIC int
526 xchk_btree_get_block(
527 	struct xchk_btree	*bs,
528 	int			level,
529 	union xfs_btree_ptr	*pp,
530 	struct xfs_btree_block	**pblock,
531 	struct xfs_buf		**pbp)
532 {
533 	xfs_failaddr_t		failed_at;
534 	int			error;
535 
536 	*pblock = NULL;
537 	*pbp = NULL;
538 
539 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
540 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
541 	    !*pblock)
542 		return error;
543 
544 	xfs_btree_get_block(bs->cur, level, pbp);
545 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
546 		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
547 				level, *pbp);
548 	else
549 		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
550 				 level, *pbp);
551 	if (failed_at) {
552 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
553 		return 0;
554 	}
555 	if (*pbp)
556 		xchk_buffer_recheck(bs->sc, *pbp);
557 
558 	xchk_btree_check_minrecs(bs, level, *pblock);
559 
560 	/*
561 	 * Check the block's owner; this function absorbs error codes
562 	 * for us.
563 	 */
564 	error = xchk_btree_check_owner(bs, level, *pbp);
565 	if (error)
566 		return error;
567 
568 	/*
569 	 * Check the block's siblings; this function absorbs error codes
570 	 * for us.
571 	 */
572 	return xchk_btree_block_check_siblings(bs, *pblock);
573 }
574 
575 /*
576  * Check that the low and high keys of this block match the keys stored
577  * in the parent block.
578  */
579 STATIC void
580 xchk_btree_block_keys(
581 	struct xchk_btree	*bs,
582 	int			level,
583 	struct xfs_btree_block	*block)
584 {
585 	union xfs_btree_key	block_keys;
586 	struct xfs_btree_cur	*cur = bs->cur;
587 	union xfs_btree_key	*high_bk;
588 	union xfs_btree_key	*parent_keys;
589 	union xfs_btree_key	*high_pk;
590 	struct xfs_btree_block	*parent_block;
591 	struct xfs_buf		*bp;
592 
593 	if (level >= cur->bc_nlevels - 1)
594 		return;
595 
596 	/* Calculate the keys for this block. */
597 	xfs_btree_get_keys(cur, block, &block_keys);
598 
599 	/* Obtain the parent's copy of the keys for this block. */
600 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
601 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
602 			parent_block);
603 
604 	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
605 		xchk_btree_set_corrupt(bs->sc, cur, 1);
606 
607 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
608 		return;
609 
610 	/* Get high keys */
611 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
612 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
613 			parent_block);
614 
615 	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
616 		xchk_btree_set_corrupt(bs->sc, cur, 1);
617 }
618 
619 /*
620  * Visit all nodes and leaves of a btree.  Check that all pointers and
621  * records are in order, that the keys reflect the records, and use a callback
622  * so that the caller can verify individual records.
623  */
624 int
625 xchk_btree(
626 	struct xfs_scrub		*sc,
627 	struct xfs_btree_cur		*cur,
628 	xchk_btree_rec_fn		scrub_fn,
629 	const struct xfs_owner_info	*oinfo,
630 	void				*private)
631 {
632 	union xfs_btree_ptr		ptr;
633 	struct xchk_btree		*bs;
634 	union xfs_btree_ptr		*pp;
635 	union xfs_btree_rec		*recp;
636 	struct xfs_btree_block		*block;
637 	struct xfs_buf			*bp;
638 	struct check_owner		*co;
639 	struct check_owner		*n;
640 	size_t				cur_sz;
641 	int				level;
642 	int				error = 0;
643 
644 	/*
645 	 * Allocate the btree scrub context from the heap, because this
646 	 * structure can get rather large.  Don't let a caller feed us a
647 	 * totally absurd size.
648 	 */
649 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
650 	if (cur_sz > PAGE_SIZE) {
651 		xchk_btree_set_corrupt(sc, cur, 0);
652 		return 0;
653 	}
654 	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
655 	if (!bs)
656 		return -ENOMEM;
657 	bs->cur = cur;
658 	bs->scrub_rec = scrub_fn;
659 	bs->oinfo = oinfo;
660 	bs->private = private;
661 	bs->sc = sc;
662 
663 	/* Initialize scrub state */
664 	INIT_LIST_HEAD(&bs->to_check);
665 
666 	/*
667 	 * Load the root of the btree.  The helper function absorbs
668 	 * error codes for us.
669 	 */
670 	level = cur->bc_nlevels - 1;
671 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
672 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
673 		goto out;
674 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
675 	if (error || !block)
676 		goto out;
677 
678 	cur->bc_levels[level].ptr = 1;
679 
680 	while (level < cur->bc_nlevels) {
681 		block = xfs_btree_get_block(cur, level, &bp);
682 
683 		if (level == 0) {
684 			/* End of leaf, pop back towards the root. */
685 			if (cur->bc_levels[level].ptr >
686 			    be16_to_cpu(block->bb_numrecs)) {
687 				xchk_btree_block_keys(bs, level, block);
688 				if (level < cur->bc_nlevels - 1)
689 					cur->bc_levels[level + 1].ptr++;
690 				level++;
691 				continue;
692 			}
693 
694 			/* Records in order for scrub? */
695 			xchk_btree_rec(bs);
696 
697 			/* Call out to the record checker. */
698 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
699 					block);
700 			error = bs->scrub_rec(bs, recp);
701 			if (error)
702 				break;
703 			if (xchk_should_terminate(sc, &error) ||
704 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
705 				break;
706 
707 			cur->bc_levels[level].ptr++;
708 			continue;
709 		}
710 
711 		/* End of node, pop back towards the root. */
712 		if (cur->bc_levels[level].ptr >
713 					be16_to_cpu(block->bb_numrecs)) {
714 			xchk_btree_block_keys(bs, level, block);
715 			if (level < cur->bc_nlevels - 1)
716 				cur->bc_levels[level + 1].ptr++;
717 			level++;
718 			continue;
719 		}
720 
721 		/* Keys in order for scrub? */
722 		xchk_btree_key(bs, level);
723 
724 		/* Drill another level deeper. */
725 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
726 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
727 			cur->bc_levels[level].ptr++;
728 			continue;
729 		}
730 		level--;
731 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
732 		if (error || !block)
733 			goto out;
734 
735 		cur->bc_levels[level].ptr = 1;
736 	}
737 
738 out:
739 	/* Process deferred owner checks on btree blocks. */
740 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
741 		if (!error && bs->cur)
742 			error = xchk_btree_check_block_owner(bs, co->level,
743 					co->daddr);
744 		list_del(&co->list);
745 		kfree(co);
746 	}
747 	kfree(bs);
748 
749 	return error;
750 }
751