xref: /openbmc/linux/fs/xfs/scrub/btree.c (revision b868a02e)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2017 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 		/* Used to restart an op with deadlock avoidance. */
40 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
41 		break;
42 	case -EFSBADCRC:
43 	case -EFSCORRUPTED:
44 		/* Note the badness but don't abort. */
45 		sc->sm->sm_flags |= errflag;
46 		*error = 0;
47 		fallthrough;
48 	default:
49 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
50 			trace_xchk_ifork_btree_op_error(sc, cur, level,
51 					*error, ret_ip);
52 		else
53 			trace_xchk_btree_op_error(sc, cur, level,
54 					*error, ret_ip);
55 		break;
56 	}
57 	return false;
58 }
59 
60 bool
61 xchk_btree_process_error(
62 	struct xfs_scrub	*sc,
63 	struct xfs_btree_cur	*cur,
64 	int			level,
65 	int			*error)
66 {
67 	return __xchk_btree_process_error(sc, cur, level, error,
68 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
69 }
70 
71 bool
72 xchk_btree_xref_process_error(
73 	struct xfs_scrub	*sc,
74 	struct xfs_btree_cur	*cur,
75 	int			level,
76 	int			*error)
77 {
78 	return __xchk_btree_process_error(sc, cur, level, error,
79 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
80 }
81 
82 /* Record btree block corruption. */
83 static void
84 __xchk_btree_set_corrupt(
85 	struct xfs_scrub	*sc,
86 	struct xfs_btree_cur	*cur,
87 	int			level,
88 	__u32			errflag,
89 	void			*ret_ip)
90 {
91 	sc->sm->sm_flags |= errflag;
92 
93 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
94 		trace_xchk_ifork_btree_error(sc, cur, level,
95 				ret_ip);
96 	else
97 		trace_xchk_btree_error(sc, cur, level,
98 				ret_ip);
99 }
100 
101 void
102 xchk_btree_set_corrupt(
103 	struct xfs_scrub	*sc,
104 	struct xfs_btree_cur	*cur,
105 	int			level)
106 {
107 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
108 			__return_address);
109 }
110 
111 void
112 xchk_btree_xref_set_corrupt(
113 	struct xfs_scrub	*sc,
114 	struct xfs_btree_cur	*cur,
115 	int			level)
116 {
117 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
118 			__return_address);
119 }
120 
121 /*
122  * Make sure this record is in order and doesn't stray outside of the parent
123  * keys.
124  */
125 STATIC void
126 xchk_btree_rec(
127 	struct xchk_btree	*bs)
128 {
129 	struct xfs_btree_cur	*cur = bs->cur;
130 	union xfs_btree_rec	*rec;
131 	union xfs_btree_key	key;
132 	union xfs_btree_key	hkey;
133 	union xfs_btree_key	*keyp;
134 	struct xfs_btree_block	*block;
135 	struct xfs_btree_block	*keyblock;
136 	struct xfs_buf		*bp;
137 
138 	block = xfs_btree_get_block(cur, 0, &bp);
139 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
140 
141 	trace_xchk_btree_rec(bs->sc, cur, 0);
142 
143 	/* If this isn't the first record, are they in order? */
144 	if (cur->bc_levels[0].ptr > 1 &&
145 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
146 		xchk_btree_set_corrupt(bs->sc, cur, 0);
147 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
148 
149 	if (cur->bc_nlevels == 1)
150 		return;
151 
152 	/* Is this at least as large as the parent low key? */
153 	cur->bc_ops->init_key_from_rec(&key, rec);
154 	keyblock = xfs_btree_get_block(cur, 1, &bp);
155 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
156 	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
157 		xchk_btree_set_corrupt(bs->sc, cur, 1);
158 
159 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
160 		return;
161 
162 	/* Is this no larger than the parent high key? */
163 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
164 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
165 	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
166 		xchk_btree_set_corrupt(bs->sc, cur, 1);
167 }
168 
169 /*
170  * Make sure this key is in order and doesn't stray outside of the parent
171  * keys.
172  */
173 STATIC void
174 xchk_btree_key(
175 	struct xchk_btree	*bs,
176 	int			level)
177 {
178 	struct xfs_btree_cur	*cur = bs->cur;
179 	union xfs_btree_key	*key;
180 	union xfs_btree_key	*keyp;
181 	struct xfs_btree_block	*block;
182 	struct xfs_btree_block	*keyblock;
183 	struct xfs_buf		*bp;
184 
185 	block = xfs_btree_get_block(cur, level, &bp);
186 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
187 
188 	trace_xchk_btree_key(bs->sc, cur, level);
189 
190 	/* If this isn't the first key, are they in order? */
191 	if (cur->bc_levels[level].ptr > 1 &&
192 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
193 		xchk_btree_set_corrupt(bs->sc, cur, level);
194 	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
195 
196 	if (level + 1 >= cur->bc_nlevels)
197 		return;
198 
199 	/* Is this at least as large as the parent low key? */
200 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
201 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
202 	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
203 		xchk_btree_set_corrupt(bs->sc, cur, level);
204 
205 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
206 		return;
207 
208 	/* Is this no larger than the parent high key? */
209 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
210 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
211 			keyblock);
212 	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
213 		xchk_btree_set_corrupt(bs->sc, cur, level);
214 }
215 
216 /*
217  * Check a btree pointer.  Returns true if it's ok to use this pointer.
218  * Callers do not need to set the corrupt flag.
219  */
220 static bool
221 xchk_btree_ptr_ok(
222 	struct xchk_btree	*bs,
223 	int			level,
224 	union xfs_btree_ptr	*ptr)
225 {
226 	bool			res;
227 
228 	/* A btree rooted in an inode has no block pointer to the root. */
229 	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
230 	    level == bs->cur->bc_nlevels)
231 		return true;
232 
233 	/* Otherwise, check the pointers. */
234 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
235 		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
236 	else
237 		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
238 	if (!res)
239 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
240 
241 	return res;
242 }
243 
244 /* Check that a btree block's sibling matches what we expect it. */
245 STATIC int
246 xchk_btree_block_check_sibling(
247 	struct xchk_btree	*bs,
248 	int			level,
249 	int			direction,
250 	union xfs_btree_ptr	*sibling)
251 {
252 	struct xfs_btree_cur	*cur = bs->cur;
253 	struct xfs_btree_block	*pblock;
254 	struct xfs_buf		*pbp;
255 	struct xfs_btree_cur	*ncur = NULL;
256 	union xfs_btree_ptr	*pp;
257 	int			success;
258 	int			error;
259 
260 	error = xfs_btree_dup_cursor(cur, &ncur);
261 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
262 	    !ncur)
263 		return error;
264 
265 	/*
266 	 * If the pointer is null, we shouldn't be able to move the upper
267 	 * level pointer anywhere.
268 	 */
269 	if (xfs_btree_ptr_is_null(cur, sibling)) {
270 		if (direction > 0)
271 			error = xfs_btree_increment(ncur, level + 1, &success);
272 		else
273 			error = xfs_btree_decrement(ncur, level + 1, &success);
274 		if (error == 0 && success)
275 			xchk_btree_set_corrupt(bs->sc, cur, level);
276 		error = 0;
277 		goto out;
278 	}
279 
280 	/* Increment upper level pointer. */
281 	if (direction > 0)
282 		error = xfs_btree_increment(ncur, level + 1, &success);
283 	else
284 		error = xfs_btree_decrement(ncur, level + 1, &success);
285 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
286 		goto out;
287 	if (!success) {
288 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
289 		goto out;
290 	}
291 
292 	/* Compare upper level pointer to sibling pointer. */
293 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
294 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
295 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
296 		goto out;
297 	if (pbp)
298 		xchk_buffer_recheck(bs->sc, pbp);
299 
300 	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
301 		xchk_btree_set_corrupt(bs->sc, cur, level);
302 out:
303 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
304 	return error;
305 }
306 
307 /* Check the siblings of a btree block. */
308 STATIC int
309 xchk_btree_block_check_siblings(
310 	struct xchk_btree	*bs,
311 	struct xfs_btree_block	*block)
312 {
313 	struct xfs_btree_cur	*cur = bs->cur;
314 	union xfs_btree_ptr	leftsib;
315 	union xfs_btree_ptr	rightsib;
316 	int			level;
317 	int			error = 0;
318 
319 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
320 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
321 	level = xfs_btree_get_level(block);
322 
323 	/* Root block should never have siblings. */
324 	if (level == cur->bc_nlevels - 1) {
325 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
326 		    !xfs_btree_ptr_is_null(cur, &rightsib))
327 			xchk_btree_set_corrupt(bs->sc, cur, level);
328 		goto out;
329 	}
330 
331 	/*
332 	 * Does the left & right sibling pointers match the adjacent
333 	 * parent level pointers?
334 	 * (These function absorbs error codes for us.)
335 	 */
336 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
337 	if (error)
338 		return error;
339 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
340 	if (error)
341 		return error;
342 out:
343 	return error;
344 }
345 
346 struct check_owner {
347 	struct list_head	list;
348 	xfs_daddr_t		daddr;
349 	int			level;
350 };
351 
352 /*
353  * Make sure this btree block isn't in the free list and that there's
354  * an rmap record for it.
355  */
356 STATIC int
357 xchk_btree_check_block_owner(
358 	struct xchk_btree	*bs,
359 	int			level,
360 	xfs_daddr_t		daddr)
361 {
362 	xfs_agnumber_t		agno;
363 	xfs_agblock_t		agbno;
364 	xfs_btnum_t		btnum;
365 	bool			init_sa;
366 	int			error = 0;
367 
368 	if (!bs->cur)
369 		return 0;
370 
371 	btnum = bs->cur->bc_btnum;
372 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
373 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
374 
375 	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
376 	if (init_sa) {
377 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
378 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
379 				level, &error))
380 			goto out_free;
381 	}
382 
383 	xchk_xref_is_used_space(bs->sc, agbno, 1);
384 	/*
385 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
386 	 * have to nullify it (to shut down further block owner checks) if
387 	 * self-xref encounters problems.
388 	 */
389 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
390 		bs->cur = NULL;
391 
392 	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
393 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
394 		bs->cur = NULL;
395 
396 out_free:
397 	if (init_sa)
398 		xchk_ag_free(bs->sc, &bs->sc->sa);
399 
400 	return error;
401 }
402 
403 /* Check the owner of a btree block. */
404 STATIC int
405 xchk_btree_check_owner(
406 	struct xchk_btree	*bs,
407 	int			level,
408 	struct xfs_buf		*bp)
409 {
410 	struct xfs_btree_cur	*cur = bs->cur;
411 	struct check_owner	*co;
412 
413 	/*
414 	 * In theory, xfs_btree_get_block should only give us a null buffer
415 	 * pointer for the root of a root-in-inode btree type, but we need
416 	 * to check defensively here in case the cursor state is also screwed
417 	 * up.
418 	 */
419 	if (bp == NULL) {
420 		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
421 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
422 		return 0;
423 	}
424 
425 	/*
426 	 * We want to cross-reference each btree block with the bnobt
427 	 * and the rmapbt.  We cannot cross-reference the bnobt or
428 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
429 	 * because we cannot alter the cursor and we'd prefer not to
430 	 * duplicate cursors.  Therefore, save the buffer daddr for
431 	 * later scanning.
432 	 */
433 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
434 		co = kmem_alloc(sizeof(struct check_owner),
435 				KM_MAYFAIL);
436 		if (!co)
437 			return -ENOMEM;
438 		co->level = level;
439 		co->daddr = xfs_buf_daddr(bp);
440 		list_add_tail(&co->list, &bs->to_check);
441 		return 0;
442 	}
443 
444 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
445 }
446 
447 /* Decide if we want to check minrecs of a btree block in the inode root. */
448 static inline bool
449 xchk_btree_check_iroot_minrecs(
450 	struct xchk_btree	*bs)
451 {
452 	/*
453 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
454 	 * would miscalculate the space required for the data fork bmbt root
455 	 * when adding an attr fork, and promote the iroot contents to an
456 	 * external block unnecessarily.  This went unnoticed for many years
457 	 * until scrub found filesystems in this state.  Inode rooted btrees are
458 	 * not supposed to have immediate child blocks that are small enough
459 	 * that the contents could fit in the inode root, but we can't fail
460 	 * existing filesystems, so instead we disable the check for data fork
461 	 * bmap btrees when there's an attr fork.
462 	 */
463 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
464 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
465 	    xfs_inode_has_attr_fork(bs->sc->ip))
466 		return false;
467 
468 	return true;
469 }
470 
471 /*
472  * Check that this btree block has at least minrecs records or is one of the
473  * special blocks that don't require that.
474  */
475 STATIC void
476 xchk_btree_check_minrecs(
477 	struct xchk_btree	*bs,
478 	int			level,
479 	struct xfs_btree_block	*block)
480 {
481 	struct xfs_btree_cur	*cur = bs->cur;
482 	unsigned int		root_level = cur->bc_nlevels - 1;
483 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
484 
485 	/* More records than minrecs means the block is ok. */
486 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
487 		return;
488 
489 	/*
490 	 * For btrees rooted in the inode, it's possible that the root block
491 	 * contents spilled into a regular ondisk block because there wasn't
492 	 * enough space in the inode root.  The number of records in that
493 	 * child block might be less than the standard minrecs, but that's ok
494 	 * provided that there's only one direct child of the root.
495 	 */
496 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
497 	    level == cur->bc_nlevels - 2) {
498 		struct xfs_btree_block	*root_block;
499 		struct xfs_buf		*root_bp;
500 		int			root_maxrecs;
501 
502 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
503 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
504 		if (xchk_btree_check_iroot_minrecs(bs) &&
505 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
506 		     numrecs <= root_maxrecs))
507 			xchk_btree_set_corrupt(bs->sc, cur, level);
508 		return;
509 	}
510 
511 	/*
512 	 * Otherwise, only the root level is allowed to have fewer than minrecs
513 	 * records or keyptrs.
514 	 */
515 	if (level < root_level)
516 		xchk_btree_set_corrupt(bs->sc, cur, level);
517 }
518 
519 /*
520  * Grab and scrub a btree block given a btree pointer.  Returns block
521  * and buffer pointers (if applicable) if they're ok to use.
522  */
523 STATIC int
524 xchk_btree_get_block(
525 	struct xchk_btree	*bs,
526 	int			level,
527 	union xfs_btree_ptr	*pp,
528 	struct xfs_btree_block	**pblock,
529 	struct xfs_buf		**pbp)
530 {
531 	xfs_failaddr_t		failed_at;
532 	int			error;
533 
534 	*pblock = NULL;
535 	*pbp = NULL;
536 
537 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
538 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
539 	    !*pblock)
540 		return error;
541 
542 	xfs_btree_get_block(bs->cur, level, pbp);
543 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
544 		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
545 				level, *pbp);
546 	else
547 		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
548 				 level, *pbp);
549 	if (failed_at) {
550 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
551 		return 0;
552 	}
553 	if (*pbp)
554 		xchk_buffer_recheck(bs->sc, *pbp);
555 
556 	xchk_btree_check_minrecs(bs, level, *pblock);
557 
558 	/*
559 	 * Check the block's owner; this function absorbs error codes
560 	 * for us.
561 	 */
562 	error = xchk_btree_check_owner(bs, level, *pbp);
563 	if (error)
564 		return error;
565 
566 	/*
567 	 * Check the block's siblings; this function absorbs error codes
568 	 * for us.
569 	 */
570 	return xchk_btree_block_check_siblings(bs, *pblock);
571 }
572 
573 /*
574  * Check that the low and high keys of this block match the keys stored
575  * in the parent block.
576  */
577 STATIC void
578 xchk_btree_block_keys(
579 	struct xchk_btree	*bs,
580 	int			level,
581 	struct xfs_btree_block	*block)
582 {
583 	union xfs_btree_key	block_keys;
584 	struct xfs_btree_cur	*cur = bs->cur;
585 	union xfs_btree_key	*high_bk;
586 	union xfs_btree_key	*parent_keys;
587 	union xfs_btree_key	*high_pk;
588 	struct xfs_btree_block	*parent_block;
589 	struct xfs_buf		*bp;
590 
591 	if (level >= cur->bc_nlevels - 1)
592 		return;
593 
594 	/* Calculate the keys for this block. */
595 	xfs_btree_get_keys(cur, block, &block_keys);
596 
597 	/* Obtain the parent's copy of the keys for this block. */
598 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
599 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
600 			parent_block);
601 
602 	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
603 		xchk_btree_set_corrupt(bs->sc, cur, 1);
604 
605 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
606 		return;
607 
608 	/* Get high keys */
609 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
610 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
611 			parent_block);
612 
613 	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
614 		xchk_btree_set_corrupt(bs->sc, cur, 1);
615 }
616 
617 /*
618  * Visit all nodes and leaves of a btree.  Check that all pointers and
619  * records are in order, that the keys reflect the records, and use a callback
620  * so that the caller can verify individual records.
621  */
622 int
623 xchk_btree(
624 	struct xfs_scrub		*sc,
625 	struct xfs_btree_cur		*cur,
626 	xchk_btree_rec_fn		scrub_fn,
627 	const struct xfs_owner_info	*oinfo,
628 	void				*private)
629 {
630 	union xfs_btree_ptr		ptr;
631 	struct xchk_btree		*bs;
632 	union xfs_btree_ptr		*pp;
633 	union xfs_btree_rec		*recp;
634 	struct xfs_btree_block		*block;
635 	struct xfs_buf			*bp;
636 	struct check_owner		*co;
637 	struct check_owner		*n;
638 	size_t				cur_sz;
639 	int				level;
640 	int				error = 0;
641 
642 	/*
643 	 * Allocate the btree scrub context from the heap, because this
644 	 * structure can get rather large.  Don't let a caller feed us a
645 	 * totally absurd size.
646 	 */
647 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
648 	if (cur_sz > PAGE_SIZE) {
649 		xchk_btree_set_corrupt(sc, cur, 0);
650 		return 0;
651 	}
652 	bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
653 	if (!bs)
654 		return -ENOMEM;
655 	bs->cur = cur;
656 	bs->scrub_rec = scrub_fn;
657 	bs->oinfo = oinfo;
658 	bs->private = private;
659 	bs->sc = sc;
660 
661 	/* Initialize scrub state */
662 	INIT_LIST_HEAD(&bs->to_check);
663 
664 	/*
665 	 * Load the root of the btree.  The helper function absorbs
666 	 * error codes for us.
667 	 */
668 	level = cur->bc_nlevels - 1;
669 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
670 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
671 		goto out;
672 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
673 	if (error || !block)
674 		goto out;
675 
676 	cur->bc_levels[level].ptr = 1;
677 
678 	while (level < cur->bc_nlevels) {
679 		block = xfs_btree_get_block(cur, level, &bp);
680 
681 		if (level == 0) {
682 			/* End of leaf, pop back towards the root. */
683 			if (cur->bc_levels[level].ptr >
684 			    be16_to_cpu(block->bb_numrecs)) {
685 				xchk_btree_block_keys(bs, level, block);
686 				if (level < cur->bc_nlevels - 1)
687 					cur->bc_levels[level + 1].ptr++;
688 				level++;
689 				continue;
690 			}
691 
692 			/* Records in order for scrub? */
693 			xchk_btree_rec(bs);
694 
695 			/* Call out to the record checker. */
696 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
697 					block);
698 			error = bs->scrub_rec(bs, recp);
699 			if (error)
700 				break;
701 			if (xchk_should_terminate(sc, &error) ||
702 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
703 				break;
704 
705 			cur->bc_levels[level].ptr++;
706 			continue;
707 		}
708 
709 		/* End of node, pop back towards the root. */
710 		if (cur->bc_levels[level].ptr >
711 					be16_to_cpu(block->bb_numrecs)) {
712 			xchk_btree_block_keys(bs, level, block);
713 			if (level < cur->bc_nlevels - 1)
714 				cur->bc_levels[level + 1].ptr++;
715 			level++;
716 			continue;
717 		}
718 
719 		/* Keys in order for scrub? */
720 		xchk_btree_key(bs, level);
721 
722 		/* Drill another level deeper. */
723 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
724 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
725 			cur->bc_levels[level].ptr++;
726 			continue;
727 		}
728 		level--;
729 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
730 		if (error || !block)
731 			goto out;
732 
733 		cur->bc_levels[level].ptr = 1;
734 	}
735 
736 out:
737 	/* Process deferred owner checks on btree blocks. */
738 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
739 		if (!error && bs->cur)
740 			error = xchk_btree_check_block_owner(bs, co->level,
741 					co->daddr);
742 		list_del(&co->list);
743 		kmem_free(co);
744 	}
745 	kmem_free(bs);
746 
747 	return error;
748 }
749