xref: /openbmc/linux/fs/xfs/scrub/btree.c (revision 2ae1beb3)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 	case -ECHRNG:
40 		/* Used to restart an op with deadlock avoidance. */
41 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
42 		break;
43 	case -EFSBADCRC:
44 	case -EFSCORRUPTED:
45 		/* Note the badness but don't abort. */
46 		sc->sm->sm_flags |= errflag;
47 		*error = 0;
48 		fallthrough;
49 	default:
50 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
51 			trace_xchk_ifork_btree_op_error(sc, cur, level,
52 					*error, ret_ip);
53 		else
54 			trace_xchk_btree_op_error(sc, cur, level,
55 					*error, ret_ip);
56 		break;
57 	}
58 	return false;
59 }
60 
61 bool
62 xchk_btree_process_error(
63 	struct xfs_scrub	*sc,
64 	struct xfs_btree_cur	*cur,
65 	int			level,
66 	int			*error)
67 {
68 	return __xchk_btree_process_error(sc, cur, level, error,
69 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
70 }
71 
72 bool
73 xchk_btree_xref_process_error(
74 	struct xfs_scrub	*sc,
75 	struct xfs_btree_cur	*cur,
76 	int			level,
77 	int			*error)
78 {
79 	return __xchk_btree_process_error(sc, cur, level, error,
80 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
81 }
82 
83 /* Record btree block corruption. */
84 static void
85 __xchk_btree_set_corrupt(
86 	struct xfs_scrub	*sc,
87 	struct xfs_btree_cur	*cur,
88 	int			level,
89 	__u32			errflag,
90 	void			*ret_ip)
91 {
92 	sc->sm->sm_flags |= errflag;
93 
94 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
95 		trace_xchk_ifork_btree_error(sc, cur, level,
96 				ret_ip);
97 	else
98 		trace_xchk_btree_error(sc, cur, level,
99 				ret_ip);
100 }
101 
102 void
103 xchk_btree_set_corrupt(
104 	struct xfs_scrub	*sc,
105 	struct xfs_btree_cur	*cur,
106 	int			level)
107 {
108 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
109 			__return_address);
110 }
111 
112 void
113 xchk_btree_xref_set_corrupt(
114 	struct xfs_scrub	*sc,
115 	struct xfs_btree_cur	*cur,
116 	int			level)
117 {
118 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
119 			__return_address);
120 }
121 
122 void
123 xchk_btree_set_preen(
124 	struct xfs_scrub	*sc,
125 	struct xfs_btree_cur	*cur,
126 	int			level)
127 {
128 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
129 			__return_address);
130 }
131 
132 /*
133  * Make sure this record is in order and doesn't stray outside of the parent
134  * keys.
135  */
136 STATIC void
137 xchk_btree_rec(
138 	struct xchk_btree	*bs)
139 {
140 	struct xfs_btree_cur	*cur = bs->cur;
141 	union xfs_btree_rec	*rec;
142 	union xfs_btree_key	key;
143 	union xfs_btree_key	hkey;
144 	union xfs_btree_key	*keyp;
145 	struct xfs_btree_block	*block;
146 	struct xfs_btree_block	*keyblock;
147 	struct xfs_buf		*bp;
148 
149 	block = xfs_btree_get_block(cur, 0, &bp);
150 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
151 
152 	trace_xchk_btree_rec(bs->sc, cur, 0);
153 
154 	/* Are all records across all record blocks in order? */
155 	if (bs->lastrec_valid &&
156 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
157 		xchk_btree_set_corrupt(bs->sc, cur, 0);
158 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
159 	bs->lastrec_valid = true;
160 
161 	if (cur->bc_nlevels == 1)
162 		return;
163 
164 	/* Is low_key(rec) at least as large as the parent low key? */
165 	cur->bc_ops->init_key_from_rec(&key, rec);
166 	keyblock = xfs_btree_get_block(cur, 1, &bp);
167 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
168 	if (xfs_btree_keycmp_lt(cur, &key, keyp))
169 		xchk_btree_set_corrupt(bs->sc, cur, 1);
170 
171 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
172 		return;
173 
174 	/* Is high_key(rec) no larger than the parent high key? */
175 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
176 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
177 	if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
178 		xchk_btree_set_corrupt(bs->sc, cur, 1);
179 }
180 
181 /*
182  * Make sure this key is in order and doesn't stray outside of the parent
183  * keys.
184  */
185 STATIC void
186 xchk_btree_key(
187 	struct xchk_btree	*bs,
188 	int			level)
189 {
190 	struct xfs_btree_cur	*cur = bs->cur;
191 	union xfs_btree_key	*key;
192 	union xfs_btree_key	*keyp;
193 	struct xfs_btree_block	*block;
194 	struct xfs_btree_block	*keyblock;
195 	struct xfs_buf		*bp;
196 
197 	block = xfs_btree_get_block(cur, level, &bp);
198 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
199 
200 	trace_xchk_btree_key(bs->sc, cur, level);
201 
202 	/* Are all low keys across all node blocks in order? */
203 	if (bs->lastkey[level - 1].valid &&
204 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
205 		xchk_btree_set_corrupt(bs->sc, cur, level);
206 	memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
207 	bs->lastkey[level - 1].valid = true;
208 
209 	if (level + 1 >= cur->bc_nlevels)
210 		return;
211 
212 	/* Is this block's low key at least as large as the parent low key? */
213 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
214 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
215 	if (xfs_btree_keycmp_lt(cur, key, keyp))
216 		xchk_btree_set_corrupt(bs->sc, cur, level);
217 
218 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
219 		return;
220 
221 	/* Is this block's high key no larger than the parent high key? */
222 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
223 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
224 			keyblock);
225 	if (xfs_btree_keycmp_lt(cur, keyp, key))
226 		xchk_btree_set_corrupt(bs->sc, cur, level);
227 }
228 
229 /*
230  * Check a btree pointer.  Returns true if it's ok to use this pointer.
231  * Callers do not need to set the corrupt flag.
232  */
233 static bool
234 xchk_btree_ptr_ok(
235 	struct xchk_btree	*bs,
236 	int			level,
237 	union xfs_btree_ptr	*ptr)
238 {
239 	bool			res;
240 
241 	/* A btree rooted in an inode has no block pointer to the root. */
242 	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
243 	    level == bs->cur->bc_nlevels)
244 		return true;
245 
246 	/* Otherwise, check the pointers. */
247 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
248 		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
249 	else
250 		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
251 	if (!res)
252 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
253 
254 	return res;
255 }
256 
257 /* Check that a btree block's sibling matches what we expect it. */
258 STATIC int
259 xchk_btree_block_check_sibling(
260 	struct xchk_btree	*bs,
261 	int			level,
262 	int			direction,
263 	union xfs_btree_ptr	*sibling)
264 {
265 	struct xfs_btree_cur	*cur = bs->cur;
266 	struct xfs_btree_block	*pblock;
267 	struct xfs_buf		*pbp;
268 	struct xfs_btree_cur	*ncur = NULL;
269 	union xfs_btree_ptr	*pp;
270 	int			success;
271 	int			error;
272 
273 	error = xfs_btree_dup_cursor(cur, &ncur);
274 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
275 	    !ncur)
276 		return error;
277 
278 	/*
279 	 * If the pointer is null, we shouldn't be able to move the upper
280 	 * level pointer anywhere.
281 	 */
282 	if (xfs_btree_ptr_is_null(cur, sibling)) {
283 		if (direction > 0)
284 			error = xfs_btree_increment(ncur, level + 1, &success);
285 		else
286 			error = xfs_btree_decrement(ncur, level + 1, &success);
287 		if (error == 0 && success)
288 			xchk_btree_set_corrupt(bs->sc, cur, level);
289 		error = 0;
290 		goto out;
291 	}
292 
293 	/* Increment upper level pointer. */
294 	if (direction > 0)
295 		error = xfs_btree_increment(ncur, level + 1, &success);
296 	else
297 		error = xfs_btree_decrement(ncur, level + 1, &success);
298 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
299 		goto out;
300 	if (!success) {
301 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
302 		goto out;
303 	}
304 
305 	/* Compare upper level pointer to sibling pointer. */
306 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
307 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
308 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
309 		goto out;
310 	if (pbp)
311 		xchk_buffer_recheck(bs->sc, pbp);
312 
313 	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
314 		xchk_btree_set_corrupt(bs->sc, cur, level);
315 out:
316 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
317 	return error;
318 }
319 
320 /* Check the siblings of a btree block. */
321 STATIC int
322 xchk_btree_block_check_siblings(
323 	struct xchk_btree	*bs,
324 	struct xfs_btree_block	*block)
325 {
326 	struct xfs_btree_cur	*cur = bs->cur;
327 	union xfs_btree_ptr	leftsib;
328 	union xfs_btree_ptr	rightsib;
329 	int			level;
330 	int			error = 0;
331 
332 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
333 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
334 	level = xfs_btree_get_level(block);
335 
336 	/* Root block should never have siblings. */
337 	if (level == cur->bc_nlevels - 1) {
338 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
339 		    !xfs_btree_ptr_is_null(cur, &rightsib))
340 			xchk_btree_set_corrupt(bs->sc, cur, level);
341 		goto out;
342 	}
343 
344 	/*
345 	 * Does the left & right sibling pointers match the adjacent
346 	 * parent level pointers?
347 	 * (These function absorbs error codes for us.)
348 	 */
349 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
350 	if (error)
351 		return error;
352 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
353 	if (error)
354 		return error;
355 out:
356 	return error;
357 }
358 
359 struct check_owner {
360 	struct list_head	list;
361 	xfs_daddr_t		daddr;
362 	int			level;
363 };
364 
365 /*
366  * Make sure this btree block isn't in the free list and that there's
367  * an rmap record for it.
368  */
369 STATIC int
370 xchk_btree_check_block_owner(
371 	struct xchk_btree	*bs,
372 	int			level,
373 	xfs_daddr_t		daddr)
374 {
375 	xfs_agnumber_t		agno;
376 	xfs_agblock_t		agbno;
377 	xfs_btnum_t		btnum;
378 	bool			init_sa;
379 	int			error = 0;
380 
381 	if (!bs->cur)
382 		return 0;
383 
384 	btnum = bs->cur->bc_btnum;
385 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
386 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
387 
388 	/*
389 	 * If the btree being examined is not itself a per-AG btree, initialize
390 	 * sc->sa so that we can check for the presence of an ownership record
391 	 * in the rmap btree for the AG containing the block.
392 	 */
393 	init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE;
394 	if (init_sa) {
395 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
396 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
397 				level, &error))
398 			goto out_free;
399 	}
400 
401 	xchk_xref_is_used_space(bs->sc, agbno, 1);
402 	/*
403 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
404 	 * have to nullify it (to shut down further block owner checks) if
405 	 * self-xref encounters problems.
406 	 */
407 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
408 		bs->cur = NULL;
409 
410 	xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
411 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
412 		bs->cur = NULL;
413 
414 out_free:
415 	if (init_sa)
416 		xchk_ag_free(bs->sc, &bs->sc->sa);
417 
418 	return error;
419 }
420 
421 /* Check the owner of a btree block. */
422 STATIC int
423 xchk_btree_check_owner(
424 	struct xchk_btree	*bs,
425 	int			level,
426 	struct xfs_buf		*bp)
427 {
428 	struct xfs_btree_cur	*cur = bs->cur;
429 
430 	/*
431 	 * In theory, xfs_btree_get_block should only give us a null buffer
432 	 * pointer for the root of a root-in-inode btree type, but we need
433 	 * to check defensively here in case the cursor state is also screwed
434 	 * up.
435 	 */
436 	if (bp == NULL) {
437 		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
438 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
439 		return 0;
440 	}
441 
442 	/*
443 	 * We want to cross-reference each btree block with the bnobt
444 	 * and the rmapbt.  We cannot cross-reference the bnobt or
445 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
446 	 * because we cannot alter the cursor and we'd prefer not to
447 	 * duplicate cursors.  Therefore, save the buffer daddr for
448 	 * later scanning.
449 	 */
450 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
451 		struct check_owner	*co;
452 
453 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
454 		if (!co)
455 			return -ENOMEM;
456 
457 		INIT_LIST_HEAD(&co->list);
458 		co->level = level;
459 		co->daddr = xfs_buf_daddr(bp);
460 		list_add_tail(&co->list, &bs->to_check);
461 		return 0;
462 	}
463 
464 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
465 }
466 
467 /* Decide if we want to check minrecs of a btree block in the inode root. */
468 static inline bool
469 xchk_btree_check_iroot_minrecs(
470 	struct xchk_btree	*bs)
471 {
472 	/*
473 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
474 	 * would miscalculate the space required for the data fork bmbt root
475 	 * when adding an attr fork, and promote the iroot contents to an
476 	 * external block unnecessarily.  This went unnoticed for many years
477 	 * until scrub found filesystems in this state.  Inode rooted btrees are
478 	 * not supposed to have immediate child blocks that are small enough
479 	 * that the contents could fit in the inode root, but we can't fail
480 	 * existing filesystems, so instead we disable the check for data fork
481 	 * bmap btrees when there's an attr fork.
482 	 */
483 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
484 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
485 	    xfs_inode_has_attr_fork(bs->sc->ip))
486 		return false;
487 
488 	return true;
489 }
490 
491 /*
492  * Check that this btree block has at least minrecs records or is one of the
493  * special blocks that don't require that.
494  */
495 STATIC void
496 xchk_btree_check_minrecs(
497 	struct xchk_btree	*bs,
498 	int			level,
499 	struct xfs_btree_block	*block)
500 {
501 	struct xfs_btree_cur	*cur = bs->cur;
502 	unsigned int		root_level = cur->bc_nlevels - 1;
503 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
504 
505 	/* More records than minrecs means the block is ok. */
506 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
507 		return;
508 
509 	/*
510 	 * For btrees rooted in the inode, it's possible that the root block
511 	 * contents spilled into a regular ondisk block because there wasn't
512 	 * enough space in the inode root.  The number of records in that
513 	 * child block might be less than the standard minrecs, but that's ok
514 	 * provided that there's only one direct child of the root.
515 	 */
516 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
517 	    level == cur->bc_nlevels - 2) {
518 		struct xfs_btree_block	*root_block;
519 		struct xfs_buf		*root_bp;
520 		int			root_maxrecs;
521 
522 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
523 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
524 		if (xchk_btree_check_iroot_minrecs(bs) &&
525 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
526 		     numrecs <= root_maxrecs))
527 			xchk_btree_set_corrupt(bs->sc, cur, level);
528 		return;
529 	}
530 
531 	/*
532 	 * Otherwise, only the root level is allowed to have fewer than minrecs
533 	 * records or keyptrs.
534 	 */
535 	if (level < root_level)
536 		xchk_btree_set_corrupt(bs->sc, cur, level);
537 }
538 
539 /*
540  * If this btree block has a parent, make sure that the parent's keys capture
541  * the keyspace contained in this block.
542  */
543 STATIC void
544 xchk_btree_block_check_keys(
545 	struct xchk_btree	*bs,
546 	int			level,
547 	struct xfs_btree_block	*block)
548 {
549 	union xfs_btree_key	block_key;
550 	union xfs_btree_key	*block_high_key;
551 	union xfs_btree_key	*parent_low_key, *parent_high_key;
552 	struct xfs_btree_cur	*cur = bs->cur;
553 	struct xfs_btree_block	*parent_block;
554 	struct xfs_buf		*bp;
555 
556 	if (level == cur->bc_nlevels - 1)
557 		return;
558 
559 	xfs_btree_get_keys(cur, block, &block_key);
560 
561 	/* Make sure the low key of this block matches the parent. */
562 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
563 	parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
564 			parent_block);
565 	if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
566 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
567 		return;
568 	}
569 
570 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
571 		return;
572 
573 	/* Make sure the high key of this block matches the parent. */
574 	parent_high_key = xfs_btree_high_key_addr(cur,
575 			cur->bc_levels[level + 1].ptr, parent_block);
576 	block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
577 	if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
578 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
579 }
580 
581 /*
582  * Grab and scrub a btree block given a btree pointer.  Returns block
583  * and buffer pointers (if applicable) if they're ok to use.
584  */
585 STATIC int
586 xchk_btree_get_block(
587 	struct xchk_btree	*bs,
588 	int			level,
589 	union xfs_btree_ptr	*pp,
590 	struct xfs_btree_block	**pblock,
591 	struct xfs_buf		**pbp)
592 {
593 	xfs_failaddr_t		failed_at;
594 	int			error;
595 
596 	*pblock = NULL;
597 	*pbp = NULL;
598 
599 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
600 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
601 	    !*pblock)
602 		return error;
603 
604 	xfs_btree_get_block(bs->cur, level, pbp);
605 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
606 		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
607 				level, *pbp);
608 	else
609 		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
610 				 level, *pbp);
611 	if (failed_at) {
612 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
613 		return 0;
614 	}
615 	if (*pbp)
616 		xchk_buffer_recheck(bs->sc, *pbp);
617 
618 	xchk_btree_check_minrecs(bs, level, *pblock);
619 
620 	/*
621 	 * Check the block's owner; this function absorbs error codes
622 	 * for us.
623 	 */
624 	error = xchk_btree_check_owner(bs, level, *pbp);
625 	if (error)
626 		return error;
627 
628 	/*
629 	 * Check the block's siblings; this function absorbs error codes
630 	 * for us.
631 	 */
632 	error = xchk_btree_block_check_siblings(bs, *pblock);
633 	if (error)
634 		return error;
635 
636 	xchk_btree_block_check_keys(bs, level, *pblock);
637 	return 0;
638 }
639 
640 /*
641  * Check that the low and high keys of this block match the keys stored
642  * in the parent block.
643  */
644 STATIC void
645 xchk_btree_block_keys(
646 	struct xchk_btree	*bs,
647 	int			level,
648 	struct xfs_btree_block	*block)
649 {
650 	union xfs_btree_key	block_keys;
651 	struct xfs_btree_cur	*cur = bs->cur;
652 	union xfs_btree_key	*high_bk;
653 	union xfs_btree_key	*parent_keys;
654 	union xfs_btree_key	*high_pk;
655 	struct xfs_btree_block	*parent_block;
656 	struct xfs_buf		*bp;
657 
658 	if (level >= cur->bc_nlevels - 1)
659 		return;
660 
661 	/* Calculate the keys for this block. */
662 	xfs_btree_get_keys(cur, block, &block_keys);
663 
664 	/* Obtain the parent's copy of the keys for this block. */
665 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
666 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
667 			parent_block);
668 
669 	if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
670 		xchk_btree_set_corrupt(bs->sc, cur, 1);
671 
672 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
673 		return;
674 
675 	/* Get high keys */
676 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
677 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
678 			parent_block);
679 
680 	if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
681 		xchk_btree_set_corrupt(bs->sc, cur, 1);
682 }
683 
684 /*
685  * Visit all nodes and leaves of a btree.  Check that all pointers and
686  * records are in order, that the keys reflect the records, and use a callback
687  * so that the caller can verify individual records.
688  */
689 int
690 xchk_btree(
691 	struct xfs_scrub		*sc,
692 	struct xfs_btree_cur		*cur,
693 	xchk_btree_rec_fn		scrub_fn,
694 	const struct xfs_owner_info	*oinfo,
695 	void				*private)
696 {
697 	union xfs_btree_ptr		ptr;
698 	struct xchk_btree		*bs;
699 	union xfs_btree_ptr		*pp;
700 	union xfs_btree_rec		*recp;
701 	struct xfs_btree_block		*block;
702 	struct xfs_buf			*bp;
703 	struct check_owner		*co;
704 	struct check_owner		*n;
705 	size_t				cur_sz;
706 	int				level;
707 	int				error = 0;
708 
709 	/*
710 	 * Allocate the btree scrub context from the heap, because this
711 	 * structure can get rather large.  Don't let a caller feed us a
712 	 * totally absurd size.
713 	 */
714 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
715 	if (cur_sz > PAGE_SIZE) {
716 		xchk_btree_set_corrupt(sc, cur, 0);
717 		return 0;
718 	}
719 	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
720 	if (!bs)
721 		return -ENOMEM;
722 	bs->cur = cur;
723 	bs->scrub_rec = scrub_fn;
724 	bs->oinfo = oinfo;
725 	bs->private = private;
726 	bs->sc = sc;
727 
728 	/* Initialize scrub state */
729 	INIT_LIST_HEAD(&bs->to_check);
730 
731 	/*
732 	 * Load the root of the btree.  The helper function absorbs
733 	 * error codes for us.
734 	 */
735 	level = cur->bc_nlevels - 1;
736 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
737 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
738 		goto out;
739 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
740 	if (error || !block)
741 		goto out;
742 
743 	cur->bc_levels[level].ptr = 1;
744 
745 	while (level < cur->bc_nlevels) {
746 		block = xfs_btree_get_block(cur, level, &bp);
747 
748 		if (level == 0) {
749 			/* End of leaf, pop back towards the root. */
750 			if (cur->bc_levels[level].ptr >
751 			    be16_to_cpu(block->bb_numrecs)) {
752 				xchk_btree_block_keys(bs, level, block);
753 				if (level < cur->bc_nlevels - 1)
754 					cur->bc_levels[level + 1].ptr++;
755 				level++;
756 				continue;
757 			}
758 
759 			/* Records in order for scrub? */
760 			xchk_btree_rec(bs);
761 
762 			/* Call out to the record checker. */
763 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
764 					block);
765 			error = bs->scrub_rec(bs, recp);
766 			if (error)
767 				break;
768 			if (xchk_should_terminate(sc, &error) ||
769 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
770 				break;
771 
772 			cur->bc_levels[level].ptr++;
773 			continue;
774 		}
775 
776 		/* End of node, pop back towards the root. */
777 		if (cur->bc_levels[level].ptr >
778 					be16_to_cpu(block->bb_numrecs)) {
779 			xchk_btree_block_keys(bs, level, block);
780 			if (level < cur->bc_nlevels - 1)
781 				cur->bc_levels[level + 1].ptr++;
782 			level++;
783 			continue;
784 		}
785 
786 		/* Keys in order for scrub? */
787 		xchk_btree_key(bs, level);
788 
789 		/* Drill another level deeper. */
790 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
791 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
792 			cur->bc_levels[level].ptr++;
793 			continue;
794 		}
795 		level--;
796 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
797 		if (error || !block)
798 			goto out;
799 
800 		cur->bc_levels[level].ptr = 1;
801 	}
802 
803 out:
804 	/* Process deferred owner checks on btree blocks. */
805 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
806 		if (!error && bs->cur)
807 			error = xchk_btree_check_block_owner(bs, co->level,
808 					co->daddr);
809 		list_del(&co->list);
810 		kfree(co);
811 	}
812 	kfree(bs);
813 
814 	return error;
815 }
816