xref: /openbmc/linux/fs/xfs/scrub/btree.c (revision c99f99fa)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 	case -ECHRNG:
40 		/* Used to restart an op with deadlock avoidance. */
41 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
42 		break;
43 	case -EFSBADCRC:
44 	case -EFSCORRUPTED:
45 		/* Note the badness but don't abort. */
46 		sc->sm->sm_flags |= errflag;
47 		*error = 0;
48 		fallthrough;
49 	default:
50 		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
51 			trace_xchk_ifork_btree_op_error(sc, cur, level,
52 					*error, ret_ip);
53 		else
54 			trace_xchk_btree_op_error(sc, cur, level,
55 					*error, ret_ip);
56 		break;
57 	}
58 	return false;
59 }
60 
61 bool
62 xchk_btree_process_error(
63 	struct xfs_scrub	*sc,
64 	struct xfs_btree_cur	*cur,
65 	int			level,
66 	int			*error)
67 {
68 	return __xchk_btree_process_error(sc, cur, level, error,
69 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
70 }
71 
72 bool
73 xchk_btree_xref_process_error(
74 	struct xfs_scrub	*sc,
75 	struct xfs_btree_cur	*cur,
76 	int			level,
77 	int			*error)
78 {
79 	return __xchk_btree_process_error(sc, cur, level, error,
80 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
81 }
82 
83 /* Record btree block corruption. */
84 static void
85 __xchk_btree_set_corrupt(
86 	struct xfs_scrub	*sc,
87 	struct xfs_btree_cur	*cur,
88 	int			level,
89 	__u32			errflag,
90 	void			*ret_ip)
91 {
92 	sc->sm->sm_flags |= errflag;
93 
94 	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
95 		trace_xchk_ifork_btree_error(sc, cur, level,
96 				ret_ip);
97 	else
98 		trace_xchk_btree_error(sc, cur, level,
99 				ret_ip);
100 }
101 
102 void
103 xchk_btree_set_corrupt(
104 	struct xfs_scrub	*sc,
105 	struct xfs_btree_cur	*cur,
106 	int			level)
107 {
108 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
109 			__return_address);
110 }
111 
112 void
113 xchk_btree_xref_set_corrupt(
114 	struct xfs_scrub	*sc,
115 	struct xfs_btree_cur	*cur,
116 	int			level)
117 {
118 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
119 			__return_address);
120 }
121 
122 void
123 xchk_btree_set_preen(
124 	struct xfs_scrub	*sc,
125 	struct xfs_btree_cur	*cur,
126 	int			level)
127 {
128 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
129 			__return_address);
130 }
131 
132 /*
133  * Make sure this record is in order and doesn't stray outside of the parent
134  * keys.
135  */
136 STATIC void
137 xchk_btree_rec(
138 	struct xchk_btree	*bs)
139 {
140 	struct xfs_btree_cur	*cur = bs->cur;
141 	union xfs_btree_rec	*rec;
142 	union xfs_btree_key	key;
143 	union xfs_btree_key	hkey;
144 	union xfs_btree_key	*keyp;
145 	struct xfs_btree_block	*block;
146 	struct xfs_btree_block	*keyblock;
147 	struct xfs_buf		*bp;
148 
149 	block = xfs_btree_get_block(cur, 0, &bp);
150 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
151 
152 	trace_xchk_btree_rec(bs->sc, cur, 0);
153 
154 	/* If this isn't the first record, are they in order? */
155 	if (cur->bc_levels[0].ptr > 1 &&
156 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
157 		xchk_btree_set_corrupt(bs->sc, cur, 0);
158 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
159 
160 	if (cur->bc_nlevels == 1)
161 		return;
162 
163 	/* Is this at least as large as the parent low key? */
164 	cur->bc_ops->init_key_from_rec(&key, rec);
165 	keyblock = xfs_btree_get_block(cur, 1, &bp);
166 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
167 	if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0)
168 		xchk_btree_set_corrupt(bs->sc, cur, 1);
169 
170 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
171 		return;
172 
173 	/* Is this no larger than the parent high key? */
174 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
175 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
176 	if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0)
177 		xchk_btree_set_corrupt(bs->sc, cur, 1);
178 }
179 
180 /*
181  * Make sure this key is in order and doesn't stray outside of the parent
182  * keys.
183  */
184 STATIC void
185 xchk_btree_key(
186 	struct xchk_btree	*bs,
187 	int			level)
188 {
189 	struct xfs_btree_cur	*cur = bs->cur;
190 	union xfs_btree_key	*key;
191 	union xfs_btree_key	*keyp;
192 	struct xfs_btree_block	*block;
193 	struct xfs_btree_block	*keyblock;
194 	struct xfs_buf		*bp;
195 
196 	block = xfs_btree_get_block(cur, level, &bp);
197 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
198 
199 	trace_xchk_btree_key(bs->sc, cur, level);
200 
201 	/* If this isn't the first key, are they in order? */
202 	if (cur->bc_levels[level].ptr > 1 &&
203 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key))
204 		xchk_btree_set_corrupt(bs->sc, cur, level);
205 	memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len);
206 
207 	if (level + 1 >= cur->bc_nlevels)
208 		return;
209 
210 	/* Is this at least as large as the parent low key? */
211 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
212 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
213 	if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0)
214 		xchk_btree_set_corrupt(bs->sc, cur, level);
215 
216 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
217 		return;
218 
219 	/* Is this no larger than the parent high key? */
220 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
221 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
222 			keyblock);
223 	if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0)
224 		xchk_btree_set_corrupt(bs->sc, cur, level);
225 }
226 
227 /*
228  * Check a btree pointer.  Returns true if it's ok to use this pointer.
229  * Callers do not need to set the corrupt flag.
230  */
231 static bool
232 xchk_btree_ptr_ok(
233 	struct xchk_btree	*bs,
234 	int			level,
235 	union xfs_btree_ptr	*ptr)
236 {
237 	bool			res;
238 
239 	/* A btree rooted in an inode has no block pointer to the root. */
240 	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
241 	    level == bs->cur->bc_nlevels)
242 		return true;
243 
244 	/* Otherwise, check the pointers. */
245 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
246 		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
247 	else
248 		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
249 	if (!res)
250 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
251 
252 	return res;
253 }
254 
255 /* Check that a btree block's sibling matches what we expect it. */
256 STATIC int
257 xchk_btree_block_check_sibling(
258 	struct xchk_btree	*bs,
259 	int			level,
260 	int			direction,
261 	union xfs_btree_ptr	*sibling)
262 {
263 	struct xfs_btree_cur	*cur = bs->cur;
264 	struct xfs_btree_block	*pblock;
265 	struct xfs_buf		*pbp;
266 	struct xfs_btree_cur	*ncur = NULL;
267 	union xfs_btree_ptr	*pp;
268 	int			success;
269 	int			error;
270 
271 	error = xfs_btree_dup_cursor(cur, &ncur);
272 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
273 	    !ncur)
274 		return error;
275 
276 	/*
277 	 * If the pointer is null, we shouldn't be able to move the upper
278 	 * level pointer anywhere.
279 	 */
280 	if (xfs_btree_ptr_is_null(cur, sibling)) {
281 		if (direction > 0)
282 			error = xfs_btree_increment(ncur, level + 1, &success);
283 		else
284 			error = xfs_btree_decrement(ncur, level + 1, &success);
285 		if (error == 0 && success)
286 			xchk_btree_set_corrupt(bs->sc, cur, level);
287 		error = 0;
288 		goto out;
289 	}
290 
291 	/* Increment upper level pointer. */
292 	if (direction > 0)
293 		error = xfs_btree_increment(ncur, level + 1, &success);
294 	else
295 		error = xfs_btree_decrement(ncur, level + 1, &success);
296 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
297 		goto out;
298 	if (!success) {
299 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
300 		goto out;
301 	}
302 
303 	/* Compare upper level pointer to sibling pointer. */
304 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
305 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
306 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
307 		goto out;
308 	if (pbp)
309 		xchk_buffer_recheck(bs->sc, pbp);
310 
311 	if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
312 		xchk_btree_set_corrupt(bs->sc, cur, level);
313 out:
314 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
315 	return error;
316 }
317 
318 /* Check the siblings of a btree block. */
319 STATIC int
320 xchk_btree_block_check_siblings(
321 	struct xchk_btree	*bs,
322 	struct xfs_btree_block	*block)
323 {
324 	struct xfs_btree_cur	*cur = bs->cur;
325 	union xfs_btree_ptr	leftsib;
326 	union xfs_btree_ptr	rightsib;
327 	int			level;
328 	int			error = 0;
329 
330 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
331 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
332 	level = xfs_btree_get_level(block);
333 
334 	/* Root block should never have siblings. */
335 	if (level == cur->bc_nlevels - 1) {
336 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
337 		    !xfs_btree_ptr_is_null(cur, &rightsib))
338 			xchk_btree_set_corrupt(bs->sc, cur, level);
339 		goto out;
340 	}
341 
342 	/*
343 	 * Does the left & right sibling pointers match the adjacent
344 	 * parent level pointers?
345 	 * (These function absorbs error codes for us.)
346 	 */
347 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
348 	if (error)
349 		return error;
350 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
351 	if (error)
352 		return error;
353 out:
354 	return error;
355 }
356 
357 struct check_owner {
358 	struct list_head	list;
359 	xfs_daddr_t		daddr;
360 	int			level;
361 };
362 
363 /*
364  * Make sure this btree block isn't in the free list and that there's
365  * an rmap record for it.
366  */
367 STATIC int
368 xchk_btree_check_block_owner(
369 	struct xchk_btree	*bs,
370 	int			level,
371 	xfs_daddr_t		daddr)
372 {
373 	xfs_agnumber_t		agno;
374 	xfs_agblock_t		agbno;
375 	xfs_btnum_t		btnum;
376 	bool			init_sa;
377 	int			error = 0;
378 
379 	if (!bs->cur)
380 		return 0;
381 
382 	btnum = bs->cur->bc_btnum;
383 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
384 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
385 
386 	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
387 	if (init_sa) {
388 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
389 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
390 				level, &error))
391 			goto out_free;
392 	}
393 
394 	xchk_xref_is_used_space(bs->sc, agbno, 1);
395 	/*
396 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
397 	 * have to nullify it (to shut down further block owner checks) if
398 	 * self-xref encounters problems.
399 	 */
400 	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
401 		bs->cur = NULL;
402 
403 	xchk_xref_is_owned_by(bs->sc, agbno, 1, bs->oinfo);
404 	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
405 		bs->cur = NULL;
406 
407 out_free:
408 	if (init_sa)
409 		xchk_ag_free(bs->sc, &bs->sc->sa);
410 
411 	return error;
412 }
413 
414 /* Check the owner of a btree block. */
415 STATIC int
416 xchk_btree_check_owner(
417 	struct xchk_btree	*bs,
418 	int			level,
419 	struct xfs_buf		*bp)
420 {
421 	struct xfs_btree_cur	*cur = bs->cur;
422 
423 	/*
424 	 * In theory, xfs_btree_get_block should only give us a null buffer
425 	 * pointer for the root of a root-in-inode btree type, but we need
426 	 * to check defensively here in case the cursor state is also screwed
427 	 * up.
428 	 */
429 	if (bp == NULL) {
430 		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
431 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
432 		return 0;
433 	}
434 
435 	/*
436 	 * We want to cross-reference each btree block with the bnobt
437 	 * and the rmapbt.  We cannot cross-reference the bnobt or
438 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
439 	 * because we cannot alter the cursor and we'd prefer not to
440 	 * duplicate cursors.  Therefore, save the buffer daddr for
441 	 * later scanning.
442 	 */
443 	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
444 		struct check_owner	*co;
445 
446 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
447 		if (!co)
448 			return -ENOMEM;
449 
450 		INIT_LIST_HEAD(&co->list);
451 		co->level = level;
452 		co->daddr = xfs_buf_daddr(bp);
453 		list_add_tail(&co->list, &bs->to_check);
454 		return 0;
455 	}
456 
457 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
458 }
459 
460 /* Decide if we want to check minrecs of a btree block in the inode root. */
461 static inline bool
462 xchk_btree_check_iroot_minrecs(
463 	struct xchk_btree	*bs)
464 {
465 	/*
466 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
467 	 * would miscalculate the space required for the data fork bmbt root
468 	 * when adding an attr fork, and promote the iroot contents to an
469 	 * external block unnecessarily.  This went unnoticed for many years
470 	 * until scrub found filesystems in this state.  Inode rooted btrees are
471 	 * not supposed to have immediate child blocks that are small enough
472 	 * that the contents could fit in the inode root, but we can't fail
473 	 * existing filesystems, so instead we disable the check for data fork
474 	 * bmap btrees when there's an attr fork.
475 	 */
476 	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
477 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
478 	    xfs_inode_has_attr_fork(bs->sc->ip))
479 		return false;
480 
481 	return true;
482 }
483 
484 /*
485  * Check that this btree block has at least minrecs records or is one of the
486  * special blocks that don't require that.
487  */
488 STATIC void
489 xchk_btree_check_minrecs(
490 	struct xchk_btree	*bs,
491 	int			level,
492 	struct xfs_btree_block	*block)
493 {
494 	struct xfs_btree_cur	*cur = bs->cur;
495 	unsigned int		root_level = cur->bc_nlevels - 1;
496 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
497 
498 	/* More records than minrecs means the block is ok. */
499 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
500 		return;
501 
502 	/*
503 	 * For btrees rooted in the inode, it's possible that the root block
504 	 * contents spilled into a regular ondisk block because there wasn't
505 	 * enough space in the inode root.  The number of records in that
506 	 * child block might be less than the standard minrecs, but that's ok
507 	 * provided that there's only one direct child of the root.
508 	 */
509 	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
510 	    level == cur->bc_nlevels - 2) {
511 		struct xfs_btree_block	*root_block;
512 		struct xfs_buf		*root_bp;
513 		int			root_maxrecs;
514 
515 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
516 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
517 		if (xchk_btree_check_iroot_minrecs(bs) &&
518 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
519 		     numrecs <= root_maxrecs))
520 			xchk_btree_set_corrupt(bs->sc, cur, level);
521 		return;
522 	}
523 
524 	/*
525 	 * Otherwise, only the root level is allowed to have fewer than minrecs
526 	 * records or keyptrs.
527 	 */
528 	if (level < root_level)
529 		xchk_btree_set_corrupt(bs->sc, cur, level);
530 }
531 
532 /*
533  * If this btree block has a parent, make sure that the parent's keys capture
534  * the keyspace contained in this block.
535  */
536 STATIC void
537 xchk_btree_block_check_keys(
538 	struct xchk_btree	*bs,
539 	int			level,
540 	struct xfs_btree_block	*block)
541 {
542 	union xfs_btree_key	block_key;
543 	union xfs_btree_key	*block_high_key;
544 	union xfs_btree_key	*parent_low_key, *parent_high_key;
545 	struct xfs_btree_cur	*cur = bs->cur;
546 	struct xfs_btree_block	*parent_block;
547 	struct xfs_buf		*bp;
548 
549 	if (level == cur->bc_nlevels - 1)
550 		return;
551 
552 	xfs_btree_get_keys(cur, block, &block_key);
553 
554 	/* Make sure the low key of this block matches the parent. */
555 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
556 	parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
557 			parent_block);
558 	if (cur->bc_ops->diff_two_keys(cur, &block_key, parent_low_key)) {
559 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
560 		return;
561 	}
562 
563 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
564 		return;
565 
566 	/* Make sure the high key of this block matches the parent. */
567 	parent_high_key = xfs_btree_high_key_addr(cur,
568 			cur->bc_levels[level + 1].ptr, parent_block);
569 	block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
570 	if (cur->bc_ops->diff_two_keys(cur, block_high_key, parent_high_key))
571 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
572 }
573 
574 /*
575  * Grab and scrub a btree block given a btree pointer.  Returns block
576  * and buffer pointers (if applicable) if they're ok to use.
577  */
578 STATIC int
579 xchk_btree_get_block(
580 	struct xchk_btree	*bs,
581 	int			level,
582 	union xfs_btree_ptr	*pp,
583 	struct xfs_btree_block	**pblock,
584 	struct xfs_buf		**pbp)
585 {
586 	xfs_failaddr_t		failed_at;
587 	int			error;
588 
589 	*pblock = NULL;
590 	*pbp = NULL;
591 
592 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
593 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
594 	    !*pblock)
595 		return error;
596 
597 	xfs_btree_get_block(bs->cur, level, pbp);
598 	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
599 		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
600 				level, *pbp);
601 	else
602 		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
603 				 level, *pbp);
604 	if (failed_at) {
605 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
606 		return 0;
607 	}
608 	if (*pbp)
609 		xchk_buffer_recheck(bs->sc, *pbp);
610 
611 	xchk_btree_check_minrecs(bs, level, *pblock);
612 
613 	/*
614 	 * Check the block's owner; this function absorbs error codes
615 	 * for us.
616 	 */
617 	error = xchk_btree_check_owner(bs, level, *pbp);
618 	if (error)
619 		return error;
620 
621 	/*
622 	 * Check the block's siblings; this function absorbs error codes
623 	 * for us.
624 	 */
625 	error = xchk_btree_block_check_siblings(bs, *pblock);
626 	if (error)
627 		return error;
628 
629 	xchk_btree_block_check_keys(bs, level, *pblock);
630 	return 0;
631 }
632 
633 /*
634  * Check that the low and high keys of this block match the keys stored
635  * in the parent block.
636  */
637 STATIC void
638 xchk_btree_block_keys(
639 	struct xchk_btree	*bs,
640 	int			level,
641 	struct xfs_btree_block	*block)
642 {
643 	union xfs_btree_key	block_keys;
644 	struct xfs_btree_cur	*cur = bs->cur;
645 	union xfs_btree_key	*high_bk;
646 	union xfs_btree_key	*parent_keys;
647 	union xfs_btree_key	*high_pk;
648 	struct xfs_btree_block	*parent_block;
649 	struct xfs_buf		*bp;
650 
651 	if (level >= cur->bc_nlevels - 1)
652 		return;
653 
654 	/* Calculate the keys for this block. */
655 	xfs_btree_get_keys(cur, block, &block_keys);
656 
657 	/* Obtain the parent's copy of the keys for this block. */
658 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
659 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
660 			parent_block);
661 
662 	if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0)
663 		xchk_btree_set_corrupt(bs->sc, cur, 1);
664 
665 	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
666 		return;
667 
668 	/* Get high keys */
669 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
670 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
671 			parent_block);
672 
673 	if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0)
674 		xchk_btree_set_corrupt(bs->sc, cur, 1);
675 }
676 
677 /*
678  * Visit all nodes and leaves of a btree.  Check that all pointers and
679  * records are in order, that the keys reflect the records, and use a callback
680  * so that the caller can verify individual records.
681  */
682 int
683 xchk_btree(
684 	struct xfs_scrub		*sc,
685 	struct xfs_btree_cur		*cur,
686 	xchk_btree_rec_fn		scrub_fn,
687 	const struct xfs_owner_info	*oinfo,
688 	void				*private)
689 {
690 	union xfs_btree_ptr		ptr;
691 	struct xchk_btree		*bs;
692 	union xfs_btree_ptr		*pp;
693 	union xfs_btree_rec		*recp;
694 	struct xfs_btree_block		*block;
695 	struct xfs_buf			*bp;
696 	struct check_owner		*co;
697 	struct check_owner		*n;
698 	size_t				cur_sz;
699 	int				level;
700 	int				error = 0;
701 
702 	/*
703 	 * Allocate the btree scrub context from the heap, because this
704 	 * structure can get rather large.  Don't let a caller feed us a
705 	 * totally absurd size.
706 	 */
707 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
708 	if (cur_sz > PAGE_SIZE) {
709 		xchk_btree_set_corrupt(sc, cur, 0);
710 		return 0;
711 	}
712 	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
713 	if (!bs)
714 		return -ENOMEM;
715 	bs->cur = cur;
716 	bs->scrub_rec = scrub_fn;
717 	bs->oinfo = oinfo;
718 	bs->private = private;
719 	bs->sc = sc;
720 
721 	/* Initialize scrub state */
722 	INIT_LIST_HEAD(&bs->to_check);
723 
724 	/*
725 	 * Load the root of the btree.  The helper function absorbs
726 	 * error codes for us.
727 	 */
728 	level = cur->bc_nlevels - 1;
729 	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
730 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
731 		goto out;
732 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
733 	if (error || !block)
734 		goto out;
735 
736 	cur->bc_levels[level].ptr = 1;
737 
738 	while (level < cur->bc_nlevels) {
739 		block = xfs_btree_get_block(cur, level, &bp);
740 
741 		if (level == 0) {
742 			/* End of leaf, pop back towards the root. */
743 			if (cur->bc_levels[level].ptr >
744 			    be16_to_cpu(block->bb_numrecs)) {
745 				xchk_btree_block_keys(bs, level, block);
746 				if (level < cur->bc_nlevels - 1)
747 					cur->bc_levels[level + 1].ptr++;
748 				level++;
749 				continue;
750 			}
751 
752 			/* Records in order for scrub? */
753 			xchk_btree_rec(bs);
754 
755 			/* Call out to the record checker. */
756 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
757 					block);
758 			error = bs->scrub_rec(bs, recp);
759 			if (error)
760 				break;
761 			if (xchk_should_terminate(sc, &error) ||
762 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
763 				break;
764 
765 			cur->bc_levels[level].ptr++;
766 			continue;
767 		}
768 
769 		/* End of node, pop back towards the root. */
770 		if (cur->bc_levels[level].ptr >
771 					be16_to_cpu(block->bb_numrecs)) {
772 			xchk_btree_block_keys(bs, level, block);
773 			if (level < cur->bc_nlevels - 1)
774 				cur->bc_levels[level + 1].ptr++;
775 			level++;
776 			continue;
777 		}
778 
779 		/* Keys in order for scrub? */
780 		xchk_btree_key(bs, level);
781 
782 		/* Drill another level deeper. */
783 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
784 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
785 			cur->bc_levels[level].ptr++;
786 			continue;
787 		}
788 		level--;
789 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
790 		if (error || !block)
791 			goto out;
792 
793 		cur->bc_levels[level].ptr = 1;
794 	}
795 
796 out:
797 	/* Process deferred owner checks on btree blocks. */
798 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
799 		if (!error && bs->cur)
800 			error = xchk_btree_check_block_owner(bs, co->level,
801 					co->daddr);
802 		list_del(&co->list);
803 		kfree(co);
804 	}
805 	kfree(bs);
806 
807 	return error;
808 }
809