1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_ag.h"
13 #include "xfs_btree.h"
14 #include "xfs_rmap.h"
15 #include "xfs_refcount.h"
16 #include "scrub/scrub.h"
17 #include "scrub/common.h"
18 #include "scrub/btree.h"
19 #include "scrub/trace.h"
20
21 /*
22 * Set us up to scrub reference count btrees.
23 */
24 int
xchk_setup_ag_refcountbt(struct xfs_scrub * sc)25 xchk_setup_ag_refcountbt(
26 struct xfs_scrub *sc)
27 {
28 if (xchk_need_intent_drain(sc))
29 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
30 return xchk_setup_ag_btree(sc, false);
31 }
32
33 /* Reference count btree scrubber. */
34
35 /*
36 * Confirming Reference Counts via Reverse Mappings
37 *
38 * We want to count the reverse mappings overlapping a refcount record
39 * (bno, len, refcount), allowing for the possibility that some of the
40 * overlap may come from smaller adjoining reverse mappings, while some
41 * comes from single extents which overlap the range entirely. The
42 * outer loop is as follows:
43 *
44 * 1. For all reverse mappings overlapping the refcount extent,
45 * a. If a given rmap completely overlaps, mark it as seen.
46 * b. Otherwise, record the fragment (in agbno order) for later
47 * processing.
48 *
49 * Once we've seen all the rmaps, we know that for all blocks in the
50 * refcount record we want to find $refcount owners and we've already
51 * visited $seen extents that overlap all the blocks. Therefore, we
52 * need to find ($refcount - $seen) owners for every block in the
53 * extent; call that quantity $target_nr. Proceed as follows:
54 *
55 * 2. Pull the first $target_nr fragments from the list; all of them
56 * should start at or before the start of the extent.
57 * Call this subset of fragments the working set.
58 * 3. Until there are no more unprocessed fragments,
59 * a. Find the shortest fragments in the set and remove them.
60 * b. Note the block number of the end of these fragments.
61 * c. Pull the same number of fragments from the list. All of these
62 * fragments should start at the block number recorded in the
63 * previous step.
64 * d. Put those fragments in the set.
65 * 4. Check that there are $target_nr fragments remaining in the list,
66 * and that they all end at or beyond the end of the refcount extent.
67 *
68 * If the refcount is correct, all the check conditions in the algorithm
69 * should always hold true. If not, the refcount is incorrect.
70 */
71 struct xchk_refcnt_frag {
72 struct list_head list;
73 struct xfs_rmap_irec rm;
74 };
75
76 struct xchk_refcnt_check {
77 struct xfs_scrub *sc;
78 struct list_head fragments;
79
80 /* refcount extent we're examining */
81 xfs_agblock_t bno;
82 xfs_extlen_t len;
83 xfs_nlink_t refcount;
84
85 /* number of owners seen */
86 xfs_nlink_t seen;
87 };
88
89 /*
90 * Decide if the given rmap is large enough that we can redeem it
91 * towards refcount verification now, or if it's a fragment, in
92 * which case we'll hang onto it in the hopes that we'll later
93 * discover that we've collected exactly the correct number of
94 * fragments as the refcountbt says we should have.
95 */
96 STATIC int
xchk_refcountbt_rmap_check(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)97 xchk_refcountbt_rmap_check(
98 struct xfs_btree_cur *cur,
99 const struct xfs_rmap_irec *rec,
100 void *priv)
101 {
102 struct xchk_refcnt_check *refchk = priv;
103 struct xchk_refcnt_frag *frag;
104 xfs_agblock_t rm_last;
105 xfs_agblock_t rc_last;
106 int error = 0;
107
108 if (xchk_should_terminate(refchk->sc, &error))
109 return error;
110
111 rm_last = rec->rm_startblock + rec->rm_blockcount - 1;
112 rc_last = refchk->bno + refchk->len - 1;
113
114 /* Confirm that a single-owner refc extent is a CoW stage. */
115 if (refchk->refcount == 1 && rec->rm_owner != XFS_RMAP_OWN_COW) {
116 xchk_btree_xref_set_corrupt(refchk->sc, cur, 0);
117 return 0;
118 }
119
120 if (rec->rm_startblock <= refchk->bno && rm_last >= rc_last) {
121 /*
122 * The rmap overlaps the refcount record, so we can confirm
123 * one refcount owner seen.
124 */
125 refchk->seen++;
126 } else {
127 /*
128 * This rmap covers only part of the refcount record, so
129 * save the fragment for later processing. If the rmapbt
130 * is healthy each rmap_irec we see will be in agbno order
131 * so we don't need insertion sort here.
132 */
133 frag = kmalloc(sizeof(struct xchk_refcnt_frag),
134 XCHK_GFP_FLAGS);
135 if (!frag)
136 return -ENOMEM;
137 memcpy(&frag->rm, rec, sizeof(frag->rm));
138 list_add_tail(&frag->list, &refchk->fragments);
139 }
140
141 return 0;
142 }
143
144 /*
145 * Given a bunch of rmap fragments, iterate through them, keeping
146 * a running tally of the refcount. If this ever deviates from
147 * what we expect (which is the refcountbt's refcount minus the
148 * number of extents that totally covered the refcountbt extent),
149 * we have a refcountbt error.
150 */
151 STATIC void
xchk_refcountbt_process_rmap_fragments(struct xchk_refcnt_check * refchk)152 xchk_refcountbt_process_rmap_fragments(
153 struct xchk_refcnt_check *refchk)
154 {
155 struct list_head worklist;
156 struct xchk_refcnt_frag *frag;
157 struct xchk_refcnt_frag *n;
158 xfs_agblock_t bno;
159 xfs_agblock_t rbno;
160 xfs_agblock_t next_rbno;
161 xfs_nlink_t nr;
162 xfs_nlink_t target_nr;
163
164 target_nr = refchk->refcount - refchk->seen;
165 if (target_nr == 0)
166 return;
167
168 /*
169 * There are (refchk->rc.rc_refcount - refchk->nr refcount)
170 * references we haven't found yet. Pull that many off the
171 * fragment list and figure out where the smallest rmap ends
172 * (and therefore the next rmap should start). All the rmaps
173 * we pull off should start at or before the beginning of the
174 * refcount record's range.
175 */
176 INIT_LIST_HEAD(&worklist);
177 rbno = NULLAGBLOCK;
178
179 /* Make sure the fragments actually /are/ in agbno order. */
180 bno = 0;
181 list_for_each_entry(frag, &refchk->fragments, list) {
182 if (frag->rm.rm_startblock < bno)
183 goto done;
184 bno = frag->rm.rm_startblock;
185 }
186
187 /*
188 * Find all the rmaps that start at or before the refc extent,
189 * and put them on the worklist.
190 */
191 nr = 0;
192 list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
193 if (frag->rm.rm_startblock > refchk->bno || nr > target_nr)
194 break;
195 bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
196 if (bno < rbno)
197 rbno = bno;
198 list_move_tail(&frag->list, &worklist);
199 nr++;
200 }
201
202 /*
203 * We should have found exactly $target_nr rmap fragments starting
204 * at or before the refcount extent.
205 */
206 if (nr != target_nr)
207 goto done;
208
209 while (!list_empty(&refchk->fragments)) {
210 /* Discard any fragments ending at rbno from the worklist. */
211 nr = 0;
212 next_rbno = NULLAGBLOCK;
213 list_for_each_entry_safe(frag, n, &worklist, list) {
214 bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
215 if (bno != rbno) {
216 if (bno < next_rbno)
217 next_rbno = bno;
218 continue;
219 }
220 list_del(&frag->list);
221 kfree(frag);
222 nr++;
223 }
224
225 /* Try to add nr rmaps starting at rbno to the worklist. */
226 list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
227 bno = frag->rm.rm_startblock + frag->rm.rm_blockcount;
228 if (frag->rm.rm_startblock != rbno)
229 goto done;
230 list_move_tail(&frag->list, &worklist);
231 if (next_rbno > bno)
232 next_rbno = bno;
233 nr--;
234 if (nr == 0)
235 break;
236 }
237
238 /*
239 * If we get here and nr > 0, this means that we added fewer
240 * items to the worklist than we discarded because the fragment
241 * list ran out of items. Therefore, we cannot maintain the
242 * required refcount. Something is wrong, so we're done.
243 */
244 if (nr)
245 goto done;
246
247 rbno = next_rbno;
248 }
249
250 /*
251 * Make sure the last extent we processed ends at or beyond
252 * the end of the refcount extent.
253 */
254 if (rbno < refchk->bno + refchk->len)
255 goto done;
256
257 /* Actually record us having seen the remaining refcount. */
258 refchk->seen = refchk->refcount;
259 done:
260 /* Delete fragments and work list. */
261 list_for_each_entry_safe(frag, n, &worklist, list) {
262 list_del(&frag->list);
263 kfree(frag);
264 }
265 list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
266 list_del(&frag->list);
267 kfree(frag);
268 }
269 }
270
271 /* Use the rmap entries covering this extent to verify the refcount. */
272 STATIC void
xchk_refcountbt_xref_rmap(struct xfs_scrub * sc,const struct xfs_refcount_irec * irec)273 xchk_refcountbt_xref_rmap(
274 struct xfs_scrub *sc,
275 const struct xfs_refcount_irec *irec)
276 {
277 struct xchk_refcnt_check refchk = {
278 .sc = sc,
279 .bno = irec->rc_startblock,
280 .len = irec->rc_blockcount,
281 .refcount = irec->rc_refcount,
282 .seen = 0,
283 };
284 struct xfs_rmap_irec low;
285 struct xfs_rmap_irec high;
286 struct xchk_refcnt_frag *frag;
287 struct xchk_refcnt_frag *n;
288 int error;
289
290 if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
291 return;
292
293 /* Cross-reference with the rmapbt to confirm the refcount. */
294 memset(&low, 0, sizeof(low));
295 low.rm_startblock = irec->rc_startblock;
296 memset(&high, 0xFF, sizeof(high));
297 high.rm_startblock = irec->rc_startblock + irec->rc_blockcount - 1;
298
299 INIT_LIST_HEAD(&refchk.fragments);
300 error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
301 &xchk_refcountbt_rmap_check, &refchk);
302 if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
303 goto out_free;
304
305 xchk_refcountbt_process_rmap_fragments(&refchk);
306 if (irec->rc_refcount != refchk.seen) {
307 trace_xchk_refcount_incorrect(sc->sa.pag, irec, refchk.seen);
308 xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
309 }
310
311 out_free:
312 list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
313 list_del(&frag->list);
314 kfree(frag);
315 }
316 }
317
318 /* Cross-reference with the other btrees. */
319 STATIC void
xchk_refcountbt_xref(struct xfs_scrub * sc,const struct xfs_refcount_irec * irec)320 xchk_refcountbt_xref(
321 struct xfs_scrub *sc,
322 const struct xfs_refcount_irec *irec)
323 {
324 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
325 return;
326
327 xchk_xref_is_used_space(sc, irec->rc_startblock, irec->rc_blockcount);
328 xchk_xref_is_not_inode_chunk(sc, irec->rc_startblock,
329 irec->rc_blockcount);
330 xchk_refcountbt_xref_rmap(sc, irec);
331 }
332
333 struct xchk_refcbt_records {
334 /* Previous refcount record. */
335 struct xfs_refcount_irec prev_rec;
336
337 /* The next AG block where we aren't expecting shared extents. */
338 xfs_agblock_t next_unshared_agbno;
339
340 /* Number of CoW blocks we expect. */
341 xfs_agblock_t cow_blocks;
342
343 /* Was the last record a shared or CoW staging extent? */
344 enum xfs_refc_domain prev_domain;
345 };
346
347 STATIC int
xchk_refcountbt_rmap_check_gap(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)348 xchk_refcountbt_rmap_check_gap(
349 struct xfs_btree_cur *cur,
350 const struct xfs_rmap_irec *rec,
351 void *priv)
352 {
353 xfs_agblock_t *next_bno = priv;
354
355 if (*next_bno != NULLAGBLOCK && rec->rm_startblock < *next_bno)
356 return -ECANCELED;
357
358 *next_bno = rec->rm_startblock + rec->rm_blockcount;
359 return 0;
360 }
361
362 /*
363 * Make sure that a gap in the reference count records does not correspond to
364 * overlapping records (i.e. shared extents) in the reverse mappings.
365 */
366 static inline void
xchk_refcountbt_xref_gaps(struct xfs_scrub * sc,struct xchk_refcbt_records * rrc,xfs_agblock_t bno)367 xchk_refcountbt_xref_gaps(
368 struct xfs_scrub *sc,
369 struct xchk_refcbt_records *rrc,
370 xfs_agblock_t bno)
371 {
372 struct xfs_rmap_irec low;
373 struct xfs_rmap_irec high;
374 xfs_agblock_t next_bno = NULLAGBLOCK;
375 int error;
376
377 if (bno <= rrc->next_unshared_agbno || !sc->sa.rmap_cur ||
378 xchk_skip_xref(sc->sm))
379 return;
380
381 memset(&low, 0, sizeof(low));
382 low.rm_startblock = rrc->next_unshared_agbno;
383 memset(&high, 0xFF, sizeof(high));
384 high.rm_startblock = bno - 1;
385
386 error = xfs_rmap_query_range(sc->sa.rmap_cur, &low, &high,
387 xchk_refcountbt_rmap_check_gap, &next_bno);
388 if (error == -ECANCELED)
389 xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
390 else
391 xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur);
392 }
393
394 static inline bool
xchk_refcount_mergeable(struct xchk_refcbt_records * rrc,const struct xfs_refcount_irec * r2)395 xchk_refcount_mergeable(
396 struct xchk_refcbt_records *rrc,
397 const struct xfs_refcount_irec *r2)
398 {
399 const struct xfs_refcount_irec *r1 = &rrc->prev_rec;
400
401 /* Ignore if prev_rec is not yet initialized. */
402 if (r1->rc_blockcount > 0)
403 return false;
404
405 if (r1->rc_domain != r2->rc_domain)
406 return false;
407 if (r1->rc_startblock + r1->rc_blockcount != r2->rc_startblock)
408 return false;
409 if (r1->rc_refcount != r2->rc_refcount)
410 return false;
411 if ((unsigned long long)r1->rc_blockcount + r2->rc_blockcount >
412 MAXREFCEXTLEN)
413 return false;
414
415 return true;
416 }
417
418 /* Flag failures for records that could be merged. */
419 STATIC void
xchk_refcountbt_check_mergeable(struct xchk_btree * bs,struct xchk_refcbt_records * rrc,const struct xfs_refcount_irec * irec)420 xchk_refcountbt_check_mergeable(
421 struct xchk_btree *bs,
422 struct xchk_refcbt_records *rrc,
423 const struct xfs_refcount_irec *irec)
424 {
425 if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
426 return;
427
428 if (xchk_refcount_mergeable(rrc, irec))
429 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
430
431 memcpy(&rrc->prev_rec, irec, sizeof(struct xfs_refcount_irec));
432 }
433
434 /* Scrub a refcountbt record. */
435 STATIC int
xchk_refcountbt_rec(struct xchk_btree * bs,const union xfs_btree_rec * rec)436 xchk_refcountbt_rec(
437 struct xchk_btree *bs,
438 const union xfs_btree_rec *rec)
439 {
440 struct xfs_refcount_irec irec;
441 struct xchk_refcbt_records *rrc = bs->private;
442
443 xfs_refcount_btrec_to_irec(rec, &irec);
444 if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
445 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
446 return 0;
447 }
448
449 if (irec.rc_domain == XFS_REFC_DOMAIN_COW)
450 rrc->cow_blocks += irec.rc_blockcount;
451
452 /* Shared records always come before CoW records. */
453 if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED &&
454 rrc->prev_domain == XFS_REFC_DOMAIN_COW)
455 xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
456 rrc->prev_domain = irec.rc_domain;
457
458 xchk_refcountbt_check_mergeable(bs, rrc, &irec);
459 xchk_refcountbt_xref(bs->sc, &irec);
460
461 /*
462 * If this is a record for a shared extent, check that all blocks
463 * between the previous record and this one have at most one reverse
464 * mapping.
465 */
466 if (irec.rc_domain == XFS_REFC_DOMAIN_SHARED) {
467 xchk_refcountbt_xref_gaps(bs->sc, rrc, irec.rc_startblock);
468 rrc->next_unshared_agbno = irec.rc_startblock +
469 irec.rc_blockcount;
470 }
471
472 return 0;
473 }
474
475 /* Make sure we have as many refc blocks as the rmap says. */
476 STATIC void
xchk_refcount_xref_rmap(struct xfs_scrub * sc,xfs_filblks_t cow_blocks)477 xchk_refcount_xref_rmap(
478 struct xfs_scrub *sc,
479 xfs_filblks_t cow_blocks)
480 {
481 xfs_extlen_t refcbt_blocks = 0;
482 xfs_filblks_t blocks;
483 int error;
484
485 if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm))
486 return;
487
488 /* Check that we saw as many refcbt blocks as the rmap knows about. */
489 error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks);
490 if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error))
491 return;
492 error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
493 &XFS_RMAP_OINFO_REFC, &blocks);
494 if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
495 return;
496 if (blocks != refcbt_blocks)
497 xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
498
499 /* Check that we saw as many cow blocks as the rmap knows about. */
500 error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur,
501 &XFS_RMAP_OINFO_COW, &blocks);
502 if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur))
503 return;
504 if (blocks != cow_blocks)
505 xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
506 }
507
508 /* Scrub the refcount btree for some AG. */
509 int
xchk_refcountbt(struct xfs_scrub * sc)510 xchk_refcountbt(
511 struct xfs_scrub *sc)
512 {
513 struct xchk_refcbt_records rrc = {
514 .cow_blocks = 0,
515 .next_unshared_agbno = 0,
516 .prev_domain = XFS_REFC_DOMAIN_SHARED,
517 };
518 int error;
519
520 error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec,
521 &XFS_RMAP_OINFO_REFC, &rrc);
522 if (error)
523 return error;
524
525 /*
526 * Check that all blocks between the last refcount > 1 record and the
527 * end of the AG have at most one reverse mapping.
528 */
529 xchk_refcountbt_xref_gaps(sc, &rrc, sc->mp->m_sb.sb_agblocks);
530
531 xchk_refcount_xref_rmap(sc, rrc.cow_blocks);
532
533 return 0;
534 }
535
536 /* xref check that a cow staging extent is marked in the refcountbt. */
537 void
xchk_xref_is_cow_staging(struct xfs_scrub * sc,xfs_agblock_t agbno,xfs_extlen_t len)538 xchk_xref_is_cow_staging(
539 struct xfs_scrub *sc,
540 xfs_agblock_t agbno,
541 xfs_extlen_t len)
542 {
543 struct xfs_refcount_irec rc;
544 int has_refcount;
545 int error;
546
547 if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
548 return;
549
550 /* Find the CoW staging extent. */
551 error = xfs_refcount_lookup_le(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
552 agbno, &has_refcount);
553 if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
554 return;
555 if (!has_refcount) {
556 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
557 return;
558 }
559
560 error = xfs_refcount_get_rec(sc->sa.refc_cur, &rc, &has_refcount);
561 if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
562 return;
563 if (!has_refcount) {
564 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
565 return;
566 }
567
568 /* CoW lookup returned a shared extent record? */
569 if (rc.rc_domain != XFS_REFC_DOMAIN_COW)
570 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
571
572 /* Must be at least as long as what was passed in */
573 if (rc.rc_blockcount < len)
574 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
575 }
576
577 /*
578 * xref check that the extent is not shared. Only file data blocks
579 * can have multiple owners.
580 */
581 void
xchk_xref_is_not_shared(struct xfs_scrub * sc,xfs_agblock_t agbno,xfs_extlen_t len)582 xchk_xref_is_not_shared(
583 struct xfs_scrub *sc,
584 xfs_agblock_t agbno,
585 xfs_extlen_t len)
586 {
587 enum xbtree_recpacking outcome;
588 int error;
589
590 if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
591 return;
592
593 error = xfs_refcount_has_records(sc->sa.refc_cur,
594 XFS_REFC_DOMAIN_SHARED, agbno, len, &outcome);
595 if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
596 return;
597 if (outcome != XBTREE_RECPACKING_EMPTY)
598 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
599 }
600
601 /* xref check that the extent is not being used for CoW staging. */
602 void
xchk_xref_is_not_cow_staging(struct xfs_scrub * sc,xfs_agblock_t agbno,xfs_extlen_t len)603 xchk_xref_is_not_cow_staging(
604 struct xfs_scrub *sc,
605 xfs_agblock_t agbno,
606 xfs_extlen_t len)
607 {
608 enum xbtree_recpacking outcome;
609 int error;
610
611 if (!sc->sa.refc_cur || xchk_skip_xref(sc->sm))
612 return;
613
614 error = xfs_refcount_has_records(sc->sa.refc_cur, XFS_REFC_DOMAIN_COW,
615 agbno, len, &outcome);
616 if (!xchk_should_check_xref(sc, &error, &sc->sa.refc_cur))
617 return;
618 if (outcome != XBTREE_RECPACKING_EMPTY)
619 xchk_btree_xref_set_corrupt(sc, sc->sa.refc_cur, 0);
620 }
621