1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_log_format.h"
12 #include "xfs_trans.h"
13 #include "xfs_mount.h"
14 #include "xfs_alloc.h"
15 #include "xfs_ialloc.h"
16 #include "xfs_health.h"
17 #include "xfs_btree.h"
18 #include "xfs_ag.h"
19 #include "xfs_rtbitmap.h"
20 #include "xfs_inode.h"
21 #include "xfs_icache.h"
22 #include "scrub/scrub.h"
23 #include "scrub/common.h"
24 #include "scrub/trace.h"
25
26 /*
27 * FS Summary Counters
28 * ===================
29 *
30 * The basics of filesystem summary counter checking are that we iterate the
31 * AGs counting the number of free blocks, free space btree blocks, per-AG
32 * reservations, inodes, delayed allocation reservations, and free inodes.
33 * Then we compare what we computed against the in-core counters.
34 *
35 * However, the reality is that summary counters are a tricky beast to check.
36 * While we /could/ freeze the filesystem and scramble around the AGs counting
37 * the free blocks, in practice we prefer not do that for a scan because
38 * freezing is costly. To get around this, we added a per-cpu counter of the
39 * delalloc reservations so that we can rotor around the AGs relatively
40 * quickly, and we allow the counts to be slightly off because we're not taking
41 * any locks while we do this.
42 *
43 * So the first thing we do is warm up the buffer cache in the setup routine by
44 * walking all the AGs to make sure the incore per-AG structure has been
45 * initialized. The expected value calculation then iterates the incore per-AG
46 * structures as quickly as it can. We snapshot the percpu counters before and
47 * after this operation and use the difference in counter values to guess at
48 * our tolerance for mismatch between expected and actual counter values.
49 */
50
51 struct xchk_fscounters {
52 struct xfs_scrub *sc;
53 uint64_t icount;
54 uint64_t ifree;
55 uint64_t fdblocks;
56 uint64_t frextents;
57 unsigned long long icount_min;
58 unsigned long long icount_max;
59 bool frozen;
60 };
61
62 /*
63 * Since the expected value computation is lockless but only browses incore
64 * values, the percpu counters should be fairly close to each other. However,
65 * we'll allow ourselves to be off by at least this (arbitrary) amount.
66 */
67 #define XCHK_FSCOUNT_MIN_VARIANCE (512)
68
69 /*
70 * Make sure the per-AG structure has been initialized from the on-disk header
71 * contents and trust that the incore counters match the ondisk counters. (The
72 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
73 * summary counters after checking all AG headers). Do this from the setup
74 * function so that the inner AG aggregation loop runs as quickly as possible.
75 *
76 * This function runs during the setup phase /before/ we start checking any
77 * metadata.
78 */
79 STATIC int
xchk_fscount_warmup(struct xfs_scrub * sc)80 xchk_fscount_warmup(
81 struct xfs_scrub *sc)
82 {
83 struct xfs_mount *mp = sc->mp;
84 struct xfs_buf *agi_bp = NULL;
85 struct xfs_buf *agf_bp = NULL;
86 struct xfs_perag *pag = NULL;
87 xfs_agnumber_t agno;
88 int error = 0;
89
90 for_each_perag(mp, agno, pag) {
91 if (xchk_should_terminate(sc, &error))
92 break;
93 if (xfs_perag_initialised_agi(pag) &&
94 xfs_perag_initialised_agf(pag))
95 continue;
96
97 /* Lock both AG headers. */
98 error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
99 if (error)
100 break;
101 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
102 if (error)
103 break;
104
105 /*
106 * These are supposed to be initialized by the header read
107 * function.
108 */
109 if (!xfs_perag_initialised_agi(pag) ||
110 !xfs_perag_initialised_agf(pag)) {
111 error = -EFSCORRUPTED;
112 break;
113 }
114
115 xfs_buf_relse(agf_bp);
116 agf_bp = NULL;
117 xfs_buf_relse(agi_bp);
118 agi_bp = NULL;
119 }
120
121 if (agf_bp)
122 xfs_buf_relse(agf_bp);
123 if (agi_bp)
124 xfs_buf_relse(agi_bp);
125 if (pag)
126 xfs_perag_rele(pag);
127 return error;
128 }
129
130 static inline int
xchk_fsfreeze(struct xfs_scrub * sc)131 xchk_fsfreeze(
132 struct xfs_scrub *sc)
133 {
134 int error;
135
136 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
137 trace_xchk_fsfreeze(sc, error);
138 return error;
139 }
140
141 static inline int
xchk_fsthaw(struct xfs_scrub * sc)142 xchk_fsthaw(
143 struct xfs_scrub *sc)
144 {
145 int error;
146
147 /* This should always succeed, we have a kernel freeze */
148 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
149 trace_xchk_fsthaw(sc, error);
150 return error;
151 }
152
153 /*
154 * We couldn't stabilize the filesystem long enough to sample all the variables
155 * that comprise the summary counters and compare them to the percpu counters.
156 * We need to disable all writer threads, which means taking the first two
157 * freeze levels to put userspace to sleep, and the third freeze level to
158 * prevent background threads from starting new transactions. Take one level
159 * more to prevent other callers from unfreezing the filesystem while we run.
160 */
161 STATIC int
xchk_fscounters_freeze(struct xfs_scrub * sc)162 xchk_fscounters_freeze(
163 struct xfs_scrub *sc)
164 {
165 struct xchk_fscounters *fsc = sc->buf;
166 int error = 0;
167
168 if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
169 sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
170 mnt_drop_write_file(sc->file);
171 }
172
173 /* Try to grab a kernel freeze. */
174 while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
175 if (xchk_should_terminate(sc, &error))
176 return error;
177
178 delay(HZ / 10);
179 }
180 if (error)
181 return error;
182
183 fsc->frozen = true;
184 return 0;
185 }
186
187 /* Thaw the filesystem after checking or repairing fscounters. */
188 STATIC void
xchk_fscounters_cleanup(void * buf)189 xchk_fscounters_cleanup(
190 void *buf)
191 {
192 struct xchk_fscounters *fsc = buf;
193 struct xfs_scrub *sc = fsc->sc;
194 int error;
195
196 if (!fsc->frozen)
197 return;
198
199 error = xchk_fsthaw(sc);
200 if (error)
201 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
202 else
203 fsc->frozen = false;
204 }
205
206 int
xchk_setup_fscounters(struct xfs_scrub * sc)207 xchk_setup_fscounters(
208 struct xfs_scrub *sc)
209 {
210 struct xchk_fscounters *fsc;
211 int error;
212
213 /*
214 * If the AGF doesn't track btreeblks, we have to lock the AGF to count
215 * btree block usage by walking the actual btrees.
216 */
217 if (!xfs_has_lazysbcount(sc->mp))
218 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
219
220 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
221 if (!sc->buf)
222 return -ENOMEM;
223 sc->buf_cleanup = xchk_fscounters_cleanup;
224 fsc = sc->buf;
225 fsc->sc = sc;
226
227 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
228
229 /* We must get the incore counters set up before we can proceed. */
230 error = xchk_fscount_warmup(sc);
231 if (error)
232 return error;
233
234 /*
235 * Pause all writer activity in the filesystem while we're scrubbing to
236 * reduce the likelihood of background perturbations to the counters
237 * throwing off our calculations.
238 */
239 if (sc->flags & XCHK_TRY_HARDER) {
240 error = xchk_fscounters_freeze(sc);
241 if (error)
242 return error;
243 }
244
245 return xfs_trans_alloc_empty(sc->mp, &sc->tp);
246 }
247
248 /*
249 * Part 1: Collecting filesystem summary counts. For each AG, we add its
250 * summary counts (total inodes, free inodes, free data blocks) to an incore
251 * copy of the overall filesystem summary counts.
252 *
253 * To avoid false corruption reports in part 2, any failure in this part must
254 * set the INCOMPLETE flag even when a negative errno is returned. This care
255 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
256 * ECANCELED) that are absorbed into a scrub state flag update by
257 * xchk_*_process_error.
258 */
259
260 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
261 static int
xchk_fscount_btreeblks(struct xfs_scrub * sc,struct xchk_fscounters * fsc,xfs_agnumber_t agno)262 xchk_fscount_btreeblks(
263 struct xfs_scrub *sc,
264 struct xchk_fscounters *fsc,
265 xfs_agnumber_t agno)
266 {
267 xfs_extlen_t blocks;
268 int error;
269
270 error = xchk_ag_init_existing(sc, agno, &sc->sa);
271 if (error)
272 goto out_free;
273
274 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks);
275 if (error)
276 goto out_free;
277 fsc->fdblocks += blocks - 1;
278
279 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks);
280 if (error)
281 goto out_free;
282 fsc->fdblocks += blocks - 1;
283
284 out_free:
285 xchk_ag_free(sc, &sc->sa);
286 return error;
287 }
288
289 /*
290 * Calculate what the global in-core counters ought to be from the incore
291 * per-AG structure. Callers can compare this to the actual in-core counters
292 * to estimate by how much both in-core and on-disk counters need to be
293 * adjusted.
294 */
295 STATIC int
xchk_fscount_aggregate_agcounts(struct xfs_scrub * sc,struct xchk_fscounters * fsc)296 xchk_fscount_aggregate_agcounts(
297 struct xfs_scrub *sc,
298 struct xchk_fscounters *fsc)
299 {
300 struct xfs_mount *mp = sc->mp;
301 struct xfs_perag *pag;
302 uint64_t delayed;
303 xfs_agnumber_t agno;
304 int tries = 8;
305 int error = 0;
306
307 retry:
308 fsc->icount = 0;
309 fsc->ifree = 0;
310 fsc->fdblocks = 0;
311
312 for_each_perag(mp, agno, pag) {
313 if (xchk_should_terminate(sc, &error))
314 break;
315
316 /* This somehow got unset since the warmup? */
317 if (!xfs_perag_initialised_agi(pag) ||
318 !xfs_perag_initialised_agf(pag)) {
319 error = -EFSCORRUPTED;
320 break;
321 }
322
323 /* Count all the inodes */
324 fsc->icount += pag->pagi_count;
325 fsc->ifree += pag->pagi_freecount;
326
327 /* Add up the free/freelist/bnobt/cntbt blocks */
328 fsc->fdblocks += pag->pagf_freeblks;
329 fsc->fdblocks += pag->pagf_flcount;
330 if (xfs_has_lazysbcount(sc->mp)) {
331 fsc->fdblocks += pag->pagf_btreeblks;
332 } else {
333 error = xchk_fscount_btreeblks(sc, fsc, agno);
334 if (error)
335 break;
336 }
337
338 /*
339 * Per-AG reservations are taken out of the incore counters,
340 * so they must be left out of the free blocks computation.
341 */
342 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
343 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
344
345 }
346 if (pag)
347 xfs_perag_rele(pag);
348 if (error) {
349 xchk_set_incomplete(sc);
350 return error;
351 }
352
353 /*
354 * The global incore space reservation is taken from the incore
355 * counters, so leave that out of the computation.
356 */
357 fsc->fdblocks -= mp->m_resblks_avail;
358
359 /*
360 * Delayed allocation reservations are taken out of the incore counters
361 * but not recorded on disk, so leave them and their indlen blocks out
362 * of the computation.
363 */
364 delayed = percpu_counter_sum(&mp->m_delalloc_blks);
365 fsc->fdblocks -= delayed;
366
367 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
368 delayed);
369
370
371 /* Bail out if the values we compute are totally nonsense. */
372 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
373 fsc->fdblocks > mp->m_sb.sb_dblocks ||
374 fsc->ifree > fsc->icount_max)
375 return -EFSCORRUPTED;
376
377 /*
378 * If ifree > icount then we probably had some perturbation in the
379 * counters while we were calculating things. We'll try a few times
380 * to maintain ifree <= icount before giving up.
381 */
382 if (fsc->ifree > fsc->icount) {
383 if (tries--)
384 goto retry;
385 return -EDEADLOCK;
386 }
387
388 return 0;
389 }
390
391 #ifdef CONFIG_XFS_RT
392 STATIC int
xchk_fscount_add_frextent(struct xfs_mount * mp,struct xfs_trans * tp,const struct xfs_rtalloc_rec * rec,void * priv)393 xchk_fscount_add_frextent(
394 struct xfs_mount *mp,
395 struct xfs_trans *tp,
396 const struct xfs_rtalloc_rec *rec,
397 void *priv)
398 {
399 struct xchk_fscounters *fsc = priv;
400 int error = 0;
401
402 fsc->frextents += rec->ar_extcount;
403
404 xchk_should_terminate(fsc->sc, &error);
405 return error;
406 }
407
408 /* Calculate the number of free realtime extents from the realtime bitmap. */
409 STATIC int
xchk_fscount_count_frextents(struct xfs_scrub * sc,struct xchk_fscounters * fsc)410 xchk_fscount_count_frextents(
411 struct xfs_scrub *sc,
412 struct xchk_fscounters *fsc)
413 {
414 struct xfs_mount *mp = sc->mp;
415 int error;
416
417 fsc->frextents = 0;
418 if (!xfs_has_realtime(mp))
419 return 0;
420
421 xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
422 error = xfs_rtalloc_query_all(sc->mp, sc->tp,
423 xchk_fscount_add_frextent, fsc);
424 if (error) {
425 xchk_set_incomplete(sc);
426 goto out_unlock;
427 }
428
429 out_unlock:
430 xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
431 return error;
432 }
433 #else
434 STATIC int
xchk_fscount_count_frextents(struct xfs_scrub * sc,struct xchk_fscounters * fsc)435 xchk_fscount_count_frextents(
436 struct xfs_scrub *sc,
437 struct xchk_fscounters *fsc)
438 {
439 fsc->frextents = 0;
440 return 0;
441 }
442 #endif /* CONFIG_XFS_RT */
443
444 /*
445 * Part 2: Comparing filesystem summary counters. All we have to do here is
446 * sum the percpu counters and compare them to what we've observed.
447 */
448
449 /*
450 * Is the @counter reasonably close to the @expected value?
451 *
452 * We neither locked nor froze anything in the filesystem while aggregating the
453 * per-AG data to compute the @expected value, which means that the counter
454 * could have changed. We know the @old_value of the summation of the counter
455 * before the aggregation, and we re-sum the counter now. If the expected
456 * value falls between the two summations, we're ok.
457 *
458 * Otherwise, we /might/ have a problem. If the change in the summations is
459 * more than we want to tolerate, the filesystem is probably busy and we should
460 * just send back INCOMPLETE and see if userspace will try again.
461 *
462 * If we're repairing then we require an exact match.
463 */
464 static inline bool
xchk_fscount_within_range(struct xfs_scrub * sc,const int64_t old_value,struct percpu_counter * counter,uint64_t expected)465 xchk_fscount_within_range(
466 struct xfs_scrub *sc,
467 const int64_t old_value,
468 struct percpu_counter *counter,
469 uint64_t expected)
470 {
471 int64_t min_value, max_value;
472 int64_t curr_value = percpu_counter_sum(counter);
473
474 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
475 old_value);
476
477 /* Negative values are always wrong. */
478 if (curr_value < 0)
479 return false;
480
481 /* Exact matches are always ok. */
482 if (curr_value == expected)
483 return true;
484
485 min_value = min(old_value, curr_value);
486 max_value = max(old_value, curr_value);
487
488 /* Within the before-and-after range is ok. */
489 if (expected >= min_value && expected <= max_value)
490 return true;
491
492 /* Everything else is bad. */
493 return false;
494 }
495
496 /* Check the superblock counters. */
497 int
xchk_fscounters(struct xfs_scrub * sc)498 xchk_fscounters(
499 struct xfs_scrub *sc)
500 {
501 struct xfs_mount *mp = sc->mp;
502 struct xchk_fscounters *fsc = sc->buf;
503 int64_t icount, ifree, fdblocks, frextents;
504 bool try_again = false;
505 int error;
506
507 /* Snapshot the percpu counters. */
508 icount = percpu_counter_sum(&mp->m_icount);
509 ifree = percpu_counter_sum(&mp->m_ifree);
510 fdblocks = percpu_counter_sum(&mp->m_fdblocks);
511 frextents = percpu_counter_sum(&mp->m_frextents);
512
513 /* No negative values, please! */
514 if (icount < 0 || ifree < 0)
515 xchk_set_corrupt(sc);
516
517 /*
518 * If the filesystem is not frozen, the counter summation calls above
519 * can race with xfs_mod_freecounter, which subtracts a requested space
520 * reservation from the counter and undoes the subtraction if that made
521 * the counter go negative. Therefore, it's possible to see negative
522 * values here, and we should only flag that as a corruption if we
523 * froze the fs. This is much more likely to happen with frextents
524 * since there are no reserved pools.
525 */
526 if (fdblocks < 0 || frextents < 0) {
527 if (!fsc->frozen)
528 return -EDEADLOCK;
529
530 xchk_set_corrupt(sc);
531 return 0;
532 }
533
534 /* See if icount is obviously wrong. */
535 if (icount < fsc->icount_min || icount > fsc->icount_max)
536 xchk_set_corrupt(sc);
537
538 /* See if fdblocks is obviously wrong. */
539 if (fdblocks > mp->m_sb.sb_dblocks)
540 xchk_set_corrupt(sc);
541
542 /* See if frextents is obviously wrong. */
543 if (frextents > mp->m_sb.sb_rextents)
544 xchk_set_corrupt(sc);
545
546 /*
547 * If ifree exceeds icount by more than the minimum variance then
548 * something's probably wrong with the counters.
549 */
550 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
551 xchk_set_corrupt(sc);
552
553 /* Walk the incore AG headers to calculate the expected counters. */
554 error = xchk_fscount_aggregate_agcounts(sc, fsc);
555 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
556 return error;
557
558 /* Count the free extents counter for rt volumes. */
559 error = xchk_fscount_count_frextents(sc, fsc);
560 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
561 return error;
562 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
563 return 0;
564
565 /*
566 * Compare the in-core counters with whatever we counted. If the fs is
567 * frozen, we treat the discrepancy as a corruption because the freeze
568 * should have stabilized the counter values. Otherwise, we need
569 * userspace to call us back having granted us freeze permission.
570 */
571 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
572 fsc->icount)) {
573 if (fsc->frozen)
574 xchk_set_corrupt(sc);
575 else
576 try_again = true;
577 }
578
579 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
580 if (fsc->frozen)
581 xchk_set_corrupt(sc);
582 else
583 try_again = true;
584 }
585
586 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
587 fsc->fdblocks)) {
588 if (fsc->frozen)
589 xchk_set_corrupt(sc);
590 else
591 try_again = true;
592 }
593
594 if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
595 fsc->frextents)) {
596 if (fsc->frozen)
597 xchk_set_corrupt(sc);
598 else
599 try_again = true;
600 }
601
602 if (try_again)
603 return -EDEADLOCK;
604
605 return 0;
606 }
607