1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2019 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_alloc.h" 13 #include "xfs_ialloc.h" 14 #include "xfs_health.h" 15 #include "xfs_btree.h" 16 #include "xfs_ag.h" 17 #include "scrub/scrub.h" 18 #include "scrub/common.h" 19 #include "scrub/trace.h" 20 21 /* 22 * FS Summary Counters 23 * =================== 24 * 25 * The basics of filesystem summary counter checking are that we iterate the 26 * AGs counting the number of free blocks, free space btree blocks, per-AG 27 * reservations, inodes, delayed allocation reservations, and free inodes. 28 * Then we compare what we computed against the in-core counters. 29 * 30 * However, the reality is that summary counters are a tricky beast to check. 31 * While we /could/ freeze the filesystem and scramble around the AGs counting 32 * the free blocks, in practice we prefer not do that for a scan because 33 * freezing is costly. To get around this, we added a per-cpu counter of the 34 * delalloc reservations so that we can rotor around the AGs relatively 35 * quickly, and we allow the counts to be slightly off because we're not taking 36 * any locks while we do this. 37 * 38 * So the first thing we do is warm up the buffer cache in the setup routine by 39 * walking all the AGs to make sure the incore per-AG structure has been 40 * initialized. The expected value calculation then iterates the incore per-AG 41 * structures as quickly as it can. We snapshot the percpu counters before and 42 * after this operation and use the difference in counter values to guess at 43 * our tolerance for mismatch between expected and actual counter values. 44 */ 45 46 /* 47 * Since the expected value computation is lockless but only browses incore 48 * values, the percpu counters should be fairly close to each other. However, 49 * we'll allow ourselves to be off by at least this (arbitrary) amount. 50 */ 51 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 52 53 /* 54 * Make sure the per-AG structure has been initialized from the on-disk header 55 * contents and trust that the incore counters match the ondisk counters. (The 56 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 57 * summary counters after checking all AG headers). Do this from the setup 58 * function so that the inner AG aggregation loop runs as quickly as possible. 59 * 60 * This function runs during the setup phase /before/ we start checking any 61 * metadata. 62 */ 63 STATIC int 64 xchk_fscount_warmup( 65 struct xfs_scrub *sc) 66 { 67 struct xfs_mount *mp = sc->mp; 68 struct xfs_buf *agi_bp = NULL; 69 struct xfs_buf *agf_bp = NULL; 70 struct xfs_perag *pag = NULL; 71 xfs_agnumber_t agno; 72 int error = 0; 73 74 for_each_perag(mp, agno, pag) { 75 if (xchk_should_terminate(sc, &error)) 76 break; 77 if (pag->pagi_init && pag->pagf_init) 78 continue; 79 80 /* Lock both AG headers. */ 81 error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); 82 if (error) 83 break; 84 error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); 85 if (error) 86 break; 87 88 /* 89 * These are supposed to be initialized by the header read 90 * function. 91 */ 92 if (!pag->pagi_init || !pag->pagf_init) { 93 error = -EFSCORRUPTED; 94 break; 95 } 96 97 xfs_buf_relse(agf_bp); 98 agf_bp = NULL; 99 xfs_buf_relse(agi_bp); 100 agi_bp = NULL; 101 } 102 103 if (agf_bp) 104 xfs_buf_relse(agf_bp); 105 if (agi_bp) 106 xfs_buf_relse(agi_bp); 107 if (pag) 108 xfs_perag_put(pag); 109 return error; 110 } 111 112 int 113 xchk_setup_fscounters( 114 struct xfs_scrub *sc) 115 { 116 struct xchk_fscounters *fsc; 117 int error; 118 119 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0); 120 if (!sc->buf) 121 return -ENOMEM; 122 fsc = sc->buf; 123 124 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 125 126 /* We must get the incore counters set up before we can proceed. */ 127 error = xchk_fscount_warmup(sc); 128 if (error) 129 return error; 130 131 /* 132 * Pause background reclaim while we're scrubbing to reduce the 133 * likelihood of background perturbations to the counters throwing off 134 * our calculations. 135 */ 136 xchk_stop_reaping(sc); 137 138 return xchk_trans_alloc(sc, 0); 139 } 140 141 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 142 static int 143 xchk_fscount_btreeblks( 144 struct xfs_scrub *sc, 145 struct xchk_fscounters *fsc, 146 xfs_agnumber_t agno) 147 { 148 xfs_extlen_t blocks; 149 int error; 150 151 error = xchk_ag_init(sc, agno, &sc->sa); 152 if (error) 153 return error; 154 155 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 156 if (error) 157 goto out_free; 158 fsc->fdblocks += blocks - 1; 159 160 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 161 if (error) 162 goto out_free; 163 fsc->fdblocks += blocks - 1; 164 165 out_free: 166 xchk_ag_free(sc, &sc->sa); 167 return error; 168 } 169 170 /* 171 * Calculate what the global in-core counters ought to be from the incore 172 * per-AG structure. Callers can compare this to the actual in-core counters 173 * to estimate by how much both in-core and on-disk counters need to be 174 * adjusted. 175 */ 176 STATIC int 177 xchk_fscount_aggregate_agcounts( 178 struct xfs_scrub *sc, 179 struct xchk_fscounters *fsc) 180 { 181 struct xfs_mount *mp = sc->mp; 182 struct xfs_perag *pag; 183 uint64_t delayed; 184 xfs_agnumber_t agno; 185 int tries = 8; 186 int error = 0; 187 188 retry: 189 fsc->icount = 0; 190 fsc->ifree = 0; 191 fsc->fdblocks = 0; 192 193 for_each_perag(mp, agno, pag) { 194 if (xchk_should_terminate(sc, &error)) 195 break; 196 197 /* This somehow got unset since the warmup? */ 198 if (!pag->pagi_init || !pag->pagf_init) { 199 error = -EFSCORRUPTED; 200 break; 201 } 202 203 /* Count all the inodes */ 204 fsc->icount += pag->pagi_count; 205 fsc->ifree += pag->pagi_freecount; 206 207 /* Add up the free/freelist/bnobt/cntbt blocks */ 208 fsc->fdblocks += pag->pagf_freeblks; 209 fsc->fdblocks += pag->pagf_flcount; 210 if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { 211 fsc->fdblocks += pag->pagf_btreeblks; 212 } else { 213 error = xchk_fscount_btreeblks(sc, fsc, agno); 214 if (error) 215 break; 216 } 217 218 /* 219 * Per-AG reservations are taken out of the incore counters, 220 * so they must be left out of the free blocks computation. 221 */ 222 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 223 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 224 225 } 226 if (pag) 227 xfs_perag_put(pag); 228 if (error) 229 return error; 230 231 /* 232 * The global incore space reservation is taken from the incore 233 * counters, so leave that out of the computation. 234 */ 235 fsc->fdblocks -= mp->m_resblks_avail; 236 237 /* 238 * Delayed allocation reservations are taken out of the incore counters 239 * but not recorded on disk, so leave them and their indlen blocks out 240 * of the computation. 241 */ 242 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 243 fsc->fdblocks -= delayed; 244 245 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 246 delayed); 247 248 249 /* Bail out if the values we compute are totally nonsense. */ 250 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 251 fsc->fdblocks > mp->m_sb.sb_dblocks || 252 fsc->ifree > fsc->icount_max) 253 return -EFSCORRUPTED; 254 255 /* 256 * If ifree > icount then we probably had some perturbation in the 257 * counters while we were calculating things. We'll try a few times 258 * to maintain ifree <= icount before giving up. 259 */ 260 if (fsc->ifree > fsc->icount) { 261 if (tries--) 262 goto retry; 263 xchk_set_incomplete(sc); 264 return 0; 265 } 266 267 return 0; 268 } 269 270 /* 271 * Is the @counter reasonably close to the @expected value? 272 * 273 * We neither locked nor froze anything in the filesystem while aggregating the 274 * per-AG data to compute the @expected value, which means that the counter 275 * could have changed. We know the @old_value of the summation of the counter 276 * before the aggregation, and we re-sum the counter now. If the expected 277 * value falls between the two summations, we're ok. 278 * 279 * Otherwise, we /might/ have a problem. If the change in the summations is 280 * more than we want to tolerate, the filesystem is probably busy and we should 281 * just send back INCOMPLETE and see if userspace will try again. 282 */ 283 static inline bool 284 xchk_fscount_within_range( 285 struct xfs_scrub *sc, 286 const int64_t old_value, 287 struct percpu_counter *counter, 288 uint64_t expected) 289 { 290 int64_t min_value, max_value; 291 int64_t curr_value = percpu_counter_sum(counter); 292 293 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 294 old_value); 295 296 /* Negative values are always wrong. */ 297 if (curr_value < 0) 298 return false; 299 300 /* Exact matches are always ok. */ 301 if (curr_value == expected) 302 return true; 303 304 min_value = min(old_value, curr_value); 305 max_value = max(old_value, curr_value); 306 307 /* Within the before-and-after range is ok. */ 308 if (expected >= min_value && expected <= max_value) 309 return true; 310 311 /* 312 * If the difference between the two summations is too large, the fs 313 * might just be busy and so we'll mark the scrub incomplete. Return 314 * true here so that we don't mark the counter corrupt. 315 * 316 * XXX: In the future when userspace can grant scrub permission to 317 * quiesce the filesystem to solve the outsized variance problem, this 318 * check should be moved up and the return code changed to signal to 319 * userspace that we need quiesce permission. 320 */ 321 if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { 322 xchk_set_incomplete(sc); 323 return true; 324 } 325 326 return false; 327 } 328 329 /* Check the superblock counters. */ 330 int 331 xchk_fscounters( 332 struct xfs_scrub *sc) 333 { 334 struct xfs_mount *mp = sc->mp; 335 struct xchk_fscounters *fsc = sc->buf; 336 int64_t icount, ifree, fdblocks; 337 int error; 338 339 /* Snapshot the percpu counters. */ 340 icount = percpu_counter_sum(&mp->m_icount); 341 ifree = percpu_counter_sum(&mp->m_ifree); 342 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 343 344 /* No negative values, please! */ 345 if (icount < 0 || ifree < 0 || fdblocks < 0) 346 xchk_set_corrupt(sc); 347 348 /* See if icount is obviously wrong. */ 349 if (icount < fsc->icount_min || icount > fsc->icount_max) 350 xchk_set_corrupt(sc); 351 352 /* See if fdblocks is obviously wrong. */ 353 if (fdblocks > mp->m_sb.sb_dblocks) 354 xchk_set_corrupt(sc); 355 356 /* 357 * If ifree exceeds icount by more than the minimum variance then 358 * something's probably wrong with the counters. 359 */ 360 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 361 xchk_set_corrupt(sc); 362 363 /* Walk the incore AG headers to calculate the expected counters. */ 364 error = xchk_fscount_aggregate_agcounts(sc, fsc); 365 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 366 return error; 367 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 368 return 0; 369 370 /* Compare the in-core counters with whatever we counted. */ 371 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) 372 xchk_set_corrupt(sc); 373 374 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) 375 xchk_set_corrupt(sc); 376 377 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 378 fsc->fdblocks)) 379 xchk_set_corrupt(sc); 380 381 return 0; 382 } 383