1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2019 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_defer.h" 13 #include "xfs_btree.h" 14 #include "xfs_bit.h" 15 #include "xfs_log_format.h" 16 #include "xfs_trans.h" 17 #include "xfs_sb.h" 18 #include "xfs_inode.h" 19 #include "xfs_alloc.h" 20 #include "xfs_ialloc.h" 21 #include "xfs_rmap.h" 22 #include "xfs_error.h" 23 #include "xfs_errortag.h" 24 #include "xfs_icache.h" 25 #include "xfs_health.h" 26 #include "xfs_bmap.h" 27 #include "scrub/xfs_scrub.h" 28 #include "scrub/scrub.h" 29 #include "scrub/common.h" 30 #include "scrub/trace.h" 31 32 /* 33 * FS Summary Counters 34 * =================== 35 * 36 * The basics of filesystem summary counter checking are that we iterate the 37 * AGs counting the number of free blocks, free space btree blocks, per-AG 38 * reservations, inodes, delayed allocation reservations, and free inodes. 39 * Then we compare what we computed against the in-core counters. 40 * 41 * However, the reality is that summary counters are a tricky beast to check. 42 * While we /could/ freeze the filesystem and scramble around the AGs counting 43 * the free blocks, in practice we prefer not do that for a scan because 44 * freezing is costly. To get around this, we added a per-cpu counter of the 45 * delalloc reservations so that we can rotor around the AGs relatively 46 * quickly, and we allow the counts to be slightly off because we're not taking 47 * any locks while we do this. 48 * 49 * So the first thing we do is warm up the buffer cache in the setup routine by 50 * walking all the AGs to make sure the incore per-AG structure has been 51 * initialized. The expected value calculation then iterates the incore per-AG 52 * structures as quickly as it can. We snapshot the percpu counters before and 53 * after this operation and use the difference in counter values to guess at 54 * our tolerance for mismatch between expected and actual counter values. 55 */ 56 57 /* 58 * Since the expected value computation is lockless but only browses incore 59 * values, the percpu counters should be fairly close to each other. However, 60 * we'll allow ourselves to be off by at least this (arbitrary) amount. 61 */ 62 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 63 64 /* 65 * Make sure the per-AG structure has been initialized from the on-disk header 66 * contents and trust that the incore counters match the ondisk counters. (The 67 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 68 * summary counters after checking all AG headers). Do this from the setup 69 * function so that the inner AG aggregation loop runs as quickly as possible. 70 * 71 * This function runs during the setup phase /before/ we start checking any 72 * metadata. 73 */ 74 STATIC int 75 xchk_fscount_warmup( 76 struct xfs_scrub *sc) 77 { 78 struct xfs_mount *mp = sc->mp; 79 struct xfs_buf *agi_bp = NULL; 80 struct xfs_buf *agf_bp = NULL; 81 struct xfs_perag *pag = NULL; 82 xfs_agnumber_t agno; 83 int error = 0; 84 85 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 86 pag = xfs_perag_get(mp, agno); 87 88 if (pag->pagi_init && pag->pagf_init) 89 goto next_loop_perag; 90 91 /* Lock both AG headers. */ 92 error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); 93 if (error) 94 break; 95 error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); 96 if (error) 97 break; 98 error = -ENOMEM; 99 if (!agf_bp || !agi_bp) 100 break; 101 102 /* 103 * These are supposed to be initialized by the header read 104 * function. 105 */ 106 error = -EFSCORRUPTED; 107 if (!pag->pagi_init || !pag->pagf_init) 108 break; 109 110 xfs_buf_relse(agf_bp); 111 agf_bp = NULL; 112 xfs_buf_relse(agi_bp); 113 agi_bp = NULL; 114 next_loop_perag: 115 xfs_perag_put(pag); 116 pag = NULL; 117 error = 0; 118 119 if (fatal_signal_pending(current)) 120 break; 121 } 122 123 if (agf_bp) 124 xfs_buf_relse(agf_bp); 125 if (agi_bp) 126 xfs_buf_relse(agi_bp); 127 if (pag) 128 xfs_perag_put(pag); 129 return error; 130 } 131 132 int 133 xchk_setup_fscounters( 134 struct xfs_scrub *sc, 135 struct xfs_inode *ip) 136 { 137 struct xchk_fscounters *fsc; 138 int error; 139 140 sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); 141 if (!sc->buf) 142 return -ENOMEM; 143 fsc = sc->buf; 144 145 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 146 147 /* We must get the incore counters set up before we can proceed. */ 148 error = xchk_fscount_warmup(sc); 149 if (error) 150 return error; 151 152 /* 153 * Pause background reclaim while we're scrubbing to reduce the 154 * likelihood of background perturbations to the counters throwing off 155 * our calculations. 156 */ 157 xchk_stop_reaping(sc); 158 159 return xchk_trans_alloc(sc, 0); 160 } 161 162 /* 163 * Calculate what the global in-core counters ought to be from the incore 164 * per-AG structure. Callers can compare this to the actual in-core counters 165 * to estimate by how much both in-core and on-disk counters need to be 166 * adjusted. 167 */ 168 STATIC int 169 xchk_fscount_aggregate_agcounts( 170 struct xfs_scrub *sc, 171 struct xchk_fscounters *fsc) 172 { 173 struct xfs_mount *mp = sc->mp; 174 struct xfs_perag *pag; 175 uint64_t delayed; 176 xfs_agnumber_t agno; 177 int tries = 8; 178 179 retry: 180 fsc->icount = 0; 181 fsc->ifree = 0; 182 fsc->fdblocks = 0; 183 184 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 185 pag = xfs_perag_get(mp, agno); 186 187 /* This somehow got unset since the warmup? */ 188 if (!pag->pagi_init || !pag->pagf_init) { 189 xfs_perag_put(pag); 190 return -EFSCORRUPTED; 191 } 192 193 /* Count all the inodes */ 194 fsc->icount += pag->pagi_count; 195 fsc->ifree += pag->pagi_freecount; 196 197 /* Add up the free/freelist/bnobt/cntbt blocks */ 198 fsc->fdblocks += pag->pagf_freeblks; 199 fsc->fdblocks += pag->pagf_flcount; 200 fsc->fdblocks += pag->pagf_btreeblks; 201 202 /* 203 * Per-AG reservations are taken out of the incore counters, 204 * so they must be left out of the free blocks computation. 205 */ 206 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 207 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 208 209 xfs_perag_put(pag); 210 211 if (fatal_signal_pending(current)) 212 break; 213 } 214 215 /* 216 * The global incore space reservation is taken from the incore 217 * counters, so leave that out of the computation. 218 */ 219 fsc->fdblocks -= mp->m_resblks_avail; 220 221 /* 222 * Delayed allocation reservations are taken out of the incore counters 223 * but not recorded on disk, so leave them and their indlen blocks out 224 * of the computation. 225 */ 226 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 227 fsc->fdblocks -= delayed; 228 229 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 230 delayed); 231 232 233 /* Bail out if the values we compute are totally nonsense. */ 234 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 235 fsc->fdblocks > mp->m_sb.sb_dblocks || 236 fsc->ifree > fsc->icount_max) 237 return -EFSCORRUPTED; 238 239 /* 240 * If ifree > icount then we probably had some perturbation in the 241 * counters while we were calculating things. We'll try a few times 242 * to maintain ifree <= icount before giving up. 243 */ 244 if (fsc->ifree > fsc->icount) { 245 if (tries--) 246 goto retry; 247 xchk_set_incomplete(sc); 248 return 0; 249 } 250 251 return 0; 252 } 253 254 /* 255 * Is the @counter reasonably close to the @expected value? 256 * 257 * We neither locked nor froze anything in the filesystem while aggregating the 258 * per-AG data to compute the @expected value, which means that the counter 259 * could have changed. We know the @old_value of the summation of the counter 260 * before the aggregation, and we re-sum the counter now. If the expected 261 * value falls between the two summations, we're ok. 262 * 263 * Otherwise, we /might/ have a problem. If the change in the summations is 264 * more than we want to tolerate, the filesystem is probably busy and we should 265 * just send back INCOMPLETE and see if userspace will try again. 266 */ 267 static inline bool 268 xchk_fscount_within_range( 269 struct xfs_scrub *sc, 270 const int64_t old_value, 271 struct percpu_counter *counter, 272 uint64_t expected) 273 { 274 int64_t min_value, max_value; 275 int64_t curr_value = percpu_counter_sum(counter); 276 277 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 278 old_value); 279 280 /* Negative values are always wrong. */ 281 if (curr_value < 0) 282 return false; 283 284 /* Exact matches are always ok. */ 285 if (curr_value == expected) 286 return true; 287 288 min_value = min(old_value, curr_value); 289 max_value = max(old_value, curr_value); 290 291 /* Within the before-and-after range is ok. */ 292 if (expected >= min_value && expected <= max_value) 293 return true; 294 295 /* 296 * If the difference between the two summations is too large, the fs 297 * might just be busy and so we'll mark the scrub incomplete. Return 298 * true here so that we don't mark the counter corrupt. 299 * 300 * XXX: In the future when userspace can grant scrub permission to 301 * quiesce the filesystem to solve the outsized variance problem, this 302 * check should be moved up and the return code changed to signal to 303 * userspace that we need quiesce permission. 304 */ 305 if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { 306 xchk_set_incomplete(sc); 307 return true; 308 } 309 310 return false; 311 } 312 313 /* Check the superblock counters. */ 314 int 315 xchk_fscounters( 316 struct xfs_scrub *sc) 317 { 318 struct xfs_mount *mp = sc->mp; 319 struct xchk_fscounters *fsc = sc->buf; 320 int64_t icount, ifree, fdblocks; 321 int error; 322 323 /* Snapshot the percpu counters. */ 324 icount = percpu_counter_sum(&mp->m_icount); 325 ifree = percpu_counter_sum(&mp->m_ifree); 326 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 327 328 /* No negative values, please! */ 329 if (icount < 0 || ifree < 0 || fdblocks < 0) 330 xchk_set_corrupt(sc); 331 332 /* See if icount is obviously wrong. */ 333 if (icount < fsc->icount_min || icount > fsc->icount_max) 334 xchk_set_corrupt(sc); 335 336 /* See if fdblocks is obviously wrong. */ 337 if (fdblocks > mp->m_sb.sb_dblocks) 338 xchk_set_corrupt(sc); 339 340 /* 341 * If ifree exceeds icount by more than the minimum variance then 342 * something's probably wrong with the counters. 343 */ 344 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 345 xchk_set_corrupt(sc); 346 347 /* Walk the incore AG headers to calculate the expected counters. */ 348 error = xchk_fscount_aggregate_agcounts(sc, fsc); 349 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 350 return error; 351 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 352 return 0; 353 354 /* Compare the in-core counters with whatever we counted. */ 355 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) 356 xchk_set_corrupt(sc); 357 358 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) 359 xchk_set_corrupt(sc); 360 361 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 362 fsc->fdblocks)) 363 xchk_set_corrupt(sc); 364 365 return 0; 366 } 367