1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_alloc.h" 16 #include "xfs_errortag.h" 17 #include "xfs_error.h" 18 #include "xfs_trace.h" 19 #include "xfs_cksum.h" 20 #include "xfs_trans.h" 21 #include "xfs_bit.h" 22 #include "xfs_bmap.h" 23 #include "xfs_bmap_btree.h" 24 #include "xfs_ag_resv.h" 25 #include "xfs_trans_space.h" 26 #include "xfs_rmap_btree.h" 27 #include "xfs_btree.h" 28 #include "xfs_refcount_btree.h" 29 #include "xfs_ialloc_btree.h" 30 31 /* 32 * Per-AG Block Reservations 33 * 34 * For some kinds of allocation group metadata structures, it is advantageous 35 * to reserve a small number of blocks in each AG so that future expansions of 36 * that data structure do not encounter ENOSPC because errors during a btree 37 * split cause the filesystem to go offline. 38 * 39 * Prior to the introduction of reflink, this wasn't an issue because the free 40 * space btrees maintain a reserve of space (the AGFL) to handle any expansion 41 * that may be necessary; and allocations of other metadata (inodes, BMBT, 42 * dir/attr) aren't restricted to a single AG. However, with reflink it is 43 * possible to allocate all the space in an AG, have subsequent reflink/CoW 44 * activity expand the refcount btree, and discover that there's no space left 45 * to handle that expansion. Since we can calculate the maximum size of the 46 * refcount btree, we can reserve space for it and avoid ENOSPC. 47 * 48 * Handling per-AG reservations consists of three changes to the allocator's 49 * behavior: First, because these reservations are always needed, we decrease 50 * the ag_max_usable counter to reflect the size of the AG after the reserved 51 * blocks are taken. Second, the reservations must be reflected in the 52 * fdblocks count to maintain proper accounting. Third, each AG must maintain 53 * its own reserved block counter so that we can calculate the amount of space 54 * that must remain free to maintain the reservations. Fourth, the "remaining 55 * reserved blocks" count must be used when calculating the length of the 56 * longest free extent in an AG and to clamp maxlen in the per-AG allocation 57 * functions. In other words, we maintain a virtual allocation via in-core 58 * accounting tricks so that we don't have to clean up after a crash. :) 59 * 60 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type 61 * values via struct xfs_alloc_arg or directly to the xfs_free_extent 62 * function. It might seem a little funny to maintain a reservoir of blocks 63 * to feed another reservoir, but the AGFL only holds enough blocks to get 64 * through the next transaction. The per-AG reservation is to ensure (we 65 * hope) that each AG never runs out of blocks. Each data structure wanting 66 * to use the reservation system should update ask/used in xfs_ag_resv_init. 67 */ 68 69 /* 70 * Are we critically low on blocks? For now we'll define that as the number 71 * of blocks we can get our hands on being less than 10% of what we reserved 72 * or less than some arbitrary number (maximum btree height). 73 */ 74 bool 75 xfs_ag_resv_critical( 76 struct xfs_perag *pag, 77 enum xfs_ag_resv_type type) 78 { 79 xfs_extlen_t avail; 80 xfs_extlen_t orig; 81 82 switch (type) { 83 case XFS_AG_RESV_METADATA: 84 avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved; 85 orig = pag->pag_meta_resv.ar_asked; 86 break; 87 case XFS_AG_RESV_RMAPBT: 88 avail = pag->pagf_freeblks + pag->pagf_flcount - 89 pag->pag_meta_resv.ar_reserved; 90 orig = pag->pag_rmapbt_resv.ar_asked; 91 break; 92 default: 93 ASSERT(0); 94 return false; 95 } 96 97 trace_xfs_ag_resv_critical(pag, type, avail); 98 99 /* Critically low if less than 10% or max btree height remains. */ 100 return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, 101 pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); 102 } 103 104 /* 105 * How many blocks are reserved but not used, and therefore must not be 106 * allocated away? 107 */ 108 xfs_extlen_t 109 xfs_ag_resv_needed( 110 struct xfs_perag *pag, 111 enum xfs_ag_resv_type type) 112 { 113 xfs_extlen_t len; 114 115 len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved; 116 switch (type) { 117 case XFS_AG_RESV_METADATA: 118 case XFS_AG_RESV_RMAPBT: 119 len -= xfs_perag_resv(pag, type)->ar_reserved; 120 break; 121 case XFS_AG_RESV_NONE: 122 /* empty */ 123 break; 124 default: 125 ASSERT(0); 126 } 127 128 trace_xfs_ag_resv_needed(pag, type, len); 129 130 return len; 131 } 132 133 /* Clean out a reservation */ 134 static int 135 __xfs_ag_resv_free( 136 struct xfs_perag *pag, 137 enum xfs_ag_resv_type type) 138 { 139 struct xfs_ag_resv *resv; 140 xfs_extlen_t oldresv; 141 int error; 142 143 trace_xfs_ag_resv_free(pag, type, 0); 144 145 resv = xfs_perag_resv(pag, type); 146 if (pag->pag_agno == 0) 147 pag->pag_mount->m_ag_max_usable += resv->ar_asked; 148 /* 149 * RMAPBT blocks come from the AGFL and AGFL blocks are always 150 * considered "free", so whatever was reserved at mount time must be 151 * given back at umount. 152 */ 153 if (type == XFS_AG_RESV_RMAPBT) 154 oldresv = resv->ar_orig_reserved; 155 else 156 oldresv = resv->ar_reserved; 157 error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); 158 resv->ar_reserved = 0; 159 resv->ar_asked = 0; 160 resv->ar_orig_reserved = 0; 161 162 if (error) 163 trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, 164 error, _RET_IP_); 165 return error; 166 } 167 168 /* Free a per-AG reservation. */ 169 int 170 xfs_ag_resv_free( 171 struct xfs_perag *pag) 172 { 173 int error; 174 int err2; 175 176 error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); 177 err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); 178 if (err2 && !error) 179 error = err2; 180 return error; 181 } 182 183 static int 184 __xfs_ag_resv_init( 185 struct xfs_perag *pag, 186 enum xfs_ag_resv_type type, 187 xfs_extlen_t ask, 188 xfs_extlen_t used) 189 { 190 struct xfs_mount *mp = pag->pag_mount; 191 struct xfs_ag_resv *resv; 192 int error; 193 xfs_extlen_t hidden_space; 194 195 if (used > ask) 196 ask = used; 197 198 switch (type) { 199 case XFS_AG_RESV_RMAPBT: 200 /* 201 * Space taken by the rmapbt is not subtracted from fdblocks 202 * because the rmapbt lives in the free space. Here we must 203 * subtract the entire reservation from fdblocks so that we 204 * always have blocks available for rmapbt expansion. 205 */ 206 hidden_space = ask; 207 break; 208 case XFS_AG_RESV_METADATA: 209 /* 210 * Space taken by all other metadata btrees are accounted 211 * on-disk as used space. We therefore only hide the space 212 * that is reserved but not used by the trees. 213 */ 214 hidden_space = ask - used; 215 break; 216 default: 217 ASSERT(0); 218 return -EINVAL; 219 } 220 error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); 221 if (error) { 222 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, 223 error, _RET_IP_); 224 xfs_warn(mp, 225 "Per-AG reservation for AG %u failed. Filesystem may run out of space.", 226 pag->pag_agno); 227 return error; 228 } 229 230 /* 231 * Reduce the maximum per-AG allocation length by however much we're 232 * trying to reserve for an AG. Since this is a filesystem-wide 233 * counter, we only make the adjustment for AG 0. This assumes that 234 * there aren't any AGs hungrier for per-AG reservation than AG 0. 235 */ 236 if (pag->pag_agno == 0) 237 mp->m_ag_max_usable -= ask; 238 239 resv = xfs_perag_resv(pag, type); 240 resv->ar_asked = ask; 241 resv->ar_orig_reserved = hidden_space; 242 resv->ar_reserved = ask - used; 243 244 trace_xfs_ag_resv_init(pag, type, ask); 245 return 0; 246 } 247 248 /* Create a per-AG block reservation. */ 249 int 250 xfs_ag_resv_init( 251 struct xfs_perag *pag, 252 struct xfs_trans *tp) 253 { 254 struct xfs_mount *mp = pag->pag_mount; 255 xfs_agnumber_t agno = pag->pag_agno; 256 xfs_extlen_t ask; 257 xfs_extlen_t used; 258 int error = 0; 259 260 /* Create the metadata reservation. */ 261 if (pag->pag_meta_resv.ar_asked == 0) { 262 ask = used = 0; 263 264 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); 265 if (error) 266 goto out; 267 268 error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used); 269 if (error) 270 goto out; 271 272 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 273 ask, used); 274 if (error) { 275 /* 276 * Because we didn't have per-AG reservations when the 277 * finobt feature was added we might not be able to 278 * reserve all needed blocks. Warn and fall back to the 279 * old and potentially buggy code in that case, but 280 * ensure we do have the reservation for the refcountbt. 281 */ 282 ask = used = 0; 283 284 mp->m_inotbt_nores = true; 285 286 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, 287 &used); 288 if (error) 289 goto out; 290 291 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 292 ask, used); 293 if (error) 294 goto out; 295 } 296 } 297 298 /* Create the RMAPBT metadata reservation */ 299 if (pag->pag_rmapbt_resv.ar_asked == 0) { 300 ask = used = 0; 301 302 error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used); 303 if (error) 304 goto out; 305 306 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); 307 if (error) 308 goto out; 309 } 310 311 #ifdef DEBUG 312 /* need to read in the AGF for the ASSERT below to work */ 313 error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); 314 if (error) 315 return error; 316 317 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + 318 xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= 319 pag->pagf_freeblks + pag->pagf_flcount); 320 #endif 321 out: 322 return error; 323 } 324 325 /* Allocate a block from the reservation. */ 326 void 327 xfs_ag_resv_alloc_extent( 328 struct xfs_perag *pag, 329 enum xfs_ag_resv_type type, 330 struct xfs_alloc_arg *args) 331 { 332 struct xfs_ag_resv *resv; 333 xfs_extlen_t len; 334 uint field; 335 336 trace_xfs_ag_resv_alloc_extent(pag, type, args->len); 337 338 switch (type) { 339 case XFS_AG_RESV_AGFL: 340 return; 341 case XFS_AG_RESV_METADATA: 342 case XFS_AG_RESV_RMAPBT: 343 resv = xfs_perag_resv(pag, type); 344 break; 345 default: 346 ASSERT(0); 347 /* fall through */ 348 case XFS_AG_RESV_NONE: 349 field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : 350 XFS_TRANS_SB_FDBLOCKS; 351 xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); 352 return; 353 } 354 355 len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); 356 resv->ar_reserved -= len; 357 if (type == XFS_AG_RESV_RMAPBT) 358 return; 359 /* Allocations of reserved blocks only need on-disk sb updates... */ 360 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); 361 /* ...but non-reserved blocks need in-core and on-disk updates. */ 362 if (args->len > len) 363 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, 364 -((int64_t)args->len - len)); 365 } 366 367 /* Free a block to the reservation. */ 368 void 369 xfs_ag_resv_free_extent( 370 struct xfs_perag *pag, 371 enum xfs_ag_resv_type type, 372 struct xfs_trans *tp, 373 xfs_extlen_t len) 374 { 375 xfs_extlen_t leftover; 376 struct xfs_ag_resv *resv; 377 378 trace_xfs_ag_resv_free_extent(pag, type, len); 379 380 switch (type) { 381 case XFS_AG_RESV_AGFL: 382 return; 383 case XFS_AG_RESV_METADATA: 384 case XFS_AG_RESV_RMAPBT: 385 resv = xfs_perag_resv(pag, type); 386 break; 387 default: 388 ASSERT(0); 389 /* fall through */ 390 case XFS_AG_RESV_NONE: 391 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); 392 return; 393 } 394 395 leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); 396 resv->ar_reserved += leftover; 397 if (type == XFS_AG_RESV_RMAPBT) 398 return; 399 /* Freeing into the reserved pool only requires on-disk update... */ 400 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); 401 /* ...but freeing beyond that requires in-core and on-disk update. */ 402 if (len > leftover) 403 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); 404 } 405