1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_alloc.h" 14 #include "xfs_errortag.h" 15 #include "xfs_error.h" 16 #include "xfs_trace.h" 17 #include "xfs_trans.h" 18 #include "xfs_rmap_btree.h" 19 #include "xfs_btree.h" 20 #include "xfs_refcount_btree.h" 21 #include "xfs_ialloc_btree.h" 22 23 /* 24 * Per-AG Block Reservations 25 * 26 * For some kinds of allocation group metadata structures, it is advantageous 27 * to reserve a small number of blocks in each AG so that future expansions of 28 * that data structure do not encounter ENOSPC because errors during a btree 29 * split cause the filesystem to go offline. 30 * 31 * Prior to the introduction of reflink, this wasn't an issue because the free 32 * space btrees maintain a reserve of space (the AGFL) to handle any expansion 33 * that may be necessary; and allocations of other metadata (inodes, BMBT, 34 * dir/attr) aren't restricted to a single AG. However, with reflink it is 35 * possible to allocate all the space in an AG, have subsequent reflink/CoW 36 * activity expand the refcount btree, and discover that there's no space left 37 * to handle that expansion. Since we can calculate the maximum size of the 38 * refcount btree, we can reserve space for it and avoid ENOSPC. 39 * 40 * Handling per-AG reservations consists of three changes to the allocator's 41 * behavior: First, because these reservations are always needed, we decrease 42 * the ag_max_usable counter to reflect the size of the AG after the reserved 43 * blocks are taken. Second, the reservations must be reflected in the 44 * fdblocks count to maintain proper accounting. Third, each AG must maintain 45 * its own reserved block counter so that we can calculate the amount of space 46 * that must remain free to maintain the reservations. Fourth, the "remaining 47 * reserved blocks" count must be used when calculating the length of the 48 * longest free extent in an AG and to clamp maxlen in the per-AG allocation 49 * functions. In other words, we maintain a virtual allocation via in-core 50 * accounting tricks so that we don't have to clean up after a crash. :) 51 * 52 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type 53 * values via struct xfs_alloc_arg or directly to the xfs_free_extent 54 * function. It might seem a little funny to maintain a reservoir of blocks 55 * to feed another reservoir, but the AGFL only holds enough blocks to get 56 * through the next transaction. The per-AG reservation is to ensure (we 57 * hope) that each AG never runs out of blocks. Each data structure wanting 58 * to use the reservation system should update ask/used in xfs_ag_resv_init. 59 */ 60 61 /* 62 * Are we critically low on blocks? For now we'll define that as the number 63 * of blocks we can get our hands on being less than 10% of what we reserved 64 * or less than some arbitrary number (maximum btree height). 65 */ 66 bool 67 xfs_ag_resv_critical( 68 struct xfs_perag *pag, 69 enum xfs_ag_resv_type type) 70 { 71 xfs_extlen_t avail; 72 xfs_extlen_t orig; 73 74 switch (type) { 75 case XFS_AG_RESV_METADATA: 76 avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved; 77 orig = pag->pag_meta_resv.ar_asked; 78 break; 79 case XFS_AG_RESV_RMAPBT: 80 avail = pag->pagf_freeblks + pag->pagf_flcount - 81 pag->pag_meta_resv.ar_reserved; 82 orig = pag->pag_rmapbt_resv.ar_asked; 83 break; 84 default: 85 ASSERT(0); 86 return false; 87 } 88 89 trace_xfs_ag_resv_critical(pag, type, avail); 90 91 /* Critically low if less than 10% or max btree height remains. */ 92 return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, 93 pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); 94 } 95 96 /* 97 * How many blocks are reserved but not used, and therefore must not be 98 * allocated away? 99 */ 100 xfs_extlen_t 101 xfs_ag_resv_needed( 102 struct xfs_perag *pag, 103 enum xfs_ag_resv_type type) 104 { 105 xfs_extlen_t len; 106 107 len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved; 108 switch (type) { 109 case XFS_AG_RESV_METADATA: 110 case XFS_AG_RESV_RMAPBT: 111 len -= xfs_perag_resv(pag, type)->ar_reserved; 112 break; 113 case XFS_AG_RESV_NONE: 114 /* empty */ 115 break; 116 default: 117 ASSERT(0); 118 } 119 120 trace_xfs_ag_resv_needed(pag, type, len); 121 122 return len; 123 } 124 125 /* Clean out a reservation */ 126 static int 127 __xfs_ag_resv_free( 128 struct xfs_perag *pag, 129 enum xfs_ag_resv_type type) 130 { 131 struct xfs_ag_resv *resv; 132 xfs_extlen_t oldresv; 133 int error; 134 135 trace_xfs_ag_resv_free(pag, type, 0); 136 137 resv = xfs_perag_resv(pag, type); 138 if (pag->pag_agno == 0) 139 pag->pag_mount->m_ag_max_usable += resv->ar_asked; 140 /* 141 * RMAPBT blocks come from the AGFL and AGFL blocks are always 142 * considered "free", so whatever was reserved at mount time must be 143 * given back at umount. 144 */ 145 if (type == XFS_AG_RESV_RMAPBT) 146 oldresv = resv->ar_orig_reserved; 147 else 148 oldresv = resv->ar_reserved; 149 error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); 150 resv->ar_reserved = 0; 151 resv->ar_asked = 0; 152 resv->ar_orig_reserved = 0; 153 154 if (error) 155 trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, 156 error, _RET_IP_); 157 return error; 158 } 159 160 /* Free a per-AG reservation. */ 161 int 162 xfs_ag_resv_free( 163 struct xfs_perag *pag) 164 { 165 int error; 166 int err2; 167 168 error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); 169 err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); 170 if (err2 && !error) 171 error = err2; 172 return error; 173 } 174 175 static int 176 __xfs_ag_resv_init( 177 struct xfs_perag *pag, 178 enum xfs_ag_resv_type type, 179 xfs_extlen_t ask, 180 xfs_extlen_t used) 181 { 182 struct xfs_mount *mp = pag->pag_mount; 183 struct xfs_ag_resv *resv; 184 int error; 185 xfs_extlen_t hidden_space; 186 187 if (used > ask) 188 ask = used; 189 190 switch (type) { 191 case XFS_AG_RESV_RMAPBT: 192 /* 193 * Space taken by the rmapbt is not subtracted from fdblocks 194 * because the rmapbt lives in the free space. Here we must 195 * subtract the entire reservation from fdblocks so that we 196 * always have blocks available for rmapbt expansion. 197 */ 198 hidden_space = ask; 199 break; 200 case XFS_AG_RESV_METADATA: 201 /* 202 * Space taken by all other metadata btrees are accounted 203 * on-disk as used space. We therefore only hide the space 204 * that is reserved but not used by the trees. 205 */ 206 hidden_space = ask - used; 207 break; 208 default: 209 ASSERT(0); 210 return -EINVAL; 211 } 212 error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); 213 if (error) { 214 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, 215 error, _RET_IP_); 216 xfs_warn(mp, 217 "Per-AG reservation for AG %u failed. Filesystem may run out of space.", 218 pag->pag_agno); 219 return error; 220 } 221 222 /* 223 * Reduce the maximum per-AG allocation length by however much we're 224 * trying to reserve for an AG. Since this is a filesystem-wide 225 * counter, we only make the adjustment for AG 0. This assumes that 226 * there aren't any AGs hungrier for per-AG reservation than AG 0. 227 */ 228 if (pag->pag_agno == 0) 229 mp->m_ag_max_usable -= ask; 230 231 resv = xfs_perag_resv(pag, type); 232 resv->ar_asked = ask; 233 resv->ar_orig_reserved = hidden_space; 234 resv->ar_reserved = ask - used; 235 236 trace_xfs_ag_resv_init(pag, type, ask); 237 return 0; 238 } 239 240 /* Create a per-AG block reservation. */ 241 int 242 xfs_ag_resv_init( 243 struct xfs_perag *pag, 244 struct xfs_trans *tp) 245 { 246 struct xfs_mount *mp = pag->pag_mount; 247 xfs_agnumber_t agno = pag->pag_agno; 248 xfs_extlen_t ask; 249 xfs_extlen_t used; 250 int error = 0; 251 252 /* Create the metadata reservation. */ 253 if (pag->pag_meta_resv.ar_asked == 0) { 254 ask = used = 0; 255 256 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); 257 if (error) 258 goto out; 259 260 error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used); 261 if (error) 262 goto out; 263 264 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 265 ask, used); 266 if (error) { 267 /* 268 * Because we didn't have per-AG reservations when the 269 * finobt feature was added we might not be able to 270 * reserve all needed blocks. Warn and fall back to the 271 * old and potentially buggy code in that case, but 272 * ensure we do have the reservation for the refcountbt. 273 */ 274 ask = used = 0; 275 276 mp->m_finobt_nores = true; 277 278 error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, 279 &used); 280 if (error) 281 goto out; 282 283 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 284 ask, used); 285 if (error) 286 goto out; 287 } 288 } 289 290 /* Create the RMAPBT metadata reservation */ 291 if (pag->pag_rmapbt_resv.ar_asked == 0) { 292 ask = used = 0; 293 294 error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used); 295 if (error) 296 goto out; 297 298 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); 299 if (error) 300 goto out; 301 } 302 303 #ifdef DEBUG 304 /* need to read in the AGF for the ASSERT below to work */ 305 error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); 306 if (error) 307 return error; 308 309 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + 310 xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= 311 pag->pagf_freeblks + pag->pagf_flcount); 312 #endif 313 out: 314 return error; 315 } 316 317 /* Allocate a block from the reservation. */ 318 void 319 xfs_ag_resv_alloc_extent( 320 struct xfs_perag *pag, 321 enum xfs_ag_resv_type type, 322 struct xfs_alloc_arg *args) 323 { 324 struct xfs_ag_resv *resv; 325 xfs_extlen_t len; 326 uint field; 327 328 trace_xfs_ag_resv_alloc_extent(pag, type, args->len); 329 330 switch (type) { 331 case XFS_AG_RESV_AGFL: 332 return; 333 case XFS_AG_RESV_METADATA: 334 case XFS_AG_RESV_RMAPBT: 335 resv = xfs_perag_resv(pag, type); 336 break; 337 default: 338 ASSERT(0); 339 /* fall through */ 340 case XFS_AG_RESV_NONE: 341 field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : 342 XFS_TRANS_SB_FDBLOCKS; 343 xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); 344 return; 345 } 346 347 len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); 348 resv->ar_reserved -= len; 349 if (type == XFS_AG_RESV_RMAPBT) 350 return; 351 /* Allocations of reserved blocks only need on-disk sb updates... */ 352 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); 353 /* ...but non-reserved blocks need in-core and on-disk updates. */ 354 if (args->len > len) 355 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, 356 -((int64_t)args->len - len)); 357 } 358 359 /* Free a block to the reservation. */ 360 void 361 xfs_ag_resv_free_extent( 362 struct xfs_perag *pag, 363 enum xfs_ag_resv_type type, 364 struct xfs_trans *tp, 365 xfs_extlen_t len) 366 { 367 xfs_extlen_t leftover; 368 struct xfs_ag_resv *resv; 369 370 trace_xfs_ag_resv_free_extent(pag, type, len); 371 372 switch (type) { 373 case XFS_AG_RESV_AGFL: 374 return; 375 case XFS_AG_RESV_METADATA: 376 case XFS_AG_RESV_RMAPBT: 377 resv = xfs_perag_resv(pag, type); 378 break; 379 default: 380 ASSERT(0); 381 /* fall through */ 382 case XFS_AG_RESV_NONE: 383 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); 384 return; 385 } 386 387 leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); 388 resv->ar_reserved += leftover; 389 if (type == XFS_AG_RESV_RMAPBT) 390 return; 391 /* Freeing into the reserved pool only requires on-disk update... */ 392 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); 393 /* ...but freeing beyond that requires in-core and on-disk update. */ 394 if (len > leftover) 395 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); 396 } 397