1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 30 struct kmem_cache *xfs_extfree_item_cache; 31 32 struct workqueue_struct *xfs_alloc_wq; 33 34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 /* 40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 41 * the beginning of the block for a proper header with the location information 42 * and CRC. 43 */ 44 unsigned int xfs_agfl_size(struct xfs_mount * mp)45 xfs_agfl_size( 46 struct xfs_mount *mp) 47 { 48 unsigned int size = mp->m_sb.sb_sectsize; 49 50 if (xfs_has_crc(mp)) 51 size -= sizeof(struct xfs_agfl); 52 53 return size / sizeof(xfs_agblock_t); 54 } 55 56 unsigned int xfs_refc_block(struct xfs_mount * mp)57 xfs_refc_block( 58 struct xfs_mount *mp) 59 { 60 if (xfs_has_rmapbt(mp)) 61 return XFS_RMAP_BLOCK(mp) + 1; 62 if (xfs_has_finobt(mp)) 63 return XFS_FIBT_BLOCK(mp) + 1; 64 return XFS_IBT_BLOCK(mp) + 1; 65 } 66 67 xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount * mp)68 xfs_prealloc_blocks( 69 struct xfs_mount *mp) 70 { 71 if (xfs_has_reflink(mp)) 72 return xfs_refc_block(mp) + 1; 73 if (xfs_has_rmapbt(mp)) 74 return XFS_RMAP_BLOCK(mp) + 1; 75 if (xfs_has_finobt(mp)) 76 return XFS_FIBT_BLOCK(mp) + 1; 77 return XFS_IBT_BLOCK(mp) + 1; 78 } 79 80 /* 81 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to 82 * guarantee that we can refill the AGFL prior to allocating space in a nearly 83 * full AG. Although the space described by the free space btrees, the 84 * blocks used by the freesp btrees themselves, and the blocks owned by the 85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 86 * free space in the AG drop so low that the free space btrees cannot refill an 87 * empty AGFL up to the minimum level. Rather than grind through empty AGs 88 * until the fs goes down, we subtract this many AG blocks from the incore 89 * fdblocks to ensure user allocation does not overcommit the space the 90 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 91 * withhold space from xfs_mod_fdblocks, so we do not account for that here. 92 */ 93 #define XFS_ALLOCBT_AGFL_RESERVE 4 94 95 /* 96 * Compute the number of blocks that we set aside to guarantee the ability to 97 * refill the AGFL and handle a full bmap btree split. 98 * 99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 100 * AGF buffer (PV 947395), we place constraints on the relationship among 101 * actual allocations for data blocks, freelist blocks, and potential file data 102 * bmap btree blocks. However, these restrictions may result in no actual space 103 * allocated for a delayed extent, for example, a data block in a certain AG is 104 * allocated but there is no additional block for the additional bmap btree 105 * block due to a split of the bmap btree of the file. The result of this may 106 * lead to an infinite loop when the file gets flushed to disk and all delayed 107 * extents need to be actually allocated. To get around this, we explicitly set 108 * aside a few blocks which will not be reserved in delayed allocation. 109 * 110 * For each AG, we need to reserve enough blocks to replenish a totally empty 111 * AGFL and 4 more to handle a potential split of the file's bmap btree. 112 */ 113 unsigned int xfs_alloc_set_aside(struct xfs_mount * mp)114 xfs_alloc_set_aside( 115 struct xfs_mount *mp) 116 { 117 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 118 } 119 120 /* 121 * When deciding how much space to allocate out of an AG, we limit the 122 * allocation maximum size to the size the AG. However, we cannot use all the 123 * blocks in the AG - some are permanently used by metadata. These 124 * blocks are generally: 125 * - the AG superblock, AGF, AGI and AGFL 126 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 127 * the AGI free inode and rmap btree root blocks. 128 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 129 * - the rmapbt root block 130 * 131 * The AG headers are sector sized, so the amount of space they take up is 132 * dependent on filesystem geometry. The others are all single blocks. 133 */ 134 unsigned int xfs_alloc_ag_max_usable(struct xfs_mount * mp)135 xfs_alloc_ag_max_usable( 136 struct xfs_mount *mp) 137 { 138 unsigned int blocks; 139 140 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 141 blocks += XFS_ALLOCBT_AGFL_RESERVE; 142 blocks += 3; /* AGF, AGI btree root blocks */ 143 if (xfs_has_finobt(mp)) 144 blocks++; /* finobt root block */ 145 if (xfs_has_rmapbt(mp)) 146 blocks++; /* rmap root block */ 147 if (xfs_has_reflink(mp)) 148 blocks++; /* refcount root block */ 149 150 return mp->m_sb.sb_agblocks - blocks; 151 } 152 153 /* 154 * Lookup the record equal to [bno, len] in the btree given by cur. 155 */ 156 STATIC int /* error */ xfs_alloc_lookup_eq(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)157 xfs_alloc_lookup_eq( 158 struct xfs_btree_cur *cur, /* btree cursor */ 159 xfs_agblock_t bno, /* starting block of extent */ 160 xfs_extlen_t len, /* length of extent */ 161 int *stat) /* success/failure */ 162 { 163 int error; 164 165 cur->bc_rec.a.ar_startblock = bno; 166 cur->bc_rec.a.ar_blockcount = len; 167 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); 168 cur->bc_ag.abt.active = (*stat == 1); 169 return error; 170 } 171 172 /* 173 * Lookup the first record greater than or equal to [bno, len] 174 * in the btree given by cur. 175 */ 176 int /* error */ xfs_alloc_lookup_ge(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)177 xfs_alloc_lookup_ge( 178 struct xfs_btree_cur *cur, /* btree cursor */ 179 xfs_agblock_t bno, /* starting block of extent */ 180 xfs_extlen_t len, /* length of extent */ 181 int *stat) /* success/failure */ 182 { 183 int error; 184 185 cur->bc_rec.a.ar_startblock = bno; 186 cur->bc_rec.a.ar_blockcount = len; 187 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 188 cur->bc_ag.abt.active = (*stat == 1); 189 return error; 190 } 191 192 /* 193 * Lookup the first record less than or equal to [bno, len] 194 * in the btree given by cur. 195 */ 196 int /* error */ xfs_alloc_lookup_le(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,int * stat)197 xfs_alloc_lookup_le( 198 struct xfs_btree_cur *cur, /* btree cursor */ 199 xfs_agblock_t bno, /* starting block of extent */ 200 xfs_extlen_t len, /* length of extent */ 201 int *stat) /* success/failure */ 202 { 203 int error; 204 cur->bc_rec.a.ar_startblock = bno; 205 cur->bc_rec.a.ar_blockcount = len; 206 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); 207 cur->bc_ag.abt.active = (*stat == 1); 208 return error; 209 } 210 211 static inline bool xfs_alloc_cur_active(struct xfs_btree_cur * cur)212 xfs_alloc_cur_active( 213 struct xfs_btree_cur *cur) 214 { 215 return cur && cur->bc_ag.abt.active; 216 } 217 218 /* 219 * Update the record referred to by cur to the value given 220 * by [bno, len]. 221 * This either works (return 0) or gets an EFSCORRUPTED error. 222 */ 223 STATIC int /* error */ xfs_alloc_update(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len)224 xfs_alloc_update( 225 struct xfs_btree_cur *cur, /* btree cursor */ 226 xfs_agblock_t bno, /* starting block of extent */ 227 xfs_extlen_t len) /* length of extent */ 228 { 229 union xfs_btree_rec rec; 230 231 rec.alloc.ar_startblock = cpu_to_be32(bno); 232 rec.alloc.ar_blockcount = cpu_to_be32(len); 233 return xfs_btree_update(cur, &rec); 234 } 235 236 /* Convert the ondisk btree record to its incore representation. */ 237 void xfs_alloc_btrec_to_irec(const union xfs_btree_rec * rec,struct xfs_alloc_rec_incore * irec)238 xfs_alloc_btrec_to_irec( 239 const union xfs_btree_rec *rec, 240 struct xfs_alloc_rec_incore *irec) 241 { 242 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 243 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 244 } 245 246 /* Simple checks for free space records. */ 247 xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur * cur,const struct xfs_alloc_rec_incore * irec)248 xfs_alloc_check_irec( 249 struct xfs_btree_cur *cur, 250 const struct xfs_alloc_rec_incore *irec) 251 { 252 struct xfs_perag *pag = cur->bc_ag.pag; 253 254 if (irec->ar_blockcount == 0) 255 return __this_address; 256 257 /* check for valid extent range, including overflow */ 258 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) 259 return __this_address; 260 261 return NULL; 262 } 263 264 static inline int xfs_alloc_complain_bad_rec(struct xfs_btree_cur * cur,xfs_failaddr_t fa,const struct xfs_alloc_rec_incore * irec)265 xfs_alloc_complain_bad_rec( 266 struct xfs_btree_cur *cur, 267 xfs_failaddr_t fa, 268 const struct xfs_alloc_rec_incore *irec) 269 { 270 struct xfs_mount *mp = cur->bc_mp; 271 272 xfs_warn(mp, 273 "%s Freespace BTree record corruption in AG %d detected at %pS!", 274 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", 275 cur->bc_ag.pag->pag_agno, fa); 276 xfs_warn(mp, 277 "start block 0x%x block count 0x%x", irec->ar_startblock, 278 irec->ar_blockcount); 279 return -EFSCORRUPTED; 280 } 281 282 /* 283 * Get the data from the pointed-to record. 284 */ 285 int /* error */ xfs_alloc_get_rec(struct xfs_btree_cur * cur,xfs_agblock_t * bno,xfs_extlen_t * len,int * stat)286 xfs_alloc_get_rec( 287 struct xfs_btree_cur *cur, /* btree cursor */ 288 xfs_agblock_t *bno, /* output: starting block of extent */ 289 xfs_extlen_t *len, /* output: length of extent */ 290 int *stat) /* output: success/failure */ 291 { 292 struct xfs_alloc_rec_incore irec; 293 union xfs_btree_rec *rec; 294 xfs_failaddr_t fa; 295 int error; 296 297 error = xfs_btree_get_rec(cur, &rec, stat); 298 if (error || !(*stat)) 299 return error; 300 301 xfs_alloc_btrec_to_irec(rec, &irec); 302 fa = xfs_alloc_check_irec(cur, &irec); 303 if (fa) 304 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 305 306 *bno = irec.ar_startblock; 307 *len = irec.ar_blockcount; 308 return 0; 309 } 310 311 /* 312 * Compute aligned version of the found extent. 313 * Takes alignment and min length into account. 314 */ 315 STATIC bool xfs_alloc_compute_aligned(xfs_alloc_arg_t * args,xfs_agblock_t foundbno,xfs_extlen_t foundlen,xfs_agblock_t * resbno,xfs_extlen_t * reslen,unsigned * busy_gen)316 xfs_alloc_compute_aligned( 317 xfs_alloc_arg_t *args, /* allocation argument structure */ 318 xfs_agblock_t foundbno, /* starting block in found extent */ 319 xfs_extlen_t foundlen, /* length in found extent */ 320 xfs_agblock_t *resbno, /* result block number */ 321 xfs_extlen_t *reslen, /* result length */ 322 unsigned *busy_gen) 323 { 324 xfs_agblock_t bno = foundbno; 325 xfs_extlen_t len = foundlen; 326 xfs_extlen_t diff; 327 bool busy; 328 329 /* Trim busy sections out of found extent */ 330 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 331 332 /* 333 * If we have a largish extent that happens to start before min_agbno, 334 * see if we can shift it into range... 335 */ 336 if (bno < args->min_agbno && bno + len > args->min_agbno) { 337 diff = args->min_agbno - bno; 338 if (len > diff) { 339 bno += diff; 340 len -= diff; 341 } 342 } 343 344 if (args->alignment > 1 && len >= args->minlen) { 345 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 346 347 diff = aligned_bno - bno; 348 349 *resbno = aligned_bno; 350 *reslen = diff >= len ? 0 : len - diff; 351 } else { 352 *resbno = bno; 353 *reslen = len; 354 } 355 356 return busy; 357 } 358 359 /* 360 * Compute best start block and diff for "near" allocations. 361 * freelen >= wantlen already checked by caller. 362 */ 363 STATIC xfs_extlen_t /* difference value (absolute) */ xfs_alloc_compute_diff(xfs_agblock_t wantbno,xfs_extlen_t wantlen,xfs_extlen_t alignment,int datatype,xfs_agblock_t freebno,xfs_extlen_t freelen,xfs_agblock_t * newbnop)364 xfs_alloc_compute_diff( 365 xfs_agblock_t wantbno, /* target starting block */ 366 xfs_extlen_t wantlen, /* target length */ 367 xfs_extlen_t alignment, /* target alignment */ 368 int datatype, /* are we allocating data? */ 369 xfs_agblock_t freebno, /* freespace's starting block */ 370 xfs_extlen_t freelen, /* freespace's length */ 371 xfs_agblock_t *newbnop) /* result: best start block from free */ 372 { 373 xfs_agblock_t freeend; /* end of freespace extent */ 374 xfs_agblock_t newbno1; /* return block number */ 375 xfs_agblock_t newbno2; /* other new block number */ 376 xfs_extlen_t newlen1=0; /* length with newbno1 */ 377 xfs_extlen_t newlen2=0; /* length with newbno2 */ 378 xfs_agblock_t wantend; /* end of target extent */ 379 bool userdata = datatype & XFS_ALLOC_USERDATA; 380 381 ASSERT(freelen >= wantlen); 382 freeend = freebno + freelen; 383 wantend = wantbno + wantlen; 384 /* 385 * We want to allocate from the start of a free extent if it is past 386 * the desired block or if we are allocating user data and the free 387 * extent is before desired block. The second case is there to allow 388 * for contiguous allocation from the remaining free space if the file 389 * grows in the short term. 390 */ 391 if (freebno >= wantbno || (userdata && freeend < wantend)) { 392 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 393 newbno1 = NULLAGBLOCK; 394 } else if (freeend >= wantend && alignment > 1) { 395 newbno1 = roundup(wantbno, alignment); 396 newbno2 = newbno1 - alignment; 397 if (newbno1 >= freeend) 398 newbno1 = NULLAGBLOCK; 399 else 400 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 401 if (newbno2 < freebno) 402 newbno2 = NULLAGBLOCK; 403 else 404 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 405 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 406 if (newlen1 < newlen2 || 407 (newlen1 == newlen2 && 408 XFS_ABSDIFF(newbno1, wantbno) > 409 XFS_ABSDIFF(newbno2, wantbno))) 410 newbno1 = newbno2; 411 } else if (newbno2 != NULLAGBLOCK) 412 newbno1 = newbno2; 413 } else if (freeend >= wantend) { 414 newbno1 = wantbno; 415 } else if (alignment > 1) { 416 newbno1 = roundup(freeend - wantlen, alignment); 417 if (newbno1 > freeend - wantlen && 418 newbno1 - alignment >= freebno) 419 newbno1 -= alignment; 420 else if (newbno1 >= freeend) 421 newbno1 = NULLAGBLOCK; 422 } else 423 newbno1 = freeend - wantlen; 424 *newbnop = newbno1; 425 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 426 } 427 428 /* 429 * Fix up the length, based on mod and prod. 430 * len should be k * prod + mod for some k. 431 * If len is too small it is returned unchanged. 432 * If len hits maxlen it is left alone. 433 */ 434 STATIC void xfs_alloc_fix_len(xfs_alloc_arg_t * args)435 xfs_alloc_fix_len( 436 xfs_alloc_arg_t *args) /* allocation argument structure */ 437 { 438 xfs_extlen_t k; 439 xfs_extlen_t rlen; 440 441 ASSERT(args->mod < args->prod); 442 rlen = args->len; 443 ASSERT(rlen >= args->minlen); 444 ASSERT(rlen <= args->maxlen); 445 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 446 (args->mod == 0 && rlen < args->prod)) 447 return; 448 k = rlen % args->prod; 449 if (k == args->mod) 450 return; 451 if (k > args->mod) 452 rlen = rlen - (k - args->mod); 453 else 454 rlen = rlen - args->prod + (args->mod - k); 455 /* casts to (int) catch length underflows */ 456 if ((int)rlen < (int)args->minlen) 457 return; 458 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 459 ASSERT(rlen % args->prod == args->mod); 460 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 461 rlen + args->minleft); 462 args->len = rlen; 463 } 464 465 /* 466 * Update the two btrees, logically removing from freespace the extent 467 * starting at rbno, rlen blocks. The extent is contained within the 468 * actual (current) free extent fbno for flen blocks. 469 * Flags are passed in indicating whether the cursors are set to the 470 * relevant records. 471 */ 472 STATIC int /* error code */ xfs_alloc_fixup_trees(struct xfs_btree_cur * cnt_cur,struct xfs_btree_cur * bno_cur,xfs_agblock_t fbno,xfs_extlen_t flen,xfs_agblock_t rbno,xfs_extlen_t rlen,int flags)473 xfs_alloc_fixup_trees( 474 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 475 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 476 xfs_agblock_t fbno, /* starting block of free extent */ 477 xfs_extlen_t flen, /* length of free extent */ 478 xfs_agblock_t rbno, /* starting block of returned extent */ 479 xfs_extlen_t rlen, /* length of returned extent */ 480 int flags) /* flags, XFSA_FIXUP_... */ 481 { 482 int error; /* error code */ 483 int i; /* operation results */ 484 xfs_agblock_t nfbno1; /* first new free startblock */ 485 xfs_agblock_t nfbno2; /* second new free startblock */ 486 xfs_extlen_t nflen1=0; /* first new free length */ 487 xfs_extlen_t nflen2=0; /* second new free length */ 488 struct xfs_mount *mp; 489 490 mp = cnt_cur->bc_mp; 491 492 /* 493 * Look up the record in the by-size tree if necessary. 494 */ 495 if (flags & XFSA_FIXUP_CNT_OK) { 496 #ifdef DEBUG 497 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 498 return error; 499 if (XFS_IS_CORRUPT(mp, 500 i != 1 || 501 nfbno1 != fbno || 502 nflen1 != flen)) 503 return -EFSCORRUPTED; 504 #endif 505 } else { 506 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 507 return error; 508 if (XFS_IS_CORRUPT(mp, i != 1)) 509 return -EFSCORRUPTED; 510 } 511 /* 512 * Look up the record in the by-block tree if necessary. 513 */ 514 if (flags & XFSA_FIXUP_BNO_OK) { 515 #ifdef DEBUG 516 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 517 return error; 518 if (XFS_IS_CORRUPT(mp, 519 i != 1 || 520 nfbno1 != fbno || 521 nflen1 != flen)) 522 return -EFSCORRUPTED; 523 #endif 524 } else { 525 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 526 return error; 527 if (XFS_IS_CORRUPT(mp, i != 1)) 528 return -EFSCORRUPTED; 529 } 530 531 #ifdef DEBUG 532 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 533 struct xfs_btree_block *bnoblock; 534 struct xfs_btree_block *cntblock; 535 536 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 537 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 538 539 if (XFS_IS_CORRUPT(mp, 540 bnoblock->bb_numrecs != 541 cntblock->bb_numrecs)) 542 return -EFSCORRUPTED; 543 } 544 #endif 545 546 /* 547 * Deal with all four cases: the allocated record is contained 548 * within the freespace record, so we can have new freespace 549 * at either (or both) end, or no freespace remaining. 550 */ 551 if (rbno == fbno && rlen == flen) 552 nfbno1 = nfbno2 = NULLAGBLOCK; 553 else if (rbno == fbno) { 554 nfbno1 = rbno + rlen; 555 nflen1 = flen - rlen; 556 nfbno2 = NULLAGBLOCK; 557 } else if (rbno + rlen == fbno + flen) { 558 nfbno1 = fbno; 559 nflen1 = flen - rlen; 560 nfbno2 = NULLAGBLOCK; 561 } else { 562 nfbno1 = fbno; 563 nflen1 = rbno - fbno; 564 nfbno2 = rbno + rlen; 565 nflen2 = (fbno + flen) - nfbno2; 566 } 567 /* 568 * Delete the entry from the by-size btree. 569 */ 570 if ((error = xfs_btree_delete(cnt_cur, &i))) 571 return error; 572 if (XFS_IS_CORRUPT(mp, i != 1)) 573 return -EFSCORRUPTED; 574 /* 575 * Add new by-size btree entry(s). 576 */ 577 if (nfbno1 != NULLAGBLOCK) { 578 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 579 return error; 580 if (XFS_IS_CORRUPT(mp, i != 0)) 581 return -EFSCORRUPTED; 582 if ((error = xfs_btree_insert(cnt_cur, &i))) 583 return error; 584 if (XFS_IS_CORRUPT(mp, i != 1)) 585 return -EFSCORRUPTED; 586 } 587 if (nfbno2 != NULLAGBLOCK) { 588 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 589 return error; 590 if (XFS_IS_CORRUPT(mp, i != 0)) 591 return -EFSCORRUPTED; 592 if ((error = xfs_btree_insert(cnt_cur, &i))) 593 return error; 594 if (XFS_IS_CORRUPT(mp, i != 1)) 595 return -EFSCORRUPTED; 596 } 597 /* 598 * Fix up the by-block btree entry(s). 599 */ 600 if (nfbno1 == NULLAGBLOCK) { 601 /* 602 * No remaining freespace, just delete the by-block tree entry. 603 */ 604 if ((error = xfs_btree_delete(bno_cur, &i))) 605 return error; 606 if (XFS_IS_CORRUPT(mp, i != 1)) 607 return -EFSCORRUPTED; 608 } else { 609 /* 610 * Update the by-block entry to start later|be shorter. 611 */ 612 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 613 return error; 614 } 615 if (nfbno2 != NULLAGBLOCK) { 616 /* 617 * 2 resulting free entries, need to add one. 618 */ 619 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 620 return error; 621 if (XFS_IS_CORRUPT(mp, i != 0)) 622 return -EFSCORRUPTED; 623 if ((error = xfs_btree_insert(bno_cur, &i))) 624 return error; 625 if (XFS_IS_CORRUPT(mp, i != 1)) 626 return -EFSCORRUPTED; 627 } 628 return 0; 629 } 630 631 /* 632 * We do not verify the AGFL contents against AGF-based index counters here, 633 * even though we may have access to the perag that contains shadow copies. We 634 * don't know if the AGF based counters have been checked, and if they have they 635 * still may be inconsistent because they haven't yet been reset on the first 636 * allocation after the AGF has been read in. 637 * 638 * This means we can only check that all agfl entries contain valid or null 639 * values because we can't reliably determine the active range to exclude 640 * NULLAGBNO as a valid value. 641 * 642 * However, we can't even do that for v4 format filesystems because there are 643 * old versions of mkfs out there that does not initialise the AGFL to known, 644 * verifiable values. HEnce we can't tell the difference between a AGFL block 645 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. 646 * 647 * As a result, we can only fully validate AGFL block numbers when we pull them 648 * from the freelist in xfs_alloc_get_freelist(). 649 */ 650 static xfs_failaddr_t xfs_agfl_verify(struct xfs_buf * bp)651 xfs_agfl_verify( 652 struct xfs_buf *bp) 653 { 654 struct xfs_mount *mp = bp->b_mount; 655 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 656 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 657 int i; 658 659 if (!xfs_has_crc(mp)) 660 return NULL; 661 662 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 663 return __this_address; 664 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 665 return __this_address; 666 /* 667 * during growfs operations, the perag is not fully initialised, 668 * so we can't use it for any useful checking. growfs ensures we can't 669 * use it by using uncached buffers that don't have the perag attached 670 * so we can detect and avoid this problem. 671 */ 672 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 673 return __this_address; 674 675 for (i = 0; i < xfs_agfl_size(mp); i++) { 676 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 677 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 678 return __this_address; 679 } 680 681 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 682 return __this_address; 683 return NULL; 684 } 685 686 static void xfs_agfl_read_verify(struct xfs_buf * bp)687 xfs_agfl_read_verify( 688 struct xfs_buf *bp) 689 { 690 struct xfs_mount *mp = bp->b_mount; 691 xfs_failaddr_t fa; 692 693 /* 694 * There is no verification of non-crc AGFLs because mkfs does not 695 * initialise the AGFL to zero or NULL. Hence the only valid part of the 696 * AGFL is what the AGF says is active. We can't get to the AGF, so we 697 * can't verify just those entries are valid. 698 */ 699 if (!xfs_has_crc(mp)) 700 return; 701 702 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 703 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 704 else { 705 fa = xfs_agfl_verify(bp); 706 if (fa) 707 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 708 } 709 } 710 711 static void xfs_agfl_write_verify(struct xfs_buf * bp)712 xfs_agfl_write_verify( 713 struct xfs_buf *bp) 714 { 715 struct xfs_mount *mp = bp->b_mount; 716 struct xfs_buf_log_item *bip = bp->b_log_item; 717 xfs_failaddr_t fa; 718 719 /* no verification of non-crc AGFLs */ 720 if (!xfs_has_crc(mp)) 721 return; 722 723 fa = xfs_agfl_verify(bp); 724 if (fa) { 725 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 726 return; 727 } 728 729 if (bip) 730 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 731 732 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 733 } 734 735 const struct xfs_buf_ops xfs_agfl_buf_ops = { 736 .name = "xfs_agfl", 737 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 738 .verify_read = xfs_agfl_read_verify, 739 .verify_write = xfs_agfl_write_verify, 740 .verify_struct = xfs_agfl_verify, 741 }; 742 743 /* 744 * Read in the allocation group free block array. 745 */ 746 int xfs_alloc_read_agfl(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf ** bpp)747 xfs_alloc_read_agfl( 748 struct xfs_perag *pag, 749 struct xfs_trans *tp, 750 struct xfs_buf **bpp) 751 { 752 struct xfs_mount *mp = pag->pag_mount; 753 struct xfs_buf *bp; 754 int error; 755 756 error = xfs_trans_read_buf( 757 mp, tp, mp->m_ddev_targp, 758 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 759 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 760 if (error) 761 return error; 762 xfs_buf_set_ref(bp, XFS_AGFL_REF); 763 *bpp = bp; 764 return 0; 765 } 766 767 STATIC int xfs_alloc_update_counters(struct xfs_trans * tp,struct xfs_buf * agbp,long len)768 xfs_alloc_update_counters( 769 struct xfs_trans *tp, 770 struct xfs_buf *agbp, 771 long len) 772 { 773 struct xfs_agf *agf = agbp->b_addr; 774 775 agbp->b_pag->pagf_freeblks += len; 776 be32_add_cpu(&agf->agf_freeblks, len); 777 778 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 779 be32_to_cpu(agf->agf_length))) { 780 xfs_buf_mark_corrupt(agbp); 781 return -EFSCORRUPTED; 782 } 783 784 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 785 return 0; 786 } 787 788 /* 789 * Block allocation algorithm and data structures. 790 */ 791 struct xfs_alloc_cur { 792 struct xfs_btree_cur *cnt; /* btree cursors */ 793 struct xfs_btree_cur *bnolt; 794 struct xfs_btree_cur *bnogt; 795 xfs_extlen_t cur_len;/* current search length */ 796 xfs_agblock_t rec_bno;/* extent startblock */ 797 xfs_extlen_t rec_len;/* extent length */ 798 xfs_agblock_t bno; /* alloc bno */ 799 xfs_extlen_t len; /* alloc len */ 800 xfs_extlen_t diff; /* diff from search bno */ 801 unsigned int busy_gen;/* busy state */ 802 bool busy; 803 }; 804 805 /* 806 * Set up cursors, etc. in the extent allocation cursor. This function can be 807 * called multiple times to reset an initialized structure without having to 808 * reallocate cursors. 809 */ 810 static int xfs_alloc_cur_setup(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)811 xfs_alloc_cur_setup( 812 struct xfs_alloc_arg *args, 813 struct xfs_alloc_cur *acur) 814 { 815 int error; 816 int i; 817 818 acur->cur_len = args->maxlen; 819 acur->rec_bno = 0; 820 acur->rec_len = 0; 821 acur->bno = 0; 822 acur->len = 0; 823 acur->diff = -1; 824 acur->busy = false; 825 acur->busy_gen = 0; 826 827 /* 828 * Perform an initial cntbt lookup to check for availability of maxlen 829 * extents. If this fails, we'll return -ENOSPC to signal the caller to 830 * attempt a small allocation. 831 */ 832 if (!acur->cnt) 833 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, 834 args->agbp, args->pag, XFS_BTNUM_CNT); 835 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 836 if (error) 837 return error; 838 839 /* 840 * Allocate the bnobt left and right search cursors. 841 */ 842 if (!acur->bnolt) 843 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, 844 args->agbp, args->pag, XFS_BTNUM_BNO); 845 if (!acur->bnogt) 846 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, 847 args->agbp, args->pag, XFS_BTNUM_BNO); 848 return i == 1 ? 0 : -ENOSPC; 849 } 850 851 static void xfs_alloc_cur_close(struct xfs_alloc_cur * acur,bool error)852 xfs_alloc_cur_close( 853 struct xfs_alloc_cur *acur, 854 bool error) 855 { 856 int cur_error = XFS_BTREE_NOERROR; 857 858 if (error) 859 cur_error = XFS_BTREE_ERROR; 860 861 if (acur->cnt) 862 xfs_btree_del_cursor(acur->cnt, cur_error); 863 if (acur->bnolt) 864 xfs_btree_del_cursor(acur->bnolt, cur_error); 865 if (acur->bnogt) 866 xfs_btree_del_cursor(acur->bnogt, cur_error); 867 acur->cnt = acur->bnolt = acur->bnogt = NULL; 868 } 869 870 /* 871 * Check an extent for allocation and track the best available candidate in the 872 * allocation structure. The cursor is deactivated if it has entered an out of 873 * range state based on allocation arguments. Optionally return the extent 874 * extent geometry and allocation status if requested by the caller. 875 */ 876 static int xfs_alloc_cur_check(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,struct xfs_btree_cur * cur,int * new)877 xfs_alloc_cur_check( 878 struct xfs_alloc_arg *args, 879 struct xfs_alloc_cur *acur, 880 struct xfs_btree_cur *cur, 881 int *new) 882 { 883 int error, i; 884 xfs_agblock_t bno, bnoa, bnew; 885 xfs_extlen_t len, lena, diff = -1; 886 bool busy; 887 unsigned busy_gen = 0; 888 bool deactivate = false; 889 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; 890 891 *new = 0; 892 893 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 894 if (error) 895 return error; 896 if (XFS_IS_CORRUPT(args->mp, i != 1)) 897 return -EFSCORRUPTED; 898 899 /* 900 * Check minlen and deactivate a cntbt cursor if out of acceptable size 901 * range (i.e., walking backwards looking for a minlen extent). 902 */ 903 if (len < args->minlen) { 904 deactivate = !isbnobt; 905 goto out; 906 } 907 908 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 909 &busy_gen); 910 acur->busy |= busy; 911 if (busy) 912 acur->busy_gen = busy_gen; 913 /* deactivate a bnobt cursor outside of locality range */ 914 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 915 deactivate = isbnobt; 916 goto out; 917 } 918 if (lena < args->minlen) 919 goto out; 920 921 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 922 xfs_alloc_fix_len(args); 923 ASSERT(args->len >= args->minlen); 924 if (args->len < acur->len) 925 goto out; 926 927 /* 928 * We have an aligned record that satisfies minlen and beats or matches 929 * the candidate extent size. Compare locality for near allocation mode. 930 */ 931 diff = xfs_alloc_compute_diff(args->agbno, args->len, 932 args->alignment, args->datatype, 933 bnoa, lena, &bnew); 934 if (bnew == NULLAGBLOCK) 935 goto out; 936 937 /* 938 * Deactivate a bnobt cursor with worse locality than the current best. 939 */ 940 if (diff > acur->diff) { 941 deactivate = isbnobt; 942 goto out; 943 } 944 945 ASSERT(args->len > acur->len || 946 (args->len == acur->len && diff <= acur->diff)); 947 acur->rec_bno = bno; 948 acur->rec_len = len; 949 acur->bno = bnew; 950 acur->len = args->len; 951 acur->diff = diff; 952 *new = 1; 953 954 /* 955 * We're done if we found a perfect allocation. This only deactivates 956 * the current cursor, but this is just an optimization to terminate a 957 * cntbt search that otherwise runs to the edge of the tree. 958 */ 959 if (acur->diff == 0 && acur->len == args->maxlen) 960 deactivate = true; 961 out: 962 if (deactivate) 963 cur->bc_ag.abt.active = false; 964 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, 965 *new); 966 return 0; 967 } 968 969 /* 970 * Complete an allocation of a candidate extent. Remove the extent from both 971 * trees and update the args structure. 972 */ 973 STATIC int xfs_alloc_cur_finish(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)974 xfs_alloc_cur_finish( 975 struct xfs_alloc_arg *args, 976 struct xfs_alloc_cur *acur) 977 { 978 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 979 int error; 980 981 ASSERT(acur->cnt && acur->bnolt); 982 ASSERT(acur->bno >= acur->rec_bno); 983 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 984 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); 985 986 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 987 acur->rec_len, acur->bno, acur->len, 0); 988 if (error) 989 return error; 990 991 args->agbno = acur->bno; 992 args->len = acur->len; 993 args->wasfromfl = 0; 994 995 trace_xfs_alloc_cur(args); 996 return 0; 997 } 998 999 /* 1000 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 1001 * bno optimized lookup to search for extents with ideal size and locality. 1002 */ 1003 STATIC int xfs_alloc_cntbt_iter(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur)1004 xfs_alloc_cntbt_iter( 1005 struct xfs_alloc_arg *args, 1006 struct xfs_alloc_cur *acur) 1007 { 1008 struct xfs_btree_cur *cur = acur->cnt; 1009 xfs_agblock_t bno; 1010 xfs_extlen_t len, cur_len; 1011 int error; 1012 int i; 1013 1014 if (!xfs_alloc_cur_active(cur)) 1015 return 0; 1016 1017 /* locality optimized lookup */ 1018 cur_len = acur->cur_len; 1019 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 1020 if (error) 1021 return error; 1022 if (i == 0) 1023 return 0; 1024 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1025 if (error) 1026 return error; 1027 1028 /* check the current record and update search length from it */ 1029 error = xfs_alloc_cur_check(args, acur, cur, &i); 1030 if (error) 1031 return error; 1032 ASSERT(len >= acur->cur_len); 1033 acur->cur_len = len; 1034 1035 /* 1036 * We looked up the first record >= [agbno, len] above. The agbno is a 1037 * secondary key and so the current record may lie just before or after 1038 * agbno. If it is past agbno, check the previous record too so long as 1039 * the length matches as it may be closer. Don't check a smaller record 1040 * because that could deactivate our cursor. 1041 */ 1042 if (bno > args->agbno) { 1043 error = xfs_btree_decrement(cur, 0, &i); 1044 if (!error && i) { 1045 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1046 if (!error && i && len == acur->cur_len) 1047 error = xfs_alloc_cur_check(args, acur, cur, 1048 &i); 1049 } 1050 if (error) 1051 return error; 1052 } 1053 1054 /* 1055 * Increment the search key until we find at least one allocation 1056 * candidate or if the extent we found was larger. Otherwise, double the 1057 * search key to optimize the search. Efficiency is more important here 1058 * than absolute best locality. 1059 */ 1060 cur_len <<= 1; 1061 if (!acur->len || acur->cur_len >= cur_len) 1062 acur->cur_len++; 1063 else 1064 acur->cur_len = cur_len; 1065 1066 return error; 1067 } 1068 1069 /* 1070 * Deal with the case where only small freespaces remain. Either return the 1071 * contents of the last freespace record, or allocate space from the freelist if 1072 * there is nothing in the tree. 1073 */ 1074 STATIC int /* error */ xfs_alloc_ag_vextent_small(struct xfs_alloc_arg * args,struct xfs_btree_cur * ccur,xfs_agblock_t * fbnop,xfs_extlen_t * flenp,int * stat)1075 xfs_alloc_ag_vextent_small( 1076 struct xfs_alloc_arg *args, /* allocation argument structure */ 1077 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1078 xfs_agblock_t *fbnop, /* result block number */ 1079 xfs_extlen_t *flenp, /* result length */ 1080 int *stat) /* status: 0-freelist, 1-normal/none */ 1081 { 1082 struct xfs_agf *agf = args->agbp->b_addr; 1083 int error = 0; 1084 xfs_agblock_t fbno = NULLAGBLOCK; 1085 xfs_extlen_t flen = 0; 1086 int i = 0; 1087 1088 /* 1089 * If a cntbt cursor is provided, try to allocate the largest record in 1090 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1091 * allocation. Make sure to respect minleft even when pulling from the 1092 * freelist. 1093 */ 1094 if (ccur) 1095 error = xfs_btree_decrement(ccur, 0, &i); 1096 if (error) 1097 goto error; 1098 if (i) { 1099 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1100 if (error) 1101 goto error; 1102 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1103 error = -EFSCORRUPTED; 1104 goto error; 1105 } 1106 goto out; 1107 } 1108 1109 if (args->minlen != 1 || args->alignment != 1 || 1110 args->resv == XFS_AG_RESV_AGFL || 1111 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1112 goto out; 1113 1114 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1115 &fbno, 0); 1116 if (error) 1117 goto error; 1118 if (fbno == NULLAGBLOCK) 1119 goto out; 1120 1121 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1122 (args->datatype & XFS_ALLOC_NOBUSY)); 1123 1124 if (args->datatype & XFS_ALLOC_USERDATA) { 1125 struct xfs_buf *bp; 1126 1127 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1128 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1129 args->mp->m_bsize, 0, &bp); 1130 if (error) 1131 goto error; 1132 xfs_trans_binval(args->tp, bp); 1133 } 1134 *fbnop = args->agbno = fbno; 1135 *flenp = args->len = 1; 1136 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1137 error = -EFSCORRUPTED; 1138 goto error; 1139 } 1140 args->wasfromfl = 1; 1141 trace_xfs_alloc_small_freelist(args); 1142 1143 /* 1144 * If we're feeding an AGFL block to something that doesn't live in the 1145 * free space, we need to clear out the OWN_AG rmap. 1146 */ 1147 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1148 &XFS_RMAP_OINFO_AG); 1149 if (error) 1150 goto error; 1151 1152 *stat = 0; 1153 return 0; 1154 1155 out: 1156 /* 1157 * Can't do the allocation, give up. 1158 */ 1159 if (flen < args->minlen) { 1160 args->agbno = NULLAGBLOCK; 1161 trace_xfs_alloc_small_notenough(args); 1162 flen = 0; 1163 } 1164 *fbnop = fbno; 1165 *flenp = flen; 1166 *stat = 1; 1167 trace_xfs_alloc_small_done(args); 1168 return 0; 1169 1170 error: 1171 trace_xfs_alloc_small_error(args); 1172 return error; 1173 } 1174 1175 /* 1176 * Allocate a variable extent at exactly agno/bno. 1177 * Extent's length (returned in *len) will be between minlen and maxlen, 1178 * and of the form k * prod + mod unless there's nothing that large. 1179 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1180 */ 1181 STATIC int /* error */ xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t * args)1182 xfs_alloc_ag_vextent_exact( 1183 xfs_alloc_arg_t *args) /* allocation argument structure */ 1184 { 1185 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1186 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1187 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1188 int error; 1189 xfs_agblock_t fbno; /* start block of found extent */ 1190 xfs_extlen_t flen; /* length of found extent */ 1191 xfs_agblock_t tbno; /* start block of busy extent */ 1192 xfs_extlen_t tlen; /* length of busy extent */ 1193 xfs_agblock_t tend; /* end block of busy extent */ 1194 int i; /* success/failure of operation */ 1195 unsigned busy_gen; 1196 1197 ASSERT(args->alignment == 1); 1198 1199 /* 1200 * Allocate/initialize a cursor for the by-number freespace btree. 1201 */ 1202 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1203 args->pag, XFS_BTNUM_BNO); 1204 1205 /* 1206 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1207 * Look for the closest free block <= bno, it must contain bno 1208 * if any free block does. 1209 */ 1210 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1211 if (error) 1212 goto error0; 1213 if (!i) 1214 goto not_found; 1215 1216 /* 1217 * Grab the freespace record. 1218 */ 1219 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1220 if (error) 1221 goto error0; 1222 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1223 error = -EFSCORRUPTED; 1224 goto error0; 1225 } 1226 ASSERT(fbno <= args->agbno); 1227 1228 /* 1229 * Check for overlapping busy extents. 1230 */ 1231 tbno = fbno; 1232 tlen = flen; 1233 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1234 1235 /* 1236 * Give up if the start of the extent is busy, or the freespace isn't 1237 * long enough for the minimum request. 1238 */ 1239 if (tbno > args->agbno) 1240 goto not_found; 1241 if (tlen < args->minlen) 1242 goto not_found; 1243 tend = tbno + tlen; 1244 if (tend < args->agbno + args->minlen) 1245 goto not_found; 1246 1247 /* 1248 * End of extent will be smaller of the freespace end and the 1249 * maximal requested end. 1250 * 1251 * Fix the length according to mod and prod if given. 1252 */ 1253 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1254 - args->agbno; 1255 xfs_alloc_fix_len(args); 1256 ASSERT(args->agbno + args->len <= tend); 1257 1258 /* 1259 * We are allocating agbno for args->len 1260 * Allocate/initialize a cursor for the by-size btree. 1261 */ 1262 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1263 args->pag, XFS_BTNUM_CNT); 1264 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); 1265 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1266 args->len, XFSA_FIXUP_BNO_OK); 1267 if (error) { 1268 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1269 goto error0; 1270 } 1271 1272 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1273 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1274 1275 args->wasfromfl = 0; 1276 trace_xfs_alloc_exact_done(args); 1277 return 0; 1278 1279 not_found: 1280 /* Didn't find it, return null. */ 1281 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1282 args->agbno = NULLAGBLOCK; 1283 trace_xfs_alloc_exact_notfound(args); 1284 return 0; 1285 1286 error0: 1287 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1288 trace_xfs_alloc_exact_error(args); 1289 return error; 1290 } 1291 1292 /* 1293 * Search a given number of btree records in a given direction. Check each 1294 * record against the good extent we've already found. 1295 */ 1296 STATIC int xfs_alloc_walk_iter(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,struct xfs_btree_cur * cur,bool increment,bool find_one,int count,int * stat)1297 xfs_alloc_walk_iter( 1298 struct xfs_alloc_arg *args, 1299 struct xfs_alloc_cur *acur, 1300 struct xfs_btree_cur *cur, 1301 bool increment, 1302 bool find_one, /* quit on first candidate */ 1303 int count, /* rec count (-1 for infinite) */ 1304 int *stat) 1305 { 1306 int error; 1307 int i; 1308 1309 *stat = 0; 1310 1311 /* 1312 * Search so long as the cursor is active or we find a better extent. 1313 * The cursor is deactivated if it extends beyond the range of the 1314 * current allocation candidate. 1315 */ 1316 while (xfs_alloc_cur_active(cur) && count) { 1317 error = xfs_alloc_cur_check(args, acur, cur, &i); 1318 if (error) 1319 return error; 1320 if (i == 1) { 1321 *stat = 1; 1322 if (find_one) 1323 break; 1324 } 1325 if (!xfs_alloc_cur_active(cur)) 1326 break; 1327 1328 if (increment) 1329 error = xfs_btree_increment(cur, 0, &i); 1330 else 1331 error = xfs_btree_decrement(cur, 0, &i); 1332 if (error) 1333 return error; 1334 if (i == 0) 1335 cur->bc_ag.abt.active = false; 1336 1337 if (count > 0) 1338 count--; 1339 } 1340 1341 return 0; 1342 } 1343 1344 /* 1345 * Search the by-bno and by-size btrees in parallel in search of an extent with 1346 * ideal locality based on the NEAR mode ->agbno locality hint. 1347 */ 1348 STATIC int xfs_alloc_ag_vextent_locality(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,int * stat)1349 xfs_alloc_ag_vextent_locality( 1350 struct xfs_alloc_arg *args, 1351 struct xfs_alloc_cur *acur, 1352 int *stat) 1353 { 1354 struct xfs_btree_cur *fbcur = NULL; 1355 int error; 1356 int i; 1357 bool fbinc; 1358 1359 ASSERT(acur->len == 0); 1360 1361 *stat = 0; 1362 1363 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1364 if (error) 1365 return error; 1366 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1367 if (error) 1368 return error; 1369 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1370 if (error) 1371 return error; 1372 1373 /* 1374 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1375 * right and lookup the closest extent to the locality hint for each 1376 * extent size key in the cntbt. The entire search terminates 1377 * immediately on a bnobt hit because that means we've found best case 1378 * locality. Otherwise the search continues until the cntbt cursor runs 1379 * off the end of the tree. If no allocation candidate is found at this 1380 * point, give up on locality, walk backwards from the end of the cntbt 1381 * and take the first available extent. 1382 * 1383 * The parallel tree searches balance each other out to provide fairly 1384 * consistent performance for various situations. The bnobt search can 1385 * have pathological behavior in the worst case scenario of larger 1386 * allocation requests and fragmented free space. On the other hand, the 1387 * bnobt is able to satisfy most smaller allocation requests much more 1388 * quickly than the cntbt. The cntbt search can sift through fragmented 1389 * free space and sets of free extents for larger allocation requests 1390 * more quickly than the bnobt. Since the locality hint is just a hint 1391 * and we don't want to scan the entire bnobt for perfect locality, the 1392 * cntbt search essentially bounds the bnobt search such that we can 1393 * find good enough locality at reasonable performance in most cases. 1394 */ 1395 while (xfs_alloc_cur_active(acur->bnolt) || 1396 xfs_alloc_cur_active(acur->bnogt) || 1397 xfs_alloc_cur_active(acur->cnt)) { 1398 1399 trace_xfs_alloc_cur_lookup(args); 1400 1401 /* 1402 * Search the bnobt left and right. In the case of a hit, finish 1403 * the search in the opposite direction and we're done. 1404 */ 1405 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1406 true, 1, &i); 1407 if (error) 1408 return error; 1409 if (i == 1) { 1410 trace_xfs_alloc_cur_left(args); 1411 fbcur = acur->bnogt; 1412 fbinc = true; 1413 break; 1414 } 1415 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1416 1, &i); 1417 if (error) 1418 return error; 1419 if (i == 1) { 1420 trace_xfs_alloc_cur_right(args); 1421 fbcur = acur->bnolt; 1422 fbinc = false; 1423 break; 1424 } 1425 1426 /* 1427 * Check the extent with best locality based on the current 1428 * extent size search key and keep track of the best candidate. 1429 */ 1430 error = xfs_alloc_cntbt_iter(args, acur); 1431 if (error) 1432 return error; 1433 if (!xfs_alloc_cur_active(acur->cnt)) { 1434 trace_xfs_alloc_cur_lookup_done(args); 1435 break; 1436 } 1437 } 1438 1439 /* 1440 * If we failed to find anything due to busy extents, return empty 1441 * handed so the caller can flush and retry. If no busy extents were 1442 * found, walk backwards from the end of the cntbt as a last resort. 1443 */ 1444 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1445 error = xfs_btree_decrement(acur->cnt, 0, &i); 1446 if (error) 1447 return error; 1448 if (i) { 1449 acur->cnt->bc_ag.abt.active = true; 1450 fbcur = acur->cnt; 1451 fbinc = false; 1452 } 1453 } 1454 1455 /* 1456 * Search in the opposite direction for a better entry in the case of 1457 * a bnobt hit or walk backwards from the end of the cntbt. 1458 */ 1459 if (fbcur) { 1460 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1461 &i); 1462 if (error) 1463 return error; 1464 } 1465 1466 if (acur->len) 1467 *stat = 1; 1468 1469 return 0; 1470 } 1471 1472 /* Check the last block of the cnt btree for allocations. */ 1473 static int xfs_alloc_ag_vextent_lastblock(struct xfs_alloc_arg * args,struct xfs_alloc_cur * acur,xfs_agblock_t * bno,xfs_extlen_t * len,bool * allocated)1474 xfs_alloc_ag_vextent_lastblock( 1475 struct xfs_alloc_arg *args, 1476 struct xfs_alloc_cur *acur, 1477 xfs_agblock_t *bno, 1478 xfs_extlen_t *len, 1479 bool *allocated) 1480 { 1481 int error; 1482 int i; 1483 1484 #ifdef DEBUG 1485 /* Randomly don't execute the first algorithm. */ 1486 if (get_random_u32_below(2)) 1487 return 0; 1488 #endif 1489 1490 /* 1491 * Start from the entry that lookup found, sequence through all larger 1492 * free blocks. If we're actually pointing at a record smaller than 1493 * maxlen, go to the start of this block, and skip all those smaller 1494 * than minlen. 1495 */ 1496 if (*len || args->alignment > 1) { 1497 acur->cnt->bc_levels[0].ptr = 1; 1498 do { 1499 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1500 if (error) 1501 return error; 1502 if (XFS_IS_CORRUPT(args->mp, i != 1)) 1503 return -EFSCORRUPTED; 1504 if (*len >= args->minlen) 1505 break; 1506 error = xfs_btree_increment(acur->cnt, 0, &i); 1507 if (error) 1508 return error; 1509 } while (i); 1510 ASSERT(*len >= args->minlen); 1511 if (!i) 1512 return 0; 1513 } 1514 1515 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1516 if (error) 1517 return error; 1518 1519 /* 1520 * It didn't work. We COULD be in a case where there's a good record 1521 * somewhere, so try again. 1522 */ 1523 if (acur->len == 0) 1524 return 0; 1525 1526 trace_xfs_alloc_near_first(args); 1527 *allocated = true; 1528 return 0; 1529 } 1530 1531 /* 1532 * Allocate a variable extent near bno in the allocation group agno. 1533 * Extent's length (returned in len) will be between minlen and maxlen, 1534 * and of the form k * prod + mod unless there's nothing that large. 1535 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1536 */ 1537 STATIC int xfs_alloc_ag_vextent_near(struct xfs_alloc_arg * args,uint32_t alloc_flags)1538 xfs_alloc_ag_vextent_near( 1539 struct xfs_alloc_arg *args, 1540 uint32_t alloc_flags) 1541 { 1542 struct xfs_alloc_cur acur = {}; 1543 int error; /* error code */ 1544 int i; /* result code, temporary */ 1545 xfs_agblock_t bno; 1546 xfs_extlen_t len; 1547 1548 /* handle uninitialized agbno range so caller doesn't have to */ 1549 if (!args->min_agbno && !args->max_agbno) 1550 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1551 ASSERT(args->min_agbno <= args->max_agbno); 1552 1553 /* clamp agbno to the range if it's outside */ 1554 if (args->agbno < args->min_agbno) 1555 args->agbno = args->min_agbno; 1556 if (args->agbno > args->max_agbno) 1557 args->agbno = args->max_agbno; 1558 1559 /* Retry once quickly if we find busy extents before blocking. */ 1560 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1561 restart: 1562 len = 0; 1563 1564 /* 1565 * Set up cursors and see if there are any free extents as big as 1566 * maxlen. If not, pick the last entry in the tree unless the tree is 1567 * empty. 1568 */ 1569 error = xfs_alloc_cur_setup(args, &acur); 1570 if (error == -ENOSPC) { 1571 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1572 &len, &i); 1573 if (error) 1574 goto out; 1575 if (i == 0 || len == 0) { 1576 trace_xfs_alloc_near_noentry(args); 1577 goto out; 1578 } 1579 ASSERT(i == 1); 1580 } else if (error) { 1581 goto out; 1582 } 1583 1584 /* 1585 * First algorithm. 1586 * If the requested extent is large wrt the freespaces available 1587 * in this a.g., then the cursor will be pointing to a btree entry 1588 * near the right edge of the tree. If it's in the last btree leaf 1589 * block, then we just examine all the entries in that block 1590 * that are big enough, and pick the best one. 1591 */ 1592 if (xfs_btree_islastblock(acur.cnt, 0)) { 1593 bool allocated = false; 1594 1595 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1596 &allocated); 1597 if (error) 1598 goto out; 1599 if (allocated) 1600 goto alloc_finish; 1601 } 1602 1603 /* 1604 * Second algorithm. Combined cntbt and bnobt search to find ideal 1605 * locality. 1606 */ 1607 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1608 if (error) 1609 goto out; 1610 1611 /* 1612 * If we couldn't get anything, give up. 1613 */ 1614 if (!acur.len) { 1615 if (acur.busy) { 1616 /* 1617 * Our only valid extents must have been busy. Flush and 1618 * retry the allocation again. If we get an -EAGAIN 1619 * error, we're being told that a deadlock was avoided 1620 * and the current transaction needs committing before 1621 * the allocation can be retried. 1622 */ 1623 trace_xfs_alloc_near_busy(args); 1624 error = xfs_extent_busy_flush(args->tp, args->pag, 1625 acur.busy_gen, alloc_flags); 1626 if (error) 1627 goto out; 1628 1629 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1630 goto restart; 1631 } 1632 trace_xfs_alloc_size_neither(args); 1633 args->agbno = NULLAGBLOCK; 1634 goto out; 1635 } 1636 1637 alloc_finish: 1638 /* fix up btrees on a successful allocation */ 1639 error = xfs_alloc_cur_finish(args, &acur); 1640 1641 out: 1642 xfs_alloc_cur_close(&acur, error); 1643 return error; 1644 } 1645 1646 /* 1647 * Allocate a variable extent anywhere in the allocation group agno. 1648 * Extent's length (returned in len) will be between minlen and maxlen, 1649 * and of the form k * prod + mod unless there's nothing that large. 1650 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1651 */ 1652 static int xfs_alloc_ag_vextent_size(struct xfs_alloc_arg * args,uint32_t alloc_flags)1653 xfs_alloc_ag_vextent_size( 1654 struct xfs_alloc_arg *args, 1655 uint32_t alloc_flags) 1656 { 1657 struct xfs_agf *agf = args->agbp->b_addr; 1658 struct xfs_btree_cur *bno_cur; 1659 struct xfs_btree_cur *cnt_cur; 1660 xfs_agblock_t fbno; /* start of found freespace */ 1661 xfs_extlen_t flen; /* length of found freespace */ 1662 xfs_agblock_t rbno; /* returned block number */ 1663 xfs_extlen_t rlen; /* length of returned extent */ 1664 bool busy; 1665 unsigned busy_gen; 1666 int error; 1667 int i; 1668 1669 /* Retry once quickly if we find busy extents before blocking. */ 1670 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1671 restart: 1672 /* 1673 * Allocate and initialize a cursor for the by-size btree. 1674 */ 1675 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1676 args->pag, XFS_BTNUM_CNT); 1677 bno_cur = NULL; 1678 1679 /* 1680 * Look for an entry >= maxlen+alignment-1 blocks. 1681 */ 1682 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1683 args->maxlen + args->alignment - 1, &i))) 1684 goto error0; 1685 1686 /* 1687 * If none then we have to settle for a smaller extent. In the case that 1688 * there are no large extents, this will return the last entry in the 1689 * tree unless the tree is empty. In the case that there are only busy 1690 * large extents, this will return the largest small extent unless there 1691 * are no smaller extents available. 1692 */ 1693 if (!i) { 1694 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1695 &fbno, &flen, &i); 1696 if (error) 1697 goto error0; 1698 if (i == 0 || flen == 0) { 1699 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1700 trace_xfs_alloc_size_noentry(args); 1701 return 0; 1702 } 1703 ASSERT(i == 1); 1704 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1705 &rlen, &busy_gen); 1706 } else { 1707 /* 1708 * Search for a non-busy extent that is large enough. 1709 */ 1710 for (;;) { 1711 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1712 if (error) 1713 goto error0; 1714 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1715 error = -EFSCORRUPTED; 1716 goto error0; 1717 } 1718 1719 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1720 &rbno, &rlen, &busy_gen); 1721 1722 if (rlen >= args->maxlen) 1723 break; 1724 1725 error = xfs_btree_increment(cnt_cur, 0, &i); 1726 if (error) 1727 goto error0; 1728 if (i) 1729 continue; 1730 1731 /* 1732 * Our only valid extents must have been busy. Flush and 1733 * retry the allocation again. If we get an -EAGAIN 1734 * error, we're being told that a deadlock was avoided 1735 * and the current transaction needs committing before 1736 * the allocation can be retried. 1737 */ 1738 trace_xfs_alloc_size_busy(args); 1739 error = xfs_extent_busy_flush(args->tp, args->pag, 1740 busy_gen, alloc_flags); 1741 if (error) 1742 goto error0; 1743 1744 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1745 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1746 goto restart; 1747 } 1748 } 1749 1750 /* 1751 * In the first case above, we got the last entry in the 1752 * by-size btree. Now we check to see if the space hits maxlen 1753 * once aligned; if not, we search left for something better. 1754 * This can't happen in the second case above. 1755 */ 1756 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1757 if (XFS_IS_CORRUPT(args->mp, 1758 rlen != 0 && 1759 (rlen > flen || 1760 rbno + rlen > fbno + flen))) { 1761 error = -EFSCORRUPTED; 1762 goto error0; 1763 } 1764 if (rlen < args->maxlen) { 1765 xfs_agblock_t bestfbno; 1766 xfs_extlen_t bestflen; 1767 xfs_agblock_t bestrbno; 1768 xfs_extlen_t bestrlen; 1769 1770 bestrlen = rlen; 1771 bestrbno = rbno; 1772 bestflen = flen; 1773 bestfbno = fbno; 1774 for (;;) { 1775 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1776 goto error0; 1777 if (i == 0) 1778 break; 1779 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1780 &i))) 1781 goto error0; 1782 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1783 error = -EFSCORRUPTED; 1784 goto error0; 1785 } 1786 if (flen <= bestrlen) 1787 break; 1788 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1789 &rbno, &rlen, &busy_gen); 1790 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1791 if (XFS_IS_CORRUPT(args->mp, 1792 rlen != 0 && 1793 (rlen > flen || 1794 rbno + rlen > fbno + flen))) { 1795 error = -EFSCORRUPTED; 1796 goto error0; 1797 } 1798 if (rlen > bestrlen) { 1799 bestrlen = rlen; 1800 bestrbno = rbno; 1801 bestflen = flen; 1802 bestfbno = fbno; 1803 if (rlen == args->maxlen) 1804 break; 1805 } 1806 } 1807 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1808 &i))) 1809 goto error0; 1810 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1811 error = -EFSCORRUPTED; 1812 goto error0; 1813 } 1814 rlen = bestrlen; 1815 rbno = bestrbno; 1816 flen = bestflen; 1817 fbno = bestfbno; 1818 } 1819 args->wasfromfl = 0; 1820 /* 1821 * Fix up the length. 1822 */ 1823 args->len = rlen; 1824 if (rlen < args->minlen) { 1825 if (busy) { 1826 /* 1827 * Our only valid extents must have been busy. Flush and 1828 * retry the allocation again. If we get an -EAGAIN 1829 * error, we're being told that a deadlock was avoided 1830 * and the current transaction needs committing before 1831 * the allocation can be retried. 1832 */ 1833 trace_xfs_alloc_size_busy(args); 1834 error = xfs_extent_busy_flush(args->tp, args->pag, 1835 busy_gen, alloc_flags); 1836 if (error) 1837 goto error0; 1838 1839 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1840 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1841 goto restart; 1842 } 1843 goto out_nominleft; 1844 } 1845 xfs_alloc_fix_len(args); 1846 1847 rlen = args->len; 1848 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1849 error = -EFSCORRUPTED; 1850 goto error0; 1851 } 1852 /* 1853 * Allocate and initialize a cursor for the by-block tree. 1854 */ 1855 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1856 args->pag, XFS_BTNUM_BNO); 1857 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1858 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1859 goto error0; 1860 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1861 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1862 cnt_cur = bno_cur = NULL; 1863 args->len = rlen; 1864 args->agbno = rbno; 1865 if (XFS_IS_CORRUPT(args->mp, 1866 args->agbno + args->len > 1867 be32_to_cpu(agf->agf_length))) { 1868 error = -EFSCORRUPTED; 1869 goto error0; 1870 } 1871 trace_xfs_alloc_size_done(args); 1872 return 0; 1873 1874 error0: 1875 trace_xfs_alloc_size_error(args); 1876 if (cnt_cur) 1877 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1878 if (bno_cur) 1879 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1880 return error; 1881 1882 out_nominleft: 1883 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1884 trace_xfs_alloc_size_nominleft(args); 1885 args->agbno = NULLAGBLOCK; 1886 return 0; 1887 } 1888 1889 /* 1890 * Free the extent starting at agno/bno for length. 1891 */ 1892 STATIC int xfs_free_ag_extent(struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agnumber_t agno,xfs_agblock_t bno,xfs_extlen_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type)1893 xfs_free_ag_extent( 1894 struct xfs_trans *tp, 1895 struct xfs_buf *agbp, 1896 xfs_agnumber_t agno, 1897 xfs_agblock_t bno, 1898 xfs_extlen_t len, 1899 const struct xfs_owner_info *oinfo, 1900 enum xfs_ag_resv_type type) 1901 { 1902 struct xfs_mount *mp; 1903 struct xfs_btree_cur *bno_cur; 1904 struct xfs_btree_cur *cnt_cur; 1905 xfs_agblock_t gtbno; /* start of right neighbor */ 1906 xfs_extlen_t gtlen; /* length of right neighbor */ 1907 xfs_agblock_t ltbno; /* start of left neighbor */ 1908 xfs_extlen_t ltlen; /* length of left neighbor */ 1909 xfs_agblock_t nbno; /* new starting block of freesp */ 1910 xfs_extlen_t nlen; /* new length of freespace */ 1911 int haveleft; /* have a left neighbor */ 1912 int haveright; /* have a right neighbor */ 1913 int i; 1914 int error; 1915 struct xfs_perag *pag = agbp->b_pag; 1916 1917 bno_cur = cnt_cur = NULL; 1918 mp = tp->t_mountp; 1919 1920 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1921 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1922 if (error) 1923 goto error0; 1924 } 1925 1926 /* 1927 * Allocate and initialize a cursor for the by-block btree. 1928 */ 1929 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); 1930 /* 1931 * Look for a neighboring block on the left (lower block numbers) 1932 * that is contiguous with this space. 1933 */ 1934 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1935 goto error0; 1936 if (haveleft) { 1937 /* 1938 * There is a block to our left. 1939 */ 1940 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1941 goto error0; 1942 if (XFS_IS_CORRUPT(mp, i != 1)) { 1943 error = -EFSCORRUPTED; 1944 goto error0; 1945 } 1946 /* 1947 * It's not contiguous, though. 1948 */ 1949 if (ltbno + ltlen < bno) 1950 haveleft = 0; 1951 else { 1952 /* 1953 * If this failure happens the request to free this 1954 * space was invalid, it's (partly) already free. 1955 * Very bad. 1956 */ 1957 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 1958 error = -EFSCORRUPTED; 1959 goto error0; 1960 } 1961 } 1962 } 1963 /* 1964 * Look for a neighboring block on the right (higher block numbers) 1965 * that is contiguous with this space. 1966 */ 1967 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 1968 goto error0; 1969 if (haveright) { 1970 /* 1971 * There is a block to our right. 1972 */ 1973 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 1974 goto error0; 1975 if (XFS_IS_CORRUPT(mp, i != 1)) { 1976 error = -EFSCORRUPTED; 1977 goto error0; 1978 } 1979 /* 1980 * It's not contiguous, though. 1981 */ 1982 if (bno + len < gtbno) 1983 haveright = 0; 1984 else { 1985 /* 1986 * If this failure happens the request to free this 1987 * space was invalid, it's (partly) already free. 1988 * Very bad. 1989 */ 1990 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 1991 error = -EFSCORRUPTED; 1992 goto error0; 1993 } 1994 } 1995 } 1996 /* 1997 * Now allocate and initialize a cursor for the by-size tree. 1998 */ 1999 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); 2000 /* 2001 * Have both left and right contiguous neighbors. 2002 * Merge all three into a single free block. 2003 */ 2004 if (haveleft && haveright) { 2005 /* 2006 * Delete the old by-size entry on the left. 2007 */ 2008 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2009 goto error0; 2010 if (XFS_IS_CORRUPT(mp, i != 1)) { 2011 error = -EFSCORRUPTED; 2012 goto error0; 2013 } 2014 if ((error = xfs_btree_delete(cnt_cur, &i))) 2015 goto error0; 2016 if (XFS_IS_CORRUPT(mp, i != 1)) { 2017 error = -EFSCORRUPTED; 2018 goto error0; 2019 } 2020 /* 2021 * Delete the old by-size entry on the right. 2022 */ 2023 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2024 goto error0; 2025 if (XFS_IS_CORRUPT(mp, i != 1)) { 2026 error = -EFSCORRUPTED; 2027 goto error0; 2028 } 2029 if ((error = xfs_btree_delete(cnt_cur, &i))) 2030 goto error0; 2031 if (XFS_IS_CORRUPT(mp, i != 1)) { 2032 error = -EFSCORRUPTED; 2033 goto error0; 2034 } 2035 /* 2036 * Delete the old by-block entry for the right block. 2037 */ 2038 if ((error = xfs_btree_delete(bno_cur, &i))) 2039 goto error0; 2040 if (XFS_IS_CORRUPT(mp, i != 1)) { 2041 error = -EFSCORRUPTED; 2042 goto error0; 2043 } 2044 /* 2045 * Move the by-block cursor back to the left neighbor. 2046 */ 2047 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2048 goto error0; 2049 if (XFS_IS_CORRUPT(mp, i != 1)) { 2050 error = -EFSCORRUPTED; 2051 goto error0; 2052 } 2053 #ifdef DEBUG 2054 /* 2055 * Check that this is the right record: delete didn't 2056 * mangle the cursor. 2057 */ 2058 { 2059 xfs_agblock_t xxbno; 2060 xfs_extlen_t xxlen; 2061 2062 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2063 &i))) 2064 goto error0; 2065 if (XFS_IS_CORRUPT(mp, 2066 i != 1 || 2067 xxbno != ltbno || 2068 xxlen != ltlen)) { 2069 error = -EFSCORRUPTED; 2070 goto error0; 2071 } 2072 } 2073 #endif 2074 /* 2075 * Update remaining by-block entry to the new, joined block. 2076 */ 2077 nbno = ltbno; 2078 nlen = len + ltlen + gtlen; 2079 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2080 goto error0; 2081 } 2082 /* 2083 * Have only a left contiguous neighbor. 2084 * Merge it together with the new freespace. 2085 */ 2086 else if (haveleft) { 2087 /* 2088 * Delete the old by-size entry on the left. 2089 */ 2090 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2091 goto error0; 2092 if (XFS_IS_CORRUPT(mp, i != 1)) { 2093 error = -EFSCORRUPTED; 2094 goto error0; 2095 } 2096 if ((error = xfs_btree_delete(cnt_cur, &i))) 2097 goto error0; 2098 if (XFS_IS_CORRUPT(mp, i != 1)) { 2099 error = -EFSCORRUPTED; 2100 goto error0; 2101 } 2102 /* 2103 * Back up the by-block cursor to the left neighbor, and 2104 * update its length. 2105 */ 2106 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2107 goto error0; 2108 if (XFS_IS_CORRUPT(mp, i != 1)) { 2109 error = -EFSCORRUPTED; 2110 goto error0; 2111 } 2112 nbno = ltbno; 2113 nlen = len + ltlen; 2114 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2115 goto error0; 2116 } 2117 /* 2118 * Have only a right contiguous neighbor. 2119 * Merge it together with the new freespace. 2120 */ 2121 else if (haveright) { 2122 /* 2123 * Delete the old by-size entry on the right. 2124 */ 2125 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2126 goto error0; 2127 if (XFS_IS_CORRUPT(mp, i != 1)) { 2128 error = -EFSCORRUPTED; 2129 goto error0; 2130 } 2131 if ((error = xfs_btree_delete(cnt_cur, &i))) 2132 goto error0; 2133 if (XFS_IS_CORRUPT(mp, i != 1)) { 2134 error = -EFSCORRUPTED; 2135 goto error0; 2136 } 2137 /* 2138 * Update the starting block and length of the right 2139 * neighbor in the by-block tree. 2140 */ 2141 nbno = bno; 2142 nlen = len + gtlen; 2143 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2144 goto error0; 2145 } 2146 /* 2147 * No contiguous neighbors. 2148 * Insert the new freespace into the by-block tree. 2149 */ 2150 else { 2151 nbno = bno; 2152 nlen = len; 2153 if ((error = xfs_btree_insert(bno_cur, &i))) 2154 goto error0; 2155 if (XFS_IS_CORRUPT(mp, i != 1)) { 2156 error = -EFSCORRUPTED; 2157 goto error0; 2158 } 2159 } 2160 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2161 bno_cur = NULL; 2162 /* 2163 * In all cases we need to insert the new freespace in the by-size tree. 2164 */ 2165 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2166 goto error0; 2167 if (XFS_IS_CORRUPT(mp, i != 0)) { 2168 error = -EFSCORRUPTED; 2169 goto error0; 2170 } 2171 if ((error = xfs_btree_insert(cnt_cur, &i))) 2172 goto error0; 2173 if (XFS_IS_CORRUPT(mp, i != 1)) { 2174 error = -EFSCORRUPTED; 2175 goto error0; 2176 } 2177 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2178 cnt_cur = NULL; 2179 2180 /* 2181 * Update the freespace totals in the ag and superblock. 2182 */ 2183 error = xfs_alloc_update_counters(tp, agbp, len); 2184 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2185 if (error) 2186 goto error0; 2187 2188 XFS_STATS_INC(mp, xs_freex); 2189 XFS_STATS_ADD(mp, xs_freeb, len); 2190 2191 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2192 2193 return 0; 2194 2195 error0: 2196 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2197 if (bno_cur) 2198 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2199 if (cnt_cur) 2200 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2201 return error; 2202 } 2203 2204 /* 2205 * Visible (exported) allocation/free functions. 2206 * Some of these are used just by xfs_alloc_btree.c and this file. 2207 */ 2208 2209 /* 2210 * Compute and fill in value of m_alloc_maxlevels. 2211 */ 2212 void xfs_alloc_compute_maxlevels(xfs_mount_t * mp)2213 xfs_alloc_compute_maxlevels( 2214 xfs_mount_t *mp) /* file system mount structure */ 2215 { 2216 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2217 (mp->m_sb.sb_agblocks + 1) / 2); 2218 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2219 } 2220 2221 /* 2222 * Find the length of the longest extent in an AG. The 'need' parameter 2223 * specifies how much space we're going to need for the AGFL and the 2224 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2225 * other callers. 2226 */ 2227 xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag * pag,xfs_extlen_t need,xfs_extlen_t reserved)2228 xfs_alloc_longest_free_extent( 2229 struct xfs_perag *pag, 2230 xfs_extlen_t need, 2231 xfs_extlen_t reserved) 2232 { 2233 xfs_extlen_t delta = 0; 2234 2235 /* 2236 * If the AGFL needs a recharge, we'll have to subtract that from the 2237 * longest extent. 2238 */ 2239 if (need > pag->pagf_flcount) 2240 delta = need - pag->pagf_flcount; 2241 2242 /* 2243 * If we cannot maintain others' reservations with space from the 2244 * not-longest freesp extents, we'll have to subtract /that/ from 2245 * the longest extent too. 2246 */ 2247 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2248 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2249 2250 /* 2251 * If the longest extent is long enough to satisfy all the 2252 * reservations and AGFL rules in place, we can return this extent. 2253 */ 2254 if (pag->pagf_longest > delta) 2255 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2256 pag->pagf_longest - delta); 2257 2258 /* Otherwise, let the caller try for 1 block if there's space. */ 2259 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2260 } 2261 2262 /* 2263 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2264 * return the largest possible minimum length. 2265 */ 2266 unsigned int xfs_alloc_min_freelist(struct xfs_mount * mp,struct xfs_perag * pag)2267 xfs_alloc_min_freelist( 2268 struct xfs_mount *mp, 2269 struct xfs_perag *pag) 2270 { 2271 /* AG btrees have at least 1 level. */ 2272 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; 2273 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; 2274 unsigned int min_free; 2275 2276 ASSERT(mp->m_alloc_maxlevels > 0); 2277 2278 /* 2279 * For a btree shorter than the maximum height, the worst case is that 2280 * every level gets split and a new level is added, then while inserting 2281 * another entry to refill the AGFL, every level under the old root gets 2282 * split again. This is: 2283 * 2284 * (full height split reservation) + (AGFL refill split height) 2285 * = (current height + 1) + (current height - 1) 2286 * = (new height) + (new height - 2) 2287 * = 2 * new height - 2 2288 * 2289 * For a btree of maximum height, the worst case is that every level 2290 * under the root gets split, then while inserting another entry to 2291 * refill the AGFL, every level under the root gets split again. This is 2292 * also: 2293 * 2294 * 2 * (current height - 1) 2295 * = 2 * (new height - 1) 2296 * = 2 * new height - 2 2297 */ 2298 2299 /* space needed by-bno freespace btree */ 2300 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, 2301 mp->m_alloc_maxlevels) * 2 - 2; 2302 /* space needed by-size freespace btree */ 2303 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, 2304 mp->m_alloc_maxlevels) * 2 - 2; 2305 /* space needed reverse mapping used space btree */ 2306 if (xfs_has_rmapbt(mp)) 2307 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, 2308 mp->m_rmap_maxlevels) * 2 - 2; 2309 2310 return min_free; 2311 } 2312 2313 /* 2314 * Check if the operation we are fixing up the freelist for should go ahead or 2315 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2316 * is dependent on whether the size and shape of free space available will 2317 * permit the requested allocation to take place. 2318 */ 2319 static bool xfs_alloc_space_available(struct xfs_alloc_arg * args,xfs_extlen_t min_free,int flags)2320 xfs_alloc_space_available( 2321 struct xfs_alloc_arg *args, 2322 xfs_extlen_t min_free, 2323 int flags) 2324 { 2325 struct xfs_perag *pag = args->pag; 2326 xfs_extlen_t alloc_len, longest; 2327 xfs_extlen_t reservation; /* blocks that are still reserved */ 2328 int available; 2329 xfs_extlen_t agflcount; 2330 2331 if (flags & XFS_ALLOC_FLAG_FREEING) 2332 return true; 2333 2334 reservation = xfs_ag_resv_needed(pag, args->resv); 2335 2336 /* do we have enough contiguous free space for the allocation? */ 2337 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2338 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2339 if (longest < alloc_len) 2340 return false; 2341 2342 /* 2343 * Do we have enough free space remaining for the allocation? Don't 2344 * account extra agfl blocks because we are about to defer free them, 2345 * making them unavailable until the current transaction commits. 2346 */ 2347 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2348 available = (int)(pag->pagf_freeblks + agflcount - 2349 reservation - min_free - args->minleft); 2350 if (available < (int)max(args->total, alloc_len)) 2351 return false; 2352 2353 /* 2354 * Clamp maxlen to the amount of free space available for the actual 2355 * extent allocation. 2356 */ 2357 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2358 args->maxlen = available; 2359 ASSERT(args->maxlen > 0); 2360 ASSERT(args->maxlen >= args->minlen); 2361 } 2362 2363 return true; 2364 } 2365 2366 int xfs_free_agfl_block(struct xfs_trans * tp,xfs_agnumber_t agno,xfs_agblock_t agbno,struct xfs_buf * agbp,struct xfs_owner_info * oinfo)2367 xfs_free_agfl_block( 2368 struct xfs_trans *tp, 2369 xfs_agnumber_t agno, 2370 xfs_agblock_t agbno, 2371 struct xfs_buf *agbp, 2372 struct xfs_owner_info *oinfo) 2373 { 2374 int error; 2375 struct xfs_buf *bp; 2376 2377 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2378 XFS_AG_RESV_AGFL); 2379 if (error) 2380 return error; 2381 2382 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2383 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2384 tp->t_mountp->m_bsize, 0, &bp); 2385 if (error) 2386 return error; 2387 xfs_trans_binval(tp, bp); 2388 2389 return 0; 2390 } 2391 2392 /* 2393 * Check the agfl fields of the agf for inconsistency or corruption. 2394 * 2395 * The original purpose was to detect an agfl header padding mismatch between 2396 * current and early v5 kernels. This problem manifests as a 1-slot size 2397 * difference between the on-disk flcount and the active [first, last] range of 2398 * a wrapped agfl. 2399 * 2400 * However, we need to use these same checks to catch agfl count corruptions 2401 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either 2402 * way, we need to reset the agfl and warn the user. 2403 * 2404 * Return true if a reset is required before the agfl can be used, false 2405 * otherwise. 2406 */ 2407 static bool xfs_agfl_needs_reset(struct xfs_mount * mp,struct xfs_agf * agf)2408 xfs_agfl_needs_reset( 2409 struct xfs_mount *mp, 2410 struct xfs_agf *agf) 2411 { 2412 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2413 uint32_t l = be32_to_cpu(agf->agf_fllast); 2414 uint32_t c = be32_to_cpu(agf->agf_flcount); 2415 int agfl_size = xfs_agfl_size(mp); 2416 int active; 2417 2418 /* 2419 * The agf read verifier catches severe corruption of these fields. 2420 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2421 * the verifier allows it. 2422 */ 2423 if (f >= agfl_size || l >= agfl_size) 2424 return true; 2425 if (c > agfl_size) 2426 return true; 2427 2428 /* 2429 * Check consistency between the on-disk count and the active range. An 2430 * agfl padding mismatch manifests as an inconsistent flcount. 2431 */ 2432 if (c && l >= f) 2433 active = l - f + 1; 2434 else if (c) 2435 active = agfl_size - f + l + 1; 2436 else 2437 active = 0; 2438 2439 return active != c; 2440 } 2441 2442 /* 2443 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2444 * agfl content cannot be trusted. Warn the user that a repair is required to 2445 * recover leaked blocks. 2446 * 2447 * The purpose of this mechanism is to handle filesystems affected by the agfl 2448 * header padding mismatch problem. A reset keeps the filesystem online with a 2449 * relatively minor free space accounting inconsistency rather than suffer the 2450 * inevitable crash from use of an invalid agfl block. 2451 */ 2452 static void xfs_agfl_reset(struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_perag * pag)2453 xfs_agfl_reset( 2454 struct xfs_trans *tp, 2455 struct xfs_buf *agbp, 2456 struct xfs_perag *pag) 2457 { 2458 struct xfs_mount *mp = tp->t_mountp; 2459 struct xfs_agf *agf = agbp->b_addr; 2460 2461 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2462 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2463 2464 xfs_warn(mp, 2465 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2466 "Please unmount and run xfs_repair.", 2467 pag->pag_agno, pag->pagf_flcount); 2468 2469 agf->agf_flfirst = 0; 2470 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2471 agf->agf_flcount = 0; 2472 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2473 XFS_AGF_FLCOUNT); 2474 2475 pag->pagf_flcount = 0; 2476 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2477 } 2478 2479 /* 2480 * Defer an AGFL block free. This is effectively equivalent to 2481 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2482 * 2483 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2484 * allocation operations in a transaction. AGFL frees are prone to this problem 2485 * because for one they are always freed one at a time. Further, an immediate 2486 * AGFL block free can cause a btree join and require another block free before 2487 * the real allocation can proceed. Deferring the free disconnects freeing up 2488 * the AGFL slot from freeing the block. 2489 */ 2490 static int xfs_defer_agfl_block(struct xfs_trans * tp,xfs_agnumber_t agno,xfs_agblock_t agbno,struct xfs_owner_info * oinfo)2491 xfs_defer_agfl_block( 2492 struct xfs_trans *tp, 2493 xfs_agnumber_t agno, 2494 xfs_agblock_t agbno, 2495 struct xfs_owner_info *oinfo) 2496 { 2497 struct xfs_mount *mp = tp->t_mountp; 2498 struct xfs_extent_free_item *xefi; 2499 xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); 2500 2501 ASSERT(xfs_extfree_item_cache != NULL); 2502 ASSERT(oinfo != NULL); 2503 2504 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) 2505 return -EFSCORRUPTED; 2506 2507 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2508 GFP_KERNEL | __GFP_NOFAIL); 2509 xefi->xefi_startblock = fsbno; 2510 xefi->xefi_blockcount = 1; 2511 xefi->xefi_owner = oinfo->oi_owner; 2512 xefi->xefi_agresv = XFS_AG_RESV_AGFL; 2513 2514 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2515 2516 xfs_extent_free_get_group(mp, xefi); 2517 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); 2518 return 0; 2519 } 2520 2521 /* 2522 * Add the extent to the list of extents to be free at transaction end. 2523 * The list is maintained sorted (by block number). 2524 */ 2525 int __xfs_free_extent_later(struct xfs_trans * tp,xfs_fsblock_t bno,xfs_filblks_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type,bool skip_discard)2526 __xfs_free_extent_later( 2527 struct xfs_trans *tp, 2528 xfs_fsblock_t bno, 2529 xfs_filblks_t len, 2530 const struct xfs_owner_info *oinfo, 2531 enum xfs_ag_resv_type type, 2532 bool skip_discard) 2533 { 2534 struct xfs_extent_free_item *xefi; 2535 struct xfs_mount *mp = tp->t_mountp; 2536 #ifdef DEBUG 2537 xfs_agnumber_t agno; 2538 xfs_agblock_t agbno; 2539 2540 ASSERT(bno != NULLFSBLOCK); 2541 ASSERT(len > 0); 2542 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2543 ASSERT(!isnullstartblock(bno)); 2544 agno = XFS_FSB_TO_AGNO(mp, bno); 2545 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2546 ASSERT(agno < mp->m_sb.sb_agcount); 2547 ASSERT(agbno < mp->m_sb.sb_agblocks); 2548 ASSERT(len < mp->m_sb.sb_agblocks); 2549 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2550 #endif 2551 ASSERT(xfs_extfree_item_cache != NULL); 2552 ASSERT(type != XFS_AG_RESV_AGFL); 2553 2554 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) 2555 return -EFSCORRUPTED; 2556 2557 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2558 GFP_KERNEL | __GFP_NOFAIL); 2559 xefi->xefi_startblock = bno; 2560 xefi->xefi_blockcount = (xfs_extlen_t)len; 2561 xefi->xefi_agresv = type; 2562 if (skip_discard) 2563 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2564 if (oinfo) { 2565 ASSERT(oinfo->oi_offset == 0); 2566 2567 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2568 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2569 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2570 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2571 xefi->xefi_owner = oinfo->oi_owner; 2572 } else { 2573 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2574 } 2575 trace_xfs_bmap_free_defer(mp, 2576 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2577 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2578 2579 xfs_extent_free_get_group(mp, xefi); 2580 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); 2581 return 0; 2582 } 2583 2584 /* 2585 * Check if an AGF has a free extent record whose length is equal to 2586 * args->minlen. 2587 */ 2588 STATIC int xfs_exact_minlen_extent_available(struct xfs_alloc_arg * args,struct xfs_buf * agbp,int * stat)2589 xfs_exact_minlen_extent_available( 2590 struct xfs_alloc_arg *args, 2591 struct xfs_buf *agbp, 2592 int *stat) 2593 { 2594 struct xfs_btree_cur *cnt_cur; 2595 xfs_agblock_t fbno; 2596 xfs_extlen_t flen; 2597 int error = 0; 2598 2599 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, 2600 args->pag, XFS_BTNUM_CNT); 2601 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2602 if (error) 2603 goto out; 2604 2605 if (*stat == 0) { 2606 error = -EFSCORRUPTED; 2607 goto out; 2608 } 2609 2610 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2611 if (error) 2612 goto out; 2613 2614 if (*stat == 1 && flen != args->minlen) 2615 *stat = 0; 2616 2617 out: 2618 xfs_btree_del_cursor(cnt_cur, error); 2619 2620 return error; 2621 } 2622 2623 /* 2624 * Decide whether to use this allocation group for this allocation. 2625 * If so, fix up the btree freelist's size. 2626 */ 2627 int /* error */ xfs_alloc_fix_freelist(struct xfs_alloc_arg * args,uint32_t alloc_flags)2628 xfs_alloc_fix_freelist( 2629 struct xfs_alloc_arg *args, /* allocation argument structure */ 2630 uint32_t alloc_flags) 2631 { 2632 struct xfs_mount *mp = args->mp; 2633 struct xfs_perag *pag = args->pag; 2634 struct xfs_trans *tp = args->tp; 2635 struct xfs_buf *agbp = NULL; 2636 struct xfs_buf *agflbp = NULL; 2637 struct xfs_alloc_arg targs; /* local allocation arguments */ 2638 xfs_agblock_t bno; /* freelist block */ 2639 xfs_extlen_t need; /* total blocks needed in freelist */ 2640 int error = 0; 2641 2642 /* deferred ops (AGFL block frees) require permanent transactions */ 2643 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2644 2645 if (!xfs_perag_initialised_agf(pag)) { 2646 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2647 if (error) { 2648 /* Couldn't lock the AGF so skip this AG. */ 2649 if (error == -EAGAIN) 2650 error = 0; 2651 goto out_no_agbp; 2652 } 2653 } 2654 2655 /* 2656 * If this is a metadata preferred pag and we are user data then try 2657 * somewhere else if we are not being asked to try harder at this 2658 * point 2659 */ 2660 if (xfs_perag_prefers_metadata(pag) && 2661 (args->datatype & XFS_ALLOC_USERDATA) && 2662 (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2663 ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); 2664 goto out_agbp_relse; 2665 } 2666 2667 need = xfs_alloc_min_freelist(mp, pag); 2668 if (!xfs_alloc_space_available(args, need, alloc_flags | 2669 XFS_ALLOC_FLAG_CHECK)) 2670 goto out_agbp_relse; 2671 2672 /* 2673 * Get the a.g. freespace buffer. 2674 * Can fail if we're not blocking on locks, and it's held. 2675 */ 2676 if (!agbp) { 2677 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2678 if (error) { 2679 /* Couldn't lock the AGF so skip this AG. */ 2680 if (error == -EAGAIN) 2681 error = 0; 2682 goto out_no_agbp; 2683 } 2684 } 2685 2686 /* reset a padding mismatched agfl before final free space check */ 2687 if (xfs_perag_agfl_needs_reset(pag)) 2688 xfs_agfl_reset(tp, agbp, pag); 2689 2690 /* If there isn't enough total space or single-extent, reject it. */ 2691 need = xfs_alloc_min_freelist(mp, pag); 2692 if (!xfs_alloc_space_available(args, need, alloc_flags)) 2693 goto out_agbp_relse; 2694 2695 if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { 2696 int stat; 2697 2698 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2699 if (error || !stat) 2700 goto out_agbp_relse; 2701 } 2702 2703 /* 2704 * Make the freelist shorter if it's too long. 2705 * 2706 * Note that from this point onwards, we will always release the agf and 2707 * agfl buffers on error. This handles the case where we error out and 2708 * the buffers are clean or may not have been joined to the transaction 2709 * and hence need to be released manually. If they have been joined to 2710 * the transaction, then xfs_trans_brelse() will handle them 2711 * appropriately based on the recursion count and dirty state of the 2712 * buffer. 2713 * 2714 * XXX (dgc): When we have lots of free space, does this buy us 2715 * anything other than extra overhead when we need to put more blocks 2716 * back on the free list? Maybe we should only do this when space is 2717 * getting low or the AGFL is more than half full? 2718 * 2719 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2720 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2721 * updating the rmapbt. Both flags are used in xfs_repair while we're 2722 * rebuilding the rmapbt, and neither are used by the kernel. They're 2723 * both required to ensure that rmaps are correctly recorded for the 2724 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2725 * repair/rmap.c in xfsprogs for details. 2726 */ 2727 memset(&targs, 0, sizeof(targs)); 2728 /* struct copy below */ 2729 if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) 2730 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2731 else 2732 targs.oinfo = XFS_RMAP_OINFO_AG; 2733 while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && 2734 pag->pagf_flcount > need) { 2735 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2736 if (error) 2737 goto out_agbp_relse; 2738 2739 /* defer agfl frees */ 2740 error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2741 if (error) 2742 goto out_agbp_relse; 2743 } 2744 2745 targs.tp = tp; 2746 targs.mp = mp; 2747 targs.agbp = agbp; 2748 targs.agno = args->agno; 2749 targs.alignment = targs.minlen = targs.prod = 1; 2750 targs.pag = pag; 2751 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2752 if (error) 2753 goto out_agbp_relse; 2754 2755 /* Make the freelist longer if it's too short. */ 2756 while (pag->pagf_flcount < need) { 2757 targs.agbno = 0; 2758 targs.maxlen = need - pag->pagf_flcount; 2759 targs.resv = XFS_AG_RESV_AGFL; 2760 2761 /* Allocate as many blocks as possible at once. */ 2762 error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); 2763 if (error) 2764 goto out_agflbp_relse; 2765 2766 /* 2767 * Stop if we run out. Won't happen if callers are obeying 2768 * the restrictions correctly. Can happen for free calls 2769 * on a completely full ag. 2770 */ 2771 if (targs.agbno == NULLAGBLOCK) { 2772 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 2773 break; 2774 goto out_agflbp_relse; 2775 } 2776 2777 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2778 error = xfs_rmap_alloc(tp, agbp, pag, 2779 targs.agbno, targs.len, &targs.oinfo); 2780 if (error) 2781 goto out_agflbp_relse; 2782 } 2783 error = xfs_alloc_update_counters(tp, agbp, 2784 -((long)(targs.len))); 2785 if (error) 2786 goto out_agflbp_relse; 2787 2788 /* 2789 * Put each allocated block on the list. 2790 */ 2791 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2792 error = xfs_alloc_put_freelist(pag, tp, agbp, 2793 agflbp, bno, 0); 2794 if (error) 2795 goto out_agflbp_relse; 2796 } 2797 } 2798 xfs_trans_brelse(tp, agflbp); 2799 args->agbp = agbp; 2800 return 0; 2801 2802 out_agflbp_relse: 2803 xfs_trans_brelse(tp, agflbp); 2804 out_agbp_relse: 2805 if (agbp) 2806 xfs_trans_brelse(tp, agbp); 2807 out_no_agbp: 2808 args->agbp = NULL; 2809 return error; 2810 } 2811 2812 /* 2813 * Get a block from the freelist. 2814 * Returns with the buffer for the block gotten. 2815 */ 2816 int xfs_alloc_get_freelist(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,xfs_agblock_t * bnop,int btreeblk)2817 xfs_alloc_get_freelist( 2818 struct xfs_perag *pag, 2819 struct xfs_trans *tp, 2820 struct xfs_buf *agbp, 2821 xfs_agblock_t *bnop, 2822 int btreeblk) 2823 { 2824 struct xfs_agf *agf = agbp->b_addr; 2825 struct xfs_buf *agflbp; 2826 xfs_agblock_t bno; 2827 __be32 *agfl_bno; 2828 int error; 2829 uint32_t logflags; 2830 struct xfs_mount *mp = tp->t_mountp; 2831 2832 /* 2833 * Freelist is empty, give up. 2834 */ 2835 if (!agf->agf_flcount) { 2836 *bnop = NULLAGBLOCK; 2837 return 0; 2838 } 2839 /* 2840 * Read the array of free blocks. 2841 */ 2842 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2843 if (error) 2844 return error; 2845 2846 2847 /* 2848 * Get the block number and update the data structures. 2849 */ 2850 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2851 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 2852 if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) 2853 return -EFSCORRUPTED; 2854 2855 be32_add_cpu(&agf->agf_flfirst, 1); 2856 xfs_trans_brelse(tp, agflbp); 2857 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2858 agf->agf_flfirst = 0; 2859 2860 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2861 be32_add_cpu(&agf->agf_flcount, -1); 2862 pag->pagf_flcount--; 2863 2864 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 2865 if (btreeblk) { 2866 be32_add_cpu(&agf->agf_btreeblks, 1); 2867 pag->pagf_btreeblks++; 2868 logflags |= XFS_AGF_BTREEBLKS; 2869 } 2870 2871 xfs_alloc_log_agf(tp, agbp, logflags); 2872 *bnop = bno; 2873 2874 return 0; 2875 } 2876 2877 /* 2878 * Log the given fields from the agf structure. 2879 */ 2880 void xfs_alloc_log_agf(struct xfs_trans * tp,struct xfs_buf * bp,uint32_t fields)2881 xfs_alloc_log_agf( 2882 struct xfs_trans *tp, 2883 struct xfs_buf *bp, 2884 uint32_t fields) 2885 { 2886 int first; /* first byte offset */ 2887 int last; /* last byte offset */ 2888 static const short offsets[] = { 2889 offsetof(xfs_agf_t, agf_magicnum), 2890 offsetof(xfs_agf_t, agf_versionnum), 2891 offsetof(xfs_agf_t, agf_seqno), 2892 offsetof(xfs_agf_t, agf_length), 2893 offsetof(xfs_agf_t, agf_roots[0]), 2894 offsetof(xfs_agf_t, agf_levels[0]), 2895 offsetof(xfs_agf_t, agf_flfirst), 2896 offsetof(xfs_agf_t, agf_fllast), 2897 offsetof(xfs_agf_t, agf_flcount), 2898 offsetof(xfs_agf_t, agf_freeblks), 2899 offsetof(xfs_agf_t, agf_longest), 2900 offsetof(xfs_agf_t, agf_btreeblks), 2901 offsetof(xfs_agf_t, agf_uuid), 2902 offsetof(xfs_agf_t, agf_rmap_blocks), 2903 offsetof(xfs_agf_t, agf_refcount_blocks), 2904 offsetof(xfs_agf_t, agf_refcount_root), 2905 offsetof(xfs_agf_t, agf_refcount_level), 2906 /* needed so that we don't log the whole rest of the structure: */ 2907 offsetof(xfs_agf_t, agf_spare64), 2908 sizeof(xfs_agf_t) 2909 }; 2910 2911 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 2912 2913 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 2914 2915 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2916 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2917 } 2918 2919 /* 2920 * Put the block on the freelist for the allocation group. 2921 */ 2922 int xfs_alloc_put_freelist(struct xfs_perag * pag,struct xfs_trans * tp,struct xfs_buf * agbp,struct xfs_buf * agflbp,xfs_agblock_t bno,int btreeblk)2923 xfs_alloc_put_freelist( 2924 struct xfs_perag *pag, 2925 struct xfs_trans *tp, 2926 struct xfs_buf *agbp, 2927 struct xfs_buf *agflbp, 2928 xfs_agblock_t bno, 2929 int btreeblk) 2930 { 2931 struct xfs_mount *mp = tp->t_mountp; 2932 struct xfs_agf *agf = agbp->b_addr; 2933 __be32 *blockp; 2934 int error; 2935 uint32_t logflags; 2936 __be32 *agfl_bno; 2937 int startoff; 2938 2939 if (!agflbp) { 2940 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2941 if (error) 2942 return error; 2943 } 2944 2945 be32_add_cpu(&agf->agf_fllast, 1); 2946 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2947 agf->agf_fllast = 0; 2948 2949 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2950 be32_add_cpu(&agf->agf_flcount, 1); 2951 pag->pagf_flcount++; 2952 2953 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 2954 if (btreeblk) { 2955 be32_add_cpu(&agf->agf_btreeblks, -1); 2956 pag->pagf_btreeblks--; 2957 logflags |= XFS_AGF_BTREEBLKS; 2958 } 2959 2960 xfs_alloc_log_agf(tp, agbp, logflags); 2961 2962 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 2963 2964 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2965 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2966 *blockp = cpu_to_be32(bno); 2967 startoff = (char *)blockp - (char *)agflbp->b_addr; 2968 2969 xfs_alloc_log_agf(tp, agbp, logflags); 2970 2971 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 2972 xfs_trans_log_buf(tp, agflbp, startoff, 2973 startoff + sizeof(xfs_agblock_t) - 1); 2974 return 0; 2975 } 2976 2977 /* 2978 * Check that this AGF/AGI header's sequence number and length matches the AG 2979 * number and size in fsblocks. 2980 */ 2981 xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf * bp,uint32_t seqno,uint32_t length)2982 xfs_validate_ag_length( 2983 struct xfs_buf *bp, 2984 uint32_t seqno, 2985 uint32_t length) 2986 { 2987 struct xfs_mount *mp = bp->b_mount; 2988 /* 2989 * During growfs operations, the perag is not fully initialised, 2990 * so we can't use it for any useful checking. growfs ensures we can't 2991 * use it by using uncached buffers that don't have the perag attached 2992 * so we can detect and avoid this problem. 2993 */ 2994 if (bp->b_pag && seqno != bp->b_pag->pag_agno) 2995 return __this_address; 2996 2997 /* 2998 * Only the last AG in the filesystem is allowed to be shorter 2999 * than the AG size recorded in the superblock. 3000 */ 3001 if (length != mp->m_sb.sb_agblocks) { 3002 /* 3003 * During growfs, the new last AG can get here before we 3004 * have updated the superblock. Give it a pass on the seqno 3005 * check. 3006 */ 3007 if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) 3008 return __this_address; 3009 if (length < XFS_MIN_AG_BLOCKS) 3010 return __this_address; 3011 if (length > mp->m_sb.sb_agblocks) 3012 return __this_address; 3013 } 3014 3015 return NULL; 3016 } 3017 3018 /* 3019 * Verify the AGF is consistent. 3020 * 3021 * We do not verify the AGFL indexes in the AGF are fully consistent here 3022 * because of issues with variable on-disk structure sizes. Instead, we check 3023 * the agfl indexes for consistency when we initialise the perag from the AGF 3024 * information after a read completes. 3025 * 3026 * If the index is inconsistent, then we mark the perag as needing an AGFL 3027 * reset. The first AGFL update performed then resets the AGFL indexes and 3028 * refills the AGFL with known good free blocks, allowing the filesystem to 3029 * continue operating normally at the cost of a few leaked free space blocks. 3030 */ 3031 static xfs_failaddr_t xfs_agf_verify(struct xfs_buf * bp)3032 xfs_agf_verify( 3033 struct xfs_buf *bp) 3034 { 3035 struct xfs_mount *mp = bp->b_mount; 3036 struct xfs_agf *agf = bp->b_addr; 3037 xfs_failaddr_t fa; 3038 uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); 3039 uint32_t agf_length = be32_to_cpu(agf->agf_length); 3040 3041 if (xfs_has_crc(mp)) { 3042 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 3043 return __this_address; 3044 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 3045 return __this_address; 3046 } 3047 3048 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 3049 return __this_address; 3050 3051 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) 3052 return __this_address; 3053 3054 /* 3055 * Both agf_seqno and agf_length need to validated before anything else 3056 * block number related in the AGF or AGFL can be checked. 3057 */ 3058 fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); 3059 if (fa) 3060 return fa; 3061 3062 if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) 3063 return __this_address; 3064 if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) 3065 return __this_address; 3066 if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) 3067 return __this_address; 3068 3069 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 3070 be32_to_cpu(agf->agf_freeblks) > agf_length) 3071 return __this_address; 3072 3073 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || 3074 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || 3075 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > 3076 mp->m_alloc_maxlevels || 3077 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > 3078 mp->m_alloc_maxlevels) 3079 return __this_address; 3080 3081 if (xfs_has_lazysbcount(mp) && 3082 be32_to_cpu(agf->agf_btreeblks) > agf_length) 3083 return __this_address; 3084 3085 if (xfs_has_rmapbt(mp)) { 3086 if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) 3087 return __this_address; 3088 3089 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || 3090 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > 3091 mp->m_rmap_maxlevels) 3092 return __this_address; 3093 } 3094 3095 if (xfs_has_reflink(mp)) { 3096 if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) 3097 return __this_address; 3098 3099 if (be32_to_cpu(agf->agf_refcount_level) < 1 || 3100 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) 3101 return __this_address; 3102 } 3103 3104 return NULL; 3105 } 3106 3107 static void xfs_agf_read_verify(struct xfs_buf * bp)3108 xfs_agf_read_verify( 3109 struct xfs_buf *bp) 3110 { 3111 struct xfs_mount *mp = bp->b_mount; 3112 xfs_failaddr_t fa; 3113 3114 if (xfs_has_crc(mp) && 3115 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 3116 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 3117 else { 3118 fa = xfs_agf_verify(bp); 3119 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 3120 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3121 } 3122 } 3123 3124 static void xfs_agf_write_verify(struct xfs_buf * bp)3125 xfs_agf_write_verify( 3126 struct xfs_buf *bp) 3127 { 3128 struct xfs_mount *mp = bp->b_mount; 3129 struct xfs_buf_log_item *bip = bp->b_log_item; 3130 struct xfs_agf *agf = bp->b_addr; 3131 xfs_failaddr_t fa; 3132 3133 fa = xfs_agf_verify(bp); 3134 if (fa) { 3135 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3136 return; 3137 } 3138 3139 if (!xfs_has_crc(mp)) 3140 return; 3141 3142 if (bip) 3143 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3144 3145 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3146 } 3147 3148 const struct xfs_buf_ops xfs_agf_buf_ops = { 3149 .name = "xfs_agf", 3150 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3151 .verify_read = xfs_agf_read_verify, 3152 .verify_write = xfs_agf_write_verify, 3153 .verify_struct = xfs_agf_verify, 3154 }; 3155 3156 /* 3157 * Read in the allocation group header (free/alloc section). 3158 */ 3159 int xfs_read_agf(struct xfs_perag * pag,struct xfs_trans * tp,int flags,struct xfs_buf ** agfbpp)3160 xfs_read_agf( 3161 struct xfs_perag *pag, 3162 struct xfs_trans *tp, 3163 int flags, 3164 struct xfs_buf **agfbpp) 3165 { 3166 struct xfs_mount *mp = pag->pag_mount; 3167 int error; 3168 3169 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 3170 3171 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3172 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 3173 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3174 if (error) 3175 return error; 3176 3177 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3178 return 0; 3179 } 3180 3181 /* 3182 * Read in the allocation group header (free/alloc section) and initialise the 3183 * perag structure if necessary. If the caller provides @agfbpp, then return the 3184 * locked buffer to the caller, otherwise free it. 3185 */ 3186 int xfs_alloc_read_agf(struct xfs_perag * pag,struct xfs_trans * tp,int flags,struct xfs_buf ** agfbpp)3187 xfs_alloc_read_agf( 3188 struct xfs_perag *pag, 3189 struct xfs_trans *tp, 3190 int flags, 3191 struct xfs_buf **agfbpp) 3192 { 3193 struct xfs_buf *agfbp; 3194 struct xfs_agf *agf; 3195 int error; 3196 int allocbt_blks; 3197 3198 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3199 3200 /* We don't support trylock when freeing. */ 3201 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3202 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3203 error = xfs_read_agf(pag, tp, 3204 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3205 &agfbp); 3206 if (error) 3207 return error; 3208 3209 agf = agfbp->b_addr; 3210 if (!xfs_perag_initialised_agf(pag)) { 3211 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3212 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3213 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3214 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3215 pag->pagf_levels[XFS_BTNUM_BNOi] = 3216 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); 3217 pag->pagf_levels[XFS_BTNUM_CNTi] = 3218 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 3219 pag->pagf_levels[XFS_BTNUM_RMAPi] = 3220 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); 3221 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3222 if (xfs_agfl_needs_reset(pag->pag_mount, agf)) 3223 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3224 else 3225 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3226 3227 /* 3228 * Update the in-core allocbt counter. Filter out the rmapbt 3229 * subset of the btreeblks counter because the rmapbt is managed 3230 * by perag reservation. Subtract one for the rmapbt root block 3231 * because the rmap counter includes it while the btreeblks 3232 * counter only tracks non-root blocks. 3233 */ 3234 allocbt_blks = pag->pagf_btreeblks; 3235 if (xfs_has_rmapbt(pag->pag_mount)) 3236 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3237 if (allocbt_blks > 0) 3238 atomic64_add(allocbt_blks, 3239 &pag->pag_mount->m_allocbt_blks); 3240 3241 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3242 } 3243 #ifdef DEBUG 3244 else if (!xfs_is_shutdown(pag->pag_mount)) { 3245 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3246 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3247 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3248 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3249 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 3250 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); 3251 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == 3252 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3253 } 3254 #endif 3255 if (agfbpp) 3256 *agfbpp = agfbp; 3257 else 3258 xfs_trans_brelse(tp, agfbp); 3259 return 0; 3260 } 3261 3262 /* 3263 * Pre-proces allocation arguments to set initial state that we don't require 3264 * callers to set up correctly, as well as bounds check the allocation args 3265 * that are set up. 3266 */ 3267 static int xfs_alloc_vextent_check_args(struct xfs_alloc_arg * args,xfs_fsblock_t target,xfs_agnumber_t * minimum_agno)3268 xfs_alloc_vextent_check_args( 3269 struct xfs_alloc_arg *args, 3270 xfs_fsblock_t target, 3271 xfs_agnumber_t *minimum_agno) 3272 { 3273 struct xfs_mount *mp = args->mp; 3274 xfs_agblock_t agsize; 3275 3276 args->fsbno = NULLFSBLOCK; 3277 3278 *minimum_agno = 0; 3279 if (args->tp->t_highest_agno != NULLAGNUMBER) 3280 *minimum_agno = args->tp->t_highest_agno; 3281 3282 /* 3283 * Just fix this up, for the case where the last a.g. is shorter 3284 * (or there's only one a.g.) and the caller couldn't easily figure 3285 * that out (xfs_bmap_alloc). 3286 */ 3287 agsize = mp->m_sb.sb_agblocks; 3288 if (args->maxlen > agsize) 3289 args->maxlen = agsize; 3290 if (args->alignment == 0) 3291 args->alignment = 1; 3292 3293 ASSERT(args->minlen > 0); 3294 ASSERT(args->maxlen > 0); 3295 ASSERT(args->alignment > 0); 3296 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3297 3298 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3299 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3300 ASSERT(args->minlen <= args->maxlen); 3301 ASSERT(args->minlen <= agsize); 3302 ASSERT(args->mod < args->prod); 3303 3304 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3305 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3306 args->minlen > args->maxlen || args->minlen > agsize || 3307 args->mod >= args->prod) { 3308 trace_xfs_alloc_vextent_badargs(args); 3309 return -ENOSPC; 3310 } 3311 3312 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3313 trace_xfs_alloc_vextent_skip_deadlock(args); 3314 return -ENOSPC; 3315 } 3316 return 0; 3317 3318 } 3319 3320 /* 3321 * Prepare an AG for allocation. If the AG is not prepared to accept the 3322 * allocation, return failure. 3323 * 3324 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3325 * modified to hold their own perag references. 3326 */ 3327 static int xfs_alloc_vextent_prepare_ag(struct xfs_alloc_arg * args,uint32_t alloc_flags)3328 xfs_alloc_vextent_prepare_ag( 3329 struct xfs_alloc_arg *args, 3330 uint32_t alloc_flags) 3331 { 3332 bool need_pag = !args->pag; 3333 int error; 3334 3335 if (need_pag) 3336 args->pag = xfs_perag_get(args->mp, args->agno); 3337 3338 args->agbp = NULL; 3339 error = xfs_alloc_fix_freelist(args, alloc_flags); 3340 if (error) { 3341 trace_xfs_alloc_vextent_nofix(args); 3342 if (need_pag) 3343 xfs_perag_put(args->pag); 3344 args->agbno = NULLAGBLOCK; 3345 return error; 3346 } 3347 if (!args->agbp) { 3348 /* cannot allocate in this AG at all */ 3349 trace_xfs_alloc_vextent_noagbp(args); 3350 args->agbno = NULLAGBLOCK; 3351 return 0; 3352 } 3353 args->wasfromfl = 0; 3354 return 0; 3355 } 3356 3357 /* 3358 * Post-process allocation results to account for the allocation if it succeed 3359 * and set the allocated block number correctly for the caller. 3360 * 3361 * XXX: we should really be returning ENOSPC for ENOSPC, not 3362 * hiding it behind a "successful" NULLFSBLOCK allocation. 3363 */ 3364 static int xfs_alloc_vextent_finish(struct xfs_alloc_arg * args,xfs_agnumber_t minimum_agno,int alloc_error,bool drop_perag)3365 xfs_alloc_vextent_finish( 3366 struct xfs_alloc_arg *args, 3367 xfs_agnumber_t minimum_agno, 3368 int alloc_error, 3369 bool drop_perag) 3370 { 3371 struct xfs_mount *mp = args->mp; 3372 int error = 0; 3373 3374 /* 3375 * We can end up here with a locked AGF. If we failed, the caller is 3376 * likely going to try to allocate again with different parameters, and 3377 * that can widen the AGs that are searched for free space. If we have 3378 * to do BMBT block allocation, we have to do a new allocation. 3379 * 3380 * Hence leaving this function with the AGF locked opens up potential 3381 * ABBA AGF deadlocks because a future allocation attempt in this 3382 * transaction may attempt to lock a lower number AGF. 3383 * 3384 * We can't release the AGF until the transaction is commited, so at 3385 * this point we must update the "first allocation" tracker to point at 3386 * this AG if the tracker is empty or points to a lower AG. This allows 3387 * the next allocation attempt to be modified appropriately to avoid 3388 * deadlocks. 3389 */ 3390 if (args->agbp && 3391 (args->tp->t_highest_agno == NULLAGNUMBER || 3392 args->agno > minimum_agno)) 3393 args->tp->t_highest_agno = args->agno; 3394 3395 /* 3396 * If the allocation failed with an error or we had an ENOSPC result, 3397 * preserve the returned error whilst also marking the allocation result 3398 * as "no extent allocated". This ensures that callers that fail to 3399 * capture the error will still treat it as a failed allocation. 3400 */ 3401 if (alloc_error || args->agbno == NULLAGBLOCK) { 3402 args->fsbno = NULLFSBLOCK; 3403 error = alloc_error; 3404 goto out_drop_perag; 3405 } 3406 3407 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3408 3409 ASSERT(args->len >= args->minlen); 3410 ASSERT(args->len <= args->maxlen); 3411 ASSERT(args->agbno % args->alignment == 0); 3412 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3413 3414 /* if not file data, insert new block into the reverse map btree */ 3415 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3416 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3417 args->agbno, args->len, &args->oinfo); 3418 if (error) 3419 goto out_drop_perag; 3420 } 3421 3422 if (!args->wasfromfl) { 3423 error = xfs_alloc_update_counters(args->tp, args->agbp, 3424 -((long)(args->len))); 3425 if (error) 3426 goto out_drop_perag; 3427 3428 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, 3429 args->len)); 3430 } 3431 3432 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3433 3434 XFS_STATS_INC(mp, xs_allocx); 3435 XFS_STATS_ADD(mp, xs_allocb, args->len); 3436 3437 trace_xfs_alloc_vextent_finish(args); 3438 3439 out_drop_perag: 3440 if (drop_perag && args->pag) { 3441 xfs_perag_rele(args->pag); 3442 args->pag = NULL; 3443 } 3444 return error; 3445 } 3446 3447 /* 3448 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3449 * you need an exact sized allocation without locality constraints, this is the 3450 * fastest way to do it. 3451 * 3452 * Caller is expected to hold a perag reference in args->pag. 3453 */ 3454 int xfs_alloc_vextent_this_ag(struct xfs_alloc_arg * args,xfs_agnumber_t agno)3455 xfs_alloc_vextent_this_ag( 3456 struct xfs_alloc_arg *args, 3457 xfs_agnumber_t agno) 3458 { 3459 struct xfs_mount *mp = args->mp; 3460 xfs_agnumber_t minimum_agno; 3461 uint32_t alloc_flags = 0; 3462 int error; 3463 3464 ASSERT(args->pag != NULL); 3465 ASSERT(args->pag->pag_agno == agno); 3466 3467 args->agno = agno; 3468 args->agbno = 0; 3469 3470 trace_xfs_alloc_vextent_this_ag(args); 3471 3472 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), 3473 &minimum_agno); 3474 if (error) { 3475 if (error == -ENOSPC) 3476 return 0; 3477 return error; 3478 } 3479 3480 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3481 if (!error && args->agbp) 3482 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3483 3484 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3485 } 3486 3487 /* 3488 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3489 * 3490 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3491 * allocation attempts in @start_agno have locality information. If we fail to 3492 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3493 * we attempt to allocation in as there is no locality optimisation possible for 3494 * those allocations. 3495 * 3496 * On return, args->pag may be left referenced if we finish before the "all 3497 * failed" return point. The allocation finish still needs the perag, and 3498 * so the caller will release it once they've finished the allocation. 3499 * 3500 * When we wrap the AG iteration at the end of the filesystem, we have to be 3501 * careful not to wrap into AGs below ones we already have locked in the 3502 * transaction if we are doing a blocking iteration. This will result in an 3503 * out-of-order locking of AGFs and hence can cause deadlocks. 3504 */ 3505 static int xfs_alloc_vextent_iterate_ags(struct xfs_alloc_arg * args,xfs_agnumber_t minimum_agno,xfs_agnumber_t start_agno,xfs_agblock_t target_agbno,uint32_t alloc_flags)3506 xfs_alloc_vextent_iterate_ags( 3507 struct xfs_alloc_arg *args, 3508 xfs_agnumber_t minimum_agno, 3509 xfs_agnumber_t start_agno, 3510 xfs_agblock_t target_agbno, 3511 uint32_t alloc_flags) 3512 { 3513 struct xfs_mount *mp = args->mp; 3514 xfs_agnumber_t restart_agno = minimum_agno; 3515 xfs_agnumber_t agno; 3516 int error = 0; 3517 3518 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) 3519 restart_agno = 0; 3520 restart: 3521 for_each_perag_wrap_range(mp, start_agno, restart_agno, 3522 mp->m_sb.sb_agcount, agno, args->pag) { 3523 args->agno = agno; 3524 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3525 if (error) 3526 break; 3527 if (!args->agbp) { 3528 trace_xfs_alloc_vextent_loopfailed(args); 3529 continue; 3530 } 3531 3532 /* 3533 * Allocation is supposed to succeed now, so break out of the 3534 * loop regardless of whether we succeed or not. 3535 */ 3536 if (args->agno == start_agno && target_agbno) { 3537 args->agbno = target_agbno; 3538 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3539 } else { 3540 args->agbno = 0; 3541 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3542 } 3543 break; 3544 } 3545 if (error) { 3546 xfs_perag_rele(args->pag); 3547 args->pag = NULL; 3548 return error; 3549 } 3550 if (args->agbp) 3551 return 0; 3552 3553 /* 3554 * We didn't find an AG we can alloation from. If we were given 3555 * constraining flags by the caller, drop them and retry the allocation 3556 * without any constraints being set. 3557 */ 3558 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { 3559 alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; 3560 restart_agno = minimum_agno; 3561 goto restart; 3562 } 3563 3564 ASSERT(args->pag == NULL); 3565 trace_xfs_alloc_vextent_allfailed(args); 3566 return 0; 3567 } 3568 3569 /* 3570 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3571 * to allocate blocks. It starts with a near allocation attempt in the initial 3572 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3573 * back to zero if allowed by previous allocations in this transaction, 3574 * otherwise will wrap back to the start AG and run a second blocking pass to 3575 * the end of the filesystem. 3576 */ 3577 int xfs_alloc_vextent_start_ag(struct xfs_alloc_arg * args,xfs_fsblock_t target)3578 xfs_alloc_vextent_start_ag( 3579 struct xfs_alloc_arg *args, 3580 xfs_fsblock_t target) 3581 { 3582 struct xfs_mount *mp = args->mp; 3583 xfs_agnumber_t minimum_agno; 3584 xfs_agnumber_t start_agno; 3585 xfs_agnumber_t rotorstep = xfs_rotorstep; 3586 bool bump_rotor = false; 3587 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3588 int error; 3589 3590 ASSERT(args->pag == NULL); 3591 3592 args->agno = NULLAGNUMBER; 3593 args->agbno = NULLAGBLOCK; 3594 3595 trace_xfs_alloc_vextent_start_ag(args); 3596 3597 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3598 if (error) { 3599 if (error == -ENOSPC) 3600 return 0; 3601 return error; 3602 } 3603 3604 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3605 xfs_is_inode32(mp)) { 3606 target = XFS_AGB_TO_FSB(mp, 3607 ((mp->m_agfrotor / rotorstep) % 3608 mp->m_sb.sb_agcount), 0); 3609 bump_rotor = 1; 3610 } 3611 3612 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3613 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3614 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3615 3616 if (bump_rotor) { 3617 if (args->agno == start_agno) 3618 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3619 (mp->m_sb.sb_agcount * rotorstep); 3620 else 3621 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3622 (mp->m_sb.sb_agcount * rotorstep); 3623 } 3624 3625 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3626 } 3627 3628 /* 3629 * Iterate from the agno indicated via @target through to the end of the 3630 * filesystem attempting blocking allocation. This does not wrap or try a second 3631 * pass, so will not recurse into AGs lower than indicated by the target. 3632 */ 3633 int xfs_alloc_vextent_first_ag(struct xfs_alloc_arg * args,xfs_fsblock_t target)3634 xfs_alloc_vextent_first_ag( 3635 struct xfs_alloc_arg *args, 3636 xfs_fsblock_t target) 3637 { 3638 struct xfs_mount *mp = args->mp; 3639 xfs_agnumber_t minimum_agno; 3640 xfs_agnumber_t start_agno; 3641 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3642 int error; 3643 3644 ASSERT(args->pag == NULL); 3645 3646 args->agno = NULLAGNUMBER; 3647 args->agbno = NULLAGBLOCK; 3648 3649 trace_xfs_alloc_vextent_first_ag(args); 3650 3651 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3652 if (error) { 3653 if (error == -ENOSPC) 3654 return 0; 3655 return error; 3656 } 3657 3658 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3659 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3660 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3661 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3662 } 3663 3664 /* 3665 * Allocate at the exact block target or fail. Caller is expected to hold a 3666 * perag reference in args->pag. 3667 */ 3668 int xfs_alloc_vextent_exact_bno(struct xfs_alloc_arg * args,xfs_fsblock_t target)3669 xfs_alloc_vextent_exact_bno( 3670 struct xfs_alloc_arg *args, 3671 xfs_fsblock_t target) 3672 { 3673 struct xfs_mount *mp = args->mp; 3674 xfs_agnumber_t minimum_agno; 3675 int error; 3676 3677 ASSERT(args->pag != NULL); 3678 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3679 3680 args->agno = XFS_FSB_TO_AGNO(mp, target); 3681 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3682 3683 trace_xfs_alloc_vextent_exact_bno(args); 3684 3685 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3686 if (error) { 3687 if (error == -ENOSPC) 3688 return 0; 3689 return error; 3690 } 3691 3692 error = xfs_alloc_vextent_prepare_ag(args, 0); 3693 if (!error && args->agbp) 3694 error = xfs_alloc_ag_vextent_exact(args); 3695 3696 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3697 } 3698 3699 /* 3700 * Allocate an extent as close to the target as possible. If there are not 3701 * viable candidates in the AG, then fail the allocation. 3702 * 3703 * Caller may or may not have a per-ag reference in args->pag. 3704 */ 3705 int xfs_alloc_vextent_near_bno(struct xfs_alloc_arg * args,xfs_fsblock_t target)3706 xfs_alloc_vextent_near_bno( 3707 struct xfs_alloc_arg *args, 3708 xfs_fsblock_t target) 3709 { 3710 struct xfs_mount *mp = args->mp; 3711 xfs_agnumber_t minimum_agno; 3712 bool needs_perag = args->pag == NULL; 3713 uint32_t alloc_flags = 0; 3714 int error; 3715 3716 if (!needs_perag) 3717 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3718 3719 args->agno = XFS_FSB_TO_AGNO(mp, target); 3720 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3721 3722 trace_xfs_alloc_vextent_near_bno(args); 3723 3724 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3725 if (error) { 3726 if (error == -ENOSPC) 3727 return 0; 3728 return error; 3729 } 3730 3731 if (needs_perag) 3732 args->pag = xfs_perag_grab(mp, args->agno); 3733 3734 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3735 if (!error && args->agbp) 3736 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3737 3738 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3739 } 3740 3741 /* Ensure that the freelist is at full capacity. */ 3742 int xfs_free_extent_fix_freelist(struct xfs_trans * tp,struct xfs_perag * pag,struct xfs_buf ** agbp)3743 xfs_free_extent_fix_freelist( 3744 struct xfs_trans *tp, 3745 struct xfs_perag *pag, 3746 struct xfs_buf **agbp) 3747 { 3748 struct xfs_alloc_arg args; 3749 int error; 3750 3751 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3752 args.tp = tp; 3753 args.mp = tp->t_mountp; 3754 args.agno = pag->pag_agno; 3755 args.pag = pag; 3756 3757 /* 3758 * validate that the block number is legal - the enables us to detect 3759 * and handle a silent filesystem corruption rather than crashing. 3760 */ 3761 if (args.agno >= args.mp->m_sb.sb_agcount) 3762 return -EFSCORRUPTED; 3763 3764 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3765 if (error) 3766 return error; 3767 3768 *agbp = args.agbp; 3769 return 0; 3770 } 3771 3772 /* 3773 * Free an extent. 3774 * Just break up the extent address and hand off to xfs_free_ag_extent 3775 * after fixing up the freelist. 3776 */ 3777 int __xfs_free_extent(struct xfs_trans * tp,struct xfs_perag * pag,xfs_agblock_t agbno,xfs_extlen_t len,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type,bool skip_discard)3778 __xfs_free_extent( 3779 struct xfs_trans *tp, 3780 struct xfs_perag *pag, 3781 xfs_agblock_t agbno, 3782 xfs_extlen_t len, 3783 const struct xfs_owner_info *oinfo, 3784 enum xfs_ag_resv_type type, 3785 bool skip_discard) 3786 { 3787 struct xfs_mount *mp = tp->t_mountp; 3788 struct xfs_buf *agbp; 3789 struct xfs_agf *agf; 3790 int error; 3791 unsigned int busy_flags = 0; 3792 3793 ASSERT(len != 0); 3794 ASSERT(type != XFS_AG_RESV_AGFL); 3795 3796 if (XFS_TEST_ERROR(false, mp, 3797 XFS_ERRTAG_FREE_EXTENT)) 3798 return -EIO; 3799 3800 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3801 if (error) 3802 return error; 3803 agf = agbp->b_addr; 3804 3805 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3806 error = -EFSCORRUPTED; 3807 goto err_release; 3808 } 3809 3810 /* validate the extent size is legal now we have the agf locked */ 3811 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3812 error = -EFSCORRUPTED; 3813 goto err_release; 3814 } 3815 3816 error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, 3817 type); 3818 if (error) 3819 goto err_release; 3820 3821 if (skip_discard) 3822 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3823 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3824 return 0; 3825 3826 err_release: 3827 xfs_trans_brelse(tp, agbp); 3828 return error; 3829 } 3830 3831 struct xfs_alloc_query_range_info { 3832 xfs_alloc_query_range_fn fn; 3833 void *priv; 3834 }; 3835 3836 /* Format btree record and pass to our callback. */ 3837 STATIC int xfs_alloc_query_range_helper(struct xfs_btree_cur * cur,const union xfs_btree_rec * rec,void * priv)3838 xfs_alloc_query_range_helper( 3839 struct xfs_btree_cur *cur, 3840 const union xfs_btree_rec *rec, 3841 void *priv) 3842 { 3843 struct xfs_alloc_query_range_info *query = priv; 3844 struct xfs_alloc_rec_incore irec; 3845 xfs_failaddr_t fa; 3846 3847 xfs_alloc_btrec_to_irec(rec, &irec); 3848 fa = xfs_alloc_check_irec(cur, &irec); 3849 if (fa) 3850 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 3851 3852 return query->fn(cur, &irec, query->priv); 3853 } 3854 3855 /* Find all free space within a given range of blocks. */ 3856 int xfs_alloc_query_range(struct xfs_btree_cur * cur,const struct xfs_alloc_rec_incore * low_rec,const struct xfs_alloc_rec_incore * high_rec,xfs_alloc_query_range_fn fn,void * priv)3857 xfs_alloc_query_range( 3858 struct xfs_btree_cur *cur, 3859 const struct xfs_alloc_rec_incore *low_rec, 3860 const struct xfs_alloc_rec_incore *high_rec, 3861 xfs_alloc_query_range_fn fn, 3862 void *priv) 3863 { 3864 union xfs_btree_irec low_brec = { .a = *low_rec }; 3865 union xfs_btree_irec high_brec = { .a = *high_rec }; 3866 struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; 3867 3868 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3869 return xfs_btree_query_range(cur, &low_brec, &high_brec, 3870 xfs_alloc_query_range_helper, &query); 3871 } 3872 3873 /* Find all free space records. */ 3874 int xfs_alloc_query_all(struct xfs_btree_cur * cur,xfs_alloc_query_range_fn fn,void * priv)3875 xfs_alloc_query_all( 3876 struct xfs_btree_cur *cur, 3877 xfs_alloc_query_range_fn fn, 3878 void *priv) 3879 { 3880 struct xfs_alloc_query_range_info query; 3881 3882 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3883 query.priv = priv; 3884 query.fn = fn; 3885 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 3886 } 3887 3888 /* 3889 * Scan part of the keyspace of the free space and tell us if the area has no 3890 * records, is fully mapped by records, or is partially filled. 3891 */ 3892 int xfs_alloc_has_records(struct xfs_btree_cur * cur,xfs_agblock_t bno,xfs_extlen_t len,enum xbtree_recpacking * outcome)3893 xfs_alloc_has_records( 3894 struct xfs_btree_cur *cur, 3895 xfs_agblock_t bno, 3896 xfs_extlen_t len, 3897 enum xbtree_recpacking *outcome) 3898 { 3899 union xfs_btree_irec low; 3900 union xfs_btree_irec high; 3901 3902 memset(&low, 0, sizeof(low)); 3903 low.a.ar_startblock = bno; 3904 memset(&high, 0xFF, sizeof(high)); 3905 high.a.ar_startblock = bno + len - 1; 3906 3907 return xfs_btree_has_records(cur, &low, &high, NULL, outcome); 3908 } 3909 3910 /* 3911 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 3912 * error code or XFS_ITER_*. 3913 */ 3914 int xfs_agfl_walk(struct xfs_mount * mp,struct xfs_agf * agf,struct xfs_buf * agflbp,xfs_agfl_walk_fn walk_fn,void * priv)3915 xfs_agfl_walk( 3916 struct xfs_mount *mp, 3917 struct xfs_agf *agf, 3918 struct xfs_buf *agflbp, 3919 xfs_agfl_walk_fn walk_fn, 3920 void *priv) 3921 { 3922 __be32 *agfl_bno; 3923 unsigned int i; 3924 int error; 3925 3926 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3927 i = be32_to_cpu(agf->agf_flfirst); 3928 3929 /* Nothing to walk in an empty AGFL. */ 3930 if (agf->agf_flcount == cpu_to_be32(0)) 3931 return 0; 3932 3933 /* Otherwise, walk from first to last, wrapping as needed. */ 3934 for (;;) { 3935 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 3936 if (error) 3937 return error; 3938 if (i == be32_to_cpu(agf->agf_fllast)) 3939 break; 3940 if (++i == xfs_agfl_size(mp)) 3941 i = 0; 3942 } 3943 3944 return 0; 3945 } 3946 3947 int __init xfs_extfree_intent_init_cache(void)3948 xfs_extfree_intent_init_cache(void) 3949 { 3950 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 3951 sizeof(struct xfs_extent_free_item), 3952 0, 0, NULL); 3953 3954 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 3955 } 3956 3957 void xfs_extfree_intent_destroy_cache(void)3958 xfs_extfree_intent_destroy_cache(void) 3959 { 3960 kmem_cache_destroy(xfs_extfree_item_cache); 3961 xfs_extfree_item_cache = NULL; 3962 } 3963