1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 30 struct kmem_cache *xfs_extfree_item_cache; 31 32 struct workqueue_struct *xfs_alloc_wq; 33 34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 /* 40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 41 * the beginning of the block for a proper header with the location information 42 * and CRC. 43 */ 44 unsigned int 45 xfs_agfl_size( 46 struct xfs_mount *mp) 47 { 48 unsigned int size = mp->m_sb.sb_sectsize; 49 50 if (xfs_has_crc(mp)) 51 size -= sizeof(struct xfs_agfl); 52 53 return size / sizeof(xfs_agblock_t); 54 } 55 56 unsigned int 57 xfs_refc_block( 58 struct xfs_mount *mp) 59 { 60 if (xfs_has_rmapbt(mp)) 61 return XFS_RMAP_BLOCK(mp) + 1; 62 if (xfs_has_finobt(mp)) 63 return XFS_FIBT_BLOCK(mp) + 1; 64 return XFS_IBT_BLOCK(mp) + 1; 65 } 66 67 xfs_extlen_t 68 xfs_prealloc_blocks( 69 struct xfs_mount *mp) 70 { 71 if (xfs_has_reflink(mp)) 72 return xfs_refc_block(mp) + 1; 73 if (xfs_has_rmapbt(mp)) 74 return XFS_RMAP_BLOCK(mp) + 1; 75 if (xfs_has_finobt(mp)) 76 return XFS_FIBT_BLOCK(mp) + 1; 77 return XFS_IBT_BLOCK(mp) + 1; 78 } 79 80 /* 81 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to 82 * guarantee that we can refill the AGFL prior to allocating space in a nearly 83 * full AG. Although the space described by the free space btrees, the 84 * blocks used by the freesp btrees themselves, and the blocks owned by the 85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 86 * free space in the AG drop so low that the free space btrees cannot refill an 87 * empty AGFL up to the minimum level. Rather than grind through empty AGs 88 * until the fs goes down, we subtract this many AG blocks from the incore 89 * fdblocks to ensure user allocation does not overcommit the space the 90 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 91 * withhold space from xfs_mod_fdblocks, so we do not account for that here. 92 */ 93 #define XFS_ALLOCBT_AGFL_RESERVE 4 94 95 /* 96 * Compute the number of blocks that we set aside to guarantee the ability to 97 * refill the AGFL and handle a full bmap btree split. 98 * 99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 100 * AGF buffer (PV 947395), we place constraints on the relationship among 101 * actual allocations for data blocks, freelist blocks, and potential file data 102 * bmap btree blocks. However, these restrictions may result in no actual space 103 * allocated for a delayed extent, for example, a data block in a certain AG is 104 * allocated but there is no additional block for the additional bmap btree 105 * block due to a split of the bmap btree of the file. The result of this may 106 * lead to an infinite loop when the file gets flushed to disk and all delayed 107 * extents need to be actually allocated. To get around this, we explicitly set 108 * aside a few blocks which will not be reserved in delayed allocation. 109 * 110 * For each AG, we need to reserve enough blocks to replenish a totally empty 111 * AGFL and 4 more to handle a potential split of the file's bmap btree. 112 */ 113 unsigned int 114 xfs_alloc_set_aside( 115 struct xfs_mount *mp) 116 { 117 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 118 } 119 120 /* 121 * When deciding how much space to allocate out of an AG, we limit the 122 * allocation maximum size to the size the AG. However, we cannot use all the 123 * blocks in the AG - some are permanently used by metadata. These 124 * blocks are generally: 125 * - the AG superblock, AGF, AGI and AGFL 126 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 127 * the AGI free inode and rmap btree root blocks. 128 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 129 * - the rmapbt root block 130 * 131 * The AG headers are sector sized, so the amount of space they take up is 132 * dependent on filesystem geometry. The others are all single blocks. 133 */ 134 unsigned int 135 xfs_alloc_ag_max_usable( 136 struct xfs_mount *mp) 137 { 138 unsigned int blocks; 139 140 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 141 blocks += XFS_ALLOCBT_AGFL_RESERVE; 142 blocks += 3; /* AGF, AGI btree root blocks */ 143 if (xfs_has_finobt(mp)) 144 blocks++; /* finobt root block */ 145 if (xfs_has_rmapbt(mp)) 146 blocks++; /* rmap root block */ 147 if (xfs_has_reflink(mp)) 148 blocks++; /* refcount root block */ 149 150 return mp->m_sb.sb_agblocks - blocks; 151 } 152 153 /* 154 * Lookup the record equal to [bno, len] in the btree given by cur. 155 */ 156 STATIC int /* error */ 157 xfs_alloc_lookup_eq( 158 struct xfs_btree_cur *cur, /* btree cursor */ 159 xfs_agblock_t bno, /* starting block of extent */ 160 xfs_extlen_t len, /* length of extent */ 161 int *stat) /* success/failure */ 162 { 163 int error; 164 165 cur->bc_rec.a.ar_startblock = bno; 166 cur->bc_rec.a.ar_blockcount = len; 167 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); 168 cur->bc_ag.abt.active = (*stat == 1); 169 return error; 170 } 171 172 /* 173 * Lookup the first record greater than or equal to [bno, len] 174 * in the btree given by cur. 175 */ 176 int /* error */ 177 xfs_alloc_lookup_ge( 178 struct xfs_btree_cur *cur, /* btree cursor */ 179 xfs_agblock_t bno, /* starting block of extent */ 180 xfs_extlen_t len, /* length of extent */ 181 int *stat) /* success/failure */ 182 { 183 int error; 184 185 cur->bc_rec.a.ar_startblock = bno; 186 cur->bc_rec.a.ar_blockcount = len; 187 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 188 cur->bc_ag.abt.active = (*stat == 1); 189 return error; 190 } 191 192 /* 193 * Lookup the first record less than or equal to [bno, len] 194 * in the btree given by cur. 195 */ 196 int /* error */ 197 xfs_alloc_lookup_le( 198 struct xfs_btree_cur *cur, /* btree cursor */ 199 xfs_agblock_t bno, /* starting block of extent */ 200 xfs_extlen_t len, /* length of extent */ 201 int *stat) /* success/failure */ 202 { 203 int error; 204 cur->bc_rec.a.ar_startblock = bno; 205 cur->bc_rec.a.ar_blockcount = len; 206 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); 207 cur->bc_ag.abt.active = (*stat == 1); 208 return error; 209 } 210 211 static inline bool 212 xfs_alloc_cur_active( 213 struct xfs_btree_cur *cur) 214 { 215 return cur && cur->bc_ag.abt.active; 216 } 217 218 /* 219 * Update the record referred to by cur to the value given 220 * by [bno, len]. 221 * This either works (return 0) or gets an EFSCORRUPTED error. 222 */ 223 STATIC int /* error */ 224 xfs_alloc_update( 225 struct xfs_btree_cur *cur, /* btree cursor */ 226 xfs_agblock_t bno, /* starting block of extent */ 227 xfs_extlen_t len) /* length of extent */ 228 { 229 union xfs_btree_rec rec; 230 231 rec.alloc.ar_startblock = cpu_to_be32(bno); 232 rec.alloc.ar_blockcount = cpu_to_be32(len); 233 return xfs_btree_update(cur, &rec); 234 } 235 236 /* 237 * Get the data from the pointed-to record. 238 */ 239 int /* error */ 240 xfs_alloc_get_rec( 241 struct xfs_btree_cur *cur, /* btree cursor */ 242 xfs_agblock_t *bno, /* output: starting block of extent */ 243 xfs_extlen_t *len, /* output: length of extent */ 244 int *stat) /* output: success/failure */ 245 { 246 struct xfs_mount *mp = cur->bc_mp; 247 struct xfs_perag *pag = cur->bc_ag.pag; 248 union xfs_btree_rec *rec; 249 int error; 250 251 error = xfs_btree_get_rec(cur, &rec, stat); 252 if (error || !(*stat)) 253 return error; 254 255 *bno = be32_to_cpu(rec->alloc.ar_startblock); 256 *len = be32_to_cpu(rec->alloc.ar_blockcount); 257 258 if (*len == 0) 259 goto out_bad_rec; 260 261 /* check for valid extent range, including overflow */ 262 if (!xfs_verify_agbext(pag, *bno, *len)) 263 goto out_bad_rec; 264 265 return 0; 266 267 out_bad_rec: 268 xfs_warn(mp, 269 "%s Freespace BTree record corruption in AG %d detected!", 270 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", 271 pag->pag_agno); 272 xfs_warn(mp, 273 "start block 0x%x block count 0x%x", *bno, *len); 274 return -EFSCORRUPTED; 275 } 276 277 /* 278 * Compute aligned version of the found extent. 279 * Takes alignment and min length into account. 280 */ 281 STATIC bool 282 xfs_alloc_compute_aligned( 283 xfs_alloc_arg_t *args, /* allocation argument structure */ 284 xfs_agblock_t foundbno, /* starting block in found extent */ 285 xfs_extlen_t foundlen, /* length in found extent */ 286 xfs_agblock_t *resbno, /* result block number */ 287 xfs_extlen_t *reslen, /* result length */ 288 unsigned *busy_gen) 289 { 290 xfs_agblock_t bno = foundbno; 291 xfs_extlen_t len = foundlen; 292 xfs_extlen_t diff; 293 bool busy; 294 295 /* Trim busy sections out of found extent */ 296 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 297 298 /* 299 * If we have a largish extent that happens to start before min_agbno, 300 * see if we can shift it into range... 301 */ 302 if (bno < args->min_agbno && bno + len > args->min_agbno) { 303 diff = args->min_agbno - bno; 304 if (len > diff) { 305 bno += diff; 306 len -= diff; 307 } 308 } 309 310 if (args->alignment > 1 && len >= args->minlen) { 311 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 312 313 diff = aligned_bno - bno; 314 315 *resbno = aligned_bno; 316 *reslen = diff >= len ? 0 : len - diff; 317 } else { 318 *resbno = bno; 319 *reslen = len; 320 } 321 322 return busy; 323 } 324 325 /* 326 * Compute best start block and diff for "near" allocations. 327 * freelen >= wantlen already checked by caller. 328 */ 329 STATIC xfs_extlen_t /* difference value (absolute) */ 330 xfs_alloc_compute_diff( 331 xfs_agblock_t wantbno, /* target starting block */ 332 xfs_extlen_t wantlen, /* target length */ 333 xfs_extlen_t alignment, /* target alignment */ 334 int datatype, /* are we allocating data? */ 335 xfs_agblock_t freebno, /* freespace's starting block */ 336 xfs_extlen_t freelen, /* freespace's length */ 337 xfs_agblock_t *newbnop) /* result: best start block from free */ 338 { 339 xfs_agblock_t freeend; /* end of freespace extent */ 340 xfs_agblock_t newbno1; /* return block number */ 341 xfs_agblock_t newbno2; /* other new block number */ 342 xfs_extlen_t newlen1=0; /* length with newbno1 */ 343 xfs_extlen_t newlen2=0; /* length with newbno2 */ 344 xfs_agblock_t wantend; /* end of target extent */ 345 bool userdata = datatype & XFS_ALLOC_USERDATA; 346 347 ASSERT(freelen >= wantlen); 348 freeend = freebno + freelen; 349 wantend = wantbno + wantlen; 350 /* 351 * We want to allocate from the start of a free extent if it is past 352 * the desired block or if we are allocating user data and the free 353 * extent is before desired block. The second case is there to allow 354 * for contiguous allocation from the remaining free space if the file 355 * grows in the short term. 356 */ 357 if (freebno >= wantbno || (userdata && freeend < wantend)) { 358 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 359 newbno1 = NULLAGBLOCK; 360 } else if (freeend >= wantend && alignment > 1) { 361 newbno1 = roundup(wantbno, alignment); 362 newbno2 = newbno1 - alignment; 363 if (newbno1 >= freeend) 364 newbno1 = NULLAGBLOCK; 365 else 366 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 367 if (newbno2 < freebno) 368 newbno2 = NULLAGBLOCK; 369 else 370 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 371 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 372 if (newlen1 < newlen2 || 373 (newlen1 == newlen2 && 374 XFS_ABSDIFF(newbno1, wantbno) > 375 XFS_ABSDIFF(newbno2, wantbno))) 376 newbno1 = newbno2; 377 } else if (newbno2 != NULLAGBLOCK) 378 newbno1 = newbno2; 379 } else if (freeend >= wantend) { 380 newbno1 = wantbno; 381 } else if (alignment > 1) { 382 newbno1 = roundup(freeend - wantlen, alignment); 383 if (newbno1 > freeend - wantlen && 384 newbno1 - alignment >= freebno) 385 newbno1 -= alignment; 386 else if (newbno1 >= freeend) 387 newbno1 = NULLAGBLOCK; 388 } else 389 newbno1 = freeend - wantlen; 390 *newbnop = newbno1; 391 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 392 } 393 394 /* 395 * Fix up the length, based on mod and prod. 396 * len should be k * prod + mod for some k. 397 * If len is too small it is returned unchanged. 398 * If len hits maxlen it is left alone. 399 */ 400 STATIC void 401 xfs_alloc_fix_len( 402 xfs_alloc_arg_t *args) /* allocation argument structure */ 403 { 404 xfs_extlen_t k; 405 xfs_extlen_t rlen; 406 407 ASSERT(args->mod < args->prod); 408 rlen = args->len; 409 ASSERT(rlen >= args->minlen); 410 ASSERT(rlen <= args->maxlen); 411 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 412 (args->mod == 0 && rlen < args->prod)) 413 return; 414 k = rlen % args->prod; 415 if (k == args->mod) 416 return; 417 if (k > args->mod) 418 rlen = rlen - (k - args->mod); 419 else 420 rlen = rlen - args->prod + (args->mod - k); 421 /* casts to (int) catch length underflows */ 422 if ((int)rlen < (int)args->minlen) 423 return; 424 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 425 ASSERT(rlen % args->prod == args->mod); 426 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 427 rlen + args->minleft); 428 args->len = rlen; 429 } 430 431 /* 432 * Update the two btrees, logically removing from freespace the extent 433 * starting at rbno, rlen blocks. The extent is contained within the 434 * actual (current) free extent fbno for flen blocks. 435 * Flags are passed in indicating whether the cursors are set to the 436 * relevant records. 437 */ 438 STATIC int /* error code */ 439 xfs_alloc_fixup_trees( 440 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 441 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 442 xfs_agblock_t fbno, /* starting block of free extent */ 443 xfs_extlen_t flen, /* length of free extent */ 444 xfs_agblock_t rbno, /* starting block of returned extent */ 445 xfs_extlen_t rlen, /* length of returned extent */ 446 int flags) /* flags, XFSA_FIXUP_... */ 447 { 448 int error; /* error code */ 449 int i; /* operation results */ 450 xfs_agblock_t nfbno1; /* first new free startblock */ 451 xfs_agblock_t nfbno2; /* second new free startblock */ 452 xfs_extlen_t nflen1=0; /* first new free length */ 453 xfs_extlen_t nflen2=0; /* second new free length */ 454 struct xfs_mount *mp; 455 456 mp = cnt_cur->bc_mp; 457 458 /* 459 * Look up the record in the by-size tree if necessary. 460 */ 461 if (flags & XFSA_FIXUP_CNT_OK) { 462 #ifdef DEBUG 463 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 464 return error; 465 if (XFS_IS_CORRUPT(mp, 466 i != 1 || 467 nfbno1 != fbno || 468 nflen1 != flen)) 469 return -EFSCORRUPTED; 470 #endif 471 } else { 472 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 473 return error; 474 if (XFS_IS_CORRUPT(mp, i != 1)) 475 return -EFSCORRUPTED; 476 } 477 /* 478 * Look up the record in the by-block tree if necessary. 479 */ 480 if (flags & XFSA_FIXUP_BNO_OK) { 481 #ifdef DEBUG 482 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 483 return error; 484 if (XFS_IS_CORRUPT(mp, 485 i != 1 || 486 nfbno1 != fbno || 487 nflen1 != flen)) 488 return -EFSCORRUPTED; 489 #endif 490 } else { 491 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 492 return error; 493 if (XFS_IS_CORRUPT(mp, i != 1)) 494 return -EFSCORRUPTED; 495 } 496 497 #ifdef DEBUG 498 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 499 struct xfs_btree_block *bnoblock; 500 struct xfs_btree_block *cntblock; 501 502 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 503 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 504 505 if (XFS_IS_CORRUPT(mp, 506 bnoblock->bb_numrecs != 507 cntblock->bb_numrecs)) 508 return -EFSCORRUPTED; 509 } 510 #endif 511 512 /* 513 * Deal with all four cases: the allocated record is contained 514 * within the freespace record, so we can have new freespace 515 * at either (or both) end, or no freespace remaining. 516 */ 517 if (rbno == fbno && rlen == flen) 518 nfbno1 = nfbno2 = NULLAGBLOCK; 519 else if (rbno == fbno) { 520 nfbno1 = rbno + rlen; 521 nflen1 = flen - rlen; 522 nfbno2 = NULLAGBLOCK; 523 } else if (rbno + rlen == fbno + flen) { 524 nfbno1 = fbno; 525 nflen1 = flen - rlen; 526 nfbno2 = NULLAGBLOCK; 527 } else { 528 nfbno1 = fbno; 529 nflen1 = rbno - fbno; 530 nfbno2 = rbno + rlen; 531 nflen2 = (fbno + flen) - nfbno2; 532 } 533 /* 534 * Delete the entry from the by-size btree. 535 */ 536 if ((error = xfs_btree_delete(cnt_cur, &i))) 537 return error; 538 if (XFS_IS_CORRUPT(mp, i != 1)) 539 return -EFSCORRUPTED; 540 /* 541 * Add new by-size btree entry(s). 542 */ 543 if (nfbno1 != NULLAGBLOCK) { 544 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 545 return error; 546 if (XFS_IS_CORRUPT(mp, i != 0)) 547 return -EFSCORRUPTED; 548 if ((error = xfs_btree_insert(cnt_cur, &i))) 549 return error; 550 if (XFS_IS_CORRUPT(mp, i != 1)) 551 return -EFSCORRUPTED; 552 } 553 if (nfbno2 != NULLAGBLOCK) { 554 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 555 return error; 556 if (XFS_IS_CORRUPT(mp, i != 0)) 557 return -EFSCORRUPTED; 558 if ((error = xfs_btree_insert(cnt_cur, &i))) 559 return error; 560 if (XFS_IS_CORRUPT(mp, i != 1)) 561 return -EFSCORRUPTED; 562 } 563 /* 564 * Fix up the by-block btree entry(s). 565 */ 566 if (nfbno1 == NULLAGBLOCK) { 567 /* 568 * No remaining freespace, just delete the by-block tree entry. 569 */ 570 if ((error = xfs_btree_delete(bno_cur, &i))) 571 return error; 572 if (XFS_IS_CORRUPT(mp, i != 1)) 573 return -EFSCORRUPTED; 574 } else { 575 /* 576 * Update the by-block entry to start later|be shorter. 577 */ 578 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 579 return error; 580 } 581 if (nfbno2 != NULLAGBLOCK) { 582 /* 583 * 2 resulting free entries, need to add one. 584 */ 585 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 586 return error; 587 if (XFS_IS_CORRUPT(mp, i != 0)) 588 return -EFSCORRUPTED; 589 if ((error = xfs_btree_insert(bno_cur, &i))) 590 return error; 591 if (XFS_IS_CORRUPT(mp, i != 1)) 592 return -EFSCORRUPTED; 593 } 594 return 0; 595 } 596 597 static xfs_failaddr_t 598 xfs_agfl_verify( 599 struct xfs_buf *bp) 600 { 601 struct xfs_mount *mp = bp->b_mount; 602 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 603 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 604 int i; 605 606 /* 607 * There is no verification of non-crc AGFLs because mkfs does not 608 * initialise the AGFL to zero or NULL. Hence the only valid part of the 609 * AGFL is what the AGF says is active. We can't get to the AGF, so we 610 * can't verify just those entries are valid. 611 */ 612 if (!xfs_has_crc(mp)) 613 return NULL; 614 615 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 616 return __this_address; 617 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 618 return __this_address; 619 /* 620 * during growfs operations, the perag is not fully initialised, 621 * so we can't use it for any useful checking. growfs ensures we can't 622 * use it by using uncached buffers that don't have the perag attached 623 * so we can detect and avoid this problem. 624 */ 625 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 626 return __this_address; 627 628 for (i = 0; i < xfs_agfl_size(mp); i++) { 629 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 630 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 631 return __this_address; 632 } 633 634 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 635 return __this_address; 636 return NULL; 637 } 638 639 static void 640 xfs_agfl_read_verify( 641 struct xfs_buf *bp) 642 { 643 struct xfs_mount *mp = bp->b_mount; 644 xfs_failaddr_t fa; 645 646 /* 647 * There is no verification of non-crc AGFLs because mkfs does not 648 * initialise the AGFL to zero or NULL. Hence the only valid part of the 649 * AGFL is what the AGF says is active. We can't get to the AGF, so we 650 * can't verify just those entries are valid. 651 */ 652 if (!xfs_has_crc(mp)) 653 return; 654 655 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 656 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 657 else { 658 fa = xfs_agfl_verify(bp); 659 if (fa) 660 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 661 } 662 } 663 664 static void 665 xfs_agfl_write_verify( 666 struct xfs_buf *bp) 667 { 668 struct xfs_mount *mp = bp->b_mount; 669 struct xfs_buf_log_item *bip = bp->b_log_item; 670 xfs_failaddr_t fa; 671 672 /* no verification of non-crc AGFLs */ 673 if (!xfs_has_crc(mp)) 674 return; 675 676 fa = xfs_agfl_verify(bp); 677 if (fa) { 678 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 679 return; 680 } 681 682 if (bip) 683 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 684 685 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 686 } 687 688 const struct xfs_buf_ops xfs_agfl_buf_ops = { 689 .name = "xfs_agfl", 690 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 691 .verify_read = xfs_agfl_read_verify, 692 .verify_write = xfs_agfl_write_verify, 693 .verify_struct = xfs_agfl_verify, 694 }; 695 696 /* 697 * Read in the allocation group free block array. 698 */ 699 int 700 xfs_alloc_read_agfl( 701 struct xfs_perag *pag, 702 struct xfs_trans *tp, 703 struct xfs_buf **bpp) 704 { 705 struct xfs_mount *mp = pag->pag_mount; 706 struct xfs_buf *bp; 707 int error; 708 709 error = xfs_trans_read_buf( 710 mp, tp, mp->m_ddev_targp, 711 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 712 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 713 if (error) 714 return error; 715 xfs_buf_set_ref(bp, XFS_AGFL_REF); 716 *bpp = bp; 717 return 0; 718 } 719 720 STATIC int 721 xfs_alloc_update_counters( 722 struct xfs_trans *tp, 723 struct xfs_buf *agbp, 724 long len) 725 { 726 struct xfs_agf *agf = agbp->b_addr; 727 728 agbp->b_pag->pagf_freeblks += len; 729 be32_add_cpu(&agf->agf_freeblks, len); 730 731 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 732 be32_to_cpu(agf->agf_length))) { 733 xfs_buf_mark_corrupt(agbp); 734 return -EFSCORRUPTED; 735 } 736 737 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 738 return 0; 739 } 740 741 /* 742 * Block allocation algorithm and data structures. 743 */ 744 struct xfs_alloc_cur { 745 struct xfs_btree_cur *cnt; /* btree cursors */ 746 struct xfs_btree_cur *bnolt; 747 struct xfs_btree_cur *bnogt; 748 xfs_extlen_t cur_len;/* current search length */ 749 xfs_agblock_t rec_bno;/* extent startblock */ 750 xfs_extlen_t rec_len;/* extent length */ 751 xfs_agblock_t bno; /* alloc bno */ 752 xfs_extlen_t len; /* alloc len */ 753 xfs_extlen_t diff; /* diff from search bno */ 754 unsigned int busy_gen;/* busy state */ 755 bool busy; 756 }; 757 758 /* 759 * Set up cursors, etc. in the extent allocation cursor. This function can be 760 * called multiple times to reset an initialized structure without having to 761 * reallocate cursors. 762 */ 763 static int 764 xfs_alloc_cur_setup( 765 struct xfs_alloc_arg *args, 766 struct xfs_alloc_cur *acur) 767 { 768 int error; 769 int i; 770 771 acur->cur_len = args->maxlen; 772 acur->rec_bno = 0; 773 acur->rec_len = 0; 774 acur->bno = 0; 775 acur->len = 0; 776 acur->diff = -1; 777 acur->busy = false; 778 acur->busy_gen = 0; 779 780 /* 781 * Perform an initial cntbt lookup to check for availability of maxlen 782 * extents. If this fails, we'll return -ENOSPC to signal the caller to 783 * attempt a small allocation. 784 */ 785 if (!acur->cnt) 786 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, 787 args->agbp, args->pag, XFS_BTNUM_CNT); 788 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 789 if (error) 790 return error; 791 792 /* 793 * Allocate the bnobt left and right search cursors. 794 */ 795 if (!acur->bnolt) 796 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, 797 args->agbp, args->pag, XFS_BTNUM_BNO); 798 if (!acur->bnogt) 799 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, 800 args->agbp, args->pag, XFS_BTNUM_BNO); 801 return i == 1 ? 0 : -ENOSPC; 802 } 803 804 static void 805 xfs_alloc_cur_close( 806 struct xfs_alloc_cur *acur, 807 bool error) 808 { 809 int cur_error = XFS_BTREE_NOERROR; 810 811 if (error) 812 cur_error = XFS_BTREE_ERROR; 813 814 if (acur->cnt) 815 xfs_btree_del_cursor(acur->cnt, cur_error); 816 if (acur->bnolt) 817 xfs_btree_del_cursor(acur->bnolt, cur_error); 818 if (acur->bnogt) 819 xfs_btree_del_cursor(acur->bnogt, cur_error); 820 acur->cnt = acur->bnolt = acur->bnogt = NULL; 821 } 822 823 /* 824 * Check an extent for allocation and track the best available candidate in the 825 * allocation structure. The cursor is deactivated if it has entered an out of 826 * range state based on allocation arguments. Optionally return the extent 827 * extent geometry and allocation status if requested by the caller. 828 */ 829 static int 830 xfs_alloc_cur_check( 831 struct xfs_alloc_arg *args, 832 struct xfs_alloc_cur *acur, 833 struct xfs_btree_cur *cur, 834 int *new) 835 { 836 int error, i; 837 xfs_agblock_t bno, bnoa, bnew; 838 xfs_extlen_t len, lena, diff = -1; 839 bool busy; 840 unsigned busy_gen = 0; 841 bool deactivate = false; 842 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; 843 844 *new = 0; 845 846 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 847 if (error) 848 return error; 849 if (XFS_IS_CORRUPT(args->mp, i != 1)) 850 return -EFSCORRUPTED; 851 852 /* 853 * Check minlen and deactivate a cntbt cursor if out of acceptable size 854 * range (i.e., walking backwards looking for a minlen extent). 855 */ 856 if (len < args->minlen) { 857 deactivate = !isbnobt; 858 goto out; 859 } 860 861 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 862 &busy_gen); 863 acur->busy |= busy; 864 if (busy) 865 acur->busy_gen = busy_gen; 866 /* deactivate a bnobt cursor outside of locality range */ 867 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 868 deactivate = isbnobt; 869 goto out; 870 } 871 if (lena < args->minlen) 872 goto out; 873 874 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 875 xfs_alloc_fix_len(args); 876 ASSERT(args->len >= args->minlen); 877 if (args->len < acur->len) 878 goto out; 879 880 /* 881 * We have an aligned record that satisfies minlen and beats or matches 882 * the candidate extent size. Compare locality for near allocation mode. 883 */ 884 diff = xfs_alloc_compute_diff(args->agbno, args->len, 885 args->alignment, args->datatype, 886 bnoa, lena, &bnew); 887 if (bnew == NULLAGBLOCK) 888 goto out; 889 890 /* 891 * Deactivate a bnobt cursor with worse locality than the current best. 892 */ 893 if (diff > acur->diff) { 894 deactivate = isbnobt; 895 goto out; 896 } 897 898 ASSERT(args->len > acur->len || 899 (args->len == acur->len && diff <= acur->diff)); 900 acur->rec_bno = bno; 901 acur->rec_len = len; 902 acur->bno = bnew; 903 acur->len = args->len; 904 acur->diff = diff; 905 *new = 1; 906 907 /* 908 * We're done if we found a perfect allocation. This only deactivates 909 * the current cursor, but this is just an optimization to terminate a 910 * cntbt search that otherwise runs to the edge of the tree. 911 */ 912 if (acur->diff == 0 && acur->len == args->maxlen) 913 deactivate = true; 914 out: 915 if (deactivate) 916 cur->bc_ag.abt.active = false; 917 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, 918 *new); 919 return 0; 920 } 921 922 /* 923 * Complete an allocation of a candidate extent. Remove the extent from both 924 * trees and update the args structure. 925 */ 926 STATIC int 927 xfs_alloc_cur_finish( 928 struct xfs_alloc_arg *args, 929 struct xfs_alloc_cur *acur) 930 { 931 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 932 int error; 933 934 ASSERT(acur->cnt && acur->bnolt); 935 ASSERT(acur->bno >= acur->rec_bno); 936 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 937 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); 938 939 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 940 acur->rec_len, acur->bno, acur->len, 0); 941 if (error) 942 return error; 943 944 args->agbno = acur->bno; 945 args->len = acur->len; 946 args->wasfromfl = 0; 947 948 trace_xfs_alloc_cur(args); 949 return 0; 950 } 951 952 /* 953 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 954 * bno optimized lookup to search for extents with ideal size and locality. 955 */ 956 STATIC int 957 xfs_alloc_cntbt_iter( 958 struct xfs_alloc_arg *args, 959 struct xfs_alloc_cur *acur) 960 { 961 struct xfs_btree_cur *cur = acur->cnt; 962 xfs_agblock_t bno; 963 xfs_extlen_t len, cur_len; 964 int error; 965 int i; 966 967 if (!xfs_alloc_cur_active(cur)) 968 return 0; 969 970 /* locality optimized lookup */ 971 cur_len = acur->cur_len; 972 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 973 if (error) 974 return error; 975 if (i == 0) 976 return 0; 977 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 978 if (error) 979 return error; 980 981 /* check the current record and update search length from it */ 982 error = xfs_alloc_cur_check(args, acur, cur, &i); 983 if (error) 984 return error; 985 ASSERT(len >= acur->cur_len); 986 acur->cur_len = len; 987 988 /* 989 * We looked up the first record >= [agbno, len] above. The agbno is a 990 * secondary key and so the current record may lie just before or after 991 * agbno. If it is past agbno, check the previous record too so long as 992 * the length matches as it may be closer. Don't check a smaller record 993 * because that could deactivate our cursor. 994 */ 995 if (bno > args->agbno) { 996 error = xfs_btree_decrement(cur, 0, &i); 997 if (!error && i) { 998 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 999 if (!error && i && len == acur->cur_len) 1000 error = xfs_alloc_cur_check(args, acur, cur, 1001 &i); 1002 } 1003 if (error) 1004 return error; 1005 } 1006 1007 /* 1008 * Increment the search key until we find at least one allocation 1009 * candidate or if the extent we found was larger. Otherwise, double the 1010 * search key to optimize the search. Efficiency is more important here 1011 * than absolute best locality. 1012 */ 1013 cur_len <<= 1; 1014 if (!acur->len || acur->cur_len >= cur_len) 1015 acur->cur_len++; 1016 else 1017 acur->cur_len = cur_len; 1018 1019 return error; 1020 } 1021 1022 /* 1023 * Deal with the case where only small freespaces remain. Either return the 1024 * contents of the last freespace record, or allocate space from the freelist if 1025 * there is nothing in the tree. 1026 */ 1027 STATIC int /* error */ 1028 xfs_alloc_ag_vextent_small( 1029 struct xfs_alloc_arg *args, /* allocation argument structure */ 1030 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1031 xfs_agblock_t *fbnop, /* result block number */ 1032 xfs_extlen_t *flenp, /* result length */ 1033 int *stat) /* status: 0-freelist, 1-normal/none */ 1034 { 1035 struct xfs_agf *agf = args->agbp->b_addr; 1036 int error = 0; 1037 xfs_agblock_t fbno = NULLAGBLOCK; 1038 xfs_extlen_t flen = 0; 1039 int i = 0; 1040 1041 /* 1042 * If a cntbt cursor is provided, try to allocate the largest record in 1043 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1044 * allocation. Make sure to respect minleft even when pulling from the 1045 * freelist. 1046 */ 1047 if (ccur) 1048 error = xfs_btree_decrement(ccur, 0, &i); 1049 if (error) 1050 goto error; 1051 if (i) { 1052 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1053 if (error) 1054 goto error; 1055 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1056 error = -EFSCORRUPTED; 1057 goto error; 1058 } 1059 goto out; 1060 } 1061 1062 if (args->minlen != 1 || args->alignment != 1 || 1063 args->resv == XFS_AG_RESV_AGFL || 1064 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1065 goto out; 1066 1067 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1068 &fbno, 0); 1069 if (error) 1070 goto error; 1071 if (fbno == NULLAGBLOCK) 1072 goto out; 1073 1074 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1075 (args->datatype & XFS_ALLOC_NOBUSY)); 1076 1077 if (args->datatype & XFS_ALLOC_USERDATA) { 1078 struct xfs_buf *bp; 1079 1080 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1081 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1082 args->mp->m_bsize, 0, &bp); 1083 if (error) 1084 goto error; 1085 xfs_trans_binval(args->tp, bp); 1086 } 1087 *fbnop = args->agbno = fbno; 1088 *flenp = args->len = 1; 1089 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1090 error = -EFSCORRUPTED; 1091 goto error; 1092 } 1093 args->wasfromfl = 1; 1094 trace_xfs_alloc_small_freelist(args); 1095 1096 /* 1097 * If we're feeding an AGFL block to something that doesn't live in the 1098 * free space, we need to clear out the OWN_AG rmap. 1099 */ 1100 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1101 &XFS_RMAP_OINFO_AG); 1102 if (error) 1103 goto error; 1104 1105 *stat = 0; 1106 return 0; 1107 1108 out: 1109 /* 1110 * Can't do the allocation, give up. 1111 */ 1112 if (flen < args->minlen) { 1113 args->agbno = NULLAGBLOCK; 1114 trace_xfs_alloc_small_notenough(args); 1115 flen = 0; 1116 } 1117 *fbnop = fbno; 1118 *flenp = flen; 1119 *stat = 1; 1120 trace_xfs_alloc_small_done(args); 1121 return 0; 1122 1123 error: 1124 trace_xfs_alloc_small_error(args); 1125 return error; 1126 } 1127 1128 /* 1129 * Allocate a variable extent at exactly agno/bno. 1130 * Extent's length (returned in *len) will be between minlen and maxlen, 1131 * and of the form k * prod + mod unless there's nothing that large. 1132 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1133 */ 1134 STATIC int /* error */ 1135 xfs_alloc_ag_vextent_exact( 1136 xfs_alloc_arg_t *args) /* allocation argument structure */ 1137 { 1138 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1139 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1140 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1141 int error; 1142 xfs_agblock_t fbno; /* start block of found extent */ 1143 xfs_extlen_t flen; /* length of found extent */ 1144 xfs_agblock_t tbno; /* start block of busy extent */ 1145 xfs_extlen_t tlen; /* length of busy extent */ 1146 xfs_agblock_t tend; /* end block of busy extent */ 1147 int i; /* success/failure of operation */ 1148 unsigned busy_gen; 1149 1150 ASSERT(args->alignment == 1); 1151 1152 /* 1153 * Allocate/initialize a cursor for the by-number freespace btree. 1154 */ 1155 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1156 args->pag, XFS_BTNUM_BNO); 1157 1158 /* 1159 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1160 * Look for the closest free block <= bno, it must contain bno 1161 * if any free block does. 1162 */ 1163 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1164 if (error) 1165 goto error0; 1166 if (!i) 1167 goto not_found; 1168 1169 /* 1170 * Grab the freespace record. 1171 */ 1172 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1173 if (error) 1174 goto error0; 1175 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1176 error = -EFSCORRUPTED; 1177 goto error0; 1178 } 1179 ASSERT(fbno <= args->agbno); 1180 1181 /* 1182 * Check for overlapping busy extents. 1183 */ 1184 tbno = fbno; 1185 tlen = flen; 1186 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1187 1188 /* 1189 * Give up if the start of the extent is busy, or the freespace isn't 1190 * long enough for the minimum request. 1191 */ 1192 if (tbno > args->agbno) 1193 goto not_found; 1194 if (tlen < args->minlen) 1195 goto not_found; 1196 tend = tbno + tlen; 1197 if (tend < args->agbno + args->minlen) 1198 goto not_found; 1199 1200 /* 1201 * End of extent will be smaller of the freespace end and the 1202 * maximal requested end. 1203 * 1204 * Fix the length according to mod and prod if given. 1205 */ 1206 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1207 - args->agbno; 1208 xfs_alloc_fix_len(args); 1209 ASSERT(args->agbno + args->len <= tend); 1210 1211 /* 1212 * We are allocating agbno for args->len 1213 * Allocate/initialize a cursor for the by-size btree. 1214 */ 1215 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1216 args->pag, XFS_BTNUM_CNT); 1217 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); 1218 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1219 args->len, XFSA_FIXUP_BNO_OK); 1220 if (error) { 1221 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1222 goto error0; 1223 } 1224 1225 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1226 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1227 1228 args->wasfromfl = 0; 1229 trace_xfs_alloc_exact_done(args); 1230 return 0; 1231 1232 not_found: 1233 /* Didn't find it, return null. */ 1234 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1235 args->agbno = NULLAGBLOCK; 1236 trace_xfs_alloc_exact_notfound(args); 1237 return 0; 1238 1239 error0: 1240 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1241 trace_xfs_alloc_exact_error(args); 1242 return error; 1243 } 1244 1245 /* 1246 * Search a given number of btree records in a given direction. Check each 1247 * record against the good extent we've already found. 1248 */ 1249 STATIC int 1250 xfs_alloc_walk_iter( 1251 struct xfs_alloc_arg *args, 1252 struct xfs_alloc_cur *acur, 1253 struct xfs_btree_cur *cur, 1254 bool increment, 1255 bool find_one, /* quit on first candidate */ 1256 int count, /* rec count (-1 for infinite) */ 1257 int *stat) 1258 { 1259 int error; 1260 int i; 1261 1262 *stat = 0; 1263 1264 /* 1265 * Search so long as the cursor is active or we find a better extent. 1266 * The cursor is deactivated if it extends beyond the range of the 1267 * current allocation candidate. 1268 */ 1269 while (xfs_alloc_cur_active(cur) && count) { 1270 error = xfs_alloc_cur_check(args, acur, cur, &i); 1271 if (error) 1272 return error; 1273 if (i == 1) { 1274 *stat = 1; 1275 if (find_one) 1276 break; 1277 } 1278 if (!xfs_alloc_cur_active(cur)) 1279 break; 1280 1281 if (increment) 1282 error = xfs_btree_increment(cur, 0, &i); 1283 else 1284 error = xfs_btree_decrement(cur, 0, &i); 1285 if (error) 1286 return error; 1287 if (i == 0) 1288 cur->bc_ag.abt.active = false; 1289 1290 if (count > 0) 1291 count--; 1292 } 1293 1294 return 0; 1295 } 1296 1297 /* 1298 * Search the by-bno and by-size btrees in parallel in search of an extent with 1299 * ideal locality based on the NEAR mode ->agbno locality hint. 1300 */ 1301 STATIC int 1302 xfs_alloc_ag_vextent_locality( 1303 struct xfs_alloc_arg *args, 1304 struct xfs_alloc_cur *acur, 1305 int *stat) 1306 { 1307 struct xfs_btree_cur *fbcur = NULL; 1308 int error; 1309 int i; 1310 bool fbinc; 1311 1312 ASSERT(acur->len == 0); 1313 1314 *stat = 0; 1315 1316 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1317 if (error) 1318 return error; 1319 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1320 if (error) 1321 return error; 1322 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1323 if (error) 1324 return error; 1325 1326 /* 1327 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1328 * right and lookup the closest extent to the locality hint for each 1329 * extent size key in the cntbt. The entire search terminates 1330 * immediately on a bnobt hit because that means we've found best case 1331 * locality. Otherwise the search continues until the cntbt cursor runs 1332 * off the end of the tree. If no allocation candidate is found at this 1333 * point, give up on locality, walk backwards from the end of the cntbt 1334 * and take the first available extent. 1335 * 1336 * The parallel tree searches balance each other out to provide fairly 1337 * consistent performance for various situations. The bnobt search can 1338 * have pathological behavior in the worst case scenario of larger 1339 * allocation requests and fragmented free space. On the other hand, the 1340 * bnobt is able to satisfy most smaller allocation requests much more 1341 * quickly than the cntbt. The cntbt search can sift through fragmented 1342 * free space and sets of free extents for larger allocation requests 1343 * more quickly than the bnobt. Since the locality hint is just a hint 1344 * and we don't want to scan the entire bnobt for perfect locality, the 1345 * cntbt search essentially bounds the bnobt search such that we can 1346 * find good enough locality at reasonable performance in most cases. 1347 */ 1348 while (xfs_alloc_cur_active(acur->bnolt) || 1349 xfs_alloc_cur_active(acur->bnogt) || 1350 xfs_alloc_cur_active(acur->cnt)) { 1351 1352 trace_xfs_alloc_cur_lookup(args); 1353 1354 /* 1355 * Search the bnobt left and right. In the case of a hit, finish 1356 * the search in the opposite direction and we're done. 1357 */ 1358 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1359 true, 1, &i); 1360 if (error) 1361 return error; 1362 if (i == 1) { 1363 trace_xfs_alloc_cur_left(args); 1364 fbcur = acur->bnogt; 1365 fbinc = true; 1366 break; 1367 } 1368 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1369 1, &i); 1370 if (error) 1371 return error; 1372 if (i == 1) { 1373 trace_xfs_alloc_cur_right(args); 1374 fbcur = acur->bnolt; 1375 fbinc = false; 1376 break; 1377 } 1378 1379 /* 1380 * Check the extent with best locality based on the current 1381 * extent size search key and keep track of the best candidate. 1382 */ 1383 error = xfs_alloc_cntbt_iter(args, acur); 1384 if (error) 1385 return error; 1386 if (!xfs_alloc_cur_active(acur->cnt)) { 1387 trace_xfs_alloc_cur_lookup_done(args); 1388 break; 1389 } 1390 } 1391 1392 /* 1393 * If we failed to find anything due to busy extents, return empty 1394 * handed so the caller can flush and retry. If no busy extents were 1395 * found, walk backwards from the end of the cntbt as a last resort. 1396 */ 1397 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1398 error = xfs_btree_decrement(acur->cnt, 0, &i); 1399 if (error) 1400 return error; 1401 if (i) { 1402 acur->cnt->bc_ag.abt.active = true; 1403 fbcur = acur->cnt; 1404 fbinc = false; 1405 } 1406 } 1407 1408 /* 1409 * Search in the opposite direction for a better entry in the case of 1410 * a bnobt hit or walk backwards from the end of the cntbt. 1411 */ 1412 if (fbcur) { 1413 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1414 &i); 1415 if (error) 1416 return error; 1417 } 1418 1419 if (acur->len) 1420 *stat = 1; 1421 1422 return 0; 1423 } 1424 1425 /* Check the last block of the cnt btree for allocations. */ 1426 static int 1427 xfs_alloc_ag_vextent_lastblock( 1428 struct xfs_alloc_arg *args, 1429 struct xfs_alloc_cur *acur, 1430 xfs_agblock_t *bno, 1431 xfs_extlen_t *len, 1432 bool *allocated) 1433 { 1434 int error; 1435 int i; 1436 1437 #ifdef DEBUG 1438 /* Randomly don't execute the first algorithm. */ 1439 if (get_random_u32_below(2)) 1440 return 0; 1441 #endif 1442 1443 /* 1444 * Start from the entry that lookup found, sequence through all larger 1445 * free blocks. If we're actually pointing at a record smaller than 1446 * maxlen, go to the start of this block, and skip all those smaller 1447 * than minlen. 1448 */ 1449 if (*len || args->alignment > 1) { 1450 acur->cnt->bc_levels[0].ptr = 1; 1451 do { 1452 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1453 if (error) 1454 return error; 1455 if (XFS_IS_CORRUPT(args->mp, i != 1)) 1456 return -EFSCORRUPTED; 1457 if (*len >= args->minlen) 1458 break; 1459 error = xfs_btree_increment(acur->cnt, 0, &i); 1460 if (error) 1461 return error; 1462 } while (i); 1463 ASSERT(*len >= args->minlen); 1464 if (!i) 1465 return 0; 1466 } 1467 1468 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1469 if (error) 1470 return error; 1471 1472 /* 1473 * It didn't work. We COULD be in a case where there's a good record 1474 * somewhere, so try again. 1475 */ 1476 if (acur->len == 0) 1477 return 0; 1478 1479 trace_xfs_alloc_near_first(args); 1480 *allocated = true; 1481 return 0; 1482 } 1483 1484 /* 1485 * Allocate a variable extent near bno in the allocation group agno. 1486 * Extent's length (returned in len) will be between minlen and maxlen, 1487 * and of the form k * prod + mod unless there's nothing that large. 1488 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1489 */ 1490 STATIC int 1491 xfs_alloc_ag_vextent_near( 1492 struct xfs_alloc_arg *args) 1493 { 1494 struct xfs_alloc_cur acur = {}; 1495 int error; /* error code */ 1496 int i; /* result code, temporary */ 1497 xfs_agblock_t bno; 1498 xfs_extlen_t len; 1499 1500 /* handle uninitialized agbno range so caller doesn't have to */ 1501 if (!args->min_agbno && !args->max_agbno) 1502 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1503 ASSERT(args->min_agbno <= args->max_agbno); 1504 1505 /* clamp agbno to the range if it's outside */ 1506 if (args->agbno < args->min_agbno) 1507 args->agbno = args->min_agbno; 1508 if (args->agbno > args->max_agbno) 1509 args->agbno = args->max_agbno; 1510 1511 restart: 1512 len = 0; 1513 1514 /* 1515 * Set up cursors and see if there are any free extents as big as 1516 * maxlen. If not, pick the last entry in the tree unless the tree is 1517 * empty. 1518 */ 1519 error = xfs_alloc_cur_setup(args, &acur); 1520 if (error == -ENOSPC) { 1521 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1522 &len, &i); 1523 if (error) 1524 goto out; 1525 if (i == 0 || len == 0) { 1526 trace_xfs_alloc_near_noentry(args); 1527 goto out; 1528 } 1529 ASSERT(i == 1); 1530 } else if (error) { 1531 goto out; 1532 } 1533 1534 /* 1535 * First algorithm. 1536 * If the requested extent is large wrt the freespaces available 1537 * in this a.g., then the cursor will be pointing to a btree entry 1538 * near the right edge of the tree. If it's in the last btree leaf 1539 * block, then we just examine all the entries in that block 1540 * that are big enough, and pick the best one. 1541 */ 1542 if (xfs_btree_islastblock(acur.cnt, 0)) { 1543 bool allocated = false; 1544 1545 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1546 &allocated); 1547 if (error) 1548 goto out; 1549 if (allocated) 1550 goto alloc_finish; 1551 } 1552 1553 /* 1554 * Second algorithm. Combined cntbt and bnobt search to find ideal 1555 * locality. 1556 */ 1557 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1558 if (error) 1559 goto out; 1560 1561 /* 1562 * If we couldn't get anything, give up. 1563 */ 1564 if (!acur.len) { 1565 if (acur.busy) { 1566 trace_xfs_alloc_near_busy(args); 1567 xfs_extent_busy_flush(args->mp, args->pag, 1568 acur.busy_gen); 1569 goto restart; 1570 } 1571 trace_xfs_alloc_size_neither(args); 1572 args->agbno = NULLAGBLOCK; 1573 goto out; 1574 } 1575 1576 alloc_finish: 1577 /* fix up btrees on a successful allocation */ 1578 error = xfs_alloc_cur_finish(args, &acur); 1579 1580 out: 1581 xfs_alloc_cur_close(&acur, error); 1582 return error; 1583 } 1584 1585 /* 1586 * Allocate a variable extent anywhere in the allocation group agno. 1587 * Extent's length (returned in len) will be between minlen and maxlen, 1588 * and of the form k * prod + mod unless there's nothing that large. 1589 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1590 */ 1591 STATIC int /* error */ 1592 xfs_alloc_ag_vextent_size( 1593 xfs_alloc_arg_t *args) /* allocation argument structure */ 1594 { 1595 struct xfs_agf *agf = args->agbp->b_addr; 1596 struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ 1597 struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ 1598 int error; /* error result */ 1599 xfs_agblock_t fbno; /* start of found freespace */ 1600 xfs_extlen_t flen; /* length of found freespace */ 1601 int i; /* temp status variable */ 1602 xfs_agblock_t rbno; /* returned block number */ 1603 xfs_extlen_t rlen; /* length of returned extent */ 1604 bool busy; 1605 unsigned busy_gen; 1606 1607 restart: 1608 /* 1609 * Allocate and initialize a cursor for the by-size btree. 1610 */ 1611 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1612 args->pag, XFS_BTNUM_CNT); 1613 bno_cur = NULL; 1614 1615 /* 1616 * Look for an entry >= maxlen+alignment-1 blocks. 1617 */ 1618 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1619 args->maxlen + args->alignment - 1, &i))) 1620 goto error0; 1621 1622 /* 1623 * If none then we have to settle for a smaller extent. In the case that 1624 * there are no large extents, this will return the last entry in the 1625 * tree unless the tree is empty. In the case that there are only busy 1626 * large extents, this will return the largest small extent unless there 1627 * are no smaller extents available. 1628 */ 1629 if (!i) { 1630 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1631 &fbno, &flen, &i); 1632 if (error) 1633 goto error0; 1634 if (i == 0 || flen == 0) { 1635 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1636 trace_xfs_alloc_size_noentry(args); 1637 return 0; 1638 } 1639 ASSERT(i == 1); 1640 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1641 &rlen, &busy_gen); 1642 } else { 1643 /* 1644 * Search for a non-busy extent that is large enough. 1645 */ 1646 for (;;) { 1647 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1648 if (error) 1649 goto error0; 1650 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1651 error = -EFSCORRUPTED; 1652 goto error0; 1653 } 1654 1655 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1656 &rbno, &rlen, &busy_gen); 1657 1658 if (rlen >= args->maxlen) 1659 break; 1660 1661 error = xfs_btree_increment(cnt_cur, 0, &i); 1662 if (error) 1663 goto error0; 1664 if (i == 0) { 1665 /* 1666 * Our only valid extents must have been busy. 1667 * Make it unbusy by forcing the log out and 1668 * retrying. 1669 */ 1670 xfs_btree_del_cursor(cnt_cur, 1671 XFS_BTREE_NOERROR); 1672 trace_xfs_alloc_size_busy(args); 1673 xfs_extent_busy_flush(args->mp, 1674 args->pag, busy_gen); 1675 goto restart; 1676 } 1677 } 1678 } 1679 1680 /* 1681 * In the first case above, we got the last entry in the 1682 * by-size btree. Now we check to see if the space hits maxlen 1683 * once aligned; if not, we search left for something better. 1684 * This can't happen in the second case above. 1685 */ 1686 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1687 if (XFS_IS_CORRUPT(args->mp, 1688 rlen != 0 && 1689 (rlen > flen || 1690 rbno + rlen > fbno + flen))) { 1691 error = -EFSCORRUPTED; 1692 goto error0; 1693 } 1694 if (rlen < args->maxlen) { 1695 xfs_agblock_t bestfbno; 1696 xfs_extlen_t bestflen; 1697 xfs_agblock_t bestrbno; 1698 xfs_extlen_t bestrlen; 1699 1700 bestrlen = rlen; 1701 bestrbno = rbno; 1702 bestflen = flen; 1703 bestfbno = fbno; 1704 for (;;) { 1705 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1706 goto error0; 1707 if (i == 0) 1708 break; 1709 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1710 &i))) 1711 goto error0; 1712 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1713 error = -EFSCORRUPTED; 1714 goto error0; 1715 } 1716 if (flen < bestrlen) 1717 break; 1718 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1719 &rbno, &rlen, &busy_gen); 1720 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1721 if (XFS_IS_CORRUPT(args->mp, 1722 rlen != 0 && 1723 (rlen > flen || 1724 rbno + rlen > fbno + flen))) { 1725 error = -EFSCORRUPTED; 1726 goto error0; 1727 } 1728 if (rlen > bestrlen) { 1729 bestrlen = rlen; 1730 bestrbno = rbno; 1731 bestflen = flen; 1732 bestfbno = fbno; 1733 if (rlen == args->maxlen) 1734 break; 1735 } 1736 } 1737 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1738 &i))) 1739 goto error0; 1740 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1741 error = -EFSCORRUPTED; 1742 goto error0; 1743 } 1744 rlen = bestrlen; 1745 rbno = bestrbno; 1746 flen = bestflen; 1747 fbno = bestfbno; 1748 } 1749 args->wasfromfl = 0; 1750 /* 1751 * Fix up the length. 1752 */ 1753 args->len = rlen; 1754 if (rlen < args->minlen) { 1755 if (busy) { 1756 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1757 trace_xfs_alloc_size_busy(args); 1758 xfs_extent_busy_flush(args->mp, args->pag, busy_gen); 1759 goto restart; 1760 } 1761 goto out_nominleft; 1762 } 1763 xfs_alloc_fix_len(args); 1764 1765 rlen = args->len; 1766 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1767 error = -EFSCORRUPTED; 1768 goto error0; 1769 } 1770 /* 1771 * Allocate and initialize a cursor for the by-block tree. 1772 */ 1773 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1774 args->pag, XFS_BTNUM_BNO); 1775 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1776 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1777 goto error0; 1778 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1779 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1780 cnt_cur = bno_cur = NULL; 1781 args->len = rlen; 1782 args->agbno = rbno; 1783 if (XFS_IS_CORRUPT(args->mp, 1784 args->agbno + args->len > 1785 be32_to_cpu(agf->agf_length))) { 1786 error = -EFSCORRUPTED; 1787 goto error0; 1788 } 1789 trace_xfs_alloc_size_done(args); 1790 return 0; 1791 1792 error0: 1793 trace_xfs_alloc_size_error(args); 1794 if (cnt_cur) 1795 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1796 if (bno_cur) 1797 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1798 return error; 1799 1800 out_nominleft: 1801 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1802 trace_xfs_alloc_size_nominleft(args); 1803 args->agbno = NULLAGBLOCK; 1804 return 0; 1805 } 1806 1807 /* 1808 * Free the extent starting at agno/bno for length. 1809 */ 1810 STATIC int 1811 xfs_free_ag_extent( 1812 struct xfs_trans *tp, 1813 struct xfs_buf *agbp, 1814 xfs_agnumber_t agno, 1815 xfs_agblock_t bno, 1816 xfs_extlen_t len, 1817 const struct xfs_owner_info *oinfo, 1818 enum xfs_ag_resv_type type) 1819 { 1820 struct xfs_mount *mp; 1821 struct xfs_btree_cur *bno_cur; 1822 struct xfs_btree_cur *cnt_cur; 1823 xfs_agblock_t gtbno; /* start of right neighbor */ 1824 xfs_extlen_t gtlen; /* length of right neighbor */ 1825 xfs_agblock_t ltbno; /* start of left neighbor */ 1826 xfs_extlen_t ltlen; /* length of left neighbor */ 1827 xfs_agblock_t nbno; /* new starting block of freesp */ 1828 xfs_extlen_t nlen; /* new length of freespace */ 1829 int haveleft; /* have a left neighbor */ 1830 int haveright; /* have a right neighbor */ 1831 int i; 1832 int error; 1833 struct xfs_perag *pag = agbp->b_pag; 1834 1835 bno_cur = cnt_cur = NULL; 1836 mp = tp->t_mountp; 1837 1838 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1839 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1840 if (error) 1841 goto error0; 1842 } 1843 1844 /* 1845 * Allocate and initialize a cursor for the by-block btree. 1846 */ 1847 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); 1848 /* 1849 * Look for a neighboring block on the left (lower block numbers) 1850 * that is contiguous with this space. 1851 */ 1852 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1853 goto error0; 1854 if (haveleft) { 1855 /* 1856 * There is a block to our left. 1857 */ 1858 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1859 goto error0; 1860 if (XFS_IS_CORRUPT(mp, i != 1)) { 1861 error = -EFSCORRUPTED; 1862 goto error0; 1863 } 1864 /* 1865 * It's not contiguous, though. 1866 */ 1867 if (ltbno + ltlen < bno) 1868 haveleft = 0; 1869 else { 1870 /* 1871 * If this failure happens the request to free this 1872 * space was invalid, it's (partly) already free. 1873 * Very bad. 1874 */ 1875 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 1876 error = -EFSCORRUPTED; 1877 goto error0; 1878 } 1879 } 1880 } 1881 /* 1882 * Look for a neighboring block on the right (higher block numbers) 1883 * that is contiguous with this space. 1884 */ 1885 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 1886 goto error0; 1887 if (haveright) { 1888 /* 1889 * There is a block to our right. 1890 */ 1891 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 1892 goto error0; 1893 if (XFS_IS_CORRUPT(mp, i != 1)) { 1894 error = -EFSCORRUPTED; 1895 goto error0; 1896 } 1897 /* 1898 * It's not contiguous, though. 1899 */ 1900 if (bno + len < gtbno) 1901 haveright = 0; 1902 else { 1903 /* 1904 * If this failure happens the request to free this 1905 * space was invalid, it's (partly) already free. 1906 * Very bad. 1907 */ 1908 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 1909 error = -EFSCORRUPTED; 1910 goto error0; 1911 } 1912 } 1913 } 1914 /* 1915 * Now allocate and initialize a cursor for the by-size tree. 1916 */ 1917 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); 1918 /* 1919 * Have both left and right contiguous neighbors. 1920 * Merge all three into a single free block. 1921 */ 1922 if (haveleft && haveright) { 1923 /* 1924 * Delete the old by-size entry on the left. 1925 */ 1926 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1927 goto error0; 1928 if (XFS_IS_CORRUPT(mp, i != 1)) { 1929 error = -EFSCORRUPTED; 1930 goto error0; 1931 } 1932 if ((error = xfs_btree_delete(cnt_cur, &i))) 1933 goto error0; 1934 if (XFS_IS_CORRUPT(mp, i != 1)) { 1935 error = -EFSCORRUPTED; 1936 goto error0; 1937 } 1938 /* 1939 * Delete the old by-size entry on the right. 1940 */ 1941 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1942 goto error0; 1943 if (XFS_IS_CORRUPT(mp, i != 1)) { 1944 error = -EFSCORRUPTED; 1945 goto error0; 1946 } 1947 if ((error = xfs_btree_delete(cnt_cur, &i))) 1948 goto error0; 1949 if (XFS_IS_CORRUPT(mp, i != 1)) { 1950 error = -EFSCORRUPTED; 1951 goto error0; 1952 } 1953 /* 1954 * Delete the old by-block entry for the right block. 1955 */ 1956 if ((error = xfs_btree_delete(bno_cur, &i))) 1957 goto error0; 1958 if (XFS_IS_CORRUPT(mp, i != 1)) { 1959 error = -EFSCORRUPTED; 1960 goto error0; 1961 } 1962 /* 1963 * Move the by-block cursor back to the left neighbor. 1964 */ 1965 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 1966 goto error0; 1967 if (XFS_IS_CORRUPT(mp, i != 1)) { 1968 error = -EFSCORRUPTED; 1969 goto error0; 1970 } 1971 #ifdef DEBUG 1972 /* 1973 * Check that this is the right record: delete didn't 1974 * mangle the cursor. 1975 */ 1976 { 1977 xfs_agblock_t xxbno; 1978 xfs_extlen_t xxlen; 1979 1980 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 1981 &i))) 1982 goto error0; 1983 if (XFS_IS_CORRUPT(mp, 1984 i != 1 || 1985 xxbno != ltbno || 1986 xxlen != ltlen)) { 1987 error = -EFSCORRUPTED; 1988 goto error0; 1989 } 1990 } 1991 #endif 1992 /* 1993 * Update remaining by-block entry to the new, joined block. 1994 */ 1995 nbno = ltbno; 1996 nlen = len + ltlen + gtlen; 1997 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 1998 goto error0; 1999 } 2000 /* 2001 * Have only a left contiguous neighbor. 2002 * Merge it together with the new freespace. 2003 */ 2004 else if (haveleft) { 2005 /* 2006 * Delete the old by-size entry on the left. 2007 */ 2008 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2009 goto error0; 2010 if (XFS_IS_CORRUPT(mp, i != 1)) { 2011 error = -EFSCORRUPTED; 2012 goto error0; 2013 } 2014 if ((error = xfs_btree_delete(cnt_cur, &i))) 2015 goto error0; 2016 if (XFS_IS_CORRUPT(mp, i != 1)) { 2017 error = -EFSCORRUPTED; 2018 goto error0; 2019 } 2020 /* 2021 * Back up the by-block cursor to the left neighbor, and 2022 * update its length. 2023 */ 2024 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2025 goto error0; 2026 if (XFS_IS_CORRUPT(mp, i != 1)) { 2027 error = -EFSCORRUPTED; 2028 goto error0; 2029 } 2030 nbno = ltbno; 2031 nlen = len + ltlen; 2032 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2033 goto error0; 2034 } 2035 /* 2036 * Have only a right contiguous neighbor. 2037 * Merge it together with the new freespace. 2038 */ 2039 else if (haveright) { 2040 /* 2041 * Delete the old by-size entry on the right. 2042 */ 2043 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2044 goto error0; 2045 if (XFS_IS_CORRUPT(mp, i != 1)) { 2046 error = -EFSCORRUPTED; 2047 goto error0; 2048 } 2049 if ((error = xfs_btree_delete(cnt_cur, &i))) 2050 goto error0; 2051 if (XFS_IS_CORRUPT(mp, i != 1)) { 2052 error = -EFSCORRUPTED; 2053 goto error0; 2054 } 2055 /* 2056 * Update the starting block and length of the right 2057 * neighbor in the by-block tree. 2058 */ 2059 nbno = bno; 2060 nlen = len + gtlen; 2061 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2062 goto error0; 2063 } 2064 /* 2065 * No contiguous neighbors. 2066 * Insert the new freespace into the by-block tree. 2067 */ 2068 else { 2069 nbno = bno; 2070 nlen = len; 2071 if ((error = xfs_btree_insert(bno_cur, &i))) 2072 goto error0; 2073 if (XFS_IS_CORRUPT(mp, i != 1)) { 2074 error = -EFSCORRUPTED; 2075 goto error0; 2076 } 2077 } 2078 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2079 bno_cur = NULL; 2080 /* 2081 * In all cases we need to insert the new freespace in the by-size tree. 2082 */ 2083 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2084 goto error0; 2085 if (XFS_IS_CORRUPT(mp, i != 0)) { 2086 error = -EFSCORRUPTED; 2087 goto error0; 2088 } 2089 if ((error = xfs_btree_insert(cnt_cur, &i))) 2090 goto error0; 2091 if (XFS_IS_CORRUPT(mp, i != 1)) { 2092 error = -EFSCORRUPTED; 2093 goto error0; 2094 } 2095 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2096 cnt_cur = NULL; 2097 2098 /* 2099 * Update the freespace totals in the ag and superblock. 2100 */ 2101 error = xfs_alloc_update_counters(tp, agbp, len); 2102 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2103 if (error) 2104 goto error0; 2105 2106 XFS_STATS_INC(mp, xs_freex); 2107 XFS_STATS_ADD(mp, xs_freeb, len); 2108 2109 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2110 2111 return 0; 2112 2113 error0: 2114 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2115 if (bno_cur) 2116 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2117 if (cnt_cur) 2118 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2119 return error; 2120 } 2121 2122 /* 2123 * Visible (exported) allocation/free functions. 2124 * Some of these are used just by xfs_alloc_btree.c and this file. 2125 */ 2126 2127 /* 2128 * Compute and fill in value of m_alloc_maxlevels. 2129 */ 2130 void 2131 xfs_alloc_compute_maxlevels( 2132 xfs_mount_t *mp) /* file system mount structure */ 2133 { 2134 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2135 (mp->m_sb.sb_agblocks + 1) / 2); 2136 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2137 } 2138 2139 /* 2140 * Find the length of the longest extent in an AG. The 'need' parameter 2141 * specifies how much space we're going to need for the AGFL and the 2142 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2143 * other callers. 2144 */ 2145 xfs_extlen_t 2146 xfs_alloc_longest_free_extent( 2147 struct xfs_perag *pag, 2148 xfs_extlen_t need, 2149 xfs_extlen_t reserved) 2150 { 2151 xfs_extlen_t delta = 0; 2152 2153 /* 2154 * If the AGFL needs a recharge, we'll have to subtract that from the 2155 * longest extent. 2156 */ 2157 if (need > pag->pagf_flcount) 2158 delta = need - pag->pagf_flcount; 2159 2160 /* 2161 * If we cannot maintain others' reservations with space from the 2162 * not-longest freesp extents, we'll have to subtract /that/ from 2163 * the longest extent too. 2164 */ 2165 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2166 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2167 2168 /* 2169 * If the longest extent is long enough to satisfy all the 2170 * reservations and AGFL rules in place, we can return this extent. 2171 */ 2172 if (pag->pagf_longest > delta) 2173 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2174 pag->pagf_longest - delta); 2175 2176 /* Otherwise, let the caller try for 1 block if there's space. */ 2177 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2178 } 2179 2180 /* 2181 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2182 * return the largest possible minimum length. 2183 */ 2184 unsigned int 2185 xfs_alloc_min_freelist( 2186 struct xfs_mount *mp, 2187 struct xfs_perag *pag) 2188 { 2189 /* AG btrees have at least 1 level. */ 2190 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; 2191 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; 2192 unsigned int min_free; 2193 2194 ASSERT(mp->m_alloc_maxlevels > 0); 2195 2196 /* space needed by-bno freespace btree */ 2197 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, 2198 mp->m_alloc_maxlevels); 2199 /* space needed by-size freespace btree */ 2200 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, 2201 mp->m_alloc_maxlevels); 2202 /* space needed reverse mapping used space btree */ 2203 if (xfs_has_rmapbt(mp)) 2204 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, 2205 mp->m_rmap_maxlevels); 2206 2207 return min_free; 2208 } 2209 2210 /* 2211 * Check if the operation we are fixing up the freelist for should go ahead or 2212 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2213 * is dependent on whether the size and shape of free space available will 2214 * permit the requested allocation to take place. 2215 */ 2216 static bool 2217 xfs_alloc_space_available( 2218 struct xfs_alloc_arg *args, 2219 xfs_extlen_t min_free, 2220 int flags) 2221 { 2222 struct xfs_perag *pag = args->pag; 2223 xfs_extlen_t alloc_len, longest; 2224 xfs_extlen_t reservation; /* blocks that are still reserved */ 2225 int available; 2226 xfs_extlen_t agflcount; 2227 2228 if (flags & XFS_ALLOC_FLAG_FREEING) 2229 return true; 2230 2231 reservation = xfs_ag_resv_needed(pag, args->resv); 2232 2233 /* do we have enough contiguous free space for the allocation? */ 2234 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2235 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2236 if (longest < alloc_len) 2237 return false; 2238 2239 /* 2240 * Do we have enough free space remaining for the allocation? Don't 2241 * account extra agfl blocks because we are about to defer free them, 2242 * making them unavailable until the current transaction commits. 2243 */ 2244 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2245 available = (int)(pag->pagf_freeblks + agflcount - 2246 reservation - min_free - args->minleft); 2247 if (available < (int)max(args->total, alloc_len)) 2248 return false; 2249 2250 /* 2251 * Clamp maxlen to the amount of free space available for the actual 2252 * extent allocation. 2253 */ 2254 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2255 args->maxlen = available; 2256 ASSERT(args->maxlen > 0); 2257 ASSERT(args->maxlen >= args->minlen); 2258 } 2259 2260 return true; 2261 } 2262 2263 int 2264 xfs_free_agfl_block( 2265 struct xfs_trans *tp, 2266 xfs_agnumber_t agno, 2267 xfs_agblock_t agbno, 2268 struct xfs_buf *agbp, 2269 struct xfs_owner_info *oinfo) 2270 { 2271 int error; 2272 struct xfs_buf *bp; 2273 2274 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2275 XFS_AG_RESV_AGFL); 2276 if (error) 2277 return error; 2278 2279 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2280 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2281 tp->t_mountp->m_bsize, 0, &bp); 2282 if (error) 2283 return error; 2284 xfs_trans_binval(tp, bp); 2285 2286 return 0; 2287 } 2288 2289 /* 2290 * Check the agfl fields of the agf for inconsistency or corruption. The purpose 2291 * is to detect an agfl header padding mismatch between current and early v5 2292 * kernels. This problem manifests as a 1-slot size difference between the 2293 * on-disk flcount and the active [first, last] range of a wrapped agfl. This 2294 * may also catch variants of agfl count corruption unrelated to padding. Either 2295 * way, we'll reset the agfl and warn the user. 2296 * 2297 * Return true if a reset is required before the agfl can be used, false 2298 * otherwise. 2299 */ 2300 static bool 2301 xfs_agfl_needs_reset( 2302 struct xfs_mount *mp, 2303 struct xfs_agf *agf) 2304 { 2305 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2306 uint32_t l = be32_to_cpu(agf->agf_fllast); 2307 uint32_t c = be32_to_cpu(agf->agf_flcount); 2308 int agfl_size = xfs_agfl_size(mp); 2309 int active; 2310 2311 /* no agfl header on v4 supers */ 2312 if (!xfs_has_crc(mp)) 2313 return false; 2314 2315 /* 2316 * The agf read verifier catches severe corruption of these fields. 2317 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2318 * the verifier allows it. 2319 */ 2320 if (f >= agfl_size || l >= agfl_size) 2321 return true; 2322 if (c > agfl_size) 2323 return true; 2324 2325 /* 2326 * Check consistency between the on-disk count and the active range. An 2327 * agfl padding mismatch manifests as an inconsistent flcount. 2328 */ 2329 if (c && l >= f) 2330 active = l - f + 1; 2331 else if (c) 2332 active = agfl_size - f + l + 1; 2333 else 2334 active = 0; 2335 2336 return active != c; 2337 } 2338 2339 /* 2340 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2341 * agfl content cannot be trusted. Warn the user that a repair is required to 2342 * recover leaked blocks. 2343 * 2344 * The purpose of this mechanism is to handle filesystems affected by the agfl 2345 * header padding mismatch problem. A reset keeps the filesystem online with a 2346 * relatively minor free space accounting inconsistency rather than suffer the 2347 * inevitable crash from use of an invalid agfl block. 2348 */ 2349 static void 2350 xfs_agfl_reset( 2351 struct xfs_trans *tp, 2352 struct xfs_buf *agbp, 2353 struct xfs_perag *pag) 2354 { 2355 struct xfs_mount *mp = tp->t_mountp; 2356 struct xfs_agf *agf = agbp->b_addr; 2357 2358 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2359 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2360 2361 xfs_warn(mp, 2362 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2363 "Please unmount and run xfs_repair.", 2364 pag->pag_agno, pag->pagf_flcount); 2365 2366 agf->agf_flfirst = 0; 2367 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2368 agf->agf_flcount = 0; 2369 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2370 XFS_AGF_FLCOUNT); 2371 2372 pag->pagf_flcount = 0; 2373 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2374 } 2375 2376 /* 2377 * Defer an AGFL block free. This is effectively equivalent to 2378 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2379 * 2380 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2381 * allocation operations in a transaction. AGFL frees are prone to this problem 2382 * because for one they are always freed one at a time. Further, an immediate 2383 * AGFL block free can cause a btree join and require another block free before 2384 * the real allocation can proceed. Deferring the free disconnects freeing up 2385 * the AGFL slot from freeing the block. 2386 */ 2387 STATIC void 2388 xfs_defer_agfl_block( 2389 struct xfs_trans *tp, 2390 xfs_agnumber_t agno, 2391 xfs_fsblock_t agbno, 2392 struct xfs_owner_info *oinfo) 2393 { 2394 struct xfs_mount *mp = tp->t_mountp; 2395 struct xfs_extent_free_item *xefi; 2396 2397 ASSERT(xfs_extfree_item_cache != NULL); 2398 ASSERT(oinfo != NULL); 2399 2400 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2401 GFP_KERNEL | __GFP_NOFAIL); 2402 xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2403 xefi->xefi_blockcount = 1; 2404 xefi->xefi_owner = oinfo->oi_owner; 2405 2406 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2407 2408 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); 2409 } 2410 2411 /* 2412 * Add the extent to the list of extents to be free at transaction end. 2413 * The list is maintained sorted (by block number). 2414 */ 2415 void 2416 __xfs_free_extent_later( 2417 struct xfs_trans *tp, 2418 xfs_fsblock_t bno, 2419 xfs_filblks_t len, 2420 const struct xfs_owner_info *oinfo, 2421 bool skip_discard) 2422 { 2423 struct xfs_extent_free_item *xefi; 2424 #ifdef DEBUG 2425 struct xfs_mount *mp = tp->t_mountp; 2426 xfs_agnumber_t agno; 2427 xfs_agblock_t agbno; 2428 2429 ASSERT(bno != NULLFSBLOCK); 2430 ASSERT(len > 0); 2431 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2432 ASSERT(!isnullstartblock(bno)); 2433 agno = XFS_FSB_TO_AGNO(mp, bno); 2434 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2435 ASSERT(agno < mp->m_sb.sb_agcount); 2436 ASSERT(agbno < mp->m_sb.sb_agblocks); 2437 ASSERT(len < mp->m_sb.sb_agblocks); 2438 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2439 #endif 2440 ASSERT(xfs_extfree_item_cache != NULL); 2441 2442 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2443 GFP_KERNEL | __GFP_NOFAIL); 2444 xefi->xefi_startblock = bno; 2445 xefi->xefi_blockcount = (xfs_extlen_t)len; 2446 if (skip_discard) 2447 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2448 if (oinfo) { 2449 ASSERT(oinfo->oi_offset == 0); 2450 2451 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2452 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2453 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2454 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2455 xefi->xefi_owner = oinfo->oi_owner; 2456 } else { 2457 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2458 } 2459 trace_xfs_bmap_free_defer(tp->t_mountp, 2460 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2461 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2462 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); 2463 } 2464 2465 #ifdef DEBUG 2466 /* 2467 * Check if an AGF has a free extent record whose length is equal to 2468 * args->minlen. 2469 */ 2470 STATIC int 2471 xfs_exact_minlen_extent_available( 2472 struct xfs_alloc_arg *args, 2473 struct xfs_buf *agbp, 2474 int *stat) 2475 { 2476 struct xfs_btree_cur *cnt_cur; 2477 xfs_agblock_t fbno; 2478 xfs_extlen_t flen; 2479 int error = 0; 2480 2481 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, 2482 args->pag, XFS_BTNUM_CNT); 2483 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2484 if (error) 2485 goto out; 2486 2487 if (*stat == 0) { 2488 error = -EFSCORRUPTED; 2489 goto out; 2490 } 2491 2492 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2493 if (error) 2494 goto out; 2495 2496 if (*stat == 1 && flen != args->minlen) 2497 *stat = 0; 2498 2499 out: 2500 xfs_btree_del_cursor(cnt_cur, error); 2501 2502 return error; 2503 } 2504 #endif 2505 2506 /* 2507 * Decide whether to use this allocation group for this allocation. 2508 * If so, fix up the btree freelist's size. 2509 */ 2510 int /* error */ 2511 xfs_alloc_fix_freelist( 2512 struct xfs_alloc_arg *args, /* allocation argument structure */ 2513 int flags) /* XFS_ALLOC_FLAG_... */ 2514 { 2515 struct xfs_mount *mp = args->mp; 2516 struct xfs_perag *pag = args->pag; 2517 struct xfs_trans *tp = args->tp; 2518 struct xfs_buf *agbp = NULL; 2519 struct xfs_buf *agflbp = NULL; 2520 struct xfs_alloc_arg targs; /* local allocation arguments */ 2521 xfs_agblock_t bno; /* freelist block */ 2522 xfs_extlen_t need; /* total blocks needed in freelist */ 2523 int error = 0; 2524 2525 /* deferred ops (AGFL block frees) require permanent transactions */ 2526 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2527 2528 if (!xfs_perag_initialised_agf(pag)) { 2529 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2530 if (error) { 2531 /* Couldn't lock the AGF so skip this AG. */ 2532 if (error == -EAGAIN) 2533 error = 0; 2534 goto out_no_agbp; 2535 } 2536 } 2537 2538 /* 2539 * If this is a metadata preferred pag and we are user data then try 2540 * somewhere else if we are not being asked to try harder at this 2541 * point 2542 */ 2543 if (xfs_perag_prefers_metadata(pag) && 2544 (args->datatype & XFS_ALLOC_USERDATA) && 2545 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2546 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2547 goto out_agbp_relse; 2548 } 2549 2550 need = xfs_alloc_min_freelist(mp, pag); 2551 if (!xfs_alloc_space_available(args, need, flags | 2552 XFS_ALLOC_FLAG_CHECK)) 2553 goto out_agbp_relse; 2554 2555 /* 2556 * Get the a.g. freespace buffer. 2557 * Can fail if we're not blocking on locks, and it's held. 2558 */ 2559 if (!agbp) { 2560 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2561 if (error) { 2562 /* Couldn't lock the AGF so skip this AG. */ 2563 if (error == -EAGAIN) 2564 error = 0; 2565 goto out_no_agbp; 2566 } 2567 } 2568 2569 /* reset a padding mismatched agfl before final free space check */ 2570 if (xfs_perag_agfl_needs_reset(pag)) 2571 xfs_agfl_reset(tp, agbp, pag); 2572 2573 /* If there isn't enough total space or single-extent, reject it. */ 2574 need = xfs_alloc_min_freelist(mp, pag); 2575 if (!xfs_alloc_space_available(args, need, flags)) 2576 goto out_agbp_relse; 2577 2578 #ifdef DEBUG 2579 if (args->alloc_minlen_only) { 2580 int stat; 2581 2582 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2583 if (error || !stat) 2584 goto out_agbp_relse; 2585 } 2586 #endif 2587 /* 2588 * Make the freelist shorter if it's too long. 2589 * 2590 * Note that from this point onwards, we will always release the agf and 2591 * agfl buffers on error. This handles the case where we error out and 2592 * the buffers are clean or may not have been joined to the transaction 2593 * and hence need to be released manually. If they have been joined to 2594 * the transaction, then xfs_trans_brelse() will handle them 2595 * appropriately based on the recursion count and dirty state of the 2596 * buffer. 2597 * 2598 * XXX (dgc): When we have lots of free space, does this buy us 2599 * anything other than extra overhead when we need to put more blocks 2600 * back on the free list? Maybe we should only do this when space is 2601 * getting low or the AGFL is more than half full? 2602 * 2603 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2604 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2605 * updating the rmapbt. Both flags are used in xfs_repair while we're 2606 * rebuilding the rmapbt, and neither are used by the kernel. They're 2607 * both required to ensure that rmaps are correctly recorded for the 2608 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2609 * repair/rmap.c in xfsprogs for details. 2610 */ 2611 memset(&targs, 0, sizeof(targs)); 2612 /* struct copy below */ 2613 if (flags & XFS_ALLOC_FLAG_NORMAP) 2614 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2615 else 2616 targs.oinfo = XFS_RMAP_OINFO_AG; 2617 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { 2618 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2619 if (error) 2620 goto out_agbp_relse; 2621 2622 /* defer agfl frees */ 2623 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2624 } 2625 2626 targs.tp = tp; 2627 targs.mp = mp; 2628 targs.agbp = agbp; 2629 targs.agno = args->agno; 2630 targs.alignment = targs.minlen = targs.prod = 1; 2631 targs.pag = pag; 2632 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2633 if (error) 2634 goto out_agbp_relse; 2635 2636 /* Make the freelist longer if it's too short. */ 2637 while (pag->pagf_flcount < need) { 2638 targs.agbno = 0; 2639 targs.maxlen = need - pag->pagf_flcount; 2640 targs.resv = XFS_AG_RESV_AGFL; 2641 2642 /* Allocate as many blocks as possible at once. */ 2643 error = xfs_alloc_ag_vextent_size(&targs); 2644 if (error) 2645 goto out_agflbp_relse; 2646 2647 /* 2648 * Stop if we run out. Won't happen if callers are obeying 2649 * the restrictions correctly. Can happen for free calls 2650 * on a completely full ag. 2651 */ 2652 if (targs.agbno == NULLAGBLOCK) { 2653 if (flags & XFS_ALLOC_FLAG_FREEING) 2654 break; 2655 goto out_agflbp_relse; 2656 } 2657 2658 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2659 error = xfs_rmap_alloc(tp, agbp, pag, 2660 targs.agbno, targs.len, &targs.oinfo); 2661 if (error) 2662 goto out_agflbp_relse; 2663 } 2664 error = xfs_alloc_update_counters(tp, agbp, 2665 -((long)(targs.len))); 2666 if (error) 2667 goto out_agflbp_relse; 2668 2669 /* 2670 * Put each allocated block on the list. 2671 */ 2672 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2673 error = xfs_alloc_put_freelist(pag, tp, agbp, 2674 agflbp, bno, 0); 2675 if (error) 2676 goto out_agflbp_relse; 2677 } 2678 } 2679 xfs_trans_brelse(tp, agflbp); 2680 args->agbp = agbp; 2681 return 0; 2682 2683 out_agflbp_relse: 2684 xfs_trans_brelse(tp, agflbp); 2685 out_agbp_relse: 2686 if (agbp) 2687 xfs_trans_brelse(tp, agbp); 2688 out_no_agbp: 2689 args->agbp = NULL; 2690 return error; 2691 } 2692 2693 /* 2694 * Get a block from the freelist. 2695 * Returns with the buffer for the block gotten. 2696 */ 2697 int 2698 xfs_alloc_get_freelist( 2699 struct xfs_perag *pag, 2700 struct xfs_trans *tp, 2701 struct xfs_buf *agbp, 2702 xfs_agblock_t *bnop, 2703 int btreeblk) 2704 { 2705 struct xfs_agf *agf = agbp->b_addr; 2706 struct xfs_buf *agflbp; 2707 xfs_agblock_t bno; 2708 __be32 *agfl_bno; 2709 int error; 2710 uint32_t logflags; 2711 struct xfs_mount *mp = tp->t_mountp; 2712 2713 /* 2714 * Freelist is empty, give up. 2715 */ 2716 if (!agf->agf_flcount) { 2717 *bnop = NULLAGBLOCK; 2718 return 0; 2719 } 2720 /* 2721 * Read the array of free blocks. 2722 */ 2723 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2724 if (error) 2725 return error; 2726 2727 2728 /* 2729 * Get the block number and update the data structures. 2730 */ 2731 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2732 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 2733 be32_add_cpu(&agf->agf_flfirst, 1); 2734 xfs_trans_brelse(tp, agflbp); 2735 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2736 agf->agf_flfirst = 0; 2737 2738 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2739 be32_add_cpu(&agf->agf_flcount, -1); 2740 pag->pagf_flcount--; 2741 2742 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 2743 if (btreeblk) { 2744 be32_add_cpu(&agf->agf_btreeblks, 1); 2745 pag->pagf_btreeblks++; 2746 logflags |= XFS_AGF_BTREEBLKS; 2747 } 2748 2749 xfs_alloc_log_agf(tp, agbp, logflags); 2750 *bnop = bno; 2751 2752 return 0; 2753 } 2754 2755 /* 2756 * Log the given fields from the agf structure. 2757 */ 2758 void 2759 xfs_alloc_log_agf( 2760 struct xfs_trans *tp, 2761 struct xfs_buf *bp, 2762 uint32_t fields) 2763 { 2764 int first; /* first byte offset */ 2765 int last; /* last byte offset */ 2766 static const short offsets[] = { 2767 offsetof(xfs_agf_t, agf_magicnum), 2768 offsetof(xfs_agf_t, agf_versionnum), 2769 offsetof(xfs_agf_t, agf_seqno), 2770 offsetof(xfs_agf_t, agf_length), 2771 offsetof(xfs_agf_t, agf_roots[0]), 2772 offsetof(xfs_agf_t, agf_levels[0]), 2773 offsetof(xfs_agf_t, agf_flfirst), 2774 offsetof(xfs_agf_t, agf_fllast), 2775 offsetof(xfs_agf_t, agf_flcount), 2776 offsetof(xfs_agf_t, agf_freeblks), 2777 offsetof(xfs_agf_t, agf_longest), 2778 offsetof(xfs_agf_t, agf_btreeblks), 2779 offsetof(xfs_agf_t, agf_uuid), 2780 offsetof(xfs_agf_t, agf_rmap_blocks), 2781 offsetof(xfs_agf_t, agf_refcount_blocks), 2782 offsetof(xfs_agf_t, agf_refcount_root), 2783 offsetof(xfs_agf_t, agf_refcount_level), 2784 /* needed so that we don't log the whole rest of the structure: */ 2785 offsetof(xfs_agf_t, agf_spare64), 2786 sizeof(xfs_agf_t) 2787 }; 2788 2789 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 2790 2791 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 2792 2793 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2794 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2795 } 2796 2797 /* 2798 * Put the block on the freelist for the allocation group. 2799 */ 2800 int 2801 xfs_alloc_put_freelist( 2802 struct xfs_perag *pag, 2803 struct xfs_trans *tp, 2804 struct xfs_buf *agbp, 2805 struct xfs_buf *agflbp, 2806 xfs_agblock_t bno, 2807 int btreeblk) 2808 { 2809 struct xfs_mount *mp = tp->t_mountp; 2810 struct xfs_agf *agf = agbp->b_addr; 2811 __be32 *blockp; 2812 int error; 2813 uint32_t logflags; 2814 __be32 *agfl_bno; 2815 int startoff; 2816 2817 if (!agflbp) { 2818 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2819 if (error) 2820 return error; 2821 } 2822 2823 be32_add_cpu(&agf->agf_fllast, 1); 2824 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2825 agf->agf_fllast = 0; 2826 2827 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2828 be32_add_cpu(&agf->agf_flcount, 1); 2829 pag->pagf_flcount++; 2830 2831 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 2832 if (btreeblk) { 2833 be32_add_cpu(&agf->agf_btreeblks, -1); 2834 pag->pagf_btreeblks--; 2835 logflags |= XFS_AGF_BTREEBLKS; 2836 } 2837 2838 xfs_alloc_log_agf(tp, agbp, logflags); 2839 2840 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 2841 2842 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2843 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2844 *blockp = cpu_to_be32(bno); 2845 startoff = (char *)blockp - (char *)agflbp->b_addr; 2846 2847 xfs_alloc_log_agf(tp, agbp, logflags); 2848 2849 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 2850 xfs_trans_log_buf(tp, agflbp, startoff, 2851 startoff + sizeof(xfs_agblock_t) - 1); 2852 return 0; 2853 } 2854 2855 static xfs_failaddr_t 2856 xfs_agf_verify( 2857 struct xfs_buf *bp) 2858 { 2859 struct xfs_mount *mp = bp->b_mount; 2860 struct xfs_agf *agf = bp->b_addr; 2861 2862 if (xfs_has_crc(mp)) { 2863 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 2864 return __this_address; 2865 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 2866 return __this_address; 2867 } 2868 2869 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 2870 return __this_address; 2871 2872 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2873 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2874 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && 2875 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && 2876 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) 2877 return __this_address; 2878 2879 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) 2880 return __this_address; 2881 2882 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 2883 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) 2884 return __this_address; 2885 2886 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || 2887 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || 2888 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > 2889 mp->m_alloc_maxlevels || 2890 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > 2891 mp->m_alloc_maxlevels) 2892 return __this_address; 2893 2894 if (xfs_has_rmapbt(mp) && 2895 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || 2896 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > 2897 mp->m_rmap_maxlevels)) 2898 return __this_address; 2899 2900 if (xfs_has_rmapbt(mp) && 2901 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) 2902 return __this_address; 2903 2904 /* 2905 * during growfs operations, the perag is not fully initialised, 2906 * so we can't use it for any useful checking. growfs ensures we can't 2907 * use it by using uncached buffers that don't have the perag attached 2908 * so we can detect and avoid this problem. 2909 */ 2910 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) 2911 return __this_address; 2912 2913 if (xfs_has_lazysbcount(mp) && 2914 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) 2915 return __this_address; 2916 2917 if (xfs_has_reflink(mp) && 2918 be32_to_cpu(agf->agf_refcount_blocks) > 2919 be32_to_cpu(agf->agf_length)) 2920 return __this_address; 2921 2922 if (xfs_has_reflink(mp) && 2923 (be32_to_cpu(agf->agf_refcount_level) < 1 || 2924 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) 2925 return __this_address; 2926 2927 return NULL; 2928 2929 } 2930 2931 static void 2932 xfs_agf_read_verify( 2933 struct xfs_buf *bp) 2934 { 2935 struct xfs_mount *mp = bp->b_mount; 2936 xfs_failaddr_t fa; 2937 2938 if (xfs_has_crc(mp) && 2939 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 2940 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 2941 else { 2942 fa = xfs_agf_verify(bp); 2943 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 2944 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2945 } 2946 } 2947 2948 static void 2949 xfs_agf_write_verify( 2950 struct xfs_buf *bp) 2951 { 2952 struct xfs_mount *mp = bp->b_mount; 2953 struct xfs_buf_log_item *bip = bp->b_log_item; 2954 struct xfs_agf *agf = bp->b_addr; 2955 xfs_failaddr_t fa; 2956 2957 fa = xfs_agf_verify(bp); 2958 if (fa) { 2959 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2960 return; 2961 } 2962 2963 if (!xfs_has_crc(mp)) 2964 return; 2965 2966 if (bip) 2967 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 2968 2969 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 2970 } 2971 2972 const struct xfs_buf_ops xfs_agf_buf_ops = { 2973 .name = "xfs_agf", 2974 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 2975 .verify_read = xfs_agf_read_verify, 2976 .verify_write = xfs_agf_write_verify, 2977 .verify_struct = xfs_agf_verify, 2978 }; 2979 2980 /* 2981 * Read in the allocation group header (free/alloc section). 2982 */ 2983 int 2984 xfs_read_agf( 2985 struct xfs_perag *pag, 2986 struct xfs_trans *tp, 2987 int flags, 2988 struct xfs_buf **agfbpp) 2989 { 2990 struct xfs_mount *mp = pag->pag_mount; 2991 int error; 2992 2993 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 2994 2995 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 2996 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 2997 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 2998 if (error) 2999 return error; 3000 3001 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3002 return 0; 3003 } 3004 3005 /* 3006 * Read in the allocation group header (free/alloc section) and initialise the 3007 * perag structure if necessary. If the caller provides @agfbpp, then return the 3008 * locked buffer to the caller, otherwise free it. 3009 */ 3010 int 3011 xfs_alloc_read_agf( 3012 struct xfs_perag *pag, 3013 struct xfs_trans *tp, 3014 int flags, 3015 struct xfs_buf **agfbpp) 3016 { 3017 struct xfs_buf *agfbp; 3018 struct xfs_agf *agf; 3019 int error; 3020 int allocbt_blks; 3021 3022 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3023 3024 /* We don't support trylock when freeing. */ 3025 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3026 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3027 error = xfs_read_agf(pag, tp, 3028 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3029 &agfbp); 3030 if (error) 3031 return error; 3032 3033 agf = agfbp->b_addr; 3034 if (!xfs_perag_initialised_agf(pag)) { 3035 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3036 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3037 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3038 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3039 pag->pagf_levels[XFS_BTNUM_BNOi] = 3040 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); 3041 pag->pagf_levels[XFS_BTNUM_CNTi] = 3042 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 3043 pag->pagf_levels[XFS_BTNUM_RMAPi] = 3044 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); 3045 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3046 if (xfs_agfl_needs_reset(pag->pag_mount, agf)) 3047 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3048 3049 /* 3050 * Update the in-core allocbt counter. Filter out the rmapbt 3051 * subset of the btreeblks counter because the rmapbt is managed 3052 * by perag reservation. Subtract one for the rmapbt root block 3053 * because the rmap counter includes it while the btreeblks 3054 * counter only tracks non-root blocks. 3055 */ 3056 allocbt_blks = pag->pagf_btreeblks; 3057 if (xfs_has_rmapbt(pag->pag_mount)) 3058 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3059 if (allocbt_blks > 0) 3060 atomic64_add(allocbt_blks, 3061 &pag->pag_mount->m_allocbt_blks); 3062 3063 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3064 } 3065 #ifdef DEBUG 3066 else if (!xfs_is_shutdown(pag->pag_mount)) { 3067 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3068 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3069 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3070 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3071 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 3072 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); 3073 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == 3074 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3075 } 3076 #endif 3077 if (agfbpp) 3078 *agfbpp = agfbp; 3079 else 3080 xfs_trans_brelse(tp, agfbp); 3081 return 0; 3082 } 3083 3084 /* 3085 * Pre-proces allocation arguments to set initial state that we don't require 3086 * callers to set up correctly, as well as bounds check the allocation args 3087 * that are set up. 3088 */ 3089 static int 3090 xfs_alloc_vextent_check_args( 3091 struct xfs_alloc_arg *args, 3092 xfs_fsblock_t target, 3093 xfs_agnumber_t *minimum_agno) 3094 { 3095 struct xfs_mount *mp = args->mp; 3096 xfs_agblock_t agsize; 3097 3098 args->fsbno = NULLFSBLOCK; 3099 3100 *minimum_agno = 0; 3101 if (args->tp->t_highest_agno != NULLAGNUMBER) 3102 *minimum_agno = args->tp->t_highest_agno; 3103 3104 /* 3105 * Just fix this up, for the case where the last a.g. is shorter 3106 * (or there's only one a.g.) and the caller couldn't easily figure 3107 * that out (xfs_bmap_alloc). 3108 */ 3109 agsize = mp->m_sb.sb_agblocks; 3110 if (args->maxlen > agsize) 3111 args->maxlen = agsize; 3112 if (args->alignment == 0) 3113 args->alignment = 1; 3114 3115 ASSERT(args->minlen > 0); 3116 ASSERT(args->maxlen > 0); 3117 ASSERT(args->alignment > 0); 3118 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3119 3120 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3121 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3122 ASSERT(args->minlen <= args->maxlen); 3123 ASSERT(args->minlen <= agsize); 3124 ASSERT(args->mod < args->prod); 3125 3126 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3127 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3128 args->minlen > args->maxlen || args->minlen > agsize || 3129 args->mod >= args->prod) { 3130 trace_xfs_alloc_vextent_badargs(args); 3131 return -ENOSPC; 3132 } 3133 3134 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3135 trace_xfs_alloc_vextent_skip_deadlock(args); 3136 return -ENOSPC; 3137 } 3138 return 0; 3139 3140 } 3141 3142 /* 3143 * Prepare an AG for allocation. If the AG is not prepared to accept the 3144 * allocation, return failure. 3145 * 3146 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3147 * modified to hold their own perag references. 3148 */ 3149 static int 3150 xfs_alloc_vextent_prepare_ag( 3151 struct xfs_alloc_arg *args) 3152 { 3153 bool need_pag = !args->pag; 3154 int error; 3155 3156 if (need_pag) 3157 args->pag = xfs_perag_get(args->mp, args->agno); 3158 3159 args->agbp = NULL; 3160 error = xfs_alloc_fix_freelist(args, 0); 3161 if (error) { 3162 trace_xfs_alloc_vextent_nofix(args); 3163 if (need_pag) 3164 xfs_perag_put(args->pag); 3165 args->agbno = NULLAGBLOCK; 3166 return error; 3167 } 3168 if (!args->agbp) { 3169 /* cannot allocate in this AG at all */ 3170 trace_xfs_alloc_vextent_noagbp(args); 3171 args->agbno = NULLAGBLOCK; 3172 return 0; 3173 } 3174 args->wasfromfl = 0; 3175 return 0; 3176 } 3177 3178 /* 3179 * Post-process allocation results to account for the allocation if it succeed 3180 * and set the allocated block number correctly for the caller. 3181 * 3182 * XXX: we should really be returning ENOSPC for ENOSPC, not 3183 * hiding it behind a "successful" NULLFSBLOCK allocation. 3184 */ 3185 static int 3186 xfs_alloc_vextent_finish( 3187 struct xfs_alloc_arg *args, 3188 xfs_agnumber_t minimum_agno, 3189 int alloc_error, 3190 bool drop_perag) 3191 { 3192 struct xfs_mount *mp = args->mp; 3193 int error = 0; 3194 3195 /* 3196 * We can end up here with a locked AGF. If we failed, the caller is 3197 * likely going to try to allocate again with different parameters, and 3198 * that can widen the AGs that are searched for free space. If we have 3199 * to do BMBT block allocation, we have to do a new allocation. 3200 * 3201 * Hence leaving this function with the AGF locked opens up potential 3202 * ABBA AGF deadlocks because a future allocation attempt in this 3203 * transaction may attempt to lock a lower number AGF. 3204 * 3205 * We can't release the AGF until the transaction is commited, so at 3206 * this point we must update the "first allocation" tracker to point at 3207 * this AG if the tracker is empty or points to a lower AG. This allows 3208 * the next allocation attempt to be modified appropriately to avoid 3209 * deadlocks. 3210 */ 3211 if (args->agbp && 3212 (args->tp->t_highest_agno == NULLAGNUMBER || 3213 args->agno > minimum_agno)) 3214 args->tp->t_highest_agno = args->agno; 3215 3216 /* 3217 * If the allocation failed with an error or we had an ENOSPC result, 3218 * preserve the returned error whilst also marking the allocation result 3219 * as "no extent allocated". This ensures that callers that fail to 3220 * capture the error will still treat it as a failed allocation. 3221 */ 3222 if (alloc_error || args->agbno == NULLAGBLOCK) { 3223 args->fsbno = NULLFSBLOCK; 3224 error = alloc_error; 3225 goto out_drop_perag; 3226 } 3227 3228 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3229 3230 ASSERT(args->len >= args->minlen); 3231 ASSERT(args->len <= args->maxlen); 3232 ASSERT(args->agbno % args->alignment == 0); 3233 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3234 3235 /* if not file data, insert new block into the reverse map btree */ 3236 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3237 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3238 args->agbno, args->len, &args->oinfo); 3239 if (error) 3240 goto out_drop_perag; 3241 } 3242 3243 if (!args->wasfromfl) { 3244 error = xfs_alloc_update_counters(args->tp, args->agbp, 3245 -((long)(args->len))); 3246 if (error) 3247 goto out_drop_perag; 3248 3249 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, 3250 args->len)); 3251 } 3252 3253 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3254 3255 XFS_STATS_INC(mp, xs_allocx); 3256 XFS_STATS_ADD(mp, xs_allocb, args->len); 3257 3258 out_drop_perag: 3259 if (drop_perag && args->pag) { 3260 xfs_perag_rele(args->pag); 3261 args->pag = NULL; 3262 } 3263 return error; 3264 } 3265 3266 /* 3267 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3268 * you need an exact sized allocation without locality constraints, this is the 3269 * fastest way to do it. 3270 * 3271 * Caller is expected to hold a perag reference in args->pag. 3272 */ 3273 int 3274 xfs_alloc_vextent_this_ag( 3275 struct xfs_alloc_arg *args, 3276 xfs_agnumber_t agno) 3277 { 3278 struct xfs_mount *mp = args->mp; 3279 xfs_agnumber_t minimum_agno; 3280 int error; 3281 3282 args->agno = agno; 3283 args->agbno = 0; 3284 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), 3285 &minimum_agno); 3286 if (error) { 3287 if (error == -ENOSPC) 3288 return 0; 3289 return error; 3290 } 3291 3292 error = xfs_alloc_vextent_prepare_ag(args); 3293 if (!error && args->agbp) 3294 error = xfs_alloc_ag_vextent_size(args); 3295 3296 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3297 } 3298 3299 /* 3300 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3301 * 3302 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3303 * allocation attempts in @start_agno have locality information. If we fail to 3304 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3305 * we attempt to allocation in as there is no locality optimisation possible for 3306 * those allocations. 3307 * 3308 * On return, args->pag may be left referenced if we finish before the "all 3309 * failed" return point. The allocation finish still needs the perag, and 3310 * so the caller will release it once they've finished the allocation. 3311 * 3312 * When we wrap the AG iteration at the end of the filesystem, we have to be 3313 * careful not to wrap into AGs below ones we already have locked in the 3314 * transaction if we are doing a blocking iteration. This will result in an 3315 * out-of-order locking of AGFs and hence can cause deadlocks. 3316 */ 3317 static int 3318 xfs_alloc_vextent_iterate_ags( 3319 struct xfs_alloc_arg *args, 3320 xfs_agnumber_t minimum_agno, 3321 xfs_agnumber_t start_agno, 3322 xfs_agblock_t target_agbno, 3323 uint32_t flags) 3324 { 3325 struct xfs_mount *mp = args->mp; 3326 xfs_agnumber_t agno; 3327 int error = 0; 3328 3329 restart: 3330 for_each_perag_wrap_range(mp, start_agno, minimum_agno, 3331 mp->m_sb.sb_agcount, agno, args->pag) { 3332 args->agno = agno; 3333 error = xfs_alloc_vextent_prepare_ag(args); 3334 if (error) 3335 break; 3336 if (!args->agbp) { 3337 trace_xfs_alloc_vextent_loopfailed(args); 3338 continue; 3339 } 3340 3341 /* 3342 * Allocation is supposed to succeed now, so break out of the 3343 * loop regardless of whether we succeed or not. 3344 */ 3345 if (args->agno == start_agno && target_agbno) { 3346 args->agbno = target_agbno; 3347 error = xfs_alloc_ag_vextent_near(args); 3348 } else { 3349 args->agbno = 0; 3350 error = xfs_alloc_ag_vextent_size(args); 3351 } 3352 break; 3353 } 3354 if (error) { 3355 xfs_perag_rele(args->pag); 3356 args->pag = NULL; 3357 return error; 3358 } 3359 if (args->agbp) 3360 return 0; 3361 3362 /* 3363 * We didn't find an AG we can alloation from. If we were given 3364 * constraining flags by the caller, drop them and retry the allocation 3365 * without any constraints being set. 3366 */ 3367 if (flags) { 3368 flags = 0; 3369 goto restart; 3370 } 3371 3372 ASSERT(args->pag == NULL); 3373 trace_xfs_alloc_vextent_allfailed(args); 3374 return 0; 3375 } 3376 3377 /* 3378 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3379 * to allocate blocks. It starts with a near allocation attempt in the initial 3380 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3381 * back to zero if allowed by previous allocations in this transaction, 3382 * otherwise will wrap back to the start AG and run a second blocking pass to 3383 * the end of the filesystem. 3384 */ 3385 int 3386 xfs_alloc_vextent_start_ag( 3387 struct xfs_alloc_arg *args, 3388 xfs_fsblock_t target) 3389 { 3390 struct xfs_mount *mp = args->mp; 3391 xfs_agnumber_t minimum_agno; 3392 xfs_agnumber_t start_agno; 3393 xfs_agnumber_t rotorstep = xfs_rotorstep; 3394 bool bump_rotor = false; 3395 int error; 3396 3397 args->agno = NULLAGNUMBER; 3398 args->agbno = NULLAGBLOCK; 3399 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3400 if (error) { 3401 if (error == -ENOSPC) 3402 return 0; 3403 return error; 3404 } 3405 3406 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3407 xfs_is_inode32(mp)) { 3408 target = XFS_AGB_TO_FSB(mp, 3409 ((mp->m_agfrotor / rotorstep) % 3410 mp->m_sb.sb_agcount), 0); 3411 bump_rotor = 1; 3412 } 3413 3414 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3415 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3416 XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); 3417 3418 if (bump_rotor) { 3419 if (args->agno == start_agno) 3420 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3421 (mp->m_sb.sb_agcount * rotorstep); 3422 else 3423 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3424 (mp->m_sb.sb_agcount * rotorstep); 3425 } 3426 3427 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3428 } 3429 3430 /* 3431 * Iterate from the agno indicated via @target through to the end of the 3432 * filesystem attempting blocking allocation. This does not wrap or try a second 3433 * pass, so will not recurse into AGs lower than indicated by the target. 3434 */ 3435 int 3436 xfs_alloc_vextent_first_ag( 3437 struct xfs_alloc_arg *args, 3438 xfs_fsblock_t target) 3439 { 3440 struct xfs_mount *mp = args->mp; 3441 xfs_agnumber_t minimum_agno; 3442 xfs_agnumber_t start_agno; 3443 int error; 3444 3445 args->agno = NULLAGNUMBER; 3446 args->agbno = NULLAGBLOCK; 3447 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3448 if (error) { 3449 if (error == -ENOSPC) 3450 return 0; 3451 return error; 3452 } 3453 3454 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3455 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3456 XFS_FSB_TO_AGBNO(mp, target), 0); 3457 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3458 } 3459 3460 /* 3461 * Allocate at the exact block target or fail. Caller is expected to hold a 3462 * perag reference in args->pag. 3463 */ 3464 int 3465 xfs_alloc_vextent_exact_bno( 3466 struct xfs_alloc_arg *args, 3467 xfs_fsblock_t target) 3468 { 3469 struct xfs_mount *mp = args->mp; 3470 xfs_agnumber_t minimum_agno; 3471 int error; 3472 3473 args->agno = XFS_FSB_TO_AGNO(mp, target); 3474 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3475 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3476 if (error) { 3477 if (error == -ENOSPC) 3478 return 0; 3479 return error; 3480 } 3481 3482 error = xfs_alloc_vextent_prepare_ag(args); 3483 if (!error && args->agbp) 3484 error = xfs_alloc_ag_vextent_exact(args); 3485 3486 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3487 } 3488 3489 /* 3490 * Allocate an extent as close to the target as possible. If there are not 3491 * viable candidates in the AG, then fail the allocation. 3492 * 3493 * Caller may or may not have a per-ag reference in args->pag. 3494 */ 3495 int 3496 xfs_alloc_vextent_near_bno( 3497 struct xfs_alloc_arg *args, 3498 xfs_fsblock_t target) 3499 { 3500 struct xfs_mount *mp = args->mp; 3501 xfs_agnumber_t minimum_agno; 3502 bool needs_perag = args->pag == NULL; 3503 int error; 3504 3505 args->agno = XFS_FSB_TO_AGNO(mp, target); 3506 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3507 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3508 if (error) { 3509 if (error == -ENOSPC) 3510 return 0; 3511 return error; 3512 } 3513 3514 if (needs_perag) 3515 args->pag = xfs_perag_grab(mp, args->agno); 3516 3517 error = xfs_alloc_vextent_prepare_ag(args); 3518 if (!error && args->agbp) 3519 error = xfs_alloc_ag_vextent_near(args); 3520 3521 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3522 } 3523 3524 /* Ensure that the freelist is at full capacity. */ 3525 int 3526 xfs_free_extent_fix_freelist( 3527 struct xfs_trans *tp, 3528 struct xfs_perag *pag, 3529 struct xfs_buf **agbp) 3530 { 3531 struct xfs_alloc_arg args; 3532 int error; 3533 3534 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3535 args.tp = tp; 3536 args.mp = tp->t_mountp; 3537 args.agno = pag->pag_agno; 3538 args.pag = pag; 3539 3540 /* 3541 * validate that the block number is legal - the enables us to detect 3542 * and handle a silent filesystem corruption rather than crashing. 3543 */ 3544 if (args.agno >= args.mp->m_sb.sb_agcount) 3545 return -EFSCORRUPTED; 3546 3547 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3548 if (error) 3549 return error; 3550 3551 *agbp = args.agbp; 3552 return 0; 3553 } 3554 3555 /* 3556 * Free an extent. 3557 * Just break up the extent address and hand off to xfs_free_ag_extent 3558 * after fixing up the freelist. 3559 */ 3560 int 3561 __xfs_free_extent( 3562 struct xfs_trans *tp, 3563 xfs_fsblock_t bno, 3564 xfs_extlen_t len, 3565 const struct xfs_owner_info *oinfo, 3566 enum xfs_ag_resv_type type, 3567 bool skip_discard) 3568 { 3569 struct xfs_mount *mp = tp->t_mountp; 3570 struct xfs_buf *agbp; 3571 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); 3572 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); 3573 struct xfs_agf *agf; 3574 int error; 3575 unsigned int busy_flags = 0; 3576 struct xfs_perag *pag; 3577 3578 ASSERT(len != 0); 3579 ASSERT(type != XFS_AG_RESV_AGFL); 3580 3581 if (XFS_TEST_ERROR(false, mp, 3582 XFS_ERRTAG_FREE_EXTENT)) 3583 return -EIO; 3584 3585 pag = xfs_perag_get(mp, agno); 3586 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3587 if (error) 3588 goto err; 3589 agf = agbp->b_addr; 3590 3591 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3592 error = -EFSCORRUPTED; 3593 goto err_release; 3594 } 3595 3596 /* validate the extent size is legal now we have the agf locked */ 3597 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3598 error = -EFSCORRUPTED; 3599 goto err_release; 3600 } 3601 3602 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); 3603 if (error) 3604 goto err_release; 3605 3606 if (skip_discard) 3607 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3608 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3609 xfs_perag_put(pag); 3610 return 0; 3611 3612 err_release: 3613 xfs_trans_brelse(tp, agbp); 3614 err: 3615 xfs_perag_put(pag); 3616 return error; 3617 } 3618 3619 struct xfs_alloc_query_range_info { 3620 xfs_alloc_query_range_fn fn; 3621 void *priv; 3622 }; 3623 3624 /* Format btree record and pass to our callback. */ 3625 STATIC int 3626 xfs_alloc_query_range_helper( 3627 struct xfs_btree_cur *cur, 3628 const union xfs_btree_rec *rec, 3629 void *priv) 3630 { 3631 struct xfs_alloc_query_range_info *query = priv; 3632 struct xfs_alloc_rec_incore irec; 3633 3634 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 3635 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 3636 return query->fn(cur, &irec, query->priv); 3637 } 3638 3639 /* Find all free space within a given range of blocks. */ 3640 int 3641 xfs_alloc_query_range( 3642 struct xfs_btree_cur *cur, 3643 const struct xfs_alloc_rec_incore *low_rec, 3644 const struct xfs_alloc_rec_incore *high_rec, 3645 xfs_alloc_query_range_fn fn, 3646 void *priv) 3647 { 3648 union xfs_btree_irec low_brec; 3649 union xfs_btree_irec high_brec; 3650 struct xfs_alloc_query_range_info query; 3651 3652 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3653 low_brec.a = *low_rec; 3654 high_brec.a = *high_rec; 3655 query.priv = priv; 3656 query.fn = fn; 3657 return xfs_btree_query_range(cur, &low_brec, &high_brec, 3658 xfs_alloc_query_range_helper, &query); 3659 } 3660 3661 /* Find all free space records. */ 3662 int 3663 xfs_alloc_query_all( 3664 struct xfs_btree_cur *cur, 3665 xfs_alloc_query_range_fn fn, 3666 void *priv) 3667 { 3668 struct xfs_alloc_query_range_info query; 3669 3670 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3671 query.priv = priv; 3672 query.fn = fn; 3673 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 3674 } 3675 3676 /* Is there a record covering a given extent? */ 3677 int 3678 xfs_alloc_has_record( 3679 struct xfs_btree_cur *cur, 3680 xfs_agblock_t bno, 3681 xfs_extlen_t len, 3682 bool *exists) 3683 { 3684 union xfs_btree_irec low; 3685 union xfs_btree_irec high; 3686 3687 memset(&low, 0, sizeof(low)); 3688 low.a.ar_startblock = bno; 3689 memset(&high, 0xFF, sizeof(high)); 3690 high.a.ar_startblock = bno + len - 1; 3691 3692 return xfs_btree_has_record(cur, &low, &high, exists); 3693 } 3694 3695 /* 3696 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 3697 * error code or XFS_ITER_*. 3698 */ 3699 int 3700 xfs_agfl_walk( 3701 struct xfs_mount *mp, 3702 struct xfs_agf *agf, 3703 struct xfs_buf *agflbp, 3704 xfs_agfl_walk_fn walk_fn, 3705 void *priv) 3706 { 3707 __be32 *agfl_bno; 3708 unsigned int i; 3709 int error; 3710 3711 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3712 i = be32_to_cpu(agf->agf_flfirst); 3713 3714 /* Nothing to walk in an empty AGFL. */ 3715 if (agf->agf_flcount == cpu_to_be32(0)) 3716 return 0; 3717 3718 /* Otherwise, walk from first to last, wrapping as needed. */ 3719 for (;;) { 3720 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 3721 if (error) 3722 return error; 3723 if (i == be32_to_cpu(agf->agf_fllast)) 3724 break; 3725 if (++i == xfs_agfl_size(mp)) 3726 i = 0; 3727 } 3728 3729 return 0; 3730 } 3731 3732 int __init 3733 xfs_extfree_intent_init_cache(void) 3734 { 3735 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 3736 sizeof(struct xfs_extent_free_item), 3737 0, 0, NULL); 3738 3739 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 3740 } 3741 3742 void 3743 xfs_extfree_intent_destroy_cache(void) 3744 { 3745 kmem_cache_destroy(xfs_extfree_item_cache); 3746 xfs_extfree_item_cache = NULL; 3747 } 3748