1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_sb.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_btree.h" 17 #include "xfs_rmap.h" 18 #include "xfs_alloc_btree.h" 19 #include "xfs_alloc.h" 20 #include "xfs_extent_busy.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_trace.h" 24 #include "xfs_trans.h" 25 #include "xfs_buf_item.h" 26 #include "xfs_log.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 30 extern kmem_zone_t *xfs_bmap_free_item_zone; 31 32 struct workqueue_struct *xfs_alloc_wq; 33 34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 42 43 /* 44 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 45 * the beginning of the block for a proper header with the location information 46 * and CRC. 47 */ 48 unsigned int 49 xfs_agfl_size( 50 struct xfs_mount *mp) 51 { 52 unsigned int size = mp->m_sb.sb_sectsize; 53 54 if (xfs_sb_version_hascrc(&mp->m_sb)) 55 size -= sizeof(struct xfs_agfl); 56 57 return size / sizeof(xfs_agblock_t); 58 } 59 60 unsigned int 61 xfs_refc_block( 62 struct xfs_mount *mp) 63 { 64 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) 65 return XFS_RMAP_BLOCK(mp) + 1; 66 if (xfs_sb_version_hasfinobt(&mp->m_sb)) 67 return XFS_FIBT_BLOCK(mp) + 1; 68 return XFS_IBT_BLOCK(mp) + 1; 69 } 70 71 xfs_extlen_t 72 xfs_prealloc_blocks( 73 struct xfs_mount *mp) 74 { 75 if (xfs_sb_version_hasreflink(&mp->m_sb)) 76 return xfs_refc_block(mp) + 1; 77 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) 78 return XFS_RMAP_BLOCK(mp) + 1; 79 if (xfs_sb_version_hasfinobt(&mp->m_sb)) 80 return XFS_FIBT_BLOCK(mp) + 1; 81 return XFS_IBT_BLOCK(mp) + 1; 82 } 83 84 /* 85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 86 * AGF buffer (PV 947395), we place constraints on the relationship among 87 * actual allocations for data blocks, freelist blocks, and potential file data 88 * bmap btree blocks. However, these restrictions may result in no actual space 89 * allocated for a delayed extent, for example, a data block in a certain AG is 90 * allocated but there is no additional block for the additional bmap btree 91 * block due to a split of the bmap btree of the file. The result of this may 92 * lead to an infinite loop when the file gets flushed to disk and all delayed 93 * extents need to be actually allocated. To get around this, we explicitly set 94 * aside a few blocks which will not be reserved in delayed allocation. 95 * 96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a 97 * potential split of the file's bmap btree. 98 */ 99 unsigned int 100 xfs_alloc_set_aside( 101 struct xfs_mount *mp) 102 { 103 return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); 104 } 105 106 /* 107 * When deciding how much space to allocate out of an AG, we limit the 108 * allocation maximum size to the size the AG. However, we cannot use all the 109 * blocks in the AG - some are permanently used by metadata. These 110 * blocks are generally: 111 * - the AG superblock, AGF, AGI and AGFL 112 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 113 * the AGI free inode and rmap btree root blocks. 114 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 115 * - the rmapbt root block 116 * 117 * The AG headers are sector sized, so the amount of space they take up is 118 * dependent on filesystem geometry. The others are all single blocks. 119 */ 120 unsigned int 121 xfs_alloc_ag_max_usable( 122 struct xfs_mount *mp) 123 { 124 unsigned int blocks; 125 126 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 127 blocks += XFS_ALLOC_AGFL_RESERVE; 128 blocks += 3; /* AGF, AGI btree root blocks */ 129 if (xfs_sb_version_hasfinobt(&mp->m_sb)) 130 blocks++; /* finobt root block */ 131 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) 132 blocks++; /* rmap root block */ 133 if (xfs_sb_version_hasreflink(&mp->m_sb)) 134 blocks++; /* refcount root block */ 135 136 return mp->m_sb.sb_agblocks - blocks; 137 } 138 139 /* 140 * Lookup the record equal to [bno, len] in the btree given by cur. 141 */ 142 STATIC int /* error */ 143 xfs_alloc_lookup_eq( 144 struct xfs_btree_cur *cur, /* btree cursor */ 145 xfs_agblock_t bno, /* starting block of extent */ 146 xfs_extlen_t len, /* length of extent */ 147 int *stat) /* success/failure */ 148 { 149 int error; 150 151 cur->bc_rec.a.ar_startblock = bno; 152 cur->bc_rec.a.ar_blockcount = len; 153 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); 154 cur->bc_private.a.priv.abt.active = (*stat == 1); 155 return error; 156 } 157 158 /* 159 * Lookup the first record greater than or equal to [bno, len] 160 * in the btree given by cur. 161 */ 162 int /* error */ 163 xfs_alloc_lookup_ge( 164 struct xfs_btree_cur *cur, /* btree cursor */ 165 xfs_agblock_t bno, /* starting block of extent */ 166 xfs_extlen_t len, /* length of extent */ 167 int *stat) /* success/failure */ 168 { 169 int error; 170 171 cur->bc_rec.a.ar_startblock = bno; 172 cur->bc_rec.a.ar_blockcount = len; 173 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 174 cur->bc_private.a.priv.abt.active = (*stat == 1); 175 return error; 176 } 177 178 /* 179 * Lookup the first record less than or equal to [bno, len] 180 * in the btree given by cur. 181 */ 182 int /* error */ 183 xfs_alloc_lookup_le( 184 struct xfs_btree_cur *cur, /* btree cursor */ 185 xfs_agblock_t bno, /* starting block of extent */ 186 xfs_extlen_t len, /* length of extent */ 187 int *stat) /* success/failure */ 188 { 189 int error; 190 cur->bc_rec.a.ar_startblock = bno; 191 cur->bc_rec.a.ar_blockcount = len; 192 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); 193 cur->bc_private.a.priv.abt.active = (*stat == 1); 194 return error; 195 } 196 197 static inline bool 198 xfs_alloc_cur_active( 199 struct xfs_btree_cur *cur) 200 { 201 return cur && cur->bc_private.a.priv.abt.active; 202 } 203 204 /* 205 * Update the record referred to by cur to the value given 206 * by [bno, len]. 207 * This either works (return 0) or gets an EFSCORRUPTED error. 208 */ 209 STATIC int /* error */ 210 xfs_alloc_update( 211 struct xfs_btree_cur *cur, /* btree cursor */ 212 xfs_agblock_t bno, /* starting block of extent */ 213 xfs_extlen_t len) /* length of extent */ 214 { 215 union xfs_btree_rec rec; 216 217 rec.alloc.ar_startblock = cpu_to_be32(bno); 218 rec.alloc.ar_blockcount = cpu_to_be32(len); 219 return xfs_btree_update(cur, &rec); 220 } 221 222 /* 223 * Get the data from the pointed-to record. 224 */ 225 int /* error */ 226 xfs_alloc_get_rec( 227 struct xfs_btree_cur *cur, /* btree cursor */ 228 xfs_agblock_t *bno, /* output: starting block of extent */ 229 xfs_extlen_t *len, /* output: length of extent */ 230 int *stat) /* output: success/failure */ 231 { 232 struct xfs_mount *mp = cur->bc_mp; 233 xfs_agnumber_t agno = cur->bc_private.a.agno; 234 union xfs_btree_rec *rec; 235 int error; 236 237 error = xfs_btree_get_rec(cur, &rec, stat); 238 if (error || !(*stat)) 239 return error; 240 241 *bno = be32_to_cpu(rec->alloc.ar_startblock); 242 *len = be32_to_cpu(rec->alloc.ar_blockcount); 243 244 if (*len == 0) 245 goto out_bad_rec; 246 247 /* check for valid extent range, including overflow */ 248 if (!xfs_verify_agbno(mp, agno, *bno)) 249 goto out_bad_rec; 250 if (*bno > *bno + *len) 251 goto out_bad_rec; 252 if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) 253 goto out_bad_rec; 254 255 return 0; 256 257 out_bad_rec: 258 xfs_warn(mp, 259 "%s Freespace BTree record corruption in AG %d detected!", 260 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); 261 xfs_warn(mp, 262 "start block 0x%x block count 0x%x", *bno, *len); 263 return -EFSCORRUPTED; 264 } 265 266 /* 267 * Compute aligned version of the found extent. 268 * Takes alignment and min length into account. 269 */ 270 STATIC bool 271 xfs_alloc_compute_aligned( 272 xfs_alloc_arg_t *args, /* allocation argument structure */ 273 xfs_agblock_t foundbno, /* starting block in found extent */ 274 xfs_extlen_t foundlen, /* length in found extent */ 275 xfs_agblock_t *resbno, /* result block number */ 276 xfs_extlen_t *reslen, /* result length */ 277 unsigned *busy_gen) 278 { 279 xfs_agblock_t bno = foundbno; 280 xfs_extlen_t len = foundlen; 281 xfs_extlen_t diff; 282 bool busy; 283 284 /* Trim busy sections out of found extent */ 285 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 286 287 /* 288 * If we have a largish extent that happens to start before min_agbno, 289 * see if we can shift it into range... 290 */ 291 if (bno < args->min_agbno && bno + len > args->min_agbno) { 292 diff = args->min_agbno - bno; 293 if (len > diff) { 294 bno += diff; 295 len -= diff; 296 } 297 } 298 299 if (args->alignment > 1 && len >= args->minlen) { 300 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 301 302 diff = aligned_bno - bno; 303 304 *resbno = aligned_bno; 305 *reslen = diff >= len ? 0 : len - diff; 306 } else { 307 *resbno = bno; 308 *reslen = len; 309 } 310 311 return busy; 312 } 313 314 /* 315 * Compute best start block and diff for "near" allocations. 316 * freelen >= wantlen already checked by caller. 317 */ 318 STATIC xfs_extlen_t /* difference value (absolute) */ 319 xfs_alloc_compute_diff( 320 xfs_agblock_t wantbno, /* target starting block */ 321 xfs_extlen_t wantlen, /* target length */ 322 xfs_extlen_t alignment, /* target alignment */ 323 int datatype, /* are we allocating data? */ 324 xfs_agblock_t freebno, /* freespace's starting block */ 325 xfs_extlen_t freelen, /* freespace's length */ 326 xfs_agblock_t *newbnop) /* result: best start block from free */ 327 { 328 xfs_agblock_t freeend; /* end of freespace extent */ 329 xfs_agblock_t newbno1; /* return block number */ 330 xfs_agblock_t newbno2; /* other new block number */ 331 xfs_extlen_t newlen1=0; /* length with newbno1 */ 332 xfs_extlen_t newlen2=0; /* length with newbno2 */ 333 xfs_agblock_t wantend; /* end of target extent */ 334 bool userdata = datatype & XFS_ALLOC_USERDATA; 335 336 ASSERT(freelen >= wantlen); 337 freeend = freebno + freelen; 338 wantend = wantbno + wantlen; 339 /* 340 * We want to allocate from the start of a free extent if it is past 341 * the desired block or if we are allocating user data and the free 342 * extent is before desired block. The second case is there to allow 343 * for contiguous allocation from the remaining free space if the file 344 * grows in the short term. 345 */ 346 if (freebno >= wantbno || (userdata && freeend < wantend)) { 347 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 348 newbno1 = NULLAGBLOCK; 349 } else if (freeend >= wantend && alignment > 1) { 350 newbno1 = roundup(wantbno, alignment); 351 newbno2 = newbno1 - alignment; 352 if (newbno1 >= freeend) 353 newbno1 = NULLAGBLOCK; 354 else 355 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 356 if (newbno2 < freebno) 357 newbno2 = NULLAGBLOCK; 358 else 359 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 360 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 361 if (newlen1 < newlen2 || 362 (newlen1 == newlen2 && 363 XFS_ABSDIFF(newbno1, wantbno) > 364 XFS_ABSDIFF(newbno2, wantbno))) 365 newbno1 = newbno2; 366 } else if (newbno2 != NULLAGBLOCK) 367 newbno1 = newbno2; 368 } else if (freeend >= wantend) { 369 newbno1 = wantbno; 370 } else if (alignment > 1) { 371 newbno1 = roundup(freeend - wantlen, alignment); 372 if (newbno1 > freeend - wantlen && 373 newbno1 - alignment >= freebno) 374 newbno1 -= alignment; 375 else if (newbno1 >= freeend) 376 newbno1 = NULLAGBLOCK; 377 } else 378 newbno1 = freeend - wantlen; 379 *newbnop = newbno1; 380 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 381 } 382 383 /* 384 * Fix up the length, based on mod and prod. 385 * len should be k * prod + mod for some k. 386 * If len is too small it is returned unchanged. 387 * If len hits maxlen it is left alone. 388 */ 389 STATIC void 390 xfs_alloc_fix_len( 391 xfs_alloc_arg_t *args) /* allocation argument structure */ 392 { 393 xfs_extlen_t k; 394 xfs_extlen_t rlen; 395 396 ASSERT(args->mod < args->prod); 397 rlen = args->len; 398 ASSERT(rlen >= args->minlen); 399 ASSERT(rlen <= args->maxlen); 400 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 401 (args->mod == 0 && rlen < args->prod)) 402 return; 403 k = rlen % args->prod; 404 if (k == args->mod) 405 return; 406 if (k > args->mod) 407 rlen = rlen - (k - args->mod); 408 else 409 rlen = rlen - args->prod + (args->mod - k); 410 /* casts to (int) catch length underflows */ 411 if ((int)rlen < (int)args->minlen) 412 return; 413 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 414 ASSERT(rlen % args->prod == args->mod); 415 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 416 rlen + args->minleft); 417 args->len = rlen; 418 } 419 420 /* 421 * Update the two btrees, logically removing from freespace the extent 422 * starting at rbno, rlen blocks. The extent is contained within the 423 * actual (current) free extent fbno for flen blocks. 424 * Flags are passed in indicating whether the cursors are set to the 425 * relevant records. 426 */ 427 STATIC int /* error code */ 428 xfs_alloc_fixup_trees( 429 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ 430 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ 431 xfs_agblock_t fbno, /* starting block of free extent */ 432 xfs_extlen_t flen, /* length of free extent */ 433 xfs_agblock_t rbno, /* starting block of returned extent */ 434 xfs_extlen_t rlen, /* length of returned extent */ 435 int flags) /* flags, XFSA_FIXUP_... */ 436 { 437 int error; /* error code */ 438 int i; /* operation results */ 439 xfs_agblock_t nfbno1; /* first new free startblock */ 440 xfs_agblock_t nfbno2; /* second new free startblock */ 441 xfs_extlen_t nflen1=0; /* first new free length */ 442 xfs_extlen_t nflen2=0; /* second new free length */ 443 struct xfs_mount *mp; 444 445 mp = cnt_cur->bc_mp; 446 447 /* 448 * Look up the record in the by-size tree if necessary. 449 */ 450 if (flags & XFSA_FIXUP_CNT_OK) { 451 #ifdef DEBUG 452 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 453 return error; 454 if (XFS_IS_CORRUPT(mp, 455 i != 1 || 456 nfbno1 != fbno || 457 nflen1 != flen)) 458 return -EFSCORRUPTED; 459 #endif 460 } else { 461 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 462 return error; 463 if (XFS_IS_CORRUPT(mp, i != 1)) 464 return -EFSCORRUPTED; 465 } 466 /* 467 * Look up the record in the by-block tree if necessary. 468 */ 469 if (flags & XFSA_FIXUP_BNO_OK) { 470 #ifdef DEBUG 471 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 472 return error; 473 if (XFS_IS_CORRUPT(mp, 474 i != 1 || 475 nfbno1 != fbno || 476 nflen1 != flen)) 477 return -EFSCORRUPTED; 478 #endif 479 } else { 480 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 481 return error; 482 if (XFS_IS_CORRUPT(mp, i != 1)) 483 return -EFSCORRUPTED; 484 } 485 486 #ifdef DEBUG 487 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 488 struct xfs_btree_block *bnoblock; 489 struct xfs_btree_block *cntblock; 490 491 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); 492 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); 493 494 if (XFS_IS_CORRUPT(mp, 495 bnoblock->bb_numrecs != 496 cntblock->bb_numrecs)) 497 return -EFSCORRUPTED; 498 } 499 #endif 500 501 /* 502 * Deal with all four cases: the allocated record is contained 503 * within the freespace record, so we can have new freespace 504 * at either (or both) end, or no freespace remaining. 505 */ 506 if (rbno == fbno && rlen == flen) 507 nfbno1 = nfbno2 = NULLAGBLOCK; 508 else if (rbno == fbno) { 509 nfbno1 = rbno + rlen; 510 nflen1 = flen - rlen; 511 nfbno2 = NULLAGBLOCK; 512 } else if (rbno + rlen == fbno + flen) { 513 nfbno1 = fbno; 514 nflen1 = flen - rlen; 515 nfbno2 = NULLAGBLOCK; 516 } else { 517 nfbno1 = fbno; 518 nflen1 = rbno - fbno; 519 nfbno2 = rbno + rlen; 520 nflen2 = (fbno + flen) - nfbno2; 521 } 522 /* 523 * Delete the entry from the by-size btree. 524 */ 525 if ((error = xfs_btree_delete(cnt_cur, &i))) 526 return error; 527 if (XFS_IS_CORRUPT(mp, i != 1)) 528 return -EFSCORRUPTED; 529 /* 530 * Add new by-size btree entry(s). 531 */ 532 if (nfbno1 != NULLAGBLOCK) { 533 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 534 return error; 535 if (XFS_IS_CORRUPT(mp, i != 0)) 536 return -EFSCORRUPTED; 537 if ((error = xfs_btree_insert(cnt_cur, &i))) 538 return error; 539 if (XFS_IS_CORRUPT(mp, i != 1)) 540 return -EFSCORRUPTED; 541 } 542 if (nfbno2 != NULLAGBLOCK) { 543 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 544 return error; 545 if (XFS_IS_CORRUPT(mp, i != 0)) 546 return -EFSCORRUPTED; 547 if ((error = xfs_btree_insert(cnt_cur, &i))) 548 return error; 549 if (XFS_IS_CORRUPT(mp, i != 1)) 550 return -EFSCORRUPTED; 551 } 552 /* 553 * Fix up the by-block btree entry(s). 554 */ 555 if (nfbno1 == NULLAGBLOCK) { 556 /* 557 * No remaining freespace, just delete the by-block tree entry. 558 */ 559 if ((error = xfs_btree_delete(bno_cur, &i))) 560 return error; 561 if (XFS_IS_CORRUPT(mp, i != 1)) 562 return -EFSCORRUPTED; 563 } else { 564 /* 565 * Update the by-block entry to start later|be shorter. 566 */ 567 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 568 return error; 569 } 570 if (nfbno2 != NULLAGBLOCK) { 571 /* 572 * 2 resulting free entries, need to add one. 573 */ 574 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 575 return error; 576 if (XFS_IS_CORRUPT(mp, i != 0)) 577 return -EFSCORRUPTED; 578 if ((error = xfs_btree_insert(bno_cur, &i))) 579 return error; 580 if (XFS_IS_CORRUPT(mp, i != 1)) 581 return -EFSCORRUPTED; 582 } 583 return 0; 584 } 585 586 static xfs_failaddr_t 587 xfs_agfl_verify( 588 struct xfs_buf *bp) 589 { 590 struct xfs_mount *mp = bp->b_mount; 591 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 592 int i; 593 594 /* 595 * There is no verification of non-crc AGFLs because mkfs does not 596 * initialise the AGFL to zero or NULL. Hence the only valid part of the 597 * AGFL is what the AGF says is active. We can't get to the AGF, so we 598 * can't verify just those entries are valid. 599 */ 600 if (!xfs_sb_version_hascrc(&mp->m_sb)) 601 return NULL; 602 603 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 604 return __this_address; 605 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 606 return __this_address; 607 /* 608 * during growfs operations, the perag is not fully initialised, 609 * so we can't use it for any useful checking. growfs ensures we can't 610 * use it by using uncached buffers that don't have the perag attached 611 * so we can detect and avoid this problem. 612 */ 613 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 614 return __this_address; 615 616 for (i = 0; i < xfs_agfl_size(mp); i++) { 617 if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && 618 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) 619 return __this_address; 620 } 621 622 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 623 return __this_address; 624 return NULL; 625 } 626 627 static void 628 xfs_agfl_read_verify( 629 struct xfs_buf *bp) 630 { 631 struct xfs_mount *mp = bp->b_mount; 632 xfs_failaddr_t fa; 633 634 /* 635 * There is no verification of non-crc AGFLs because mkfs does not 636 * initialise the AGFL to zero or NULL. Hence the only valid part of the 637 * AGFL is what the AGF says is active. We can't get to the AGF, so we 638 * can't verify just those entries are valid. 639 */ 640 if (!xfs_sb_version_hascrc(&mp->m_sb)) 641 return; 642 643 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 644 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 645 else { 646 fa = xfs_agfl_verify(bp); 647 if (fa) 648 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 649 } 650 } 651 652 static void 653 xfs_agfl_write_verify( 654 struct xfs_buf *bp) 655 { 656 struct xfs_mount *mp = bp->b_mount; 657 struct xfs_buf_log_item *bip = bp->b_log_item; 658 xfs_failaddr_t fa; 659 660 /* no verification of non-crc AGFLs */ 661 if (!xfs_sb_version_hascrc(&mp->m_sb)) 662 return; 663 664 fa = xfs_agfl_verify(bp); 665 if (fa) { 666 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 667 return; 668 } 669 670 if (bip) 671 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 672 673 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 674 } 675 676 const struct xfs_buf_ops xfs_agfl_buf_ops = { 677 .name = "xfs_agfl", 678 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 679 .verify_read = xfs_agfl_read_verify, 680 .verify_write = xfs_agfl_write_verify, 681 .verify_struct = xfs_agfl_verify, 682 }; 683 684 /* 685 * Read in the allocation group free block array. 686 */ 687 int /* error */ 688 xfs_alloc_read_agfl( 689 xfs_mount_t *mp, /* mount point structure */ 690 xfs_trans_t *tp, /* transaction pointer */ 691 xfs_agnumber_t agno, /* allocation group number */ 692 xfs_buf_t **bpp) /* buffer for the ag free block array */ 693 { 694 xfs_buf_t *bp; /* return value */ 695 int error; 696 697 ASSERT(agno != NULLAGNUMBER); 698 error = xfs_trans_read_buf( 699 mp, tp, mp->m_ddev_targp, 700 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), 701 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 702 if (error) 703 return error; 704 xfs_buf_set_ref(bp, XFS_AGFL_REF); 705 *bpp = bp; 706 return 0; 707 } 708 709 STATIC int 710 xfs_alloc_update_counters( 711 struct xfs_trans *tp, 712 struct xfs_perag *pag, 713 struct xfs_buf *agbp, 714 long len) 715 { 716 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 717 718 pag->pagf_freeblks += len; 719 be32_add_cpu(&agf->agf_freeblks, len); 720 721 xfs_trans_agblocks_delta(tp, len); 722 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 723 be32_to_cpu(agf->agf_length))) { 724 xfs_buf_corruption_error(agbp); 725 return -EFSCORRUPTED; 726 } 727 728 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 729 return 0; 730 } 731 732 /* 733 * Block allocation algorithm and data structures. 734 */ 735 struct xfs_alloc_cur { 736 struct xfs_btree_cur *cnt; /* btree cursors */ 737 struct xfs_btree_cur *bnolt; 738 struct xfs_btree_cur *bnogt; 739 xfs_extlen_t cur_len;/* current search length */ 740 xfs_agblock_t rec_bno;/* extent startblock */ 741 xfs_extlen_t rec_len;/* extent length */ 742 xfs_agblock_t bno; /* alloc bno */ 743 xfs_extlen_t len; /* alloc len */ 744 xfs_extlen_t diff; /* diff from search bno */ 745 unsigned int busy_gen;/* busy state */ 746 bool busy; 747 }; 748 749 /* 750 * Set up cursors, etc. in the extent allocation cursor. This function can be 751 * called multiple times to reset an initialized structure without having to 752 * reallocate cursors. 753 */ 754 static int 755 xfs_alloc_cur_setup( 756 struct xfs_alloc_arg *args, 757 struct xfs_alloc_cur *acur) 758 { 759 int error; 760 int i; 761 762 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); 763 764 acur->cur_len = args->maxlen; 765 acur->rec_bno = 0; 766 acur->rec_len = 0; 767 acur->bno = 0; 768 acur->len = 0; 769 acur->diff = -1; 770 acur->busy = false; 771 acur->busy_gen = 0; 772 773 /* 774 * Perform an initial cntbt lookup to check for availability of maxlen 775 * extents. If this fails, we'll return -ENOSPC to signal the caller to 776 * attempt a small allocation. 777 */ 778 if (!acur->cnt) 779 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, 780 args->agbp, args->agno, XFS_BTNUM_CNT); 781 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 782 if (error) 783 return error; 784 785 /* 786 * Allocate the bnobt left and right search cursors. 787 */ 788 if (!acur->bnolt) 789 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, 790 args->agbp, args->agno, XFS_BTNUM_BNO); 791 if (!acur->bnogt) 792 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, 793 args->agbp, args->agno, XFS_BTNUM_BNO); 794 return i == 1 ? 0 : -ENOSPC; 795 } 796 797 static void 798 xfs_alloc_cur_close( 799 struct xfs_alloc_cur *acur, 800 bool error) 801 { 802 int cur_error = XFS_BTREE_NOERROR; 803 804 if (error) 805 cur_error = XFS_BTREE_ERROR; 806 807 if (acur->cnt) 808 xfs_btree_del_cursor(acur->cnt, cur_error); 809 if (acur->bnolt) 810 xfs_btree_del_cursor(acur->bnolt, cur_error); 811 if (acur->bnogt) 812 xfs_btree_del_cursor(acur->bnogt, cur_error); 813 acur->cnt = acur->bnolt = acur->bnogt = NULL; 814 } 815 816 /* 817 * Check an extent for allocation and track the best available candidate in the 818 * allocation structure. The cursor is deactivated if it has entered an out of 819 * range state based on allocation arguments. Optionally return the extent 820 * extent geometry and allocation status if requested by the caller. 821 */ 822 static int 823 xfs_alloc_cur_check( 824 struct xfs_alloc_arg *args, 825 struct xfs_alloc_cur *acur, 826 struct xfs_btree_cur *cur, 827 int *new) 828 { 829 int error, i; 830 xfs_agblock_t bno, bnoa, bnew; 831 xfs_extlen_t len, lena, diff = -1; 832 bool busy; 833 unsigned busy_gen = 0; 834 bool deactivate = false; 835 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; 836 837 *new = 0; 838 839 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 840 if (error) 841 return error; 842 if (XFS_IS_CORRUPT(args->mp, i != 1)) 843 return -EFSCORRUPTED; 844 845 /* 846 * Check minlen and deactivate a cntbt cursor if out of acceptable size 847 * range (i.e., walking backwards looking for a minlen extent). 848 */ 849 if (len < args->minlen) { 850 deactivate = !isbnobt; 851 goto out; 852 } 853 854 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 855 &busy_gen); 856 acur->busy |= busy; 857 if (busy) 858 acur->busy_gen = busy_gen; 859 /* deactivate a bnobt cursor outside of locality range */ 860 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 861 deactivate = isbnobt; 862 goto out; 863 } 864 if (lena < args->minlen) 865 goto out; 866 867 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 868 xfs_alloc_fix_len(args); 869 ASSERT(args->len >= args->minlen); 870 if (args->len < acur->len) 871 goto out; 872 873 /* 874 * We have an aligned record that satisfies minlen and beats or matches 875 * the candidate extent size. Compare locality for near allocation mode. 876 */ 877 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); 878 diff = xfs_alloc_compute_diff(args->agbno, args->len, 879 args->alignment, args->datatype, 880 bnoa, lena, &bnew); 881 if (bnew == NULLAGBLOCK) 882 goto out; 883 884 /* 885 * Deactivate a bnobt cursor with worse locality than the current best. 886 */ 887 if (diff > acur->diff) { 888 deactivate = isbnobt; 889 goto out; 890 } 891 892 ASSERT(args->len > acur->len || 893 (args->len == acur->len && diff <= acur->diff)); 894 acur->rec_bno = bno; 895 acur->rec_len = len; 896 acur->bno = bnew; 897 acur->len = args->len; 898 acur->diff = diff; 899 *new = 1; 900 901 /* 902 * We're done if we found a perfect allocation. This only deactivates 903 * the current cursor, but this is just an optimization to terminate a 904 * cntbt search that otherwise runs to the edge of the tree. 905 */ 906 if (acur->diff == 0 && acur->len == args->maxlen) 907 deactivate = true; 908 out: 909 if (deactivate) 910 cur->bc_private.a.priv.abt.active = false; 911 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, 912 *new); 913 return 0; 914 } 915 916 /* 917 * Complete an allocation of a candidate extent. Remove the extent from both 918 * trees and update the args structure. 919 */ 920 STATIC int 921 xfs_alloc_cur_finish( 922 struct xfs_alloc_arg *args, 923 struct xfs_alloc_cur *acur) 924 { 925 int error; 926 927 ASSERT(acur->cnt && acur->bnolt); 928 ASSERT(acur->bno >= acur->rec_bno); 929 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 930 ASSERT(acur->rec_bno + acur->rec_len <= 931 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 932 933 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 934 acur->rec_len, acur->bno, acur->len, 0); 935 if (error) 936 return error; 937 938 args->agbno = acur->bno; 939 args->len = acur->len; 940 args->wasfromfl = 0; 941 942 trace_xfs_alloc_cur(args); 943 return 0; 944 } 945 946 /* 947 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 948 * bno optimized lookup to search for extents with ideal size and locality. 949 */ 950 STATIC int 951 xfs_alloc_cntbt_iter( 952 struct xfs_alloc_arg *args, 953 struct xfs_alloc_cur *acur) 954 { 955 struct xfs_btree_cur *cur = acur->cnt; 956 xfs_agblock_t bno; 957 xfs_extlen_t len, cur_len; 958 int error; 959 int i; 960 961 if (!xfs_alloc_cur_active(cur)) 962 return 0; 963 964 /* locality optimized lookup */ 965 cur_len = acur->cur_len; 966 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 967 if (error) 968 return error; 969 if (i == 0) 970 return 0; 971 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 972 if (error) 973 return error; 974 975 /* check the current record and update search length from it */ 976 error = xfs_alloc_cur_check(args, acur, cur, &i); 977 if (error) 978 return error; 979 ASSERT(len >= acur->cur_len); 980 acur->cur_len = len; 981 982 /* 983 * We looked up the first record >= [agbno, len] above. The agbno is a 984 * secondary key and so the current record may lie just before or after 985 * agbno. If it is past agbno, check the previous record too so long as 986 * the length matches as it may be closer. Don't check a smaller record 987 * because that could deactivate our cursor. 988 */ 989 if (bno > args->agbno) { 990 error = xfs_btree_decrement(cur, 0, &i); 991 if (!error && i) { 992 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 993 if (!error && i && len == acur->cur_len) 994 error = xfs_alloc_cur_check(args, acur, cur, 995 &i); 996 } 997 if (error) 998 return error; 999 } 1000 1001 /* 1002 * Increment the search key until we find at least one allocation 1003 * candidate or if the extent we found was larger. Otherwise, double the 1004 * search key to optimize the search. Efficiency is more important here 1005 * than absolute best locality. 1006 */ 1007 cur_len <<= 1; 1008 if (!acur->len || acur->cur_len >= cur_len) 1009 acur->cur_len++; 1010 else 1011 acur->cur_len = cur_len; 1012 1013 return error; 1014 } 1015 1016 /* 1017 * Deal with the case where only small freespaces remain. Either return the 1018 * contents of the last freespace record, or allocate space from the freelist if 1019 * there is nothing in the tree. 1020 */ 1021 STATIC int /* error */ 1022 xfs_alloc_ag_vextent_small( 1023 struct xfs_alloc_arg *args, /* allocation argument structure */ 1024 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1025 xfs_agblock_t *fbnop, /* result block number */ 1026 xfs_extlen_t *flenp, /* result length */ 1027 int *stat) /* status: 0-freelist, 1-normal/none */ 1028 { 1029 int error = 0; 1030 xfs_agblock_t fbno = NULLAGBLOCK; 1031 xfs_extlen_t flen = 0; 1032 int i = 0; 1033 1034 /* 1035 * If a cntbt cursor is provided, try to allocate the largest record in 1036 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1037 * allocation. Make sure to respect minleft even when pulling from the 1038 * freelist. 1039 */ 1040 if (ccur) 1041 error = xfs_btree_decrement(ccur, 0, &i); 1042 if (error) 1043 goto error; 1044 if (i) { 1045 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1046 if (error) 1047 goto error; 1048 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1049 error = -EFSCORRUPTED; 1050 goto error; 1051 } 1052 goto out; 1053 } 1054 1055 if (args->minlen != 1 || args->alignment != 1 || 1056 args->resv == XFS_AG_RESV_AGFL || 1057 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= 1058 args->minleft)) 1059 goto out; 1060 1061 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); 1062 if (error) 1063 goto error; 1064 if (fbno == NULLAGBLOCK) 1065 goto out; 1066 1067 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, 1068 (args->datatype & XFS_ALLOC_NOBUSY)); 1069 1070 if (args->datatype & XFS_ALLOC_USERDATA) { 1071 struct xfs_buf *bp; 1072 1073 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1074 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1075 args->mp->m_bsize, 0, &bp); 1076 if (error) 1077 goto error; 1078 xfs_trans_binval(args->tp, bp); 1079 } 1080 *fbnop = args->agbno = fbno; 1081 *flenp = args->len = 1; 1082 if (XFS_IS_CORRUPT(args->mp, 1083 fbno >= be32_to_cpu( 1084 XFS_BUF_TO_AGF(args->agbp)->agf_length))) { 1085 error = -EFSCORRUPTED; 1086 goto error; 1087 } 1088 args->wasfromfl = 1; 1089 trace_xfs_alloc_small_freelist(args); 1090 1091 /* 1092 * If we're feeding an AGFL block to something that doesn't live in the 1093 * free space, we need to clear out the OWN_AG rmap. 1094 */ 1095 error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, 1096 &XFS_RMAP_OINFO_AG); 1097 if (error) 1098 goto error; 1099 1100 *stat = 0; 1101 return 0; 1102 1103 out: 1104 /* 1105 * Can't do the allocation, give up. 1106 */ 1107 if (flen < args->minlen) { 1108 args->agbno = NULLAGBLOCK; 1109 trace_xfs_alloc_small_notenough(args); 1110 flen = 0; 1111 } 1112 *fbnop = fbno; 1113 *flenp = flen; 1114 *stat = 1; 1115 trace_xfs_alloc_small_done(args); 1116 return 0; 1117 1118 error: 1119 trace_xfs_alloc_small_error(args); 1120 return error; 1121 } 1122 1123 /* 1124 * Allocate a variable extent in the allocation group agno. 1125 * Type and bno are used to determine where in the allocation group the 1126 * extent will start. 1127 * Extent's length (returned in *len) will be between minlen and maxlen, 1128 * and of the form k * prod + mod unless there's nothing that large. 1129 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1130 */ 1131 STATIC int /* error */ 1132 xfs_alloc_ag_vextent( 1133 xfs_alloc_arg_t *args) /* argument structure for allocation */ 1134 { 1135 int error=0; 1136 1137 ASSERT(args->minlen > 0); 1138 ASSERT(args->maxlen > 0); 1139 ASSERT(args->minlen <= args->maxlen); 1140 ASSERT(args->mod < args->prod); 1141 ASSERT(args->alignment > 0); 1142 1143 /* 1144 * Branch to correct routine based on the type. 1145 */ 1146 args->wasfromfl = 0; 1147 switch (args->type) { 1148 case XFS_ALLOCTYPE_THIS_AG: 1149 error = xfs_alloc_ag_vextent_size(args); 1150 break; 1151 case XFS_ALLOCTYPE_NEAR_BNO: 1152 error = xfs_alloc_ag_vextent_near(args); 1153 break; 1154 case XFS_ALLOCTYPE_THIS_BNO: 1155 error = xfs_alloc_ag_vextent_exact(args); 1156 break; 1157 default: 1158 ASSERT(0); 1159 /* NOTREACHED */ 1160 } 1161 1162 if (error || args->agbno == NULLAGBLOCK) 1163 return error; 1164 1165 ASSERT(args->len >= args->minlen); 1166 ASSERT(args->len <= args->maxlen); 1167 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); 1168 ASSERT(args->agbno % args->alignment == 0); 1169 1170 /* if not file data, insert new block into the reverse map btree */ 1171 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 1172 error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, 1173 args->agbno, args->len, &args->oinfo); 1174 if (error) 1175 return error; 1176 } 1177 1178 if (!args->wasfromfl) { 1179 error = xfs_alloc_update_counters(args->tp, args->pag, 1180 args->agbp, 1181 -((long)(args->len))); 1182 if (error) 1183 return error; 1184 1185 ASSERT(!xfs_extent_busy_search(args->mp, args->agno, 1186 args->agbno, args->len)); 1187 } 1188 1189 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 1190 1191 XFS_STATS_INC(args->mp, xs_allocx); 1192 XFS_STATS_ADD(args->mp, xs_allocb, args->len); 1193 return error; 1194 } 1195 1196 /* 1197 * Allocate a variable extent at exactly agno/bno. 1198 * Extent's length (returned in *len) will be between minlen and maxlen, 1199 * and of the form k * prod + mod unless there's nothing that large. 1200 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1201 */ 1202 STATIC int /* error */ 1203 xfs_alloc_ag_vextent_exact( 1204 xfs_alloc_arg_t *args) /* allocation argument structure */ 1205 { 1206 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ 1207 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ 1208 int error; 1209 xfs_agblock_t fbno; /* start block of found extent */ 1210 xfs_extlen_t flen; /* length of found extent */ 1211 xfs_agblock_t tbno; /* start block of busy extent */ 1212 xfs_extlen_t tlen; /* length of busy extent */ 1213 xfs_agblock_t tend; /* end block of busy extent */ 1214 int i; /* success/failure of operation */ 1215 unsigned busy_gen; 1216 1217 ASSERT(args->alignment == 1); 1218 1219 /* 1220 * Allocate/initialize a cursor for the by-number freespace btree. 1221 */ 1222 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1223 args->agno, XFS_BTNUM_BNO); 1224 1225 /* 1226 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1227 * Look for the closest free block <= bno, it must contain bno 1228 * if any free block does. 1229 */ 1230 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1231 if (error) 1232 goto error0; 1233 if (!i) 1234 goto not_found; 1235 1236 /* 1237 * Grab the freespace record. 1238 */ 1239 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1240 if (error) 1241 goto error0; 1242 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1243 error = -EFSCORRUPTED; 1244 goto error0; 1245 } 1246 ASSERT(fbno <= args->agbno); 1247 1248 /* 1249 * Check for overlapping busy extents. 1250 */ 1251 tbno = fbno; 1252 tlen = flen; 1253 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1254 1255 /* 1256 * Give up if the start of the extent is busy, or the freespace isn't 1257 * long enough for the minimum request. 1258 */ 1259 if (tbno > args->agbno) 1260 goto not_found; 1261 if (tlen < args->minlen) 1262 goto not_found; 1263 tend = tbno + tlen; 1264 if (tend < args->agbno + args->minlen) 1265 goto not_found; 1266 1267 /* 1268 * End of extent will be smaller of the freespace end and the 1269 * maximal requested end. 1270 * 1271 * Fix the length according to mod and prod if given. 1272 */ 1273 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1274 - args->agbno; 1275 xfs_alloc_fix_len(args); 1276 ASSERT(args->agbno + args->len <= tend); 1277 1278 /* 1279 * We are allocating agbno for args->len 1280 * Allocate/initialize a cursor for the by-size btree. 1281 */ 1282 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1283 args->agno, XFS_BTNUM_CNT); 1284 ASSERT(args->agbno + args->len <= 1285 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1286 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1287 args->len, XFSA_FIXUP_BNO_OK); 1288 if (error) { 1289 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1290 goto error0; 1291 } 1292 1293 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1294 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1295 1296 args->wasfromfl = 0; 1297 trace_xfs_alloc_exact_done(args); 1298 return 0; 1299 1300 not_found: 1301 /* Didn't find it, return null. */ 1302 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1303 args->agbno = NULLAGBLOCK; 1304 trace_xfs_alloc_exact_notfound(args); 1305 return 0; 1306 1307 error0: 1308 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1309 trace_xfs_alloc_exact_error(args); 1310 return error; 1311 } 1312 1313 /* 1314 * Search a given number of btree records in a given direction. Check each 1315 * record against the good extent we've already found. 1316 */ 1317 STATIC int 1318 xfs_alloc_walk_iter( 1319 struct xfs_alloc_arg *args, 1320 struct xfs_alloc_cur *acur, 1321 struct xfs_btree_cur *cur, 1322 bool increment, 1323 bool find_one, /* quit on first candidate */ 1324 int count, /* rec count (-1 for infinite) */ 1325 int *stat) 1326 { 1327 int error; 1328 int i; 1329 1330 *stat = 0; 1331 1332 /* 1333 * Search so long as the cursor is active or we find a better extent. 1334 * The cursor is deactivated if it extends beyond the range of the 1335 * current allocation candidate. 1336 */ 1337 while (xfs_alloc_cur_active(cur) && count) { 1338 error = xfs_alloc_cur_check(args, acur, cur, &i); 1339 if (error) 1340 return error; 1341 if (i == 1) { 1342 *stat = 1; 1343 if (find_one) 1344 break; 1345 } 1346 if (!xfs_alloc_cur_active(cur)) 1347 break; 1348 1349 if (increment) 1350 error = xfs_btree_increment(cur, 0, &i); 1351 else 1352 error = xfs_btree_decrement(cur, 0, &i); 1353 if (error) 1354 return error; 1355 if (i == 0) 1356 cur->bc_private.a.priv.abt.active = false; 1357 1358 if (count > 0) 1359 count--; 1360 } 1361 1362 return 0; 1363 } 1364 1365 /* 1366 * Search the by-bno and by-size btrees in parallel in search of an extent with 1367 * ideal locality based on the NEAR mode ->agbno locality hint. 1368 */ 1369 STATIC int 1370 xfs_alloc_ag_vextent_locality( 1371 struct xfs_alloc_arg *args, 1372 struct xfs_alloc_cur *acur, 1373 int *stat) 1374 { 1375 struct xfs_btree_cur *fbcur = NULL; 1376 int error; 1377 int i; 1378 bool fbinc; 1379 1380 ASSERT(acur->len == 0); 1381 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); 1382 1383 *stat = 0; 1384 1385 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1386 if (error) 1387 return error; 1388 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1389 if (error) 1390 return error; 1391 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1392 if (error) 1393 return error; 1394 1395 /* 1396 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1397 * right and lookup the closest extent to the locality hint for each 1398 * extent size key in the cntbt. The entire search terminates 1399 * immediately on a bnobt hit because that means we've found best case 1400 * locality. Otherwise the search continues until the cntbt cursor runs 1401 * off the end of the tree. If no allocation candidate is found at this 1402 * point, give up on locality, walk backwards from the end of the cntbt 1403 * and take the first available extent. 1404 * 1405 * The parallel tree searches balance each other out to provide fairly 1406 * consistent performance for various situations. The bnobt search can 1407 * have pathological behavior in the worst case scenario of larger 1408 * allocation requests and fragmented free space. On the other hand, the 1409 * bnobt is able to satisfy most smaller allocation requests much more 1410 * quickly than the cntbt. The cntbt search can sift through fragmented 1411 * free space and sets of free extents for larger allocation requests 1412 * more quickly than the bnobt. Since the locality hint is just a hint 1413 * and we don't want to scan the entire bnobt for perfect locality, the 1414 * cntbt search essentially bounds the bnobt search such that we can 1415 * find good enough locality at reasonable performance in most cases. 1416 */ 1417 while (xfs_alloc_cur_active(acur->bnolt) || 1418 xfs_alloc_cur_active(acur->bnogt) || 1419 xfs_alloc_cur_active(acur->cnt)) { 1420 1421 trace_xfs_alloc_cur_lookup(args); 1422 1423 /* 1424 * Search the bnobt left and right. In the case of a hit, finish 1425 * the search in the opposite direction and we're done. 1426 */ 1427 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1428 true, 1, &i); 1429 if (error) 1430 return error; 1431 if (i == 1) { 1432 trace_xfs_alloc_cur_left(args); 1433 fbcur = acur->bnogt; 1434 fbinc = true; 1435 break; 1436 } 1437 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1438 1, &i); 1439 if (error) 1440 return error; 1441 if (i == 1) { 1442 trace_xfs_alloc_cur_right(args); 1443 fbcur = acur->bnolt; 1444 fbinc = false; 1445 break; 1446 } 1447 1448 /* 1449 * Check the extent with best locality based on the current 1450 * extent size search key and keep track of the best candidate. 1451 */ 1452 error = xfs_alloc_cntbt_iter(args, acur); 1453 if (error) 1454 return error; 1455 if (!xfs_alloc_cur_active(acur->cnt)) { 1456 trace_xfs_alloc_cur_lookup_done(args); 1457 break; 1458 } 1459 } 1460 1461 /* 1462 * If we failed to find anything due to busy extents, return empty 1463 * handed so the caller can flush and retry. If no busy extents were 1464 * found, walk backwards from the end of the cntbt as a last resort. 1465 */ 1466 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1467 error = xfs_btree_decrement(acur->cnt, 0, &i); 1468 if (error) 1469 return error; 1470 if (i) { 1471 acur->cnt->bc_private.a.priv.abt.active = true; 1472 fbcur = acur->cnt; 1473 fbinc = false; 1474 } 1475 } 1476 1477 /* 1478 * Search in the opposite direction for a better entry in the case of 1479 * a bnobt hit or walk backwards from the end of the cntbt. 1480 */ 1481 if (fbcur) { 1482 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1483 &i); 1484 if (error) 1485 return error; 1486 } 1487 1488 if (acur->len) 1489 *stat = 1; 1490 1491 return 0; 1492 } 1493 1494 /* Check the last block of the cnt btree for allocations. */ 1495 static int 1496 xfs_alloc_ag_vextent_lastblock( 1497 struct xfs_alloc_arg *args, 1498 struct xfs_alloc_cur *acur, 1499 xfs_agblock_t *bno, 1500 xfs_extlen_t *len, 1501 bool *allocated) 1502 { 1503 int error; 1504 int i; 1505 1506 #ifdef DEBUG 1507 /* Randomly don't execute the first algorithm. */ 1508 if (prandom_u32() & 1) 1509 return 0; 1510 #endif 1511 1512 /* 1513 * Start from the entry that lookup found, sequence through all larger 1514 * free blocks. If we're actually pointing at a record smaller than 1515 * maxlen, go to the start of this block, and skip all those smaller 1516 * than minlen. 1517 */ 1518 if (len || args->alignment > 1) { 1519 acur->cnt->bc_ptrs[0] = 1; 1520 do { 1521 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1522 if (error) 1523 return error; 1524 if (XFS_IS_CORRUPT(args->mp, i != 1)) 1525 return -EFSCORRUPTED; 1526 if (*len >= args->minlen) 1527 break; 1528 error = xfs_btree_increment(acur->cnt, 0, &i); 1529 if (error) 1530 return error; 1531 } while (i); 1532 ASSERT(*len >= args->minlen); 1533 if (!i) 1534 return 0; 1535 } 1536 1537 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1538 if (error) 1539 return error; 1540 1541 /* 1542 * It didn't work. We COULD be in a case where there's a good record 1543 * somewhere, so try again. 1544 */ 1545 if (acur->len == 0) 1546 return 0; 1547 1548 trace_xfs_alloc_near_first(args); 1549 *allocated = true; 1550 return 0; 1551 } 1552 1553 /* 1554 * Allocate a variable extent near bno in the allocation group agno. 1555 * Extent's length (returned in len) will be between minlen and maxlen, 1556 * and of the form k * prod + mod unless there's nothing that large. 1557 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1558 */ 1559 STATIC int 1560 xfs_alloc_ag_vextent_near( 1561 struct xfs_alloc_arg *args) 1562 { 1563 struct xfs_alloc_cur acur = {}; 1564 int error; /* error code */ 1565 int i; /* result code, temporary */ 1566 xfs_agblock_t bno; 1567 xfs_extlen_t len; 1568 1569 /* handle uninitialized agbno range so caller doesn't have to */ 1570 if (!args->min_agbno && !args->max_agbno) 1571 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1572 ASSERT(args->min_agbno <= args->max_agbno); 1573 1574 /* clamp agbno to the range if it's outside */ 1575 if (args->agbno < args->min_agbno) 1576 args->agbno = args->min_agbno; 1577 if (args->agbno > args->max_agbno) 1578 args->agbno = args->max_agbno; 1579 1580 restart: 1581 len = 0; 1582 1583 /* 1584 * Set up cursors and see if there are any free extents as big as 1585 * maxlen. If not, pick the last entry in the tree unless the tree is 1586 * empty. 1587 */ 1588 error = xfs_alloc_cur_setup(args, &acur); 1589 if (error == -ENOSPC) { 1590 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1591 &len, &i); 1592 if (error) 1593 goto out; 1594 if (i == 0 || len == 0) { 1595 trace_xfs_alloc_near_noentry(args); 1596 goto out; 1597 } 1598 ASSERT(i == 1); 1599 } else if (error) { 1600 goto out; 1601 } 1602 1603 /* 1604 * First algorithm. 1605 * If the requested extent is large wrt the freespaces available 1606 * in this a.g., then the cursor will be pointing to a btree entry 1607 * near the right edge of the tree. If it's in the last btree leaf 1608 * block, then we just examine all the entries in that block 1609 * that are big enough, and pick the best one. 1610 */ 1611 if (xfs_btree_islastblock(acur.cnt, 0)) { 1612 bool allocated = false; 1613 1614 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1615 &allocated); 1616 if (error) 1617 goto out; 1618 if (allocated) 1619 goto alloc_finish; 1620 } 1621 1622 /* 1623 * Second algorithm. Combined cntbt and bnobt search to find ideal 1624 * locality. 1625 */ 1626 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1627 if (error) 1628 goto out; 1629 1630 /* 1631 * If we couldn't get anything, give up. 1632 */ 1633 if (!acur.len) { 1634 if (acur.busy) { 1635 trace_xfs_alloc_near_busy(args); 1636 xfs_extent_busy_flush(args->mp, args->pag, 1637 acur.busy_gen); 1638 goto restart; 1639 } 1640 trace_xfs_alloc_size_neither(args); 1641 args->agbno = NULLAGBLOCK; 1642 goto out; 1643 } 1644 1645 alloc_finish: 1646 /* fix up btrees on a successful allocation */ 1647 error = xfs_alloc_cur_finish(args, &acur); 1648 1649 out: 1650 xfs_alloc_cur_close(&acur, error); 1651 return error; 1652 } 1653 1654 /* 1655 * Allocate a variable extent anywhere in the allocation group agno. 1656 * Extent's length (returned in len) will be between minlen and maxlen, 1657 * and of the form k * prod + mod unless there's nothing that large. 1658 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1659 */ 1660 STATIC int /* error */ 1661 xfs_alloc_ag_vextent_size( 1662 xfs_alloc_arg_t *args) /* allocation argument structure */ 1663 { 1664 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ 1665 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ 1666 int error; /* error result */ 1667 xfs_agblock_t fbno; /* start of found freespace */ 1668 xfs_extlen_t flen; /* length of found freespace */ 1669 int i; /* temp status variable */ 1670 xfs_agblock_t rbno; /* returned block number */ 1671 xfs_extlen_t rlen; /* length of returned extent */ 1672 bool busy; 1673 unsigned busy_gen; 1674 1675 restart: 1676 /* 1677 * Allocate and initialize a cursor for the by-size btree. 1678 */ 1679 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1680 args->agno, XFS_BTNUM_CNT); 1681 bno_cur = NULL; 1682 busy = false; 1683 1684 /* 1685 * Look for an entry >= maxlen+alignment-1 blocks. 1686 */ 1687 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1688 args->maxlen + args->alignment - 1, &i))) 1689 goto error0; 1690 1691 /* 1692 * If none then we have to settle for a smaller extent. In the case that 1693 * there are no large extents, this will return the last entry in the 1694 * tree unless the tree is empty. In the case that there are only busy 1695 * large extents, this will return the largest small extent unless there 1696 * are no smaller extents available. 1697 */ 1698 if (!i) { 1699 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1700 &fbno, &flen, &i); 1701 if (error) 1702 goto error0; 1703 if (i == 0 || flen == 0) { 1704 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1705 trace_xfs_alloc_size_noentry(args); 1706 return 0; 1707 } 1708 ASSERT(i == 1); 1709 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1710 &rlen, &busy_gen); 1711 } else { 1712 /* 1713 * Search for a non-busy extent that is large enough. 1714 */ 1715 for (;;) { 1716 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1717 if (error) 1718 goto error0; 1719 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1720 error = -EFSCORRUPTED; 1721 goto error0; 1722 } 1723 1724 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1725 &rbno, &rlen, &busy_gen); 1726 1727 if (rlen >= args->maxlen) 1728 break; 1729 1730 error = xfs_btree_increment(cnt_cur, 0, &i); 1731 if (error) 1732 goto error0; 1733 if (i == 0) { 1734 /* 1735 * Our only valid extents must have been busy. 1736 * Make it unbusy by forcing the log out and 1737 * retrying. 1738 */ 1739 xfs_btree_del_cursor(cnt_cur, 1740 XFS_BTREE_NOERROR); 1741 trace_xfs_alloc_size_busy(args); 1742 xfs_extent_busy_flush(args->mp, 1743 args->pag, busy_gen); 1744 goto restart; 1745 } 1746 } 1747 } 1748 1749 /* 1750 * In the first case above, we got the last entry in the 1751 * by-size btree. Now we check to see if the space hits maxlen 1752 * once aligned; if not, we search left for something better. 1753 * This can't happen in the second case above. 1754 */ 1755 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1756 if (XFS_IS_CORRUPT(args->mp, 1757 rlen != 0 && 1758 (rlen > flen || 1759 rbno + rlen > fbno + flen))) { 1760 error = -EFSCORRUPTED; 1761 goto error0; 1762 } 1763 if (rlen < args->maxlen) { 1764 xfs_agblock_t bestfbno; 1765 xfs_extlen_t bestflen; 1766 xfs_agblock_t bestrbno; 1767 xfs_extlen_t bestrlen; 1768 1769 bestrlen = rlen; 1770 bestrbno = rbno; 1771 bestflen = flen; 1772 bestfbno = fbno; 1773 for (;;) { 1774 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1775 goto error0; 1776 if (i == 0) 1777 break; 1778 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1779 &i))) 1780 goto error0; 1781 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1782 error = -EFSCORRUPTED; 1783 goto error0; 1784 } 1785 if (flen < bestrlen) 1786 break; 1787 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1788 &rbno, &rlen, &busy_gen); 1789 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1790 if (XFS_IS_CORRUPT(args->mp, 1791 rlen != 0 && 1792 (rlen > flen || 1793 rbno + rlen > fbno + flen))) { 1794 error = -EFSCORRUPTED; 1795 goto error0; 1796 } 1797 if (rlen > bestrlen) { 1798 bestrlen = rlen; 1799 bestrbno = rbno; 1800 bestflen = flen; 1801 bestfbno = fbno; 1802 if (rlen == args->maxlen) 1803 break; 1804 } 1805 } 1806 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1807 &i))) 1808 goto error0; 1809 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1810 error = -EFSCORRUPTED; 1811 goto error0; 1812 } 1813 rlen = bestrlen; 1814 rbno = bestrbno; 1815 flen = bestflen; 1816 fbno = bestfbno; 1817 } 1818 args->wasfromfl = 0; 1819 /* 1820 * Fix up the length. 1821 */ 1822 args->len = rlen; 1823 if (rlen < args->minlen) { 1824 if (busy) { 1825 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1826 trace_xfs_alloc_size_busy(args); 1827 xfs_extent_busy_flush(args->mp, args->pag, busy_gen); 1828 goto restart; 1829 } 1830 goto out_nominleft; 1831 } 1832 xfs_alloc_fix_len(args); 1833 1834 rlen = args->len; 1835 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1836 error = -EFSCORRUPTED; 1837 goto error0; 1838 } 1839 /* 1840 * Allocate and initialize a cursor for the by-block tree. 1841 */ 1842 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1843 args->agno, XFS_BTNUM_BNO); 1844 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1845 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1846 goto error0; 1847 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1848 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1849 cnt_cur = bno_cur = NULL; 1850 args->len = rlen; 1851 args->agbno = rbno; 1852 if (XFS_IS_CORRUPT(args->mp, 1853 args->agbno + args->len > 1854 be32_to_cpu( 1855 XFS_BUF_TO_AGF(args->agbp)->agf_length))) { 1856 error = -EFSCORRUPTED; 1857 goto error0; 1858 } 1859 trace_xfs_alloc_size_done(args); 1860 return 0; 1861 1862 error0: 1863 trace_xfs_alloc_size_error(args); 1864 if (cnt_cur) 1865 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1866 if (bno_cur) 1867 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1868 return error; 1869 1870 out_nominleft: 1871 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1872 trace_xfs_alloc_size_nominleft(args); 1873 args->agbno = NULLAGBLOCK; 1874 return 0; 1875 } 1876 1877 /* 1878 * Free the extent starting at agno/bno for length. 1879 */ 1880 STATIC int 1881 xfs_free_ag_extent( 1882 struct xfs_trans *tp, 1883 struct xfs_buf *agbp, 1884 xfs_agnumber_t agno, 1885 xfs_agblock_t bno, 1886 xfs_extlen_t len, 1887 const struct xfs_owner_info *oinfo, 1888 enum xfs_ag_resv_type type) 1889 { 1890 struct xfs_mount *mp; 1891 struct xfs_perag *pag; 1892 struct xfs_btree_cur *bno_cur; 1893 struct xfs_btree_cur *cnt_cur; 1894 xfs_agblock_t gtbno; /* start of right neighbor */ 1895 xfs_extlen_t gtlen; /* length of right neighbor */ 1896 xfs_agblock_t ltbno; /* start of left neighbor */ 1897 xfs_extlen_t ltlen; /* length of left neighbor */ 1898 xfs_agblock_t nbno; /* new starting block of freesp */ 1899 xfs_extlen_t nlen; /* new length of freespace */ 1900 int haveleft; /* have a left neighbor */ 1901 int haveright; /* have a right neighbor */ 1902 int i; 1903 int error; 1904 1905 bno_cur = cnt_cur = NULL; 1906 mp = tp->t_mountp; 1907 1908 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1909 error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); 1910 if (error) 1911 goto error0; 1912 } 1913 1914 /* 1915 * Allocate and initialize a cursor for the by-block btree. 1916 */ 1917 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); 1918 /* 1919 * Look for a neighboring block on the left (lower block numbers) 1920 * that is contiguous with this space. 1921 */ 1922 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1923 goto error0; 1924 if (haveleft) { 1925 /* 1926 * There is a block to our left. 1927 */ 1928 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1929 goto error0; 1930 if (XFS_IS_CORRUPT(mp, i != 1)) { 1931 error = -EFSCORRUPTED; 1932 goto error0; 1933 } 1934 /* 1935 * It's not contiguous, though. 1936 */ 1937 if (ltbno + ltlen < bno) 1938 haveleft = 0; 1939 else { 1940 /* 1941 * If this failure happens the request to free this 1942 * space was invalid, it's (partly) already free. 1943 * Very bad. 1944 */ 1945 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 1946 error = -EFSCORRUPTED; 1947 goto error0; 1948 } 1949 } 1950 } 1951 /* 1952 * Look for a neighboring block on the right (higher block numbers) 1953 * that is contiguous with this space. 1954 */ 1955 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 1956 goto error0; 1957 if (haveright) { 1958 /* 1959 * There is a block to our right. 1960 */ 1961 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 1962 goto error0; 1963 if (XFS_IS_CORRUPT(mp, i != 1)) { 1964 error = -EFSCORRUPTED; 1965 goto error0; 1966 } 1967 /* 1968 * It's not contiguous, though. 1969 */ 1970 if (bno + len < gtbno) 1971 haveright = 0; 1972 else { 1973 /* 1974 * If this failure happens the request to free this 1975 * space was invalid, it's (partly) already free. 1976 * Very bad. 1977 */ 1978 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 1979 error = -EFSCORRUPTED; 1980 goto error0; 1981 } 1982 } 1983 } 1984 /* 1985 * Now allocate and initialize a cursor for the by-size tree. 1986 */ 1987 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); 1988 /* 1989 * Have both left and right contiguous neighbors. 1990 * Merge all three into a single free block. 1991 */ 1992 if (haveleft && haveright) { 1993 /* 1994 * Delete the old by-size entry on the left. 1995 */ 1996 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1997 goto error0; 1998 if (XFS_IS_CORRUPT(mp, i != 1)) { 1999 error = -EFSCORRUPTED; 2000 goto error0; 2001 } 2002 if ((error = xfs_btree_delete(cnt_cur, &i))) 2003 goto error0; 2004 if (XFS_IS_CORRUPT(mp, i != 1)) { 2005 error = -EFSCORRUPTED; 2006 goto error0; 2007 } 2008 /* 2009 * Delete the old by-size entry on the right. 2010 */ 2011 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2012 goto error0; 2013 if (XFS_IS_CORRUPT(mp, i != 1)) { 2014 error = -EFSCORRUPTED; 2015 goto error0; 2016 } 2017 if ((error = xfs_btree_delete(cnt_cur, &i))) 2018 goto error0; 2019 if (XFS_IS_CORRUPT(mp, i != 1)) { 2020 error = -EFSCORRUPTED; 2021 goto error0; 2022 } 2023 /* 2024 * Delete the old by-block entry for the right block. 2025 */ 2026 if ((error = xfs_btree_delete(bno_cur, &i))) 2027 goto error0; 2028 if (XFS_IS_CORRUPT(mp, i != 1)) { 2029 error = -EFSCORRUPTED; 2030 goto error0; 2031 } 2032 /* 2033 * Move the by-block cursor back to the left neighbor. 2034 */ 2035 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2036 goto error0; 2037 if (XFS_IS_CORRUPT(mp, i != 1)) { 2038 error = -EFSCORRUPTED; 2039 goto error0; 2040 } 2041 #ifdef DEBUG 2042 /* 2043 * Check that this is the right record: delete didn't 2044 * mangle the cursor. 2045 */ 2046 { 2047 xfs_agblock_t xxbno; 2048 xfs_extlen_t xxlen; 2049 2050 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2051 &i))) 2052 goto error0; 2053 if (XFS_IS_CORRUPT(mp, 2054 i != 1 || 2055 xxbno != ltbno || 2056 xxlen != ltlen)) { 2057 error = -EFSCORRUPTED; 2058 goto error0; 2059 } 2060 } 2061 #endif 2062 /* 2063 * Update remaining by-block entry to the new, joined block. 2064 */ 2065 nbno = ltbno; 2066 nlen = len + ltlen + gtlen; 2067 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2068 goto error0; 2069 } 2070 /* 2071 * Have only a left contiguous neighbor. 2072 * Merge it together with the new freespace. 2073 */ 2074 else if (haveleft) { 2075 /* 2076 * Delete the old by-size entry on the left. 2077 */ 2078 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2079 goto error0; 2080 if (XFS_IS_CORRUPT(mp, i != 1)) { 2081 error = -EFSCORRUPTED; 2082 goto error0; 2083 } 2084 if ((error = xfs_btree_delete(cnt_cur, &i))) 2085 goto error0; 2086 if (XFS_IS_CORRUPT(mp, i != 1)) { 2087 error = -EFSCORRUPTED; 2088 goto error0; 2089 } 2090 /* 2091 * Back up the by-block cursor to the left neighbor, and 2092 * update its length. 2093 */ 2094 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2095 goto error0; 2096 if (XFS_IS_CORRUPT(mp, i != 1)) { 2097 error = -EFSCORRUPTED; 2098 goto error0; 2099 } 2100 nbno = ltbno; 2101 nlen = len + ltlen; 2102 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2103 goto error0; 2104 } 2105 /* 2106 * Have only a right contiguous neighbor. 2107 * Merge it together with the new freespace. 2108 */ 2109 else if (haveright) { 2110 /* 2111 * Delete the old by-size entry on the right. 2112 */ 2113 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2114 goto error0; 2115 if (XFS_IS_CORRUPT(mp, i != 1)) { 2116 error = -EFSCORRUPTED; 2117 goto error0; 2118 } 2119 if ((error = xfs_btree_delete(cnt_cur, &i))) 2120 goto error0; 2121 if (XFS_IS_CORRUPT(mp, i != 1)) { 2122 error = -EFSCORRUPTED; 2123 goto error0; 2124 } 2125 /* 2126 * Update the starting block and length of the right 2127 * neighbor in the by-block tree. 2128 */ 2129 nbno = bno; 2130 nlen = len + gtlen; 2131 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2132 goto error0; 2133 } 2134 /* 2135 * No contiguous neighbors. 2136 * Insert the new freespace into the by-block tree. 2137 */ 2138 else { 2139 nbno = bno; 2140 nlen = len; 2141 if ((error = xfs_btree_insert(bno_cur, &i))) 2142 goto error0; 2143 if (XFS_IS_CORRUPT(mp, i != 1)) { 2144 error = -EFSCORRUPTED; 2145 goto error0; 2146 } 2147 } 2148 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2149 bno_cur = NULL; 2150 /* 2151 * In all cases we need to insert the new freespace in the by-size tree. 2152 */ 2153 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2154 goto error0; 2155 if (XFS_IS_CORRUPT(mp, i != 0)) { 2156 error = -EFSCORRUPTED; 2157 goto error0; 2158 } 2159 if ((error = xfs_btree_insert(cnt_cur, &i))) 2160 goto error0; 2161 if (XFS_IS_CORRUPT(mp, i != 1)) { 2162 error = -EFSCORRUPTED; 2163 goto error0; 2164 } 2165 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2166 cnt_cur = NULL; 2167 2168 /* 2169 * Update the freespace totals in the ag and superblock. 2170 */ 2171 pag = xfs_perag_get(mp, agno); 2172 error = xfs_alloc_update_counters(tp, pag, agbp, len); 2173 xfs_ag_resv_free_extent(pag, type, tp, len); 2174 xfs_perag_put(pag); 2175 if (error) 2176 goto error0; 2177 2178 XFS_STATS_INC(mp, xs_freex); 2179 XFS_STATS_ADD(mp, xs_freeb, len); 2180 2181 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2182 2183 return 0; 2184 2185 error0: 2186 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2187 if (bno_cur) 2188 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2189 if (cnt_cur) 2190 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2191 return error; 2192 } 2193 2194 /* 2195 * Visible (exported) allocation/free functions. 2196 * Some of these are used just by xfs_alloc_btree.c and this file. 2197 */ 2198 2199 /* 2200 * Compute and fill in value of m_ag_maxlevels. 2201 */ 2202 void 2203 xfs_alloc_compute_maxlevels( 2204 xfs_mount_t *mp) /* file system mount structure */ 2205 { 2206 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2207 (mp->m_sb.sb_agblocks + 1) / 2); 2208 } 2209 2210 /* 2211 * Find the length of the longest extent in an AG. The 'need' parameter 2212 * specifies how much space we're going to need for the AGFL and the 2213 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2214 * other callers. 2215 */ 2216 xfs_extlen_t 2217 xfs_alloc_longest_free_extent( 2218 struct xfs_perag *pag, 2219 xfs_extlen_t need, 2220 xfs_extlen_t reserved) 2221 { 2222 xfs_extlen_t delta = 0; 2223 2224 /* 2225 * If the AGFL needs a recharge, we'll have to subtract that from the 2226 * longest extent. 2227 */ 2228 if (need > pag->pagf_flcount) 2229 delta = need - pag->pagf_flcount; 2230 2231 /* 2232 * If we cannot maintain others' reservations with space from the 2233 * not-longest freesp extents, we'll have to subtract /that/ from 2234 * the longest extent too. 2235 */ 2236 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2237 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2238 2239 /* 2240 * If the longest extent is long enough to satisfy all the 2241 * reservations and AGFL rules in place, we can return this extent. 2242 */ 2243 if (pag->pagf_longest > delta) 2244 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2245 pag->pagf_longest - delta); 2246 2247 /* Otherwise, let the caller try for 1 block if there's space. */ 2248 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2249 } 2250 2251 /* 2252 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2253 * return the largest possible minimum length. 2254 */ 2255 unsigned int 2256 xfs_alloc_min_freelist( 2257 struct xfs_mount *mp, 2258 struct xfs_perag *pag) 2259 { 2260 /* AG btrees have at least 1 level. */ 2261 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; 2262 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; 2263 unsigned int min_free; 2264 2265 ASSERT(mp->m_ag_maxlevels > 0); 2266 2267 /* space needed by-bno freespace btree */ 2268 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, 2269 mp->m_ag_maxlevels); 2270 /* space needed by-size freespace btree */ 2271 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, 2272 mp->m_ag_maxlevels); 2273 /* space needed reverse mapping used space btree */ 2274 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) 2275 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, 2276 mp->m_rmap_maxlevels); 2277 2278 return min_free; 2279 } 2280 2281 /* 2282 * Check if the operation we are fixing up the freelist for should go ahead or 2283 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2284 * is dependent on whether the size and shape of free space available will 2285 * permit the requested allocation to take place. 2286 */ 2287 static bool 2288 xfs_alloc_space_available( 2289 struct xfs_alloc_arg *args, 2290 xfs_extlen_t min_free, 2291 int flags) 2292 { 2293 struct xfs_perag *pag = args->pag; 2294 xfs_extlen_t alloc_len, longest; 2295 xfs_extlen_t reservation; /* blocks that are still reserved */ 2296 int available; 2297 xfs_extlen_t agflcount; 2298 2299 if (flags & XFS_ALLOC_FLAG_FREEING) 2300 return true; 2301 2302 reservation = xfs_ag_resv_needed(pag, args->resv); 2303 2304 /* do we have enough contiguous free space for the allocation? */ 2305 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2306 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2307 if (longest < alloc_len) 2308 return false; 2309 2310 /* 2311 * Do we have enough free space remaining for the allocation? Don't 2312 * account extra agfl blocks because we are about to defer free them, 2313 * making them unavailable until the current transaction commits. 2314 */ 2315 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2316 available = (int)(pag->pagf_freeblks + agflcount - 2317 reservation - min_free - args->minleft); 2318 if (available < (int)max(args->total, alloc_len)) 2319 return false; 2320 2321 /* 2322 * Clamp maxlen to the amount of free space available for the actual 2323 * extent allocation. 2324 */ 2325 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2326 args->maxlen = available; 2327 ASSERT(args->maxlen > 0); 2328 ASSERT(args->maxlen >= args->minlen); 2329 } 2330 2331 return true; 2332 } 2333 2334 int 2335 xfs_free_agfl_block( 2336 struct xfs_trans *tp, 2337 xfs_agnumber_t agno, 2338 xfs_agblock_t agbno, 2339 struct xfs_buf *agbp, 2340 struct xfs_owner_info *oinfo) 2341 { 2342 int error; 2343 struct xfs_buf *bp; 2344 2345 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2346 XFS_AG_RESV_AGFL); 2347 if (error) 2348 return error; 2349 2350 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2351 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2352 tp->t_mountp->m_bsize, 0, &bp); 2353 if (error) 2354 return error; 2355 xfs_trans_binval(tp, bp); 2356 2357 return 0; 2358 } 2359 2360 /* 2361 * Check the agfl fields of the agf for inconsistency or corruption. The purpose 2362 * is to detect an agfl header padding mismatch between current and early v5 2363 * kernels. This problem manifests as a 1-slot size difference between the 2364 * on-disk flcount and the active [first, last] range of a wrapped agfl. This 2365 * may also catch variants of agfl count corruption unrelated to padding. Either 2366 * way, we'll reset the agfl and warn the user. 2367 * 2368 * Return true if a reset is required before the agfl can be used, false 2369 * otherwise. 2370 */ 2371 static bool 2372 xfs_agfl_needs_reset( 2373 struct xfs_mount *mp, 2374 struct xfs_agf *agf) 2375 { 2376 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2377 uint32_t l = be32_to_cpu(agf->agf_fllast); 2378 uint32_t c = be32_to_cpu(agf->agf_flcount); 2379 int agfl_size = xfs_agfl_size(mp); 2380 int active; 2381 2382 /* no agfl header on v4 supers */ 2383 if (!xfs_sb_version_hascrc(&mp->m_sb)) 2384 return false; 2385 2386 /* 2387 * The agf read verifier catches severe corruption of these fields. 2388 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2389 * the verifier allows it. 2390 */ 2391 if (f >= agfl_size || l >= agfl_size) 2392 return true; 2393 if (c > agfl_size) 2394 return true; 2395 2396 /* 2397 * Check consistency between the on-disk count and the active range. An 2398 * agfl padding mismatch manifests as an inconsistent flcount. 2399 */ 2400 if (c && l >= f) 2401 active = l - f + 1; 2402 else if (c) 2403 active = agfl_size - f + l + 1; 2404 else 2405 active = 0; 2406 2407 return active != c; 2408 } 2409 2410 /* 2411 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2412 * agfl content cannot be trusted. Warn the user that a repair is required to 2413 * recover leaked blocks. 2414 * 2415 * The purpose of this mechanism is to handle filesystems affected by the agfl 2416 * header padding mismatch problem. A reset keeps the filesystem online with a 2417 * relatively minor free space accounting inconsistency rather than suffer the 2418 * inevitable crash from use of an invalid agfl block. 2419 */ 2420 static void 2421 xfs_agfl_reset( 2422 struct xfs_trans *tp, 2423 struct xfs_buf *agbp, 2424 struct xfs_perag *pag) 2425 { 2426 struct xfs_mount *mp = tp->t_mountp; 2427 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 2428 2429 ASSERT(pag->pagf_agflreset); 2430 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2431 2432 xfs_warn(mp, 2433 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2434 "Please unmount and run xfs_repair.", 2435 pag->pag_agno, pag->pagf_flcount); 2436 2437 agf->agf_flfirst = 0; 2438 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2439 agf->agf_flcount = 0; 2440 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2441 XFS_AGF_FLCOUNT); 2442 2443 pag->pagf_flcount = 0; 2444 pag->pagf_agflreset = false; 2445 } 2446 2447 /* 2448 * Defer an AGFL block free. This is effectively equivalent to 2449 * xfs_bmap_add_free() with some special handling particular to AGFL blocks. 2450 * 2451 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2452 * allocation operations in a transaction. AGFL frees are prone to this problem 2453 * because for one they are always freed one at a time. Further, an immediate 2454 * AGFL block free can cause a btree join and require another block free before 2455 * the real allocation can proceed. Deferring the free disconnects freeing up 2456 * the AGFL slot from freeing the block. 2457 */ 2458 STATIC void 2459 xfs_defer_agfl_block( 2460 struct xfs_trans *tp, 2461 xfs_agnumber_t agno, 2462 xfs_fsblock_t agbno, 2463 struct xfs_owner_info *oinfo) 2464 { 2465 struct xfs_mount *mp = tp->t_mountp; 2466 struct xfs_extent_free_item *new; /* new element */ 2467 2468 ASSERT(xfs_bmap_free_item_zone != NULL); 2469 ASSERT(oinfo != NULL); 2470 2471 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0); 2472 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2473 new->xefi_blockcount = 1; 2474 new->xefi_oinfo = *oinfo; 2475 2476 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2477 2478 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); 2479 } 2480 2481 /* 2482 * Decide whether to use this allocation group for this allocation. 2483 * If so, fix up the btree freelist's size. 2484 */ 2485 int /* error */ 2486 xfs_alloc_fix_freelist( 2487 struct xfs_alloc_arg *args, /* allocation argument structure */ 2488 int flags) /* XFS_ALLOC_FLAG_... */ 2489 { 2490 struct xfs_mount *mp = args->mp; 2491 struct xfs_perag *pag = args->pag; 2492 struct xfs_trans *tp = args->tp; 2493 struct xfs_buf *agbp = NULL; 2494 struct xfs_buf *agflbp = NULL; 2495 struct xfs_alloc_arg targs; /* local allocation arguments */ 2496 xfs_agblock_t bno; /* freelist block */ 2497 xfs_extlen_t need; /* total blocks needed in freelist */ 2498 int error = 0; 2499 2500 /* deferred ops (AGFL block frees) require permanent transactions */ 2501 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2502 2503 if (!pag->pagf_init) { 2504 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); 2505 if (error) { 2506 /* Couldn't lock the AGF so skip this AG. */ 2507 if (error == -EAGAIN) 2508 error = 0; 2509 goto out_no_agbp; 2510 } 2511 } 2512 2513 /* 2514 * If this is a metadata preferred pag and we are user data then try 2515 * somewhere else if we are not being asked to try harder at this 2516 * point 2517 */ 2518 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && 2519 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2520 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2521 goto out_agbp_relse; 2522 } 2523 2524 need = xfs_alloc_min_freelist(mp, pag); 2525 if (!xfs_alloc_space_available(args, need, flags | 2526 XFS_ALLOC_FLAG_CHECK)) 2527 goto out_agbp_relse; 2528 2529 /* 2530 * Get the a.g. freespace buffer. 2531 * Can fail if we're not blocking on locks, and it's held. 2532 */ 2533 if (!agbp) { 2534 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); 2535 if (error) { 2536 /* Couldn't lock the AGF so skip this AG. */ 2537 if (error == -EAGAIN) 2538 error = 0; 2539 goto out_no_agbp; 2540 } 2541 } 2542 2543 /* reset a padding mismatched agfl before final free space check */ 2544 if (pag->pagf_agflreset) 2545 xfs_agfl_reset(tp, agbp, pag); 2546 2547 /* If there isn't enough total space or single-extent, reject it. */ 2548 need = xfs_alloc_min_freelist(mp, pag); 2549 if (!xfs_alloc_space_available(args, need, flags)) 2550 goto out_agbp_relse; 2551 2552 /* 2553 * Make the freelist shorter if it's too long. 2554 * 2555 * Note that from this point onwards, we will always release the agf and 2556 * agfl buffers on error. This handles the case where we error out and 2557 * the buffers are clean or may not have been joined to the transaction 2558 * and hence need to be released manually. If they have been joined to 2559 * the transaction, then xfs_trans_brelse() will handle them 2560 * appropriately based on the recursion count and dirty state of the 2561 * buffer. 2562 * 2563 * XXX (dgc): When we have lots of free space, does this buy us 2564 * anything other than extra overhead when we need to put more blocks 2565 * back on the free list? Maybe we should only do this when space is 2566 * getting low or the AGFL is more than half full? 2567 * 2568 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2569 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2570 * updating the rmapbt. Both flags are used in xfs_repair while we're 2571 * rebuilding the rmapbt, and neither are used by the kernel. They're 2572 * both required to ensure that rmaps are correctly recorded for the 2573 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2574 * repair/rmap.c in xfsprogs for details. 2575 */ 2576 memset(&targs, 0, sizeof(targs)); 2577 /* struct copy below */ 2578 if (flags & XFS_ALLOC_FLAG_NORMAP) 2579 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2580 else 2581 targs.oinfo = XFS_RMAP_OINFO_AG; 2582 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { 2583 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); 2584 if (error) 2585 goto out_agbp_relse; 2586 2587 /* defer agfl frees */ 2588 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2589 } 2590 2591 targs.tp = tp; 2592 targs.mp = mp; 2593 targs.agbp = agbp; 2594 targs.agno = args->agno; 2595 targs.alignment = targs.minlen = targs.prod = 1; 2596 targs.type = XFS_ALLOCTYPE_THIS_AG; 2597 targs.pag = pag; 2598 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); 2599 if (error) 2600 goto out_agbp_relse; 2601 2602 /* Make the freelist longer if it's too short. */ 2603 while (pag->pagf_flcount < need) { 2604 targs.agbno = 0; 2605 targs.maxlen = need - pag->pagf_flcount; 2606 targs.resv = XFS_AG_RESV_AGFL; 2607 2608 /* Allocate as many blocks as possible at once. */ 2609 error = xfs_alloc_ag_vextent(&targs); 2610 if (error) 2611 goto out_agflbp_relse; 2612 2613 /* 2614 * Stop if we run out. Won't happen if callers are obeying 2615 * the restrictions correctly. Can happen for free calls 2616 * on a completely full ag. 2617 */ 2618 if (targs.agbno == NULLAGBLOCK) { 2619 if (flags & XFS_ALLOC_FLAG_FREEING) 2620 break; 2621 goto out_agflbp_relse; 2622 } 2623 /* 2624 * Put each allocated block on the list. 2625 */ 2626 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2627 error = xfs_alloc_put_freelist(tp, agbp, 2628 agflbp, bno, 0); 2629 if (error) 2630 goto out_agflbp_relse; 2631 } 2632 } 2633 xfs_trans_brelse(tp, agflbp); 2634 args->agbp = agbp; 2635 return 0; 2636 2637 out_agflbp_relse: 2638 xfs_trans_brelse(tp, agflbp); 2639 out_agbp_relse: 2640 if (agbp) 2641 xfs_trans_brelse(tp, agbp); 2642 out_no_agbp: 2643 args->agbp = NULL; 2644 return error; 2645 } 2646 2647 /* 2648 * Get a block from the freelist. 2649 * Returns with the buffer for the block gotten. 2650 */ 2651 int /* error */ 2652 xfs_alloc_get_freelist( 2653 xfs_trans_t *tp, /* transaction pointer */ 2654 xfs_buf_t *agbp, /* buffer containing the agf structure */ 2655 xfs_agblock_t *bnop, /* block address retrieved from freelist */ 2656 int btreeblk) /* destination is a AGF btree */ 2657 { 2658 xfs_agf_t *agf; /* a.g. freespace structure */ 2659 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ 2660 xfs_agblock_t bno; /* block number returned */ 2661 __be32 *agfl_bno; 2662 int error; 2663 int logflags; 2664 xfs_mount_t *mp = tp->t_mountp; 2665 xfs_perag_t *pag; /* per allocation group data */ 2666 2667 /* 2668 * Freelist is empty, give up. 2669 */ 2670 agf = XFS_BUF_TO_AGF(agbp); 2671 if (!agf->agf_flcount) { 2672 *bnop = NULLAGBLOCK; 2673 return 0; 2674 } 2675 /* 2676 * Read the array of free blocks. 2677 */ 2678 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), 2679 &agflbp); 2680 if (error) 2681 return error; 2682 2683 2684 /* 2685 * Get the block number and update the data structures. 2686 */ 2687 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); 2688 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 2689 be32_add_cpu(&agf->agf_flfirst, 1); 2690 xfs_trans_brelse(tp, agflbp); 2691 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2692 agf->agf_flfirst = 0; 2693 2694 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); 2695 ASSERT(!pag->pagf_agflreset); 2696 be32_add_cpu(&agf->agf_flcount, -1); 2697 xfs_trans_agflist_delta(tp, -1); 2698 pag->pagf_flcount--; 2699 2700 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 2701 if (btreeblk) { 2702 be32_add_cpu(&agf->agf_btreeblks, 1); 2703 pag->pagf_btreeblks++; 2704 logflags |= XFS_AGF_BTREEBLKS; 2705 } 2706 xfs_perag_put(pag); 2707 2708 xfs_alloc_log_agf(tp, agbp, logflags); 2709 *bnop = bno; 2710 2711 return 0; 2712 } 2713 2714 /* 2715 * Log the given fields from the agf structure. 2716 */ 2717 void 2718 xfs_alloc_log_agf( 2719 xfs_trans_t *tp, /* transaction pointer */ 2720 xfs_buf_t *bp, /* buffer for a.g. freelist header */ 2721 int fields) /* mask of fields to be logged (XFS_AGF_...) */ 2722 { 2723 int first; /* first byte offset */ 2724 int last; /* last byte offset */ 2725 static const short offsets[] = { 2726 offsetof(xfs_agf_t, agf_magicnum), 2727 offsetof(xfs_agf_t, agf_versionnum), 2728 offsetof(xfs_agf_t, agf_seqno), 2729 offsetof(xfs_agf_t, agf_length), 2730 offsetof(xfs_agf_t, agf_roots[0]), 2731 offsetof(xfs_agf_t, agf_levels[0]), 2732 offsetof(xfs_agf_t, agf_flfirst), 2733 offsetof(xfs_agf_t, agf_fllast), 2734 offsetof(xfs_agf_t, agf_flcount), 2735 offsetof(xfs_agf_t, agf_freeblks), 2736 offsetof(xfs_agf_t, agf_longest), 2737 offsetof(xfs_agf_t, agf_btreeblks), 2738 offsetof(xfs_agf_t, agf_uuid), 2739 offsetof(xfs_agf_t, agf_rmap_blocks), 2740 offsetof(xfs_agf_t, agf_refcount_blocks), 2741 offsetof(xfs_agf_t, agf_refcount_root), 2742 offsetof(xfs_agf_t, agf_refcount_level), 2743 /* needed so that we don't log the whole rest of the structure: */ 2744 offsetof(xfs_agf_t, agf_spare64), 2745 sizeof(xfs_agf_t) 2746 }; 2747 2748 trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); 2749 2750 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 2751 2752 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2753 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2754 } 2755 2756 /* 2757 * Interface for inode allocation to force the pag data to be initialized. 2758 */ 2759 int /* error */ 2760 xfs_alloc_pagf_init( 2761 xfs_mount_t *mp, /* file system mount structure */ 2762 xfs_trans_t *tp, /* transaction pointer */ 2763 xfs_agnumber_t agno, /* allocation group number */ 2764 int flags) /* XFS_ALLOC_FLAGS_... */ 2765 { 2766 xfs_buf_t *bp; 2767 int error; 2768 2769 error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); 2770 if (!error) 2771 xfs_trans_brelse(tp, bp); 2772 return error; 2773 } 2774 2775 /* 2776 * Put the block on the freelist for the allocation group. 2777 */ 2778 int /* error */ 2779 xfs_alloc_put_freelist( 2780 xfs_trans_t *tp, /* transaction pointer */ 2781 xfs_buf_t *agbp, /* buffer for a.g. freelist header */ 2782 xfs_buf_t *agflbp,/* buffer for a.g. free block array */ 2783 xfs_agblock_t bno, /* block being freed */ 2784 int btreeblk) /* block came from a AGF btree */ 2785 { 2786 xfs_agf_t *agf; /* a.g. freespace structure */ 2787 __be32 *blockp;/* pointer to array entry */ 2788 int error; 2789 int logflags; 2790 xfs_mount_t *mp; /* mount structure */ 2791 xfs_perag_t *pag; /* per allocation group data */ 2792 __be32 *agfl_bno; 2793 int startoff; 2794 2795 agf = XFS_BUF_TO_AGF(agbp); 2796 mp = tp->t_mountp; 2797 2798 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, 2799 be32_to_cpu(agf->agf_seqno), &agflbp))) 2800 return error; 2801 be32_add_cpu(&agf->agf_fllast, 1); 2802 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2803 agf->agf_fllast = 0; 2804 2805 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); 2806 ASSERT(!pag->pagf_agflreset); 2807 be32_add_cpu(&agf->agf_flcount, 1); 2808 xfs_trans_agflist_delta(tp, 1); 2809 pag->pagf_flcount++; 2810 2811 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 2812 if (btreeblk) { 2813 be32_add_cpu(&agf->agf_btreeblks, -1); 2814 pag->pagf_btreeblks--; 2815 logflags |= XFS_AGF_BTREEBLKS; 2816 } 2817 xfs_perag_put(pag); 2818 2819 xfs_alloc_log_agf(tp, agbp, logflags); 2820 2821 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 2822 2823 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); 2824 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2825 *blockp = cpu_to_be32(bno); 2826 startoff = (char *)blockp - (char *)agflbp->b_addr; 2827 2828 xfs_alloc_log_agf(tp, agbp, logflags); 2829 2830 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 2831 xfs_trans_log_buf(tp, agflbp, startoff, 2832 startoff + sizeof(xfs_agblock_t) - 1); 2833 return 0; 2834 } 2835 2836 static xfs_failaddr_t 2837 xfs_agf_verify( 2838 struct xfs_buf *bp) 2839 { 2840 struct xfs_mount *mp = bp->b_mount; 2841 struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); 2842 2843 if (xfs_sb_version_hascrc(&mp->m_sb)) { 2844 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 2845 return __this_address; 2846 if (!xfs_log_check_lsn(mp, 2847 be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) 2848 return __this_address; 2849 } 2850 2851 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 2852 return __this_address; 2853 2854 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2855 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2856 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && 2857 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && 2858 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) 2859 return __this_address; 2860 2861 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || 2862 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || 2863 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || 2864 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) 2865 return __this_address; 2866 2867 if (xfs_sb_version_hasrmapbt(&mp->m_sb) && 2868 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || 2869 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) 2870 return __this_address; 2871 2872 /* 2873 * during growfs operations, the perag is not fully initialised, 2874 * so we can't use it for any useful checking. growfs ensures we can't 2875 * use it by using uncached buffers that don't have the perag attached 2876 * so we can detect and avoid this problem. 2877 */ 2878 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) 2879 return __this_address; 2880 2881 if (xfs_sb_version_haslazysbcount(&mp->m_sb) && 2882 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) 2883 return __this_address; 2884 2885 if (xfs_sb_version_hasreflink(&mp->m_sb) && 2886 (be32_to_cpu(agf->agf_refcount_level) < 1 || 2887 be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) 2888 return __this_address; 2889 2890 return NULL; 2891 2892 } 2893 2894 static void 2895 xfs_agf_read_verify( 2896 struct xfs_buf *bp) 2897 { 2898 struct xfs_mount *mp = bp->b_mount; 2899 xfs_failaddr_t fa; 2900 2901 if (xfs_sb_version_hascrc(&mp->m_sb) && 2902 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 2903 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 2904 else { 2905 fa = xfs_agf_verify(bp); 2906 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 2907 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2908 } 2909 } 2910 2911 static void 2912 xfs_agf_write_verify( 2913 struct xfs_buf *bp) 2914 { 2915 struct xfs_mount *mp = bp->b_mount; 2916 struct xfs_buf_log_item *bip = bp->b_log_item; 2917 xfs_failaddr_t fa; 2918 2919 fa = xfs_agf_verify(bp); 2920 if (fa) { 2921 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2922 return; 2923 } 2924 2925 if (!xfs_sb_version_hascrc(&mp->m_sb)) 2926 return; 2927 2928 if (bip) 2929 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 2930 2931 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 2932 } 2933 2934 const struct xfs_buf_ops xfs_agf_buf_ops = { 2935 .name = "xfs_agf", 2936 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 2937 .verify_read = xfs_agf_read_verify, 2938 .verify_write = xfs_agf_write_verify, 2939 .verify_struct = xfs_agf_verify, 2940 }; 2941 2942 /* 2943 * Read in the allocation group header (free/alloc section). 2944 */ 2945 int /* error */ 2946 xfs_read_agf( 2947 struct xfs_mount *mp, /* mount point structure */ 2948 struct xfs_trans *tp, /* transaction pointer */ 2949 xfs_agnumber_t agno, /* allocation group number */ 2950 int flags, /* XFS_BUF_ */ 2951 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2952 { 2953 int error; 2954 2955 trace_xfs_read_agf(mp, agno); 2956 2957 ASSERT(agno != NULLAGNUMBER); 2958 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 2959 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), 2960 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); 2961 if (error) 2962 return error; 2963 2964 ASSERT(!(*bpp)->b_error); 2965 xfs_buf_set_ref(*bpp, XFS_AGF_REF); 2966 return 0; 2967 } 2968 2969 /* 2970 * Read in the allocation group header (free/alloc section). 2971 */ 2972 int /* error */ 2973 xfs_alloc_read_agf( 2974 struct xfs_mount *mp, /* mount point structure */ 2975 struct xfs_trans *tp, /* transaction pointer */ 2976 xfs_agnumber_t agno, /* allocation group number */ 2977 int flags, /* XFS_ALLOC_FLAG_... */ 2978 struct xfs_buf **bpp) /* buffer for the ag freelist header */ 2979 { 2980 struct xfs_agf *agf; /* ag freelist header */ 2981 struct xfs_perag *pag; /* per allocation group data */ 2982 int error; 2983 2984 trace_xfs_alloc_read_agf(mp, agno); 2985 2986 /* We don't support trylock when freeing. */ 2987 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 2988 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 2989 ASSERT(agno != NULLAGNUMBER); 2990 error = xfs_read_agf(mp, tp, agno, 2991 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 2992 bpp); 2993 if (error) 2994 return error; 2995 ASSERT(!(*bpp)->b_error); 2996 2997 agf = XFS_BUF_TO_AGF(*bpp); 2998 pag = xfs_perag_get(mp, agno); 2999 if (!pag->pagf_init) { 3000 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3001 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3002 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3003 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3004 pag->pagf_levels[XFS_BTNUM_BNOi] = 3005 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); 3006 pag->pagf_levels[XFS_BTNUM_CNTi] = 3007 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 3008 pag->pagf_levels[XFS_BTNUM_RMAPi] = 3009 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); 3010 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3011 pag->pagf_init = 1; 3012 pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); 3013 } 3014 #ifdef DEBUG 3015 else if (!XFS_FORCED_SHUTDOWN(mp)) { 3016 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3017 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3018 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3019 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3020 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 3021 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); 3022 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == 3023 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3024 } 3025 #endif 3026 xfs_perag_put(pag); 3027 return 0; 3028 } 3029 3030 /* 3031 * Allocate an extent (variable-size). 3032 * Depending on the allocation type, we either look in a single allocation 3033 * group or loop over the allocation groups to find the result. 3034 */ 3035 int /* error */ 3036 xfs_alloc_vextent( 3037 struct xfs_alloc_arg *args) /* allocation argument structure */ 3038 { 3039 xfs_agblock_t agsize; /* allocation group size */ 3040 int error; 3041 int flags; /* XFS_ALLOC_FLAG_... locking flags */ 3042 struct xfs_mount *mp; /* mount structure pointer */ 3043 xfs_agnumber_t sagno; /* starting allocation group number */ 3044 xfs_alloctype_t type; /* input allocation type */ 3045 int bump_rotor = 0; 3046 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ 3047 3048 mp = args->mp; 3049 type = args->otype = args->type; 3050 args->agbno = NULLAGBLOCK; 3051 /* 3052 * Just fix this up, for the case where the last a.g. is shorter 3053 * (or there's only one a.g.) and the caller couldn't easily figure 3054 * that out (xfs_bmap_alloc). 3055 */ 3056 agsize = mp->m_sb.sb_agblocks; 3057 if (args->maxlen > agsize) 3058 args->maxlen = agsize; 3059 if (args->alignment == 0) 3060 args->alignment = 1; 3061 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); 3062 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); 3063 ASSERT(args->minlen <= args->maxlen); 3064 ASSERT(args->minlen <= agsize); 3065 ASSERT(args->mod < args->prod); 3066 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || 3067 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || 3068 args->minlen > args->maxlen || args->minlen > agsize || 3069 args->mod >= args->prod) { 3070 args->fsbno = NULLFSBLOCK; 3071 trace_xfs_alloc_vextent_badargs(args); 3072 return 0; 3073 } 3074 3075 switch (type) { 3076 case XFS_ALLOCTYPE_THIS_AG: 3077 case XFS_ALLOCTYPE_NEAR_BNO: 3078 case XFS_ALLOCTYPE_THIS_BNO: 3079 /* 3080 * These three force us into a single a.g. 3081 */ 3082 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3083 args->pag = xfs_perag_get(mp, args->agno); 3084 error = xfs_alloc_fix_freelist(args, 0); 3085 if (error) { 3086 trace_xfs_alloc_vextent_nofix(args); 3087 goto error0; 3088 } 3089 if (!args->agbp) { 3090 trace_xfs_alloc_vextent_noagbp(args); 3091 break; 3092 } 3093 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 3094 if ((error = xfs_alloc_ag_vextent(args))) 3095 goto error0; 3096 break; 3097 case XFS_ALLOCTYPE_START_BNO: 3098 /* 3099 * Try near allocation first, then anywhere-in-ag after 3100 * the first a.g. fails. 3101 */ 3102 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3103 (mp->m_flags & XFS_MOUNT_32BITINODES)) { 3104 args->fsbno = XFS_AGB_TO_FSB(mp, 3105 ((mp->m_agfrotor / rotorstep) % 3106 mp->m_sb.sb_agcount), 0); 3107 bump_rotor = 1; 3108 } 3109 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 3110 args->type = XFS_ALLOCTYPE_NEAR_BNO; 3111 /* FALLTHROUGH */ 3112 case XFS_ALLOCTYPE_FIRST_AG: 3113 /* 3114 * Rotate through the allocation groups looking for a winner. 3115 */ 3116 if (type == XFS_ALLOCTYPE_FIRST_AG) { 3117 /* 3118 * Start with allocation group given by bno. 3119 */ 3120 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3121 args->type = XFS_ALLOCTYPE_THIS_AG; 3122 sagno = 0; 3123 flags = 0; 3124 } else { 3125 /* 3126 * Start with the given allocation group. 3127 */ 3128 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3129 flags = XFS_ALLOC_FLAG_TRYLOCK; 3130 } 3131 /* 3132 * Loop over allocation groups twice; first time with 3133 * trylock set, second time without. 3134 */ 3135 for (;;) { 3136 args->pag = xfs_perag_get(mp, args->agno); 3137 error = xfs_alloc_fix_freelist(args, flags); 3138 if (error) { 3139 trace_xfs_alloc_vextent_nofix(args); 3140 goto error0; 3141 } 3142 /* 3143 * If we get a buffer back then the allocation will fly. 3144 */ 3145 if (args->agbp) { 3146 if ((error = xfs_alloc_ag_vextent(args))) 3147 goto error0; 3148 break; 3149 } 3150 3151 trace_xfs_alloc_vextent_loopfailed(args); 3152 3153 /* 3154 * Didn't work, figure out the next iteration. 3155 */ 3156 if (args->agno == sagno && 3157 type == XFS_ALLOCTYPE_START_BNO) 3158 args->type = XFS_ALLOCTYPE_THIS_AG; 3159 /* 3160 * For the first allocation, we can try any AG to get 3161 * space. However, if we already have allocated a 3162 * block, we don't want to try AGs whose number is below 3163 * sagno. Otherwise, we may end up with out-of-order 3164 * locking of AGF, which might cause deadlock. 3165 */ 3166 if (++(args->agno) == mp->m_sb.sb_agcount) { 3167 if (args->tp->t_firstblock != NULLFSBLOCK) 3168 args->agno = sagno; 3169 else 3170 args->agno = 0; 3171 } 3172 /* 3173 * Reached the starting a.g., must either be done 3174 * or switch to non-trylock mode. 3175 */ 3176 if (args->agno == sagno) { 3177 if (flags == 0) { 3178 args->agbno = NULLAGBLOCK; 3179 trace_xfs_alloc_vextent_allfailed(args); 3180 break; 3181 } 3182 3183 flags = 0; 3184 if (type == XFS_ALLOCTYPE_START_BNO) { 3185 args->agbno = XFS_FSB_TO_AGBNO(mp, 3186 args->fsbno); 3187 args->type = XFS_ALLOCTYPE_NEAR_BNO; 3188 } 3189 } 3190 xfs_perag_put(args->pag); 3191 } 3192 if (bump_rotor) { 3193 if (args->agno == sagno) 3194 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3195 (mp->m_sb.sb_agcount * rotorstep); 3196 else 3197 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3198 (mp->m_sb.sb_agcount * rotorstep); 3199 } 3200 break; 3201 default: 3202 ASSERT(0); 3203 /* NOTREACHED */ 3204 } 3205 if (args->agbno == NULLAGBLOCK) 3206 args->fsbno = NULLFSBLOCK; 3207 else { 3208 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3209 #ifdef DEBUG 3210 ASSERT(args->len >= args->minlen); 3211 ASSERT(args->len <= args->maxlen); 3212 ASSERT(args->agbno % args->alignment == 0); 3213 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), 3214 args->len); 3215 #endif 3216 3217 } 3218 xfs_perag_put(args->pag); 3219 return 0; 3220 error0: 3221 xfs_perag_put(args->pag); 3222 return error; 3223 } 3224 3225 /* Ensure that the freelist is at full capacity. */ 3226 int 3227 xfs_free_extent_fix_freelist( 3228 struct xfs_trans *tp, 3229 xfs_agnumber_t agno, 3230 struct xfs_buf **agbp) 3231 { 3232 struct xfs_alloc_arg args; 3233 int error; 3234 3235 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3236 args.tp = tp; 3237 args.mp = tp->t_mountp; 3238 args.agno = agno; 3239 3240 /* 3241 * validate that the block number is legal - the enables us to detect 3242 * and handle a silent filesystem corruption rather than crashing. 3243 */ 3244 if (args.agno >= args.mp->m_sb.sb_agcount) 3245 return -EFSCORRUPTED; 3246 3247 args.pag = xfs_perag_get(args.mp, args.agno); 3248 ASSERT(args.pag); 3249 3250 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3251 if (error) 3252 goto out; 3253 3254 *agbp = args.agbp; 3255 out: 3256 xfs_perag_put(args.pag); 3257 return error; 3258 } 3259 3260 /* 3261 * Free an extent. 3262 * Just break up the extent address and hand off to xfs_free_ag_extent 3263 * after fixing up the freelist. 3264 */ 3265 int 3266 __xfs_free_extent( 3267 struct xfs_trans *tp, 3268 xfs_fsblock_t bno, 3269 xfs_extlen_t len, 3270 const struct xfs_owner_info *oinfo, 3271 enum xfs_ag_resv_type type, 3272 bool skip_discard) 3273 { 3274 struct xfs_mount *mp = tp->t_mountp; 3275 struct xfs_buf *agbp; 3276 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); 3277 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); 3278 int error; 3279 unsigned int busy_flags = 0; 3280 3281 ASSERT(len != 0); 3282 ASSERT(type != XFS_AG_RESV_AGFL); 3283 3284 if (XFS_TEST_ERROR(false, mp, 3285 XFS_ERRTAG_FREE_EXTENT)) 3286 return -EIO; 3287 3288 error = xfs_free_extent_fix_freelist(tp, agno, &agbp); 3289 if (error) 3290 return error; 3291 3292 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3293 error = -EFSCORRUPTED; 3294 goto err; 3295 } 3296 3297 /* validate the extent size is legal now we have the agf locked */ 3298 if (XFS_IS_CORRUPT(mp, 3299 agbno + len > 3300 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { 3301 error = -EFSCORRUPTED; 3302 goto err; 3303 } 3304 3305 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); 3306 if (error) 3307 goto err; 3308 3309 if (skip_discard) 3310 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3311 xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); 3312 return 0; 3313 3314 err: 3315 xfs_trans_brelse(tp, agbp); 3316 return error; 3317 } 3318 3319 struct xfs_alloc_query_range_info { 3320 xfs_alloc_query_range_fn fn; 3321 void *priv; 3322 }; 3323 3324 /* Format btree record and pass to our callback. */ 3325 STATIC int 3326 xfs_alloc_query_range_helper( 3327 struct xfs_btree_cur *cur, 3328 union xfs_btree_rec *rec, 3329 void *priv) 3330 { 3331 struct xfs_alloc_query_range_info *query = priv; 3332 struct xfs_alloc_rec_incore irec; 3333 3334 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 3335 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 3336 return query->fn(cur, &irec, query->priv); 3337 } 3338 3339 /* Find all free space within a given range of blocks. */ 3340 int 3341 xfs_alloc_query_range( 3342 struct xfs_btree_cur *cur, 3343 struct xfs_alloc_rec_incore *low_rec, 3344 struct xfs_alloc_rec_incore *high_rec, 3345 xfs_alloc_query_range_fn fn, 3346 void *priv) 3347 { 3348 union xfs_btree_irec low_brec; 3349 union xfs_btree_irec high_brec; 3350 struct xfs_alloc_query_range_info query; 3351 3352 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3353 low_brec.a = *low_rec; 3354 high_brec.a = *high_rec; 3355 query.priv = priv; 3356 query.fn = fn; 3357 return xfs_btree_query_range(cur, &low_brec, &high_brec, 3358 xfs_alloc_query_range_helper, &query); 3359 } 3360 3361 /* Find all free space records. */ 3362 int 3363 xfs_alloc_query_all( 3364 struct xfs_btree_cur *cur, 3365 xfs_alloc_query_range_fn fn, 3366 void *priv) 3367 { 3368 struct xfs_alloc_query_range_info query; 3369 3370 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3371 query.priv = priv; 3372 query.fn = fn; 3373 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 3374 } 3375 3376 /* Is there a record covering a given extent? */ 3377 int 3378 xfs_alloc_has_record( 3379 struct xfs_btree_cur *cur, 3380 xfs_agblock_t bno, 3381 xfs_extlen_t len, 3382 bool *exists) 3383 { 3384 union xfs_btree_irec low; 3385 union xfs_btree_irec high; 3386 3387 memset(&low, 0, sizeof(low)); 3388 low.a.ar_startblock = bno; 3389 memset(&high, 0xFF, sizeof(high)); 3390 high.a.ar_startblock = bno + len - 1; 3391 3392 return xfs_btree_has_record(cur, &low, &high, exists); 3393 } 3394 3395 /* 3396 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 3397 * error code or XFS_ITER_*. 3398 */ 3399 int 3400 xfs_agfl_walk( 3401 struct xfs_mount *mp, 3402 struct xfs_agf *agf, 3403 struct xfs_buf *agflbp, 3404 xfs_agfl_walk_fn walk_fn, 3405 void *priv) 3406 { 3407 __be32 *agfl_bno; 3408 unsigned int i; 3409 int error; 3410 3411 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); 3412 i = be32_to_cpu(agf->agf_flfirst); 3413 3414 /* Nothing to walk in an empty AGFL. */ 3415 if (agf->agf_flcount == cpu_to_be32(0)) 3416 return 0; 3417 3418 /* Otherwise, walk from first to last, wrapping as needed. */ 3419 for (;;) { 3420 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 3421 if (error) 3422 return error; 3423 if (i == be32_to_cpu(agf->agf_fllast)) 3424 break; 3425 if (++i == xfs_agfl_size(mp)) 3426 i = 0; 3427 } 3428 3429 return 0; 3430 } 3431