1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * Copyright (c) 2010 David Chinner. 5 * Copyright (c) 2011 Christoph Hellwig. 6 * All Rights Reserved. 7 */ 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_format.h" 11 #include "xfs_log_format.h" 12 #include "xfs_shared.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_alloc.h" 16 #include "xfs_extent_busy.h" 17 #include "xfs_trace.h" 18 #include "xfs_trans.h" 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 22 static void 23 xfs_extent_busy_insert_list( 24 struct xfs_perag *pag, 25 xfs_agblock_t bno, 26 xfs_extlen_t len, 27 unsigned int flags, 28 struct list_head *busy_list) 29 { 30 struct xfs_extent_busy *new; 31 struct xfs_extent_busy *busyp; 32 struct rb_node **rbp; 33 struct rb_node *parent = NULL; 34 35 new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); 36 new->agno = pag->pag_agno; 37 new->bno = bno; 38 new->length = len; 39 INIT_LIST_HEAD(&new->list); 40 new->flags = flags; 41 42 /* trace before insert to be able to see failed inserts */ 43 trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); 44 45 spin_lock(&pag->pagb_lock); 46 rbp = &pag->pagb_tree.rb_node; 47 while (*rbp) { 48 parent = *rbp; 49 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 50 51 if (new->bno < busyp->bno) { 52 rbp = &(*rbp)->rb_left; 53 ASSERT(new->bno + new->length <= busyp->bno); 54 } else if (new->bno > busyp->bno) { 55 rbp = &(*rbp)->rb_right; 56 ASSERT(bno >= busyp->bno + busyp->length); 57 } else { 58 ASSERT(0); 59 } 60 } 61 62 rb_link_node(&new->rb_node, parent, rbp); 63 rb_insert_color(&new->rb_node, &pag->pagb_tree); 64 65 list_add(&new->list, busy_list); 66 spin_unlock(&pag->pagb_lock); 67 } 68 69 void 70 xfs_extent_busy_insert( 71 struct xfs_trans *tp, 72 struct xfs_perag *pag, 73 xfs_agblock_t bno, 74 xfs_extlen_t len, 75 unsigned int flags) 76 { 77 xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); 78 } 79 80 void 81 xfs_extent_busy_insert_discard( 82 struct xfs_perag *pag, 83 xfs_agblock_t bno, 84 xfs_extlen_t len, 85 struct list_head *busy_list) 86 { 87 xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, 88 busy_list); 89 } 90 91 /* 92 * Search for a busy extent within the range of the extent we are about to 93 * allocate. You need to be holding the busy extent tree lock when calling 94 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 95 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 96 * match. This is done so that a non-zero return indicates an overlap that 97 * will require a synchronous transaction, but it can still be 98 * used to distinguish between a partial or exact match. 99 */ 100 int 101 xfs_extent_busy_search( 102 struct xfs_mount *mp, 103 struct xfs_perag *pag, 104 xfs_agblock_t bno, 105 xfs_extlen_t len) 106 { 107 struct rb_node *rbp; 108 struct xfs_extent_busy *busyp; 109 int match = 0; 110 111 /* find closest start bno overlap */ 112 spin_lock(&pag->pagb_lock); 113 rbp = pag->pagb_tree.rb_node; 114 while (rbp) { 115 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 116 if (bno < busyp->bno) { 117 /* may overlap, but exact start block is lower */ 118 if (bno + len > busyp->bno) 119 match = -1; 120 rbp = rbp->rb_left; 121 } else if (bno > busyp->bno) { 122 /* may overlap, but exact start block is higher */ 123 if (bno < busyp->bno + busyp->length) 124 match = -1; 125 rbp = rbp->rb_right; 126 } else { 127 /* bno matches busyp, length determines exact match */ 128 match = (busyp->length == len) ? 1 : -1; 129 break; 130 } 131 } 132 spin_unlock(&pag->pagb_lock); 133 return match; 134 } 135 136 /* 137 * The found free extent [fbno, fend] overlaps part or all of the given busy 138 * extent. If the overlap covers the beginning, the end, or all of the busy 139 * extent, the overlapping portion can be made unbusy and used for the 140 * allocation. We can't split a busy extent because we can't modify a 141 * transaction/CIL context busy list, but we can update an entry's block 142 * number or length. 143 * 144 * Returns true if the extent can safely be reused, or false if the search 145 * needs to be restarted. 146 */ 147 STATIC bool 148 xfs_extent_busy_update_extent( 149 struct xfs_mount *mp, 150 struct xfs_perag *pag, 151 struct xfs_extent_busy *busyp, 152 xfs_agblock_t fbno, 153 xfs_extlen_t flen, 154 bool userdata) __releases(&pag->pagb_lock) 155 __acquires(&pag->pagb_lock) 156 { 157 xfs_agblock_t fend = fbno + flen; 158 xfs_agblock_t bbno = busyp->bno; 159 xfs_agblock_t bend = bbno + busyp->length; 160 161 /* 162 * This extent is currently being discarded. Give the thread 163 * performing the discard a chance to mark the extent unbusy 164 * and retry. 165 */ 166 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 167 spin_unlock(&pag->pagb_lock); 168 delay(1); 169 spin_lock(&pag->pagb_lock); 170 return false; 171 } 172 173 /* 174 * If there is a busy extent overlapping a user allocation, we have 175 * no choice but to force the log and retry the search. 176 * 177 * Fortunately this does not happen during normal operation, but 178 * only if the filesystem is very low on space and has to dip into 179 * the AGFL for normal allocations. 180 */ 181 if (userdata) 182 goto out_force_log; 183 184 if (bbno < fbno && bend > fend) { 185 /* 186 * Case 1: 187 * bbno bend 188 * +BBBBBBBBBBBBBBBBB+ 189 * +---------+ 190 * fbno fend 191 */ 192 193 /* 194 * We would have to split the busy extent to be able to track 195 * it correct, which we cannot do because we would have to 196 * modify the list of busy extents attached to the transaction 197 * or CIL context, which is immutable. 198 * 199 * Force out the log to clear the busy extent and retry the 200 * search. 201 */ 202 goto out_force_log; 203 } else if (bbno >= fbno && bend <= fend) { 204 /* 205 * Case 2: 206 * bbno bend 207 * +BBBBBBBBBBBBBBBBB+ 208 * +-----------------+ 209 * fbno fend 210 * 211 * Case 3: 212 * bbno bend 213 * +BBBBBBBBBBBBBBBBB+ 214 * +--------------------------+ 215 * fbno fend 216 * 217 * Case 4: 218 * bbno bend 219 * +BBBBBBBBBBBBBBBBB+ 220 * +--------------------------+ 221 * fbno fend 222 * 223 * Case 5: 224 * bbno bend 225 * +BBBBBBBBBBBBBBBBB+ 226 * +-----------------------------------+ 227 * fbno fend 228 * 229 */ 230 231 /* 232 * The busy extent is fully covered by the extent we are 233 * allocating, and can simply be removed from the rbtree. 234 * However we cannot remove it from the immutable list 235 * tracking busy extents in the transaction or CIL context, 236 * so set the length to zero to mark it invalid. 237 * 238 * We also need to restart the busy extent search from the 239 * tree root, because erasing the node can rearrange the 240 * tree topology. 241 */ 242 rb_erase(&busyp->rb_node, &pag->pagb_tree); 243 busyp->length = 0; 244 return false; 245 } else if (fend < bend) { 246 /* 247 * Case 6: 248 * bbno bend 249 * +BBBBBBBBBBBBBBBBB+ 250 * +---------+ 251 * fbno fend 252 * 253 * Case 7: 254 * bbno bend 255 * +BBBBBBBBBBBBBBBBB+ 256 * +------------------+ 257 * fbno fend 258 * 259 */ 260 busyp->bno = fend; 261 busyp->length = bend - fend; 262 } else if (bbno < fbno) { 263 /* 264 * Case 8: 265 * bbno bend 266 * +BBBBBBBBBBBBBBBBB+ 267 * +-------------+ 268 * fbno fend 269 * 270 * Case 9: 271 * bbno bend 272 * +BBBBBBBBBBBBBBBBB+ 273 * +----------------------+ 274 * fbno fend 275 */ 276 busyp->length = fbno - busyp->bno; 277 } else { 278 ASSERT(0); 279 } 280 281 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); 282 return true; 283 284 out_force_log: 285 spin_unlock(&pag->pagb_lock); 286 xfs_log_force(mp, XFS_LOG_SYNC); 287 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); 288 spin_lock(&pag->pagb_lock); 289 return false; 290 } 291 292 293 /* 294 * For a given extent [fbno, flen], make sure we can reuse it safely. 295 */ 296 void 297 xfs_extent_busy_reuse( 298 struct xfs_mount *mp, 299 struct xfs_perag *pag, 300 xfs_agblock_t fbno, 301 xfs_extlen_t flen, 302 bool userdata) 303 { 304 struct rb_node *rbp; 305 306 ASSERT(flen > 0); 307 spin_lock(&pag->pagb_lock); 308 restart: 309 rbp = pag->pagb_tree.rb_node; 310 while (rbp) { 311 struct xfs_extent_busy *busyp = 312 rb_entry(rbp, struct xfs_extent_busy, rb_node); 313 xfs_agblock_t bbno = busyp->bno; 314 xfs_agblock_t bend = bbno + busyp->length; 315 316 if (fbno + flen <= bbno) { 317 rbp = rbp->rb_left; 318 continue; 319 } else if (fbno >= bend) { 320 rbp = rbp->rb_right; 321 continue; 322 } 323 324 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, 325 userdata)) 326 goto restart; 327 } 328 spin_unlock(&pag->pagb_lock); 329 } 330 331 /* 332 * For a given extent [fbno, flen], search the busy extent list to find a 333 * subset of the extent that is not busy. If *rlen is smaller than 334 * args->minlen no suitable extent could be found, and the higher level 335 * code needs to force out the log and retry the allocation. 336 * 337 * Return the current busy generation for the AG if the extent is busy. This 338 * value can be used to wait for at least one of the currently busy extents 339 * to be cleared. Note that the busy list is not guaranteed to be empty after 340 * the gen is woken. The state of a specific extent must always be confirmed 341 * with another call to xfs_extent_busy_trim() before it can be used. 342 */ 343 bool 344 xfs_extent_busy_trim( 345 struct xfs_alloc_arg *args, 346 xfs_agblock_t *bno, 347 xfs_extlen_t *len, 348 unsigned *busy_gen) 349 { 350 xfs_agblock_t fbno; 351 xfs_extlen_t flen; 352 struct rb_node *rbp; 353 bool ret = false; 354 355 ASSERT(*len > 0); 356 357 spin_lock(&args->pag->pagb_lock); 358 fbno = *bno; 359 flen = *len; 360 rbp = args->pag->pagb_tree.rb_node; 361 while (rbp && flen >= args->minlen) { 362 struct xfs_extent_busy *busyp = 363 rb_entry(rbp, struct xfs_extent_busy, rb_node); 364 xfs_agblock_t fend = fbno + flen; 365 xfs_agblock_t bbno = busyp->bno; 366 xfs_agblock_t bend = bbno + busyp->length; 367 368 if (fend <= bbno) { 369 rbp = rbp->rb_left; 370 continue; 371 } else if (fbno >= bend) { 372 rbp = rbp->rb_right; 373 continue; 374 } 375 376 if (bbno <= fbno) { 377 /* start overlap */ 378 379 /* 380 * Case 1: 381 * bbno bend 382 * +BBBBBBBBBBBBBBBBB+ 383 * +---------+ 384 * fbno fend 385 * 386 * Case 2: 387 * bbno bend 388 * +BBBBBBBBBBBBBBBBB+ 389 * +-------------+ 390 * fbno fend 391 * 392 * Case 3: 393 * bbno bend 394 * +BBBBBBBBBBBBBBBBB+ 395 * +-------------+ 396 * fbno fend 397 * 398 * Case 4: 399 * bbno bend 400 * +BBBBBBBBBBBBBBBBB+ 401 * +-----------------+ 402 * fbno fend 403 * 404 * No unbusy region in extent, return failure. 405 */ 406 if (fend <= bend) 407 goto fail; 408 409 /* 410 * Case 5: 411 * bbno bend 412 * +BBBBBBBBBBBBBBBBB+ 413 * +----------------------+ 414 * fbno fend 415 * 416 * Case 6: 417 * bbno bend 418 * +BBBBBBBBBBBBBBBBB+ 419 * +--------------------------+ 420 * fbno fend 421 * 422 * Needs to be trimmed to: 423 * +-------+ 424 * fbno fend 425 */ 426 fbno = bend; 427 } else if (bend >= fend) { 428 /* end overlap */ 429 430 /* 431 * Case 7: 432 * bbno bend 433 * +BBBBBBBBBBBBBBBBB+ 434 * +------------------+ 435 * fbno fend 436 * 437 * Case 8: 438 * bbno bend 439 * +BBBBBBBBBBBBBBBBB+ 440 * +--------------------------+ 441 * fbno fend 442 * 443 * Needs to be trimmed to: 444 * +-------+ 445 * fbno fend 446 */ 447 fend = bbno; 448 } else { 449 /* middle overlap */ 450 451 /* 452 * Case 9: 453 * bbno bend 454 * +BBBBBBBBBBBBBBBBB+ 455 * +-----------------------------------+ 456 * fbno fend 457 * 458 * Can be trimmed to: 459 * +-------+ OR +-------+ 460 * fbno fend fbno fend 461 * 462 * Backward allocation leads to significant 463 * fragmentation of directories, which degrades 464 * directory performance, therefore we always want to 465 * choose the option that produces forward allocation 466 * patterns. 467 * Preferring the lower bno extent will make the next 468 * request use "fend" as the start of the next 469 * allocation; if the segment is no longer busy at 470 * that point, we'll get a contiguous allocation, but 471 * even if it is still busy, we will get a forward 472 * allocation. 473 * We try to avoid choosing the segment at "bend", 474 * because that can lead to the next allocation 475 * taking the segment at "fbno", which would be a 476 * backward allocation. We only use the segment at 477 * "fbno" if it is much larger than the current 478 * requested size, because in that case there's a 479 * good chance subsequent allocations will be 480 * contiguous. 481 */ 482 if (bbno - fbno >= args->maxlen) { 483 /* left candidate fits perfect */ 484 fend = bbno; 485 } else if (fend - bend >= args->maxlen * 4) { 486 /* right candidate has enough free space */ 487 fbno = bend; 488 } else if (bbno - fbno >= args->minlen) { 489 /* left candidate fits minimum requirement */ 490 fend = bbno; 491 } else { 492 goto fail; 493 } 494 } 495 496 flen = fend - fbno; 497 } 498 out: 499 500 if (fbno != *bno || flen != *len) { 501 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, 502 fbno, flen); 503 *bno = fbno; 504 *len = flen; 505 *busy_gen = args->pag->pagb_gen; 506 ret = true; 507 } 508 spin_unlock(&args->pag->pagb_lock); 509 return ret; 510 fail: 511 /* 512 * Return a zero extent length as failure indications. All callers 513 * re-check if the trimmed extent satisfies the minlen requirement. 514 */ 515 flen = 0; 516 goto out; 517 } 518 519 STATIC void 520 xfs_extent_busy_clear_one( 521 struct xfs_mount *mp, 522 struct xfs_perag *pag, 523 struct xfs_extent_busy *busyp) 524 { 525 if (busyp->length) { 526 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, 527 busyp->length); 528 rb_erase(&busyp->rb_node, &pag->pagb_tree); 529 } 530 531 list_del_init(&busyp->list); 532 kmem_free(busyp); 533 } 534 535 static void 536 xfs_extent_busy_put_pag( 537 struct xfs_perag *pag, 538 bool wakeup) 539 __releases(pag->pagb_lock) 540 { 541 if (wakeup) { 542 pag->pagb_gen++; 543 wake_up_all(&pag->pagb_wait); 544 } 545 546 spin_unlock(&pag->pagb_lock); 547 xfs_perag_put(pag); 548 } 549 550 /* 551 * Remove all extents on the passed in list from the busy extents tree. 552 * If do_discard is set skip extents that need to be discarded, and mark 553 * these as undergoing a discard operation instead. 554 */ 555 void 556 xfs_extent_busy_clear( 557 struct xfs_mount *mp, 558 struct list_head *list, 559 bool do_discard) 560 { 561 struct xfs_extent_busy *busyp, *n; 562 struct xfs_perag *pag = NULL; 563 xfs_agnumber_t agno = NULLAGNUMBER; 564 bool wakeup = false; 565 566 list_for_each_entry_safe(busyp, n, list, list) { 567 if (busyp->agno != agno) { 568 if (pag) 569 xfs_extent_busy_put_pag(pag, wakeup); 570 agno = busyp->agno; 571 pag = xfs_perag_get(mp, agno); 572 spin_lock(&pag->pagb_lock); 573 wakeup = false; 574 } 575 576 if (do_discard && busyp->length && 577 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { 578 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 579 } else { 580 xfs_extent_busy_clear_one(mp, pag, busyp); 581 wakeup = true; 582 } 583 } 584 585 if (pag) 586 xfs_extent_busy_put_pag(pag, wakeup); 587 } 588 589 /* 590 * Flush out all busy extents for this AG. 591 * 592 * If the current transaction is holding busy extents, the caller may not want 593 * to wait for committed busy extents to resolve. If we are being told just to 594 * try a flush or progress has been made since we last skipped a busy extent, 595 * return immediately to allow the caller to try again. 596 * 597 * If we are freeing extents, we might actually be holding the only free extents 598 * in the transaction busy list and the log force won't resolve that situation. 599 * In this case, we must return -EAGAIN to avoid a deadlock by informing the 600 * caller it needs to commit the busy extents it holds before retrying the 601 * extent free operation. 602 */ 603 int 604 xfs_extent_busy_flush( 605 struct xfs_trans *tp, 606 struct xfs_perag *pag, 607 unsigned busy_gen, 608 uint32_t alloc_flags) 609 { 610 DEFINE_WAIT (wait); 611 int error; 612 613 error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); 614 if (error) 615 return error; 616 617 /* Avoid deadlocks on uncommitted busy extents. */ 618 if (!list_empty(&tp->t_busy)) { 619 if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) 620 return 0; 621 622 if (busy_gen != READ_ONCE(pag->pagb_gen)) 623 return 0; 624 625 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 626 return -EAGAIN; 627 } 628 629 /* Wait for committed busy extents to resolve. */ 630 do { 631 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 632 if (busy_gen != READ_ONCE(pag->pagb_gen)) 633 break; 634 schedule(); 635 } while (1); 636 637 finish_wait(&pag->pagb_wait, &wait); 638 return 0; 639 } 640 641 void 642 xfs_extent_busy_wait_all( 643 struct xfs_mount *mp) 644 { 645 struct xfs_perag *pag; 646 DEFINE_WAIT (wait); 647 xfs_agnumber_t agno; 648 649 for_each_perag(mp, agno, pag) { 650 do { 651 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 652 if (RB_EMPTY_ROOT(&pag->pagb_tree)) 653 break; 654 schedule(); 655 } while (1); 656 finish_wait(&pag->pagb_wait, &wait); 657 } 658 } 659 660 /* 661 * Callback for list_sort to sort busy extents by the AG they reside in. 662 */ 663 int 664 xfs_extent_busy_ag_cmp( 665 void *priv, 666 const struct list_head *l1, 667 const struct list_head *l2) 668 { 669 struct xfs_extent_busy *b1 = 670 container_of(l1, struct xfs_extent_busy, list); 671 struct xfs_extent_busy *b2 = 672 container_of(l2, struct xfs_extent_busy, list); 673 s32 diff; 674 675 diff = b1->agno - b2->agno; 676 if (!diff) 677 diff = b1->bno - b2->bno; 678 return diff; 679 } 680