1 /* 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 3 * Copyright (c) 2010 David Chinner. 4 * Copyright (c) 2011 Christoph Hellwig. 5 * All Rights Reserved. 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License as 9 * published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_format.h" 23 #include "xfs_log_format.h" 24 #include "xfs_shared.h" 25 #include "xfs_trans_resv.h" 26 #include "xfs_sb.h" 27 #include "xfs_mount.h" 28 #include "xfs_alloc.h" 29 #include "xfs_extent_busy.h" 30 #include "xfs_trace.h" 31 #include "xfs_trans.h" 32 #include "xfs_log.h" 33 34 void 35 xfs_extent_busy_insert( 36 struct xfs_trans *tp, 37 xfs_agnumber_t agno, 38 xfs_agblock_t bno, 39 xfs_extlen_t len, 40 unsigned int flags) 41 { 42 struct xfs_extent_busy *new; 43 struct xfs_extent_busy *busyp; 44 struct xfs_perag *pag; 45 struct rb_node **rbp; 46 struct rb_node *parent = NULL; 47 48 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); 49 new->agno = agno; 50 new->bno = bno; 51 new->length = len; 52 INIT_LIST_HEAD(&new->list); 53 new->flags = flags; 54 55 /* trace before insert to be able to see failed inserts */ 56 trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); 57 58 pag = xfs_perag_get(tp->t_mountp, new->agno); 59 spin_lock(&pag->pagb_lock); 60 rbp = &pag->pagb_tree.rb_node; 61 while (*rbp) { 62 parent = *rbp; 63 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 64 65 if (new->bno < busyp->bno) { 66 rbp = &(*rbp)->rb_left; 67 ASSERT(new->bno + new->length <= busyp->bno); 68 } else if (new->bno > busyp->bno) { 69 rbp = &(*rbp)->rb_right; 70 ASSERT(bno >= busyp->bno + busyp->length); 71 } else { 72 ASSERT(0); 73 } 74 } 75 76 rb_link_node(&new->rb_node, parent, rbp); 77 rb_insert_color(&new->rb_node, &pag->pagb_tree); 78 79 list_add(&new->list, &tp->t_busy); 80 spin_unlock(&pag->pagb_lock); 81 xfs_perag_put(pag); 82 } 83 84 /* 85 * Search for a busy extent within the range of the extent we are about to 86 * allocate. You need to be holding the busy extent tree lock when calling 87 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 88 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 89 * match. This is done so that a non-zero return indicates an overlap that 90 * will require a synchronous transaction, but it can still be 91 * used to distinguish between a partial or exact match. 92 */ 93 int 94 xfs_extent_busy_search( 95 struct xfs_mount *mp, 96 xfs_agnumber_t agno, 97 xfs_agblock_t bno, 98 xfs_extlen_t len) 99 { 100 struct xfs_perag *pag; 101 struct rb_node *rbp; 102 struct xfs_extent_busy *busyp; 103 int match = 0; 104 105 pag = xfs_perag_get(mp, agno); 106 spin_lock(&pag->pagb_lock); 107 108 rbp = pag->pagb_tree.rb_node; 109 110 /* find closest start bno overlap */ 111 while (rbp) { 112 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 113 if (bno < busyp->bno) { 114 /* may overlap, but exact start block is lower */ 115 if (bno + len > busyp->bno) 116 match = -1; 117 rbp = rbp->rb_left; 118 } else if (bno > busyp->bno) { 119 /* may overlap, but exact start block is higher */ 120 if (bno < busyp->bno + busyp->length) 121 match = -1; 122 rbp = rbp->rb_right; 123 } else { 124 /* bno matches busyp, length determines exact match */ 125 match = (busyp->length == len) ? 1 : -1; 126 break; 127 } 128 } 129 spin_unlock(&pag->pagb_lock); 130 xfs_perag_put(pag); 131 return match; 132 } 133 134 /* 135 * The found free extent [fbno, fend] overlaps part or all of the given busy 136 * extent. If the overlap covers the beginning, the end, or all of the busy 137 * extent, the overlapping portion can be made unbusy and used for the 138 * allocation. We can't split a busy extent because we can't modify a 139 * transaction/CIL context busy list, but we can update an entry's block 140 * number or length. 141 * 142 * Returns true if the extent can safely be reused, or false if the search 143 * needs to be restarted. 144 */ 145 STATIC bool 146 xfs_extent_busy_update_extent( 147 struct xfs_mount *mp, 148 struct xfs_perag *pag, 149 struct xfs_extent_busy *busyp, 150 xfs_agblock_t fbno, 151 xfs_extlen_t flen, 152 bool userdata) __releases(&pag->pagb_lock) 153 __acquires(&pag->pagb_lock) 154 { 155 xfs_agblock_t fend = fbno + flen; 156 xfs_agblock_t bbno = busyp->bno; 157 xfs_agblock_t bend = bbno + busyp->length; 158 159 /* 160 * This extent is currently being discarded. Give the thread 161 * performing the discard a chance to mark the extent unbusy 162 * and retry. 163 */ 164 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 165 spin_unlock(&pag->pagb_lock); 166 delay(1); 167 spin_lock(&pag->pagb_lock); 168 return false; 169 } 170 171 /* 172 * If there is a busy extent overlapping a user allocation, we have 173 * no choice but to force the log and retry the search. 174 * 175 * Fortunately this does not happen during normal operation, but 176 * only if the filesystem is very low on space and has to dip into 177 * the AGFL for normal allocations. 178 */ 179 if (userdata) 180 goto out_force_log; 181 182 if (bbno < fbno && bend > fend) { 183 /* 184 * Case 1: 185 * bbno bend 186 * +BBBBBBBBBBBBBBBBB+ 187 * +---------+ 188 * fbno fend 189 */ 190 191 /* 192 * We would have to split the busy extent to be able to track 193 * it correct, which we cannot do because we would have to 194 * modify the list of busy extents attached to the transaction 195 * or CIL context, which is immutable. 196 * 197 * Force out the log to clear the busy extent and retry the 198 * search. 199 */ 200 goto out_force_log; 201 } else if (bbno >= fbno && bend <= fend) { 202 /* 203 * Case 2: 204 * bbno bend 205 * +BBBBBBBBBBBBBBBBB+ 206 * +-----------------+ 207 * fbno fend 208 * 209 * Case 3: 210 * bbno bend 211 * +BBBBBBBBBBBBBBBBB+ 212 * +--------------------------+ 213 * fbno fend 214 * 215 * Case 4: 216 * bbno bend 217 * +BBBBBBBBBBBBBBBBB+ 218 * +--------------------------+ 219 * fbno fend 220 * 221 * Case 5: 222 * bbno bend 223 * +BBBBBBBBBBBBBBBBB+ 224 * +-----------------------------------+ 225 * fbno fend 226 * 227 */ 228 229 /* 230 * The busy extent is fully covered by the extent we are 231 * allocating, and can simply be removed from the rbtree. 232 * However we cannot remove it from the immutable list 233 * tracking busy extents in the transaction or CIL context, 234 * so set the length to zero to mark it invalid. 235 * 236 * We also need to restart the busy extent search from the 237 * tree root, because erasing the node can rearrange the 238 * tree topology. 239 */ 240 rb_erase(&busyp->rb_node, &pag->pagb_tree); 241 busyp->length = 0; 242 return false; 243 } else if (fend < bend) { 244 /* 245 * Case 6: 246 * bbno bend 247 * +BBBBBBBBBBBBBBBBB+ 248 * +---------+ 249 * fbno fend 250 * 251 * Case 7: 252 * bbno bend 253 * +BBBBBBBBBBBBBBBBB+ 254 * +------------------+ 255 * fbno fend 256 * 257 */ 258 busyp->bno = fend; 259 } else if (bbno < fbno) { 260 /* 261 * Case 8: 262 * bbno bend 263 * +BBBBBBBBBBBBBBBBB+ 264 * +-------------+ 265 * fbno fend 266 * 267 * Case 9: 268 * bbno bend 269 * +BBBBBBBBBBBBBBBBB+ 270 * +----------------------+ 271 * fbno fend 272 */ 273 busyp->length = fbno - busyp->bno; 274 } else { 275 ASSERT(0); 276 } 277 278 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); 279 return true; 280 281 out_force_log: 282 spin_unlock(&pag->pagb_lock); 283 xfs_log_force(mp, XFS_LOG_SYNC); 284 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); 285 spin_lock(&pag->pagb_lock); 286 return false; 287 } 288 289 290 /* 291 * For a given extent [fbno, flen], make sure we can reuse it safely. 292 */ 293 void 294 xfs_extent_busy_reuse( 295 struct xfs_mount *mp, 296 xfs_agnumber_t agno, 297 xfs_agblock_t fbno, 298 xfs_extlen_t flen, 299 bool userdata) 300 { 301 struct xfs_perag *pag; 302 struct rb_node *rbp; 303 304 ASSERT(flen > 0); 305 306 pag = xfs_perag_get(mp, agno); 307 spin_lock(&pag->pagb_lock); 308 restart: 309 rbp = pag->pagb_tree.rb_node; 310 while (rbp) { 311 struct xfs_extent_busy *busyp = 312 rb_entry(rbp, struct xfs_extent_busy, rb_node); 313 xfs_agblock_t bbno = busyp->bno; 314 xfs_agblock_t bend = bbno + busyp->length; 315 316 if (fbno + flen <= bbno) { 317 rbp = rbp->rb_left; 318 continue; 319 } else if (fbno >= bend) { 320 rbp = rbp->rb_right; 321 continue; 322 } 323 324 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, 325 userdata)) 326 goto restart; 327 } 328 spin_unlock(&pag->pagb_lock); 329 xfs_perag_put(pag); 330 } 331 332 /* 333 * For a given extent [fbno, flen], search the busy extent list to find a 334 * subset of the extent that is not busy. If *rlen is smaller than 335 * args->minlen no suitable extent could be found, and the higher level 336 * code needs to force out the log and retry the allocation. 337 * 338 * Return the current busy generation for the AG if the extent is busy. This 339 * value can be used to wait for at least one of the currently busy extents 340 * to be cleared. Note that the busy list is not guaranteed to be empty after 341 * the gen is woken. The state of a specific extent must always be confirmed 342 * with another call to xfs_extent_busy_trim() before it can be used. 343 */ 344 bool 345 xfs_extent_busy_trim( 346 struct xfs_alloc_arg *args, 347 xfs_agblock_t *bno, 348 xfs_extlen_t *len, 349 unsigned *busy_gen) 350 { 351 xfs_agblock_t fbno; 352 xfs_extlen_t flen; 353 struct rb_node *rbp; 354 bool ret = false; 355 356 ASSERT(*len > 0); 357 358 spin_lock(&args->pag->pagb_lock); 359 restart: 360 fbno = *bno; 361 flen = *len; 362 rbp = args->pag->pagb_tree.rb_node; 363 while (rbp && flen >= args->minlen) { 364 struct xfs_extent_busy *busyp = 365 rb_entry(rbp, struct xfs_extent_busy, rb_node); 366 xfs_agblock_t fend = fbno + flen; 367 xfs_agblock_t bbno = busyp->bno; 368 xfs_agblock_t bend = bbno + busyp->length; 369 370 if (fend <= bbno) { 371 rbp = rbp->rb_left; 372 continue; 373 } else if (fbno >= bend) { 374 rbp = rbp->rb_right; 375 continue; 376 } 377 378 /* 379 * If this is a metadata allocation, try to reuse the busy 380 * extent instead of trimming the allocation. 381 */ 382 if (!xfs_alloc_is_userdata(args->datatype) && 383 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 384 if (!xfs_extent_busy_update_extent(args->mp, args->pag, 385 busyp, fbno, flen, 386 false)) 387 goto restart; 388 continue; 389 } 390 391 if (bbno <= fbno) { 392 /* start overlap */ 393 394 /* 395 * Case 1: 396 * bbno bend 397 * +BBBBBBBBBBBBBBBBB+ 398 * +---------+ 399 * fbno fend 400 * 401 * Case 2: 402 * bbno bend 403 * +BBBBBBBBBBBBBBBBB+ 404 * +-------------+ 405 * fbno fend 406 * 407 * Case 3: 408 * bbno bend 409 * +BBBBBBBBBBBBBBBBB+ 410 * +-------------+ 411 * fbno fend 412 * 413 * Case 4: 414 * bbno bend 415 * +BBBBBBBBBBBBBBBBB+ 416 * +-----------------+ 417 * fbno fend 418 * 419 * No unbusy region in extent, return failure. 420 */ 421 if (fend <= bend) 422 goto fail; 423 424 /* 425 * Case 5: 426 * bbno bend 427 * +BBBBBBBBBBBBBBBBB+ 428 * +----------------------+ 429 * fbno fend 430 * 431 * Case 6: 432 * bbno bend 433 * +BBBBBBBBBBBBBBBBB+ 434 * +--------------------------+ 435 * fbno fend 436 * 437 * Needs to be trimmed to: 438 * +-------+ 439 * fbno fend 440 */ 441 fbno = bend; 442 } else if (bend >= fend) { 443 /* end overlap */ 444 445 /* 446 * Case 7: 447 * bbno bend 448 * +BBBBBBBBBBBBBBBBB+ 449 * +------------------+ 450 * fbno fend 451 * 452 * Case 8: 453 * bbno bend 454 * +BBBBBBBBBBBBBBBBB+ 455 * +--------------------------+ 456 * fbno fend 457 * 458 * Needs to be trimmed to: 459 * +-------+ 460 * fbno fend 461 */ 462 fend = bbno; 463 } else { 464 /* middle overlap */ 465 466 /* 467 * Case 9: 468 * bbno bend 469 * +BBBBBBBBBBBBBBBBB+ 470 * +-----------------------------------+ 471 * fbno fend 472 * 473 * Can be trimmed to: 474 * +-------+ OR +-------+ 475 * fbno fend fbno fend 476 * 477 * Backward allocation leads to significant 478 * fragmentation of directories, which degrades 479 * directory performance, therefore we always want to 480 * choose the option that produces forward allocation 481 * patterns. 482 * Preferring the lower bno extent will make the next 483 * request use "fend" as the start of the next 484 * allocation; if the segment is no longer busy at 485 * that point, we'll get a contiguous allocation, but 486 * even if it is still busy, we will get a forward 487 * allocation. 488 * We try to avoid choosing the segment at "bend", 489 * because that can lead to the next allocation 490 * taking the segment at "fbno", which would be a 491 * backward allocation. We only use the segment at 492 * "fbno" if it is much larger than the current 493 * requested size, because in that case there's a 494 * good chance subsequent allocations will be 495 * contiguous. 496 */ 497 if (bbno - fbno >= args->maxlen) { 498 /* left candidate fits perfect */ 499 fend = bbno; 500 } else if (fend - bend >= args->maxlen * 4) { 501 /* right candidate has enough free space */ 502 fbno = bend; 503 } else if (bbno - fbno >= args->minlen) { 504 /* left candidate fits minimum requirement */ 505 fend = bbno; 506 } else { 507 goto fail; 508 } 509 } 510 511 flen = fend - fbno; 512 } 513 out: 514 515 if (fbno != *bno || flen != *len) { 516 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, 517 fbno, flen); 518 *bno = fbno; 519 *len = flen; 520 *busy_gen = args->pag->pagb_gen; 521 ret = true; 522 } 523 spin_unlock(&args->pag->pagb_lock); 524 return ret; 525 fail: 526 /* 527 * Return a zero extent length as failure indications. All callers 528 * re-check if the trimmed extent satisfies the minlen requirement. 529 */ 530 flen = 0; 531 goto out; 532 } 533 534 STATIC void 535 xfs_extent_busy_clear_one( 536 struct xfs_mount *mp, 537 struct xfs_perag *pag, 538 struct xfs_extent_busy *busyp) 539 { 540 if (busyp->length) { 541 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, 542 busyp->length); 543 rb_erase(&busyp->rb_node, &pag->pagb_tree); 544 } 545 546 list_del_init(&busyp->list); 547 kmem_free(busyp); 548 } 549 550 static void 551 xfs_extent_busy_put_pag( 552 struct xfs_perag *pag, 553 bool wakeup) 554 __releases(pag->pagb_lock) 555 { 556 if (wakeup) { 557 pag->pagb_gen++; 558 wake_up_all(&pag->pagb_wait); 559 } 560 561 spin_unlock(&pag->pagb_lock); 562 xfs_perag_put(pag); 563 } 564 565 /* 566 * Remove all extents on the passed in list from the busy extents tree. 567 * If do_discard is set skip extents that need to be discarded, and mark 568 * these as undergoing a discard operation instead. 569 */ 570 void 571 xfs_extent_busy_clear( 572 struct xfs_mount *mp, 573 struct list_head *list, 574 bool do_discard) 575 { 576 struct xfs_extent_busy *busyp, *n; 577 struct xfs_perag *pag = NULL; 578 xfs_agnumber_t agno = NULLAGNUMBER; 579 bool wakeup = false; 580 581 list_for_each_entry_safe(busyp, n, list, list) { 582 if (busyp->agno != agno) { 583 if (pag) 584 xfs_extent_busy_put_pag(pag, wakeup); 585 agno = busyp->agno; 586 pag = xfs_perag_get(mp, agno); 587 spin_lock(&pag->pagb_lock); 588 wakeup = false; 589 } 590 591 if (do_discard && busyp->length && 592 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { 593 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 594 } else { 595 xfs_extent_busy_clear_one(mp, pag, busyp); 596 wakeup = true; 597 } 598 } 599 600 if (pag) 601 xfs_extent_busy_put_pag(pag, wakeup); 602 } 603 604 /* 605 * Flush out all busy extents for this AG. 606 */ 607 void 608 xfs_extent_busy_flush( 609 struct xfs_mount *mp, 610 struct xfs_perag *pag, 611 unsigned busy_gen) 612 { 613 DEFINE_WAIT (wait); 614 int log_flushed = 0, error; 615 616 trace_xfs_log_force(mp, 0, _THIS_IP_); 617 error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); 618 if (error) 619 return; 620 621 do { 622 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 623 if (busy_gen != READ_ONCE(pag->pagb_gen)) 624 break; 625 schedule(); 626 } while (1); 627 628 finish_wait(&pag->pagb_wait, &wait); 629 } 630 631 void 632 xfs_extent_busy_wait_all( 633 struct xfs_mount *mp) 634 { 635 DEFINE_WAIT (wait); 636 xfs_agnumber_t agno; 637 638 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 639 struct xfs_perag *pag = xfs_perag_get(mp, agno); 640 641 do { 642 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); 643 if (RB_EMPTY_ROOT(&pag->pagb_tree)) 644 break; 645 schedule(); 646 } while (1); 647 finish_wait(&pag->pagb_wait, &wait); 648 649 xfs_perag_put(pag); 650 } 651 } 652 653 /* 654 * Callback for list_sort to sort busy extents by the AG they reside in. 655 */ 656 int 657 xfs_extent_busy_ag_cmp( 658 void *priv, 659 struct list_head *l1, 660 struct list_head *l2) 661 { 662 struct xfs_extent_busy *b1 = 663 container_of(l1, struct xfs_extent_busy, list); 664 struct xfs_extent_busy *b2 = 665 container_of(l2, struct xfs_extent_busy, list); 666 s32 diff; 667 668 diff = b1->agno - b2->agno; 669 if (!diff) 670 diff = b1->bno - b2->bno; 671 return diff; 672 } 673