1 /* 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 3 * Copyright (c) 2010 David Chinner. 4 * Copyright (c) 2011 Christoph Hellwig. 5 * All Rights Reserved. 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License as 9 * published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_format.h" 23 #include "xfs_log_format.h" 24 #include "xfs_shared.h" 25 #include "xfs_trans_resv.h" 26 #include "xfs_sb.h" 27 #include "xfs_ag.h" 28 #include "xfs_mount.h" 29 #include "xfs_alloc.h" 30 #include "xfs_extent_busy.h" 31 #include "xfs_trace.h" 32 #include "xfs_trans.h" 33 #include "xfs_log.h" 34 35 void 36 xfs_extent_busy_insert( 37 struct xfs_trans *tp, 38 xfs_agnumber_t agno, 39 xfs_agblock_t bno, 40 xfs_extlen_t len, 41 unsigned int flags) 42 { 43 struct xfs_extent_busy *new; 44 struct xfs_extent_busy *busyp; 45 struct xfs_perag *pag; 46 struct rb_node **rbp; 47 struct rb_node *parent = NULL; 48 49 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); 50 if (!new) { 51 /* 52 * No Memory! Since it is now not possible to track the free 53 * block, make this a synchronous transaction to insure that 54 * the block is not reused before this transaction commits. 55 */ 56 trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); 57 xfs_trans_set_sync(tp); 58 return; 59 } 60 61 new->agno = agno; 62 new->bno = bno; 63 new->length = len; 64 INIT_LIST_HEAD(&new->list); 65 new->flags = flags; 66 67 /* trace before insert to be able to see failed inserts */ 68 trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); 69 70 pag = xfs_perag_get(tp->t_mountp, new->agno); 71 spin_lock(&pag->pagb_lock); 72 rbp = &pag->pagb_tree.rb_node; 73 while (*rbp) { 74 parent = *rbp; 75 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 76 77 if (new->bno < busyp->bno) { 78 rbp = &(*rbp)->rb_left; 79 ASSERT(new->bno + new->length <= busyp->bno); 80 } else if (new->bno > busyp->bno) { 81 rbp = &(*rbp)->rb_right; 82 ASSERT(bno >= busyp->bno + busyp->length); 83 } else { 84 ASSERT(0); 85 } 86 } 87 88 rb_link_node(&new->rb_node, parent, rbp); 89 rb_insert_color(&new->rb_node, &pag->pagb_tree); 90 91 list_add(&new->list, &tp->t_busy); 92 spin_unlock(&pag->pagb_lock); 93 xfs_perag_put(pag); 94 } 95 96 /* 97 * Search for a busy extent within the range of the extent we are about to 98 * allocate. You need to be holding the busy extent tree lock when calling 99 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 100 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 101 * match. This is done so that a non-zero return indicates an overlap that 102 * will require a synchronous transaction, but it can still be 103 * used to distinguish between a partial or exact match. 104 */ 105 int 106 xfs_extent_busy_search( 107 struct xfs_mount *mp, 108 xfs_agnumber_t agno, 109 xfs_agblock_t bno, 110 xfs_extlen_t len) 111 { 112 struct xfs_perag *pag; 113 struct rb_node *rbp; 114 struct xfs_extent_busy *busyp; 115 int match = 0; 116 117 pag = xfs_perag_get(mp, agno); 118 spin_lock(&pag->pagb_lock); 119 120 rbp = pag->pagb_tree.rb_node; 121 122 /* find closest start bno overlap */ 123 while (rbp) { 124 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 125 if (bno < busyp->bno) { 126 /* may overlap, but exact start block is lower */ 127 if (bno + len > busyp->bno) 128 match = -1; 129 rbp = rbp->rb_left; 130 } else if (bno > busyp->bno) { 131 /* may overlap, but exact start block is higher */ 132 if (bno < busyp->bno + busyp->length) 133 match = -1; 134 rbp = rbp->rb_right; 135 } else { 136 /* bno matches busyp, length determines exact match */ 137 match = (busyp->length == len) ? 1 : -1; 138 break; 139 } 140 } 141 spin_unlock(&pag->pagb_lock); 142 xfs_perag_put(pag); 143 return match; 144 } 145 146 /* 147 * The found free extent [fbno, fend] overlaps part or all of the given busy 148 * extent. If the overlap covers the beginning, the end, or all of the busy 149 * extent, the overlapping portion can be made unbusy and used for the 150 * allocation. We can't split a busy extent because we can't modify a 151 * transaction/CIL context busy list, but we can update an entry's block 152 * number or length. 153 * 154 * Returns true if the extent can safely be reused, or false if the search 155 * needs to be restarted. 156 */ 157 STATIC bool 158 xfs_extent_busy_update_extent( 159 struct xfs_mount *mp, 160 struct xfs_perag *pag, 161 struct xfs_extent_busy *busyp, 162 xfs_agblock_t fbno, 163 xfs_extlen_t flen, 164 bool userdata) __releases(&pag->pagb_lock) 165 __acquires(&pag->pagb_lock) 166 { 167 xfs_agblock_t fend = fbno + flen; 168 xfs_agblock_t bbno = busyp->bno; 169 xfs_agblock_t bend = bbno + busyp->length; 170 171 /* 172 * This extent is currently being discarded. Give the thread 173 * performing the discard a chance to mark the extent unbusy 174 * and retry. 175 */ 176 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 177 spin_unlock(&pag->pagb_lock); 178 delay(1); 179 spin_lock(&pag->pagb_lock); 180 return false; 181 } 182 183 /* 184 * If there is a busy extent overlapping a user allocation, we have 185 * no choice but to force the log and retry the search. 186 * 187 * Fortunately this does not happen during normal operation, but 188 * only if the filesystem is very low on space and has to dip into 189 * the AGFL for normal allocations. 190 */ 191 if (userdata) 192 goto out_force_log; 193 194 if (bbno < fbno && bend > fend) { 195 /* 196 * Case 1: 197 * bbno bend 198 * +BBBBBBBBBBBBBBBBB+ 199 * +---------+ 200 * fbno fend 201 */ 202 203 /* 204 * We would have to split the busy extent to be able to track 205 * it correct, which we cannot do because we would have to 206 * modify the list of busy extents attached to the transaction 207 * or CIL context, which is immutable. 208 * 209 * Force out the log to clear the busy extent and retry the 210 * search. 211 */ 212 goto out_force_log; 213 } else if (bbno >= fbno && bend <= fend) { 214 /* 215 * Case 2: 216 * bbno bend 217 * +BBBBBBBBBBBBBBBBB+ 218 * +-----------------+ 219 * fbno fend 220 * 221 * Case 3: 222 * bbno bend 223 * +BBBBBBBBBBBBBBBBB+ 224 * +--------------------------+ 225 * fbno fend 226 * 227 * Case 4: 228 * bbno bend 229 * +BBBBBBBBBBBBBBBBB+ 230 * +--------------------------+ 231 * fbno fend 232 * 233 * Case 5: 234 * bbno bend 235 * +BBBBBBBBBBBBBBBBB+ 236 * +-----------------------------------+ 237 * fbno fend 238 * 239 */ 240 241 /* 242 * The busy extent is fully covered by the extent we are 243 * allocating, and can simply be removed from the rbtree. 244 * However we cannot remove it from the immutable list 245 * tracking busy extents in the transaction or CIL context, 246 * so set the length to zero to mark it invalid. 247 * 248 * We also need to restart the busy extent search from the 249 * tree root, because erasing the node can rearrange the 250 * tree topology. 251 */ 252 rb_erase(&busyp->rb_node, &pag->pagb_tree); 253 busyp->length = 0; 254 return false; 255 } else if (fend < bend) { 256 /* 257 * Case 6: 258 * bbno bend 259 * +BBBBBBBBBBBBBBBBB+ 260 * +---------+ 261 * fbno fend 262 * 263 * Case 7: 264 * bbno bend 265 * +BBBBBBBBBBBBBBBBB+ 266 * +------------------+ 267 * fbno fend 268 * 269 */ 270 busyp->bno = fend; 271 } else if (bbno < fbno) { 272 /* 273 * Case 8: 274 * bbno bend 275 * +BBBBBBBBBBBBBBBBB+ 276 * +-------------+ 277 * fbno fend 278 * 279 * Case 9: 280 * bbno bend 281 * +BBBBBBBBBBBBBBBBB+ 282 * +----------------------+ 283 * fbno fend 284 */ 285 busyp->length = fbno - busyp->bno; 286 } else { 287 ASSERT(0); 288 } 289 290 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); 291 return true; 292 293 out_force_log: 294 spin_unlock(&pag->pagb_lock); 295 xfs_log_force(mp, XFS_LOG_SYNC); 296 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); 297 spin_lock(&pag->pagb_lock); 298 return false; 299 } 300 301 302 /* 303 * For a given extent [fbno, flen], make sure we can reuse it safely. 304 */ 305 void 306 xfs_extent_busy_reuse( 307 struct xfs_mount *mp, 308 xfs_agnumber_t agno, 309 xfs_agblock_t fbno, 310 xfs_extlen_t flen, 311 bool userdata) 312 { 313 struct xfs_perag *pag; 314 struct rb_node *rbp; 315 316 ASSERT(flen > 0); 317 318 pag = xfs_perag_get(mp, agno); 319 spin_lock(&pag->pagb_lock); 320 restart: 321 rbp = pag->pagb_tree.rb_node; 322 while (rbp) { 323 struct xfs_extent_busy *busyp = 324 rb_entry(rbp, struct xfs_extent_busy, rb_node); 325 xfs_agblock_t bbno = busyp->bno; 326 xfs_agblock_t bend = bbno + busyp->length; 327 328 if (fbno + flen <= bbno) { 329 rbp = rbp->rb_left; 330 continue; 331 } else if (fbno >= bend) { 332 rbp = rbp->rb_right; 333 continue; 334 } 335 336 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, 337 userdata)) 338 goto restart; 339 } 340 spin_unlock(&pag->pagb_lock); 341 xfs_perag_put(pag); 342 } 343 344 /* 345 * For a given extent [fbno, flen], search the busy extent list to find a 346 * subset of the extent that is not busy. If *rlen is smaller than 347 * args->minlen no suitable extent could be found, and the higher level 348 * code needs to force out the log and retry the allocation. 349 */ 350 void 351 xfs_extent_busy_trim( 352 struct xfs_alloc_arg *args, 353 xfs_agblock_t bno, 354 xfs_extlen_t len, 355 xfs_agblock_t *rbno, 356 xfs_extlen_t *rlen) 357 { 358 xfs_agblock_t fbno; 359 xfs_extlen_t flen; 360 struct rb_node *rbp; 361 362 ASSERT(len > 0); 363 364 spin_lock(&args->pag->pagb_lock); 365 restart: 366 fbno = bno; 367 flen = len; 368 rbp = args->pag->pagb_tree.rb_node; 369 while (rbp && flen >= args->minlen) { 370 struct xfs_extent_busy *busyp = 371 rb_entry(rbp, struct xfs_extent_busy, rb_node); 372 xfs_agblock_t fend = fbno + flen; 373 xfs_agblock_t bbno = busyp->bno; 374 xfs_agblock_t bend = bbno + busyp->length; 375 376 if (fend <= bbno) { 377 rbp = rbp->rb_left; 378 continue; 379 } else if (fbno >= bend) { 380 rbp = rbp->rb_right; 381 continue; 382 } 383 384 /* 385 * If this is a metadata allocation, try to reuse the busy 386 * extent instead of trimming the allocation. 387 */ 388 if (!args->userdata && 389 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 390 if (!xfs_extent_busy_update_extent(args->mp, args->pag, 391 busyp, fbno, flen, 392 false)) 393 goto restart; 394 continue; 395 } 396 397 if (bbno <= fbno) { 398 /* start overlap */ 399 400 /* 401 * Case 1: 402 * bbno bend 403 * +BBBBBBBBBBBBBBBBB+ 404 * +---------+ 405 * fbno fend 406 * 407 * Case 2: 408 * bbno bend 409 * +BBBBBBBBBBBBBBBBB+ 410 * +-------------+ 411 * fbno fend 412 * 413 * Case 3: 414 * bbno bend 415 * +BBBBBBBBBBBBBBBBB+ 416 * +-------------+ 417 * fbno fend 418 * 419 * Case 4: 420 * bbno bend 421 * +BBBBBBBBBBBBBBBBB+ 422 * +-----------------+ 423 * fbno fend 424 * 425 * No unbusy region in extent, return failure. 426 */ 427 if (fend <= bend) 428 goto fail; 429 430 /* 431 * Case 5: 432 * bbno bend 433 * +BBBBBBBBBBBBBBBBB+ 434 * +----------------------+ 435 * fbno fend 436 * 437 * Case 6: 438 * bbno bend 439 * +BBBBBBBBBBBBBBBBB+ 440 * +--------------------------+ 441 * fbno fend 442 * 443 * Needs to be trimmed to: 444 * +-------+ 445 * fbno fend 446 */ 447 fbno = bend; 448 } else if (bend >= fend) { 449 /* end overlap */ 450 451 /* 452 * Case 7: 453 * bbno bend 454 * +BBBBBBBBBBBBBBBBB+ 455 * +------------------+ 456 * fbno fend 457 * 458 * Case 8: 459 * bbno bend 460 * +BBBBBBBBBBBBBBBBB+ 461 * +--------------------------+ 462 * fbno fend 463 * 464 * Needs to be trimmed to: 465 * +-------+ 466 * fbno fend 467 */ 468 fend = bbno; 469 } else { 470 /* middle overlap */ 471 472 /* 473 * Case 9: 474 * bbno bend 475 * +BBBBBBBBBBBBBBBBB+ 476 * +-----------------------------------+ 477 * fbno fend 478 * 479 * Can be trimmed to: 480 * +-------+ OR +-------+ 481 * fbno fend fbno fend 482 * 483 * Backward allocation leads to significant 484 * fragmentation of directories, which degrades 485 * directory performance, therefore we always want to 486 * choose the option that produces forward allocation 487 * patterns. 488 * Preferring the lower bno extent will make the next 489 * request use "fend" as the start of the next 490 * allocation; if the segment is no longer busy at 491 * that point, we'll get a contiguous allocation, but 492 * even if it is still busy, we will get a forward 493 * allocation. 494 * We try to avoid choosing the segment at "bend", 495 * because that can lead to the next allocation 496 * taking the segment at "fbno", which would be a 497 * backward allocation. We only use the segment at 498 * "fbno" if it is much larger than the current 499 * requested size, because in that case there's a 500 * good chance subsequent allocations will be 501 * contiguous. 502 */ 503 if (bbno - fbno >= args->maxlen) { 504 /* left candidate fits perfect */ 505 fend = bbno; 506 } else if (fend - bend >= args->maxlen * 4) { 507 /* right candidate has enough free space */ 508 fbno = bend; 509 } else if (bbno - fbno >= args->minlen) { 510 /* left candidate fits minimum requirement */ 511 fend = bbno; 512 } else { 513 goto fail; 514 } 515 } 516 517 flen = fend - fbno; 518 } 519 spin_unlock(&args->pag->pagb_lock); 520 521 if (fbno != bno || flen != len) { 522 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, 523 fbno, flen); 524 } 525 *rbno = fbno; 526 *rlen = flen; 527 return; 528 fail: 529 /* 530 * Return a zero extent length as failure indications. All callers 531 * re-check if the trimmed extent satisfies the minlen requirement. 532 */ 533 spin_unlock(&args->pag->pagb_lock); 534 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); 535 *rbno = fbno; 536 *rlen = 0; 537 } 538 539 STATIC void 540 xfs_extent_busy_clear_one( 541 struct xfs_mount *mp, 542 struct xfs_perag *pag, 543 struct xfs_extent_busy *busyp) 544 { 545 if (busyp->length) { 546 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, 547 busyp->length); 548 rb_erase(&busyp->rb_node, &pag->pagb_tree); 549 } 550 551 list_del_init(&busyp->list); 552 kmem_free(busyp); 553 } 554 555 /* 556 * Remove all extents on the passed in list from the busy extents tree. 557 * If do_discard is set skip extents that need to be discarded, and mark 558 * these as undergoing a discard operation instead. 559 */ 560 void 561 xfs_extent_busy_clear( 562 struct xfs_mount *mp, 563 struct list_head *list, 564 bool do_discard) 565 { 566 struct xfs_extent_busy *busyp, *n; 567 struct xfs_perag *pag = NULL; 568 xfs_agnumber_t agno = NULLAGNUMBER; 569 570 list_for_each_entry_safe(busyp, n, list, list) { 571 if (busyp->agno != agno) { 572 if (pag) { 573 spin_unlock(&pag->pagb_lock); 574 xfs_perag_put(pag); 575 } 576 pag = xfs_perag_get(mp, busyp->agno); 577 spin_lock(&pag->pagb_lock); 578 agno = busyp->agno; 579 } 580 581 if (do_discard && busyp->length && 582 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) 583 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 584 else 585 xfs_extent_busy_clear_one(mp, pag, busyp); 586 } 587 588 if (pag) { 589 spin_unlock(&pag->pagb_lock); 590 xfs_perag_put(pag); 591 } 592 } 593 594 /* 595 * Callback for list_sort to sort busy extents by the AG they reside in. 596 */ 597 int 598 xfs_extent_busy_ag_cmp( 599 void *priv, 600 struct list_head *a, 601 struct list_head *b) 602 { 603 return container_of(a, struct xfs_extent_busy, list)->agno - 604 container_of(b, struct xfs_extent_busy, list)->agno; 605 } 606