1 /* 2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 3 * Copyright (c) 2010 David Chinner. 4 * Copyright (c) 2011 Christoph Hellwig. 5 * All Rights Reserved. 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License as 9 * published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_types.h" 23 #include "xfs_log.h" 24 #include "xfs_trans.h" 25 #include "xfs_sb.h" 26 #include "xfs_ag.h" 27 #include "xfs_mount.h" 28 #include "xfs_bmap_btree.h" 29 #include "xfs_alloc.h" 30 #include "xfs_inode.h" 31 #include "xfs_extent_busy.h" 32 #include "xfs_trace.h" 33 34 void 35 xfs_extent_busy_insert( 36 struct xfs_trans *tp, 37 xfs_agnumber_t agno, 38 xfs_agblock_t bno, 39 xfs_extlen_t len, 40 unsigned int flags) 41 { 42 struct xfs_extent_busy *new; 43 struct xfs_extent_busy *busyp; 44 struct xfs_perag *pag; 45 struct rb_node **rbp; 46 struct rb_node *parent = NULL; 47 48 new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); 49 if (!new) { 50 /* 51 * No Memory! Since it is now not possible to track the free 52 * block, make this a synchronous transaction to insure that 53 * the block is not reused before this transaction commits. 54 */ 55 trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); 56 xfs_trans_set_sync(tp); 57 return; 58 } 59 60 new->agno = agno; 61 new->bno = bno; 62 new->length = len; 63 INIT_LIST_HEAD(&new->list); 64 new->flags = flags; 65 66 /* trace before insert to be able to see failed inserts */ 67 trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); 68 69 pag = xfs_perag_get(tp->t_mountp, new->agno); 70 spin_lock(&pag->pagb_lock); 71 rbp = &pag->pagb_tree.rb_node; 72 while (*rbp) { 73 parent = *rbp; 74 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); 75 76 if (new->bno < busyp->bno) { 77 rbp = &(*rbp)->rb_left; 78 ASSERT(new->bno + new->length <= busyp->bno); 79 } else if (new->bno > busyp->bno) { 80 rbp = &(*rbp)->rb_right; 81 ASSERT(bno >= busyp->bno + busyp->length); 82 } else { 83 ASSERT(0); 84 } 85 } 86 87 rb_link_node(&new->rb_node, parent, rbp); 88 rb_insert_color(&new->rb_node, &pag->pagb_tree); 89 90 list_add(&new->list, &tp->t_busy); 91 spin_unlock(&pag->pagb_lock); 92 xfs_perag_put(pag); 93 } 94 95 /* 96 * Search for a busy extent within the range of the extent we are about to 97 * allocate. You need to be holding the busy extent tree lock when calling 98 * xfs_extent_busy_search(). This function returns 0 for no overlapping busy 99 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact 100 * match. This is done so that a non-zero return indicates an overlap that 101 * will require a synchronous transaction, but it can still be 102 * used to distinguish between a partial or exact match. 103 */ 104 int 105 xfs_extent_busy_search( 106 struct xfs_mount *mp, 107 xfs_agnumber_t agno, 108 xfs_agblock_t bno, 109 xfs_extlen_t len) 110 { 111 struct xfs_perag *pag; 112 struct rb_node *rbp; 113 struct xfs_extent_busy *busyp; 114 int match = 0; 115 116 pag = xfs_perag_get(mp, agno); 117 spin_lock(&pag->pagb_lock); 118 119 rbp = pag->pagb_tree.rb_node; 120 121 /* find closest start bno overlap */ 122 while (rbp) { 123 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); 124 if (bno < busyp->bno) { 125 /* may overlap, but exact start block is lower */ 126 if (bno + len > busyp->bno) 127 match = -1; 128 rbp = rbp->rb_left; 129 } else if (bno > busyp->bno) { 130 /* may overlap, but exact start block is higher */ 131 if (bno < busyp->bno + busyp->length) 132 match = -1; 133 rbp = rbp->rb_right; 134 } else { 135 /* bno matches busyp, length determines exact match */ 136 match = (busyp->length == len) ? 1 : -1; 137 break; 138 } 139 } 140 spin_unlock(&pag->pagb_lock); 141 xfs_perag_put(pag); 142 return match; 143 } 144 145 /* 146 * The found free extent [fbno, fend] overlaps part or all of the given busy 147 * extent. If the overlap covers the beginning, the end, or all of the busy 148 * extent, the overlapping portion can be made unbusy and used for the 149 * allocation. We can't split a busy extent because we can't modify a 150 * transaction/CIL context busy list, but we can update an entries block 151 * number or length. 152 * 153 * Returns true if the extent can safely be reused, or false if the search 154 * needs to be restarted. 155 */ 156 STATIC bool 157 xfs_extent_busy_update_extent( 158 struct xfs_mount *mp, 159 struct xfs_perag *pag, 160 struct xfs_extent_busy *busyp, 161 xfs_agblock_t fbno, 162 xfs_extlen_t flen, 163 bool userdata) 164 { 165 xfs_agblock_t fend = fbno + flen; 166 xfs_agblock_t bbno = busyp->bno; 167 xfs_agblock_t bend = bbno + busyp->length; 168 169 /* 170 * This extent is currently being discarded. Give the thread 171 * performing the discard a chance to mark the extent unbusy 172 * and retry. 173 */ 174 if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { 175 spin_unlock(&pag->pagb_lock); 176 delay(1); 177 spin_lock(&pag->pagb_lock); 178 return false; 179 } 180 181 /* 182 * If there is a busy extent overlapping a user allocation, we have 183 * no choice but to force the log and retry the search. 184 * 185 * Fortunately this does not happen during normal operation, but 186 * only if the filesystem is very low on space and has to dip into 187 * the AGFL for normal allocations. 188 */ 189 if (userdata) 190 goto out_force_log; 191 192 if (bbno < fbno && bend > fend) { 193 /* 194 * Case 1: 195 * bbno bend 196 * +BBBBBBBBBBBBBBBBB+ 197 * +---------+ 198 * fbno fend 199 */ 200 201 /* 202 * We would have to split the busy extent to be able to track 203 * it correct, which we cannot do because we would have to 204 * modify the list of busy extents attached to the transaction 205 * or CIL context, which is immutable. 206 * 207 * Force out the log to clear the busy extent and retry the 208 * search. 209 */ 210 goto out_force_log; 211 } else if (bbno >= fbno && bend <= fend) { 212 /* 213 * Case 2: 214 * bbno bend 215 * +BBBBBBBBBBBBBBBBB+ 216 * +-----------------+ 217 * fbno fend 218 * 219 * Case 3: 220 * bbno bend 221 * +BBBBBBBBBBBBBBBBB+ 222 * +--------------------------+ 223 * fbno fend 224 * 225 * Case 4: 226 * bbno bend 227 * +BBBBBBBBBBBBBBBBB+ 228 * +--------------------------+ 229 * fbno fend 230 * 231 * Case 5: 232 * bbno bend 233 * +BBBBBBBBBBBBBBBBB+ 234 * +-----------------------------------+ 235 * fbno fend 236 * 237 */ 238 239 /* 240 * The busy extent is fully covered by the extent we are 241 * allocating, and can simply be removed from the rbtree. 242 * However we cannot remove it from the immutable list 243 * tracking busy extents in the transaction or CIL context, 244 * so set the length to zero to mark it invalid. 245 * 246 * We also need to restart the busy extent search from the 247 * tree root, because erasing the node can rearrange the 248 * tree topology. 249 */ 250 rb_erase(&busyp->rb_node, &pag->pagb_tree); 251 busyp->length = 0; 252 return false; 253 } else if (fend < bend) { 254 /* 255 * Case 6: 256 * bbno bend 257 * +BBBBBBBBBBBBBBBBB+ 258 * +---------+ 259 * fbno fend 260 * 261 * Case 7: 262 * bbno bend 263 * +BBBBBBBBBBBBBBBBB+ 264 * +------------------+ 265 * fbno fend 266 * 267 */ 268 busyp->bno = fend; 269 } else if (bbno < fbno) { 270 /* 271 * Case 8: 272 * bbno bend 273 * +BBBBBBBBBBBBBBBBB+ 274 * +-------------+ 275 * fbno fend 276 * 277 * Case 9: 278 * bbno bend 279 * +BBBBBBBBBBBBBBBBB+ 280 * +----------------------+ 281 * fbno fend 282 */ 283 busyp->length = fbno - busyp->bno; 284 } else { 285 ASSERT(0); 286 } 287 288 trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); 289 return true; 290 291 out_force_log: 292 spin_unlock(&pag->pagb_lock); 293 xfs_log_force(mp, XFS_LOG_SYNC); 294 trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); 295 spin_lock(&pag->pagb_lock); 296 return false; 297 } 298 299 300 /* 301 * For a given extent [fbno, flen], make sure we can reuse it safely. 302 */ 303 void 304 xfs_extent_busy_reuse( 305 struct xfs_mount *mp, 306 xfs_agnumber_t agno, 307 xfs_agblock_t fbno, 308 xfs_extlen_t flen, 309 bool userdata) 310 { 311 struct xfs_perag *pag; 312 struct rb_node *rbp; 313 314 ASSERT(flen > 0); 315 316 pag = xfs_perag_get(mp, agno); 317 spin_lock(&pag->pagb_lock); 318 restart: 319 rbp = pag->pagb_tree.rb_node; 320 while (rbp) { 321 struct xfs_extent_busy *busyp = 322 rb_entry(rbp, struct xfs_extent_busy, rb_node); 323 xfs_agblock_t bbno = busyp->bno; 324 xfs_agblock_t bend = bbno + busyp->length; 325 326 if (fbno + flen <= bbno) { 327 rbp = rbp->rb_left; 328 continue; 329 } else if (fbno >= bend) { 330 rbp = rbp->rb_right; 331 continue; 332 } 333 334 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, 335 userdata)) 336 goto restart; 337 } 338 spin_unlock(&pag->pagb_lock); 339 xfs_perag_put(pag); 340 } 341 342 /* 343 * For a given extent [fbno, flen], search the busy extent list to find a 344 * subset of the extent that is not busy. If *rlen is smaller than 345 * args->minlen no suitable extent could be found, and the higher level 346 * code needs to force out the log and retry the allocation. 347 */ 348 void 349 xfs_extent_busy_trim( 350 struct xfs_alloc_arg *args, 351 xfs_agblock_t bno, 352 xfs_extlen_t len, 353 xfs_agblock_t *rbno, 354 xfs_extlen_t *rlen) 355 { 356 xfs_agblock_t fbno; 357 xfs_extlen_t flen; 358 struct rb_node *rbp; 359 360 ASSERT(len > 0); 361 362 spin_lock(&args->pag->pagb_lock); 363 restart: 364 fbno = bno; 365 flen = len; 366 rbp = args->pag->pagb_tree.rb_node; 367 while (rbp && flen >= args->minlen) { 368 struct xfs_extent_busy *busyp = 369 rb_entry(rbp, struct xfs_extent_busy, rb_node); 370 xfs_agblock_t fend = fbno + flen; 371 xfs_agblock_t bbno = busyp->bno; 372 xfs_agblock_t bend = bbno + busyp->length; 373 374 if (fend <= bbno) { 375 rbp = rbp->rb_left; 376 continue; 377 } else if (fbno >= bend) { 378 rbp = rbp->rb_right; 379 continue; 380 } 381 382 /* 383 * If this is a metadata allocation, try to reuse the busy 384 * extent instead of trimming the allocation. 385 */ 386 if (!args->userdata && 387 !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { 388 if (!xfs_extent_busy_update_extent(args->mp, args->pag, 389 busyp, fbno, flen, 390 false)) 391 goto restart; 392 continue; 393 } 394 395 if (bbno <= fbno) { 396 /* start overlap */ 397 398 /* 399 * Case 1: 400 * bbno bend 401 * +BBBBBBBBBBBBBBBBB+ 402 * +---------+ 403 * fbno fend 404 * 405 * Case 2: 406 * bbno bend 407 * +BBBBBBBBBBBBBBBBB+ 408 * +-------------+ 409 * fbno fend 410 * 411 * Case 3: 412 * bbno bend 413 * +BBBBBBBBBBBBBBBBB+ 414 * +-------------+ 415 * fbno fend 416 * 417 * Case 4: 418 * bbno bend 419 * +BBBBBBBBBBBBBBBBB+ 420 * +-----------------+ 421 * fbno fend 422 * 423 * No unbusy region in extent, return failure. 424 */ 425 if (fend <= bend) 426 goto fail; 427 428 /* 429 * Case 5: 430 * bbno bend 431 * +BBBBBBBBBBBBBBBBB+ 432 * +----------------------+ 433 * fbno fend 434 * 435 * Case 6: 436 * bbno bend 437 * +BBBBBBBBBBBBBBBBB+ 438 * +--------------------------+ 439 * fbno fend 440 * 441 * Needs to be trimmed to: 442 * +-------+ 443 * fbno fend 444 */ 445 fbno = bend; 446 } else if (bend >= fend) { 447 /* end overlap */ 448 449 /* 450 * Case 7: 451 * bbno bend 452 * +BBBBBBBBBBBBBBBBB+ 453 * +------------------+ 454 * fbno fend 455 * 456 * Case 8: 457 * bbno bend 458 * +BBBBBBBBBBBBBBBBB+ 459 * +--------------------------+ 460 * fbno fend 461 * 462 * Needs to be trimmed to: 463 * +-------+ 464 * fbno fend 465 */ 466 fend = bbno; 467 } else { 468 /* middle overlap */ 469 470 /* 471 * Case 9: 472 * bbno bend 473 * +BBBBBBBBBBBBBBBBB+ 474 * +-----------------------------------+ 475 * fbno fend 476 * 477 * Can be trimmed to: 478 * +-------+ OR +-------+ 479 * fbno fend fbno fend 480 * 481 * Backward allocation leads to significant 482 * fragmentation of directories, which degrades 483 * directory performance, therefore we always want to 484 * choose the option that produces forward allocation 485 * patterns. 486 * Preferring the lower bno extent will make the next 487 * request use "fend" as the start of the next 488 * allocation; if the segment is no longer busy at 489 * that point, we'll get a contiguous allocation, but 490 * even if it is still busy, we will get a forward 491 * allocation. 492 * We try to avoid choosing the segment at "bend", 493 * because that can lead to the next allocation 494 * taking the segment at "fbno", which would be a 495 * backward allocation. We only use the segment at 496 * "fbno" if it is much larger than the current 497 * requested size, because in that case there's a 498 * good chance subsequent allocations will be 499 * contiguous. 500 */ 501 if (bbno - fbno >= args->maxlen) { 502 /* left candidate fits perfect */ 503 fend = bbno; 504 } else if (fend - bend >= args->maxlen * 4) { 505 /* right candidate has enough free space */ 506 fbno = bend; 507 } else if (bbno - fbno >= args->minlen) { 508 /* left candidate fits minimum requirement */ 509 fend = bbno; 510 } else { 511 goto fail; 512 } 513 } 514 515 flen = fend - fbno; 516 } 517 spin_unlock(&args->pag->pagb_lock); 518 519 if (fbno != bno || flen != len) { 520 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, 521 fbno, flen); 522 } 523 *rbno = fbno; 524 *rlen = flen; 525 return; 526 fail: 527 /* 528 * Return a zero extent length as failure indications. All callers 529 * re-check if the trimmed extent satisfies the minlen requirement. 530 */ 531 spin_unlock(&args->pag->pagb_lock); 532 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); 533 *rbno = fbno; 534 *rlen = 0; 535 } 536 537 STATIC void 538 xfs_extent_busy_clear_one( 539 struct xfs_mount *mp, 540 struct xfs_perag *pag, 541 struct xfs_extent_busy *busyp) 542 { 543 if (busyp->length) { 544 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, 545 busyp->length); 546 rb_erase(&busyp->rb_node, &pag->pagb_tree); 547 } 548 549 list_del_init(&busyp->list); 550 kmem_free(busyp); 551 } 552 553 /* 554 * Remove all extents on the passed in list from the busy extents tree. 555 * If do_discard is set skip extents that need to be discarded, and mark 556 * these as undergoing a discard operation instead. 557 */ 558 void 559 xfs_extent_busy_clear( 560 struct xfs_mount *mp, 561 struct list_head *list, 562 bool do_discard) 563 { 564 struct xfs_extent_busy *busyp, *n; 565 struct xfs_perag *pag = NULL; 566 xfs_agnumber_t agno = NULLAGNUMBER; 567 568 list_for_each_entry_safe(busyp, n, list, list) { 569 if (busyp->agno != agno) { 570 if (pag) { 571 spin_unlock(&pag->pagb_lock); 572 xfs_perag_put(pag); 573 } 574 pag = xfs_perag_get(mp, busyp->agno); 575 spin_lock(&pag->pagb_lock); 576 agno = busyp->agno; 577 } 578 579 if (do_discard && busyp->length && 580 !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) 581 busyp->flags = XFS_EXTENT_BUSY_DISCARDED; 582 else 583 xfs_extent_busy_clear_one(mp, pag, busyp); 584 } 585 586 if (pag) { 587 spin_unlock(&pag->pagb_lock); 588 xfs_perag_put(pag); 589 } 590 } 591 592 /* 593 * Callback for list_sort to sort busy extents by the AG they reside in. 594 */ 595 int 596 xfs_extent_busy_ag_cmp( 597 void *priv, 598 struct list_head *a, 599 struct list_head *b) 600 { 601 return container_of(a, struct xfs_extent_busy, list)->agno - 602 container_of(b, struct xfs_extent_busy, list)->agno; 603 } 604