1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 #include <linux/sched/mm.h> 37 38 #include "xfs_format.h" 39 #include "xfs_log_format.h" 40 #include "xfs_trans_resv.h" 41 #include "xfs_sb.h" 42 #include "xfs_mount.h" 43 #include "xfs_trace.h" 44 #include "xfs_log.h" 45 #include "xfs_errortag.h" 46 #include "xfs_error.h" 47 48 static kmem_zone_t *xfs_buf_zone; 49 50 #ifdef XFS_BUF_LOCK_TRACKING 51 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 52 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 53 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 54 #else 55 # define XB_SET_OWNER(bp) do { } while (0) 56 # define XB_CLEAR_OWNER(bp) do { } while (0) 57 # define XB_GET_OWNER(bp) do { } while (0) 58 #endif 59 60 #define xb_to_gfp(flags) \ 61 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 62 63 64 static inline int 65 xfs_buf_is_vmapped( 66 struct xfs_buf *bp) 67 { 68 /* 69 * Return true if the buffer is vmapped. 70 * 71 * b_addr is null if the buffer is not mapped, but the code is clever 72 * enough to know it doesn't have to map a single page, so the check has 73 * to be both for b_addr and bp->b_page_count > 1. 74 */ 75 return bp->b_addr && bp->b_page_count > 1; 76 } 77 78 static inline int 79 xfs_buf_vmap_len( 80 struct xfs_buf *bp) 81 { 82 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 83 } 84 85 /* 86 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 87 * this buffer. The count is incremented once per buffer (per hold cycle) 88 * because the corresponding decrement is deferred to buffer release. Buffers 89 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 90 * tracking adds unnecessary overhead. This is used for sychronization purposes 91 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of 92 * in-flight buffers. 93 * 94 * Buffers that are never released (e.g., superblock, iclog buffers) must set 95 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 96 * never reaches zero and unmount hangs indefinitely. 97 */ 98 static inline void 99 xfs_buf_ioacct_inc( 100 struct xfs_buf *bp) 101 { 102 if (bp->b_flags & XBF_NO_IOACCT) 103 return; 104 105 ASSERT(bp->b_flags & XBF_ASYNC); 106 spin_lock(&bp->b_lock); 107 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 108 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 109 percpu_counter_inc(&bp->b_target->bt_io_count); 110 } 111 spin_unlock(&bp->b_lock); 112 } 113 114 /* 115 * Clear the in-flight state on a buffer about to be released to the LRU or 116 * freed and unaccount from the buftarg. 117 */ 118 static inline void 119 __xfs_buf_ioacct_dec( 120 struct xfs_buf *bp) 121 { 122 lockdep_assert_held(&bp->b_lock); 123 124 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 125 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 126 percpu_counter_dec(&bp->b_target->bt_io_count); 127 } 128 } 129 130 static inline void 131 xfs_buf_ioacct_dec( 132 struct xfs_buf *bp) 133 { 134 spin_lock(&bp->b_lock); 135 __xfs_buf_ioacct_dec(bp); 136 spin_unlock(&bp->b_lock); 137 } 138 139 /* 140 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 141 * b_lru_ref count so that the buffer is freed immediately when the buffer 142 * reference count falls to zero. If the buffer is already on the LRU, we need 143 * to remove the reference that LRU holds on the buffer. 144 * 145 * This prevents build-up of stale buffers on the LRU. 146 */ 147 void 148 xfs_buf_stale( 149 struct xfs_buf *bp) 150 { 151 ASSERT(xfs_buf_islocked(bp)); 152 153 bp->b_flags |= XBF_STALE; 154 155 /* 156 * Clear the delwri status so that a delwri queue walker will not 157 * flush this buffer to disk now that it is stale. The delwri queue has 158 * a reference to the buffer, so this is safe to do. 159 */ 160 bp->b_flags &= ~_XBF_DELWRI_Q; 161 162 /* 163 * Once the buffer is marked stale and unlocked, a subsequent lookup 164 * could reset b_flags. There is no guarantee that the buffer is 165 * unaccounted (released to LRU) before that occurs. Drop in-flight 166 * status now to preserve accounting consistency. 167 */ 168 spin_lock(&bp->b_lock); 169 __xfs_buf_ioacct_dec(bp); 170 171 atomic_set(&bp->b_lru_ref, 0); 172 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 173 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 174 atomic_dec(&bp->b_hold); 175 176 ASSERT(atomic_read(&bp->b_hold) >= 1); 177 spin_unlock(&bp->b_lock); 178 } 179 180 static int 181 xfs_buf_get_maps( 182 struct xfs_buf *bp, 183 int map_count) 184 { 185 ASSERT(bp->b_maps == NULL); 186 bp->b_map_count = map_count; 187 188 if (map_count == 1) { 189 bp->b_maps = &bp->__b_map; 190 return 0; 191 } 192 193 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 194 KM_NOFS); 195 if (!bp->b_maps) 196 return -ENOMEM; 197 return 0; 198 } 199 200 /* 201 * Frees b_pages if it was allocated. 202 */ 203 static void 204 xfs_buf_free_maps( 205 struct xfs_buf *bp) 206 { 207 if (bp->b_maps != &bp->__b_map) { 208 kmem_free(bp->b_maps); 209 bp->b_maps = NULL; 210 } 211 } 212 213 struct xfs_buf * 214 _xfs_buf_alloc( 215 struct xfs_buftarg *target, 216 struct xfs_buf_map *map, 217 int nmaps, 218 xfs_buf_flags_t flags) 219 { 220 struct xfs_buf *bp; 221 int error; 222 int i; 223 224 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 225 if (unlikely(!bp)) 226 return NULL; 227 228 /* 229 * We don't want certain flags to appear in b_flags unless they are 230 * specifically set by later operations on the buffer. 231 */ 232 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 233 234 atomic_set(&bp->b_hold, 1); 235 atomic_set(&bp->b_lru_ref, 1); 236 init_completion(&bp->b_iowait); 237 INIT_LIST_HEAD(&bp->b_lru); 238 INIT_LIST_HEAD(&bp->b_list); 239 sema_init(&bp->b_sema, 0); /* held, no waiters */ 240 spin_lock_init(&bp->b_lock); 241 XB_SET_OWNER(bp); 242 bp->b_target = target; 243 bp->b_flags = flags; 244 245 /* 246 * Set length and io_length to the same value initially. 247 * I/O routines should use io_length, which will be the same in 248 * most cases but may be reset (e.g. XFS recovery). 249 */ 250 error = xfs_buf_get_maps(bp, nmaps); 251 if (error) { 252 kmem_zone_free(xfs_buf_zone, bp); 253 return NULL; 254 } 255 256 bp->b_bn = map[0].bm_bn; 257 bp->b_length = 0; 258 for (i = 0; i < nmaps; i++) { 259 bp->b_maps[i].bm_bn = map[i].bm_bn; 260 bp->b_maps[i].bm_len = map[i].bm_len; 261 bp->b_length += map[i].bm_len; 262 } 263 bp->b_io_length = bp->b_length; 264 265 atomic_set(&bp->b_pin_count, 0); 266 init_waitqueue_head(&bp->b_waiters); 267 268 XFS_STATS_INC(target->bt_mount, xb_create); 269 trace_xfs_buf_init(bp, _RET_IP_); 270 271 return bp; 272 } 273 274 /* 275 * Allocate a page array capable of holding a specified number 276 * of pages, and point the page buf at it. 277 */ 278 STATIC int 279 _xfs_buf_get_pages( 280 xfs_buf_t *bp, 281 int page_count) 282 { 283 /* Make sure that we have a page list */ 284 if (bp->b_pages == NULL) { 285 bp->b_page_count = page_count; 286 if (page_count <= XB_PAGES) { 287 bp->b_pages = bp->b_page_array; 288 } else { 289 bp->b_pages = kmem_alloc(sizeof(struct page *) * 290 page_count, KM_NOFS); 291 if (bp->b_pages == NULL) 292 return -ENOMEM; 293 } 294 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 295 } 296 return 0; 297 } 298 299 /* 300 * Frees b_pages if it was allocated. 301 */ 302 STATIC void 303 _xfs_buf_free_pages( 304 xfs_buf_t *bp) 305 { 306 if (bp->b_pages != bp->b_page_array) { 307 kmem_free(bp->b_pages); 308 bp->b_pages = NULL; 309 } 310 } 311 312 /* 313 * Releases the specified buffer. 314 * 315 * The modification state of any associated pages is left unchanged. 316 * The buffer must not be on any hash - use xfs_buf_rele instead for 317 * hashed and refcounted buffers 318 */ 319 void 320 xfs_buf_free( 321 xfs_buf_t *bp) 322 { 323 trace_xfs_buf_free(bp, _RET_IP_); 324 325 ASSERT(list_empty(&bp->b_lru)); 326 327 if (bp->b_flags & _XBF_PAGES) { 328 uint i; 329 330 if (xfs_buf_is_vmapped(bp)) 331 vm_unmap_ram(bp->b_addr - bp->b_offset, 332 bp->b_page_count); 333 334 for (i = 0; i < bp->b_page_count; i++) { 335 struct page *page = bp->b_pages[i]; 336 337 __free_page(page); 338 } 339 } else if (bp->b_flags & _XBF_KMEM) 340 kmem_free(bp->b_addr); 341 _xfs_buf_free_pages(bp); 342 xfs_buf_free_maps(bp); 343 kmem_zone_free(xfs_buf_zone, bp); 344 } 345 346 /* 347 * Allocates all the pages for buffer in question and builds it's page list. 348 */ 349 STATIC int 350 xfs_buf_allocate_memory( 351 xfs_buf_t *bp, 352 uint flags) 353 { 354 size_t size; 355 size_t nbytes, offset; 356 gfp_t gfp_mask = xb_to_gfp(flags); 357 unsigned short page_count, i; 358 xfs_off_t start, end; 359 int error; 360 361 /* 362 * for buffers that are contained within a single page, just allocate 363 * the memory from the heap - there's no need for the complexity of 364 * page arrays to keep allocation down to order 0. 365 */ 366 size = BBTOB(bp->b_length); 367 if (size < PAGE_SIZE) { 368 bp->b_addr = kmem_alloc(size, KM_NOFS); 369 if (!bp->b_addr) { 370 /* low memory - use alloc_page loop instead */ 371 goto use_alloc_page; 372 } 373 374 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 375 ((unsigned long)bp->b_addr & PAGE_MASK)) { 376 /* b_addr spans two pages - use alloc_page instead */ 377 kmem_free(bp->b_addr); 378 bp->b_addr = NULL; 379 goto use_alloc_page; 380 } 381 bp->b_offset = offset_in_page(bp->b_addr); 382 bp->b_pages = bp->b_page_array; 383 bp->b_pages[0] = virt_to_page(bp->b_addr); 384 bp->b_page_count = 1; 385 bp->b_flags |= _XBF_KMEM; 386 return 0; 387 } 388 389 use_alloc_page: 390 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 391 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 392 >> PAGE_SHIFT; 393 page_count = end - start; 394 error = _xfs_buf_get_pages(bp, page_count); 395 if (unlikely(error)) 396 return error; 397 398 offset = bp->b_offset; 399 bp->b_flags |= _XBF_PAGES; 400 401 for (i = 0; i < bp->b_page_count; i++) { 402 struct page *page; 403 uint retries = 0; 404 retry: 405 page = alloc_page(gfp_mask); 406 if (unlikely(page == NULL)) { 407 if (flags & XBF_READ_AHEAD) { 408 bp->b_page_count = i; 409 error = -ENOMEM; 410 goto out_free_pages; 411 } 412 413 /* 414 * This could deadlock. 415 * 416 * But until all the XFS lowlevel code is revamped to 417 * handle buffer allocation failures we can't do much. 418 */ 419 if (!(++retries % 100)) 420 xfs_err(NULL, 421 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 422 current->comm, current->pid, 423 __func__, gfp_mask); 424 425 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); 426 congestion_wait(BLK_RW_ASYNC, HZ/50); 427 goto retry; 428 } 429 430 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); 431 432 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 433 size -= nbytes; 434 bp->b_pages[i] = page; 435 offset = 0; 436 } 437 return 0; 438 439 out_free_pages: 440 for (i = 0; i < bp->b_page_count; i++) 441 __free_page(bp->b_pages[i]); 442 bp->b_flags &= ~_XBF_PAGES; 443 return error; 444 } 445 446 /* 447 * Map buffer into kernel address-space if necessary. 448 */ 449 STATIC int 450 _xfs_buf_map_pages( 451 xfs_buf_t *bp, 452 uint flags) 453 { 454 ASSERT(bp->b_flags & _XBF_PAGES); 455 if (bp->b_page_count == 1) { 456 /* A single page buffer is always mappable */ 457 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 458 } else if (flags & XBF_UNMAPPED) { 459 bp->b_addr = NULL; 460 } else { 461 int retried = 0; 462 unsigned nofs_flag; 463 464 /* 465 * vm_map_ram() will allocate auxillary structures (e.g. 466 * pagetables) with GFP_KERNEL, yet we are likely to be under 467 * GFP_NOFS context here. Hence we need to tell memory reclaim 468 * that we are in such a context via PF_MEMALLOC_NOFS to prevent 469 * memory reclaim re-entering the filesystem here and 470 * potentially deadlocking. 471 */ 472 nofs_flag = memalloc_nofs_save(); 473 do { 474 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 475 -1, PAGE_KERNEL); 476 if (bp->b_addr) 477 break; 478 vm_unmap_aliases(); 479 } while (retried++ <= 1); 480 memalloc_nofs_restore(nofs_flag); 481 482 if (!bp->b_addr) 483 return -ENOMEM; 484 bp->b_addr += bp->b_offset; 485 } 486 487 return 0; 488 } 489 490 /* 491 * Finding and Reading Buffers 492 */ 493 static int 494 _xfs_buf_obj_cmp( 495 struct rhashtable_compare_arg *arg, 496 const void *obj) 497 { 498 const struct xfs_buf_map *map = arg->key; 499 const struct xfs_buf *bp = obj; 500 501 /* 502 * The key hashing in the lookup path depends on the key being the 503 * first element of the compare_arg, make sure to assert this. 504 */ 505 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 506 507 if (bp->b_bn != map->bm_bn) 508 return 1; 509 510 if (unlikely(bp->b_length != map->bm_len)) { 511 /* 512 * found a block number match. If the range doesn't 513 * match, the only way this is allowed is if the buffer 514 * in the cache is stale and the transaction that made 515 * it stale has not yet committed. i.e. we are 516 * reallocating a busy extent. Skip this buffer and 517 * continue searching for an exact match. 518 */ 519 ASSERT(bp->b_flags & XBF_STALE); 520 return 1; 521 } 522 return 0; 523 } 524 525 static const struct rhashtable_params xfs_buf_hash_params = { 526 .min_size = 32, /* empty AGs have minimal footprint */ 527 .nelem_hint = 16, 528 .key_len = sizeof(xfs_daddr_t), 529 .key_offset = offsetof(struct xfs_buf, b_bn), 530 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 531 .automatic_shrinking = true, 532 .obj_cmpfn = _xfs_buf_obj_cmp, 533 }; 534 535 int 536 xfs_buf_hash_init( 537 struct xfs_perag *pag) 538 { 539 spin_lock_init(&pag->pag_buf_lock); 540 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 541 } 542 543 void 544 xfs_buf_hash_destroy( 545 struct xfs_perag *pag) 546 { 547 rhashtable_destroy(&pag->pag_buf_hash); 548 } 549 550 /* 551 * Look up, and creates if absent, a lockable buffer for 552 * a given range of an inode. The buffer is returned 553 * locked. No I/O is implied by this call. 554 */ 555 xfs_buf_t * 556 _xfs_buf_find( 557 struct xfs_buftarg *btp, 558 struct xfs_buf_map *map, 559 int nmaps, 560 xfs_buf_flags_t flags, 561 xfs_buf_t *new_bp) 562 { 563 struct xfs_perag *pag; 564 xfs_buf_t *bp; 565 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 566 xfs_daddr_t eofs; 567 int i; 568 569 for (i = 0; i < nmaps; i++) 570 cmap.bm_len += map[i].bm_len; 571 572 /* Check for IOs smaller than the sector size / not sector aligned */ 573 ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); 574 ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 575 576 /* 577 * Corrupted block numbers can get through to here, unfortunately, so we 578 * have to check that the buffer falls within the filesystem bounds. 579 */ 580 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 581 if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { 582 /* 583 * XXX (dgc): we should really be returning -EFSCORRUPTED here, 584 * but none of the higher level infrastructure supports 585 * returning a specific error on buffer lookup failures. 586 */ 587 xfs_alert(btp->bt_mount, 588 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", 589 __func__, cmap.bm_bn, eofs); 590 WARN_ON(1); 591 return NULL; 592 } 593 594 pag = xfs_perag_get(btp->bt_mount, 595 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 596 597 spin_lock(&pag->pag_buf_lock); 598 bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, 599 xfs_buf_hash_params); 600 if (bp) { 601 atomic_inc(&bp->b_hold); 602 goto found; 603 } 604 605 /* No match found */ 606 if (new_bp) { 607 /* the buffer keeps the perag reference until it is freed */ 608 new_bp->b_pag = pag; 609 rhashtable_insert_fast(&pag->pag_buf_hash, 610 &new_bp->b_rhash_head, 611 xfs_buf_hash_params); 612 spin_unlock(&pag->pag_buf_lock); 613 } else { 614 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 615 spin_unlock(&pag->pag_buf_lock); 616 xfs_perag_put(pag); 617 } 618 return new_bp; 619 620 found: 621 spin_unlock(&pag->pag_buf_lock); 622 xfs_perag_put(pag); 623 624 if (!xfs_buf_trylock(bp)) { 625 if (flags & XBF_TRYLOCK) { 626 xfs_buf_rele(bp); 627 XFS_STATS_INC(btp->bt_mount, xb_busy_locked); 628 return NULL; 629 } 630 xfs_buf_lock(bp); 631 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); 632 } 633 634 /* 635 * if the buffer is stale, clear all the external state associated with 636 * it. We need to keep flags such as how we allocated the buffer memory 637 * intact here. 638 */ 639 if (bp->b_flags & XBF_STALE) { 640 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 641 ASSERT(bp->b_iodone == NULL); 642 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 643 bp->b_ops = NULL; 644 } 645 646 trace_xfs_buf_find(bp, flags, _RET_IP_); 647 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 648 return bp; 649 } 650 651 /* 652 * Assembles a buffer covering the specified range. The code is optimised for 653 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 654 * more hits than misses. 655 */ 656 struct xfs_buf * 657 xfs_buf_get_map( 658 struct xfs_buftarg *target, 659 struct xfs_buf_map *map, 660 int nmaps, 661 xfs_buf_flags_t flags) 662 { 663 struct xfs_buf *bp; 664 struct xfs_buf *new_bp; 665 int error = 0; 666 667 bp = _xfs_buf_find(target, map, nmaps, flags, NULL); 668 if (likely(bp)) 669 goto found; 670 671 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 672 if (unlikely(!new_bp)) 673 return NULL; 674 675 error = xfs_buf_allocate_memory(new_bp, flags); 676 if (error) { 677 xfs_buf_free(new_bp); 678 return NULL; 679 } 680 681 bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); 682 if (!bp) { 683 xfs_buf_free(new_bp); 684 return NULL; 685 } 686 687 if (bp != new_bp) 688 xfs_buf_free(new_bp); 689 690 found: 691 if (!bp->b_addr) { 692 error = _xfs_buf_map_pages(bp, flags); 693 if (unlikely(error)) { 694 xfs_warn(target->bt_mount, 695 "%s: failed to map pagesn", __func__); 696 xfs_buf_relse(bp); 697 return NULL; 698 } 699 } 700 701 /* 702 * Clear b_error if this is a lookup from a caller that doesn't expect 703 * valid data to be found in the buffer. 704 */ 705 if (!(flags & XBF_READ)) 706 xfs_buf_ioerror(bp, 0); 707 708 XFS_STATS_INC(target->bt_mount, xb_get); 709 trace_xfs_buf_get(bp, flags, _RET_IP_); 710 return bp; 711 } 712 713 STATIC int 714 _xfs_buf_read( 715 xfs_buf_t *bp, 716 xfs_buf_flags_t flags) 717 { 718 ASSERT(!(flags & XBF_WRITE)); 719 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 720 721 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 722 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 723 724 if (flags & XBF_ASYNC) { 725 xfs_buf_submit(bp); 726 return 0; 727 } 728 return xfs_buf_submit_wait(bp); 729 } 730 731 xfs_buf_t * 732 xfs_buf_read_map( 733 struct xfs_buftarg *target, 734 struct xfs_buf_map *map, 735 int nmaps, 736 xfs_buf_flags_t flags, 737 const struct xfs_buf_ops *ops) 738 { 739 struct xfs_buf *bp; 740 741 flags |= XBF_READ; 742 743 bp = xfs_buf_get_map(target, map, nmaps, flags); 744 if (bp) { 745 trace_xfs_buf_read(bp, flags, _RET_IP_); 746 747 if (!(bp->b_flags & XBF_DONE)) { 748 XFS_STATS_INC(target->bt_mount, xb_get_read); 749 bp->b_ops = ops; 750 _xfs_buf_read(bp, flags); 751 } else if (flags & XBF_ASYNC) { 752 /* 753 * Read ahead call which is already satisfied, 754 * drop the buffer 755 */ 756 xfs_buf_relse(bp); 757 return NULL; 758 } else { 759 /* We do not want read in the flags */ 760 bp->b_flags &= ~XBF_READ; 761 } 762 } 763 764 return bp; 765 } 766 767 /* 768 * If we are not low on memory then do the readahead in a deadlock 769 * safe manner. 770 */ 771 void 772 xfs_buf_readahead_map( 773 struct xfs_buftarg *target, 774 struct xfs_buf_map *map, 775 int nmaps, 776 const struct xfs_buf_ops *ops) 777 { 778 if (bdi_read_congested(target->bt_bdev->bd_bdi)) 779 return; 780 781 xfs_buf_read_map(target, map, nmaps, 782 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 783 } 784 785 /* 786 * Read an uncached buffer from disk. Allocates and returns a locked 787 * buffer containing the disk contents or nothing. 788 */ 789 int 790 xfs_buf_read_uncached( 791 struct xfs_buftarg *target, 792 xfs_daddr_t daddr, 793 size_t numblks, 794 int flags, 795 struct xfs_buf **bpp, 796 const struct xfs_buf_ops *ops) 797 { 798 struct xfs_buf *bp; 799 800 *bpp = NULL; 801 802 bp = xfs_buf_get_uncached(target, numblks, flags); 803 if (!bp) 804 return -ENOMEM; 805 806 /* set up the buffer for a read IO */ 807 ASSERT(bp->b_map_count == 1); 808 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ 809 bp->b_maps[0].bm_bn = daddr; 810 bp->b_flags |= XBF_READ; 811 bp->b_ops = ops; 812 813 xfs_buf_submit_wait(bp); 814 if (bp->b_error) { 815 int error = bp->b_error; 816 xfs_buf_relse(bp); 817 return error; 818 } 819 820 *bpp = bp; 821 return 0; 822 } 823 824 /* 825 * Return a buffer allocated as an empty buffer and associated to external 826 * memory via xfs_buf_associate_memory() back to it's empty state. 827 */ 828 void 829 xfs_buf_set_empty( 830 struct xfs_buf *bp, 831 size_t numblks) 832 { 833 if (bp->b_pages) 834 _xfs_buf_free_pages(bp); 835 836 bp->b_pages = NULL; 837 bp->b_page_count = 0; 838 bp->b_addr = NULL; 839 bp->b_length = numblks; 840 bp->b_io_length = numblks; 841 842 ASSERT(bp->b_map_count == 1); 843 bp->b_bn = XFS_BUF_DADDR_NULL; 844 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 845 bp->b_maps[0].bm_len = bp->b_length; 846 } 847 848 static inline struct page * 849 mem_to_page( 850 void *addr) 851 { 852 if ((!is_vmalloc_addr(addr))) { 853 return virt_to_page(addr); 854 } else { 855 return vmalloc_to_page(addr); 856 } 857 } 858 859 int 860 xfs_buf_associate_memory( 861 xfs_buf_t *bp, 862 void *mem, 863 size_t len) 864 { 865 int rval; 866 int i = 0; 867 unsigned long pageaddr; 868 unsigned long offset; 869 size_t buflen; 870 int page_count; 871 872 pageaddr = (unsigned long)mem & PAGE_MASK; 873 offset = (unsigned long)mem - pageaddr; 874 buflen = PAGE_ALIGN(len + offset); 875 page_count = buflen >> PAGE_SHIFT; 876 877 /* Free any previous set of page pointers */ 878 if (bp->b_pages) 879 _xfs_buf_free_pages(bp); 880 881 bp->b_pages = NULL; 882 bp->b_addr = mem; 883 884 rval = _xfs_buf_get_pages(bp, page_count); 885 if (rval) 886 return rval; 887 888 bp->b_offset = offset; 889 890 for (i = 0; i < bp->b_page_count; i++) { 891 bp->b_pages[i] = mem_to_page((void *)pageaddr); 892 pageaddr += PAGE_SIZE; 893 } 894 895 bp->b_io_length = BTOBB(len); 896 bp->b_length = BTOBB(buflen); 897 898 return 0; 899 } 900 901 xfs_buf_t * 902 xfs_buf_get_uncached( 903 struct xfs_buftarg *target, 904 size_t numblks, 905 int flags) 906 { 907 unsigned long page_count; 908 int error, i; 909 struct xfs_buf *bp; 910 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 911 912 /* flags might contain irrelevant bits, pass only what we care about */ 913 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); 914 if (unlikely(bp == NULL)) 915 goto fail; 916 917 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 918 error = _xfs_buf_get_pages(bp, page_count); 919 if (error) 920 goto fail_free_buf; 921 922 for (i = 0; i < page_count; i++) { 923 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 924 if (!bp->b_pages[i]) 925 goto fail_free_mem; 926 } 927 bp->b_flags |= _XBF_PAGES; 928 929 error = _xfs_buf_map_pages(bp, 0); 930 if (unlikely(error)) { 931 xfs_warn(target->bt_mount, 932 "%s: failed to map pages", __func__); 933 goto fail_free_mem; 934 } 935 936 trace_xfs_buf_get_uncached(bp, _RET_IP_); 937 return bp; 938 939 fail_free_mem: 940 while (--i >= 0) 941 __free_page(bp->b_pages[i]); 942 _xfs_buf_free_pages(bp); 943 fail_free_buf: 944 xfs_buf_free_maps(bp); 945 kmem_zone_free(xfs_buf_zone, bp); 946 fail: 947 return NULL; 948 } 949 950 /* 951 * Increment reference count on buffer, to hold the buffer concurrently 952 * with another thread which may release (free) the buffer asynchronously. 953 * Must hold the buffer already to call this function. 954 */ 955 void 956 xfs_buf_hold( 957 xfs_buf_t *bp) 958 { 959 trace_xfs_buf_hold(bp, _RET_IP_); 960 atomic_inc(&bp->b_hold); 961 } 962 963 /* 964 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 965 * placed on LRU or freed (depending on b_lru_ref). 966 */ 967 void 968 xfs_buf_rele( 969 xfs_buf_t *bp) 970 { 971 struct xfs_perag *pag = bp->b_pag; 972 bool release; 973 bool freebuf = false; 974 975 trace_xfs_buf_rele(bp, _RET_IP_); 976 977 if (!pag) { 978 ASSERT(list_empty(&bp->b_lru)); 979 if (atomic_dec_and_test(&bp->b_hold)) { 980 xfs_buf_ioacct_dec(bp); 981 xfs_buf_free(bp); 982 } 983 return; 984 } 985 986 ASSERT(atomic_read(&bp->b_hold) > 0); 987 988 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 989 spin_lock(&bp->b_lock); 990 if (!release) { 991 /* 992 * Drop the in-flight state if the buffer is already on the LRU 993 * and it holds the only reference. This is racy because we 994 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 995 * ensures the decrement occurs only once per-buf. 996 */ 997 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 998 __xfs_buf_ioacct_dec(bp); 999 goto out_unlock; 1000 } 1001 1002 /* the last reference has been dropped ... */ 1003 __xfs_buf_ioacct_dec(bp); 1004 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1005 /* 1006 * If the buffer is added to the LRU take a new reference to the 1007 * buffer for the LRU and clear the (now stale) dispose list 1008 * state flag 1009 */ 1010 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 1011 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1012 atomic_inc(&bp->b_hold); 1013 } 1014 spin_unlock(&pag->pag_buf_lock); 1015 } else { 1016 /* 1017 * most of the time buffers will already be removed from the 1018 * LRU, so optimise that case by checking for the 1019 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1020 * was on was the disposal list 1021 */ 1022 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1023 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 1024 } else { 1025 ASSERT(list_empty(&bp->b_lru)); 1026 } 1027 1028 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1029 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1030 xfs_buf_hash_params); 1031 spin_unlock(&pag->pag_buf_lock); 1032 xfs_perag_put(pag); 1033 freebuf = true; 1034 } 1035 1036 out_unlock: 1037 spin_unlock(&bp->b_lock); 1038 1039 if (freebuf) 1040 xfs_buf_free(bp); 1041 } 1042 1043 1044 /* 1045 * Lock a buffer object, if it is not already locked. 1046 * 1047 * If we come across a stale, pinned, locked buffer, we know that we are 1048 * being asked to lock a buffer that has been reallocated. Because it is 1049 * pinned, we know that the log has not been pushed to disk and hence it 1050 * will still be locked. Rather than continuing to have trylock attempts 1051 * fail until someone else pushes the log, push it ourselves before 1052 * returning. This means that the xfsaild will not get stuck trying 1053 * to push on stale inode buffers. 1054 */ 1055 int 1056 xfs_buf_trylock( 1057 struct xfs_buf *bp) 1058 { 1059 int locked; 1060 1061 locked = down_trylock(&bp->b_sema) == 0; 1062 if (locked) { 1063 XB_SET_OWNER(bp); 1064 trace_xfs_buf_trylock(bp, _RET_IP_); 1065 } else { 1066 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1067 } 1068 return locked; 1069 } 1070 1071 /* 1072 * Lock a buffer object. 1073 * 1074 * If we come across a stale, pinned, locked buffer, we know that we 1075 * are being asked to lock a buffer that has been reallocated. Because 1076 * it is pinned, we know that the log has not been pushed to disk and 1077 * hence it will still be locked. Rather than sleeping until someone 1078 * else pushes the log, push it ourselves before trying to get the lock. 1079 */ 1080 void 1081 xfs_buf_lock( 1082 struct xfs_buf *bp) 1083 { 1084 trace_xfs_buf_lock(bp, _RET_IP_); 1085 1086 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1087 xfs_log_force(bp->b_target->bt_mount, 0); 1088 down(&bp->b_sema); 1089 XB_SET_OWNER(bp); 1090 1091 trace_xfs_buf_lock_done(bp, _RET_IP_); 1092 } 1093 1094 void 1095 xfs_buf_unlock( 1096 struct xfs_buf *bp) 1097 { 1098 ASSERT(xfs_buf_islocked(bp)); 1099 1100 XB_CLEAR_OWNER(bp); 1101 up(&bp->b_sema); 1102 1103 trace_xfs_buf_unlock(bp, _RET_IP_); 1104 } 1105 1106 STATIC void 1107 xfs_buf_wait_unpin( 1108 xfs_buf_t *bp) 1109 { 1110 DECLARE_WAITQUEUE (wait, current); 1111 1112 if (atomic_read(&bp->b_pin_count) == 0) 1113 return; 1114 1115 add_wait_queue(&bp->b_waiters, &wait); 1116 for (;;) { 1117 set_current_state(TASK_UNINTERRUPTIBLE); 1118 if (atomic_read(&bp->b_pin_count) == 0) 1119 break; 1120 io_schedule(); 1121 } 1122 remove_wait_queue(&bp->b_waiters, &wait); 1123 set_current_state(TASK_RUNNING); 1124 } 1125 1126 /* 1127 * Buffer Utility Routines 1128 */ 1129 1130 void 1131 xfs_buf_ioend( 1132 struct xfs_buf *bp) 1133 { 1134 bool read = bp->b_flags & XBF_READ; 1135 1136 trace_xfs_buf_iodone(bp, _RET_IP_); 1137 1138 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1139 1140 /* 1141 * Pull in IO completion errors now. We are guaranteed to be running 1142 * single threaded, so we don't need the lock to read b_io_error. 1143 */ 1144 if (!bp->b_error && bp->b_io_error) 1145 xfs_buf_ioerror(bp, bp->b_io_error); 1146 1147 /* Only validate buffers that were read without errors */ 1148 if (read && !bp->b_error && bp->b_ops) { 1149 ASSERT(!bp->b_iodone); 1150 bp->b_ops->verify_read(bp); 1151 } 1152 1153 if (!bp->b_error) 1154 bp->b_flags |= XBF_DONE; 1155 1156 if (bp->b_iodone) 1157 (*(bp->b_iodone))(bp); 1158 else if (bp->b_flags & XBF_ASYNC) 1159 xfs_buf_relse(bp); 1160 else 1161 complete(&bp->b_iowait); 1162 } 1163 1164 static void 1165 xfs_buf_ioend_work( 1166 struct work_struct *work) 1167 { 1168 struct xfs_buf *bp = 1169 container_of(work, xfs_buf_t, b_ioend_work); 1170 1171 xfs_buf_ioend(bp); 1172 } 1173 1174 static void 1175 xfs_buf_ioend_async( 1176 struct xfs_buf *bp) 1177 { 1178 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1179 queue_work(bp->b_ioend_wq, &bp->b_ioend_work); 1180 } 1181 1182 void 1183 xfs_buf_ioerror( 1184 xfs_buf_t *bp, 1185 int error) 1186 { 1187 ASSERT(error <= 0 && error >= -1000); 1188 bp->b_error = error; 1189 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1190 } 1191 1192 void 1193 xfs_buf_ioerror_alert( 1194 struct xfs_buf *bp, 1195 const char *func) 1196 { 1197 xfs_alert(bp->b_target->bt_mount, 1198 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", 1199 (uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); 1200 } 1201 1202 int 1203 xfs_bwrite( 1204 struct xfs_buf *bp) 1205 { 1206 int error; 1207 1208 ASSERT(xfs_buf_islocked(bp)); 1209 1210 bp->b_flags |= XBF_WRITE; 1211 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1212 XBF_WRITE_FAIL | XBF_DONE); 1213 1214 error = xfs_buf_submit_wait(bp); 1215 if (error) { 1216 xfs_force_shutdown(bp->b_target->bt_mount, 1217 SHUTDOWN_META_IO_ERROR); 1218 } 1219 return error; 1220 } 1221 1222 static void 1223 xfs_buf_bio_end_io( 1224 struct bio *bio) 1225 { 1226 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1227 1228 /* 1229 * don't overwrite existing errors - otherwise we can lose errors on 1230 * buffers that require multiple bios to complete. 1231 */ 1232 if (bio->bi_status) { 1233 int error = blk_status_to_errno(bio->bi_status); 1234 1235 cmpxchg(&bp->b_io_error, 0, error); 1236 } 1237 1238 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1239 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1240 1241 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1242 xfs_buf_ioend_async(bp); 1243 bio_put(bio); 1244 } 1245 1246 static void 1247 xfs_buf_ioapply_map( 1248 struct xfs_buf *bp, 1249 int map, 1250 int *buf_offset, 1251 int *count, 1252 int op, 1253 int op_flags) 1254 { 1255 int page_index; 1256 int total_nr_pages = bp->b_page_count; 1257 int nr_pages; 1258 struct bio *bio; 1259 sector_t sector = bp->b_maps[map].bm_bn; 1260 int size; 1261 int offset; 1262 1263 /* skip the pages in the buffer before the start offset */ 1264 page_index = 0; 1265 offset = *buf_offset; 1266 while (offset >= PAGE_SIZE) { 1267 page_index++; 1268 offset -= PAGE_SIZE; 1269 } 1270 1271 /* 1272 * Limit the IO size to the length of the current vector, and update the 1273 * remaining IO count for the next time around. 1274 */ 1275 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1276 *count -= size; 1277 *buf_offset += size; 1278 1279 next_chunk: 1280 atomic_inc(&bp->b_io_remaining); 1281 nr_pages = min(total_nr_pages, BIO_MAX_PAGES); 1282 1283 bio = bio_alloc(GFP_NOIO, nr_pages); 1284 bio_set_dev(bio, bp->b_target->bt_bdev); 1285 bio->bi_iter.bi_sector = sector; 1286 bio->bi_end_io = xfs_buf_bio_end_io; 1287 bio->bi_private = bp; 1288 bio_set_op_attrs(bio, op, op_flags); 1289 1290 for (; size && nr_pages; nr_pages--, page_index++) { 1291 int rbytes, nbytes = PAGE_SIZE - offset; 1292 1293 if (nbytes > size) 1294 nbytes = size; 1295 1296 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1297 offset); 1298 if (rbytes < nbytes) 1299 break; 1300 1301 offset = 0; 1302 sector += BTOBB(nbytes); 1303 size -= nbytes; 1304 total_nr_pages--; 1305 } 1306 1307 if (likely(bio->bi_iter.bi_size)) { 1308 if (xfs_buf_is_vmapped(bp)) { 1309 flush_kernel_vmap_range(bp->b_addr, 1310 xfs_buf_vmap_len(bp)); 1311 } 1312 submit_bio(bio); 1313 if (size) 1314 goto next_chunk; 1315 } else { 1316 /* 1317 * This is guaranteed not to be the last io reference count 1318 * because the caller (xfs_buf_submit) holds a count itself. 1319 */ 1320 atomic_dec(&bp->b_io_remaining); 1321 xfs_buf_ioerror(bp, -EIO); 1322 bio_put(bio); 1323 } 1324 1325 } 1326 1327 STATIC void 1328 _xfs_buf_ioapply( 1329 struct xfs_buf *bp) 1330 { 1331 struct blk_plug plug; 1332 int op; 1333 int op_flags = 0; 1334 int offset; 1335 int size; 1336 int i; 1337 1338 /* 1339 * Make sure we capture only current IO errors rather than stale errors 1340 * left over from previous use of the buffer (e.g. failed readahead). 1341 */ 1342 bp->b_error = 0; 1343 1344 /* 1345 * Initialize the I/O completion workqueue if we haven't yet or the 1346 * submitter has not opted to specify a custom one. 1347 */ 1348 if (!bp->b_ioend_wq) 1349 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; 1350 1351 if (bp->b_flags & XBF_WRITE) { 1352 op = REQ_OP_WRITE; 1353 if (bp->b_flags & XBF_SYNCIO) 1354 op_flags = REQ_SYNC; 1355 if (bp->b_flags & XBF_FUA) 1356 op_flags |= REQ_FUA; 1357 if (bp->b_flags & XBF_FLUSH) 1358 op_flags |= REQ_PREFLUSH; 1359 1360 /* 1361 * Run the write verifier callback function if it exists. If 1362 * this function fails it will mark the buffer with an error and 1363 * the IO should not be dispatched. 1364 */ 1365 if (bp->b_ops) { 1366 bp->b_ops->verify_write(bp); 1367 if (bp->b_error) { 1368 xfs_force_shutdown(bp->b_target->bt_mount, 1369 SHUTDOWN_CORRUPT_INCORE); 1370 return; 1371 } 1372 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { 1373 struct xfs_mount *mp = bp->b_target->bt_mount; 1374 1375 /* 1376 * non-crc filesystems don't attach verifiers during 1377 * log recovery, so don't warn for such filesystems. 1378 */ 1379 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1380 xfs_warn(mp, 1381 "%s: no ops on block 0x%llx/0x%x", 1382 __func__, bp->b_bn, bp->b_length); 1383 xfs_hex_dump(bp->b_addr, 64); 1384 dump_stack(); 1385 } 1386 } 1387 } else if (bp->b_flags & XBF_READ_AHEAD) { 1388 op = REQ_OP_READ; 1389 op_flags = REQ_RAHEAD; 1390 } else { 1391 op = REQ_OP_READ; 1392 } 1393 1394 /* we only use the buffer cache for meta-data */ 1395 op_flags |= REQ_META; 1396 1397 /* 1398 * Walk all the vectors issuing IO on them. Set up the initial offset 1399 * into the buffer and the desired IO size before we start - 1400 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1401 * subsequent call. 1402 */ 1403 offset = bp->b_offset; 1404 size = BBTOB(bp->b_io_length); 1405 blk_start_plug(&plug); 1406 for (i = 0; i < bp->b_map_count; i++) { 1407 xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); 1408 if (bp->b_error) 1409 break; 1410 if (size <= 0) 1411 break; /* all done */ 1412 } 1413 blk_finish_plug(&plug); 1414 } 1415 1416 /* 1417 * Asynchronous IO submission path. This transfers the buffer lock ownership and 1418 * the current reference to the IO. It is not safe to reference the buffer after 1419 * a call to this function unless the caller holds an additional reference 1420 * itself. 1421 */ 1422 void 1423 xfs_buf_submit( 1424 struct xfs_buf *bp) 1425 { 1426 trace_xfs_buf_submit(bp, _RET_IP_); 1427 1428 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1429 ASSERT(bp->b_flags & XBF_ASYNC); 1430 1431 /* on shutdown we stale and complete the buffer immediately */ 1432 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1433 xfs_buf_ioerror(bp, -EIO); 1434 bp->b_flags &= ~XBF_DONE; 1435 xfs_buf_stale(bp); 1436 xfs_buf_ioend(bp); 1437 return; 1438 } 1439 1440 if (bp->b_flags & XBF_WRITE) 1441 xfs_buf_wait_unpin(bp); 1442 1443 /* clear the internal error state to avoid spurious errors */ 1444 bp->b_io_error = 0; 1445 1446 /* 1447 * The caller's reference is released during I/O completion. 1448 * This occurs some time after the last b_io_remaining reference is 1449 * released, so after we drop our Io reference we have to have some 1450 * other reference to ensure the buffer doesn't go away from underneath 1451 * us. Take a direct reference to ensure we have safe access to the 1452 * buffer until we are finished with it. 1453 */ 1454 xfs_buf_hold(bp); 1455 1456 /* 1457 * Set the count to 1 initially, this will stop an I/O completion 1458 * callout which happens before we have started all the I/O from calling 1459 * xfs_buf_ioend too early. 1460 */ 1461 atomic_set(&bp->b_io_remaining, 1); 1462 xfs_buf_ioacct_inc(bp); 1463 _xfs_buf_ioapply(bp); 1464 1465 /* 1466 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1467 * reference we took above. If we drop it to zero, run completion so 1468 * that we don't return to the caller with completion still pending. 1469 */ 1470 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1471 if (bp->b_error) 1472 xfs_buf_ioend(bp); 1473 else 1474 xfs_buf_ioend_async(bp); 1475 } 1476 1477 xfs_buf_rele(bp); 1478 /* Note: it is not safe to reference bp now we've dropped our ref */ 1479 } 1480 1481 /* 1482 * Synchronous buffer IO submission path, read or write. 1483 */ 1484 int 1485 xfs_buf_submit_wait( 1486 struct xfs_buf *bp) 1487 { 1488 int error; 1489 1490 trace_xfs_buf_submit_wait(bp, _RET_IP_); 1491 1492 ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); 1493 1494 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1495 xfs_buf_ioerror(bp, -EIO); 1496 xfs_buf_stale(bp); 1497 bp->b_flags &= ~XBF_DONE; 1498 return -EIO; 1499 } 1500 1501 if (bp->b_flags & XBF_WRITE) 1502 xfs_buf_wait_unpin(bp); 1503 1504 /* clear the internal error state to avoid spurious errors */ 1505 bp->b_io_error = 0; 1506 1507 /* 1508 * For synchronous IO, the IO does not inherit the submitters reference 1509 * count, nor the buffer lock. Hence we cannot release the reference we 1510 * are about to take until we've waited for all IO completion to occur, 1511 * including any xfs_buf_ioend_async() work that may be pending. 1512 */ 1513 xfs_buf_hold(bp); 1514 1515 /* 1516 * Set the count to 1 initially, this will stop an I/O completion 1517 * callout which happens before we have started all the I/O from calling 1518 * xfs_buf_ioend too early. 1519 */ 1520 atomic_set(&bp->b_io_remaining, 1); 1521 _xfs_buf_ioapply(bp); 1522 1523 /* 1524 * make sure we run completion synchronously if it raced with us and is 1525 * already complete. 1526 */ 1527 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1528 xfs_buf_ioend(bp); 1529 1530 /* wait for completion before gathering the error from the buffer */ 1531 trace_xfs_buf_iowait(bp, _RET_IP_); 1532 wait_for_completion(&bp->b_iowait); 1533 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1534 error = bp->b_error; 1535 1536 /* 1537 * all done now, we can release the hold that keeps the buffer 1538 * referenced for the entire IO. 1539 */ 1540 xfs_buf_rele(bp); 1541 return error; 1542 } 1543 1544 void * 1545 xfs_buf_offset( 1546 struct xfs_buf *bp, 1547 size_t offset) 1548 { 1549 struct page *page; 1550 1551 if (bp->b_addr) 1552 return bp->b_addr + offset; 1553 1554 offset += bp->b_offset; 1555 page = bp->b_pages[offset >> PAGE_SHIFT]; 1556 return page_address(page) + (offset & (PAGE_SIZE-1)); 1557 } 1558 1559 /* 1560 * Move data into or out of a buffer. 1561 */ 1562 void 1563 xfs_buf_iomove( 1564 xfs_buf_t *bp, /* buffer to process */ 1565 size_t boff, /* starting buffer offset */ 1566 size_t bsize, /* length to copy */ 1567 void *data, /* data address */ 1568 xfs_buf_rw_t mode) /* read/write/zero flag */ 1569 { 1570 size_t bend; 1571 1572 bend = boff + bsize; 1573 while (boff < bend) { 1574 struct page *page; 1575 int page_index, page_offset, csize; 1576 1577 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1578 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1579 page = bp->b_pages[page_index]; 1580 csize = min_t(size_t, PAGE_SIZE - page_offset, 1581 BBTOB(bp->b_io_length) - boff); 1582 1583 ASSERT((csize + page_offset) <= PAGE_SIZE); 1584 1585 switch (mode) { 1586 case XBRW_ZERO: 1587 memset(page_address(page) + page_offset, 0, csize); 1588 break; 1589 case XBRW_READ: 1590 memcpy(data, page_address(page) + page_offset, csize); 1591 break; 1592 case XBRW_WRITE: 1593 memcpy(page_address(page) + page_offset, data, csize); 1594 } 1595 1596 boff += csize; 1597 data += csize; 1598 } 1599 } 1600 1601 /* 1602 * Handling of buffer targets (buftargs). 1603 */ 1604 1605 /* 1606 * Wait for any bufs with callbacks that have been submitted but have not yet 1607 * returned. These buffers will have an elevated hold count, so wait on those 1608 * while freeing all the buffers only held by the LRU. 1609 */ 1610 static enum lru_status 1611 xfs_buftarg_wait_rele( 1612 struct list_head *item, 1613 struct list_lru_one *lru, 1614 spinlock_t *lru_lock, 1615 void *arg) 1616 1617 { 1618 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1619 struct list_head *dispose = arg; 1620 1621 if (atomic_read(&bp->b_hold) > 1) { 1622 /* need to wait, so skip it this pass */ 1623 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1624 return LRU_SKIP; 1625 } 1626 if (!spin_trylock(&bp->b_lock)) 1627 return LRU_SKIP; 1628 1629 /* 1630 * clear the LRU reference count so the buffer doesn't get 1631 * ignored in xfs_buf_rele(). 1632 */ 1633 atomic_set(&bp->b_lru_ref, 0); 1634 bp->b_state |= XFS_BSTATE_DISPOSE; 1635 list_lru_isolate_move(lru, item, dispose); 1636 spin_unlock(&bp->b_lock); 1637 return LRU_REMOVED; 1638 } 1639 1640 void 1641 xfs_wait_buftarg( 1642 struct xfs_buftarg *btp) 1643 { 1644 LIST_HEAD(dispose); 1645 int loop = 0; 1646 1647 /* 1648 * First wait on the buftarg I/O count for all in-flight buffers to be 1649 * released. This is critical as new buffers do not make the LRU until 1650 * they are released. 1651 * 1652 * Next, flush the buffer workqueue to ensure all completion processing 1653 * has finished. Just waiting on buffer locks is not sufficient for 1654 * async IO as the reference count held over IO is not released until 1655 * after the buffer lock is dropped. Hence we need to ensure here that 1656 * all reference counts have been dropped before we start walking the 1657 * LRU list. 1658 */ 1659 while (percpu_counter_sum(&btp->bt_io_count)) 1660 delay(100); 1661 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1662 1663 /* loop until there is nothing left on the lru list. */ 1664 while (list_lru_count(&btp->bt_lru)) { 1665 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, 1666 &dispose, LONG_MAX); 1667 1668 while (!list_empty(&dispose)) { 1669 struct xfs_buf *bp; 1670 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1671 list_del_init(&bp->b_lru); 1672 if (bp->b_flags & XBF_WRITE_FAIL) { 1673 xfs_alert(btp->bt_mount, 1674 "Corruption Alert: Buffer at block 0x%llx had permanent write failures!", 1675 (long long)bp->b_bn); 1676 xfs_alert(btp->bt_mount, 1677 "Please run xfs_repair to determine the extent of the problem."); 1678 } 1679 xfs_buf_rele(bp); 1680 } 1681 if (loop++ != 0) 1682 delay(100); 1683 } 1684 } 1685 1686 static enum lru_status 1687 xfs_buftarg_isolate( 1688 struct list_head *item, 1689 struct list_lru_one *lru, 1690 spinlock_t *lru_lock, 1691 void *arg) 1692 { 1693 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1694 struct list_head *dispose = arg; 1695 1696 /* 1697 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1698 * If we fail to get the lock, just skip it. 1699 */ 1700 if (!spin_trylock(&bp->b_lock)) 1701 return LRU_SKIP; 1702 /* 1703 * Decrement the b_lru_ref count unless the value is already 1704 * zero. If the value is already zero, we need to reclaim the 1705 * buffer, otherwise it gets another trip through the LRU. 1706 */ 1707 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1708 spin_unlock(&bp->b_lock); 1709 return LRU_ROTATE; 1710 } 1711 1712 bp->b_state |= XFS_BSTATE_DISPOSE; 1713 list_lru_isolate_move(lru, item, dispose); 1714 spin_unlock(&bp->b_lock); 1715 return LRU_REMOVED; 1716 } 1717 1718 static unsigned long 1719 xfs_buftarg_shrink_scan( 1720 struct shrinker *shrink, 1721 struct shrink_control *sc) 1722 { 1723 struct xfs_buftarg *btp = container_of(shrink, 1724 struct xfs_buftarg, bt_shrinker); 1725 LIST_HEAD(dispose); 1726 unsigned long freed; 1727 1728 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1729 xfs_buftarg_isolate, &dispose); 1730 1731 while (!list_empty(&dispose)) { 1732 struct xfs_buf *bp; 1733 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1734 list_del_init(&bp->b_lru); 1735 xfs_buf_rele(bp); 1736 } 1737 1738 return freed; 1739 } 1740 1741 static unsigned long 1742 xfs_buftarg_shrink_count( 1743 struct shrinker *shrink, 1744 struct shrink_control *sc) 1745 { 1746 struct xfs_buftarg *btp = container_of(shrink, 1747 struct xfs_buftarg, bt_shrinker); 1748 return list_lru_shrink_count(&btp->bt_lru, sc); 1749 } 1750 1751 void 1752 xfs_free_buftarg( 1753 struct xfs_mount *mp, 1754 struct xfs_buftarg *btp) 1755 { 1756 unregister_shrinker(&btp->bt_shrinker); 1757 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1758 percpu_counter_destroy(&btp->bt_io_count); 1759 list_lru_destroy(&btp->bt_lru); 1760 1761 xfs_blkdev_issue_flush(btp); 1762 1763 kmem_free(btp); 1764 } 1765 1766 int 1767 xfs_setsize_buftarg( 1768 xfs_buftarg_t *btp, 1769 unsigned int sectorsize) 1770 { 1771 /* Set up metadata sector size info */ 1772 btp->bt_meta_sectorsize = sectorsize; 1773 btp->bt_meta_sectormask = sectorsize - 1; 1774 1775 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1776 xfs_warn(btp->bt_mount, 1777 "Cannot set_blocksize to %u on device %pg", 1778 sectorsize, btp->bt_bdev); 1779 return -EINVAL; 1780 } 1781 1782 /* Set up device logical sector size mask */ 1783 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1784 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1785 1786 return 0; 1787 } 1788 1789 /* 1790 * When allocating the initial buffer target we have not yet 1791 * read in the superblock, so don't know what sized sectors 1792 * are being used at this early stage. Play safe. 1793 */ 1794 STATIC int 1795 xfs_setsize_buftarg_early( 1796 xfs_buftarg_t *btp, 1797 struct block_device *bdev) 1798 { 1799 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1800 } 1801 1802 xfs_buftarg_t * 1803 xfs_alloc_buftarg( 1804 struct xfs_mount *mp, 1805 struct block_device *bdev, 1806 struct dax_device *dax_dev) 1807 { 1808 xfs_buftarg_t *btp; 1809 1810 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1811 1812 btp->bt_mount = mp; 1813 btp->bt_dev = bdev->bd_dev; 1814 btp->bt_bdev = bdev; 1815 btp->bt_daxdev = dax_dev; 1816 1817 if (xfs_setsize_buftarg_early(btp, bdev)) 1818 goto error_free; 1819 1820 if (list_lru_init(&btp->bt_lru)) 1821 goto error_free; 1822 1823 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 1824 goto error_lru; 1825 1826 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1827 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1828 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1829 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 1830 if (register_shrinker(&btp->bt_shrinker)) 1831 goto error_pcpu; 1832 return btp; 1833 1834 error_pcpu: 1835 percpu_counter_destroy(&btp->bt_io_count); 1836 error_lru: 1837 list_lru_destroy(&btp->bt_lru); 1838 error_free: 1839 kmem_free(btp); 1840 return NULL; 1841 } 1842 1843 /* 1844 * Cancel a delayed write list. 1845 * 1846 * Remove each buffer from the list, clear the delwri queue flag and drop the 1847 * associated buffer reference. 1848 */ 1849 void 1850 xfs_buf_delwri_cancel( 1851 struct list_head *list) 1852 { 1853 struct xfs_buf *bp; 1854 1855 while (!list_empty(list)) { 1856 bp = list_first_entry(list, struct xfs_buf, b_list); 1857 1858 xfs_buf_lock(bp); 1859 bp->b_flags &= ~_XBF_DELWRI_Q; 1860 list_del_init(&bp->b_list); 1861 xfs_buf_relse(bp); 1862 } 1863 } 1864 1865 /* 1866 * Add a buffer to the delayed write list. 1867 * 1868 * This queues a buffer for writeout if it hasn't already been. Note that 1869 * neither this routine nor the buffer list submission functions perform 1870 * any internal synchronization. It is expected that the lists are thread-local 1871 * to the callers. 1872 * 1873 * Returns true if we queued up the buffer, or false if it already had 1874 * been on the buffer list. 1875 */ 1876 bool 1877 xfs_buf_delwri_queue( 1878 struct xfs_buf *bp, 1879 struct list_head *list) 1880 { 1881 ASSERT(xfs_buf_islocked(bp)); 1882 ASSERT(!(bp->b_flags & XBF_READ)); 1883 1884 /* 1885 * If the buffer is already marked delwri it already is queued up 1886 * by someone else for imediate writeout. Just ignore it in that 1887 * case. 1888 */ 1889 if (bp->b_flags & _XBF_DELWRI_Q) { 1890 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1891 return false; 1892 } 1893 1894 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1895 1896 /* 1897 * If a buffer gets written out synchronously or marked stale while it 1898 * is on a delwri list we lazily remove it. To do this, the other party 1899 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1900 * It remains referenced and on the list. In a rare corner case it 1901 * might get readded to a delwri list after the synchronous writeout, in 1902 * which case we need just need to re-add the flag here. 1903 */ 1904 bp->b_flags |= _XBF_DELWRI_Q; 1905 if (list_empty(&bp->b_list)) { 1906 atomic_inc(&bp->b_hold); 1907 list_add_tail(&bp->b_list, list); 1908 } 1909 1910 return true; 1911 } 1912 1913 /* 1914 * Compare function is more complex than it needs to be because 1915 * the return value is only 32 bits and we are doing comparisons 1916 * on 64 bit values 1917 */ 1918 static int 1919 xfs_buf_cmp( 1920 void *priv, 1921 struct list_head *a, 1922 struct list_head *b) 1923 { 1924 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1925 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1926 xfs_daddr_t diff; 1927 1928 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1929 if (diff < 0) 1930 return -1; 1931 if (diff > 0) 1932 return 1; 1933 return 0; 1934 } 1935 1936 /* 1937 * submit buffers for write. 1938 * 1939 * When we have a large buffer list, we do not want to hold all the buffers 1940 * locked while we block on the request queue waiting for IO dispatch. To avoid 1941 * this problem, we lock and submit buffers in groups of 50, thereby minimising 1942 * the lock hold times for lists which may contain thousands of objects. 1943 * 1944 * To do this, we sort the buffer list before we walk the list to lock and 1945 * submit buffers, and we plug and unplug around each group of buffers we 1946 * submit. 1947 */ 1948 static int 1949 xfs_buf_delwri_submit_buffers( 1950 struct list_head *buffer_list, 1951 struct list_head *wait_list) 1952 { 1953 struct xfs_buf *bp, *n; 1954 LIST_HEAD (submit_list); 1955 int pinned = 0; 1956 struct blk_plug plug; 1957 1958 list_sort(NULL, buffer_list, xfs_buf_cmp); 1959 1960 blk_start_plug(&plug); 1961 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1962 if (!wait_list) { 1963 if (xfs_buf_ispinned(bp)) { 1964 pinned++; 1965 continue; 1966 } 1967 if (!xfs_buf_trylock(bp)) 1968 continue; 1969 } else { 1970 xfs_buf_lock(bp); 1971 } 1972 1973 /* 1974 * Someone else might have written the buffer synchronously or 1975 * marked it stale in the meantime. In that case only the 1976 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 1977 * reference and remove it from the list here. 1978 */ 1979 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1980 list_del_init(&bp->b_list); 1981 xfs_buf_relse(bp); 1982 continue; 1983 } 1984 1985 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1986 1987 /* 1988 * We do all IO submission async. This means if we need 1989 * to wait for IO completion we need to take an extra 1990 * reference so the buffer is still valid on the other 1991 * side. We need to move the buffer onto the io_list 1992 * at this point so the caller can still access it. 1993 */ 1994 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); 1995 bp->b_flags |= XBF_WRITE | XBF_ASYNC; 1996 if (wait_list) { 1997 xfs_buf_hold(bp); 1998 list_move_tail(&bp->b_list, wait_list); 1999 } else 2000 list_del_init(&bp->b_list); 2001 2002 xfs_buf_submit(bp); 2003 } 2004 blk_finish_plug(&plug); 2005 2006 return pinned; 2007 } 2008 2009 /* 2010 * Write out a buffer list asynchronously. 2011 * 2012 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2013 * out and not wait for I/O completion on any of the buffers. This interface 2014 * is only safely useable for callers that can track I/O completion by higher 2015 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2016 * function. 2017 */ 2018 int 2019 xfs_buf_delwri_submit_nowait( 2020 struct list_head *buffer_list) 2021 { 2022 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2023 } 2024 2025 /* 2026 * Write out a buffer list synchronously. 2027 * 2028 * This will take the @buffer_list, write all buffers out and wait for I/O 2029 * completion on all of the buffers. @buffer_list is consumed by the function, 2030 * so callers must have some other way of tracking buffers if they require such 2031 * functionality. 2032 */ 2033 int 2034 xfs_buf_delwri_submit( 2035 struct list_head *buffer_list) 2036 { 2037 LIST_HEAD (wait_list); 2038 int error = 0, error2; 2039 struct xfs_buf *bp; 2040 2041 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2042 2043 /* Wait for IO to complete. */ 2044 while (!list_empty(&wait_list)) { 2045 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2046 2047 list_del_init(&bp->b_list); 2048 2049 /* locking the buffer will wait for async IO completion. */ 2050 xfs_buf_lock(bp); 2051 error2 = bp->b_error; 2052 xfs_buf_relse(bp); 2053 if (!error) 2054 error = error2; 2055 } 2056 2057 return error; 2058 } 2059 2060 /* 2061 * Push a single buffer on a delwri queue. 2062 * 2063 * The purpose of this function is to submit a single buffer of a delwri queue 2064 * and return with the buffer still on the original queue. The waiting delwri 2065 * buffer submission infrastructure guarantees transfer of the delwri queue 2066 * buffer reference to a temporary wait list. We reuse this infrastructure to 2067 * transfer the buffer back to the original queue. 2068 * 2069 * Note the buffer transitions from the queued state, to the submitted and wait 2070 * listed state and back to the queued state during this call. The buffer 2071 * locking and queue management logic between _delwri_pushbuf() and 2072 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2073 * before returning. 2074 */ 2075 int 2076 xfs_buf_delwri_pushbuf( 2077 struct xfs_buf *bp, 2078 struct list_head *buffer_list) 2079 { 2080 LIST_HEAD (submit_list); 2081 int error; 2082 2083 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2084 2085 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2086 2087 /* 2088 * Isolate the buffer to a new local list so we can submit it for I/O 2089 * independently from the rest of the original list. 2090 */ 2091 xfs_buf_lock(bp); 2092 list_move(&bp->b_list, &submit_list); 2093 xfs_buf_unlock(bp); 2094 2095 /* 2096 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2097 * the buffer on the wait list with an associated reference. Rather than 2098 * bounce the buffer from a local wait list back to the original list 2099 * after I/O completion, reuse the original list as the wait list. 2100 */ 2101 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2102 2103 /* 2104 * The buffer is now under I/O and wait listed as during typical delwri 2105 * submission. Lock the buffer to wait for I/O completion. Rather than 2106 * remove the buffer from the wait list and release the reference, we 2107 * want to return with the buffer queued to the original list. The 2108 * buffer already sits on the original list with a wait list reference, 2109 * however. If we let the queue inherit that wait list reference, all we 2110 * need to do is reset the DELWRI_Q flag. 2111 */ 2112 xfs_buf_lock(bp); 2113 error = bp->b_error; 2114 bp->b_flags |= _XBF_DELWRI_Q; 2115 xfs_buf_unlock(bp); 2116 2117 return error; 2118 } 2119 2120 int __init 2121 xfs_buf_init(void) 2122 { 2123 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 2124 KM_ZONE_HWALIGN, NULL); 2125 if (!xfs_buf_zone) 2126 goto out; 2127 2128 return 0; 2129 2130 out: 2131 return -ENOMEM; 2132 } 2133 2134 void 2135 xfs_buf_terminate(void) 2136 { 2137 kmem_zone_destroy(xfs_buf_zone); 2138 } 2139 2140 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2141 { 2142 /* 2143 * Set the lru reference count to 0 based on the error injection tag. 2144 * This allows userspace to disrupt buffer caching for debug/testing 2145 * purposes. 2146 */ 2147 if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, 2148 XFS_ERRTAG_BUF_LRU_REF)) 2149 lru_ref = 0; 2150 2151 atomic_set(&bp->b_lru_ref, lru_ref); 2152 } 2153