1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/stddef.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/pagemap.h> 11 #include <linux/init.h> 12 #include <linux/vmalloc.h> 13 #include <linux/bio.h> 14 #include <linux/sysctl.h> 15 #include <linux/proc_fs.h> 16 #include <linux/workqueue.h> 17 #include <linux/percpu.h> 18 #include <linux/blkdev.h> 19 #include <linux/hash.h> 20 #include <linux/kthread.h> 21 #include <linux/migrate.h> 22 #include <linux/backing-dev.h> 23 #include <linux/freezer.h> 24 25 #include "xfs_format.h" 26 #include "xfs_log_format.h" 27 #include "xfs_trans_resv.h" 28 #include "xfs_sb.h" 29 #include "xfs_mount.h" 30 #include "xfs_trace.h" 31 #include "xfs_log.h" 32 #include "xfs_errortag.h" 33 #include "xfs_error.h" 34 35 static kmem_zone_t *xfs_buf_zone; 36 37 #define xb_to_gfp(flags) \ 38 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 39 40 /* 41 * Locking orders 42 * 43 * xfs_buf_ioacct_inc: 44 * xfs_buf_ioacct_dec: 45 * b_sema (caller holds) 46 * b_lock 47 * 48 * xfs_buf_stale: 49 * b_sema (caller holds) 50 * b_lock 51 * lru_lock 52 * 53 * xfs_buf_rele: 54 * b_lock 55 * pag_buf_lock 56 * lru_lock 57 * 58 * xfs_buftarg_wait_rele 59 * lru_lock 60 * b_lock (trylock due to inversion) 61 * 62 * xfs_buftarg_isolate 63 * lru_lock 64 * b_lock (trylock due to inversion) 65 */ 66 67 static inline int 68 xfs_buf_is_vmapped( 69 struct xfs_buf *bp) 70 { 71 /* 72 * Return true if the buffer is vmapped. 73 * 74 * b_addr is null if the buffer is not mapped, but the code is clever 75 * enough to know it doesn't have to map a single page, so the check has 76 * to be both for b_addr and bp->b_page_count > 1. 77 */ 78 return bp->b_addr && bp->b_page_count > 1; 79 } 80 81 static inline int 82 xfs_buf_vmap_len( 83 struct xfs_buf *bp) 84 { 85 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 86 } 87 88 /* 89 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 90 * this buffer. The count is incremented once per buffer (per hold cycle) 91 * because the corresponding decrement is deferred to buffer release. Buffers 92 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 93 * tracking adds unnecessary overhead. This is used for sychronization purposes 94 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of 95 * in-flight buffers. 96 * 97 * Buffers that are never released (e.g., superblock, iclog buffers) must set 98 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 99 * never reaches zero and unmount hangs indefinitely. 100 */ 101 static inline void 102 xfs_buf_ioacct_inc( 103 struct xfs_buf *bp) 104 { 105 if (bp->b_flags & XBF_NO_IOACCT) 106 return; 107 108 ASSERT(bp->b_flags & XBF_ASYNC); 109 spin_lock(&bp->b_lock); 110 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 111 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 112 percpu_counter_inc(&bp->b_target->bt_io_count); 113 } 114 spin_unlock(&bp->b_lock); 115 } 116 117 /* 118 * Clear the in-flight state on a buffer about to be released to the LRU or 119 * freed and unaccount from the buftarg. 120 */ 121 static inline void 122 __xfs_buf_ioacct_dec( 123 struct xfs_buf *bp) 124 { 125 lockdep_assert_held(&bp->b_lock); 126 127 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 128 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 129 percpu_counter_dec(&bp->b_target->bt_io_count); 130 } 131 } 132 133 static inline void 134 xfs_buf_ioacct_dec( 135 struct xfs_buf *bp) 136 { 137 spin_lock(&bp->b_lock); 138 __xfs_buf_ioacct_dec(bp); 139 spin_unlock(&bp->b_lock); 140 } 141 142 /* 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 144 * b_lru_ref count so that the buffer is freed immediately when the buffer 145 * reference count falls to zero. If the buffer is already on the LRU, we need 146 * to remove the reference that LRU holds on the buffer. 147 * 148 * This prevents build-up of stale buffers on the LRU. 149 */ 150 void 151 xfs_buf_stale( 152 struct xfs_buf *bp) 153 { 154 ASSERT(xfs_buf_islocked(bp)); 155 156 bp->b_flags |= XBF_STALE; 157 158 /* 159 * Clear the delwri status so that a delwri queue walker will not 160 * flush this buffer to disk now that it is stale. The delwri queue has 161 * a reference to the buffer, so this is safe to do. 162 */ 163 bp->b_flags &= ~_XBF_DELWRI_Q; 164 165 /* 166 * Once the buffer is marked stale and unlocked, a subsequent lookup 167 * could reset b_flags. There is no guarantee that the buffer is 168 * unaccounted (released to LRU) before that occurs. Drop in-flight 169 * status now to preserve accounting consistency. 170 */ 171 spin_lock(&bp->b_lock); 172 __xfs_buf_ioacct_dec(bp); 173 174 atomic_set(&bp->b_lru_ref, 0); 175 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 176 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 177 atomic_dec(&bp->b_hold); 178 179 ASSERT(atomic_read(&bp->b_hold) >= 1); 180 spin_unlock(&bp->b_lock); 181 } 182 183 static int 184 xfs_buf_get_maps( 185 struct xfs_buf *bp, 186 int map_count) 187 { 188 ASSERT(bp->b_maps == NULL); 189 bp->b_map_count = map_count; 190 191 if (map_count == 1) { 192 bp->b_maps = &bp->__b_map; 193 return 0; 194 } 195 196 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 197 KM_NOFS); 198 if (!bp->b_maps) 199 return -ENOMEM; 200 return 0; 201 } 202 203 /* 204 * Frees b_pages if it was allocated. 205 */ 206 static void 207 xfs_buf_free_maps( 208 struct xfs_buf *bp) 209 { 210 if (bp->b_maps != &bp->__b_map) { 211 kmem_free(bp->b_maps); 212 bp->b_maps = NULL; 213 } 214 } 215 216 struct xfs_buf * 217 _xfs_buf_alloc( 218 struct xfs_buftarg *target, 219 struct xfs_buf_map *map, 220 int nmaps, 221 xfs_buf_flags_t flags) 222 { 223 struct xfs_buf *bp; 224 int error; 225 int i; 226 227 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 228 if (unlikely(!bp)) 229 return NULL; 230 231 /* 232 * We don't want certain flags to appear in b_flags unless they are 233 * specifically set by later operations on the buffer. 234 */ 235 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 236 237 atomic_set(&bp->b_hold, 1); 238 atomic_set(&bp->b_lru_ref, 1); 239 init_completion(&bp->b_iowait); 240 INIT_LIST_HEAD(&bp->b_lru); 241 INIT_LIST_HEAD(&bp->b_list); 242 INIT_LIST_HEAD(&bp->b_li_list); 243 sema_init(&bp->b_sema, 0); /* held, no waiters */ 244 spin_lock_init(&bp->b_lock); 245 bp->b_target = target; 246 bp->b_flags = flags; 247 248 /* 249 * Set length and io_length to the same value initially. 250 * I/O routines should use io_length, which will be the same in 251 * most cases but may be reset (e.g. XFS recovery). 252 */ 253 error = xfs_buf_get_maps(bp, nmaps); 254 if (error) { 255 kmem_zone_free(xfs_buf_zone, bp); 256 return NULL; 257 } 258 259 bp->b_bn = map[0].bm_bn; 260 bp->b_length = 0; 261 for (i = 0; i < nmaps; i++) { 262 bp->b_maps[i].bm_bn = map[i].bm_bn; 263 bp->b_maps[i].bm_len = map[i].bm_len; 264 bp->b_length += map[i].bm_len; 265 } 266 bp->b_io_length = bp->b_length; 267 268 atomic_set(&bp->b_pin_count, 0); 269 init_waitqueue_head(&bp->b_waiters); 270 271 XFS_STATS_INC(target->bt_mount, xb_create); 272 trace_xfs_buf_init(bp, _RET_IP_); 273 274 return bp; 275 } 276 277 /* 278 * Allocate a page array capable of holding a specified number 279 * of pages, and point the page buf at it. 280 */ 281 STATIC int 282 _xfs_buf_get_pages( 283 xfs_buf_t *bp, 284 int page_count) 285 { 286 /* Make sure that we have a page list */ 287 if (bp->b_pages == NULL) { 288 bp->b_page_count = page_count; 289 if (page_count <= XB_PAGES) { 290 bp->b_pages = bp->b_page_array; 291 } else { 292 bp->b_pages = kmem_alloc(sizeof(struct page *) * 293 page_count, KM_NOFS); 294 if (bp->b_pages == NULL) 295 return -ENOMEM; 296 } 297 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 298 } 299 return 0; 300 } 301 302 /* 303 * Frees b_pages if it was allocated. 304 */ 305 STATIC void 306 _xfs_buf_free_pages( 307 xfs_buf_t *bp) 308 { 309 if (bp->b_pages != bp->b_page_array) { 310 kmem_free(bp->b_pages); 311 bp->b_pages = NULL; 312 } 313 } 314 315 /* 316 * Releases the specified buffer. 317 * 318 * The modification state of any associated pages is left unchanged. 319 * The buffer must not be on any hash - use xfs_buf_rele instead for 320 * hashed and refcounted buffers 321 */ 322 void 323 xfs_buf_free( 324 xfs_buf_t *bp) 325 { 326 trace_xfs_buf_free(bp, _RET_IP_); 327 328 ASSERT(list_empty(&bp->b_lru)); 329 330 if (bp->b_flags & _XBF_PAGES) { 331 uint i; 332 333 if (xfs_buf_is_vmapped(bp)) 334 vm_unmap_ram(bp->b_addr - bp->b_offset, 335 bp->b_page_count); 336 337 for (i = 0; i < bp->b_page_count; i++) { 338 struct page *page = bp->b_pages[i]; 339 340 __free_page(page); 341 } 342 } else if (bp->b_flags & _XBF_KMEM) 343 kmem_free(bp->b_addr); 344 _xfs_buf_free_pages(bp); 345 xfs_buf_free_maps(bp); 346 kmem_zone_free(xfs_buf_zone, bp); 347 } 348 349 /* 350 * Allocates all the pages for buffer in question and builds it's page list. 351 */ 352 STATIC int 353 xfs_buf_allocate_memory( 354 xfs_buf_t *bp, 355 uint flags) 356 { 357 size_t size; 358 size_t nbytes, offset; 359 gfp_t gfp_mask = xb_to_gfp(flags); 360 unsigned short page_count, i; 361 xfs_off_t start, end; 362 int error; 363 364 /* 365 * for buffers that are contained within a single page, just allocate 366 * the memory from the heap - there's no need for the complexity of 367 * page arrays to keep allocation down to order 0. 368 */ 369 size = BBTOB(bp->b_length); 370 if (size < PAGE_SIZE) { 371 bp->b_addr = kmem_alloc(size, KM_NOFS); 372 if (!bp->b_addr) { 373 /* low memory - use alloc_page loop instead */ 374 goto use_alloc_page; 375 } 376 377 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 378 ((unsigned long)bp->b_addr & PAGE_MASK)) { 379 /* b_addr spans two pages - use alloc_page instead */ 380 kmem_free(bp->b_addr); 381 bp->b_addr = NULL; 382 goto use_alloc_page; 383 } 384 bp->b_offset = offset_in_page(bp->b_addr); 385 bp->b_pages = bp->b_page_array; 386 bp->b_pages[0] = virt_to_page(bp->b_addr); 387 bp->b_page_count = 1; 388 bp->b_flags |= _XBF_KMEM; 389 return 0; 390 } 391 392 use_alloc_page: 393 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 394 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 395 >> PAGE_SHIFT; 396 page_count = end - start; 397 error = _xfs_buf_get_pages(bp, page_count); 398 if (unlikely(error)) 399 return error; 400 401 offset = bp->b_offset; 402 bp->b_flags |= _XBF_PAGES; 403 404 for (i = 0; i < bp->b_page_count; i++) { 405 struct page *page; 406 uint retries = 0; 407 retry: 408 page = alloc_page(gfp_mask); 409 if (unlikely(page == NULL)) { 410 if (flags & XBF_READ_AHEAD) { 411 bp->b_page_count = i; 412 error = -ENOMEM; 413 goto out_free_pages; 414 } 415 416 /* 417 * This could deadlock. 418 * 419 * But until all the XFS lowlevel code is revamped to 420 * handle buffer allocation failures we can't do much. 421 */ 422 if (!(++retries % 100)) 423 xfs_err(NULL, 424 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 425 current->comm, current->pid, 426 __func__, gfp_mask); 427 428 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); 429 congestion_wait(BLK_RW_ASYNC, HZ/50); 430 goto retry; 431 } 432 433 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); 434 435 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 436 size -= nbytes; 437 bp->b_pages[i] = page; 438 offset = 0; 439 } 440 return 0; 441 442 out_free_pages: 443 for (i = 0; i < bp->b_page_count; i++) 444 __free_page(bp->b_pages[i]); 445 bp->b_flags &= ~_XBF_PAGES; 446 return error; 447 } 448 449 /* 450 * Map buffer into kernel address-space if necessary. 451 */ 452 STATIC int 453 _xfs_buf_map_pages( 454 xfs_buf_t *bp, 455 uint flags) 456 { 457 ASSERT(bp->b_flags & _XBF_PAGES); 458 if (bp->b_page_count == 1) { 459 /* A single page buffer is always mappable */ 460 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 461 } else if (flags & XBF_UNMAPPED) { 462 bp->b_addr = NULL; 463 } else { 464 int retried = 0; 465 unsigned nofs_flag; 466 467 /* 468 * vm_map_ram() will allocate auxillary structures (e.g. 469 * pagetables) with GFP_KERNEL, yet we are likely to be under 470 * GFP_NOFS context here. Hence we need to tell memory reclaim 471 * that we are in such a context via PF_MEMALLOC_NOFS to prevent 472 * memory reclaim re-entering the filesystem here and 473 * potentially deadlocking. 474 */ 475 nofs_flag = memalloc_nofs_save(); 476 do { 477 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 478 -1, PAGE_KERNEL); 479 if (bp->b_addr) 480 break; 481 vm_unmap_aliases(); 482 } while (retried++ <= 1); 483 memalloc_nofs_restore(nofs_flag); 484 485 if (!bp->b_addr) 486 return -ENOMEM; 487 bp->b_addr += bp->b_offset; 488 } 489 490 return 0; 491 } 492 493 /* 494 * Finding and Reading Buffers 495 */ 496 static int 497 _xfs_buf_obj_cmp( 498 struct rhashtable_compare_arg *arg, 499 const void *obj) 500 { 501 const struct xfs_buf_map *map = arg->key; 502 const struct xfs_buf *bp = obj; 503 504 /* 505 * The key hashing in the lookup path depends on the key being the 506 * first element of the compare_arg, make sure to assert this. 507 */ 508 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 509 510 if (bp->b_bn != map->bm_bn) 511 return 1; 512 513 if (unlikely(bp->b_length != map->bm_len)) { 514 /* 515 * found a block number match. If the range doesn't 516 * match, the only way this is allowed is if the buffer 517 * in the cache is stale and the transaction that made 518 * it stale has not yet committed. i.e. we are 519 * reallocating a busy extent. Skip this buffer and 520 * continue searching for an exact match. 521 */ 522 ASSERT(bp->b_flags & XBF_STALE); 523 return 1; 524 } 525 return 0; 526 } 527 528 static const struct rhashtable_params xfs_buf_hash_params = { 529 .min_size = 32, /* empty AGs have minimal footprint */ 530 .nelem_hint = 16, 531 .key_len = sizeof(xfs_daddr_t), 532 .key_offset = offsetof(struct xfs_buf, b_bn), 533 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 534 .automatic_shrinking = true, 535 .obj_cmpfn = _xfs_buf_obj_cmp, 536 }; 537 538 int 539 xfs_buf_hash_init( 540 struct xfs_perag *pag) 541 { 542 spin_lock_init(&pag->pag_buf_lock); 543 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 544 } 545 546 void 547 xfs_buf_hash_destroy( 548 struct xfs_perag *pag) 549 { 550 rhashtable_destroy(&pag->pag_buf_hash); 551 } 552 553 /* 554 * Look up a buffer in the buffer cache and return it referenced and locked 555 * in @found_bp. 556 * 557 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the 558 * cache. 559 * 560 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return 561 * -EAGAIN if we fail to lock it. 562 * 563 * Return values are: 564 * -EFSCORRUPTED if have been supplied with an invalid address 565 * -EAGAIN on trylock failure 566 * -ENOENT if we fail to find a match and @new_bp was NULL 567 * 0, with @found_bp: 568 * - @new_bp if we inserted it into the cache 569 * - the buffer we found and locked. 570 */ 571 static int 572 xfs_buf_find( 573 struct xfs_buftarg *btp, 574 struct xfs_buf_map *map, 575 int nmaps, 576 xfs_buf_flags_t flags, 577 struct xfs_buf *new_bp, 578 struct xfs_buf **found_bp) 579 { 580 struct xfs_perag *pag; 581 xfs_buf_t *bp; 582 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 583 xfs_daddr_t eofs; 584 int i; 585 586 *found_bp = NULL; 587 588 for (i = 0; i < nmaps; i++) 589 cmap.bm_len += map[i].bm_len; 590 591 /* Check for IOs smaller than the sector size / not sector aligned */ 592 ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); 593 ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 594 595 /* 596 * Corrupted block numbers can get through to here, unfortunately, so we 597 * have to check that the buffer falls within the filesystem bounds. 598 */ 599 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 600 if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { 601 xfs_alert(btp->bt_mount, 602 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 603 __func__, cmap.bm_bn, eofs); 604 WARN_ON(1); 605 return -EFSCORRUPTED; 606 } 607 608 pag = xfs_perag_get(btp->bt_mount, 609 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 610 611 spin_lock(&pag->pag_buf_lock); 612 bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, 613 xfs_buf_hash_params); 614 if (bp) { 615 atomic_inc(&bp->b_hold); 616 goto found; 617 } 618 619 /* No match found */ 620 if (!new_bp) { 621 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 622 spin_unlock(&pag->pag_buf_lock); 623 xfs_perag_put(pag); 624 return -ENOENT; 625 } 626 627 /* the buffer keeps the perag reference until it is freed */ 628 new_bp->b_pag = pag; 629 rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, 630 xfs_buf_hash_params); 631 spin_unlock(&pag->pag_buf_lock); 632 *found_bp = new_bp; 633 return 0; 634 635 found: 636 spin_unlock(&pag->pag_buf_lock); 637 xfs_perag_put(pag); 638 639 if (!xfs_buf_trylock(bp)) { 640 if (flags & XBF_TRYLOCK) { 641 xfs_buf_rele(bp); 642 XFS_STATS_INC(btp->bt_mount, xb_busy_locked); 643 return -EAGAIN; 644 } 645 xfs_buf_lock(bp); 646 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); 647 } 648 649 /* 650 * if the buffer is stale, clear all the external state associated with 651 * it. We need to keep flags such as how we allocated the buffer memory 652 * intact here. 653 */ 654 if (bp->b_flags & XBF_STALE) { 655 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 656 ASSERT(bp->b_iodone == NULL); 657 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 658 bp->b_ops = NULL; 659 } 660 661 trace_xfs_buf_find(bp, flags, _RET_IP_); 662 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 663 *found_bp = bp; 664 return 0; 665 } 666 667 struct xfs_buf * 668 xfs_buf_incore( 669 struct xfs_buftarg *target, 670 xfs_daddr_t blkno, 671 size_t numblks, 672 xfs_buf_flags_t flags) 673 { 674 struct xfs_buf *bp; 675 int error; 676 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 677 678 error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); 679 if (error) 680 return NULL; 681 return bp; 682 } 683 684 /* 685 * Assembles a buffer covering the specified range. The code is optimised for 686 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 687 * more hits than misses. 688 */ 689 struct xfs_buf * 690 xfs_buf_get_map( 691 struct xfs_buftarg *target, 692 struct xfs_buf_map *map, 693 int nmaps, 694 xfs_buf_flags_t flags) 695 { 696 struct xfs_buf *bp; 697 struct xfs_buf *new_bp; 698 int error = 0; 699 700 error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); 701 702 switch (error) { 703 case 0: 704 /* cache hit */ 705 goto found; 706 case -EAGAIN: 707 /* cache hit, trylock failure, caller handles failure */ 708 ASSERT(flags & XBF_TRYLOCK); 709 return NULL; 710 case -ENOENT: 711 /* cache miss, go for insert */ 712 break; 713 case -EFSCORRUPTED: 714 default: 715 /* 716 * None of the higher layers understand failure types 717 * yet, so return NULL to signal a fatal lookup error. 718 */ 719 return NULL; 720 } 721 722 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 723 if (unlikely(!new_bp)) 724 return NULL; 725 726 error = xfs_buf_allocate_memory(new_bp, flags); 727 if (error) { 728 xfs_buf_free(new_bp); 729 return NULL; 730 } 731 732 error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); 733 if (error) { 734 xfs_buf_free(new_bp); 735 return NULL; 736 } 737 738 if (bp != new_bp) 739 xfs_buf_free(new_bp); 740 741 found: 742 if (!bp->b_addr) { 743 error = _xfs_buf_map_pages(bp, flags); 744 if (unlikely(error)) { 745 xfs_warn(target->bt_mount, 746 "%s: failed to map pagesn", __func__); 747 xfs_buf_relse(bp); 748 return NULL; 749 } 750 } 751 752 /* 753 * Clear b_error if this is a lookup from a caller that doesn't expect 754 * valid data to be found in the buffer. 755 */ 756 if (!(flags & XBF_READ)) 757 xfs_buf_ioerror(bp, 0); 758 759 XFS_STATS_INC(target->bt_mount, xb_get); 760 trace_xfs_buf_get(bp, flags, _RET_IP_); 761 return bp; 762 } 763 764 STATIC int 765 _xfs_buf_read( 766 xfs_buf_t *bp, 767 xfs_buf_flags_t flags) 768 { 769 ASSERT(!(flags & XBF_WRITE)); 770 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 771 772 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 773 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 774 775 return xfs_buf_submit(bp); 776 } 777 778 /* 779 * If the caller passed in an ops structure and the buffer doesn't have ops 780 * assigned, set the ops and use them to verify the contents. If the contents 781 * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no 782 * recorded errors and is already in XBF_DONE state. 783 */ 784 int 785 xfs_buf_ensure_ops( 786 struct xfs_buf *bp, 787 const struct xfs_buf_ops *ops) 788 { 789 ASSERT(bp->b_flags & XBF_DONE); 790 ASSERT(bp->b_error == 0); 791 792 if (!ops || bp->b_ops) 793 return 0; 794 795 bp->b_ops = ops; 796 bp->b_ops->verify_read(bp); 797 if (bp->b_error) 798 bp->b_flags &= ~XBF_DONE; 799 return bp->b_error; 800 } 801 802 xfs_buf_t * 803 xfs_buf_read_map( 804 struct xfs_buftarg *target, 805 struct xfs_buf_map *map, 806 int nmaps, 807 xfs_buf_flags_t flags, 808 const struct xfs_buf_ops *ops) 809 { 810 struct xfs_buf *bp; 811 812 flags |= XBF_READ; 813 814 bp = xfs_buf_get_map(target, map, nmaps, flags); 815 if (!bp) 816 return NULL; 817 818 trace_xfs_buf_read(bp, flags, _RET_IP_); 819 820 if (!(bp->b_flags & XBF_DONE)) { 821 XFS_STATS_INC(target->bt_mount, xb_get_read); 822 bp->b_ops = ops; 823 _xfs_buf_read(bp, flags); 824 return bp; 825 } 826 827 xfs_buf_ensure_ops(bp, ops); 828 829 if (flags & XBF_ASYNC) { 830 /* 831 * Read ahead call which is already satisfied, 832 * drop the buffer 833 */ 834 xfs_buf_relse(bp); 835 return NULL; 836 } 837 838 /* We do not want read in the flags */ 839 bp->b_flags &= ~XBF_READ; 840 ASSERT(bp->b_ops != NULL || ops == NULL); 841 return bp; 842 } 843 844 /* 845 * If we are not low on memory then do the readahead in a deadlock 846 * safe manner. 847 */ 848 void 849 xfs_buf_readahead_map( 850 struct xfs_buftarg *target, 851 struct xfs_buf_map *map, 852 int nmaps, 853 const struct xfs_buf_ops *ops) 854 { 855 if (bdi_read_congested(target->bt_bdev->bd_bdi)) 856 return; 857 858 xfs_buf_read_map(target, map, nmaps, 859 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 860 } 861 862 /* 863 * Read an uncached buffer from disk. Allocates and returns a locked 864 * buffer containing the disk contents or nothing. 865 */ 866 int 867 xfs_buf_read_uncached( 868 struct xfs_buftarg *target, 869 xfs_daddr_t daddr, 870 size_t numblks, 871 int flags, 872 struct xfs_buf **bpp, 873 const struct xfs_buf_ops *ops) 874 { 875 struct xfs_buf *bp; 876 877 *bpp = NULL; 878 879 bp = xfs_buf_get_uncached(target, numblks, flags); 880 if (!bp) 881 return -ENOMEM; 882 883 /* set up the buffer for a read IO */ 884 ASSERT(bp->b_map_count == 1); 885 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ 886 bp->b_maps[0].bm_bn = daddr; 887 bp->b_flags |= XBF_READ; 888 bp->b_ops = ops; 889 890 xfs_buf_submit(bp); 891 if (bp->b_error) { 892 int error = bp->b_error; 893 xfs_buf_relse(bp); 894 return error; 895 } 896 897 *bpp = bp; 898 return 0; 899 } 900 901 /* 902 * Return a buffer allocated as an empty buffer and associated to external 903 * memory via xfs_buf_associate_memory() back to it's empty state. 904 */ 905 void 906 xfs_buf_set_empty( 907 struct xfs_buf *bp, 908 size_t numblks) 909 { 910 if (bp->b_pages) 911 _xfs_buf_free_pages(bp); 912 913 bp->b_pages = NULL; 914 bp->b_page_count = 0; 915 bp->b_addr = NULL; 916 bp->b_length = numblks; 917 bp->b_io_length = numblks; 918 919 ASSERT(bp->b_map_count == 1); 920 bp->b_bn = XFS_BUF_DADDR_NULL; 921 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 922 bp->b_maps[0].bm_len = bp->b_length; 923 } 924 925 static inline struct page * 926 mem_to_page( 927 void *addr) 928 { 929 if ((!is_vmalloc_addr(addr))) { 930 return virt_to_page(addr); 931 } else { 932 return vmalloc_to_page(addr); 933 } 934 } 935 936 int 937 xfs_buf_associate_memory( 938 xfs_buf_t *bp, 939 void *mem, 940 size_t len) 941 { 942 int rval; 943 int i = 0; 944 unsigned long pageaddr; 945 unsigned long offset; 946 size_t buflen; 947 int page_count; 948 949 pageaddr = (unsigned long)mem & PAGE_MASK; 950 offset = (unsigned long)mem - pageaddr; 951 buflen = PAGE_ALIGN(len + offset); 952 page_count = buflen >> PAGE_SHIFT; 953 954 /* Free any previous set of page pointers */ 955 if (bp->b_pages) 956 _xfs_buf_free_pages(bp); 957 958 bp->b_pages = NULL; 959 bp->b_addr = mem; 960 961 rval = _xfs_buf_get_pages(bp, page_count); 962 if (rval) 963 return rval; 964 965 bp->b_offset = offset; 966 967 for (i = 0; i < bp->b_page_count; i++) { 968 bp->b_pages[i] = mem_to_page((void *)pageaddr); 969 pageaddr += PAGE_SIZE; 970 } 971 972 bp->b_io_length = BTOBB(len); 973 bp->b_length = BTOBB(buflen); 974 975 return 0; 976 } 977 978 xfs_buf_t * 979 xfs_buf_get_uncached( 980 struct xfs_buftarg *target, 981 size_t numblks, 982 int flags) 983 { 984 unsigned long page_count; 985 int error, i; 986 struct xfs_buf *bp; 987 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 988 989 /* flags might contain irrelevant bits, pass only what we care about */ 990 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); 991 if (unlikely(bp == NULL)) 992 goto fail; 993 994 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 995 error = _xfs_buf_get_pages(bp, page_count); 996 if (error) 997 goto fail_free_buf; 998 999 for (i = 0; i < page_count; i++) { 1000 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 1001 if (!bp->b_pages[i]) 1002 goto fail_free_mem; 1003 } 1004 bp->b_flags |= _XBF_PAGES; 1005 1006 error = _xfs_buf_map_pages(bp, 0); 1007 if (unlikely(error)) { 1008 xfs_warn(target->bt_mount, 1009 "%s: failed to map pages", __func__); 1010 goto fail_free_mem; 1011 } 1012 1013 trace_xfs_buf_get_uncached(bp, _RET_IP_); 1014 return bp; 1015 1016 fail_free_mem: 1017 while (--i >= 0) 1018 __free_page(bp->b_pages[i]); 1019 _xfs_buf_free_pages(bp); 1020 fail_free_buf: 1021 xfs_buf_free_maps(bp); 1022 kmem_zone_free(xfs_buf_zone, bp); 1023 fail: 1024 return NULL; 1025 } 1026 1027 /* 1028 * Increment reference count on buffer, to hold the buffer concurrently 1029 * with another thread which may release (free) the buffer asynchronously. 1030 * Must hold the buffer already to call this function. 1031 */ 1032 void 1033 xfs_buf_hold( 1034 xfs_buf_t *bp) 1035 { 1036 trace_xfs_buf_hold(bp, _RET_IP_); 1037 atomic_inc(&bp->b_hold); 1038 } 1039 1040 /* 1041 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 1042 * placed on LRU or freed (depending on b_lru_ref). 1043 */ 1044 void 1045 xfs_buf_rele( 1046 xfs_buf_t *bp) 1047 { 1048 struct xfs_perag *pag = bp->b_pag; 1049 bool release; 1050 bool freebuf = false; 1051 1052 trace_xfs_buf_rele(bp, _RET_IP_); 1053 1054 if (!pag) { 1055 ASSERT(list_empty(&bp->b_lru)); 1056 if (atomic_dec_and_test(&bp->b_hold)) { 1057 xfs_buf_ioacct_dec(bp); 1058 xfs_buf_free(bp); 1059 } 1060 return; 1061 } 1062 1063 ASSERT(atomic_read(&bp->b_hold) > 0); 1064 1065 /* 1066 * We grab the b_lock here first to serialise racing xfs_buf_rele() 1067 * calls. The pag_buf_lock being taken on the last reference only 1068 * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1069 * to last reference we drop here is not serialised against the last 1070 * reference until we take bp->b_lock. Hence if we don't grab b_lock 1071 * first, the last "release" reference can win the race to the lock and 1072 * free the buffer before the second-to-last reference is processed, 1073 * leading to a use-after-free scenario. 1074 */ 1075 spin_lock(&bp->b_lock); 1076 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 1077 if (!release) { 1078 /* 1079 * Drop the in-flight state if the buffer is already on the LRU 1080 * and it holds the only reference. This is racy because we 1081 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1082 * ensures the decrement occurs only once per-buf. 1083 */ 1084 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1085 __xfs_buf_ioacct_dec(bp); 1086 goto out_unlock; 1087 } 1088 1089 /* the last reference has been dropped ... */ 1090 __xfs_buf_ioacct_dec(bp); 1091 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1092 /* 1093 * If the buffer is added to the LRU take a new reference to the 1094 * buffer for the LRU and clear the (now stale) dispose list 1095 * state flag 1096 */ 1097 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 1098 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1099 atomic_inc(&bp->b_hold); 1100 } 1101 spin_unlock(&pag->pag_buf_lock); 1102 } else { 1103 /* 1104 * most of the time buffers will already be removed from the 1105 * LRU, so optimise that case by checking for the 1106 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1107 * was on was the disposal list 1108 */ 1109 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1110 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 1111 } else { 1112 ASSERT(list_empty(&bp->b_lru)); 1113 } 1114 1115 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1116 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1117 xfs_buf_hash_params); 1118 spin_unlock(&pag->pag_buf_lock); 1119 xfs_perag_put(pag); 1120 freebuf = true; 1121 } 1122 1123 out_unlock: 1124 spin_unlock(&bp->b_lock); 1125 1126 if (freebuf) 1127 xfs_buf_free(bp); 1128 } 1129 1130 1131 /* 1132 * Lock a buffer object, if it is not already locked. 1133 * 1134 * If we come across a stale, pinned, locked buffer, we know that we are 1135 * being asked to lock a buffer that has been reallocated. Because it is 1136 * pinned, we know that the log has not been pushed to disk and hence it 1137 * will still be locked. Rather than continuing to have trylock attempts 1138 * fail until someone else pushes the log, push it ourselves before 1139 * returning. This means that the xfsaild will not get stuck trying 1140 * to push on stale inode buffers. 1141 */ 1142 int 1143 xfs_buf_trylock( 1144 struct xfs_buf *bp) 1145 { 1146 int locked; 1147 1148 locked = down_trylock(&bp->b_sema) == 0; 1149 if (locked) 1150 trace_xfs_buf_trylock(bp, _RET_IP_); 1151 else 1152 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1153 return locked; 1154 } 1155 1156 /* 1157 * Lock a buffer object. 1158 * 1159 * If we come across a stale, pinned, locked buffer, we know that we 1160 * are being asked to lock a buffer that has been reallocated. Because 1161 * it is pinned, we know that the log has not been pushed to disk and 1162 * hence it will still be locked. Rather than sleeping until someone 1163 * else pushes the log, push it ourselves before trying to get the lock. 1164 */ 1165 void 1166 xfs_buf_lock( 1167 struct xfs_buf *bp) 1168 { 1169 trace_xfs_buf_lock(bp, _RET_IP_); 1170 1171 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1172 xfs_log_force(bp->b_target->bt_mount, 0); 1173 down(&bp->b_sema); 1174 1175 trace_xfs_buf_lock_done(bp, _RET_IP_); 1176 } 1177 1178 void 1179 xfs_buf_unlock( 1180 struct xfs_buf *bp) 1181 { 1182 ASSERT(xfs_buf_islocked(bp)); 1183 1184 up(&bp->b_sema); 1185 trace_xfs_buf_unlock(bp, _RET_IP_); 1186 } 1187 1188 STATIC void 1189 xfs_buf_wait_unpin( 1190 xfs_buf_t *bp) 1191 { 1192 DECLARE_WAITQUEUE (wait, current); 1193 1194 if (atomic_read(&bp->b_pin_count) == 0) 1195 return; 1196 1197 add_wait_queue(&bp->b_waiters, &wait); 1198 for (;;) { 1199 set_current_state(TASK_UNINTERRUPTIBLE); 1200 if (atomic_read(&bp->b_pin_count) == 0) 1201 break; 1202 io_schedule(); 1203 } 1204 remove_wait_queue(&bp->b_waiters, &wait); 1205 set_current_state(TASK_RUNNING); 1206 } 1207 1208 /* 1209 * Buffer Utility Routines 1210 */ 1211 1212 void 1213 xfs_buf_ioend( 1214 struct xfs_buf *bp) 1215 { 1216 bool read = bp->b_flags & XBF_READ; 1217 1218 trace_xfs_buf_iodone(bp, _RET_IP_); 1219 1220 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1221 1222 /* 1223 * Pull in IO completion errors now. We are guaranteed to be running 1224 * single threaded, so we don't need the lock to read b_io_error. 1225 */ 1226 if (!bp->b_error && bp->b_io_error) 1227 xfs_buf_ioerror(bp, bp->b_io_error); 1228 1229 /* Only validate buffers that were read without errors */ 1230 if (read && !bp->b_error && bp->b_ops) { 1231 ASSERT(!bp->b_iodone); 1232 bp->b_ops->verify_read(bp); 1233 } 1234 1235 if (!bp->b_error) 1236 bp->b_flags |= XBF_DONE; 1237 1238 if (bp->b_iodone) 1239 (*(bp->b_iodone))(bp); 1240 else if (bp->b_flags & XBF_ASYNC) 1241 xfs_buf_relse(bp); 1242 else 1243 complete(&bp->b_iowait); 1244 } 1245 1246 static void 1247 xfs_buf_ioend_work( 1248 struct work_struct *work) 1249 { 1250 struct xfs_buf *bp = 1251 container_of(work, xfs_buf_t, b_ioend_work); 1252 1253 xfs_buf_ioend(bp); 1254 } 1255 1256 static void 1257 xfs_buf_ioend_async( 1258 struct xfs_buf *bp) 1259 { 1260 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1261 queue_work(bp->b_ioend_wq, &bp->b_ioend_work); 1262 } 1263 1264 void 1265 __xfs_buf_ioerror( 1266 xfs_buf_t *bp, 1267 int error, 1268 xfs_failaddr_t failaddr) 1269 { 1270 ASSERT(error <= 0 && error >= -1000); 1271 bp->b_error = error; 1272 trace_xfs_buf_ioerror(bp, error, failaddr); 1273 } 1274 1275 void 1276 xfs_buf_ioerror_alert( 1277 struct xfs_buf *bp, 1278 const char *func) 1279 { 1280 xfs_alert(bp->b_target->bt_mount, 1281 "metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", 1282 func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, 1283 -bp->b_error); 1284 } 1285 1286 int 1287 xfs_bwrite( 1288 struct xfs_buf *bp) 1289 { 1290 int error; 1291 1292 ASSERT(xfs_buf_islocked(bp)); 1293 1294 bp->b_flags |= XBF_WRITE; 1295 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1296 XBF_WRITE_FAIL | XBF_DONE); 1297 1298 error = xfs_buf_submit(bp); 1299 if (error) { 1300 xfs_force_shutdown(bp->b_target->bt_mount, 1301 SHUTDOWN_META_IO_ERROR); 1302 } 1303 return error; 1304 } 1305 1306 static void 1307 xfs_buf_bio_end_io( 1308 struct bio *bio) 1309 { 1310 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1311 1312 /* 1313 * don't overwrite existing errors - otherwise we can lose errors on 1314 * buffers that require multiple bios to complete. 1315 */ 1316 if (bio->bi_status) { 1317 int error = blk_status_to_errno(bio->bi_status); 1318 1319 cmpxchg(&bp->b_io_error, 0, error); 1320 } 1321 1322 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1323 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1324 1325 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1326 xfs_buf_ioend_async(bp); 1327 bio_put(bio); 1328 } 1329 1330 static void 1331 xfs_buf_ioapply_map( 1332 struct xfs_buf *bp, 1333 int map, 1334 int *buf_offset, 1335 int *count, 1336 int op, 1337 int op_flags) 1338 { 1339 int page_index; 1340 int total_nr_pages = bp->b_page_count; 1341 int nr_pages; 1342 struct bio *bio; 1343 sector_t sector = bp->b_maps[map].bm_bn; 1344 int size; 1345 int offset; 1346 1347 /* skip the pages in the buffer before the start offset */ 1348 page_index = 0; 1349 offset = *buf_offset; 1350 while (offset >= PAGE_SIZE) { 1351 page_index++; 1352 offset -= PAGE_SIZE; 1353 } 1354 1355 /* 1356 * Limit the IO size to the length of the current vector, and update the 1357 * remaining IO count for the next time around. 1358 */ 1359 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1360 *count -= size; 1361 *buf_offset += size; 1362 1363 next_chunk: 1364 atomic_inc(&bp->b_io_remaining); 1365 nr_pages = min(total_nr_pages, BIO_MAX_PAGES); 1366 1367 bio = bio_alloc(GFP_NOIO, nr_pages); 1368 bio_set_dev(bio, bp->b_target->bt_bdev); 1369 bio->bi_iter.bi_sector = sector; 1370 bio->bi_end_io = xfs_buf_bio_end_io; 1371 bio->bi_private = bp; 1372 bio_set_op_attrs(bio, op, op_flags); 1373 1374 for (; size && nr_pages; nr_pages--, page_index++) { 1375 int rbytes, nbytes = PAGE_SIZE - offset; 1376 1377 if (nbytes > size) 1378 nbytes = size; 1379 1380 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1381 offset); 1382 if (rbytes < nbytes) 1383 break; 1384 1385 offset = 0; 1386 sector += BTOBB(nbytes); 1387 size -= nbytes; 1388 total_nr_pages--; 1389 } 1390 1391 if (likely(bio->bi_iter.bi_size)) { 1392 if (xfs_buf_is_vmapped(bp)) { 1393 flush_kernel_vmap_range(bp->b_addr, 1394 xfs_buf_vmap_len(bp)); 1395 } 1396 submit_bio(bio); 1397 if (size) 1398 goto next_chunk; 1399 } else { 1400 /* 1401 * This is guaranteed not to be the last io reference count 1402 * because the caller (xfs_buf_submit) holds a count itself. 1403 */ 1404 atomic_dec(&bp->b_io_remaining); 1405 xfs_buf_ioerror(bp, -EIO); 1406 bio_put(bio); 1407 } 1408 1409 } 1410 1411 STATIC void 1412 _xfs_buf_ioapply( 1413 struct xfs_buf *bp) 1414 { 1415 struct blk_plug plug; 1416 int op; 1417 int op_flags = 0; 1418 int offset; 1419 int size; 1420 int i; 1421 1422 /* 1423 * Make sure we capture only current IO errors rather than stale errors 1424 * left over from previous use of the buffer (e.g. failed readahead). 1425 */ 1426 bp->b_error = 0; 1427 1428 /* 1429 * Initialize the I/O completion workqueue if we haven't yet or the 1430 * submitter has not opted to specify a custom one. 1431 */ 1432 if (!bp->b_ioend_wq) 1433 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; 1434 1435 if (bp->b_flags & XBF_WRITE) { 1436 op = REQ_OP_WRITE; 1437 if (bp->b_flags & XBF_SYNCIO) 1438 op_flags = REQ_SYNC; 1439 if (bp->b_flags & XBF_FUA) 1440 op_flags |= REQ_FUA; 1441 if (bp->b_flags & XBF_FLUSH) 1442 op_flags |= REQ_PREFLUSH; 1443 1444 /* 1445 * Run the write verifier callback function if it exists. If 1446 * this function fails it will mark the buffer with an error and 1447 * the IO should not be dispatched. 1448 */ 1449 if (bp->b_ops) { 1450 bp->b_ops->verify_write(bp); 1451 if (bp->b_error) { 1452 xfs_force_shutdown(bp->b_target->bt_mount, 1453 SHUTDOWN_CORRUPT_INCORE); 1454 return; 1455 } 1456 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { 1457 struct xfs_mount *mp = bp->b_target->bt_mount; 1458 1459 /* 1460 * non-crc filesystems don't attach verifiers during 1461 * log recovery, so don't warn for such filesystems. 1462 */ 1463 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1464 xfs_warn(mp, 1465 "%s: no buf ops on daddr 0x%llx len %d", 1466 __func__, bp->b_bn, bp->b_length); 1467 xfs_hex_dump(bp->b_addr, 1468 XFS_CORRUPTION_DUMP_LEN); 1469 dump_stack(); 1470 } 1471 } 1472 } else if (bp->b_flags & XBF_READ_AHEAD) { 1473 op = REQ_OP_READ; 1474 op_flags = REQ_RAHEAD; 1475 } else { 1476 op = REQ_OP_READ; 1477 } 1478 1479 /* we only use the buffer cache for meta-data */ 1480 op_flags |= REQ_META; 1481 1482 /* 1483 * Walk all the vectors issuing IO on them. Set up the initial offset 1484 * into the buffer and the desired IO size before we start - 1485 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1486 * subsequent call. 1487 */ 1488 offset = bp->b_offset; 1489 size = BBTOB(bp->b_io_length); 1490 blk_start_plug(&plug); 1491 for (i = 0; i < bp->b_map_count; i++) { 1492 xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); 1493 if (bp->b_error) 1494 break; 1495 if (size <= 0) 1496 break; /* all done */ 1497 } 1498 blk_finish_plug(&plug); 1499 } 1500 1501 /* 1502 * Wait for I/O completion of a sync buffer and return the I/O error code. 1503 */ 1504 static int 1505 xfs_buf_iowait( 1506 struct xfs_buf *bp) 1507 { 1508 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1509 1510 trace_xfs_buf_iowait(bp, _RET_IP_); 1511 wait_for_completion(&bp->b_iowait); 1512 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1513 1514 return bp->b_error; 1515 } 1516 1517 /* 1518 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1519 * the buffer lock ownership and the current reference to the IO. It is not 1520 * safe to reference the buffer after a call to this function unless the caller 1521 * holds an additional reference itself. 1522 */ 1523 int 1524 __xfs_buf_submit( 1525 struct xfs_buf *bp, 1526 bool wait) 1527 { 1528 int error = 0; 1529 1530 trace_xfs_buf_submit(bp, _RET_IP_); 1531 1532 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1533 1534 /* on shutdown we stale and complete the buffer immediately */ 1535 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1536 xfs_buf_ioerror(bp, -EIO); 1537 bp->b_flags &= ~XBF_DONE; 1538 xfs_buf_stale(bp); 1539 if (bp->b_flags & XBF_ASYNC) 1540 xfs_buf_ioend(bp); 1541 return -EIO; 1542 } 1543 1544 /* 1545 * Grab a reference so the buffer does not go away underneath us. For 1546 * async buffers, I/O completion drops the callers reference, which 1547 * could occur before submission returns. 1548 */ 1549 xfs_buf_hold(bp); 1550 1551 if (bp->b_flags & XBF_WRITE) 1552 xfs_buf_wait_unpin(bp); 1553 1554 /* clear the internal error state to avoid spurious errors */ 1555 bp->b_io_error = 0; 1556 1557 /* 1558 * Set the count to 1 initially, this will stop an I/O completion 1559 * callout which happens before we have started all the I/O from calling 1560 * xfs_buf_ioend too early. 1561 */ 1562 atomic_set(&bp->b_io_remaining, 1); 1563 if (bp->b_flags & XBF_ASYNC) 1564 xfs_buf_ioacct_inc(bp); 1565 _xfs_buf_ioapply(bp); 1566 1567 /* 1568 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1569 * reference we took above. If we drop it to zero, run completion so 1570 * that we don't return to the caller with completion still pending. 1571 */ 1572 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1573 if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1574 xfs_buf_ioend(bp); 1575 else 1576 xfs_buf_ioend_async(bp); 1577 } 1578 1579 if (wait) 1580 error = xfs_buf_iowait(bp); 1581 1582 /* 1583 * Release the hold that keeps the buffer referenced for the entire 1584 * I/O. Note that if the buffer is async, it is not safe to reference 1585 * after this release. 1586 */ 1587 xfs_buf_rele(bp); 1588 return error; 1589 } 1590 1591 void * 1592 xfs_buf_offset( 1593 struct xfs_buf *bp, 1594 size_t offset) 1595 { 1596 struct page *page; 1597 1598 if (bp->b_addr) 1599 return bp->b_addr + offset; 1600 1601 offset += bp->b_offset; 1602 page = bp->b_pages[offset >> PAGE_SHIFT]; 1603 return page_address(page) + (offset & (PAGE_SIZE-1)); 1604 } 1605 1606 /* 1607 * Move data into or out of a buffer. 1608 */ 1609 void 1610 xfs_buf_iomove( 1611 xfs_buf_t *bp, /* buffer to process */ 1612 size_t boff, /* starting buffer offset */ 1613 size_t bsize, /* length to copy */ 1614 void *data, /* data address */ 1615 xfs_buf_rw_t mode) /* read/write/zero flag */ 1616 { 1617 size_t bend; 1618 1619 bend = boff + bsize; 1620 while (boff < bend) { 1621 struct page *page; 1622 int page_index, page_offset, csize; 1623 1624 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1625 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1626 page = bp->b_pages[page_index]; 1627 csize = min_t(size_t, PAGE_SIZE - page_offset, 1628 BBTOB(bp->b_io_length) - boff); 1629 1630 ASSERT((csize + page_offset) <= PAGE_SIZE); 1631 1632 switch (mode) { 1633 case XBRW_ZERO: 1634 memset(page_address(page) + page_offset, 0, csize); 1635 break; 1636 case XBRW_READ: 1637 memcpy(data, page_address(page) + page_offset, csize); 1638 break; 1639 case XBRW_WRITE: 1640 memcpy(page_address(page) + page_offset, data, csize); 1641 } 1642 1643 boff += csize; 1644 data += csize; 1645 } 1646 } 1647 1648 /* 1649 * Handling of buffer targets (buftargs). 1650 */ 1651 1652 /* 1653 * Wait for any bufs with callbacks that have been submitted but have not yet 1654 * returned. These buffers will have an elevated hold count, so wait on those 1655 * while freeing all the buffers only held by the LRU. 1656 */ 1657 static enum lru_status 1658 xfs_buftarg_wait_rele( 1659 struct list_head *item, 1660 struct list_lru_one *lru, 1661 spinlock_t *lru_lock, 1662 void *arg) 1663 1664 { 1665 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1666 struct list_head *dispose = arg; 1667 1668 if (atomic_read(&bp->b_hold) > 1) { 1669 /* need to wait, so skip it this pass */ 1670 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1671 return LRU_SKIP; 1672 } 1673 if (!spin_trylock(&bp->b_lock)) 1674 return LRU_SKIP; 1675 1676 /* 1677 * clear the LRU reference count so the buffer doesn't get 1678 * ignored in xfs_buf_rele(). 1679 */ 1680 atomic_set(&bp->b_lru_ref, 0); 1681 bp->b_state |= XFS_BSTATE_DISPOSE; 1682 list_lru_isolate_move(lru, item, dispose); 1683 spin_unlock(&bp->b_lock); 1684 return LRU_REMOVED; 1685 } 1686 1687 void 1688 xfs_wait_buftarg( 1689 struct xfs_buftarg *btp) 1690 { 1691 LIST_HEAD(dispose); 1692 int loop = 0; 1693 1694 /* 1695 * First wait on the buftarg I/O count for all in-flight buffers to be 1696 * released. This is critical as new buffers do not make the LRU until 1697 * they are released. 1698 * 1699 * Next, flush the buffer workqueue to ensure all completion processing 1700 * has finished. Just waiting on buffer locks is not sufficient for 1701 * async IO as the reference count held over IO is not released until 1702 * after the buffer lock is dropped. Hence we need to ensure here that 1703 * all reference counts have been dropped before we start walking the 1704 * LRU list. 1705 */ 1706 while (percpu_counter_sum(&btp->bt_io_count)) 1707 delay(100); 1708 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1709 1710 /* loop until there is nothing left on the lru list. */ 1711 while (list_lru_count(&btp->bt_lru)) { 1712 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, 1713 &dispose, LONG_MAX); 1714 1715 while (!list_empty(&dispose)) { 1716 struct xfs_buf *bp; 1717 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1718 list_del_init(&bp->b_lru); 1719 if (bp->b_flags & XBF_WRITE_FAIL) { 1720 xfs_alert(btp->bt_mount, 1721 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1722 (long long)bp->b_bn); 1723 xfs_alert(btp->bt_mount, 1724 "Please run xfs_repair to determine the extent of the problem."); 1725 } 1726 xfs_buf_rele(bp); 1727 } 1728 if (loop++ != 0) 1729 delay(100); 1730 } 1731 } 1732 1733 static enum lru_status 1734 xfs_buftarg_isolate( 1735 struct list_head *item, 1736 struct list_lru_one *lru, 1737 spinlock_t *lru_lock, 1738 void *arg) 1739 { 1740 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1741 struct list_head *dispose = arg; 1742 1743 /* 1744 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1745 * If we fail to get the lock, just skip it. 1746 */ 1747 if (!spin_trylock(&bp->b_lock)) 1748 return LRU_SKIP; 1749 /* 1750 * Decrement the b_lru_ref count unless the value is already 1751 * zero. If the value is already zero, we need to reclaim the 1752 * buffer, otherwise it gets another trip through the LRU. 1753 */ 1754 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1755 spin_unlock(&bp->b_lock); 1756 return LRU_ROTATE; 1757 } 1758 1759 bp->b_state |= XFS_BSTATE_DISPOSE; 1760 list_lru_isolate_move(lru, item, dispose); 1761 spin_unlock(&bp->b_lock); 1762 return LRU_REMOVED; 1763 } 1764 1765 static unsigned long 1766 xfs_buftarg_shrink_scan( 1767 struct shrinker *shrink, 1768 struct shrink_control *sc) 1769 { 1770 struct xfs_buftarg *btp = container_of(shrink, 1771 struct xfs_buftarg, bt_shrinker); 1772 LIST_HEAD(dispose); 1773 unsigned long freed; 1774 1775 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1776 xfs_buftarg_isolate, &dispose); 1777 1778 while (!list_empty(&dispose)) { 1779 struct xfs_buf *bp; 1780 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1781 list_del_init(&bp->b_lru); 1782 xfs_buf_rele(bp); 1783 } 1784 1785 return freed; 1786 } 1787 1788 static unsigned long 1789 xfs_buftarg_shrink_count( 1790 struct shrinker *shrink, 1791 struct shrink_control *sc) 1792 { 1793 struct xfs_buftarg *btp = container_of(shrink, 1794 struct xfs_buftarg, bt_shrinker); 1795 return list_lru_shrink_count(&btp->bt_lru, sc); 1796 } 1797 1798 void 1799 xfs_free_buftarg( 1800 struct xfs_buftarg *btp) 1801 { 1802 unregister_shrinker(&btp->bt_shrinker); 1803 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1804 percpu_counter_destroy(&btp->bt_io_count); 1805 list_lru_destroy(&btp->bt_lru); 1806 1807 xfs_blkdev_issue_flush(btp); 1808 1809 kmem_free(btp); 1810 } 1811 1812 int 1813 xfs_setsize_buftarg( 1814 xfs_buftarg_t *btp, 1815 unsigned int sectorsize) 1816 { 1817 /* Set up metadata sector size info */ 1818 btp->bt_meta_sectorsize = sectorsize; 1819 btp->bt_meta_sectormask = sectorsize - 1; 1820 1821 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1822 xfs_warn(btp->bt_mount, 1823 "Cannot set_blocksize to %u on device %pg", 1824 sectorsize, btp->bt_bdev); 1825 return -EINVAL; 1826 } 1827 1828 /* Set up device logical sector size mask */ 1829 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1830 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1831 1832 return 0; 1833 } 1834 1835 /* 1836 * When allocating the initial buffer target we have not yet 1837 * read in the superblock, so don't know what sized sectors 1838 * are being used at this early stage. Play safe. 1839 */ 1840 STATIC int 1841 xfs_setsize_buftarg_early( 1842 xfs_buftarg_t *btp, 1843 struct block_device *bdev) 1844 { 1845 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1846 } 1847 1848 xfs_buftarg_t * 1849 xfs_alloc_buftarg( 1850 struct xfs_mount *mp, 1851 struct block_device *bdev, 1852 struct dax_device *dax_dev) 1853 { 1854 xfs_buftarg_t *btp; 1855 1856 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1857 1858 btp->bt_mount = mp; 1859 btp->bt_dev = bdev->bd_dev; 1860 btp->bt_bdev = bdev; 1861 btp->bt_daxdev = dax_dev; 1862 1863 if (xfs_setsize_buftarg_early(btp, bdev)) 1864 goto error_free; 1865 1866 if (list_lru_init(&btp->bt_lru)) 1867 goto error_free; 1868 1869 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 1870 goto error_lru; 1871 1872 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1873 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1874 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1875 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 1876 if (register_shrinker(&btp->bt_shrinker)) 1877 goto error_pcpu; 1878 return btp; 1879 1880 error_pcpu: 1881 percpu_counter_destroy(&btp->bt_io_count); 1882 error_lru: 1883 list_lru_destroy(&btp->bt_lru); 1884 error_free: 1885 kmem_free(btp); 1886 return NULL; 1887 } 1888 1889 /* 1890 * Cancel a delayed write list. 1891 * 1892 * Remove each buffer from the list, clear the delwri queue flag and drop the 1893 * associated buffer reference. 1894 */ 1895 void 1896 xfs_buf_delwri_cancel( 1897 struct list_head *list) 1898 { 1899 struct xfs_buf *bp; 1900 1901 while (!list_empty(list)) { 1902 bp = list_first_entry(list, struct xfs_buf, b_list); 1903 1904 xfs_buf_lock(bp); 1905 bp->b_flags &= ~_XBF_DELWRI_Q; 1906 list_del_init(&bp->b_list); 1907 xfs_buf_relse(bp); 1908 } 1909 } 1910 1911 /* 1912 * Add a buffer to the delayed write list. 1913 * 1914 * This queues a buffer for writeout if it hasn't already been. Note that 1915 * neither this routine nor the buffer list submission functions perform 1916 * any internal synchronization. It is expected that the lists are thread-local 1917 * to the callers. 1918 * 1919 * Returns true if we queued up the buffer, or false if it already had 1920 * been on the buffer list. 1921 */ 1922 bool 1923 xfs_buf_delwri_queue( 1924 struct xfs_buf *bp, 1925 struct list_head *list) 1926 { 1927 ASSERT(xfs_buf_islocked(bp)); 1928 ASSERT(!(bp->b_flags & XBF_READ)); 1929 1930 /* 1931 * If the buffer is already marked delwri it already is queued up 1932 * by someone else for imediate writeout. Just ignore it in that 1933 * case. 1934 */ 1935 if (bp->b_flags & _XBF_DELWRI_Q) { 1936 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1937 return false; 1938 } 1939 1940 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1941 1942 /* 1943 * If a buffer gets written out synchronously or marked stale while it 1944 * is on a delwri list we lazily remove it. To do this, the other party 1945 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1946 * It remains referenced and on the list. In a rare corner case it 1947 * might get readded to a delwri list after the synchronous writeout, in 1948 * which case we need just need to re-add the flag here. 1949 */ 1950 bp->b_flags |= _XBF_DELWRI_Q; 1951 if (list_empty(&bp->b_list)) { 1952 atomic_inc(&bp->b_hold); 1953 list_add_tail(&bp->b_list, list); 1954 } 1955 1956 return true; 1957 } 1958 1959 /* 1960 * Compare function is more complex than it needs to be because 1961 * the return value is only 32 bits and we are doing comparisons 1962 * on 64 bit values 1963 */ 1964 static int 1965 xfs_buf_cmp( 1966 void *priv, 1967 struct list_head *a, 1968 struct list_head *b) 1969 { 1970 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1971 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1972 xfs_daddr_t diff; 1973 1974 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1975 if (diff < 0) 1976 return -1; 1977 if (diff > 0) 1978 return 1; 1979 return 0; 1980 } 1981 1982 /* 1983 * Submit buffers for write. If wait_list is specified, the buffers are 1984 * submitted using sync I/O and placed on the wait list such that the caller can 1985 * iowait each buffer. Otherwise async I/O is used and the buffers are released 1986 * at I/O completion time. In either case, buffers remain locked until I/O 1987 * completes and the buffer is released from the queue. 1988 */ 1989 static int 1990 xfs_buf_delwri_submit_buffers( 1991 struct list_head *buffer_list, 1992 struct list_head *wait_list) 1993 { 1994 struct xfs_buf *bp, *n; 1995 LIST_HEAD (submit_list); 1996 int pinned = 0; 1997 struct blk_plug plug; 1998 1999 list_sort(NULL, buffer_list, xfs_buf_cmp); 2000 2001 blk_start_plug(&plug); 2002 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2003 if (!wait_list) { 2004 if (xfs_buf_ispinned(bp)) { 2005 pinned++; 2006 continue; 2007 } 2008 if (!xfs_buf_trylock(bp)) 2009 continue; 2010 } else { 2011 xfs_buf_lock(bp); 2012 } 2013 2014 /* 2015 * Someone else might have written the buffer synchronously or 2016 * marked it stale in the meantime. In that case only the 2017 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2018 * reference and remove it from the list here. 2019 */ 2020 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2021 list_del_init(&bp->b_list); 2022 xfs_buf_relse(bp); 2023 continue; 2024 } 2025 2026 trace_xfs_buf_delwri_split(bp, _RET_IP_); 2027 2028 /* 2029 * If we have a wait list, each buffer (and associated delwri 2030 * queue reference) transfers to it and is submitted 2031 * synchronously. Otherwise, drop the buffer from the delwri 2032 * queue and submit async. 2033 */ 2034 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); 2035 bp->b_flags |= XBF_WRITE; 2036 if (wait_list) { 2037 bp->b_flags &= ~XBF_ASYNC; 2038 list_move_tail(&bp->b_list, wait_list); 2039 } else { 2040 bp->b_flags |= XBF_ASYNC; 2041 list_del_init(&bp->b_list); 2042 } 2043 __xfs_buf_submit(bp, false); 2044 } 2045 blk_finish_plug(&plug); 2046 2047 return pinned; 2048 } 2049 2050 /* 2051 * Write out a buffer list asynchronously. 2052 * 2053 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2054 * out and not wait for I/O completion on any of the buffers. This interface 2055 * is only safely useable for callers that can track I/O completion by higher 2056 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2057 * function. 2058 * 2059 * Note: this function will skip buffers it would block on, and in doing so 2060 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2061 * it is up to the caller to ensure that the buffer list is fully submitted or 2062 * cancelled appropriately when they are finished with the list. Failure to 2063 * cancel or resubmit the list until it is empty will result in leaked buffers 2064 * at unmount time. 2065 */ 2066 int 2067 xfs_buf_delwri_submit_nowait( 2068 struct list_head *buffer_list) 2069 { 2070 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2071 } 2072 2073 /* 2074 * Write out a buffer list synchronously. 2075 * 2076 * This will take the @buffer_list, write all buffers out and wait for I/O 2077 * completion on all of the buffers. @buffer_list is consumed by the function, 2078 * so callers must have some other way of tracking buffers if they require such 2079 * functionality. 2080 */ 2081 int 2082 xfs_buf_delwri_submit( 2083 struct list_head *buffer_list) 2084 { 2085 LIST_HEAD (wait_list); 2086 int error = 0, error2; 2087 struct xfs_buf *bp; 2088 2089 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2090 2091 /* Wait for IO to complete. */ 2092 while (!list_empty(&wait_list)) { 2093 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2094 2095 list_del_init(&bp->b_list); 2096 2097 /* 2098 * Wait on the locked buffer, check for errors and unlock and 2099 * release the delwri queue reference. 2100 */ 2101 error2 = xfs_buf_iowait(bp); 2102 xfs_buf_relse(bp); 2103 if (!error) 2104 error = error2; 2105 } 2106 2107 return error; 2108 } 2109 2110 /* 2111 * Push a single buffer on a delwri queue. 2112 * 2113 * The purpose of this function is to submit a single buffer of a delwri queue 2114 * and return with the buffer still on the original queue. The waiting delwri 2115 * buffer submission infrastructure guarantees transfer of the delwri queue 2116 * buffer reference to a temporary wait list. We reuse this infrastructure to 2117 * transfer the buffer back to the original queue. 2118 * 2119 * Note the buffer transitions from the queued state, to the submitted and wait 2120 * listed state and back to the queued state during this call. The buffer 2121 * locking and queue management logic between _delwri_pushbuf() and 2122 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2123 * before returning. 2124 */ 2125 int 2126 xfs_buf_delwri_pushbuf( 2127 struct xfs_buf *bp, 2128 struct list_head *buffer_list) 2129 { 2130 LIST_HEAD (submit_list); 2131 int error; 2132 2133 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2134 2135 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2136 2137 /* 2138 * Isolate the buffer to a new local list so we can submit it for I/O 2139 * independently from the rest of the original list. 2140 */ 2141 xfs_buf_lock(bp); 2142 list_move(&bp->b_list, &submit_list); 2143 xfs_buf_unlock(bp); 2144 2145 /* 2146 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2147 * the buffer on the wait list with the original reference. Rather than 2148 * bounce the buffer from a local wait list back to the original list 2149 * after I/O completion, reuse the original list as the wait list. 2150 */ 2151 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2152 2153 /* 2154 * The buffer is now locked, under I/O and wait listed on the original 2155 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2156 * return with the buffer unlocked and on the original queue. 2157 */ 2158 error = xfs_buf_iowait(bp); 2159 bp->b_flags |= _XBF_DELWRI_Q; 2160 xfs_buf_unlock(bp); 2161 2162 return error; 2163 } 2164 2165 int __init 2166 xfs_buf_init(void) 2167 { 2168 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 2169 KM_ZONE_HWALIGN, NULL); 2170 if (!xfs_buf_zone) 2171 goto out; 2172 2173 return 0; 2174 2175 out: 2176 return -ENOMEM; 2177 } 2178 2179 void 2180 xfs_buf_terminate(void) 2181 { 2182 kmem_zone_destroy(xfs_buf_zone); 2183 } 2184 2185 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2186 { 2187 /* 2188 * Set the lru reference count to 0 based on the error injection tag. 2189 * This allows userspace to disrupt buffer caching for debug/testing 2190 * purposes. 2191 */ 2192 if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, 2193 XFS_ERRTAG_BUF_LRU_REF)) 2194 lru_ref = 0; 2195 2196 atomic_set(&bp->b_lru_ref, lru_ref); 2197 } 2198