1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/stddef.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/pagemap.h> 11 #include <linux/init.h> 12 #include <linux/vmalloc.h> 13 #include <linux/bio.h> 14 #include <linux/sysctl.h> 15 #include <linux/proc_fs.h> 16 #include <linux/workqueue.h> 17 #include <linux/percpu.h> 18 #include <linux/blkdev.h> 19 #include <linux/hash.h> 20 #include <linux/kthread.h> 21 #include <linux/migrate.h> 22 #include <linux/backing-dev.h> 23 #include <linux/freezer.h> 24 25 #include "xfs_format.h" 26 #include "xfs_log_format.h" 27 #include "xfs_trans_resv.h" 28 #include "xfs_sb.h" 29 #include "xfs_mount.h" 30 #include "xfs_trace.h" 31 #include "xfs_log.h" 32 #include "xfs_errortag.h" 33 #include "xfs_error.h" 34 35 static kmem_zone_t *xfs_buf_zone; 36 37 #define xb_to_gfp(flags) \ 38 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 39 40 /* 41 * Locking orders 42 * 43 * xfs_buf_ioacct_inc: 44 * xfs_buf_ioacct_dec: 45 * b_sema (caller holds) 46 * b_lock 47 * 48 * xfs_buf_stale: 49 * b_sema (caller holds) 50 * b_lock 51 * lru_lock 52 * 53 * xfs_buf_rele: 54 * b_lock 55 * pag_buf_lock 56 * lru_lock 57 * 58 * xfs_buftarg_wait_rele 59 * lru_lock 60 * b_lock (trylock due to inversion) 61 * 62 * xfs_buftarg_isolate 63 * lru_lock 64 * b_lock (trylock due to inversion) 65 */ 66 67 static inline int 68 xfs_buf_is_vmapped( 69 struct xfs_buf *bp) 70 { 71 /* 72 * Return true if the buffer is vmapped. 73 * 74 * b_addr is null if the buffer is not mapped, but the code is clever 75 * enough to know it doesn't have to map a single page, so the check has 76 * to be both for b_addr and bp->b_page_count > 1. 77 */ 78 return bp->b_addr && bp->b_page_count > 1; 79 } 80 81 static inline int 82 xfs_buf_vmap_len( 83 struct xfs_buf *bp) 84 { 85 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 86 } 87 88 /* 89 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 90 * this buffer. The count is incremented once per buffer (per hold cycle) 91 * because the corresponding decrement is deferred to buffer release. Buffers 92 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 93 * tracking adds unnecessary overhead. This is used for sychronization purposes 94 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of 95 * in-flight buffers. 96 * 97 * Buffers that are never released (e.g., superblock, iclog buffers) must set 98 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 99 * never reaches zero and unmount hangs indefinitely. 100 */ 101 static inline void 102 xfs_buf_ioacct_inc( 103 struct xfs_buf *bp) 104 { 105 if (bp->b_flags & XBF_NO_IOACCT) 106 return; 107 108 ASSERT(bp->b_flags & XBF_ASYNC); 109 spin_lock(&bp->b_lock); 110 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 111 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 112 percpu_counter_inc(&bp->b_target->bt_io_count); 113 } 114 spin_unlock(&bp->b_lock); 115 } 116 117 /* 118 * Clear the in-flight state on a buffer about to be released to the LRU or 119 * freed and unaccount from the buftarg. 120 */ 121 static inline void 122 __xfs_buf_ioacct_dec( 123 struct xfs_buf *bp) 124 { 125 lockdep_assert_held(&bp->b_lock); 126 127 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 128 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 129 percpu_counter_dec(&bp->b_target->bt_io_count); 130 } 131 } 132 133 static inline void 134 xfs_buf_ioacct_dec( 135 struct xfs_buf *bp) 136 { 137 spin_lock(&bp->b_lock); 138 __xfs_buf_ioacct_dec(bp); 139 spin_unlock(&bp->b_lock); 140 } 141 142 /* 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 144 * b_lru_ref count so that the buffer is freed immediately when the buffer 145 * reference count falls to zero. If the buffer is already on the LRU, we need 146 * to remove the reference that LRU holds on the buffer. 147 * 148 * This prevents build-up of stale buffers on the LRU. 149 */ 150 void 151 xfs_buf_stale( 152 struct xfs_buf *bp) 153 { 154 ASSERT(xfs_buf_islocked(bp)); 155 156 bp->b_flags |= XBF_STALE; 157 158 /* 159 * Clear the delwri status so that a delwri queue walker will not 160 * flush this buffer to disk now that it is stale. The delwri queue has 161 * a reference to the buffer, so this is safe to do. 162 */ 163 bp->b_flags &= ~_XBF_DELWRI_Q; 164 165 /* 166 * Once the buffer is marked stale and unlocked, a subsequent lookup 167 * could reset b_flags. There is no guarantee that the buffer is 168 * unaccounted (released to LRU) before that occurs. Drop in-flight 169 * status now to preserve accounting consistency. 170 */ 171 spin_lock(&bp->b_lock); 172 __xfs_buf_ioacct_dec(bp); 173 174 atomic_set(&bp->b_lru_ref, 0); 175 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 176 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 177 atomic_dec(&bp->b_hold); 178 179 ASSERT(atomic_read(&bp->b_hold) >= 1); 180 spin_unlock(&bp->b_lock); 181 } 182 183 static int 184 xfs_buf_get_maps( 185 struct xfs_buf *bp, 186 int map_count) 187 { 188 ASSERT(bp->b_maps == NULL); 189 bp->b_map_count = map_count; 190 191 if (map_count == 1) { 192 bp->b_maps = &bp->__b_map; 193 return 0; 194 } 195 196 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 197 KM_NOFS); 198 if (!bp->b_maps) 199 return -ENOMEM; 200 return 0; 201 } 202 203 /* 204 * Frees b_pages if it was allocated. 205 */ 206 static void 207 xfs_buf_free_maps( 208 struct xfs_buf *bp) 209 { 210 if (bp->b_maps != &bp->__b_map) { 211 kmem_free(bp->b_maps); 212 bp->b_maps = NULL; 213 } 214 } 215 216 struct xfs_buf * 217 _xfs_buf_alloc( 218 struct xfs_buftarg *target, 219 struct xfs_buf_map *map, 220 int nmaps, 221 xfs_buf_flags_t flags) 222 { 223 struct xfs_buf *bp; 224 int error; 225 int i; 226 227 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 228 if (unlikely(!bp)) 229 return NULL; 230 231 /* 232 * We don't want certain flags to appear in b_flags unless they are 233 * specifically set by later operations on the buffer. 234 */ 235 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 236 237 atomic_set(&bp->b_hold, 1); 238 atomic_set(&bp->b_lru_ref, 1); 239 init_completion(&bp->b_iowait); 240 INIT_LIST_HEAD(&bp->b_lru); 241 INIT_LIST_HEAD(&bp->b_list); 242 INIT_LIST_HEAD(&bp->b_li_list); 243 sema_init(&bp->b_sema, 0); /* held, no waiters */ 244 spin_lock_init(&bp->b_lock); 245 bp->b_target = target; 246 bp->b_flags = flags; 247 248 /* 249 * Set length and io_length to the same value initially. 250 * I/O routines should use io_length, which will be the same in 251 * most cases but may be reset (e.g. XFS recovery). 252 */ 253 error = xfs_buf_get_maps(bp, nmaps); 254 if (error) { 255 kmem_zone_free(xfs_buf_zone, bp); 256 return NULL; 257 } 258 259 bp->b_bn = map[0].bm_bn; 260 bp->b_length = 0; 261 for (i = 0; i < nmaps; i++) { 262 bp->b_maps[i].bm_bn = map[i].bm_bn; 263 bp->b_maps[i].bm_len = map[i].bm_len; 264 bp->b_length += map[i].bm_len; 265 } 266 bp->b_io_length = bp->b_length; 267 268 atomic_set(&bp->b_pin_count, 0); 269 init_waitqueue_head(&bp->b_waiters); 270 271 XFS_STATS_INC(target->bt_mount, xb_create); 272 trace_xfs_buf_init(bp, _RET_IP_); 273 274 return bp; 275 } 276 277 /* 278 * Allocate a page array capable of holding a specified number 279 * of pages, and point the page buf at it. 280 */ 281 STATIC int 282 _xfs_buf_get_pages( 283 xfs_buf_t *bp, 284 int page_count) 285 { 286 /* Make sure that we have a page list */ 287 if (bp->b_pages == NULL) { 288 bp->b_page_count = page_count; 289 if (page_count <= XB_PAGES) { 290 bp->b_pages = bp->b_page_array; 291 } else { 292 bp->b_pages = kmem_alloc(sizeof(struct page *) * 293 page_count, KM_NOFS); 294 if (bp->b_pages == NULL) 295 return -ENOMEM; 296 } 297 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 298 } 299 return 0; 300 } 301 302 /* 303 * Frees b_pages if it was allocated. 304 */ 305 STATIC void 306 _xfs_buf_free_pages( 307 xfs_buf_t *bp) 308 { 309 if (bp->b_pages != bp->b_page_array) { 310 kmem_free(bp->b_pages); 311 bp->b_pages = NULL; 312 } 313 } 314 315 /* 316 * Releases the specified buffer. 317 * 318 * The modification state of any associated pages is left unchanged. 319 * The buffer must not be on any hash - use xfs_buf_rele instead for 320 * hashed and refcounted buffers 321 */ 322 void 323 xfs_buf_free( 324 xfs_buf_t *bp) 325 { 326 trace_xfs_buf_free(bp, _RET_IP_); 327 328 ASSERT(list_empty(&bp->b_lru)); 329 330 if (bp->b_flags & _XBF_PAGES) { 331 uint i; 332 333 if (xfs_buf_is_vmapped(bp)) 334 vm_unmap_ram(bp->b_addr - bp->b_offset, 335 bp->b_page_count); 336 337 for (i = 0; i < bp->b_page_count; i++) { 338 struct page *page = bp->b_pages[i]; 339 340 __free_page(page); 341 } 342 } else if (bp->b_flags & _XBF_KMEM) 343 kmem_free(bp->b_addr); 344 _xfs_buf_free_pages(bp); 345 xfs_buf_free_maps(bp); 346 kmem_zone_free(xfs_buf_zone, bp); 347 } 348 349 /* 350 * Allocates all the pages for buffer in question and builds it's page list. 351 */ 352 STATIC int 353 xfs_buf_allocate_memory( 354 xfs_buf_t *bp, 355 uint flags) 356 { 357 size_t size; 358 size_t nbytes, offset; 359 gfp_t gfp_mask = xb_to_gfp(flags); 360 unsigned short page_count, i; 361 xfs_off_t start, end; 362 int error; 363 364 /* 365 * for buffers that are contained within a single page, just allocate 366 * the memory from the heap - there's no need for the complexity of 367 * page arrays to keep allocation down to order 0. 368 */ 369 size = BBTOB(bp->b_length); 370 if (size < PAGE_SIZE) { 371 bp->b_addr = kmem_alloc(size, KM_NOFS); 372 if (!bp->b_addr) { 373 /* low memory - use alloc_page loop instead */ 374 goto use_alloc_page; 375 } 376 377 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 378 ((unsigned long)bp->b_addr & PAGE_MASK)) { 379 /* b_addr spans two pages - use alloc_page instead */ 380 kmem_free(bp->b_addr); 381 bp->b_addr = NULL; 382 goto use_alloc_page; 383 } 384 bp->b_offset = offset_in_page(bp->b_addr); 385 bp->b_pages = bp->b_page_array; 386 bp->b_pages[0] = virt_to_page(bp->b_addr); 387 bp->b_page_count = 1; 388 bp->b_flags |= _XBF_KMEM; 389 return 0; 390 } 391 392 use_alloc_page: 393 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 394 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 395 >> PAGE_SHIFT; 396 page_count = end - start; 397 error = _xfs_buf_get_pages(bp, page_count); 398 if (unlikely(error)) 399 return error; 400 401 offset = bp->b_offset; 402 bp->b_flags |= _XBF_PAGES; 403 404 for (i = 0; i < bp->b_page_count; i++) { 405 struct page *page; 406 uint retries = 0; 407 retry: 408 page = alloc_page(gfp_mask); 409 if (unlikely(page == NULL)) { 410 if (flags & XBF_READ_AHEAD) { 411 bp->b_page_count = i; 412 error = -ENOMEM; 413 goto out_free_pages; 414 } 415 416 /* 417 * This could deadlock. 418 * 419 * But until all the XFS lowlevel code is revamped to 420 * handle buffer allocation failures we can't do much. 421 */ 422 if (!(++retries % 100)) 423 xfs_err(NULL, 424 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 425 current->comm, current->pid, 426 __func__, gfp_mask); 427 428 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); 429 congestion_wait(BLK_RW_ASYNC, HZ/50); 430 goto retry; 431 } 432 433 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); 434 435 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 436 size -= nbytes; 437 bp->b_pages[i] = page; 438 offset = 0; 439 } 440 return 0; 441 442 out_free_pages: 443 for (i = 0; i < bp->b_page_count; i++) 444 __free_page(bp->b_pages[i]); 445 bp->b_flags &= ~_XBF_PAGES; 446 return error; 447 } 448 449 /* 450 * Map buffer into kernel address-space if necessary. 451 */ 452 STATIC int 453 _xfs_buf_map_pages( 454 xfs_buf_t *bp, 455 uint flags) 456 { 457 ASSERT(bp->b_flags & _XBF_PAGES); 458 if (bp->b_page_count == 1) { 459 /* A single page buffer is always mappable */ 460 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 461 } else if (flags & XBF_UNMAPPED) { 462 bp->b_addr = NULL; 463 } else { 464 int retried = 0; 465 unsigned nofs_flag; 466 467 /* 468 * vm_map_ram() will allocate auxillary structures (e.g. 469 * pagetables) with GFP_KERNEL, yet we are likely to be under 470 * GFP_NOFS context here. Hence we need to tell memory reclaim 471 * that we are in such a context via PF_MEMALLOC_NOFS to prevent 472 * memory reclaim re-entering the filesystem here and 473 * potentially deadlocking. 474 */ 475 nofs_flag = memalloc_nofs_save(); 476 do { 477 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 478 -1, PAGE_KERNEL); 479 if (bp->b_addr) 480 break; 481 vm_unmap_aliases(); 482 } while (retried++ <= 1); 483 memalloc_nofs_restore(nofs_flag); 484 485 if (!bp->b_addr) 486 return -ENOMEM; 487 bp->b_addr += bp->b_offset; 488 } 489 490 return 0; 491 } 492 493 /* 494 * Finding and Reading Buffers 495 */ 496 static int 497 _xfs_buf_obj_cmp( 498 struct rhashtable_compare_arg *arg, 499 const void *obj) 500 { 501 const struct xfs_buf_map *map = arg->key; 502 const struct xfs_buf *bp = obj; 503 504 /* 505 * The key hashing in the lookup path depends on the key being the 506 * first element of the compare_arg, make sure to assert this. 507 */ 508 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 509 510 if (bp->b_bn != map->bm_bn) 511 return 1; 512 513 if (unlikely(bp->b_length != map->bm_len)) { 514 /* 515 * found a block number match. If the range doesn't 516 * match, the only way this is allowed is if the buffer 517 * in the cache is stale and the transaction that made 518 * it stale has not yet committed. i.e. we are 519 * reallocating a busy extent. Skip this buffer and 520 * continue searching for an exact match. 521 */ 522 ASSERT(bp->b_flags & XBF_STALE); 523 return 1; 524 } 525 return 0; 526 } 527 528 static const struct rhashtable_params xfs_buf_hash_params = { 529 .min_size = 32, /* empty AGs have minimal footprint */ 530 .nelem_hint = 16, 531 .key_len = sizeof(xfs_daddr_t), 532 .key_offset = offsetof(struct xfs_buf, b_bn), 533 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 534 .automatic_shrinking = true, 535 .obj_cmpfn = _xfs_buf_obj_cmp, 536 }; 537 538 int 539 xfs_buf_hash_init( 540 struct xfs_perag *pag) 541 { 542 spin_lock_init(&pag->pag_buf_lock); 543 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 544 } 545 546 void 547 xfs_buf_hash_destroy( 548 struct xfs_perag *pag) 549 { 550 rhashtable_destroy(&pag->pag_buf_hash); 551 } 552 553 /* 554 * Look up a buffer in the buffer cache and return it referenced and locked 555 * in @found_bp. 556 * 557 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the 558 * cache. 559 * 560 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return 561 * -EAGAIN if we fail to lock it. 562 * 563 * Return values are: 564 * -EFSCORRUPTED if have been supplied with an invalid address 565 * -EAGAIN on trylock failure 566 * -ENOENT if we fail to find a match and @new_bp was NULL 567 * 0, with @found_bp: 568 * - @new_bp if we inserted it into the cache 569 * - the buffer we found and locked. 570 */ 571 static int 572 xfs_buf_find( 573 struct xfs_buftarg *btp, 574 struct xfs_buf_map *map, 575 int nmaps, 576 xfs_buf_flags_t flags, 577 struct xfs_buf *new_bp, 578 struct xfs_buf **found_bp) 579 { 580 struct xfs_perag *pag; 581 xfs_buf_t *bp; 582 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 583 xfs_daddr_t eofs; 584 int i; 585 586 *found_bp = NULL; 587 588 for (i = 0; i < nmaps; i++) 589 cmap.bm_len += map[i].bm_len; 590 591 /* Check for IOs smaller than the sector size / not sector aligned */ 592 ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); 593 ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 594 595 /* 596 * Corrupted block numbers can get through to here, unfortunately, so we 597 * have to check that the buffer falls within the filesystem bounds. 598 */ 599 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 600 if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { 601 xfs_alert(btp->bt_mount, 602 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 603 __func__, cmap.bm_bn, eofs); 604 WARN_ON(1); 605 return -EFSCORRUPTED; 606 } 607 608 pag = xfs_perag_get(btp->bt_mount, 609 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 610 611 spin_lock(&pag->pag_buf_lock); 612 bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, 613 xfs_buf_hash_params); 614 if (bp) { 615 atomic_inc(&bp->b_hold); 616 goto found; 617 } 618 619 /* No match found */ 620 if (!new_bp) { 621 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 622 spin_unlock(&pag->pag_buf_lock); 623 xfs_perag_put(pag); 624 return -ENOENT; 625 } 626 627 /* the buffer keeps the perag reference until it is freed */ 628 new_bp->b_pag = pag; 629 rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, 630 xfs_buf_hash_params); 631 spin_unlock(&pag->pag_buf_lock); 632 *found_bp = new_bp; 633 return 0; 634 635 found: 636 spin_unlock(&pag->pag_buf_lock); 637 xfs_perag_put(pag); 638 639 if (!xfs_buf_trylock(bp)) { 640 if (flags & XBF_TRYLOCK) { 641 xfs_buf_rele(bp); 642 XFS_STATS_INC(btp->bt_mount, xb_busy_locked); 643 return -EAGAIN; 644 } 645 xfs_buf_lock(bp); 646 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); 647 } 648 649 /* 650 * if the buffer is stale, clear all the external state associated with 651 * it. We need to keep flags such as how we allocated the buffer memory 652 * intact here. 653 */ 654 if (bp->b_flags & XBF_STALE) { 655 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 656 ASSERT(bp->b_iodone == NULL); 657 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 658 bp->b_ops = NULL; 659 } 660 661 trace_xfs_buf_find(bp, flags, _RET_IP_); 662 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 663 *found_bp = bp; 664 return 0; 665 } 666 667 struct xfs_buf * 668 xfs_buf_incore( 669 struct xfs_buftarg *target, 670 xfs_daddr_t blkno, 671 size_t numblks, 672 xfs_buf_flags_t flags) 673 { 674 struct xfs_buf *bp; 675 int error; 676 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 677 678 error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); 679 if (error) 680 return NULL; 681 return bp; 682 } 683 684 /* 685 * Assembles a buffer covering the specified range. The code is optimised for 686 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 687 * more hits than misses. 688 */ 689 struct xfs_buf * 690 xfs_buf_get_map( 691 struct xfs_buftarg *target, 692 struct xfs_buf_map *map, 693 int nmaps, 694 xfs_buf_flags_t flags) 695 { 696 struct xfs_buf *bp; 697 struct xfs_buf *new_bp; 698 int error = 0; 699 700 error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); 701 702 switch (error) { 703 case 0: 704 /* cache hit */ 705 goto found; 706 case -EAGAIN: 707 /* cache hit, trylock failure, caller handles failure */ 708 ASSERT(flags & XBF_TRYLOCK); 709 return NULL; 710 case -ENOENT: 711 /* cache miss, go for insert */ 712 break; 713 case -EFSCORRUPTED: 714 default: 715 /* 716 * None of the higher layers understand failure types 717 * yet, so return NULL to signal a fatal lookup error. 718 */ 719 return NULL; 720 } 721 722 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 723 if (unlikely(!new_bp)) 724 return NULL; 725 726 error = xfs_buf_allocate_memory(new_bp, flags); 727 if (error) { 728 xfs_buf_free(new_bp); 729 return NULL; 730 } 731 732 error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); 733 if (error) { 734 xfs_buf_free(new_bp); 735 return NULL; 736 } 737 738 if (bp != new_bp) 739 xfs_buf_free(new_bp); 740 741 found: 742 if (!bp->b_addr) { 743 error = _xfs_buf_map_pages(bp, flags); 744 if (unlikely(error)) { 745 xfs_warn(target->bt_mount, 746 "%s: failed to map pagesn", __func__); 747 xfs_buf_relse(bp); 748 return NULL; 749 } 750 } 751 752 /* 753 * Clear b_error if this is a lookup from a caller that doesn't expect 754 * valid data to be found in the buffer. 755 */ 756 if (!(flags & XBF_READ)) 757 xfs_buf_ioerror(bp, 0); 758 759 XFS_STATS_INC(target->bt_mount, xb_get); 760 trace_xfs_buf_get(bp, flags, _RET_IP_); 761 return bp; 762 } 763 764 STATIC int 765 _xfs_buf_read( 766 xfs_buf_t *bp, 767 xfs_buf_flags_t flags) 768 { 769 ASSERT(!(flags & XBF_WRITE)); 770 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 771 772 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 773 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 774 775 return xfs_buf_submit(bp); 776 } 777 778 /* 779 * Set buffer ops on an unchecked buffer and validate it, if possible. 780 * 781 * If the caller passed in an ops structure and the buffer doesn't have ops 782 * assigned, set the ops and use them to verify the contents. If the contents 783 * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no 784 * recorded errors and is already in XBF_DONE state. 785 * 786 * Under normal operations, every in-core buffer must have buffer ops assigned 787 * to them when the buffer is read in from disk so that we can validate the 788 * metadata. 789 * 790 * However, there are two scenarios where one can encounter in-core buffers 791 * that don't have buffer ops. The first is during log recovery of buffers on 792 * a V4 filesystem, though these buffers are purged at the end of recovery. 793 * 794 * The other is online repair, which tries to match arbitrary metadata blocks 795 * with btree types in order to find the root. If online repair doesn't match 796 * the buffer with /any/ btree type, the buffer remains in memory in DONE state 797 * with no ops, and a subsequent read_buf call from elsewhere will not set the 798 * ops. This function helps us fix this situation. 799 */ 800 int 801 xfs_buf_ensure_ops( 802 struct xfs_buf *bp, 803 const struct xfs_buf_ops *ops) 804 { 805 ASSERT(bp->b_flags & XBF_DONE); 806 ASSERT(bp->b_error == 0); 807 808 if (!ops || bp->b_ops) 809 return 0; 810 811 bp->b_ops = ops; 812 bp->b_ops->verify_read(bp); 813 if (bp->b_error) 814 bp->b_flags &= ~XBF_DONE; 815 return bp->b_error; 816 } 817 818 xfs_buf_t * 819 xfs_buf_read_map( 820 struct xfs_buftarg *target, 821 struct xfs_buf_map *map, 822 int nmaps, 823 xfs_buf_flags_t flags, 824 const struct xfs_buf_ops *ops) 825 { 826 struct xfs_buf *bp; 827 828 flags |= XBF_READ; 829 830 bp = xfs_buf_get_map(target, map, nmaps, flags); 831 if (!bp) 832 return NULL; 833 834 trace_xfs_buf_read(bp, flags, _RET_IP_); 835 836 if (!(bp->b_flags & XBF_DONE)) { 837 XFS_STATS_INC(target->bt_mount, xb_get_read); 838 bp->b_ops = ops; 839 _xfs_buf_read(bp, flags); 840 return bp; 841 } 842 843 xfs_buf_ensure_ops(bp, ops); 844 845 if (flags & XBF_ASYNC) { 846 /* 847 * Read ahead call which is already satisfied, 848 * drop the buffer 849 */ 850 xfs_buf_relse(bp); 851 return NULL; 852 } 853 854 /* We do not want read in the flags */ 855 bp->b_flags &= ~XBF_READ; 856 ASSERT(bp->b_ops != NULL || ops == NULL); 857 return bp; 858 } 859 860 /* 861 * If we are not low on memory then do the readahead in a deadlock 862 * safe manner. 863 */ 864 void 865 xfs_buf_readahead_map( 866 struct xfs_buftarg *target, 867 struct xfs_buf_map *map, 868 int nmaps, 869 const struct xfs_buf_ops *ops) 870 { 871 if (bdi_read_congested(target->bt_bdev->bd_bdi)) 872 return; 873 874 xfs_buf_read_map(target, map, nmaps, 875 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 876 } 877 878 /* 879 * Read an uncached buffer from disk. Allocates and returns a locked 880 * buffer containing the disk contents or nothing. 881 */ 882 int 883 xfs_buf_read_uncached( 884 struct xfs_buftarg *target, 885 xfs_daddr_t daddr, 886 size_t numblks, 887 int flags, 888 struct xfs_buf **bpp, 889 const struct xfs_buf_ops *ops) 890 { 891 struct xfs_buf *bp; 892 893 *bpp = NULL; 894 895 bp = xfs_buf_get_uncached(target, numblks, flags); 896 if (!bp) 897 return -ENOMEM; 898 899 /* set up the buffer for a read IO */ 900 ASSERT(bp->b_map_count == 1); 901 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ 902 bp->b_maps[0].bm_bn = daddr; 903 bp->b_flags |= XBF_READ; 904 bp->b_ops = ops; 905 906 xfs_buf_submit(bp); 907 if (bp->b_error) { 908 int error = bp->b_error; 909 xfs_buf_relse(bp); 910 return error; 911 } 912 913 *bpp = bp; 914 return 0; 915 } 916 917 /* 918 * Return a buffer allocated as an empty buffer and associated to external 919 * memory via xfs_buf_associate_memory() back to it's empty state. 920 */ 921 void 922 xfs_buf_set_empty( 923 struct xfs_buf *bp, 924 size_t numblks) 925 { 926 if (bp->b_pages) 927 _xfs_buf_free_pages(bp); 928 929 bp->b_pages = NULL; 930 bp->b_page_count = 0; 931 bp->b_addr = NULL; 932 bp->b_length = numblks; 933 bp->b_io_length = numblks; 934 935 ASSERT(bp->b_map_count == 1); 936 bp->b_bn = XFS_BUF_DADDR_NULL; 937 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 938 bp->b_maps[0].bm_len = bp->b_length; 939 } 940 941 static inline struct page * 942 mem_to_page( 943 void *addr) 944 { 945 if ((!is_vmalloc_addr(addr))) { 946 return virt_to_page(addr); 947 } else { 948 return vmalloc_to_page(addr); 949 } 950 } 951 952 int 953 xfs_buf_associate_memory( 954 xfs_buf_t *bp, 955 void *mem, 956 size_t len) 957 { 958 int rval; 959 int i = 0; 960 unsigned long pageaddr; 961 unsigned long offset; 962 size_t buflen; 963 int page_count; 964 965 pageaddr = (unsigned long)mem & PAGE_MASK; 966 offset = (unsigned long)mem - pageaddr; 967 buflen = PAGE_ALIGN(len + offset); 968 page_count = buflen >> PAGE_SHIFT; 969 970 /* Free any previous set of page pointers */ 971 if (bp->b_pages) 972 _xfs_buf_free_pages(bp); 973 974 bp->b_pages = NULL; 975 bp->b_addr = mem; 976 977 rval = _xfs_buf_get_pages(bp, page_count); 978 if (rval) 979 return rval; 980 981 bp->b_offset = offset; 982 983 for (i = 0; i < bp->b_page_count; i++) { 984 bp->b_pages[i] = mem_to_page((void *)pageaddr); 985 pageaddr += PAGE_SIZE; 986 } 987 988 bp->b_io_length = BTOBB(len); 989 bp->b_length = BTOBB(buflen); 990 991 return 0; 992 } 993 994 xfs_buf_t * 995 xfs_buf_get_uncached( 996 struct xfs_buftarg *target, 997 size_t numblks, 998 int flags) 999 { 1000 unsigned long page_count; 1001 int error, i; 1002 struct xfs_buf *bp; 1003 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 1004 1005 /* flags might contain irrelevant bits, pass only what we care about */ 1006 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); 1007 if (unlikely(bp == NULL)) 1008 goto fail; 1009 1010 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 1011 error = _xfs_buf_get_pages(bp, page_count); 1012 if (error) 1013 goto fail_free_buf; 1014 1015 for (i = 0; i < page_count; i++) { 1016 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 1017 if (!bp->b_pages[i]) 1018 goto fail_free_mem; 1019 } 1020 bp->b_flags |= _XBF_PAGES; 1021 1022 error = _xfs_buf_map_pages(bp, 0); 1023 if (unlikely(error)) { 1024 xfs_warn(target->bt_mount, 1025 "%s: failed to map pages", __func__); 1026 goto fail_free_mem; 1027 } 1028 1029 trace_xfs_buf_get_uncached(bp, _RET_IP_); 1030 return bp; 1031 1032 fail_free_mem: 1033 while (--i >= 0) 1034 __free_page(bp->b_pages[i]); 1035 _xfs_buf_free_pages(bp); 1036 fail_free_buf: 1037 xfs_buf_free_maps(bp); 1038 kmem_zone_free(xfs_buf_zone, bp); 1039 fail: 1040 return NULL; 1041 } 1042 1043 /* 1044 * Increment reference count on buffer, to hold the buffer concurrently 1045 * with another thread which may release (free) the buffer asynchronously. 1046 * Must hold the buffer already to call this function. 1047 */ 1048 void 1049 xfs_buf_hold( 1050 xfs_buf_t *bp) 1051 { 1052 trace_xfs_buf_hold(bp, _RET_IP_); 1053 atomic_inc(&bp->b_hold); 1054 } 1055 1056 /* 1057 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 1058 * placed on LRU or freed (depending on b_lru_ref). 1059 */ 1060 void 1061 xfs_buf_rele( 1062 xfs_buf_t *bp) 1063 { 1064 struct xfs_perag *pag = bp->b_pag; 1065 bool release; 1066 bool freebuf = false; 1067 1068 trace_xfs_buf_rele(bp, _RET_IP_); 1069 1070 if (!pag) { 1071 ASSERT(list_empty(&bp->b_lru)); 1072 if (atomic_dec_and_test(&bp->b_hold)) { 1073 xfs_buf_ioacct_dec(bp); 1074 xfs_buf_free(bp); 1075 } 1076 return; 1077 } 1078 1079 ASSERT(atomic_read(&bp->b_hold) > 0); 1080 1081 /* 1082 * We grab the b_lock here first to serialise racing xfs_buf_rele() 1083 * calls. The pag_buf_lock being taken on the last reference only 1084 * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1085 * to last reference we drop here is not serialised against the last 1086 * reference until we take bp->b_lock. Hence if we don't grab b_lock 1087 * first, the last "release" reference can win the race to the lock and 1088 * free the buffer before the second-to-last reference is processed, 1089 * leading to a use-after-free scenario. 1090 */ 1091 spin_lock(&bp->b_lock); 1092 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 1093 if (!release) { 1094 /* 1095 * Drop the in-flight state if the buffer is already on the LRU 1096 * and it holds the only reference. This is racy because we 1097 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1098 * ensures the decrement occurs only once per-buf. 1099 */ 1100 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1101 __xfs_buf_ioacct_dec(bp); 1102 goto out_unlock; 1103 } 1104 1105 /* the last reference has been dropped ... */ 1106 __xfs_buf_ioacct_dec(bp); 1107 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1108 /* 1109 * If the buffer is added to the LRU take a new reference to the 1110 * buffer for the LRU and clear the (now stale) dispose list 1111 * state flag 1112 */ 1113 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 1114 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1115 atomic_inc(&bp->b_hold); 1116 } 1117 spin_unlock(&pag->pag_buf_lock); 1118 } else { 1119 /* 1120 * most of the time buffers will already be removed from the 1121 * LRU, so optimise that case by checking for the 1122 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1123 * was on was the disposal list 1124 */ 1125 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1126 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 1127 } else { 1128 ASSERT(list_empty(&bp->b_lru)); 1129 } 1130 1131 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1132 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1133 xfs_buf_hash_params); 1134 spin_unlock(&pag->pag_buf_lock); 1135 xfs_perag_put(pag); 1136 freebuf = true; 1137 } 1138 1139 out_unlock: 1140 spin_unlock(&bp->b_lock); 1141 1142 if (freebuf) 1143 xfs_buf_free(bp); 1144 } 1145 1146 1147 /* 1148 * Lock a buffer object, if it is not already locked. 1149 * 1150 * If we come across a stale, pinned, locked buffer, we know that we are 1151 * being asked to lock a buffer that has been reallocated. Because it is 1152 * pinned, we know that the log has not been pushed to disk and hence it 1153 * will still be locked. Rather than continuing to have trylock attempts 1154 * fail until someone else pushes the log, push it ourselves before 1155 * returning. This means that the xfsaild will not get stuck trying 1156 * to push on stale inode buffers. 1157 */ 1158 int 1159 xfs_buf_trylock( 1160 struct xfs_buf *bp) 1161 { 1162 int locked; 1163 1164 locked = down_trylock(&bp->b_sema) == 0; 1165 if (locked) 1166 trace_xfs_buf_trylock(bp, _RET_IP_); 1167 else 1168 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1169 return locked; 1170 } 1171 1172 /* 1173 * Lock a buffer object. 1174 * 1175 * If we come across a stale, pinned, locked buffer, we know that we 1176 * are being asked to lock a buffer that has been reallocated. Because 1177 * it is pinned, we know that the log has not been pushed to disk and 1178 * hence it will still be locked. Rather than sleeping until someone 1179 * else pushes the log, push it ourselves before trying to get the lock. 1180 */ 1181 void 1182 xfs_buf_lock( 1183 struct xfs_buf *bp) 1184 { 1185 trace_xfs_buf_lock(bp, _RET_IP_); 1186 1187 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1188 xfs_log_force(bp->b_target->bt_mount, 0); 1189 down(&bp->b_sema); 1190 1191 trace_xfs_buf_lock_done(bp, _RET_IP_); 1192 } 1193 1194 void 1195 xfs_buf_unlock( 1196 struct xfs_buf *bp) 1197 { 1198 ASSERT(xfs_buf_islocked(bp)); 1199 1200 up(&bp->b_sema); 1201 trace_xfs_buf_unlock(bp, _RET_IP_); 1202 } 1203 1204 STATIC void 1205 xfs_buf_wait_unpin( 1206 xfs_buf_t *bp) 1207 { 1208 DECLARE_WAITQUEUE (wait, current); 1209 1210 if (atomic_read(&bp->b_pin_count) == 0) 1211 return; 1212 1213 add_wait_queue(&bp->b_waiters, &wait); 1214 for (;;) { 1215 set_current_state(TASK_UNINTERRUPTIBLE); 1216 if (atomic_read(&bp->b_pin_count) == 0) 1217 break; 1218 io_schedule(); 1219 } 1220 remove_wait_queue(&bp->b_waiters, &wait); 1221 set_current_state(TASK_RUNNING); 1222 } 1223 1224 /* 1225 * Buffer Utility Routines 1226 */ 1227 1228 void 1229 xfs_buf_ioend( 1230 struct xfs_buf *bp) 1231 { 1232 bool read = bp->b_flags & XBF_READ; 1233 1234 trace_xfs_buf_iodone(bp, _RET_IP_); 1235 1236 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1237 1238 /* 1239 * Pull in IO completion errors now. We are guaranteed to be running 1240 * single threaded, so we don't need the lock to read b_io_error. 1241 */ 1242 if (!bp->b_error && bp->b_io_error) 1243 xfs_buf_ioerror(bp, bp->b_io_error); 1244 1245 /* Only validate buffers that were read without errors */ 1246 if (read && !bp->b_error && bp->b_ops) { 1247 ASSERT(!bp->b_iodone); 1248 bp->b_ops->verify_read(bp); 1249 } 1250 1251 if (!bp->b_error) 1252 bp->b_flags |= XBF_DONE; 1253 1254 if (bp->b_iodone) 1255 (*(bp->b_iodone))(bp); 1256 else if (bp->b_flags & XBF_ASYNC) 1257 xfs_buf_relse(bp); 1258 else 1259 complete(&bp->b_iowait); 1260 } 1261 1262 static void 1263 xfs_buf_ioend_work( 1264 struct work_struct *work) 1265 { 1266 struct xfs_buf *bp = 1267 container_of(work, xfs_buf_t, b_ioend_work); 1268 1269 xfs_buf_ioend(bp); 1270 } 1271 1272 static void 1273 xfs_buf_ioend_async( 1274 struct xfs_buf *bp) 1275 { 1276 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1277 queue_work(bp->b_ioend_wq, &bp->b_ioend_work); 1278 } 1279 1280 void 1281 __xfs_buf_ioerror( 1282 xfs_buf_t *bp, 1283 int error, 1284 xfs_failaddr_t failaddr) 1285 { 1286 ASSERT(error <= 0 && error >= -1000); 1287 bp->b_error = error; 1288 trace_xfs_buf_ioerror(bp, error, failaddr); 1289 } 1290 1291 void 1292 xfs_buf_ioerror_alert( 1293 struct xfs_buf *bp, 1294 const char *func) 1295 { 1296 xfs_alert(bp->b_target->bt_mount, 1297 "metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", 1298 func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, 1299 -bp->b_error); 1300 } 1301 1302 int 1303 xfs_bwrite( 1304 struct xfs_buf *bp) 1305 { 1306 int error; 1307 1308 ASSERT(xfs_buf_islocked(bp)); 1309 1310 bp->b_flags |= XBF_WRITE; 1311 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1312 XBF_WRITE_FAIL | XBF_DONE); 1313 1314 error = xfs_buf_submit(bp); 1315 if (error) { 1316 xfs_force_shutdown(bp->b_target->bt_mount, 1317 SHUTDOWN_META_IO_ERROR); 1318 } 1319 return error; 1320 } 1321 1322 static void 1323 xfs_buf_bio_end_io( 1324 struct bio *bio) 1325 { 1326 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1327 1328 /* 1329 * don't overwrite existing errors - otherwise we can lose errors on 1330 * buffers that require multiple bios to complete. 1331 */ 1332 if (bio->bi_status) { 1333 int error = blk_status_to_errno(bio->bi_status); 1334 1335 cmpxchg(&bp->b_io_error, 0, error); 1336 } 1337 1338 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1339 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1340 1341 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1342 xfs_buf_ioend_async(bp); 1343 bio_put(bio); 1344 } 1345 1346 static void 1347 xfs_buf_ioapply_map( 1348 struct xfs_buf *bp, 1349 int map, 1350 int *buf_offset, 1351 int *count, 1352 int op, 1353 int op_flags) 1354 { 1355 int page_index; 1356 int total_nr_pages = bp->b_page_count; 1357 int nr_pages; 1358 struct bio *bio; 1359 sector_t sector = bp->b_maps[map].bm_bn; 1360 int size; 1361 int offset; 1362 1363 /* skip the pages in the buffer before the start offset */ 1364 page_index = 0; 1365 offset = *buf_offset; 1366 while (offset >= PAGE_SIZE) { 1367 page_index++; 1368 offset -= PAGE_SIZE; 1369 } 1370 1371 /* 1372 * Limit the IO size to the length of the current vector, and update the 1373 * remaining IO count for the next time around. 1374 */ 1375 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1376 *count -= size; 1377 *buf_offset += size; 1378 1379 next_chunk: 1380 atomic_inc(&bp->b_io_remaining); 1381 nr_pages = min(total_nr_pages, BIO_MAX_PAGES); 1382 1383 bio = bio_alloc(GFP_NOIO, nr_pages); 1384 bio_set_dev(bio, bp->b_target->bt_bdev); 1385 bio->bi_iter.bi_sector = sector; 1386 bio->bi_end_io = xfs_buf_bio_end_io; 1387 bio->bi_private = bp; 1388 bio_set_op_attrs(bio, op, op_flags); 1389 1390 for (; size && nr_pages; nr_pages--, page_index++) { 1391 int rbytes, nbytes = PAGE_SIZE - offset; 1392 1393 if (nbytes > size) 1394 nbytes = size; 1395 1396 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1397 offset); 1398 if (rbytes < nbytes) 1399 break; 1400 1401 offset = 0; 1402 sector += BTOBB(nbytes); 1403 size -= nbytes; 1404 total_nr_pages--; 1405 } 1406 1407 if (likely(bio->bi_iter.bi_size)) { 1408 if (xfs_buf_is_vmapped(bp)) { 1409 flush_kernel_vmap_range(bp->b_addr, 1410 xfs_buf_vmap_len(bp)); 1411 } 1412 submit_bio(bio); 1413 if (size) 1414 goto next_chunk; 1415 } else { 1416 /* 1417 * This is guaranteed not to be the last io reference count 1418 * because the caller (xfs_buf_submit) holds a count itself. 1419 */ 1420 atomic_dec(&bp->b_io_remaining); 1421 xfs_buf_ioerror(bp, -EIO); 1422 bio_put(bio); 1423 } 1424 1425 } 1426 1427 STATIC void 1428 _xfs_buf_ioapply( 1429 struct xfs_buf *bp) 1430 { 1431 struct blk_plug plug; 1432 int op; 1433 int op_flags = 0; 1434 int offset; 1435 int size; 1436 int i; 1437 1438 /* 1439 * Make sure we capture only current IO errors rather than stale errors 1440 * left over from previous use of the buffer (e.g. failed readahead). 1441 */ 1442 bp->b_error = 0; 1443 1444 /* 1445 * Initialize the I/O completion workqueue if we haven't yet or the 1446 * submitter has not opted to specify a custom one. 1447 */ 1448 if (!bp->b_ioend_wq) 1449 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; 1450 1451 if (bp->b_flags & XBF_WRITE) { 1452 op = REQ_OP_WRITE; 1453 if (bp->b_flags & XBF_SYNCIO) 1454 op_flags = REQ_SYNC; 1455 if (bp->b_flags & XBF_FUA) 1456 op_flags |= REQ_FUA; 1457 if (bp->b_flags & XBF_FLUSH) 1458 op_flags |= REQ_PREFLUSH; 1459 1460 /* 1461 * Run the write verifier callback function if it exists. If 1462 * this function fails it will mark the buffer with an error and 1463 * the IO should not be dispatched. 1464 */ 1465 if (bp->b_ops) { 1466 bp->b_ops->verify_write(bp); 1467 if (bp->b_error) { 1468 xfs_force_shutdown(bp->b_target->bt_mount, 1469 SHUTDOWN_CORRUPT_INCORE); 1470 return; 1471 } 1472 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { 1473 struct xfs_mount *mp = bp->b_target->bt_mount; 1474 1475 /* 1476 * non-crc filesystems don't attach verifiers during 1477 * log recovery, so don't warn for such filesystems. 1478 */ 1479 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1480 xfs_warn(mp, 1481 "%s: no buf ops on daddr 0x%llx len %d", 1482 __func__, bp->b_bn, bp->b_length); 1483 xfs_hex_dump(bp->b_addr, 1484 XFS_CORRUPTION_DUMP_LEN); 1485 dump_stack(); 1486 } 1487 } 1488 } else if (bp->b_flags & XBF_READ_AHEAD) { 1489 op = REQ_OP_READ; 1490 op_flags = REQ_RAHEAD; 1491 } else { 1492 op = REQ_OP_READ; 1493 } 1494 1495 /* we only use the buffer cache for meta-data */ 1496 op_flags |= REQ_META; 1497 1498 /* 1499 * Walk all the vectors issuing IO on them. Set up the initial offset 1500 * into the buffer and the desired IO size before we start - 1501 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1502 * subsequent call. 1503 */ 1504 offset = bp->b_offset; 1505 size = BBTOB(bp->b_io_length); 1506 blk_start_plug(&plug); 1507 for (i = 0; i < bp->b_map_count; i++) { 1508 xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); 1509 if (bp->b_error) 1510 break; 1511 if (size <= 0) 1512 break; /* all done */ 1513 } 1514 blk_finish_plug(&plug); 1515 } 1516 1517 /* 1518 * Wait for I/O completion of a sync buffer and return the I/O error code. 1519 */ 1520 static int 1521 xfs_buf_iowait( 1522 struct xfs_buf *bp) 1523 { 1524 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1525 1526 trace_xfs_buf_iowait(bp, _RET_IP_); 1527 wait_for_completion(&bp->b_iowait); 1528 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1529 1530 return bp->b_error; 1531 } 1532 1533 /* 1534 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1535 * the buffer lock ownership and the current reference to the IO. It is not 1536 * safe to reference the buffer after a call to this function unless the caller 1537 * holds an additional reference itself. 1538 */ 1539 int 1540 __xfs_buf_submit( 1541 struct xfs_buf *bp, 1542 bool wait) 1543 { 1544 int error = 0; 1545 1546 trace_xfs_buf_submit(bp, _RET_IP_); 1547 1548 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1549 1550 /* on shutdown we stale and complete the buffer immediately */ 1551 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1552 xfs_buf_ioerror(bp, -EIO); 1553 bp->b_flags &= ~XBF_DONE; 1554 xfs_buf_stale(bp); 1555 xfs_buf_ioend(bp); 1556 return -EIO; 1557 } 1558 1559 /* 1560 * Grab a reference so the buffer does not go away underneath us. For 1561 * async buffers, I/O completion drops the callers reference, which 1562 * could occur before submission returns. 1563 */ 1564 xfs_buf_hold(bp); 1565 1566 if (bp->b_flags & XBF_WRITE) 1567 xfs_buf_wait_unpin(bp); 1568 1569 /* clear the internal error state to avoid spurious errors */ 1570 bp->b_io_error = 0; 1571 1572 /* 1573 * Set the count to 1 initially, this will stop an I/O completion 1574 * callout which happens before we have started all the I/O from calling 1575 * xfs_buf_ioend too early. 1576 */ 1577 atomic_set(&bp->b_io_remaining, 1); 1578 if (bp->b_flags & XBF_ASYNC) 1579 xfs_buf_ioacct_inc(bp); 1580 _xfs_buf_ioapply(bp); 1581 1582 /* 1583 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1584 * reference we took above. If we drop it to zero, run completion so 1585 * that we don't return to the caller with completion still pending. 1586 */ 1587 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1588 if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1589 xfs_buf_ioend(bp); 1590 else 1591 xfs_buf_ioend_async(bp); 1592 } 1593 1594 if (wait) 1595 error = xfs_buf_iowait(bp); 1596 1597 /* 1598 * Release the hold that keeps the buffer referenced for the entire 1599 * I/O. Note that if the buffer is async, it is not safe to reference 1600 * after this release. 1601 */ 1602 xfs_buf_rele(bp); 1603 return error; 1604 } 1605 1606 void * 1607 xfs_buf_offset( 1608 struct xfs_buf *bp, 1609 size_t offset) 1610 { 1611 struct page *page; 1612 1613 if (bp->b_addr) 1614 return bp->b_addr + offset; 1615 1616 offset += bp->b_offset; 1617 page = bp->b_pages[offset >> PAGE_SHIFT]; 1618 return page_address(page) + (offset & (PAGE_SIZE-1)); 1619 } 1620 1621 /* 1622 * Move data into or out of a buffer. 1623 */ 1624 void 1625 xfs_buf_iomove( 1626 xfs_buf_t *bp, /* buffer to process */ 1627 size_t boff, /* starting buffer offset */ 1628 size_t bsize, /* length to copy */ 1629 void *data, /* data address */ 1630 xfs_buf_rw_t mode) /* read/write/zero flag */ 1631 { 1632 size_t bend; 1633 1634 bend = boff + bsize; 1635 while (boff < bend) { 1636 struct page *page; 1637 int page_index, page_offset, csize; 1638 1639 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1640 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1641 page = bp->b_pages[page_index]; 1642 csize = min_t(size_t, PAGE_SIZE - page_offset, 1643 BBTOB(bp->b_io_length) - boff); 1644 1645 ASSERT((csize + page_offset) <= PAGE_SIZE); 1646 1647 switch (mode) { 1648 case XBRW_ZERO: 1649 memset(page_address(page) + page_offset, 0, csize); 1650 break; 1651 case XBRW_READ: 1652 memcpy(data, page_address(page) + page_offset, csize); 1653 break; 1654 case XBRW_WRITE: 1655 memcpy(page_address(page) + page_offset, data, csize); 1656 } 1657 1658 boff += csize; 1659 data += csize; 1660 } 1661 } 1662 1663 /* 1664 * Handling of buffer targets (buftargs). 1665 */ 1666 1667 /* 1668 * Wait for any bufs with callbacks that have been submitted but have not yet 1669 * returned. These buffers will have an elevated hold count, so wait on those 1670 * while freeing all the buffers only held by the LRU. 1671 */ 1672 static enum lru_status 1673 xfs_buftarg_wait_rele( 1674 struct list_head *item, 1675 struct list_lru_one *lru, 1676 spinlock_t *lru_lock, 1677 void *arg) 1678 1679 { 1680 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1681 struct list_head *dispose = arg; 1682 1683 if (atomic_read(&bp->b_hold) > 1) { 1684 /* need to wait, so skip it this pass */ 1685 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1686 return LRU_SKIP; 1687 } 1688 if (!spin_trylock(&bp->b_lock)) 1689 return LRU_SKIP; 1690 1691 /* 1692 * clear the LRU reference count so the buffer doesn't get 1693 * ignored in xfs_buf_rele(). 1694 */ 1695 atomic_set(&bp->b_lru_ref, 0); 1696 bp->b_state |= XFS_BSTATE_DISPOSE; 1697 list_lru_isolate_move(lru, item, dispose); 1698 spin_unlock(&bp->b_lock); 1699 return LRU_REMOVED; 1700 } 1701 1702 void 1703 xfs_wait_buftarg( 1704 struct xfs_buftarg *btp) 1705 { 1706 LIST_HEAD(dispose); 1707 int loop = 0; 1708 1709 /* 1710 * First wait on the buftarg I/O count for all in-flight buffers to be 1711 * released. This is critical as new buffers do not make the LRU until 1712 * they are released. 1713 * 1714 * Next, flush the buffer workqueue to ensure all completion processing 1715 * has finished. Just waiting on buffer locks is not sufficient for 1716 * async IO as the reference count held over IO is not released until 1717 * after the buffer lock is dropped. Hence we need to ensure here that 1718 * all reference counts have been dropped before we start walking the 1719 * LRU list. 1720 */ 1721 while (percpu_counter_sum(&btp->bt_io_count)) 1722 delay(100); 1723 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1724 1725 /* loop until there is nothing left on the lru list. */ 1726 while (list_lru_count(&btp->bt_lru)) { 1727 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, 1728 &dispose, LONG_MAX); 1729 1730 while (!list_empty(&dispose)) { 1731 struct xfs_buf *bp; 1732 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1733 list_del_init(&bp->b_lru); 1734 if (bp->b_flags & XBF_WRITE_FAIL) { 1735 xfs_alert(btp->bt_mount, 1736 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1737 (long long)bp->b_bn); 1738 xfs_alert(btp->bt_mount, 1739 "Please run xfs_repair to determine the extent of the problem."); 1740 } 1741 xfs_buf_rele(bp); 1742 } 1743 if (loop++ != 0) 1744 delay(100); 1745 } 1746 } 1747 1748 static enum lru_status 1749 xfs_buftarg_isolate( 1750 struct list_head *item, 1751 struct list_lru_one *lru, 1752 spinlock_t *lru_lock, 1753 void *arg) 1754 { 1755 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1756 struct list_head *dispose = arg; 1757 1758 /* 1759 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1760 * If we fail to get the lock, just skip it. 1761 */ 1762 if (!spin_trylock(&bp->b_lock)) 1763 return LRU_SKIP; 1764 /* 1765 * Decrement the b_lru_ref count unless the value is already 1766 * zero. If the value is already zero, we need to reclaim the 1767 * buffer, otherwise it gets another trip through the LRU. 1768 */ 1769 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1770 spin_unlock(&bp->b_lock); 1771 return LRU_ROTATE; 1772 } 1773 1774 bp->b_state |= XFS_BSTATE_DISPOSE; 1775 list_lru_isolate_move(lru, item, dispose); 1776 spin_unlock(&bp->b_lock); 1777 return LRU_REMOVED; 1778 } 1779 1780 static unsigned long 1781 xfs_buftarg_shrink_scan( 1782 struct shrinker *shrink, 1783 struct shrink_control *sc) 1784 { 1785 struct xfs_buftarg *btp = container_of(shrink, 1786 struct xfs_buftarg, bt_shrinker); 1787 LIST_HEAD(dispose); 1788 unsigned long freed; 1789 1790 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1791 xfs_buftarg_isolate, &dispose); 1792 1793 while (!list_empty(&dispose)) { 1794 struct xfs_buf *bp; 1795 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1796 list_del_init(&bp->b_lru); 1797 xfs_buf_rele(bp); 1798 } 1799 1800 return freed; 1801 } 1802 1803 static unsigned long 1804 xfs_buftarg_shrink_count( 1805 struct shrinker *shrink, 1806 struct shrink_control *sc) 1807 { 1808 struct xfs_buftarg *btp = container_of(shrink, 1809 struct xfs_buftarg, bt_shrinker); 1810 return list_lru_shrink_count(&btp->bt_lru, sc); 1811 } 1812 1813 void 1814 xfs_free_buftarg( 1815 struct xfs_buftarg *btp) 1816 { 1817 unregister_shrinker(&btp->bt_shrinker); 1818 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1819 percpu_counter_destroy(&btp->bt_io_count); 1820 list_lru_destroy(&btp->bt_lru); 1821 1822 xfs_blkdev_issue_flush(btp); 1823 1824 kmem_free(btp); 1825 } 1826 1827 int 1828 xfs_setsize_buftarg( 1829 xfs_buftarg_t *btp, 1830 unsigned int sectorsize) 1831 { 1832 /* Set up metadata sector size info */ 1833 btp->bt_meta_sectorsize = sectorsize; 1834 btp->bt_meta_sectormask = sectorsize - 1; 1835 1836 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1837 xfs_warn(btp->bt_mount, 1838 "Cannot set_blocksize to %u on device %pg", 1839 sectorsize, btp->bt_bdev); 1840 return -EINVAL; 1841 } 1842 1843 /* Set up device logical sector size mask */ 1844 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1845 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1846 1847 return 0; 1848 } 1849 1850 /* 1851 * When allocating the initial buffer target we have not yet 1852 * read in the superblock, so don't know what sized sectors 1853 * are being used at this early stage. Play safe. 1854 */ 1855 STATIC int 1856 xfs_setsize_buftarg_early( 1857 xfs_buftarg_t *btp, 1858 struct block_device *bdev) 1859 { 1860 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1861 } 1862 1863 xfs_buftarg_t * 1864 xfs_alloc_buftarg( 1865 struct xfs_mount *mp, 1866 struct block_device *bdev, 1867 struct dax_device *dax_dev) 1868 { 1869 xfs_buftarg_t *btp; 1870 1871 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1872 1873 btp->bt_mount = mp; 1874 btp->bt_dev = bdev->bd_dev; 1875 btp->bt_bdev = bdev; 1876 btp->bt_daxdev = dax_dev; 1877 1878 if (xfs_setsize_buftarg_early(btp, bdev)) 1879 goto error_free; 1880 1881 if (list_lru_init(&btp->bt_lru)) 1882 goto error_free; 1883 1884 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 1885 goto error_lru; 1886 1887 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1888 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1889 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1890 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 1891 if (register_shrinker(&btp->bt_shrinker)) 1892 goto error_pcpu; 1893 return btp; 1894 1895 error_pcpu: 1896 percpu_counter_destroy(&btp->bt_io_count); 1897 error_lru: 1898 list_lru_destroy(&btp->bt_lru); 1899 error_free: 1900 kmem_free(btp); 1901 return NULL; 1902 } 1903 1904 /* 1905 * Cancel a delayed write list. 1906 * 1907 * Remove each buffer from the list, clear the delwri queue flag and drop the 1908 * associated buffer reference. 1909 */ 1910 void 1911 xfs_buf_delwri_cancel( 1912 struct list_head *list) 1913 { 1914 struct xfs_buf *bp; 1915 1916 while (!list_empty(list)) { 1917 bp = list_first_entry(list, struct xfs_buf, b_list); 1918 1919 xfs_buf_lock(bp); 1920 bp->b_flags &= ~_XBF_DELWRI_Q; 1921 list_del_init(&bp->b_list); 1922 xfs_buf_relse(bp); 1923 } 1924 } 1925 1926 /* 1927 * Add a buffer to the delayed write list. 1928 * 1929 * This queues a buffer for writeout if it hasn't already been. Note that 1930 * neither this routine nor the buffer list submission functions perform 1931 * any internal synchronization. It is expected that the lists are thread-local 1932 * to the callers. 1933 * 1934 * Returns true if we queued up the buffer, or false if it already had 1935 * been on the buffer list. 1936 */ 1937 bool 1938 xfs_buf_delwri_queue( 1939 struct xfs_buf *bp, 1940 struct list_head *list) 1941 { 1942 ASSERT(xfs_buf_islocked(bp)); 1943 ASSERT(!(bp->b_flags & XBF_READ)); 1944 1945 /* 1946 * If the buffer is already marked delwri it already is queued up 1947 * by someone else for imediate writeout. Just ignore it in that 1948 * case. 1949 */ 1950 if (bp->b_flags & _XBF_DELWRI_Q) { 1951 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1952 return false; 1953 } 1954 1955 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1956 1957 /* 1958 * If a buffer gets written out synchronously or marked stale while it 1959 * is on a delwri list we lazily remove it. To do this, the other party 1960 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1961 * It remains referenced and on the list. In a rare corner case it 1962 * might get readded to a delwri list after the synchronous writeout, in 1963 * which case we need just need to re-add the flag here. 1964 */ 1965 bp->b_flags |= _XBF_DELWRI_Q; 1966 if (list_empty(&bp->b_list)) { 1967 atomic_inc(&bp->b_hold); 1968 list_add_tail(&bp->b_list, list); 1969 } 1970 1971 return true; 1972 } 1973 1974 /* 1975 * Compare function is more complex than it needs to be because 1976 * the return value is only 32 bits and we are doing comparisons 1977 * on 64 bit values 1978 */ 1979 static int 1980 xfs_buf_cmp( 1981 void *priv, 1982 struct list_head *a, 1983 struct list_head *b) 1984 { 1985 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1986 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1987 xfs_daddr_t diff; 1988 1989 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1990 if (diff < 0) 1991 return -1; 1992 if (diff > 0) 1993 return 1; 1994 return 0; 1995 } 1996 1997 /* 1998 * Submit buffers for write. If wait_list is specified, the buffers are 1999 * submitted using sync I/O and placed on the wait list such that the caller can 2000 * iowait each buffer. Otherwise async I/O is used and the buffers are released 2001 * at I/O completion time. In either case, buffers remain locked until I/O 2002 * completes and the buffer is released from the queue. 2003 */ 2004 static int 2005 xfs_buf_delwri_submit_buffers( 2006 struct list_head *buffer_list, 2007 struct list_head *wait_list) 2008 { 2009 struct xfs_buf *bp, *n; 2010 int pinned = 0; 2011 struct blk_plug plug; 2012 2013 list_sort(NULL, buffer_list, xfs_buf_cmp); 2014 2015 blk_start_plug(&plug); 2016 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2017 if (!wait_list) { 2018 if (xfs_buf_ispinned(bp)) { 2019 pinned++; 2020 continue; 2021 } 2022 if (!xfs_buf_trylock(bp)) 2023 continue; 2024 } else { 2025 xfs_buf_lock(bp); 2026 } 2027 2028 /* 2029 * Someone else might have written the buffer synchronously or 2030 * marked it stale in the meantime. In that case only the 2031 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2032 * reference and remove it from the list here. 2033 */ 2034 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2035 list_del_init(&bp->b_list); 2036 xfs_buf_relse(bp); 2037 continue; 2038 } 2039 2040 trace_xfs_buf_delwri_split(bp, _RET_IP_); 2041 2042 /* 2043 * If we have a wait list, each buffer (and associated delwri 2044 * queue reference) transfers to it and is submitted 2045 * synchronously. Otherwise, drop the buffer from the delwri 2046 * queue and submit async. 2047 */ 2048 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); 2049 bp->b_flags |= XBF_WRITE; 2050 if (wait_list) { 2051 bp->b_flags &= ~XBF_ASYNC; 2052 list_move_tail(&bp->b_list, wait_list); 2053 } else { 2054 bp->b_flags |= XBF_ASYNC; 2055 list_del_init(&bp->b_list); 2056 } 2057 __xfs_buf_submit(bp, false); 2058 } 2059 blk_finish_plug(&plug); 2060 2061 return pinned; 2062 } 2063 2064 /* 2065 * Write out a buffer list asynchronously. 2066 * 2067 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2068 * out and not wait for I/O completion on any of the buffers. This interface 2069 * is only safely useable for callers that can track I/O completion by higher 2070 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2071 * function. 2072 * 2073 * Note: this function will skip buffers it would block on, and in doing so 2074 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2075 * it is up to the caller to ensure that the buffer list is fully submitted or 2076 * cancelled appropriately when they are finished with the list. Failure to 2077 * cancel or resubmit the list until it is empty will result in leaked buffers 2078 * at unmount time. 2079 */ 2080 int 2081 xfs_buf_delwri_submit_nowait( 2082 struct list_head *buffer_list) 2083 { 2084 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2085 } 2086 2087 /* 2088 * Write out a buffer list synchronously. 2089 * 2090 * This will take the @buffer_list, write all buffers out and wait for I/O 2091 * completion on all of the buffers. @buffer_list is consumed by the function, 2092 * so callers must have some other way of tracking buffers if they require such 2093 * functionality. 2094 */ 2095 int 2096 xfs_buf_delwri_submit( 2097 struct list_head *buffer_list) 2098 { 2099 LIST_HEAD (wait_list); 2100 int error = 0, error2; 2101 struct xfs_buf *bp; 2102 2103 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2104 2105 /* Wait for IO to complete. */ 2106 while (!list_empty(&wait_list)) { 2107 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2108 2109 list_del_init(&bp->b_list); 2110 2111 /* 2112 * Wait on the locked buffer, check for errors and unlock and 2113 * release the delwri queue reference. 2114 */ 2115 error2 = xfs_buf_iowait(bp); 2116 xfs_buf_relse(bp); 2117 if (!error) 2118 error = error2; 2119 } 2120 2121 return error; 2122 } 2123 2124 /* 2125 * Push a single buffer on a delwri queue. 2126 * 2127 * The purpose of this function is to submit a single buffer of a delwri queue 2128 * and return with the buffer still on the original queue. The waiting delwri 2129 * buffer submission infrastructure guarantees transfer of the delwri queue 2130 * buffer reference to a temporary wait list. We reuse this infrastructure to 2131 * transfer the buffer back to the original queue. 2132 * 2133 * Note the buffer transitions from the queued state, to the submitted and wait 2134 * listed state and back to the queued state during this call. The buffer 2135 * locking and queue management logic between _delwri_pushbuf() and 2136 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2137 * before returning. 2138 */ 2139 int 2140 xfs_buf_delwri_pushbuf( 2141 struct xfs_buf *bp, 2142 struct list_head *buffer_list) 2143 { 2144 LIST_HEAD (submit_list); 2145 int error; 2146 2147 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2148 2149 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2150 2151 /* 2152 * Isolate the buffer to a new local list so we can submit it for I/O 2153 * independently from the rest of the original list. 2154 */ 2155 xfs_buf_lock(bp); 2156 list_move(&bp->b_list, &submit_list); 2157 xfs_buf_unlock(bp); 2158 2159 /* 2160 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2161 * the buffer on the wait list with the original reference. Rather than 2162 * bounce the buffer from a local wait list back to the original list 2163 * after I/O completion, reuse the original list as the wait list. 2164 */ 2165 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2166 2167 /* 2168 * The buffer is now locked, under I/O and wait listed on the original 2169 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2170 * return with the buffer unlocked and on the original queue. 2171 */ 2172 error = xfs_buf_iowait(bp); 2173 bp->b_flags |= _XBF_DELWRI_Q; 2174 xfs_buf_unlock(bp); 2175 2176 return error; 2177 } 2178 2179 int __init 2180 xfs_buf_init(void) 2181 { 2182 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 2183 KM_ZONE_HWALIGN, NULL); 2184 if (!xfs_buf_zone) 2185 goto out; 2186 2187 return 0; 2188 2189 out: 2190 return -ENOMEM; 2191 } 2192 2193 void 2194 xfs_buf_terminate(void) 2195 { 2196 kmem_zone_destroy(xfs_buf_zone); 2197 } 2198 2199 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2200 { 2201 /* 2202 * Set the lru reference count to 0 based on the error injection tag. 2203 * This allows userspace to disrupt buffer caching for debug/testing 2204 * purposes. 2205 */ 2206 if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, 2207 XFS_ERRTAG_BUF_LRU_REF)) 2208 lru_ref = 0; 2209 2210 atomic_set(&bp->b_lru_ref, lru_ref); 2211 } 2212