1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 37 #include "xfs_sb.h" 38 #include "xfs_trans_resv.h" 39 #include "xfs_log.h" 40 #include "xfs_ag.h" 41 #include "xfs_mount.h" 42 #include "xfs_trace.h" 43 44 static kmem_zone_t *xfs_buf_zone; 45 46 static struct workqueue_struct *xfslogd_workqueue; 47 48 #ifdef XFS_BUF_LOCK_TRACKING 49 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 50 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 51 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 52 #else 53 # define XB_SET_OWNER(bp) do { } while (0) 54 # define XB_CLEAR_OWNER(bp) do { } while (0) 55 # define XB_GET_OWNER(bp) do { } while (0) 56 #endif 57 58 #define xb_to_gfp(flags) \ 59 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 60 61 62 static inline int 63 xfs_buf_is_vmapped( 64 struct xfs_buf *bp) 65 { 66 /* 67 * Return true if the buffer is vmapped. 68 * 69 * b_addr is null if the buffer is not mapped, but the code is clever 70 * enough to know it doesn't have to map a single page, so the check has 71 * to be both for b_addr and bp->b_page_count > 1. 72 */ 73 return bp->b_addr && bp->b_page_count > 1; 74 } 75 76 static inline int 77 xfs_buf_vmap_len( 78 struct xfs_buf *bp) 79 { 80 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 81 } 82 83 /* 84 * xfs_buf_lru_add - add a buffer to the LRU. 85 * 86 * The LRU takes a new reference to the buffer so that it will only be freed 87 * once the shrinker takes the buffer off the LRU. 88 */ 89 STATIC void 90 xfs_buf_lru_add( 91 struct xfs_buf *bp) 92 { 93 struct xfs_buftarg *btp = bp->b_target; 94 95 spin_lock(&btp->bt_lru_lock); 96 if (list_empty(&bp->b_lru)) { 97 atomic_inc(&bp->b_hold); 98 list_add_tail(&bp->b_lru, &btp->bt_lru); 99 btp->bt_lru_nr++; 100 bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; 101 } 102 spin_unlock(&btp->bt_lru_lock); 103 } 104 105 /* 106 * xfs_buf_lru_del - remove a buffer from the LRU 107 * 108 * The unlocked check is safe here because it only occurs when there are not 109 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there 110 * to optimise the shrinker removing the buffer from the LRU and calling 111 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the 112 * bt_lru_lock. 113 */ 114 STATIC void 115 xfs_buf_lru_del( 116 struct xfs_buf *bp) 117 { 118 struct xfs_buftarg *btp = bp->b_target; 119 120 if (list_empty(&bp->b_lru)) 121 return; 122 123 spin_lock(&btp->bt_lru_lock); 124 if (!list_empty(&bp->b_lru)) { 125 list_del_init(&bp->b_lru); 126 btp->bt_lru_nr--; 127 } 128 spin_unlock(&btp->bt_lru_lock); 129 } 130 131 /* 132 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 133 * b_lru_ref count so that the buffer is freed immediately when the buffer 134 * reference count falls to zero. If the buffer is already on the LRU, we need 135 * to remove the reference that LRU holds on the buffer. 136 * 137 * This prevents build-up of stale buffers on the LRU. 138 */ 139 void 140 xfs_buf_stale( 141 struct xfs_buf *bp) 142 { 143 ASSERT(xfs_buf_islocked(bp)); 144 145 bp->b_flags |= XBF_STALE; 146 147 /* 148 * Clear the delwri status so that a delwri queue walker will not 149 * flush this buffer to disk now that it is stale. The delwri queue has 150 * a reference to the buffer, so this is safe to do. 151 */ 152 bp->b_flags &= ~_XBF_DELWRI_Q; 153 154 atomic_set(&(bp)->b_lru_ref, 0); 155 if (!list_empty(&bp->b_lru)) { 156 struct xfs_buftarg *btp = bp->b_target; 157 158 spin_lock(&btp->bt_lru_lock); 159 if (!list_empty(&bp->b_lru) && 160 !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { 161 list_del_init(&bp->b_lru); 162 btp->bt_lru_nr--; 163 atomic_dec(&bp->b_hold); 164 } 165 spin_unlock(&btp->bt_lru_lock); 166 } 167 ASSERT(atomic_read(&bp->b_hold) >= 1); 168 } 169 170 static int 171 xfs_buf_get_maps( 172 struct xfs_buf *bp, 173 int map_count) 174 { 175 ASSERT(bp->b_maps == NULL); 176 bp->b_map_count = map_count; 177 178 if (map_count == 1) { 179 bp->b_maps = &bp->__b_map; 180 return 0; 181 } 182 183 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 184 KM_NOFS); 185 if (!bp->b_maps) 186 return ENOMEM; 187 return 0; 188 } 189 190 /* 191 * Frees b_pages if it was allocated. 192 */ 193 static void 194 xfs_buf_free_maps( 195 struct xfs_buf *bp) 196 { 197 if (bp->b_maps != &bp->__b_map) { 198 kmem_free(bp->b_maps); 199 bp->b_maps = NULL; 200 } 201 } 202 203 struct xfs_buf * 204 _xfs_buf_alloc( 205 struct xfs_buftarg *target, 206 struct xfs_buf_map *map, 207 int nmaps, 208 xfs_buf_flags_t flags) 209 { 210 struct xfs_buf *bp; 211 int error; 212 int i; 213 214 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 215 if (unlikely(!bp)) 216 return NULL; 217 218 /* 219 * We don't want certain flags to appear in b_flags unless they are 220 * specifically set by later operations on the buffer. 221 */ 222 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 223 224 atomic_set(&bp->b_hold, 1); 225 atomic_set(&bp->b_lru_ref, 1); 226 init_completion(&bp->b_iowait); 227 INIT_LIST_HEAD(&bp->b_lru); 228 INIT_LIST_HEAD(&bp->b_list); 229 RB_CLEAR_NODE(&bp->b_rbnode); 230 sema_init(&bp->b_sema, 0); /* held, no waiters */ 231 XB_SET_OWNER(bp); 232 bp->b_target = target; 233 bp->b_flags = flags; 234 235 /* 236 * Set length and io_length to the same value initially. 237 * I/O routines should use io_length, which will be the same in 238 * most cases but may be reset (e.g. XFS recovery). 239 */ 240 error = xfs_buf_get_maps(bp, nmaps); 241 if (error) { 242 kmem_zone_free(xfs_buf_zone, bp); 243 return NULL; 244 } 245 246 bp->b_bn = map[0].bm_bn; 247 bp->b_length = 0; 248 for (i = 0; i < nmaps; i++) { 249 bp->b_maps[i].bm_bn = map[i].bm_bn; 250 bp->b_maps[i].bm_len = map[i].bm_len; 251 bp->b_length += map[i].bm_len; 252 } 253 bp->b_io_length = bp->b_length; 254 255 atomic_set(&bp->b_pin_count, 0); 256 init_waitqueue_head(&bp->b_waiters); 257 258 XFS_STATS_INC(xb_create); 259 trace_xfs_buf_init(bp, _RET_IP_); 260 261 return bp; 262 } 263 264 /* 265 * Allocate a page array capable of holding a specified number 266 * of pages, and point the page buf at it. 267 */ 268 STATIC int 269 _xfs_buf_get_pages( 270 xfs_buf_t *bp, 271 int page_count, 272 xfs_buf_flags_t flags) 273 { 274 /* Make sure that we have a page list */ 275 if (bp->b_pages == NULL) { 276 bp->b_page_count = page_count; 277 if (page_count <= XB_PAGES) { 278 bp->b_pages = bp->b_page_array; 279 } else { 280 bp->b_pages = kmem_alloc(sizeof(struct page *) * 281 page_count, KM_NOFS); 282 if (bp->b_pages == NULL) 283 return -ENOMEM; 284 } 285 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 286 } 287 return 0; 288 } 289 290 /* 291 * Frees b_pages if it was allocated. 292 */ 293 STATIC void 294 _xfs_buf_free_pages( 295 xfs_buf_t *bp) 296 { 297 if (bp->b_pages != bp->b_page_array) { 298 kmem_free(bp->b_pages); 299 bp->b_pages = NULL; 300 } 301 } 302 303 /* 304 * Releases the specified buffer. 305 * 306 * The modification state of any associated pages is left unchanged. 307 * The buffer must not be on any hash - use xfs_buf_rele instead for 308 * hashed and refcounted buffers 309 */ 310 void 311 xfs_buf_free( 312 xfs_buf_t *bp) 313 { 314 trace_xfs_buf_free(bp, _RET_IP_); 315 316 ASSERT(list_empty(&bp->b_lru)); 317 318 if (bp->b_flags & _XBF_PAGES) { 319 uint i; 320 321 if (xfs_buf_is_vmapped(bp)) 322 vm_unmap_ram(bp->b_addr - bp->b_offset, 323 bp->b_page_count); 324 325 for (i = 0; i < bp->b_page_count; i++) { 326 struct page *page = bp->b_pages[i]; 327 328 __free_page(page); 329 } 330 } else if (bp->b_flags & _XBF_KMEM) 331 kmem_free(bp->b_addr); 332 _xfs_buf_free_pages(bp); 333 xfs_buf_free_maps(bp); 334 kmem_zone_free(xfs_buf_zone, bp); 335 } 336 337 /* 338 * Allocates all the pages for buffer in question and builds it's page list. 339 */ 340 STATIC int 341 xfs_buf_allocate_memory( 342 xfs_buf_t *bp, 343 uint flags) 344 { 345 size_t size; 346 size_t nbytes, offset; 347 gfp_t gfp_mask = xb_to_gfp(flags); 348 unsigned short page_count, i; 349 xfs_off_t start, end; 350 int error; 351 352 /* 353 * for buffers that are contained within a single page, just allocate 354 * the memory from the heap - there's no need for the complexity of 355 * page arrays to keep allocation down to order 0. 356 */ 357 size = BBTOB(bp->b_length); 358 if (size < PAGE_SIZE) { 359 bp->b_addr = kmem_alloc(size, KM_NOFS); 360 if (!bp->b_addr) { 361 /* low memory - use alloc_page loop instead */ 362 goto use_alloc_page; 363 } 364 365 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 366 ((unsigned long)bp->b_addr & PAGE_MASK)) { 367 /* b_addr spans two pages - use alloc_page instead */ 368 kmem_free(bp->b_addr); 369 bp->b_addr = NULL; 370 goto use_alloc_page; 371 } 372 bp->b_offset = offset_in_page(bp->b_addr); 373 bp->b_pages = bp->b_page_array; 374 bp->b_pages[0] = virt_to_page(bp->b_addr); 375 bp->b_page_count = 1; 376 bp->b_flags |= _XBF_KMEM; 377 return 0; 378 } 379 380 use_alloc_page: 381 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 382 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 383 >> PAGE_SHIFT; 384 page_count = end - start; 385 error = _xfs_buf_get_pages(bp, page_count, flags); 386 if (unlikely(error)) 387 return error; 388 389 offset = bp->b_offset; 390 bp->b_flags |= _XBF_PAGES; 391 392 for (i = 0; i < bp->b_page_count; i++) { 393 struct page *page; 394 uint retries = 0; 395 retry: 396 page = alloc_page(gfp_mask); 397 if (unlikely(page == NULL)) { 398 if (flags & XBF_READ_AHEAD) { 399 bp->b_page_count = i; 400 error = ENOMEM; 401 goto out_free_pages; 402 } 403 404 /* 405 * This could deadlock. 406 * 407 * But until all the XFS lowlevel code is revamped to 408 * handle buffer allocation failures we can't do much. 409 */ 410 if (!(++retries % 100)) 411 xfs_err(NULL, 412 "possible memory allocation deadlock in %s (mode:0x%x)", 413 __func__, gfp_mask); 414 415 XFS_STATS_INC(xb_page_retries); 416 congestion_wait(BLK_RW_ASYNC, HZ/50); 417 goto retry; 418 } 419 420 XFS_STATS_INC(xb_page_found); 421 422 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 423 size -= nbytes; 424 bp->b_pages[i] = page; 425 offset = 0; 426 } 427 return 0; 428 429 out_free_pages: 430 for (i = 0; i < bp->b_page_count; i++) 431 __free_page(bp->b_pages[i]); 432 return error; 433 } 434 435 /* 436 * Map buffer into kernel address-space if necessary. 437 */ 438 STATIC int 439 _xfs_buf_map_pages( 440 xfs_buf_t *bp, 441 uint flags) 442 { 443 ASSERT(bp->b_flags & _XBF_PAGES); 444 if (bp->b_page_count == 1) { 445 /* A single page buffer is always mappable */ 446 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 447 } else if (flags & XBF_UNMAPPED) { 448 bp->b_addr = NULL; 449 } else { 450 int retried = 0; 451 452 do { 453 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 454 -1, PAGE_KERNEL); 455 if (bp->b_addr) 456 break; 457 vm_unmap_aliases(); 458 } while (retried++ <= 1); 459 460 if (!bp->b_addr) 461 return -ENOMEM; 462 bp->b_addr += bp->b_offset; 463 } 464 465 return 0; 466 } 467 468 /* 469 * Finding and Reading Buffers 470 */ 471 472 /* 473 * Look up, and creates if absent, a lockable buffer for 474 * a given range of an inode. The buffer is returned 475 * locked. No I/O is implied by this call. 476 */ 477 xfs_buf_t * 478 _xfs_buf_find( 479 struct xfs_buftarg *btp, 480 struct xfs_buf_map *map, 481 int nmaps, 482 xfs_buf_flags_t flags, 483 xfs_buf_t *new_bp) 484 { 485 size_t numbytes; 486 struct xfs_perag *pag; 487 struct rb_node **rbp; 488 struct rb_node *parent; 489 xfs_buf_t *bp; 490 xfs_daddr_t blkno = map[0].bm_bn; 491 xfs_daddr_t eofs; 492 int numblks = 0; 493 int i; 494 495 for (i = 0; i < nmaps; i++) 496 numblks += map[i].bm_len; 497 numbytes = BBTOB(numblks); 498 499 /* Check for IOs smaller than the sector size / not sector aligned */ 500 ASSERT(!(numbytes < (1 << btp->bt_sshift))); 501 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); 502 503 /* 504 * Corrupted block numbers can get through to here, unfortunately, so we 505 * have to check that the buffer falls within the filesystem bounds. 506 */ 507 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 508 if (blkno >= eofs) { 509 /* 510 * XXX (dgc): we should really be returning EFSCORRUPTED here, 511 * but none of the higher level infrastructure supports 512 * returning a specific error on buffer lookup failures. 513 */ 514 xfs_alert(btp->bt_mount, 515 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", 516 __func__, blkno, eofs); 517 WARN_ON(1); 518 return NULL; 519 } 520 521 /* get tree root */ 522 pag = xfs_perag_get(btp->bt_mount, 523 xfs_daddr_to_agno(btp->bt_mount, blkno)); 524 525 /* walk tree */ 526 spin_lock(&pag->pag_buf_lock); 527 rbp = &pag->pag_buf_tree.rb_node; 528 parent = NULL; 529 bp = NULL; 530 while (*rbp) { 531 parent = *rbp; 532 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 533 534 if (blkno < bp->b_bn) 535 rbp = &(*rbp)->rb_left; 536 else if (blkno > bp->b_bn) 537 rbp = &(*rbp)->rb_right; 538 else { 539 /* 540 * found a block number match. If the range doesn't 541 * match, the only way this is allowed is if the buffer 542 * in the cache is stale and the transaction that made 543 * it stale has not yet committed. i.e. we are 544 * reallocating a busy extent. Skip this buffer and 545 * continue searching to the right for an exact match. 546 */ 547 if (bp->b_length != numblks) { 548 ASSERT(bp->b_flags & XBF_STALE); 549 rbp = &(*rbp)->rb_right; 550 continue; 551 } 552 atomic_inc(&bp->b_hold); 553 goto found; 554 } 555 } 556 557 /* No match found */ 558 if (new_bp) { 559 rb_link_node(&new_bp->b_rbnode, parent, rbp); 560 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 561 /* the buffer keeps the perag reference until it is freed */ 562 new_bp->b_pag = pag; 563 spin_unlock(&pag->pag_buf_lock); 564 } else { 565 XFS_STATS_INC(xb_miss_locked); 566 spin_unlock(&pag->pag_buf_lock); 567 xfs_perag_put(pag); 568 } 569 return new_bp; 570 571 found: 572 spin_unlock(&pag->pag_buf_lock); 573 xfs_perag_put(pag); 574 575 if (!xfs_buf_trylock(bp)) { 576 if (flags & XBF_TRYLOCK) { 577 xfs_buf_rele(bp); 578 XFS_STATS_INC(xb_busy_locked); 579 return NULL; 580 } 581 xfs_buf_lock(bp); 582 XFS_STATS_INC(xb_get_locked_waited); 583 } 584 585 /* 586 * if the buffer is stale, clear all the external state associated with 587 * it. We need to keep flags such as how we allocated the buffer memory 588 * intact here. 589 */ 590 if (bp->b_flags & XBF_STALE) { 591 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 592 ASSERT(bp->b_iodone == NULL); 593 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 594 bp->b_ops = NULL; 595 } 596 597 trace_xfs_buf_find(bp, flags, _RET_IP_); 598 XFS_STATS_INC(xb_get_locked); 599 return bp; 600 } 601 602 /* 603 * Assembles a buffer covering the specified range. The code is optimised for 604 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 605 * more hits than misses. 606 */ 607 struct xfs_buf * 608 xfs_buf_get_map( 609 struct xfs_buftarg *target, 610 struct xfs_buf_map *map, 611 int nmaps, 612 xfs_buf_flags_t flags) 613 { 614 struct xfs_buf *bp; 615 struct xfs_buf *new_bp; 616 int error = 0; 617 618 bp = _xfs_buf_find(target, map, nmaps, flags, NULL); 619 if (likely(bp)) 620 goto found; 621 622 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 623 if (unlikely(!new_bp)) 624 return NULL; 625 626 error = xfs_buf_allocate_memory(new_bp, flags); 627 if (error) { 628 xfs_buf_free(new_bp); 629 return NULL; 630 } 631 632 bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); 633 if (!bp) { 634 xfs_buf_free(new_bp); 635 return NULL; 636 } 637 638 if (bp != new_bp) 639 xfs_buf_free(new_bp); 640 641 found: 642 if (!bp->b_addr) { 643 error = _xfs_buf_map_pages(bp, flags); 644 if (unlikely(error)) { 645 xfs_warn(target->bt_mount, 646 "%s: failed to map pages\n", __func__); 647 xfs_buf_relse(bp); 648 return NULL; 649 } 650 } 651 652 XFS_STATS_INC(xb_get); 653 trace_xfs_buf_get(bp, flags, _RET_IP_); 654 return bp; 655 } 656 657 STATIC int 658 _xfs_buf_read( 659 xfs_buf_t *bp, 660 xfs_buf_flags_t flags) 661 { 662 ASSERT(!(flags & XBF_WRITE)); 663 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 664 665 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 666 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 667 668 xfs_buf_iorequest(bp); 669 if (flags & XBF_ASYNC) 670 return 0; 671 return xfs_buf_iowait(bp); 672 } 673 674 xfs_buf_t * 675 xfs_buf_read_map( 676 struct xfs_buftarg *target, 677 struct xfs_buf_map *map, 678 int nmaps, 679 xfs_buf_flags_t flags, 680 const struct xfs_buf_ops *ops) 681 { 682 struct xfs_buf *bp; 683 684 flags |= XBF_READ; 685 686 bp = xfs_buf_get_map(target, map, nmaps, flags); 687 if (bp) { 688 trace_xfs_buf_read(bp, flags, _RET_IP_); 689 690 if (!XFS_BUF_ISDONE(bp)) { 691 XFS_STATS_INC(xb_get_read); 692 bp->b_ops = ops; 693 _xfs_buf_read(bp, flags); 694 } else if (flags & XBF_ASYNC) { 695 /* 696 * Read ahead call which is already satisfied, 697 * drop the buffer 698 */ 699 xfs_buf_relse(bp); 700 return NULL; 701 } else { 702 /* We do not want read in the flags */ 703 bp->b_flags &= ~XBF_READ; 704 } 705 } 706 707 return bp; 708 } 709 710 /* 711 * If we are not low on memory then do the readahead in a deadlock 712 * safe manner. 713 */ 714 void 715 xfs_buf_readahead_map( 716 struct xfs_buftarg *target, 717 struct xfs_buf_map *map, 718 int nmaps, 719 const struct xfs_buf_ops *ops) 720 { 721 if (bdi_read_congested(target->bt_bdi)) 722 return; 723 724 xfs_buf_read_map(target, map, nmaps, 725 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 726 } 727 728 /* 729 * Read an uncached buffer from disk. Allocates and returns a locked 730 * buffer containing the disk contents or nothing. 731 */ 732 struct xfs_buf * 733 xfs_buf_read_uncached( 734 struct xfs_buftarg *target, 735 xfs_daddr_t daddr, 736 size_t numblks, 737 int flags, 738 const struct xfs_buf_ops *ops) 739 { 740 struct xfs_buf *bp; 741 742 bp = xfs_buf_get_uncached(target, numblks, flags); 743 if (!bp) 744 return NULL; 745 746 /* set up the buffer for a read IO */ 747 ASSERT(bp->b_map_count == 1); 748 bp->b_bn = daddr; 749 bp->b_maps[0].bm_bn = daddr; 750 bp->b_flags |= XBF_READ; 751 bp->b_ops = ops; 752 753 xfsbdstrat(target->bt_mount, bp); 754 xfs_buf_iowait(bp); 755 return bp; 756 } 757 758 /* 759 * Return a buffer allocated as an empty buffer and associated to external 760 * memory via xfs_buf_associate_memory() back to it's empty state. 761 */ 762 void 763 xfs_buf_set_empty( 764 struct xfs_buf *bp, 765 size_t numblks) 766 { 767 if (bp->b_pages) 768 _xfs_buf_free_pages(bp); 769 770 bp->b_pages = NULL; 771 bp->b_page_count = 0; 772 bp->b_addr = NULL; 773 bp->b_length = numblks; 774 bp->b_io_length = numblks; 775 776 ASSERT(bp->b_map_count == 1); 777 bp->b_bn = XFS_BUF_DADDR_NULL; 778 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 779 bp->b_maps[0].bm_len = bp->b_length; 780 } 781 782 static inline struct page * 783 mem_to_page( 784 void *addr) 785 { 786 if ((!is_vmalloc_addr(addr))) { 787 return virt_to_page(addr); 788 } else { 789 return vmalloc_to_page(addr); 790 } 791 } 792 793 int 794 xfs_buf_associate_memory( 795 xfs_buf_t *bp, 796 void *mem, 797 size_t len) 798 { 799 int rval; 800 int i = 0; 801 unsigned long pageaddr; 802 unsigned long offset; 803 size_t buflen; 804 int page_count; 805 806 pageaddr = (unsigned long)mem & PAGE_MASK; 807 offset = (unsigned long)mem - pageaddr; 808 buflen = PAGE_ALIGN(len + offset); 809 page_count = buflen >> PAGE_SHIFT; 810 811 /* Free any previous set of page pointers */ 812 if (bp->b_pages) 813 _xfs_buf_free_pages(bp); 814 815 bp->b_pages = NULL; 816 bp->b_addr = mem; 817 818 rval = _xfs_buf_get_pages(bp, page_count, 0); 819 if (rval) 820 return rval; 821 822 bp->b_offset = offset; 823 824 for (i = 0; i < bp->b_page_count; i++) { 825 bp->b_pages[i] = mem_to_page((void *)pageaddr); 826 pageaddr += PAGE_SIZE; 827 } 828 829 bp->b_io_length = BTOBB(len); 830 bp->b_length = BTOBB(buflen); 831 832 return 0; 833 } 834 835 xfs_buf_t * 836 xfs_buf_get_uncached( 837 struct xfs_buftarg *target, 838 size_t numblks, 839 int flags) 840 { 841 unsigned long page_count; 842 int error, i; 843 struct xfs_buf *bp; 844 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 845 846 bp = _xfs_buf_alloc(target, &map, 1, 0); 847 if (unlikely(bp == NULL)) 848 goto fail; 849 850 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 851 error = _xfs_buf_get_pages(bp, page_count, 0); 852 if (error) 853 goto fail_free_buf; 854 855 for (i = 0; i < page_count; i++) { 856 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 857 if (!bp->b_pages[i]) 858 goto fail_free_mem; 859 } 860 bp->b_flags |= _XBF_PAGES; 861 862 error = _xfs_buf_map_pages(bp, 0); 863 if (unlikely(error)) { 864 xfs_warn(target->bt_mount, 865 "%s: failed to map pages\n", __func__); 866 goto fail_free_mem; 867 } 868 869 trace_xfs_buf_get_uncached(bp, _RET_IP_); 870 return bp; 871 872 fail_free_mem: 873 while (--i >= 0) 874 __free_page(bp->b_pages[i]); 875 _xfs_buf_free_pages(bp); 876 fail_free_buf: 877 xfs_buf_free_maps(bp); 878 kmem_zone_free(xfs_buf_zone, bp); 879 fail: 880 return NULL; 881 } 882 883 /* 884 * Increment reference count on buffer, to hold the buffer concurrently 885 * with another thread which may release (free) the buffer asynchronously. 886 * Must hold the buffer already to call this function. 887 */ 888 void 889 xfs_buf_hold( 890 xfs_buf_t *bp) 891 { 892 trace_xfs_buf_hold(bp, _RET_IP_); 893 atomic_inc(&bp->b_hold); 894 } 895 896 /* 897 * Releases a hold on the specified buffer. If the 898 * the hold count is 1, calls xfs_buf_free. 899 */ 900 void 901 xfs_buf_rele( 902 xfs_buf_t *bp) 903 { 904 struct xfs_perag *pag = bp->b_pag; 905 906 trace_xfs_buf_rele(bp, _RET_IP_); 907 908 if (!pag) { 909 ASSERT(list_empty(&bp->b_lru)); 910 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 911 if (atomic_dec_and_test(&bp->b_hold)) 912 xfs_buf_free(bp); 913 return; 914 } 915 916 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 917 918 ASSERT(atomic_read(&bp->b_hold) > 0); 919 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 920 if (!(bp->b_flags & XBF_STALE) && 921 atomic_read(&bp->b_lru_ref)) { 922 xfs_buf_lru_add(bp); 923 spin_unlock(&pag->pag_buf_lock); 924 } else { 925 xfs_buf_lru_del(bp); 926 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 927 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 928 spin_unlock(&pag->pag_buf_lock); 929 xfs_perag_put(pag); 930 xfs_buf_free(bp); 931 } 932 } 933 } 934 935 936 /* 937 * Lock a buffer object, if it is not already locked. 938 * 939 * If we come across a stale, pinned, locked buffer, we know that we are 940 * being asked to lock a buffer that has been reallocated. Because it is 941 * pinned, we know that the log has not been pushed to disk and hence it 942 * will still be locked. Rather than continuing to have trylock attempts 943 * fail until someone else pushes the log, push it ourselves before 944 * returning. This means that the xfsaild will not get stuck trying 945 * to push on stale inode buffers. 946 */ 947 int 948 xfs_buf_trylock( 949 struct xfs_buf *bp) 950 { 951 int locked; 952 953 locked = down_trylock(&bp->b_sema) == 0; 954 if (locked) 955 XB_SET_OWNER(bp); 956 957 trace_xfs_buf_trylock(bp, _RET_IP_); 958 return locked; 959 } 960 961 /* 962 * Lock a buffer object. 963 * 964 * If we come across a stale, pinned, locked buffer, we know that we 965 * are being asked to lock a buffer that has been reallocated. Because 966 * it is pinned, we know that the log has not been pushed to disk and 967 * hence it will still be locked. Rather than sleeping until someone 968 * else pushes the log, push it ourselves before trying to get the lock. 969 */ 970 void 971 xfs_buf_lock( 972 struct xfs_buf *bp) 973 { 974 trace_xfs_buf_lock(bp, _RET_IP_); 975 976 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 977 xfs_log_force(bp->b_target->bt_mount, 0); 978 down(&bp->b_sema); 979 XB_SET_OWNER(bp); 980 981 trace_xfs_buf_lock_done(bp, _RET_IP_); 982 } 983 984 void 985 xfs_buf_unlock( 986 struct xfs_buf *bp) 987 { 988 XB_CLEAR_OWNER(bp); 989 up(&bp->b_sema); 990 991 trace_xfs_buf_unlock(bp, _RET_IP_); 992 } 993 994 STATIC void 995 xfs_buf_wait_unpin( 996 xfs_buf_t *bp) 997 { 998 DECLARE_WAITQUEUE (wait, current); 999 1000 if (atomic_read(&bp->b_pin_count) == 0) 1001 return; 1002 1003 add_wait_queue(&bp->b_waiters, &wait); 1004 for (;;) { 1005 set_current_state(TASK_UNINTERRUPTIBLE); 1006 if (atomic_read(&bp->b_pin_count) == 0) 1007 break; 1008 io_schedule(); 1009 } 1010 remove_wait_queue(&bp->b_waiters, &wait); 1011 set_current_state(TASK_RUNNING); 1012 } 1013 1014 /* 1015 * Buffer Utility Routines 1016 */ 1017 1018 STATIC void 1019 xfs_buf_iodone_work( 1020 struct work_struct *work) 1021 { 1022 struct xfs_buf *bp = 1023 container_of(work, xfs_buf_t, b_iodone_work); 1024 bool read = !!(bp->b_flags & XBF_READ); 1025 1026 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1027 1028 /* only validate buffers that were read without errors */ 1029 if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) 1030 bp->b_ops->verify_read(bp); 1031 1032 if (bp->b_iodone) 1033 (*(bp->b_iodone))(bp); 1034 else if (bp->b_flags & XBF_ASYNC) 1035 xfs_buf_relse(bp); 1036 else { 1037 ASSERT(read && bp->b_ops); 1038 complete(&bp->b_iowait); 1039 } 1040 } 1041 1042 void 1043 xfs_buf_ioend( 1044 struct xfs_buf *bp, 1045 int schedule) 1046 { 1047 bool read = !!(bp->b_flags & XBF_READ); 1048 1049 trace_xfs_buf_iodone(bp, _RET_IP_); 1050 1051 if (bp->b_error == 0) 1052 bp->b_flags |= XBF_DONE; 1053 1054 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { 1055 if (schedule) { 1056 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1057 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1058 } else { 1059 xfs_buf_iodone_work(&bp->b_iodone_work); 1060 } 1061 } else { 1062 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1063 complete(&bp->b_iowait); 1064 } 1065 } 1066 1067 void 1068 xfs_buf_ioerror( 1069 xfs_buf_t *bp, 1070 int error) 1071 { 1072 ASSERT(error >= 0 && error <= 0xffff); 1073 bp->b_error = (unsigned short)error; 1074 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1075 } 1076 1077 void 1078 xfs_buf_ioerror_alert( 1079 struct xfs_buf *bp, 1080 const char *func) 1081 { 1082 xfs_alert(bp->b_target->bt_mount, 1083 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", 1084 (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); 1085 } 1086 1087 /* 1088 * Called when we want to stop a buffer from getting written or read. 1089 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 1090 * so that the proper iodone callbacks get called. 1091 */ 1092 STATIC int 1093 xfs_bioerror( 1094 xfs_buf_t *bp) 1095 { 1096 #ifdef XFSERRORDEBUG 1097 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); 1098 #endif 1099 1100 /* 1101 * No need to wait until the buffer is unpinned, we aren't flushing it. 1102 */ 1103 xfs_buf_ioerror(bp, EIO); 1104 1105 /* 1106 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1107 */ 1108 XFS_BUF_UNREAD(bp); 1109 XFS_BUF_UNDONE(bp); 1110 xfs_buf_stale(bp); 1111 1112 xfs_buf_ioend(bp, 0); 1113 1114 return EIO; 1115 } 1116 1117 /* 1118 * Same as xfs_bioerror, except that we are releasing the buffer 1119 * here ourselves, and avoiding the xfs_buf_ioend call. 1120 * This is meant for userdata errors; metadata bufs come with 1121 * iodone functions attached, so that we can track down errors. 1122 */ 1123 STATIC int 1124 xfs_bioerror_relse( 1125 struct xfs_buf *bp) 1126 { 1127 int64_t fl = bp->b_flags; 1128 /* 1129 * No need to wait until the buffer is unpinned. 1130 * We aren't flushing it. 1131 * 1132 * chunkhold expects B_DONE to be set, whether 1133 * we actually finish the I/O or not. We don't want to 1134 * change that interface. 1135 */ 1136 XFS_BUF_UNREAD(bp); 1137 XFS_BUF_DONE(bp); 1138 xfs_buf_stale(bp); 1139 bp->b_iodone = NULL; 1140 if (!(fl & XBF_ASYNC)) { 1141 /* 1142 * Mark b_error and B_ERROR _both_. 1143 * Lot's of chunkcache code assumes that. 1144 * There's no reason to mark error for 1145 * ASYNC buffers. 1146 */ 1147 xfs_buf_ioerror(bp, EIO); 1148 complete(&bp->b_iowait); 1149 } else { 1150 xfs_buf_relse(bp); 1151 } 1152 1153 return EIO; 1154 } 1155 1156 STATIC int 1157 xfs_bdstrat_cb( 1158 struct xfs_buf *bp) 1159 { 1160 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1161 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1162 /* 1163 * Metadata write that didn't get logged but 1164 * written delayed anyway. These aren't associated 1165 * with a transaction, and can be ignored. 1166 */ 1167 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) 1168 return xfs_bioerror_relse(bp); 1169 else 1170 return xfs_bioerror(bp); 1171 } 1172 1173 xfs_buf_iorequest(bp); 1174 return 0; 1175 } 1176 1177 int 1178 xfs_bwrite( 1179 struct xfs_buf *bp) 1180 { 1181 int error; 1182 1183 ASSERT(xfs_buf_islocked(bp)); 1184 1185 bp->b_flags |= XBF_WRITE; 1186 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); 1187 1188 xfs_bdstrat_cb(bp); 1189 1190 error = xfs_buf_iowait(bp); 1191 if (error) { 1192 xfs_force_shutdown(bp->b_target->bt_mount, 1193 SHUTDOWN_META_IO_ERROR); 1194 } 1195 return error; 1196 } 1197 1198 /* 1199 * Wrapper around bdstrat so that we can stop data from going to disk in case 1200 * we are shutting down the filesystem. Typically user data goes thru this 1201 * path; one of the exceptions is the superblock. 1202 */ 1203 void 1204 xfsbdstrat( 1205 struct xfs_mount *mp, 1206 struct xfs_buf *bp) 1207 { 1208 if (XFS_FORCED_SHUTDOWN(mp)) { 1209 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1210 xfs_bioerror_relse(bp); 1211 return; 1212 } 1213 1214 xfs_buf_iorequest(bp); 1215 } 1216 1217 STATIC void 1218 _xfs_buf_ioend( 1219 xfs_buf_t *bp, 1220 int schedule) 1221 { 1222 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1223 xfs_buf_ioend(bp, schedule); 1224 } 1225 1226 STATIC void 1227 xfs_buf_bio_end_io( 1228 struct bio *bio, 1229 int error) 1230 { 1231 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1232 1233 /* 1234 * don't overwrite existing errors - otherwise we can lose errors on 1235 * buffers that require multiple bios to complete. 1236 */ 1237 if (!bp->b_error) 1238 xfs_buf_ioerror(bp, -error); 1239 1240 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1241 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1242 1243 _xfs_buf_ioend(bp, 1); 1244 bio_put(bio); 1245 } 1246 1247 static void 1248 xfs_buf_ioapply_map( 1249 struct xfs_buf *bp, 1250 int map, 1251 int *buf_offset, 1252 int *count, 1253 int rw) 1254 { 1255 int page_index; 1256 int total_nr_pages = bp->b_page_count; 1257 int nr_pages; 1258 struct bio *bio; 1259 sector_t sector = bp->b_maps[map].bm_bn; 1260 int size; 1261 int offset; 1262 1263 total_nr_pages = bp->b_page_count; 1264 1265 /* skip the pages in the buffer before the start offset */ 1266 page_index = 0; 1267 offset = *buf_offset; 1268 while (offset >= PAGE_SIZE) { 1269 page_index++; 1270 offset -= PAGE_SIZE; 1271 } 1272 1273 /* 1274 * Limit the IO size to the length of the current vector, and update the 1275 * remaining IO count for the next time around. 1276 */ 1277 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1278 *count -= size; 1279 *buf_offset += size; 1280 1281 next_chunk: 1282 atomic_inc(&bp->b_io_remaining); 1283 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1284 if (nr_pages > total_nr_pages) 1285 nr_pages = total_nr_pages; 1286 1287 bio = bio_alloc(GFP_NOIO, nr_pages); 1288 bio->bi_bdev = bp->b_target->bt_bdev; 1289 bio->bi_sector = sector; 1290 bio->bi_end_io = xfs_buf_bio_end_io; 1291 bio->bi_private = bp; 1292 1293 1294 for (; size && nr_pages; nr_pages--, page_index++) { 1295 int rbytes, nbytes = PAGE_SIZE - offset; 1296 1297 if (nbytes > size) 1298 nbytes = size; 1299 1300 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1301 offset); 1302 if (rbytes < nbytes) 1303 break; 1304 1305 offset = 0; 1306 sector += BTOBB(nbytes); 1307 size -= nbytes; 1308 total_nr_pages--; 1309 } 1310 1311 if (likely(bio->bi_size)) { 1312 if (xfs_buf_is_vmapped(bp)) { 1313 flush_kernel_vmap_range(bp->b_addr, 1314 xfs_buf_vmap_len(bp)); 1315 } 1316 submit_bio(rw, bio); 1317 if (size) 1318 goto next_chunk; 1319 } else { 1320 /* 1321 * This is guaranteed not to be the last io reference count 1322 * because the caller (xfs_buf_iorequest) holds a count itself. 1323 */ 1324 atomic_dec(&bp->b_io_remaining); 1325 xfs_buf_ioerror(bp, EIO); 1326 bio_put(bio); 1327 } 1328 1329 } 1330 1331 STATIC void 1332 _xfs_buf_ioapply( 1333 struct xfs_buf *bp) 1334 { 1335 struct blk_plug plug; 1336 int rw; 1337 int offset; 1338 int size; 1339 int i; 1340 1341 /* 1342 * Make sure we capture only current IO errors rather than stale errors 1343 * left over from previous use of the buffer (e.g. failed readahead). 1344 */ 1345 bp->b_error = 0; 1346 1347 if (bp->b_flags & XBF_WRITE) { 1348 if (bp->b_flags & XBF_SYNCIO) 1349 rw = WRITE_SYNC; 1350 else 1351 rw = WRITE; 1352 if (bp->b_flags & XBF_FUA) 1353 rw |= REQ_FUA; 1354 if (bp->b_flags & XBF_FLUSH) 1355 rw |= REQ_FLUSH; 1356 1357 /* 1358 * Run the write verifier callback function if it exists. If 1359 * this function fails it will mark the buffer with an error and 1360 * the IO should not be dispatched. 1361 */ 1362 if (bp->b_ops) { 1363 bp->b_ops->verify_write(bp); 1364 if (bp->b_error) { 1365 xfs_force_shutdown(bp->b_target->bt_mount, 1366 SHUTDOWN_CORRUPT_INCORE); 1367 return; 1368 } 1369 } 1370 } else if (bp->b_flags & XBF_READ_AHEAD) { 1371 rw = READA; 1372 } else { 1373 rw = READ; 1374 } 1375 1376 /* we only use the buffer cache for meta-data */ 1377 rw |= REQ_META; 1378 1379 /* 1380 * Walk all the vectors issuing IO on them. Set up the initial offset 1381 * into the buffer and the desired IO size before we start - 1382 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1383 * subsequent call. 1384 */ 1385 offset = bp->b_offset; 1386 size = BBTOB(bp->b_io_length); 1387 blk_start_plug(&plug); 1388 for (i = 0; i < bp->b_map_count; i++) { 1389 xfs_buf_ioapply_map(bp, i, &offset, &size, rw); 1390 if (bp->b_error) 1391 break; 1392 if (size <= 0) 1393 break; /* all done */ 1394 } 1395 blk_finish_plug(&plug); 1396 } 1397 1398 void 1399 xfs_buf_iorequest( 1400 xfs_buf_t *bp) 1401 { 1402 trace_xfs_buf_iorequest(bp, _RET_IP_); 1403 1404 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1405 1406 if (bp->b_flags & XBF_WRITE) 1407 xfs_buf_wait_unpin(bp); 1408 xfs_buf_hold(bp); 1409 1410 /* Set the count to 1 initially, this will stop an I/O 1411 * completion callout which happens before we have started 1412 * all the I/O from calling xfs_buf_ioend too early. 1413 */ 1414 atomic_set(&bp->b_io_remaining, 1); 1415 _xfs_buf_ioapply(bp); 1416 _xfs_buf_ioend(bp, 1); 1417 1418 xfs_buf_rele(bp); 1419 } 1420 1421 /* 1422 * Waits for I/O to complete on the buffer supplied. It returns immediately if 1423 * no I/O is pending or there is already a pending error on the buffer. It 1424 * returns the I/O error code, if any, or 0 if there was no error. 1425 */ 1426 int 1427 xfs_buf_iowait( 1428 xfs_buf_t *bp) 1429 { 1430 trace_xfs_buf_iowait(bp, _RET_IP_); 1431 1432 if (!bp->b_error) 1433 wait_for_completion(&bp->b_iowait); 1434 1435 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1436 return bp->b_error; 1437 } 1438 1439 xfs_caddr_t 1440 xfs_buf_offset( 1441 xfs_buf_t *bp, 1442 size_t offset) 1443 { 1444 struct page *page; 1445 1446 if (bp->b_addr) 1447 return bp->b_addr + offset; 1448 1449 offset += bp->b_offset; 1450 page = bp->b_pages[offset >> PAGE_SHIFT]; 1451 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1452 } 1453 1454 /* 1455 * Move data into or out of a buffer. 1456 */ 1457 void 1458 xfs_buf_iomove( 1459 xfs_buf_t *bp, /* buffer to process */ 1460 size_t boff, /* starting buffer offset */ 1461 size_t bsize, /* length to copy */ 1462 void *data, /* data address */ 1463 xfs_buf_rw_t mode) /* read/write/zero flag */ 1464 { 1465 size_t bend; 1466 1467 bend = boff + bsize; 1468 while (boff < bend) { 1469 struct page *page; 1470 int page_index, page_offset, csize; 1471 1472 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1473 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1474 page = bp->b_pages[page_index]; 1475 csize = min_t(size_t, PAGE_SIZE - page_offset, 1476 BBTOB(bp->b_io_length) - boff); 1477 1478 ASSERT((csize + page_offset) <= PAGE_SIZE); 1479 1480 switch (mode) { 1481 case XBRW_ZERO: 1482 memset(page_address(page) + page_offset, 0, csize); 1483 break; 1484 case XBRW_READ: 1485 memcpy(data, page_address(page) + page_offset, csize); 1486 break; 1487 case XBRW_WRITE: 1488 memcpy(page_address(page) + page_offset, data, csize); 1489 } 1490 1491 boff += csize; 1492 data += csize; 1493 } 1494 } 1495 1496 /* 1497 * Handling of buffer targets (buftargs). 1498 */ 1499 1500 /* 1501 * Wait for any bufs with callbacks that have been submitted but have not yet 1502 * returned. These buffers will have an elevated hold count, so wait on those 1503 * while freeing all the buffers only held by the LRU. 1504 */ 1505 void 1506 xfs_wait_buftarg( 1507 struct xfs_buftarg *btp) 1508 { 1509 struct xfs_buf *bp; 1510 1511 restart: 1512 spin_lock(&btp->bt_lru_lock); 1513 while (!list_empty(&btp->bt_lru)) { 1514 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1515 if (atomic_read(&bp->b_hold) > 1) { 1516 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1517 list_move_tail(&bp->b_lru, &btp->bt_lru); 1518 spin_unlock(&btp->bt_lru_lock); 1519 delay(100); 1520 goto restart; 1521 } 1522 /* 1523 * clear the LRU reference count so the buffer doesn't get 1524 * ignored in xfs_buf_rele(). 1525 */ 1526 atomic_set(&bp->b_lru_ref, 0); 1527 spin_unlock(&btp->bt_lru_lock); 1528 xfs_buf_rele(bp); 1529 spin_lock(&btp->bt_lru_lock); 1530 } 1531 spin_unlock(&btp->bt_lru_lock); 1532 } 1533 1534 int 1535 xfs_buftarg_shrink( 1536 struct shrinker *shrink, 1537 struct shrink_control *sc) 1538 { 1539 struct xfs_buftarg *btp = container_of(shrink, 1540 struct xfs_buftarg, bt_shrinker); 1541 struct xfs_buf *bp; 1542 int nr_to_scan = sc->nr_to_scan; 1543 LIST_HEAD(dispose); 1544 1545 if (!nr_to_scan) 1546 return btp->bt_lru_nr; 1547 1548 spin_lock(&btp->bt_lru_lock); 1549 while (!list_empty(&btp->bt_lru)) { 1550 if (nr_to_scan-- <= 0) 1551 break; 1552 1553 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1554 1555 /* 1556 * Decrement the b_lru_ref count unless the value is already 1557 * zero. If the value is already zero, we need to reclaim the 1558 * buffer, otherwise it gets another trip through the LRU. 1559 */ 1560 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1561 list_move_tail(&bp->b_lru, &btp->bt_lru); 1562 continue; 1563 } 1564 1565 /* 1566 * remove the buffer from the LRU now to avoid needing another 1567 * lock round trip inside xfs_buf_rele(). 1568 */ 1569 list_move(&bp->b_lru, &dispose); 1570 btp->bt_lru_nr--; 1571 bp->b_lru_flags |= _XBF_LRU_DISPOSE; 1572 } 1573 spin_unlock(&btp->bt_lru_lock); 1574 1575 while (!list_empty(&dispose)) { 1576 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1577 list_del_init(&bp->b_lru); 1578 xfs_buf_rele(bp); 1579 } 1580 1581 return btp->bt_lru_nr; 1582 } 1583 1584 void 1585 xfs_free_buftarg( 1586 struct xfs_mount *mp, 1587 struct xfs_buftarg *btp) 1588 { 1589 unregister_shrinker(&btp->bt_shrinker); 1590 1591 if (mp->m_flags & XFS_MOUNT_BARRIER) 1592 xfs_blkdev_issue_flush(btp); 1593 1594 kmem_free(btp); 1595 } 1596 1597 STATIC int 1598 xfs_setsize_buftarg_flags( 1599 xfs_buftarg_t *btp, 1600 unsigned int blocksize, 1601 unsigned int sectorsize, 1602 int verbose) 1603 { 1604 btp->bt_bsize = blocksize; 1605 btp->bt_sshift = ffs(sectorsize) - 1; 1606 btp->bt_smask = sectorsize - 1; 1607 1608 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1609 char name[BDEVNAME_SIZE]; 1610 1611 bdevname(btp->bt_bdev, name); 1612 1613 xfs_warn(btp->bt_mount, 1614 "Cannot set_blocksize to %u on device %s\n", 1615 sectorsize, name); 1616 return EINVAL; 1617 } 1618 1619 return 0; 1620 } 1621 1622 /* 1623 * When allocating the initial buffer target we have not yet 1624 * read in the superblock, so don't know what sized sectors 1625 * are being used at this early stage. Play safe. 1626 */ 1627 STATIC int 1628 xfs_setsize_buftarg_early( 1629 xfs_buftarg_t *btp, 1630 struct block_device *bdev) 1631 { 1632 return xfs_setsize_buftarg_flags(btp, 1633 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1634 } 1635 1636 int 1637 xfs_setsize_buftarg( 1638 xfs_buftarg_t *btp, 1639 unsigned int blocksize, 1640 unsigned int sectorsize) 1641 { 1642 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1643 } 1644 1645 xfs_buftarg_t * 1646 xfs_alloc_buftarg( 1647 struct xfs_mount *mp, 1648 struct block_device *bdev, 1649 int external, 1650 const char *fsname) 1651 { 1652 xfs_buftarg_t *btp; 1653 1654 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1655 1656 btp->bt_mount = mp; 1657 btp->bt_dev = bdev->bd_dev; 1658 btp->bt_bdev = bdev; 1659 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1660 if (!btp->bt_bdi) 1661 goto error; 1662 1663 INIT_LIST_HEAD(&btp->bt_lru); 1664 spin_lock_init(&btp->bt_lru_lock); 1665 if (xfs_setsize_buftarg_early(btp, bdev)) 1666 goto error; 1667 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1668 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1669 register_shrinker(&btp->bt_shrinker); 1670 return btp; 1671 1672 error: 1673 kmem_free(btp); 1674 return NULL; 1675 } 1676 1677 /* 1678 * Add a buffer to the delayed write list. 1679 * 1680 * This queues a buffer for writeout if it hasn't already been. Note that 1681 * neither this routine nor the buffer list submission functions perform 1682 * any internal synchronization. It is expected that the lists are thread-local 1683 * to the callers. 1684 * 1685 * Returns true if we queued up the buffer, or false if it already had 1686 * been on the buffer list. 1687 */ 1688 bool 1689 xfs_buf_delwri_queue( 1690 struct xfs_buf *bp, 1691 struct list_head *list) 1692 { 1693 ASSERT(xfs_buf_islocked(bp)); 1694 ASSERT(!(bp->b_flags & XBF_READ)); 1695 1696 /* 1697 * If the buffer is already marked delwri it already is queued up 1698 * by someone else for imediate writeout. Just ignore it in that 1699 * case. 1700 */ 1701 if (bp->b_flags & _XBF_DELWRI_Q) { 1702 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1703 return false; 1704 } 1705 1706 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1707 1708 /* 1709 * If a buffer gets written out synchronously or marked stale while it 1710 * is on a delwri list we lazily remove it. To do this, the other party 1711 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1712 * It remains referenced and on the list. In a rare corner case it 1713 * might get readded to a delwri list after the synchronous writeout, in 1714 * which case we need just need to re-add the flag here. 1715 */ 1716 bp->b_flags |= _XBF_DELWRI_Q; 1717 if (list_empty(&bp->b_list)) { 1718 atomic_inc(&bp->b_hold); 1719 list_add_tail(&bp->b_list, list); 1720 } 1721 1722 return true; 1723 } 1724 1725 /* 1726 * Compare function is more complex than it needs to be because 1727 * the return value is only 32 bits and we are doing comparisons 1728 * on 64 bit values 1729 */ 1730 static int 1731 xfs_buf_cmp( 1732 void *priv, 1733 struct list_head *a, 1734 struct list_head *b) 1735 { 1736 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1737 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1738 xfs_daddr_t diff; 1739 1740 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1741 if (diff < 0) 1742 return -1; 1743 if (diff > 0) 1744 return 1; 1745 return 0; 1746 } 1747 1748 static int 1749 __xfs_buf_delwri_submit( 1750 struct list_head *buffer_list, 1751 struct list_head *io_list, 1752 bool wait) 1753 { 1754 struct blk_plug plug; 1755 struct xfs_buf *bp, *n; 1756 int pinned = 0; 1757 1758 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1759 if (!wait) { 1760 if (xfs_buf_ispinned(bp)) { 1761 pinned++; 1762 continue; 1763 } 1764 if (!xfs_buf_trylock(bp)) 1765 continue; 1766 } else { 1767 xfs_buf_lock(bp); 1768 } 1769 1770 /* 1771 * Someone else might have written the buffer synchronously or 1772 * marked it stale in the meantime. In that case only the 1773 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 1774 * reference and remove it from the list here. 1775 */ 1776 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1777 list_del_init(&bp->b_list); 1778 xfs_buf_relse(bp); 1779 continue; 1780 } 1781 1782 list_move_tail(&bp->b_list, io_list); 1783 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1784 } 1785 1786 list_sort(NULL, io_list, xfs_buf_cmp); 1787 1788 blk_start_plug(&plug); 1789 list_for_each_entry_safe(bp, n, io_list, b_list) { 1790 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 1791 bp->b_flags |= XBF_WRITE; 1792 1793 if (!wait) { 1794 bp->b_flags |= XBF_ASYNC; 1795 list_del_init(&bp->b_list); 1796 } 1797 xfs_bdstrat_cb(bp); 1798 } 1799 blk_finish_plug(&plug); 1800 1801 return pinned; 1802 } 1803 1804 /* 1805 * Write out a buffer list asynchronously. 1806 * 1807 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1808 * out and not wait for I/O completion on any of the buffers. This interface 1809 * is only safely useable for callers that can track I/O completion by higher 1810 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1811 * function. 1812 */ 1813 int 1814 xfs_buf_delwri_submit_nowait( 1815 struct list_head *buffer_list) 1816 { 1817 LIST_HEAD (io_list); 1818 return __xfs_buf_delwri_submit(buffer_list, &io_list, false); 1819 } 1820 1821 /* 1822 * Write out a buffer list synchronously. 1823 * 1824 * This will take the @buffer_list, write all buffers out and wait for I/O 1825 * completion on all of the buffers. @buffer_list is consumed by the function, 1826 * so callers must have some other way of tracking buffers if they require such 1827 * functionality. 1828 */ 1829 int 1830 xfs_buf_delwri_submit( 1831 struct list_head *buffer_list) 1832 { 1833 LIST_HEAD (io_list); 1834 int error = 0, error2; 1835 struct xfs_buf *bp; 1836 1837 __xfs_buf_delwri_submit(buffer_list, &io_list, true); 1838 1839 /* Wait for IO to complete. */ 1840 while (!list_empty(&io_list)) { 1841 bp = list_first_entry(&io_list, struct xfs_buf, b_list); 1842 1843 list_del_init(&bp->b_list); 1844 error2 = xfs_buf_iowait(bp); 1845 xfs_buf_relse(bp); 1846 if (!error) 1847 error = error2; 1848 } 1849 1850 return error; 1851 } 1852 1853 int __init 1854 xfs_buf_init(void) 1855 { 1856 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1857 KM_ZONE_HWALIGN, NULL); 1858 if (!xfs_buf_zone) 1859 goto out; 1860 1861 xfslogd_workqueue = alloc_workqueue("xfslogd", 1862 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); 1863 if (!xfslogd_workqueue) 1864 goto out_free_buf_zone; 1865 1866 return 0; 1867 1868 out_free_buf_zone: 1869 kmem_zone_destroy(xfs_buf_zone); 1870 out: 1871 return -ENOMEM; 1872 } 1873 1874 void 1875 xfs_buf_terminate(void) 1876 { 1877 destroy_workqueue(xfslogd_workqueue); 1878 kmem_zone_destroy(xfs_buf_zone); 1879 } 1880