1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 37 #include "xfs_sb.h" 38 #include "xfs_inum.h" 39 #include "xfs_log.h" 40 #include "xfs_ag.h" 41 #include "xfs_mount.h" 42 #include "xfs_trace.h" 43 44 static kmem_zone_t *xfs_buf_zone; 45 STATIC int xfsbufd(void *); 46 47 static struct workqueue_struct *xfslogd_workqueue; 48 struct workqueue_struct *xfsdatad_workqueue; 49 struct workqueue_struct *xfsconvertd_workqueue; 50 51 #ifdef XFS_BUF_LOCK_TRACKING 52 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 53 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 54 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 55 #else 56 # define XB_SET_OWNER(bp) do { } while (0) 57 # define XB_CLEAR_OWNER(bp) do { } while (0) 58 # define XB_GET_OWNER(bp) do { } while (0) 59 #endif 60 61 #define xb_to_gfp(flags) \ 62 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 63 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 64 65 #define xb_to_km(flags) \ 66 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 67 68 69 static inline int 70 xfs_buf_is_vmapped( 71 struct xfs_buf *bp) 72 { 73 /* 74 * Return true if the buffer is vmapped. 75 * 76 * The XBF_MAPPED flag is set if the buffer should be mapped, but the 77 * code is clever enough to know it doesn't have to map a single page, 78 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. 79 */ 80 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; 81 } 82 83 static inline int 84 xfs_buf_vmap_len( 85 struct xfs_buf *bp) 86 { 87 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 88 } 89 90 /* 91 * xfs_buf_lru_add - add a buffer to the LRU. 92 * 93 * The LRU takes a new reference to the buffer so that it will only be freed 94 * once the shrinker takes the buffer off the LRU. 95 */ 96 STATIC void 97 xfs_buf_lru_add( 98 struct xfs_buf *bp) 99 { 100 struct xfs_buftarg *btp = bp->b_target; 101 102 spin_lock(&btp->bt_lru_lock); 103 if (list_empty(&bp->b_lru)) { 104 atomic_inc(&bp->b_hold); 105 list_add_tail(&bp->b_lru, &btp->bt_lru); 106 btp->bt_lru_nr++; 107 } 108 spin_unlock(&btp->bt_lru_lock); 109 } 110 111 /* 112 * xfs_buf_lru_del - remove a buffer from the LRU 113 * 114 * The unlocked check is safe here because it only occurs when there are not 115 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there 116 * to optimise the shrinker removing the buffer from the LRU and calling 117 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the 118 * bt_lru_lock. 119 */ 120 STATIC void 121 xfs_buf_lru_del( 122 struct xfs_buf *bp) 123 { 124 struct xfs_buftarg *btp = bp->b_target; 125 126 if (list_empty(&bp->b_lru)) 127 return; 128 129 spin_lock(&btp->bt_lru_lock); 130 if (!list_empty(&bp->b_lru)) { 131 list_del_init(&bp->b_lru); 132 btp->bt_lru_nr--; 133 } 134 spin_unlock(&btp->bt_lru_lock); 135 } 136 137 /* 138 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 139 * b_lru_ref count so that the buffer is freed immediately when the buffer 140 * reference count falls to zero. If the buffer is already on the LRU, we need 141 * to remove the reference that LRU holds on the buffer. 142 * 143 * This prevents build-up of stale buffers on the LRU. 144 */ 145 void 146 xfs_buf_stale( 147 struct xfs_buf *bp) 148 { 149 bp->b_flags |= XBF_STALE; 150 xfs_buf_delwri_dequeue(bp); 151 atomic_set(&(bp)->b_lru_ref, 0); 152 if (!list_empty(&bp->b_lru)) { 153 struct xfs_buftarg *btp = bp->b_target; 154 155 spin_lock(&btp->bt_lru_lock); 156 if (!list_empty(&bp->b_lru)) { 157 list_del_init(&bp->b_lru); 158 btp->bt_lru_nr--; 159 atomic_dec(&bp->b_hold); 160 } 161 spin_unlock(&btp->bt_lru_lock); 162 } 163 ASSERT(atomic_read(&bp->b_hold) >= 1); 164 } 165 166 struct xfs_buf * 167 xfs_buf_alloc( 168 struct xfs_buftarg *target, 169 xfs_off_t range_base, 170 size_t range_length, 171 xfs_buf_flags_t flags) 172 { 173 struct xfs_buf *bp; 174 175 bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)); 176 if (unlikely(!bp)) 177 return NULL; 178 179 /* 180 * We don't want certain flags to appear in b_flags. 181 */ 182 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 183 184 memset(bp, 0, sizeof(xfs_buf_t)); 185 atomic_set(&bp->b_hold, 1); 186 atomic_set(&bp->b_lru_ref, 1); 187 init_completion(&bp->b_iowait); 188 INIT_LIST_HEAD(&bp->b_lru); 189 INIT_LIST_HEAD(&bp->b_list); 190 RB_CLEAR_NODE(&bp->b_rbnode); 191 sema_init(&bp->b_sema, 0); /* held, no waiters */ 192 XB_SET_OWNER(bp); 193 bp->b_target = target; 194 bp->b_file_offset = range_base; 195 /* 196 * Set buffer_length and count_desired to the same value initially. 197 * I/O routines should use count_desired, which will be the same in 198 * most cases but may be reset (e.g. XFS recovery). 199 */ 200 bp->b_buffer_length = bp->b_count_desired = range_length; 201 bp->b_flags = flags; 202 bp->b_bn = XFS_BUF_DADDR_NULL; 203 atomic_set(&bp->b_pin_count, 0); 204 init_waitqueue_head(&bp->b_waiters); 205 206 XFS_STATS_INC(xb_create); 207 trace_xfs_buf_init(bp, _RET_IP_); 208 209 return bp; 210 } 211 212 /* 213 * Allocate a page array capable of holding a specified number 214 * of pages, and point the page buf at it. 215 */ 216 STATIC int 217 _xfs_buf_get_pages( 218 xfs_buf_t *bp, 219 int page_count, 220 xfs_buf_flags_t flags) 221 { 222 /* Make sure that we have a page list */ 223 if (bp->b_pages == NULL) { 224 bp->b_offset = xfs_buf_poff(bp->b_file_offset); 225 bp->b_page_count = page_count; 226 if (page_count <= XB_PAGES) { 227 bp->b_pages = bp->b_page_array; 228 } else { 229 bp->b_pages = kmem_alloc(sizeof(struct page *) * 230 page_count, xb_to_km(flags)); 231 if (bp->b_pages == NULL) 232 return -ENOMEM; 233 } 234 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 235 } 236 return 0; 237 } 238 239 /* 240 * Frees b_pages if it was allocated. 241 */ 242 STATIC void 243 _xfs_buf_free_pages( 244 xfs_buf_t *bp) 245 { 246 if (bp->b_pages != bp->b_page_array) { 247 kmem_free(bp->b_pages); 248 bp->b_pages = NULL; 249 } 250 } 251 252 /* 253 * Releases the specified buffer. 254 * 255 * The modification state of any associated pages is left unchanged. 256 * The buffer most not be on any hash - use xfs_buf_rele instead for 257 * hashed and refcounted buffers 258 */ 259 void 260 xfs_buf_free( 261 xfs_buf_t *bp) 262 { 263 trace_xfs_buf_free(bp, _RET_IP_); 264 265 ASSERT(list_empty(&bp->b_lru)); 266 267 if (bp->b_flags & _XBF_PAGES) { 268 uint i; 269 270 if (xfs_buf_is_vmapped(bp)) 271 vm_unmap_ram(bp->b_addr - bp->b_offset, 272 bp->b_page_count); 273 274 for (i = 0; i < bp->b_page_count; i++) { 275 struct page *page = bp->b_pages[i]; 276 277 __free_page(page); 278 } 279 } else if (bp->b_flags & _XBF_KMEM) 280 kmem_free(bp->b_addr); 281 _xfs_buf_free_pages(bp); 282 kmem_zone_free(xfs_buf_zone, bp); 283 } 284 285 /* 286 * Allocates all the pages for buffer in question and builds it's page list. 287 */ 288 STATIC int 289 xfs_buf_allocate_memory( 290 xfs_buf_t *bp, 291 uint flags) 292 { 293 size_t size = bp->b_count_desired; 294 size_t nbytes, offset; 295 gfp_t gfp_mask = xb_to_gfp(flags); 296 unsigned short page_count, i; 297 xfs_off_t end; 298 int error; 299 300 /* 301 * for buffers that are contained within a single page, just allocate 302 * the memory from the heap - there's no need for the complexity of 303 * page arrays to keep allocation down to order 0. 304 */ 305 if (bp->b_buffer_length < PAGE_SIZE) { 306 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); 307 if (!bp->b_addr) { 308 /* low memory - use alloc_page loop instead */ 309 goto use_alloc_page; 310 } 311 312 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & 313 PAGE_MASK) != 314 ((unsigned long)bp->b_addr & PAGE_MASK)) { 315 /* b_addr spans two pages - use alloc_page instead */ 316 kmem_free(bp->b_addr); 317 bp->b_addr = NULL; 318 goto use_alloc_page; 319 } 320 bp->b_offset = offset_in_page(bp->b_addr); 321 bp->b_pages = bp->b_page_array; 322 bp->b_pages[0] = virt_to_page(bp->b_addr); 323 bp->b_page_count = 1; 324 bp->b_flags |= XBF_MAPPED | _XBF_KMEM; 325 return 0; 326 } 327 328 use_alloc_page: 329 end = bp->b_file_offset + bp->b_buffer_length; 330 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 331 error = _xfs_buf_get_pages(bp, page_count, flags); 332 if (unlikely(error)) 333 return error; 334 335 offset = bp->b_offset; 336 bp->b_flags |= _XBF_PAGES; 337 338 for (i = 0; i < bp->b_page_count; i++) { 339 struct page *page; 340 uint retries = 0; 341 retry: 342 page = alloc_page(gfp_mask); 343 if (unlikely(page == NULL)) { 344 if (flags & XBF_READ_AHEAD) { 345 bp->b_page_count = i; 346 error = ENOMEM; 347 goto out_free_pages; 348 } 349 350 /* 351 * This could deadlock. 352 * 353 * But until all the XFS lowlevel code is revamped to 354 * handle buffer allocation failures we can't do much. 355 */ 356 if (!(++retries % 100)) 357 xfs_err(NULL, 358 "possible memory allocation deadlock in %s (mode:0x%x)", 359 __func__, gfp_mask); 360 361 XFS_STATS_INC(xb_page_retries); 362 congestion_wait(BLK_RW_ASYNC, HZ/50); 363 goto retry; 364 } 365 366 XFS_STATS_INC(xb_page_found); 367 368 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 369 size -= nbytes; 370 bp->b_pages[i] = page; 371 offset = 0; 372 } 373 return 0; 374 375 out_free_pages: 376 for (i = 0; i < bp->b_page_count; i++) 377 __free_page(bp->b_pages[i]); 378 return error; 379 } 380 381 /* 382 * Map buffer into kernel address-space if necessary. 383 */ 384 STATIC int 385 _xfs_buf_map_pages( 386 xfs_buf_t *bp, 387 uint flags) 388 { 389 ASSERT(bp->b_flags & _XBF_PAGES); 390 if (bp->b_page_count == 1) { 391 /* A single page buffer is always mappable */ 392 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 393 bp->b_flags |= XBF_MAPPED; 394 } else if (flags & XBF_MAPPED) { 395 int retried = 0; 396 397 do { 398 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 399 -1, PAGE_KERNEL); 400 if (bp->b_addr) 401 break; 402 vm_unmap_aliases(); 403 } while (retried++ <= 1); 404 405 if (!bp->b_addr) 406 return -ENOMEM; 407 bp->b_addr += bp->b_offset; 408 bp->b_flags |= XBF_MAPPED; 409 } 410 411 return 0; 412 } 413 414 /* 415 * Finding and Reading Buffers 416 */ 417 418 /* 419 * Look up, and creates if absent, a lockable buffer for 420 * a given range of an inode. The buffer is returned 421 * locked. No I/O is implied by this call. 422 */ 423 xfs_buf_t * 424 _xfs_buf_find( 425 xfs_buftarg_t *btp, /* block device target */ 426 xfs_off_t ioff, /* starting offset of range */ 427 size_t isize, /* length of range */ 428 xfs_buf_flags_t flags, 429 xfs_buf_t *new_bp) 430 { 431 xfs_off_t range_base; 432 size_t range_length; 433 struct xfs_perag *pag; 434 struct rb_node **rbp; 435 struct rb_node *parent; 436 xfs_buf_t *bp; 437 438 range_base = (ioff << BBSHIFT); 439 range_length = (isize << BBSHIFT); 440 441 /* Check for IOs smaller than the sector size / not sector aligned */ 442 ASSERT(!(range_length < (1 << btp->bt_sshift))); 443 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 444 445 /* get tree root */ 446 pag = xfs_perag_get(btp->bt_mount, 447 xfs_daddr_to_agno(btp->bt_mount, ioff)); 448 449 /* walk tree */ 450 spin_lock(&pag->pag_buf_lock); 451 rbp = &pag->pag_buf_tree.rb_node; 452 parent = NULL; 453 bp = NULL; 454 while (*rbp) { 455 parent = *rbp; 456 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 457 458 if (range_base < bp->b_file_offset) 459 rbp = &(*rbp)->rb_left; 460 else if (range_base > bp->b_file_offset) 461 rbp = &(*rbp)->rb_right; 462 else { 463 /* 464 * found a block offset match. If the range doesn't 465 * match, the only way this is allowed is if the buffer 466 * in the cache is stale and the transaction that made 467 * it stale has not yet committed. i.e. we are 468 * reallocating a busy extent. Skip this buffer and 469 * continue searching to the right for an exact match. 470 */ 471 if (bp->b_buffer_length != range_length) { 472 ASSERT(bp->b_flags & XBF_STALE); 473 rbp = &(*rbp)->rb_right; 474 continue; 475 } 476 atomic_inc(&bp->b_hold); 477 goto found; 478 } 479 } 480 481 /* No match found */ 482 if (new_bp) { 483 rb_link_node(&new_bp->b_rbnode, parent, rbp); 484 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 485 /* the buffer keeps the perag reference until it is freed */ 486 new_bp->b_pag = pag; 487 spin_unlock(&pag->pag_buf_lock); 488 } else { 489 XFS_STATS_INC(xb_miss_locked); 490 spin_unlock(&pag->pag_buf_lock); 491 xfs_perag_put(pag); 492 } 493 return new_bp; 494 495 found: 496 spin_unlock(&pag->pag_buf_lock); 497 xfs_perag_put(pag); 498 499 if (!xfs_buf_trylock(bp)) { 500 if (flags & XBF_TRYLOCK) { 501 xfs_buf_rele(bp); 502 XFS_STATS_INC(xb_busy_locked); 503 return NULL; 504 } 505 xfs_buf_lock(bp); 506 XFS_STATS_INC(xb_get_locked_waited); 507 } 508 509 /* 510 * if the buffer is stale, clear all the external state associated with 511 * it. We need to keep flags such as how we allocated the buffer memory 512 * intact here. 513 */ 514 if (bp->b_flags & XBF_STALE) { 515 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 516 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; 517 } 518 519 trace_xfs_buf_find(bp, flags, _RET_IP_); 520 XFS_STATS_INC(xb_get_locked); 521 return bp; 522 } 523 524 /* 525 * Assembles a buffer covering the specified range. The code is optimised for 526 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 527 * more hits than misses. 528 */ 529 struct xfs_buf * 530 xfs_buf_get( 531 xfs_buftarg_t *target,/* target for buffer */ 532 xfs_off_t ioff, /* starting offset of range */ 533 size_t isize, /* length of range */ 534 xfs_buf_flags_t flags) 535 { 536 struct xfs_buf *bp; 537 struct xfs_buf *new_bp; 538 int error = 0; 539 540 bp = _xfs_buf_find(target, ioff, isize, flags, NULL); 541 if (likely(bp)) 542 goto found; 543 544 new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT, 545 flags); 546 if (unlikely(!new_bp)) 547 return NULL; 548 549 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 550 if (!bp) { 551 kmem_zone_free(xfs_buf_zone, new_bp); 552 return NULL; 553 } 554 555 if (bp == new_bp) { 556 error = xfs_buf_allocate_memory(bp, flags); 557 if (error) 558 goto no_buffer; 559 } else 560 kmem_zone_free(xfs_buf_zone, new_bp); 561 562 /* 563 * Now we have a workable buffer, fill in the block number so 564 * that we can do IO on it. 565 */ 566 bp->b_bn = ioff; 567 bp->b_count_desired = bp->b_buffer_length; 568 569 found: 570 if (!(bp->b_flags & XBF_MAPPED)) { 571 error = _xfs_buf_map_pages(bp, flags); 572 if (unlikely(error)) { 573 xfs_warn(target->bt_mount, 574 "%s: failed to map pages\n", __func__); 575 goto no_buffer; 576 } 577 } 578 579 XFS_STATS_INC(xb_get); 580 trace_xfs_buf_get(bp, flags, _RET_IP_); 581 return bp; 582 583 no_buffer: 584 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 585 xfs_buf_unlock(bp); 586 xfs_buf_rele(bp); 587 return NULL; 588 } 589 590 STATIC int 591 _xfs_buf_read( 592 xfs_buf_t *bp, 593 xfs_buf_flags_t flags) 594 { 595 int status; 596 597 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 598 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 599 600 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); 601 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 602 603 status = xfs_buf_iorequest(bp); 604 if (status || bp->b_error || (flags & XBF_ASYNC)) 605 return status; 606 return xfs_buf_iowait(bp); 607 } 608 609 xfs_buf_t * 610 xfs_buf_read( 611 xfs_buftarg_t *target, 612 xfs_off_t ioff, 613 size_t isize, 614 xfs_buf_flags_t flags) 615 { 616 xfs_buf_t *bp; 617 618 flags |= XBF_READ; 619 620 bp = xfs_buf_get(target, ioff, isize, flags); 621 if (bp) { 622 trace_xfs_buf_read(bp, flags, _RET_IP_); 623 624 if (!XFS_BUF_ISDONE(bp)) { 625 XFS_STATS_INC(xb_get_read); 626 _xfs_buf_read(bp, flags); 627 } else if (flags & XBF_ASYNC) { 628 /* 629 * Read ahead call which is already satisfied, 630 * drop the buffer 631 */ 632 goto no_buffer; 633 } else { 634 /* We do not want read in the flags */ 635 bp->b_flags &= ~XBF_READ; 636 } 637 } 638 639 return bp; 640 641 no_buffer: 642 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 643 xfs_buf_unlock(bp); 644 xfs_buf_rele(bp); 645 return NULL; 646 } 647 648 /* 649 * If we are not low on memory then do the readahead in a deadlock 650 * safe manner. 651 */ 652 void 653 xfs_buf_readahead( 654 xfs_buftarg_t *target, 655 xfs_off_t ioff, 656 size_t isize) 657 { 658 if (bdi_read_congested(target->bt_bdi)) 659 return; 660 661 xfs_buf_read(target, ioff, isize, 662 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); 663 } 664 665 /* 666 * Read an uncached buffer from disk. Allocates and returns a locked 667 * buffer containing the disk contents or nothing. 668 */ 669 struct xfs_buf * 670 xfs_buf_read_uncached( 671 struct xfs_mount *mp, 672 struct xfs_buftarg *target, 673 xfs_daddr_t daddr, 674 size_t length, 675 int flags) 676 { 677 xfs_buf_t *bp; 678 int error; 679 680 bp = xfs_buf_get_uncached(target, length, flags); 681 if (!bp) 682 return NULL; 683 684 /* set up the buffer for a read IO */ 685 XFS_BUF_SET_ADDR(bp, daddr); 686 XFS_BUF_READ(bp); 687 688 xfsbdstrat(mp, bp); 689 error = xfs_buf_iowait(bp); 690 if (error || bp->b_error) { 691 xfs_buf_relse(bp); 692 return NULL; 693 } 694 return bp; 695 } 696 697 /* 698 * Return a buffer allocated as an empty buffer and associated to external 699 * memory via xfs_buf_associate_memory() back to it's empty state. 700 */ 701 void 702 xfs_buf_set_empty( 703 struct xfs_buf *bp, 704 size_t len) 705 { 706 if (bp->b_pages) 707 _xfs_buf_free_pages(bp); 708 709 bp->b_pages = NULL; 710 bp->b_page_count = 0; 711 bp->b_addr = NULL; 712 bp->b_file_offset = 0; 713 bp->b_buffer_length = bp->b_count_desired = len; 714 bp->b_bn = XFS_BUF_DADDR_NULL; 715 bp->b_flags &= ~XBF_MAPPED; 716 } 717 718 static inline struct page * 719 mem_to_page( 720 void *addr) 721 { 722 if ((!is_vmalloc_addr(addr))) { 723 return virt_to_page(addr); 724 } else { 725 return vmalloc_to_page(addr); 726 } 727 } 728 729 int 730 xfs_buf_associate_memory( 731 xfs_buf_t *bp, 732 void *mem, 733 size_t len) 734 { 735 int rval; 736 int i = 0; 737 unsigned long pageaddr; 738 unsigned long offset; 739 size_t buflen; 740 int page_count; 741 742 pageaddr = (unsigned long)mem & PAGE_MASK; 743 offset = (unsigned long)mem - pageaddr; 744 buflen = PAGE_ALIGN(len + offset); 745 page_count = buflen >> PAGE_SHIFT; 746 747 /* Free any previous set of page pointers */ 748 if (bp->b_pages) 749 _xfs_buf_free_pages(bp); 750 751 bp->b_pages = NULL; 752 bp->b_addr = mem; 753 754 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); 755 if (rval) 756 return rval; 757 758 bp->b_offset = offset; 759 760 for (i = 0; i < bp->b_page_count; i++) { 761 bp->b_pages[i] = mem_to_page((void *)pageaddr); 762 pageaddr += PAGE_SIZE; 763 } 764 765 bp->b_count_desired = len; 766 bp->b_buffer_length = buflen; 767 bp->b_flags |= XBF_MAPPED; 768 769 return 0; 770 } 771 772 xfs_buf_t * 773 xfs_buf_get_uncached( 774 struct xfs_buftarg *target, 775 size_t len, 776 int flags) 777 { 778 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 779 int error, i; 780 xfs_buf_t *bp; 781 782 bp = xfs_buf_alloc(target, 0, len, 0); 783 if (unlikely(bp == NULL)) 784 goto fail; 785 786 error = _xfs_buf_get_pages(bp, page_count, 0); 787 if (error) 788 goto fail_free_buf; 789 790 for (i = 0; i < page_count; i++) { 791 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 792 if (!bp->b_pages[i]) 793 goto fail_free_mem; 794 } 795 bp->b_flags |= _XBF_PAGES; 796 797 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 798 if (unlikely(error)) { 799 xfs_warn(target->bt_mount, 800 "%s: failed to map pages\n", __func__); 801 goto fail_free_mem; 802 } 803 804 trace_xfs_buf_get_uncached(bp, _RET_IP_); 805 return bp; 806 807 fail_free_mem: 808 while (--i >= 0) 809 __free_page(bp->b_pages[i]); 810 _xfs_buf_free_pages(bp); 811 fail_free_buf: 812 kmem_zone_free(xfs_buf_zone, bp); 813 fail: 814 return NULL; 815 } 816 817 /* 818 * Increment reference count on buffer, to hold the buffer concurrently 819 * with another thread which may release (free) the buffer asynchronously. 820 * Must hold the buffer already to call this function. 821 */ 822 void 823 xfs_buf_hold( 824 xfs_buf_t *bp) 825 { 826 trace_xfs_buf_hold(bp, _RET_IP_); 827 atomic_inc(&bp->b_hold); 828 } 829 830 /* 831 * Releases a hold on the specified buffer. If the 832 * the hold count is 1, calls xfs_buf_free. 833 */ 834 void 835 xfs_buf_rele( 836 xfs_buf_t *bp) 837 { 838 struct xfs_perag *pag = bp->b_pag; 839 840 trace_xfs_buf_rele(bp, _RET_IP_); 841 842 if (!pag) { 843 ASSERT(list_empty(&bp->b_lru)); 844 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 845 if (atomic_dec_and_test(&bp->b_hold)) 846 xfs_buf_free(bp); 847 return; 848 } 849 850 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 851 852 ASSERT(atomic_read(&bp->b_hold) > 0); 853 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 854 if (!(bp->b_flags & XBF_STALE) && 855 atomic_read(&bp->b_lru_ref)) { 856 xfs_buf_lru_add(bp); 857 spin_unlock(&pag->pag_buf_lock); 858 } else { 859 xfs_buf_lru_del(bp); 860 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 861 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 862 spin_unlock(&pag->pag_buf_lock); 863 xfs_perag_put(pag); 864 xfs_buf_free(bp); 865 } 866 } 867 } 868 869 870 /* 871 * Lock a buffer object, if it is not already locked. 872 * 873 * If we come across a stale, pinned, locked buffer, we know that we are 874 * being asked to lock a buffer that has been reallocated. Because it is 875 * pinned, we know that the log has not been pushed to disk and hence it 876 * will still be locked. Rather than continuing to have trylock attempts 877 * fail until someone else pushes the log, push it ourselves before 878 * returning. This means that the xfsaild will not get stuck trying 879 * to push on stale inode buffers. 880 */ 881 int 882 xfs_buf_trylock( 883 struct xfs_buf *bp) 884 { 885 int locked; 886 887 locked = down_trylock(&bp->b_sema) == 0; 888 if (locked) 889 XB_SET_OWNER(bp); 890 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 891 xfs_log_force(bp->b_target->bt_mount, 0); 892 893 trace_xfs_buf_trylock(bp, _RET_IP_); 894 return locked; 895 } 896 897 /* 898 * Lock a buffer object. 899 * 900 * If we come across a stale, pinned, locked buffer, we know that we 901 * are being asked to lock a buffer that has been reallocated. Because 902 * it is pinned, we know that the log has not been pushed to disk and 903 * hence it will still be locked. Rather than sleeping until someone 904 * else pushes the log, push it ourselves before trying to get the lock. 905 */ 906 void 907 xfs_buf_lock( 908 struct xfs_buf *bp) 909 { 910 trace_xfs_buf_lock(bp, _RET_IP_); 911 912 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 913 xfs_log_force(bp->b_target->bt_mount, 0); 914 down(&bp->b_sema); 915 XB_SET_OWNER(bp); 916 917 trace_xfs_buf_lock_done(bp, _RET_IP_); 918 } 919 920 /* 921 * Releases the lock on the buffer object. 922 * If the buffer is marked delwri but is not queued, do so before we 923 * unlock the buffer as we need to set flags correctly. We also need to 924 * take a reference for the delwri queue because the unlocker is going to 925 * drop their's and they don't know we just queued it. 926 */ 927 void 928 xfs_buf_unlock( 929 struct xfs_buf *bp) 930 { 931 XB_CLEAR_OWNER(bp); 932 up(&bp->b_sema); 933 934 trace_xfs_buf_unlock(bp, _RET_IP_); 935 } 936 937 STATIC void 938 xfs_buf_wait_unpin( 939 xfs_buf_t *bp) 940 { 941 DECLARE_WAITQUEUE (wait, current); 942 943 if (atomic_read(&bp->b_pin_count) == 0) 944 return; 945 946 add_wait_queue(&bp->b_waiters, &wait); 947 for (;;) { 948 set_current_state(TASK_UNINTERRUPTIBLE); 949 if (atomic_read(&bp->b_pin_count) == 0) 950 break; 951 io_schedule(); 952 } 953 remove_wait_queue(&bp->b_waiters, &wait); 954 set_current_state(TASK_RUNNING); 955 } 956 957 /* 958 * Buffer Utility Routines 959 */ 960 961 STATIC void 962 xfs_buf_iodone_work( 963 struct work_struct *work) 964 { 965 xfs_buf_t *bp = 966 container_of(work, xfs_buf_t, b_iodone_work); 967 968 if (bp->b_iodone) 969 (*(bp->b_iodone))(bp); 970 else if (bp->b_flags & XBF_ASYNC) 971 xfs_buf_relse(bp); 972 } 973 974 void 975 xfs_buf_ioend( 976 xfs_buf_t *bp, 977 int schedule) 978 { 979 trace_xfs_buf_iodone(bp, _RET_IP_); 980 981 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 982 if (bp->b_error == 0) 983 bp->b_flags |= XBF_DONE; 984 985 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 986 if (schedule) { 987 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 988 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 989 } else { 990 xfs_buf_iodone_work(&bp->b_iodone_work); 991 } 992 } else { 993 complete(&bp->b_iowait); 994 } 995 } 996 997 void 998 xfs_buf_ioerror( 999 xfs_buf_t *bp, 1000 int error) 1001 { 1002 ASSERT(error >= 0 && error <= 0xffff); 1003 bp->b_error = (unsigned short)error; 1004 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1005 } 1006 1007 void 1008 xfs_buf_ioerror_alert( 1009 struct xfs_buf *bp, 1010 const char *func) 1011 { 1012 xfs_alert(bp->b_target->bt_mount, 1013 "metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd", 1014 (__uint64_t)XFS_BUF_ADDR(bp), func, 1015 bp->b_error, XFS_BUF_COUNT(bp)); 1016 } 1017 1018 int 1019 xfs_bwrite( 1020 struct xfs_buf *bp) 1021 { 1022 int error; 1023 1024 bp->b_flags |= XBF_WRITE; 1025 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1026 1027 xfs_buf_delwri_dequeue(bp); 1028 xfs_bdstrat_cb(bp); 1029 1030 error = xfs_buf_iowait(bp); 1031 if (error) { 1032 xfs_force_shutdown(bp->b_target->bt_mount, 1033 SHUTDOWN_META_IO_ERROR); 1034 } 1035 return error; 1036 } 1037 1038 /* 1039 * Called when we want to stop a buffer from getting written or read. 1040 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 1041 * so that the proper iodone callbacks get called. 1042 */ 1043 STATIC int 1044 xfs_bioerror( 1045 xfs_buf_t *bp) 1046 { 1047 #ifdef XFSERRORDEBUG 1048 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); 1049 #endif 1050 1051 /* 1052 * No need to wait until the buffer is unpinned, we aren't flushing it. 1053 */ 1054 xfs_buf_ioerror(bp, EIO); 1055 1056 /* 1057 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1058 */ 1059 XFS_BUF_UNREAD(bp); 1060 XFS_BUF_UNDONE(bp); 1061 xfs_buf_stale(bp); 1062 1063 xfs_buf_ioend(bp, 0); 1064 1065 return EIO; 1066 } 1067 1068 /* 1069 * Same as xfs_bioerror, except that we are releasing the buffer 1070 * here ourselves, and avoiding the xfs_buf_ioend call. 1071 * This is meant for userdata errors; metadata bufs come with 1072 * iodone functions attached, so that we can track down errors. 1073 */ 1074 STATIC int 1075 xfs_bioerror_relse( 1076 struct xfs_buf *bp) 1077 { 1078 int64_t fl = bp->b_flags; 1079 /* 1080 * No need to wait until the buffer is unpinned. 1081 * We aren't flushing it. 1082 * 1083 * chunkhold expects B_DONE to be set, whether 1084 * we actually finish the I/O or not. We don't want to 1085 * change that interface. 1086 */ 1087 XFS_BUF_UNREAD(bp); 1088 XFS_BUF_DONE(bp); 1089 xfs_buf_stale(bp); 1090 bp->b_iodone = NULL; 1091 if (!(fl & XBF_ASYNC)) { 1092 /* 1093 * Mark b_error and B_ERROR _both_. 1094 * Lot's of chunkcache code assumes that. 1095 * There's no reason to mark error for 1096 * ASYNC buffers. 1097 */ 1098 xfs_buf_ioerror(bp, EIO); 1099 complete(&bp->b_iowait); 1100 } else { 1101 xfs_buf_relse(bp); 1102 } 1103 1104 return EIO; 1105 } 1106 1107 1108 /* 1109 * All xfs metadata buffers except log state machine buffers 1110 * get this attached as their b_bdstrat callback function. 1111 * This is so that we can catch a buffer 1112 * after prematurely unpinning it to forcibly shutdown the filesystem. 1113 */ 1114 int 1115 xfs_bdstrat_cb( 1116 struct xfs_buf *bp) 1117 { 1118 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1119 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1120 /* 1121 * Metadata write that didn't get logged but 1122 * written delayed anyway. These aren't associated 1123 * with a transaction, and can be ignored. 1124 */ 1125 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) 1126 return xfs_bioerror_relse(bp); 1127 else 1128 return xfs_bioerror(bp); 1129 } 1130 1131 xfs_buf_iorequest(bp); 1132 return 0; 1133 } 1134 1135 /* 1136 * Wrapper around bdstrat so that we can stop data from going to disk in case 1137 * we are shutting down the filesystem. Typically user data goes thru this 1138 * path; one of the exceptions is the superblock. 1139 */ 1140 void 1141 xfsbdstrat( 1142 struct xfs_mount *mp, 1143 struct xfs_buf *bp) 1144 { 1145 if (XFS_FORCED_SHUTDOWN(mp)) { 1146 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1147 xfs_bioerror_relse(bp); 1148 return; 1149 } 1150 1151 xfs_buf_iorequest(bp); 1152 } 1153 1154 STATIC void 1155 _xfs_buf_ioend( 1156 xfs_buf_t *bp, 1157 int schedule) 1158 { 1159 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1160 xfs_buf_ioend(bp, schedule); 1161 } 1162 1163 STATIC void 1164 xfs_buf_bio_end_io( 1165 struct bio *bio, 1166 int error) 1167 { 1168 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1169 1170 xfs_buf_ioerror(bp, -error); 1171 1172 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1173 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1174 1175 _xfs_buf_ioend(bp, 1); 1176 bio_put(bio); 1177 } 1178 1179 STATIC void 1180 _xfs_buf_ioapply( 1181 xfs_buf_t *bp) 1182 { 1183 int rw, map_i, total_nr_pages, nr_pages; 1184 struct bio *bio; 1185 int offset = bp->b_offset; 1186 int size = bp->b_count_desired; 1187 sector_t sector = bp->b_bn; 1188 1189 total_nr_pages = bp->b_page_count; 1190 map_i = 0; 1191 1192 if (bp->b_flags & XBF_WRITE) { 1193 if (bp->b_flags & XBF_SYNCIO) 1194 rw = WRITE_SYNC; 1195 else 1196 rw = WRITE; 1197 if (bp->b_flags & XBF_FUA) 1198 rw |= REQ_FUA; 1199 if (bp->b_flags & XBF_FLUSH) 1200 rw |= REQ_FLUSH; 1201 } else if (bp->b_flags & XBF_READ_AHEAD) { 1202 rw = READA; 1203 } else { 1204 rw = READ; 1205 } 1206 1207 /* we only use the buffer cache for meta-data */ 1208 rw |= REQ_META; 1209 1210 next_chunk: 1211 atomic_inc(&bp->b_io_remaining); 1212 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1213 if (nr_pages > total_nr_pages) 1214 nr_pages = total_nr_pages; 1215 1216 bio = bio_alloc(GFP_NOIO, nr_pages); 1217 bio->bi_bdev = bp->b_target->bt_bdev; 1218 bio->bi_sector = sector; 1219 bio->bi_end_io = xfs_buf_bio_end_io; 1220 bio->bi_private = bp; 1221 1222 1223 for (; size && nr_pages; nr_pages--, map_i++) { 1224 int rbytes, nbytes = PAGE_SIZE - offset; 1225 1226 if (nbytes > size) 1227 nbytes = size; 1228 1229 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1230 if (rbytes < nbytes) 1231 break; 1232 1233 offset = 0; 1234 sector += nbytes >> BBSHIFT; 1235 size -= nbytes; 1236 total_nr_pages--; 1237 } 1238 1239 if (likely(bio->bi_size)) { 1240 if (xfs_buf_is_vmapped(bp)) { 1241 flush_kernel_vmap_range(bp->b_addr, 1242 xfs_buf_vmap_len(bp)); 1243 } 1244 submit_bio(rw, bio); 1245 if (size) 1246 goto next_chunk; 1247 } else { 1248 xfs_buf_ioerror(bp, EIO); 1249 bio_put(bio); 1250 } 1251 } 1252 1253 int 1254 xfs_buf_iorequest( 1255 xfs_buf_t *bp) 1256 { 1257 trace_xfs_buf_iorequest(bp, _RET_IP_); 1258 1259 ASSERT(!(bp->b_flags & XBF_DELWRI)); 1260 1261 if (bp->b_flags & XBF_WRITE) 1262 xfs_buf_wait_unpin(bp); 1263 xfs_buf_hold(bp); 1264 1265 /* Set the count to 1 initially, this will stop an I/O 1266 * completion callout which happens before we have started 1267 * all the I/O from calling xfs_buf_ioend too early. 1268 */ 1269 atomic_set(&bp->b_io_remaining, 1); 1270 _xfs_buf_ioapply(bp); 1271 _xfs_buf_ioend(bp, 0); 1272 1273 xfs_buf_rele(bp); 1274 return 0; 1275 } 1276 1277 /* 1278 * Waits for I/O to complete on the buffer supplied. 1279 * It returns immediately if no I/O is pending. 1280 * It returns the I/O error code, if any, or 0 if there was no error. 1281 */ 1282 int 1283 xfs_buf_iowait( 1284 xfs_buf_t *bp) 1285 { 1286 trace_xfs_buf_iowait(bp, _RET_IP_); 1287 1288 wait_for_completion(&bp->b_iowait); 1289 1290 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1291 return bp->b_error; 1292 } 1293 1294 xfs_caddr_t 1295 xfs_buf_offset( 1296 xfs_buf_t *bp, 1297 size_t offset) 1298 { 1299 struct page *page; 1300 1301 if (bp->b_flags & XBF_MAPPED) 1302 return bp->b_addr + offset; 1303 1304 offset += bp->b_offset; 1305 page = bp->b_pages[offset >> PAGE_SHIFT]; 1306 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1307 } 1308 1309 /* 1310 * Move data into or out of a buffer. 1311 */ 1312 void 1313 xfs_buf_iomove( 1314 xfs_buf_t *bp, /* buffer to process */ 1315 size_t boff, /* starting buffer offset */ 1316 size_t bsize, /* length to copy */ 1317 void *data, /* data address */ 1318 xfs_buf_rw_t mode) /* read/write/zero flag */ 1319 { 1320 size_t bend, cpoff, csize; 1321 struct page *page; 1322 1323 bend = boff + bsize; 1324 while (boff < bend) { 1325 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1326 cpoff = xfs_buf_poff(boff + bp->b_offset); 1327 csize = min_t(size_t, 1328 PAGE_SIZE-cpoff, bp->b_count_desired-boff); 1329 1330 ASSERT(((csize + cpoff) <= PAGE_SIZE)); 1331 1332 switch (mode) { 1333 case XBRW_ZERO: 1334 memset(page_address(page) + cpoff, 0, csize); 1335 break; 1336 case XBRW_READ: 1337 memcpy(data, page_address(page) + cpoff, csize); 1338 break; 1339 case XBRW_WRITE: 1340 memcpy(page_address(page) + cpoff, data, csize); 1341 } 1342 1343 boff += csize; 1344 data += csize; 1345 } 1346 } 1347 1348 /* 1349 * Handling of buffer targets (buftargs). 1350 */ 1351 1352 /* 1353 * Wait for any bufs with callbacks that have been submitted but have not yet 1354 * returned. These buffers will have an elevated hold count, so wait on those 1355 * while freeing all the buffers only held by the LRU. 1356 */ 1357 void 1358 xfs_wait_buftarg( 1359 struct xfs_buftarg *btp) 1360 { 1361 struct xfs_buf *bp; 1362 1363 restart: 1364 spin_lock(&btp->bt_lru_lock); 1365 while (!list_empty(&btp->bt_lru)) { 1366 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1367 if (atomic_read(&bp->b_hold) > 1) { 1368 spin_unlock(&btp->bt_lru_lock); 1369 delay(100); 1370 goto restart; 1371 } 1372 /* 1373 * clear the LRU reference count so the buffer doesn't get 1374 * ignored in xfs_buf_rele(). 1375 */ 1376 atomic_set(&bp->b_lru_ref, 0); 1377 spin_unlock(&btp->bt_lru_lock); 1378 xfs_buf_rele(bp); 1379 spin_lock(&btp->bt_lru_lock); 1380 } 1381 spin_unlock(&btp->bt_lru_lock); 1382 } 1383 1384 int 1385 xfs_buftarg_shrink( 1386 struct shrinker *shrink, 1387 struct shrink_control *sc) 1388 { 1389 struct xfs_buftarg *btp = container_of(shrink, 1390 struct xfs_buftarg, bt_shrinker); 1391 struct xfs_buf *bp; 1392 int nr_to_scan = sc->nr_to_scan; 1393 LIST_HEAD(dispose); 1394 1395 if (!nr_to_scan) 1396 return btp->bt_lru_nr; 1397 1398 spin_lock(&btp->bt_lru_lock); 1399 while (!list_empty(&btp->bt_lru)) { 1400 if (nr_to_scan-- <= 0) 1401 break; 1402 1403 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1404 1405 /* 1406 * Decrement the b_lru_ref count unless the value is already 1407 * zero. If the value is already zero, we need to reclaim the 1408 * buffer, otherwise it gets another trip through the LRU. 1409 */ 1410 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1411 list_move_tail(&bp->b_lru, &btp->bt_lru); 1412 continue; 1413 } 1414 1415 /* 1416 * remove the buffer from the LRU now to avoid needing another 1417 * lock round trip inside xfs_buf_rele(). 1418 */ 1419 list_move(&bp->b_lru, &dispose); 1420 btp->bt_lru_nr--; 1421 } 1422 spin_unlock(&btp->bt_lru_lock); 1423 1424 while (!list_empty(&dispose)) { 1425 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1426 list_del_init(&bp->b_lru); 1427 xfs_buf_rele(bp); 1428 } 1429 1430 return btp->bt_lru_nr; 1431 } 1432 1433 void 1434 xfs_free_buftarg( 1435 struct xfs_mount *mp, 1436 struct xfs_buftarg *btp) 1437 { 1438 unregister_shrinker(&btp->bt_shrinker); 1439 1440 xfs_flush_buftarg(btp, 1); 1441 if (mp->m_flags & XFS_MOUNT_BARRIER) 1442 xfs_blkdev_issue_flush(btp); 1443 1444 kthread_stop(btp->bt_task); 1445 kmem_free(btp); 1446 } 1447 1448 STATIC int 1449 xfs_setsize_buftarg_flags( 1450 xfs_buftarg_t *btp, 1451 unsigned int blocksize, 1452 unsigned int sectorsize, 1453 int verbose) 1454 { 1455 btp->bt_bsize = blocksize; 1456 btp->bt_sshift = ffs(sectorsize) - 1; 1457 btp->bt_smask = sectorsize - 1; 1458 1459 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1460 char name[BDEVNAME_SIZE]; 1461 1462 bdevname(btp->bt_bdev, name); 1463 1464 xfs_warn(btp->bt_mount, 1465 "Cannot set_blocksize to %u on device %s\n", 1466 sectorsize, name); 1467 return EINVAL; 1468 } 1469 1470 return 0; 1471 } 1472 1473 /* 1474 * When allocating the initial buffer target we have not yet 1475 * read in the superblock, so don't know what sized sectors 1476 * are being used is at this early stage. Play safe. 1477 */ 1478 STATIC int 1479 xfs_setsize_buftarg_early( 1480 xfs_buftarg_t *btp, 1481 struct block_device *bdev) 1482 { 1483 return xfs_setsize_buftarg_flags(btp, 1484 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1485 } 1486 1487 int 1488 xfs_setsize_buftarg( 1489 xfs_buftarg_t *btp, 1490 unsigned int blocksize, 1491 unsigned int sectorsize) 1492 { 1493 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1494 } 1495 1496 STATIC int 1497 xfs_alloc_delwri_queue( 1498 xfs_buftarg_t *btp, 1499 const char *fsname) 1500 { 1501 INIT_LIST_HEAD(&btp->bt_delwri_queue); 1502 spin_lock_init(&btp->bt_delwri_lock); 1503 btp->bt_flags = 0; 1504 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1505 if (IS_ERR(btp->bt_task)) 1506 return PTR_ERR(btp->bt_task); 1507 return 0; 1508 } 1509 1510 xfs_buftarg_t * 1511 xfs_alloc_buftarg( 1512 struct xfs_mount *mp, 1513 struct block_device *bdev, 1514 int external, 1515 const char *fsname) 1516 { 1517 xfs_buftarg_t *btp; 1518 1519 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1520 1521 btp->bt_mount = mp; 1522 btp->bt_dev = bdev->bd_dev; 1523 btp->bt_bdev = bdev; 1524 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1525 if (!btp->bt_bdi) 1526 goto error; 1527 1528 INIT_LIST_HEAD(&btp->bt_lru); 1529 spin_lock_init(&btp->bt_lru_lock); 1530 if (xfs_setsize_buftarg_early(btp, bdev)) 1531 goto error; 1532 if (xfs_alloc_delwri_queue(btp, fsname)) 1533 goto error; 1534 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1535 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1536 register_shrinker(&btp->bt_shrinker); 1537 return btp; 1538 1539 error: 1540 kmem_free(btp); 1541 return NULL; 1542 } 1543 1544 1545 /* 1546 * Delayed write buffer handling 1547 */ 1548 void 1549 xfs_buf_delwri_queue( 1550 xfs_buf_t *bp) 1551 { 1552 struct xfs_buftarg *btp = bp->b_target; 1553 1554 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1555 1556 ASSERT(!(bp->b_flags & XBF_READ)); 1557 1558 spin_lock(&btp->bt_delwri_lock); 1559 if (!list_empty(&bp->b_list)) { 1560 /* if already in the queue, move it to the tail */ 1561 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1562 list_move_tail(&bp->b_list, &btp->bt_delwri_queue); 1563 } else { 1564 /* start xfsbufd as it is about to have something to do */ 1565 if (list_empty(&btp->bt_delwri_queue)) 1566 wake_up_process(bp->b_target->bt_task); 1567 1568 atomic_inc(&bp->b_hold); 1569 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC; 1570 list_add_tail(&bp->b_list, &btp->bt_delwri_queue); 1571 } 1572 bp->b_queuetime = jiffies; 1573 spin_unlock(&btp->bt_delwri_lock); 1574 } 1575 1576 void 1577 xfs_buf_delwri_dequeue( 1578 xfs_buf_t *bp) 1579 { 1580 int dequeued = 0; 1581 1582 spin_lock(&bp->b_target->bt_delwri_lock); 1583 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1584 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1585 list_del_init(&bp->b_list); 1586 dequeued = 1; 1587 } 1588 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1589 spin_unlock(&bp->b_target->bt_delwri_lock); 1590 1591 if (dequeued) 1592 xfs_buf_rele(bp); 1593 1594 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1595 } 1596 1597 /* 1598 * If a delwri buffer needs to be pushed before it has aged out, then promote 1599 * it to the head of the delwri queue so that it will be flushed on the next 1600 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older 1601 * than the age currently needed to flush the buffer. Hence the next time the 1602 * xfsbufd sees it is guaranteed to be considered old enough to flush. 1603 */ 1604 void 1605 xfs_buf_delwri_promote( 1606 struct xfs_buf *bp) 1607 { 1608 struct xfs_buftarg *btp = bp->b_target; 1609 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; 1610 1611 ASSERT(bp->b_flags & XBF_DELWRI); 1612 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1613 1614 /* 1615 * Check the buffer age before locking the delayed write queue as we 1616 * don't need to promote buffers that are already past the flush age. 1617 */ 1618 if (bp->b_queuetime < jiffies - age) 1619 return; 1620 bp->b_queuetime = jiffies - age; 1621 spin_lock(&btp->bt_delwri_lock); 1622 list_move(&bp->b_list, &btp->bt_delwri_queue); 1623 spin_unlock(&btp->bt_delwri_lock); 1624 } 1625 1626 /* 1627 * Move as many buffers as specified to the supplied list 1628 * idicating if we skipped any buffers to prevent deadlocks. 1629 */ 1630 STATIC int 1631 xfs_buf_delwri_split( 1632 xfs_buftarg_t *target, 1633 struct list_head *list, 1634 unsigned long age) 1635 { 1636 xfs_buf_t *bp, *n; 1637 int skipped = 0; 1638 int force; 1639 1640 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1641 INIT_LIST_HEAD(list); 1642 spin_lock(&target->bt_delwri_lock); 1643 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) { 1644 ASSERT(bp->b_flags & XBF_DELWRI); 1645 1646 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { 1647 if (!force && 1648 time_before(jiffies, bp->b_queuetime + age)) { 1649 xfs_buf_unlock(bp); 1650 break; 1651 } 1652 1653 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); 1654 bp->b_flags |= XBF_WRITE; 1655 list_move_tail(&bp->b_list, list); 1656 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1657 } else 1658 skipped++; 1659 } 1660 1661 spin_unlock(&target->bt_delwri_lock); 1662 return skipped; 1663 } 1664 1665 /* 1666 * Compare function is more complex than it needs to be because 1667 * the return value is only 32 bits and we are doing comparisons 1668 * on 64 bit values 1669 */ 1670 static int 1671 xfs_buf_cmp( 1672 void *priv, 1673 struct list_head *a, 1674 struct list_head *b) 1675 { 1676 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1677 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1678 xfs_daddr_t diff; 1679 1680 diff = ap->b_bn - bp->b_bn; 1681 if (diff < 0) 1682 return -1; 1683 if (diff > 0) 1684 return 1; 1685 return 0; 1686 } 1687 1688 STATIC int 1689 xfsbufd( 1690 void *data) 1691 { 1692 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1693 1694 current->flags |= PF_MEMALLOC; 1695 1696 set_freezable(); 1697 1698 do { 1699 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1700 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1701 struct list_head tmp; 1702 struct blk_plug plug; 1703 1704 if (unlikely(freezing(current))) 1705 try_to_freeze(); 1706 1707 /* sleep for a long time if there is nothing to do. */ 1708 if (list_empty(&target->bt_delwri_queue)) 1709 tout = MAX_SCHEDULE_TIMEOUT; 1710 schedule_timeout_interruptible(tout); 1711 1712 xfs_buf_delwri_split(target, &tmp, age); 1713 list_sort(NULL, &tmp, xfs_buf_cmp); 1714 1715 blk_start_plug(&plug); 1716 while (!list_empty(&tmp)) { 1717 struct xfs_buf *bp; 1718 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1719 list_del_init(&bp->b_list); 1720 xfs_bdstrat_cb(bp); 1721 } 1722 blk_finish_plug(&plug); 1723 } while (!kthread_should_stop()); 1724 1725 return 0; 1726 } 1727 1728 /* 1729 * Go through all incore buffers, and release buffers if they belong to 1730 * the given device. This is used in filesystem error handling to 1731 * preserve the consistency of its metadata. 1732 */ 1733 int 1734 xfs_flush_buftarg( 1735 xfs_buftarg_t *target, 1736 int wait) 1737 { 1738 xfs_buf_t *bp; 1739 int pincount = 0; 1740 LIST_HEAD(tmp_list); 1741 LIST_HEAD(wait_list); 1742 struct blk_plug plug; 1743 1744 flush_workqueue(xfslogd_workqueue); 1745 1746 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1747 pincount = xfs_buf_delwri_split(target, &tmp_list, 0); 1748 1749 /* 1750 * Dropped the delayed write list lock, now walk the temporary list. 1751 * All I/O is issued async and then if we need to wait for completion 1752 * we do that after issuing all the IO. 1753 */ 1754 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1755 1756 blk_start_plug(&plug); 1757 while (!list_empty(&tmp_list)) { 1758 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1759 ASSERT(target == bp->b_target); 1760 list_del_init(&bp->b_list); 1761 if (wait) { 1762 bp->b_flags &= ~XBF_ASYNC; 1763 list_add(&bp->b_list, &wait_list); 1764 } 1765 xfs_bdstrat_cb(bp); 1766 } 1767 blk_finish_plug(&plug); 1768 1769 if (wait) { 1770 /* Wait for IO to complete. */ 1771 while (!list_empty(&wait_list)) { 1772 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1773 1774 list_del_init(&bp->b_list); 1775 xfs_buf_iowait(bp); 1776 xfs_buf_relse(bp); 1777 } 1778 } 1779 1780 return pincount; 1781 } 1782 1783 int __init 1784 xfs_buf_init(void) 1785 { 1786 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1787 KM_ZONE_HWALIGN, NULL); 1788 if (!xfs_buf_zone) 1789 goto out; 1790 1791 xfslogd_workqueue = alloc_workqueue("xfslogd", 1792 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); 1793 if (!xfslogd_workqueue) 1794 goto out_free_buf_zone; 1795 1796 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); 1797 if (!xfsdatad_workqueue) 1798 goto out_destroy_xfslogd_workqueue; 1799 1800 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", 1801 WQ_MEM_RECLAIM, 1); 1802 if (!xfsconvertd_workqueue) 1803 goto out_destroy_xfsdatad_workqueue; 1804 1805 return 0; 1806 1807 out_destroy_xfsdatad_workqueue: 1808 destroy_workqueue(xfsdatad_workqueue); 1809 out_destroy_xfslogd_workqueue: 1810 destroy_workqueue(xfslogd_workqueue); 1811 out_free_buf_zone: 1812 kmem_zone_destroy(xfs_buf_zone); 1813 out: 1814 return -ENOMEM; 1815 } 1816 1817 void 1818 xfs_buf_terminate(void) 1819 { 1820 destroy_workqueue(xfsconvertd_workqueue); 1821 destroy_workqueue(xfsdatad_workqueue); 1822 destroy_workqueue(xfslogd_workqueue); 1823 kmem_zone_destroy(xfs_buf_zone); 1824 } 1825