1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 24 static struct kmem_cache *extent_state_cache; 25 static struct kmem_cache *extent_buffer_cache; 26 static struct bio_set *btrfs_bioset; 27 28 #ifdef CONFIG_BTRFS_DEBUG 29 static LIST_HEAD(buffers); 30 static LIST_HEAD(states); 31 32 static DEFINE_SPINLOCK(leak_lock); 33 34 static inline 35 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 36 { 37 unsigned long flags; 38 39 spin_lock_irqsave(&leak_lock, flags); 40 list_add(new, head); 41 spin_unlock_irqrestore(&leak_lock, flags); 42 } 43 44 static inline 45 void btrfs_leak_debug_del(struct list_head *entry) 46 { 47 unsigned long flags; 48 49 spin_lock_irqsave(&leak_lock, flags); 50 list_del(entry); 51 spin_unlock_irqrestore(&leak_lock, flags); 52 } 53 54 static inline 55 void btrfs_leak_debug_check(void) 56 { 57 struct extent_state *state; 58 struct extent_buffer *eb; 59 60 while (!list_empty(&states)) { 61 state = list_entry(states.next, struct extent_state, leak_list); 62 printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " 63 "state %lu in tree %p refs %d\n", 64 state->start, state->end, state->state, state->tree, 65 atomic_read(&state->refs)); 66 list_del(&state->leak_list); 67 kmem_cache_free(extent_state_cache, state); 68 } 69 70 while (!list_empty(&buffers)) { 71 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 72 printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu " 73 "refs %d\n", 74 eb->start, eb->len, atomic_read(&eb->refs)); 75 list_del(&eb->leak_list); 76 kmem_cache_free(extent_buffer_cache, eb); 77 } 78 } 79 80 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 81 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 82 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 83 struct extent_io_tree *tree, u64 start, u64 end) 84 { 85 struct inode *inode; 86 u64 isize; 87 88 if (!tree->mapping) 89 return; 90 91 inode = tree->mapping->host; 92 isize = i_size_read(inode); 93 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 94 printk_ratelimited(KERN_DEBUG 95 "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", 96 caller, btrfs_ino(inode), isize, start, end); 97 } 98 } 99 #else 100 #define btrfs_leak_debug_add(new, head) do {} while (0) 101 #define btrfs_leak_debug_del(entry) do {} while (0) 102 #define btrfs_leak_debug_check() do {} while (0) 103 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 104 #endif 105 106 #define BUFFER_LRU_MAX 64 107 108 struct tree_entry { 109 u64 start; 110 u64 end; 111 struct rb_node rb_node; 112 }; 113 114 struct extent_page_data { 115 struct bio *bio; 116 struct extent_io_tree *tree; 117 get_extent_t *get_extent; 118 unsigned long bio_flags; 119 120 /* tells writepage not to lock the state bits for this range 121 * it still does the unlocking 122 */ 123 unsigned int extent_locked:1; 124 125 /* tells the submit_bio code to use a WRITE_SYNC */ 126 unsigned int sync_io:1; 127 }; 128 129 static noinline void flush_write_bio(void *data); 130 static inline struct btrfs_fs_info * 131 tree_fs_info(struct extent_io_tree *tree) 132 { 133 if (!tree->mapping) 134 return NULL; 135 return btrfs_sb(tree->mapping->host->i_sb); 136 } 137 138 int __init extent_io_init(void) 139 { 140 extent_state_cache = kmem_cache_create("btrfs_extent_state", 141 sizeof(struct extent_state), 0, 142 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 143 if (!extent_state_cache) 144 return -ENOMEM; 145 146 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 147 sizeof(struct extent_buffer), 0, 148 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 149 if (!extent_buffer_cache) 150 goto free_state_cache; 151 152 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 153 offsetof(struct btrfs_io_bio, bio)); 154 if (!btrfs_bioset) 155 goto free_buffer_cache; 156 157 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 158 goto free_bioset; 159 160 return 0; 161 162 free_bioset: 163 bioset_free(btrfs_bioset); 164 btrfs_bioset = NULL; 165 166 free_buffer_cache: 167 kmem_cache_destroy(extent_buffer_cache); 168 extent_buffer_cache = NULL; 169 170 free_state_cache: 171 kmem_cache_destroy(extent_state_cache); 172 extent_state_cache = NULL; 173 return -ENOMEM; 174 } 175 176 void extent_io_exit(void) 177 { 178 btrfs_leak_debug_check(); 179 180 /* 181 * Make sure all delayed rcu free are flushed before we 182 * destroy caches. 183 */ 184 rcu_barrier(); 185 if (extent_state_cache) 186 kmem_cache_destroy(extent_state_cache); 187 if (extent_buffer_cache) 188 kmem_cache_destroy(extent_buffer_cache); 189 if (btrfs_bioset) 190 bioset_free(btrfs_bioset); 191 } 192 193 void extent_io_tree_init(struct extent_io_tree *tree, 194 struct address_space *mapping) 195 { 196 tree->state = RB_ROOT; 197 tree->ops = NULL; 198 tree->dirty_bytes = 0; 199 spin_lock_init(&tree->lock); 200 tree->mapping = mapping; 201 } 202 203 static struct extent_state *alloc_extent_state(gfp_t mask) 204 { 205 struct extent_state *state; 206 207 state = kmem_cache_alloc(extent_state_cache, mask); 208 if (!state) 209 return state; 210 state->state = 0; 211 state->private = 0; 212 state->tree = NULL; 213 btrfs_leak_debug_add(&state->leak_list, &states); 214 atomic_set(&state->refs, 1); 215 init_waitqueue_head(&state->wq); 216 trace_alloc_extent_state(state, mask, _RET_IP_); 217 return state; 218 } 219 220 void free_extent_state(struct extent_state *state) 221 { 222 if (!state) 223 return; 224 if (atomic_dec_and_test(&state->refs)) { 225 WARN_ON(state->tree); 226 btrfs_leak_debug_del(&state->leak_list); 227 trace_free_extent_state(state, _RET_IP_); 228 kmem_cache_free(extent_state_cache, state); 229 } 230 } 231 232 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 233 struct rb_node *node, 234 struct rb_node ***p_in, 235 struct rb_node **parent_in) 236 { 237 struct rb_node **p = &root->rb_node; 238 struct rb_node *parent = NULL; 239 struct tree_entry *entry; 240 241 if (p_in && parent_in) { 242 p = *p_in; 243 parent = *parent_in; 244 goto do_insert; 245 } 246 247 while (*p) { 248 parent = *p; 249 entry = rb_entry(parent, struct tree_entry, rb_node); 250 251 if (offset < entry->start) 252 p = &(*p)->rb_left; 253 else if (offset > entry->end) 254 p = &(*p)->rb_right; 255 else 256 return parent; 257 } 258 259 do_insert: 260 rb_link_node(node, parent, p); 261 rb_insert_color(node, root); 262 return NULL; 263 } 264 265 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 266 struct rb_node **prev_ret, 267 struct rb_node **next_ret, 268 struct rb_node ***p_ret, 269 struct rb_node **parent_ret) 270 { 271 struct rb_root *root = &tree->state; 272 struct rb_node **n = &root->rb_node; 273 struct rb_node *prev = NULL; 274 struct rb_node *orig_prev = NULL; 275 struct tree_entry *entry; 276 struct tree_entry *prev_entry = NULL; 277 278 while (*n) { 279 prev = *n; 280 entry = rb_entry(prev, struct tree_entry, rb_node); 281 prev_entry = entry; 282 283 if (offset < entry->start) 284 n = &(*n)->rb_left; 285 else if (offset > entry->end) 286 n = &(*n)->rb_right; 287 else 288 return *n; 289 } 290 291 if (p_ret) 292 *p_ret = n; 293 if (parent_ret) 294 *parent_ret = prev; 295 296 if (prev_ret) { 297 orig_prev = prev; 298 while (prev && offset > prev_entry->end) { 299 prev = rb_next(prev); 300 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 301 } 302 *prev_ret = prev; 303 prev = orig_prev; 304 } 305 306 if (next_ret) { 307 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 308 while (prev && offset < prev_entry->start) { 309 prev = rb_prev(prev); 310 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 311 } 312 *next_ret = prev; 313 } 314 return NULL; 315 } 316 317 static inline struct rb_node * 318 tree_search_for_insert(struct extent_io_tree *tree, 319 u64 offset, 320 struct rb_node ***p_ret, 321 struct rb_node **parent_ret) 322 { 323 struct rb_node *prev = NULL; 324 struct rb_node *ret; 325 326 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 327 if (!ret) 328 return prev; 329 return ret; 330 } 331 332 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 333 u64 offset) 334 { 335 return tree_search_for_insert(tree, offset, NULL, NULL); 336 } 337 338 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 339 struct extent_state *other) 340 { 341 if (tree->ops && tree->ops->merge_extent_hook) 342 tree->ops->merge_extent_hook(tree->mapping->host, new, 343 other); 344 } 345 346 /* 347 * utility function to look for merge candidates inside a given range. 348 * Any extents with matching state are merged together into a single 349 * extent in the tree. Extents with EXTENT_IO in their state field 350 * are not merged because the end_io handlers need to be able to do 351 * operations on them without sleeping (or doing allocations/splits). 352 * 353 * This should be called with the tree lock held. 354 */ 355 static void merge_state(struct extent_io_tree *tree, 356 struct extent_state *state) 357 { 358 struct extent_state *other; 359 struct rb_node *other_node; 360 361 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 362 return; 363 364 other_node = rb_prev(&state->rb_node); 365 if (other_node) { 366 other = rb_entry(other_node, struct extent_state, rb_node); 367 if (other->end == state->start - 1 && 368 other->state == state->state) { 369 merge_cb(tree, state, other); 370 state->start = other->start; 371 other->tree = NULL; 372 rb_erase(&other->rb_node, &tree->state); 373 free_extent_state(other); 374 } 375 } 376 other_node = rb_next(&state->rb_node); 377 if (other_node) { 378 other = rb_entry(other_node, struct extent_state, rb_node); 379 if (other->start == state->end + 1 && 380 other->state == state->state) { 381 merge_cb(tree, state, other); 382 state->end = other->end; 383 other->tree = NULL; 384 rb_erase(&other->rb_node, &tree->state); 385 free_extent_state(other); 386 } 387 } 388 } 389 390 static void set_state_cb(struct extent_io_tree *tree, 391 struct extent_state *state, unsigned long *bits) 392 { 393 if (tree->ops && tree->ops->set_bit_hook) 394 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 395 } 396 397 static void clear_state_cb(struct extent_io_tree *tree, 398 struct extent_state *state, unsigned long *bits) 399 { 400 if (tree->ops && tree->ops->clear_bit_hook) 401 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 402 } 403 404 static void set_state_bits(struct extent_io_tree *tree, 405 struct extent_state *state, unsigned long *bits); 406 407 /* 408 * insert an extent_state struct into the tree. 'bits' are set on the 409 * struct before it is inserted. 410 * 411 * This may return -EEXIST if the extent is already there, in which case the 412 * state struct is freed. 413 * 414 * The tree lock is not taken internally. This is a utility function and 415 * probably isn't what you want to call (see set/clear_extent_bit). 416 */ 417 static int insert_state(struct extent_io_tree *tree, 418 struct extent_state *state, u64 start, u64 end, 419 struct rb_node ***p, 420 struct rb_node **parent, 421 unsigned long *bits) 422 { 423 struct rb_node *node; 424 425 if (end < start) 426 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 427 end, start); 428 state->start = start; 429 state->end = end; 430 431 set_state_bits(tree, state, bits); 432 433 node = tree_insert(&tree->state, end, &state->rb_node, p, parent); 434 if (node) { 435 struct extent_state *found; 436 found = rb_entry(node, struct extent_state, rb_node); 437 printk(KERN_ERR "BTRFS: found node %llu %llu on insert of " 438 "%llu %llu\n", 439 found->start, found->end, start, end); 440 return -EEXIST; 441 } 442 state->tree = tree; 443 merge_state(tree, state); 444 return 0; 445 } 446 447 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 448 u64 split) 449 { 450 if (tree->ops && tree->ops->split_extent_hook) 451 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 452 } 453 454 /* 455 * split a given extent state struct in two, inserting the preallocated 456 * struct 'prealloc' as the newly created second half. 'split' indicates an 457 * offset inside 'orig' where it should be split. 458 * 459 * Before calling, 460 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 461 * are two extent state structs in the tree: 462 * prealloc: [orig->start, split - 1] 463 * orig: [ split, orig->end ] 464 * 465 * The tree locks are not taken by this function. They need to be held 466 * by the caller. 467 */ 468 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 469 struct extent_state *prealloc, u64 split) 470 { 471 struct rb_node *node; 472 473 split_cb(tree, orig, split); 474 475 prealloc->start = orig->start; 476 prealloc->end = split - 1; 477 prealloc->state = orig->state; 478 orig->start = split; 479 480 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node, 481 NULL, NULL); 482 if (node) { 483 free_extent_state(prealloc); 484 return -EEXIST; 485 } 486 prealloc->tree = tree; 487 return 0; 488 } 489 490 static struct extent_state *next_state(struct extent_state *state) 491 { 492 struct rb_node *next = rb_next(&state->rb_node); 493 if (next) 494 return rb_entry(next, struct extent_state, rb_node); 495 else 496 return NULL; 497 } 498 499 /* 500 * utility function to clear some bits in an extent state struct. 501 * it will optionally wake up any one waiting on this state (wake == 1). 502 * 503 * If no bits are set on the state struct after clearing things, the 504 * struct is freed and removed from the tree 505 */ 506 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 507 struct extent_state *state, 508 unsigned long *bits, int wake) 509 { 510 struct extent_state *next; 511 unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; 512 513 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 514 u64 range = state->end - state->start + 1; 515 WARN_ON(range > tree->dirty_bytes); 516 tree->dirty_bytes -= range; 517 } 518 clear_state_cb(tree, state, bits); 519 state->state &= ~bits_to_clear; 520 if (wake) 521 wake_up(&state->wq); 522 if (state->state == 0) { 523 next = next_state(state); 524 if (state->tree) { 525 rb_erase(&state->rb_node, &tree->state); 526 state->tree = NULL; 527 free_extent_state(state); 528 } else { 529 WARN_ON(1); 530 } 531 } else { 532 merge_state(tree, state); 533 next = next_state(state); 534 } 535 return next; 536 } 537 538 static struct extent_state * 539 alloc_extent_state_atomic(struct extent_state *prealloc) 540 { 541 if (!prealloc) 542 prealloc = alloc_extent_state(GFP_ATOMIC); 543 544 return prealloc; 545 } 546 547 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 548 { 549 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 550 "Extent tree was modified by another " 551 "thread while locked."); 552 } 553 554 /* 555 * clear some bits on a range in the tree. This may require splitting 556 * or inserting elements in the tree, so the gfp mask is used to 557 * indicate which allocations or sleeping are allowed. 558 * 559 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 560 * the given range from the tree regardless of state (ie for truncate). 561 * 562 * the range [start, end] is inclusive. 563 * 564 * This takes the tree lock, and returns 0 on success and < 0 on error. 565 */ 566 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 567 unsigned long bits, int wake, int delete, 568 struct extent_state **cached_state, 569 gfp_t mask) 570 { 571 struct extent_state *state; 572 struct extent_state *cached; 573 struct extent_state *prealloc = NULL; 574 struct rb_node *node; 575 u64 last_end; 576 int err; 577 int clear = 0; 578 579 btrfs_debug_check_extent_io_range(tree, start, end); 580 581 if (bits & EXTENT_DELALLOC) 582 bits |= EXTENT_NORESERVE; 583 584 if (delete) 585 bits |= ~EXTENT_CTLBITS; 586 bits |= EXTENT_FIRST_DELALLOC; 587 588 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 589 clear = 1; 590 again: 591 if (!prealloc && (mask & __GFP_WAIT)) { 592 prealloc = alloc_extent_state(mask); 593 if (!prealloc) 594 return -ENOMEM; 595 } 596 597 spin_lock(&tree->lock); 598 if (cached_state) { 599 cached = *cached_state; 600 601 if (clear) { 602 *cached_state = NULL; 603 cached_state = NULL; 604 } 605 606 if (cached && cached->tree && cached->start <= start && 607 cached->end > start) { 608 if (clear) 609 atomic_dec(&cached->refs); 610 state = cached; 611 goto hit_next; 612 } 613 if (clear) 614 free_extent_state(cached); 615 } 616 /* 617 * this search will find the extents that end after 618 * our range starts 619 */ 620 node = tree_search(tree, start); 621 if (!node) 622 goto out; 623 state = rb_entry(node, struct extent_state, rb_node); 624 hit_next: 625 if (state->start > end) 626 goto out; 627 WARN_ON(state->end < start); 628 last_end = state->end; 629 630 /* the state doesn't have the wanted bits, go ahead */ 631 if (!(state->state & bits)) { 632 state = next_state(state); 633 goto next; 634 } 635 636 /* 637 * | ---- desired range ---- | 638 * | state | or 639 * | ------------- state -------------- | 640 * 641 * We need to split the extent we found, and may flip 642 * bits on second half. 643 * 644 * If the extent we found extends past our range, we 645 * just split and search again. It'll get split again 646 * the next time though. 647 * 648 * If the extent we found is inside our range, we clear 649 * the desired bit on it. 650 */ 651 652 if (state->start < start) { 653 prealloc = alloc_extent_state_atomic(prealloc); 654 BUG_ON(!prealloc); 655 err = split_state(tree, state, prealloc, start); 656 if (err) 657 extent_io_tree_panic(tree, err); 658 659 prealloc = NULL; 660 if (err) 661 goto out; 662 if (state->end <= end) { 663 state = clear_state_bit(tree, state, &bits, wake); 664 goto next; 665 } 666 goto search_again; 667 } 668 /* 669 * | ---- desired range ---- | 670 * | state | 671 * We need to split the extent, and clear the bit 672 * on the first half 673 */ 674 if (state->start <= end && state->end > end) { 675 prealloc = alloc_extent_state_atomic(prealloc); 676 BUG_ON(!prealloc); 677 err = split_state(tree, state, prealloc, end + 1); 678 if (err) 679 extent_io_tree_panic(tree, err); 680 681 if (wake) 682 wake_up(&state->wq); 683 684 clear_state_bit(tree, prealloc, &bits, wake); 685 686 prealloc = NULL; 687 goto out; 688 } 689 690 state = clear_state_bit(tree, state, &bits, wake); 691 next: 692 if (last_end == (u64)-1) 693 goto out; 694 start = last_end + 1; 695 if (start <= end && state && !need_resched()) 696 goto hit_next; 697 goto search_again; 698 699 out: 700 spin_unlock(&tree->lock); 701 if (prealloc) 702 free_extent_state(prealloc); 703 704 return 0; 705 706 search_again: 707 if (start > end) 708 goto out; 709 spin_unlock(&tree->lock); 710 if (mask & __GFP_WAIT) 711 cond_resched(); 712 goto again; 713 } 714 715 static void wait_on_state(struct extent_io_tree *tree, 716 struct extent_state *state) 717 __releases(tree->lock) 718 __acquires(tree->lock) 719 { 720 DEFINE_WAIT(wait); 721 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 722 spin_unlock(&tree->lock); 723 schedule(); 724 spin_lock(&tree->lock); 725 finish_wait(&state->wq, &wait); 726 } 727 728 /* 729 * waits for one or more bits to clear on a range in the state tree. 730 * The range [start, end] is inclusive. 731 * The tree lock is taken by this function 732 */ 733 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 734 unsigned long bits) 735 { 736 struct extent_state *state; 737 struct rb_node *node; 738 739 btrfs_debug_check_extent_io_range(tree, start, end); 740 741 spin_lock(&tree->lock); 742 again: 743 while (1) { 744 /* 745 * this search will find all the extents that end after 746 * our range starts 747 */ 748 node = tree_search(tree, start); 749 if (!node) 750 break; 751 752 state = rb_entry(node, struct extent_state, rb_node); 753 754 if (state->start > end) 755 goto out; 756 757 if (state->state & bits) { 758 start = state->start; 759 atomic_inc(&state->refs); 760 wait_on_state(tree, state); 761 free_extent_state(state); 762 goto again; 763 } 764 start = state->end + 1; 765 766 if (start > end) 767 break; 768 769 cond_resched_lock(&tree->lock); 770 } 771 out: 772 spin_unlock(&tree->lock); 773 } 774 775 static void set_state_bits(struct extent_io_tree *tree, 776 struct extent_state *state, 777 unsigned long *bits) 778 { 779 unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; 780 781 set_state_cb(tree, state, bits); 782 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 783 u64 range = state->end - state->start + 1; 784 tree->dirty_bytes += range; 785 } 786 state->state |= bits_to_set; 787 } 788 789 static void cache_state(struct extent_state *state, 790 struct extent_state **cached_ptr) 791 { 792 if (cached_ptr && !(*cached_ptr)) { 793 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 794 *cached_ptr = state; 795 atomic_inc(&state->refs); 796 } 797 } 798 } 799 800 /* 801 * set some bits on a range in the tree. This may require allocations or 802 * sleeping, so the gfp mask is used to indicate what is allowed. 803 * 804 * If any of the exclusive bits are set, this will fail with -EEXIST if some 805 * part of the range already has the desired bits set. The start of the 806 * existing range is returned in failed_start in this case. 807 * 808 * [start, end] is inclusive This takes the tree lock. 809 */ 810 811 static int __must_check 812 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 813 unsigned long bits, unsigned long exclusive_bits, 814 u64 *failed_start, struct extent_state **cached_state, 815 gfp_t mask) 816 { 817 struct extent_state *state; 818 struct extent_state *prealloc = NULL; 819 struct rb_node *node; 820 struct rb_node **p; 821 struct rb_node *parent; 822 int err = 0; 823 u64 last_start; 824 u64 last_end; 825 826 btrfs_debug_check_extent_io_range(tree, start, end); 827 828 bits |= EXTENT_FIRST_DELALLOC; 829 again: 830 if (!prealloc && (mask & __GFP_WAIT)) { 831 prealloc = alloc_extent_state(mask); 832 BUG_ON(!prealloc); 833 } 834 835 spin_lock(&tree->lock); 836 if (cached_state && *cached_state) { 837 state = *cached_state; 838 if (state->start <= start && state->end > start && 839 state->tree) { 840 node = &state->rb_node; 841 goto hit_next; 842 } 843 } 844 /* 845 * this search will find all the extents that end after 846 * our range starts. 847 */ 848 node = tree_search_for_insert(tree, start, &p, &parent); 849 if (!node) { 850 prealloc = alloc_extent_state_atomic(prealloc); 851 BUG_ON(!prealloc); 852 err = insert_state(tree, prealloc, start, end, 853 &p, &parent, &bits); 854 if (err) 855 extent_io_tree_panic(tree, err); 856 857 cache_state(prealloc, cached_state); 858 prealloc = NULL; 859 goto out; 860 } 861 state = rb_entry(node, struct extent_state, rb_node); 862 hit_next: 863 last_start = state->start; 864 last_end = state->end; 865 866 /* 867 * | ---- desired range ---- | 868 * | state | 869 * 870 * Just lock what we found and keep going 871 */ 872 if (state->start == start && state->end <= end) { 873 if (state->state & exclusive_bits) { 874 *failed_start = state->start; 875 err = -EEXIST; 876 goto out; 877 } 878 879 set_state_bits(tree, state, &bits); 880 cache_state(state, cached_state); 881 merge_state(tree, state); 882 if (last_end == (u64)-1) 883 goto out; 884 start = last_end + 1; 885 state = next_state(state); 886 if (start < end && state && state->start == start && 887 !need_resched()) 888 goto hit_next; 889 goto search_again; 890 } 891 892 /* 893 * | ---- desired range ---- | 894 * | state | 895 * or 896 * | ------------- state -------------- | 897 * 898 * We need to split the extent we found, and may flip bits on 899 * second half. 900 * 901 * If the extent we found extends past our 902 * range, we just split and search again. It'll get split 903 * again the next time though. 904 * 905 * If the extent we found is inside our range, we set the 906 * desired bit on it. 907 */ 908 if (state->start < start) { 909 if (state->state & exclusive_bits) { 910 *failed_start = start; 911 err = -EEXIST; 912 goto out; 913 } 914 915 prealloc = alloc_extent_state_atomic(prealloc); 916 BUG_ON(!prealloc); 917 err = split_state(tree, state, prealloc, start); 918 if (err) 919 extent_io_tree_panic(tree, err); 920 921 prealloc = NULL; 922 if (err) 923 goto out; 924 if (state->end <= end) { 925 set_state_bits(tree, state, &bits); 926 cache_state(state, cached_state); 927 merge_state(tree, state); 928 if (last_end == (u64)-1) 929 goto out; 930 start = last_end + 1; 931 state = next_state(state); 932 if (start < end && state && state->start == start && 933 !need_resched()) 934 goto hit_next; 935 } 936 goto search_again; 937 } 938 /* 939 * | ---- desired range ---- | 940 * | state | or | state | 941 * 942 * There's a hole, we need to insert something in it and 943 * ignore the extent we found. 944 */ 945 if (state->start > start) { 946 u64 this_end; 947 if (end < last_start) 948 this_end = end; 949 else 950 this_end = last_start - 1; 951 952 prealloc = alloc_extent_state_atomic(prealloc); 953 BUG_ON(!prealloc); 954 955 /* 956 * Avoid to free 'prealloc' if it can be merged with 957 * the later extent. 958 */ 959 err = insert_state(tree, prealloc, start, this_end, 960 NULL, NULL, &bits); 961 if (err) 962 extent_io_tree_panic(tree, err); 963 964 cache_state(prealloc, cached_state); 965 prealloc = NULL; 966 start = this_end + 1; 967 goto search_again; 968 } 969 /* 970 * | ---- desired range ---- | 971 * | state | 972 * We need to split the extent, and set the bit 973 * on the first half 974 */ 975 if (state->start <= end && state->end > end) { 976 if (state->state & exclusive_bits) { 977 *failed_start = start; 978 err = -EEXIST; 979 goto out; 980 } 981 982 prealloc = alloc_extent_state_atomic(prealloc); 983 BUG_ON(!prealloc); 984 err = split_state(tree, state, prealloc, end + 1); 985 if (err) 986 extent_io_tree_panic(tree, err); 987 988 set_state_bits(tree, prealloc, &bits); 989 cache_state(prealloc, cached_state); 990 merge_state(tree, prealloc); 991 prealloc = NULL; 992 goto out; 993 } 994 995 goto search_again; 996 997 out: 998 spin_unlock(&tree->lock); 999 if (prealloc) 1000 free_extent_state(prealloc); 1001 1002 return err; 1003 1004 search_again: 1005 if (start > end) 1006 goto out; 1007 spin_unlock(&tree->lock); 1008 if (mask & __GFP_WAIT) 1009 cond_resched(); 1010 goto again; 1011 } 1012 1013 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1014 unsigned long bits, u64 * failed_start, 1015 struct extent_state **cached_state, gfp_t mask) 1016 { 1017 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1018 cached_state, mask); 1019 } 1020 1021 1022 /** 1023 * convert_extent_bit - convert all bits in a given range from one bit to 1024 * another 1025 * @tree: the io tree to search 1026 * @start: the start offset in bytes 1027 * @end: the end offset in bytes (inclusive) 1028 * @bits: the bits to set in this range 1029 * @clear_bits: the bits to clear in this range 1030 * @cached_state: state that we're going to cache 1031 * @mask: the allocation mask 1032 * 1033 * This will go through and set bits for the given range. If any states exist 1034 * already in this range they are set with the given bit and cleared of the 1035 * clear_bits. This is only meant to be used by things that are mergeable, ie 1036 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1037 * boundary bits like LOCK. 1038 */ 1039 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1040 unsigned long bits, unsigned long clear_bits, 1041 struct extent_state **cached_state, gfp_t mask) 1042 { 1043 struct extent_state *state; 1044 struct extent_state *prealloc = NULL; 1045 struct rb_node *node; 1046 struct rb_node **p; 1047 struct rb_node *parent; 1048 int err = 0; 1049 u64 last_start; 1050 u64 last_end; 1051 1052 btrfs_debug_check_extent_io_range(tree, start, end); 1053 1054 again: 1055 if (!prealloc && (mask & __GFP_WAIT)) { 1056 prealloc = alloc_extent_state(mask); 1057 if (!prealloc) 1058 return -ENOMEM; 1059 } 1060 1061 spin_lock(&tree->lock); 1062 if (cached_state && *cached_state) { 1063 state = *cached_state; 1064 if (state->start <= start && state->end > start && 1065 state->tree) { 1066 node = &state->rb_node; 1067 goto hit_next; 1068 } 1069 } 1070 1071 /* 1072 * this search will find all the extents that end after 1073 * our range starts. 1074 */ 1075 node = tree_search_for_insert(tree, start, &p, &parent); 1076 if (!node) { 1077 prealloc = alloc_extent_state_atomic(prealloc); 1078 if (!prealloc) { 1079 err = -ENOMEM; 1080 goto out; 1081 } 1082 err = insert_state(tree, prealloc, start, end, 1083 &p, &parent, &bits); 1084 if (err) 1085 extent_io_tree_panic(tree, err); 1086 cache_state(prealloc, cached_state); 1087 prealloc = NULL; 1088 goto out; 1089 } 1090 state = rb_entry(node, struct extent_state, rb_node); 1091 hit_next: 1092 last_start = state->start; 1093 last_end = state->end; 1094 1095 /* 1096 * | ---- desired range ---- | 1097 * | state | 1098 * 1099 * Just lock what we found and keep going 1100 */ 1101 if (state->start == start && state->end <= end) { 1102 set_state_bits(tree, state, &bits); 1103 cache_state(state, cached_state); 1104 state = clear_state_bit(tree, state, &clear_bits, 0); 1105 if (last_end == (u64)-1) 1106 goto out; 1107 start = last_end + 1; 1108 if (start < end && state && state->start == start && 1109 !need_resched()) 1110 goto hit_next; 1111 goto search_again; 1112 } 1113 1114 /* 1115 * | ---- desired range ---- | 1116 * | state | 1117 * or 1118 * | ------------- state -------------- | 1119 * 1120 * We need to split the extent we found, and may flip bits on 1121 * second half. 1122 * 1123 * If the extent we found extends past our 1124 * range, we just split and search again. It'll get split 1125 * again the next time though. 1126 * 1127 * If the extent we found is inside our range, we set the 1128 * desired bit on it. 1129 */ 1130 if (state->start < start) { 1131 prealloc = alloc_extent_state_atomic(prealloc); 1132 if (!prealloc) { 1133 err = -ENOMEM; 1134 goto out; 1135 } 1136 err = split_state(tree, state, prealloc, start); 1137 if (err) 1138 extent_io_tree_panic(tree, err); 1139 prealloc = NULL; 1140 if (err) 1141 goto out; 1142 if (state->end <= end) { 1143 set_state_bits(tree, state, &bits); 1144 cache_state(state, cached_state); 1145 state = clear_state_bit(tree, state, &clear_bits, 0); 1146 if (last_end == (u64)-1) 1147 goto out; 1148 start = last_end + 1; 1149 if (start < end && state && state->start == start && 1150 !need_resched()) 1151 goto hit_next; 1152 } 1153 goto search_again; 1154 } 1155 /* 1156 * | ---- desired range ---- | 1157 * | state | or | state | 1158 * 1159 * There's a hole, we need to insert something in it and 1160 * ignore the extent we found. 1161 */ 1162 if (state->start > start) { 1163 u64 this_end; 1164 if (end < last_start) 1165 this_end = end; 1166 else 1167 this_end = last_start - 1; 1168 1169 prealloc = alloc_extent_state_atomic(prealloc); 1170 if (!prealloc) { 1171 err = -ENOMEM; 1172 goto out; 1173 } 1174 1175 /* 1176 * Avoid to free 'prealloc' if it can be merged with 1177 * the later extent. 1178 */ 1179 err = insert_state(tree, prealloc, start, this_end, 1180 NULL, NULL, &bits); 1181 if (err) 1182 extent_io_tree_panic(tree, err); 1183 cache_state(prealloc, cached_state); 1184 prealloc = NULL; 1185 start = this_end + 1; 1186 goto search_again; 1187 } 1188 /* 1189 * | ---- desired range ---- | 1190 * | state | 1191 * We need to split the extent, and set the bit 1192 * on the first half 1193 */ 1194 if (state->start <= end && state->end > end) { 1195 prealloc = alloc_extent_state_atomic(prealloc); 1196 if (!prealloc) { 1197 err = -ENOMEM; 1198 goto out; 1199 } 1200 1201 err = split_state(tree, state, prealloc, end + 1); 1202 if (err) 1203 extent_io_tree_panic(tree, err); 1204 1205 set_state_bits(tree, prealloc, &bits); 1206 cache_state(prealloc, cached_state); 1207 clear_state_bit(tree, prealloc, &clear_bits, 0); 1208 prealloc = NULL; 1209 goto out; 1210 } 1211 1212 goto search_again; 1213 1214 out: 1215 spin_unlock(&tree->lock); 1216 if (prealloc) 1217 free_extent_state(prealloc); 1218 1219 return err; 1220 1221 search_again: 1222 if (start > end) 1223 goto out; 1224 spin_unlock(&tree->lock); 1225 if (mask & __GFP_WAIT) 1226 cond_resched(); 1227 goto again; 1228 } 1229 1230 /* wrappers around set/clear extent bit */ 1231 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1232 gfp_t mask) 1233 { 1234 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1235 NULL, mask); 1236 } 1237 1238 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1239 unsigned long bits, gfp_t mask) 1240 { 1241 return set_extent_bit(tree, start, end, bits, NULL, 1242 NULL, mask); 1243 } 1244 1245 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1246 unsigned long bits, gfp_t mask) 1247 { 1248 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1249 } 1250 1251 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1252 struct extent_state **cached_state, gfp_t mask) 1253 { 1254 return set_extent_bit(tree, start, end, 1255 EXTENT_DELALLOC | EXTENT_UPTODATE, 1256 NULL, cached_state, mask); 1257 } 1258 1259 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1260 struct extent_state **cached_state, gfp_t mask) 1261 { 1262 return set_extent_bit(tree, start, end, 1263 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1264 NULL, cached_state, mask); 1265 } 1266 1267 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1268 gfp_t mask) 1269 { 1270 return clear_extent_bit(tree, start, end, 1271 EXTENT_DIRTY | EXTENT_DELALLOC | 1272 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1273 } 1274 1275 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1276 gfp_t mask) 1277 { 1278 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1279 NULL, mask); 1280 } 1281 1282 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1283 struct extent_state **cached_state, gfp_t mask) 1284 { 1285 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, 1286 cached_state, mask); 1287 } 1288 1289 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1290 struct extent_state **cached_state, gfp_t mask) 1291 { 1292 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1293 cached_state, mask); 1294 } 1295 1296 /* 1297 * either insert or lock state struct between start and end use mask to tell 1298 * us if waiting is desired. 1299 */ 1300 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1301 unsigned long bits, struct extent_state **cached_state) 1302 { 1303 int err; 1304 u64 failed_start; 1305 while (1) { 1306 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1307 EXTENT_LOCKED, &failed_start, 1308 cached_state, GFP_NOFS); 1309 if (err == -EEXIST) { 1310 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1311 start = failed_start; 1312 } else 1313 break; 1314 WARN_ON(start > end); 1315 } 1316 return err; 1317 } 1318 1319 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1320 { 1321 return lock_extent_bits(tree, start, end, 0, NULL); 1322 } 1323 1324 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1325 { 1326 int err; 1327 u64 failed_start; 1328 1329 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1330 &failed_start, NULL, GFP_NOFS); 1331 if (err == -EEXIST) { 1332 if (failed_start > start) 1333 clear_extent_bit(tree, start, failed_start - 1, 1334 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1335 return 0; 1336 } 1337 return 1; 1338 } 1339 1340 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1341 struct extent_state **cached, gfp_t mask) 1342 { 1343 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1344 mask); 1345 } 1346 1347 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1348 { 1349 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1350 GFP_NOFS); 1351 } 1352 1353 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1354 { 1355 unsigned long index = start >> PAGE_CACHE_SHIFT; 1356 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1357 struct page *page; 1358 1359 while (index <= end_index) { 1360 page = find_get_page(inode->i_mapping, index); 1361 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1362 clear_page_dirty_for_io(page); 1363 page_cache_release(page); 1364 index++; 1365 } 1366 return 0; 1367 } 1368 1369 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1370 { 1371 unsigned long index = start >> PAGE_CACHE_SHIFT; 1372 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1373 struct page *page; 1374 1375 while (index <= end_index) { 1376 page = find_get_page(inode->i_mapping, index); 1377 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1378 account_page_redirty(page); 1379 __set_page_dirty_nobuffers(page); 1380 page_cache_release(page); 1381 index++; 1382 } 1383 return 0; 1384 } 1385 1386 /* 1387 * helper function to set both pages and extents in the tree writeback 1388 */ 1389 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1390 { 1391 unsigned long index = start >> PAGE_CACHE_SHIFT; 1392 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1393 struct page *page; 1394 1395 while (index <= end_index) { 1396 page = find_get_page(tree->mapping, index); 1397 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1398 set_page_writeback(page); 1399 page_cache_release(page); 1400 index++; 1401 } 1402 return 0; 1403 } 1404 1405 /* find the first state struct with 'bits' set after 'start', and 1406 * return it. tree->lock must be held. NULL will returned if 1407 * nothing was found after 'start' 1408 */ 1409 static struct extent_state * 1410 find_first_extent_bit_state(struct extent_io_tree *tree, 1411 u64 start, unsigned long bits) 1412 { 1413 struct rb_node *node; 1414 struct extent_state *state; 1415 1416 /* 1417 * this search will find all the extents that end after 1418 * our range starts. 1419 */ 1420 node = tree_search(tree, start); 1421 if (!node) 1422 goto out; 1423 1424 while (1) { 1425 state = rb_entry(node, struct extent_state, rb_node); 1426 if (state->end >= start && (state->state & bits)) 1427 return state; 1428 1429 node = rb_next(node); 1430 if (!node) 1431 break; 1432 } 1433 out: 1434 return NULL; 1435 } 1436 1437 /* 1438 * find the first offset in the io tree with 'bits' set. zero is 1439 * returned if we find something, and *start_ret and *end_ret are 1440 * set to reflect the state struct that was found. 1441 * 1442 * If nothing was found, 1 is returned. If found something, return 0. 1443 */ 1444 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1445 u64 *start_ret, u64 *end_ret, unsigned long bits, 1446 struct extent_state **cached_state) 1447 { 1448 struct extent_state *state; 1449 struct rb_node *n; 1450 int ret = 1; 1451 1452 spin_lock(&tree->lock); 1453 if (cached_state && *cached_state) { 1454 state = *cached_state; 1455 if (state->end == start - 1 && state->tree) { 1456 n = rb_next(&state->rb_node); 1457 while (n) { 1458 state = rb_entry(n, struct extent_state, 1459 rb_node); 1460 if (state->state & bits) 1461 goto got_it; 1462 n = rb_next(n); 1463 } 1464 free_extent_state(*cached_state); 1465 *cached_state = NULL; 1466 goto out; 1467 } 1468 free_extent_state(*cached_state); 1469 *cached_state = NULL; 1470 } 1471 1472 state = find_first_extent_bit_state(tree, start, bits); 1473 got_it: 1474 if (state) { 1475 cache_state(state, cached_state); 1476 *start_ret = state->start; 1477 *end_ret = state->end; 1478 ret = 0; 1479 } 1480 out: 1481 spin_unlock(&tree->lock); 1482 return ret; 1483 } 1484 1485 /* 1486 * find a contiguous range of bytes in the file marked as delalloc, not 1487 * more than 'max_bytes'. start and end are used to return the range, 1488 * 1489 * 1 is returned if we find something, 0 if nothing was in the tree 1490 */ 1491 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1492 u64 *start, u64 *end, u64 max_bytes, 1493 struct extent_state **cached_state) 1494 { 1495 struct rb_node *node; 1496 struct extent_state *state; 1497 u64 cur_start = *start; 1498 u64 found = 0; 1499 u64 total_bytes = 0; 1500 1501 spin_lock(&tree->lock); 1502 1503 /* 1504 * this search will find all the extents that end after 1505 * our range starts. 1506 */ 1507 node = tree_search(tree, cur_start); 1508 if (!node) { 1509 if (!found) 1510 *end = (u64)-1; 1511 goto out; 1512 } 1513 1514 while (1) { 1515 state = rb_entry(node, struct extent_state, rb_node); 1516 if (found && (state->start != cur_start || 1517 (state->state & EXTENT_BOUNDARY))) { 1518 goto out; 1519 } 1520 if (!(state->state & EXTENT_DELALLOC)) { 1521 if (!found) 1522 *end = state->end; 1523 goto out; 1524 } 1525 if (!found) { 1526 *start = state->start; 1527 *cached_state = state; 1528 atomic_inc(&state->refs); 1529 } 1530 found++; 1531 *end = state->end; 1532 cur_start = state->end + 1; 1533 node = rb_next(node); 1534 total_bytes += state->end - state->start + 1; 1535 if (total_bytes >= max_bytes) 1536 break; 1537 if (!node) 1538 break; 1539 } 1540 out: 1541 spin_unlock(&tree->lock); 1542 return found; 1543 } 1544 1545 static noinline void __unlock_for_delalloc(struct inode *inode, 1546 struct page *locked_page, 1547 u64 start, u64 end) 1548 { 1549 int ret; 1550 struct page *pages[16]; 1551 unsigned long index = start >> PAGE_CACHE_SHIFT; 1552 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1553 unsigned long nr_pages = end_index - index + 1; 1554 int i; 1555 1556 if (index == locked_page->index && end_index == index) 1557 return; 1558 1559 while (nr_pages > 0) { 1560 ret = find_get_pages_contig(inode->i_mapping, index, 1561 min_t(unsigned long, nr_pages, 1562 ARRAY_SIZE(pages)), pages); 1563 for (i = 0; i < ret; i++) { 1564 if (pages[i] != locked_page) 1565 unlock_page(pages[i]); 1566 page_cache_release(pages[i]); 1567 } 1568 nr_pages -= ret; 1569 index += ret; 1570 cond_resched(); 1571 } 1572 } 1573 1574 static noinline int lock_delalloc_pages(struct inode *inode, 1575 struct page *locked_page, 1576 u64 delalloc_start, 1577 u64 delalloc_end) 1578 { 1579 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1580 unsigned long start_index = index; 1581 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1582 unsigned long pages_locked = 0; 1583 struct page *pages[16]; 1584 unsigned long nrpages; 1585 int ret; 1586 int i; 1587 1588 /* the caller is responsible for locking the start index */ 1589 if (index == locked_page->index && index == end_index) 1590 return 0; 1591 1592 /* skip the page at the start index */ 1593 nrpages = end_index - index + 1; 1594 while (nrpages > 0) { 1595 ret = find_get_pages_contig(inode->i_mapping, index, 1596 min_t(unsigned long, 1597 nrpages, ARRAY_SIZE(pages)), pages); 1598 if (ret == 0) { 1599 ret = -EAGAIN; 1600 goto done; 1601 } 1602 /* now we have an array of pages, lock them all */ 1603 for (i = 0; i < ret; i++) { 1604 /* 1605 * the caller is taking responsibility for 1606 * locked_page 1607 */ 1608 if (pages[i] != locked_page) { 1609 lock_page(pages[i]); 1610 if (!PageDirty(pages[i]) || 1611 pages[i]->mapping != inode->i_mapping) { 1612 ret = -EAGAIN; 1613 unlock_page(pages[i]); 1614 page_cache_release(pages[i]); 1615 goto done; 1616 } 1617 } 1618 page_cache_release(pages[i]); 1619 pages_locked++; 1620 } 1621 nrpages -= ret; 1622 index += ret; 1623 cond_resched(); 1624 } 1625 ret = 0; 1626 done: 1627 if (ret && pages_locked) { 1628 __unlock_for_delalloc(inode, locked_page, 1629 delalloc_start, 1630 ((u64)(start_index + pages_locked - 1)) << 1631 PAGE_CACHE_SHIFT); 1632 } 1633 return ret; 1634 } 1635 1636 /* 1637 * find a contiguous range of bytes in the file marked as delalloc, not 1638 * more than 'max_bytes'. start and end are used to return the range, 1639 * 1640 * 1 is returned if we find something, 0 if nothing was in the tree 1641 */ 1642 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1643 struct extent_io_tree *tree, 1644 struct page *locked_page, u64 *start, 1645 u64 *end, u64 max_bytes) 1646 { 1647 u64 delalloc_start; 1648 u64 delalloc_end; 1649 u64 found; 1650 struct extent_state *cached_state = NULL; 1651 int ret; 1652 int loops = 0; 1653 1654 again: 1655 /* step one, find a bunch of delalloc bytes starting at start */ 1656 delalloc_start = *start; 1657 delalloc_end = 0; 1658 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1659 max_bytes, &cached_state); 1660 if (!found || delalloc_end <= *start) { 1661 *start = delalloc_start; 1662 *end = delalloc_end; 1663 free_extent_state(cached_state); 1664 return 0; 1665 } 1666 1667 /* 1668 * start comes from the offset of locked_page. We have to lock 1669 * pages in order, so we can't process delalloc bytes before 1670 * locked_page 1671 */ 1672 if (delalloc_start < *start) 1673 delalloc_start = *start; 1674 1675 /* 1676 * make sure to limit the number of pages we try to lock down 1677 */ 1678 if (delalloc_end + 1 - delalloc_start > max_bytes) 1679 delalloc_end = delalloc_start + max_bytes - 1; 1680 1681 /* step two, lock all the pages after the page that has start */ 1682 ret = lock_delalloc_pages(inode, locked_page, 1683 delalloc_start, delalloc_end); 1684 if (ret == -EAGAIN) { 1685 /* some of the pages are gone, lets avoid looping by 1686 * shortening the size of the delalloc range we're searching 1687 */ 1688 free_extent_state(cached_state); 1689 if (!loops) { 1690 max_bytes = PAGE_CACHE_SIZE; 1691 loops = 1; 1692 goto again; 1693 } else { 1694 found = 0; 1695 goto out_failed; 1696 } 1697 } 1698 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1699 1700 /* step three, lock the state bits for the whole range */ 1701 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1702 1703 /* then test to make sure it is all still delalloc */ 1704 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1705 EXTENT_DELALLOC, 1, cached_state); 1706 if (!ret) { 1707 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1708 &cached_state, GFP_NOFS); 1709 __unlock_for_delalloc(inode, locked_page, 1710 delalloc_start, delalloc_end); 1711 cond_resched(); 1712 goto again; 1713 } 1714 free_extent_state(cached_state); 1715 *start = delalloc_start; 1716 *end = delalloc_end; 1717 out_failed: 1718 return found; 1719 } 1720 1721 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1722 struct page *locked_page, 1723 unsigned long clear_bits, 1724 unsigned long page_ops) 1725 { 1726 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1727 int ret; 1728 struct page *pages[16]; 1729 unsigned long index = start >> PAGE_CACHE_SHIFT; 1730 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1731 unsigned long nr_pages = end_index - index + 1; 1732 int i; 1733 1734 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1735 if (page_ops == 0) 1736 return 0; 1737 1738 while (nr_pages > 0) { 1739 ret = find_get_pages_contig(inode->i_mapping, index, 1740 min_t(unsigned long, 1741 nr_pages, ARRAY_SIZE(pages)), pages); 1742 for (i = 0; i < ret; i++) { 1743 1744 if (page_ops & PAGE_SET_PRIVATE2) 1745 SetPagePrivate2(pages[i]); 1746 1747 if (pages[i] == locked_page) { 1748 page_cache_release(pages[i]); 1749 continue; 1750 } 1751 if (page_ops & PAGE_CLEAR_DIRTY) 1752 clear_page_dirty_for_io(pages[i]); 1753 if (page_ops & PAGE_SET_WRITEBACK) 1754 set_page_writeback(pages[i]); 1755 if (page_ops & PAGE_END_WRITEBACK) 1756 end_page_writeback(pages[i]); 1757 if (page_ops & PAGE_UNLOCK) 1758 unlock_page(pages[i]); 1759 page_cache_release(pages[i]); 1760 } 1761 nr_pages -= ret; 1762 index += ret; 1763 cond_resched(); 1764 } 1765 return 0; 1766 } 1767 1768 /* 1769 * count the number of bytes in the tree that have a given bit(s) 1770 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1771 * cached. The total number found is returned. 1772 */ 1773 u64 count_range_bits(struct extent_io_tree *tree, 1774 u64 *start, u64 search_end, u64 max_bytes, 1775 unsigned long bits, int contig) 1776 { 1777 struct rb_node *node; 1778 struct extent_state *state; 1779 u64 cur_start = *start; 1780 u64 total_bytes = 0; 1781 u64 last = 0; 1782 int found = 0; 1783 1784 if (WARN_ON(search_end <= cur_start)) 1785 return 0; 1786 1787 spin_lock(&tree->lock); 1788 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1789 total_bytes = tree->dirty_bytes; 1790 goto out; 1791 } 1792 /* 1793 * this search will find all the extents that end after 1794 * our range starts. 1795 */ 1796 node = tree_search(tree, cur_start); 1797 if (!node) 1798 goto out; 1799 1800 while (1) { 1801 state = rb_entry(node, struct extent_state, rb_node); 1802 if (state->start > search_end) 1803 break; 1804 if (contig && found && state->start > last + 1) 1805 break; 1806 if (state->end >= cur_start && (state->state & bits) == bits) { 1807 total_bytes += min(search_end, state->end) + 1 - 1808 max(cur_start, state->start); 1809 if (total_bytes >= max_bytes) 1810 break; 1811 if (!found) { 1812 *start = max(cur_start, state->start); 1813 found = 1; 1814 } 1815 last = state->end; 1816 } else if (contig && found) { 1817 break; 1818 } 1819 node = rb_next(node); 1820 if (!node) 1821 break; 1822 } 1823 out: 1824 spin_unlock(&tree->lock); 1825 return total_bytes; 1826 } 1827 1828 /* 1829 * set the private field for a given byte offset in the tree. If there isn't 1830 * an extent_state there already, this does nothing. 1831 */ 1832 static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1833 { 1834 struct rb_node *node; 1835 struct extent_state *state; 1836 int ret = 0; 1837 1838 spin_lock(&tree->lock); 1839 /* 1840 * this search will find all the extents that end after 1841 * our range starts. 1842 */ 1843 node = tree_search(tree, start); 1844 if (!node) { 1845 ret = -ENOENT; 1846 goto out; 1847 } 1848 state = rb_entry(node, struct extent_state, rb_node); 1849 if (state->start != start) { 1850 ret = -ENOENT; 1851 goto out; 1852 } 1853 state->private = private; 1854 out: 1855 spin_unlock(&tree->lock); 1856 return ret; 1857 } 1858 1859 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1860 { 1861 struct rb_node *node; 1862 struct extent_state *state; 1863 int ret = 0; 1864 1865 spin_lock(&tree->lock); 1866 /* 1867 * this search will find all the extents that end after 1868 * our range starts. 1869 */ 1870 node = tree_search(tree, start); 1871 if (!node) { 1872 ret = -ENOENT; 1873 goto out; 1874 } 1875 state = rb_entry(node, struct extent_state, rb_node); 1876 if (state->start != start) { 1877 ret = -ENOENT; 1878 goto out; 1879 } 1880 *private = state->private; 1881 out: 1882 spin_unlock(&tree->lock); 1883 return ret; 1884 } 1885 1886 /* 1887 * searches a range in the state tree for a given mask. 1888 * If 'filled' == 1, this returns 1 only if every extent in the tree 1889 * has the bits set. Otherwise, 1 is returned if any bit in the 1890 * range is found set. 1891 */ 1892 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1893 unsigned long bits, int filled, struct extent_state *cached) 1894 { 1895 struct extent_state *state = NULL; 1896 struct rb_node *node; 1897 int bitset = 0; 1898 1899 spin_lock(&tree->lock); 1900 if (cached && cached->tree && cached->start <= start && 1901 cached->end > start) 1902 node = &cached->rb_node; 1903 else 1904 node = tree_search(tree, start); 1905 while (node && start <= end) { 1906 state = rb_entry(node, struct extent_state, rb_node); 1907 1908 if (filled && state->start > start) { 1909 bitset = 0; 1910 break; 1911 } 1912 1913 if (state->start > end) 1914 break; 1915 1916 if (state->state & bits) { 1917 bitset = 1; 1918 if (!filled) 1919 break; 1920 } else if (filled) { 1921 bitset = 0; 1922 break; 1923 } 1924 1925 if (state->end == (u64)-1) 1926 break; 1927 1928 start = state->end + 1; 1929 if (start > end) 1930 break; 1931 node = rb_next(node); 1932 if (!node) { 1933 if (filled) 1934 bitset = 0; 1935 break; 1936 } 1937 } 1938 spin_unlock(&tree->lock); 1939 return bitset; 1940 } 1941 1942 /* 1943 * helper function to set a given page up to date if all the 1944 * extents in the tree for that page are up to date 1945 */ 1946 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1947 { 1948 u64 start = page_offset(page); 1949 u64 end = start + PAGE_CACHE_SIZE - 1; 1950 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1951 SetPageUptodate(page); 1952 } 1953 1954 /* 1955 * When IO fails, either with EIO or csum verification fails, we 1956 * try other mirrors that might have a good copy of the data. This 1957 * io_failure_record is used to record state as we go through all the 1958 * mirrors. If another mirror has good data, the page is set up to date 1959 * and things continue. If a good mirror can't be found, the original 1960 * bio end_io callback is called to indicate things have failed. 1961 */ 1962 struct io_failure_record { 1963 struct page *page; 1964 u64 start; 1965 u64 len; 1966 u64 logical; 1967 unsigned long bio_flags; 1968 int this_mirror; 1969 int failed_mirror; 1970 int in_validation; 1971 }; 1972 1973 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1974 int did_repair) 1975 { 1976 int ret; 1977 int err = 0; 1978 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1979 1980 set_state_private(failure_tree, rec->start, 0); 1981 ret = clear_extent_bits(failure_tree, rec->start, 1982 rec->start + rec->len - 1, 1983 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1984 if (ret) 1985 err = ret; 1986 1987 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1988 rec->start + rec->len - 1, 1989 EXTENT_DAMAGED, GFP_NOFS); 1990 if (ret && !err) 1991 err = ret; 1992 1993 kfree(rec); 1994 return err; 1995 } 1996 1997 /* 1998 * this bypasses the standard btrfs submit functions deliberately, as 1999 * the standard behavior is to write all copies in a raid setup. here we only 2000 * want to write the one bad copy. so we do the mapping for ourselves and issue 2001 * submit_bio directly. 2002 * to avoid any synchronization issues, wait for the data after writing, which 2003 * actually prevents the read that triggered the error from finishing. 2004 * currently, there can be no more than two copies of every data bit. thus, 2005 * exactly one rewrite is required. 2006 */ 2007 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 2008 u64 length, u64 logical, struct page *page, 2009 int mirror_num) 2010 { 2011 struct bio *bio; 2012 struct btrfs_device *dev; 2013 u64 map_length = 0; 2014 u64 sector; 2015 struct btrfs_bio *bbio = NULL; 2016 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2017 int ret; 2018 2019 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2020 BUG_ON(!mirror_num); 2021 2022 /* we can't repair anything in raid56 yet */ 2023 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2024 return 0; 2025 2026 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2027 if (!bio) 2028 return -EIO; 2029 bio->bi_iter.bi_size = 0; 2030 map_length = length; 2031 2032 ret = btrfs_map_block(fs_info, WRITE, logical, 2033 &map_length, &bbio, mirror_num); 2034 if (ret) { 2035 bio_put(bio); 2036 return -EIO; 2037 } 2038 BUG_ON(mirror_num != bbio->mirror_num); 2039 sector = bbio->stripes[mirror_num-1].physical >> 9; 2040 bio->bi_iter.bi_sector = sector; 2041 dev = bbio->stripes[mirror_num-1].dev; 2042 kfree(bbio); 2043 if (!dev || !dev->bdev || !dev->writeable) { 2044 bio_put(bio); 2045 return -EIO; 2046 } 2047 bio->bi_bdev = dev->bdev; 2048 bio_add_page(bio, page, length, start - page_offset(page)); 2049 2050 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { 2051 /* try to remap that extent elsewhere? */ 2052 bio_put(bio); 2053 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2054 return -EIO; 2055 } 2056 2057 printk_ratelimited_in_rcu(KERN_INFO 2058 "BTRFS: read error corrected: ino %lu off %llu " 2059 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2060 start, rcu_str_deref(dev->name), sector); 2061 2062 bio_put(bio); 2063 return 0; 2064 } 2065 2066 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 2067 int mirror_num) 2068 { 2069 u64 start = eb->start; 2070 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2071 int ret = 0; 2072 2073 if (root->fs_info->sb->s_flags & MS_RDONLY) 2074 return -EROFS; 2075 2076 for (i = 0; i < num_pages; i++) { 2077 struct page *p = extent_buffer_page(eb, i); 2078 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2079 start, p, mirror_num); 2080 if (ret) 2081 break; 2082 start += PAGE_CACHE_SIZE; 2083 } 2084 2085 return ret; 2086 } 2087 2088 /* 2089 * each time an IO finishes, we do a fast check in the IO failure tree 2090 * to see if we need to process or clean up an io_failure_record 2091 */ 2092 static int clean_io_failure(u64 start, struct page *page) 2093 { 2094 u64 private; 2095 u64 private_failure; 2096 struct io_failure_record *failrec; 2097 struct inode *inode = page->mapping->host; 2098 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2099 struct extent_state *state; 2100 int num_copies; 2101 int did_repair = 0; 2102 int ret; 2103 2104 private = 0; 2105 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2106 (u64)-1, 1, EXTENT_DIRTY, 0); 2107 if (!ret) 2108 return 0; 2109 2110 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2111 &private_failure); 2112 if (ret) 2113 return 0; 2114 2115 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2116 BUG_ON(!failrec->this_mirror); 2117 2118 if (failrec->in_validation) { 2119 /* there was no real error, just free the record */ 2120 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2121 failrec->start); 2122 did_repair = 1; 2123 goto out; 2124 } 2125 if (fs_info->sb->s_flags & MS_RDONLY) 2126 goto out; 2127 2128 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2129 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2130 failrec->start, 2131 EXTENT_LOCKED); 2132 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2133 2134 if (state && state->start <= failrec->start && 2135 state->end >= failrec->start + failrec->len - 1) { 2136 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2137 failrec->len); 2138 if (num_copies > 1) { 2139 ret = repair_io_failure(fs_info, start, failrec->len, 2140 failrec->logical, page, 2141 failrec->failed_mirror); 2142 did_repair = !ret; 2143 } 2144 ret = 0; 2145 } 2146 2147 out: 2148 if (!ret) 2149 ret = free_io_failure(inode, failrec, did_repair); 2150 2151 return ret; 2152 } 2153 2154 /* 2155 * this is a generic handler for readpage errors (default 2156 * readpage_io_failed_hook). if other copies exist, read those and write back 2157 * good data to the failed position. does not investigate in remapping the 2158 * failed extent elsewhere, hoping the device will be smart enough to do this as 2159 * needed 2160 */ 2161 2162 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2163 struct page *page, u64 start, u64 end, 2164 int failed_mirror) 2165 { 2166 struct io_failure_record *failrec = NULL; 2167 u64 private; 2168 struct extent_map *em; 2169 struct inode *inode = page->mapping->host; 2170 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2171 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2172 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2173 struct bio *bio; 2174 struct btrfs_io_bio *btrfs_failed_bio; 2175 struct btrfs_io_bio *btrfs_bio; 2176 int num_copies; 2177 int ret; 2178 int read_mode; 2179 u64 logical; 2180 2181 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2182 2183 ret = get_state_private(failure_tree, start, &private); 2184 if (ret) { 2185 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2186 if (!failrec) 2187 return -ENOMEM; 2188 failrec->start = start; 2189 failrec->len = end - start + 1; 2190 failrec->this_mirror = 0; 2191 failrec->bio_flags = 0; 2192 failrec->in_validation = 0; 2193 2194 read_lock(&em_tree->lock); 2195 em = lookup_extent_mapping(em_tree, start, failrec->len); 2196 if (!em) { 2197 read_unlock(&em_tree->lock); 2198 kfree(failrec); 2199 return -EIO; 2200 } 2201 2202 if (em->start > start || em->start + em->len <= start) { 2203 free_extent_map(em); 2204 em = NULL; 2205 } 2206 read_unlock(&em_tree->lock); 2207 2208 if (!em) { 2209 kfree(failrec); 2210 return -EIO; 2211 } 2212 logical = start - em->start; 2213 logical = em->block_start + logical; 2214 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2215 logical = em->block_start; 2216 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2217 extent_set_compress_type(&failrec->bio_flags, 2218 em->compress_type); 2219 } 2220 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2221 "len=%llu\n", logical, start, failrec->len); 2222 failrec->logical = logical; 2223 free_extent_map(em); 2224 2225 /* set the bits in the private failure tree */ 2226 ret = set_extent_bits(failure_tree, start, end, 2227 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2228 if (ret >= 0) 2229 ret = set_state_private(failure_tree, start, 2230 (u64)(unsigned long)failrec); 2231 /* set the bits in the inode's tree */ 2232 if (ret >= 0) 2233 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2234 GFP_NOFS); 2235 if (ret < 0) { 2236 kfree(failrec); 2237 return ret; 2238 } 2239 } else { 2240 failrec = (struct io_failure_record *)(unsigned long)private; 2241 pr_debug("bio_readpage_error: (found) logical=%llu, " 2242 "start=%llu, len=%llu, validation=%d\n", 2243 failrec->logical, failrec->start, failrec->len, 2244 failrec->in_validation); 2245 /* 2246 * when data can be on disk more than twice, add to failrec here 2247 * (e.g. with a list for failed_mirror) to make 2248 * clean_io_failure() clean all those errors at once. 2249 */ 2250 } 2251 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2252 failrec->logical, failrec->len); 2253 if (num_copies == 1) { 2254 /* 2255 * we only have a single copy of the data, so don't bother with 2256 * all the retry and error correction code that follows. no 2257 * matter what the error is, it is very likely to persist. 2258 */ 2259 pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 2260 num_copies, failrec->this_mirror, failed_mirror); 2261 free_io_failure(inode, failrec, 0); 2262 return -EIO; 2263 } 2264 2265 /* 2266 * there are two premises: 2267 * a) deliver good data to the caller 2268 * b) correct the bad sectors on disk 2269 */ 2270 if (failed_bio->bi_vcnt > 1) { 2271 /* 2272 * to fulfill b), we need to know the exact failing sectors, as 2273 * we don't want to rewrite any more than the failed ones. thus, 2274 * we need separate read requests for the failed bio 2275 * 2276 * if the following BUG_ON triggers, our validation request got 2277 * merged. we need separate requests for our algorithm to work. 2278 */ 2279 BUG_ON(failrec->in_validation); 2280 failrec->in_validation = 1; 2281 failrec->this_mirror = failed_mirror; 2282 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2283 } else { 2284 /* 2285 * we're ready to fulfill a) and b) alongside. get a good copy 2286 * of the failed sector and if we succeed, we have setup 2287 * everything for repair_io_failure to do the rest for us. 2288 */ 2289 if (failrec->in_validation) { 2290 BUG_ON(failrec->this_mirror != failed_mirror); 2291 failrec->in_validation = 0; 2292 failrec->this_mirror = 0; 2293 } 2294 failrec->failed_mirror = failed_mirror; 2295 failrec->this_mirror++; 2296 if (failrec->this_mirror == failed_mirror) 2297 failrec->this_mirror++; 2298 read_mode = READ_SYNC; 2299 } 2300 2301 if (failrec->this_mirror > num_copies) { 2302 pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 2303 num_copies, failrec->this_mirror, failed_mirror); 2304 free_io_failure(inode, failrec, 0); 2305 return -EIO; 2306 } 2307 2308 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2309 if (!bio) { 2310 free_io_failure(inode, failrec, 0); 2311 return -EIO; 2312 } 2313 bio->bi_end_io = failed_bio->bi_end_io; 2314 bio->bi_iter.bi_sector = failrec->logical >> 9; 2315 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2316 bio->bi_iter.bi_size = 0; 2317 2318 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2319 if (btrfs_failed_bio->csum) { 2320 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2321 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2322 2323 btrfs_bio = btrfs_io_bio(bio); 2324 btrfs_bio->csum = btrfs_bio->csum_inline; 2325 phy_offset >>= inode->i_sb->s_blocksize_bits; 2326 phy_offset *= csum_size; 2327 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, 2328 csum_size); 2329 } 2330 2331 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2332 2333 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2334 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2335 failrec->this_mirror, num_copies, failrec->in_validation); 2336 2337 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2338 failrec->this_mirror, 2339 failrec->bio_flags, 0); 2340 return ret; 2341 } 2342 2343 /* lots and lots of room for performance fixes in the end_bio funcs */ 2344 2345 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2346 { 2347 int uptodate = (err == 0); 2348 struct extent_io_tree *tree; 2349 int ret; 2350 2351 tree = &BTRFS_I(page->mapping->host)->io_tree; 2352 2353 if (tree->ops && tree->ops->writepage_end_io_hook) { 2354 ret = tree->ops->writepage_end_io_hook(page, start, 2355 end, NULL, uptodate); 2356 if (ret) 2357 uptodate = 0; 2358 } 2359 2360 if (!uptodate) { 2361 ClearPageUptodate(page); 2362 SetPageError(page); 2363 } 2364 return 0; 2365 } 2366 2367 /* 2368 * after a writepage IO is done, we need to: 2369 * clear the uptodate bits on error 2370 * clear the writeback bits in the extent tree for this IO 2371 * end_page_writeback if the page has no more pending IO 2372 * 2373 * Scheduling is not allowed, so the extent state tree is expected 2374 * to have one and only one object corresponding to this IO. 2375 */ 2376 static void end_bio_extent_writepage(struct bio *bio, int err) 2377 { 2378 struct bio_vec *bvec; 2379 u64 start; 2380 u64 end; 2381 int i; 2382 2383 bio_for_each_segment_all(bvec, bio, i) { 2384 struct page *page = bvec->bv_page; 2385 2386 /* We always issue full-page reads, but if some block 2387 * in a page fails to read, blk_update_request() will 2388 * advance bv_offset and adjust bv_len to compensate. 2389 * Print a warning for nonzero offsets, and an error 2390 * if they don't add up to a full page. */ 2391 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { 2392 if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) 2393 btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, 2394 "partial page write in btrfs with offset %u and length %u", 2395 bvec->bv_offset, bvec->bv_len); 2396 else 2397 btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, 2398 "incomplete page write in btrfs with offset %u and " 2399 "length %u", 2400 bvec->bv_offset, bvec->bv_len); 2401 } 2402 2403 start = page_offset(page); 2404 end = start + bvec->bv_offset + bvec->bv_len - 1; 2405 2406 if (end_extent_writepage(page, err, start, end)) 2407 continue; 2408 2409 end_page_writeback(page); 2410 } 2411 2412 bio_put(bio); 2413 } 2414 2415 static void 2416 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2417 int uptodate) 2418 { 2419 struct extent_state *cached = NULL; 2420 u64 end = start + len - 1; 2421 2422 if (uptodate && tree->track_uptodate) 2423 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2424 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2425 } 2426 2427 /* 2428 * after a readpage IO is done, we need to: 2429 * clear the uptodate bits on error 2430 * set the uptodate bits if things worked 2431 * set the page up to date if all extents in the tree are uptodate 2432 * clear the lock bit in the extent tree 2433 * unlock the page if there are no other extents locked for it 2434 * 2435 * Scheduling is not allowed, so the extent state tree is expected 2436 * to have one and only one object corresponding to this IO. 2437 */ 2438 static void end_bio_extent_readpage(struct bio *bio, int err) 2439 { 2440 struct bio_vec *bvec; 2441 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2442 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2443 struct extent_io_tree *tree; 2444 u64 offset = 0; 2445 u64 start; 2446 u64 end; 2447 u64 len; 2448 u64 extent_start = 0; 2449 u64 extent_len = 0; 2450 int mirror; 2451 int ret; 2452 int i; 2453 2454 if (err) 2455 uptodate = 0; 2456 2457 bio_for_each_segment_all(bvec, bio, i) { 2458 struct page *page = bvec->bv_page; 2459 struct inode *inode = page->mapping->host; 2460 2461 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2462 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, 2463 io_bio->mirror_num); 2464 tree = &BTRFS_I(inode)->io_tree; 2465 2466 /* We always issue full-page reads, but if some block 2467 * in a page fails to read, blk_update_request() will 2468 * advance bv_offset and adjust bv_len to compensate. 2469 * Print a warning for nonzero offsets, and an error 2470 * if they don't add up to a full page. */ 2471 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) { 2472 if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE) 2473 btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, 2474 "partial page read in btrfs with offset %u and length %u", 2475 bvec->bv_offset, bvec->bv_len); 2476 else 2477 btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, 2478 "incomplete page read in btrfs with offset %u and " 2479 "length %u", 2480 bvec->bv_offset, bvec->bv_len); 2481 } 2482 2483 start = page_offset(page); 2484 end = start + bvec->bv_offset + bvec->bv_len - 1; 2485 len = bvec->bv_len; 2486 2487 mirror = io_bio->mirror_num; 2488 if (likely(uptodate && tree->ops && 2489 tree->ops->readpage_end_io_hook)) { 2490 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2491 page, start, end, 2492 mirror); 2493 if (ret) 2494 uptodate = 0; 2495 else 2496 clean_io_failure(start, page); 2497 } 2498 2499 if (likely(uptodate)) 2500 goto readpage_ok; 2501 2502 if (tree->ops && tree->ops->readpage_io_failed_hook) { 2503 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2504 if (!ret && !err && 2505 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2506 uptodate = 1; 2507 } else { 2508 /* 2509 * The generic bio_readpage_error handles errors the 2510 * following way: If possible, new read requests are 2511 * created and submitted and will end up in 2512 * end_bio_extent_readpage as well (if we're lucky, not 2513 * in the !uptodate case). In that case it returns 0 and 2514 * we just go on with the next page in our bio. If it 2515 * can't handle the error it will return -EIO and we 2516 * remain responsible for that page. 2517 */ 2518 ret = bio_readpage_error(bio, offset, page, start, end, 2519 mirror); 2520 if (ret == 0) { 2521 uptodate = 2522 test_bit(BIO_UPTODATE, &bio->bi_flags); 2523 if (err) 2524 uptodate = 0; 2525 continue; 2526 } 2527 } 2528 readpage_ok: 2529 if (likely(uptodate)) { 2530 loff_t i_size = i_size_read(inode); 2531 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2532 unsigned offset; 2533 2534 /* Zero out the end if this page straddles i_size */ 2535 offset = i_size & (PAGE_CACHE_SIZE-1); 2536 if (page->index == end_index && offset) 2537 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2538 SetPageUptodate(page); 2539 } else { 2540 ClearPageUptodate(page); 2541 SetPageError(page); 2542 } 2543 unlock_page(page); 2544 offset += len; 2545 2546 if (unlikely(!uptodate)) { 2547 if (extent_len) { 2548 endio_readpage_release_extent(tree, 2549 extent_start, 2550 extent_len, 1); 2551 extent_start = 0; 2552 extent_len = 0; 2553 } 2554 endio_readpage_release_extent(tree, start, 2555 end - start + 1, 0); 2556 } else if (!extent_len) { 2557 extent_start = start; 2558 extent_len = end + 1 - start; 2559 } else if (extent_start + extent_len == start) { 2560 extent_len += end + 1 - start; 2561 } else { 2562 endio_readpage_release_extent(tree, extent_start, 2563 extent_len, uptodate); 2564 extent_start = start; 2565 extent_len = end + 1 - start; 2566 } 2567 } 2568 2569 if (extent_len) 2570 endio_readpage_release_extent(tree, extent_start, extent_len, 2571 uptodate); 2572 if (io_bio->end_io) 2573 io_bio->end_io(io_bio, err); 2574 bio_put(bio); 2575 } 2576 2577 /* 2578 * this allocates from the btrfs_bioset. We're returning a bio right now 2579 * but you can call btrfs_io_bio for the appropriate container_of magic 2580 */ 2581 struct bio * 2582 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2583 gfp_t gfp_flags) 2584 { 2585 struct btrfs_io_bio *btrfs_bio; 2586 struct bio *bio; 2587 2588 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2589 2590 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2591 while (!bio && (nr_vecs /= 2)) { 2592 bio = bio_alloc_bioset(gfp_flags, 2593 nr_vecs, btrfs_bioset); 2594 } 2595 } 2596 2597 if (bio) { 2598 bio->bi_bdev = bdev; 2599 bio->bi_iter.bi_sector = first_sector; 2600 btrfs_bio = btrfs_io_bio(bio); 2601 btrfs_bio->csum = NULL; 2602 btrfs_bio->csum_allocated = NULL; 2603 btrfs_bio->end_io = NULL; 2604 } 2605 return bio; 2606 } 2607 2608 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2609 { 2610 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2611 } 2612 2613 2614 /* this also allocates from the btrfs_bioset */ 2615 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2616 { 2617 struct btrfs_io_bio *btrfs_bio; 2618 struct bio *bio; 2619 2620 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2621 if (bio) { 2622 btrfs_bio = btrfs_io_bio(bio); 2623 btrfs_bio->csum = NULL; 2624 btrfs_bio->csum_allocated = NULL; 2625 btrfs_bio->end_io = NULL; 2626 } 2627 return bio; 2628 } 2629 2630 2631 static int __must_check submit_one_bio(int rw, struct bio *bio, 2632 int mirror_num, unsigned long bio_flags) 2633 { 2634 int ret = 0; 2635 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2636 struct page *page = bvec->bv_page; 2637 struct extent_io_tree *tree = bio->bi_private; 2638 u64 start; 2639 2640 start = page_offset(page) + bvec->bv_offset; 2641 2642 bio->bi_private = NULL; 2643 2644 bio_get(bio); 2645 2646 if (tree->ops && tree->ops->submit_bio_hook) 2647 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2648 mirror_num, bio_flags, start); 2649 else 2650 btrfsic_submit_bio(rw, bio); 2651 2652 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2653 ret = -EOPNOTSUPP; 2654 bio_put(bio); 2655 return ret; 2656 } 2657 2658 static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, 2659 unsigned long offset, size_t size, struct bio *bio, 2660 unsigned long bio_flags) 2661 { 2662 int ret = 0; 2663 if (tree->ops && tree->ops->merge_bio_hook) 2664 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, 2665 bio_flags); 2666 BUG_ON(ret < 0); 2667 return ret; 2668 2669 } 2670 2671 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2672 struct page *page, sector_t sector, 2673 size_t size, unsigned long offset, 2674 struct block_device *bdev, 2675 struct bio **bio_ret, 2676 unsigned long max_pages, 2677 bio_end_io_t end_io_func, 2678 int mirror_num, 2679 unsigned long prev_bio_flags, 2680 unsigned long bio_flags) 2681 { 2682 int ret = 0; 2683 struct bio *bio; 2684 int nr; 2685 int contig = 0; 2686 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2687 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2688 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2689 2690 if (bio_ret && *bio_ret) { 2691 bio = *bio_ret; 2692 if (old_compressed) 2693 contig = bio->bi_iter.bi_sector == sector; 2694 else 2695 contig = bio_end_sector(bio) == sector; 2696 2697 if (prev_bio_flags != bio_flags || !contig || 2698 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2699 bio_add_page(bio, page, page_size, offset) < page_size) { 2700 ret = submit_one_bio(rw, bio, mirror_num, 2701 prev_bio_flags); 2702 if (ret < 0) 2703 return ret; 2704 bio = NULL; 2705 } else { 2706 return 0; 2707 } 2708 } 2709 if (this_compressed) 2710 nr = BIO_MAX_PAGES; 2711 else 2712 nr = bio_get_nr_vecs(bdev); 2713 2714 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2715 if (!bio) 2716 return -ENOMEM; 2717 2718 bio_add_page(bio, page, page_size, offset); 2719 bio->bi_end_io = end_io_func; 2720 bio->bi_private = tree; 2721 2722 if (bio_ret) 2723 *bio_ret = bio; 2724 else 2725 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2726 2727 return ret; 2728 } 2729 2730 static void attach_extent_buffer_page(struct extent_buffer *eb, 2731 struct page *page) 2732 { 2733 if (!PagePrivate(page)) { 2734 SetPagePrivate(page); 2735 page_cache_get(page); 2736 set_page_private(page, (unsigned long)eb); 2737 } else { 2738 WARN_ON(page->private != (unsigned long)eb); 2739 } 2740 } 2741 2742 void set_page_extent_mapped(struct page *page) 2743 { 2744 if (!PagePrivate(page)) { 2745 SetPagePrivate(page); 2746 page_cache_get(page); 2747 set_page_private(page, EXTENT_PAGE_PRIVATE); 2748 } 2749 } 2750 2751 static struct extent_map * 2752 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2753 u64 start, u64 len, get_extent_t *get_extent, 2754 struct extent_map **em_cached) 2755 { 2756 struct extent_map *em; 2757 2758 if (em_cached && *em_cached) { 2759 em = *em_cached; 2760 if (em->in_tree && start >= em->start && 2761 start < extent_map_end(em)) { 2762 atomic_inc(&em->refs); 2763 return em; 2764 } 2765 2766 free_extent_map(em); 2767 *em_cached = NULL; 2768 } 2769 2770 em = get_extent(inode, page, pg_offset, start, len, 0); 2771 if (em_cached && !IS_ERR_OR_NULL(em)) { 2772 BUG_ON(*em_cached); 2773 atomic_inc(&em->refs); 2774 *em_cached = em; 2775 } 2776 return em; 2777 } 2778 /* 2779 * basic readpage implementation. Locked extent state structs are inserted 2780 * into the tree that are removed when the IO is done (by the end_io 2781 * handlers) 2782 * XXX JDM: This needs looking at to ensure proper page locking 2783 */ 2784 static int __do_readpage(struct extent_io_tree *tree, 2785 struct page *page, 2786 get_extent_t *get_extent, 2787 struct extent_map **em_cached, 2788 struct bio **bio, int mirror_num, 2789 unsigned long *bio_flags, int rw) 2790 { 2791 struct inode *inode = page->mapping->host; 2792 u64 start = page_offset(page); 2793 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2794 u64 end; 2795 u64 cur = start; 2796 u64 extent_offset; 2797 u64 last_byte = i_size_read(inode); 2798 u64 block_start; 2799 u64 cur_end; 2800 sector_t sector; 2801 struct extent_map *em; 2802 struct block_device *bdev; 2803 int ret; 2804 int nr = 0; 2805 int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; 2806 size_t pg_offset = 0; 2807 size_t iosize; 2808 size_t disk_io_size; 2809 size_t blocksize = inode->i_sb->s_blocksize; 2810 unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; 2811 2812 set_page_extent_mapped(page); 2813 2814 end = page_end; 2815 if (!PageUptodate(page)) { 2816 if (cleancache_get_page(page) == 0) { 2817 BUG_ON(blocksize != PAGE_SIZE); 2818 unlock_extent(tree, start, end); 2819 goto out; 2820 } 2821 } 2822 2823 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2824 char *userpage; 2825 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2826 2827 if (zero_offset) { 2828 iosize = PAGE_CACHE_SIZE - zero_offset; 2829 userpage = kmap_atomic(page); 2830 memset(userpage + zero_offset, 0, iosize); 2831 flush_dcache_page(page); 2832 kunmap_atomic(userpage); 2833 } 2834 } 2835 while (cur <= end) { 2836 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2837 2838 if (cur >= last_byte) { 2839 char *userpage; 2840 struct extent_state *cached = NULL; 2841 2842 iosize = PAGE_CACHE_SIZE - pg_offset; 2843 userpage = kmap_atomic(page); 2844 memset(userpage + pg_offset, 0, iosize); 2845 flush_dcache_page(page); 2846 kunmap_atomic(userpage); 2847 set_extent_uptodate(tree, cur, cur + iosize - 1, 2848 &cached, GFP_NOFS); 2849 if (!parent_locked) 2850 unlock_extent_cached(tree, cur, 2851 cur + iosize - 1, 2852 &cached, GFP_NOFS); 2853 break; 2854 } 2855 em = __get_extent_map(inode, page, pg_offset, cur, 2856 end - cur + 1, get_extent, em_cached); 2857 if (IS_ERR_OR_NULL(em)) { 2858 SetPageError(page); 2859 if (!parent_locked) 2860 unlock_extent(tree, cur, end); 2861 break; 2862 } 2863 extent_offset = cur - em->start; 2864 BUG_ON(extent_map_end(em) <= cur); 2865 BUG_ON(end < cur); 2866 2867 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2868 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2869 extent_set_compress_type(&this_bio_flag, 2870 em->compress_type); 2871 } 2872 2873 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2874 cur_end = min(extent_map_end(em) - 1, end); 2875 iosize = ALIGN(iosize, blocksize); 2876 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2877 disk_io_size = em->block_len; 2878 sector = em->block_start >> 9; 2879 } else { 2880 sector = (em->block_start + extent_offset) >> 9; 2881 disk_io_size = iosize; 2882 } 2883 bdev = em->bdev; 2884 block_start = em->block_start; 2885 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2886 block_start = EXTENT_MAP_HOLE; 2887 free_extent_map(em); 2888 em = NULL; 2889 2890 /* we've found a hole, just zero and go on */ 2891 if (block_start == EXTENT_MAP_HOLE) { 2892 char *userpage; 2893 struct extent_state *cached = NULL; 2894 2895 userpage = kmap_atomic(page); 2896 memset(userpage + pg_offset, 0, iosize); 2897 flush_dcache_page(page); 2898 kunmap_atomic(userpage); 2899 2900 set_extent_uptodate(tree, cur, cur + iosize - 1, 2901 &cached, GFP_NOFS); 2902 unlock_extent_cached(tree, cur, cur + iosize - 1, 2903 &cached, GFP_NOFS); 2904 cur = cur + iosize; 2905 pg_offset += iosize; 2906 continue; 2907 } 2908 /* the get_extent function already copied into the page */ 2909 if (test_range_bit(tree, cur, cur_end, 2910 EXTENT_UPTODATE, 1, NULL)) { 2911 check_page_uptodate(tree, page); 2912 if (!parent_locked) 2913 unlock_extent(tree, cur, cur + iosize - 1); 2914 cur = cur + iosize; 2915 pg_offset += iosize; 2916 continue; 2917 } 2918 /* we have an inline extent but it didn't get marked up 2919 * to date. Error out 2920 */ 2921 if (block_start == EXTENT_MAP_INLINE) { 2922 SetPageError(page); 2923 if (!parent_locked) 2924 unlock_extent(tree, cur, cur + iosize - 1); 2925 cur = cur + iosize; 2926 pg_offset += iosize; 2927 continue; 2928 } 2929 2930 pnr -= page->index; 2931 ret = submit_extent_page(rw, tree, page, 2932 sector, disk_io_size, pg_offset, 2933 bdev, bio, pnr, 2934 end_bio_extent_readpage, mirror_num, 2935 *bio_flags, 2936 this_bio_flag); 2937 if (!ret) { 2938 nr++; 2939 *bio_flags = this_bio_flag; 2940 } else { 2941 SetPageError(page); 2942 if (!parent_locked) 2943 unlock_extent(tree, cur, cur + iosize - 1); 2944 } 2945 cur = cur + iosize; 2946 pg_offset += iosize; 2947 } 2948 out: 2949 if (!nr) { 2950 if (!PageError(page)) 2951 SetPageUptodate(page); 2952 unlock_page(page); 2953 } 2954 return 0; 2955 } 2956 2957 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 2958 struct page *pages[], int nr_pages, 2959 u64 start, u64 end, 2960 get_extent_t *get_extent, 2961 struct extent_map **em_cached, 2962 struct bio **bio, int mirror_num, 2963 unsigned long *bio_flags, int rw) 2964 { 2965 struct inode *inode; 2966 struct btrfs_ordered_extent *ordered; 2967 int index; 2968 2969 inode = pages[0]->mapping->host; 2970 while (1) { 2971 lock_extent(tree, start, end); 2972 ordered = btrfs_lookup_ordered_range(inode, start, 2973 end - start + 1); 2974 if (!ordered) 2975 break; 2976 unlock_extent(tree, start, end); 2977 btrfs_start_ordered_extent(inode, ordered, 1); 2978 btrfs_put_ordered_extent(ordered); 2979 } 2980 2981 for (index = 0; index < nr_pages; index++) { 2982 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 2983 mirror_num, bio_flags, rw); 2984 page_cache_release(pages[index]); 2985 } 2986 } 2987 2988 static void __extent_readpages(struct extent_io_tree *tree, 2989 struct page *pages[], 2990 int nr_pages, get_extent_t *get_extent, 2991 struct extent_map **em_cached, 2992 struct bio **bio, int mirror_num, 2993 unsigned long *bio_flags, int rw) 2994 { 2995 u64 start = 0; 2996 u64 end = 0; 2997 u64 page_start; 2998 int index; 2999 int first_index = 0; 3000 3001 for (index = 0; index < nr_pages; index++) { 3002 page_start = page_offset(pages[index]); 3003 if (!end) { 3004 start = page_start; 3005 end = start + PAGE_CACHE_SIZE - 1; 3006 first_index = index; 3007 } else if (end + 1 == page_start) { 3008 end += PAGE_CACHE_SIZE; 3009 } else { 3010 __do_contiguous_readpages(tree, &pages[first_index], 3011 index - first_index, start, 3012 end, get_extent, em_cached, 3013 bio, mirror_num, bio_flags, 3014 rw); 3015 start = page_start; 3016 end = start + PAGE_CACHE_SIZE - 1; 3017 first_index = index; 3018 } 3019 } 3020 3021 if (end) 3022 __do_contiguous_readpages(tree, &pages[first_index], 3023 index - first_index, start, 3024 end, get_extent, em_cached, bio, 3025 mirror_num, bio_flags, rw); 3026 } 3027 3028 static int __extent_read_full_page(struct extent_io_tree *tree, 3029 struct page *page, 3030 get_extent_t *get_extent, 3031 struct bio **bio, int mirror_num, 3032 unsigned long *bio_flags, int rw) 3033 { 3034 struct inode *inode = page->mapping->host; 3035 struct btrfs_ordered_extent *ordered; 3036 u64 start = page_offset(page); 3037 u64 end = start + PAGE_CACHE_SIZE - 1; 3038 int ret; 3039 3040 while (1) { 3041 lock_extent(tree, start, end); 3042 ordered = btrfs_lookup_ordered_extent(inode, start); 3043 if (!ordered) 3044 break; 3045 unlock_extent(tree, start, end); 3046 btrfs_start_ordered_extent(inode, ordered, 1); 3047 btrfs_put_ordered_extent(ordered); 3048 } 3049 3050 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3051 bio_flags, rw); 3052 return ret; 3053 } 3054 3055 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3056 get_extent_t *get_extent, int mirror_num) 3057 { 3058 struct bio *bio = NULL; 3059 unsigned long bio_flags = 0; 3060 int ret; 3061 3062 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3063 &bio_flags, READ); 3064 if (bio) 3065 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3066 return ret; 3067 } 3068 3069 int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, 3070 get_extent_t *get_extent, int mirror_num) 3071 { 3072 struct bio *bio = NULL; 3073 unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; 3074 int ret; 3075 3076 ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, 3077 &bio_flags, READ); 3078 if (bio) 3079 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3080 return ret; 3081 } 3082 3083 static noinline void update_nr_written(struct page *page, 3084 struct writeback_control *wbc, 3085 unsigned long nr_written) 3086 { 3087 wbc->nr_to_write -= nr_written; 3088 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 3089 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 3090 page->mapping->writeback_index = page->index + nr_written; 3091 } 3092 3093 /* 3094 * the writepage semantics are similar to regular writepage. extent 3095 * records are inserted to lock ranges in the tree, and as dirty areas 3096 * are found, they are marked writeback. Then the lock bits are removed 3097 * and the end_io handler clears the writeback ranges 3098 */ 3099 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3100 void *data) 3101 { 3102 struct inode *inode = page->mapping->host; 3103 struct extent_page_data *epd = data; 3104 struct extent_io_tree *tree = epd->tree; 3105 u64 start = page_offset(page); 3106 u64 delalloc_start; 3107 u64 page_end = start + PAGE_CACHE_SIZE - 1; 3108 u64 end; 3109 u64 cur = start; 3110 u64 extent_offset; 3111 u64 last_byte = i_size_read(inode); 3112 u64 block_start; 3113 u64 iosize; 3114 sector_t sector; 3115 struct extent_state *cached_state = NULL; 3116 struct extent_map *em; 3117 struct block_device *bdev; 3118 int ret; 3119 int nr = 0; 3120 size_t pg_offset = 0; 3121 size_t blocksize; 3122 loff_t i_size = i_size_read(inode); 3123 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 3124 u64 nr_delalloc; 3125 u64 delalloc_end; 3126 int page_started; 3127 int compressed; 3128 int write_flags; 3129 unsigned long nr_written = 0; 3130 bool fill_delalloc = true; 3131 3132 if (wbc->sync_mode == WB_SYNC_ALL) 3133 write_flags = WRITE_SYNC; 3134 else 3135 write_flags = WRITE; 3136 3137 trace___extent_writepage(page, inode, wbc); 3138 3139 WARN_ON(!PageLocked(page)); 3140 3141 ClearPageError(page); 3142 3143 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 3144 if (page->index > end_index || 3145 (page->index == end_index && !pg_offset)) { 3146 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); 3147 unlock_page(page); 3148 return 0; 3149 } 3150 3151 if (page->index == end_index) { 3152 char *userpage; 3153 3154 userpage = kmap_atomic(page); 3155 memset(userpage + pg_offset, 0, 3156 PAGE_CACHE_SIZE - pg_offset); 3157 kunmap_atomic(userpage); 3158 flush_dcache_page(page); 3159 } 3160 pg_offset = 0; 3161 3162 set_page_extent_mapped(page); 3163 3164 if (!tree->ops || !tree->ops->fill_delalloc) 3165 fill_delalloc = false; 3166 3167 delalloc_start = start; 3168 delalloc_end = 0; 3169 page_started = 0; 3170 if (!epd->extent_locked && fill_delalloc) { 3171 u64 delalloc_to_write = 0; 3172 /* 3173 * make sure the wbc mapping index is at least updated 3174 * to this page. 3175 */ 3176 update_nr_written(page, wbc, 0); 3177 3178 while (delalloc_end < page_end) { 3179 nr_delalloc = find_lock_delalloc_range(inode, tree, 3180 page, 3181 &delalloc_start, 3182 &delalloc_end, 3183 128 * 1024 * 1024); 3184 if (nr_delalloc == 0) { 3185 delalloc_start = delalloc_end + 1; 3186 continue; 3187 } 3188 ret = tree->ops->fill_delalloc(inode, page, 3189 delalloc_start, 3190 delalloc_end, 3191 &page_started, 3192 &nr_written); 3193 /* File system has been set read-only */ 3194 if (ret) { 3195 SetPageError(page); 3196 goto done; 3197 } 3198 /* 3199 * delalloc_end is already one less than the total 3200 * length, so we don't subtract one from 3201 * PAGE_CACHE_SIZE 3202 */ 3203 delalloc_to_write += (delalloc_end - delalloc_start + 3204 PAGE_CACHE_SIZE) >> 3205 PAGE_CACHE_SHIFT; 3206 delalloc_start = delalloc_end + 1; 3207 } 3208 if (wbc->nr_to_write < delalloc_to_write) { 3209 int thresh = 8192; 3210 3211 if (delalloc_to_write < thresh * 2) 3212 thresh = delalloc_to_write; 3213 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3214 thresh); 3215 } 3216 3217 /* did the fill delalloc function already unlock and start 3218 * the IO? 3219 */ 3220 if (page_started) { 3221 ret = 0; 3222 /* 3223 * we've unlocked the page, so we can't update 3224 * the mapping's writeback index, just update 3225 * nr_to_write. 3226 */ 3227 wbc->nr_to_write -= nr_written; 3228 goto done_unlocked; 3229 } 3230 } 3231 if (tree->ops && tree->ops->writepage_start_hook) { 3232 ret = tree->ops->writepage_start_hook(page, start, 3233 page_end); 3234 if (ret) { 3235 /* Fixup worker will requeue */ 3236 if (ret == -EBUSY) 3237 wbc->pages_skipped++; 3238 else 3239 redirty_page_for_writepage(wbc, page); 3240 update_nr_written(page, wbc, nr_written); 3241 unlock_page(page); 3242 ret = 0; 3243 goto done_unlocked; 3244 } 3245 } 3246 3247 /* 3248 * we don't want to touch the inode after unlocking the page, 3249 * so we update the mapping writeback index now 3250 */ 3251 update_nr_written(page, wbc, nr_written + 1); 3252 3253 end = page_end; 3254 if (last_byte <= start) { 3255 if (tree->ops && tree->ops->writepage_end_io_hook) 3256 tree->ops->writepage_end_io_hook(page, start, 3257 page_end, NULL, 1); 3258 goto done; 3259 } 3260 3261 blocksize = inode->i_sb->s_blocksize; 3262 3263 while (cur <= end) { 3264 if (cur >= last_byte) { 3265 if (tree->ops && tree->ops->writepage_end_io_hook) 3266 tree->ops->writepage_end_io_hook(page, cur, 3267 page_end, NULL, 1); 3268 break; 3269 } 3270 em = epd->get_extent(inode, page, pg_offset, cur, 3271 end - cur + 1, 1); 3272 if (IS_ERR_OR_NULL(em)) { 3273 SetPageError(page); 3274 break; 3275 } 3276 3277 extent_offset = cur - em->start; 3278 BUG_ON(extent_map_end(em) <= cur); 3279 BUG_ON(end < cur); 3280 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3281 iosize = ALIGN(iosize, blocksize); 3282 sector = (em->block_start + extent_offset) >> 9; 3283 bdev = em->bdev; 3284 block_start = em->block_start; 3285 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3286 free_extent_map(em); 3287 em = NULL; 3288 3289 /* 3290 * compressed and inline extents are written through other 3291 * paths in the FS 3292 */ 3293 if (compressed || block_start == EXTENT_MAP_HOLE || 3294 block_start == EXTENT_MAP_INLINE) { 3295 /* 3296 * end_io notification does not happen here for 3297 * compressed extents 3298 */ 3299 if (!compressed && tree->ops && 3300 tree->ops->writepage_end_io_hook) 3301 tree->ops->writepage_end_io_hook(page, cur, 3302 cur + iosize - 1, 3303 NULL, 1); 3304 else if (compressed) { 3305 /* we don't want to end_page_writeback on 3306 * a compressed extent. this happens 3307 * elsewhere 3308 */ 3309 nr++; 3310 } 3311 3312 cur += iosize; 3313 pg_offset += iosize; 3314 continue; 3315 } 3316 /* leave this out until we have a page_mkwrite call */ 3317 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3318 EXTENT_DIRTY, 0, NULL)) { 3319 cur = cur + iosize; 3320 pg_offset += iosize; 3321 continue; 3322 } 3323 3324 if (tree->ops && tree->ops->writepage_io_hook) { 3325 ret = tree->ops->writepage_io_hook(page, cur, 3326 cur + iosize - 1); 3327 } else { 3328 ret = 0; 3329 } 3330 if (ret) { 3331 SetPageError(page); 3332 } else { 3333 unsigned long max_nr = end_index + 1; 3334 3335 set_range_writeback(tree, cur, cur + iosize - 1); 3336 if (!PageWriteback(page)) { 3337 btrfs_err(BTRFS_I(inode)->root->fs_info, 3338 "page %lu not writeback, cur %llu end %llu", 3339 page->index, cur, end); 3340 } 3341 3342 ret = submit_extent_page(write_flags, tree, page, 3343 sector, iosize, pg_offset, 3344 bdev, &epd->bio, max_nr, 3345 end_bio_extent_writepage, 3346 0, 0, 0); 3347 if (ret) 3348 SetPageError(page); 3349 } 3350 cur = cur + iosize; 3351 pg_offset += iosize; 3352 nr++; 3353 } 3354 done: 3355 if (nr == 0) { 3356 /* make sure the mapping tag for page dirty gets cleared */ 3357 set_page_writeback(page); 3358 end_page_writeback(page); 3359 } 3360 unlock_page(page); 3361 3362 done_unlocked: 3363 3364 /* drop our reference on any cached states */ 3365 free_extent_state(cached_state); 3366 return 0; 3367 } 3368 3369 static int eb_wait(void *word) 3370 { 3371 io_schedule(); 3372 return 0; 3373 } 3374 3375 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3376 { 3377 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3378 TASK_UNINTERRUPTIBLE); 3379 } 3380 3381 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3382 struct btrfs_fs_info *fs_info, 3383 struct extent_page_data *epd) 3384 { 3385 unsigned long i, num_pages; 3386 int flush = 0; 3387 int ret = 0; 3388 3389 if (!btrfs_try_tree_write_lock(eb)) { 3390 flush = 1; 3391 flush_write_bio(epd); 3392 btrfs_tree_lock(eb); 3393 } 3394 3395 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3396 btrfs_tree_unlock(eb); 3397 if (!epd->sync_io) 3398 return 0; 3399 if (!flush) { 3400 flush_write_bio(epd); 3401 flush = 1; 3402 } 3403 while (1) { 3404 wait_on_extent_buffer_writeback(eb); 3405 btrfs_tree_lock(eb); 3406 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3407 break; 3408 btrfs_tree_unlock(eb); 3409 } 3410 } 3411 3412 /* 3413 * We need to do this to prevent races in people who check if the eb is 3414 * under IO since we can end up having no IO bits set for a short period 3415 * of time. 3416 */ 3417 spin_lock(&eb->refs_lock); 3418 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3419 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3420 spin_unlock(&eb->refs_lock); 3421 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3422 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3423 -eb->len, 3424 fs_info->dirty_metadata_batch); 3425 ret = 1; 3426 } else { 3427 spin_unlock(&eb->refs_lock); 3428 } 3429 3430 btrfs_tree_unlock(eb); 3431 3432 if (!ret) 3433 return ret; 3434 3435 num_pages = num_extent_pages(eb->start, eb->len); 3436 for (i = 0; i < num_pages; i++) { 3437 struct page *p = extent_buffer_page(eb, i); 3438 3439 if (!trylock_page(p)) { 3440 if (!flush) { 3441 flush_write_bio(epd); 3442 flush = 1; 3443 } 3444 lock_page(p); 3445 } 3446 } 3447 3448 return ret; 3449 } 3450 3451 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3452 { 3453 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3454 smp_mb__after_clear_bit(); 3455 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3456 } 3457 3458 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3459 { 3460 struct bio_vec *bvec; 3461 struct extent_buffer *eb; 3462 int i, done; 3463 3464 bio_for_each_segment_all(bvec, bio, i) { 3465 struct page *page = bvec->bv_page; 3466 3467 eb = (struct extent_buffer *)page->private; 3468 BUG_ON(!eb); 3469 done = atomic_dec_and_test(&eb->io_pages); 3470 3471 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3472 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3473 ClearPageUptodate(page); 3474 SetPageError(page); 3475 } 3476 3477 end_page_writeback(page); 3478 3479 if (!done) 3480 continue; 3481 3482 end_extent_buffer_writeback(eb); 3483 } 3484 3485 bio_put(bio); 3486 } 3487 3488 static int write_one_eb(struct extent_buffer *eb, 3489 struct btrfs_fs_info *fs_info, 3490 struct writeback_control *wbc, 3491 struct extent_page_data *epd) 3492 { 3493 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3494 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3495 u64 offset = eb->start; 3496 unsigned long i, num_pages; 3497 unsigned long bio_flags = 0; 3498 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3499 int ret = 0; 3500 3501 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3502 num_pages = num_extent_pages(eb->start, eb->len); 3503 atomic_set(&eb->io_pages, num_pages); 3504 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3505 bio_flags = EXTENT_BIO_TREE_LOG; 3506 3507 for (i = 0; i < num_pages; i++) { 3508 struct page *p = extent_buffer_page(eb, i); 3509 3510 clear_page_dirty_for_io(p); 3511 set_page_writeback(p); 3512 ret = submit_extent_page(rw, tree, p, offset >> 9, 3513 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3514 -1, end_bio_extent_buffer_writepage, 3515 0, epd->bio_flags, bio_flags); 3516 epd->bio_flags = bio_flags; 3517 if (ret) { 3518 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3519 SetPageError(p); 3520 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3521 end_extent_buffer_writeback(eb); 3522 ret = -EIO; 3523 break; 3524 } 3525 offset += PAGE_CACHE_SIZE; 3526 update_nr_written(p, wbc, 1); 3527 unlock_page(p); 3528 } 3529 3530 if (unlikely(ret)) { 3531 for (; i < num_pages; i++) { 3532 struct page *p = extent_buffer_page(eb, i); 3533 unlock_page(p); 3534 } 3535 } 3536 3537 return ret; 3538 } 3539 3540 int btree_write_cache_pages(struct address_space *mapping, 3541 struct writeback_control *wbc) 3542 { 3543 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3544 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3545 struct extent_buffer *eb, *prev_eb = NULL; 3546 struct extent_page_data epd = { 3547 .bio = NULL, 3548 .tree = tree, 3549 .extent_locked = 0, 3550 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3551 .bio_flags = 0, 3552 }; 3553 int ret = 0; 3554 int done = 0; 3555 int nr_to_write_done = 0; 3556 struct pagevec pvec; 3557 int nr_pages; 3558 pgoff_t index; 3559 pgoff_t end; /* Inclusive */ 3560 int scanned = 0; 3561 int tag; 3562 3563 pagevec_init(&pvec, 0); 3564 if (wbc->range_cyclic) { 3565 index = mapping->writeback_index; /* Start from prev offset */ 3566 end = -1; 3567 } else { 3568 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3569 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3570 scanned = 1; 3571 } 3572 if (wbc->sync_mode == WB_SYNC_ALL) 3573 tag = PAGECACHE_TAG_TOWRITE; 3574 else 3575 tag = PAGECACHE_TAG_DIRTY; 3576 retry: 3577 if (wbc->sync_mode == WB_SYNC_ALL) 3578 tag_pages_for_writeback(mapping, index, end); 3579 while (!done && !nr_to_write_done && (index <= end) && 3580 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3581 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3582 unsigned i; 3583 3584 scanned = 1; 3585 for (i = 0; i < nr_pages; i++) { 3586 struct page *page = pvec.pages[i]; 3587 3588 if (!PagePrivate(page)) 3589 continue; 3590 3591 if (!wbc->range_cyclic && page->index > end) { 3592 done = 1; 3593 break; 3594 } 3595 3596 spin_lock(&mapping->private_lock); 3597 if (!PagePrivate(page)) { 3598 spin_unlock(&mapping->private_lock); 3599 continue; 3600 } 3601 3602 eb = (struct extent_buffer *)page->private; 3603 3604 /* 3605 * Shouldn't happen and normally this would be a BUG_ON 3606 * but no sense in crashing the users box for something 3607 * we can survive anyway. 3608 */ 3609 if (WARN_ON(!eb)) { 3610 spin_unlock(&mapping->private_lock); 3611 continue; 3612 } 3613 3614 if (eb == prev_eb) { 3615 spin_unlock(&mapping->private_lock); 3616 continue; 3617 } 3618 3619 ret = atomic_inc_not_zero(&eb->refs); 3620 spin_unlock(&mapping->private_lock); 3621 if (!ret) 3622 continue; 3623 3624 prev_eb = eb; 3625 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3626 if (!ret) { 3627 free_extent_buffer(eb); 3628 continue; 3629 } 3630 3631 ret = write_one_eb(eb, fs_info, wbc, &epd); 3632 if (ret) { 3633 done = 1; 3634 free_extent_buffer(eb); 3635 break; 3636 } 3637 free_extent_buffer(eb); 3638 3639 /* 3640 * the filesystem may choose to bump up nr_to_write. 3641 * We have to make sure to honor the new nr_to_write 3642 * at any time 3643 */ 3644 nr_to_write_done = wbc->nr_to_write <= 0; 3645 } 3646 pagevec_release(&pvec); 3647 cond_resched(); 3648 } 3649 if (!scanned && !done) { 3650 /* 3651 * We hit the last page and there is more work to be done: wrap 3652 * back to the start of the file 3653 */ 3654 scanned = 1; 3655 index = 0; 3656 goto retry; 3657 } 3658 flush_write_bio(&epd); 3659 return ret; 3660 } 3661 3662 /** 3663 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3664 * @mapping: address space structure to write 3665 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3666 * @writepage: function called for each page 3667 * @data: data passed to writepage function 3668 * 3669 * If a page is already under I/O, write_cache_pages() skips it, even 3670 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3671 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3672 * and msync() need to guarantee that all the data which was dirty at the time 3673 * the call was made get new I/O started against them. If wbc->sync_mode is 3674 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3675 * existing IO to complete. 3676 */ 3677 static int extent_write_cache_pages(struct extent_io_tree *tree, 3678 struct address_space *mapping, 3679 struct writeback_control *wbc, 3680 writepage_t writepage, void *data, 3681 void (*flush_fn)(void *)) 3682 { 3683 struct inode *inode = mapping->host; 3684 int ret = 0; 3685 int done = 0; 3686 int nr_to_write_done = 0; 3687 struct pagevec pvec; 3688 int nr_pages; 3689 pgoff_t index; 3690 pgoff_t end; /* Inclusive */ 3691 int scanned = 0; 3692 int tag; 3693 3694 /* 3695 * We have to hold onto the inode so that ordered extents can do their 3696 * work when the IO finishes. The alternative to this is failing to add 3697 * an ordered extent if the igrab() fails there and that is a huge pain 3698 * to deal with, so instead just hold onto the inode throughout the 3699 * writepages operation. If it fails here we are freeing up the inode 3700 * anyway and we'd rather not waste our time writing out stuff that is 3701 * going to be truncated anyway. 3702 */ 3703 if (!igrab(inode)) 3704 return 0; 3705 3706 pagevec_init(&pvec, 0); 3707 if (wbc->range_cyclic) { 3708 index = mapping->writeback_index; /* Start from prev offset */ 3709 end = -1; 3710 } else { 3711 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3712 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3713 scanned = 1; 3714 } 3715 if (wbc->sync_mode == WB_SYNC_ALL) 3716 tag = PAGECACHE_TAG_TOWRITE; 3717 else 3718 tag = PAGECACHE_TAG_DIRTY; 3719 retry: 3720 if (wbc->sync_mode == WB_SYNC_ALL) 3721 tag_pages_for_writeback(mapping, index, end); 3722 while (!done && !nr_to_write_done && (index <= end) && 3723 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3724 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3725 unsigned i; 3726 3727 scanned = 1; 3728 for (i = 0; i < nr_pages; i++) { 3729 struct page *page = pvec.pages[i]; 3730 3731 /* 3732 * At this point we hold neither mapping->tree_lock nor 3733 * lock on the page itself: the page may be truncated or 3734 * invalidated (changing page->mapping to NULL), or even 3735 * swizzled back from swapper_space to tmpfs file 3736 * mapping 3737 */ 3738 if (!trylock_page(page)) { 3739 flush_fn(data); 3740 lock_page(page); 3741 } 3742 3743 if (unlikely(page->mapping != mapping)) { 3744 unlock_page(page); 3745 continue; 3746 } 3747 3748 if (!wbc->range_cyclic && page->index > end) { 3749 done = 1; 3750 unlock_page(page); 3751 continue; 3752 } 3753 3754 if (wbc->sync_mode != WB_SYNC_NONE) { 3755 if (PageWriteback(page)) 3756 flush_fn(data); 3757 wait_on_page_writeback(page); 3758 } 3759 3760 if (PageWriteback(page) || 3761 !clear_page_dirty_for_io(page)) { 3762 unlock_page(page); 3763 continue; 3764 } 3765 3766 ret = (*writepage)(page, wbc, data); 3767 3768 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3769 unlock_page(page); 3770 ret = 0; 3771 } 3772 if (ret) 3773 done = 1; 3774 3775 /* 3776 * the filesystem may choose to bump up nr_to_write. 3777 * We have to make sure to honor the new nr_to_write 3778 * at any time 3779 */ 3780 nr_to_write_done = wbc->nr_to_write <= 0; 3781 } 3782 pagevec_release(&pvec); 3783 cond_resched(); 3784 } 3785 if (!scanned && !done) { 3786 /* 3787 * We hit the last page and there is more work to be done: wrap 3788 * back to the start of the file 3789 */ 3790 scanned = 1; 3791 index = 0; 3792 goto retry; 3793 } 3794 btrfs_add_delayed_iput(inode); 3795 return ret; 3796 } 3797 3798 static void flush_epd_write_bio(struct extent_page_data *epd) 3799 { 3800 if (epd->bio) { 3801 int rw = WRITE; 3802 int ret; 3803 3804 if (epd->sync_io) 3805 rw = WRITE_SYNC; 3806 3807 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3808 BUG_ON(ret < 0); /* -ENOMEM */ 3809 epd->bio = NULL; 3810 } 3811 } 3812 3813 static noinline void flush_write_bio(void *data) 3814 { 3815 struct extent_page_data *epd = data; 3816 flush_epd_write_bio(epd); 3817 } 3818 3819 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3820 get_extent_t *get_extent, 3821 struct writeback_control *wbc) 3822 { 3823 int ret; 3824 struct extent_page_data epd = { 3825 .bio = NULL, 3826 .tree = tree, 3827 .get_extent = get_extent, 3828 .extent_locked = 0, 3829 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3830 .bio_flags = 0, 3831 }; 3832 3833 ret = __extent_writepage(page, wbc, &epd); 3834 3835 flush_epd_write_bio(&epd); 3836 return ret; 3837 } 3838 3839 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3840 u64 start, u64 end, get_extent_t *get_extent, 3841 int mode) 3842 { 3843 int ret = 0; 3844 struct address_space *mapping = inode->i_mapping; 3845 struct page *page; 3846 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3847 PAGE_CACHE_SHIFT; 3848 3849 struct extent_page_data epd = { 3850 .bio = NULL, 3851 .tree = tree, 3852 .get_extent = get_extent, 3853 .extent_locked = 1, 3854 .sync_io = mode == WB_SYNC_ALL, 3855 .bio_flags = 0, 3856 }; 3857 struct writeback_control wbc_writepages = { 3858 .sync_mode = mode, 3859 .nr_to_write = nr_pages * 2, 3860 .range_start = start, 3861 .range_end = end + 1, 3862 }; 3863 3864 while (start <= end) { 3865 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3866 if (clear_page_dirty_for_io(page)) 3867 ret = __extent_writepage(page, &wbc_writepages, &epd); 3868 else { 3869 if (tree->ops && tree->ops->writepage_end_io_hook) 3870 tree->ops->writepage_end_io_hook(page, start, 3871 start + PAGE_CACHE_SIZE - 1, 3872 NULL, 1); 3873 unlock_page(page); 3874 } 3875 page_cache_release(page); 3876 start += PAGE_CACHE_SIZE; 3877 } 3878 3879 flush_epd_write_bio(&epd); 3880 return ret; 3881 } 3882 3883 int extent_writepages(struct extent_io_tree *tree, 3884 struct address_space *mapping, 3885 get_extent_t *get_extent, 3886 struct writeback_control *wbc) 3887 { 3888 int ret = 0; 3889 struct extent_page_data epd = { 3890 .bio = NULL, 3891 .tree = tree, 3892 .get_extent = get_extent, 3893 .extent_locked = 0, 3894 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3895 .bio_flags = 0, 3896 }; 3897 3898 ret = extent_write_cache_pages(tree, mapping, wbc, 3899 __extent_writepage, &epd, 3900 flush_write_bio); 3901 flush_epd_write_bio(&epd); 3902 return ret; 3903 } 3904 3905 int extent_readpages(struct extent_io_tree *tree, 3906 struct address_space *mapping, 3907 struct list_head *pages, unsigned nr_pages, 3908 get_extent_t get_extent) 3909 { 3910 struct bio *bio = NULL; 3911 unsigned page_idx; 3912 unsigned long bio_flags = 0; 3913 struct page *pagepool[16]; 3914 struct page *page; 3915 struct extent_map *em_cached = NULL; 3916 int nr = 0; 3917 3918 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3919 page = list_entry(pages->prev, struct page, lru); 3920 3921 prefetchw(&page->flags); 3922 list_del(&page->lru); 3923 if (add_to_page_cache_lru(page, mapping, 3924 page->index, GFP_NOFS)) { 3925 page_cache_release(page); 3926 continue; 3927 } 3928 3929 pagepool[nr++] = page; 3930 if (nr < ARRAY_SIZE(pagepool)) 3931 continue; 3932 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 3933 &bio, 0, &bio_flags, READ); 3934 nr = 0; 3935 } 3936 if (nr) 3937 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 3938 &bio, 0, &bio_flags, READ); 3939 3940 if (em_cached) 3941 free_extent_map(em_cached); 3942 3943 BUG_ON(!list_empty(pages)); 3944 if (bio) 3945 return submit_one_bio(READ, bio, 0, bio_flags); 3946 return 0; 3947 } 3948 3949 /* 3950 * basic invalidatepage code, this waits on any locked or writeback 3951 * ranges corresponding to the page, and then deletes any extent state 3952 * records from the tree 3953 */ 3954 int extent_invalidatepage(struct extent_io_tree *tree, 3955 struct page *page, unsigned long offset) 3956 { 3957 struct extent_state *cached_state = NULL; 3958 u64 start = page_offset(page); 3959 u64 end = start + PAGE_CACHE_SIZE - 1; 3960 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3961 3962 start += ALIGN(offset, blocksize); 3963 if (start > end) 3964 return 0; 3965 3966 lock_extent_bits(tree, start, end, 0, &cached_state); 3967 wait_on_page_writeback(page); 3968 clear_extent_bit(tree, start, end, 3969 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3970 EXTENT_DO_ACCOUNTING, 3971 1, 1, &cached_state, GFP_NOFS); 3972 return 0; 3973 } 3974 3975 /* 3976 * a helper for releasepage, this tests for areas of the page that 3977 * are locked or under IO and drops the related state bits if it is safe 3978 * to drop the page. 3979 */ 3980 static int try_release_extent_state(struct extent_map_tree *map, 3981 struct extent_io_tree *tree, 3982 struct page *page, gfp_t mask) 3983 { 3984 u64 start = page_offset(page); 3985 u64 end = start + PAGE_CACHE_SIZE - 1; 3986 int ret = 1; 3987 3988 if (test_range_bit(tree, start, end, 3989 EXTENT_IOBITS, 0, NULL)) 3990 ret = 0; 3991 else { 3992 if ((mask & GFP_NOFS) == GFP_NOFS) 3993 mask = GFP_NOFS; 3994 /* 3995 * at this point we can safely clear everything except the 3996 * locked bit and the nodatasum bit 3997 */ 3998 ret = clear_extent_bit(tree, start, end, 3999 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4000 0, 0, NULL, mask); 4001 4002 /* if clear_extent_bit failed for enomem reasons, 4003 * we can't allow the release to continue. 4004 */ 4005 if (ret < 0) 4006 ret = 0; 4007 else 4008 ret = 1; 4009 } 4010 return ret; 4011 } 4012 4013 /* 4014 * a helper for releasepage. As long as there are no locked extents 4015 * in the range corresponding to the page, both state records and extent 4016 * map records are removed 4017 */ 4018 int try_release_extent_mapping(struct extent_map_tree *map, 4019 struct extent_io_tree *tree, struct page *page, 4020 gfp_t mask) 4021 { 4022 struct extent_map *em; 4023 u64 start = page_offset(page); 4024 u64 end = start + PAGE_CACHE_SIZE - 1; 4025 4026 if ((mask & __GFP_WAIT) && 4027 page->mapping->host->i_size > 16 * 1024 * 1024) { 4028 u64 len; 4029 while (start <= end) { 4030 len = end - start + 1; 4031 write_lock(&map->lock); 4032 em = lookup_extent_mapping(map, start, len); 4033 if (!em) { 4034 write_unlock(&map->lock); 4035 break; 4036 } 4037 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4038 em->start != start) { 4039 write_unlock(&map->lock); 4040 free_extent_map(em); 4041 break; 4042 } 4043 if (!test_range_bit(tree, em->start, 4044 extent_map_end(em) - 1, 4045 EXTENT_LOCKED | EXTENT_WRITEBACK, 4046 0, NULL)) { 4047 remove_extent_mapping(map, em); 4048 /* once for the rb tree */ 4049 free_extent_map(em); 4050 } 4051 start = extent_map_end(em); 4052 write_unlock(&map->lock); 4053 4054 /* once for us */ 4055 free_extent_map(em); 4056 } 4057 } 4058 return try_release_extent_state(map, tree, page, mask); 4059 } 4060 4061 /* 4062 * helper function for fiemap, which doesn't want to see any holes. 4063 * This maps until we find something past 'last' 4064 */ 4065 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4066 u64 offset, 4067 u64 last, 4068 get_extent_t *get_extent) 4069 { 4070 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 4071 struct extent_map *em; 4072 u64 len; 4073 4074 if (offset >= last) 4075 return NULL; 4076 4077 while (1) { 4078 len = last - offset; 4079 if (len == 0) 4080 break; 4081 len = ALIGN(len, sectorsize); 4082 em = get_extent(inode, NULL, 0, offset, len, 0); 4083 if (IS_ERR_OR_NULL(em)) 4084 return em; 4085 4086 /* if this isn't a hole return it */ 4087 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4088 em->block_start != EXTENT_MAP_HOLE) { 4089 return em; 4090 } 4091 4092 /* this is a hole, advance to the next extent */ 4093 offset = extent_map_end(em); 4094 free_extent_map(em); 4095 if (offset >= last) 4096 break; 4097 } 4098 return NULL; 4099 } 4100 4101 static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) 4102 { 4103 unsigned long cnt = *((unsigned long *)ctx); 4104 4105 cnt++; 4106 *((unsigned long *)ctx) = cnt; 4107 4108 /* Now we're sure that the extent is shared. */ 4109 if (cnt > 1) 4110 return 1; 4111 return 0; 4112 } 4113 4114 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4115 __u64 start, __u64 len, get_extent_t *get_extent) 4116 { 4117 int ret = 0; 4118 u64 off = start; 4119 u64 max = start + len; 4120 u32 flags = 0; 4121 u32 found_type; 4122 u64 last; 4123 u64 last_for_get_extent = 0; 4124 u64 disko = 0; 4125 u64 isize = i_size_read(inode); 4126 struct btrfs_key found_key; 4127 struct extent_map *em = NULL; 4128 struct extent_state *cached_state = NULL; 4129 struct btrfs_path *path; 4130 int end = 0; 4131 u64 em_start = 0; 4132 u64 em_len = 0; 4133 u64 em_end = 0; 4134 4135 if (len == 0) 4136 return -EINVAL; 4137 4138 path = btrfs_alloc_path(); 4139 if (!path) 4140 return -ENOMEM; 4141 path->leave_spinning = 1; 4142 4143 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 4144 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 4145 4146 /* 4147 * lookup the last file extent. We're not using i_size here 4148 * because there might be preallocation past i_size 4149 */ 4150 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 4151 path, btrfs_ino(inode), -1, 0); 4152 if (ret < 0) { 4153 btrfs_free_path(path); 4154 return ret; 4155 } 4156 WARN_ON(!ret); 4157 path->slots[0]--; 4158 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4159 found_type = btrfs_key_type(&found_key); 4160 4161 /* No extents, but there might be delalloc bits */ 4162 if (found_key.objectid != btrfs_ino(inode) || 4163 found_type != BTRFS_EXTENT_DATA_KEY) { 4164 /* have to trust i_size as the end */ 4165 last = (u64)-1; 4166 last_for_get_extent = isize; 4167 } else { 4168 /* 4169 * remember the start of the last extent. There are a 4170 * bunch of different factors that go into the length of the 4171 * extent, so its much less complex to remember where it started 4172 */ 4173 last = found_key.offset; 4174 last_for_get_extent = last + 1; 4175 } 4176 btrfs_release_path(path); 4177 4178 /* 4179 * we might have some extents allocated but more delalloc past those 4180 * extents. so, we trust isize unless the start of the last extent is 4181 * beyond isize 4182 */ 4183 if (last < isize) { 4184 last = (u64)-1; 4185 last_for_get_extent = isize; 4186 } 4187 4188 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, 4189 &cached_state); 4190 4191 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4192 get_extent); 4193 if (!em) 4194 goto out; 4195 if (IS_ERR(em)) { 4196 ret = PTR_ERR(em); 4197 goto out; 4198 } 4199 4200 while (!end) { 4201 u64 offset_in_extent = 0; 4202 4203 /* break if the extent we found is outside the range */ 4204 if (em->start >= max || extent_map_end(em) < off) 4205 break; 4206 4207 /* 4208 * get_extent may return an extent that starts before our 4209 * requested range. We have to make sure the ranges 4210 * we return to fiemap always move forward and don't 4211 * overlap, so adjust the offsets here 4212 */ 4213 em_start = max(em->start, off); 4214 4215 /* 4216 * record the offset from the start of the extent 4217 * for adjusting the disk offset below. Only do this if the 4218 * extent isn't compressed since our in ram offset may be past 4219 * what we have actually allocated on disk. 4220 */ 4221 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4222 offset_in_extent = em_start - em->start; 4223 em_end = extent_map_end(em); 4224 em_len = em_end - em_start; 4225 disko = 0; 4226 flags = 0; 4227 4228 /* 4229 * bump off for our next call to get_extent 4230 */ 4231 off = extent_map_end(em); 4232 if (off >= max) 4233 end = 1; 4234 4235 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4236 end = 1; 4237 flags |= FIEMAP_EXTENT_LAST; 4238 } else if (em->block_start == EXTENT_MAP_INLINE) { 4239 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4240 FIEMAP_EXTENT_NOT_ALIGNED); 4241 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4242 flags |= (FIEMAP_EXTENT_DELALLOC | 4243 FIEMAP_EXTENT_UNKNOWN); 4244 } else { 4245 unsigned long ref_cnt = 0; 4246 4247 disko = em->block_start + offset_in_extent; 4248 4249 /* 4250 * As btrfs supports shared space, this information 4251 * can be exported to userspace tools via 4252 * flag FIEMAP_EXTENT_SHARED. 4253 */ 4254 ret = iterate_inodes_from_logical( 4255 em->block_start, 4256 BTRFS_I(inode)->root->fs_info, 4257 path, count_ext_ref, &ref_cnt); 4258 if (ret < 0 && ret != -ENOENT) 4259 goto out_free; 4260 4261 if (ref_cnt > 1) 4262 flags |= FIEMAP_EXTENT_SHARED; 4263 } 4264 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4265 flags |= FIEMAP_EXTENT_ENCODED; 4266 4267 free_extent_map(em); 4268 em = NULL; 4269 if ((em_start >= last) || em_len == (u64)-1 || 4270 (last == (u64)-1 && isize <= em_end)) { 4271 flags |= FIEMAP_EXTENT_LAST; 4272 end = 1; 4273 } 4274 4275 /* now scan forward to see if this is really the last extent. */ 4276 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4277 get_extent); 4278 if (IS_ERR(em)) { 4279 ret = PTR_ERR(em); 4280 goto out; 4281 } 4282 if (!em) { 4283 flags |= FIEMAP_EXTENT_LAST; 4284 end = 1; 4285 } 4286 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4287 em_len, flags); 4288 if (ret) 4289 goto out_free; 4290 } 4291 out_free: 4292 free_extent_map(em); 4293 out: 4294 btrfs_free_path(path); 4295 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4296 &cached_state, GFP_NOFS); 4297 return ret; 4298 } 4299 4300 static void __free_extent_buffer(struct extent_buffer *eb) 4301 { 4302 btrfs_leak_debug_del(&eb->leak_list); 4303 kmem_cache_free(extent_buffer_cache, eb); 4304 } 4305 4306 static int extent_buffer_under_io(struct extent_buffer *eb) 4307 { 4308 return (atomic_read(&eb->io_pages) || 4309 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4310 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4311 } 4312 4313 /* 4314 * Helper for releasing extent buffer page. 4315 */ 4316 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4317 unsigned long start_idx) 4318 { 4319 unsigned long index; 4320 unsigned long num_pages; 4321 struct page *page; 4322 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4323 4324 BUG_ON(extent_buffer_under_io(eb)); 4325 4326 num_pages = num_extent_pages(eb->start, eb->len); 4327 index = start_idx + num_pages; 4328 if (start_idx >= index) 4329 return; 4330 4331 do { 4332 index--; 4333 page = extent_buffer_page(eb, index); 4334 if (page && mapped) { 4335 spin_lock(&page->mapping->private_lock); 4336 /* 4337 * We do this since we'll remove the pages after we've 4338 * removed the eb from the radix tree, so we could race 4339 * and have this page now attached to the new eb. So 4340 * only clear page_private if it's still connected to 4341 * this eb. 4342 */ 4343 if (PagePrivate(page) && 4344 page->private == (unsigned long)eb) { 4345 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4346 BUG_ON(PageDirty(page)); 4347 BUG_ON(PageWriteback(page)); 4348 /* 4349 * We need to make sure we haven't be attached 4350 * to a new eb. 4351 */ 4352 ClearPagePrivate(page); 4353 set_page_private(page, 0); 4354 /* One for the page private */ 4355 page_cache_release(page); 4356 } 4357 spin_unlock(&page->mapping->private_lock); 4358 4359 } 4360 if (page) { 4361 /* One for when we alloced the page */ 4362 page_cache_release(page); 4363 } 4364 } while (index != start_idx); 4365 } 4366 4367 /* 4368 * Helper for releasing the extent buffer. 4369 */ 4370 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4371 { 4372 btrfs_release_extent_buffer_page(eb, 0); 4373 __free_extent_buffer(eb); 4374 } 4375 4376 static struct extent_buffer * 4377 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4378 unsigned long len, gfp_t mask) 4379 { 4380 struct extent_buffer *eb = NULL; 4381 4382 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4383 if (eb == NULL) 4384 return NULL; 4385 eb->start = start; 4386 eb->len = len; 4387 eb->fs_info = fs_info; 4388 eb->bflags = 0; 4389 rwlock_init(&eb->lock); 4390 atomic_set(&eb->write_locks, 0); 4391 atomic_set(&eb->read_locks, 0); 4392 atomic_set(&eb->blocking_readers, 0); 4393 atomic_set(&eb->blocking_writers, 0); 4394 atomic_set(&eb->spinning_readers, 0); 4395 atomic_set(&eb->spinning_writers, 0); 4396 eb->lock_nested = 0; 4397 init_waitqueue_head(&eb->write_lock_wq); 4398 init_waitqueue_head(&eb->read_lock_wq); 4399 4400 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4401 4402 spin_lock_init(&eb->refs_lock); 4403 atomic_set(&eb->refs, 1); 4404 atomic_set(&eb->io_pages, 0); 4405 4406 /* 4407 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4408 */ 4409 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4410 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4411 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4412 4413 return eb; 4414 } 4415 4416 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4417 { 4418 unsigned long i; 4419 struct page *p; 4420 struct extent_buffer *new; 4421 unsigned long num_pages = num_extent_pages(src->start, src->len); 4422 4423 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); 4424 if (new == NULL) 4425 return NULL; 4426 4427 for (i = 0; i < num_pages; i++) { 4428 p = alloc_page(GFP_NOFS); 4429 if (!p) { 4430 btrfs_release_extent_buffer(new); 4431 return NULL; 4432 } 4433 attach_extent_buffer_page(new, p); 4434 WARN_ON(PageDirty(p)); 4435 SetPageUptodate(p); 4436 new->pages[i] = p; 4437 } 4438 4439 copy_extent_buffer(new, src, 0, 0, src->len); 4440 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4441 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4442 4443 return new; 4444 } 4445 4446 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4447 { 4448 struct extent_buffer *eb; 4449 unsigned long num_pages = num_extent_pages(0, len); 4450 unsigned long i; 4451 4452 eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); 4453 if (!eb) 4454 return NULL; 4455 4456 for (i = 0; i < num_pages; i++) { 4457 eb->pages[i] = alloc_page(GFP_NOFS); 4458 if (!eb->pages[i]) 4459 goto err; 4460 } 4461 set_extent_buffer_uptodate(eb); 4462 btrfs_set_header_nritems(eb, 0); 4463 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4464 4465 return eb; 4466 err: 4467 for (; i > 0; i--) 4468 __free_page(eb->pages[i - 1]); 4469 __free_extent_buffer(eb); 4470 return NULL; 4471 } 4472 4473 static void check_buffer_tree_ref(struct extent_buffer *eb) 4474 { 4475 int refs; 4476 /* the ref bit is tricky. We have to make sure it is set 4477 * if we have the buffer dirty. Otherwise the 4478 * code to free a buffer can end up dropping a dirty 4479 * page 4480 * 4481 * Once the ref bit is set, it won't go away while the 4482 * buffer is dirty or in writeback, and it also won't 4483 * go away while we have the reference count on the 4484 * eb bumped. 4485 * 4486 * We can't just set the ref bit without bumping the 4487 * ref on the eb because free_extent_buffer might 4488 * see the ref bit and try to clear it. If this happens 4489 * free_extent_buffer might end up dropping our original 4490 * ref by mistake and freeing the page before we are able 4491 * to add one more ref. 4492 * 4493 * So bump the ref count first, then set the bit. If someone 4494 * beat us to it, drop the ref we added. 4495 */ 4496 refs = atomic_read(&eb->refs); 4497 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4498 return; 4499 4500 spin_lock(&eb->refs_lock); 4501 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4502 atomic_inc(&eb->refs); 4503 spin_unlock(&eb->refs_lock); 4504 } 4505 4506 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4507 { 4508 unsigned long num_pages, i; 4509 4510 check_buffer_tree_ref(eb); 4511 4512 num_pages = num_extent_pages(eb->start, eb->len); 4513 for (i = 0; i < num_pages; i++) { 4514 struct page *p = extent_buffer_page(eb, i); 4515 mark_page_accessed(p); 4516 } 4517 } 4518 4519 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4520 u64 start) 4521 { 4522 struct extent_buffer *eb; 4523 4524 rcu_read_lock(); 4525 eb = radix_tree_lookup(&fs_info->buffer_radix, 4526 start >> PAGE_CACHE_SHIFT); 4527 if (eb && atomic_inc_not_zero(&eb->refs)) { 4528 rcu_read_unlock(); 4529 mark_extent_buffer_accessed(eb); 4530 return eb; 4531 } 4532 rcu_read_unlock(); 4533 4534 return NULL; 4535 } 4536 4537 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4538 u64 start, unsigned long len) 4539 { 4540 unsigned long num_pages = num_extent_pages(start, len); 4541 unsigned long i; 4542 unsigned long index = start >> PAGE_CACHE_SHIFT; 4543 struct extent_buffer *eb; 4544 struct extent_buffer *exists = NULL; 4545 struct page *p; 4546 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4547 int uptodate = 1; 4548 int ret; 4549 4550 eb = find_extent_buffer(fs_info, start); 4551 if (eb) 4552 return eb; 4553 4554 eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); 4555 if (!eb) 4556 return NULL; 4557 4558 for (i = 0; i < num_pages; i++, index++) { 4559 p = find_or_create_page(mapping, index, GFP_NOFS); 4560 if (!p) 4561 goto free_eb; 4562 4563 spin_lock(&mapping->private_lock); 4564 if (PagePrivate(p)) { 4565 /* 4566 * We could have already allocated an eb for this page 4567 * and attached one so lets see if we can get a ref on 4568 * the existing eb, and if we can we know it's good and 4569 * we can just return that one, else we know we can just 4570 * overwrite page->private. 4571 */ 4572 exists = (struct extent_buffer *)p->private; 4573 if (atomic_inc_not_zero(&exists->refs)) { 4574 spin_unlock(&mapping->private_lock); 4575 unlock_page(p); 4576 page_cache_release(p); 4577 mark_extent_buffer_accessed(exists); 4578 goto free_eb; 4579 } 4580 4581 /* 4582 * Do this so attach doesn't complain and we need to 4583 * drop the ref the old guy had. 4584 */ 4585 ClearPagePrivate(p); 4586 WARN_ON(PageDirty(p)); 4587 page_cache_release(p); 4588 } 4589 attach_extent_buffer_page(eb, p); 4590 spin_unlock(&mapping->private_lock); 4591 WARN_ON(PageDirty(p)); 4592 mark_page_accessed(p); 4593 eb->pages[i] = p; 4594 if (!PageUptodate(p)) 4595 uptodate = 0; 4596 4597 /* 4598 * see below about how we avoid a nasty race with release page 4599 * and why we unlock later 4600 */ 4601 } 4602 if (uptodate) 4603 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4604 again: 4605 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4606 if (ret) 4607 goto free_eb; 4608 4609 spin_lock(&fs_info->buffer_lock); 4610 ret = radix_tree_insert(&fs_info->buffer_radix, 4611 start >> PAGE_CACHE_SHIFT, eb); 4612 spin_unlock(&fs_info->buffer_lock); 4613 radix_tree_preload_end(); 4614 if (ret == -EEXIST) { 4615 exists = find_extent_buffer(fs_info, start); 4616 if (exists) 4617 goto free_eb; 4618 else 4619 goto again; 4620 } 4621 /* add one reference for the tree */ 4622 check_buffer_tree_ref(eb); 4623 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4624 4625 /* 4626 * there is a race where release page may have 4627 * tried to find this extent buffer in the radix 4628 * but failed. It will tell the VM it is safe to 4629 * reclaim the, and it will clear the page private bit. 4630 * We must make sure to set the page private bit properly 4631 * after the extent buffer is in the radix tree so 4632 * it doesn't get lost 4633 */ 4634 SetPageChecked(eb->pages[0]); 4635 for (i = 1; i < num_pages; i++) { 4636 p = extent_buffer_page(eb, i); 4637 ClearPageChecked(p); 4638 unlock_page(p); 4639 } 4640 unlock_page(eb->pages[0]); 4641 return eb; 4642 4643 free_eb: 4644 for (i = 0; i < num_pages; i++) { 4645 if (eb->pages[i]) 4646 unlock_page(eb->pages[i]); 4647 } 4648 4649 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4650 btrfs_release_extent_buffer(eb); 4651 return exists; 4652 } 4653 4654 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4655 { 4656 struct extent_buffer *eb = 4657 container_of(head, struct extent_buffer, rcu_head); 4658 4659 __free_extent_buffer(eb); 4660 } 4661 4662 /* Expects to have eb->eb_lock already held */ 4663 static int release_extent_buffer(struct extent_buffer *eb) 4664 { 4665 WARN_ON(atomic_read(&eb->refs) == 0); 4666 if (atomic_dec_and_test(&eb->refs)) { 4667 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 4668 struct btrfs_fs_info *fs_info = eb->fs_info; 4669 4670 spin_unlock(&eb->refs_lock); 4671 4672 spin_lock(&fs_info->buffer_lock); 4673 radix_tree_delete(&fs_info->buffer_radix, 4674 eb->start >> PAGE_CACHE_SHIFT); 4675 spin_unlock(&fs_info->buffer_lock); 4676 } else { 4677 spin_unlock(&eb->refs_lock); 4678 } 4679 4680 /* Should be safe to release our pages at this point */ 4681 btrfs_release_extent_buffer_page(eb, 0); 4682 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4683 return 1; 4684 } 4685 spin_unlock(&eb->refs_lock); 4686 4687 return 0; 4688 } 4689 4690 void free_extent_buffer(struct extent_buffer *eb) 4691 { 4692 int refs; 4693 int old; 4694 if (!eb) 4695 return; 4696 4697 while (1) { 4698 refs = atomic_read(&eb->refs); 4699 if (refs <= 3) 4700 break; 4701 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 4702 if (old == refs) 4703 return; 4704 } 4705 4706 spin_lock(&eb->refs_lock); 4707 if (atomic_read(&eb->refs) == 2 && 4708 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4709 atomic_dec(&eb->refs); 4710 4711 if (atomic_read(&eb->refs) == 2 && 4712 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4713 !extent_buffer_under_io(eb) && 4714 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4715 atomic_dec(&eb->refs); 4716 4717 /* 4718 * I know this is terrible, but it's temporary until we stop tracking 4719 * the uptodate bits and such for the extent buffers. 4720 */ 4721 release_extent_buffer(eb); 4722 } 4723 4724 void free_extent_buffer_stale(struct extent_buffer *eb) 4725 { 4726 if (!eb) 4727 return; 4728 4729 spin_lock(&eb->refs_lock); 4730 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4731 4732 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4733 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4734 atomic_dec(&eb->refs); 4735 release_extent_buffer(eb); 4736 } 4737 4738 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4739 { 4740 unsigned long i; 4741 unsigned long num_pages; 4742 struct page *page; 4743 4744 num_pages = num_extent_pages(eb->start, eb->len); 4745 4746 for (i = 0; i < num_pages; i++) { 4747 page = extent_buffer_page(eb, i); 4748 if (!PageDirty(page)) 4749 continue; 4750 4751 lock_page(page); 4752 WARN_ON(!PagePrivate(page)); 4753 4754 clear_page_dirty_for_io(page); 4755 spin_lock_irq(&page->mapping->tree_lock); 4756 if (!PageDirty(page)) { 4757 radix_tree_tag_clear(&page->mapping->page_tree, 4758 page_index(page), 4759 PAGECACHE_TAG_DIRTY); 4760 } 4761 spin_unlock_irq(&page->mapping->tree_lock); 4762 ClearPageError(page); 4763 unlock_page(page); 4764 } 4765 WARN_ON(atomic_read(&eb->refs) == 0); 4766 } 4767 4768 int set_extent_buffer_dirty(struct extent_buffer *eb) 4769 { 4770 unsigned long i; 4771 unsigned long num_pages; 4772 int was_dirty = 0; 4773 4774 check_buffer_tree_ref(eb); 4775 4776 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4777 4778 num_pages = num_extent_pages(eb->start, eb->len); 4779 WARN_ON(atomic_read(&eb->refs) == 0); 4780 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4781 4782 for (i = 0; i < num_pages; i++) 4783 set_page_dirty(extent_buffer_page(eb, i)); 4784 return was_dirty; 4785 } 4786 4787 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4788 { 4789 unsigned long i; 4790 struct page *page; 4791 unsigned long num_pages; 4792 4793 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4794 num_pages = num_extent_pages(eb->start, eb->len); 4795 for (i = 0; i < num_pages; i++) { 4796 page = extent_buffer_page(eb, i); 4797 if (page) 4798 ClearPageUptodate(page); 4799 } 4800 return 0; 4801 } 4802 4803 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4804 { 4805 unsigned long i; 4806 struct page *page; 4807 unsigned long num_pages; 4808 4809 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4810 num_pages = num_extent_pages(eb->start, eb->len); 4811 for (i = 0; i < num_pages; i++) { 4812 page = extent_buffer_page(eb, i); 4813 SetPageUptodate(page); 4814 } 4815 return 0; 4816 } 4817 4818 int extent_buffer_uptodate(struct extent_buffer *eb) 4819 { 4820 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4821 } 4822 4823 int read_extent_buffer_pages(struct extent_io_tree *tree, 4824 struct extent_buffer *eb, u64 start, int wait, 4825 get_extent_t *get_extent, int mirror_num) 4826 { 4827 unsigned long i; 4828 unsigned long start_i; 4829 struct page *page; 4830 int err; 4831 int ret = 0; 4832 int locked_pages = 0; 4833 int all_uptodate = 1; 4834 unsigned long num_pages; 4835 unsigned long num_reads = 0; 4836 struct bio *bio = NULL; 4837 unsigned long bio_flags = 0; 4838 4839 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4840 return 0; 4841 4842 if (start) { 4843 WARN_ON(start < eb->start); 4844 start_i = (start >> PAGE_CACHE_SHIFT) - 4845 (eb->start >> PAGE_CACHE_SHIFT); 4846 } else { 4847 start_i = 0; 4848 } 4849 4850 num_pages = num_extent_pages(eb->start, eb->len); 4851 for (i = start_i; i < num_pages; i++) { 4852 page = extent_buffer_page(eb, i); 4853 if (wait == WAIT_NONE) { 4854 if (!trylock_page(page)) 4855 goto unlock_exit; 4856 } else { 4857 lock_page(page); 4858 } 4859 locked_pages++; 4860 if (!PageUptodate(page)) { 4861 num_reads++; 4862 all_uptodate = 0; 4863 } 4864 } 4865 if (all_uptodate) { 4866 if (start_i == 0) 4867 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4868 goto unlock_exit; 4869 } 4870 4871 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4872 eb->read_mirror = 0; 4873 atomic_set(&eb->io_pages, num_reads); 4874 for (i = start_i; i < num_pages; i++) { 4875 page = extent_buffer_page(eb, i); 4876 if (!PageUptodate(page)) { 4877 ClearPageError(page); 4878 err = __extent_read_full_page(tree, page, 4879 get_extent, &bio, 4880 mirror_num, &bio_flags, 4881 READ | REQ_META); 4882 if (err) 4883 ret = err; 4884 } else { 4885 unlock_page(page); 4886 } 4887 } 4888 4889 if (bio) { 4890 err = submit_one_bio(READ | REQ_META, bio, mirror_num, 4891 bio_flags); 4892 if (err) 4893 return err; 4894 } 4895 4896 if (ret || wait != WAIT_COMPLETE) 4897 return ret; 4898 4899 for (i = start_i; i < num_pages; i++) { 4900 page = extent_buffer_page(eb, i); 4901 wait_on_page_locked(page); 4902 if (!PageUptodate(page)) 4903 ret = -EIO; 4904 } 4905 4906 return ret; 4907 4908 unlock_exit: 4909 i = start_i; 4910 while (locked_pages > 0) { 4911 page = extent_buffer_page(eb, i); 4912 i++; 4913 unlock_page(page); 4914 locked_pages--; 4915 } 4916 return ret; 4917 } 4918 4919 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4920 unsigned long start, 4921 unsigned long len) 4922 { 4923 size_t cur; 4924 size_t offset; 4925 struct page *page; 4926 char *kaddr; 4927 char *dst = (char *)dstv; 4928 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4929 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4930 4931 WARN_ON(start > eb->len); 4932 WARN_ON(start + len > eb->start + eb->len); 4933 4934 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 4935 4936 while (len > 0) { 4937 page = extent_buffer_page(eb, i); 4938 4939 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4940 kaddr = page_address(page); 4941 memcpy(dst, kaddr + offset, cur); 4942 4943 dst += cur; 4944 len -= cur; 4945 offset = 0; 4946 i++; 4947 } 4948 } 4949 4950 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4951 unsigned long min_len, char **map, 4952 unsigned long *map_start, 4953 unsigned long *map_len) 4954 { 4955 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4956 char *kaddr; 4957 struct page *p; 4958 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4959 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4960 unsigned long end_i = (start_offset + start + min_len - 1) >> 4961 PAGE_CACHE_SHIFT; 4962 4963 if (i != end_i) 4964 return -EINVAL; 4965 4966 if (i == 0) { 4967 offset = start_offset; 4968 *map_start = 0; 4969 } else { 4970 offset = 0; 4971 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4972 } 4973 4974 if (start + min_len > eb->len) { 4975 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4976 "wanted %lu %lu\n", 4977 eb->start, eb->len, start, min_len); 4978 return -EINVAL; 4979 } 4980 4981 p = extent_buffer_page(eb, i); 4982 kaddr = page_address(p); 4983 *map = kaddr + offset; 4984 *map_len = PAGE_CACHE_SIZE - offset; 4985 return 0; 4986 } 4987 4988 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4989 unsigned long start, 4990 unsigned long len) 4991 { 4992 size_t cur; 4993 size_t offset; 4994 struct page *page; 4995 char *kaddr; 4996 char *ptr = (char *)ptrv; 4997 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4998 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4999 int ret = 0; 5000 5001 WARN_ON(start > eb->len); 5002 WARN_ON(start + len > eb->start + eb->len); 5003 5004 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5005 5006 while (len > 0) { 5007 page = extent_buffer_page(eb, i); 5008 5009 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5010 5011 kaddr = page_address(page); 5012 ret = memcmp(ptr, kaddr + offset, cur); 5013 if (ret) 5014 break; 5015 5016 ptr += cur; 5017 len -= cur; 5018 offset = 0; 5019 i++; 5020 } 5021 return ret; 5022 } 5023 5024 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5025 unsigned long start, unsigned long len) 5026 { 5027 size_t cur; 5028 size_t offset; 5029 struct page *page; 5030 char *kaddr; 5031 char *src = (char *)srcv; 5032 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 5033 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 5034 5035 WARN_ON(start > eb->len); 5036 WARN_ON(start + len > eb->start + eb->len); 5037 5038 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5039 5040 while (len > 0) { 5041 page = extent_buffer_page(eb, i); 5042 WARN_ON(!PageUptodate(page)); 5043 5044 cur = min(len, PAGE_CACHE_SIZE - offset); 5045 kaddr = page_address(page); 5046 memcpy(kaddr + offset, src, cur); 5047 5048 src += cur; 5049 len -= cur; 5050 offset = 0; 5051 i++; 5052 } 5053 } 5054 5055 void memset_extent_buffer(struct extent_buffer *eb, char c, 5056 unsigned long start, unsigned long len) 5057 { 5058 size_t cur; 5059 size_t offset; 5060 struct page *page; 5061 char *kaddr; 5062 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 5063 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 5064 5065 WARN_ON(start > eb->len); 5066 WARN_ON(start + len > eb->start + eb->len); 5067 5068 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5069 5070 while (len > 0) { 5071 page = extent_buffer_page(eb, i); 5072 WARN_ON(!PageUptodate(page)); 5073 5074 cur = min(len, PAGE_CACHE_SIZE - offset); 5075 kaddr = page_address(page); 5076 memset(kaddr + offset, c, cur); 5077 5078 len -= cur; 5079 offset = 0; 5080 i++; 5081 } 5082 } 5083 5084 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5085 unsigned long dst_offset, unsigned long src_offset, 5086 unsigned long len) 5087 { 5088 u64 dst_len = dst->len; 5089 size_t cur; 5090 size_t offset; 5091 struct page *page; 5092 char *kaddr; 5093 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5094 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 5095 5096 WARN_ON(src->len != dst_len); 5097 5098 offset = (start_offset + dst_offset) & 5099 (PAGE_CACHE_SIZE - 1); 5100 5101 while (len > 0) { 5102 page = extent_buffer_page(dst, i); 5103 WARN_ON(!PageUptodate(page)); 5104 5105 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 5106 5107 kaddr = page_address(page); 5108 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5109 5110 src_offset += cur; 5111 len -= cur; 5112 offset = 0; 5113 i++; 5114 } 5115 } 5116 5117 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5118 { 5119 unsigned long distance = (src > dst) ? src - dst : dst - src; 5120 return distance < len; 5121 } 5122 5123 static void copy_pages(struct page *dst_page, struct page *src_page, 5124 unsigned long dst_off, unsigned long src_off, 5125 unsigned long len) 5126 { 5127 char *dst_kaddr = page_address(dst_page); 5128 char *src_kaddr; 5129 int must_memmove = 0; 5130 5131 if (dst_page != src_page) { 5132 src_kaddr = page_address(src_page); 5133 } else { 5134 src_kaddr = dst_kaddr; 5135 if (areas_overlap(src_off, dst_off, len)) 5136 must_memmove = 1; 5137 } 5138 5139 if (must_memmove) 5140 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5141 else 5142 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5143 } 5144 5145 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5146 unsigned long src_offset, unsigned long len) 5147 { 5148 size_t cur; 5149 size_t dst_off_in_page; 5150 size_t src_off_in_page; 5151 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5152 unsigned long dst_i; 5153 unsigned long src_i; 5154 5155 if (src_offset + len > dst->len) { 5156 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " 5157 "len %lu dst len %lu\n", src_offset, len, dst->len); 5158 BUG_ON(1); 5159 } 5160 if (dst_offset + len > dst->len) { 5161 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " 5162 "len %lu dst len %lu\n", dst_offset, len, dst->len); 5163 BUG_ON(1); 5164 } 5165 5166 while (len > 0) { 5167 dst_off_in_page = (start_offset + dst_offset) & 5168 (PAGE_CACHE_SIZE - 1); 5169 src_off_in_page = (start_offset + src_offset) & 5170 (PAGE_CACHE_SIZE - 1); 5171 5172 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 5173 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 5174 5175 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 5176 src_off_in_page)); 5177 cur = min_t(unsigned long, cur, 5178 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 5179 5180 copy_pages(extent_buffer_page(dst, dst_i), 5181 extent_buffer_page(dst, src_i), 5182 dst_off_in_page, src_off_in_page, cur); 5183 5184 src_offset += cur; 5185 dst_offset += cur; 5186 len -= cur; 5187 } 5188 } 5189 5190 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5191 unsigned long src_offset, unsigned long len) 5192 { 5193 size_t cur; 5194 size_t dst_off_in_page; 5195 size_t src_off_in_page; 5196 unsigned long dst_end = dst_offset + len - 1; 5197 unsigned long src_end = src_offset + len - 1; 5198 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5199 unsigned long dst_i; 5200 unsigned long src_i; 5201 5202 if (src_offset + len > dst->len) { 5203 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " 5204 "len %lu len %lu\n", src_offset, len, dst->len); 5205 BUG_ON(1); 5206 } 5207 if (dst_offset + len > dst->len) { 5208 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " 5209 "len %lu len %lu\n", dst_offset, len, dst->len); 5210 BUG_ON(1); 5211 } 5212 if (dst_offset < src_offset) { 5213 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5214 return; 5215 } 5216 while (len > 0) { 5217 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 5218 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 5219 5220 dst_off_in_page = (start_offset + dst_end) & 5221 (PAGE_CACHE_SIZE - 1); 5222 src_off_in_page = (start_offset + src_end) & 5223 (PAGE_CACHE_SIZE - 1); 5224 5225 cur = min_t(unsigned long, len, src_off_in_page + 1); 5226 cur = min(cur, dst_off_in_page + 1); 5227 copy_pages(extent_buffer_page(dst, dst_i), 5228 extent_buffer_page(dst, src_i), 5229 dst_off_in_page - cur + 1, 5230 src_off_in_page - cur + 1, cur); 5231 5232 dst_end -= cur; 5233 src_end -= cur; 5234 len -= cur; 5235 } 5236 } 5237 5238 int try_release_extent_buffer(struct page *page) 5239 { 5240 struct extent_buffer *eb; 5241 5242 /* 5243 * We need to make sure noboody is attaching this page to an eb right 5244 * now. 5245 */ 5246 spin_lock(&page->mapping->private_lock); 5247 if (!PagePrivate(page)) { 5248 spin_unlock(&page->mapping->private_lock); 5249 return 1; 5250 } 5251 5252 eb = (struct extent_buffer *)page->private; 5253 BUG_ON(!eb); 5254 5255 /* 5256 * This is a little awful but should be ok, we need to make sure that 5257 * the eb doesn't disappear out from under us while we're looking at 5258 * this page. 5259 */ 5260 spin_lock(&eb->refs_lock); 5261 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5262 spin_unlock(&eb->refs_lock); 5263 spin_unlock(&page->mapping->private_lock); 5264 return 0; 5265 } 5266 spin_unlock(&page->mapping->private_lock); 5267 5268 /* 5269 * If tree ref isn't set then we know the ref on this eb is a real ref, 5270 * so just return, this page will likely be freed soon anyway. 5271 */ 5272 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5273 spin_unlock(&eb->refs_lock); 5274 return 0; 5275 } 5276 5277 return release_extent_buffer(eb); 5278 } 5279