1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "compat.h" 17 #include "ctree.h" 18 #include "btrfs_inode.h" 19 #include "volumes.h" 20 #include "check-integrity.h" 21 #include "locking.h" 22 #include "rcu-string.h" 23 24 static struct kmem_cache *extent_state_cache; 25 static struct kmem_cache *extent_buffer_cache; 26 static struct bio_set *btrfs_bioset; 27 28 #ifdef CONFIG_BTRFS_DEBUG 29 static LIST_HEAD(buffers); 30 static LIST_HEAD(states); 31 32 static DEFINE_SPINLOCK(leak_lock); 33 34 static inline 35 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 36 { 37 unsigned long flags; 38 39 spin_lock_irqsave(&leak_lock, flags); 40 list_add(new, head); 41 spin_unlock_irqrestore(&leak_lock, flags); 42 } 43 44 static inline 45 void btrfs_leak_debug_del(struct list_head *entry) 46 { 47 unsigned long flags; 48 49 spin_lock_irqsave(&leak_lock, flags); 50 list_del(entry); 51 spin_unlock_irqrestore(&leak_lock, flags); 52 } 53 54 static inline 55 void btrfs_leak_debug_check(void) 56 { 57 struct extent_state *state; 58 struct extent_buffer *eb; 59 60 while (!list_empty(&states)) { 61 state = list_entry(states.next, struct extent_state, leak_list); 62 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 63 "state %lu in tree %p refs %d\n", 64 (unsigned long long)state->start, 65 (unsigned long long)state->end, 66 state->state, state->tree, atomic_read(&state->refs)); 67 list_del(&state->leak_list); 68 kmem_cache_free(extent_state_cache, state); 69 } 70 71 while (!list_empty(&buffers)) { 72 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 73 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 74 "refs %d\n", (unsigned long long)eb->start, 75 eb->len, atomic_read(&eb->refs)); 76 list_del(&eb->leak_list); 77 kmem_cache_free(extent_buffer_cache, eb); 78 } 79 } 80 81 #define btrfs_debug_check_extent_io_range(inode, start, end) \ 82 __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end)) 83 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 84 struct inode *inode, u64 start, u64 end) 85 { 86 u64 isize = i_size_read(inode); 87 88 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 89 printk_ratelimited(KERN_DEBUG 90 "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", 91 caller, 92 (unsigned long long)btrfs_ino(inode), 93 (unsigned long long)isize, 94 (unsigned long long)start, 95 (unsigned long long)end); 96 } 97 } 98 #else 99 #define btrfs_leak_debug_add(new, head) do {} while (0) 100 #define btrfs_leak_debug_del(entry) do {} while (0) 101 #define btrfs_leak_debug_check() do {} while (0) 102 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 103 #endif 104 105 #define BUFFER_LRU_MAX 64 106 107 struct tree_entry { 108 u64 start; 109 u64 end; 110 struct rb_node rb_node; 111 }; 112 113 struct extent_page_data { 114 struct bio *bio; 115 struct extent_io_tree *tree; 116 get_extent_t *get_extent; 117 unsigned long bio_flags; 118 119 /* tells writepage not to lock the state bits for this range 120 * it still does the unlocking 121 */ 122 unsigned int extent_locked:1; 123 124 /* tells the submit_bio code to use a WRITE_SYNC */ 125 unsigned int sync_io:1; 126 }; 127 128 static noinline void flush_write_bio(void *data); 129 static inline struct btrfs_fs_info * 130 tree_fs_info(struct extent_io_tree *tree) 131 { 132 return btrfs_sb(tree->mapping->host->i_sb); 133 } 134 135 int __init extent_io_init(void) 136 { 137 extent_state_cache = kmem_cache_create("btrfs_extent_state", 138 sizeof(struct extent_state), 0, 139 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 140 if (!extent_state_cache) 141 return -ENOMEM; 142 143 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 144 sizeof(struct extent_buffer), 0, 145 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 146 if (!extent_buffer_cache) 147 goto free_state_cache; 148 149 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 150 offsetof(struct btrfs_io_bio, bio)); 151 if (!btrfs_bioset) 152 goto free_buffer_cache; 153 return 0; 154 155 free_buffer_cache: 156 kmem_cache_destroy(extent_buffer_cache); 157 extent_buffer_cache = NULL; 158 159 free_state_cache: 160 kmem_cache_destroy(extent_state_cache); 161 extent_state_cache = NULL; 162 return -ENOMEM; 163 } 164 165 void extent_io_exit(void) 166 { 167 btrfs_leak_debug_check(); 168 169 /* 170 * Make sure all delayed rcu free are flushed before we 171 * destroy caches. 172 */ 173 rcu_barrier(); 174 if (extent_state_cache) 175 kmem_cache_destroy(extent_state_cache); 176 if (extent_buffer_cache) 177 kmem_cache_destroy(extent_buffer_cache); 178 if (btrfs_bioset) 179 bioset_free(btrfs_bioset); 180 } 181 182 void extent_io_tree_init(struct extent_io_tree *tree, 183 struct address_space *mapping) 184 { 185 tree->state = RB_ROOT; 186 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 187 tree->ops = NULL; 188 tree->dirty_bytes = 0; 189 spin_lock_init(&tree->lock); 190 spin_lock_init(&tree->buffer_lock); 191 tree->mapping = mapping; 192 } 193 194 static struct extent_state *alloc_extent_state(gfp_t mask) 195 { 196 struct extent_state *state; 197 198 state = kmem_cache_alloc(extent_state_cache, mask); 199 if (!state) 200 return state; 201 state->state = 0; 202 state->private = 0; 203 state->tree = NULL; 204 btrfs_leak_debug_add(&state->leak_list, &states); 205 atomic_set(&state->refs, 1); 206 init_waitqueue_head(&state->wq); 207 trace_alloc_extent_state(state, mask, _RET_IP_); 208 return state; 209 } 210 211 void free_extent_state(struct extent_state *state) 212 { 213 if (!state) 214 return; 215 if (atomic_dec_and_test(&state->refs)) { 216 WARN_ON(state->tree); 217 btrfs_leak_debug_del(&state->leak_list); 218 trace_free_extent_state(state, _RET_IP_); 219 kmem_cache_free(extent_state_cache, state); 220 } 221 } 222 223 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 224 struct rb_node *node) 225 { 226 struct rb_node **p = &root->rb_node; 227 struct rb_node *parent = NULL; 228 struct tree_entry *entry; 229 230 while (*p) { 231 parent = *p; 232 entry = rb_entry(parent, struct tree_entry, rb_node); 233 234 if (offset < entry->start) 235 p = &(*p)->rb_left; 236 else if (offset > entry->end) 237 p = &(*p)->rb_right; 238 else 239 return parent; 240 } 241 242 rb_link_node(node, parent, p); 243 rb_insert_color(node, root); 244 return NULL; 245 } 246 247 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 248 struct rb_node **prev_ret, 249 struct rb_node **next_ret) 250 { 251 struct rb_root *root = &tree->state; 252 struct rb_node *n = root->rb_node; 253 struct rb_node *prev = NULL; 254 struct rb_node *orig_prev = NULL; 255 struct tree_entry *entry; 256 struct tree_entry *prev_entry = NULL; 257 258 while (n) { 259 entry = rb_entry(n, struct tree_entry, rb_node); 260 prev = n; 261 prev_entry = entry; 262 263 if (offset < entry->start) 264 n = n->rb_left; 265 else if (offset > entry->end) 266 n = n->rb_right; 267 else 268 return n; 269 } 270 271 if (prev_ret) { 272 orig_prev = prev; 273 while (prev && offset > prev_entry->end) { 274 prev = rb_next(prev); 275 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 276 } 277 *prev_ret = prev; 278 prev = orig_prev; 279 } 280 281 if (next_ret) { 282 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 283 while (prev && offset < prev_entry->start) { 284 prev = rb_prev(prev); 285 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 286 } 287 *next_ret = prev; 288 } 289 return NULL; 290 } 291 292 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 293 u64 offset) 294 { 295 struct rb_node *prev = NULL; 296 struct rb_node *ret; 297 298 ret = __etree_search(tree, offset, &prev, NULL); 299 if (!ret) 300 return prev; 301 return ret; 302 } 303 304 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 305 struct extent_state *other) 306 { 307 if (tree->ops && tree->ops->merge_extent_hook) 308 tree->ops->merge_extent_hook(tree->mapping->host, new, 309 other); 310 } 311 312 /* 313 * utility function to look for merge candidates inside a given range. 314 * Any extents with matching state are merged together into a single 315 * extent in the tree. Extents with EXTENT_IO in their state field 316 * are not merged because the end_io handlers need to be able to do 317 * operations on them without sleeping (or doing allocations/splits). 318 * 319 * This should be called with the tree lock held. 320 */ 321 static void merge_state(struct extent_io_tree *tree, 322 struct extent_state *state) 323 { 324 struct extent_state *other; 325 struct rb_node *other_node; 326 327 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 328 return; 329 330 other_node = rb_prev(&state->rb_node); 331 if (other_node) { 332 other = rb_entry(other_node, struct extent_state, rb_node); 333 if (other->end == state->start - 1 && 334 other->state == state->state) { 335 merge_cb(tree, state, other); 336 state->start = other->start; 337 other->tree = NULL; 338 rb_erase(&other->rb_node, &tree->state); 339 free_extent_state(other); 340 } 341 } 342 other_node = rb_next(&state->rb_node); 343 if (other_node) { 344 other = rb_entry(other_node, struct extent_state, rb_node); 345 if (other->start == state->end + 1 && 346 other->state == state->state) { 347 merge_cb(tree, state, other); 348 state->end = other->end; 349 other->tree = NULL; 350 rb_erase(&other->rb_node, &tree->state); 351 free_extent_state(other); 352 } 353 } 354 } 355 356 static void set_state_cb(struct extent_io_tree *tree, 357 struct extent_state *state, unsigned long *bits) 358 { 359 if (tree->ops && tree->ops->set_bit_hook) 360 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 361 } 362 363 static void clear_state_cb(struct extent_io_tree *tree, 364 struct extent_state *state, unsigned long *bits) 365 { 366 if (tree->ops && tree->ops->clear_bit_hook) 367 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 368 } 369 370 static void set_state_bits(struct extent_io_tree *tree, 371 struct extent_state *state, unsigned long *bits); 372 373 /* 374 * insert an extent_state struct into the tree. 'bits' are set on the 375 * struct before it is inserted. 376 * 377 * This may return -EEXIST if the extent is already there, in which case the 378 * state struct is freed. 379 * 380 * The tree lock is not taken internally. This is a utility function and 381 * probably isn't what you want to call (see set/clear_extent_bit). 382 */ 383 static int insert_state(struct extent_io_tree *tree, 384 struct extent_state *state, u64 start, u64 end, 385 unsigned long *bits) 386 { 387 struct rb_node *node; 388 389 if (end < start) 390 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 391 (unsigned long long)end, 392 (unsigned long long)start); 393 state->start = start; 394 state->end = end; 395 396 set_state_bits(tree, state, bits); 397 398 node = tree_insert(&tree->state, end, &state->rb_node); 399 if (node) { 400 struct extent_state *found; 401 found = rb_entry(node, struct extent_state, rb_node); 402 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 403 "%llu %llu\n", (unsigned long long)found->start, 404 (unsigned long long)found->end, 405 (unsigned long long)start, (unsigned long long)end); 406 return -EEXIST; 407 } 408 state->tree = tree; 409 merge_state(tree, state); 410 return 0; 411 } 412 413 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 414 u64 split) 415 { 416 if (tree->ops && tree->ops->split_extent_hook) 417 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 418 } 419 420 /* 421 * split a given extent state struct in two, inserting the preallocated 422 * struct 'prealloc' as the newly created second half. 'split' indicates an 423 * offset inside 'orig' where it should be split. 424 * 425 * Before calling, 426 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 427 * are two extent state structs in the tree: 428 * prealloc: [orig->start, split - 1] 429 * orig: [ split, orig->end ] 430 * 431 * The tree locks are not taken by this function. They need to be held 432 * by the caller. 433 */ 434 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 435 struct extent_state *prealloc, u64 split) 436 { 437 struct rb_node *node; 438 439 split_cb(tree, orig, split); 440 441 prealloc->start = orig->start; 442 prealloc->end = split - 1; 443 prealloc->state = orig->state; 444 orig->start = split; 445 446 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 447 if (node) { 448 free_extent_state(prealloc); 449 return -EEXIST; 450 } 451 prealloc->tree = tree; 452 return 0; 453 } 454 455 static struct extent_state *next_state(struct extent_state *state) 456 { 457 struct rb_node *next = rb_next(&state->rb_node); 458 if (next) 459 return rb_entry(next, struct extent_state, rb_node); 460 else 461 return NULL; 462 } 463 464 /* 465 * utility function to clear some bits in an extent state struct. 466 * it will optionally wake up any one waiting on this state (wake == 1). 467 * 468 * If no bits are set on the state struct after clearing things, the 469 * struct is freed and removed from the tree 470 */ 471 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 472 struct extent_state *state, 473 unsigned long *bits, int wake) 474 { 475 struct extent_state *next; 476 unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; 477 478 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 479 u64 range = state->end - state->start + 1; 480 WARN_ON(range > tree->dirty_bytes); 481 tree->dirty_bytes -= range; 482 } 483 clear_state_cb(tree, state, bits); 484 state->state &= ~bits_to_clear; 485 if (wake) 486 wake_up(&state->wq); 487 if (state->state == 0) { 488 next = next_state(state); 489 if (state->tree) { 490 rb_erase(&state->rb_node, &tree->state); 491 state->tree = NULL; 492 free_extent_state(state); 493 } else { 494 WARN_ON(1); 495 } 496 } else { 497 merge_state(tree, state); 498 next = next_state(state); 499 } 500 return next; 501 } 502 503 static struct extent_state * 504 alloc_extent_state_atomic(struct extent_state *prealloc) 505 { 506 if (!prealloc) 507 prealloc = alloc_extent_state(GFP_ATOMIC); 508 509 return prealloc; 510 } 511 512 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 513 { 514 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 515 "Extent tree was modified by another " 516 "thread while locked."); 517 } 518 519 /* 520 * clear some bits on a range in the tree. This may require splitting 521 * or inserting elements in the tree, so the gfp mask is used to 522 * indicate which allocations or sleeping are allowed. 523 * 524 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 525 * the given range from the tree regardless of state (ie for truncate). 526 * 527 * the range [start, end] is inclusive. 528 * 529 * This takes the tree lock, and returns 0 on success and < 0 on error. 530 */ 531 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 532 unsigned long bits, int wake, int delete, 533 struct extent_state **cached_state, 534 gfp_t mask) 535 { 536 struct extent_state *state; 537 struct extent_state *cached; 538 struct extent_state *prealloc = NULL; 539 struct rb_node *node; 540 u64 last_end; 541 int err; 542 int clear = 0; 543 544 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 545 546 if (bits & EXTENT_DELALLOC) 547 bits |= EXTENT_NORESERVE; 548 549 if (delete) 550 bits |= ~EXTENT_CTLBITS; 551 bits |= EXTENT_FIRST_DELALLOC; 552 553 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 554 clear = 1; 555 again: 556 if (!prealloc && (mask & __GFP_WAIT)) { 557 prealloc = alloc_extent_state(mask); 558 if (!prealloc) 559 return -ENOMEM; 560 } 561 562 spin_lock(&tree->lock); 563 if (cached_state) { 564 cached = *cached_state; 565 566 if (clear) { 567 *cached_state = NULL; 568 cached_state = NULL; 569 } 570 571 if (cached && cached->tree && cached->start <= start && 572 cached->end > start) { 573 if (clear) 574 atomic_dec(&cached->refs); 575 state = cached; 576 goto hit_next; 577 } 578 if (clear) 579 free_extent_state(cached); 580 } 581 /* 582 * this search will find the extents that end after 583 * our range starts 584 */ 585 node = tree_search(tree, start); 586 if (!node) 587 goto out; 588 state = rb_entry(node, struct extent_state, rb_node); 589 hit_next: 590 if (state->start > end) 591 goto out; 592 WARN_ON(state->end < start); 593 last_end = state->end; 594 595 /* the state doesn't have the wanted bits, go ahead */ 596 if (!(state->state & bits)) { 597 state = next_state(state); 598 goto next; 599 } 600 601 /* 602 * | ---- desired range ---- | 603 * | state | or 604 * | ------------- state -------------- | 605 * 606 * We need to split the extent we found, and may flip 607 * bits on second half. 608 * 609 * If the extent we found extends past our range, we 610 * just split and search again. It'll get split again 611 * the next time though. 612 * 613 * If the extent we found is inside our range, we clear 614 * the desired bit on it. 615 */ 616 617 if (state->start < start) { 618 prealloc = alloc_extent_state_atomic(prealloc); 619 BUG_ON(!prealloc); 620 err = split_state(tree, state, prealloc, start); 621 if (err) 622 extent_io_tree_panic(tree, err); 623 624 prealloc = NULL; 625 if (err) 626 goto out; 627 if (state->end <= end) { 628 state = clear_state_bit(tree, state, &bits, wake); 629 goto next; 630 } 631 goto search_again; 632 } 633 /* 634 * | ---- desired range ---- | 635 * | state | 636 * We need to split the extent, and clear the bit 637 * on the first half 638 */ 639 if (state->start <= end && state->end > end) { 640 prealloc = alloc_extent_state_atomic(prealloc); 641 BUG_ON(!prealloc); 642 err = split_state(tree, state, prealloc, end + 1); 643 if (err) 644 extent_io_tree_panic(tree, err); 645 646 if (wake) 647 wake_up(&state->wq); 648 649 clear_state_bit(tree, prealloc, &bits, wake); 650 651 prealloc = NULL; 652 goto out; 653 } 654 655 state = clear_state_bit(tree, state, &bits, wake); 656 next: 657 if (last_end == (u64)-1) 658 goto out; 659 start = last_end + 1; 660 if (start <= end && state && !need_resched()) 661 goto hit_next; 662 goto search_again; 663 664 out: 665 spin_unlock(&tree->lock); 666 if (prealloc) 667 free_extent_state(prealloc); 668 669 return 0; 670 671 search_again: 672 if (start > end) 673 goto out; 674 spin_unlock(&tree->lock); 675 if (mask & __GFP_WAIT) 676 cond_resched(); 677 goto again; 678 } 679 680 static void wait_on_state(struct extent_io_tree *tree, 681 struct extent_state *state) 682 __releases(tree->lock) 683 __acquires(tree->lock) 684 { 685 DEFINE_WAIT(wait); 686 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 687 spin_unlock(&tree->lock); 688 schedule(); 689 spin_lock(&tree->lock); 690 finish_wait(&state->wq, &wait); 691 } 692 693 /* 694 * waits for one or more bits to clear on a range in the state tree. 695 * The range [start, end] is inclusive. 696 * The tree lock is taken by this function 697 */ 698 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 699 unsigned long bits) 700 { 701 struct extent_state *state; 702 struct rb_node *node; 703 704 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 705 706 spin_lock(&tree->lock); 707 again: 708 while (1) { 709 /* 710 * this search will find all the extents that end after 711 * our range starts 712 */ 713 node = tree_search(tree, start); 714 if (!node) 715 break; 716 717 state = rb_entry(node, struct extent_state, rb_node); 718 719 if (state->start > end) 720 goto out; 721 722 if (state->state & bits) { 723 start = state->start; 724 atomic_inc(&state->refs); 725 wait_on_state(tree, state); 726 free_extent_state(state); 727 goto again; 728 } 729 start = state->end + 1; 730 731 if (start > end) 732 break; 733 734 cond_resched_lock(&tree->lock); 735 } 736 out: 737 spin_unlock(&tree->lock); 738 } 739 740 static void set_state_bits(struct extent_io_tree *tree, 741 struct extent_state *state, 742 unsigned long *bits) 743 { 744 unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; 745 746 set_state_cb(tree, state, bits); 747 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 748 u64 range = state->end - state->start + 1; 749 tree->dirty_bytes += range; 750 } 751 state->state |= bits_to_set; 752 } 753 754 static void cache_state(struct extent_state *state, 755 struct extent_state **cached_ptr) 756 { 757 if (cached_ptr && !(*cached_ptr)) { 758 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 759 *cached_ptr = state; 760 atomic_inc(&state->refs); 761 } 762 } 763 } 764 765 /* 766 * set some bits on a range in the tree. This may require allocations or 767 * sleeping, so the gfp mask is used to indicate what is allowed. 768 * 769 * If any of the exclusive bits are set, this will fail with -EEXIST if some 770 * part of the range already has the desired bits set. The start of the 771 * existing range is returned in failed_start in this case. 772 * 773 * [start, end] is inclusive This takes the tree lock. 774 */ 775 776 static int __must_check 777 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 778 unsigned long bits, unsigned long exclusive_bits, 779 u64 *failed_start, struct extent_state **cached_state, 780 gfp_t mask) 781 { 782 struct extent_state *state; 783 struct extent_state *prealloc = NULL; 784 struct rb_node *node; 785 int err = 0; 786 u64 last_start; 787 u64 last_end; 788 789 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 790 791 bits |= EXTENT_FIRST_DELALLOC; 792 again: 793 if (!prealloc && (mask & __GFP_WAIT)) { 794 prealloc = alloc_extent_state(mask); 795 BUG_ON(!prealloc); 796 } 797 798 spin_lock(&tree->lock); 799 if (cached_state && *cached_state) { 800 state = *cached_state; 801 if (state->start <= start && state->end > start && 802 state->tree) { 803 node = &state->rb_node; 804 goto hit_next; 805 } 806 } 807 /* 808 * this search will find all the extents that end after 809 * our range starts. 810 */ 811 node = tree_search(tree, start); 812 if (!node) { 813 prealloc = alloc_extent_state_atomic(prealloc); 814 BUG_ON(!prealloc); 815 err = insert_state(tree, prealloc, start, end, &bits); 816 if (err) 817 extent_io_tree_panic(tree, err); 818 819 prealloc = NULL; 820 goto out; 821 } 822 state = rb_entry(node, struct extent_state, rb_node); 823 hit_next: 824 last_start = state->start; 825 last_end = state->end; 826 827 /* 828 * | ---- desired range ---- | 829 * | state | 830 * 831 * Just lock what we found and keep going 832 */ 833 if (state->start == start && state->end <= end) { 834 if (state->state & exclusive_bits) { 835 *failed_start = state->start; 836 err = -EEXIST; 837 goto out; 838 } 839 840 set_state_bits(tree, state, &bits); 841 cache_state(state, cached_state); 842 merge_state(tree, state); 843 if (last_end == (u64)-1) 844 goto out; 845 start = last_end + 1; 846 state = next_state(state); 847 if (start < end && state && state->start == start && 848 !need_resched()) 849 goto hit_next; 850 goto search_again; 851 } 852 853 /* 854 * | ---- desired range ---- | 855 * | state | 856 * or 857 * | ------------- state -------------- | 858 * 859 * We need to split the extent we found, and may flip bits on 860 * second half. 861 * 862 * If the extent we found extends past our 863 * range, we just split and search again. It'll get split 864 * again the next time though. 865 * 866 * If the extent we found is inside our range, we set the 867 * desired bit on it. 868 */ 869 if (state->start < start) { 870 if (state->state & exclusive_bits) { 871 *failed_start = start; 872 err = -EEXIST; 873 goto out; 874 } 875 876 prealloc = alloc_extent_state_atomic(prealloc); 877 BUG_ON(!prealloc); 878 err = split_state(tree, state, prealloc, start); 879 if (err) 880 extent_io_tree_panic(tree, err); 881 882 prealloc = NULL; 883 if (err) 884 goto out; 885 if (state->end <= end) { 886 set_state_bits(tree, state, &bits); 887 cache_state(state, cached_state); 888 merge_state(tree, state); 889 if (last_end == (u64)-1) 890 goto out; 891 start = last_end + 1; 892 state = next_state(state); 893 if (start < end && state && state->start == start && 894 !need_resched()) 895 goto hit_next; 896 } 897 goto search_again; 898 } 899 /* 900 * | ---- desired range ---- | 901 * | state | or | state | 902 * 903 * There's a hole, we need to insert something in it and 904 * ignore the extent we found. 905 */ 906 if (state->start > start) { 907 u64 this_end; 908 if (end < last_start) 909 this_end = end; 910 else 911 this_end = last_start - 1; 912 913 prealloc = alloc_extent_state_atomic(prealloc); 914 BUG_ON(!prealloc); 915 916 /* 917 * Avoid to free 'prealloc' if it can be merged with 918 * the later extent. 919 */ 920 err = insert_state(tree, prealloc, start, this_end, 921 &bits); 922 if (err) 923 extent_io_tree_panic(tree, err); 924 925 cache_state(prealloc, cached_state); 926 prealloc = NULL; 927 start = this_end + 1; 928 goto search_again; 929 } 930 /* 931 * | ---- desired range ---- | 932 * | state | 933 * We need to split the extent, and set the bit 934 * on the first half 935 */ 936 if (state->start <= end && state->end > end) { 937 if (state->state & exclusive_bits) { 938 *failed_start = start; 939 err = -EEXIST; 940 goto out; 941 } 942 943 prealloc = alloc_extent_state_atomic(prealloc); 944 BUG_ON(!prealloc); 945 err = split_state(tree, state, prealloc, end + 1); 946 if (err) 947 extent_io_tree_panic(tree, err); 948 949 set_state_bits(tree, prealloc, &bits); 950 cache_state(prealloc, cached_state); 951 merge_state(tree, prealloc); 952 prealloc = NULL; 953 goto out; 954 } 955 956 goto search_again; 957 958 out: 959 spin_unlock(&tree->lock); 960 if (prealloc) 961 free_extent_state(prealloc); 962 963 return err; 964 965 search_again: 966 if (start > end) 967 goto out; 968 spin_unlock(&tree->lock); 969 if (mask & __GFP_WAIT) 970 cond_resched(); 971 goto again; 972 } 973 974 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 975 unsigned long bits, u64 * failed_start, 976 struct extent_state **cached_state, gfp_t mask) 977 { 978 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 979 cached_state, mask); 980 } 981 982 983 /** 984 * convert_extent_bit - convert all bits in a given range from one bit to 985 * another 986 * @tree: the io tree to search 987 * @start: the start offset in bytes 988 * @end: the end offset in bytes (inclusive) 989 * @bits: the bits to set in this range 990 * @clear_bits: the bits to clear in this range 991 * @cached_state: state that we're going to cache 992 * @mask: the allocation mask 993 * 994 * This will go through and set bits for the given range. If any states exist 995 * already in this range they are set with the given bit and cleared of the 996 * clear_bits. This is only meant to be used by things that are mergeable, ie 997 * converting from say DELALLOC to DIRTY. This is not meant to be used with 998 * boundary bits like LOCK. 999 */ 1000 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1001 unsigned long bits, unsigned long clear_bits, 1002 struct extent_state **cached_state, gfp_t mask) 1003 { 1004 struct extent_state *state; 1005 struct extent_state *prealloc = NULL; 1006 struct rb_node *node; 1007 int err = 0; 1008 u64 last_start; 1009 u64 last_end; 1010 1011 btrfs_debug_check_extent_io_range(tree->mapping->host, start, end); 1012 1013 again: 1014 if (!prealloc && (mask & __GFP_WAIT)) { 1015 prealloc = alloc_extent_state(mask); 1016 if (!prealloc) 1017 return -ENOMEM; 1018 } 1019 1020 spin_lock(&tree->lock); 1021 if (cached_state && *cached_state) { 1022 state = *cached_state; 1023 if (state->start <= start && state->end > start && 1024 state->tree) { 1025 node = &state->rb_node; 1026 goto hit_next; 1027 } 1028 } 1029 1030 /* 1031 * this search will find all the extents that end after 1032 * our range starts. 1033 */ 1034 node = tree_search(tree, start); 1035 if (!node) { 1036 prealloc = alloc_extent_state_atomic(prealloc); 1037 if (!prealloc) { 1038 err = -ENOMEM; 1039 goto out; 1040 } 1041 err = insert_state(tree, prealloc, start, end, &bits); 1042 prealloc = NULL; 1043 if (err) 1044 extent_io_tree_panic(tree, err); 1045 goto out; 1046 } 1047 state = rb_entry(node, struct extent_state, rb_node); 1048 hit_next: 1049 last_start = state->start; 1050 last_end = state->end; 1051 1052 /* 1053 * | ---- desired range ---- | 1054 * | state | 1055 * 1056 * Just lock what we found and keep going 1057 */ 1058 if (state->start == start && state->end <= end) { 1059 set_state_bits(tree, state, &bits); 1060 cache_state(state, cached_state); 1061 state = clear_state_bit(tree, state, &clear_bits, 0); 1062 if (last_end == (u64)-1) 1063 goto out; 1064 start = last_end + 1; 1065 if (start < end && state && state->start == start && 1066 !need_resched()) 1067 goto hit_next; 1068 goto search_again; 1069 } 1070 1071 /* 1072 * | ---- desired range ---- | 1073 * | state | 1074 * or 1075 * | ------------- state -------------- | 1076 * 1077 * We need to split the extent we found, and may flip bits on 1078 * second half. 1079 * 1080 * If the extent we found extends past our 1081 * range, we just split and search again. It'll get split 1082 * again the next time though. 1083 * 1084 * If the extent we found is inside our range, we set the 1085 * desired bit on it. 1086 */ 1087 if (state->start < start) { 1088 prealloc = alloc_extent_state_atomic(prealloc); 1089 if (!prealloc) { 1090 err = -ENOMEM; 1091 goto out; 1092 } 1093 err = split_state(tree, state, prealloc, start); 1094 if (err) 1095 extent_io_tree_panic(tree, err); 1096 prealloc = NULL; 1097 if (err) 1098 goto out; 1099 if (state->end <= end) { 1100 set_state_bits(tree, state, &bits); 1101 cache_state(state, cached_state); 1102 state = clear_state_bit(tree, state, &clear_bits, 0); 1103 if (last_end == (u64)-1) 1104 goto out; 1105 start = last_end + 1; 1106 if (start < end && state && state->start == start && 1107 !need_resched()) 1108 goto hit_next; 1109 } 1110 goto search_again; 1111 } 1112 /* 1113 * | ---- desired range ---- | 1114 * | state | or | state | 1115 * 1116 * There's a hole, we need to insert something in it and 1117 * ignore the extent we found. 1118 */ 1119 if (state->start > start) { 1120 u64 this_end; 1121 if (end < last_start) 1122 this_end = end; 1123 else 1124 this_end = last_start - 1; 1125 1126 prealloc = alloc_extent_state_atomic(prealloc); 1127 if (!prealloc) { 1128 err = -ENOMEM; 1129 goto out; 1130 } 1131 1132 /* 1133 * Avoid to free 'prealloc' if it can be merged with 1134 * the later extent. 1135 */ 1136 err = insert_state(tree, prealloc, start, this_end, 1137 &bits); 1138 if (err) 1139 extent_io_tree_panic(tree, err); 1140 cache_state(prealloc, cached_state); 1141 prealloc = NULL; 1142 start = this_end + 1; 1143 goto search_again; 1144 } 1145 /* 1146 * | ---- desired range ---- | 1147 * | state | 1148 * We need to split the extent, and set the bit 1149 * on the first half 1150 */ 1151 if (state->start <= end && state->end > end) { 1152 prealloc = alloc_extent_state_atomic(prealloc); 1153 if (!prealloc) { 1154 err = -ENOMEM; 1155 goto out; 1156 } 1157 1158 err = split_state(tree, state, prealloc, end + 1); 1159 if (err) 1160 extent_io_tree_panic(tree, err); 1161 1162 set_state_bits(tree, prealloc, &bits); 1163 cache_state(prealloc, cached_state); 1164 clear_state_bit(tree, prealloc, &clear_bits, 0); 1165 prealloc = NULL; 1166 goto out; 1167 } 1168 1169 goto search_again; 1170 1171 out: 1172 spin_unlock(&tree->lock); 1173 if (prealloc) 1174 free_extent_state(prealloc); 1175 1176 return err; 1177 1178 search_again: 1179 if (start > end) 1180 goto out; 1181 spin_unlock(&tree->lock); 1182 if (mask & __GFP_WAIT) 1183 cond_resched(); 1184 goto again; 1185 } 1186 1187 /* wrappers around set/clear extent bit */ 1188 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1189 gfp_t mask) 1190 { 1191 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1192 NULL, mask); 1193 } 1194 1195 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1196 unsigned long bits, gfp_t mask) 1197 { 1198 return set_extent_bit(tree, start, end, bits, NULL, 1199 NULL, mask); 1200 } 1201 1202 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1203 unsigned long bits, gfp_t mask) 1204 { 1205 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1206 } 1207 1208 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1209 struct extent_state **cached_state, gfp_t mask) 1210 { 1211 return set_extent_bit(tree, start, end, 1212 EXTENT_DELALLOC | EXTENT_UPTODATE, 1213 NULL, cached_state, mask); 1214 } 1215 1216 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1217 struct extent_state **cached_state, gfp_t mask) 1218 { 1219 return set_extent_bit(tree, start, end, 1220 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1221 NULL, cached_state, mask); 1222 } 1223 1224 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1225 gfp_t mask) 1226 { 1227 return clear_extent_bit(tree, start, end, 1228 EXTENT_DIRTY | EXTENT_DELALLOC | 1229 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1230 } 1231 1232 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1233 gfp_t mask) 1234 { 1235 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1236 NULL, mask); 1237 } 1238 1239 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1240 struct extent_state **cached_state, gfp_t mask) 1241 { 1242 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, 1243 cached_state, mask); 1244 } 1245 1246 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1247 struct extent_state **cached_state, gfp_t mask) 1248 { 1249 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1250 cached_state, mask); 1251 } 1252 1253 /* 1254 * either insert or lock state struct between start and end use mask to tell 1255 * us if waiting is desired. 1256 */ 1257 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1258 unsigned long bits, struct extent_state **cached_state) 1259 { 1260 int err; 1261 u64 failed_start; 1262 while (1) { 1263 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1264 EXTENT_LOCKED, &failed_start, 1265 cached_state, GFP_NOFS); 1266 if (err == -EEXIST) { 1267 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1268 start = failed_start; 1269 } else 1270 break; 1271 WARN_ON(start > end); 1272 } 1273 return err; 1274 } 1275 1276 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1277 { 1278 return lock_extent_bits(tree, start, end, 0, NULL); 1279 } 1280 1281 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1282 { 1283 int err; 1284 u64 failed_start; 1285 1286 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1287 &failed_start, NULL, GFP_NOFS); 1288 if (err == -EEXIST) { 1289 if (failed_start > start) 1290 clear_extent_bit(tree, start, failed_start - 1, 1291 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1292 return 0; 1293 } 1294 return 1; 1295 } 1296 1297 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1298 struct extent_state **cached, gfp_t mask) 1299 { 1300 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1301 mask); 1302 } 1303 1304 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1305 { 1306 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1307 GFP_NOFS); 1308 } 1309 1310 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1311 { 1312 unsigned long index = start >> PAGE_CACHE_SHIFT; 1313 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1314 struct page *page; 1315 1316 while (index <= end_index) { 1317 page = find_get_page(inode->i_mapping, index); 1318 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1319 clear_page_dirty_for_io(page); 1320 page_cache_release(page); 1321 index++; 1322 } 1323 return 0; 1324 } 1325 1326 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1327 { 1328 unsigned long index = start >> PAGE_CACHE_SHIFT; 1329 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1330 struct page *page; 1331 1332 while (index <= end_index) { 1333 page = find_get_page(inode->i_mapping, index); 1334 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1335 account_page_redirty(page); 1336 __set_page_dirty_nobuffers(page); 1337 page_cache_release(page); 1338 index++; 1339 } 1340 return 0; 1341 } 1342 1343 /* 1344 * helper function to set both pages and extents in the tree writeback 1345 */ 1346 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1347 { 1348 unsigned long index = start >> PAGE_CACHE_SHIFT; 1349 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1350 struct page *page; 1351 1352 while (index <= end_index) { 1353 page = find_get_page(tree->mapping, index); 1354 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1355 set_page_writeback(page); 1356 page_cache_release(page); 1357 index++; 1358 } 1359 return 0; 1360 } 1361 1362 /* find the first state struct with 'bits' set after 'start', and 1363 * return it. tree->lock must be held. NULL will returned if 1364 * nothing was found after 'start' 1365 */ 1366 static struct extent_state * 1367 find_first_extent_bit_state(struct extent_io_tree *tree, 1368 u64 start, unsigned long bits) 1369 { 1370 struct rb_node *node; 1371 struct extent_state *state; 1372 1373 /* 1374 * this search will find all the extents that end after 1375 * our range starts. 1376 */ 1377 node = tree_search(tree, start); 1378 if (!node) 1379 goto out; 1380 1381 while (1) { 1382 state = rb_entry(node, struct extent_state, rb_node); 1383 if (state->end >= start && (state->state & bits)) 1384 return state; 1385 1386 node = rb_next(node); 1387 if (!node) 1388 break; 1389 } 1390 out: 1391 return NULL; 1392 } 1393 1394 /* 1395 * find the first offset in the io tree with 'bits' set. zero is 1396 * returned if we find something, and *start_ret and *end_ret are 1397 * set to reflect the state struct that was found. 1398 * 1399 * If nothing was found, 1 is returned. If found something, return 0. 1400 */ 1401 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1402 u64 *start_ret, u64 *end_ret, unsigned long bits, 1403 struct extent_state **cached_state) 1404 { 1405 struct extent_state *state; 1406 struct rb_node *n; 1407 int ret = 1; 1408 1409 spin_lock(&tree->lock); 1410 if (cached_state && *cached_state) { 1411 state = *cached_state; 1412 if (state->end == start - 1 && state->tree) { 1413 n = rb_next(&state->rb_node); 1414 while (n) { 1415 state = rb_entry(n, struct extent_state, 1416 rb_node); 1417 if (state->state & bits) 1418 goto got_it; 1419 n = rb_next(n); 1420 } 1421 free_extent_state(*cached_state); 1422 *cached_state = NULL; 1423 goto out; 1424 } 1425 free_extent_state(*cached_state); 1426 *cached_state = NULL; 1427 } 1428 1429 state = find_first_extent_bit_state(tree, start, bits); 1430 got_it: 1431 if (state) { 1432 cache_state(state, cached_state); 1433 *start_ret = state->start; 1434 *end_ret = state->end; 1435 ret = 0; 1436 } 1437 out: 1438 spin_unlock(&tree->lock); 1439 return ret; 1440 } 1441 1442 /* 1443 * find a contiguous range of bytes in the file marked as delalloc, not 1444 * more than 'max_bytes'. start and end are used to return the range, 1445 * 1446 * 1 is returned if we find something, 0 if nothing was in the tree 1447 */ 1448 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1449 u64 *start, u64 *end, u64 max_bytes, 1450 struct extent_state **cached_state) 1451 { 1452 struct rb_node *node; 1453 struct extent_state *state; 1454 u64 cur_start = *start; 1455 u64 found = 0; 1456 u64 total_bytes = 0; 1457 1458 spin_lock(&tree->lock); 1459 1460 /* 1461 * this search will find all the extents that end after 1462 * our range starts. 1463 */ 1464 node = tree_search(tree, cur_start); 1465 if (!node) { 1466 if (!found) 1467 *end = (u64)-1; 1468 goto out; 1469 } 1470 1471 while (1) { 1472 state = rb_entry(node, struct extent_state, rb_node); 1473 if (found && (state->start != cur_start || 1474 (state->state & EXTENT_BOUNDARY))) { 1475 goto out; 1476 } 1477 if (!(state->state & EXTENT_DELALLOC)) { 1478 if (!found) 1479 *end = state->end; 1480 goto out; 1481 } 1482 if (!found) { 1483 *start = state->start; 1484 *cached_state = state; 1485 atomic_inc(&state->refs); 1486 } 1487 found++; 1488 *end = state->end; 1489 cur_start = state->end + 1; 1490 node = rb_next(node); 1491 if (!node) 1492 break; 1493 total_bytes += state->end - state->start + 1; 1494 if (total_bytes >= max_bytes) 1495 break; 1496 } 1497 out: 1498 spin_unlock(&tree->lock); 1499 return found; 1500 } 1501 1502 static noinline void __unlock_for_delalloc(struct inode *inode, 1503 struct page *locked_page, 1504 u64 start, u64 end) 1505 { 1506 int ret; 1507 struct page *pages[16]; 1508 unsigned long index = start >> PAGE_CACHE_SHIFT; 1509 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1510 unsigned long nr_pages = end_index - index + 1; 1511 int i; 1512 1513 if (index == locked_page->index && end_index == index) 1514 return; 1515 1516 while (nr_pages > 0) { 1517 ret = find_get_pages_contig(inode->i_mapping, index, 1518 min_t(unsigned long, nr_pages, 1519 ARRAY_SIZE(pages)), pages); 1520 for (i = 0; i < ret; i++) { 1521 if (pages[i] != locked_page) 1522 unlock_page(pages[i]); 1523 page_cache_release(pages[i]); 1524 } 1525 nr_pages -= ret; 1526 index += ret; 1527 cond_resched(); 1528 } 1529 } 1530 1531 static noinline int lock_delalloc_pages(struct inode *inode, 1532 struct page *locked_page, 1533 u64 delalloc_start, 1534 u64 delalloc_end) 1535 { 1536 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1537 unsigned long start_index = index; 1538 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1539 unsigned long pages_locked = 0; 1540 struct page *pages[16]; 1541 unsigned long nrpages; 1542 int ret; 1543 int i; 1544 1545 /* the caller is responsible for locking the start index */ 1546 if (index == locked_page->index && index == end_index) 1547 return 0; 1548 1549 /* skip the page at the start index */ 1550 nrpages = end_index - index + 1; 1551 while (nrpages > 0) { 1552 ret = find_get_pages_contig(inode->i_mapping, index, 1553 min_t(unsigned long, 1554 nrpages, ARRAY_SIZE(pages)), pages); 1555 if (ret == 0) { 1556 ret = -EAGAIN; 1557 goto done; 1558 } 1559 /* now we have an array of pages, lock them all */ 1560 for (i = 0; i < ret; i++) { 1561 /* 1562 * the caller is taking responsibility for 1563 * locked_page 1564 */ 1565 if (pages[i] != locked_page) { 1566 lock_page(pages[i]); 1567 if (!PageDirty(pages[i]) || 1568 pages[i]->mapping != inode->i_mapping) { 1569 ret = -EAGAIN; 1570 unlock_page(pages[i]); 1571 page_cache_release(pages[i]); 1572 goto done; 1573 } 1574 } 1575 page_cache_release(pages[i]); 1576 pages_locked++; 1577 } 1578 nrpages -= ret; 1579 index += ret; 1580 cond_resched(); 1581 } 1582 ret = 0; 1583 done: 1584 if (ret && pages_locked) { 1585 __unlock_for_delalloc(inode, locked_page, 1586 delalloc_start, 1587 ((u64)(start_index + pages_locked - 1)) << 1588 PAGE_CACHE_SHIFT); 1589 } 1590 return ret; 1591 } 1592 1593 /* 1594 * find a contiguous range of bytes in the file marked as delalloc, not 1595 * more than 'max_bytes'. start and end are used to return the range, 1596 * 1597 * 1 is returned if we find something, 0 if nothing was in the tree 1598 */ 1599 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1600 struct extent_io_tree *tree, 1601 struct page *locked_page, 1602 u64 *start, u64 *end, 1603 u64 max_bytes) 1604 { 1605 u64 delalloc_start; 1606 u64 delalloc_end; 1607 u64 found; 1608 struct extent_state *cached_state = NULL; 1609 int ret; 1610 int loops = 0; 1611 1612 again: 1613 /* step one, find a bunch of delalloc bytes starting at start */ 1614 delalloc_start = *start; 1615 delalloc_end = 0; 1616 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1617 max_bytes, &cached_state); 1618 if (!found || delalloc_end <= *start) { 1619 *start = delalloc_start; 1620 *end = delalloc_end; 1621 free_extent_state(cached_state); 1622 return found; 1623 } 1624 1625 /* 1626 * start comes from the offset of locked_page. We have to lock 1627 * pages in order, so we can't process delalloc bytes before 1628 * locked_page 1629 */ 1630 if (delalloc_start < *start) 1631 delalloc_start = *start; 1632 1633 /* 1634 * make sure to limit the number of pages we try to lock down 1635 * if we're looping. 1636 */ 1637 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1638 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1639 1640 /* step two, lock all the pages after the page that has start */ 1641 ret = lock_delalloc_pages(inode, locked_page, 1642 delalloc_start, delalloc_end); 1643 if (ret == -EAGAIN) { 1644 /* some of the pages are gone, lets avoid looping by 1645 * shortening the size of the delalloc range we're searching 1646 */ 1647 free_extent_state(cached_state); 1648 if (!loops) { 1649 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1650 max_bytes = PAGE_CACHE_SIZE - offset; 1651 loops = 1; 1652 goto again; 1653 } else { 1654 found = 0; 1655 goto out_failed; 1656 } 1657 } 1658 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1659 1660 /* step three, lock the state bits for the whole range */ 1661 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1662 1663 /* then test to make sure it is all still delalloc */ 1664 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1665 EXTENT_DELALLOC, 1, cached_state); 1666 if (!ret) { 1667 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1668 &cached_state, GFP_NOFS); 1669 __unlock_for_delalloc(inode, locked_page, 1670 delalloc_start, delalloc_end); 1671 cond_resched(); 1672 goto again; 1673 } 1674 free_extent_state(cached_state); 1675 *start = delalloc_start; 1676 *end = delalloc_end; 1677 out_failed: 1678 return found; 1679 } 1680 1681 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1682 struct page *locked_page, 1683 unsigned long clear_bits, 1684 unsigned long page_ops) 1685 { 1686 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1687 int ret; 1688 struct page *pages[16]; 1689 unsigned long index = start >> PAGE_CACHE_SHIFT; 1690 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1691 unsigned long nr_pages = end_index - index + 1; 1692 int i; 1693 1694 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1695 if (page_ops == 0) 1696 return 0; 1697 1698 while (nr_pages > 0) { 1699 ret = find_get_pages_contig(inode->i_mapping, index, 1700 min_t(unsigned long, 1701 nr_pages, ARRAY_SIZE(pages)), pages); 1702 for (i = 0; i < ret; i++) { 1703 1704 if (page_ops & PAGE_SET_PRIVATE2) 1705 SetPagePrivate2(pages[i]); 1706 1707 if (pages[i] == locked_page) { 1708 page_cache_release(pages[i]); 1709 continue; 1710 } 1711 if (page_ops & PAGE_CLEAR_DIRTY) 1712 clear_page_dirty_for_io(pages[i]); 1713 if (page_ops & PAGE_SET_WRITEBACK) 1714 set_page_writeback(pages[i]); 1715 if (page_ops & PAGE_END_WRITEBACK) 1716 end_page_writeback(pages[i]); 1717 if (page_ops & PAGE_UNLOCK) 1718 unlock_page(pages[i]); 1719 page_cache_release(pages[i]); 1720 } 1721 nr_pages -= ret; 1722 index += ret; 1723 cond_resched(); 1724 } 1725 return 0; 1726 } 1727 1728 /* 1729 * count the number of bytes in the tree that have a given bit(s) 1730 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1731 * cached. The total number found is returned. 1732 */ 1733 u64 count_range_bits(struct extent_io_tree *tree, 1734 u64 *start, u64 search_end, u64 max_bytes, 1735 unsigned long bits, int contig) 1736 { 1737 struct rb_node *node; 1738 struct extent_state *state; 1739 u64 cur_start = *start; 1740 u64 total_bytes = 0; 1741 u64 last = 0; 1742 int found = 0; 1743 1744 if (search_end <= cur_start) { 1745 WARN_ON(1); 1746 return 0; 1747 } 1748 1749 spin_lock(&tree->lock); 1750 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1751 total_bytes = tree->dirty_bytes; 1752 goto out; 1753 } 1754 /* 1755 * this search will find all the extents that end after 1756 * our range starts. 1757 */ 1758 node = tree_search(tree, cur_start); 1759 if (!node) 1760 goto out; 1761 1762 while (1) { 1763 state = rb_entry(node, struct extent_state, rb_node); 1764 if (state->start > search_end) 1765 break; 1766 if (contig && found && state->start > last + 1) 1767 break; 1768 if (state->end >= cur_start && (state->state & bits) == bits) { 1769 total_bytes += min(search_end, state->end) + 1 - 1770 max(cur_start, state->start); 1771 if (total_bytes >= max_bytes) 1772 break; 1773 if (!found) { 1774 *start = max(cur_start, state->start); 1775 found = 1; 1776 } 1777 last = state->end; 1778 } else if (contig && found) { 1779 break; 1780 } 1781 node = rb_next(node); 1782 if (!node) 1783 break; 1784 } 1785 out: 1786 spin_unlock(&tree->lock); 1787 return total_bytes; 1788 } 1789 1790 /* 1791 * set the private field for a given byte offset in the tree. If there isn't 1792 * an extent_state there already, this does nothing. 1793 */ 1794 static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1795 { 1796 struct rb_node *node; 1797 struct extent_state *state; 1798 int ret = 0; 1799 1800 spin_lock(&tree->lock); 1801 /* 1802 * this search will find all the extents that end after 1803 * our range starts. 1804 */ 1805 node = tree_search(tree, start); 1806 if (!node) { 1807 ret = -ENOENT; 1808 goto out; 1809 } 1810 state = rb_entry(node, struct extent_state, rb_node); 1811 if (state->start != start) { 1812 ret = -ENOENT; 1813 goto out; 1814 } 1815 state->private = private; 1816 out: 1817 spin_unlock(&tree->lock); 1818 return ret; 1819 } 1820 1821 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1822 { 1823 struct rb_node *node; 1824 struct extent_state *state; 1825 int ret = 0; 1826 1827 spin_lock(&tree->lock); 1828 /* 1829 * this search will find all the extents that end after 1830 * our range starts. 1831 */ 1832 node = tree_search(tree, start); 1833 if (!node) { 1834 ret = -ENOENT; 1835 goto out; 1836 } 1837 state = rb_entry(node, struct extent_state, rb_node); 1838 if (state->start != start) { 1839 ret = -ENOENT; 1840 goto out; 1841 } 1842 *private = state->private; 1843 out: 1844 spin_unlock(&tree->lock); 1845 return ret; 1846 } 1847 1848 /* 1849 * searches a range in the state tree for a given mask. 1850 * If 'filled' == 1, this returns 1 only if every extent in the tree 1851 * has the bits set. Otherwise, 1 is returned if any bit in the 1852 * range is found set. 1853 */ 1854 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1855 unsigned long bits, int filled, struct extent_state *cached) 1856 { 1857 struct extent_state *state = NULL; 1858 struct rb_node *node; 1859 int bitset = 0; 1860 1861 spin_lock(&tree->lock); 1862 if (cached && cached->tree && cached->start <= start && 1863 cached->end > start) 1864 node = &cached->rb_node; 1865 else 1866 node = tree_search(tree, start); 1867 while (node && start <= end) { 1868 state = rb_entry(node, struct extent_state, rb_node); 1869 1870 if (filled && state->start > start) { 1871 bitset = 0; 1872 break; 1873 } 1874 1875 if (state->start > end) 1876 break; 1877 1878 if (state->state & bits) { 1879 bitset = 1; 1880 if (!filled) 1881 break; 1882 } else if (filled) { 1883 bitset = 0; 1884 break; 1885 } 1886 1887 if (state->end == (u64)-1) 1888 break; 1889 1890 start = state->end + 1; 1891 if (start > end) 1892 break; 1893 node = rb_next(node); 1894 if (!node) { 1895 if (filled) 1896 bitset = 0; 1897 break; 1898 } 1899 } 1900 spin_unlock(&tree->lock); 1901 return bitset; 1902 } 1903 1904 /* 1905 * helper function to set a given page up to date if all the 1906 * extents in the tree for that page are up to date 1907 */ 1908 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1909 { 1910 u64 start = page_offset(page); 1911 u64 end = start + PAGE_CACHE_SIZE - 1; 1912 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1913 SetPageUptodate(page); 1914 } 1915 1916 /* 1917 * When IO fails, either with EIO or csum verification fails, we 1918 * try other mirrors that might have a good copy of the data. This 1919 * io_failure_record is used to record state as we go through all the 1920 * mirrors. If another mirror has good data, the page is set up to date 1921 * and things continue. If a good mirror can't be found, the original 1922 * bio end_io callback is called to indicate things have failed. 1923 */ 1924 struct io_failure_record { 1925 struct page *page; 1926 u64 start; 1927 u64 len; 1928 u64 logical; 1929 unsigned long bio_flags; 1930 int this_mirror; 1931 int failed_mirror; 1932 int in_validation; 1933 }; 1934 1935 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1936 int did_repair) 1937 { 1938 int ret; 1939 int err = 0; 1940 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1941 1942 set_state_private(failure_tree, rec->start, 0); 1943 ret = clear_extent_bits(failure_tree, rec->start, 1944 rec->start + rec->len - 1, 1945 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1946 if (ret) 1947 err = ret; 1948 1949 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1950 rec->start + rec->len - 1, 1951 EXTENT_DAMAGED, GFP_NOFS); 1952 if (ret && !err) 1953 err = ret; 1954 1955 kfree(rec); 1956 return err; 1957 } 1958 1959 static void repair_io_failure_callback(struct bio *bio, int err) 1960 { 1961 complete(bio->bi_private); 1962 } 1963 1964 /* 1965 * this bypasses the standard btrfs submit functions deliberately, as 1966 * the standard behavior is to write all copies in a raid setup. here we only 1967 * want to write the one bad copy. so we do the mapping for ourselves and issue 1968 * submit_bio directly. 1969 * to avoid any synchronization issues, wait for the data after writing, which 1970 * actually prevents the read that triggered the error from finishing. 1971 * currently, there can be no more than two copies of every data bit. thus, 1972 * exactly one rewrite is required. 1973 */ 1974 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1975 u64 length, u64 logical, struct page *page, 1976 int mirror_num) 1977 { 1978 struct bio *bio; 1979 struct btrfs_device *dev; 1980 DECLARE_COMPLETION_ONSTACK(compl); 1981 u64 map_length = 0; 1982 u64 sector; 1983 struct btrfs_bio *bbio = NULL; 1984 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 1985 int ret; 1986 1987 BUG_ON(!mirror_num); 1988 1989 /* we can't repair anything in raid56 yet */ 1990 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 1991 return 0; 1992 1993 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1994 if (!bio) 1995 return -EIO; 1996 bio->bi_private = &compl; 1997 bio->bi_end_io = repair_io_failure_callback; 1998 bio->bi_size = 0; 1999 map_length = length; 2000 2001 ret = btrfs_map_block(fs_info, WRITE, logical, 2002 &map_length, &bbio, mirror_num); 2003 if (ret) { 2004 bio_put(bio); 2005 return -EIO; 2006 } 2007 BUG_ON(mirror_num != bbio->mirror_num); 2008 sector = bbio->stripes[mirror_num-1].physical >> 9; 2009 bio->bi_sector = sector; 2010 dev = bbio->stripes[mirror_num-1].dev; 2011 kfree(bbio); 2012 if (!dev || !dev->bdev || !dev->writeable) { 2013 bio_put(bio); 2014 return -EIO; 2015 } 2016 bio->bi_bdev = dev->bdev; 2017 bio_add_page(bio, page, length, start - page_offset(page)); 2018 btrfsic_submit_bio(WRITE_SYNC, bio); 2019 wait_for_completion(&compl); 2020 2021 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2022 /* try to remap that extent elsewhere? */ 2023 bio_put(bio); 2024 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2025 return -EIO; 2026 } 2027 2028 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 2029 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2030 start, rcu_str_deref(dev->name), sector); 2031 2032 bio_put(bio); 2033 return 0; 2034 } 2035 2036 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 2037 int mirror_num) 2038 { 2039 u64 start = eb->start; 2040 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2041 int ret = 0; 2042 2043 for (i = 0; i < num_pages; i++) { 2044 struct page *p = extent_buffer_page(eb, i); 2045 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2046 start, p, mirror_num); 2047 if (ret) 2048 break; 2049 start += PAGE_CACHE_SIZE; 2050 } 2051 2052 return ret; 2053 } 2054 2055 /* 2056 * each time an IO finishes, we do a fast check in the IO failure tree 2057 * to see if we need to process or clean up an io_failure_record 2058 */ 2059 static int clean_io_failure(u64 start, struct page *page) 2060 { 2061 u64 private; 2062 u64 private_failure; 2063 struct io_failure_record *failrec; 2064 struct btrfs_fs_info *fs_info; 2065 struct extent_state *state; 2066 int num_copies; 2067 int did_repair = 0; 2068 int ret; 2069 struct inode *inode = page->mapping->host; 2070 2071 private = 0; 2072 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2073 (u64)-1, 1, EXTENT_DIRTY, 0); 2074 if (!ret) 2075 return 0; 2076 2077 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2078 &private_failure); 2079 if (ret) 2080 return 0; 2081 2082 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2083 BUG_ON(!failrec->this_mirror); 2084 2085 if (failrec->in_validation) { 2086 /* there was no real error, just free the record */ 2087 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2088 failrec->start); 2089 did_repair = 1; 2090 goto out; 2091 } 2092 2093 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2094 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2095 failrec->start, 2096 EXTENT_LOCKED); 2097 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2098 2099 if (state && state->start <= failrec->start && 2100 state->end >= failrec->start + failrec->len - 1) { 2101 fs_info = BTRFS_I(inode)->root->fs_info; 2102 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2103 failrec->len); 2104 if (num_copies > 1) { 2105 ret = repair_io_failure(fs_info, start, failrec->len, 2106 failrec->logical, page, 2107 failrec->failed_mirror); 2108 did_repair = !ret; 2109 } 2110 ret = 0; 2111 } 2112 2113 out: 2114 if (!ret) 2115 ret = free_io_failure(inode, failrec, did_repair); 2116 2117 return ret; 2118 } 2119 2120 /* 2121 * this is a generic handler for readpage errors (default 2122 * readpage_io_failed_hook). if other copies exist, read those and write back 2123 * good data to the failed position. does not investigate in remapping the 2124 * failed extent elsewhere, hoping the device will be smart enough to do this as 2125 * needed 2126 */ 2127 2128 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2129 struct page *page, u64 start, u64 end, 2130 int failed_mirror) 2131 { 2132 struct io_failure_record *failrec = NULL; 2133 u64 private; 2134 struct extent_map *em; 2135 struct inode *inode = page->mapping->host; 2136 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2137 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2138 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2139 struct bio *bio; 2140 struct btrfs_io_bio *btrfs_failed_bio; 2141 struct btrfs_io_bio *btrfs_bio; 2142 int num_copies; 2143 int ret; 2144 int read_mode; 2145 u64 logical; 2146 2147 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2148 2149 ret = get_state_private(failure_tree, start, &private); 2150 if (ret) { 2151 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2152 if (!failrec) 2153 return -ENOMEM; 2154 failrec->start = start; 2155 failrec->len = end - start + 1; 2156 failrec->this_mirror = 0; 2157 failrec->bio_flags = 0; 2158 failrec->in_validation = 0; 2159 2160 read_lock(&em_tree->lock); 2161 em = lookup_extent_mapping(em_tree, start, failrec->len); 2162 if (!em) { 2163 read_unlock(&em_tree->lock); 2164 kfree(failrec); 2165 return -EIO; 2166 } 2167 2168 if (em->start > start || em->start + em->len < start) { 2169 free_extent_map(em); 2170 em = NULL; 2171 } 2172 read_unlock(&em_tree->lock); 2173 2174 if (!em) { 2175 kfree(failrec); 2176 return -EIO; 2177 } 2178 logical = start - em->start; 2179 logical = em->block_start + logical; 2180 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2181 logical = em->block_start; 2182 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2183 extent_set_compress_type(&failrec->bio_flags, 2184 em->compress_type); 2185 } 2186 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2187 "len=%llu\n", logical, start, failrec->len); 2188 failrec->logical = logical; 2189 free_extent_map(em); 2190 2191 /* set the bits in the private failure tree */ 2192 ret = set_extent_bits(failure_tree, start, end, 2193 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2194 if (ret >= 0) 2195 ret = set_state_private(failure_tree, start, 2196 (u64)(unsigned long)failrec); 2197 /* set the bits in the inode's tree */ 2198 if (ret >= 0) 2199 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2200 GFP_NOFS); 2201 if (ret < 0) { 2202 kfree(failrec); 2203 return ret; 2204 } 2205 } else { 2206 failrec = (struct io_failure_record *)(unsigned long)private; 2207 pr_debug("bio_readpage_error: (found) logical=%llu, " 2208 "start=%llu, len=%llu, validation=%d\n", 2209 failrec->logical, failrec->start, failrec->len, 2210 failrec->in_validation); 2211 /* 2212 * when data can be on disk more than twice, add to failrec here 2213 * (e.g. with a list for failed_mirror) to make 2214 * clean_io_failure() clean all those errors at once. 2215 */ 2216 } 2217 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2218 failrec->logical, failrec->len); 2219 if (num_copies == 1) { 2220 /* 2221 * we only have a single copy of the data, so don't bother with 2222 * all the retry and error correction code that follows. no 2223 * matter what the error is, it is very likely to persist. 2224 */ 2225 pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 2226 num_copies, failrec->this_mirror, failed_mirror); 2227 free_io_failure(inode, failrec, 0); 2228 return -EIO; 2229 } 2230 2231 /* 2232 * there are two premises: 2233 * a) deliver good data to the caller 2234 * b) correct the bad sectors on disk 2235 */ 2236 if (failed_bio->bi_vcnt > 1) { 2237 /* 2238 * to fulfill b), we need to know the exact failing sectors, as 2239 * we don't want to rewrite any more than the failed ones. thus, 2240 * we need separate read requests for the failed bio 2241 * 2242 * if the following BUG_ON triggers, our validation request got 2243 * merged. we need separate requests for our algorithm to work. 2244 */ 2245 BUG_ON(failrec->in_validation); 2246 failrec->in_validation = 1; 2247 failrec->this_mirror = failed_mirror; 2248 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2249 } else { 2250 /* 2251 * we're ready to fulfill a) and b) alongside. get a good copy 2252 * of the failed sector and if we succeed, we have setup 2253 * everything for repair_io_failure to do the rest for us. 2254 */ 2255 if (failrec->in_validation) { 2256 BUG_ON(failrec->this_mirror != failed_mirror); 2257 failrec->in_validation = 0; 2258 failrec->this_mirror = 0; 2259 } 2260 failrec->failed_mirror = failed_mirror; 2261 failrec->this_mirror++; 2262 if (failrec->this_mirror == failed_mirror) 2263 failrec->this_mirror++; 2264 read_mode = READ_SYNC; 2265 } 2266 2267 if (failrec->this_mirror > num_copies) { 2268 pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 2269 num_copies, failrec->this_mirror, failed_mirror); 2270 free_io_failure(inode, failrec, 0); 2271 return -EIO; 2272 } 2273 2274 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2275 if (!bio) { 2276 free_io_failure(inode, failrec, 0); 2277 return -EIO; 2278 } 2279 bio->bi_end_io = failed_bio->bi_end_io; 2280 bio->bi_sector = failrec->logical >> 9; 2281 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2282 bio->bi_size = 0; 2283 2284 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2285 if (btrfs_failed_bio->csum) { 2286 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2287 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2288 2289 btrfs_bio = btrfs_io_bio(bio); 2290 btrfs_bio->csum = btrfs_bio->csum_inline; 2291 phy_offset >>= inode->i_sb->s_blocksize_bits; 2292 phy_offset *= csum_size; 2293 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, 2294 csum_size); 2295 } 2296 2297 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2298 2299 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2300 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2301 failrec->this_mirror, num_copies, failrec->in_validation); 2302 2303 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2304 failrec->this_mirror, 2305 failrec->bio_flags, 0); 2306 return ret; 2307 } 2308 2309 /* lots and lots of room for performance fixes in the end_bio funcs */ 2310 2311 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2312 { 2313 int uptodate = (err == 0); 2314 struct extent_io_tree *tree; 2315 int ret; 2316 2317 tree = &BTRFS_I(page->mapping->host)->io_tree; 2318 2319 if (tree->ops && tree->ops->writepage_end_io_hook) { 2320 ret = tree->ops->writepage_end_io_hook(page, start, 2321 end, NULL, uptodate); 2322 if (ret) 2323 uptodate = 0; 2324 } 2325 2326 if (!uptodate) { 2327 ClearPageUptodate(page); 2328 SetPageError(page); 2329 } 2330 return 0; 2331 } 2332 2333 /* 2334 * after a writepage IO is done, we need to: 2335 * clear the uptodate bits on error 2336 * clear the writeback bits in the extent tree for this IO 2337 * end_page_writeback if the page has no more pending IO 2338 * 2339 * Scheduling is not allowed, so the extent state tree is expected 2340 * to have one and only one object corresponding to this IO. 2341 */ 2342 static void end_bio_extent_writepage(struct bio *bio, int err) 2343 { 2344 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2345 struct extent_io_tree *tree; 2346 u64 start; 2347 u64 end; 2348 2349 do { 2350 struct page *page = bvec->bv_page; 2351 tree = &BTRFS_I(page->mapping->host)->io_tree; 2352 2353 /* We always issue full-page reads, but if some block 2354 * in a page fails to read, blk_update_request() will 2355 * advance bv_offset and adjust bv_len to compensate. 2356 * Print a warning for nonzero offsets, and an error 2357 * if they don't add up to a full page. */ 2358 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2359 printk("%s page write in btrfs with offset %u and length %u\n", 2360 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2361 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2362 bvec->bv_offset, bvec->bv_len); 2363 2364 start = page_offset(page); 2365 end = start + bvec->bv_offset + bvec->bv_len - 1; 2366 2367 if (--bvec >= bio->bi_io_vec) 2368 prefetchw(&bvec->bv_page->flags); 2369 2370 if (end_extent_writepage(page, err, start, end)) 2371 continue; 2372 2373 end_page_writeback(page); 2374 } while (bvec >= bio->bi_io_vec); 2375 2376 bio_put(bio); 2377 } 2378 2379 static void 2380 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2381 int uptodate) 2382 { 2383 struct extent_state *cached = NULL; 2384 u64 end = start + len - 1; 2385 2386 if (uptodate && tree->track_uptodate) 2387 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2388 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2389 } 2390 2391 /* 2392 * after a readpage IO is done, we need to: 2393 * clear the uptodate bits on error 2394 * set the uptodate bits if things worked 2395 * set the page up to date if all extents in the tree are uptodate 2396 * clear the lock bit in the extent tree 2397 * unlock the page if there are no other extents locked for it 2398 * 2399 * Scheduling is not allowed, so the extent state tree is expected 2400 * to have one and only one object corresponding to this IO. 2401 */ 2402 static void end_bio_extent_readpage(struct bio *bio, int err) 2403 { 2404 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2405 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2406 struct bio_vec *bvec = bio->bi_io_vec; 2407 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2408 struct extent_io_tree *tree; 2409 u64 offset = 0; 2410 u64 start; 2411 u64 end; 2412 u64 len; 2413 u64 extent_start = 0; 2414 u64 extent_len = 0; 2415 int mirror; 2416 int ret; 2417 2418 if (err) 2419 uptodate = 0; 2420 2421 do { 2422 struct page *page = bvec->bv_page; 2423 struct inode *inode = page->mapping->host; 2424 2425 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2426 "mirror=%lu\n", (u64)bio->bi_sector, err, 2427 io_bio->mirror_num); 2428 tree = &BTRFS_I(inode)->io_tree; 2429 2430 /* We always issue full-page reads, but if some block 2431 * in a page fails to read, blk_update_request() will 2432 * advance bv_offset and adjust bv_len to compensate. 2433 * Print a warning for nonzero offsets, and an error 2434 * if they don't add up to a full page. */ 2435 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2436 printk("%s page read in btrfs with offset %u and length %u\n", 2437 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2438 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2439 bvec->bv_offset, bvec->bv_len); 2440 2441 start = page_offset(page); 2442 end = start + bvec->bv_offset + bvec->bv_len - 1; 2443 len = bvec->bv_len; 2444 2445 if (++bvec <= bvec_end) 2446 prefetchw(&bvec->bv_page->flags); 2447 2448 mirror = io_bio->mirror_num; 2449 if (likely(uptodate && tree->ops && 2450 tree->ops->readpage_end_io_hook)) { 2451 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2452 page, start, end, 2453 mirror); 2454 if (ret) 2455 uptodate = 0; 2456 else 2457 clean_io_failure(start, page); 2458 } 2459 2460 if (likely(uptodate)) 2461 goto readpage_ok; 2462 2463 if (tree->ops && tree->ops->readpage_io_failed_hook) { 2464 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2465 if (!ret && !err && 2466 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2467 uptodate = 1; 2468 } else { 2469 /* 2470 * The generic bio_readpage_error handles errors the 2471 * following way: If possible, new read requests are 2472 * created and submitted and will end up in 2473 * end_bio_extent_readpage as well (if we're lucky, not 2474 * in the !uptodate case). In that case it returns 0 and 2475 * we just go on with the next page in our bio. If it 2476 * can't handle the error it will return -EIO and we 2477 * remain responsible for that page. 2478 */ 2479 ret = bio_readpage_error(bio, offset, page, start, end, 2480 mirror); 2481 if (ret == 0) { 2482 uptodate = 2483 test_bit(BIO_UPTODATE, &bio->bi_flags); 2484 if (err) 2485 uptodate = 0; 2486 continue; 2487 } 2488 } 2489 readpage_ok: 2490 if (likely(uptodate)) { 2491 loff_t i_size = i_size_read(inode); 2492 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2493 unsigned offset; 2494 2495 /* Zero out the end if this page straddles i_size */ 2496 offset = i_size & (PAGE_CACHE_SIZE-1); 2497 if (page->index == end_index && offset) 2498 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2499 SetPageUptodate(page); 2500 } else { 2501 ClearPageUptodate(page); 2502 SetPageError(page); 2503 } 2504 unlock_page(page); 2505 offset += len; 2506 2507 if (unlikely(!uptodate)) { 2508 if (extent_len) { 2509 endio_readpage_release_extent(tree, 2510 extent_start, 2511 extent_len, 1); 2512 extent_start = 0; 2513 extent_len = 0; 2514 } 2515 endio_readpage_release_extent(tree, start, 2516 end - start + 1, 0); 2517 } else if (!extent_len) { 2518 extent_start = start; 2519 extent_len = end + 1 - start; 2520 } else if (extent_start + extent_len == start) { 2521 extent_len += end + 1 - start; 2522 } else { 2523 endio_readpage_release_extent(tree, extent_start, 2524 extent_len, uptodate); 2525 extent_start = start; 2526 extent_len = end + 1 - start; 2527 } 2528 } while (bvec <= bvec_end); 2529 2530 if (extent_len) 2531 endio_readpage_release_extent(tree, extent_start, extent_len, 2532 uptodate); 2533 if (io_bio->end_io) 2534 io_bio->end_io(io_bio, err); 2535 bio_put(bio); 2536 } 2537 2538 /* 2539 * this allocates from the btrfs_bioset. We're returning a bio right now 2540 * but you can call btrfs_io_bio for the appropriate container_of magic 2541 */ 2542 struct bio * 2543 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2544 gfp_t gfp_flags) 2545 { 2546 struct btrfs_io_bio *btrfs_bio; 2547 struct bio *bio; 2548 2549 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2550 2551 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2552 while (!bio && (nr_vecs /= 2)) { 2553 bio = bio_alloc_bioset(gfp_flags, 2554 nr_vecs, btrfs_bioset); 2555 } 2556 } 2557 2558 if (bio) { 2559 bio->bi_size = 0; 2560 bio->bi_bdev = bdev; 2561 bio->bi_sector = first_sector; 2562 btrfs_bio = btrfs_io_bio(bio); 2563 btrfs_bio->csum = NULL; 2564 btrfs_bio->csum_allocated = NULL; 2565 btrfs_bio->end_io = NULL; 2566 } 2567 return bio; 2568 } 2569 2570 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2571 { 2572 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2573 } 2574 2575 2576 /* this also allocates from the btrfs_bioset */ 2577 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2578 { 2579 struct btrfs_io_bio *btrfs_bio; 2580 struct bio *bio; 2581 2582 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2583 if (bio) { 2584 btrfs_bio = btrfs_io_bio(bio); 2585 btrfs_bio->csum = NULL; 2586 btrfs_bio->csum_allocated = NULL; 2587 btrfs_bio->end_io = NULL; 2588 } 2589 return bio; 2590 } 2591 2592 2593 static int __must_check submit_one_bio(int rw, struct bio *bio, 2594 int mirror_num, unsigned long bio_flags) 2595 { 2596 int ret = 0; 2597 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2598 struct page *page = bvec->bv_page; 2599 struct extent_io_tree *tree = bio->bi_private; 2600 u64 start; 2601 2602 start = page_offset(page) + bvec->bv_offset; 2603 2604 bio->bi_private = NULL; 2605 2606 bio_get(bio); 2607 2608 if (tree->ops && tree->ops->submit_bio_hook) 2609 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2610 mirror_num, bio_flags, start); 2611 else 2612 btrfsic_submit_bio(rw, bio); 2613 2614 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2615 ret = -EOPNOTSUPP; 2616 bio_put(bio); 2617 return ret; 2618 } 2619 2620 static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, 2621 unsigned long offset, size_t size, struct bio *bio, 2622 unsigned long bio_flags) 2623 { 2624 int ret = 0; 2625 if (tree->ops && tree->ops->merge_bio_hook) 2626 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, 2627 bio_flags); 2628 BUG_ON(ret < 0); 2629 return ret; 2630 2631 } 2632 2633 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2634 struct page *page, sector_t sector, 2635 size_t size, unsigned long offset, 2636 struct block_device *bdev, 2637 struct bio **bio_ret, 2638 unsigned long max_pages, 2639 bio_end_io_t end_io_func, 2640 int mirror_num, 2641 unsigned long prev_bio_flags, 2642 unsigned long bio_flags) 2643 { 2644 int ret = 0; 2645 struct bio *bio; 2646 int nr; 2647 int contig = 0; 2648 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2649 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2650 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2651 2652 if (bio_ret && *bio_ret) { 2653 bio = *bio_ret; 2654 if (old_compressed) 2655 contig = bio->bi_sector == sector; 2656 else 2657 contig = bio_end_sector(bio) == sector; 2658 2659 if (prev_bio_flags != bio_flags || !contig || 2660 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2661 bio_add_page(bio, page, page_size, offset) < page_size) { 2662 ret = submit_one_bio(rw, bio, mirror_num, 2663 prev_bio_flags); 2664 if (ret < 0) 2665 return ret; 2666 bio = NULL; 2667 } else { 2668 return 0; 2669 } 2670 } 2671 if (this_compressed) 2672 nr = BIO_MAX_PAGES; 2673 else 2674 nr = bio_get_nr_vecs(bdev); 2675 2676 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2677 if (!bio) 2678 return -ENOMEM; 2679 2680 bio_add_page(bio, page, page_size, offset); 2681 bio->bi_end_io = end_io_func; 2682 bio->bi_private = tree; 2683 2684 if (bio_ret) 2685 *bio_ret = bio; 2686 else 2687 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2688 2689 return ret; 2690 } 2691 2692 static void attach_extent_buffer_page(struct extent_buffer *eb, 2693 struct page *page) 2694 { 2695 if (!PagePrivate(page)) { 2696 SetPagePrivate(page); 2697 page_cache_get(page); 2698 set_page_private(page, (unsigned long)eb); 2699 } else { 2700 WARN_ON(page->private != (unsigned long)eb); 2701 } 2702 } 2703 2704 void set_page_extent_mapped(struct page *page) 2705 { 2706 if (!PagePrivate(page)) { 2707 SetPagePrivate(page); 2708 page_cache_get(page); 2709 set_page_private(page, EXTENT_PAGE_PRIVATE); 2710 } 2711 } 2712 2713 static struct extent_map * 2714 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2715 u64 start, u64 len, get_extent_t *get_extent, 2716 struct extent_map **em_cached) 2717 { 2718 struct extent_map *em; 2719 2720 if (em_cached && *em_cached) { 2721 em = *em_cached; 2722 if (em->in_tree && start >= em->start && 2723 start < extent_map_end(em)) { 2724 atomic_inc(&em->refs); 2725 return em; 2726 } 2727 2728 free_extent_map(em); 2729 *em_cached = NULL; 2730 } 2731 2732 em = get_extent(inode, page, pg_offset, start, len, 0); 2733 if (em_cached && !IS_ERR_OR_NULL(em)) { 2734 BUG_ON(*em_cached); 2735 atomic_inc(&em->refs); 2736 *em_cached = em; 2737 } 2738 return em; 2739 } 2740 /* 2741 * basic readpage implementation. Locked extent state structs are inserted 2742 * into the tree that are removed when the IO is done (by the end_io 2743 * handlers) 2744 * XXX JDM: This needs looking at to ensure proper page locking 2745 */ 2746 static int __do_readpage(struct extent_io_tree *tree, 2747 struct page *page, 2748 get_extent_t *get_extent, 2749 struct extent_map **em_cached, 2750 struct bio **bio, int mirror_num, 2751 unsigned long *bio_flags, int rw) 2752 { 2753 struct inode *inode = page->mapping->host; 2754 u64 start = page_offset(page); 2755 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2756 u64 end; 2757 u64 cur = start; 2758 u64 extent_offset; 2759 u64 last_byte = i_size_read(inode); 2760 u64 block_start; 2761 u64 cur_end; 2762 sector_t sector; 2763 struct extent_map *em; 2764 struct block_device *bdev; 2765 int ret; 2766 int nr = 0; 2767 int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; 2768 size_t pg_offset = 0; 2769 size_t iosize; 2770 size_t disk_io_size; 2771 size_t blocksize = inode->i_sb->s_blocksize; 2772 unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; 2773 2774 set_page_extent_mapped(page); 2775 2776 end = page_end; 2777 if (!PageUptodate(page)) { 2778 if (cleancache_get_page(page) == 0) { 2779 BUG_ON(blocksize != PAGE_SIZE); 2780 unlock_extent(tree, start, end); 2781 goto out; 2782 } 2783 } 2784 2785 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2786 char *userpage; 2787 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2788 2789 if (zero_offset) { 2790 iosize = PAGE_CACHE_SIZE - zero_offset; 2791 userpage = kmap_atomic(page); 2792 memset(userpage + zero_offset, 0, iosize); 2793 flush_dcache_page(page); 2794 kunmap_atomic(userpage); 2795 } 2796 } 2797 while (cur <= end) { 2798 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2799 2800 if (cur >= last_byte) { 2801 char *userpage; 2802 struct extent_state *cached = NULL; 2803 2804 iosize = PAGE_CACHE_SIZE - pg_offset; 2805 userpage = kmap_atomic(page); 2806 memset(userpage + pg_offset, 0, iosize); 2807 flush_dcache_page(page); 2808 kunmap_atomic(userpage); 2809 set_extent_uptodate(tree, cur, cur + iosize - 1, 2810 &cached, GFP_NOFS); 2811 if (!parent_locked) 2812 unlock_extent_cached(tree, cur, 2813 cur + iosize - 1, 2814 &cached, GFP_NOFS); 2815 break; 2816 } 2817 em = __get_extent_map(inode, page, pg_offset, cur, 2818 end - cur + 1, get_extent, em_cached); 2819 if (IS_ERR_OR_NULL(em)) { 2820 SetPageError(page); 2821 if (!parent_locked) 2822 unlock_extent(tree, cur, end); 2823 break; 2824 } 2825 extent_offset = cur - em->start; 2826 BUG_ON(extent_map_end(em) <= cur); 2827 BUG_ON(end < cur); 2828 2829 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2830 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2831 extent_set_compress_type(&this_bio_flag, 2832 em->compress_type); 2833 } 2834 2835 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2836 cur_end = min(extent_map_end(em) - 1, end); 2837 iosize = ALIGN(iosize, blocksize); 2838 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2839 disk_io_size = em->block_len; 2840 sector = em->block_start >> 9; 2841 } else { 2842 sector = (em->block_start + extent_offset) >> 9; 2843 disk_io_size = iosize; 2844 } 2845 bdev = em->bdev; 2846 block_start = em->block_start; 2847 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2848 block_start = EXTENT_MAP_HOLE; 2849 free_extent_map(em); 2850 em = NULL; 2851 2852 /* we've found a hole, just zero and go on */ 2853 if (block_start == EXTENT_MAP_HOLE) { 2854 char *userpage; 2855 struct extent_state *cached = NULL; 2856 2857 userpage = kmap_atomic(page); 2858 memset(userpage + pg_offset, 0, iosize); 2859 flush_dcache_page(page); 2860 kunmap_atomic(userpage); 2861 2862 set_extent_uptodate(tree, cur, cur + iosize - 1, 2863 &cached, GFP_NOFS); 2864 unlock_extent_cached(tree, cur, cur + iosize - 1, 2865 &cached, GFP_NOFS); 2866 cur = cur + iosize; 2867 pg_offset += iosize; 2868 continue; 2869 } 2870 /* the get_extent function already copied into the page */ 2871 if (test_range_bit(tree, cur, cur_end, 2872 EXTENT_UPTODATE, 1, NULL)) { 2873 check_page_uptodate(tree, page); 2874 if (!parent_locked) 2875 unlock_extent(tree, cur, cur + iosize - 1); 2876 cur = cur + iosize; 2877 pg_offset += iosize; 2878 continue; 2879 } 2880 /* we have an inline extent but it didn't get marked up 2881 * to date. Error out 2882 */ 2883 if (block_start == EXTENT_MAP_INLINE) { 2884 SetPageError(page); 2885 if (!parent_locked) 2886 unlock_extent(tree, cur, cur + iosize - 1); 2887 cur = cur + iosize; 2888 pg_offset += iosize; 2889 continue; 2890 } 2891 2892 pnr -= page->index; 2893 ret = submit_extent_page(rw, tree, page, 2894 sector, disk_io_size, pg_offset, 2895 bdev, bio, pnr, 2896 end_bio_extent_readpage, mirror_num, 2897 *bio_flags, 2898 this_bio_flag); 2899 if (!ret) { 2900 nr++; 2901 *bio_flags = this_bio_flag; 2902 } else { 2903 SetPageError(page); 2904 if (!parent_locked) 2905 unlock_extent(tree, cur, cur + iosize - 1); 2906 } 2907 cur = cur + iosize; 2908 pg_offset += iosize; 2909 } 2910 out: 2911 if (!nr) { 2912 if (!PageError(page)) 2913 SetPageUptodate(page); 2914 unlock_page(page); 2915 } 2916 return 0; 2917 } 2918 2919 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 2920 struct page *pages[], int nr_pages, 2921 u64 start, u64 end, 2922 get_extent_t *get_extent, 2923 struct extent_map **em_cached, 2924 struct bio **bio, int mirror_num, 2925 unsigned long *bio_flags, int rw) 2926 { 2927 struct inode *inode; 2928 struct btrfs_ordered_extent *ordered; 2929 int index; 2930 2931 inode = pages[0]->mapping->host; 2932 while (1) { 2933 lock_extent(tree, start, end); 2934 ordered = btrfs_lookup_ordered_range(inode, start, 2935 end - start + 1); 2936 if (!ordered) 2937 break; 2938 unlock_extent(tree, start, end); 2939 btrfs_start_ordered_extent(inode, ordered, 1); 2940 btrfs_put_ordered_extent(ordered); 2941 } 2942 2943 for (index = 0; index < nr_pages; index++) { 2944 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 2945 mirror_num, bio_flags, rw); 2946 page_cache_release(pages[index]); 2947 } 2948 } 2949 2950 static void __extent_readpages(struct extent_io_tree *tree, 2951 struct page *pages[], 2952 int nr_pages, get_extent_t *get_extent, 2953 struct extent_map **em_cached, 2954 struct bio **bio, int mirror_num, 2955 unsigned long *bio_flags, int rw) 2956 { 2957 u64 start = 0; 2958 u64 end = 0; 2959 u64 page_start; 2960 int index; 2961 int first_index = 0; 2962 2963 for (index = 0; index < nr_pages; index++) { 2964 page_start = page_offset(pages[index]); 2965 if (!end) { 2966 start = page_start; 2967 end = start + PAGE_CACHE_SIZE - 1; 2968 first_index = index; 2969 } else if (end + 1 == page_start) { 2970 end += PAGE_CACHE_SIZE; 2971 } else { 2972 __do_contiguous_readpages(tree, &pages[first_index], 2973 index - first_index, start, 2974 end, get_extent, em_cached, 2975 bio, mirror_num, bio_flags, 2976 rw); 2977 start = page_start; 2978 end = start + PAGE_CACHE_SIZE - 1; 2979 first_index = index; 2980 } 2981 } 2982 2983 if (end) 2984 __do_contiguous_readpages(tree, &pages[first_index], 2985 index - first_index, start, 2986 end, get_extent, em_cached, bio, 2987 mirror_num, bio_flags, rw); 2988 } 2989 2990 static int __extent_read_full_page(struct extent_io_tree *tree, 2991 struct page *page, 2992 get_extent_t *get_extent, 2993 struct bio **bio, int mirror_num, 2994 unsigned long *bio_flags, int rw) 2995 { 2996 struct inode *inode = page->mapping->host; 2997 struct btrfs_ordered_extent *ordered; 2998 u64 start = page_offset(page); 2999 u64 end = start + PAGE_CACHE_SIZE - 1; 3000 int ret; 3001 3002 while (1) { 3003 lock_extent(tree, start, end); 3004 ordered = btrfs_lookup_ordered_extent(inode, start); 3005 if (!ordered) 3006 break; 3007 unlock_extent(tree, start, end); 3008 btrfs_start_ordered_extent(inode, ordered, 1); 3009 btrfs_put_ordered_extent(ordered); 3010 } 3011 3012 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3013 bio_flags, rw); 3014 return ret; 3015 } 3016 3017 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3018 get_extent_t *get_extent, int mirror_num) 3019 { 3020 struct bio *bio = NULL; 3021 unsigned long bio_flags = 0; 3022 int ret; 3023 3024 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3025 &bio_flags, READ); 3026 if (bio) 3027 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3028 return ret; 3029 } 3030 3031 int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, 3032 get_extent_t *get_extent, int mirror_num) 3033 { 3034 struct bio *bio = NULL; 3035 unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; 3036 int ret; 3037 3038 ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, 3039 &bio_flags, READ); 3040 if (bio) 3041 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3042 return ret; 3043 } 3044 3045 static noinline void update_nr_written(struct page *page, 3046 struct writeback_control *wbc, 3047 unsigned long nr_written) 3048 { 3049 wbc->nr_to_write -= nr_written; 3050 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 3051 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 3052 page->mapping->writeback_index = page->index + nr_written; 3053 } 3054 3055 /* 3056 * the writepage semantics are similar to regular writepage. extent 3057 * records are inserted to lock ranges in the tree, and as dirty areas 3058 * are found, they are marked writeback. Then the lock bits are removed 3059 * and the end_io handler clears the writeback ranges 3060 */ 3061 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3062 void *data) 3063 { 3064 struct inode *inode = page->mapping->host; 3065 struct extent_page_data *epd = data; 3066 struct extent_io_tree *tree = epd->tree; 3067 u64 start = page_offset(page); 3068 u64 delalloc_start; 3069 u64 page_end = start + PAGE_CACHE_SIZE - 1; 3070 u64 end; 3071 u64 cur = start; 3072 u64 extent_offset; 3073 u64 last_byte = i_size_read(inode); 3074 u64 block_start; 3075 u64 iosize; 3076 sector_t sector; 3077 struct extent_state *cached_state = NULL; 3078 struct extent_map *em; 3079 struct block_device *bdev; 3080 int ret; 3081 int nr = 0; 3082 size_t pg_offset = 0; 3083 size_t blocksize; 3084 loff_t i_size = i_size_read(inode); 3085 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 3086 u64 nr_delalloc; 3087 u64 delalloc_end; 3088 int page_started; 3089 int compressed; 3090 int write_flags; 3091 unsigned long nr_written = 0; 3092 bool fill_delalloc = true; 3093 3094 if (wbc->sync_mode == WB_SYNC_ALL) 3095 write_flags = WRITE_SYNC; 3096 else 3097 write_flags = WRITE; 3098 3099 trace___extent_writepage(page, inode, wbc); 3100 3101 WARN_ON(!PageLocked(page)); 3102 3103 ClearPageError(page); 3104 3105 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 3106 if (page->index > end_index || 3107 (page->index == end_index && !pg_offset)) { 3108 page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE); 3109 unlock_page(page); 3110 return 0; 3111 } 3112 3113 if (page->index == end_index) { 3114 char *userpage; 3115 3116 userpage = kmap_atomic(page); 3117 memset(userpage + pg_offset, 0, 3118 PAGE_CACHE_SIZE - pg_offset); 3119 kunmap_atomic(userpage); 3120 flush_dcache_page(page); 3121 } 3122 pg_offset = 0; 3123 3124 set_page_extent_mapped(page); 3125 3126 if (!tree->ops || !tree->ops->fill_delalloc) 3127 fill_delalloc = false; 3128 3129 delalloc_start = start; 3130 delalloc_end = 0; 3131 page_started = 0; 3132 if (!epd->extent_locked && fill_delalloc) { 3133 u64 delalloc_to_write = 0; 3134 /* 3135 * make sure the wbc mapping index is at least updated 3136 * to this page. 3137 */ 3138 update_nr_written(page, wbc, 0); 3139 3140 while (delalloc_end < page_end) { 3141 nr_delalloc = find_lock_delalloc_range(inode, tree, 3142 page, 3143 &delalloc_start, 3144 &delalloc_end, 3145 128 * 1024 * 1024); 3146 if (nr_delalloc == 0) { 3147 delalloc_start = delalloc_end + 1; 3148 continue; 3149 } 3150 ret = tree->ops->fill_delalloc(inode, page, 3151 delalloc_start, 3152 delalloc_end, 3153 &page_started, 3154 &nr_written); 3155 /* File system has been set read-only */ 3156 if (ret) { 3157 SetPageError(page); 3158 goto done; 3159 } 3160 /* 3161 * delalloc_end is already one less than the total 3162 * length, so we don't subtract one from 3163 * PAGE_CACHE_SIZE 3164 */ 3165 delalloc_to_write += (delalloc_end - delalloc_start + 3166 PAGE_CACHE_SIZE) >> 3167 PAGE_CACHE_SHIFT; 3168 delalloc_start = delalloc_end + 1; 3169 } 3170 if (wbc->nr_to_write < delalloc_to_write) { 3171 int thresh = 8192; 3172 3173 if (delalloc_to_write < thresh * 2) 3174 thresh = delalloc_to_write; 3175 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3176 thresh); 3177 } 3178 3179 /* did the fill delalloc function already unlock and start 3180 * the IO? 3181 */ 3182 if (page_started) { 3183 ret = 0; 3184 /* 3185 * we've unlocked the page, so we can't update 3186 * the mapping's writeback index, just update 3187 * nr_to_write. 3188 */ 3189 wbc->nr_to_write -= nr_written; 3190 goto done_unlocked; 3191 } 3192 } 3193 if (tree->ops && tree->ops->writepage_start_hook) { 3194 ret = tree->ops->writepage_start_hook(page, start, 3195 page_end); 3196 if (ret) { 3197 /* Fixup worker will requeue */ 3198 if (ret == -EBUSY) 3199 wbc->pages_skipped++; 3200 else 3201 redirty_page_for_writepage(wbc, page); 3202 update_nr_written(page, wbc, nr_written); 3203 unlock_page(page); 3204 ret = 0; 3205 goto done_unlocked; 3206 } 3207 } 3208 3209 /* 3210 * we don't want to touch the inode after unlocking the page, 3211 * so we update the mapping writeback index now 3212 */ 3213 update_nr_written(page, wbc, nr_written + 1); 3214 3215 end = page_end; 3216 if (last_byte <= start) { 3217 if (tree->ops && tree->ops->writepage_end_io_hook) 3218 tree->ops->writepage_end_io_hook(page, start, 3219 page_end, NULL, 1); 3220 goto done; 3221 } 3222 3223 blocksize = inode->i_sb->s_blocksize; 3224 3225 while (cur <= end) { 3226 if (cur >= last_byte) { 3227 if (tree->ops && tree->ops->writepage_end_io_hook) 3228 tree->ops->writepage_end_io_hook(page, cur, 3229 page_end, NULL, 1); 3230 break; 3231 } 3232 em = epd->get_extent(inode, page, pg_offset, cur, 3233 end - cur + 1, 1); 3234 if (IS_ERR_OR_NULL(em)) { 3235 SetPageError(page); 3236 break; 3237 } 3238 3239 extent_offset = cur - em->start; 3240 BUG_ON(extent_map_end(em) <= cur); 3241 BUG_ON(end < cur); 3242 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3243 iosize = ALIGN(iosize, blocksize); 3244 sector = (em->block_start + extent_offset) >> 9; 3245 bdev = em->bdev; 3246 block_start = em->block_start; 3247 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3248 free_extent_map(em); 3249 em = NULL; 3250 3251 /* 3252 * compressed and inline extents are written through other 3253 * paths in the FS 3254 */ 3255 if (compressed || block_start == EXTENT_MAP_HOLE || 3256 block_start == EXTENT_MAP_INLINE) { 3257 /* 3258 * end_io notification does not happen here for 3259 * compressed extents 3260 */ 3261 if (!compressed && tree->ops && 3262 tree->ops->writepage_end_io_hook) 3263 tree->ops->writepage_end_io_hook(page, cur, 3264 cur + iosize - 1, 3265 NULL, 1); 3266 else if (compressed) { 3267 /* we don't want to end_page_writeback on 3268 * a compressed extent. this happens 3269 * elsewhere 3270 */ 3271 nr++; 3272 } 3273 3274 cur += iosize; 3275 pg_offset += iosize; 3276 continue; 3277 } 3278 /* leave this out until we have a page_mkwrite call */ 3279 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3280 EXTENT_DIRTY, 0, NULL)) { 3281 cur = cur + iosize; 3282 pg_offset += iosize; 3283 continue; 3284 } 3285 3286 if (tree->ops && tree->ops->writepage_io_hook) { 3287 ret = tree->ops->writepage_io_hook(page, cur, 3288 cur + iosize - 1); 3289 } else { 3290 ret = 0; 3291 } 3292 if (ret) { 3293 SetPageError(page); 3294 } else { 3295 unsigned long max_nr = end_index + 1; 3296 3297 set_range_writeback(tree, cur, cur + iosize - 1); 3298 if (!PageWriteback(page)) { 3299 printk(KERN_ERR "btrfs warning page %lu not " 3300 "writeback, cur %llu end %llu\n", 3301 page->index, (unsigned long long)cur, 3302 (unsigned long long)end); 3303 } 3304 3305 ret = submit_extent_page(write_flags, tree, page, 3306 sector, iosize, pg_offset, 3307 bdev, &epd->bio, max_nr, 3308 end_bio_extent_writepage, 3309 0, 0, 0); 3310 if (ret) 3311 SetPageError(page); 3312 } 3313 cur = cur + iosize; 3314 pg_offset += iosize; 3315 nr++; 3316 } 3317 done: 3318 if (nr == 0) { 3319 /* make sure the mapping tag for page dirty gets cleared */ 3320 set_page_writeback(page); 3321 end_page_writeback(page); 3322 } 3323 unlock_page(page); 3324 3325 done_unlocked: 3326 3327 /* drop our reference on any cached states */ 3328 free_extent_state(cached_state); 3329 return 0; 3330 } 3331 3332 static int eb_wait(void *word) 3333 { 3334 io_schedule(); 3335 return 0; 3336 } 3337 3338 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3339 { 3340 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3341 TASK_UNINTERRUPTIBLE); 3342 } 3343 3344 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3345 struct btrfs_fs_info *fs_info, 3346 struct extent_page_data *epd) 3347 { 3348 unsigned long i, num_pages; 3349 int flush = 0; 3350 int ret = 0; 3351 3352 if (!btrfs_try_tree_write_lock(eb)) { 3353 flush = 1; 3354 flush_write_bio(epd); 3355 btrfs_tree_lock(eb); 3356 } 3357 3358 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3359 btrfs_tree_unlock(eb); 3360 if (!epd->sync_io) 3361 return 0; 3362 if (!flush) { 3363 flush_write_bio(epd); 3364 flush = 1; 3365 } 3366 while (1) { 3367 wait_on_extent_buffer_writeback(eb); 3368 btrfs_tree_lock(eb); 3369 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3370 break; 3371 btrfs_tree_unlock(eb); 3372 } 3373 } 3374 3375 /* 3376 * We need to do this to prevent races in people who check if the eb is 3377 * under IO since we can end up having no IO bits set for a short period 3378 * of time. 3379 */ 3380 spin_lock(&eb->refs_lock); 3381 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3382 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3383 spin_unlock(&eb->refs_lock); 3384 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3385 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3386 -eb->len, 3387 fs_info->dirty_metadata_batch); 3388 ret = 1; 3389 } else { 3390 spin_unlock(&eb->refs_lock); 3391 } 3392 3393 btrfs_tree_unlock(eb); 3394 3395 if (!ret) 3396 return ret; 3397 3398 num_pages = num_extent_pages(eb->start, eb->len); 3399 for (i = 0; i < num_pages; i++) { 3400 struct page *p = extent_buffer_page(eb, i); 3401 3402 if (!trylock_page(p)) { 3403 if (!flush) { 3404 flush_write_bio(epd); 3405 flush = 1; 3406 } 3407 lock_page(p); 3408 } 3409 } 3410 3411 return ret; 3412 } 3413 3414 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3415 { 3416 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3417 smp_mb__after_clear_bit(); 3418 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3419 } 3420 3421 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3422 { 3423 int uptodate = err == 0; 3424 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 3425 struct extent_buffer *eb; 3426 int done; 3427 3428 do { 3429 struct page *page = bvec->bv_page; 3430 3431 bvec--; 3432 eb = (struct extent_buffer *)page->private; 3433 BUG_ON(!eb); 3434 done = atomic_dec_and_test(&eb->io_pages); 3435 3436 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3437 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3438 ClearPageUptodate(page); 3439 SetPageError(page); 3440 } 3441 3442 end_page_writeback(page); 3443 3444 if (!done) 3445 continue; 3446 3447 end_extent_buffer_writeback(eb); 3448 } while (bvec >= bio->bi_io_vec); 3449 3450 bio_put(bio); 3451 3452 } 3453 3454 static int write_one_eb(struct extent_buffer *eb, 3455 struct btrfs_fs_info *fs_info, 3456 struct writeback_control *wbc, 3457 struct extent_page_data *epd) 3458 { 3459 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3460 u64 offset = eb->start; 3461 unsigned long i, num_pages; 3462 unsigned long bio_flags = 0; 3463 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3464 int ret = 0; 3465 3466 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3467 num_pages = num_extent_pages(eb->start, eb->len); 3468 atomic_set(&eb->io_pages, num_pages); 3469 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3470 bio_flags = EXTENT_BIO_TREE_LOG; 3471 3472 for (i = 0; i < num_pages; i++) { 3473 struct page *p = extent_buffer_page(eb, i); 3474 3475 clear_page_dirty_for_io(p); 3476 set_page_writeback(p); 3477 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3478 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3479 -1, end_bio_extent_buffer_writepage, 3480 0, epd->bio_flags, bio_flags); 3481 epd->bio_flags = bio_flags; 3482 if (ret) { 3483 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3484 SetPageError(p); 3485 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3486 end_extent_buffer_writeback(eb); 3487 ret = -EIO; 3488 break; 3489 } 3490 offset += PAGE_CACHE_SIZE; 3491 update_nr_written(p, wbc, 1); 3492 unlock_page(p); 3493 } 3494 3495 if (unlikely(ret)) { 3496 for (; i < num_pages; i++) { 3497 struct page *p = extent_buffer_page(eb, i); 3498 unlock_page(p); 3499 } 3500 } 3501 3502 return ret; 3503 } 3504 3505 int btree_write_cache_pages(struct address_space *mapping, 3506 struct writeback_control *wbc) 3507 { 3508 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3509 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3510 struct extent_buffer *eb, *prev_eb = NULL; 3511 struct extent_page_data epd = { 3512 .bio = NULL, 3513 .tree = tree, 3514 .extent_locked = 0, 3515 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3516 .bio_flags = 0, 3517 }; 3518 int ret = 0; 3519 int done = 0; 3520 int nr_to_write_done = 0; 3521 struct pagevec pvec; 3522 int nr_pages; 3523 pgoff_t index; 3524 pgoff_t end; /* Inclusive */ 3525 int scanned = 0; 3526 int tag; 3527 3528 pagevec_init(&pvec, 0); 3529 if (wbc->range_cyclic) { 3530 index = mapping->writeback_index; /* Start from prev offset */ 3531 end = -1; 3532 } else { 3533 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3534 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3535 scanned = 1; 3536 } 3537 if (wbc->sync_mode == WB_SYNC_ALL) 3538 tag = PAGECACHE_TAG_TOWRITE; 3539 else 3540 tag = PAGECACHE_TAG_DIRTY; 3541 retry: 3542 if (wbc->sync_mode == WB_SYNC_ALL) 3543 tag_pages_for_writeback(mapping, index, end); 3544 while (!done && !nr_to_write_done && (index <= end) && 3545 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3546 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3547 unsigned i; 3548 3549 scanned = 1; 3550 for (i = 0; i < nr_pages; i++) { 3551 struct page *page = pvec.pages[i]; 3552 3553 if (!PagePrivate(page)) 3554 continue; 3555 3556 if (!wbc->range_cyclic && page->index > end) { 3557 done = 1; 3558 break; 3559 } 3560 3561 spin_lock(&mapping->private_lock); 3562 if (!PagePrivate(page)) { 3563 spin_unlock(&mapping->private_lock); 3564 continue; 3565 } 3566 3567 eb = (struct extent_buffer *)page->private; 3568 3569 /* 3570 * Shouldn't happen and normally this would be a BUG_ON 3571 * but no sense in crashing the users box for something 3572 * we can survive anyway. 3573 */ 3574 if (!eb) { 3575 spin_unlock(&mapping->private_lock); 3576 WARN_ON(1); 3577 continue; 3578 } 3579 3580 if (eb == prev_eb) { 3581 spin_unlock(&mapping->private_lock); 3582 continue; 3583 } 3584 3585 ret = atomic_inc_not_zero(&eb->refs); 3586 spin_unlock(&mapping->private_lock); 3587 if (!ret) 3588 continue; 3589 3590 prev_eb = eb; 3591 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3592 if (!ret) { 3593 free_extent_buffer(eb); 3594 continue; 3595 } 3596 3597 ret = write_one_eb(eb, fs_info, wbc, &epd); 3598 if (ret) { 3599 done = 1; 3600 free_extent_buffer(eb); 3601 break; 3602 } 3603 free_extent_buffer(eb); 3604 3605 /* 3606 * the filesystem may choose to bump up nr_to_write. 3607 * We have to make sure to honor the new nr_to_write 3608 * at any time 3609 */ 3610 nr_to_write_done = wbc->nr_to_write <= 0; 3611 } 3612 pagevec_release(&pvec); 3613 cond_resched(); 3614 } 3615 if (!scanned && !done) { 3616 /* 3617 * We hit the last page and there is more work to be done: wrap 3618 * back to the start of the file 3619 */ 3620 scanned = 1; 3621 index = 0; 3622 goto retry; 3623 } 3624 flush_write_bio(&epd); 3625 return ret; 3626 } 3627 3628 /** 3629 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3630 * @mapping: address space structure to write 3631 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3632 * @writepage: function called for each page 3633 * @data: data passed to writepage function 3634 * 3635 * If a page is already under I/O, write_cache_pages() skips it, even 3636 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3637 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3638 * and msync() need to guarantee that all the data which was dirty at the time 3639 * the call was made get new I/O started against them. If wbc->sync_mode is 3640 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3641 * existing IO to complete. 3642 */ 3643 static int extent_write_cache_pages(struct extent_io_tree *tree, 3644 struct address_space *mapping, 3645 struct writeback_control *wbc, 3646 writepage_t writepage, void *data, 3647 void (*flush_fn)(void *)) 3648 { 3649 struct inode *inode = mapping->host; 3650 int ret = 0; 3651 int done = 0; 3652 int nr_to_write_done = 0; 3653 struct pagevec pvec; 3654 int nr_pages; 3655 pgoff_t index; 3656 pgoff_t end; /* Inclusive */ 3657 int scanned = 0; 3658 int tag; 3659 3660 /* 3661 * We have to hold onto the inode so that ordered extents can do their 3662 * work when the IO finishes. The alternative to this is failing to add 3663 * an ordered extent if the igrab() fails there and that is a huge pain 3664 * to deal with, so instead just hold onto the inode throughout the 3665 * writepages operation. If it fails here we are freeing up the inode 3666 * anyway and we'd rather not waste our time writing out stuff that is 3667 * going to be truncated anyway. 3668 */ 3669 if (!igrab(inode)) 3670 return 0; 3671 3672 pagevec_init(&pvec, 0); 3673 if (wbc->range_cyclic) { 3674 index = mapping->writeback_index; /* Start from prev offset */ 3675 end = -1; 3676 } else { 3677 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3678 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3679 scanned = 1; 3680 } 3681 if (wbc->sync_mode == WB_SYNC_ALL) 3682 tag = PAGECACHE_TAG_TOWRITE; 3683 else 3684 tag = PAGECACHE_TAG_DIRTY; 3685 retry: 3686 if (wbc->sync_mode == WB_SYNC_ALL) 3687 tag_pages_for_writeback(mapping, index, end); 3688 while (!done && !nr_to_write_done && (index <= end) && 3689 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3690 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3691 unsigned i; 3692 3693 scanned = 1; 3694 for (i = 0; i < nr_pages; i++) { 3695 struct page *page = pvec.pages[i]; 3696 3697 /* 3698 * At this point we hold neither mapping->tree_lock nor 3699 * lock on the page itself: the page may be truncated or 3700 * invalidated (changing page->mapping to NULL), or even 3701 * swizzled back from swapper_space to tmpfs file 3702 * mapping 3703 */ 3704 if (!trylock_page(page)) { 3705 flush_fn(data); 3706 lock_page(page); 3707 } 3708 3709 if (unlikely(page->mapping != mapping)) { 3710 unlock_page(page); 3711 continue; 3712 } 3713 3714 if (!wbc->range_cyclic && page->index > end) { 3715 done = 1; 3716 unlock_page(page); 3717 continue; 3718 } 3719 3720 if (wbc->sync_mode != WB_SYNC_NONE) { 3721 if (PageWriteback(page)) 3722 flush_fn(data); 3723 wait_on_page_writeback(page); 3724 } 3725 3726 if (PageWriteback(page) || 3727 !clear_page_dirty_for_io(page)) { 3728 unlock_page(page); 3729 continue; 3730 } 3731 3732 ret = (*writepage)(page, wbc, data); 3733 3734 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3735 unlock_page(page); 3736 ret = 0; 3737 } 3738 if (ret) 3739 done = 1; 3740 3741 /* 3742 * the filesystem may choose to bump up nr_to_write. 3743 * We have to make sure to honor the new nr_to_write 3744 * at any time 3745 */ 3746 nr_to_write_done = wbc->nr_to_write <= 0; 3747 } 3748 pagevec_release(&pvec); 3749 cond_resched(); 3750 } 3751 if (!scanned && !done) { 3752 /* 3753 * We hit the last page and there is more work to be done: wrap 3754 * back to the start of the file 3755 */ 3756 scanned = 1; 3757 index = 0; 3758 goto retry; 3759 } 3760 btrfs_add_delayed_iput(inode); 3761 return ret; 3762 } 3763 3764 static void flush_epd_write_bio(struct extent_page_data *epd) 3765 { 3766 if (epd->bio) { 3767 int rw = WRITE; 3768 int ret; 3769 3770 if (epd->sync_io) 3771 rw = WRITE_SYNC; 3772 3773 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3774 BUG_ON(ret < 0); /* -ENOMEM */ 3775 epd->bio = NULL; 3776 } 3777 } 3778 3779 static noinline void flush_write_bio(void *data) 3780 { 3781 struct extent_page_data *epd = data; 3782 flush_epd_write_bio(epd); 3783 } 3784 3785 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3786 get_extent_t *get_extent, 3787 struct writeback_control *wbc) 3788 { 3789 int ret; 3790 struct extent_page_data epd = { 3791 .bio = NULL, 3792 .tree = tree, 3793 .get_extent = get_extent, 3794 .extent_locked = 0, 3795 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3796 .bio_flags = 0, 3797 }; 3798 3799 ret = __extent_writepage(page, wbc, &epd); 3800 3801 flush_epd_write_bio(&epd); 3802 return ret; 3803 } 3804 3805 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3806 u64 start, u64 end, get_extent_t *get_extent, 3807 int mode) 3808 { 3809 int ret = 0; 3810 struct address_space *mapping = inode->i_mapping; 3811 struct page *page; 3812 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3813 PAGE_CACHE_SHIFT; 3814 3815 struct extent_page_data epd = { 3816 .bio = NULL, 3817 .tree = tree, 3818 .get_extent = get_extent, 3819 .extent_locked = 1, 3820 .sync_io = mode == WB_SYNC_ALL, 3821 .bio_flags = 0, 3822 }; 3823 struct writeback_control wbc_writepages = { 3824 .sync_mode = mode, 3825 .nr_to_write = nr_pages * 2, 3826 .range_start = start, 3827 .range_end = end + 1, 3828 }; 3829 3830 while (start <= end) { 3831 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3832 if (clear_page_dirty_for_io(page)) 3833 ret = __extent_writepage(page, &wbc_writepages, &epd); 3834 else { 3835 if (tree->ops && tree->ops->writepage_end_io_hook) 3836 tree->ops->writepage_end_io_hook(page, start, 3837 start + PAGE_CACHE_SIZE - 1, 3838 NULL, 1); 3839 unlock_page(page); 3840 } 3841 page_cache_release(page); 3842 start += PAGE_CACHE_SIZE; 3843 } 3844 3845 flush_epd_write_bio(&epd); 3846 return ret; 3847 } 3848 3849 int extent_writepages(struct extent_io_tree *tree, 3850 struct address_space *mapping, 3851 get_extent_t *get_extent, 3852 struct writeback_control *wbc) 3853 { 3854 int ret = 0; 3855 struct extent_page_data epd = { 3856 .bio = NULL, 3857 .tree = tree, 3858 .get_extent = get_extent, 3859 .extent_locked = 0, 3860 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3861 .bio_flags = 0, 3862 }; 3863 3864 ret = extent_write_cache_pages(tree, mapping, wbc, 3865 __extent_writepage, &epd, 3866 flush_write_bio); 3867 flush_epd_write_bio(&epd); 3868 return ret; 3869 } 3870 3871 int extent_readpages(struct extent_io_tree *tree, 3872 struct address_space *mapping, 3873 struct list_head *pages, unsigned nr_pages, 3874 get_extent_t get_extent) 3875 { 3876 struct bio *bio = NULL; 3877 unsigned page_idx; 3878 unsigned long bio_flags = 0; 3879 struct page *pagepool[16]; 3880 struct page *page; 3881 struct extent_map *em_cached = NULL; 3882 int nr = 0; 3883 3884 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3885 page = list_entry(pages->prev, struct page, lru); 3886 3887 prefetchw(&page->flags); 3888 list_del(&page->lru); 3889 if (add_to_page_cache_lru(page, mapping, 3890 page->index, GFP_NOFS)) { 3891 page_cache_release(page); 3892 continue; 3893 } 3894 3895 pagepool[nr++] = page; 3896 if (nr < ARRAY_SIZE(pagepool)) 3897 continue; 3898 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 3899 &bio, 0, &bio_flags, READ); 3900 nr = 0; 3901 } 3902 if (nr) 3903 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 3904 &bio, 0, &bio_flags, READ); 3905 3906 if (em_cached) 3907 free_extent_map(em_cached); 3908 3909 BUG_ON(!list_empty(pages)); 3910 if (bio) 3911 return submit_one_bio(READ, bio, 0, bio_flags); 3912 return 0; 3913 } 3914 3915 /* 3916 * basic invalidatepage code, this waits on any locked or writeback 3917 * ranges corresponding to the page, and then deletes any extent state 3918 * records from the tree 3919 */ 3920 int extent_invalidatepage(struct extent_io_tree *tree, 3921 struct page *page, unsigned long offset) 3922 { 3923 struct extent_state *cached_state = NULL; 3924 u64 start = page_offset(page); 3925 u64 end = start + PAGE_CACHE_SIZE - 1; 3926 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3927 3928 start += ALIGN(offset, blocksize); 3929 if (start > end) 3930 return 0; 3931 3932 lock_extent_bits(tree, start, end, 0, &cached_state); 3933 wait_on_page_writeback(page); 3934 clear_extent_bit(tree, start, end, 3935 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3936 EXTENT_DO_ACCOUNTING, 3937 1, 1, &cached_state, GFP_NOFS); 3938 return 0; 3939 } 3940 3941 /* 3942 * a helper for releasepage, this tests for areas of the page that 3943 * are locked or under IO and drops the related state bits if it is safe 3944 * to drop the page. 3945 */ 3946 static int try_release_extent_state(struct extent_map_tree *map, 3947 struct extent_io_tree *tree, 3948 struct page *page, gfp_t mask) 3949 { 3950 u64 start = page_offset(page); 3951 u64 end = start + PAGE_CACHE_SIZE - 1; 3952 int ret = 1; 3953 3954 if (test_range_bit(tree, start, end, 3955 EXTENT_IOBITS, 0, NULL)) 3956 ret = 0; 3957 else { 3958 if ((mask & GFP_NOFS) == GFP_NOFS) 3959 mask = GFP_NOFS; 3960 /* 3961 * at this point we can safely clear everything except the 3962 * locked bit and the nodatasum bit 3963 */ 3964 ret = clear_extent_bit(tree, start, end, 3965 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3966 0, 0, NULL, mask); 3967 3968 /* if clear_extent_bit failed for enomem reasons, 3969 * we can't allow the release to continue. 3970 */ 3971 if (ret < 0) 3972 ret = 0; 3973 else 3974 ret = 1; 3975 } 3976 return ret; 3977 } 3978 3979 /* 3980 * a helper for releasepage. As long as there are no locked extents 3981 * in the range corresponding to the page, both state records and extent 3982 * map records are removed 3983 */ 3984 int try_release_extent_mapping(struct extent_map_tree *map, 3985 struct extent_io_tree *tree, struct page *page, 3986 gfp_t mask) 3987 { 3988 struct extent_map *em; 3989 u64 start = page_offset(page); 3990 u64 end = start + PAGE_CACHE_SIZE - 1; 3991 3992 if ((mask & __GFP_WAIT) && 3993 page->mapping->host->i_size > 16 * 1024 * 1024) { 3994 u64 len; 3995 while (start <= end) { 3996 len = end - start + 1; 3997 write_lock(&map->lock); 3998 em = lookup_extent_mapping(map, start, len); 3999 if (!em) { 4000 write_unlock(&map->lock); 4001 break; 4002 } 4003 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4004 em->start != start) { 4005 write_unlock(&map->lock); 4006 free_extent_map(em); 4007 break; 4008 } 4009 if (!test_range_bit(tree, em->start, 4010 extent_map_end(em) - 1, 4011 EXTENT_LOCKED | EXTENT_WRITEBACK, 4012 0, NULL)) { 4013 remove_extent_mapping(map, em); 4014 /* once for the rb tree */ 4015 free_extent_map(em); 4016 } 4017 start = extent_map_end(em); 4018 write_unlock(&map->lock); 4019 4020 /* once for us */ 4021 free_extent_map(em); 4022 } 4023 } 4024 return try_release_extent_state(map, tree, page, mask); 4025 } 4026 4027 /* 4028 * helper function for fiemap, which doesn't want to see any holes. 4029 * This maps until we find something past 'last' 4030 */ 4031 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4032 u64 offset, 4033 u64 last, 4034 get_extent_t *get_extent) 4035 { 4036 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 4037 struct extent_map *em; 4038 u64 len; 4039 4040 if (offset >= last) 4041 return NULL; 4042 4043 while(1) { 4044 len = last - offset; 4045 if (len == 0) 4046 break; 4047 len = ALIGN(len, sectorsize); 4048 em = get_extent(inode, NULL, 0, offset, len, 0); 4049 if (IS_ERR_OR_NULL(em)) 4050 return em; 4051 4052 /* if this isn't a hole return it */ 4053 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4054 em->block_start != EXTENT_MAP_HOLE) { 4055 return em; 4056 } 4057 4058 /* this is a hole, advance to the next extent */ 4059 offset = extent_map_end(em); 4060 free_extent_map(em); 4061 if (offset >= last) 4062 break; 4063 } 4064 return NULL; 4065 } 4066 4067 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4068 __u64 start, __u64 len, get_extent_t *get_extent) 4069 { 4070 int ret = 0; 4071 u64 off = start; 4072 u64 max = start + len; 4073 u32 flags = 0; 4074 u32 found_type; 4075 u64 last; 4076 u64 last_for_get_extent = 0; 4077 u64 disko = 0; 4078 u64 isize = i_size_read(inode); 4079 struct btrfs_key found_key; 4080 struct extent_map *em = NULL; 4081 struct extent_state *cached_state = NULL; 4082 struct btrfs_path *path; 4083 struct btrfs_file_extent_item *item; 4084 int end = 0; 4085 u64 em_start = 0; 4086 u64 em_len = 0; 4087 u64 em_end = 0; 4088 unsigned long emflags; 4089 4090 if (len == 0) 4091 return -EINVAL; 4092 4093 path = btrfs_alloc_path(); 4094 if (!path) 4095 return -ENOMEM; 4096 path->leave_spinning = 1; 4097 4098 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 4099 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 4100 4101 /* 4102 * lookup the last file extent. We're not using i_size here 4103 * because there might be preallocation past i_size 4104 */ 4105 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 4106 path, btrfs_ino(inode), -1, 0); 4107 if (ret < 0) { 4108 btrfs_free_path(path); 4109 return ret; 4110 } 4111 WARN_ON(!ret); 4112 path->slots[0]--; 4113 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4114 struct btrfs_file_extent_item); 4115 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4116 found_type = btrfs_key_type(&found_key); 4117 4118 /* No extents, but there might be delalloc bits */ 4119 if (found_key.objectid != btrfs_ino(inode) || 4120 found_type != BTRFS_EXTENT_DATA_KEY) { 4121 /* have to trust i_size as the end */ 4122 last = (u64)-1; 4123 last_for_get_extent = isize; 4124 } else { 4125 /* 4126 * remember the start of the last extent. There are a 4127 * bunch of different factors that go into the length of the 4128 * extent, so its much less complex to remember where it started 4129 */ 4130 last = found_key.offset; 4131 last_for_get_extent = last + 1; 4132 } 4133 btrfs_free_path(path); 4134 4135 /* 4136 * we might have some extents allocated but more delalloc past those 4137 * extents. so, we trust isize unless the start of the last extent is 4138 * beyond isize 4139 */ 4140 if (last < isize) { 4141 last = (u64)-1; 4142 last_for_get_extent = isize; 4143 } 4144 4145 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, 4146 &cached_state); 4147 4148 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4149 get_extent); 4150 if (!em) 4151 goto out; 4152 if (IS_ERR(em)) { 4153 ret = PTR_ERR(em); 4154 goto out; 4155 } 4156 4157 while (!end) { 4158 u64 offset_in_extent = 0; 4159 4160 /* break if the extent we found is outside the range */ 4161 if (em->start >= max || extent_map_end(em) < off) 4162 break; 4163 4164 /* 4165 * get_extent may return an extent that starts before our 4166 * requested range. We have to make sure the ranges 4167 * we return to fiemap always move forward and don't 4168 * overlap, so adjust the offsets here 4169 */ 4170 em_start = max(em->start, off); 4171 4172 /* 4173 * record the offset from the start of the extent 4174 * for adjusting the disk offset below. Only do this if the 4175 * extent isn't compressed since our in ram offset may be past 4176 * what we have actually allocated on disk. 4177 */ 4178 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4179 offset_in_extent = em_start - em->start; 4180 em_end = extent_map_end(em); 4181 em_len = em_end - em_start; 4182 emflags = em->flags; 4183 disko = 0; 4184 flags = 0; 4185 4186 /* 4187 * bump off for our next call to get_extent 4188 */ 4189 off = extent_map_end(em); 4190 if (off >= max) 4191 end = 1; 4192 4193 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4194 end = 1; 4195 flags |= FIEMAP_EXTENT_LAST; 4196 } else if (em->block_start == EXTENT_MAP_INLINE) { 4197 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4198 FIEMAP_EXTENT_NOT_ALIGNED); 4199 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4200 flags |= (FIEMAP_EXTENT_DELALLOC | 4201 FIEMAP_EXTENT_UNKNOWN); 4202 } else { 4203 disko = em->block_start + offset_in_extent; 4204 } 4205 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4206 flags |= FIEMAP_EXTENT_ENCODED; 4207 4208 free_extent_map(em); 4209 em = NULL; 4210 if ((em_start >= last) || em_len == (u64)-1 || 4211 (last == (u64)-1 && isize <= em_end)) { 4212 flags |= FIEMAP_EXTENT_LAST; 4213 end = 1; 4214 } 4215 4216 /* now scan forward to see if this is really the last extent. */ 4217 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4218 get_extent); 4219 if (IS_ERR(em)) { 4220 ret = PTR_ERR(em); 4221 goto out; 4222 } 4223 if (!em) { 4224 flags |= FIEMAP_EXTENT_LAST; 4225 end = 1; 4226 } 4227 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4228 em_len, flags); 4229 if (ret) 4230 goto out_free; 4231 } 4232 out_free: 4233 free_extent_map(em); 4234 out: 4235 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4236 &cached_state, GFP_NOFS); 4237 return ret; 4238 } 4239 4240 static void __free_extent_buffer(struct extent_buffer *eb) 4241 { 4242 btrfs_leak_debug_del(&eb->leak_list); 4243 kmem_cache_free(extent_buffer_cache, eb); 4244 } 4245 4246 static int extent_buffer_under_io(struct extent_buffer *eb) 4247 { 4248 return (atomic_read(&eb->io_pages) || 4249 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4250 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4251 } 4252 4253 /* 4254 * Helper for releasing extent buffer page. 4255 */ 4256 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4257 unsigned long start_idx) 4258 { 4259 unsigned long index; 4260 unsigned long num_pages; 4261 struct page *page; 4262 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4263 4264 BUG_ON(extent_buffer_under_io(eb)); 4265 4266 num_pages = num_extent_pages(eb->start, eb->len); 4267 index = start_idx + num_pages; 4268 if (start_idx >= index) 4269 return; 4270 4271 do { 4272 index--; 4273 page = extent_buffer_page(eb, index); 4274 if (page && mapped) { 4275 spin_lock(&page->mapping->private_lock); 4276 /* 4277 * We do this since we'll remove the pages after we've 4278 * removed the eb from the radix tree, so we could race 4279 * and have this page now attached to the new eb. So 4280 * only clear page_private if it's still connected to 4281 * this eb. 4282 */ 4283 if (PagePrivate(page) && 4284 page->private == (unsigned long)eb) { 4285 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4286 BUG_ON(PageDirty(page)); 4287 BUG_ON(PageWriteback(page)); 4288 /* 4289 * We need to make sure we haven't be attached 4290 * to a new eb. 4291 */ 4292 ClearPagePrivate(page); 4293 set_page_private(page, 0); 4294 /* One for the page private */ 4295 page_cache_release(page); 4296 } 4297 spin_unlock(&page->mapping->private_lock); 4298 4299 } 4300 if (page) { 4301 /* One for when we alloced the page */ 4302 page_cache_release(page); 4303 } 4304 } while (index != start_idx); 4305 } 4306 4307 /* 4308 * Helper for releasing the extent buffer. 4309 */ 4310 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4311 { 4312 btrfs_release_extent_buffer_page(eb, 0); 4313 __free_extent_buffer(eb); 4314 } 4315 4316 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4317 u64 start, 4318 unsigned long len, 4319 gfp_t mask) 4320 { 4321 struct extent_buffer *eb = NULL; 4322 4323 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4324 if (eb == NULL) 4325 return NULL; 4326 eb->start = start; 4327 eb->len = len; 4328 eb->tree = tree; 4329 eb->bflags = 0; 4330 rwlock_init(&eb->lock); 4331 atomic_set(&eb->write_locks, 0); 4332 atomic_set(&eb->read_locks, 0); 4333 atomic_set(&eb->blocking_readers, 0); 4334 atomic_set(&eb->blocking_writers, 0); 4335 atomic_set(&eb->spinning_readers, 0); 4336 atomic_set(&eb->spinning_writers, 0); 4337 eb->lock_nested = 0; 4338 init_waitqueue_head(&eb->write_lock_wq); 4339 init_waitqueue_head(&eb->read_lock_wq); 4340 4341 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4342 4343 spin_lock_init(&eb->refs_lock); 4344 atomic_set(&eb->refs, 1); 4345 atomic_set(&eb->io_pages, 0); 4346 4347 /* 4348 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4349 */ 4350 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4351 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4352 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4353 4354 return eb; 4355 } 4356 4357 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4358 { 4359 unsigned long i; 4360 struct page *p; 4361 struct extent_buffer *new; 4362 unsigned long num_pages = num_extent_pages(src->start, src->len); 4363 4364 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); 4365 if (new == NULL) 4366 return NULL; 4367 4368 for (i = 0; i < num_pages; i++) { 4369 p = alloc_page(GFP_NOFS); 4370 if (!p) { 4371 btrfs_release_extent_buffer(new); 4372 return NULL; 4373 } 4374 attach_extent_buffer_page(new, p); 4375 WARN_ON(PageDirty(p)); 4376 SetPageUptodate(p); 4377 new->pages[i] = p; 4378 } 4379 4380 copy_extent_buffer(new, src, 0, 0, src->len); 4381 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4382 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4383 4384 return new; 4385 } 4386 4387 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4388 { 4389 struct extent_buffer *eb; 4390 unsigned long num_pages = num_extent_pages(0, len); 4391 unsigned long i; 4392 4393 eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); 4394 if (!eb) 4395 return NULL; 4396 4397 for (i = 0; i < num_pages; i++) { 4398 eb->pages[i] = alloc_page(GFP_NOFS); 4399 if (!eb->pages[i]) 4400 goto err; 4401 } 4402 set_extent_buffer_uptodate(eb); 4403 btrfs_set_header_nritems(eb, 0); 4404 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4405 4406 return eb; 4407 err: 4408 for (; i > 0; i--) 4409 __free_page(eb->pages[i - 1]); 4410 __free_extent_buffer(eb); 4411 return NULL; 4412 } 4413 4414 static void check_buffer_tree_ref(struct extent_buffer *eb) 4415 { 4416 int refs; 4417 /* the ref bit is tricky. We have to make sure it is set 4418 * if we have the buffer dirty. Otherwise the 4419 * code to free a buffer can end up dropping a dirty 4420 * page 4421 * 4422 * Once the ref bit is set, it won't go away while the 4423 * buffer is dirty or in writeback, and it also won't 4424 * go away while we have the reference count on the 4425 * eb bumped. 4426 * 4427 * We can't just set the ref bit without bumping the 4428 * ref on the eb because free_extent_buffer might 4429 * see the ref bit and try to clear it. If this happens 4430 * free_extent_buffer might end up dropping our original 4431 * ref by mistake and freeing the page before we are able 4432 * to add one more ref. 4433 * 4434 * So bump the ref count first, then set the bit. If someone 4435 * beat us to it, drop the ref we added. 4436 */ 4437 refs = atomic_read(&eb->refs); 4438 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4439 return; 4440 4441 spin_lock(&eb->refs_lock); 4442 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4443 atomic_inc(&eb->refs); 4444 spin_unlock(&eb->refs_lock); 4445 } 4446 4447 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4448 { 4449 unsigned long num_pages, i; 4450 4451 check_buffer_tree_ref(eb); 4452 4453 num_pages = num_extent_pages(eb->start, eb->len); 4454 for (i = 0; i < num_pages; i++) { 4455 struct page *p = extent_buffer_page(eb, i); 4456 mark_page_accessed(p); 4457 } 4458 } 4459 4460 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4461 u64 start, unsigned long len) 4462 { 4463 unsigned long num_pages = num_extent_pages(start, len); 4464 unsigned long i; 4465 unsigned long index = start >> PAGE_CACHE_SHIFT; 4466 struct extent_buffer *eb; 4467 struct extent_buffer *exists = NULL; 4468 struct page *p; 4469 struct address_space *mapping = tree->mapping; 4470 int uptodate = 1; 4471 int ret; 4472 4473 rcu_read_lock(); 4474 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4475 if (eb && atomic_inc_not_zero(&eb->refs)) { 4476 rcu_read_unlock(); 4477 mark_extent_buffer_accessed(eb); 4478 return eb; 4479 } 4480 rcu_read_unlock(); 4481 4482 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4483 if (!eb) 4484 return NULL; 4485 4486 for (i = 0; i < num_pages; i++, index++) { 4487 p = find_or_create_page(mapping, index, GFP_NOFS); 4488 if (!p) 4489 goto free_eb; 4490 4491 spin_lock(&mapping->private_lock); 4492 if (PagePrivate(p)) { 4493 /* 4494 * We could have already allocated an eb for this page 4495 * and attached one so lets see if we can get a ref on 4496 * the existing eb, and if we can we know it's good and 4497 * we can just return that one, else we know we can just 4498 * overwrite page->private. 4499 */ 4500 exists = (struct extent_buffer *)p->private; 4501 if (atomic_inc_not_zero(&exists->refs)) { 4502 spin_unlock(&mapping->private_lock); 4503 unlock_page(p); 4504 page_cache_release(p); 4505 mark_extent_buffer_accessed(exists); 4506 goto free_eb; 4507 } 4508 4509 /* 4510 * Do this so attach doesn't complain and we need to 4511 * drop the ref the old guy had. 4512 */ 4513 ClearPagePrivate(p); 4514 WARN_ON(PageDirty(p)); 4515 page_cache_release(p); 4516 } 4517 attach_extent_buffer_page(eb, p); 4518 spin_unlock(&mapping->private_lock); 4519 WARN_ON(PageDirty(p)); 4520 mark_page_accessed(p); 4521 eb->pages[i] = p; 4522 if (!PageUptodate(p)) 4523 uptodate = 0; 4524 4525 /* 4526 * see below about how we avoid a nasty race with release page 4527 * and why we unlock later 4528 */ 4529 } 4530 if (uptodate) 4531 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4532 again: 4533 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4534 if (ret) 4535 goto free_eb; 4536 4537 spin_lock(&tree->buffer_lock); 4538 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4539 if (ret == -EEXIST) { 4540 exists = radix_tree_lookup(&tree->buffer, 4541 start >> PAGE_CACHE_SHIFT); 4542 if (!atomic_inc_not_zero(&exists->refs)) { 4543 spin_unlock(&tree->buffer_lock); 4544 radix_tree_preload_end(); 4545 exists = NULL; 4546 goto again; 4547 } 4548 spin_unlock(&tree->buffer_lock); 4549 radix_tree_preload_end(); 4550 mark_extent_buffer_accessed(exists); 4551 goto free_eb; 4552 } 4553 /* add one reference for the tree */ 4554 check_buffer_tree_ref(eb); 4555 spin_unlock(&tree->buffer_lock); 4556 radix_tree_preload_end(); 4557 4558 /* 4559 * there is a race where release page may have 4560 * tried to find this extent buffer in the radix 4561 * but failed. It will tell the VM it is safe to 4562 * reclaim the, and it will clear the page private bit. 4563 * We must make sure to set the page private bit properly 4564 * after the extent buffer is in the radix tree so 4565 * it doesn't get lost 4566 */ 4567 SetPageChecked(eb->pages[0]); 4568 for (i = 1; i < num_pages; i++) { 4569 p = extent_buffer_page(eb, i); 4570 ClearPageChecked(p); 4571 unlock_page(p); 4572 } 4573 unlock_page(eb->pages[0]); 4574 return eb; 4575 4576 free_eb: 4577 for (i = 0; i < num_pages; i++) { 4578 if (eb->pages[i]) 4579 unlock_page(eb->pages[i]); 4580 } 4581 4582 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4583 btrfs_release_extent_buffer(eb); 4584 return exists; 4585 } 4586 4587 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4588 u64 start, unsigned long len) 4589 { 4590 struct extent_buffer *eb; 4591 4592 rcu_read_lock(); 4593 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4594 if (eb && atomic_inc_not_zero(&eb->refs)) { 4595 rcu_read_unlock(); 4596 mark_extent_buffer_accessed(eb); 4597 return eb; 4598 } 4599 rcu_read_unlock(); 4600 4601 return NULL; 4602 } 4603 4604 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4605 { 4606 struct extent_buffer *eb = 4607 container_of(head, struct extent_buffer, rcu_head); 4608 4609 __free_extent_buffer(eb); 4610 } 4611 4612 /* Expects to have eb->eb_lock already held */ 4613 static int release_extent_buffer(struct extent_buffer *eb) 4614 { 4615 WARN_ON(atomic_read(&eb->refs) == 0); 4616 if (atomic_dec_and_test(&eb->refs)) { 4617 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4618 spin_unlock(&eb->refs_lock); 4619 } else { 4620 struct extent_io_tree *tree = eb->tree; 4621 4622 spin_unlock(&eb->refs_lock); 4623 4624 spin_lock(&tree->buffer_lock); 4625 radix_tree_delete(&tree->buffer, 4626 eb->start >> PAGE_CACHE_SHIFT); 4627 spin_unlock(&tree->buffer_lock); 4628 } 4629 4630 /* Should be safe to release our pages at this point */ 4631 btrfs_release_extent_buffer_page(eb, 0); 4632 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4633 return 1; 4634 } 4635 spin_unlock(&eb->refs_lock); 4636 4637 return 0; 4638 } 4639 4640 void free_extent_buffer(struct extent_buffer *eb) 4641 { 4642 int refs; 4643 int old; 4644 if (!eb) 4645 return; 4646 4647 while (1) { 4648 refs = atomic_read(&eb->refs); 4649 if (refs <= 3) 4650 break; 4651 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 4652 if (old == refs) 4653 return; 4654 } 4655 4656 spin_lock(&eb->refs_lock); 4657 if (atomic_read(&eb->refs) == 2 && 4658 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4659 atomic_dec(&eb->refs); 4660 4661 if (atomic_read(&eb->refs) == 2 && 4662 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4663 !extent_buffer_under_io(eb) && 4664 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4665 atomic_dec(&eb->refs); 4666 4667 /* 4668 * I know this is terrible, but it's temporary until we stop tracking 4669 * the uptodate bits and such for the extent buffers. 4670 */ 4671 release_extent_buffer(eb); 4672 } 4673 4674 void free_extent_buffer_stale(struct extent_buffer *eb) 4675 { 4676 if (!eb) 4677 return; 4678 4679 spin_lock(&eb->refs_lock); 4680 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4681 4682 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4683 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4684 atomic_dec(&eb->refs); 4685 release_extent_buffer(eb); 4686 } 4687 4688 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4689 { 4690 unsigned long i; 4691 unsigned long num_pages; 4692 struct page *page; 4693 4694 num_pages = num_extent_pages(eb->start, eb->len); 4695 4696 for (i = 0; i < num_pages; i++) { 4697 page = extent_buffer_page(eb, i); 4698 if (!PageDirty(page)) 4699 continue; 4700 4701 lock_page(page); 4702 WARN_ON(!PagePrivate(page)); 4703 4704 clear_page_dirty_for_io(page); 4705 spin_lock_irq(&page->mapping->tree_lock); 4706 if (!PageDirty(page)) { 4707 radix_tree_tag_clear(&page->mapping->page_tree, 4708 page_index(page), 4709 PAGECACHE_TAG_DIRTY); 4710 } 4711 spin_unlock_irq(&page->mapping->tree_lock); 4712 ClearPageError(page); 4713 unlock_page(page); 4714 } 4715 WARN_ON(atomic_read(&eb->refs) == 0); 4716 } 4717 4718 int set_extent_buffer_dirty(struct extent_buffer *eb) 4719 { 4720 unsigned long i; 4721 unsigned long num_pages; 4722 int was_dirty = 0; 4723 4724 check_buffer_tree_ref(eb); 4725 4726 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4727 4728 num_pages = num_extent_pages(eb->start, eb->len); 4729 WARN_ON(atomic_read(&eb->refs) == 0); 4730 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4731 4732 for (i = 0; i < num_pages; i++) 4733 set_page_dirty(extent_buffer_page(eb, i)); 4734 return was_dirty; 4735 } 4736 4737 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4738 { 4739 unsigned long i; 4740 struct page *page; 4741 unsigned long num_pages; 4742 4743 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4744 num_pages = num_extent_pages(eb->start, eb->len); 4745 for (i = 0; i < num_pages; i++) { 4746 page = extent_buffer_page(eb, i); 4747 if (page) 4748 ClearPageUptodate(page); 4749 } 4750 return 0; 4751 } 4752 4753 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4754 { 4755 unsigned long i; 4756 struct page *page; 4757 unsigned long num_pages; 4758 4759 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4760 num_pages = num_extent_pages(eb->start, eb->len); 4761 for (i = 0; i < num_pages; i++) { 4762 page = extent_buffer_page(eb, i); 4763 SetPageUptodate(page); 4764 } 4765 return 0; 4766 } 4767 4768 int extent_buffer_uptodate(struct extent_buffer *eb) 4769 { 4770 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4771 } 4772 4773 int read_extent_buffer_pages(struct extent_io_tree *tree, 4774 struct extent_buffer *eb, u64 start, int wait, 4775 get_extent_t *get_extent, int mirror_num) 4776 { 4777 unsigned long i; 4778 unsigned long start_i; 4779 struct page *page; 4780 int err; 4781 int ret = 0; 4782 int locked_pages = 0; 4783 int all_uptodate = 1; 4784 unsigned long num_pages; 4785 unsigned long num_reads = 0; 4786 struct bio *bio = NULL; 4787 unsigned long bio_flags = 0; 4788 4789 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4790 return 0; 4791 4792 if (start) { 4793 WARN_ON(start < eb->start); 4794 start_i = (start >> PAGE_CACHE_SHIFT) - 4795 (eb->start >> PAGE_CACHE_SHIFT); 4796 } else { 4797 start_i = 0; 4798 } 4799 4800 num_pages = num_extent_pages(eb->start, eb->len); 4801 for (i = start_i; i < num_pages; i++) { 4802 page = extent_buffer_page(eb, i); 4803 if (wait == WAIT_NONE) { 4804 if (!trylock_page(page)) 4805 goto unlock_exit; 4806 } else { 4807 lock_page(page); 4808 } 4809 locked_pages++; 4810 if (!PageUptodate(page)) { 4811 num_reads++; 4812 all_uptodate = 0; 4813 } 4814 } 4815 if (all_uptodate) { 4816 if (start_i == 0) 4817 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4818 goto unlock_exit; 4819 } 4820 4821 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4822 eb->read_mirror = 0; 4823 atomic_set(&eb->io_pages, num_reads); 4824 for (i = start_i; i < num_pages; i++) { 4825 page = extent_buffer_page(eb, i); 4826 if (!PageUptodate(page)) { 4827 ClearPageError(page); 4828 err = __extent_read_full_page(tree, page, 4829 get_extent, &bio, 4830 mirror_num, &bio_flags, 4831 READ | REQ_META); 4832 if (err) 4833 ret = err; 4834 } else { 4835 unlock_page(page); 4836 } 4837 } 4838 4839 if (bio) { 4840 err = submit_one_bio(READ | REQ_META, bio, mirror_num, 4841 bio_flags); 4842 if (err) 4843 return err; 4844 } 4845 4846 if (ret || wait != WAIT_COMPLETE) 4847 return ret; 4848 4849 for (i = start_i; i < num_pages; i++) { 4850 page = extent_buffer_page(eb, i); 4851 wait_on_page_locked(page); 4852 if (!PageUptodate(page)) 4853 ret = -EIO; 4854 } 4855 4856 return ret; 4857 4858 unlock_exit: 4859 i = start_i; 4860 while (locked_pages > 0) { 4861 page = extent_buffer_page(eb, i); 4862 i++; 4863 unlock_page(page); 4864 locked_pages--; 4865 } 4866 return ret; 4867 } 4868 4869 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4870 unsigned long start, 4871 unsigned long len) 4872 { 4873 size_t cur; 4874 size_t offset; 4875 struct page *page; 4876 char *kaddr; 4877 char *dst = (char *)dstv; 4878 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4879 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4880 4881 WARN_ON(start > eb->len); 4882 WARN_ON(start + len > eb->start + eb->len); 4883 4884 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4885 4886 while (len > 0) { 4887 page = extent_buffer_page(eb, i); 4888 4889 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4890 kaddr = page_address(page); 4891 memcpy(dst, kaddr + offset, cur); 4892 4893 dst += cur; 4894 len -= cur; 4895 offset = 0; 4896 i++; 4897 } 4898 } 4899 4900 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4901 unsigned long min_len, char **map, 4902 unsigned long *map_start, 4903 unsigned long *map_len) 4904 { 4905 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4906 char *kaddr; 4907 struct page *p; 4908 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4909 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4910 unsigned long end_i = (start_offset + start + min_len - 1) >> 4911 PAGE_CACHE_SHIFT; 4912 4913 if (i != end_i) 4914 return -EINVAL; 4915 4916 if (i == 0) { 4917 offset = start_offset; 4918 *map_start = 0; 4919 } else { 4920 offset = 0; 4921 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4922 } 4923 4924 if (start + min_len > eb->len) { 4925 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4926 "wanted %lu %lu\n", (unsigned long long)eb->start, 4927 eb->len, start, min_len); 4928 return -EINVAL; 4929 } 4930 4931 p = extent_buffer_page(eb, i); 4932 kaddr = page_address(p); 4933 *map = kaddr + offset; 4934 *map_len = PAGE_CACHE_SIZE - offset; 4935 return 0; 4936 } 4937 4938 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4939 unsigned long start, 4940 unsigned long len) 4941 { 4942 size_t cur; 4943 size_t offset; 4944 struct page *page; 4945 char *kaddr; 4946 char *ptr = (char *)ptrv; 4947 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4948 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4949 int ret = 0; 4950 4951 WARN_ON(start > eb->len); 4952 WARN_ON(start + len > eb->start + eb->len); 4953 4954 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4955 4956 while (len > 0) { 4957 page = extent_buffer_page(eb, i); 4958 4959 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4960 4961 kaddr = page_address(page); 4962 ret = memcmp(ptr, kaddr + offset, cur); 4963 if (ret) 4964 break; 4965 4966 ptr += cur; 4967 len -= cur; 4968 offset = 0; 4969 i++; 4970 } 4971 return ret; 4972 } 4973 4974 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4975 unsigned long start, unsigned long len) 4976 { 4977 size_t cur; 4978 size_t offset; 4979 struct page *page; 4980 char *kaddr; 4981 char *src = (char *)srcv; 4982 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4983 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4984 4985 WARN_ON(start > eb->len); 4986 WARN_ON(start + len > eb->start + eb->len); 4987 4988 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4989 4990 while (len > 0) { 4991 page = extent_buffer_page(eb, i); 4992 WARN_ON(!PageUptodate(page)); 4993 4994 cur = min(len, PAGE_CACHE_SIZE - offset); 4995 kaddr = page_address(page); 4996 memcpy(kaddr + offset, src, cur); 4997 4998 src += cur; 4999 len -= cur; 5000 offset = 0; 5001 i++; 5002 } 5003 } 5004 5005 void memset_extent_buffer(struct extent_buffer *eb, char c, 5006 unsigned long start, unsigned long len) 5007 { 5008 size_t cur; 5009 size_t offset; 5010 struct page *page; 5011 char *kaddr; 5012 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 5013 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 5014 5015 WARN_ON(start > eb->len); 5016 WARN_ON(start + len > eb->start + eb->len); 5017 5018 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 5019 5020 while (len > 0) { 5021 page = extent_buffer_page(eb, i); 5022 WARN_ON(!PageUptodate(page)); 5023 5024 cur = min(len, PAGE_CACHE_SIZE - offset); 5025 kaddr = page_address(page); 5026 memset(kaddr + offset, c, cur); 5027 5028 len -= cur; 5029 offset = 0; 5030 i++; 5031 } 5032 } 5033 5034 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5035 unsigned long dst_offset, unsigned long src_offset, 5036 unsigned long len) 5037 { 5038 u64 dst_len = dst->len; 5039 size_t cur; 5040 size_t offset; 5041 struct page *page; 5042 char *kaddr; 5043 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5044 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 5045 5046 WARN_ON(src->len != dst_len); 5047 5048 offset = (start_offset + dst_offset) & 5049 ((unsigned long)PAGE_CACHE_SIZE - 1); 5050 5051 while (len > 0) { 5052 page = extent_buffer_page(dst, i); 5053 WARN_ON(!PageUptodate(page)); 5054 5055 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 5056 5057 kaddr = page_address(page); 5058 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5059 5060 src_offset += cur; 5061 len -= cur; 5062 offset = 0; 5063 i++; 5064 } 5065 } 5066 5067 static void move_pages(struct page *dst_page, struct page *src_page, 5068 unsigned long dst_off, unsigned long src_off, 5069 unsigned long len) 5070 { 5071 char *dst_kaddr = page_address(dst_page); 5072 if (dst_page == src_page) { 5073 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 5074 } else { 5075 char *src_kaddr = page_address(src_page); 5076 char *p = dst_kaddr + dst_off + len; 5077 char *s = src_kaddr + src_off + len; 5078 5079 while (len--) 5080 *--p = *--s; 5081 } 5082 } 5083 5084 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5085 { 5086 unsigned long distance = (src > dst) ? src - dst : dst - src; 5087 return distance < len; 5088 } 5089 5090 static void copy_pages(struct page *dst_page, struct page *src_page, 5091 unsigned long dst_off, unsigned long src_off, 5092 unsigned long len) 5093 { 5094 char *dst_kaddr = page_address(dst_page); 5095 char *src_kaddr; 5096 int must_memmove = 0; 5097 5098 if (dst_page != src_page) { 5099 src_kaddr = page_address(src_page); 5100 } else { 5101 src_kaddr = dst_kaddr; 5102 if (areas_overlap(src_off, dst_off, len)) 5103 must_memmove = 1; 5104 } 5105 5106 if (must_memmove) 5107 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5108 else 5109 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5110 } 5111 5112 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5113 unsigned long src_offset, unsigned long len) 5114 { 5115 size_t cur; 5116 size_t dst_off_in_page; 5117 size_t src_off_in_page; 5118 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5119 unsigned long dst_i; 5120 unsigned long src_i; 5121 5122 if (src_offset + len > dst->len) { 5123 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5124 "len %lu dst len %lu\n", src_offset, len, dst->len); 5125 BUG_ON(1); 5126 } 5127 if (dst_offset + len > dst->len) { 5128 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5129 "len %lu dst len %lu\n", dst_offset, len, dst->len); 5130 BUG_ON(1); 5131 } 5132 5133 while (len > 0) { 5134 dst_off_in_page = (start_offset + dst_offset) & 5135 ((unsigned long)PAGE_CACHE_SIZE - 1); 5136 src_off_in_page = (start_offset + src_offset) & 5137 ((unsigned long)PAGE_CACHE_SIZE - 1); 5138 5139 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 5140 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 5141 5142 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 5143 src_off_in_page)); 5144 cur = min_t(unsigned long, cur, 5145 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 5146 5147 copy_pages(extent_buffer_page(dst, dst_i), 5148 extent_buffer_page(dst, src_i), 5149 dst_off_in_page, src_off_in_page, cur); 5150 5151 src_offset += cur; 5152 dst_offset += cur; 5153 len -= cur; 5154 } 5155 } 5156 5157 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5158 unsigned long src_offset, unsigned long len) 5159 { 5160 size_t cur; 5161 size_t dst_off_in_page; 5162 size_t src_off_in_page; 5163 unsigned long dst_end = dst_offset + len - 1; 5164 unsigned long src_end = src_offset + len - 1; 5165 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5166 unsigned long dst_i; 5167 unsigned long src_i; 5168 5169 if (src_offset + len > dst->len) { 5170 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5171 "len %lu len %lu\n", src_offset, len, dst->len); 5172 BUG_ON(1); 5173 } 5174 if (dst_offset + len > dst->len) { 5175 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5176 "len %lu len %lu\n", dst_offset, len, dst->len); 5177 BUG_ON(1); 5178 } 5179 if (dst_offset < src_offset) { 5180 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5181 return; 5182 } 5183 while (len > 0) { 5184 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 5185 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 5186 5187 dst_off_in_page = (start_offset + dst_end) & 5188 ((unsigned long)PAGE_CACHE_SIZE - 1); 5189 src_off_in_page = (start_offset + src_end) & 5190 ((unsigned long)PAGE_CACHE_SIZE - 1); 5191 5192 cur = min_t(unsigned long, len, src_off_in_page + 1); 5193 cur = min(cur, dst_off_in_page + 1); 5194 move_pages(extent_buffer_page(dst, dst_i), 5195 extent_buffer_page(dst, src_i), 5196 dst_off_in_page - cur + 1, 5197 src_off_in_page - cur + 1, cur); 5198 5199 dst_end -= cur; 5200 src_end -= cur; 5201 len -= cur; 5202 } 5203 } 5204 5205 int try_release_extent_buffer(struct page *page) 5206 { 5207 struct extent_buffer *eb; 5208 5209 /* 5210 * We need to make sure noboody is attaching this page to an eb right 5211 * now. 5212 */ 5213 spin_lock(&page->mapping->private_lock); 5214 if (!PagePrivate(page)) { 5215 spin_unlock(&page->mapping->private_lock); 5216 return 1; 5217 } 5218 5219 eb = (struct extent_buffer *)page->private; 5220 BUG_ON(!eb); 5221 5222 /* 5223 * This is a little awful but should be ok, we need to make sure that 5224 * the eb doesn't disappear out from under us while we're looking at 5225 * this page. 5226 */ 5227 spin_lock(&eb->refs_lock); 5228 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5229 spin_unlock(&eb->refs_lock); 5230 spin_unlock(&page->mapping->private_lock); 5231 return 0; 5232 } 5233 spin_unlock(&page->mapping->private_lock); 5234 5235 /* 5236 * If tree ref isn't set then we know the ref on this eb is a real ref, 5237 * so just return, this page will likely be freed soon anyway. 5238 */ 5239 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5240 spin_unlock(&eb->refs_lock); 5241 return 0; 5242 } 5243 5244 return release_extent_buffer(eb); 5245 } 5246