1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "compat.h" 17 #include "ctree.h" 18 #include "btrfs_inode.h" 19 #include "volumes.h" 20 #include "check-integrity.h" 21 #include "locking.h" 22 #include "rcu-string.h" 23 24 static struct kmem_cache *extent_state_cache; 25 static struct kmem_cache *extent_buffer_cache; 26 27 #ifdef CONFIG_BTRFS_DEBUG 28 static LIST_HEAD(buffers); 29 static LIST_HEAD(states); 30 31 static DEFINE_SPINLOCK(leak_lock); 32 33 static inline 34 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 35 { 36 unsigned long flags; 37 38 spin_lock_irqsave(&leak_lock, flags); 39 list_add(new, head); 40 spin_unlock_irqrestore(&leak_lock, flags); 41 } 42 43 static inline 44 void btrfs_leak_debug_del(struct list_head *entry) 45 { 46 unsigned long flags; 47 48 spin_lock_irqsave(&leak_lock, flags); 49 list_del(entry); 50 spin_unlock_irqrestore(&leak_lock, flags); 51 } 52 53 static inline 54 void btrfs_leak_debug_check(void) 55 { 56 struct extent_state *state; 57 struct extent_buffer *eb; 58 59 while (!list_empty(&states)) { 60 state = list_entry(states.next, struct extent_state, leak_list); 61 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 62 "state %lu in tree %p refs %d\n", 63 (unsigned long long)state->start, 64 (unsigned long long)state->end, 65 state->state, state->tree, atomic_read(&state->refs)); 66 list_del(&state->leak_list); 67 kmem_cache_free(extent_state_cache, state); 68 } 69 70 while (!list_empty(&buffers)) { 71 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 72 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 73 "refs %d\n", (unsigned long long)eb->start, 74 eb->len, atomic_read(&eb->refs)); 75 list_del(&eb->leak_list); 76 kmem_cache_free(extent_buffer_cache, eb); 77 } 78 } 79 #else 80 #define btrfs_leak_debug_add(new, head) do {} while (0) 81 #define btrfs_leak_debug_del(entry) do {} while (0) 82 #define btrfs_leak_debug_check() do {} while (0) 83 #endif 84 85 #define BUFFER_LRU_MAX 64 86 87 struct tree_entry { 88 u64 start; 89 u64 end; 90 struct rb_node rb_node; 91 }; 92 93 struct extent_page_data { 94 struct bio *bio; 95 struct extent_io_tree *tree; 96 get_extent_t *get_extent; 97 unsigned long bio_flags; 98 99 /* tells writepage not to lock the state bits for this range 100 * it still does the unlocking 101 */ 102 unsigned int extent_locked:1; 103 104 /* tells the submit_bio code to use a WRITE_SYNC */ 105 unsigned int sync_io:1; 106 }; 107 108 static noinline void flush_write_bio(void *data); 109 static inline struct btrfs_fs_info * 110 tree_fs_info(struct extent_io_tree *tree) 111 { 112 return btrfs_sb(tree->mapping->host->i_sb); 113 } 114 115 int __init extent_io_init(void) 116 { 117 extent_state_cache = kmem_cache_create("btrfs_extent_state", 118 sizeof(struct extent_state), 0, 119 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 120 if (!extent_state_cache) 121 return -ENOMEM; 122 123 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 124 sizeof(struct extent_buffer), 0, 125 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 126 if (!extent_buffer_cache) 127 goto free_state_cache; 128 return 0; 129 130 free_state_cache: 131 kmem_cache_destroy(extent_state_cache); 132 return -ENOMEM; 133 } 134 135 void extent_io_exit(void) 136 { 137 btrfs_leak_debug_check(); 138 139 /* 140 * Make sure all delayed rcu free are flushed before we 141 * destroy caches. 142 */ 143 rcu_barrier(); 144 if (extent_state_cache) 145 kmem_cache_destroy(extent_state_cache); 146 if (extent_buffer_cache) 147 kmem_cache_destroy(extent_buffer_cache); 148 } 149 150 void extent_io_tree_init(struct extent_io_tree *tree, 151 struct address_space *mapping) 152 { 153 tree->state = RB_ROOT; 154 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 155 tree->ops = NULL; 156 tree->dirty_bytes = 0; 157 spin_lock_init(&tree->lock); 158 spin_lock_init(&tree->buffer_lock); 159 tree->mapping = mapping; 160 } 161 162 static struct extent_state *alloc_extent_state(gfp_t mask) 163 { 164 struct extent_state *state; 165 166 state = kmem_cache_alloc(extent_state_cache, mask); 167 if (!state) 168 return state; 169 state->state = 0; 170 state->private = 0; 171 state->tree = NULL; 172 btrfs_leak_debug_add(&state->leak_list, &states); 173 atomic_set(&state->refs, 1); 174 init_waitqueue_head(&state->wq); 175 trace_alloc_extent_state(state, mask, _RET_IP_); 176 return state; 177 } 178 179 void free_extent_state(struct extent_state *state) 180 { 181 if (!state) 182 return; 183 if (atomic_dec_and_test(&state->refs)) { 184 WARN_ON(state->tree); 185 btrfs_leak_debug_del(&state->leak_list); 186 trace_free_extent_state(state, _RET_IP_); 187 kmem_cache_free(extent_state_cache, state); 188 } 189 } 190 191 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 192 struct rb_node *node) 193 { 194 struct rb_node **p = &root->rb_node; 195 struct rb_node *parent = NULL; 196 struct tree_entry *entry; 197 198 while (*p) { 199 parent = *p; 200 entry = rb_entry(parent, struct tree_entry, rb_node); 201 202 if (offset < entry->start) 203 p = &(*p)->rb_left; 204 else if (offset > entry->end) 205 p = &(*p)->rb_right; 206 else 207 return parent; 208 } 209 210 rb_link_node(node, parent, p); 211 rb_insert_color(node, root); 212 return NULL; 213 } 214 215 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 216 struct rb_node **prev_ret, 217 struct rb_node **next_ret) 218 { 219 struct rb_root *root = &tree->state; 220 struct rb_node *n = root->rb_node; 221 struct rb_node *prev = NULL; 222 struct rb_node *orig_prev = NULL; 223 struct tree_entry *entry; 224 struct tree_entry *prev_entry = NULL; 225 226 while (n) { 227 entry = rb_entry(n, struct tree_entry, rb_node); 228 prev = n; 229 prev_entry = entry; 230 231 if (offset < entry->start) 232 n = n->rb_left; 233 else if (offset > entry->end) 234 n = n->rb_right; 235 else 236 return n; 237 } 238 239 if (prev_ret) { 240 orig_prev = prev; 241 while (prev && offset > prev_entry->end) { 242 prev = rb_next(prev); 243 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 244 } 245 *prev_ret = prev; 246 prev = orig_prev; 247 } 248 249 if (next_ret) { 250 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 251 while (prev && offset < prev_entry->start) { 252 prev = rb_prev(prev); 253 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 254 } 255 *next_ret = prev; 256 } 257 return NULL; 258 } 259 260 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 261 u64 offset) 262 { 263 struct rb_node *prev = NULL; 264 struct rb_node *ret; 265 266 ret = __etree_search(tree, offset, &prev, NULL); 267 if (!ret) 268 return prev; 269 return ret; 270 } 271 272 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 273 struct extent_state *other) 274 { 275 if (tree->ops && tree->ops->merge_extent_hook) 276 tree->ops->merge_extent_hook(tree->mapping->host, new, 277 other); 278 } 279 280 /* 281 * utility function to look for merge candidates inside a given range. 282 * Any extents with matching state are merged together into a single 283 * extent in the tree. Extents with EXTENT_IO in their state field 284 * are not merged because the end_io handlers need to be able to do 285 * operations on them without sleeping (or doing allocations/splits). 286 * 287 * This should be called with the tree lock held. 288 */ 289 static void merge_state(struct extent_io_tree *tree, 290 struct extent_state *state) 291 { 292 struct extent_state *other; 293 struct rb_node *other_node; 294 295 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 296 return; 297 298 other_node = rb_prev(&state->rb_node); 299 if (other_node) { 300 other = rb_entry(other_node, struct extent_state, rb_node); 301 if (other->end == state->start - 1 && 302 other->state == state->state) { 303 merge_cb(tree, state, other); 304 state->start = other->start; 305 other->tree = NULL; 306 rb_erase(&other->rb_node, &tree->state); 307 free_extent_state(other); 308 } 309 } 310 other_node = rb_next(&state->rb_node); 311 if (other_node) { 312 other = rb_entry(other_node, struct extent_state, rb_node); 313 if (other->start == state->end + 1 && 314 other->state == state->state) { 315 merge_cb(tree, state, other); 316 state->end = other->end; 317 other->tree = NULL; 318 rb_erase(&other->rb_node, &tree->state); 319 free_extent_state(other); 320 } 321 } 322 } 323 324 static void set_state_cb(struct extent_io_tree *tree, 325 struct extent_state *state, unsigned long *bits) 326 { 327 if (tree->ops && tree->ops->set_bit_hook) 328 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 329 } 330 331 static void clear_state_cb(struct extent_io_tree *tree, 332 struct extent_state *state, unsigned long *bits) 333 { 334 if (tree->ops && tree->ops->clear_bit_hook) 335 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 336 } 337 338 static void set_state_bits(struct extent_io_tree *tree, 339 struct extent_state *state, unsigned long *bits); 340 341 /* 342 * insert an extent_state struct into the tree. 'bits' are set on the 343 * struct before it is inserted. 344 * 345 * This may return -EEXIST if the extent is already there, in which case the 346 * state struct is freed. 347 * 348 * The tree lock is not taken internally. This is a utility function and 349 * probably isn't what you want to call (see set/clear_extent_bit). 350 */ 351 static int insert_state(struct extent_io_tree *tree, 352 struct extent_state *state, u64 start, u64 end, 353 unsigned long *bits) 354 { 355 struct rb_node *node; 356 357 if (end < start) 358 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 359 (unsigned long long)end, 360 (unsigned long long)start); 361 state->start = start; 362 state->end = end; 363 364 set_state_bits(tree, state, bits); 365 366 node = tree_insert(&tree->state, end, &state->rb_node); 367 if (node) { 368 struct extent_state *found; 369 found = rb_entry(node, struct extent_state, rb_node); 370 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 371 "%llu %llu\n", (unsigned long long)found->start, 372 (unsigned long long)found->end, 373 (unsigned long long)start, (unsigned long long)end); 374 return -EEXIST; 375 } 376 state->tree = tree; 377 merge_state(tree, state); 378 return 0; 379 } 380 381 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 382 u64 split) 383 { 384 if (tree->ops && tree->ops->split_extent_hook) 385 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 386 } 387 388 /* 389 * split a given extent state struct in two, inserting the preallocated 390 * struct 'prealloc' as the newly created second half. 'split' indicates an 391 * offset inside 'orig' where it should be split. 392 * 393 * Before calling, 394 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 395 * are two extent state structs in the tree: 396 * prealloc: [orig->start, split - 1] 397 * orig: [ split, orig->end ] 398 * 399 * The tree locks are not taken by this function. They need to be held 400 * by the caller. 401 */ 402 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 403 struct extent_state *prealloc, u64 split) 404 { 405 struct rb_node *node; 406 407 split_cb(tree, orig, split); 408 409 prealloc->start = orig->start; 410 prealloc->end = split - 1; 411 prealloc->state = orig->state; 412 orig->start = split; 413 414 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 415 if (node) { 416 free_extent_state(prealloc); 417 return -EEXIST; 418 } 419 prealloc->tree = tree; 420 return 0; 421 } 422 423 static struct extent_state *next_state(struct extent_state *state) 424 { 425 struct rb_node *next = rb_next(&state->rb_node); 426 if (next) 427 return rb_entry(next, struct extent_state, rb_node); 428 else 429 return NULL; 430 } 431 432 /* 433 * utility function to clear some bits in an extent state struct. 434 * it will optionally wake up any one waiting on this state (wake == 1). 435 * 436 * If no bits are set on the state struct after clearing things, the 437 * struct is freed and removed from the tree 438 */ 439 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 440 struct extent_state *state, 441 unsigned long *bits, int wake) 442 { 443 struct extent_state *next; 444 unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; 445 446 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 447 u64 range = state->end - state->start + 1; 448 WARN_ON(range > tree->dirty_bytes); 449 tree->dirty_bytes -= range; 450 } 451 clear_state_cb(tree, state, bits); 452 state->state &= ~bits_to_clear; 453 if (wake) 454 wake_up(&state->wq); 455 if (state->state == 0) { 456 next = next_state(state); 457 if (state->tree) { 458 rb_erase(&state->rb_node, &tree->state); 459 state->tree = NULL; 460 free_extent_state(state); 461 } else { 462 WARN_ON(1); 463 } 464 } else { 465 merge_state(tree, state); 466 next = next_state(state); 467 } 468 return next; 469 } 470 471 static struct extent_state * 472 alloc_extent_state_atomic(struct extent_state *prealloc) 473 { 474 if (!prealloc) 475 prealloc = alloc_extent_state(GFP_ATOMIC); 476 477 return prealloc; 478 } 479 480 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 481 { 482 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 483 "Extent tree was modified by another " 484 "thread while locked."); 485 } 486 487 /* 488 * clear some bits on a range in the tree. This may require splitting 489 * or inserting elements in the tree, so the gfp mask is used to 490 * indicate which allocations or sleeping are allowed. 491 * 492 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 493 * the given range from the tree regardless of state (ie for truncate). 494 * 495 * the range [start, end] is inclusive. 496 * 497 * This takes the tree lock, and returns 0 on success and < 0 on error. 498 */ 499 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 500 unsigned long bits, int wake, int delete, 501 struct extent_state **cached_state, 502 gfp_t mask) 503 { 504 struct extent_state *state; 505 struct extent_state *cached; 506 struct extent_state *prealloc = NULL; 507 struct rb_node *node; 508 u64 last_end; 509 int err; 510 int clear = 0; 511 512 if (delete) 513 bits |= ~EXTENT_CTLBITS; 514 bits |= EXTENT_FIRST_DELALLOC; 515 516 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 517 clear = 1; 518 again: 519 if (!prealloc && (mask & __GFP_WAIT)) { 520 prealloc = alloc_extent_state(mask); 521 if (!prealloc) 522 return -ENOMEM; 523 } 524 525 spin_lock(&tree->lock); 526 if (cached_state) { 527 cached = *cached_state; 528 529 if (clear) { 530 *cached_state = NULL; 531 cached_state = NULL; 532 } 533 534 if (cached && cached->tree && cached->start <= start && 535 cached->end > start) { 536 if (clear) 537 atomic_dec(&cached->refs); 538 state = cached; 539 goto hit_next; 540 } 541 if (clear) 542 free_extent_state(cached); 543 } 544 /* 545 * this search will find the extents that end after 546 * our range starts 547 */ 548 node = tree_search(tree, start); 549 if (!node) 550 goto out; 551 state = rb_entry(node, struct extent_state, rb_node); 552 hit_next: 553 if (state->start > end) 554 goto out; 555 WARN_ON(state->end < start); 556 last_end = state->end; 557 558 /* the state doesn't have the wanted bits, go ahead */ 559 if (!(state->state & bits)) { 560 state = next_state(state); 561 goto next; 562 } 563 564 /* 565 * | ---- desired range ---- | 566 * | state | or 567 * | ------------- state -------------- | 568 * 569 * We need to split the extent we found, and may flip 570 * bits on second half. 571 * 572 * If the extent we found extends past our range, we 573 * just split and search again. It'll get split again 574 * the next time though. 575 * 576 * If the extent we found is inside our range, we clear 577 * the desired bit on it. 578 */ 579 580 if (state->start < start) { 581 prealloc = alloc_extent_state_atomic(prealloc); 582 BUG_ON(!prealloc); 583 err = split_state(tree, state, prealloc, start); 584 if (err) 585 extent_io_tree_panic(tree, err); 586 587 prealloc = NULL; 588 if (err) 589 goto out; 590 if (state->end <= end) { 591 state = clear_state_bit(tree, state, &bits, wake); 592 goto next; 593 } 594 goto search_again; 595 } 596 /* 597 * | ---- desired range ---- | 598 * | state | 599 * We need to split the extent, and clear the bit 600 * on the first half 601 */ 602 if (state->start <= end && state->end > end) { 603 prealloc = alloc_extent_state_atomic(prealloc); 604 BUG_ON(!prealloc); 605 err = split_state(tree, state, prealloc, end + 1); 606 if (err) 607 extent_io_tree_panic(tree, err); 608 609 if (wake) 610 wake_up(&state->wq); 611 612 clear_state_bit(tree, prealloc, &bits, wake); 613 614 prealloc = NULL; 615 goto out; 616 } 617 618 state = clear_state_bit(tree, state, &bits, wake); 619 next: 620 if (last_end == (u64)-1) 621 goto out; 622 start = last_end + 1; 623 if (start <= end && state && !need_resched()) 624 goto hit_next; 625 goto search_again; 626 627 out: 628 spin_unlock(&tree->lock); 629 if (prealloc) 630 free_extent_state(prealloc); 631 632 return 0; 633 634 search_again: 635 if (start > end) 636 goto out; 637 spin_unlock(&tree->lock); 638 if (mask & __GFP_WAIT) 639 cond_resched(); 640 goto again; 641 } 642 643 static void wait_on_state(struct extent_io_tree *tree, 644 struct extent_state *state) 645 __releases(tree->lock) 646 __acquires(tree->lock) 647 { 648 DEFINE_WAIT(wait); 649 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 650 spin_unlock(&tree->lock); 651 schedule(); 652 spin_lock(&tree->lock); 653 finish_wait(&state->wq, &wait); 654 } 655 656 /* 657 * waits for one or more bits to clear on a range in the state tree. 658 * The range [start, end] is inclusive. 659 * The tree lock is taken by this function 660 */ 661 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 662 unsigned long bits) 663 { 664 struct extent_state *state; 665 struct rb_node *node; 666 667 spin_lock(&tree->lock); 668 again: 669 while (1) { 670 /* 671 * this search will find all the extents that end after 672 * our range starts 673 */ 674 node = tree_search(tree, start); 675 if (!node) 676 break; 677 678 state = rb_entry(node, struct extent_state, rb_node); 679 680 if (state->start > end) 681 goto out; 682 683 if (state->state & bits) { 684 start = state->start; 685 atomic_inc(&state->refs); 686 wait_on_state(tree, state); 687 free_extent_state(state); 688 goto again; 689 } 690 start = state->end + 1; 691 692 if (start > end) 693 break; 694 695 cond_resched_lock(&tree->lock); 696 } 697 out: 698 spin_unlock(&tree->lock); 699 } 700 701 static void set_state_bits(struct extent_io_tree *tree, 702 struct extent_state *state, 703 unsigned long *bits) 704 { 705 unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; 706 707 set_state_cb(tree, state, bits); 708 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 709 u64 range = state->end - state->start + 1; 710 tree->dirty_bytes += range; 711 } 712 state->state |= bits_to_set; 713 } 714 715 static void cache_state(struct extent_state *state, 716 struct extent_state **cached_ptr) 717 { 718 if (cached_ptr && !(*cached_ptr)) { 719 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 720 *cached_ptr = state; 721 atomic_inc(&state->refs); 722 } 723 } 724 } 725 726 static void uncache_state(struct extent_state **cached_ptr) 727 { 728 if (cached_ptr && (*cached_ptr)) { 729 struct extent_state *state = *cached_ptr; 730 *cached_ptr = NULL; 731 free_extent_state(state); 732 } 733 } 734 735 /* 736 * set some bits on a range in the tree. This may require allocations or 737 * sleeping, so the gfp mask is used to indicate what is allowed. 738 * 739 * If any of the exclusive bits are set, this will fail with -EEXIST if some 740 * part of the range already has the desired bits set. The start of the 741 * existing range is returned in failed_start in this case. 742 * 743 * [start, end] is inclusive This takes the tree lock. 744 */ 745 746 static int __must_check 747 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748 unsigned long bits, unsigned long exclusive_bits, 749 u64 *failed_start, struct extent_state **cached_state, 750 gfp_t mask) 751 { 752 struct extent_state *state; 753 struct extent_state *prealloc = NULL; 754 struct rb_node *node; 755 int err = 0; 756 u64 last_start; 757 u64 last_end; 758 759 bits |= EXTENT_FIRST_DELALLOC; 760 again: 761 if (!prealloc && (mask & __GFP_WAIT)) { 762 prealloc = alloc_extent_state(mask); 763 BUG_ON(!prealloc); 764 } 765 766 spin_lock(&tree->lock); 767 if (cached_state && *cached_state) { 768 state = *cached_state; 769 if (state->start <= start && state->end > start && 770 state->tree) { 771 node = &state->rb_node; 772 goto hit_next; 773 } 774 } 775 /* 776 * this search will find all the extents that end after 777 * our range starts. 778 */ 779 node = tree_search(tree, start); 780 if (!node) { 781 prealloc = alloc_extent_state_atomic(prealloc); 782 BUG_ON(!prealloc); 783 err = insert_state(tree, prealloc, start, end, &bits); 784 if (err) 785 extent_io_tree_panic(tree, err); 786 787 prealloc = NULL; 788 goto out; 789 } 790 state = rb_entry(node, struct extent_state, rb_node); 791 hit_next: 792 last_start = state->start; 793 last_end = state->end; 794 795 /* 796 * | ---- desired range ---- | 797 * | state | 798 * 799 * Just lock what we found and keep going 800 */ 801 if (state->start == start && state->end <= end) { 802 if (state->state & exclusive_bits) { 803 *failed_start = state->start; 804 err = -EEXIST; 805 goto out; 806 } 807 808 set_state_bits(tree, state, &bits); 809 cache_state(state, cached_state); 810 merge_state(tree, state); 811 if (last_end == (u64)-1) 812 goto out; 813 start = last_end + 1; 814 state = next_state(state); 815 if (start < end && state && state->start == start && 816 !need_resched()) 817 goto hit_next; 818 goto search_again; 819 } 820 821 /* 822 * | ---- desired range ---- | 823 * | state | 824 * or 825 * | ------------- state -------------- | 826 * 827 * We need to split the extent we found, and may flip bits on 828 * second half. 829 * 830 * If the extent we found extends past our 831 * range, we just split and search again. It'll get split 832 * again the next time though. 833 * 834 * If the extent we found is inside our range, we set the 835 * desired bit on it. 836 */ 837 if (state->start < start) { 838 if (state->state & exclusive_bits) { 839 *failed_start = start; 840 err = -EEXIST; 841 goto out; 842 } 843 844 prealloc = alloc_extent_state_atomic(prealloc); 845 BUG_ON(!prealloc); 846 err = split_state(tree, state, prealloc, start); 847 if (err) 848 extent_io_tree_panic(tree, err); 849 850 prealloc = NULL; 851 if (err) 852 goto out; 853 if (state->end <= end) { 854 set_state_bits(tree, state, &bits); 855 cache_state(state, cached_state); 856 merge_state(tree, state); 857 if (last_end == (u64)-1) 858 goto out; 859 start = last_end + 1; 860 state = next_state(state); 861 if (start < end && state && state->start == start && 862 !need_resched()) 863 goto hit_next; 864 } 865 goto search_again; 866 } 867 /* 868 * | ---- desired range ---- | 869 * | state | or | state | 870 * 871 * There's a hole, we need to insert something in it and 872 * ignore the extent we found. 873 */ 874 if (state->start > start) { 875 u64 this_end; 876 if (end < last_start) 877 this_end = end; 878 else 879 this_end = last_start - 1; 880 881 prealloc = alloc_extent_state_atomic(prealloc); 882 BUG_ON(!prealloc); 883 884 /* 885 * Avoid to free 'prealloc' if it can be merged with 886 * the later extent. 887 */ 888 err = insert_state(tree, prealloc, start, this_end, 889 &bits); 890 if (err) 891 extent_io_tree_panic(tree, err); 892 893 cache_state(prealloc, cached_state); 894 prealloc = NULL; 895 start = this_end + 1; 896 goto search_again; 897 } 898 /* 899 * | ---- desired range ---- | 900 * | state | 901 * We need to split the extent, and set the bit 902 * on the first half 903 */ 904 if (state->start <= end && state->end > end) { 905 if (state->state & exclusive_bits) { 906 *failed_start = start; 907 err = -EEXIST; 908 goto out; 909 } 910 911 prealloc = alloc_extent_state_atomic(prealloc); 912 BUG_ON(!prealloc); 913 err = split_state(tree, state, prealloc, end + 1); 914 if (err) 915 extent_io_tree_panic(tree, err); 916 917 set_state_bits(tree, prealloc, &bits); 918 cache_state(prealloc, cached_state); 919 merge_state(tree, prealloc); 920 prealloc = NULL; 921 goto out; 922 } 923 924 goto search_again; 925 926 out: 927 spin_unlock(&tree->lock); 928 if (prealloc) 929 free_extent_state(prealloc); 930 931 return err; 932 933 search_again: 934 if (start > end) 935 goto out; 936 spin_unlock(&tree->lock); 937 if (mask & __GFP_WAIT) 938 cond_resched(); 939 goto again; 940 } 941 942 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 943 unsigned long bits, u64 * failed_start, 944 struct extent_state **cached_state, gfp_t mask) 945 { 946 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 947 cached_state, mask); 948 } 949 950 951 /** 952 * convert_extent_bit - convert all bits in a given range from one bit to 953 * another 954 * @tree: the io tree to search 955 * @start: the start offset in bytes 956 * @end: the end offset in bytes (inclusive) 957 * @bits: the bits to set in this range 958 * @clear_bits: the bits to clear in this range 959 * @cached_state: state that we're going to cache 960 * @mask: the allocation mask 961 * 962 * This will go through and set bits for the given range. If any states exist 963 * already in this range they are set with the given bit and cleared of the 964 * clear_bits. This is only meant to be used by things that are mergeable, ie 965 * converting from say DELALLOC to DIRTY. This is not meant to be used with 966 * boundary bits like LOCK. 967 */ 968 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 969 unsigned long bits, unsigned long clear_bits, 970 struct extent_state **cached_state, gfp_t mask) 971 { 972 struct extent_state *state; 973 struct extent_state *prealloc = NULL; 974 struct rb_node *node; 975 int err = 0; 976 u64 last_start; 977 u64 last_end; 978 979 again: 980 if (!prealloc && (mask & __GFP_WAIT)) { 981 prealloc = alloc_extent_state(mask); 982 if (!prealloc) 983 return -ENOMEM; 984 } 985 986 spin_lock(&tree->lock); 987 if (cached_state && *cached_state) { 988 state = *cached_state; 989 if (state->start <= start && state->end > start && 990 state->tree) { 991 node = &state->rb_node; 992 goto hit_next; 993 } 994 } 995 996 /* 997 * this search will find all the extents that end after 998 * our range starts. 999 */ 1000 node = tree_search(tree, start); 1001 if (!node) { 1002 prealloc = alloc_extent_state_atomic(prealloc); 1003 if (!prealloc) { 1004 err = -ENOMEM; 1005 goto out; 1006 } 1007 err = insert_state(tree, prealloc, start, end, &bits); 1008 prealloc = NULL; 1009 if (err) 1010 extent_io_tree_panic(tree, err); 1011 goto out; 1012 } 1013 state = rb_entry(node, struct extent_state, rb_node); 1014 hit_next: 1015 last_start = state->start; 1016 last_end = state->end; 1017 1018 /* 1019 * | ---- desired range ---- | 1020 * | state | 1021 * 1022 * Just lock what we found and keep going 1023 */ 1024 if (state->start == start && state->end <= end) { 1025 set_state_bits(tree, state, &bits); 1026 cache_state(state, cached_state); 1027 state = clear_state_bit(tree, state, &clear_bits, 0); 1028 if (last_end == (u64)-1) 1029 goto out; 1030 start = last_end + 1; 1031 if (start < end && state && state->start == start && 1032 !need_resched()) 1033 goto hit_next; 1034 goto search_again; 1035 } 1036 1037 /* 1038 * | ---- desired range ---- | 1039 * | state | 1040 * or 1041 * | ------------- state -------------- | 1042 * 1043 * We need to split the extent we found, and may flip bits on 1044 * second half. 1045 * 1046 * If the extent we found extends past our 1047 * range, we just split and search again. It'll get split 1048 * again the next time though. 1049 * 1050 * If the extent we found is inside our range, we set the 1051 * desired bit on it. 1052 */ 1053 if (state->start < start) { 1054 prealloc = alloc_extent_state_atomic(prealloc); 1055 if (!prealloc) { 1056 err = -ENOMEM; 1057 goto out; 1058 } 1059 err = split_state(tree, state, prealloc, start); 1060 if (err) 1061 extent_io_tree_panic(tree, err); 1062 prealloc = NULL; 1063 if (err) 1064 goto out; 1065 if (state->end <= end) { 1066 set_state_bits(tree, state, &bits); 1067 cache_state(state, cached_state); 1068 state = clear_state_bit(tree, state, &clear_bits, 0); 1069 if (last_end == (u64)-1) 1070 goto out; 1071 start = last_end + 1; 1072 if (start < end && state && state->start == start && 1073 !need_resched()) 1074 goto hit_next; 1075 } 1076 goto search_again; 1077 } 1078 /* 1079 * | ---- desired range ---- | 1080 * | state | or | state | 1081 * 1082 * There's a hole, we need to insert something in it and 1083 * ignore the extent we found. 1084 */ 1085 if (state->start > start) { 1086 u64 this_end; 1087 if (end < last_start) 1088 this_end = end; 1089 else 1090 this_end = last_start - 1; 1091 1092 prealloc = alloc_extent_state_atomic(prealloc); 1093 if (!prealloc) { 1094 err = -ENOMEM; 1095 goto out; 1096 } 1097 1098 /* 1099 * Avoid to free 'prealloc' if it can be merged with 1100 * the later extent. 1101 */ 1102 err = insert_state(tree, prealloc, start, this_end, 1103 &bits); 1104 if (err) 1105 extent_io_tree_panic(tree, err); 1106 cache_state(prealloc, cached_state); 1107 prealloc = NULL; 1108 start = this_end + 1; 1109 goto search_again; 1110 } 1111 /* 1112 * | ---- desired range ---- | 1113 * | state | 1114 * We need to split the extent, and set the bit 1115 * on the first half 1116 */ 1117 if (state->start <= end && state->end > end) { 1118 prealloc = alloc_extent_state_atomic(prealloc); 1119 if (!prealloc) { 1120 err = -ENOMEM; 1121 goto out; 1122 } 1123 1124 err = split_state(tree, state, prealloc, end + 1); 1125 if (err) 1126 extent_io_tree_panic(tree, err); 1127 1128 set_state_bits(tree, prealloc, &bits); 1129 cache_state(prealloc, cached_state); 1130 clear_state_bit(tree, prealloc, &clear_bits, 0); 1131 prealloc = NULL; 1132 goto out; 1133 } 1134 1135 goto search_again; 1136 1137 out: 1138 spin_unlock(&tree->lock); 1139 if (prealloc) 1140 free_extent_state(prealloc); 1141 1142 return err; 1143 1144 search_again: 1145 if (start > end) 1146 goto out; 1147 spin_unlock(&tree->lock); 1148 if (mask & __GFP_WAIT) 1149 cond_resched(); 1150 goto again; 1151 } 1152 1153 /* wrappers around set/clear extent bit */ 1154 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1155 gfp_t mask) 1156 { 1157 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1158 NULL, mask); 1159 } 1160 1161 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1162 unsigned long bits, gfp_t mask) 1163 { 1164 return set_extent_bit(tree, start, end, bits, NULL, 1165 NULL, mask); 1166 } 1167 1168 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1169 unsigned long bits, gfp_t mask) 1170 { 1171 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1172 } 1173 1174 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1175 struct extent_state **cached_state, gfp_t mask) 1176 { 1177 return set_extent_bit(tree, start, end, 1178 EXTENT_DELALLOC | EXTENT_UPTODATE, 1179 NULL, cached_state, mask); 1180 } 1181 1182 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1183 struct extent_state **cached_state, gfp_t mask) 1184 { 1185 return set_extent_bit(tree, start, end, 1186 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1187 NULL, cached_state, mask); 1188 } 1189 1190 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1191 gfp_t mask) 1192 { 1193 return clear_extent_bit(tree, start, end, 1194 EXTENT_DIRTY | EXTENT_DELALLOC | 1195 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1196 } 1197 1198 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1199 gfp_t mask) 1200 { 1201 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1202 NULL, mask); 1203 } 1204 1205 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1206 struct extent_state **cached_state, gfp_t mask) 1207 { 1208 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, 1209 cached_state, mask); 1210 } 1211 1212 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1213 struct extent_state **cached_state, gfp_t mask) 1214 { 1215 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1216 cached_state, mask); 1217 } 1218 1219 /* 1220 * either insert or lock state struct between start and end use mask to tell 1221 * us if waiting is desired. 1222 */ 1223 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1224 unsigned long bits, struct extent_state **cached_state) 1225 { 1226 int err; 1227 u64 failed_start; 1228 while (1) { 1229 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1230 EXTENT_LOCKED, &failed_start, 1231 cached_state, GFP_NOFS); 1232 if (err == -EEXIST) { 1233 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1234 start = failed_start; 1235 } else 1236 break; 1237 WARN_ON(start > end); 1238 } 1239 return err; 1240 } 1241 1242 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1243 { 1244 return lock_extent_bits(tree, start, end, 0, NULL); 1245 } 1246 1247 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1248 { 1249 int err; 1250 u64 failed_start; 1251 1252 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1253 &failed_start, NULL, GFP_NOFS); 1254 if (err == -EEXIST) { 1255 if (failed_start > start) 1256 clear_extent_bit(tree, start, failed_start - 1, 1257 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1258 return 0; 1259 } 1260 return 1; 1261 } 1262 1263 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1264 struct extent_state **cached, gfp_t mask) 1265 { 1266 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1267 mask); 1268 } 1269 1270 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1271 { 1272 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1273 GFP_NOFS); 1274 } 1275 1276 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1277 { 1278 unsigned long index = start >> PAGE_CACHE_SHIFT; 1279 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1280 struct page *page; 1281 1282 while (index <= end_index) { 1283 page = find_get_page(inode->i_mapping, index); 1284 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1285 clear_page_dirty_for_io(page); 1286 page_cache_release(page); 1287 index++; 1288 } 1289 return 0; 1290 } 1291 1292 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1293 { 1294 unsigned long index = start >> PAGE_CACHE_SHIFT; 1295 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1296 struct page *page; 1297 1298 while (index <= end_index) { 1299 page = find_get_page(inode->i_mapping, index); 1300 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1301 account_page_redirty(page); 1302 __set_page_dirty_nobuffers(page); 1303 page_cache_release(page); 1304 index++; 1305 } 1306 return 0; 1307 } 1308 1309 /* 1310 * helper function to set both pages and extents in the tree writeback 1311 */ 1312 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1313 { 1314 unsigned long index = start >> PAGE_CACHE_SHIFT; 1315 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1316 struct page *page; 1317 1318 while (index <= end_index) { 1319 page = find_get_page(tree->mapping, index); 1320 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1321 set_page_writeback(page); 1322 page_cache_release(page); 1323 index++; 1324 } 1325 return 0; 1326 } 1327 1328 /* find the first state struct with 'bits' set after 'start', and 1329 * return it. tree->lock must be held. NULL will returned if 1330 * nothing was found after 'start' 1331 */ 1332 static struct extent_state * 1333 find_first_extent_bit_state(struct extent_io_tree *tree, 1334 u64 start, unsigned long bits) 1335 { 1336 struct rb_node *node; 1337 struct extent_state *state; 1338 1339 /* 1340 * this search will find all the extents that end after 1341 * our range starts. 1342 */ 1343 node = tree_search(tree, start); 1344 if (!node) 1345 goto out; 1346 1347 while (1) { 1348 state = rb_entry(node, struct extent_state, rb_node); 1349 if (state->end >= start && (state->state & bits)) 1350 return state; 1351 1352 node = rb_next(node); 1353 if (!node) 1354 break; 1355 } 1356 out: 1357 return NULL; 1358 } 1359 1360 /* 1361 * find the first offset in the io tree with 'bits' set. zero is 1362 * returned if we find something, and *start_ret and *end_ret are 1363 * set to reflect the state struct that was found. 1364 * 1365 * If nothing was found, 1 is returned. If found something, return 0. 1366 */ 1367 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1368 u64 *start_ret, u64 *end_ret, unsigned long bits, 1369 struct extent_state **cached_state) 1370 { 1371 struct extent_state *state; 1372 struct rb_node *n; 1373 int ret = 1; 1374 1375 spin_lock(&tree->lock); 1376 if (cached_state && *cached_state) { 1377 state = *cached_state; 1378 if (state->end == start - 1 && state->tree) { 1379 n = rb_next(&state->rb_node); 1380 while (n) { 1381 state = rb_entry(n, struct extent_state, 1382 rb_node); 1383 if (state->state & bits) 1384 goto got_it; 1385 n = rb_next(n); 1386 } 1387 free_extent_state(*cached_state); 1388 *cached_state = NULL; 1389 goto out; 1390 } 1391 free_extent_state(*cached_state); 1392 *cached_state = NULL; 1393 } 1394 1395 state = find_first_extent_bit_state(tree, start, bits); 1396 got_it: 1397 if (state) { 1398 cache_state(state, cached_state); 1399 *start_ret = state->start; 1400 *end_ret = state->end; 1401 ret = 0; 1402 } 1403 out: 1404 spin_unlock(&tree->lock); 1405 return ret; 1406 } 1407 1408 /* 1409 * find a contiguous range of bytes in the file marked as delalloc, not 1410 * more than 'max_bytes'. start and end are used to return the range, 1411 * 1412 * 1 is returned if we find something, 0 if nothing was in the tree 1413 */ 1414 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1415 u64 *start, u64 *end, u64 max_bytes, 1416 struct extent_state **cached_state) 1417 { 1418 struct rb_node *node; 1419 struct extent_state *state; 1420 u64 cur_start = *start; 1421 u64 found = 0; 1422 u64 total_bytes = 0; 1423 1424 spin_lock(&tree->lock); 1425 1426 /* 1427 * this search will find all the extents that end after 1428 * our range starts. 1429 */ 1430 node = tree_search(tree, cur_start); 1431 if (!node) { 1432 if (!found) 1433 *end = (u64)-1; 1434 goto out; 1435 } 1436 1437 while (1) { 1438 state = rb_entry(node, struct extent_state, rb_node); 1439 if (found && (state->start != cur_start || 1440 (state->state & EXTENT_BOUNDARY))) { 1441 goto out; 1442 } 1443 if (!(state->state & EXTENT_DELALLOC)) { 1444 if (!found) 1445 *end = state->end; 1446 goto out; 1447 } 1448 if (!found) { 1449 *start = state->start; 1450 *cached_state = state; 1451 atomic_inc(&state->refs); 1452 } 1453 found++; 1454 *end = state->end; 1455 cur_start = state->end + 1; 1456 node = rb_next(node); 1457 if (!node) 1458 break; 1459 total_bytes += state->end - state->start + 1; 1460 if (total_bytes >= max_bytes) 1461 break; 1462 } 1463 out: 1464 spin_unlock(&tree->lock); 1465 return found; 1466 } 1467 1468 static noinline void __unlock_for_delalloc(struct inode *inode, 1469 struct page *locked_page, 1470 u64 start, u64 end) 1471 { 1472 int ret; 1473 struct page *pages[16]; 1474 unsigned long index = start >> PAGE_CACHE_SHIFT; 1475 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1476 unsigned long nr_pages = end_index - index + 1; 1477 int i; 1478 1479 if (index == locked_page->index && end_index == index) 1480 return; 1481 1482 while (nr_pages > 0) { 1483 ret = find_get_pages_contig(inode->i_mapping, index, 1484 min_t(unsigned long, nr_pages, 1485 ARRAY_SIZE(pages)), pages); 1486 for (i = 0; i < ret; i++) { 1487 if (pages[i] != locked_page) 1488 unlock_page(pages[i]); 1489 page_cache_release(pages[i]); 1490 } 1491 nr_pages -= ret; 1492 index += ret; 1493 cond_resched(); 1494 } 1495 } 1496 1497 static noinline int lock_delalloc_pages(struct inode *inode, 1498 struct page *locked_page, 1499 u64 delalloc_start, 1500 u64 delalloc_end) 1501 { 1502 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1503 unsigned long start_index = index; 1504 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1505 unsigned long pages_locked = 0; 1506 struct page *pages[16]; 1507 unsigned long nrpages; 1508 int ret; 1509 int i; 1510 1511 /* the caller is responsible for locking the start index */ 1512 if (index == locked_page->index && index == end_index) 1513 return 0; 1514 1515 /* skip the page at the start index */ 1516 nrpages = end_index - index + 1; 1517 while (nrpages > 0) { 1518 ret = find_get_pages_contig(inode->i_mapping, index, 1519 min_t(unsigned long, 1520 nrpages, ARRAY_SIZE(pages)), pages); 1521 if (ret == 0) { 1522 ret = -EAGAIN; 1523 goto done; 1524 } 1525 /* now we have an array of pages, lock them all */ 1526 for (i = 0; i < ret; i++) { 1527 /* 1528 * the caller is taking responsibility for 1529 * locked_page 1530 */ 1531 if (pages[i] != locked_page) { 1532 lock_page(pages[i]); 1533 if (!PageDirty(pages[i]) || 1534 pages[i]->mapping != inode->i_mapping) { 1535 ret = -EAGAIN; 1536 unlock_page(pages[i]); 1537 page_cache_release(pages[i]); 1538 goto done; 1539 } 1540 } 1541 page_cache_release(pages[i]); 1542 pages_locked++; 1543 } 1544 nrpages -= ret; 1545 index += ret; 1546 cond_resched(); 1547 } 1548 ret = 0; 1549 done: 1550 if (ret && pages_locked) { 1551 __unlock_for_delalloc(inode, locked_page, 1552 delalloc_start, 1553 ((u64)(start_index + pages_locked - 1)) << 1554 PAGE_CACHE_SHIFT); 1555 } 1556 return ret; 1557 } 1558 1559 /* 1560 * find a contiguous range of bytes in the file marked as delalloc, not 1561 * more than 'max_bytes'. start and end are used to return the range, 1562 * 1563 * 1 is returned if we find something, 0 if nothing was in the tree 1564 */ 1565 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1566 struct extent_io_tree *tree, 1567 struct page *locked_page, 1568 u64 *start, u64 *end, 1569 u64 max_bytes) 1570 { 1571 u64 delalloc_start; 1572 u64 delalloc_end; 1573 u64 found; 1574 struct extent_state *cached_state = NULL; 1575 int ret; 1576 int loops = 0; 1577 1578 again: 1579 /* step one, find a bunch of delalloc bytes starting at start */ 1580 delalloc_start = *start; 1581 delalloc_end = 0; 1582 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1583 max_bytes, &cached_state); 1584 if (!found || delalloc_end <= *start) { 1585 *start = delalloc_start; 1586 *end = delalloc_end; 1587 free_extent_state(cached_state); 1588 return found; 1589 } 1590 1591 /* 1592 * start comes from the offset of locked_page. We have to lock 1593 * pages in order, so we can't process delalloc bytes before 1594 * locked_page 1595 */ 1596 if (delalloc_start < *start) 1597 delalloc_start = *start; 1598 1599 /* 1600 * make sure to limit the number of pages we try to lock down 1601 * if we're looping. 1602 */ 1603 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1604 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1605 1606 /* step two, lock all the pages after the page that has start */ 1607 ret = lock_delalloc_pages(inode, locked_page, 1608 delalloc_start, delalloc_end); 1609 if (ret == -EAGAIN) { 1610 /* some of the pages are gone, lets avoid looping by 1611 * shortening the size of the delalloc range we're searching 1612 */ 1613 free_extent_state(cached_state); 1614 if (!loops) { 1615 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1616 max_bytes = PAGE_CACHE_SIZE - offset; 1617 loops = 1; 1618 goto again; 1619 } else { 1620 found = 0; 1621 goto out_failed; 1622 } 1623 } 1624 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1625 1626 /* step three, lock the state bits for the whole range */ 1627 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1628 1629 /* then test to make sure it is all still delalloc */ 1630 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1631 EXTENT_DELALLOC, 1, cached_state); 1632 if (!ret) { 1633 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1634 &cached_state, GFP_NOFS); 1635 __unlock_for_delalloc(inode, locked_page, 1636 delalloc_start, delalloc_end); 1637 cond_resched(); 1638 goto again; 1639 } 1640 free_extent_state(cached_state); 1641 *start = delalloc_start; 1642 *end = delalloc_end; 1643 out_failed: 1644 return found; 1645 } 1646 1647 int extent_clear_unlock_delalloc(struct inode *inode, 1648 struct extent_io_tree *tree, 1649 u64 start, u64 end, struct page *locked_page, 1650 unsigned long op) 1651 { 1652 int ret; 1653 struct page *pages[16]; 1654 unsigned long index = start >> PAGE_CACHE_SHIFT; 1655 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1656 unsigned long nr_pages = end_index - index + 1; 1657 int i; 1658 unsigned long clear_bits = 0; 1659 1660 if (op & EXTENT_CLEAR_UNLOCK) 1661 clear_bits |= EXTENT_LOCKED; 1662 if (op & EXTENT_CLEAR_DIRTY) 1663 clear_bits |= EXTENT_DIRTY; 1664 1665 if (op & EXTENT_CLEAR_DELALLOC) 1666 clear_bits |= EXTENT_DELALLOC; 1667 1668 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1669 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1670 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1671 EXTENT_SET_PRIVATE2))) 1672 return 0; 1673 1674 while (nr_pages > 0) { 1675 ret = find_get_pages_contig(inode->i_mapping, index, 1676 min_t(unsigned long, 1677 nr_pages, ARRAY_SIZE(pages)), pages); 1678 for (i = 0; i < ret; i++) { 1679 1680 if (op & EXTENT_SET_PRIVATE2) 1681 SetPagePrivate2(pages[i]); 1682 1683 if (pages[i] == locked_page) { 1684 page_cache_release(pages[i]); 1685 continue; 1686 } 1687 if (op & EXTENT_CLEAR_DIRTY) 1688 clear_page_dirty_for_io(pages[i]); 1689 if (op & EXTENT_SET_WRITEBACK) 1690 set_page_writeback(pages[i]); 1691 if (op & EXTENT_END_WRITEBACK) 1692 end_page_writeback(pages[i]); 1693 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1694 unlock_page(pages[i]); 1695 page_cache_release(pages[i]); 1696 } 1697 nr_pages -= ret; 1698 index += ret; 1699 cond_resched(); 1700 } 1701 return 0; 1702 } 1703 1704 /* 1705 * count the number of bytes in the tree that have a given bit(s) 1706 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1707 * cached. The total number found is returned. 1708 */ 1709 u64 count_range_bits(struct extent_io_tree *tree, 1710 u64 *start, u64 search_end, u64 max_bytes, 1711 unsigned long bits, int contig) 1712 { 1713 struct rb_node *node; 1714 struct extent_state *state; 1715 u64 cur_start = *start; 1716 u64 total_bytes = 0; 1717 u64 last = 0; 1718 int found = 0; 1719 1720 if (search_end <= cur_start) { 1721 WARN_ON(1); 1722 return 0; 1723 } 1724 1725 spin_lock(&tree->lock); 1726 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1727 total_bytes = tree->dirty_bytes; 1728 goto out; 1729 } 1730 /* 1731 * this search will find all the extents that end after 1732 * our range starts. 1733 */ 1734 node = tree_search(tree, cur_start); 1735 if (!node) 1736 goto out; 1737 1738 while (1) { 1739 state = rb_entry(node, struct extent_state, rb_node); 1740 if (state->start > search_end) 1741 break; 1742 if (contig && found && state->start > last + 1) 1743 break; 1744 if (state->end >= cur_start && (state->state & bits) == bits) { 1745 total_bytes += min(search_end, state->end) + 1 - 1746 max(cur_start, state->start); 1747 if (total_bytes >= max_bytes) 1748 break; 1749 if (!found) { 1750 *start = max(cur_start, state->start); 1751 found = 1; 1752 } 1753 last = state->end; 1754 } else if (contig && found) { 1755 break; 1756 } 1757 node = rb_next(node); 1758 if (!node) 1759 break; 1760 } 1761 out: 1762 spin_unlock(&tree->lock); 1763 return total_bytes; 1764 } 1765 1766 /* 1767 * set the private field for a given byte offset in the tree. If there isn't 1768 * an extent_state there already, this does nothing. 1769 */ 1770 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1771 { 1772 struct rb_node *node; 1773 struct extent_state *state; 1774 int ret = 0; 1775 1776 spin_lock(&tree->lock); 1777 /* 1778 * this search will find all the extents that end after 1779 * our range starts. 1780 */ 1781 node = tree_search(tree, start); 1782 if (!node) { 1783 ret = -ENOENT; 1784 goto out; 1785 } 1786 state = rb_entry(node, struct extent_state, rb_node); 1787 if (state->start != start) { 1788 ret = -ENOENT; 1789 goto out; 1790 } 1791 state->private = private; 1792 out: 1793 spin_unlock(&tree->lock); 1794 return ret; 1795 } 1796 1797 void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], 1798 int count) 1799 { 1800 struct rb_node *node; 1801 struct extent_state *state; 1802 1803 spin_lock(&tree->lock); 1804 /* 1805 * this search will find all the extents that end after 1806 * our range starts. 1807 */ 1808 node = tree_search(tree, start); 1809 BUG_ON(!node); 1810 1811 state = rb_entry(node, struct extent_state, rb_node); 1812 BUG_ON(state->start != start); 1813 1814 while (count) { 1815 state->private = *csums++; 1816 count--; 1817 state = next_state(state); 1818 } 1819 spin_unlock(&tree->lock); 1820 } 1821 1822 static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) 1823 { 1824 struct bio_vec *bvec = bio->bi_io_vec + bio_index; 1825 1826 return page_offset(bvec->bv_page) + bvec->bv_offset; 1827 } 1828 1829 void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, 1830 u32 csums[], int count) 1831 { 1832 struct rb_node *node; 1833 struct extent_state *state = NULL; 1834 u64 start; 1835 1836 spin_lock(&tree->lock); 1837 do { 1838 start = __btrfs_get_bio_offset(bio, bio_index); 1839 if (state == NULL || state->start != start) { 1840 node = tree_search(tree, start); 1841 BUG_ON(!node); 1842 1843 state = rb_entry(node, struct extent_state, rb_node); 1844 BUG_ON(state->start != start); 1845 } 1846 state->private = *csums++; 1847 count--; 1848 bio_index++; 1849 1850 state = next_state(state); 1851 } while (count); 1852 spin_unlock(&tree->lock); 1853 } 1854 1855 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1856 { 1857 struct rb_node *node; 1858 struct extent_state *state; 1859 int ret = 0; 1860 1861 spin_lock(&tree->lock); 1862 /* 1863 * this search will find all the extents that end after 1864 * our range starts. 1865 */ 1866 node = tree_search(tree, start); 1867 if (!node) { 1868 ret = -ENOENT; 1869 goto out; 1870 } 1871 state = rb_entry(node, struct extent_state, rb_node); 1872 if (state->start != start) { 1873 ret = -ENOENT; 1874 goto out; 1875 } 1876 *private = state->private; 1877 out: 1878 spin_unlock(&tree->lock); 1879 return ret; 1880 } 1881 1882 /* 1883 * searches a range in the state tree for a given mask. 1884 * If 'filled' == 1, this returns 1 only if every extent in the tree 1885 * has the bits set. Otherwise, 1 is returned if any bit in the 1886 * range is found set. 1887 */ 1888 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1889 unsigned long bits, int filled, struct extent_state *cached) 1890 { 1891 struct extent_state *state = NULL; 1892 struct rb_node *node; 1893 int bitset = 0; 1894 1895 spin_lock(&tree->lock); 1896 if (cached && cached->tree && cached->start <= start && 1897 cached->end > start) 1898 node = &cached->rb_node; 1899 else 1900 node = tree_search(tree, start); 1901 while (node && start <= end) { 1902 state = rb_entry(node, struct extent_state, rb_node); 1903 1904 if (filled && state->start > start) { 1905 bitset = 0; 1906 break; 1907 } 1908 1909 if (state->start > end) 1910 break; 1911 1912 if (state->state & bits) { 1913 bitset = 1; 1914 if (!filled) 1915 break; 1916 } else if (filled) { 1917 bitset = 0; 1918 break; 1919 } 1920 1921 if (state->end == (u64)-1) 1922 break; 1923 1924 start = state->end + 1; 1925 if (start > end) 1926 break; 1927 node = rb_next(node); 1928 if (!node) { 1929 if (filled) 1930 bitset = 0; 1931 break; 1932 } 1933 } 1934 spin_unlock(&tree->lock); 1935 return bitset; 1936 } 1937 1938 /* 1939 * helper function to set a given page up to date if all the 1940 * extents in the tree for that page are up to date 1941 */ 1942 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1943 { 1944 u64 start = page_offset(page); 1945 u64 end = start + PAGE_CACHE_SIZE - 1; 1946 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1947 SetPageUptodate(page); 1948 } 1949 1950 /* 1951 * helper function to unlock a page if all the extents in the tree 1952 * for that page are unlocked 1953 */ 1954 static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1955 { 1956 u64 start = page_offset(page); 1957 u64 end = start + PAGE_CACHE_SIZE - 1; 1958 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1959 unlock_page(page); 1960 } 1961 1962 /* 1963 * helper function to end page writeback if all the extents 1964 * in the tree for that page are done with writeback 1965 */ 1966 static void check_page_writeback(struct extent_io_tree *tree, 1967 struct page *page) 1968 { 1969 end_page_writeback(page); 1970 } 1971 1972 /* 1973 * When IO fails, either with EIO or csum verification fails, we 1974 * try other mirrors that might have a good copy of the data. This 1975 * io_failure_record is used to record state as we go through all the 1976 * mirrors. If another mirror has good data, the page is set up to date 1977 * and things continue. If a good mirror can't be found, the original 1978 * bio end_io callback is called to indicate things have failed. 1979 */ 1980 struct io_failure_record { 1981 struct page *page; 1982 u64 start; 1983 u64 len; 1984 u64 logical; 1985 unsigned long bio_flags; 1986 int this_mirror; 1987 int failed_mirror; 1988 int in_validation; 1989 }; 1990 1991 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1992 int did_repair) 1993 { 1994 int ret; 1995 int err = 0; 1996 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1997 1998 set_state_private(failure_tree, rec->start, 0); 1999 ret = clear_extent_bits(failure_tree, rec->start, 2000 rec->start + rec->len - 1, 2001 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2002 if (ret) 2003 err = ret; 2004 2005 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 2006 rec->start + rec->len - 1, 2007 EXTENT_DAMAGED, GFP_NOFS); 2008 if (ret && !err) 2009 err = ret; 2010 2011 kfree(rec); 2012 return err; 2013 } 2014 2015 static void repair_io_failure_callback(struct bio *bio, int err) 2016 { 2017 complete(bio->bi_private); 2018 } 2019 2020 /* 2021 * this bypasses the standard btrfs submit functions deliberately, as 2022 * the standard behavior is to write all copies in a raid setup. here we only 2023 * want to write the one bad copy. so we do the mapping for ourselves and issue 2024 * submit_bio directly. 2025 * to avoid any synchronization issues, wait for the data after writing, which 2026 * actually prevents the read that triggered the error from finishing. 2027 * currently, there can be no more than two copies of every data bit. thus, 2028 * exactly one rewrite is required. 2029 */ 2030 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 2031 u64 length, u64 logical, struct page *page, 2032 int mirror_num) 2033 { 2034 struct bio *bio; 2035 struct btrfs_device *dev; 2036 DECLARE_COMPLETION_ONSTACK(compl); 2037 u64 map_length = 0; 2038 u64 sector; 2039 struct btrfs_bio *bbio = NULL; 2040 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2041 int ret; 2042 2043 BUG_ON(!mirror_num); 2044 2045 /* we can't repair anything in raid56 yet */ 2046 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2047 return 0; 2048 2049 bio = bio_alloc(GFP_NOFS, 1); 2050 if (!bio) 2051 return -EIO; 2052 bio->bi_private = &compl; 2053 bio->bi_end_io = repair_io_failure_callback; 2054 bio->bi_size = 0; 2055 map_length = length; 2056 2057 ret = btrfs_map_block(fs_info, WRITE, logical, 2058 &map_length, &bbio, mirror_num); 2059 if (ret) { 2060 bio_put(bio); 2061 return -EIO; 2062 } 2063 BUG_ON(mirror_num != bbio->mirror_num); 2064 sector = bbio->stripes[mirror_num-1].physical >> 9; 2065 bio->bi_sector = sector; 2066 dev = bbio->stripes[mirror_num-1].dev; 2067 kfree(bbio); 2068 if (!dev || !dev->bdev || !dev->writeable) { 2069 bio_put(bio); 2070 return -EIO; 2071 } 2072 bio->bi_bdev = dev->bdev; 2073 bio_add_page(bio, page, length, start - page_offset(page)); 2074 btrfsic_submit_bio(WRITE_SYNC, bio); 2075 wait_for_completion(&compl); 2076 2077 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2078 /* try to remap that extent elsewhere? */ 2079 bio_put(bio); 2080 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2081 return -EIO; 2082 } 2083 2084 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 2085 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2086 start, rcu_str_deref(dev->name), sector); 2087 2088 bio_put(bio); 2089 return 0; 2090 } 2091 2092 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 2093 int mirror_num) 2094 { 2095 u64 start = eb->start; 2096 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2097 int ret = 0; 2098 2099 for (i = 0; i < num_pages; i++) { 2100 struct page *p = extent_buffer_page(eb, i); 2101 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2102 start, p, mirror_num); 2103 if (ret) 2104 break; 2105 start += PAGE_CACHE_SIZE; 2106 } 2107 2108 return ret; 2109 } 2110 2111 /* 2112 * each time an IO finishes, we do a fast check in the IO failure tree 2113 * to see if we need to process or clean up an io_failure_record 2114 */ 2115 static int clean_io_failure(u64 start, struct page *page) 2116 { 2117 u64 private; 2118 u64 private_failure; 2119 struct io_failure_record *failrec; 2120 struct btrfs_fs_info *fs_info; 2121 struct extent_state *state; 2122 int num_copies; 2123 int did_repair = 0; 2124 int ret; 2125 struct inode *inode = page->mapping->host; 2126 2127 private = 0; 2128 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2129 (u64)-1, 1, EXTENT_DIRTY, 0); 2130 if (!ret) 2131 return 0; 2132 2133 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2134 &private_failure); 2135 if (ret) 2136 return 0; 2137 2138 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2139 BUG_ON(!failrec->this_mirror); 2140 2141 if (failrec->in_validation) { 2142 /* there was no real error, just free the record */ 2143 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2144 failrec->start); 2145 did_repair = 1; 2146 goto out; 2147 } 2148 2149 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2150 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2151 failrec->start, 2152 EXTENT_LOCKED); 2153 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2154 2155 if (state && state->start == failrec->start) { 2156 fs_info = BTRFS_I(inode)->root->fs_info; 2157 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2158 failrec->len); 2159 if (num_copies > 1) { 2160 ret = repair_io_failure(fs_info, start, failrec->len, 2161 failrec->logical, page, 2162 failrec->failed_mirror); 2163 did_repair = !ret; 2164 } 2165 ret = 0; 2166 } 2167 2168 out: 2169 if (!ret) 2170 ret = free_io_failure(inode, failrec, did_repair); 2171 2172 return ret; 2173 } 2174 2175 /* 2176 * this is a generic handler for readpage errors (default 2177 * readpage_io_failed_hook). if other copies exist, read those and write back 2178 * good data to the failed position. does not investigate in remapping the 2179 * failed extent elsewhere, hoping the device will be smart enough to do this as 2180 * needed 2181 */ 2182 2183 static int bio_readpage_error(struct bio *failed_bio, struct page *page, 2184 u64 start, u64 end, int failed_mirror, 2185 struct extent_state *state) 2186 { 2187 struct io_failure_record *failrec = NULL; 2188 u64 private; 2189 struct extent_map *em; 2190 struct inode *inode = page->mapping->host; 2191 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2192 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2193 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2194 struct bio *bio; 2195 int num_copies; 2196 int ret; 2197 int read_mode; 2198 u64 logical; 2199 2200 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2201 2202 ret = get_state_private(failure_tree, start, &private); 2203 if (ret) { 2204 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2205 if (!failrec) 2206 return -ENOMEM; 2207 failrec->start = start; 2208 failrec->len = end - start + 1; 2209 failrec->this_mirror = 0; 2210 failrec->bio_flags = 0; 2211 failrec->in_validation = 0; 2212 2213 read_lock(&em_tree->lock); 2214 em = lookup_extent_mapping(em_tree, start, failrec->len); 2215 if (!em) { 2216 read_unlock(&em_tree->lock); 2217 kfree(failrec); 2218 return -EIO; 2219 } 2220 2221 if (em->start > start || em->start + em->len < start) { 2222 free_extent_map(em); 2223 em = NULL; 2224 } 2225 read_unlock(&em_tree->lock); 2226 2227 if (!em) { 2228 kfree(failrec); 2229 return -EIO; 2230 } 2231 logical = start - em->start; 2232 logical = em->block_start + logical; 2233 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2234 logical = em->block_start; 2235 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2236 extent_set_compress_type(&failrec->bio_flags, 2237 em->compress_type); 2238 } 2239 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2240 "len=%llu\n", logical, start, failrec->len); 2241 failrec->logical = logical; 2242 free_extent_map(em); 2243 2244 /* set the bits in the private failure tree */ 2245 ret = set_extent_bits(failure_tree, start, end, 2246 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2247 if (ret >= 0) 2248 ret = set_state_private(failure_tree, start, 2249 (u64)(unsigned long)failrec); 2250 /* set the bits in the inode's tree */ 2251 if (ret >= 0) 2252 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2253 GFP_NOFS); 2254 if (ret < 0) { 2255 kfree(failrec); 2256 return ret; 2257 } 2258 } else { 2259 failrec = (struct io_failure_record *)(unsigned long)private; 2260 pr_debug("bio_readpage_error: (found) logical=%llu, " 2261 "start=%llu, len=%llu, validation=%d\n", 2262 failrec->logical, failrec->start, failrec->len, 2263 failrec->in_validation); 2264 /* 2265 * when data can be on disk more than twice, add to failrec here 2266 * (e.g. with a list for failed_mirror) to make 2267 * clean_io_failure() clean all those errors at once. 2268 */ 2269 } 2270 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2271 failrec->logical, failrec->len); 2272 if (num_copies == 1) { 2273 /* 2274 * we only have a single copy of the data, so don't bother with 2275 * all the retry and error correction code that follows. no 2276 * matter what the error is, it is very likely to persist. 2277 */ 2278 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " 2279 "state=%p, num_copies=%d, next_mirror %d, " 2280 "failed_mirror %d\n", state, num_copies, 2281 failrec->this_mirror, failed_mirror); 2282 free_io_failure(inode, failrec, 0); 2283 return -EIO; 2284 } 2285 2286 if (!state) { 2287 spin_lock(&tree->lock); 2288 state = find_first_extent_bit_state(tree, failrec->start, 2289 EXTENT_LOCKED); 2290 if (state && state->start != failrec->start) 2291 state = NULL; 2292 spin_unlock(&tree->lock); 2293 } 2294 2295 /* 2296 * there are two premises: 2297 * a) deliver good data to the caller 2298 * b) correct the bad sectors on disk 2299 */ 2300 if (failed_bio->bi_vcnt > 1) { 2301 /* 2302 * to fulfill b), we need to know the exact failing sectors, as 2303 * we don't want to rewrite any more than the failed ones. thus, 2304 * we need separate read requests for the failed bio 2305 * 2306 * if the following BUG_ON triggers, our validation request got 2307 * merged. we need separate requests for our algorithm to work. 2308 */ 2309 BUG_ON(failrec->in_validation); 2310 failrec->in_validation = 1; 2311 failrec->this_mirror = failed_mirror; 2312 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2313 } else { 2314 /* 2315 * we're ready to fulfill a) and b) alongside. get a good copy 2316 * of the failed sector and if we succeed, we have setup 2317 * everything for repair_io_failure to do the rest for us. 2318 */ 2319 if (failrec->in_validation) { 2320 BUG_ON(failrec->this_mirror != failed_mirror); 2321 failrec->in_validation = 0; 2322 failrec->this_mirror = 0; 2323 } 2324 failrec->failed_mirror = failed_mirror; 2325 failrec->this_mirror++; 2326 if (failrec->this_mirror == failed_mirror) 2327 failrec->this_mirror++; 2328 read_mode = READ_SYNC; 2329 } 2330 2331 if (!state || failrec->this_mirror > num_copies) { 2332 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " 2333 "next_mirror %d, failed_mirror %d\n", state, 2334 num_copies, failrec->this_mirror, failed_mirror); 2335 free_io_failure(inode, failrec, 0); 2336 return -EIO; 2337 } 2338 2339 bio = bio_alloc(GFP_NOFS, 1); 2340 if (!bio) { 2341 free_io_failure(inode, failrec, 0); 2342 return -EIO; 2343 } 2344 bio->bi_private = state; 2345 bio->bi_end_io = failed_bio->bi_end_io; 2346 bio->bi_sector = failrec->logical >> 9; 2347 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2348 bio->bi_size = 0; 2349 2350 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2351 2352 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2353 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2354 failrec->this_mirror, num_copies, failrec->in_validation); 2355 2356 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2357 failrec->this_mirror, 2358 failrec->bio_flags, 0); 2359 return ret; 2360 } 2361 2362 /* lots and lots of room for performance fixes in the end_bio funcs */ 2363 2364 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2365 { 2366 int uptodate = (err == 0); 2367 struct extent_io_tree *tree; 2368 int ret; 2369 2370 tree = &BTRFS_I(page->mapping->host)->io_tree; 2371 2372 if (tree->ops && tree->ops->writepage_end_io_hook) { 2373 ret = tree->ops->writepage_end_io_hook(page, start, 2374 end, NULL, uptodate); 2375 if (ret) 2376 uptodate = 0; 2377 } 2378 2379 if (!uptodate) { 2380 ClearPageUptodate(page); 2381 SetPageError(page); 2382 } 2383 return 0; 2384 } 2385 2386 /* 2387 * after a writepage IO is done, we need to: 2388 * clear the uptodate bits on error 2389 * clear the writeback bits in the extent tree for this IO 2390 * end_page_writeback if the page has no more pending IO 2391 * 2392 * Scheduling is not allowed, so the extent state tree is expected 2393 * to have one and only one object corresponding to this IO. 2394 */ 2395 static void end_bio_extent_writepage(struct bio *bio, int err) 2396 { 2397 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2398 struct extent_io_tree *tree; 2399 u64 start; 2400 u64 end; 2401 int whole_page; 2402 2403 do { 2404 struct page *page = bvec->bv_page; 2405 tree = &BTRFS_I(page->mapping->host)->io_tree; 2406 2407 start = page_offset(page) + bvec->bv_offset; 2408 end = start + bvec->bv_len - 1; 2409 2410 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2411 whole_page = 1; 2412 else 2413 whole_page = 0; 2414 2415 if (--bvec >= bio->bi_io_vec) 2416 prefetchw(&bvec->bv_page->flags); 2417 2418 if (end_extent_writepage(page, err, start, end)) 2419 continue; 2420 2421 if (whole_page) 2422 end_page_writeback(page); 2423 else 2424 check_page_writeback(tree, page); 2425 } while (bvec >= bio->bi_io_vec); 2426 2427 bio_put(bio); 2428 } 2429 2430 /* 2431 * after a readpage IO is done, we need to: 2432 * clear the uptodate bits on error 2433 * set the uptodate bits if things worked 2434 * set the page up to date if all extents in the tree are uptodate 2435 * clear the lock bit in the extent tree 2436 * unlock the page if there are no other extents locked for it 2437 * 2438 * Scheduling is not allowed, so the extent state tree is expected 2439 * to have one and only one object corresponding to this IO. 2440 */ 2441 static void end_bio_extent_readpage(struct bio *bio, int err) 2442 { 2443 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2444 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2445 struct bio_vec *bvec = bio->bi_io_vec; 2446 struct extent_io_tree *tree; 2447 u64 start; 2448 u64 end; 2449 int whole_page; 2450 int mirror; 2451 int ret; 2452 2453 if (err) 2454 uptodate = 0; 2455 2456 do { 2457 struct page *page = bvec->bv_page; 2458 struct extent_state *cached = NULL; 2459 struct extent_state *state; 2460 2461 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2462 "mirror=%ld\n", (u64)bio->bi_sector, err, 2463 (long int)bio->bi_bdev); 2464 tree = &BTRFS_I(page->mapping->host)->io_tree; 2465 2466 start = page_offset(page) + bvec->bv_offset; 2467 end = start + bvec->bv_len - 1; 2468 2469 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2470 whole_page = 1; 2471 else 2472 whole_page = 0; 2473 2474 if (++bvec <= bvec_end) 2475 prefetchw(&bvec->bv_page->flags); 2476 2477 spin_lock(&tree->lock); 2478 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 2479 if (state && state->start == start) { 2480 /* 2481 * take a reference on the state, unlock will drop 2482 * the ref 2483 */ 2484 cache_state(state, &cached); 2485 } 2486 spin_unlock(&tree->lock); 2487 2488 mirror = (int)(unsigned long)bio->bi_bdev; 2489 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2490 ret = tree->ops->readpage_end_io_hook(page, start, end, 2491 state, mirror); 2492 if (ret) 2493 uptodate = 0; 2494 else 2495 clean_io_failure(start, page); 2496 } 2497 2498 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2499 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2500 if (!ret && !err && 2501 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2502 uptodate = 1; 2503 } else if (!uptodate) { 2504 /* 2505 * The generic bio_readpage_error handles errors the 2506 * following way: If possible, new read requests are 2507 * created and submitted and will end up in 2508 * end_bio_extent_readpage as well (if we're lucky, not 2509 * in the !uptodate case). In that case it returns 0 and 2510 * we just go on with the next page in our bio. If it 2511 * can't handle the error it will return -EIO and we 2512 * remain responsible for that page. 2513 */ 2514 ret = bio_readpage_error(bio, page, start, end, mirror, NULL); 2515 if (ret == 0) { 2516 uptodate = 2517 test_bit(BIO_UPTODATE, &bio->bi_flags); 2518 if (err) 2519 uptodate = 0; 2520 uncache_state(&cached); 2521 continue; 2522 } 2523 } 2524 2525 if (uptodate && tree->track_uptodate) { 2526 set_extent_uptodate(tree, start, end, &cached, 2527 GFP_ATOMIC); 2528 } 2529 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2530 2531 if (whole_page) { 2532 if (uptodate) { 2533 SetPageUptodate(page); 2534 } else { 2535 ClearPageUptodate(page); 2536 SetPageError(page); 2537 } 2538 unlock_page(page); 2539 } else { 2540 if (uptodate) { 2541 check_page_uptodate(tree, page); 2542 } else { 2543 ClearPageUptodate(page); 2544 SetPageError(page); 2545 } 2546 check_page_locked(tree, page); 2547 } 2548 } while (bvec <= bvec_end); 2549 2550 bio_put(bio); 2551 } 2552 2553 struct bio * 2554 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2555 gfp_t gfp_flags) 2556 { 2557 struct bio *bio; 2558 2559 bio = bio_alloc(gfp_flags, nr_vecs); 2560 2561 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2562 while (!bio && (nr_vecs /= 2)) 2563 bio = bio_alloc(gfp_flags, nr_vecs); 2564 } 2565 2566 if (bio) { 2567 bio->bi_size = 0; 2568 bio->bi_bdev = bdev; 2569 bio->bi_sector = first_sector; 2570 } 2571 return bio; 2572 } 2573 2574 static int __must_check submit_one_bio(int rw, struct bio *bio, 2575 int mirror_num, unsigned long bio_flags) 2576 { 2577 int ret = 0; 2578 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2579 struct page *page = bvec->bv_page; 2580 struct extent_io_tree *tree = bio->bi_private; 2581 u64 start; 2582 2583 start = page_offset(page) + bvec->bv_offset; 2584 2585 bio->bi_private = NULL; 2586 2587 bio_get(bio); 2588 2589 if (tree->ops && tree->ops->submit_bio_hook) 2590 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2591 mirror_num, bio_flags, start); 2592 else 2593 btrfsic_submit_bio(rw, bio); 2594 2595 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2596 ret = -EOPNOTSUPP; 2597 bio_put(bio); 2598 return ret; 2599 } 2600 2601 static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, 2602 unsigned long offset, size_t size, struct bio *bio, 2603 unsigned long bio_flags) 2604 { 2605 int ret = 0; 2606 if (tree->ops && tree->ops->merge_bio_hook) 2607 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, 2608 bio_flags); 2609 BUG_ON(ret < 0); 2610 return ret; 2611 2612 } 2613 2614 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2615 struct page *page, sector_t sector, 2616 size_t size, unsigned long offset, 2617 struct block_device *bdev, 2618 struct bio **bio_ret, 2619 unsigned long max_pages, 2620 bio_end_io_t end_io_func, 2621 int mirror_num, 2622 unsigned long prev_bio_flags, 2623 unsigned long bio_flags) 2624 { 2625 int ret = 0; 2626 struct bio *bio; 2627 int nr; 2628 int contig = 0; 2629 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2630 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2631 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2632 2633 if (bio_ret && *bio_ret) { 2634 bio = *bio_ret; 2635 if (old_compressed) 2636 contig = bio->bi_sector == sector; 2637 else 2638 contig = bio_end_sector(bio) == sector; 2639 2640 if (prev_bio_flags != bio_flags || !contig || 2641 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2642 bio_add_page(bio, page, page_size, offset) < page_size) { 2643 ret = submit_one_bio(rw, bio, mirror_num, 2644 prev_bio_flags); 2645 if (ret < 0) 2646 return ret; 2647 bio = NULL; 2648 } else { 2649 return 0; 2650 } 2651 } 2652 if (this_compressed) 2653 nr = BIO_MAX_PAGES; 2654 else 2655 nr = bio_get_nr_vecs(bdev); 2656 2657 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2658 if (!bio) 2659 return -ENOMEM; 2660 2661 bio_add_page(bio, page, page_size, offset); 2662 bio->bi_end_io = end_io_func; 2663 bio->bi_private = tree; 2664 2665 if (bio_ret) 2666 *bio_ret = bio; 2667 else 2668 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2669 2670 return ret; 2671 } 2672 2673 static void attach_extent_buffer_page(struct extent_buffer *eb, 2674 struct page *page) 2675 { 2676 if (!PagePrivate(page)) { 2677 SetPagePrivate(page); 2678 page_cache_get(page); 2679 set_page_private(page, (unsigned long)eb); 2680 } else { 2681 WARN_ON(page->private != (unsigned long)eb); 2682 } 2683 } 2684 2685 void set_page_extent_mapped(struct page *page) 2686 { 2687 if (!PagePrivate(page)) { 2688 SetPagePrivate(page); 2689 page_cache_get(page); 2690 set_page_private(page, EXTENT_PAGE_PRIVATE); 2691 } 2692 } 2693 2694 /* 2695 * basic readpage implementation. Locked extent state structs are inserted 2696 * into the tree that are removed when the IO is done (by the end_io 2697 * handlers) 2698 * XXX JDM: This needs looking at to ensure proper page locking 2699 */ 2700 static int __extent_read_full_page(struct extent_io_tree *tree, 2701 struct page *page, 2702 get_extent_t *get_extent, 2703 struct bio **bio, int mirror_num, 2704 unsigned long *bio_flags, int rw) 2705 { 2706 struct inode *inode = page->mapping->host; 2707 u64 start = page_offset(page); 2708 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2709 u64 end; 2710 u64 cur = start; 2711 u64 extent_offset; 2712 u64 last_byte = i_size_read(inode); 2713 u64 block_start; 2714 u64 cur_end; 2715 sector_t sector; 2716 struct extent_map *em; 2717 struct block_device *bdev; 2718 struct btrfs_ordered_extent *ordered; 2719 int ret; 2720 int nr = 0; 2721 size_t pg_offset = 0; 2722 size_t iosize; 2723 size_t disk_io_size; 2724 size_t blocksize = inode->i_sb->s_blocksize; 2725 unsigned long this_bio_flag = 0; 2726 2727 set_page_extent_mapped(page); 2728 2729 if (!PageUptodate(page)) { 2730 if (cleancache_get_page(page) == 0) { 2731 BUG_ON(blocksize != PAGE_SIZE); 2732 goto out; 2733 } 2734 } 2735 2736 end = page_end; 2737 while (1) { 2738 lock_extent(tree, start, end); 2739 ordered = btrfs_lookup_ordered_extent(inode, start); 2740 if (!ordered) 2741 break; 2742 unlock_extent(tree, start, end); 2743 btrfs_start_ordered_extent(inode, ordered, 1); 2744 btrfs_put_ordered_extent(ordered); 2745 } 2746 2747 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2748 char *userpage; 2749 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2750 2751 if (zero_offset) { 2752 iosize = PAGE_CACHE_SIZE - zero_offset; 2753 userpage = kmap_atomic(page); 2754 memset(userpage + zero_offset, 0, iosize); 2755 flush_dcache_page(page); 2756 kunmap_atomic(userpage); 2757 } 2758 } 2759 while (cur <= end) { 2760 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2761 2762 if (cur >= last_byte) { 2763 char *userpage; 2764 struct extent_state *cached = NULL; 2765 2766 iosize = PAGE_CACHE_SIZE - pg_offset; 2767 userpage = kmap_atomic(page); 2768 memset(userpage + pg_offset, 0, iosize); 2769 flush_dcache_page(page); 2770 kunmap_atomic(userpage); 2771 set_extent_uptodate(tree, cur, cur + iosize - 1, 2772 &cached, GFP_NOFS); 2773 unlock_extent_cached(tree, cur, cur + iosize - 1, 2774 &cached, GFP_NOFS); 2775 break; 2776 } 2777 em = get_extent(inode, page, pg_offset, cur, 2778 end - cur + 1, 0); 2779 if (IS_ERR_OR_NULL(em)) { 2780 SetPageError(page); 2781 unlock_extent(tree, cur, end); 2782 break; 2783 } 2784 extent_offset = cur - em->start; 2785 BUG_ON(extent_map_end(em) <= cur); 2786 BUG_ON(end < cur); 2787 2788 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2789 this_bio_flag = EXTENT_BIO_COMPRESSED; 2790 extent_set_compress_type(&this_bio_flag, 2791 em->compress_type); 2792 } 2793 2794 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2795 cur_end = min(extent_map_end(em) - 1, end); 2796 iosize = ALIGN(iosize, blocksize); 2797 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2798 disk_io_size = em->block_len; 2799 sector = em->block_start >> 9; 2800 } else { 2801 sector = (em->block_start + extent_offset) >> 9; 2802 disk_io_size = iosize; 2803 } 2804 bdev = em->bdev; 2805 block_start = em->block_start; 2806 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2807 block_start = EXTENT_MAP_HOLE; 2808 free_extent_map(em); 2809 em = NULL; 2810 2811 /* we've found a hole, just zero and go on */ 2812 if (block_start == EXTENT_MAP_HOLE) { 2813 char *userpage; 2814 struct extent_state *cached = NULL; 2815 2816 userpage = kmap_atomic(page); 2817 memset(userpage + pg_offset, 0, iosize); 2818 flush_dcache_page(page); 2819 kunmap_atomic(userpage); 2820 2821 set_extent_uptodate(tree, cur, cur + iosize - 1, 2822 &cached, GFP_NOFS); 2823 unlock_extent_cached(tree, cur, cur + iosize - 1, 2824 &cached, GFP_NOFS); 2825 cur = cur + iosize; 2826 pg_offset += iosize; 2827 continue; 2828 } 2829 /* the get_extent function already copied into the page */ 2830 if (test_range_bit(tree, cur, cur_end, 2831 EXTENT_UPTODATE, 1, NULL)) { 2832 check_page_uptodate(tree, page); 2833 unlock_extent(tree, cur, cur + iosize - 1); 2834 cur = cur + iosize; 2835 pg_offset += iosize; 2836 continue; 2837 } 2838 /* we have an inline extent but it didn't get marked up 2839 * to date. Error out 2840 */ 2841 if (block_start == EXTENT_MAP_INLINE) { 2842 SetPageError(page); 2843 unlock_extent(tree, cur, cur + iosize - 1); 2844 cur = cur + iosize; 2845 pg_offset += iosize; 2846 continue; 2847 } 2848 2849 pnr -= page->index; 2850 ret = submit_extent_page(rw, tree, page, 2851 sector, disk_io_size, pg_offset, 2852 bdev, bio, pnr, 2853 end_bio_extent_readpage, mirror_num, 2854 *bio_flags, 2855 this_bio_flag); 2856 if (!ret) { 2857 nr++; 2858 *bio_flags = this_bio_flag; 2859 } else { 2860 SetPageError(page); 2861 unlock_extent(tree, cur, cur + iosize - 1); 2862 } 2863 cur = cur + iosize; 2864 pg_offset += iosize; 2865 } 2866 out: 2867 if (!nr) { 2868 if (!PageError(page)) 2869 SetPageUptodate(page); 2870 unlock_page(page); 2871 } 2872 return 0; 2873 } 2874 2875 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2876 get_extent_t *get_extent, int mirror_num) 2877 { 2878 struct bio *bio = NULL; 2879 unsigned long bio_flags = 0; 2880 int ret; 2881 2882 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 2883 &bio_flags, READ); 2884 if (bio) 2885 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 2886 return ret; 2887 } 2888 2889 static noinline void update_nr_written(struct page *page, 2890 struct writeback_control *wbc, 2891 unsigned long nr_written) 2892 { 2893 wbc->nr_to_write -= nr_written; 2894 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2895 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2896 page->mapping->writeback_index = page->index + nr_written; 2897 } 2898 2899 /* 2900 * the writepage semantics are similar to regular writepage. extent 2901 * records are inserted to lock ranges in the tree, and as dirty areas 2902 * are found, they are marked writeback. Then the lock bits are removed 2903 * and the end_io handler clears the writeback ranges 2904 */ 2905 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2906 void *data) 2907 { 2908 struct inode *inode = page->mapping->host; 2909 struct extent_page_data *epd = data; 2910 struct extent_io_tree *tree = epd->tree; 2911 u64 start = page_offset(page); 2912 u64 delalloc_start; 2913 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2914 u64 end; 2915 u64 cur = start; 2916 u64 extent_offset; 2917 u64 last_byte = i_size_read(inode); 2918 u64 block_start; 2919 u64 iosize; 2920 sector_t sector; 2921 struct extent_state *cached_state = NULL; 2922 struct extent_map *em; 2923 struct block_device *bdev; 2924 int ret; 2925 int nr = 0; 2926 size_t pg_offset = 0; 2927 size_t blocksize; 2928 loff_t i_size = i_size_read(inode); 2929 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2930 u64 nr_delalloc; 2931 u64 delalloc_end; 2932 int page_started; 2933 int compressed; 2934 int write_flags; 2935 unsigned long nr_written = 0; 2936 bool fill_delalloc = true; 2937 2938 if (wbc->sync_mode == WB_SYNC_ALL) 2939 write_flags = WRITE_SYNC; 2940 else 2941 write_flags = WRITE; 2942 2943 trace___extent_writepage(page, inode, wbc); 2944 2945 WARN_ON(!PageLocked(page)); 2946 2947 ClearPageError(page); 2948 2949 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2950 if (page->index > end_index || 2951 (page->index == end_index && !pg_offset)) { 2952 page->mapping->a_ops->invalidatepage(page, 0); 2953 unlock_page(page); 2954 return 0; 2955 } 2956 2957 if (page->index == end_index) { 2958 char *userpage; 2959 2960 userpage = kmap_atomic(page); 2961 memset(userpage + pg_offset, 0, 2962 PAGE_CACHE_SIZE - pg_offset); 2963 kunmap_atomic(userpage); 2964 flush_dcache_page(page); 2965 } 2966 pg_offset = 0; 2967 2968 set_page_extent_mapped(page); 2969 2970 if (!tree->ops || !tree->ops->fill_delalloc) 2971 fill_delalloc = false; 2972 2973 delalloc_start = start; 2974 delalloc_end = 0; 2975 page_started = 0; 2976 if (!epd->extent_locked && fill_delalloc) { 2977 u64 delalloc_to_write = 0; 2978 /* 2979 * make sure the wbc mapping index is at least updated 2980 * to this page. 2981 */ 2982 update_nr_written(page, wbc, 0); 2983 2984 while (delalloc_end < page_end) { 2985 nr_delalloc = find_lock_delalloc_range(inode, tree, 2986 page, 2987 &delalloc_start, 2988 &delalloc_end, 2989 128 * 1024 * 1024); 2990 if (nr_delalloc == 0) { 2991 delalloc_start = delalloc_end + 1; 2992 continue; 2993 } 2994 ret = tree->ops->fill_delalloc(inode, page, 2995 delalloc_start, 2996 delalloc_end, 2997 &page_started, 2998 &nr_written); 2999 /* File system has been set read-only */ 3000 if (ret) { 3001 SetPageError(page); 3002 goto done; 3003 } 3004 /* 3005 * delalloc_end is already one less than the total 3006 * length, so we don't subtract one from 3007 * PAGE_CACHE_SIZE 3008 */ 3009 delalloc_to_write += (delalloc_end - delalloc_start + 3010 PAGE_CACHE_SIZE) >> 3011 PAGE_CACHE_SHIFT; 3012 delalloc_start = delalloc_end + 1; 3013 } 3014 if (wbc->nr_to_write < delalloc_to_write) { 3015 int thresh = 8192; 3016 3017 if (delalloc_to_write < thresh * 2) 3018 thresh = delalloc_to_write; 3019 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3020 thresh); 3021 } 3022 3023 /* did the fill delalloc function already unlock and start 3024 * the IO? 3025 */ 3026 if (page_started) { 3027 ret = 0; 3028 /* 3029 * we've unlocked the page, so we can't update 3030 * the mapping's writeback index, just update 3031 * nr_to_write. 3032 */ 3033 wbc->nr_to_write -= nr_written; 3034 goto done_unlocked; 3035 } 3036 } 3037 if (tree->ops && tree->ops->writepage_start_hook) { 3038 ret = tree->ops->writepage_start_hook(page, start, 3039 page_end); 3040 if (ret) { 3041 /* Fixup worker will requeue */ 3042 if (ret == -EBUSY) 3043 wbc->pages_skipped++; 3044 else 3045 redirty_page_for_writepage(wbc, page); 3046 update_nr_written(page, wbc, nr_written); 3047 unlock_page(page); 3048 ret = 0; 3049 goto done_unlocked; 3050 } 3051 } 3052 3053 /* 3054 * we don't want to touch the inode after unlocking the page, 3055 * so we update the mapping writeback index now 3056 */ 3057 update_nr_written(page, wbc, nr_written + 1); 3058 3059 end = page_end; 3060 if (last_byte <= start) { 3061 if (tree->ops && tree->ops->writepage_end_io_hook) 3062 tree->ops->writepage_end_io_hook(page, start, 3063 page_end, NULL, 1); 3064 goto done; 3065 } 3066 3067 blocksize = inode->i_sb->s_blocksize; 3068 3069 while (cur <= end) { 3070 if (cur >= last_byte) { 3071 if (tree->ops && tree->ops->writepage_end_io_hook) 3072 tree->ops->writepage_end_io_hook(page, cur, 3073 page_end, NULL, 1); 3074 break; 3075 } 3076 em = epd->get_extent(inode, page, pg_offset, cur, 3077 end - cur + 1, 1); 3078 if (IS_ERR_OR_NULL(em)) { 3079 SetPageError(page); 3080 break; 3081 } 3082 3083 extent_offset = cur - em->start; 3084 BUG_ON(extent_map_end(em) <= cur); 3085 BUG_ON(end < cur); 3086 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3087 iosize = ALIGN(iosize, blocksize); 3088 sector = (em->block_start + extent_offset) >> 9; 3089 bdev = em->bdev; 3090 block_start = em->block_start; 3091 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3092 free_extent_map(em); 3093 em = NULL; 3094 3095 /* 3096 * compressed and inline extents are written through other 3097 * paths in the FS 3098 */ 3099 if (compressed || block_start == EXTENT_MAP_HOLE || 3100 block_start == EXTENT_MAP_INLINE) { 3101 /* 3102 * end_io notification does not happen here for 3103 * compressed extents 3104 */ 3105 if (!compressed && tree->ops && 3106 tree->ops->writepage_end_io_hook) 3107 tree->ops->writepage_end_io_hook(page, cur, 3108 cur + iosize - 1, 3109 NULL, 1); 3110 else if (compressed) { 3111 /* we don't want to end_page_writeback on 3112 * a compressed extent. this happens 3113 * elsewhere 3114 */ 3115 nr++; 3116 } 3117 3118 cur += iosize; 3119 pg_offset += iosize; 3120 continue; 3121 } 3122 /* leave this out until we have a page_mkwrite call */ 3123 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3124 EXTENT_DIRTY, 0, NULL)) { 3125 cur = cur + iosize; 3126 pg_offset += iosize; 3127 continue; 3128 } 3129 3130 if (tree->ops && tree->ops->writepage_io_hook) { 3131 ret = tree->ops->writepage_io_hook(page, cur, 3132 cur + iosize - 1); 3133 } else { 3134 ret = 0; 3135 } 3136 if (ret) { 3137 SetPageError(page); 3138 } else { 3139 unsigned long max_nr = end_index + 1; 3140 3141 set_range_writeback(tree, cur, cur + iosize - 1); 3142 if (!PageWriteback(page)) { 3143 printk(KERN_ERR "btrfs warning page %lu not " 3144 "writeback, cur %llu end %llu\n", 3145 page->index, (unsigned long long)cur, 3146 (unsigned long long)end); 3147 } 3148 3149 ret = submit_extent_page(write_flags, tree, page, 3150 sector, iosize, pg_offset, 3151 bdev, &epd->bio, max_nr, 3152 end_bio_extent_writepage, 3153 0, 0, 0); 3154 if (ret) 3155 SetPageError(page); 3156 } 3157 cur = cur + iosize; 3158 pg_offset += iosize; 3159 nr++; 3160 } 3161 done: 3162 if (nr == 0) { 3163 /* make sure the mapping tag for page dirty gets cleared */ 3164 set_page_writeback(page); 3165 end_page_writeback(page); 3166 } 3167 unlock_page(page); 3168 3169 done_unlocked: 3170 3171 /* drop our reference on any cached states */ 3172 free_extent_state(cached_state); 3173 return 0; 3174 } 3175 3176 static int eb_wait(void *word) 3177 { 3178 io_schedule(); 3179 return 0; 3180 } 3181 3182 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3183 { 3184 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3185 TASK_UNINTERRUPTIBLE); 3186 } 3187 3188 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3189 struct btrfs_fs_info *fs_info, 3190 struct extent_page_data *epd) 3191 { 3192 unsigned long i, num_pages; 3193 int flush = 0; 3194 int ret = 0; 3195 3196 if (!btrfs_try_tree_write_lock(eb)) { 3197 flush = 1; 3198 flush_write_bio(epd); 3199 btrfs_tree_lock(eb); 3200 } 3201 3202 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3203 btrfs_tree_unlock(eb); 3204 if (!epd->sync_io) 3205 return 0; 3206 if (!flush) { 3207 flush_write_bio(epd); 3208 flush = 1; 3209 } 3210 while (1) { 3211 wait_on_extent_buffer_writeback(eb); 3212 btrfs_tree_lock(eb); 3213 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3214 break; 3215 btrfs_tree_unlock(eb); 3216 } 3217 } 3218 3219 /* 3220 * We need to do this to prevent races in people who check if the eb is 3221 * under IO since we can end up having no IO bits set for a short period 3222 * of time. 3223 */ 3224 spin_lock(&eb->refs_lock); 3225 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3226 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3227 spin_unlock(&eb->refs_lock); 3228 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3229 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3230 -eb->len, 3231 fs_info->dirty_metadata_batch); 3232 ret = 1; 3233 } else { 3234 spin_unlock(&eb->refs_lock); 3235 } 3236 3237 btrfs_tree_unlock(eb); 3238 3239 if (!ret) 3240 return ret; 3241 3242 num_pages = num_extent_pages(eb->start, eb->len); 3243 for (i = 0; i < num_pages; i++) { 3244 struct page *p = extent_buffer_page(eb, i); 3245 3246 if (!trylock_page(p)) { 3247 if (!flush) { 3248 flush_write_bio(epd); 3249 flush = 1; 3250 } 3251 lock_page(p); 3252 } 3253 } 3254 3255 return ret; 3256 } 3257 3258 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3259 { 3260 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3261 smp_mb__after_clear_bit(); 3262 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3263 } 3264 3265 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3266 { 3267 int uptodate = err == 0; 3268 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 3269 struct extent_buffer *eb; 3270 int done; 3271 3272 do { 3273 struct page *page = bvec->bv_page; 3274 3275 bvec--; 3276 eb = (struct extent_buffer *)page->private; 3277 BUG_ON(!eb); 3278 done = atomic_dec_and_test(&eb->io_pages); 3279 3280 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3281 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3282 ClearPageUptodate(page); 3283 SetPageError(page); 3284 } 3285 3286 end_page_writeback(page); 3287 3288 if (!done) 3289 continue; 3290 3291 end_extent_buffer_writeback(eb); 3292 } while (bvec >= bio->bi_io_vec); 3293 3294 bio_put(bio); 3295 3296 } 3297 3298 static int write_one_eb(struct extent_buffer *eb, 3299 struct btrfs_fs_info *fs_info, 3300 struct writeback_control *wbc, 3301 struct extent_page_data *epd) 3302 { 3303 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3304 u64 offset = eb->start; 3305 unsigned long i, num_pages; 3306 unsigned long bio_flags = 0; 3307 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3308 int ret = 0; 3309 3310 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3311 num_pages = num_extent_pages(eb->start, eb->len); 3312 atomic_set(&eb->io_pages, num_pages); 3313 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3314 bio_flags = EXTENT_BIO_TREE_LOG; 3315 3316 for (i = 0; i < num_pages; i++) { 3317 struct page *p = extent_buffer_page(eb, i); 3318 3319 clear_page_dirty_for_io(p); 3320 set_page_writeback(p); 3321 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3322 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3323 -1, end_bio_extent_buffer_writepage, 3324 0, epd->bio_flags, bio_flags); 3325 epd->bio_flags = bio_flags; 3326 if (ret) { 3327 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3328 SetPageError(p); 3329 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3330 end_extent_buffer_writeback(eb); 3331 ret = -EIO; 3332 break; 3333 } 3334 offset += PAGE_CACHE_SIZE; 3335 update_nr_written(p, wbc, 1); 3336 unlock_page(p); 3337 } 3338 3339 if (unlikely(ret)) { 3340 for (; i < num_pages; i++) { 3341 struct page *p = extent_buffer_page(eb, i); 3342 unlock_page(p); 3343 } 3344 } 3345 3346 return ret; 3347 } 3348 3349 int btree_write_cache_pages(struct address_space *mapping, 3350 struct writeback_control *wbc) 3351 { 3352 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3353 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3354 struct extent_buffer *eb, *prev_eb = NULL; 3355 struct extent_page_data epd = { 3356 .bio = NULL, 3357 .tree = tree, 3358 .extent_locked = 0, 3359 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3360 .bio_flags = 0, 3361 }; 3362 int ret = 0; 3363 int done = 0; 3364 int nr_to_write_done = 0; 3365 struct pagevec pvec; 3366 int nr_pages; 3367 pgoff_t index; 3368 pgoff_t end; /* Inclusive */ 3369 int scanned = 0; 3370 int tag; 3371 3372 pagevec_init(&pvec, 0); 3373 if (wbc->range_cyclic) { 3374 index = mapping->writeback_index; /* Start from prev offset */ 3375 end = -1; 3376 } else { 3377 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3378 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3379 scanned = 1; 3380 } 3381 if (wbc->sync_mode == WB_SYNC_ALL) 3382 tag = PAGECACHE_TAG_TOWRITE; 3383 else 3384 tag = PAGECACHE_TAG_DIRTY; 3385 retry: 3386 if (wbc->sync_mode == WB_SYNC_ALL) 3387 tag_pages_for_writeback(mapping, index, end); 3388 while (!done && !nr_to_write_done && (index <= end) && 3389 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3390 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3391 unsigned i; 3392 3393 scanned = 1; 3394 for (i = 0; i < nr_pages; i++) { 3395 struct page *page = pvec.pages[i]; 3396 3397 if (!PagePrivate(page)) 3398 continue; 3399 3400 if (!wbc->range_cyclic && page->index > end) { 3401 done = 1; 3402 break; 3403 } 3404 3405 spin_lock(&mapping->private_lock); 3406 if (!PagePrivate(page)) { 3407 spin_unlock(&mapping->private_lock); 3408 continue; 3409 } 3410 3411 eb = (struct extent_buffer *)page->private; 3412 3413 /* 3414 * Shouldn't happen and normally this would be a BUG_ON 3415 * but no sense in crashing the users box for something 3416 * we can survive anyway. 3417 */ 3418 if (!eb) { 3419 spin_unlock(&mapping->private_lock); 3420 WARN_ON(1); 3421 continue; 3422 } 3423 3424 if (eb == prev_eb) { 3425 spin_unlock(&mapping->private_lock); 3426 continue; 3427 } 3428 3429 ret = atomic_inc_not_zero(&eb->refs); 3430 spin_unlock(&mapping->private_lock); 3431 if (!ret) 3432 continue; 3433 3434 prev_eb = eb; 3435 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3436 if (!ret) { 3437 free_extent_buffer(eb); 3438 continue; 3439 } 3440 3441 ret = write_one_eb(eb, fs_info, wbc, &epd); 3442 if (ret) { 3443 done = 1; 3444 free_extent_buffer(eb); 3445 break; 3446 } 3447 free_extent_buffer(eb); 3448 3449 /* 3450 * the filesystem may choose to bump up nr_to_write. 3451 * We have to make sure to honor the new nr_to_write 3452 * at any time 3453 */ 3454 nr_to_write_done = wbc->nr_to_write <= 0; 3455 } 3456 pagevec_release(&pvec); 3457 cond_resched(); 3458 } 3459 if (!scanned && !done) { 3460 /* 3461 * We hit the last page and there is more work to be done: wrap 3462 * back to the start of the file 3463 */ 3464 scanned = 1; 3465 index = 0; 3466 goto retry; 3467 } 3468 flush_write_bio(&epd); 3469 return ret; 3470 } 3471 3472 /** 3473 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3474 * @mapping: address space structure to write 3475 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3476 * @writepage: function called for each page 3477 * @data: data passed to writepage function 3478 * 3479 * If a page is already under I/O, write_cache_pages() skips it, even 3480 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3481 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3482 * and msync() need to guarantee that all the data which was dirty at the time 3483 * the call was made get new I/O started against them. If wbc->sync_mode is 3484 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3485 * existing IO to complete. 3486 */ 3487 static int extent_write_cache_pages(struct extent_io_tree *tree, 3488 struct address_space *mapping, 3489 struct writeback_control *wbc, 3490 writepage_t writepage, void *data, 3491 void (*flush_fn)(void *)) 3492 { 3493 struct inode *inode = mapping->host; 3494 int ret = 0; 3495 int done = 0; 3496 int nr_to_write_done = 0; 3497 struct pagevec pvec; 3498 int nr_pages; 3499 pgoff_t index; 3500 pgoff_t end; /* Inclusive */ 3501 int scanned = 0; 3502 int tag; 3503 3504 /* 3505 * We have to hold onto the inode so that ordered extents can do their 3506 * work when the IO finishes. The alternative to this is failing to add 3507 * an ordered extent if the igrab() fails there and that is a huge pain 3508 * to deal with, so instead just hold onto the inode throughout the 3509 * writepages operation. If it fails here we are freeing up the inode 3510 * anyway and we'd rather not waste our time writing out stuff that is 3511 * going to be truncated anyway. 3512 */ 3513 if (!igrab(inode)) 3514 return 0; 3515 3516 pagevec_init(&pvec, 0); 3517 if (wbc->range_cyclic) { 3518 index = mapping->writeback_index; /* Start from prev offset */ 3519 end = -1; 3520 } else { 3521 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3522 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3523 scanned = 1; 3524 } 3525 if (wbc->sync_mode == WB_SYNC_ALL) 3526 tag = PAGECACHE_TAG_TOWRITE; 3527 else 3528 tag = PAGECACHE_TAG_DIRTY; 3529 retry: 3530 if (wbc->sync_mode == WB_SYNC_ALL) 3531 tag_pages_for_writeback(mapping, index, end); 3532 while (!done && !nr_to_write_done && (index <= end) && 3533 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3534 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3535 unsigned i; 3536 3537 scanned = 1; 3538 for (i = 0; i < nr_pages; i++) { 3539 struct page *page = pvec.pages[i]; 3540 3541 /* 3542 * At this point we hold neither mapping->tree_lock nor 3543 * lock on the page itself: the page may be truncated or 3544 * invalidated (changing page->mapping to NULL), or even 3545 * swizzled back from swapper_space to tmpfs file 3546 * mapping 3547 */ 3548 if (!trylock_page(page)) { 3549 flush_fn(data); 3550 lock_page(page); 3551 } 3552 3553 if (unlikely(page->mapping != mapping)) { 3554 unlock_page(page); 3555 continue; 3556 } 3557 3558 if (!wbc->range_cyclic && page->index > end) { 3559 done = 1; 3560 unlock_page(page); 3561 continue; 3562 } 3563 3564 if (wbc->sync_mode != WB_SYNC_NONE) { 3565 if (PageWriteback(page)) 3566 flush_fn(data); 3567 wait_on_page_writeback(page); 3568 } 3569 3570 if (PageWriteback(page) || 3571 !clear_page_dirty_for_io(page)) { 3572 unlock_page(page); 3573 continue; 3574 } 3575 3576 ret = (*writepage)(page, wbc, data); 3577 3578 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3579 unlock_page(page); 3580 ret = 0; 3581 } 3582 if (ret) 3583 done = 1; 3584 3585 /* 3586 * the filesystem may choose to bump up nr_to_write. 3587 * We have to make sure to honor the new nr_to_write 3588 * at any time 3589 */ 3590 nr_to_write_done = wbc->nr_to_write <= 0; 3591 } 3592 pagevec_release(&pvec); 3593 cond_resched(); 3594 } 3595 if (!scanned && !done) { 3596 /* 3597 * We hit the last page and there is more work to be done: wrap 3598 * back to the start of the file 3599 */ 3600 scanned = 1; 3601 index = 0; 3602 goto retry; 3603 } 3604 btrfs_add_delayed_iput(inode); 3605 return ret; 3606 } 3607 3608 static void flush_epd_write_bio(struct extent_page_data *epd) 3609 { 3610 if (epd->bio) { 3611 int rw = WRITE; 3612 int ret; 3613 3614 if (epd->sync_io) 3615 rw = WRITE_SYNC; 3616 3617 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3618 BUG_ON(ret < 0); /* -ENOMEM */ 3619 epd->bio = NULL; 3620 } 3621 } 3622 3623 static noinline void flush_write_bio(void *data) 3624 { 3625 struct extent_page_data *epd = data; 3626 flush_epd_write_bio(epd); 3627 } 3628 3629 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3630 get_extent_t *get_extent, 3631 struct writeback_control *wbc) 3632 { 3633 int ret; 3634 struct extent_page_data epd = { 3635 .bio = NULL, 3636 .tree = tree, 3637 .get_extent = get_extent, 3638 .extent_locked = 0, 3639 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3640 .bio_flags = 0, 3641 }; 3642 3643 ret = __extent_writepage(page, wbc, &epd); 3644 3645 flush_epd_write_bio(&epd); 3646 return ret; 3647 } 3648 3649 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3650 u64 start, u64 end, get_extent_t *get_extent, 3651 int mode) 3652 { 3653 int ret = 0; 3654 struct address_space *mapping = inode->i_mapping; 3655 struct page *page; 3656 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3657 PAGE_CACHE_SHIFT; 3658 3659 struct extent_page_data epd = { 3660 .bio = NULL, 3661 .tree = tree, 3662 .get_extent = get_extent, 3663 .extent_locked = 1, 3664 .sync_io = mode == WB_SYNC_ALL, 3665 .bio_flags = 0, 3666 }; 3667 struct writeback_control wbc_writepages = { 3668 .sync_mode = mode, 3669 .nr_to_write = nr_pages * 2, 3670 .range_start = start, 3671 .range_end = end + 1, 3672 }; 3673 3674 while (start <= end) { 3675 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3676 if (clear_page_dirty_for_io(page)) 3677 ret = __extent_writepage(page, &wbc_writepages, &epd); 3678 else { 3679 if (tree->ops && tree->ops->writepage_end_io_hook) 3680 tree->ops->writepage_end_io_hook(page, start, 3681 start + PAGE_CACHE_SIZE - 1, 3682 NULL, 1); 3683 unlock_page(page); 3684 } 3685 page_cache_release(page); 3686 start += PAGE_CACHE_SIZE; 3687 } 3688 3689 flush_epd_write_bio(&epd); 3690 return ret; 3691 } 3692 3693 int extent_writepages(struct extent_io_tree *tree, 3694 struct address_space *mapping, 3695 get_extent_t *get_extent, 3696 struct writeback_control *wbc) 3697 { 3698 int ret = 0; 3699 struct extent_page_data epd = { 3700 .bio = NULL, 3701 .tree = tree, 3702 .get_extent = get_extent, 3703 .extent_locked = 0, 3704 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3705 .bio_flags = 0, 3706 }; 3707 3708 ret = extent_write_cache_pages(tree, mapping, wbc, 3709 __extent_writepage, &epd, 3710 flush_write_bio); 3711 flush_epd_write_bio(&epd); 3712 return ret; 3713 } 3714 3715 int extent_readpages(struct extent_io_tree *tree, 3716 struct address_space *mapping, 3717 struct list_head *pages, unsigned nr_pages, 3718 get_extent_t get_extent) 3719 { 3720 struct bio *bio = NULL; 3721 unsigned page_idx; 3722 unsigned long bio_flags = 0; 3723 struct page *pagepool[16]; 3724 struct page *page; 3725 int i = 0; 3726 int nr = 0; 3727 3728 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3729 page = list_entry(pages->prev, struct page, lru); 3730 3731 prefetchw(&page->flags); 3732 list_del(&page->lru); 3733 if (add_to_page_cache_lru(page, mapping, 3734 page->index, GFP_NOFS)) { 3735 page_cache_release(page); 3736 continue; 3737 } 3738 3739 pagepool[nr++] = page; 3740 if (nr < ARRAY_SIZE(pagepool)) 3741 continue; 3742 for (i = 0; i < nr; i++) { 3743 __extent_read_full_page(tree, pagepool[i], get_extent, 3744 &bio, 0, &bio_flags, READ); 3745 page_cache_release(pagepool[i]); 3746 } 3747 nr = 0; 3748 } 3749 for (i = 0; i < nr; i++) { 3750 __extent_read_full_page(tree, pagepool[i], get_extent, 3751 &bio, 0, &bio_flags, READ); 3752 page_cache_release(pagepool[i]); 3753 } 3754 3755 BUG_ON(!list_empty(pages)); 3756 if (bio) 3757 return submit_one_bio(READ, bio, 0, bio_flags); 3758 return 0; 3759 } 3760 3761 /* 3762 * basic invalidatepage code, this waits on any locked or writeback 3763 * ranges corresponding to the page, and then deletes any extent state 3764 * records from the tree 3765 */ 3766 int extent_invalidatepage(struct extent_io_tree *tree, 3767 struct page *page, unsigned long offset) 3768 { 3769 struct extent_state *cached_state = NULL; 3770 u64 start = page_offset(page); 3771 u64 end = start + PAGE_CACHE_SIZE - 1; 3772 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3773 3774 start += ALIGN(offset, blocksize); 3775 if (start > end) 3776 return 0; 3777 3778 lock_extent_bits(tree, start, end, 0, &cached_state); 3779 wait_on_page_writeback(page); 3780 clear_extent_bit(tree, start, end, 3781 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3782 EXTENT_DO_ACCOUNTING, 3783 1, 1, &cached_state, GFP_NOFS); 3784 return 0; 3785 } 3786 3787 /* 3788 * a helper for releasepage, this tests for areas of the page that 3789 * are locked or under IO and drops the related state bits if it is safe 3790 * to drop the page. 3791 */ 3792 static int try_release_extent_state(struct extent_map_tree *map, 3793 struct extent_io_tree *tree, 3794 struct page *page, gfp_t mask) 3795 { 3796 u64 start = page_offset(page); 3797 u64 end = start + PAGE_CACHE_SIZE - 1; 3798 int ret = 1; 3799 3800 if (test_range_bit(tree, start, end, 3801 EXTENT_IOBITS, 0, NULL)) 3802 ret = 0; 3803 else { 3804 if ((mask & GFP_NOFS) == GFP_NOFS) 3805 mask = GFP_NOFS; 3806 /* 3807 * at this point we can safely clear everything except the 3808 * locked bit and the nodatasum bit 3809 */ 3810 ret = clear_extent_bit(tree, start, end, 3811 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3812 0, 0, NULL, mask); 3813 3814 /* if clear_extent_bit failed for enomem reasons, 3815 * we can't allow the release to continue. 3816 */ 3817 if (ret < 0) 3818 ret = 0; 3819 else 3820 ret = 1; 3821 } 3822 return ret; 3823 } 3824 3825 /* 3826 * a helper for releasepage. As long as there are no locked extents 3827 * in the range corresponding to the page, both state records and extent 3828 * map records are removed 3829 */ 3830 int try_release_extent_mapping(struct extent_map_tree *map, 3831 struct extent_io_tree *tree, struct page *page, 3832 gfp_t mask) 3833 { 3834 struct extent_map *em; 3835 u64 start = page_offset(page); 3836 u64 end = start + PAGE_CACHE_SIZE - 1; 3837 3838 if ((mask & __GFP_WAIT) && 3839 page->mapping->host->i_size > 16 * 1024 * 1024) { 3840 u64 len; 3841 while (start <= end) { 3842 len = end - start + 1; 3843 write_lock(&map->lock); 3844 em = lookup_extent_mapping(map, start, len); 3845 if (!em) { 3846 write_unlock(&map->lock); 3847 break; 3848 } 3849 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 3850 em->start != start) { 3851 write_unlock(&map->lock); 3852 free_extent_map(em); 3853 break; 3854 } 3855 if (!test_range_bit(tree, em->start, 3856 extent_map_end(em) - 1, 3857 EXTENT_LOCKED | EXTENT_WRITEBACK, 3858 0, NULL)) { 3859 remove_extent_mapping(map, em); 3860 /* once for the rb tree */ 3861 free_extent_map(em); 3862 } 3863 start = extent_map_end(em); 3864 write_unlock(&map->lock); 3865 3866 /* once for us */ 3867 free_extent_map(em); 3868 } 3869 } 3870 return try_release_extent_state(map, tree, page, mask); 3871 } 3872 3873 /* 3874 * helper function for fiemap, which doesn't want to see any holes. 3875 * This maps until we find something past 'last' 3876 */ 3877 static struct extent_map *get_extent_skip_holes(struct inode *inode, 3878 u64 offset, 3879 u64 last, 3880 get_extent_t *get_extent) 3881 { 3882 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 3883 struct extent_map *em; 3884 u64 len; 3885 3886 if (offset >= last) 3887 return NULL; 3888 3889 while(1) { 3890 len = last - offset; 3891 if (len == 0) 3892 break; 3893 len = ALIGN(len, sectorsize); 3894 em = get_extent(inode, NULL, 0, offset, len, 0); 3895 if (IS_ERR_OR_NULL(em)) 3896 return em; 3897 3898 /* if this isn't a hole return it */ 3899 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 3900 em->block_start != EXTENT_MAP_HOLE) { 3901 return em; 3902 } 3903 3904 /* this is a hole, advance to the next extent */ 3905 offset = extent_map_end(em); 3906 free_extent_map(em); 3907 if (offset >= last) 3908 break; 3909 } 3910 return NULL; 3911 } 3912 3913 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3914 __u64 start, __u64 len, get_extent_t *get_extent) 3915 { 3916 int ret = 0; 3917 u64 off = start; 3918 u64 max = start + len; 3919 u32 flags = 0; 3920 u32 found_type; 3921 u64 last; 3922 u64 last_for_get_extent = 0; 3923 u64 disko = 0; 3924 u64 isize = i_size_read(inode); 3925 struct btrfs_key found_key; 3926 struct extent_map *em = NULL; 3927 struct extent_state *cached_state = NULL; 3928 struct btrfs_path *path; 3929 struct btrfs_file_extent_item *item; 3930 int end = 0; 3931 u64 em_start = 0; 3932 u64 em_len = 0; 3933 u64 em_end = 0; 3934 unsigned long emflags; 3935 3936 if (len == 0) 3937 return -EINVAL; 3938 3939 path = btrfs_alloc_path(); 3940 if (!path) 3941 return -ENOMEM; 3942 path->leave_spinning = 1; 3943 3944 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 3945 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 3946 3947 /* 3948 * lookup the last file extent. We're not using i_size here 3949 * because there might be preallocation past i_size 3950 */ 3951 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3952 path, btrfs_ino(inode), -1, 0); 3953 if (ret < 0) { 3954 btrfs_free_path(path); 3955 return ret; 3956 } 3957 WARN_ON(!ret); 3958 path->slots[0]--; 3959 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3960 struct btrfs_file_extent_item); 3961 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3962 found_type = btrfs_key_type(&found_key); 3963 3964 /* No extents, but there might be delalloc bits */ 3965 if (found_key.objectid != btrfs_ino(inode) || 3966 found_type != BTRFS_EXTENT_DATA_KEY) { 3967 /* have to trust i_size as the end */ 3968 last = (u64)-1; 3969 last_for_get_extent = isize; 3970 } else { 3971 /* 3972 * remember the start of the last extent. There are a 3973 * bunch of different factors that go into the length of the 3974 * extent, so its much less complex to remember where it started 3975 */ 3976 last = found_key.offset; 3977 last_for_get_extent = last + 1; 3978 } 3979 btrfs_free_path(path); 3980 3981 /* 3982 * we might have some extents allocated but more delalloc past those 3983 * extents. so, we trust isize unless the start of the last extent is 3984 * beyond isize 3985 */ 3986 if (last < isize) { 3987 last = (u64)-1; 3988 last_for_get_extent = isize; 3989 } 3990 3991 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3992 &cached_state); 3993 3994 em = get_extent_skip_holes(inode, start, last_for_get_extent, 3995 get_extent); 3996 if (!em) 3997 goto out; 3998 if (IS_ERR(em)) { 3999 ret = PTR_ERR(em); 4000 goto out; 4001 } 4002 4003 while (!end) { 4004 u64 offset_in_extent; 4005 4006 /* break if the extent we found is outside the range */ 4007 if (em->start >= max || extent_map_end(em) < off) 4008 break; 4009 4010 /* 4011 * get_extent may return an extent that starts before our 4012 * requested range. We have to make sure the ranges 4013 * we return to fiemap always move forward and don't 4014 * overlap, so adjust the offsets here 4015 */ 4016 em_start = max(em->start, off); 4017 4018 /* 4019 * record the offset from the start of the extent 4020 * for adjusting the disk offset below 4021 */ 4022 offset_in_extent = em_start - em->start; 4023 em_end = extent_map_end(em); 4024 em_len = em_end - em_start; 4025 emflags = em->flags; 4026 disko = 0; 4027 flags = 0; 4028 4029 /* 4030 * bump off for our next call to get_extent 4031 */ 4032 off = extent_map_end(em); 4033 if (off >= max) 4034 end = 1; 4035 4036 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4037 end = 1; 4038 flags |= FIEMAP_EXTENT_LAST; 4039 } else if (em->block_start == EXTENT_MAP_INLINE) { 4040 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4041 FIEMAP_EXTENT_NOT_ALIGNED); 4042 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4043 flags |= (FIEMAP_EXTENT_DELALLOC | 4044 FIEMAP_EXTENT_UNKNOWN); 4045 } else { 4046 disko = em->block_start + offset_in_extent; 4047 } 4048 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4049 flags |= FIEMAP_EXTENT_ENCODED; 4050 4051 free_extent_map(em); 4052 em = NULL; 4053 if ((em_start >= last) || em_len == (u64)-1 || 4054 (last == (u64)-1 && isize <= em_end)) { 4055 flags |= FIEMAP_EXTENT_LAST; 4056 end = 1; 4057 } 4058 4059 /* now scan forward to see if this is really the last extent. */ 4060 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4061 get_extent); 4062 if (IS_ERR(em)) { 4063 ret = PTR_ERR(em); 4064 goto out; 4065 } 4066 if (!em) { 4067 flags |= FIEMAP_EXTENT_LAST; 4068 end = 1; 4069 } 4070 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4071 em_len, flags); 4072 if (ret) 4073 goto out_free; 4074 } 4075 out_free: 4076 free_extent_map(em); 4077 out: 4078 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 4079 &cached_state, GFP_NOFS); 4080 return ret; 4081 } 4082 4083 static void __free_extent_buffer(struct extent_buffer *eb) 4084 { 4085 btrfs_leak_debug_del(&eb->leak_list); 4086 kmem_cache_free(extent_buffer_cache, eb); 4087 } 4088 4089 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4090 u64 start, 4091 unsigned long len, 4092 gfp_t mask) 4093 { 4094 struct extent_buffer *eb = NULL; 4095 4096 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4097 if (eb == NULL) 4098 return NULL; 4099 eb->start = start; 4100 eb->len = len; 4101 eb->tree = tree; 4102 eb->bflags = 0; 4103 rwlock_init(&eb->lock); 4104 atomic_set(&eb->write_locks, 0); 4105 atomic_set(&eb->read_locks, 0); 4106 atomic_set(&eb->blocking_readers, 0); 4107 atomic_set(&eb->blocking_writers, 0); 4108 atomic_set(&eb->spinning_readers, 0); 4109 atomic_set(&eb->spinning_writers, 0); 4110 eb->lock_nested = 0; 4111 init_waitqueue_head(&eb->write_lock_wq); 4112 init_waitqueue_head(&eb->read_lock_wq); 4113 4114 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4115 4116 spin_lock_init(&eb->refs_lock); 4117 atomic_set(&eb->refs, 1); 4118 atomic_set(&eb->io_pages, 0); 4119 4120 /* 4121 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4122 */ 4123 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4124 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4125 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4126 4127 return eb; 4128 } 4129 4130 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4131 { 4132 unsigned long i; 4133 struct page *p; 4134 struct extent_buffer *new; 4135 unsigned long num_pages = num_extent_pages(src->start, src->len); 4136 4137 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); 4138 if (new == NULL) 4139 return NULL; 4140 4141 for (i = 0; i < num_pages; i++) { 4142 p = alloc_page(GFP_ATOMIC); 4143 BUG_ON(!p); 4144 attach_extent_buffer_page(new, p); 4145 WARN_ON(PageDirty(p)); 4146 SetPageUptodate(p); 4147 new->pages[i] = p; 4148 } 4149 4150 copy_extent_buffer(new, src, 0, 0, src->len); 4151 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4152 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4153 4154 return new; 4155 } 4156 4157 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4158 { 4159 struct extent_buffer *eb; 4160 unsigned long num_pages = num_extent_pages(0, len); 4161 unsigned long i; 4162 4163 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); 4164 if (!eb) 4165 return NULL; 4166 4167 for (i = 0; i < num_pages; i++) { 4168 eb->pages[i] = alloc_page(GFP_ATOMIC); 4169 if (!eb->pages[i]) 4170 goto err; 4171 } 4172 set_extent_buffer_uptodate(eb); 4173 btrfs_set_header_nritems(eb, 0); 4174 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4175 4176 return eb; 4177 err: 4178 for (; i > 0; i--) 4179 __free_page(eb->pages[i - 1]); 4180 __free_extent_buffer(eb); 4181 return NULL; 4182 } 4183 4184 static int extent_buffer_under_io(struct extent_buffer *eb) 4185 { 4186 return (atomic_read(&eb->io_pages) || 4187 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4188 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4189 } 4190 4191 /* 4192 * Helper for releasing extent buffer page. 4193 */ 4194 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4195 unsigned long start_idx) 4196 { 4197 unsigned long index; 4198 unsigned long num_pages; 4199 struct page *page; 4200 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4201 4202 BUG_ON(extent_buffer_under_io(eb)); 4203 4204 num_pages = num_extent_pages(eb->start, eb->len); 4205 index = start_idx + num_pages; 4206 if (start_idx >= index) 4207 return; 4208 4209 do { 4210 index--; 4211 page = extent_buffer_page(eb, index); 4212 if (page && mapped) { 4213 spin_lock(&page->mapping->private_lock); 4214 /* 4215 * We do this since we'll remove the pages after we've 4216 * removed the eb from the radix tree, so we could race 4217 * and have this page now attached to the new eb. So 4218 * only clear page_private if it's still connected to 4219 * this eb. 4220 */ 4221 if (PagePrivate(page) && 4222 page->private == (unsigned long)eb) { 4223 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4224 BUG_ON(PageDirty(page)); 4225 BUG_ON(PageWriteback(page)); 4226 /* 4227 * We need to make sure we haven't be attached 4228 * to a new eb. 4229 */ 4230 ClearPagePrivate(page); 4231 set_page_private(page, 0); 4232 /* One for the page private */ 4233 page_cache_release(page); 4234 } 4235 spin_unlock(&page->mapping->private_lock); 4236 4237 } 4238 if (page) { 4239 /* One for when we alloced the page */ 4240 page_cache_release(page); 4241 } 4242 } while (index != start_idx); 4243 } 4244 4245 /* 4246 * Helper for releasing the extent buffer. 4247 */ 4248 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4249 { 4250 btrfs_release_extent_buffer_page(eb, 0); 4251 __free_extent_buffer(eb); 4252 } 4253 4254 static void check_buffer_tree_ref(struct extent_buffer *eb) 4255 { 4256 int refs; 4257 /* the ref bit is tricky. We have to make sure it is set 4258 * if we have the buffer dirty. Otherwise the 4259 * code to free a buffer can end up dropping a dirty 4260 * page 4261 * 4262 * Once the ref bit is set, it won't go away while the 4263 * buffer is dirty or in writeback, and it also won't 4264 * go away while we have the reference count on the 4265 * eb bumped. 4266 * 4267 * We can't just set the ref bit without bumping the 4268 * ref on the eb because free_extent_buffer might 4269 * see the ref bit and try to clear it. If this happens 4270 * free_extent_buffer might end up dropping our original 4271 * ref by mistake and freeing the page before we are able 4272 * to add one more ref. 4273 * 4274 * So bump the ref count first, then set the bit. If someone 4275 * beat us to it, drop the ref we added. 4276 */ 4277 refs = atomic_read(&eb->refs); 4278 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4279 return; 4280 4281 spin_lock(&eb->refs_lock); 4282 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4283 atomic_inc(&eb->refs); 4284 spin_unlock(&eb->refs_lock); 4285 } 4286 4287 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4288 { 4289 unsigned long num_pages, i; 4290 4291 check_buffer_tree_ref(eb); 4292 4293 num_pages = num_extent_pages(eb->start, eb->len); 4294 for (i = 0; i < num_pages; i++) { 4295 struct page *p = extent_buffer_page(eb, i); 4296 mark_page_accessed(p); 4297 } 4298 } 4299 4300 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4301 u64 start, unsigned long len) 4302 { 4303 unsigned long num_pages = num_extent_pages(start, len); 4304 unsigned long i; 4305 unsigned long index = start >> PAGE_CACHE_SHIFT; 4306 struct extent_buffer *eb; 4307 struct extent_buffer *exists = NULL; 4308 struct page *p; 4309 struct address_space *mapping = tree->mapping; 4310 int uptodate = 1; 4311 int ret; 4312 4313 rcu_read_lock(); 4314 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4315 if (eb && atomic_inc_not_zero(&eb->refs)) { 4316 rcu_read_unlock(); 4317 mark_extent_buffer_accessed(eb); 4318 return eb; 4319 } 4320 rcu_read_unlock(); 4321 4322 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4323 if (!eb) 4324 return NULL; 4325 4326 for (i = 0; i < num_pages; i++, index++) { 4327 p = find_or_create_page(mapping, index, GFP_NOFS); 4328 if (!p) 4329 goto free_eb; 4330 4331 spin_lock(&mapping->private_lock); 4332 if (PagePrivate(p)) { 4333 /* 4334 * We could have already allocated an eb for this page 4335 * and attached one so lets see if we can get a ref on 4336 * the existing eb, and if we can we know it's good and 4337 * we can just return that one, else we know we can just 4338 * overwrite page->private. 4339 */ 4340 exists = (struct extent_buffer *)p->private; 4341 if (atomic_inc_not_zero(&exists->refs)) { 4342 spin_unlock(&mapping->private_lock); 4343 unlock_page(p); 4344 page_cache_release(p); 4345 mark_extent_buffer_accessed(exists); 4346 goto free_eb; 4347 } 4348 4349 /* 4350 * Do this so attach doesn't complain and we need to 4351 * drop the ref the old guy had. 4352 */ 4353 ClearPagePrivate(p); 4354 WARN_ON(PageDirty(p)); 4355 page_cache_release(p); 4356 } 4357 attach_extent_buffer_page(eb, p); 4358 spin_unlock(&mapping->private_lock); 4359 WARN_ON(PageDirty(p)); 4360 mark_page_accessed(p); 4361 eb->pages[i] = p; 4362 if (!PageUptodate(p)) 4363 uptodate = 0; 4364 4365 /* 4366 * see below about how we avoid a nasty race with release page 4367 * and why we unlock later 4368 */ 4369 } 4370 if (uptodate) 4371 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4372 again: 4373 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4374 if (ret) 4375 goto free_eb; 4376 4377 spin_lock(&tree->buffer_lock); 4378 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4379 if (ret == -EEXIST) { 4380 exists = radix_tree_lookup(&tree->buffer, 4381 start >> PAGE_CACHE_SHIFT); 4382 if (!atomic_inc_not_zero(&exists->refs)) { 4383 spin_unlock(&tree->buffer_lock); 4384 radix_tree_preload_end(); 4385 exists = NULL; 4386 goto again; 4387 } 4388 spin_unlock(&tree->buffer_lock); 4389 radix_tree_preload_end(); 4390 mark_extent_buffer_accessed(exists); 4391 goto free_eb; 4392 } 4393 /* add one reference for the tree */ 4394 check_buffer_tree_ref(eb); 4395 spin_unlock(&tree->buffer_lock); 4396 radix_tree_preload_end(); 4397 4398 /* 4399 * there is a race where release page may have 4400 * tried to find this extent buffer in the radix 4401 * but failed. It will tell the VM it is safe to 4402 * reclaim the, and it will clear the page private bit. 4403 * We must make sure to set the page private bit properly 4404 * after the extent buffer is in the radix tree so 4405 * it doesn't get lost 4406 */ 4407 SetPageChecked(eb->pages[0]); 4408 for (i = 1; i < num_pages; i++) { 4409 p = extent_buffer_page(eb, i); 4410 ClearPageChecked(p); 4411 unlock_page(p); 4412 } 4413 unlock_page(eb->pages[0]); 4414 return eb; 4415 4416 free_eb: 4417 for (i = 0; i < num_pages; i++) { 4418 if (eb->pages[i]) 4419 unlock_page(eb->pages[i]); 4420 } 4421 4422 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4423 btrfs_release_extent_buffer(eb); 4424 return exists; 4425 } 4426 4427 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4428 u64 start, unsigned long len) 4429 { 4430 struct extent_buffer *eb; 4431 4432 rcu_read_lock(); 4433 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4434 if (eb && atomic_inc_not_zero(&eb->refs)) { 4435 rcu_read_unlock(); 4436 mark_extent_buffer_accessed(eb); 4437 return eb; 4438 } 4439 rcu_read_unlock(); 4440 4441 return NULL; 4442 } 4443 4444 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4445 { 4446 struct extent_buffer *eb = 4447 container_of(head, struct extent_buffer, rcu_head); 4448 4449 __free_extent_buffer(eb); 4450 } 4451 4452 /* Expects to have eb->eb_lock already held */ 4453 static int release_extent_buffer(struct extent_buffer *eb) 4454 { 4455 WARN_ON(atomic_read(&eb->refs) == 0); 4456 if (atomic_dec_and_test(&eb->refs)) { 4457 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4458 spin_unlock(&eb->refs_lock); 4459 } else { 4460 struct extent_io_tree *tree = eb->tree; 4461 4462 spin_unlock(&eb->refs_lock); 4463 4464 spin_lock(&tree->buffer_lock); 4465 radix_tree_delete(&tree->buffer, 4466 eb->start >> PAGE_CACHE_SHIFT); 4467 spin_unlock(&tree->buffer_lock); 4468 } 4469 4470 /* Should be safe to release our pages at this point */ 4471 btrfs_release_extent_buffer_page(eb, 0); 4472 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4473 return 1; 4474 } 4475 spin_unlock(&eb->refs_lock); 4476 4477 return 0; 4478 } 4479 4480 void free_extent_buffer(struct extent_buffer *eb) 4481 { 4482 int refs; 4483 int old; 4484 if (!eb) 4485 return; 4486 4487 while (1) { 4488 refs = atomic_read(&eb->refs); 4489 if (refs <= 3) 4490 break; 4491 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 4492 if (old == refs) 4493 return; 4494 } 4495 4496 spin_lock(&eb->refs_lock); 4497 if (atomic_read(&eb->refs) == 2 && 4498 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4499 atomic_dec(&eb->refs); 4500 4501 if (atomic_read(&eb->refs) == 2 && 4502 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4503 !extent_buffer_under_io(eb) && 4504 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4505 atomic_dec(&eb->refs); 4506 4507 /* 4508 * I know this is terrible, but it's temporary until we stop tracking 4509 * the uptodate bits and such for the extent buffers. 4510 */ 4511 release_extent_buffer(eb); 4512 } 4513 4514 void free_extent_buffer_stale(struct extent_buffer *eb) 4515 { 4516 if (!eb) 4517 return; 4518 4519 spin_lock(&eb->refs_lock); 4520 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4521 4522 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4523 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4524 atomic_dec(&eb->refs); 4525 release_extent_buffer(eb); 4526 } 4527 4528 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4529 { 4530 unsigned long i; 4531 unsigned long num_pages; 4532 struct page *page; 4533 4534 num_pages = num_extent_pages(eb->start, eb->len); 4535 4536 for (i = 0; i < num_pages; i++) { 4537 page = extent_buffer_page(eb, i); 4538 if (!PageDirty(page)) 4539 continue; 4540 4541 lock_page(page); 4542 WARN_ON(!PagePrivate(page)); 4543 4544 clear_page_dirty_for_io(page); 4545 spin_lock_irq(&page->mapping->tree_lock); 4546 if (!PageDirty(page)) { 4547 radix_tree_tag_clear(&page->mapping->page_tree, 4548 page_index(page), 4549 PAGECACHE_TAG_DIRTY); 4550 } 4551 spin_unlock_irq(&page->mapping->tree_lock); 4552 ClearPageError(page); 4553 unlock_page(page); 4554 } 4555 WARN_ON(atomic_read(&eb->refs) == 0); 4556 } 4557 4558 int set_extent_buffer_dirty(struct extent_buffer *eb) 4559 { 4560 unsigned long i; 4561 unsigned long num_pages; 4562 int was_dirty = 0; 4563 4564 check_buffer_tree_ref(eb); 4565 4566 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4567 4568 num_pages = num_extent_pages(eb->start, eb->len); 4569 WARN_ON(atomic_read(&eb->refs) == 0); 4570 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4571 4572 for (i = 0; i < num_pages; i++) 4573 set_page_dirty(extent_buffer_page(eb, i)); 4574 return was_dirty; 4575 } 4576 4577 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4578 { 4579 unsigned long i; 4580 struct page *page; 4581 unsigned long num_pages; 4582 4583 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4584 num_pages = num_extent_pages(eb->start, eb->len); 4585 for (i = 0; i < num_pages; i++) { 4586 page = extent_buffer_page(eb, i); 4587 if (page) 4588 ClearPageUptodate(page); 4589 } 4590 return 0; 4591 } 4592 4593 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4594 { 4595 unsigned long i; 4596 struct page *page; 4597 unsigned long num_pages; 4598 4599 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4600 num_pages = num_extent_pages(eb->start, eb->len); 4601 for (i = 0; i < num_pages; i++) { 4602 page = extent_buffer_page(eb, i); 4603 SetPageUptodate(page); 4604 } 4605 return 0; 4606 } 4607 4608 int extent_buffer_uptodate(struct extent_buffer *eb) 4609 { 4610 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4611 } 4612 4613 int read_extent_buffer_pages(struct extent_io_tree *tree, 4614 struct extent_buffer *eb, u64 start, int wait, 4615 get_extent_t *get_extent, int mirror_num) 4616 { 4617 unsigned long i; 4618 unsigned long start_i; 4619 struct page *page; 4620 int err; 4621 int ret = 0; 4622 int locked_pages = 0; 4623 int all_uptodate = 1; 4624 unsigned long num_pages; 4625 unsigned long num_reads = 0; 4626 struct bio *bio = NULL; 4627 unsigned long bio_flags = 0; 4628 4629 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4630 return 0; 4631 4632 if (start) { 4633 WARN_ON(start < eb->start); 4634 start_i = (start >> PAGE_CACHE_SHIFT) - 4635 (eb->start >> PAGE_CACHE_SHIFT); 4636 } else { 4637 start_i = 0; 4638 } 4639 4640 num_pages = num_extent_pages(eb->start, eb->len); 4641 for (i = start_i; i < num_pages; i++) { 4642 page = extent_buffer_page(eb, i); 4643 if (wait == WAIT_NONE) { 4644 if (!trylock_page(page)) 4645 goto unlock_exit; 4646 } else { 4647 lock_page(page); 4648 } 4649 locked_pages++; 4650 if (!PageUptodate(page)) { 4651 num_reads++; 4652 all_uptodate = 0; 4653 } 4654 } 4655 if (all_uptodate) { 4656 if (start_i == 0) 4657 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4658 goto unlock_exit; 4659 } 4660 4661 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4662 eb->read_mirror = 0; 4663 atomic_set(&eb->io_pages, num_reads); 4664 for (i = start_i; i < num_pages; i++) { 4665 page = extent_buffer_page(eb, i); 4666 if (!PageUptodate(page)) { 4667 ClearPageError(page); 4668 err = __extent_read_full_page(tree, page, 4669 get_extent, &bio, 4670 mirror_num, &bio_flags, 4671 READ | REQ_META); 4672 if (err) 4673 ret = err; 4674 } else { 4675 unlock_page(page); 4676 } 4677 } 4678 4679 if (bio) { 4680 err = submit_one_bio(READ | REQ_META, bio, mirror_num, 4681 bio_flags); 4682 if (err) 4683 return err; 4684 } 4685 4686 if (ret || wait != WAIT_COMPLETE) 4687 return ret; 4688 4689 for (i = start_i; i < num_pages; i++) { 4690 page = extent_buffer_page(eb, i); 4691 wait_on_page_locked(page); 4692 if (!PageUptodate(page)) 4693 ret = -EIO; 4694 } 4695 4696 return ret; 4697 4698 unlock_exit: 4699 i = start_i; 4700 while (locked_pages > 0) { 4701 page = extent_buffer_page(eb, i); 4702 i++; 4703 unlock_page(page); 4704 locked_pages--; 4705 } 4706 return ret; 4707 } 4708 4709 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4710 unsigned long start, 4711 unsigned long len) 4712 { 4713 size_t cur; 4714 size_t offset; 4715 struct page *page; 4716 char *kaddr; 4717 char *dst = (char *)dstv; 4718 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4719 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4720 4721 WARN_ON(start > eb->len); 4722 WARN_ON(start + len > eb->start + eb->len); 4723 4724 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4725 4726 while (len > 0) { 4727 page = extent_buffer_page(eb, i); 4728 4729 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4730 kaddr = page_address(page); 4731 memcpy(dst, kaddr + offset, cur); 4732 4733 dst += cur; 4734 len -= cur; 4735 offset = 0; 4736 i++; 4737 } 4738 } 4739 4740 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4741 unsigned long min_len, char **map, 4742 unsigned long *map_start, 4743 unsigned long *map_len) 4744 { 4745 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4746 char *kaddr; 4747 struct page *p; 4748 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4749 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4750 unsigned long end_i = (start_offset + start + min_len - 1) >> 4751 PAGE_CACHE_SHIFT; 4752 4753 if (i != end_i) 4754 return -EINVAL; 4755 4756 if (i == 0) { 4757 offset = start_offset; 4758 *map_start = 0; 4759 } else { 4760 offset = 0; 4761 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4762 } 4763 4764 if (start + min_len > eb->len) { 4765 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4766 "wanted %lu %lu\n", (unsigned long long)eb->start, 4767 eb->len, start, min_len); 4768 return -EINVAL; 4769 } 4770 4771 p = extent_buffer_page(eb, i); 4772 kaddr = page_address(p); 4773 *map = kaddr + offset; 4774 *map_len = PAGE_CACHE_SIZE - offset; 4775 return 0; 4776 } 4777 4778 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4779 unsigned long start, 4780 unsigned long len) 4781 { 4782 size_t cur; 4783 size_t offset; 4784 struct page *page; 4785 char *kaddr; 4786 char *ptr = (char *)ptrv; 4787 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4788 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4789 int ret = 0; 4790 4791 WARN_ON(start > eb->len); 4792 WARN_ON(start + len > eb->start + eb->len); 4793 4794 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4795 4796 while (len > 0) { 4797 page = extent_buffer_page(eb, i); 4798 4799 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4800 4801 kaddr = page_address(page); 4802 ret = memcmp(ptr, kaddr + offset, cur); 4803 if (ret) 4804 break; 4805 4806 ptr += cur; 4807 len -= cur; 4808 offset = 0; 4809 i++; 4810 } 4811 return ret; 4812 } 4813 4814 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4815 unsigned long start, unsigned long len) 4816 { 4817 size_t cur; 4818 size_t offset; 4819 struct page *page; 4820 char *kaddr; 4821 char *src = (char *)srcv; 4822 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4823 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4824 4825 WARN_ON(start > eb->len); 4826 WARN_ON(start + len > eb->start + eb->len); 4827 4828 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4829 4830 while (len > 0) { 4831 page = extent_buffer_page(eb, i); 4832 WARN_ON(!PageUptodate(page)); 4833 4834 cur = min(len, PAGE_CACHE_SIZE - offset); 4835 kaddr = page_address(page); 4836 memcpy(kaddr + offset, src, cur); 4837 4838 src += cur; 4839 len -= cur; 4840 offset = 0; 4841 i++; 4842 } 4843 } 4844 4845 void memset_extent_buffer(struct extent_buffer *eb, char c, 4846 unsigned long start, unsigned long len) 4847 { 4848 size_t cur; 4849 size_t offset; 4850 struct page *page; 4851 char *kaddr; 4852 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4853 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4854 4855 WARN_ON(start > eb->len); 4856 WARN_ON(start + len > eb->start + eb->len); 4857 4858 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4859 4860 while (len > 0) { 4861 page = extent_buffer_page(eb, i); 4862 WARN_ON(!PageUptodate(page)); 4863 4864 cur = min(len, PAGE_CACHE_SIZE - offset); 4865 kaddr = page_address(page); 4866 memset(kaddr + offset, c, cur); 4867 4868 len -= cur; 4869 offset = 0; 4870 i++; 4871 } 4872 } 4873 4874 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 4875 unsigned long dst_offset, unsigned long src_offset, 4876 unsigned long len) 4877 { 4878 u64 dst_len = dst->len; 4879 size_t cur; 4880 size_t offset; 4881 struct page *page; 4882 char *kaddr; 4883 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4884 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4885 4886 WARN_ON(src->len != dst_len); 4887 4888 offset = (start_offset + dst_offset) & 4889 ((unsigned long)PAGE_CACHE_SIZE - 1); 4890 4891 while (len > 0) { 4892 page = extent_buffer_page(dst, i); 4893 WARN_ON(!PageUptodate(page)); 4894 4895 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4896 4897 kaddr = page_address(page); 4898 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4899 4900 src_offset += cur; 4901 len -= cur; 4902 offset = 0; 4903 i++; 4904 } 4905 } 4906 4907 static void move_pages(struct page *dst_page, struct page *src_page, 4908 unsigned long dst_off, unsigned long src_off, 4909 unsigned long len) 4910 { 4911 char *dst_kaddr = page_address(dst_page); 4912 if (dst_page == src_page) { 4913 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4914 } else { 4915 char *src_kaddr = page_address(src_page); 4916 char *p = dst_kaddr + dst_off + len; 4917 char *s = src_kaddr + src_off + len; 4918 4919 while (len--) 4920 *--p = *--s; 4921 } 4922 } 4923 4924 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4925 { 4926 unsigned long distance = (src > dst) ? src - dst : dst - src; 4927 return distance < len; 4928 } 4929 4930 static void copy_pages(struct page *dst_page, struct page *src_page, 4931 unsigned long dst_off, unsigned long src_off, 4932 unsigned long len) 4933 { 4934 char *dst_kaddr = page_address(dst_page); 4935 char *src_kaddr; 4936 int must_memmove = 0; 4937 4938 if (dst_page != src_page) { 4939 src_kaddr = page_address(src_page); 4940 } else { 4941 src_kaddr = dst_kaddr; 4942 if (areas_overlap(src_off, dst_off, len)) 4943 must_memmove = 1; 4944 } 4945 4946 if (must_memmove) 4947 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 4948 else 4949 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4950 } 4951 4952 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4953 unsigned long src_offset, unsigned long len) 4954 { 4955 size_t cur; 4956 size_t dst_off_in_page; 4957 size_t src_off_in_page; 4958 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4959 unsigned long dst_i; 4960 unsigned long src_i; 4961 4962 if (src_offset + len > dst->len) { 4963 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4964 "len %lu dst len %lu\n", src_offset, len, dst->len); 4965 BUG_ON(1); 4966 } 4967 if (dst_offset + len > dst->len) { 4968 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4969 "len %lu dst len %lu\n", dst_offset, len, dst->len); 4970 BUG_ON(1); 4971 } 4972 4973 while (len > 0) { 4974 dst_off_in_page = (start_offset + dst_offset) & 4975 ((unsigned long)PAGE_CACHE_SIZE - 1); 4976 src_off_in_page = (start_offset + src_offset) & 4977 ((unsigned long)PAGE_CACHE_SIZE - 1); 4978 4979 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4980 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 4981 4982 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 4983 src_off_in_page)); 4984 cur = min_t(unsigned long, cur, 4985 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 4986 4987 copy_pages(extent_buffer_page(dst, dst_i), 4988 extent_buffer_page(dst, src_i), 4989 dst_off_in_page, src_off_in_page, cur); 4990 4991 src_offset += cur; 4992 dst_offset += cur; 4993 len -= cur; 4994 } 4995 } 4996 4997 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4998 unsigned long src_offset, unsigned long len) 4999 { 5000 size_t cur; 5001 size_t dst_off_in_page; 5002 size_t src_off_in_page; 5003 unsigned long dst_end = dst_offset + len - 1; 5004 unsigned long src_end = src_offset + len - 1; 5005 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5006 unsigned long dst_i; 5007 unsigned long src_i; 5008 5009 if (src_offset + len > dst->len) { 5010 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5011 "len %lu len %lu\n", src_offset, len, dst->len); 5012 BUG_ON(1); 5013 } 5014 if (dst_offset + len > dst->len) { 5015 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5016 "len %lu len %lu\n", dst_offset, len, dst->len); 5017 BUG_ON(1); 5018 } 5019 if (dst_offset < src_offset) { 5020 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5021 return; 5022 } 5023 while (len > 0) { 5024 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 5025 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 5026 5027 dst_off_in_page = (start_offset + dst_end) & 5028 ((unsigned long)PAGE_CACHE_SIZE - 1); 5029 src_off_in_page = (start_offset + src_end) & 5030 ((unsigned long)PAGE_CACHE_SIZE - 1); 5031 5032 cur = min_t(unsigned long, len, src_off_in_page + 1); 5033 cur = min(cur, dst_off_in_page + 1); 5034 move_pages(extent_buffer_page(dst, dst_i), 5035 extent_buffer_page(dst, src_i), 5036 dst_off_in_page - cur + 1, 5037 src_off_in_page - cur + 1, cur); 5038 5039 dst_end -= cur; 5040 src_end -= cur; 5041 len -= cur; 5042 } 5043 } 5044 5045 int try_release_extent_buffer(struct page *page) 5046 { 5047 struct extent_buffer *eb; 5048 5049 /* 5050 * We need to make sure noboody is attaching this page to an eb right 5051 * now. 5052 */ 5053 spin_lock(&page->mapping->private_lock); 5054 if (!PagePrivate(page)) { 5055 spin_unlock(&page->mapping->private_lock); 5056 return 1; 5057 } 5058 5059 eb = (struct extent_buffer *)page->private; 5060 BUG_ON(!eb); 5061 5062 /* 5063 * This is a little awful but should be ok, we need to make sure that 5064 * the eb doesn't disappear out from under us while we're looking at 5065 * this page. 5066 */ 5067 spin_lock(&eb->refs_lock); 5068 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5069 spin_unlock(&eb->refs_lock); 5070 spin_unlock(&page->mapping->private_lock); 5071 return 0; 5072 } 5073 spin_unlock(&page->mapping->private_lock); 5074 5075 /* 5076 * If tree ref isn't set then we know the ref on this eb is a real ref, 5077 * so just return, this page will likely be freed soon anyway. 5078 */ 5079 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5080 spin_unlock(&eb->refs_lock); 5081 return 0; 5082 } 5083 5084 return release_extent_buffer(eb); 5085 } 5086