1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "compat.h" 17 #include "ctree.h" 18 #include "btrfs_inode.h" 19 #include "volumes.h" 20 #include "check-integrity.h" 21 #include "locking.h" 22 #include "rcu-string.h" 23 24 static struct kmem_cache *extent_state_cache; 25 static struct kmem_cache *extent_buffer_cache; 26 static struct bio_set *btrfs_bioset; 27 28 #ifdef CONFIG_BTRFS_DEBUG 29 static LIST_HEAD(buffers); 30 static LIST_HEAD(states); 31 32 static DEFINE_SPINLOCK(leak_lock); 33 34 static inline 35 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 36 { 37 unsigned long flags; 38 39 spin_lock_irqsave(&leak_lock, flags); 40 list_add(new, head); 41 spin_unlock_irqrestore(&leak_lock, flags); 42 } 43 44 static inline 45 void btrfs_leak_debug_del(struct list_head *entry) 46 { 47 unsigned long flags; 48 49 spin_lock_irqsave(&leak_lock, flags); 50 list_del(entry); 51 spin_unlock_irqrestore(&leak_lock, flags); 52 } 53 54 static inline 55 void btrfs_leak_debug_check(void) 56 { 57 struct extent_state *state; 58 struct extent_buffer *eb; 59 60 while (!list_empty(&states)) { 61 state = list_entry(states.next, struct extent_state, leak_list); 62 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 63 "state %lu in tree %p refs %d\n", 64 (unsigned long long)state->start, 65 (unsigned long long)state->end, 66 state->state, state->tree, atomic_read(&state->refs)); 67 list_del(&state->leak_list); 68 kmem_cache_free(extent_state_cache, state); 69 } 70 71 while (!list_empty(&buffers)) { 72 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 73 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 74 "refs %d\n", (unsigned long long)eb->start, 75 eb->len, atomic_read(&eb->refs)); 76 list_del(&eb->leak_list); 77 kmem_cache_free(extent_buffer_cache, eb); 78 } 79 } 80 #else 81 #define btrfs_leak_debug_add(new, head) do {} while (0) 82 #define btrfs_leak_debug_del(entry) do {} while (0) 83 #define btrfs_leak_debug_check() do {} while (0) 84 #endif 85 86 #define BUFFER_LRU_MAX 64 87 88 struct tree_entry { 89 u64 start; 90 u64 end; 91 struct rb_node rb_node; 92 }; 93 94 struct extent_page_data { 95 struct bio *bio; 96 struct extent_io_tree *tree; 97 get_extent_t *get_extent; 98 unsigned long bio_flags; 99 100 /* tells writepage not to lock the state bits for this range 101 * it still does the unlocking 102 */ 103 unsigned int extent_locked:1; 104 105 /* tells the submit_bio code to use a WRITE_SYNC */ 106 unsigned int sync_io:1; 107 }; 108 109 static noinline void flush_write_bio(void *data); 110 static inline struct btrfs_fs_info * 111 tree_fs_info(struct extent_io_tree *tree) 112 { 113 return btrfs_sb(tree->mapping->host->i_sb); 114 } 115 116 int __init extent_io_init(void) 117 { 118 extent_state_cache = kmem_cache_create("btrfs_extent_state", 119 sizeof(struct extent_state), 0, 120 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 121 if (!extent_state_cache) 122 return -ENOMEM; 123 124 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 125 sizeof(struct extent_buffer), 0, 126 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 127 if (!extent_buffer_cache) 128 goto free_state_cache; 129 130 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 131 offsetof(struct btrfs_io_bio, bio)); 132 if (!btrfs_bioset) 133 goto free_buffer_cache; 134 return 0; 135 136 free_buffer_cache: 137 kmem_cache_destroy(extent_buffer_cache); 138 extent_buffer_cache = NULL; 139 140 free_state_cache: 141 kmem_cache_destroy(extent_state_cache); 142 extent_state_cache = NULL; 143 return -ENOMEM; 144 } 145 146 void extent_io_exit(void) 147 { 148 btrfs_leak_debug_check(); 149 150 /* 151 * Make sure all delayed rcu free are flushed before we 152 * destroy caches. 153 */ 154 rcu_barrier(); 155 if (extent_state_cache) 156 kmem_cache_destroy(extent_state_cache); 157 if (extent_buffer_cache) 158 kmem_cache_destroy(extent_buffer_cache); 159 if (btrfs_bioset) 160 bioset_free(btrfs_bioset); 161 } 162 163 void extent_io_tree_init(struct extent_io_tree *tree, 164 struct address_space *mapping) 165 { 166 tree->state = RB_ROOT; 167 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 168 tree->ops = NULL; 169 tree->dirty_bytes = 0; 170 spin_lock_init(&tree->lock); 171 spin_lock_init(&tree->buffer_lock); 172 tree->mapping = mapping; 173 } 174 175 static struct extent_state *alloc_extent_state(gfp_t mask) 176 { 177 struct extent_state *state; 178 179 state = kmem_cache_alloc(extent_state_cache, mask); 180 if (!state) 181 return state; 182 state->state = 0; 183 state->private = 0; 184 state->tree = NULL; 185 btrfs_leak_debug_add(&state->leak_list, &states); 186 atomic_set(&state->refs, 1); 187 init_waitqueue_head(&state->wq); 188 trace_alloc_extent_state(state, mask, _RET_IP_); 189 return state; 190 } 191 192 void free_extent_state(struct extent_state *state) 193 { 194 if (!state) 195 return; 196 if (atomic_dec_and_test(&state->refs)) { 197 WARN_ON(state->tree); 198 btrfs_leak_debug_del(&state->leak_list); 199 trace_free_extent_state(state, _RET_IP_); 200 kmem_cache_free(extent_state_cache, state); 201 } 202 } 203 204 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 205 struct rb_node *node) 206 { 207 struct rb_node **p = &root->rb_node; 208 struct rb_node *parent = NULL; 209 struct tree_entry *entry; 210 211 while (*p) { 212 parent = *p; 213 entry = rb_entry(parent, struct tree_entry, rb_node); 214 215 if (offset < entry->start) 216 p = &(*p)->rb_left; 217 else if (offset > entry->end) 218 p = &(*p)->rb_right; 219 else 220 return parent; 221 } 222 223 rb_link_node(node, parent, p); 224 rb_insert_color(node, root); 225 return NULL; 226 } 227 228 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 229 struct rb_node **prev_ret, 230 struct rb_node **next_ret) 231 { 232 struct rb_root *root = &tree->state; 233 struct rb_node *n = root->rb_node; 234 struct rb_node *prev = NULL; 235 struct rb_node *orig_prev = NULL; 236 struct tree_entry *entry; 237 struct tree_entry *prev_entry = NULL; 238 239 while (n) { 240 entry = rb_entry(n, struct tree_entry, rb_node); 241 prev = n; 242 prev_entry = entry; 243 244 if (offset < entry->start) 245 n = n->rb_left; 246 else if (offset > entry->end) 247 n = n->rb_right; 248 else 249 return n; 250 } 251 252 if (prev_ret) { 253 orig_prev = prev; 254 while (prev && offset > prev_entry->end) { 255 prev = rb_next(prev); 256 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 257 } 258 *prev_ret = prev; 259 prev = orig_prev; 260 } 261 262 if (next_ret) { 263 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 264 while (prev && offset < prev_entry->start) { 265 prev = rb_prev(prev); 266 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 267 } 268 *next_ret = prev; 269 } 270 return NULL; 271 } 272 273 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 274 u64 offset) 275 { 276 struct rb_node *prev = NULL; 277 struct rb_node *ret; 278 279 ret = __etree_search(tree, offset, &prev, NULL); 280 if (!ret) 281 return prev; 282 return ret; 283 } 284 285 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 286 struct extent_state *other) 287 { 288 if (tree->ops && tree->ops->merge_extent_hook) 289 tree->ops->merge_extent_hook(tree->mapping->host, new, 290 other); 291 } 292 293 /* 294 * utility function to look for merge candidates inside a given range. 295 * Any extents with matching state are merged together into a single 296 * extent in the tree. Extents with EXTENT_IO in their state field 297 * are not merged because the end_io handlers need to be able to do 298 * operations on them without sleeping (or doing allocations/splits). 299 * 300 * This should be called with the tree lock held. 301 */ 302 static void merge_state(struct extent_io_tree *tree, 303 struct extent_state *state) 304 { 305 struct extent_state *other; 306 struct rb_node *other_node; 307 308 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 309 return; 310 311 other_node = rb_prev(&state->rb_node); 312 if (other_node) { 313 other = rb_entry(other_node, struct extent_state, rb_node); 314 if (other->end == state->start - 1 && 315 other->state == state->state) { 316 merge_cb(tree, state, other); 317 state->start = other->start; 318 other->tree = NULL; 319 rb_erase(&other->rb_node, &tree->state); 320 free_extent_state(other); 321 } 322 } 323 other_node = rb_next(&state->rb_node); 324 if (other_node) { 325 other = rb_entry(other_node, struct extent_state, rb_node); 326 if (other->start == state->end + 1 && 327 other->state == state->state) { 328 merge_cb(tree, state, other); 329 state->end = other->end; 330 other->tree = NULL; 331 rb_erase(&other->rb_node, &tree->state); 332 free_extent_state(other); 333 } 334 } 335 } 336 337 static void set_state_cb(struct extent_io_tree *tree, 338 struct extent_state *state, unsigned long *bits) 339 { 340 if (tree->ops && tree->ops->set_bit_hook) 341 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 342 } 343 344 static void clear_state_cb(struct extent_io_tree *tree, 345 struct extent_state *state, unsigned long *bits) 346 { 347 if (tree->ops && tree->ops->clear_bit_hook) 348 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 349 } 350 351 static void set_state_bits(struct extent_io_tree *tree, 352 struct extent_state *state, unsigned long *bits); 353 354 /* 355 * insert an extent_state struct into the tree. 'bits' are set on the 356 * struct before it is inserted. 357 * 358 * This may return -EEXIST if the extent is already there, in which case the 359 * state struct is freed. 360 * 361 * The tree lock is not taken internally. This is a utility function and 362 * probably isn't what you want to call (see set/clear_extent_bit). 363 */ 364 static int insert_state(struct extent_io_tree *tree, 365 struct extent_state *state, u64 start, u64 end, 366 unsigned long *bits) 367 { 368 struct rb_node *node; 369 370 if (end < start) 371 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 372 (unsigned long long)end, 373 (unsigned long long)start); 374 state->start = start; 375 state->end = end; 376 377 set_state_bits(tree, state, bits); 378 379 node = tree_insert(&tree->state, end, &state->rb_node); 380 if (node) { 381 struct extent_state *found; 382 found = rb_entry(node, struct extent_state, rb_node); 383 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 384 "%llu %llu\n", (unsigned long long)found->start, 385 (unsigned long long)found->end, 386 (unsigned long long)start, (unsigned long long)end); 387 return -EEXIST; 388 } 389 state->tree = tree; 390 merge_state(tree, state); 391 return 0; 392 } 393 394 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 395 u64 split) 396 { 397 if (tree->ops && tree->ops->split_extent_hook) 398 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 399 } 400 401 /* 402 * split a given extent state struct in two, inserting the preallocated 403 * struct 'prealloc' as the newly created second half. 'split' indicates an 404 * offset inside 'orig' where it should be split. 405 * 406 * Before calling, 407 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 408 * are two extent state structs in the tree: 409 * prealloc: [orig->start, split - 1] 410 * orig: [ split, orig->end ] 411 * 412 * The tree locks are not taken by this function. They need to be held 413 * by the caller. 414 */ 415 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 416 struct extent_state *prealloc, u64 split) 417 { 418 struct rb_node *node; 419 420 split_cb(tree, orig, split); 421 422 prealloc->start = orig->start; 423 prealloc->end = split - 1; 424 prealloc->state = orig->state; 425 orig->start = split; 426 427 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 428 if (node) { 429 free_extent_state(prealloc); 430 return -EEXIST; 431 } 432 prealloc->tree = tree; 433 return 0; 434 } 435 436 static struct extent_state *next_state(struct extent_state *state) 437 { 438 struct rb_node *next = rb_next(&state->rb_node); 439 if (next) 440 return rb_entry(next, struct extent_state, rb_node); 441 else 442 return NULL; 443 } 444 445 /* 446 * utility function to clear some bits in an extent state struct. 447 * it will optionally wake up any one waiting on this state (wake == 1). 448 * 449 * If no bits are set on the state struct after clearing things, the 450 * struct is freed and removed from the tree 451 */ 452 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 453 struct extent_state *state, 454 unsigned long *bits, int wake) 455 { 456 struct extent_state *next; 457 unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; 458 459 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 460 u64 range = state->end - state->start + 1; 461 WARN_ON(range > tree->dirty_bytes); 462 tree->dirty_bytes -= range; 463 } 464 clear_state_cb(tree, state, bits); 465 state->state &= ~bits_to_clear; 466 if (wake) 467 wake_up(&state->wq); 468 if (state->state == 0) { 469 next = next_state(state); 470 if (state->tree) { 471 rb_erase(&state->rb_node, &tree->state); 472 state->tree = NULL; 473 free_extent_state(state); 474 } else { 475 WARN_ON(1); 476 } 477 } else { 478 merge_state(tree, state); 479 next = next_state(state); 480 } 481 return next; 482 } 483 484 static struct extent_state * 485 alloc_extent_state_atomic(struct extent_state *prealloc) 486 { 487 if (!prealloc) 488 prealloc = alloc_extent_state(GFP_ATOMIC); 489 490 return prealloc; 491 } 492 493 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 494 { 495 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 496 "Extent tree was modified by another " 497 "thread while locked."); 498 } 499 500 /* 501 * clear some bits on a range in the tree. This may require splitting 502 * or inserting elements in the tree, so the gfp mask is used to 503 * indicate which allocations or sleeping are allowed. 504 * 505 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 506 * the given range from the tree regardless of state (ie for truncate). 507 * 508 * the range [start, end] is inclusive. 509 * 510 * This takes the tree lock, and returns 0 on success and < 0 on error. 511 */ 512 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 513 unsigned long bits, int wake, int delete, 514 struct extent_state **cached_state, 515 gfp_t mask) 516 { 517 struct extent_state *state; 518 struct extent_state *cached; 519 struct extent_state *prealloc = NULL; 520 struct rb_node *node; 521 u64 last_end; 522 int err; 523 int clear = 0; 524 525 if (delete) 526 bits |= ~EXTENT_CTLBITS; 527 bits |= EXTENT_FIRST_DELALLOC; 528 529 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 530 clear = 1; 531 again: 532 if (!prealloc && (mask & __GFP_WAIT)) { 533 prealloc = alloc_extent_state(mask); 534 if (!prealloc) 535 return -ENOMEM; 536 } 537 538 spin_lock(&tree->lock); 539 if (cached_state) { 540 cached = *cached_state; 541 542 if (clear) { 543 *cached_state = NULL; 544 cached_state = NULL; 545 } 546 547 if (cached && cached->tree && cached->start <= start && 548 cached->end > start) { 549 if (clear) 550 atomic_dec(&cached->refs); 551 state = cached; 552 goto hit_next; 553 } 554 if (clear) 555 free_extent_state(cached); 556 } 557 /* 558 * this search will find the extents that end after 559 * our range starts 560 */ 561 node = tree_search(tree, start); 562 if (!node) 563 goto out; 564 state = rb_entry(node, struct extent_state, rb_node); 565 hit_next: 566 if (state->start > end) 567 goto out; 568 WARN_ON(state->end < start); 569 last_end = state->end; 570 571 /* the state doesn't have the wanted bits, go ahead */ 572 if (!(state->state & bits)) { 573 state = next_state(state); 574 goto next; 575 } 576 577 /* 578 * | ---- desired range ---- | 579 * | state | or 580 * | ------------- state -------------- | 581 * 582 * We need to split the extent we found, and may flip 583 * bits on second half. 584 * 585 * If the extent we found extends past our range, we 586 * just split and search again. It'll get split again 587 * the next time though. 588 * 589 * If the extent we found is inside our range, we clear 590 * the desired bit on it. 591 */ 592 593 if (state->start < start) { 594 prealloc = alloc_extent_state_atomic(prealloc); 595 BUG_ON(!prealloc); 596 err = split_state(tree, state, prealloc, start); 597 if (err) 598 extent_io_tree_panic(tree, err); 599 600 prealloc = NULL; 601 if (err) 602 goto out; 603 if (state->end <= end) { 604 state = clear_state_bit(tree, state, &bits, wake); 605 goto next; 606 } 607 goto search_again; 608 } 609 /* 610 * | ---- desired range ---- | 611 * | state | 612 * We need to split the extent, and clear the bit 613 * on the first half 614 */ 615 if (state->start <= end && state->end > end) { 616 prealloc = alloc_extent_state_atomic(prealloc); 617 BUG_ON(!prealloc); 618 err = split_state(tree, state, prealloc, end + 1); 619 if (err) 620 extent_io_tree_panic(tree, err); 621 622 if (wake) 623 wake_up(&state->wq); 624 625 clear_state_bit(tree, prealloc, &bits, wake); 626 627 prealloc = NULL; 628 goto out; 629 } 630 631 state = clear_state_bit(tree, state, &bits, wake); 632 next: 633 if (last_end == (u64)-1) 634 goto out; 635 start = last_end + 1; 636 if (start <= end && state && !need_resched()) 637 goto hit_next; 638 goto search_again; 639 640 out: 641 spin_unlock(&tree->lock); 642 if (prealloc) 643 free_extent_state(prealloc); 644 645 return 0; 646 647 search_again: 648 if (start > end) 649 goto out; 650 spin_unlock(&tree->lock); 651 if (mask & __GFP_WAIT) 652 cond_resched(); 653 goto again; 654 } 655 656 static void wait_on_state(struct extent_io_tree *tree, 657 struct extent_state *state) 658 __releases(tree->lock) 659 __acquires(tree->lock) 660 { 661 DEFINE_WAIT(wait); 662 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 663 spin_unlock(&tree->lock); 664 schedule(); 665 spin_lock(&tree->lock); 666 finish_wait(&state->wq, &wait); 667 } 668 669 /* 670 * waits for one or more bits to clear on a range in the state tree. 671 * The range [start, end] is inclusive. 672 * The tree lock is taken by this function 673 */ 674 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 675 unsigned long bits) 676 { 677 struct extent_state *state; 678 struct rb_node *node; 679 680 spin_lock(&tree->lock); 681 again: 682 while (1) { 683 /* 684 * this search will find all the extents that end after 685 * our range starts 686 */ 687 node = tree_search(tree, start); 688 if (!node) 689 break; 690 691 state = rb_entry(node, struct extent_state, rb_node); 692 693 if (state->start > end) 694 goto out; 695 696 if (state->state & bits) { 697 start = state->start; 698 atomic_inc(&state->refs); 699 wait_on_state(tree, state); 700 free_extent_state(state); 701 goto again; 702 } 703 start = state->end + 1; 704 705 if (start > end) 706 break; 707 708 cond_resched_lock(&tree->lock); 709 } 710 out: 711 spin_unlock(&tree->lock); 712 } 713 714 static void set_state_bits(struct extent_io_tree *tree, 715 struct extent_state *state, 716 unsigned long *bits) 717 { 718 unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; 719 720 set_state_cb(tree, state, bits); 721 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 722 u64 range = state->end - state->start + 1; 723 tree->dirty_bytes += range; 724 } 725 state->state |= bits_to_set; 726 } 727 728 static void cache_state(struct extent_state *state, 729 struct extent_state **cached_ptr) 730 { 731 if (cached_ptr && !(*cached_ptr)) { 732 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 733 *cached_ptr = state; 734 atomic_inc(&state->refs); 735 } 736 } 737 } 738 739 static void uncache_state(struct extent_state **cached_ptr) 740 { 741 if (cached_ptr && (*cached_ptr)) { 742 struct extent_state *state = *cached_ptr; 743 *cached_ptr = NULL; 744 free_extent_state(state); 745 } 746 } 747 748 /* 749 * set some bits on a range in the tree. This may require allocations or 750 * sleeping, so the gfp mask is used to indicate what is allowed. 751 * 752 * If any of the exclusive bits are set, this will fail with -EEXIST if some 753 * part of the range already has the desired bits set. The start of the 754 * existing range is returned in failed_start in this case. 755 * 756 * [start, end] is inclusive This takes the tree lock. 757 */ 758 759 static int __must_check 760 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 761 unsigned long bits, unsigned long exclusive_bits, 762 u64 *failed_start, struct extent_state **cached_state, 763 gfp_t mask) 764 { 765 struct extent_state *state; 766 struct extent_state *prealloc = NULL; 767 struct rb_node *node; 768 int err = 0; 769 u64 last_start; 770 u64 last_end; 771 772 bits |= EXTENT_FIRST_DELALLOC; 773 again: 774 if (!prealloc && (mask & __GFP_WAIT)) { 775 prealloc = alloc_extent_state(mask); 776 BUG_ON(!prealloc); 777 } 778 779 spin_lock(&tree->lock); 780 if (cached_state && *cached_state) { 781 state = *cached_state; 782 if (state->start <= start && state->end > start && 783 state->tree) { 784 node = &state->rb_node; 785 goto hit_next; 786 } 787 } 788 /* 789 * this search will find all the extents that end after 790 * our range starts. 791 */ 792 node = tree_search(tree, start); 793 if (!node) { 794 prealloc = alloc_extent_state_atomic(prealloc); 795 BUG_ON(!prealloc); 796 err = insert_state(tree, prealloc, start, end, &bits); 797 if (err) 798 extent_io_tree_panic(tree, err); 799 800 prealloc = NULL; 801 goto out; 802 } 803 state = rb_entry(node, struct extent_state, rb_node); 804 hit_next: 805 last_start = state->start; 806 last_end = state->end; 807 808 /* 809 * | ---- desired range ---- | 810 * | state | 811 * 812 * Just lock what we found and keep going 813 */ 814 if (state->start == start && state->end <= end) { 815 if (state->state & exclusive_bits) { 816 *failed_start = state->start; 817 err = -EEXIST; 818 goto out; 819 } 820 821 set_state_bits(tree, state, &bits); 822 cache_state(state, cached_state); 823 merge_state(tree, state); 824 if (last_end == (u64)-1) 825 goto out; 826 start = last_end + 1; 827 state = next_state(state); 828 if (start < end && state && state->start == start && 829 !need_resched()) 830 goto hit_next; 831 goto search_again; 832 } 833 834 /* 835 * | ---- desired range ---- | 836 * | state | 837 * or 838 * | ------------- state -------------- | 839 * 840 * We need to split the extent we found, and may flip bits on 841 * second half. 842 * 843 * If the extent we found extends past our 844 * range, we just split and search again. It'll get split 845 * again the next time though. 846 * 847 * If the extent we found is inside our range, we set the 848 * desired bit on it. 849 */ 850 if (state->start < start) { 851 if (state->state & exclusive_bits) { 852 *failed_start = start; 853 err = -EEXIST; 854 goto out; 855 } 856 857 prealloc = alloc_extent_state_atomic(prealloc); 858 BUG_ON(!prealloc); 859 err = split_state(tree, state, prealloc, start); 860 if (err) 861 extent_io_tree_panic(tree, err); 862 863 prealloc = NULL; 864 if (err) 865 goto out; 866 if (state->end <= end) { 867 set_state_bits(tree, state, &bits); 868 cache_state(state, cached_state); 869 merge_state(tree, state); 870 if (last_end == (u64)-1) 871 goto out; 872 start = last_end + 1; 873 state = next_state(state); 874 if (start < end && state && state->start == start && 875 !need_resched()) 876 goto hit_next; 877 } 878 goto search_again; 879 } 880 /* 881 * | ---- desired range ---- | 882 * | state | or | state | 883 * 884 * There's a hole, we need to insert something in it and 885 * ignore the extent we found. 886 */ 887 if (state->start > start) { 888 u64 this_end; 889 if (end < last_start) 890 this_end = end; 891 else 892 this_end = last_start - 1; 893 894 prealloc = alloc_extent_state_atomic(prealloc); 895 BUG_ON(!prealloc); 896 897 /* 898 * Avoid to free 'prealloc' if it can be merged with 899 * the later extent. 900 */ 901 err = insert_state(tree, prealloc, start, this_end, 902 &bits); 903 if (err) 904 extent_io_tree_panic(tree, err); 905 906 cache_state(prealloc, cached_state); 907 prealloc = NULL; 908 start = this_end + 1; 909 goto search_again; 910 } 911 /* 912 * | ---- desired range ---- | 913 * | state | 914 * We need to split the extent, and set the bit 915 * on the first half 916 */ 917 if (state->start <= end && state->end > end) { 918 if (state->state & exclusive_bits) { 919 *failed_start = start; 920 err = -EEXIST; 921 goto out; 922 } 923 924 prealloc = alloc_extent_state_atomic(prealloc); 925 BUG_ON(!prealloc); 926 err = split_state(tree, state, prealloc, end + 1); 927 if (err) 928 extent_io_tree_panic(tree, err); 929 930 set_state_bits(tree, prealloc, &bits); 931 cache_state(prealloc, cached_state); 932 merge_state(tree, prealloc); 933 prealloc = NULL; 934 goto out; 935 } 936 937 goto search_again; 938 939 out: 940 spin_unlock(&tree->lock); 941 if (prealloc) 942 free_extent_state(prealloc); 943 944 return err; 945 946 search_again: 947 if (start > end) 948 goto out; 949 spin_unlock(&tree->lock); 950 if (mask & __GFP_WAIT) 951 cond_resched(); 952 goto again; 953 } 954 955 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 956 unsigned long bits, u64 * failed_start, 957 struct extent_state **cached_state, gfp_t mask) 958 { 959 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 960 cached_state, mask); 961 } 962 963 964 /** 965 * convert_extent_bit - convert all bits in a given range from one bit to 966 * another 967 * @tree: the io tree to search 968 * @start: the start offset in bytes 969 * @end: the end offset in bytes (inclusive) 970 * @bits: the bits to set in this range 971 * @clear_bits: the bits to clear in this range 972 * @cached_state: state that we're going to cache 973 * @mask: the allocation mask 974 * 975 * This will go through and set bits for the given range. If any states exist 976 * already in this range they are set with the given bit and cleared of the 977 * clear_bits. This is only meant to be used by things that are mergeable, ie 978 * converting from say DELALLOC to DIRTY. This is not meant to be used with 979 * boundary bits like LOCK. 980 */ 981 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 982 unsigned long bits, unsigned long clear_bits, 983 struct extent_state **cached_state, gfp_t mask) 984 { 985 struct extent_state *state; 986 struct extent_state *prealloc = NULL; 987 struct rb_node *node; 988 int err = 0; 989 u64 last_start; 990 u64 last_end; 991 992 again: 993 if (!prealloc && (mask & __GFP_WAIT)) { 994 prealloc = alloc_extent_state(mask); 995 if (!prealloc) 996 return -ENOMEM; 997 } 998 999 spin_lock(&tree->lock); 1000 if (cached_state && *cached_state) { 1001 state = *cached_state; 1002 if (state->start <= start && state->end > start && 1003 state->tree) { 1004 node = &state->rb_node; 1005 goto hit_next; 1006 } 1007 } 1008 1009 /* 1010 * this search will find all the extents that end after 1011 * our range starts. 1012 */ 1013 node = tree_search(tree, start); 1014 if (!node) { 1015 prealloc = alloc_extent_state_atomic(prealloc); 1016 if (!prealloc) { 1017 err = -ENOMEM; 1018 goto out; 1019 } 1020 err = insert_state(tree, prealloc, start, end, &bits); 1021 prealloc = NULL; 1022 if (err) 1023 extent_io_tree_panic(tree, err); 1024 goto out; 1025 } 1026 state = rb_entry(node, struct extent_state, rb_node); 1027 hit_next: 1028 last_start = state->start; 1029 last_end = state->end; 1030 1031 /* 1032 * | ---- desired range ---- | 1033 * | state | 1034 * 1035 * Just lock what we found and keep going 1036 */ 1037 if (state->start == start && state->end <= end) { 1038 set_state_bits(tree, state, &bits); 1039 cache_state(state, cached_state); 1040 state = clear_state_bit(tree, state, &clear_bits, 0); 1041 if (last_end == (u64)-1) 1042 goto out; 1043 start = last_end + 1; 1044 if (start < end && state && state->start == start && 1045 !need_resched()) 1046 goto hit_next; 1047 goto search_again; 1048 } 1049 1050 /* 1051 * | ---- desired range ---- | 1052 * | state | 1053 * or 1054 * | ------------- state -------------- | 1055 * 1056 * We need to split the extent we found, and may flip bits on 1057 * second half. 1058 * 1059 * If the extent we found extends past our 1060 * range, we just split and search again. It'll get split 1061 * again the next time though. 1062 * 1063 * If the extent we found is inside our range, we set the 1064 * desired bit on it. 1065 */ 1066 if (state->start < start) { 1067 prealloc = alloc_extent_state_atomic(prealloc); 1068 if (!prealloc) { 1069 err = -ENOMEM; 1070 goto out; 1071 } 1072 err = split_state(tree, state, prealloc, start); 1073 if (err) 1074 extent_io_tree_panic(tree, err); 1075 prealloc = NULL; 1076 if (err) 1077 goto out; 1078 if (state->end <= end) { 1079 set_state_bits(tree, state, &bits); 1080 cache_state(state, cached_state); 1081 state = clear_state_bit(tree, state, &clear_bits, 0); 1082 if (last_end == (u64)-1) 1083 goto out; 1084 start = last_end + 1; 1085 if (start < end && state && state->start == start && 1086 !need_resched()) 1087 goto hit_next; 1088 } 1089 goto search_again; 1090 } 1091 /* 1092 * | ---- desired range ---- | 1093 * | state | or | state | 1094 * 1095 * There's a hole, we need to insert something in it and 1096 * ignore the extent we found. 1097 */ 1098 if (state->start > start) { 1099 u64 this_end; 1100 if (end < last_start) 1101 this_end = end; 1102 else 1103 this_end = last_start - 1; 1104 1105 prealloc = alloc_extent_state_atomic(prealloc); 1106 if (!prealloc) { 1107 err = -ENOMEM; 1108 goto out; 1109 } 1110 1111 /* 1112 * Avoid to free 'prealloc' if it can be merged with 1113 * the later extent. 1114 */ 1115 err = insert_state(tree, prealloc, start, this_end, 1116 &bits); 1117 if (err) 1118 extent_io_tree_panic(tree, err); 1119 cache_state(prealloc, cached_state); 1120 prealloc = NULL; 1121 start = this_end + 1; 1122 goto search_again; 1123 } 1124 /* 1125 * | ---- desired range ---- | 1126 * | state | 1127 * We need to split the extent, and set the bit 1128 * on the first half 1129 */ 1130 if (state->start <= end && state->end > end) { 1131 prealloc = alloc_extent_state_atomic(prealloc); 1132 if (!prealloc) { 1133 err = -ENOMEM; 1134 goto out; 1135 } 1136 1137 err = split_state(tree, state, prealloc, end + 1); 1138 if (err) 1139 extent_io_tree_panic(tree, err); 1140 1141 set_state_bits(tree, prealloc, &bits); 1142 cache_state(prealloc, cached_state); 1143 clear_state_bit(tree, prealloc, &clear_bits, 0); 1144 prealloc = NULL; 1145 goto out; 1146 } 1147 1148 goto search_again; 1149 1150 out: 1151 spin_unlock(&tree->lock); 1152 if (prealloc) 1153 free_extent_state(prealloc); 1154 1155 return err; 1156 1157 search_again: 1158 if (start > end) 1159 goto out; 1160 spin_unlock(&tree->lock); 1161 if (mask & __GFP_WAIT) 1162 cond_resched(); 1163 goto again; 1164 } 1165 1166 /* wrappers around set/clear extent bit */ 1167 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1168 gfp_t mask) 1169 { 1170 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1171 NULL, mask); 1172 } 1173 1174 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1175 unsigned long bits, gfp_t mask) 1176 { 1177 return set_extent_bit(tree, start, end, bits, NULL, 1178 NULL, mask); 1179 } 1180 1181 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1182 unsigned long bits, gfp_t mask) 1183 { 1184 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1185 } 1186 1187 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1188 struct extent_state **cached_state, gfp_t mask) 1189 { 1190 return set_extent_bit(tree, start, end, 1191 EXTENT_DELALLOC | EXTENT_UPTODATE, 1192 NULL, cached_state, mask); 1193 } 1194 1195 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1196 struct extent_state **cached_state, gfp_t mask) 1197 { 1198 return set_extent_bit(tree, start, end, 1199 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1200 NULL, cached_state, mask); 1201 } 1202 1203 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1204 gfp_t mask) 1205 { 1206 return clear_extent_bit(tree, start, end, 1207 EXTENT_DIRTY | EXTENT_DELALLOC | 1208 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1209 } 1210 1211 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1212 gfp_t mask) 1213 { 1214 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1215 NULL, mask); 1216 } 1217 1218 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1219 struct extent_state **cached_state, gfp_t mask) 1220 { 1221 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, 1222 cached_state, mask); 1223 } 1224 1225 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1226 struct extent_state **cached_state, gfp_t mask) 1227 { 1228 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1229 cached_state, mask); 1230 } 1231 1232 /* 1233 * either insert or lock state struct between start and end use mask to tell 1234 * us if waiting is desired. 1235 */ 1236 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1237 unsigned long bits, struct extent_state **cached_state) 1238 { 1239 int err; 1240 u64 failed_start; 1241 while (1) { 1242 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1243 EXTENT_LOCKED, &failed_start, 1244 cached_state, GFP_NOFS); 1245 if (err == -EEXIST) { 1246 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1247 start = failed_start; 1248 } else 1249 break; 1250 WARN_ON(start > end); 1251 } 1252 return err; 1253 } 1254 1255 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1256 { 1257 return lock_extent_bits(tree, start, end, 0, NULL); 1258 } 1259 1260 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1261 { 1262 int err; 1263 u64 failed_start; 1264 1265 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1266 &failed_start, NULL, GFP_NOFS); 1267 if (err == -EEXIST) { 1268 if (failed_start > start) 1269 clear_extent_bit(tree, start, failed_start - 1, 1270 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1271 return 0; 1272 } 1273 return 1; 1274 } 1275 1276 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1277 struct extent_state **cached, gfp_t mask) 1278 { 1279 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1280 mask); 1281 } 1282 1283 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1284 { 1285 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1286 GFP_NOFS); 1287 } 1288 1289 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1290 { 1291 unsigned long index = start >> PAGE_CACHE_SHIFT; 1292 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1293 struct page *page; 1294 1295 while (index <= end_index) { 1296 page = find_get_page(inode->i_mapping, index); 1297 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1298 clear_page_dirty_for_io(page); 1299 page_cache_release(page); 1300 index++; 1301 } 1302 return 0; 1303 } 1304 1305 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1306 { 1307 unsigned long index = start >> PAGE_CACHE_SHIFT; 1308 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1309 struct page *page; 1310 1311 while (index <= end_index) { 1312 page = find_get_page(inode->i_mapping, index); 1313 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1314 account_page_redirty(page); 1315 __set_page_dirty_nobuffers(page); 1316 page_cache_release(page); 1317 index++; 1318 } 1319 return 0; 1320 } 1321 1322 /* 1323 * helper function to set both pages and extents in the tree writeback 1324 */ 1325 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1326 { 1327 unsigned long index = start >> PAGE_CACHE_SHIFT; 1328 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1329 struct page *page; 1330 1331 while (index <= end_index) { 1332 page = find_get_page(tree->mapping, index); 1333 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1334 set_page_writeback(page); 1335 page_cache_release(page); 1336 index++; 1337 } 1338 return 0; 1339 } 1340 1341 /* find the first state struct with 'bits' set after 'start', and 1342 * return it. tree->lock must be held. NULL will returned if 1343 * nothing was found after 'start' 1344 */ 1345 static struct extent_state * 1346 find_first_extent_bit_state(struct extent_io_tree *tree, 1347 u64 start, unsigned long bits) 1348 { 1349 struct rb_node *node; 1350 struct extent_state *state; 1351 1352 /* 1353 * this search will find all the extents that end after 1354 * our range starts. 1355 */ 1356 node = tree_search(tree, start); 1357 if (!node) 1358 goto out; 1359 1360 while (1) { 1361 state = rb_entry(node, struct extent_state, rb_node); 1362 if (state->end >= start && (state->state & bits)) 1363 return state; 1364 1365 node = rb_next(node); 1366 if (!node) 1367 break; 1368 } 1369 out: 1370 return NULL; 1371 } 1372 1373 /* 1374 * find the first offset in the io tree with 'bits' set. zero is 1375 * returned if we find something, and *start_ret and *end_ret are 1376 * set to reflect the state struct that was found. 1377 * 1378 * If nothing was found, 1 is returned. If found something, return 0. 1379 */ 1380 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1381 u64 *start_ret, u64 *end_ret, unsigned long bits, 1382 struct extent_state **cached_state) 1383 { 1384 struct extent_state *state; 1385 struct rb_node *n; 1386 int ret = 1; 1387 1388 spin_lock(&tree->lock); 1389 if (cached_state && *cached_state) { 1390 state = *cached_state; 1391 if (state->end == start - 1 && state->tree) { 1392 n = rb_next(&state->rb_node); 1393 while (n) { 1394 state = rb_entry(n, struct extent_state, 1395 rb_node); 1396 if (state->state & bits) 1397 goto got_it; 1398 n = rb_next(n); 1399 } 1400 free_extent_state(*cached_state); 1401 *cached_state = NULL; 1402 goto out; 1403 } 1404 free_extent_state(*cached_state); 1405 *cached_state = NULL; 1406 } 1407 1408 state = find_first_extent_bit_state(tree, start, bits); 1409 got_it: 1410 if (state) { 1411 cache_state(state, cached_state); 1412 *start_ret = state->start; 1413 *end_ret = state->end; 1414 ret = 0; 1415 } 1416 out: 1417 spin_unlock(&tree->lock); 1418 return ret; 1419 } 1420 1421 /* 1422 * find a contiguous range of bytes in the file marked as delalloc, not 1423 * more than 'max_bytes'. start and end are used to return the range, 1424 * 1425 * 1 is returned if we find something, 0 if nothing was in the tree 1426 */ 1427 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1428 u64 *start, u64 *end, u64 max_bytes, 1429 struct extent_state **cached_state) 1430 { 1431 struct rb_node *node; 1432 struct extent_state *state; 1433 u64 cur_start = *start; 1434 u64 found = 0; 1435 u64 total_bytes = 0; 1436 1437 spin_lock(&tree->lock); 1438 1439 /* 1440 * this search will find all the extents that end after 1441 * our range starts. 1442 */ 1443 node = tree_search(tree, cur_start); 1444 if (!node) { 1445 if (!found) 1446 *end = (u64)-1; 1447 goto out; 1448 } 1449 1450 while (1) { 1451 state = rb_entry(node, struct extent_state, rb_node); 1452 if (found && (state->start != cur_start || 1453 (state->state & EXTENT_BOUNDARY))) { 1454 goto out; 1455 } 1456 if (!(state->state & EXTENT_DELALLOC)) { 1457 if (!found) 1458 *end = state->end; 1459 goto out; 1460 } 1461 if (!found) { 1462 *start = state->start; 1463 *cached_state = state; 1464 atomic_inc(&state->refs); 1465 } 1466 found++; 1467 *end = state->end; 1468 cur_start = state->end + 1; 1469 node = rb_next(node); 1470 if (!node) 1471 break; 1472 total_bytes += state->end - state->start + 1; 1473 if (total_bytes >= max_bytes) 1474 break; 1475 } 1476 out: 1477 spin_unlock(&tree->lock); 1478 return found; 1479 } 1480 1481 static noinline void __unlock_for_delalloc(struct inode *inode, 1482 struct page *locked_page, 1483 u64 start, u64 end) 1484 { 1485 int ret; 1486 struct page *pages[16]; 1487 unsigned long index = start >> PAGE_CACHE_SHIFT; 1488 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1489 unsigned long nr_pages = end_index - index + 1; 1490 int i; 1491 1492 if (index == locked_page->index && end_index == index) 1493 return; 1494 1495 while (nr_pages > 0) { 1496 ret = find_get_pages_contig(inode->i_mapping, index, 1497 min_t(unsigned long, nr_pages, 1498 ARRAY_SIZE(pages)), pages); 1499 for (i = 0; i < ret; i++) { 1500 if (pages[i] != locked_page) 1501 unlock_page(pages[i]); 1502 page_cache_release(pages[i]); 1503 } 1504 nr_pages -= ret; 1505 index += ret; 1506 cond_resched(); 1507 } 1508 } 1509 1510 static noinline int lock_delalloc_pages(struct inode *inode, 1511 struct page *locked_page, 1512 u64 delalloc_start, 1513 u64 delalloc_end) 1514 { 1515 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1516 unsigned long start_index = index; 1517 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1518 unsigned long pages_locked = 0; 1519 struct page *pages[16]; 1520 unsigned long nrpages; 1521 int ret; 1522 int i; 1523 1524 /* the caller is responsible for locking the start index */ 1525 if (index == locked_page->index && index == end_index) 1526 return 0; 1527 1528 /* skip the page at the start index */ 1529 nrpages = end_index - index + 1; 1530 while (nrpages > 0) { 1531 ret = find_get_pages_contig(inode->i_mapping, index, 1532 min_t(unsigned long, 1533 nrpages, ARRAY_SIZE(pages)), pages); 1534 if (ret == 0) { 1535 ret = -EAGAIN; 1536 goto done; 1537 } 1538 /* now we have an array of pages, lock them all */ 1539 for (i = 0; i < ret; i++) { 1540 /* 1541 * the caller is taking responsibility for 1542 * locked_page 1543 */ 1544 if (pages[i] != locked_page) { 1545 lock_page(pages[i]); 1546 if (!PageDirty(pages[i]) || 1547 pages[i]->mapping != inode->i_mapping) { 1548 ret = -EAGAIN; 1549 unlock_page(pages[i]); 1550 page_cache_release(pages[i]); 1551 goto done; 1552 } 1553 } 1554 page_cache_release(pages[i]); 1555 pages_locked++; 1556 } 1557 nrpages -= ret; 1558 index += ret; 1559 cond_resched(); 1560 } 1561 ret = 0; 1562 done: 1563 if (ret && pages_locked) { 1564 __unlock_for_delalloc(inode, locked_page, 1565 delalloc_start, 1566 ((u64)(start_index + pages_locked - 1)) << 1567 PAGE_CACHE_SHIFT); 1568 } 1569 return ret; 1570 } 1571 1572 /* 1573 * find a contiguous range of bytes in the file marked as delalloc, not 1574 * more than 'max_bytes'. start and end are used to return the range, 1575 * 1576 * 1 is returned if we find something, 0 if nothing was in the tree 1577 */ 1578 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1579 struct extent_io_tree *tree, 1580 struct page *locked_page, 1581 u64 *start, u64 *end, 1582 u64 max_bytes) 1583 { 1584 u64 delalloc_start; 1585 u64 delalloc_end; 1586 u64 found; 1587 struct extent_state *cached_state = NULL; 1588 int ret; 1589 int loops = 0; 1590 1591 again: 1592 /* step one, find a bunch of delalloc bytes starting at start */ 1593 delalloc_start = *start; 1594 delalloc_end = 0; 1595 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1596 max_bytes, &cached_state); 1597 if (!found || delalloc_end <= *start) { 1598 *start = delalloc_start; 1599 *end = delalloc_end; 1600 free_extent_state(cached_state); 1601 return found; 1602 } 1603 1604 /* 1605 * start comes from the offset of locked_page. We have to lock 1606 * pages in order, so we can't process delalloc bytes before 1607 * locked_page 1608 */ 1609 if (delalloc_start < *start) 1610 delalloc_start = *start; 1611 1612 /* 1613 * make sure to limit the number of pages we try to lock down 1614 * if we're looping. 1615 */ 1616 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1617 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1618 1619 /* step two, lock all the pages after the page that has start */ 1620 ret = lock_delalloc_pages(inode, locked_page, 1621 delalloc_start, delalloc_end); 1622 if (ret == -EAGAIN) { 1623 /* some of the pages are gone, lets avoid looping by 1624 * shortening the size of the delalloc range we're searching 1625 */ 1626 free_extent_state(cached_state); 1627 if (!loops) { 1628 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1629 max_bytes = PAGE_CACHE_SIZE - offset; 1630 loops = 1; 1631 goto again; 1632 } else { 1633 found = 0; 1634 goto out_failed; 1635 } 1636 } 1637 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1638 1639 /* step three, lock the state bits for the whole range */ 1640 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1641 1642 /* then test to make sure it is all still delalloc */ 1643 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1644 EXTENT_DELALLOC, 1, cached_state); 1645 if (!ret) { 1646 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1647 &cached_state, GFP_NOFS); 1648 __unlock_for_delalloc(inode, locked_page, 1649 delalloc_start, delalloc_end); 1650 cond_resched(); 1651 goto again; 1652 } 1653 free_extent_state(cached_state); 1654 *start = delalloc_start; 1655 *end = delalloc_end; 1656 out_failed: 1657 return found; 1658 } 1659 1660 int extent_clear_unlock_delalloc(struct inode *inode, 1661 struct extent_io_tree *tree, 1662 u64 start, u64 end, struct page *locked_page, 1663 unsigned long op) 1664 { 1665 int ret; 1666 struct page *pages[16]; 1667 unsigned long index = start >> PAGE_CACHE_SHIFT; 1668 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1669 unsigned long nr_pages = end_index - index + 1; 1670 int i; 1671 unsigned long clear_bits = 0; 1672 1673 if (op & EXTENT_CLEAR_UNLOCK) 1674 clear_bits |= EXTENT_LOCKED; 1675 if (op & EXTENT_CLEAR_DIRTY) 1676 clear_bits |= EXTENT_DIRTY; 1677 1678 if (op & EXTENT_CLEAR_DELALLOC) 1679 clear_bits |= EXTENT_DELALLOC; 1680 1681 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1682 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1683 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1684 EXTENT_SET_PRIVATE2))) 1685 return 0; 1686 1687 while (nr_pages > 0) { 1688 ret = find_get_pages_contig(inode->i_mapping, index, 1689 min_t(unsigned long, 1690 nr_pages, ARRAY_SIZE(pages)), pages); 1691 for (i = 0; i < ret; i++) { 1692 1693 if (op & EXTENT_SET_PRIVATE2) 1694 SetPagePrivate2(pages[i]); 1695 1696 if (pages[i] == locked_page) { 1697 page_cache_release(pages[i]); 1698 continue; 1699 } 1700 if (op & EXTENT_CLEAR_DIRTY) 1701 clear_page_dirty_for_io(pages[i]); 1702 if (op & EXTENT_SET_WRITEBACK) 1703 set_page_writeback(pages[i]); 1704 if (op & EXTENT_END_WRITEBACK) 1705 end_page_writeback(pages[i]); 1706 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1707 unlock_page(pages[i]); 1708 page_cache_release(pages[i]); 1709 } 1710 nr_pages -= ret; 1711 index += ret; 1712 cond_resched(); 1713 } 1714 return 0; 1715 } 1716 1717 /* 1718 * count the number of bytes in the tree that have a given bit(s) 1719 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1720 * cached. The total number found is returned. 1721 */ 1722 u64 count_range_bits(struct extent_io_tree *tree, 1723 u64 *start, u64 search_end, u64 max_bytes, 1724 unsigned long bits, int contig) 1725 { 1726 struct rb_node *node; 1727 struct extent_state *state; 1728 u64 cur_start = *start; 1729 u64 total_bytes = 0; 1730 u64 last = 0; 1731 int found = 0; 1732 1733 if (search_end <= cur_start) { 1734 WARN_ON(1); 1735 return 0; 1736 } 1737 1738 spin_lock(&tree->lock); 1739 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1740 total_bytes = tree->dirty_bytes; 1741 goto out; 1742 } 1743 /* 1744 * this search will find all the extents that end after 1745 * our range starts. 1746 */ 1747 node = tree_search(tree, cur_start); 1748 if (!node) 1749 goto out; 1750 1751 while (1) { 1752 state = rb_entry(node, struct extent_state, rb_node); 1753 if (state->start > search_end) 1754 break; 1755 if (contig && found && state->start > last + 1) 1756 break; 1757 if (state->end >= cur_start && (state->state & bits) == bits) { 1758 total_bytes += min(search_end, state->end) + 1 - 1759 max(cur_start, state->start); 1760 if (total_bytes >= max_bytes) 1761 break; 1762 if (!found) { 1763 *start = max(cur_start, state->start); 1764 found = 1; 1765 } 1766 last = state->end; 1767 } else if (contig && found) { 1768 break; 1769 } 1770 node = rb_next(node); 1771 if (!node) 1772 break; 1773 } 1774 out: 1775 spin_unlock(&tree->lock); 1776 return total_bytes; 1777 } 1778 1779 /* 1780 * set the private field for a given byte offset in the tree. If there isn't 1781 * an extent_state there already, this does nothing. 1782 */ 1783 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1784 { 1785 struct rb_node *node; 1786 struct extent_state *state; 1787 int ret = 0; 1788 1789 spin_lock(&tree->lock); 1790 /* 1791 * this search will find all the extents that end after 1792 * our range starts. 1793 */ 1794 node = tree_search(tree, start); 1795 if (!node) { 1796 ret = -ENOENT; 1797 goto out; 1798 } 1799 state = rb_entry(node, struct extent_state, rb_node); 1800 if (state->start != start) { 1801 ret = -ENOENT; 1802 goto out; 1803 } 1804 state->private = private; 1805 out: 1806 spin_unlock(&tree->lock); 1807 return ret; 1808 } 1809 1810 void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], 1811 int count) 1812 { 1813 struct rb_node *node; 1814 struct extent_state *state; 1815 1816 spin_lock(&tree->lock); 1817 /* 1818 * this search will find all the extents that end after 1819 * our range starts. 1820 */ 1821 node = tree_search(tree, start); 1822 BUG_ON(!node); 1823 1824 state = rb_entry(node, struct extent_state, rb_node); 1825 BUG_ON(state->start != start); 1826 1827 while (count) { 1828 state->private = *csums++; 1829 count--; 1830 state = next_state(state); 1831 } 1832 spin_unlock(&tree->lock); 1833 } 1834 1835 static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) 1836 { 1837 struct bio_vec *bvec = bio->bi_io_vec + bio_index; 1838 1839 return page_offset(bvec->bv_page) + bvec->bv_offset; 1840 } 1841 1842 void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, 1843 u32 csums[], int count) 1844 { 1845 struct rb_node *node; 1846 struct extent_state *state = NULL; 1847 u64 start; 1848 1849 spin_lock(&tree->lock); 1850 do { 1851 start = __btrfs_get_bio_offset(bio, bio_index); 1852 if (state == NULL || state->start != start) { 1853 node = tree_search(tree, start); 1854 BUG_ON(!node); 1855 1856 state = rb_entry(node, struct extent_state, rb_node); 1857 BUG_ON(state->start != start); 1858 } 1859 state->private = *csums++; 1860 count--; 1861 bio_index++; 1862 1863 state = next_state(state); 1864 } while (count); 1865 spin_unlock(&tree->lock); 1866 } 1867 1868 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1869 { 1870 struct rb_node *node; 1871 struct extent_state *state; 1872 int ret = 0; 1873 1874 spin_lock(&tree->lock); 1875 /* 1876 * this search will find all the extents that end after 1877 * our range starts. 1878 */ 1879 node = tree_search(tree, start); 1880 if (!node) { 1881 ret = -ENOENT; 1882 goto out; 1883 } 1884 state = rb_entry(node, struct extent_state, rb_node); 1885 if (state->start != start) { 1886 ret = -ENOENT; 1887 goto out; 1888 } 1889 *private = state->private; 1890 out: 1891 spin_unlock(&tree->lock); 1892 return ret; 1893 } 1894 1895 /* 1896 * searches a range in the state tree for a given mask. 1897 * If 'filled' == 1, this returns 1 only if every extent in the tree 1898 * has the bits set. Otherwise, 1 is returned if any bit in the 1899 * range is found set. 1900 */ 1901 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1902 unsigned long bits, int filled, struct extent_state *cached) 1903 { 1904 struct extent_state *state = NULL; 1905 struct rb_node *node; 1906 int bitset = 0; 1907 1908 spin_lock(&tree->lock); 1909 if (cached && cached->tree && cached->start <= start && 1910 cached->end > start) 1911 node = &cached->rb_node; 1912 else 1913 node = tree_search(tree, start); 1914 while (node && start <= end) { 1915 state = rb_entry(node, struct extent_state, rb_node); 1916 1917 if (filled && state->start > start) { 1918 bitset = 0; 1919 break; 1920 } 1921 1922 if (state->start > end) 1923 break; 1924 1925 if (state->state & bits) { 1926 bitset = 1; 1927 if (!filled) 1928 break; 1929 } else if (filled) { 1930 bitset = 0; 1931 break; 1932 } 1933 1934 if (state->end == (u64)-1) 1935 break; 1936 1937 start = state->end + 1; 1938 if (start > end) 1939 break; 1940 node = rb_next(node); 1941 if (!node) { 1942 if (filled) 1943 bitset = 0; 1944 break; 1945 } 1946 } 1947 spin_unlock(&tree->lock); 1948 return bitset; 1949 } 1950 1951 /* 1952 * helper function to set a given page up to date if all the 1953 * extents in the tree for that page are up to date 1954 */ 1955 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1956 { 1957 u64 start = page_offset(page); 1958 u64 end = start + PAGE_CACHE_SIZE - 1; 1959 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1960 SetPageUptodate(page); 1961 } 1962 1963 /* 1964 * When IO fails, either with EIO or csum verification fails, we 1965 * try other mirrors that might have a good copy of the data. This 1966 * io_failure_record is used to record state as we go through all the 1967 * mirrors. If another mirror has good data, the page is set up to date 1968 * and things continue. If a good mirror can't be found, the original 1969 * bio end_io callback is called to indicate things have failed. 1970 */ 1971 struct io_failure_record { 1972 struct page *page; 1973 u64 start; 1974 u64 len; 1975 u64 logical; 1976 unsigned long bio_flags; 1977 int this_mirror; 1978 int failed_mirror; 1979 int in_validation; 1980 }; 1981 1982 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1983 int did_repair) 1984 { 1985 int ret; 1986 int err = 0; 1987 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1988 1989 set_state_private(failure_tree, rec->start, 0); 1990 ret = clear_extent_bits(failure_tree, rec->start, 1991 rec->start + rec->len - 1, 1992 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1993 if (ret) 1994 err = ret; 1995 1996 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1997 rec->start + rec->len - 1, 1998 EXTENT_DAMAGED, GFP_NOFS); 1999 if (ret && !err) 2000 err = ret; 2001 2002 kfree(rec); 2003 return err; 2004 } 2005 2006 static void repair_io_failure_callback(struct bio *bio, int err) 2007 { 2008 complete(bio->bi_private); 2009 } 2010 2011 /* 2012 * this bypasses the standard btrfs submit functions deliberately, as 2013 * the standard behavior is to write all copies in a raid setup. here we only 2014 * want to write the one bad copy. so we do the mapping for ourselves and issue 2015 * submit_bio directly. 2016 * to avoid any synchronization issues, wait for the data after writing, which 2017 * actually prevents the read that triggered the error from finishing. 2018 * currently, there can be no more than two copies of every data bit. thus, 2019 * exactly one rewrite is required. 2020 */ 2021 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 2022 u64 length, u64 logical, struct page *page, 2023 int mirror_num) 2024 { 2025 struct bio *bio; 2026 struct btrfs_device *dev; 2027 DECLARE_COMPLETION_ONSTACK(compl); 2028 u64 map_length = 0; 2029 u64 sector; 2030 struct btrfs_bio *bbio = NULL; 2031 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2032 int ret; 2033 2034 BUG_ON(!mirror_num); 2035 2036 /* we can't repair anything in raid56 yet */ 2037 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2038 return 0; 2039 2040 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2041 if (!bio) 2042 return -EIO; 2043 bio->bi_private = &compl; 2044 bio->bi_end_io = repair_io_failure_callback; 2045 bio->bi_size = 0; 2046 map_length = length; 2047 2048 ret = btrfs_map_block(fs_info, WRITE, logical, 2049 &map_length, &bbio, mirror_num); 2050 if (ret) { 2051 bio_put(bio); 2052 return -EIO; 2053 } 2054 BUG_ON(mirror_num != bbio->mirror_num); 2055 sector = bbio->stripes[mirror_num-1].physical >> 9; 2056 bio->bi_sector = sector; 2057 dev = bbio->stripes[mirror_num-1].dev; 2058 kfree(bbio); 2059 if (!dev || !dev->bdev || !dev->writeable) { 2060 bio_put(bio); 2061 return -EIO; 2062 } 2063 bio->bi_bdev = dev->bdev; 2064 bio_add_page(bio, page, length, start - page_offset(page)); 2065 btrfsic_submit_bio(WRITE_SYNC, bio); 2066 wait_for_completion(&compl); 2067 2068 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2069 /* try to remap that extent elsewhere? */ 2070 bio_put(bio); 2071 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2072 return -EIO; 2073 } 2074 2075 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 2076 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2077 start, rcu_str_deref(dev->name), sector); 2078 2079 bio_put(bio); 2080 return 0; 2081 } 2082 2083 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 2084 int mirror_num) 2085 { 2086 u64 start = eb->start; 2087 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2088 int ret = 0; 2089 2090 for (i = 0; i < num_pages; i++) { 2091 struct page *p = extent_buffer_page(eb, i); 2092 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2093 start, p, mirror_num); 2094 if (ret) 2095 break; 2096 start += PAGE_CACHE_SIZE; 2097 } 2098 2099 return ret; 2100 } 2101 2102 /* 2103 * each time an IO finishes, we do a fast check in the IO failure tree 2104 * to see if we need to process or clean up an io_failure_record 2105 */ 2106 static int clean_io_failure(u64 start, struct page *page) 2107 { 2108 u64 private; 2109 u64 private_failure; 2110 struct io_failure_record *failrec; 2111 struct btrfs_fs_info *fs_info; 2112 struct extent_state *state; 2113 int num_copies; 2114 int did_repair = 0; 2115 int ret; 2116 struct inode *inode = page->mapping->host; 2117 2118 private = 0; 2119 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2120 (u64)-1, 1, EXTENT_DIRTY, 0); 2121 if (!ret) 2122 return 0; 2123 2124 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2125 &private_failure); 2126 if (ret) 2127 return 0; 2128 2129 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2130 BUG_ON(!failrec->this_mirror); 2131 2132 if (failrec->in_validation) { 2133 /* there was no real error, just free the record */ 2134 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2135 failrec->start); 2136 did_repair = 1; 2137 goto out; 2138 } 2139 2140 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2141 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2142 failrec->start, 2143 EXTENT_LOCKED); 2144 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2145 2146 if (state && state->start == failrec->start) { 2147 fs_info = BTRFS_I(inode)->root->fs_info; 2148 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2149 failrec->len); 2150 if (num_copies > 1) { 2151 ret = repair_io_failure(fs_info, start, failrec->len, 2152 failrec->logical, page, 2153 failrec->failed_mirror); 2154 did_repair = !ret; 2155 } 2156 ret = 0; 2157 } 2158 2159 out: 2160 if (!ret) 2161 ret = free_io_failure(inode, failrec, did_repair); 2162 2163 return ret; 2164 } 2165 2166 /* 2167 * this is a generic handler for readpage errors (default 2168 * readpage_io_failed_hook). if other copies exist, read those and write back 2169 * good data to the failed position. does not investigate in remapping the 2170 * failed extent elsewhere, hoping the device will be smart enough to do this as 2171 * needed 2172 */ 2173 2174 static int bio_readpage_error(struct bio *failed_bio, struct page *page, 2175 u64 start, u64 end, int failed_mirror, 2176 struct extent_state *state) 2177 { 2178 struct io_failure_record *failrec = NULL; 2179 u64 private; 2180 struct extent_map *em; 2181 struct inode *inode = page->mapping->host; 2182 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2183 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2184 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2185 struct bio *bio; 2186 int num_copies; 2187 int ret; 2188 int read_mode; 2189 u64 logical; 2190 2191 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2192 2193 ret = get_state_private(failure_tree, start, &private); 2194 if (ret) { 2195 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2196 if (!failrec) 2197 return -ENOMEM; 2198 failrec->start = start; 2199 failrec->len = end - start + 1; 2200 failrec->this_mirror = 0; 2201 failrec->bio_flags = 0; 2202 failrec->in_validation = 0; 2203 2204 read_lock(&em_tree->lock); 2205 em = lookup_extent_mapping(em_tree, start, failrec->len); 2206 if (!em) { 2207 read_unlock(&em_tree->lock); 2208 kfree(failrec); 2209 return -EIO; 2210 } 2211 2212 if (em->start > start || em->start + em->len < start) { 2213 free_extent_map(em); 2214 em = NULL; 2215 } 2216 read_unlock(&em_tree->lock); 2217 2218 if (!em) { 2219 kfree(failrec); 2220 return -EIO; 2221 } 2222 logical = start - em->start; 2223 logical = em->block_start + logical; 2224 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2225 logical = em->block_start; 2226 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2227 extent_set_compress_type(&failrec->bio_flags, 2228 em->compress_type); 2229 } 2230 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2231 "len=%llu\n", logical, start, failrec->len); 2232 failrec->logical = logical; 2233 free_extent_map(em); 2234 2235 /* set the bits in the private failure tree */ 2236 ret = set_extent_bits(failure_tree, start, end, 2237 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2238 if (ret >= 0) 2239 ret = set_state_private(failure_tree, start, 2240 (u64)(unsigned long)failrec); 2241 /* set the bits in the inode's tree */ 2242 if (ret >= 0) 2243 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2244 GFP_NOFS); 2245 if (ret < 0) { 2246 kfree(failrec); 2247 return ret; 2248 } 2249 } else { 2250 failrec = (struct io_failure_record *)(unsigned long)private; 2251 pr_debug("bio_readpage_error: (found) logical=%llu, " 2252 "start=%llu, len=%llu, validation=%d\n", 2253 failrec->logical, failrec->start, failrec->len, 2254 failrec->in_validation); 2255 /* 2256 * when data can be on disk more than twice, add to failrec here 2257 * (e.g. with a list for failed_mirror) to make 2258 * clean_io_failure() clean all those errors at once. 2259 */ 2260 } 2261 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2262 failrec->logical, failrec->len); 2263 if (num_copies == 1) { 2264 /* 2265 * we only have a single copy of the data, so don't bother with 2266 * all the retry and error correction code that follows. no 2267 * matter what the error is, it is very likely to persist. 2268 */ 2269 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " 2270 "state=%p, num_copies=%d, next_mirror %d, " 2271 "failed_mirror %d\n", state, num_copies, 2272 failrec->this_mirror, failed_mirror); 2273 free_io_failure(inode, failrec, 0); 2274 return -EIO; 2275 } 2276 2277 if (!state) { 2278 spin_lock(&tree->lock); 2279 state = find_first_extent_bit_state(tree, failrec->start, 2280 EXTENT_LOCKED); 2281 if (state && state->start != failrec->start) 2282 state = NULL; 2283 spin_unlock(&tree->lock); 2284 } 2285 2286 /* 2287 * there are two premises: 2288 * a) deliver good data to the caller 2289 * b) correct the bad sectors on disk 2290 */ 2291 if (failed_bio->bi_vcnt > 1) { 2292 /* 2293 * to fulfill b), we need to know the exact failing sectors, as 2294 * we don't want to rewrite any more than the failed ones. thus, 2295 * we need separate read requests for the failed bio 2296 * 2297 * if the following BUG_ON triggers, our validation request got 2298 * merged. we need separate requests for our algorithm to work. 2299 */ 2300 BUG_ON(failrec->in_validation); 2301 failrec->in_validation = 1; 2302 failrec->this_mirror = failed_mirror; 2303 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2304 } else { 2305 /* 2306 * we're ready to fulfill a) and b) alongside. get a good copy 2307 * of the failed sector and if we succeed, we have setup 2308 * everything for repair_io_failure to do the rest for us. 2309 */ 2310 if (failrec->in_validation) { 2311 BUG_ON(failrec->this_mirror != failed_mirror); 2312 failrec->in_validation = 0; 2313 failrec->this_mirror = 0; 2314 } 2315 failrec->failed_mirror = failed_mirror; 2316 failrec->this_mirror++; 2317 if (failrec->this_mirror == failed_mirror) 2318 failrec->this_mirror++; 2319 read_mode = READ_SYNC; 2320 } 2321 2322 if (!state || failrec->this_mirror > num_copies) { 2323 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " 2324 "next_mirror %d, failed_mirror %d\n", state, 2325 num_copies, failrec->this_mirror, failed_mirror); 2326 free_io_failure(inode, failrec, 0); 2327 return -EIO; 2328 } 2329 2330 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2331 if (!bio) { 2332 free_io_failure(inode, failrec, 0); 2333 return -EIO; 2334 } 2335 bio->bi_private = state; 2336 bio->bi_end_io = failed_bio->bi_end_io; 2337 bio->bi_sector = failrec->logical >> 9; 2338 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2339 bio->bi_size = 0; 2340 2341 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2342 2343 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2344 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2345 failrec->this_mirror, num_copies, failrec->in_validation); 2346 2347 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2348 failrec->this_mirror, 2349 failrec->bio_flags, 0); 2350 return ret; 2351 } 2352 2353 /* lots and lots of room for performance fixes in the end_bio funcs */ 2354 2355 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2356 { 2357 int uptodate = (err == 0); 2358 struct extent_io_tree *tree; 2359 int ret; 2360 2361 tree = &BTRFS_I(page->mapping->host)->io_tree; 2362 2363 if (tree->ops && tree->ops->writepage_end_io_hook) { 2364 ret = tree->ops->writepage_end_io_hook(page, start, 2365 end, NULL, uptodate); 2366 if (ret) 2367 uptodate = 0; 2368 } 2369 2370 if (!uptodate) { 2371 ClearPageUptodate(page); 2372 SetPageError(page); 2373 } 2374 return 0; 2375 } 2376 2377 /* 2378 * after a writepage IO is done, we need to: 2379 * clear the uptodate bits on error 2380 * clear the writeback bits in the extent tree for this IO 2381 * end_page_writeback if the page has no more pending IO 2382 * 2383 * Scheduling is not allowed, so the extent state tree is expected 2384 * to have one and only one object corresponding to this IO. 2385 */ 2386 static void end_bio_extent_writepage(struct bio *bio, int err) 2387 { 2388 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2389 struct extent_io_tree *tree; 2390 u64 start; 2391 u64 end; 2392 2393 do { 2394 struct page *page = bvec->bv_page; 2395 tree = &BTRFS_I(page->mapping->host)->io_tree; 2396 2397 /* We always issue full-page reads, but if some block 2398 * in a page fails to read, blk_update_request() will 2399 * advance bv_offset and adjust bv_len to compensate. 2400 * Print a warning for nonzero offsets, and an error 2401 * if they don't add up to a full page. */ 2402 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2403 printk("%s page write in btrfs with offset %u and length %u\n", 2404 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2405 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2406 bvec->bv_offset, bvec->bv_len); 2407 2408 start = page_offset(page); 2409 end = start + bvec->bv_offset + bvec->bv_len - 1; 2410 2411 if (--bvec >= bio->bi_io_vec) 2412 prefetchw(&bvec->bv_page->flags); 2413 2414 if (end_extent_writepage(page, err, start, end)) 2415 continue; 2416 2417 end_page_writeback(page); 2418 } while (bvec >= bio->bi_io_vec); 2419 2420 bio_put(bio); 2421 } 2422 2423 /* 2424 * after a readpage IO is done, we need to: 2425 * clear the uptodate bits on error 2426 * set the uptodate bits if things worked 2427 * set the page up to date if all extents in the tree are uptodate 2428 * clear the lock bit in the extent tree 2429 * unlock the page if there are no other extents locked for it 2430 * 2431 * Scheduling is not allowed, so the extent state tree is expected 2432 * to have one and only one object corresponding to this IO. 2433 */ 2434 static void end_bio_extent_readpage(struct bio *bio, int err) 2435 { 2436 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2437 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2438 struct bio_vec *bvec = bio->bi_io_vec; 2439 struct extent_io_tree *tree; 2440 u64 start; 2441 u64 end; 2442 int mirror; 2443 int ret; 2444 2445 if (err) 2446 uptodate = 0; 2447 2448 do { 2449 struct page *page = bvec->bv_page; 2450 struct extent_state *cached = NULL; 2451 struct extent_state *state; 2452 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2453 2454 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2455 "mirror=%lu\n", (u64)bio->bi_sector, err, 2456 io_bio->mirror_num); 2457 tree = &BTRFS_I(page->mapping->host)->io_tree; 2458 2459 /* We always issue full-page reads, but if some block 2460 * in a page fails to read, blk_update_request() will 2461 * advance bv_offset and adjust bv_len to compensate. 2462 * Print a warning for nonzero offsets, and an error 2463 * if they don't add up to a full page. */ 2464 if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) 2465 printk("%s page read in btrfs with offset %u and length %u\n", 2466 bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE 2467 ? KERN_ERR "partial" : KERN_INFO "incomplete", 2468 bvec->bv_offset, bvec->bv_len); 2469 2470 start = page_offset(page); 2471 end = start + bvec->bv_offset + bvec->bv_len - 1; 2472 2473 if (++bvec <= bvec_end) 2474 prefetchw(&bvec->bv_page->flags); 2475 2476 spin_lock(&tree->lock); 2477 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 2478 if (state && state->start == start) { 2479 /* 2480 * take a reference on the state, unlock will drop 2481 * the ref 2482 */ 2483 cache_state(state, &cached); 2484 } 2485 spin_unlock(&tree->lock); 2486 2487 mirror = io_bio->mirror_num; 2488 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2489 ret = tree->ops->readpage_end_io_hook(page, start, end, 2490 state, mirror); 2491 if (ret) 2492 uptodate = 0; 2493 else 2494 clean_io_failure(start, page); 2495 } 2496 2497 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2498 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2499 if (!ret && !err && 2500 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2501 uptodate = 1; 2502 } else if (!uptodate) { 2503 /* 2504 * The generic bio_readpage_error handles errors the 2505 * following way: If possible, new read requests are 2506 * created and submitted and will end up in 2507 * end_bio_extent_readpage as well (if we're lucky, not 2508 * in the !uptodate case). In that case it returns 0 and 2509 * we just go on with the next page in our bio. If it 2510 * can't handle the error it will return -EIO and we 2511 * remain responsible for that page. 2512 */ 2513 ret = bio_readpage_error(bio, page, start, end, mirror, NULL); 2514 if (ret == 0) { 2515 uptodate = 2516 test_bit(BIO_UPTODATE, &bio->bi_flags); 2517 if (err) 2518 uptodate = 0; 2519 uncache_state(&cached); 2520 continue; 2521 } 2522 } 2523 2524 if (uptodate && tree->track_uptodate) { 2525 set_extent_uptodate(tree, start, end, &cached, 2526 GFP_ATOMIC); 2527 } 2528 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2529 2530 if (uptodate) { 2531 SetPageUptodate(page); 2532 } else { 2533 ClearPageUptodate(page); 2534 SetPageError(page); 2535 } 2536 unlock_page(page); 2537 } while (bvec <= bvec_end); 2538 2539 bio_put(bio); 2540 } 2541 2542 /* 2543 * this allocates from the btrfs_bioset. We're returning a bio right now 2544 * but you can call btrfs_io_bio for the appropriate container_of magic 2545 */ 2546 struct bio * 2547 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2548 gfp_t gfp_flags) 2549 { 2550 struct bio *bio; 2551 2552 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2553 2554 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2555 while (!bio && (nr_vecs /= 2)) { 2556 bio = bio_alloc_bioset(gfp_flags, 2557 nr_vecs, btrfs_bioset); 2558 } 2559 } 2560 2561 if (bio) { 2562 bio->bi_size = 0; 2563 bio->bi_bdev = bdev; 2564 bio->bi_sector = first_sector; 2565 } 2566 return bio; 2567 } 2568 2569 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2570 { 2571 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2572 } 2573 2574 2575 /* this also allocates from the btrfs_bioset */ 2576 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2577 { 2578 return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2579 } 2580 2581 2582 static int __must_check submit_one_bio(int rw, struct bio *bio, 2583 int mirror_num, unsigned long bio_flags) 2584 { 2585 int ret = 0; 2586 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2587 struct page *page = bvec->bv_page; 2588 struct extent_io_tree *tree = bio->bi_private; 2589 u64 start; 2590 2591 start = page_offset(page) + bvec->bv_offset; 2592 2593 bio->bi_private = NULL; 2594 2595 bio_get(bio); 2596 2597 if (tree->ops && tree->ops->submit_bio_hook) 2598 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2599 mirror_num, bio_flags, start); 2600 else 2601 btrfsic_submit_bio(rw, bio); 2602 2603 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2604 ret = -EOPNOTSUPP; 2605 bio_put(bio); 2606 return ret; 2607 } 2608 2609 static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, 2610 unsigned long offset, size_t size, struct bio *bio, 2611 unsigned long bio_flags) 2612 { 2613 int ret = 0; 2614 if (tree->ops && tree->ops->merge_bio_hook) 2615 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, 2616 bio_flags); 2617 BUG_ON(ret < 0); 2618 return ret; 2619 2620 } 2621 2622 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2623 struct page *page, sector_t sector, 2624 size_t size, unsigned long offset, 2625 struct block_device *bdev, 2626 struct bio **bio_ret, 2627 unsigned long max_pages, 2628 bio_end_io_t end_io_func, 2629 int mirror_num, 2630 unsigned long prev_bio_flags, 2631 unsigned long bio_flags) 2632 { 2633 int ret = 0; 2634 struct bio *bio; 2635 int nr; 2636 int contig = 0; 2637 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2638 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2639 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2640 2641 if (bio_ret && *bio_ret) { 2642 bio = *bio_ret; 2643 if (old_compressed) 2644 contig = bio->bi_sector == sector; 2645 else 2646 contig = bio_end_sector(bio) == sector; 2647 2648 if (prev_bio_flags != bio_flags || !contig || 2649 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2650 bio_add_page(bio, page, page_size, offset) < page_size) { 2651 ret = submit_one_bio(rw, bio, mirror_num, 2652 prev_bio_flags); 2653 if (ret < 0) 2654 return ret; 2655 bio = NULL; 2656 } else { 2657 return 0; 2658 } 2659 } 2660 if (this_compressed) 2661 nr = BIO_MAX_PAGES; 2662 else 2663 nr = bio_get_nr_vecs(bdev); 2664 2665 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2666 if (!bio) 2667 return -ENOMEM; 2668 2669 bio_add_page(bio, page, page_size, offset); 2670 bio->bi_end_io = end_io_func; 2671 bio->bi_private = tree; 2672 2673 if (bio_ret) 2674 *bio_ret = bio; 2675 else 2676 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2677 2678 return ret; 2679 } 2680 2681 static void attach_extent_buffer_page(struct extent_buffer *eb, 2682 struct page *page) 2683 { 2684 if (!PagePrivate(page)) { 2685 SetPagePrivate(page); 2686 page_cache_get(page); 2687 set_page_private(page, (unsigned long)eb); 2688 } else { 2689 WARN_ON(page->private != (unsigned long)eb); 2690 } 2691 } 2692 2693 void set_page_extent_mapped(struct page *page) 2694 { 2695 if (!PagePrivate(page)) { 2696 SetPagePrivate(page); 2697 page_cache_get(page); 2698 set_page_private(page, EXTENT_PAGE_PRIVATE); 2699 } 2700 } 2701 2702 /* 2703 * basic readpage implementation. Locked extent state structs are inserted 2704 * into the tree that are removed when the IO is done (by the end_io 2705 * handlers) 2706 * XXX JDM: This needs looking at to ensure proper page locking 2707 */ 2708 static int __extent_read_full_page(struct extent_io_tree *tree, 2709 struct page *page, 2710 get_extent_t *get_extent, 2711 struct bio **bio, int mirror_num, 2712 unsigned long *bio_flags, int rw) 2713 { 2714 struct inode *inode = page->mapping->host; 2715 u64 start = page_offset(page); 2716 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2717 u64 end; 2718 u64 cur = start; 2719 u64 extent_offset; 2720 u64 last_byte = i_size_read(inode); 2721 u64 block_start; 2722 u64 cur_end; 2723 sector_t sector; 2724 struct extent_map *em; 2725 struct block_device *bdev; 2726 struct btrfs_ordered_extent *ordered; 2727 int ret; 2728 int nr = 0; 2729 size_t pg_offset = 0; 2730 size_t iosize; 2731 size_t disk_io_size; 2732 size_t blocksize = inode->i_sb->s_blocksize; 2733 unsigned long this_bio_flag = 0; 2734 2735 set_page_extent_mapped(page); 2736 2737 if (!PageUptodate(page)) { 2738 if (cleancache_get_page(page) == 0) { 2739 BUG_ON(blocksize != PAGE_SIZE); 2740 goto out; 2741 } 2742 } 2743 2744 end = page_end; 2745 while (1) { 2746 lock_extent(tree, start, end); 2747 ordered = btrfs_lookup_ordered_extent(inode, start); 2748 if (!ordered) 2749 break; 2750 unlock_extent(tree, start, end); 2751 btrfs_start_ordered_extent(inode, ordered, 1); 2752 btrfs_put_ordered_extent(ordered); 2753 } 2754 2755 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2756 char *userpage; 2757 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2758 2759 if (zero_offset) { 2760 iosize = PAGE_CACHE_SIZE - zero_offset; 2761 userpage = kmap_atomic(page); 2762 memset(userpage + zero_offset, 0, iosize); 2763 flush_dcache_page(page); 2764 kunmap_atomic(userpage); 2765 } 2766 } 2767 while (cur <= end) { 2768 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2769 2770 if (cur >= last_byte) { 2771 char *userpage; 2772 struct extent_state *cached = NULL; 2773 2774 iosize = PAGE_CACHE_SIZE - pg_offset; 2775 userpage = kmap_atomic(page); 2776 memset(userpage + pg_offset, 0, iosize); 2777 flush_dcache_page(page); 2778 kunmap_atomic(userpage); 2779 set_extent_uptodate(tree, cur, cur + iosize - 1, 2780 &cached, GFP_NOFS); 2781 unlock_extent_cached(tree, cur, cur + iosize - 1, 2782 &cached, GFP_NOFS); 2783 break; 2784 } 2785 em = get_extent(inode, page, pg_offset, cur, 2786 end - cur + 1, 0); 2787 if (IS_ERR_OR_NULL(em)) { 2788 SetPageError(page); 2789 unlock_extent(tree, cur, end); 2790 break; 2791 } 2792 extent_offset = cur - em->start; 2793 BUG_ON(extent_map_end(em) <= cur); 2794 BUG_ON(end < cur); 2795 2796 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2797 this_bio_flag = EXTENT_BIO_COMPRESSED; 2798 extent_set_compress_type(&this_bio_flag, 2799 em->compress_type); 2800 } 2801 2802 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2803 cur_end = min(extent_map_end(em) - 1, end); 2804 iosize = ALIGN(iosize, blocksize); 2805 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2806 disk_io_size = em->block_len; 2807 sector = em->block_start >> 9; 2808 } else { 2809 sector = (em->block_start + extent_offset) >> 9; 2810 disk_io_size = iosize; 2811 } 2812 bdev = em->bdev; 2813 block_start = em->block_start; 2814 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2815 block_start = EXTENT_MAP_HOLE; 2816 free_extent_map(em); 2817 em = NULL; 2818 2819 /* we've found a hole, just zero and go on */ 2820 if (block_start == EXTENT_MAP_HOLE) { 2821 char *userpage; 2822 struct extent_state *cached = NULL; 2823 2824 userpage = kmap_atomic(page); 2825 memset(userpage + pg_offset, 0, iosize); 2826 flush_dcache_page(page); 2827 kunmap_atomic(userpage); 2828 2829 set_extent_uptodate(tree, cur, cur + iosize - 1, 2830 &cached, GFP_NOFS); 2831 unlock_extent_cached(tree, cur, cur + iosize - 1, 2832 &cached, GFP_NOFS); 2833 cur = cur + iosize; 2834 pg_offset += iosize; 2835 continue; 2836 } 2837 /* the get_extent function already copied into the page */ 2838 if (test_range_bit(tree, cur, cur_end, 2839 EXTENT_UPTODATE, 1, NULL)) { 2840 check_page_uptodate(tree, page); 2841 unlock_extent(tree, cur, cur + iosize - 1); 2842 cur = cur + iosize; 2843 pg_offset += iosize; 2844 continue; 2845 } 2846 /* we have an inline extent but it didn't get marked up 2847 * to date. Error out 2848 */ 2849 if (block_start == EXTENT_MAP_INLINE) { 2850 SetPageError(page); 2851 unlock_extent(tree, cur, cur + iosize - 1); 2852 cur = cur + iosize; 2853 pg_offset += iosize; 2854 continue; 2855 } 2856 2857 pnr -= page->index; 2858 ret = submit_extent_page(rw, tree, page, 2859 sector, disk_io_size, pg_offset, 2860 bdev, bio, pnr, 2861 end_bio_extent_readpage, mirror_num, 2862 *bio_flags, 2863 this_bio_flag); 2864 if (!ret) { 2865 nr++; 2866 *bio_flags = this_bio_flag; 2867 } else { 2868 SetPageError(page); 2869 unlock_extent(tree, cur, cur + iosize - 1); 2870 } 2871 cur = cur + iosize; 2872 pg_offset += iosize; 2873 } 2874 out: 2875 if (!nr) { 2876 if (!PageError(page)) 2877 SetPageUptodate(page); 2878 unlock_page(page); 2879 } 2880 return 0; 2881 } 2882 2883 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2884 get_extent_t *get_extent, int mirror_num) 2885 { 2886 struct bio *bio = NULL; 2887 unsigned long bio_flags = 0; 2888 int ret; 2889 2890 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 2891 &bio_flags, READ); 2892 if (bio) 2893 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 2894 return ret; 2895 } 2896 2897 static noinline void update_nr_written(struct page *page, 2898 struct writeback_control *wbc, 2899 unsigned long nr_written) 2900 { 2901 wbc->nr_to_write -= nr_written; 2902 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2903 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2904 page->mapping->writeback_index = page->index + nr_written; 2905 } 2906 2907 /* 2908 * the writepage semantics are similar to regular writepage. extent 2909 * records are inserted to lock ranges in the tree, and as dirty areas 2910 * are found, they are marked writeback. Then the lock bits are removed 2911 * and the end_io handler clears the writeback ranges 2912 */ 2913 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2914 void *data) 2915 { 2916 struct inode *inode = page->mapping->host; 2917 struct extent_page_data *epd = data; 2918 struct extent_io_tree *tree = epd->tree; 2919 u64 start = page_offset(page); 2920 u64 delalloc_start; 2921 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2922 u64 end; 2923 u64 cur = start; 2924 u64 extent_offset; 2925 u64 last_byte = i_size_read(inode); 2926 u64 block_start; 2927 u64 iosize; 2928 sector_t sector; 2929 struct extent_state *cached_state = NULL; 2930 struct extent_map *em; 2931 struct block_device *bdev; 2932 int ret; 2933 int nr = 0; 2934 size_t pg_offset = 0; 2935 size_t blocksize; 2936 loff_t i_size = i_size_read(inode); 2937 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2938 u64 nr_delalloc; 2939 u64 delalloc_end; 2940 int page_started; 2941 int compressed; 2942 int write_flags; 2943 unsigned long nr_written = 0; 2944 bool fill_delalloc = true; 2945 2946 if (wbc->sync_mode == WB_SYNC_ALL) 2947 write_flags = WRITE_SYNC; 2948 else 2949 write_flags = WRITE; 2950 2951 trace___extent_writepage(page, inode, wbc); 2952 2953 WARN_ON(!PageLocked(page)); 2954 2955 ClearPageError(page); 2956 2957 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2958 if (page->index > end_index || 2959 (page->index == end_index && !pg_offset)) { 2960 page->mapping->a_ops->invalidatepage(page, 0); 2961 unlock_page(page); 2962 return 0; 2963 } 2964 2965 if (page->index == end_index) { 2966 char *userpage; 2967 2968 userpage = kmap_atomic(page); 2969 memset(userpage + pg_offset, 0, 2970 PAGE_CACHE_SIZE - pg_offset); 2971 kunmap_atomic(userpage); 2972 flush_dcache_page(page); 2973 } 2974 pg_offset = 0; 2975 2976 set_page_extent_mapped(page); 2977 2978 if (!tree->ops || !tree->ops->fill_delalloc) 2979 fill_delalloc = false; 2980 2981 delalloc_start = start; 2982 delalloc_end = 0; 2983 page_started = 0; 2984 if (!epd->extent_locked && fill_delalloc) { 2985 u64 delalloc_to_write = 0; 2986 /* 2987 * make sure the wbc mapping index is at least updated 2988 * to this page. 2989 */ 2990 update_nr_written(page, wbc, 0); 2991 2992 while (delalloc_end < page_end) { 2993 nr_delalloc = find_lock_delalloc_range(inode, tree, 2994 page, 2995 &delalloc_start, 2996 &delalloc_end, 2997 128 * 1024 * 1024); 2998 if (nr_delalloc == 0) { 2999 delalloc_start = delalloc_end + 1; 3000 continue; 3001 } 3002 ret = tree->ops->fill_delalloc(inode, page, 3003 delalloc_start, 3004 delalloc_end, 3005 &page_started, 3006 &nr_written); 3007 /* File system has been set read-only */ 3008 if (ret) { 3009 SetPageError(page); 3010 goto done; 3011 } 3012 /* 3013 * delalloc_end is already one less than the total 3014 * length, so we don't subtract one from 3015 * PAGE_CACHE_SIZE 3016 */ 3017 delalloc_to_write += (delalloc_end - delalloc_start + 3018 PAGE_CACHE_SIZE) >> 3019 PAGE_CACHE_SHIFT; 3020 delalloc_start = delalloc_end + 1; 3021 } 3022 if (wbc->nr_to_write < delalloc_to_write) { 3023 int thresh = 8192; 3024 3025 if (delalloc_to_write < thresh * 2) 3026 thresh = delalloc_to_write; 3027 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3028 thresh); 3029 } 3030 3031 /* did the fill delalloc function already unlock and start 3032 * the IO? 3033 */ 3034 if (page_started) { 3035 ret = 0; 3036 /* 3037 * we've unlocked the page, so we can't update 3038 * the mapping's writeback index, just update 3039 * nr_to_write. 3040 */ 3041 wbc->nr_to_write -= nr_written; 3042 goto done_unlocked; 3043 } 3044 } 3045 if (tree->ops && tree->ops->writepage_start_hook) { 3046 ret = tree->ops->writepage_start_hook(page, start, 3047 page_end); 3048 if (ret) { 3049 /* Fixup worker will requeue */ 3050 if (ret == -EBUSY) 3051 wbc->pages_skipped++; 3052 else 3053 redirty_page_for_writepage(wbc, page); 3054 update_nr_written(page, wbc, nr_written); 3055 unlock_page(page); 3056 ret = 0; 3057 goto done_unlocked; 3058 } 3059 } 3060 3061 /* 3062 * we don't want to touch the inode after unlocking the page, 3063 * so we update the mapping writeback index now 3064 */ 3065 update_nr_written(page, wbc, nr_written + 1); 3066 3067 end = page_end; 3068 if (last_byte <= start) { 3069 if (tree->ops && tree->ops->writepage_end_io_hook) 3070 tree->ops->writepage_end_io_hook(page, start, 3071 page_end, NULL, 1); 3072 goto done; 3073 } 3074 3075 blocksize = inode->i_sb->s_blocksize; 3076 3077 while (cur <= end) { 3078 if (cur >= last_byte) { 3079 if (tree->ops && tree->ops->writepage_end_io_hook) 3080 tree->ops->writepage_end_io_hook(page, cur, 3081 page_end, NULL, 1); 3082 break; 3083 } 3084 em = epd->get_extent(inode, page, pg_offset, cur, 3085 end - cur + 1, 1); 3086 if (IS_ERR_OR_NULL(em)) { 3087 SetPageError(page); 3088 break; 3089 } 3090 3091 extent_offset = cur - em->start; 3092 BUG_ON(extent_map_end(em) <= cur); 3093 BUG_ON(end < cur); 3094 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3095 iosize = ALIGN(iosize, blocksize); 3096 sector = (em->block_start + extent_offset) >> 9; 3097 bdev = em->bdev; 3098 block_start = em->block_start; 3099 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3100 free_extent_map(em); 3101 em = NULL; 3102 3103 /* 3104 * compressed and inline extents are written through other 3105 * paths in the FS 3106 */ 3107 if (compressed || block_start == EXTENT_MAP_HOLE || 3108 block_start == EXTENT_MAP_INLINE) { 3109 /* 3110 * end_io notification does not happen here for 3111 * compressed extents 3112 */ 3113 if (!compressed && tree->ops && 3114 tree->ops->writepage_end_io_hook) 3115 tree->ops->writepage_end_io_hook(page, cur, 3116 cur + iosize - 1, 3117 NULL, 1); 3118 else if (compressed) { 3119 /* we don't want to end_page_writeback on 3120 * a compressed extent. this happens 3121 * elsewhere 3122 */ 3123 nr++; 3124 } 3125 3126 cur += iosize; 3127 pg_offset += iosize; 3128 continue; 3129 } 3130 /* leave this out until we have a page_mkwrite call */ 3131 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3132 EXTENT_DIRTY, 0, NULL)) { 3133 cur = cur + iosize; 3134 pg_offset += iosize; 3135 continue; 3136 } 3137 3138 if (tree->ops && tree->ops->writepage_io_hook) { 3139 ret = tree->ops->writepage_io_hook(page, cur, 3140 cur + iosize - 1); 3141 } else { 3142 ret = 0; 3143 } 3144 if (ret) { 3145 SetPageError(page); 3146 } else { 3147 unsigned long max_nr = end_index + 1; 3148 3149 set_range_writeback(tree, cur, cur + iosize - 1); 3150 if (!PageWriteback(page)) { 3151 printk(KERN_ERR "btrfs warning page %lu not " 3152 "writeback, cur %llu end %llu\n", 3153 page->index, (unsigned long long)cur, 3154 (unsigned long long)end); 3155 } 3156 3157 ret = submit_extent_page(write_flags, tree, page, 3158 sector, iosize, pg_offset, 3159 bdev, &epd->bio, max_nr, 3160 end_bio_extent_writepage, 3161 0, 0, 0); 3162 if (ret) 3163 SetPageError(page); 3164 } 3165 cur = cur + iosize; 3166 pg_offset += iosize; 3167 nr++; 3168 } 3169 done: 3170 if (nr == 0) { 3171 /* make sure the mapping tag for page dirty gets cleared */ 3172 set_page_writeback(page); 3173 end_page_writeback(page); 3174 } 3175 unlock_page(page); 3176 3177 done_unlocked: 3178 3179 /* drop our reference on any cached states */ 3180 free_extent_state(cached_state); 3181 return 0; 3182 } 3183 3184 static int eb_wait(void *word) 3185 { 3186 io_schedule(); 3187 return 0; 3188 } 3189 3190 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3191 { 3192 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3193 TASK_UNINTERRUPTIBLE); 3194 } 3195 3196 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3197 struct btrfs_fs_info *fs_info, 3198 struct extent_page_data *epd) 3199 { 3200 unsigned long i, num_pages; 3201 int flush = 0; 3202 int ret = 0; 3203 3204 if (!btrfs_try_tree_write_lock(eb)) { 3205 flush = 1; 3206 flush_write_bio(epd); 3207 btrfs_tree_lock(eb); 3208 } 3209 3210 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3211 btrfs_tree_unlock(eb); 3212 if (!epd->sync_io) 3213 return 0; 3214 if (!flush) { 3215 flush_write_bio(epd); 3216 flush = 1; 3217 } 3218 while (1) { 3219 wait_on_extent_buffer_writeback(eb); 3220 btrfs_tree_lock(eb); 3221 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3222 break; 3223 btrfs_tree_unlock(eb); 3224 } 3225 } 3226 3227 /* 3228 * We need to do this to prevent races in people who check if the eb is 3229 * under IO since we can end up having no IO bits set for a short period 3230 * of time. 3231 */ 3232 spin_lock(&eb->refs_lock); 3233 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3234 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3235 spin_unlock(&eb->refs_lock); 3236 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3237 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3238 -eb->len, 3239 fs_info->dirty_metadata_batch); 3240 ret = 1; 3241 } else { 3242 spin_unlock(&eb->refs_lock); 3243 } 3244 3245 btrfs_tree_unlock(eb); 3246 3247 if (!ret) 3248 return ret; 3249 3250 num_pages = num_extent_pages(eb->start, eb->len); 3251 for (i = 0; i < num_pages; i++) { 3252 struct page *p = extent_buffer_page(eb, i); 3253 3254 if (!trylock_page(p)) { 3255 if (!flush) { 3256 flush_write_bio(epd); 3257 flush = 1; 3258 } 3259 lock_page(p); 3260 } 3261 } 3262 3263 return ret; 3264 } 3265 3266 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3267 { 3268 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3269 smp_mb__after_clear_bit(); 3270 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3271 } 3272 3273 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3274 { 3275 int uptodate = err == 0; 3276 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 3277 struct extent_buffer *eb; 3278 int done; 3279 3280 do { 3281 struct page *page = bvec->bv_page; 3282 3283 bvec--; 3284 eb = (struct extent_buffer *)page->private; 3285 BUG_ON(!eb); 3286 done = atomic_dec_and_test(&eb->io_pages); 3287 3288 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3289 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3290 ClearPageUptodate(page); 3291 SetPageError(page); 3292 } 3293 3294 end_page_writeback(page); 3295 3296 if (!done) 3297 continue; 3298 3299 end_extent_buffer_writeback(eb); 3300 } while (bvec >= bio->bi_io_vec); 3301 3302 bio_put(bio); 3303 3304 } 3305 3306 static int write_one_eb(struct extent_buffer *eb, 3307 struct btrfs_fs_info *fs_info, 3308 struct writeback_control *wbc, 3309 struct extent_page_data *epd) 3310 { 3311 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3312 u64 offset = eb->start; 3313 unsigned long i, num_pages; 3314 unsigned long bio_flags = 0; 3315 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3316 int ret = 0; 3317 3318 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3319 num_pages = num_extent_pages(eb->start, eb->len); 3320 atomic_set(&eb->io_pages, num_pages); 3321 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3322 bio_flags = EXTENT_BIO_TREE_LOG; 3323 3324 for (i = 0; i < num_pages; i++) { 3325 struct page *p = extent_buffer_page(eb, i); 3326 3327 clear_page_dirty_for_io(p); 3328 set_page_writeback(p); 3329 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3330 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3331 -1, end_bio_extent_buffer_writepage, 3332 0, epd->bio_flags, bio_flags); 3333 epd->bio_flags = bio_flags; 3334 if (ret) { 3335 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3336 SetPageError(p); 3337 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3338 end_extent_buffer_writeback(eb); 3339 ret = -EIO; 3340 break; 3341 } 3342 offset += PAGE_CACHE_SIZE; 3343 update_nr_written(p, wbc, 1); 3344 unlock_page(p); 3345 } 3346 3347 if (unlikely(ret)) { 3348 for (; i < num_pages; i++) { 3349 struct page *p = extent_buffer_page(eb, i); 3350 unlock_page(p); 3351 } 3352 } 3353 3354 return ret; 3355 } 3356 3357 int btree_write_cache_pages(struct address_space *mapping, 3358 struct writeback_control *wbc) 3359 { 3360 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3361 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3362 struct extent_buffer *eb, *prev_eb = NULL; 3363 struct extent_page_data epd = { 3364 .bio = NULL, 3365 .tree = tree, 3366 .extent_locked = 0, 3367 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3368 .bio_flags = 0, 3369 }; 3370 int ret = 0; 3371 int done = 0; 3372 int nr_to_write_done = 0; 3373 struct pagevec pvec; 3374 int nr_pages; 3375 pgoff_t index; 3376 pgoff_t end; /* Inclusive */ 3377 int scanned = 0; 3378 int tag; 3379 3380 pagevec_init(&pvec, 0); 3381 if (wbc->range_cyclic) { 3382 index = mapping->writeback_index; /* Start from prev offset */ 3383 end = -1; 3384 } else { 3385 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3386 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3387 scanned = 1; 3388 } 3389 if (wbc->sync_mode == WB_SYNC_ALL) 3390 tag = PAGECACHE_TAG_TOWRITE; 3391 else 3392 tag = PAGECACHE_TAG_DIRTY; 3393 retry: 3394 if (wbc->sync_mode == WB_SYNC_ALL) 3395 tag_pages_for_writeback(mapping, index, end); 3396 while (!done && !nr_to_write_done && (index <= end) && 3397 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3398 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3399 unsigned i; 3400 3401 scanned = 1; 3402 for (i = 0; i < nr_pages; i++) { 3403 struct page *page = pvec.pages[i]; 3404 3405 if (!PagePrivate(page)) 3406 continue; 3407 3408 if (!wbc->range_cyclic && page->index > end) { 3409 done = 1; 3410 break; 3411 } 3412 3413 spin_lock(&mapping->private_lock); 3414 if (!PagePrivate(page)) { 3415 spin_unlock(&mapping->private_lock); 3416 continue; 3417 } 3418 3419 eb = (struct extent_buffer *)page->private; 3420 3421 /* 3422 * Shouldn't happen and normally this would be a BUG_ON 3423 * but no sense in crashing the users box for something 3424 * we can survive anyway. 3425 */ 3426 if (!eb) { 3427 spin_unlock(&mapping->private_lock); 3428 WARN_ON(1); 3429 continue; 3430 } 3431 3432 if (eb == prev_eb) { 3433 spin_unlock(&mapping->private_lock); 3434 continue; 3435 } 3436 3437 ret = atomic_inc_not_zero(&eb->refs); 3438 spin_unlock(&mapping->private_lock); 3439 if (!ret) 3440 continue; 3441 3442 prev_eb = eb; 3443 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3444 if (!ret) { 3445 free_extent_buffer(eb); 3446 continue; 3447 } 3448 3449 ret = write_one_eb(eb, fs_info, wbc, &epd); 3450 if (ret) { 3451 done = 1; 3452 free_extent_buffer(eb); 3453 break; 3454 } 3455 free_extent_buffer(eb); 3456 3457 /* 3458 * the filesystem may choose to bump up nr_to_write. 3459 * We have to make sure to honor the new nr_to_write 3460 * at any time 3461 */ 3462 nr_to_write_done = wbc->nr_to_write <= 0; 3463 } 3464 pagevec_release(&pvec); 3465 cond_resched(); 3466 } 3467 if (!scanned && !done) { 3468 /* 3469 * We hit the last page and there is more work to be done: wrap 3470 * back to the start of the file 3471 */ 3472 scanned = 1; 3473 index = 0; 3474 goto retry; 3475 } 3476 flush_write_bio(&epd); 3477 return ret; 3478 } 3479 3480 /** 3481 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3482 * @mapping: address space structure to write 3483 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3484 * @writepage: function called for each page 3485 * @data: data passed to writepage function 3486 * 3487 * If a page is already under I/O, write_cache_pages() skips it, even 3488 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3489 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3490 * and msync() need to guarantee that all the data which was dirty at the time 3491 * the call was made get new I/O started against them. If wbc->sync_mode is 3492 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3493 * existing IO to complete. 3494 */ 3495 static int extent_write_cache_pages(struct extent_io_tree *tree, 3496 struct address_space *mapping, 3497 struct writeback_control *wbc, 3498 writepage_t writepage, void *data, 3499 void (*flush_fn)(void *)) 3500 { 3501 struct inode *inode = mapping->host; 3502 int ret = 0; 3503 int done = 0; 3504 int nr_to_write_done = 0; 3505 struct pagevec pvec; 3506 int nr_pages; 3507 pgoff_t index; 3508 pgoff_t end; /* Inclusive */ 3509 int scanned = 0; 3510 int tag; 3511 3512 /* 3513 * We have to hold onto the inode so that ordered extents can do their 3514 * work when the IO finishes. The alternative to this is failing to add 3515 * an ordered extent if the igrab() fails there and that is a huge pain 3516 * to deal with, so instead just hold onto the inode throughout the 3517 * writepages operation. If it fails here we are freeing up the inode 3518 * anyway and we'd rather not waste our time writing out stuff that is 3519 * going to be truncated anyway. 3520 */ 3521 if (!igrab(inode)) 3522 return 0; 3523 3524 pagevec_init(&pvec, 0); 3525 if (wbc->range_cyclic) { 3526 index = mapping->writeback_index; /* Start from prev offset */ 3527 end = -1; 3528 } else { 3529 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3530 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3531 scanned = 1; 3532 } 3533 if (wbc->sync_mode == WB_SYNC_ALL) 3534 tag = PAGECACHE_TAG_TOWRITE; 3535 else 3536 tag = PAGECACHE_TAG_DIRTY; 3537 retry: 3538 if (wbc->sync_mode == WB_SYNC_ALL) 3539 tag_pages_for_writeback(mapping, index, end); 3540 while (!done && !nr_to_write_done && (index <= end) && 3541 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3542 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3543 unsigned i; 3544 3545 scanned = 1; 3546 for (i = 0; i < nr_pages; i++) { 3547 struct page *page = pvec.pages[i]; 3548 3549 /* 3550 * At this point we hold neither mapping->tree_lock nor 3551 * lock on the page itself: the page may be truncated or 3552 * invalidated (changing page->mapping to NULL), or even 3553 * swizzled back from swapper_space to tmpfs file 3554 * mapping 3555 */ 3556 if (!trylock_page(page)) { 3557 flush_fn(data); 3558 lock_page(page); 3559 } 3560 3561 if (unlikely(page->mapping != mapping)) { 3562 unlock_page(page); 3563 continue; 3564 } 3565 3566 if (!wbc->range_cyclic && page->index > end) { 3567 done = 1; 3568 unlock_page(page); 3569 continue; 3570 } 3571 3572 if (wbc->sync_mode != WB_SYNC_NONE) { 3573 if (PageWriteback(page)) 3574 flush_fn(data); 3575 wait_on_page_writeback(page); 3576 } 3577 3578 if (PageWriteback(page) || 3579 !clear_page_dirty_for_io(page)) { 3580 unlock_page(page); 3581 continue; 3582 } 3583 3584 ret = (*writepage)(page, wbc, data); 3585 3586 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3587 unlock_page(page); 3588 ret = 0; 3589 } 3590 if (ret) 3591 done = 1; 3592 3593 /* 3594 * the filesystem may choose to bump up nr_to_write. 3595 * We have to make sure to honor the new nr_to_write 3596 * at any time 3597 */ 3598 nr_to_write_done = wbc->nr_to_write <= 0; 3599 } 3600 pagevec_release(&pvec); 3601 cond_resched(); 3602 } 3603 if (!scanned && !done) { 3604 /* 3605 * We hit the last page and there is more work to be done: wrap 3606 * back to the start of the file 3607 */ 3608 scanned = 1; 3609 index = 0; 3610 goto retry; 3611 } 3612 btrfs_add_delayed_iput(inode); 3613 return ret; 3614 } 3615 3616 static void flush_epd_write_bio(struct extent_page_data *epd) 3617 { 3618 if (epd->bio) { 3619 int rw = WRITE; 3620 int ret; 3621 3622 if (epd->sync_io) 3623 rw = WRITE_SYNC; 3624 3625 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3626 BUG_ON(ret < 0); /* -ENOMEM */ 3627 epd->bio = NULL; 3628 } 3629 } 3630 3631 static noinline void flush_write_bio(void *data) 3632 { 3633 struct extent_page_data *epd = data; 3634 flush_epd_write_bio(epd); 3635 } 3636 3637 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3638 get_extent_t *get_extent, 3639 struct writeback_control *wbc) 3640 { 3641 int ret; 3642 struct extent_page_data epd = { 3643 .bio = NULL, 3644 .tree = tree, 3645 .get_extent = get_extent, 3646 .extent_locked = 0, 3647 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3648 .bio_flags = 0, 3649 }; 3650 3651 ret = __extent_writepage(page, wbc, &epd); 3652 3653 flush_epd_write_bio(&epd); 3654 return ret; 3655 } 3656 3657 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3658 u64 start, u64 end, get_extent_t *get_extent, 3659 int mode) 3660 { 3661 int ret = 0; 3662 struct address_space *mapping = inode->i_mapping; 3663 struct page *page; 3664 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3665 PAGE_CACHE_SHIFT; 3666 3667 struct extent_page_data epd = { 3668 .bio = NULL, 3669 .tree = tree, 3670 .get_extent = get_extent, 3671 .extent_locked = 1, 3672 .sync_io = mode == WB_SYNC_ALL, 3673 .bio_flags = 0, 3674 }; 3675 struct writeback_control wbc_writepages = { 3676 .sync_mode = mode, 3677 .nr_to_write = nr_pages * 2, 3678 .range_start = start, 3679 .range_end = end + 1, 3680 }; 3681 3682 while (start <= end) { 3683 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3684 if (clear_page_dirty_for_io(page)) 3685 ret = __extent_writepage(page, &wbc_writepages, &epd); 3686 else { 3687 if (tree->ops && tree->ops->writepage_end_io_hook) 3688 tree->ops->writepage_end_io_hook(page, start, 3689 start + PAGE_CACHE_SIZE - 1, 3690 NULL, 1); 3691 unlock_page(page); 3692 } 3693 page_cache_release(page); 3694 start += PAGE_CACHE_SIZE; 3695 } 3696 3697 flush_epd_write_bio(&epd); 3698 return ret; 3699 } 3700 3701 int extent_writepages(struct extent_io_tree *tree, 3702 struct address_space *mapping, 3703 get_extent_t *get_extent, 3704 struct writeback_control *wbc) 3705 { 3706 int ret = 0; 3707 struct extent_page_data epd = { 3708 .bio = NULL, 3709 .tree = tree, 3710 .get_extent = get_extent, 3711 .extent_locked = 0, 3712 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3713 .bio_flags = 0, 3714 }; 3715 3716 ret = extent_write_cache_pages(tree, mapping, wbc, 3717 __extent_writepage, &epd, 3718 flush_write_bio); 3719 flush_epd_write_bio(&epd); 3720 return ret; 3721 } 3722 3723 int extent_readpages(struct extent_io_tree *tree, 3724 struct address_space *mapping, 3725 struct list_head *pages, unsigned nr_pages, 3726 get_extent_t get_extent) 3727 { 3728 struct bio *bio = NULL; 3729 unsigned page_idx; 3730 unsigned long bio_flags = 0; 3731 struct page *pagepool[16]; 3732 struct page *page; 3733 int i = 0; 3734 int nr = 0; 3735 3736 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3737 page = list_entry(pages->prev, struct page, lru); 3738 3739 prefetchw(&page->flags); 3740 list_del(&page->lru); 3741 if (add_to_page_cache_lru(page, mapping, 3742 page->index, GFP_NOFS)) { 3743 page_cache_release(page); 3744 continue; 3745 } 3746 3747 pagepool[nr++] = page; 3748 if (nr < ARRAY_SIZE(pagepool)) 3749 continue; 3750 for (i = 0; i < nr; i++) { 3751 __extent_read_full_page(tree, pagepool[i], get_extent, 3752 &bio, 0, &bio_flags, READ); 3753 page_cache_release(pagepool[i]); 3754 } 3755 nr = 0; 3756 } 3757 for (i = 0; i < nr; i++) { 3758 __extent_read_full_page(tree, pagepool[i], get_extent, 3759 &bio, 0, &bio_flags, READ); 3760 page_cache_release(pagepool[i]); 3761 } 3762 3763 BUG_ON(!list_empty(pages)); 3764 if (bio) 3765 return submit_one_bio(READ, bio, 0, bio_flags); 3766 return 0; 3767 } 3768 3769 /* 3770 * basic invalidatepage code, this waits on any locked or writeback 3771 * ranges corresponding to the page, and then deletes any extent state 3772 * records from the tree 3773 */ 3774 int extent_invalidatepage(struct extent_io_tree *tree, 3775 struct page *page, unsigned long offset) 3776 { 3777 struct extent_state *cached_state = NULL; 3778 u64 start = page_offset(page); 3779 u64 end = start + PAGE_CACHE_SIZE - 1; 3780 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3781 3782 start += ALIGN(offset, blocksize); 3783 if (start > end) 3784 return 0; 3785 3786 lock_extent_bits(tree, start, end, 0, &cached_state); 3787 wait_on_page_writeback(page); 3788 clear_extent_bit(tree, start, end, 3789 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3790 EXTENT_DO_ACCOUNTING, 3791 1, 1, &cached_state, GFP_NOFS); 3792 return 0; 3793 } 3794 3795 /* 3796 * a helper for releasepage, this tests for areas of the page that 3797 * are locked or under IO and drops the related state bits if it is safe 3798 * to drop the page. 3799 */ 3800 static int try_release_extent_state(struct extent_map_tree *map, 3801 struct extent_io_tree *tree, 3802 struct page *page, gfp_t mask) 3803 { 3804 u64 start = page_offset(page); 3805 u64 end = start + PAGE_CACHE_SIZE - 1; 3806 int ret = 1; 3807 3808 if (test_range_bit(tree, start, end, 3809 EXTENT_IOBITS, 0, NULL)) 3810 ret = 0; 3811 else { 3812 if ((mask & GFP_NOFS) == GFP_NOFS) 3813 mask = GFP_NOFS; 3814 /* 3815 * at this point we can safely clear everything except the 3816 * locked bit and the nodatasum bit 3817 */ 3818 ret = clear_extent_bit(tree, start, end, 3819 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3820 0, 0, NULL, mask); 3821 3822 /* if clear_extent_bit failed for enomem reasons, 3823 * we can't allow the release to continue. 3824 */ 3825 if (ret < 0) 3826 ret = 0; 3827 else 3828 ret = 1; 3829 } 3830 return ret; 3831 } 3832 3833 /* 3834 * a helper for releasepage. As long as there are no locked extents 3835 * in the range corresponding to the page, both state records and extent 3836 * map records are removed 3837 */ 3838 int try_release_extent_mapping(struct extent_map_tree *map, 3839 struct extent_io_tree *tree, struct page *page, 3840 gfp_t mask) 3841 { 3842 struct extent_map *em; 3843 u64 start = page_offset(page); 3844 u64 end = start + PAGE_CACHE_SIZE - 1; 3845 3846 if ((mask & __GFP_WAIT) && 3847 page->mapping->host->i_size > 16 * 1024 * 1024) { 3848 u64 len; 3849 while (start <= end) { 3850 len = end - start + 1; 3851 write_lock(&map->lock); 3852 em = lookup_extent_mapping(map, start, len); 3853 if (!em) { 3854 write_unlock(&map->lock); 3855 break; 3856 } 3857 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 3858 em->start != start) { 3859 write_unlock(&map->lock); 3860 free_extent_map(em); 3861 break; 3862 } 3863 if (!test_range_bit(tree, em->start, 3864 extent_map_end(em) - 1, 3865 EXTENT_LOCKED | EXTENT_WRITEBACK, 3866 0, NULL)) { 3867 remove_extent_mapping(map, em); 3868 /* once for the rb tree */ 3869 free_extent_map(em); 3870 } 3871 start = extent_map_end(em); 3872 write_unlock(&map->lock); 3873 3874 /* once for us */ 3875 free_extent_map(em); 3876 } 3877 } 3878 return try_release_extent_state(map, tree, page, mask); 3879 } 3880 3881 /* 3882 * helper function for fiemap, which doesn't want to see any holes. 3883 * This maps until we find something past 'last' 3884 */ 3885 static struct extent_map *get_extent_skip_holes(struct inode *inode, 3886 u64 offset, 3887 u64 last, 3888 get_extent_t *get_extent) 3889 { 3890 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 3891 struct extent_map *em; 3892 u64 len; 3893 3894 if (offset >= last) 3895 return NULL; 3896 3897 while(1) { 3898 len = last - offset; 3899 if (len == 0) 3900 break; 3901 len = ALIGN(len, sectorsize); 3902 em = get_extent(inode, NULL, 0, offset, len, 0); 3903 if (IS_ERR_OR_NULL(em)) 3904 return em; 3905 3906 /* if this isn't a hole return it */ 3907 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 3908 em->block_start != EXTENT_MAP_HOLE) { 3909 return em; 3910 } 3911 3912 /* this is a hole, advance to the next extent */ 3913 offset = extent_map_end(em); 3914 free_extent_map(em); 3915 if (offset >= last) 3916 break; 3917 } 3918 return NULL; 3919 } 3920 3921 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3922 __u64 start, __u64 len, get_extent_t *get_extent) 3923 { 3924 int ret = 0; 3925 u64 off = start; 3926 u64 max = start + len; 3927 u32 flags = 0; 3928 u32 found_type; 3929 u64 last; 3930 u64 last_for_get_extent = 0; 3931 u64 disko = 0; 3932 u64 isize = i_size_read(inode); 3933 struct btrfs_key found_key; 3934 struct extent_map *em = NULL; 3935 struct extent_state *cached_state = NULL; 3936 struct btrfs_path *path; 3937 struct btrfs_file_extent_item *item; 3938 int end = 0; 3939 u64 em_start = 0; 3940 u64 em_len = 0; 3941 u64 em_end = 0; 3942 unsigned long emflags; 3943 3944 if (len == 0) 3945 return -EINVAL; 3946 3947 path = btrfs_alloc_path(); 3948 if (!path) 3949 return -ENOMEM; 3950 path->leave_spinning = 1; 3951 3952 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 3953 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 3954 3955 /* 3956 * lookup the last file extent. We're not using i_size here 3957 * because there might be preallocation past i_size 3958 */ 3959 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3960 path, btrfs_ino(inode), -1, 0); 3961 if (ret < 0) { 3962 btrfs_free_path(path); 3963 return ret; 3964 } 3965 WARN_ON(!ret); 3966 path->slots[0]--; 3967 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3968 struct btrfs_file_extent_item); 3969 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3970 found_type = btrfs_key_type(&found_key); 3971 3972 /* No extents, but there might be delalloc bits */ 3973 if (found_key.objectid != btrfs_ino(inode) || 3974 found_type != BTRFS_EXTENT_DATA_KEY) { 3975 /* have to trust i_size as the end */ 3976 last = (u64)-1; 3977 last_for_get_extent = isize; 3978 } else { 3979 /* 3980 * remember the start of the last extent. There are a 3981 * bunch of different factors that go into the length of the 3982 * extent, so its much less complex to remember where it started 3983 */ 3984 last = found_key.offset; 3985 last_for_get_extent = last + 1; 3986 } 3987 btrfs_free_path(path); 3988 3989 /* 3990 * we might have some extents allocated but more delalloc past those 3991 * extents. so, we trust isize unless the start of the last extent is 3992 * beyond isize 3993 */ 3994 if (last < isize) { 3995 last = (u64)-1; 3996 last_for_get_extent = isize; 3997 } 3998 3999 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, 4000 &cached_state); 4001 4002 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4003 get_extent); 4004 if (!em) 4005 goto out; 4006 if (IS_ERR(em)) { 4007 ret = PTR_ERR(em); 4008 goto out; 4009 } 4010 4011 while (!end) { 4012 u64 offset_in_extent; 4013 4014 /* break if the extent we found is outside the range */ 4015 if (em->start >= max || extent_map_end(em) < off) 4016 break; 4017 4018 /* 4019 * get_extent may return an extent that starts before our 4020 * requested range. We have to make sure the ranges 4021 * we return to fiemap always move forward and don't 4022 * overlap, so adjust the offsets here 4023 */ 4024 em_start = max(em->start, off); 4025 4026 /* 4027 * record the offset from the start of the extent 4028 * for adjusting the disk offset below 4029 */ 4030 offset_in_extent = em_start - em->start; 4031 em_end = extent_map_end(em); 4032 em_len = em_end - em_start; 4033 emflags = em->flags; 4034 disko = 0; 4035 flags = 0; 4036 4037 /* 4038 * bump off for our next call to get_extent 4039 */ 4040 off = extent_map_end(em); 4041 if (off >= max) 4042 end = 1; 4043 4044 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4045 end = 1; 4046 flags |= FIEMAP_EXTENT_LAST; 4047 } else if (em->block_start == EXTENT_MAP_INLINE) { 4048 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4049 FIEMAP_EXTENT_NOT_ALIGNED); 4050 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4051 flags |= (FIEMAP_EXTENT_DELALLOC | 4052 FIEMAP_EXTENT_UNKNOWN); 4053 } else { 4054 disko = em->block_start + offset_in_extent; 4055 } 4056 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4057 flags |= FIEMAP_EXTENT_ENCODED; 4058 4059 free_extent_map(em); 4060 em = NULL; 4061 if ((em_start >= last) || em_len == (u64)-1 || 4062 (last == (u64)-1 && isize <= em_end)) { 4063 flags |= FIEMAP_EXTENT_LAST; 4064 end = 1; 4065 } 4066 4067 /* now scan forward to see if this is really the last extent. */ 4068 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4069 get_extent); 4070 if (IS_ERR(em)) { 4071 ret = PTR_ERR(em); 4072 goto out; 4073 } 4074 if (!em) { 4075 flags |= FIEMAP_EXTENT_LAST; 4076 end = 1; 4077 } 4078 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4079 em_len, flags); 4080 if (ret) 4081 goto out_free; 4082 } 4083 out_free: 4084 free_extent_map(em); 4085 out: 4086 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4087 &cached_state, GFP_NOFS); 4088 return ret; 4089 } 4090 4091 static void __free_extent_buffer(struct extent_buffer *eb) 4092 { 4093 btrfs_leak_debug_del(&eb->leak_list); 4094 kmem_cache_free(extent_buffer_cache, eb); 4095 } 4096 4097 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4098 u64 start, 4099 unsigned long len, 4100 gfp_t mask) 4101 { 4102 struct extent_buffer *eb = NULL; 4103 4104 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4105 if (eb == NULL) 4106 return NULL; 4107 eb->start = start; 4108 eb->len = len; 4109 eb->tree = tree; 4110 eb->bflags = 0; 4111 rwlock_init(&eb->lock); 4112 atomic_set(&eb->write_locks, 0); 4113 atomic_set(&eb->read_locks, 0); 4114 atomic_set(&eb->blocking_readers, 0); 4115 atomic_set(&eb->blocking_writers, 0); 4116 atomic_set(&eb->spinning_readers, 0); 4117 atomic_set(&eb->spinning_writers, 0); 4118 eb->lock_nested = 0; 4119 init_waitqueue_head(&eb->write_lock_wq); 4120 init_waitqueue_head(&eb->read_lock_wq); 4121 4122 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4123 4124 spin_lock_init(&eb->refs_lock); 4125 atomic_set(&eb->refs, 1); 4126 atomic_set(&eb->io_pages, 0); 4127 4128 /* 4129 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4130 */ 4131 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4132 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4133 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4134 4135 return eb; 4136 } 4137 4138 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4139 { 4140 unsigned long i; 4141 struct page *p; 4142 struct extent_buffer *new; 4143 unsigned long num_pages = num_extent_pages(src->start, src->len); 4144 4145 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); 4146 if (new == NULL) 4147 return NULL; 4148 4149 for (i = 0; i < num_pages; i++) { 4150 p = alloc_page(GFP_ATOMIC); 4151 BUG_ON(!p); 4152 attach_extent_buffer_page(new, p); 4153 WARN_ON(PageDirty(p)); 4154 SetPageUptodate(p); 4155 new->pages[i] = p; 4156 } 4157 4158 copy_extent_buffer(new, src, 0, 0, src->len); 4159 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4160 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4161 4162 return new; 4163 } 4164 4165 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4166 { 4167 struct extent_buffer *eb; 4168 unsigned long num_pages = num_extent_pages(0, len); 4169 unsigned long i; 4170 4171 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); 4172 if (!eb) 4173 return NULL; 4174 4175 for (i = 0; i < num_pages; i++) { 4176 eb->pages[i] = alloc_page(GFP_ATOMIC); 4177 if (!eb->pages[i]) 4178 goto err; 4179 } 4180 set_extent_buffer_uptodate(eb); 4181 btrfs_set_header_nritems(eb, 0); 4182 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4183 4184 return eb; 4185 err: 4186 for (; i > 0; i--) 4187 __free_page(eb->pages[i - 1]); 4188 __free_extent_buffer(eb); 4189 return NULL; 4190 } 4191 4192 static int extent_buffer_under_io(struct extent_buffer *eb) 4193 { 4194 return (atomic_read(&eb->io_pages) || 4195 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4196 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4197 } 4198 4199 /* 4200 * Helper for releasing extent buffer page. 4201 */ 4202 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4203 unsigned long start_idx) 4204 { 4205 unsigned long index; 4206 unsigned long num_pages; 4207 struct page *page; 4208 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4209 4210 BUG_ON(extent_buffer_under_io(eb)); 4211 4212 num_pages = num_extent_pages(eb->start, eb->len); 4213 index = start_idx + num_pages; 4214 if (start_idx >= index) 4215 return; 4216 4217 do { 4218 index--; 4219 page = extent_buffer_page(eb, index); 4220 if (page && mapped) { 4221 spin_lock(&page->mapping->private_lock); 4222 /* 4223 * We do this since we'll remove the pages after we've 4224 * removed the eb from the radix tree, so we could race 4225 * and have this page now attached to the new eb. So 4226 * only clear page_private if it's still connected to 4227 * this eb. 4228 */ 4229 if (PagePrivate(page) && 4230 page->private == (unsigned long)eb) { 4231 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4232 BUG_ON(PageDirty(page)); 4233 BUG_ON(PageWriteback(page)); 4234 /* 4235 * We need to make sure we haven't be attached 4236 * to a new eb. 4237 */ 4238 ClearPagePrivate(page); 4239 set_page_private(page, 0); 4240 /* One for the page private */ 4241 page_cache_release(page); 4242 } 4243 spin_unlock(&page->mapping->private_lock); 4244 4245 } 4246 if (page) { 4247 /* One for when we alloced the page */ 4248 page_cache_release(page); 4249 } 4250 } while (index != start_idx); 4251 } 4252 4253 /* 4254 * Helper for releasing the extent buffer. 4255 */ 4256 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4257 { 4258 btrfs_release_extent_buffer_page(eb, 0); 4259 __free_extent_buffer(eb); 4260 } 4261 4262 static void check_buffer_tree_ref(struct extent_buffer *eb) 4263 { 4264 int refs; 4265 /* the ref bit is tricky. We have to make sure it is set 4266 * if we have the buffer dirty. Otherwise the 4267 * code to free a buffer can end up dropping a dirty 4268 * page 4269 * 4270 * Once the ref bit is set, it won't go away while the 4271 * buffer is dirty or in writeback, and it also won't 4272 * go away while we have the reference count on the 4273 * eb bumped. 4274 * 4275 * We can't just set the ref bit without bumping the 4276 * ref on the eb because free_extent_buffer might 4277 * see the ref bit and try to clear it. If this happens 4278 * free_extent_buffer might end up dropping our original 4279 * ref by mistake and freeing the page before we are able 4280 * to add one more ref. 4281 * 4282 * So bump the ref count first, then set the bit. If someone 4283 * beat us to it, drop the ref we added. 4284 */ 4285 refs = atomic_read(&eb->refs); 4286 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4287 return; 4288 4289 spin_lock(&eb->refs_lock); 4290 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4291 atomic_inc(&eb->refs); 4292 spin_unlock(&eb->refs_lock); 4293 } 4294 4295 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4296 { 4297 unsigned long num_pages, i; 4298 4299 check_buffer_tree_ref(eb); 4300 4301 num_pages = num_extent_pages(eb->start, eb->len); 4302 for (i = 0; i < num_pages; i++) { 4303 struct page *p = extent_buffer_page(eb, i); 4304 mark_page_accessed(p); 4305 } 4306 } 4307 4308 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4309 u64 start, unsigned long len) 4310 { 4311 unsigned long num_pages = num_extent_pages(start, len); 4312 unsigned long i; 4313 unsigned long index = start >> PAGE_CACHE_SHIFT; 4314 struct extent_buffer *eb; 4315 struct extent_buffer *exists = NULL; 4316 struct page *p; 4317 struct address_space *mapping = tree->mapping; 4318 int uptodate = 1; 4319 int ret; 4320 4321 rcu_read_lock(); 4322 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4323 if (eb && atomic_inc_not_zero(&eb->refs)) { 4324 rcu_read_unlock(); 4325 mark_extent_buffer_accessed(eb); 4326 return eb; 4327 } 4328 rcu_read_unlock(); 4329 4330 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4331 if (!eb) 4332 return NULL; 4333 4334 for (i = 0; i < num_pages; i++, index++) { 4335 p = find_or_create_page(mapping, index, GFP_NOFS); 4336 if (!p) 4337 goto free_eb; 4338 4339 spin_lock(&mapping->private_lock); 4340 if (PagePrivate(p)) { 4341 /* 4342 * We could have already allocated an eb for this page 4343 * and attached one so lets see if we can get a ref on 4344 * the existing eb, and if we can we know it's good and 4345 * we can just return that one, else we know we can just 4346 * overwrite page->private. 4347 */ 4348 exists = (struct extent_buffer *)p->private; 4349 if (atomic_inc_not_zero(&exists->refs)) { 4350 spin_unlock(&mapping->private_lock); 4351 unlock_page(p); 4352 page_cache_release(p); 4353 mark_extent_buffer_accessed(exists); 4354 goto free_eb; 4355 } 4356 4357 /* 4358 * Do this so attach doesn't complain and we need to 4359 * drop the ref the old guy had. 4360 */ 4361 ClearPagePrivate(p); 4362 WARN_ON(PageDirty(p)); 4363 page_cache_release(p); 4364 } 4365 attach_extent_buffer_page(eb, p); 4366 spin_unlock(&mapping->private_lock); 4367 WARN_ON(PageDirty(p)); 4368 mark_page_accessed(p); 4369 eb->pages[i] = p; 4370 if (!PageUptodate(p)) 4371 uptodate = 0; 4372 4373 /* 4374 * see below about how we avoid a nasty race with release page 4375 * and why we unlock later 4376 */ 4377 } 4378 if (uptodate) 4379 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4380 again: 4381 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4382 if (ret) 4383 goto free_eb; 4384 4385 spin_lock(&tree->buffer_lock); 4386 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4387 if (ret == -EEXIST) { 4388 exists = radix_tree_lookup(&tree->buffer, 4389 start >> PAGE_CACHE_SHIFT); 4390 if (!atomic_inc_not_zero(&exists->refs)) { 4391 spin_unlock(&tree->buffer_lock); 4392 radix_tree_preload_end(); 4393 exists = NULL; 4394 goto again; 4395 } 4396 spin_unlock(&tree->buffer_lock); 4397 radix_tree_preload_end(); 4398 mark_extent_buffer_accessed(exists); 4399 goto free_eb; 4400 } 4401 /* add one reference for the tree */ 4402 check_buffer_tree_ref(eb); 4403 spin_unlock(&tree->buffer_lock); 4404 radix_tree_preload_end(); 4405 4406 /* 4407 * there is a race where release page may have 4408 * tried to find this extent buffer in the radix 4409 * but failed. It will tell the VM it is safe to 4410 * reclaim the, and it will clear the page private bit. 4411 * We must make sure to set the page private bit properly 4412 * after the extent buffer is in the radix tree so 4413 * it doesn't get lost 4414 */ 4415 SetPageChecked(eb->pages[0]); 4416 for (i = 1; i < num_pages; i++) { 4417 p = extent_buffer_page(eb, i); 4418 ClearPageChecked(p); 4419 unlock_page(p); 4420 } 4421 unlock_page(eb->pages[0]); 4422 return eb; 4423 4424 free_eb: 4425 for (i = 0; i < num_pages; i++) { 4426 if (eb->pages[i]) 4427 unlock_page(eb->pages[i]); 4428 } 4429 4430 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4431 btrfs_release_extent_buffer(eb); 4432 return exists; 4433 } 4434 4435 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4436 u64 start, unsigned long len) 4437 { 4438 struct extent_buffer *eb; 4439 4440 rcu_read_lock(); 4441 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4442 if (eb && atomic_inc_not_zero(&eb->refs)) { 4443 rcu_read_unlock(); 4444 mark_extent_buffer_accessed(eb); 4445 return eb; 4446 } 4447 rcu_read_unlock(); 4448 4449 return NULL; 4450 } 4451 4452 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4453 { 4454 struct extent_buffer *eb = 4455 container_of(head, struct extent_buffer, rcu_head); 4456 4457 __free_extent_buffer(eb); 4458 } 4459 4460 /* Expects to have eb->eb_lock already held */ 4461 static int release_extent_buffer(struct extent_buffer *eb) 4462 { 4463 WARN_ON(atomic_read(&eb->refs) == 0); 4464 if (atomic_dec_and_test(&eb->refs)) { 4465 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4466 spin_unlock(&eb->refs_lock); 4467 } else { 4468 struct extent_io_tree *tree = eb->tree; 4469 4470 spin_unlock(&eb->refs_lock); 4471 4472 spin_lock(&tree->buffer_lock); 4473 radix_tree_delete(&tree->buffer, 4474 eb->start >> PAGE_CACHE_SHIFT); 4475 spin_unlock(&tree->buffer_lock); 4476 } 4477 4478 /* Should be safe to release our pages at this point */ 4479 btrfs_release_extent_buffer_page(eb, 0); 4480 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4481 return 1; 4482 } 4483 spin_unlock(&eb->refs_lock); 4484 4485 return 0; 4486 } 4487 4488 void free_extent_buffer(struct extent_buffer *eb) 4489 { 4490 int refs; 4491 int old; 4492 if (!eb) 4493 return; 4494 4495 while (1) { 4496 refs = atomic_read(&eb->refs); 4497 if (refs <= 3) 4498 break; 4499 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 4500 if (old == refs) 4501 return; 4502 } 4503 4504 spin_lock(&eb->refs_lock); 4505 if (atomic_read(&eb->refs) == 2 && 4506 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4507 atomic_dec(&eb->refs); 4508 4509 if (atomic_read(&eb->refs) == 2 && 4510 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4511 !extent_buffer_under_io(eb) && 4512 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4513 atomic_dec(&eb->refs); 4514 4515 /* 4516 * I know this is terrible, but it's temporary until we stop tracking 4517 * the uptodate bits and such for the extent buffers. 4518 */ 4519 release_extent_buffer(eb); 4520 } 4521 4522 void free_extent_buffer_stale(struct extent_buffer *eb) 4523 { 4524 if (!eb) 4525 return; 4526 4527 spin_lock(&eb->refs_lock); 4528 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4529 4530 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4531 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4532 atomic_dec(&eb->refs); 4533 release_extent_buffer(eb); 4534 } 4535 4536 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4537 { 4538 unsigned long i; 4539 unsigned long num_pages; 4540 struct page *page; 4541 4542 num_pages = num_extent_pages(eb->start, eb->len); 4543 4544 for (i = 0; i < num_pages; i++) { 4545 page = extent_buffer_page(eb, i); 4546 if (!PageDirty(page)) 4547 continue; 4548 4549 lock_page(page); 4550 WARN_ON(!PagePrivate(page)); 4551 4552 clear_page_dirty_for_io(page); 4553 spin_lock_irq(&page->mapping->tree_lock); 4554 if (!PageDirty(page)) { 4555 radix_tree_tag_clear(&page->mapping->page_tree, 4556 page_index(page), 4557 PAGECACHE_TAG_DIRTY); 4558 } 4559 spin_unlock_irq(&page->mapping->tree_lock); 4560 ClearPageError(page); 4561 unlock_page(page); 4562 } 4563 WARN_ON(atomic_read(&eb->refs) == 0); 4564 } 4565 4566 int set_extent_buffer_dirty(struct extent_buffer *eb) 4567 { 4568 unsigned long i; 4569 unsigned long num_pages; 4570 int was_dirty = 0; 4571 4572 check_buffer_tree_ref(eb); 4573 4574 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4575 4576 num_pages = num_extent_pages(eb->start, eb->len); 4577 WARN_ON(atomic_read(&eb->refs) == 0); 4578 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4579 4580 for (i = 0; i < num_pages; i++) 4581 set_page_dirty(extent_buffer_page(eb, i)); 4582 return was_dirty; 4583 } 4584 4585 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4586 { 4587 unsigned long i; 4588 struct page *page; 4589 unsigned long num_pages; 4590 4591 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4592 num_pages = num_extent_pages(eb->start, eb->len); 4593 for (i = 0; i < num_pages; i++) { 4594 page = extent_buffer_page(eb, i); 4595 if (page) 4596 ClearPageUptodate(page); 4597 } 4598 return 0; 4599 } 4600 4601 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4602 { 4603 unsigned long i; 4604 struct page *page; 4605 unsigned long num_pages; 4606 4607 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4608 num_pages = num_extent_pages(eb->start, eb->len); 4609 for (i = 0; i < num_pages; i++) { 4610 page = extent_buffer_page(eb, i); 4611 SetPageUptodate(page); 4612 } 4613 return 0; 4614 } 4615 4616 int extent_buffer_uptodate(struct extent_buffer *eb) 4617 { 4618 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4619 } 4620 4621 int read_extent_buffer_pages(struct extent_io_tree *tree, 4622 struct extent_buffer *eb, u64 start, int wait, 4623 get_extent_t *get_extent, int mirror_num) 4624 { 4625 unsigned long i; 4626 unsigned long start_i; 4627 struct page *page; 4628 int err; 4629 int ret = 0; 4630 int locked_pages = 0; 4631 int all_uptodate = 1; 4632 unsigned long num_pages; 4633 unsigned long num_reads = 0; 4634 struct bio *bio = NULL; 4635 unsigned long bio_flags = 0; 4636 4637 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4638 return 0; 4639 4640 if (start) { 4641 WARN_ON(start < eb->start); 4642 start_i = (start >> PAGE_CACHE_SHIFT) - 4643 (eb->start >> PAGE_CACHE_SHIFT); 4644 } else { 4645 start_i = 0; 4646 } 4647 4648 num_pages = num_extent_pages(eb->start, eb->len); 4649 for (i = start_i; i < num_pages; i++) { 4650 page = extent_buffer_page(eb, i); 4651 if (wait == WAIT_NONE) { 4652 if (!trylock_page(page)) 4653 goto unlock_exit; 4654 } else { 4655 lock_page(page); 4656 } 4657 locked_pages++; 4658 if (!PageUptodate(page)) { 4659 num_reads++; 4660 all_uptodate = 0; 4661 } 4662 } 4663 if (all_uptodate) { 4664 if (start_i == 0) 4665 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4666 goto unlock_exit; 4667 } 4668 4669 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4670 eb->read_mirror = 0; 4671 atomic_set(&eb->io_pages, num_reads); 4672 for (i = start_i; i < num_pages; i++) { 4673 page = extent_buffer_page(eb, i); 4674 if (!PageUptodate(page)) { 4675 ClearPageError(page); 4676 err = __extent_read_full_page(tree, page, 4677 get_extent, &bio, 4678 mirror_num, &bio_flags, 4679 READ | REQ_META); 4680 if (err) 4681 ret = err; 4682 } else { 4683 unlock_page(page); 4684 } 4685 } 4686 4687 if (bio) { 4688 err = submit_one_bio(READ | REQ_META, bio, mirror_num, 4689 bio_flags); 4690 if (err) 4691 return err; 4692 } 4693 4694 if (ret || wait != WAIT_COMPLETE) 4695 return ret; 4696 4697 for (i = start_i; i < num_pages; i++) { 4698 page = extent_buffer_page(eb, i); 4699 wait_on_page_locked(page); 4700 if (!PageUptodate(page)) 4701 ret = -EIO; 4702 } 4703 4704 return ret; 4705 4706 unlock_exit: 4707 i = start_i; 4708 while (locked_pages > 0) { 4709 page = extent_buffer_page(eb, i); 4710 i++; 4711 unlock_page(page); 4712 locked_pages--; 4713 } 4714 return ret; 4715 } 4716 4717 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4718 unsigned long start, 4719 unsigned long len) 4720 { 4721 size_t cur; 4722 size_t offset; 4723 struct page *page; 4724 char *kaddr; 4725 char *dst = (char *)dstv; 4726 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4727 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4728 4729 WARN_ON(start > eb->len); 4730 WARN_ON(start + len > eb->start + eb->len); 4731 4732 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4733 4734 while (len > 0) { 4735 page = extent_buffer_page(eb, i); 4736 4737 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4738 kaddr = page_address(page); 4739 memcpy(dst, kaddr + offset, cur); 4740 4741 dst += cur; 4742 len -= cur; 4743 offset = 0; 4744 i++; 4745 } 4746 } 4747 4748 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4749 unsigned long min_len, char **map, 4750 unsigned long *map_start, 4751 unsigned long *map_len) 4752 { 4753 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4754 char *kaddr; 4755 struct page *p; 4756 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4757 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4758 unsigned long end_i = (start_offset + start + min_len - 1) >> 4759 PAGE_CACHE_SHIFT; 4760 4761 if (i != end_i) 4762 return -EINVAL; 4763 4764 if (i == 0) { 4765 offset = start_offset; 4766 *map_start = 0; 4767 } else { 4768 offset = 0; 4769 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4770 } 4771 4772 if (start + min_len > eb->len) { 4773 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4774 "wanted %lu %lu\n", (unsigned long long)eb->start, 4775 eb->len, start, min_len); 4776 return -EINVAL; 4777 } 4778 4779 p = extent_buffer_page(eb, i); 4780 kaddr = page_address(p); 4781 *map = kaddr + offset; 4782 *map_len = PAGE_CACHE_SIZE - offset; 4783 return 0; 4784 } 4785 4786 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4787 unsigned long start, 4788 unsigned long len) 4789 { 4790 size_t cur; 4791 size_t offset; 4792 struct page *page; 4793 char *kaddr; 4794 char *ptr = (char *)ptrv; 4795 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4796 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4797 int ret = 0; 4798 4799 WARN_ON(start > eb->len); 4800 WARN_ON(start + len > eb->start + eb->len); 4801 4802 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4803 4804 while (len > 0) { 4805 page = extent_buffer_page(eb, i); 4806 4807 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4808 4809 kaddr = page_address(page); 4810 ret = memcmp(ptr, kaddr + offset, cur); 4811 if (ret) 4812 break; 4813 4814 ptr += cur; 4815 len -= cur; 4816 offset = 0; 4817 i++; 4818 } 4819 return ret; 4820 } 4821 4822 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4823 unsigned long start, unsigned long len) 4824 { 4825 size_t cur; 4826 size_t offset; 4827 struct page *page; 4828 char *kaddr; 4829 char *src = (char *)srcv; 4830 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4831 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4832 4833 WARN_ON(start > eb->len); 4834 WARN_ON(start + len > eb->start + eb->len); 4835 4836 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4837 4838 while (len > 0) { 4839 page = extent_buffer_page(eb, i); 4840 WARN_ON(!PageUptodate(page)); 4841 4842 cur = min(len, PAGE_CACHE_SIZE - offset); 4843 kaddr = page_address(page); 4844 memcpy(kaddr + offset, src, cur); 4845 4846 src += cur; 4847 len -= cur; 4848 offset = 0; 4849 i++; 4850 } 4851 } 4852 4853 void memset_extent_buffer(struct extent_buffer *eb, char c, 4854 unsigned long start, unsigned long len) 4855 { 4856 size_t cur; 4857 size_t offset; 4858 struct page *page; 4859 char *kaddr; 4860 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4861 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4862 4863 WARN_ON(start > eb->len); 4864 WARN_ON(start + len > eb->start + eb->len); 4865 4866 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4867 4868 while (len > 0) { 4869 page = extent_buffer_page(eb, i); 4870 WARN_ON(!PageUptodate(page)); 4871 4872 cur = min(len, PAGE_CACHE_SIZE - offset); 4873 kaddr = page_address(page); 4874 memset(kaddr + offset, c, cur); 4875 4876 len -= cur; 4877 offset = 0; 4878 i++; 4879 } 4880 } 4881 4882 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 4883 unsigned long dst_offset, unsigned long src_offset, 4884 unsigned long len) 4885 { 4886 u64 dst_len = dst->len; 4887 size_t cur; 4888 size_t offset; 4889 struct page *page; 4890 char *kaddr; 4891 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4892 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4893 4894 WARN_ON(src->len != dst_len); 4895 4896 offset = (start_offset + dst_offset) & 4897 ((unsigned long)PAGE_CACHE_SIZE - 1); 4898 4899 while (len > 0) { 4900 page = extent_buffer_page(dst, i); 4901 WARN_ON(!PageUptodate(page)); 4902 4903 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4904 4905 kaddr = page_address(page); 4906 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4907 4908 src_offset += cur; 4909 len -= cur; 4910 offset = 0; 4911 i++; 4912 } 4913 } 4914 4915 static void move_pages(struct page *dst_page, struct page *src_page, 4916 unsigned long dst_off, unsigned long src_off, 4917 unsigned long len) 4918 { 4919 char *dst_kaddr = page_address(dst_page); 4920 if (dst_page == src_page) { 4921 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4922 } else { 4923 char *src_kaddr = page_address(src_page); 4924 char *p = dst_kaddr + dst_off + len; 4925 char *s = src_kaddr + src_off + len; 4926 4927 while (len--) 4928 *--p = *--s; 4929 } 4930 } 4931 4932 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4933 { 4934 unsigned long distance = (src > dst) ? src - dst : dst - src; 4935 return distance < len; 4936 } 4937 4938 static void copy_pages(struct page *dst_page, struct page *src_page, 4939 unsigned long dst_off, unsigned long src_off, 4940 unsigned long len) 4941 { 4942 char *dst_kaddr = page_address(dst_page); 4943 char *src_kaddr; 4944 int must_memmove = 0; 4945 4946 if (dst_page != src_page) { 4947 src_kaddr = page_address(src_page); 4948 } else { 4949 src_kaddr = dst_kaddr; 4950 if (areas_overlap(src_off, dst_off, len)) 4951 must_memmove = 1; 4952 } 4953 4954 if (must_memmove) 4955 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 4956 else 4957 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4958 } 4959 4960 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4961 unsigned long src_offset, unsigned long len) 4962 { 4963 size_t cur; 4964 size_t dst_off_in_page; 4965 size_t src_off_in_page; 4966 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4967 unsigned long dst_i; 4968 unsigned long src_i; 4969 4970 if (src_offset + len > dst->len) { 4971 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4972 "len %lu dst len %lu\n", src_offset, len, dst->len); 4973 BUG_ON(1); 4974 } 4975 if (dst_offset + len > dst->len) { 4976 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4977 "len %lu dst len %lu\n", dst_offset, len, dst->len); 4978 BUG_ON(1); 4979 } 4980 4981 while (len > 0) { 4982 dst_off_in_page = (start_offset + dst_offset) & 4983 ((unsigned long)PAGE_CACHE_SIZE - 1); 4984 src_off_in_page = (start_offset + src_offset) & 4985 ((unsigned long)PAGE_CACHE_SIZE - 1); 4986 4987 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4988 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 4989 4990 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 4991 src_off_in_page)); 4992 cur = min_t(unsigned long, cur, 4993 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 4994 4995 copy_pages(extent_buffer_page(dst, dst_i), 4996 extent_buffer_page(dst, src_i), 4997 dst_off_in_page, src_off_in_page, cur); 4998 4999 src_offset += cur; 5000 dst_offset += cur; 5001 len -= cur; 5002 } 5003 } 5004 5005 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5006 unsigned long src_offset, unsigned long len) 5007 { 5008 size_t cur; 5009 size_t dst_off_in_page; 5010 size_t src_off_in_page; 5011 unsigned long dst_end = dst_offset + len - 1; 5012 unsigned long src_end = src_offset + len - 1; 5013 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 5014 unsigned long dst_i; 5015 unsigned long src_i; 5016 5017 if (src_offset + len > dst->len) { 5018 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 5019 "len %lu len %lu\n", src_offset, len, dst->len); 5020 BUG_ON(1); 5021 } 5022 if (dst_offset + len > dst->len) { 5023 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 5024 "len %lu len %lu\n", dst_offset, len, dst->len); 5025 BUG_ON(1); 5026 } 5027 if (dst_offset < src_offset) { 5028 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5029 return; 5030 } 5031 while (len > 0) { 5032 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 5033 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 5034 5035 dst_off_in_page = (start_offset + dst_end) & 5036 ((unsigned long)PAGE_CACHE_SIZE - 1); 5037 src_off_in_page = (start_offset + src_end) & 5038 ((unsigned long)PAGE_CACHE_SIZE - 1); 5039 5040 cur = min_t(unsigned long, len, src_off_in_page + 1); 5041 cur = min(cur, dst_off_in_page + 1); 5042 move_pages(extent_buffer_page(dst, dst_i), 5043 extent_buffer_page(dst, src_i), 5044 dst_off_in_page - cur + 1, 5045 src_off_in_page - cur + 1, cur); 5046 5047 dst_end -= cur; 5048 src_end -= cur; 5049 len -= cur; 5050 } 5051 } 5052 5053 int try_release_extent_buffer(struct page *page) 5054 { 5055 struct extent_buffer *eb; 5056 5057 /* 5058 * We need to make sure noboody is attaching this page to an eb right 5059 * now. 5060 */ 5061 spin_lock(&page->mapping->private_lock); 5062 if (!PagePrivate(page)) { 5063 spin_unlock(&page->mapping->private_lock); 5064 return 1; 5065 } 5066 5067 eb = (struct extent_buffer *)page->private; 5068 BUG_ON(!eb); 5069 5070 /* 5071 * This is a little awful but should be ok, we need to make sure that 5072 * the eb doesn't disappear out from under us while we're looking at 5073 * this page. 5074 */ 5075 spin_lock(&eb->refs_lock); 5076 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5077 spin_unlock(&eb->refs_lock); 5078 spin_unlock(&page->mapping->private_lock); 5079 return 0; 5080 } 5081 spin_unlock(&page->mapping->private_lock); 5082 5083 /* 5084 * If tree ref isn't set then we know the ref on this eb is a real ref, 5085 * so just return, this page will likely be freed soon anyway. 5086 */ 5087 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5088 spin_unlock(&eb->refs_lock); 5089 return 0; 5090 } 5091 5092 return release_extent_buffer(eb); 5093 } 5094