1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include <linux/prefetch.h> 14 #include <linux/cleancache.h> 15 #include "extent_io.h" 16 #include "extent_map.h" 17 #include "compat.h" 18 #include "ctree.h" 19 #include "btrfs_inode.h" 20 #include "volumes.h" 21 #include "check-integrity.h" 22 #include "locking.h" 23 #include "rcu-string.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 28 static LIST_HEAD(buffers); 29 static LIST_HEAD(states); 30 31 #define LEAK_DEBUG 0 32 #if LEAK_DEBUG 33 static DEFINE_SPINLOCK(leak_lock); 34 #endif 35 36 #define BUFFER_LRU_MAX 64 37 38 struct tree_entry { 39 u64 start; 40 u64 end; 41 struct rb_node rb_node; 42 }; 43 44 struct extent_page_data { 45 struct bio *bio; 46 struct extent_io_tree *tree; 47 get_extent_t *get_extent; 48 unsigned long bio_flags; 49 50 /* tells writepage not to lock the state bits for this range 51 * it still does the unlocking 52 */ 53 unsigned int extent_locked:1; 54 55 /* tells the submit_bio code to use a WRITE_SYNC */ 56 unsigned int sync_io:1; 57 }; 58 59 static noinline void flush_write_bio(void *data); 60 static inline struct btrfs_fs_info * 61 tree_fs_info(struct extent_io_tree *tree) 62 { 63 return btrfs_sb(tree->mapping->host->i_sb); 64 } 65 66 int __init extent_io_init(void) 67 { 68 extent_state_cache = kmem_cache_create("btrfs_extent_state", 69 sizeof(struct extent_state), 0, 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 71 if (!extent_state_cache) 72 return -ENOMEM; 73 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 75 sizeof(struct extent_buffer), 0, 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 77 if (!extent_buffer_cache) 78 goto free_state_cache; 79 return 0; 80 81 free_state_cache: 82 kmem_cache_destroy(extent_state_cache); 83 return -ENOMEM; 84 } 85 86 void extent_io_exit(void) 87 { 88 struct extent_state *state; 89 struct extent_buffer *eb; 90 91 while (!list_empty(&states)) { 92 state = list_entry(states.next, struct extent_state, leak_list); 93 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 94 "state %lu in tree %p refs %d\n", 95 (unsigned long long)state->start, 96 (unsigned long long)state->end, 97 state->state, state->tree, atomic_read(&state->refs)); 98 list_del(&state->leak_list); 99 kmem_cache_free(extent_state_cache, state); 100 101 } 102 103 while (!list_empty(&buffers)) { 104 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 105 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 106 "refs %d\n", (unsigned long long)eb->start, 107 eb->len, atomic_read(&eb->refs)); 108 list_del(&eb->leak_list); 109 kmem_cache_free(extent_buffer_cache, eb); 110 } 111 112 /* 113 * Make sure all delayed rcu free are flushed before we 114 * destroy caches. 115 */ 116 rcu_barrier(); 117 if (extent_state_cache) 118 kmem_cache_destroy(extent_state_cache); 119 if (extent_buffer_cache) 120 kmem_cache_destroy(extent_buffer_cache); 121 } 122 123 void extent_io_tree_init(struct extent_io_tree *tree, 124 struct address_space *mapping) 125 { 126 tree->state = RB_ROOT; 127 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 128 tree->ops = NULL; 129 tree->dirty_bytes = 0; 130 spin_lock_init(&tree->lock); 131 spin_lock_init(&tree->buffer_lock); 132 tree->mapping = mapping; 133 } 134 135 static struct extent_state *alloc_extent_state(gfp_t mask) 136 { 137 struct extent_state *state; 138 #if LEAK_DEBUG 139 unsigned long flags; 140 #endif 141 142 state = kmem_cache_alloc(extent_state_cache, mask); 143 if (!state) 144 return state; 145 state->state = 0; 146 state->private = 0; 147 state->tree = NULL; 148 #if LEAK_DEBUG 149 spin_lock_irqsave(&leak_lock, flags); 150 list_add(&state->leak_list, &states); 151 spin_unlock_irqrestore(&leak_lock, flags); 152 #endif 153 atomic_set(&state->refs, 1); 154 init_waitqueue_head(&state->wq); 155 trace_alloc_extent_state(state, mask, _RET_IP_); 156 return state; 157 } 158 159 void free_extent_state(struct extent_state *state) 160 { 161 if (!state) 162 return; 163 if (atomic_dec_and_test(&state->refs)) { 164 #if LEAK_DEBUG 165 unsigned long flags; 166 #endif 167 WARN_ON(state->tree); 168 #if LEAK_DEBUG 169 spin_lock_irqsave(&leak_lock, flags); 170 list_del(&state->leak_list); 171 spin_unlock_irqrestore(&leak_lock, flags); 172 #endif 173 trace_free_extent_state(state, _RET_IP_); 174 kmem_cache_free(extent_state_cache, state); 175 } 176 } 177 178 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 179 struct rb_node *node) 180 { 181 struct rb_node **p = &root->rb_node; 182 struct rb_node *parent = NULL; 183 struct tree_entry *entry; 184 185 while (*p) { 186 parent = *p; 187 entry = rb_entry(parent, struct tree_entry, rb_node); 188 189 if (offset < entry->start) 190 p = &(*p)->rb_left; 191 else if (offset > entry->end) 192 p = &(*p)->rb_right; 193 else 194 return parent; 195 } 196 197 rb_link_node(node, parent, p); 198 rb_insert_color(node, root); 199 return NULL; 200 } 201 202 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 203 struct rb_node **prev_ret, 204 struct rb_node **next_ret) 205 { 206 struct rb_root *root = &tree->state; 207 struct rb_node *n = root->rb_node; 208 struct rb_node *prev = NULL; 209 struct rb_node *orig_prev = NULL; 210 struct tree_entry *entry; 211 struct tree_entry *prev_entry = NULL; 212 213 while (n) { 214 entry = rb_entry(n, struct tree_entry, rb_node); 215 prev = n; 216 prev_entry = entry; 217 218 if (offset < entry->start) 219 n = n->rb_left; 220 else if (offset > entry->end) 221 n = n->rb_right; 222 else 223 return n; 224 } 225 226 if (prev_ret) { 227 orig_prev = prev; 228 while (prev && offset > prev_entry->end) { 229 prev = rb_next(prev); 230 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 231 } 232 *prev_ret = prev; 233 prev = orig_prev; 234 } 235 236 if (next_ret) { 237 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 238 while (prev && offset < prev_entry->start) { 239 prev = rb_prev(prev); 240 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 241 } 242 *next_ret = prev; 243 } 244 return NULL; 245 } 246 247 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 248 u64 offset) 249 { 250 struct rb_node *prev = NULL; 251 struct rb_node *ret; 252 253 ret = __etree_search(tree, offset, &prev, NULL); 254 if (!ret) 255 return prev; 256 return ret; 257 } 258 259 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 260 struct extent_state *other) 261 { 262 if (tree->ops && tree->ops->merge_extent_hook) 263 tree->ops->merge_extent_hook(tree->mapping->host, new, 264 other); 265 } 266 267 /* 268 * utility function to look for merge candidates inside a given range. 269 * Any extents with matching state are merged together into a single 270 * extent in the tree. Extents with EXTENT_IO in their state field 271 * are not merged because the end_io handlers need to be able to do 272 * operations on them without sleeping (or doing allocations/splits). 273 * 274 * This should be called with the tree lock held. 275 */ 276 static void merge_state(struct extent_io_tree *tree, 277 struct extent_state *state) 278 { 279 struct extent_state *other; 280 struct rb_node *other_node; 281 282 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 283 return; 284 285 other_node = rb_prev(&state->rb_node); 286 if (other_node) { 287 other = rb_entry(other_node, struct extent_state, rb_node); 288 if (other->end == state->start - 1 && 289 other->state == state->state) { 290 merge_cb(tree, state, other); 291 state->start = other->start; 292 other->tree = NULL; 293 rb_erase(&other->rb_node, &tree->state); 294 free_extent_state(other); 295 } 296 } 297 other_node = rb_next(&state->rb_node); 298 if (other_node) { 299 other = rb_entry(other_node, struct extent_state, rb_node); 300 if (other->start == state->end + 1 && 301 other->state == state->state) { 302 merge_cb(tree, state, other); 303 state->end = other->end; 304 other->tree = NULL; 305 rb_erase(&other->rb_node, &tree->state); 306 free_extent_state(other); 307 } 308 } 309 } 310 311 static void set_state_cb(struct extent_io_tree *tree, 312 struct extent_state *state, int *bits) 313 { 314 if (tree->ops && tree->ops->set_bit_hook) 315 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 316 } 317 318 static void clear_state_cb(struct extent_io_tree *tree, 319 struct extent_state *state, int *bits) 320 { 321 if (tree->ops && tree->ops->clear_bit_hook) 322 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 323 } 324 325 static void set_state_bits(struct extent_io_tree *tree, 326 struct extent_state *state, int *bits); 327 328 /* 329 * insert an extent_state struct into the tree. 'bits' are set on the 330 * struct before it is inserted. 331 * 332 * This may return -EEXIST if the extent is already there, in which case the 333 * state struct is freed. 334 * 335 * The tree lock is not taken internally. This is a utility function and 336 * probably isn't what you want to call (see set/clear_extent_bit). 337 */ 338 static int insert_state(struct extent_io_tree *tree, 339 struct extent_state *state, u64 start, u64 end, 340 int *bits) 341 { 342 struct rb_node *node; 343 344 if (end < start) { 345 printk(KERN_ERR "btrfs end < start %llu %llu\n", 346 (unsigned long long)end, 347 (unsigned long long)start); 348 WARN_ON(1); 349 } 350 state->start = start; 351 state->end = end; 352 353 set_state_bits(tree, state, bits); 354 355 node = tree_insert(&tree->state, end, &state->rb_node); 356 if (node) { 357 struct extent_state *found; 358 found = rb_entry(node, struct extent_state, rb_node); 359 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 360 "%llu %llu\n", (unsigned long long)found->start, 361 (unsigned long long)found->end, 362 (unsigned long long)start, (unsigned long long)end); 363 return -EEXIST; 364 } 365 state->tree = tree; 366 merge_state(tree, state); 367 return 0; 368 } 369 370 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 371 u64 split) 372 { 373 if (tree->ops && tree->ops->split_extent_hook) 374 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 375 } 376 377 /* 378 * split a given extent state struct in two, inserting the preallocated 379 * struct 'prealloc' as the newly created second half. 'split' indicates an 380 * offset inside 'orig' where it should be split. 381 * 382 * Before calling, 383 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 384 * are two extent state structs in the tree: 385 * prealloc: [orig->start, split - 1] 386 * orig: [ split, orig->end ] 387 * 388 * The tree locks are not taken by this function. They need to be held 389 * by the caller. 390 */ 391 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 392 struct extent_state *prealloc, u64 split) 393 { 394 struct rb_node *node; 395 396 split_cb(tree, orig, split); 397 398 prealloc->start = orig->start; 399 prealloc->end = split - 1; 400 prealloc->state = orig->state; 401 orig->start = split; 402 403 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 404 if (node) { 405 free_extent_state(prealloc); 406 return -EEXIST; 407 } 408 prealloc->tree = tree; 409 return 0; 410 } 411 412 static struct extent_state *next_state(struct extent_state *state) 413 { 414 struct rb_node *next = rb_next(&state->rb_node); 415 if (next) 416 return rb_entry(next, struct extent_state, rb_node); 417 else 418 return NULL; 419 } 420 421 /* 422 * utility function to clear some bits in an extent state struct. 423 * it will optionally wake up any one waiting on this state (wake == 1). 424 * 425 * If no bits are set on the state struct after clearing things, the 426 * struct is freed and removed from the tree 427 */ 428 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 429 struct extent_state *state, 430 int *bits, int wake) 431 { 432 struct extent_state *next; 433 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 434 435 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 436 u64 range = state->end - state->start + 1; 437 WARN_ON(range > tree->dirty_bytes); 438 tree->dirty_bytes -= range; 439 } 440 clear_state_cb(tree, state, bits); 441 state->state &= ~bits_to_clear; 442 if (wake) 443 wake_up(&state->wq); 444 if (state->state == 0) { 445 next = next_state(state); 446 if (state->tree) { 447 rb_erase(&state->rb_node, &tree->state); 448 state->tree = NULL; 449 free_extent_state(state); 450 } else { 451 WARN_ON(1); 452 } 453 } else { 454 merge_state(tree, state); 455 next = next_state(state); 456 } 457 return next; 458 } 459 460 static struct extent_state * 461 alloc_extent_state_atomic(struct extent_state *prealloc) 462 { 463 if (!prealloc) 464 prealloc = alloc_extent_state(GFP_ATOMIC); 465 466 return prealloc; 467 } 468 469 void extent_io_tree_panic(struct extent_io_tree *tree, int err) 470 { 471 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 472 "Extent tree was modified by another " 473 "thread while locked."); 474 } 475 476 /* 477 * clear some bits on a range in the tree. This may require splitting 478 * or inserting elements in the tree, so the gfp mask is used to 479 * indicate which allocations or sleeping are allowed. 480 * 481 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 482 * the given range from the tree regardless of state (ie for truncate). 483 * 484 * the range [start, end] is inclusive. 485 * 486 * This takes the tree lock, and returns 0 on success and < 0 on error. 487 */ 488 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 489 int bits, int wake, int delete, 490 struct extent_state **cached_state, 491 gfp_t mask) 492 { 493 struct extent_state *state; 494 struct extent_state *cached; 495 struct extent_state *prealloc = NULL; 496 struct rb_node *node; 497 u64 last_end; 498 int err; 499 int clear = 0; 500 501 if (delete) 502 bits |= ~EXTENT_CTLBITS; 503 bits |= EXTENT_FIRST_DELALLOC; 504 505 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 506 clear = 1; 507 again: 508 if (!prealloc && (mask & __GFP_WAIT)) { 509 prealloc = alloc_extent_state(mask); 510 if (!prealloc) 511 return -ENOMEM; 512 } 513 514 spin_lock(&tree->lock); 515 if (cached_state) { 516 cached = *cached_state; 517 518 if (clear) { 519 *cached_state = NULL; 520 cached_state = NULL; 521 } 522 523 if (cached && cached->tree && cached->start <= start && 524 cached->end > start) { 525 if (clear) 526 atomic_dec(&cached->refs); 527 state = cached; 528 goto hit_next; 529 } 530 if (clear) 531 free_extent_state(cached); 532 } 533 /* 534 * this search will find the extents that end after 535 * our range starts 536 */ 537 node = tree_search(tree, start); 538 if (!node) 539 goto out; 540 state = rb_entry(node, struct extent_state, rb_node); 541 hit_next: 542 if (state->start > end) 543 goto out; 544 WARN_ON(state->end < start); 545 last_end = state->end; 546 547 /* the state doesn't have the wanted bits, go ahead */ 548 if (!(state->state & bits)) { 549 state = next_state(state); 550 goto next; 551 } 552 553 /* 554 * | ---- desired range ---- | 555 * | state | or 556 * | ------------- state -------------- | 557 * 558 * We need to split the extent we found, and may flip 559 * bits on second half. 560 * 561 * If the extent we found extends past our range, we 562 * just split and search again. It'll get split again 563 * the next time though. 564 * 565 * If the extent we found is inside our range, we clear 566 * the desired bit on it. 567 */ 568 569 if (state->start < start) { 570 prealloc = alloc_extent_state_atomic(prealloc); 571 BUG_ON(!prealloc); 572 err = split_state(tree, state, prealloc, start); 573 if (err) 574 extent_io_tree_panic(tree, err); 575 576 prealloc = NULL; 577 if (err) 578 goto out; 579 if (state->end <= end) { 580 state = clear_state_bit(tree, state, &bits, wake); 581 goto next; 582 } 583 goto search_again; 584 } 585 /* 586 * | ---- desired range ---- | 587 * | state | 588 * We need to split the extent, and clear the bit 589 * on the first half 590 */ 591 if (state->start <= end && state->end > end) { 592 prealloc = alloc_extent_state_atomic(prealloc); 593 BUG_ON(!prealloc); 594 err = split_state(tree, state, prealloc, end + 1); 595 if (err) 596 extent_io_tree_panic(tree, err); 597 598 if (wake) 599 wake_up(&state->wq); 600 601 clear_state_bit(tree, prealloc, &bits, wake); 602 603 prealloc = NULL; 604 goto out; 605 } 606 607 state = clear_state_bit(tree, state, &bits, wake); 608 next: 609 if (last_end == (u64)-1) 610 goto out; 611 start = last_end + 1; 612 if (start <= end && state && !need_resched()) 613 goto hit_next; 614 goto search_again; 615 616 out: 617 spin_unlock(&tree->lock); 618 if (prealloc) 619 free_extent_state(prealloc); 620 621 return 0; 622 623 search_again: 624 if (start > end) 625 goto out; 626 spin_unlock(&tree->lock); 627 if (mask & __GFP_WAIT) 628 cond_resched(); 629 goto again; 630 } 631 632 static void wait_on_state(struct extent_io_tree *tree, 633 struct extent_state *state) 634 __releases(tree->lock) 635 __acquires(tree->lock) 636 { 637 DEFINE_WAIT(wait); 638 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 639 spin_unlock(&tree->lock); 640 schedule(); 641 spin_lock(&tree->lock); 642 finish_wait(&state->wq, &wait); 643 } 644 645 /* 646 * waits for one or more bits to clear on a range in the state tree. 647 * The range [start, end] is inclusive. 648 * The tree lock is taken by this function 649 */ 650 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 651 { 652 struct extent_state *state; 653 struct rb_node *node; 654 655 spin_lock(&tree->lock); 656 again: 657 while (1) { 658 /* 659 * this search will find all the extents that end after 660 * our range starts 661 */ 662 node = tree_search(tree, start); 663 if (!node) 664 break; 665 666 state = rb_entry(node, struct extent_state, rb_node); 667 668 if (state->start > end) 669 goto out; 670 671 if (state->state & bits) { 672 start = state->start; 673 atomic_inc(&state->refs); 674 wait_on_state(tree, state); 675 free_extent_state(state); 676 goto again; 677 } 678 start = state->end + 1; 679 680 if (start > end) 681 break; 682 683 cond_resched_lock(&tree->lock); 684 } 685 out: 686 spin_unlock(&tree->lock); 687 } 688 689 static void set_state_bits(struct extent_io_tree *tree, 690 struct extent_state *state, 691 int *bits) 692 { 693 int bits_to_set = *bits & ~EXTENT_CTLBITS; 694 695 set_state_cb(tree, state, bits); 696 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 697 u64 range = state->end - state->start + 1; 698 tree->dirty_bytes += range; 699 } 700 state->state |= bits_to_set; 701 } 702 703 static void cache_state(struct extent_state *state, 704 struct extent_state **cached_ptr) 705 { 706 if (cached_ptr && !(*cached_ptr)) { 707 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 708 *cached_ptr = state; 709 atomic_inc(&state->refs); 710 } 711 } 712 } 713 714 static void uncache_state(struct extent_state **cached_ptr) 715 { 716 if (cached_ptr && (*cached_ptr)) { 717 struct extent_state *state = *cached_ptr; 718 *cached_ptr = NULL; 719 free_extent_state(state); 720 } 721 } 722 723 /* 724 * set some bits on a range in the tree. This may require allocations or 725 * sleeping, so the gfp mask is used to indicate what is allowed. 726 * 727 * If any of the exclusive bits are set, this will fail with -EEXIST if some 728 * part of the range already has the desired bits set. The start of the 729 * existing range is returned in failed_start in this case. 730 * 731 * [start, end] is inclusive This takes the tree lock. 732 */ 733 734 static int __must_check 735 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 736 int bits, int exclusive_bits, u64 *failed_start, 737 struct extent_state **cached_state, gfp_t mask) 738 { 739 struct extent_state *state; 740 struct extent_state *prealloc = NULL; 741 struct rb_node *node; 742 int err = 0; 743 u64 last_start; 744 u64 last_end; 745 746 bits |= EXTENT_FIRST_DELALLOC; 747 again: 748 if (!prealloc && (mask & __GFP_WAIT)) { 749 prealloc = alloc_extent_state(mask); 750 BUG_ON(!prealloc); 751 } 752 753 spin_lock(&tree->lock); 754 if (cached_state && *cached_state) { 755 state = *cached_state; 756 if (state->start <= start && state->end > start && 757 state->tree) { 758 node = &state->rb_node; 759 goto hit_next; 760 } 761 } 762 /* 763 * this search will find all the extents that end after 764 * our range starts. 765 */ 766 node = tree_search(tree, start); 767 if (!node) { 768 prealloc = alloc_extent_state_atomic(prealloc); 769 BUG_ON(!prealloc); 770 err = insert_state(tree, prealloc, start, end, &bits); 771 if (err) 772 extent_io_tree_panic(tree, err); 773 774 prealloc = NULL; 775 goto out; 776 } 777 state = rb_entry(node, struct extent_state, rb_node); 778 hit_next: 779 last_start = state->start; 780 last_end = state->end; 781 782 /* 783 * | ---- desired range ---- | 784 * | state | 785 * 786 * Just lock what we found and keep going 787 */ 788 if (state->start == start && state->end <= end) { 789 if (state->state & exclusive_bits) { 790 *failed_start = state->start; 791 err = -EEXIST; 792 goto out; 793 } 794 795 set_state_bits(tree, state, &bits); 796 cache_state(state, cached_state); 797 merge_state(tree, state); 798 if (last_end == (u64)-1) 799 goto out; 800 start = last_end + 1; 801 state = next_state(state); 802 if (start < end && state && state->start == start && 803 !need_resched()) 804 goto hit_next; 805 goto search_again; 806 } 807 808 /* 809 * | ---- desired range ---- | 810 * | state | 811 * or 812 * | ------------- state -------------- | 813 * 814 * We need to split the extent we found, and may flip bits on 815 * second half. 816 * 817 * If the extent we found extends past our 818 * range, we just split and search again. It'll get split 819 * again the next time though. 820 * 821 * If the extent we found is inside our range, we set the 822 * desired bit on it. 823 */ 824 if (state->start < start) { 825 if (state->state & exclusive_bits) { 826 *failed_start = start; 827 err = -EEXIST; 828 goto out; 829 } 830 831 prealloc = alloc_extent_state_atomic(prealloc); 832 BUG_ON(!prealloc); 833 err = split_state(tree, state, prealloc, start); 834 if (err) 835 extent_io_tree_panic(tree, err); 836 837 prealloc = NULL; 838 if (err) 839 goto out; 840 if (state->end <= end) { 841 set_state_bits(tree, state, &bits); 842 cache_state(state, cached_state); 843 merge_state(tree, state); 844 if (last_end == (u64)-1) 845 goto out; 846 start = last_end + 1; 847 state = next_state(state); 848 if (start < end && state && state->start == start && 849 !need_resched()) 850 goto hit_next; 851 } 852 goto search_again; 853 } 854 /* 855 * | ---- desired range ---- | 856 * | state | or | state | 857 * 858 * There's a hole, we need to insert something in it and 859 * ignore the extent we found. 860 */ 861 if (state->start > start) { 862 u64 this_end; 863 if (end < last_start) 864 this_end = end; 865 else 866 this_end = last_start - 1; 867 868 prealloc = alloc_extent_state_atomic(prealloc); 869 BUG_ON(!prealloc); 870 871 /* 872 * Avoid to free 'prealloc' if it can be merged with 873 * the later extent. 874 */ 875 err = insert_state(tree, prealloc, start, this_end, 876 &bits); 877 if (err) 878 extent_io_tree_panic(tree, err); 879 880 cache_state(prealloc, cached_state); 881 prealloc = NULL; 882 start = this_end + 1; 883 goto search_again; 884 } 885 /* 886 * | ---- desired range ---- | 887 * | state | 888 * We need to split the extent, and set the bit 889 * on the first half 890 */ 891 if (state->start <= end && state->end > end) { 892 if (state->state & exclusive_bits) { 893 *failed_start = start; 894 err = -EEXIST; 895 goto out; 896 } 897 898 prealloc = alloc_extent_state_atomic(prealloc); 899 BUG_ON(!prealloc); 900 err = split_state(tree, state, prealloc, end + 1); 901 if (err) 902 extent_io_tree_panic(tree, err); 903 904 set_state_bits(tree, prealloc, &bits); 905 cache_state(prealloc, cached_state); 906 merge_state(tree, prealloc); 907 prealloc = NULL; 908 goto out; 909 } 910 911 goto search_again; 912 913 out: 914 spin_unlock(&tree->lock); 915 if (prealloc) 916 free_extent_state(prealloc); 917 918 return err; 919 920 search_again: 921 if (start > end) 922 goto out; 923 spin_unlock(&tree->lock); 924 if (mask & __GFP_WAIT) 925 cond_resched(); 926 goto again; 927 } 928 929 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, 930 u64 *failed_start, struct extent_state **cached_state, 931 gfp_t mask) 932 { 933 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 934 cached_state, mask); 935 } 936 937 938 /** 939 * convert_extent_bit - convert all bits in a given range from one bit to 940 * another 941 * @tree: the io tree to search 942 * @start: the start offset in bytes 943 * @end: the end offset in bytes (inclusive) 944 * @bits: the bits to set in this range 945 * @clear_bits: the bits to clear in this range 946 * @cached_state: state that we're going to cache 947 * @mask: the allocation mask 948 * 949 * This will go through and set bits for the given range. If any states exist 950 * already in this range they are set with the given bit and cleared of the 951 * clear_bits. This is only meant to be used by things that are mergeable, ie 952 * converting from say DELALLOC to DIRTY. This is not meant to be used with 953 * boundary bits like LOCK. 954 */ 955 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 956 int bits, int clear_bits, 957 struct extent_state **cached_state, gfp_t mask) 958 { 959 struct extent_state *state; 960 struct extent_state *prealloc = NULL; 961 struct rb_node *node; 962 int err = 0; 963 u64 last_start; 964 u64 last_end; 965 966 again: 967 if (!prealloc && (mask & __GFP_WAIT)) { 968 prealloc = alloc_extent_state(mask); 969 if (!prealloc) 970 return -ENOMEM; 971 } 972 973 spin_lock(&tree->lock); 974 if (cached_state && *cached_state) { 975 state = *cached_state; 976 if (state->start <= start && state->end > start && 977 state->tree) { 978 node = &state->rb_node; 979 goto hit_next; 980 } 981 } 982 983 /* 984 * this search will find all the extents that end after 985 * our range starts. 986 */ 987 node = tree_search(tree, start); 988 if (!node) { 989 prealloc = alloc_extent_state_atomic(prealloc); 990 if (!prealloc) { 991 err = -ENOMEM; 992 goto out; 993 } 994 err = insert_state(tree, prealloc, start, end, &bits); 995 prealloc = NULL; 996 if (err) 997 extent_io_tree_panic(tree, err); 998 goto out; 999 } 1000 state = rb_entry(node, struct extent_state, rb_node); 1001 hit_next: 1002 last_start = state->start; 1003 last_end = state->end; 1004 1005 /* 1006 * | ---- desired range ---- | 1007 * | state | 1008 * 1009 * Just lock what we found and keep going 1010 */ 1011 if (state->start == start && state->end <= end) { 1012 set_state_bits(tree, state, &bits); 1013 cache_state(state, cached_state); 1014 state = clear_state_bit(tree, state, &clear_bits, 0); 1015 if (last_end == (u64)-1) 1016 goto out; 1017 start = last_end + 1; 1018 if (start < end && state && state->start == start && 1019 !need_resched()) 1020 goto hit_next; 1021 goto search_again; 1022 } 1023 1024 /* 1025 * | ---- desired range ---- | 1026 * | state | 1027 * or 1028 * | ------------- state -------------- | 1029 * 1030 * We need to split the extent we found, and may flip bits on 1031 * second half. 1032 * 1033 * If the extent we found extends past our 1034 * range, we just split and search again. It'll get split 1035 * again the next time though. 1036 * 1037 * If the extent we found is inside our range, we set the 1038 * desired bit on it. 1039 */ 1040 if (state->start < start) { 1041 prealloc = alloc_extent_state_atomic(prealloc); 1042 if (!prealloc) { 1043 err = -ENOMEM; 1044 goto out; 1045 } 1046 err = split_state(tree, state, prealloc, start); 1047 if (err) 1048 extent_io_tree_panic(tree, err); 1049 prealloc = NULL; 1050 if (err) 1051 goto out; 1052 if (state->end <= end) { 1053 set_state_bits(tree, state, &bits); 1054 cache_state(state, cached_state); 1055 state = clear_state_bit(tree, state, &clear_bits, 0); 1056 if (last_end == (u64)-1) 1057 goto out; 1058 start = last_end + 1; 1059 if (start < end && state && state->start == start && 1060 !need_resched()) 1061 goto hit_next; 1062 } 1063 goto search_again; 1064 } 1065 /* 1066 * | ---- desired range ---- | 1067 * | state | or | state | 1068 * 1069 * There's a hole, we need to insert something in it and 1070 * ignore the extent we found. 1071 */ 1072 if (state->start > start) { 1073 u64 this_end; 1074 if (end < last_start) 1075 this_end = end; 1076 else 1077 this_end = last_start - 1; 1078 1079 prealloc = alloc_extent_state_atomic(prealloc); 1080 if (!prealloc) { 1081 err = -ENOMEM; 1082 goto out; 1083 } 1084 1085 /* 1086 * Avoid to free 'prealloc' if it can be merged with 1087 * the later extent. 1088 */ 1089 err = insert_state(tree, prealloc, start, this_end, 1090 &bits); 1091 if (err) 1092 extent_io_tree_panic(tree, err); 1093 cache_state(prealloc, cached_state); 1094 prealloc = NULL; 1095 start = this_end + 1; 1096 goto search_again; 1097 } 1098 /* 1099 * | ---- desired range ---- | 1100 * | state | 1101 * We need to split the extent, and set the bit 1102 * on the first half 1103 */ 1104 if (state->start <= end && state->end > end) { 1105 prealloc = alloc_extent_state_atomic(prealloc); 1106 if (!prealloc) { 1107 err = -ENOMEM; 1108 goto out; 1109 } 1110 1111 err = split_state(tree, state, prealloc, end + 1); 1112 if (err) 1113 extent_io_tree_panic(tree, err); 1114 1115 set_state_bits(tree, prealloc, &bits); 1116 cache_state(prealloc, cached_state); 1117 clear_state_bit(tree, prealloc, &clear_bits, 0); 1118 prealloc = NULL; 1119 goto out; 1120 } 1121 1122 goto search_again; 1123 1124 out: 1125 spin_unlock(&tree->lock); 1126 if (prealloc) 1127 free_extent_state(prealloc); 1128 1129 return err; 1130 1131 search_again: 1132 if (start > end) 1133 goto out; 1134 spin_unlock(&tree->lock); 1135 if (mask & __GFP_WAIT) 1136 cond_resched(); 1137 goto again; 1138 } 1139 1140 /* wrappers around set/clear extent bit */ 1141 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1142 gfp_t mask) 1143 { 1144 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1145 NULL, mask); 1146 } 1147 1148 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1149 int bits, gfp_t mask) 1150 { 1151 return set_extent_bit(tree, start, end, bits, NULL, 1152 NULL, mask); 1153 } 1154 1155 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1156 int bits, gfp_t mask) 1157 { 1158 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1159 } 1160 1161 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1162 struct extent_state **cached_state, gfp_t mask) 1163 { 1164 return set_extent_bit(tree, start, end, 1165 EXTENT_DELALLOC | EXTENT_UPTODATE, 1166 NULL, cached_state, mask); 1167 } 1168 1169 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1170 struct extent_state **cached_state, gfp_t mask) 1171 { 1172 return set_extent_bit(tree, start, end, 1173 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1174 NULL, cached_state, mask); 1175 } 1176 1177 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1178 gfp_t mask) 1179 { 1180 return clear_extent_bit(tree, start, end, 1181 EXTENT_DIRTY | EXTENT_DELALLOC | 1182 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1183 } 1184 1185 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1186 gfp_t mask) 1187 { 1188 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1189 NULL, mask); 1190 } 1191 1192 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1193 struct extent_state **cached_state, gfp_t mask) 1194 { 1195 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 1196 cached_state, mask); 1197 } 1198 1199 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1200 struct extent_state **cached_state, gfp_t mask) 1201 { 1202 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1203 cached_state, mask); 1204 } 1205 1206 /* 1207 * either insert or lock state struct between start and end use mask to tell 1208 * us if waiting is desired. 1209 */ 1210 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1211 int bits, struct extent_state **cached_state) 1212 { 1213 int err; 1214 u64 failed_start; 1215 while (1) { 1216 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1217 EXTENT_LOCKED, &failed_start, 1218 cached_state, GFP_NOFS); 1219 if (err == -EEXIST) { 1220 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1221 start = failed_start; 1222 } else 1223 break; 1224 WARN_ON(start > end); 1225 } 1226 return err; 1227 } 1228 1229 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1230 { 1231 return lock_extent_bits(tree, start, end, 0, NULL); 1232 } 1233 1234 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1235 { 1236 int err; 1237 u64 failed_start; 1238 1239 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1240 &failed_start, NULL, GFP_NOFS); 1241 if (err == -EEXIST) { 1242 if (failed_start > start) 1243 clear_extent_bit(tree, start, failed_start - 1, 1244 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1245 return 0; 1246 } 1247 return 1; 1248 } 1249 1250 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1251 struct extent_state **cached, gfp_t mask) 1252 { 1253 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1254 mask); 1255 } 1256 1257 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1258 { 1259 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1260 GFP_NOFS); 1261 } 1262 1263 /* 1264 * helper function to set both pages and extents in the tree writeback 1265 */ 1266 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1267 { 1268 unsigned long index = start >> PAGE_CACHE_SHIFT; 1269 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1270 struct page *page; 1271 1272 while (index <= end_index) { 1273 page = find_get_page(tree->mapping, index); 1274 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1275 set_page_writeback(page); 1276 page_cache_release(page); 1277 index++; 1278 } 1279 return 0; 1280 } 1281 1282 /* find the first state struct with 'bits' set after 'start', and 1283 * return it. tree->lock must be held. NULL will returned if 1284 * nothing was found after 'start' 1285 */ 1286 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1287 u64 start, int bits) 1288 { 1289 struct rb_node *node; 1290 struct extent_state *state; 1291 1292 /* 1293 * this search will find all the extents that end after 1294 * our range starts. 1295 */ 1296 node = tree_search(tree, start); 1297 if (!node) 1298 goto out; 1299 1300 while (1) { 1301 state = rb_entry(node, struct extent_state, rb_node); 1302 if (state->end >= start && (state->state & bits)) 1303 return state; 1304 1305 node = rb_next(node); 1306 if (!node) 1307 break; 1308 } 1309 out: 1310 return NULL; 1311 } 1312 1313 /* 1314 * find the first offset in the io tree with 'bits' set. zero is 1315 * returned if we find something, and *start_ret and *end_ret are 1316 * set to reflect the state struct that was found. 1317 * 1318 * If nothing was found, 1 is returned. If found something, return 0. 1319 */ 1320 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1321 u64 *start_ret, u64 *end_ret, int bits, 1322 struct extent_state **cached_state) 1323 { 1324 struct extent_state *state; 1325 struct rb_node *n; 1326 int ret = 1; 1327 1328 spin_lock(&tree->lock); 1329 if (cached_state && *cached_state) { 1330 state = *cached_state; 1331 if (state->end == start - 1 && state->tree) { 1332 n = rb_next(&state->rb_node); 1333 while (n) { 1334 state = rb_entry(n, struct extent_state, 1335 rb_node); 1336 if (state->state & bits) 1337 goto got_it; 1338 n = rb_next(n); 1339 } 1340 free_extent_state(*cached_state); 1341 *cached_state = NULL; 1342 goto out; 1343 } 1344 free_extent_state(*cached_state); 1345 *cached_state = NULL; 1346 } 1347 1348 state = find_first_extent_bit_state(tree, start, bits); 1349 got_it: 1350 if (state) { 1351 cache_state(state, cached_state); 1352 *start_ret = state->start; 1353 *end_ret = state->end; 1354 ret = 0; 1355 } 1356 out: 1357 spin_unlock(&tree->lock); 1358 return ret; 1359 } 1360 1361 /* 1362 * find a contiguous range of bytes in the file marked as delalloc, not 1363 * more than 'max_bytes'. start and end are used to return the range, 1364 * 1365 * 1 is returned if we find something, 0 if nothing was in the tree 1366 */ 1367 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1368 u64 *start, u64 *end, u64 max_bytes, 1369 struct extent_state **cached_state) 1370 { 1371 struct rb_node *node; 1372 struct extent_state *state; 1373 u64 cur_start = *start; 1374 u64 found = 0; 1375 u64 total_bytes = 0; 1376 1377 spin_lock(&tree->lock); 1378 1379 /* 1380 * this search will find all the extents that end after 1381 * our range starts. 1382 */ 1383 node = tree_search(tree, cur_start); 1384 if (!node) { 1385 if (!found) 1386 *end = (u64)-1; 1387 goto out; 1388 } 1389 1390 while (1) { 1391 state = rb_entry(node, struct extent_state, rb_node); 1392 if (found && (state->start != cur_start || 1393 (state->state & EXTENT_BOUNDARY))) { 1394 goto out; 1395 } 1396 if (!(state->state & EXTENT_DELALLOC)) { 1397 if (!found) 1398 *end = state->end; 1399 goto out; 1400 } 1401 if (!found) { 1402 *start = state->start; 1403 *cached_state = state; 1404 atomic_inc(&state->refs); 1405 } 1406 found++; 1407 *end = state->end; 1408 cur_start = state->end + 1; 1409 node = rb_next(node); 1410 if (!node) 1411 break; 1412 total_bytes += state->end - state->start + 1; 1413 if (total_bytes >= max_bytes) 1414 break; 1415 } 1416 out: 1417 spin_unlock(&tree->lock); 1418 return found; 1419 } 1420 1421 static noinline void __unlock_for_delalloc(struct inode *inode, 1422 struct page *locked_page, 1423 u64 start, u64 end) 1424 { 1425 int ret; 1426 struct page *pages[16]; 1427 unsigned long index = start >> PAGE_CACHE_SHIFT; 1428 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1429 unsigned long nr_pages = end_index - index + 1; 1430 int i; 1431 1432 if (index == locked_page->index && end_index == index) 1433 return; 1434 1435 while (nr_pages > 0) { 1436 ret = find_get_pages_contig(inode->i_mapping, index, 1437 min_t(unsigned long, nr_pages, 1438 ARRAY_SIZE(pages)), pages); 1439 for (i = 0; i < ret; i++) { 1440 if (pages[i] != locked_page) 1441 unlock_page(pages[i]); 1442 page_cache_release(pages[i]); 1443 } 1444 nr_pages -= ret; 1445 index += ret; 1446 cond_resched(); 1447 } 1448 } 1449 1450 static noinline int lock_delalloc_pages(struct inode *inode, 1451 struct page *locked_page, 1452 u64 delalloc_start, 1453 u64 delalloc_end) 1454 { 1455 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1456 unsigned long start_index = index; 1457 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1458 unsigned long pages_locked = 0; 1459 struct page *pages[16]; 1460 unsigned long nrpages; 1461 int ret; 1462 int i; 1463 1464 /* the caller is responsible for locking the start index */ 1465 if (index == locked_page->index && index == end_index) 1466 return 0; 1467 1468 /* skip the page at the start index */ 1469 nrpages = end_index - index + 1; 1470 while (nrpages > 0) { 1471 ret = find_get_pages_contig(inode->i_mapping, index, 1472 min_t(unsigned long, 1473 nrpages, ARRAY_SIZE(pages)), pages); 1474 if (ret == 0) { 1475 ret = -EAGAIN; 1476 goto done; 1477 } 1478 /* now we have an array of pages, lock them all */ 1479 for (i = 0; i < ret; i++) { 1480 /* 1481 * the caller is taking responsibility for 1482 * locked_page 1483 */ 1484 if (pages[i] != locked_page) { 1485 lock_page(pages[i]); 1486 if (!PageDirty(pages[i]) || 1487 pages[i]->mapping != inode->i_mapping) { 1488 ret = -EAGAIN; 1489 unlock_page(pages[i]); 1490 page_cache_release(pages[i]); 1491 goto done; 1492 } 1493 } 1494 page_cache_release(pages[i]); 1495 pages_locked++; 1496 } 1497 nrpages -= ret; 1498 index += ret; 1499 cond_resched(); 1500 } 1501 ret = 0; 1502 done: 1503 if (ret && pages_locked) { 1504 __unlock_for_delalloc(inode, locked_page, 1505 delalloc_start, 1506 ((u64)(start_index + pages_locked - 1)) << 1507 PAGE_CACHE_SHIFT); 1508 } 1509 return ret; 1510 } 1511 1512 /* 1513 * find a contiguous range of bytes in the file marked as delalloc, not 1514 * more than 'max_bytes'. start and end are used to return the range, 1515 * 1516 * 1 is returned if we find something, 0 if nothing was in the tree 1517 */ 1518 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1519 struct extent_io_tree *tree, 1520 struct page *locked_page, 1521 u64 *start, u64 *end, 1522 u64 max_bytes) 1523 { 1524 u64 delalloc_start; 1525 u64 delalloc_end; 1526 u64 found; 1527 struct extent_state *cached_state = NULL; 1528 int ret; 1529 int loops = 0; 1530 1531 again: 1532 /* step one, find a bunch of delalloc bytes starting at start */ 1533 delalloc_start = *start; 1534 delalloc_end = 0; 1535 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1536 max_bytes, &cached_state); 1537 if (!found || delalloc_end <= *start) { 1538 *start = delalloc_start; 1539 *end = delalloc_end; 1540 free_extent_state(cached_state); 1541 return found; 1542 } 1543 1544 /* 1545 * start comes from the offset of locked_page. We have to lock 1546 * pages in order, so we can't process delalloc bytes before 1547 * locked_page 1548 */ 1549 if (delalloc_start < *start) 1550 delalloc_start = *start; 1551 1552 /* 1553 * make sure to limit the number of pages we try to lock down 1554 * if we're looping. 1555 */ 1556 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1557 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1558 1559 /* step two, lock all the pages after the page that has start */ 1560 ret = lock_delalloc_pages(inode, locked_page, 1561 delalloc_start, delalloc_end); 1562 if (ret == -EAGAIN) { 1563 /* some of the pages are gone, lets avoid looping by 1564 * shortening the size of the delalloc range we're searching 1565 */ 1566 free_extent_state(cached_state); 1567 if (!loops) { 1568 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1569 max_bytes = PAGE_CACHE_SIZE - offset; 1570 loops = 1; 1571 goto again; 1572 } else { 1573 found = 0; 1574 goto out_failed; 1575 } 1576 } 1577 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1578 1579 /* step three, lock the state bits for the whole range */ 1580 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1581 1582 /* then test to make sure it is all still delalloc */ 1583 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1584 EXTENT_DELALLOC, 1, cached_state); 1585 if (!ret) { 1586 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1587 &cached_state, GFP_NOFS); 1588 __unlock_for_delalloc(inode, locked_page, 1589 delalloc_start, delalloc_end); 1590 cond_resched(); 1591 goto again; 1592 } 1593 free_extent_state(cached_state); 1594 *start = delalloc_start; 1595 *end = delalloc_end; 1596 out_failed: 1597 return found; 1598 } 1599 1600 int extent_clear_unlock_delalloc(struct inode *inode, 1601 struct extent_io_tree *tree, 1602 u64 start, u64 end, struct page *locked_page, 1603 unsigned long op) 1604 { 1605 int ret; 1606 struct page *pages[16]; 1607 unsigned long index = start >> PAGE_CACHE_SHIFT; 1608 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1609 unsigned long nr_pages = end_index - index + 1; 1610 int i; 1611 int clear_bits = 0; 1612 1613 if (op & EXTENT_CLEAR_UNLOCK) 1614 clear_bits |= EXTENT_LOCKED; 1615 if (op & EXTENT_CLEAR_DIRTY) 1616 clear_bits |= EXTENT_DIRTY; 1617 1618 if (op & EXTENT_CLEAR_DELALLOC) 1619 clear_bits |= EXTENT_DELALLOC; 1620 1621 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1622 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1623 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1624 EXTENT_SET_PRIVATE2))) 1625 return 0; 1626 1627 while (nr_pages > 0) { 1628 ret = find_get_pages_contig(inode->i_mapping, index, 1629 min_t(unsigned long, 1630 nr_pages, ARRAY_SIZE(pages)), pages); 1631 for (i = 0; i < ret; i++) { 1632 1633 if (op & EXTENT_SET_PRIVATE2) 1634 SetPagePrivate2(pages[i]); 1635 1636 if (pages[i] == locked_page) { 1637 page_cache_release(pages[i]); 1638 continue; 1639 } 1640 if (op & EXTENT_CLEAR_DIRTY) 1641 clear_page_dirty_for_io(pages[i]); 1642 if (op & EXTENT_SET_WRITEBACK) 1643 set_page_writeback(pages[i]); 1644 if (op & EXTENT_END_WRITEBACK) 1645 end_page_writeback(pages[i]); 1646 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1647 unlock_page(pages[i]); 1648 page_cache_release(pages[i]); 1649 } 1650 nr_pages -= ret; 1651 index += ret; 1652 cond_resched(); 1653 } 1654 return 0; 1655 } 1656 1657 /* 1658 * count the number of bytes in the tree that have a given bit(s) 1659 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1660 * cached. The total number found is returned. 1661 */ 1662 u64 count_range_bits(struct extent_io_tree *tree, 1663 u64 *start, u64 search_end, u64 max_bytes, 1664 unsigned long bits, int contig) 1665 { 1666 struct rb_node *node; 1667 struct extent_state *state; 1668 u64 cur_start = *start; 1669 u64 total_bytes = 0; 1670 u64 last = 0; 1671 int found = 0; 1672 1673 if (search_end <= cur_start) { 1674 WARN_ON(1); 1675 return 0; 1676 } 1677 1678 spin_lock(&tree->lock); 1679 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1680 total_bytes = tree->dirty_bytes; 1681 goto out; 1682 } 1683 /* 1684 * this search will find all the extents that end after 1685 * our range starts. 1686 */ 1687 node = tree_search(tree, cur_start); 1688 if (!node) 1689 goto out; 1690 1691 while (1) { 1692 state = rb_entry(node, struct extent_state, rb_node); 1693 if (state->start > search_end) 1694 break; 1695 if (contig && found && state->start > last + 1) 1696 break; 1697 if (state->end >= cur_start && (state->state & bits) == bits) { 1698 total_bytes += min(search_end, state->end) + 1 - 1699 max(cur_start, state->start); 1700 if (total_bytes >= max_bytes) 1701 break; 1702 if (!found) { 1703 *start = max(cur_start, state->start); 1704 found = 1; 1705 } 1706 last = state->end; 1707 } else if (contig && found) { 1708 break; 1709 } 1710 node = rb_next(node); 1711 if (!node) 1712 break; 1713 } 1714 out: 1715 spin_unlock(&tree->lock); 1716 return total_bytes; 1717 } 1718 1719 /* 1720 * set the private field for a given byte offset in the tree. If there isn't 1721 * an extent_state there already, this does nothing. 1722 */ 1723 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1724 { 1725 struct rb_node *node; 1726 struct extent_state *state; 1727 int ret = 0; 1728 1729 spin_lock(&tree->lock); 1730 /* 1731 * this search will find all the extents that end after 1732 * our range starts. 1733 */ 1734 node = tree_search(tree, start); 1735 if (!node) { 1736 ret = -ENOENT; 1737 goto out; 1738 } 1739 state = rb_entry(node, struct extent_state, rb_node); 1740 if (state->start != start) { 1741 ret = -ENOENT; 1742 goto out; 1743 } 1744 state->private = private; 1745 out: 1746 spin_unlock(&tree->lock); 1747 return ret; 1748 } 1749 1750 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1751 { 1752 struct rb_node *node; 1753 struct extent_state *state; 1754 int ret = 0; 1755 1756 spin_lock(&tree->lock); 1757 /* 1758 * this search will find all the extents that end after 1759 * our range starts. 1760 */ 1761 node = tree_search(tree, start); 1762 if (!node) { 1763 ret = -ENOENT; 1764 goto out; 1765 } 1766 state = rb_entry(node, struct extent_state, rb_node); 1767 if (state->start != start) { 1768 ret = -ENOENT; 1769 goto out; 1770 } 1771 *private = state->private; 1772 out: 1773 spin_unlock(&tree->lock); 1774 return ret; 1775 } 1776 1777 /* 1778 * searches a range in the state tree for a given mask. 1779 * If 'filled' == 1, this returns 1 only if every extent in the tree 1780 * has the bits set. Otherwise, 1 is returned if any bit in the 1781 * range is found set. 1782 */ 1783 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1784 int bits, int filled, struct extent_state *cached) 1785 { 1786 struct extent_state *state = NULL; 1787 struct rb_node *node; 1788 int bitset = 0; 1789 1790 spin_lock(&tree->lock); 1791 if (cached && cached->tree && cached->start <= start && 1792 cached->end > start) 1793 node = &cached->rb_node; 1794 else 1795 node = tree_search(tree, start); 1796 while (node && start <= end) { 1797 state = rb_entry(node, struct extent_state, rb_node); 1798 1799 if (filled && state->start > start) { 1800 bitset = 0; 1801 break; 1802 } 1803 1804 if (state->start > end) 1805 break; 1806 1807 if (state->state & bits) { 1808 bitset = 1; 1809 if (!filled) 1810 break; 1811 } else if (filled) { 1812 bitset = 0; 1813 break; 1814 } 1815 1816 if (state->end == (u64)-1) 1817 break; 1818 1819 start = state->end + 1; 1820 if (start > end) 1821 break; 1822 node = rb_next(node); 1823 if (!node) { 1824 if (filled) 1825 bitset = 0; 1826 break; 1827 } 1828 } 1829 spin_unlock(&tree->lock); 1830 return bitset; 1831 } 1832 1833 /* 1834 * helper function to set a given page up to date if all the 1835 * extents in the tree for that page are up to date 1836 */ 1837 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1838 { 1839 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1840 u64 end = start + PAGE_CACHE_SIZE - 1; 1841 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1842 SetPageUptodate(page); 1843 } 1844 1845 /* 1846 * helper function to unlock a page if all the extents in the tree 1847 * for that page are unlocked 1848 */ 1849 static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1850 { 1851 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1852 u64 end = start + PAGE_CACHE_SIZE - 1; 1853 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1854 unlock_page(page); 1855 } 1856 1857 /* 1858 * helper function to end page writeback if all the extents 1859 * in the tree for that page are done with writeback 1860 */ 1861 static void check_page_writeback(struct extent_io_tree *tree, 1862 struct page *page) 1863 { 1864 end_page_writeback(page); 1865 } 1866 1867 /* 1868 * When IO fails, either with EIO or csum verification fails, we 1869 * try other mirrors that might have a good copy of the data. This 1870 * io_failure_record is used to record state as we go through all the 1871 * mirrors. If another mirror has good data, the page is set up to date 1872 * and things continue. If a good mirror can't be found, the original 1873 * bio end_io callback is called to indicate things have failed. 1874 */ 1875 struct io_failure_record { 1876 struct page *page; 1877 u64 start; 1878 u64 len; 1879 u64 logical; 1880 unsigned long bio_flags; 1881 int this_mirror; 1882 int failed_mirror; 1883 int in_validation; 1884 }; 1885 1886 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1887 int did_repair) 1888 { 1889 int ret; 1890 int err = 0; 1891 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1892 1893 set_state_private(failure_tree, rec->start, 0); 1894 ret = clear_extent_bits(failure_tree, rec->start, 1895 rec->start + rec->len - 1, 1896 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1897 if (ret) 1898 err = ret; 1899 1900 if (did_repair) { 1901 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1902 rec->start + rec->len - 1, 1903 EXTENT_DAMAGED, GFP_NOFS); 1904 if (ret && !err) 1905 err = ret; 1906 } 1907 1908 kfree(rec); 1909 return err; 1910 } 1911 1912 static void repair_io_failure_callback(struct bio *bio, int err) 1913 { 1914 complete(bio->bi_private); 1915 } 1916 1917 /* 1918 * this bypasses the standard btrfs submit functions deliberately, as 1919 * the standard behavior is to write all copies in a raid setup. here we only 1920 * want to write the one bad copy. so we do the mapping for ourselves and issue 1921 * submit_bio directly. 1922 * to avoid any synchonization issues, wait for the data after writing, which 1923 * actually prevents the read that triggered the error from finishing. 1924 * currently, there can be no more than two copies of every data bit. thus, 1925 * exactly one rewrite is required. 1926 */ 1927 int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, 1928 u64 length, u64 logical, struct page *page, 1929 int mirror_num) 1930 { 1931 struct bio *bio; 1932 struct btrfs_device *dev; 1933 DECLARE_COMPLETION_ONSTACK(compl); 1934 u64 map_length = 0; 1935 u64 sector; 1936 struct btrfs_bio *bbio = NULL; 1937 int ret; 1938 1939 BUG_ON(!mirror_num); 1940 1941 bio = bio_alloc(GFP_NOFS, 1); 1942 if (!bio) 1943 return -EIO; 1944 bio->bi_private = &compl; 1945 bio->bi_end_io = repair_io_failure_callback; 1946 bio->bi_size = 0; 1947 map_length = length; 1948 1949 ret = btrfs_map_block(map_tree, WRITE, logical, 1950 &map_length, &bbio, mirror_num); 1951 if (ret) { 1952 bio_put(bio); 1953 return -EIO; 1954 } 1955 BUG_ON(mirror_num != bbio->mirror_num); 1956 sector = bbio->stripes[mirror_num-1].physical >> 9; 1957 bio->bi_sector = sector; 1958 dev = bbio->stripes[mirror_num-1].dev; 1959 kfree(bbio); 1960 if (!dev || !dev->bdev || !dev->writeable) { 1961 bio_put(bio); 1962 return -EIO; 1963 } 1964 bio->bi_bdev = dev->bdev; 1965 bio_add_page(bio, page, length, start-page_offset(page)); 1966 btrfsic_submit_bio(WRITE_SYNC, bio); 1967 wait_for_completion(&compl); 1968 1969 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1970 /* try to remap that extent elsewhere? */ 1971 bio_put(bio); 1972 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 1973 return -EIO; 1974 } 1975 1976 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 1977 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 1978 start, rcu_str_deref(dev->name), sector); 1979 1980 bio_put(bio); 1981 return 0; 1982 } 1983 1984 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1985 int mirror_num) 1986 { 1987 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 1988 u64 start = eb->start; 1989 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1990 int ret = 0; 1991 1992 for (i = 0; i < num_pages; i++) { 1993 struct page *p = extent_buffer_page(eb, i); 1994 ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE, 1995 start, p, mirror_num); 1996 if (ret) 1997 break; 1998 start += PAGE_CACHE_SIZE; 1999 } 2000 2001 return ret; 2002 } 2003 2004 /* 2005 * each time an IO finishes, we do a fast check in the IO failure tree 2006 * to see if we need to process or clean up an io_failure_record 2007 */ 2008 static int clean_io_failure(u64 start, struct page *page) 2009 { 2010 u64 private; 2011 u64 private_failure; 2012 struct io_failure_record *failrec; 2013 struct btrfs_mapping_tree *map_tree; 2014 struct extent_state *state; 2015 int num_copies; 2016 int did_repair = 0; 2017 int ret; 2018 struct inode *inode = page->mapping->host; 2019 2020 private = 0; 2021 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2022 (u64)-1, 1, EXTENT_DIRTY, 0); 2023 if (!ret) 2024 return 0; 2025 2026 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2027 &private_failure); 2028 if (ret) 2029 return 0; 2030 2031 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2032 BUG_ON(!failrec->this_mirror); 2033 2034 if (failrec->in_validation) { 2035 /* there was no real error, just free the record */ 2036 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2037 failrec->start); 2038 did_repair = 1; 2039 goto out; 2040 } 2041 2042 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2043 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2044 failrec->start, 2045 EXTENT_LOCKED); 2046 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2047 2048 if (state && state->start == failrec->start) { 2049 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 2050 num_copies = btrfs_num_copies(map_tree, failrec->logical, 2051 failrec->len); 2052 if (num_copies > 1) { 2053 ret = repair_io_failure(map_tree, start, failrec->len, 2054 failrec->logical, page, 2055 failrec->failed_mirror); 2056 did_repair = !ret; 2057 } 2058 } 2059 2060 out: 2061 if (!ret) 2062 ret = free_io_failure(inode, failrec, did_repair); 2063 2064 return ret; 2065 } 2066 2067 /* 2068 * this is a generic handler for readpage errors (default 2069 * readpage_io_failed_hook). if other copies exist, read those and write back 2070 * good data to the failed position. does not investigate in remapping the 2071 * failed extent elsewhere, hoping the device will be smart enough to do this as 2072 * needed 2073 */ 2074 2075 static int bio_readpage_error(struct bio *failed_bio, struct page *page, 2076 u64 start, u64 end, int failed_mirror, 2077 struct extent_state *state) 2078 { 2079 struct io_failure_record *failrec = NULL; 2080 u64 private; 2081 struct extent_map *em; 2082 struct inode *inode = page->mapping->host; 2083 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2084 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2085 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2086 struct bio *bio; 2087 int num_copies; 2088 int ret; 2089 int read_mode; 2090 u64 logical; 2091 2092 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2093 2094 ret = get_state_private(failure_tree, start, &private); 2095 if (ret) { 2096 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2097 if (!failrec) 2098 return -ENOMEM; 2099 failrec->start = start; 2100 failrec->len = end - start + 1; 2101 failrec->this_mirror = 0; 2102 failrec->bio_flags = 0; 2103 failrec->in_validation = 0; 2104 2105 read_lock(&em_tree->lock); 2106 em = lookup_extent_mapping(em_tree, start, failrec->len); 2107 if (!em) { 2108 read_unlock(&em_tree->lock); 2109 kfree(failrec); 2110 return -EIO; 2111 } 2112 2113 if (em->start > start || em->start + em->len < start) { 2114 free_extent_map(em); 2115 em = NULL; 2116 } 2117 read_unlock(&em_tree->lock); 2118 2119 if (!em) { 2120 kfree(failrec); 2121 return -EIO; 2122 } 2123 logical = start - em->start; 2124 logical = em->block_start + logical; 2125 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2126 logical = em->block_start; 2127 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2128 extent_set_compress_type(&failrec->bio_flags, 2129 em->compress_type); 2130 } 2131 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2132 "len=%llu\n", logical, start, failrec->len); 2133 failrec->logical = logical; 2134 free_extent_map(em); 2135 2136 /* set the bits in the private failure tree */ 2137 ret = set_extent_bits(failure_tree, start, end, 2138 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2139 if (ret >= 0) 2140 ret = set_state_private(failure_tree, start, 2141 (u64)(unsigned long)failrec); 2142 /* set the bits in the inode's tree */ 2143 if (ret >= 0) 2144 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2145 GFP_NOFS); 2146 if (ret < 0) { 2147 kfree(failrec); 2148 return ret; 2149 } 2150 } else { 2151 failrec = (struct io_failure_record *)(unsigned long)private; 2152 pr_debug("bio_readpage_error: (found) logical=%llu, " 2153 "start=%llu, len=%llu, validation=%d\n", 2154 failrec->logical, failrec->start, failrec->len, 2155 failrec->in_validation); 2156 /* 2157 * when data can be on disk more than twice, add to failrec here 2158 * (e.g. with a list for failed_mirror) to make 2159 * clean_io_failure() clean all those errors at once. 2160 */ 2161 } 2162 num_copies = btrfs_num_copies( 2163 &BTRFS_I(inode)->root->fs_info->mapping_tree, 2164 failrec->logical, failrec->len); 2165 if (num_copies == 1) { 2166 /* 2167 * we only have a single copy of the data, so don't bother with 2168 * all the retry and error correction code that follows. no 2169 * matter what the error is, it is very likely to persist. 2170 */ 2171 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " 2172 "state=%p, num_copies=%d, next_mirror %d, " 2173 "failed_mirror %d\n", state, num_copies, 2174 failrec->this_mirror, failed_mirror); 2175 free_io_failure(inode, failrec, 0); 2176 return -EIO; 2177 } 2178 2179 if (!state) { 2180 spin_lock(&tree->lock); 2181 state = find_first_extent_bit_state(tree, failrec->start, 2182 EXTENT_LOCKED); 2183 if (state && state->start != failrec->start) 2184 state = NULL; 2185 spin_unlock(&tree->lock); 2186 } 2187 2188 /* 2189 * there are two premises: 2190 * a) deliver good data to the caller 2191 * b) correct the bad sectors on disk 2192 */ 2193 if (failed_bio->bi_vcnt > 1) { 2194 /* 2195 * to fulfill b), we need to know the exact failing sectors, as 2196 * we don't want to rewrite any more than the failed ones. thus, 2197 * we need separate read requests for the failed bio 2198 * 2199 * if the following BUG_ON triggers, our validation request got 2200 * merged. we need separate requests for our algorithm to work. 2201 */ 2202 BUG_ON(failrec->in_validation); 2203 failrec->in_validation = 1; 2204 failrec->this_mirror = failed_mirror; 2205 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2206 } else { 2207 /* 2208 * we're ready to fulfill a) and b) alongside. get a good copy 2209 * of the failed sector and if we succeed, we have setup 2210 * everything for repair_io_failure to do the rest for us. 2211 */ 2212 if (failrec->in_validation) { 2213 BUG_ON(failrec->this_mirror != failed_mirror); 2214 failrec->in_validation = 0; 2215 failrec->this_mirror = 0; 2216 } 2217 failrec->failed_mirror = failed_mirror; 2218 failrec->this_mirror++; 2219 if (failrec->this_mirror == failed_mirror) 2220 failrec->this_mirror++; 2221 read_mode = READ_SYNC; 2222 } 2223 2224 if (!state || failrec->this_mirror > num_copies) { 2225 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " 2226 "next_mirror %d, failed_mirror %d\n", state, 2227 num_copies, failrec->this_mirror, failed_mirror); 2228 free_io_failure(inode, failrec, 0); 2229 return -EIO; 2230 } 2231 2232 bio = bio_alloc(GFP_NOFS, 1); 2233 if (!bio) { 2234 free_io_failure(inode, failrec, 0); 2235 return -EIO; 2236 } 2237 bio->bi_private = state; 2238 bio->bi_end_io = failed_bio->bi_end_io; 2239 bio->bi_sector = failrec->logical >> 9; 2240 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2241 bio->bi_size = 0; 2242 2243 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2244 2245 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2246 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2247 failrec->this_mirror, num_copies, failrec->in_validation); 2248 2249 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2250 failrec->this_mirror, 2251 failrec->bio_flags, 0); 2252 return ret; 2253 } 2254 2255 /* lots and lots of room for performance fixes in the end_bio funcs */ 2256 2257 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2258 { 2259 int uptodate = (err == 0); 2260 struct extent_io_tree *tree; 2261 int ret; 2262 2263 tree = &BTRFS_I(page->mapping->host)->io_tree; 2264 2265 if (tree->ops && tree->ops->writepage_end_io_hook) { 2266 ret = tree->ops->writepage_end_io_hook(page, start, 2267 end, NULL, uptodate); 2268 if (ret) 2269 uptodate = 0; 2270 } 2271 2272 if (!uptodate) { 2273 ClearPageUptodate(page); 2274 SetPageError(page); 2275 } 2276 return 0; 2277 } 2278 2279 /* 2280 * after a writepage IO is done, we need to: 2281 * clear the uptodate bits on error 2282 * clear the writeback bits in the extent tree for this IO 2283 * end_page_writeback if the page has no more pending IO 2284 * 2285 * Scheduling is not allowed, so the extent state tree is expected 2286 * to have one and only one object corresponding to this IO. 2287 */ 2288 static void end_bio_extent_writepage(struct bio *bio, int err) 2289 { 2290 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2291 struct extent_io_tree *tree; 2292 u64 start; 2293 u64 end; 2294 int whole_page; 2295 2296 do { 2297 struct page *page = bvec->bv_page; 2298 tree = &BTRFS_I(page->mapping->host)->io_tree; 2299 2300 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2301 bvec->bv_offset; 2302 end = start + bvec->bv_len - 1; 2303 2304 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2305 whole_page = 1; 2306 else 2307 whole_page = 0; 2308 2309 if (--bvec >= bio->bi_io_vec) 2310 prefetchw(&bvec->bv_page->flags); 2311 2312 if (end_extent_writepage(page, err, start, end)) 2313 continue; 2314 2315 if (whole_page) 2316 end_page_writeback(page); 2317 else 2318 check_page_writeback(tree, page); 2319 } while (bvec >= bio->bi_io_vec); 2320 2321 bio_put(bio); 2322 } 2323 2324 /* 2325 * after a readpage IO is done, we need to: 2326 * clear the uptodate bits on error 2327 * set the uptodate bits if things worked 2328 * set the page up to date if all extents in the tree are uptodate 2329 * clear the lock bit in the extent tree 2330 * unlock the page if there are no other extents locked for it 2331 * 2332 * Scheduling is not allowed, so the extent state tree is expected 2333 * to have one and only one object corresponding to this IO. 2334 */ 2335 static void end_bio_extent_readpage(struct bio *bio, int err) 2336 { 2337 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2338 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2339 struct bio_vec *bvec = bio->bi_io_vec; 2340 struct extent_io_tree *tree; 2341 u64 start; 2342 u64 end; 2343 int whole_page; 2344 int mirror; 2345 int ret; 2346 2347 if (err) 2348 uptodate = 0; 2349 2350 do { 2351 struct page *page = bvec->bv_page; 2352 struct extent_state *cached = NULL; 2353 struct extent_state *state; 2354 2355 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2356 "mirror=%ld\n", (u64)bio->bi_sector, err, 2357 (long int)bio->bi_bdev); 2358 tree = &BTRFS_I(page->mapping->host)->io_tree; 2359 2360 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2361 bvec->bv_offset; 2362 end = start + bvec->bv_len - 1; 2363 2364 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2365 whole_page = 1; 2366 else 2367 whole_page = 0; 2368 2369 if (++bvec <= bvec_end) 2370 prefetchw(&bvec->bv_page->flags); 2371 2372 spin_lock(&tree->lock); 2373 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 2374 if (state && state->start == start) { 2375 /* 2376 * take a reference on the state, unlock will drop 2377 * the ref 2378 */ 2379 cache_state(state, &cached); 2380 } 2381 spin_unlock(&tree->lock); 2382 2383 mirror = (int)(unsigned long)bio->bi_bdev; 2384 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2385 ret = tree->ops->readpage_end_io_hook(page, start, end, 2386 state, mirror); 2387 if (ret) 2388 uptodate = 0; 2389 else 2390 clean_io_failure(start, page); 2391 } 2392 2393 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2394 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2395 if (!ret && !err && 2396 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2397 uptodate = 1; 2398 } else if (!uptodate) { 2399 /* 2400 * The generic bio_readpage_error handles errors the 2401 * following way: If possible, new read requests are 2402 * created and submitted and will end up in 2403 * end_bio_extent_readpage as well (if we're lucky, not 2404 * in the !uptodate case). In that case it returns 0 and 2405 * we just go on with the next page in our bio. If it 2406 * can't handle the error it will return -EIO and we 2407 * remain responsible for that page. 2408 */ 2409 ret = bio_readpage_error(bio, page, start, end, mirror, NULL); 2410 if (ret == 0) { 2411 uptodate = 2412 test_bit(BIO_UPTODATE, &bio->bi_flags); 2413 if (err) 2414 uptodate = 0; 2415 uncache_state(&cached); 2416 continue; 2417 } 2418 } 2419 2420 if (uptodate && tree->track_uptodate) { 2421 set_extent_uptodate(tree, start, end, &cached, 2422 GFP_ATOMIC); 2423 } 2424 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2425 2426 if (whole_page) { 2427 if (uptodate) { 2428 SetPageUptodate(page); 2429 } else { 2430 ClearPageUptodate(page); 2431 SetPageError(page); 2432 } 2433 unlock_page(page); 2434 } else { 2435 if (uptodate) { 2436 check_page_uptodate(tree, page); 2437 } else { 2438 ClearPageUptodate(page); 2439 SetPageError(page); 2440 } 2441 check_page_locked(tree, page); 2442 } 2443 } while (bvec <= bvec_end); 2444 2445 bio_put(bio); 2446 } 2447 2448 struct bio * 2449 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2450 gfp_t gfp_flags) 2451 { 2452 struct bio *bio; 2453 2454 bio = bio_alloc(gfp_flags, nr_vecs); 2455 2456 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2457 while (!bio && (nr_vecs /= 2)) 2458 bio = bio_alloc(gfp_flags, nr_vecs); 2459 } 2460 2461 if (bio) { 2462 bio->bi_size = 0; 2463 bio->bi_bdev = bdev; 2464 bio->bi_sector = first_sector; 2465 } 2466 return bio; 2467 } 2468 2469 /* 2470 * Since writes are async, they will only return -ENOMEM. 2471 * Reads can return the full range of I/O error conditions. 2472 */ 2473 static int __must_check submit_one_bio(int rw, struct bio *bio, 2474 int mirror_num, unsigned long bio_flags) 2475 { 2476 int ret = 0; 2477 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2478 struct page *page = bvec->bv_page; 2479 struct extent_io_tree *tree = bio->bi_private; 2480 u64 start; 2481 2482 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2483 2484 bio->bi_private = NULL; 2485 2486 bio_get(bio); 2487 2488 if (tree->ops && tree->ops->submit_bio_hook) 2489 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2490 mirror_num, bio_flags, start); 2491 else 2492 btrfsic_submit_bio(rw, bio); 2493 2494 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2495 ret = -EOPNOTSUPP; 2496 bio_put(bio); 2497 return ret; 2498 } 2499 2500 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2501 unsigned long offset, size_t size, struct bio *bio, 2502 unsigned long bio_flags) 2503 { 2504 int ret = 0; 2505 if (tree->ops && tree->ops->merge_bio_hook) 2506 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2507 bio_flags); 2508 BUG_ON(ret < 0); 2509 return ret; 2510 2511 } 2512 2513 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2514 struct page *page, sector_t sector, 2515 size_t size, unsigned long offset, 2516 struct block_device *bdev, 2517 struct bio **bio_ret, 2518 unsigned long max_pages, 2519 bio_end_io_t end_io_func, 2520 int mirror_num, 2521 unsigned long prev_bio_flags, 2522 unsigned long bio_flags) 2523 { 2524 int ret = 0; 2525 struct bio *bio; 2526 int nr; 2527 int contig = 0; 2528 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2529 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2530 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2531 2532 if (bio_ret && *bio_ret) { 2533 bio = *bio_ret; 2534 if (old_compressed) 2535 contig = bio->bi_sector == sector; 2536 else 2537 contig = bio->bi_sector + (bio->bi_size >> 9) == 2538 sector; 2539 2540 if (prev_bio_flags != bio_flags || !contig || 2541 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2542 bio_add_page(bio, page, page_size, offset) < page_size) { 2543 ret = submit_one_bio(rw, bio, mirror_num, 2544 prev_bio_flags); 2545 if (ret < 0) 2546 return ret; 2547 bio = NULL; 2548 } else { 2549 return 0; 2550 } 2551 } 2552 if (this_compressed) 2553 nr = BIO_MAX_PAGES; 2554 else 2555 nr = bio_get_nr_vecs(bdev); 2556 2557 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2558 if (!bio) 2559 return -ENOMEM; 2560 2561 bio_add_page(bio, page, page_size, offset); 2562 bio->bi_end_io = end_io_func; 2563 bio->bi_private = tree; 2564 2565 if (bio_ret) 2566 *bio_ret = bio; 2567 else 2568 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2569 2570 return ret; 2571 } 2572 2573 void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) 2574 { 2575 if (!PagePrivate(page)) { 2576 SetPagePrivate(page); 2577 page_cache_get(page); 2578 set_page_private(page, (unsigned long)eb); 2579 } else { 2580 WARN_ON(page->private != (unsigned long)eb); 2581 } 2582 } 2583 2584 void set_page_extent_mapped(struct page *page) 2585 { 2586 if (!PagePrivate(page)) { 2587 SetPagePrivate(page); 2588 page_cache_get(page); 2589 set_page_private(page, EXTENT_PAGE_PRIVATE); 2590 } 2591 } 2592 2593 /* 2594 * basic readpage implementation. Locked extent state structs are inserted 2595 * into the tree that are removed when the IO is done (by the end_io 2596 * handlers) 2597 * XXX JDM: This needs looking at to ensure proper page locking 2598 */ 2599 static int __extent_read_full_page(struct extent_io_tree *tree, 2600 struct page *page, 2601 get_extent_t *get_extent, 2602 struct bio **bio, int mirror_num, 2603 unsigned long *bio_flags) 2604 { 2605 struct inode *inode = page->mapping->host; 2606 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2607 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2608 u64 end; 2609 u64 cur = start; 2610 u64 extent_offset; 2611 u64 last_byte = i_size_read(inode); 2612 u64 block_start; 2613 u64 cur_end; 2614 sector_t sector; 2615 struct extent_map *em; 2616 struct block_device *bdev; 2617 struct btrfs_ordered_extent *ordered; 2618 int ret; 2619 int nr = 0; 2620 size_t pg_offset = 0; 2621 size_t iosize; 2622 size_t disk_io_size; 2623 size_t blocksize = inode->i_sb->s_blocksize; 2624 unsigned long this_bio_flag = 0; 2625 2626 set_page_extent_mapped(page); 2627 2628 if (!PageUptodate(page)) { 2629 if (cleancache_get_page(page) == 0) { 2630 BUG_ON(blocksize != PAGE_SIZE); 2631 goto out; 2632 } 2633 } 2634 2635 end = page_end; 2636 while (1) { 2637 lock_extent(tree, start, end); 2638 ordered = btrfs_lookup_ordered_extent(inode, start); 2639 if (!ordered) 2640 break; 2641 unlock_extent(tree, start, end); 2642 btrfs_start_ordered_extent(inode, ordered, 1); 2643 btrfs_put_ordered_extent(ordered); 2644 } 2645 2646 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2647 char *userpage; 2648 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2649 2650 if (zero_offset) { 2651 iosize = PAGE_CACHE_SIZE - zero_offset; 2652 userpage = kmap_atomic(page); 2653 memset(userpage + zero_offset, 0, iosize); 2654 flush_dcache_page(page); 2655 kunmap_atomic(userpage); 2656 } 2657 } 2658 while (cur <= end) { 2659 if (cur >= last_byte) { 2660 char *userpage; 2661 struct extent_state *cached = NULL; 2662 2663 iosize = PAGE_CACHE_SIZE - pg_offset; 2664 userpage = kmap_atomic(page); 2665 memset(userpage + pg_offset, 0, iosize); 2666 flush_dcache_page(page); 2667 kunmap_atomic(userpage); 2668 set_extent_uptodate(tree, cur, cur + iosize - 1, 2669 &cached, GFP_NOFS); 2670 unlock_extent_cached(tree, cur, cur + iosize - 1, 2671 &cached, GFP_NOFS); 2672 break; 2673 } 2674 em = get_extent(inode, page, pg_offset, cur, 2675 end - cur + 1, 0); 2676 if (IS_ERR_OR_NULL(em)) { 2677 SetPageError(page); 2678 unlock_extent(tree, cur, end); 2679 break; 2680 } 2681 extent_offset = cur - em->start; 2682 BUG_ON(extent_map_end(em) <= cur); 2683 BUG_ON(end < cur); 2684 2685 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2686 this_bio_flag = EXTENT_BIO_COMPRESSED; 2687 extent_set_compress_type(&this_bio_flag, 2688 em->compress_type); 2689 } 2690 2691 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2692 cur_end = min(extent_map_end(em) - 1, end); 2693 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2694 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2695 disk_io_size = em->block_len; 2696 sector = em->block_start >> 9; 2697 } else { 2698 sector = (em->block_start + extent_offset) >> 9; 2699 disk_io_size = iosize; 2700 } 2701 bdev = em->bdev; 2702 block_start = em->block_start; 2703 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2704 block_start = EXTENT_MAP_HOLE; 2705 free_extent_map(em); 2706 em = NULL; 2707 2708 /* we've found a hole, just zero and go on */ 2709 if (block_start == EXTENT_MAP_HOLE) { 2710 char *userpage; 2711 struct extent_state *cached = NULL; 2712 2713 userpage = kmap_atomic(page); 2714 memset(userpage + pg_offset, 0, iosize); 2715 flush_dcache_page(page); 2716 kunmap_atomic(userpage); 2717 2718 set_extent_uptodate(tree, cur, cur + iosize - 1, 2719 &cached, GFP_NOFS); 2720 unlock_extent_cached(tree, cur, cur + iosize - 1, 2721 &cached, GFP_NOFS); 2722 cur = cur + iosize; 2723 pg_offset += iosize; 2724 continue; 2725 } 2726 /* the get_extent function already copied into the page */ 2727 if (test_range_bit(tree, cur, cur_end, 2728 EXTENT_UPTODATE, 1, NULL)) { 2729 check_page_uptodate(tree, page); 2730 unlock_extent(tree, cur, cur + iosize - 1); 2731 cur = cur + iosize; 2732 pg_offset += iosize; 2733 continue; 2734 } 2735 /* we have an inline extent but it didn't get marked up 2736 * to date. Error out 2737 */ 2738 if (block_start == EXTENT_MAP_INLINE) { 2739 SetPageError(page); 2740 unlock_extent(tree, cur, cur + iosize - 1); 2741 cur = cur + iosize; 2742 pg_offset += iosize; 2743 continue; 2744 } 2745 2746 ret = 0; 2747 if (tree->ops && tree->ops->readpage_io_hook) { 2748 ret = tree->ops->readpage_io_hook(page, cur, 2749 cur + iosize - 1); 2750 } 2751 if (!ret) { 2752 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2753 pnr -= page->index; 2754 ret = submit_extent_page(READ, tree, page, 2755 sector, disk_io_size, pg_offset, 2756 bdev, bio, pnr, 2757 end_bio_extent_readpage, mirror_num, 2758 *bio_flags, 2759 this_bio_flag); 2760 if (!ret) { 2761 nr++; 2762 *bio_flags = this_bio_flag; 2763 } 2764 } 2765 if (ret) { 2766 SetPageError(page); 2767 unlock_extent(tree, cur, cur + iosize - 1); 2768 } 2769 cur = cur + iosize; 2770 pg_offset += iosize; 2771 } 2772 out: 2773 if (!nr) { 2774 if (!PageError(page)) 2775 SetPageUptodate(page); 2776 unlock_page(page); 2777 } 2778 return 0; 2779 } 2780 2781 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2782 get_extent_t *get_extent, int mirror_num) 2783 { 2784 struct bio *bio = NULL; 2785 unsigned long bio_flags = 0; 2786 int ret; 2787 2788 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 2789 &bio_flags); 2790 if (bio) 2791 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 2792 return ret; 2793 } 2794 2795 static noinline void update_nr_written(struct page *page, 2796 struct writeback_control *wbc, 2797 unsigned long nr_written) 2798 { 2799 wbc->nr_to_write -= nr_written; 2800 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2801 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2802 page->mapping->writeback_index = page->index + nr_written; 2803 } 2804 2805 /* 2806 * the writepage semantics are similar to regular writepage. extent 2807 * records are inserted to lock ranges in the tree, and as dirty areas 2808 * are found, they are marked writeback. Then the lock bits are removed 2809 * and the end_io handler clears the writeback ranges 2810 */ 2811 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2812 void *data) 2813 { 2814 struct inode *inode = page->mapping->host; 2815 struct extent_page_data *epd = data; 2816 struct extent_io_tree *tree = epd->tree; 2817 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2818 u64 delalloc_start; 2819 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2820 u64 end; 2821 u64 cur = start; 2822 u64 extent_offset; 2823 u64 last_byte = i_size_read(inode); 2824 u64 block_start; 2825 u64 iosize; 2826 sector_t sector; 2827 struct extent_state *cached_state = NULL; 2828 struct extent_map *em; 2829 struct block_device *bdev; 2830 int ret; 2831 int nr = 0; 2832 size_t pg_offset = 0; 2833 size_t blocksize; 2834 loff_t i_size = i_size_read(inode); 2835 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2836 u64 nr_delalloc; 2837 u64 delalloc_end; 2838 int page_started; 2839 int compressed; 2840 int write_flags; 2841 unsigned long nr_written = 0; 2842 bool fill_delalloc = true; 2843 2844 if (wbc->sync_mode == WB_SYNC_ALL) 2845 write_flags = WRITE_SYNC; 2846 else 2847 write_flags = WRITE; 2848 2849 trace___extent_writepage(page, inode, wbc); 2850 2851 WARN_ON(!PageLocked(page)); 2852 2853 ClearPageError(page); 2854 2855 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2856 if (page->index > end_index || 2857 (page->index == end_index && !pg_offset)) { 2858 page->mapping->a_ops->invalidatepage(page, 0); 2859 unlock_page(page); 2860 return 0; 2861 } 2862 2863 if (page->index == end_index) { 2864 char *userpage; 2865 2866 userpage = kmap_atomic(page); 2867 memset(userpage + pg_offset, 0, 2868 PAGE_CACHE_SIZE - pg_offset); 2869 kunmap_atomic(userpage); 2870 flush_dcache_page(page); 2871 } 2872 pg_offset = 0; 2873 2874 set_page_extent_mapped(page); 2875 2876 if (!tree->ops || !tree->ops->fill_delalloc) 2877 fill_delalloc = false; 2878 2879 delalloc_start = start; 2880 delalloc_end = 0; 2881 page_started = 0; 2882 if (!epd->extent_locked && fill_delalloc) { 2883 u64 delalloc_to_write = 0; 2884 /* 2885 * make sure the wbc mapping index is at least updated 2886 * to this page. 2887 */ 2888 update_nr_written(page, wbc, 0); 2889 2890 while (delalloc_end < page_end) { 2891 nr_delalloc = find_lock_delalloc_range(inode, tree, 2892 page, 2893 &delalloc_start, 2894 &delalloc_end, 2895 128 * 1024 * 1024); 2896 if (nr_delalloc == 0) { 2897 delalloc_start = delalloc_end + 1; 2898 continue; 2899 } 2900 ret = tree->ops->fill_delalloc(inode, page, 2901 delalloc_start, 2902 delalloc_end, 2903 &page_started, 2904 &nr_written); 2905 /* File system has been set read-only */ 2906 if (ret) { 2907 SetPageError(page); 2908 goto done; 2909 } 2910 /* 2911 * delalloc_end is already one less than the total 2912 * length, so we don't subtract one from 2913 * PAGE_CACHE_SIZE 2914 */ 2915 delalloc_to_write += (delalloc_end - delalloc_start + 2916 PAGE_CACHE_SIZE) >> 2917 PAGE_CACHE_SHIFT; 2918 delalloc_start = delalloc_end + 1; 2919 } 2920 if (wbc->nr_to_write < delalloc_to_write) { 2921 int thresh = 8192; 2922 2923 if (delalloc_to_write < thresh * 2) 2924 thresh = delalloc_to_write; 2925 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2926 thresh); 2927 } 2928 2929 /* did the fill delalloc function already unlock and start 2930 * the IO? 2931 */ 2932 if (page_started) { 2933 ret = 0; 2934 /* 2935 * we've unlocked the page, so we can't update 2936 * the mapping's writeback index, just update 2937 * nr_to_write. 2938 */ 2939 wbc->nr_to_write -= nr_written; 2940 goto done_unlocked; 2941 } 2942 } 2943 if (tree->ops && tree->ops->writepage_start_hook) { 2944 ret = tree->ops->writepage_start_hook(page, start, 2945 page_end); 2946 if (ret) { 2947 /* Fixup worker will requeue */ 2948 if (ret == -EBUSY) 2949 wbc->pages_skipped++; 2950 else 2951 redirty_page_for_writepage(wbc, page); 2952 update_nr_written(page, wbc, nr_written); 2953 unlock_page(page); 2954 ret = 0; 2955 goto done_unlocked; 2956 } 2957 } 2958 2959 /* 2960 * we don't want to touch the inode after unlocking the page, 2961 * so we update the mapping writeback index now 2962 */ 2963 update_nr_written(page, wbc, nr_written + 1); 2964 2965 end = page_end; 2966 if (last_byte <= start) { 2967 if (tree->ops && tree->ops->writepage_end_io_hook) 2968 tree->ops->writepage_end_io_hook(page, start, 2969 page_end, NULL, 1); 2970 goto done; 2971 } 2972 2973 blocksize = inode->i_sb->s_blocksize; 2974 2975 while (cur <= end) { 2976 if (cur >= last_byte) { 2977 if (tree->ops && tree->ops->writepage_end_io_hook) 2978 tree->ops->writepage_end_io_hook(page, cur, 2979 page_end, NULL, 1); 2980 break; 2981 } 2982 em = epd->get_extent(inode, page, pg_offset, cur, 2983 end - cur + 1, 1); 2984 if (IS_ERR_OR_NULL(em)) { 2985 SetPageError(page); 2986 break; 2987 } 2988 2989 extent_offset = cur - em->start; 2990 BUG_ON(extent_map_end(em) <= cur); 2991 BUG_ON(end < cur); 2992 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2993 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2994 sector = (em->block_start + extent_offset) >> 9; 2995 bdev = em->bdev; 2996 block_start = em->block_start; 2997 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2998 free_extent_map(em); 2999 em = NULL; 3000 3001 /* 3002 * compressed and inline extents are written through other 3003 * paths in the FS 3004 */ 3005 if (compressed || block_start == EXTENT_MAP_HOLE || 3006 block_start == EXTENT_MAP_INLINE) { 3007 /* 3008 * end_io notification does not happen here for 3009 * compressed extents 3010 */ 3011 if (!compressed && tree->ops && 3012 tree->ops->writepage_end_io_hook) 3013 tree->ops->writepage_end_io_hook(page, cur, 3014 cur + iosize - 1, 3015 NULL, 1); 3016 else if (compressed) { 3017 /* we don't want to end_page_writeback on 3018 * a compressed extent. this happens 3019 * elsewhere 3020 */ 3021 nr++; 3022 } 3023 3024 cur += iosize; 3025 pg_offset += iosize; 3026 continue; 3027 } 3028 /* leave this out until we have a page_mkwrite call */ 3029 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3030 EXTENT_DIRTY, 0, NULL)) { 3031 cur = cur + iosize; 3032 pg_offset += iosize; 3033 continue; 3034 } 3035 3036 if (tree->ops && tree->ops->writepage_io_hook) { 3037 ret = tree->ops->writepage_io_hook(page, cur, 3038 cur + iosize - 1); 3039 } else { 3040 ret = 0; 3041 } 3042 if (ret) { 3043 SetPageError(page); 3044 } else { 3045 unsigned long max_nr = end_index + 1; 3046 3047 set_range_writeback(tree, cur, cur + iosize - 1); 3048 if (!PageWriteback(page)) { 3049 printk(KERN_ERR "btrfs warning page %lu not " 3050 "writeback, cur %llu end %llu\n", 3051 page->index, (unsigned long long)cur, 3052 (unsigned long long)end); 3053 } 3054 3055 ret = submit_extent_page(write_flags, tree, page, 3056 sector, iosize, pg_offset, 3057 bdev, &epd->bio, max_nr, 3058 end_bio_extent_writepage, 3059 0, 0, 0); 3060 if (ret) 3061 SetPageError(page); 3062 } 3063 cur = cur + iosize; 3064 pg_offset += iosize; 3065 nr++; 3066 } 3067 done: 3068 if (nr == 0) { 3069 /* make sure the mapping tag for page dirty gets cleared */ 3070 set_page_writeback(page); 3071 end_page_writeback(page); 3072 } 3073 unlock_page(page); 3074 3075 done_unlocked: 3076 3077 /* drop our reference on any cached states */ 3078 free_extent_state(cached_state); 3079 return 0; 3080 } 3081 3082 static int eb_wait(void *word) 3083 { 3084 io_schedule(); 3085 return 0; 3086 } 3087 3088 static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3089 { 3090 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3091 TASK_UNINTERRUPTIBLE); 3092 } 3093 3094 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3095 struct btrfs_fs_info *fs_info, 3096 struct extent_page_data *epd) 3097 { 3098 unsigned long i, num_pages; 3099 int flush = 0; 3100 int ret = 0; 3101 3102 if (!btrfs_try_tree_write_lock(eb)) { 3103 flush = 1; 3104 flush_write_bio(epd); 3105 btrfs_tree_lock(eb); 3106 } 3107 3108 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3109 btrfs_tree_unlock(eb); 3110 if (!epd->sync_io) 3111 return 0; 3112 if (!flush) { 3113 flush_write_bio(epd); 3114 flush = 1; 3115 } 3116 while (1) { 3117 wait_on_extent_buffer_writeback(eb); 3118 btrfs_tree_lock(eb); 3119 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3120 break; 3121 btrfs_tree_unlock(eb); 3122 } 3123 } 3124 3125 /* 3126 * We need to do this to prevent races in people who check if the eb is 3127 * under IO since we can end up having no IO bits set for a short period 3128 * of time. 3129 */ 3130 spin_lock(&eb->refs_lock); 3131 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3132 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3133 spin_unlock(&eb->refs_lock); 3134 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3135 spin_lock(&fs_info->delalloc_lock); 3136 if (fs_info->dirty_metadata_bytes >= eb->len) 3137 fs_info->dirty_metadata_bytes -= eb->len; 3138 else 3139 WARN_ON(1); 3140 spin_unlock(&fs_info->delalloc_lock); 3141 ret = 1; 3142 } else { 3143 spin_unlock(&eb->refs_lock); 3144 } 3145 3146 btrfs_tree_unlock(eb); 3147 3148 if (!ret) 3149 return ret; 3150 3151 num_pages = num_extent_pages(eb->start, eb->len); 3152 for (i = 0; i < num_pages; i++) { 3153 struct page *p = extent_buffer_page(eb, i); 3154 3155 if (!trylock_page(p)) { 3156 if (!flush) { 3157 flush_write_bio(epd); 3158 flush = 1; 3159 } 3160 lock_page(p); 3161 } 3162 } 3163 3164 return ret; 3165 } 3166 3167 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3168 { 3169 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3170 smp_mb__after_clear_bit(); 3171 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3172 } 3173 3174 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3175 { 3176 int uptodate = err == 0; 3177 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 3178 struct extent_buffer *eb; 3179 int done; 3180 3181 do { 3182 struct page *page = bvec->bv_page; 3183 3184 bvec--; 3185 eb = (struct extent_buffer *)page->private; 3186 BUG_ON(!eb); 3187 done = atomic_dec_and_test(&eb->io_pages); 3188 3189 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3190 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3191 ClearPageUptodate(page); 3192 SetPageError(page); 3193 } 3194 3195 end_page_writeback(page); 3196 3197 if (!done) 3198 continue; 3199 3200 end_extent_buffer_writeback(eb); 3201 } while (bvec >= bio->bi_io_vec); 3202 3203 bio_put(bio); 3204 3205 } 3206 3207 static int write_one_eb(struct extent_buffer *eb, 3208 struct btrfs_fs_info *fs_info, 3209 struct writeback_control *wbc, 3210 struct extent_page_data *epd) 3211 { 3212 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3213 u64 offset = eb->start; 3214 unsigned long i, num_pages; 3215 unsigned long bio_flags = 0; 3216 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3217 int ret = 0; 3218 3219 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3220 num_pages = num_extent_pages(eb->start, eb->len); 3221 atomic_set(&eb->io_pages, num_pages); 3222 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3223 bio_flags = EXTENT_BIO_TREE_LOG; 3224 3225 for (i = 0; i < num_pages; i++) { 3226 struct page *p = extent_buffer_page(eb, i); 3227 3228 clear_page_dirty_for_io(p); 3229 set_page_writeback(p); 3230 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3231 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3232 -1, end_bio_extent_buffer_writepage, 3233 0, epd->bio_flags, bio_flags); 3234 epd->bio_flags = bio_flags; 3235 if (ret) { 3236 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3237 SetPageError(p); 3238 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3239 end_extent_buffer_writeback(eb); 3240 ret = -EIO; 3241 break; 3242 } 3243 offset += PAGE_CACHE_SIZE; 3244 update_nr_written(p, wbc, 1); 3245 unlock_page(p); 3246 } 3247 3248 if (unlikely(ret)) { 3249 for (; i < num_pages; i++) { 3250 struct page *p = extent_buffer_page(eb, i); 3251 unlock_page(p); 3252 } 3253 } 3254 3255 return ret; 3256 } 3257 3258 int btree_write_cache_pages(struct address_space *mapping, 3259 struct writeback_control *wbc) 3260 { 3261 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3262 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3263 struct extent_buffer *eb, *prev_eb = NULL; 3264 struct extent_page_data epd = { 3265 .bio = NULL, 3266 .tree = tree, 3267 .extent_locked = 0, 3268 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3269 .bio_flags = 0, 3270 }; 3271 int ret = 0; 3272 int done = 0; 3273 int nr_to_write_done = 0; 3274 struct pagevec pvec; 3275 int nr_pages; 3276 pgoff_t index; 3277 pgoff_t end; /* Inclusive */ 3278 int scanned = 0; 3279 int tag; 3280 3281 pagevec_init(&pvec, 0); 3282 if (wbc->range_cyclic) { 3283 index = mapping->writeback_index; /* Start from prev offset */ 3284 end = -1; 3285 } else { 3286 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3287 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3288 scanned = 1; 3289 } 3290 if (wbc->sync_mode == WB_SYNC_ALL) 3291 tag = PAGECACHE_TAG_TOWRITE; 3292 else 3293 tag = PAGECACHE_TAG_DIRTY; 3294 retry: 3295 if (wbc->sync_mode == WB_SYNC_ALL) 3296 tag_pages_for_writeback(mapping, index, end); 3297 while (!done && !nr_to_write_done && (index <= end) && 3298 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3299 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3300 unsigned i; 3301 3302 scanned = 1; 3303 for (i = 0; i < nr_pages; i++) { 3304 struct page *page = pvec.pages[i]; 3305 3306 if (!PagePrivate(page)) 3307 continue; 3308 3309 if (!wbc->range_cyclic && page->index > end) { 3310 done = 1; 3311 break; 3312 } 3313 3314 spin_lock(&mapping->private_lock); 3315 if (!PagePrivate(page)) { 3316 spin_unlock(&mapping->private_lock); 3317 continue; 3318 } 3319 3320 eb = (struct extent_buffer *)page->private; 3321 3322 /* 3323 * Shouldn't happen and normally this would be a BUG_ON 3324 * but no sense in crashing the users box for something 3325 * we can survive anyway. 3326 */ 3327 if (!eb) { 3328 spin_unlock(&mapping->private_lock); 3329 WARN_ON(1); 3330 continue; 3331 } 3332 3333 if (eb == prev_eb) { 3334 spin_unlock(&mapping->private_lock); 3335 continue; 3336 } 3337 3338 ret = atomic_inc_not_zero(&eb->refs); 3339 spin_unlock(&mapping->private_lock); 3340 if (!ret) 3341 continue; 3342 3343 prev_eb = eb; 3344 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3345 if (!ret) { 3346 free_extent_buffer(eb); 3347 continue; 3348 } 3349 3350 ret = write_one_eb(eb, fs_info, wbc, &epd); 3351 if (ret) { 3352 done = 1; 3353 free_extent_buffer(eb); 3354 break; 3355 } 3356 free_extent_buffer(eb); 3357 3358 /* 3359 * the filesystem may choose to bump up nr_to_write. 3360 * We have to make sure to honor the new nr_to_write 3361 * at any time 3362 */ 3363 nr_to_write_done = wbc->nr_to_write <= 0; 3364 } 3365 pagevec_release(&pvec); 3366 cond_resched(); 3367 } 3368 if (!scanned && !done) { 3369 /* 3370 * We hit the last page and there is more work to be done: wrap 3371 * back to the start of the file 3372 */ 3373 scanned = 1; 3374 index = 0; 3375 goto retry; 3376 } 3377 flush_write_bio(&epd); 3378 return ret; 3379 } 3380 3381 /** 3382 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3383 * @mapping: address space structure to write 3384 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3385 * @writepage: function called for each page 3386 * @data: data passed to writepage function 3387 * 3388 * If a page is already under I/O, write_cache_pages() skips it, even 3389 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3390 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3391 * and msync() need to guarantee that all the data which was dirty at the time 3392 * the call was made get new I/O started against them. If wbc->sync_mode is 3393 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3394 * existing IO to complete. 3395 */ 3396 static int extent_write_cache_pages(struct extent_io_tree *tree, 3397 struct address_space *mapping, 3398 struct writeback_control *wbc, 3399 writepage_t writepage, void *data, 3400 void (*flush_fn)(void *)) 3401 { 3402 struct inode *inode = mapping->host; 3403 int ret = 0; 3404 int done = 0; 3405 int nr_to_write_done = 0; 3406 struct pagevec pvec; 3407 int nr_pages; 3408 pgoff_t index; 3409 pgoff_t end; /* Inclusive */ 3410 int scanned = 0; 3411 int tag; 3412 3413 /* 3414 * We have to hold onto the inode so that ordered extents can do their 3415 * work when the IO finishes. The alternative to this is failing to add 3416 * an ordered extent if the igrab() fails there and that is a huge pain 3417 * to deal with, so instead just hold onto the inode throughout the 3418 * writepages operation. If it fails here we are freeing up the inode 3419 * anyway and we'd rather not waste our time writing out stuff that is 3420 * going to be truncated anyway. 3421 */ 3422 if (!igrab(inode)) 3423 return 0; 3424 3425 pagevec_init(&pvec, 0); 3426 if (wbc->range_cyclic) { 3427 index = mapping->writeback_index; /* Start from prev offset */ 3428 end = -1; 3429 } else { 3430 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3431 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3432 scanned = 1; 3433 } 3434 if (wbc->sync_mode == WB_SYNC_ALL) 3435 tag = PAGECACHE_TAG_TOWRITE; 3436 else 3437 tag = PAGECACHE_TAG_DIRTY; 3438 retry: 3439 if (wbc->sync_mode == WB_SYNC_ALL) 3440 tag_pages_for_writeback(mapping, index, end); 3441 while (!done && !nr_to_write_done && (index <= end) && 3442 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3443 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3444 unsigned i; 3445 3446 scanned = 1; 3447 for (i = 0; i < nr_pages; i++) { 3448 struct page *page = pvec.pages[i]; 3449 3450 /* 3451 * At this point we hold neither mapping->tree_lock nor 3452 * lock on the page itself: the page may be truncated or 3453 * invalidated (changing page->mapping to NULL), or even 3454 * swizzled back from swapper_space to tmpfs file 3455 * mapping 3456 */ 3457 if (tree->ops && 3458 tree->ops->write_cache_pages_lock_hook) { 3459 tree->ops->write_cache_pages_lock_hook(page, 3460 data, flush_fn); 3461 } else { 3462 if (!trylock_page(page)) { 3463 flush_fn(data); 3464 lock_page(page); 3465 } 3466 } 3467 3468 if (unlikely(page->mapping != mapping)) { 3469 unlock_page(page); 3470 continue; 3471 } 3472 3473 if (!wbc->range_cyclic && page->index > end) { 3474 done = 1; 3475 unlock_page(page); 3476 continue; 3477 } 3478 3479 if (wbc->sync_mode != WB_SYNC_NONE) { 3480 if (PageWriteback(page)) 3481 flush_fn(data); 3482 wait_on_page_writeback(page); 3483 } 3484 3485 if (PageWriteback(page) || 3486 !clear_page_dirty_for_io(page)) { 3487 unlock_page(page); 3488 continue; 3489 } 3490 3491 ret = (*writepage)(page, wbc, data); 3492 3493 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3494 unlock_page(page); 3495 ret = 0; 3496 } 3497 if (ret) 3498 done = 1; 3499 3500 /* 3501 * the filesystem may choose to bump up nr_to_write. 3502 * We have to make sure to honor the new nr_to_write 3503 * at any time 3504 */ 3505 nr_to_write_done = wbc->nr_to_write <= 0; 3506 } 3507 pagevec_release(&pvec); 3508 cond_resched(); 3509 } 3510 if (!scanned && !done) { 3511 /* 3512 * We hit the last page and there is more work to be done: wrap 3513 * back to the start of the file 3514 */ 3515 scanned = 1; 3516 index = 0; 3517 goto retry; 3518 } 3519 btrfs_add_delayed_iput(inode); 3520 return ret; 3521 } 3522 3523 static void flush_epd_write_bio(struct extent_page_data *epd) 3524 { 3525 if (epd->bio) { 3526 int rw = WRITE; 3527 int ret; 3528 3529 if (epd->sync_io) 3530 rw = WRITE_SYNC; 3531 3532 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3533 BUG_ON(ret < 0); /* -ENOMEM */ 3534 epd->bio = NULL; 3535 } 3536 } 3537 3538 static noinline void flush_write_bio(void *data) 3539 { 3540 struct extent_page_data *epd = data; 3541 flush_epd_write_bio(epd); 3542 } 3543 3544 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3545 get_extent_t *get_extent, 3546 struct writeback_control *wbc) 3547 { 3548 int ret; 3549 struct extent_page_data epd = { 3550 .bio = NULL, 3551 .tree = tree, 3552 .get_extent = get_extent, 3553 .extent_locked = 0, 3554 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3555 .bio_flags = 0, 3556 }; 3557 3558 ret = __extent_writepage(page, wbc, &epd); 3559 3560 flush_epd_write_bio(&epd); 3561 return ret; 3562 } 3563 3564 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3565 u64 start, u64 end, get_extent_t *get_extent, 3566 int mode) 3567 { 3568 int ret = 0; 3569 struct address_space *mapping = inode->i_mapping; 3570 struct page *page; 3571 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3572 PAGE_CACHE_SHIFT; 3573 3574 struct extent_page_data epd = { 3575 .bio = NULL, 3576 .tree = tree, 3577 .get_extent = get_extent, 3578 .extent_locked = 1, 3579 .sync_io = mode == WB_SYNC_ALL, 3580 .bio_flags = 0, 3581 }; 3582 struct writeback_control wbc_writepages = { 3583 .sync_mode = mode, 3584 .nr_to_write = nr_pages * 2, 3585 .range_start = start, 3586 .range_end = end + 1, 3587 }; 3588 3589 while (start <= end) { 3590 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3591 if (clear_page_dirty_for_io(page)) 3592 ret = __extent_writepage(page, &wbc_writepages, &epd); 3593 else { 3594 if (tree->ops && tree->ops->writepage_end_io_hook) 3595 tree->ops->writepage_end_io_hook(page, start, 3596 start + PAGE_CACHE_SIZE - 1, 3597 NULL, 1); 3598 unlock_page(page); 3599 } 3600 page_cache_release(page); 3601 start += PAGE_CACHE_SIZE; 3602 } 3603 3604 flush_epd_write_bio(&epd); 3605 return ret; 3606 } 3607 3608 int extent_writepages(struct extent_io_tree *tree, 3609 struct address_space *mapping, 3610 get_extent_t *get_extent, 3611 struct writeback_control *wbc) 3612 { 3613 int ret = 0; 3614 struct extent_page_data epd = { 3615 .bio = NULL, 3616 .tree = tree, 3617 .get_extent = get_extent, 3618 .extent_locked = 0, 3619 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3620 .bio_flags = 0, 3621 }; 3622 3623 ret = extent_write_cache_pages(tree, mapping, wbc, 3624 __extent_writepage, &epd, 3625 flush_write_bio); 3626 flush_epd_write_bio(&epd); 3627 return ret; 3628 } 3629 3630 int extent_readpages(struct extent_io_tree *tree, 3631 struct address_space *mapping, 3632 struct list_head *pages, unsigned nr_pages, 3633 get_extent_t get_extent) 3634 { 3635 struct bio *bio = NULL; 3636 unsigned page_idx; 3637 unsigned long bio_flags = 0; 3638 struct page *pagepool[16]; 3639 struct page *page; 3640 int i = 0; 3641 int nr = 0; 3642 3643 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3644 page = list_entry(pages->prev, struct page, lru); 3645 3646 prefetchw(&page->flags); 3647 list_del(&page->lru); 3648 if (add_to_page_cache_lru(page, mapping, 3649 page->index, GFP_NOFS)) { 3650 page_cache_release(page); 3651 continue; 3652 } 3653 3654 pagepool[nr++] = page; 3655 if (nr < ARRAY_SIZE(pagepool)) 3656 continue; 3657 for (i = 0; i < nr; i++) { 3658 __extent_read_full_page(tree, pagepool[i], get_extent, 3659 &bio, 0, &bio_flags); 3660 page_cache_release(pagepool[i]); 3661 } 3662 nr = 0; 3663 } 3664 for (i = 0; i < nr; i++) { 3665 __extent_read_full_page(tree, pagepool[i], get_extent, 3666 &bio, 0, &bio_flags); 3667 page_cache_release(pagepool[i]); 3668 } 3669 3670 BUG_ON(!list_empty(pages)); 3671 if (bio) 3672 return submit_one_bio(READ, bio, 0, bio_flags); 3673 return 0; 3674 } 3675 3676 /* 3677 * basic invalidatepage code, this waits on any locked or writeback 3678 * ranges corresponding to the page, and then deletes any extent state 3679 * records from the tree 3680 */ 3681 int extent_invalidatepage(struct extent_io_tree *tree, 3682 struct page *page, unsigned long offset) 3683 { 3684 struct extent_state *cached_state = NULL; 3685 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3686 u64 end = start + PAGE_CACHE_SIZE - 1; 3687 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3688 3689 start += (offset + blocksize - 1) & ~(blocksize - 1); 3690 if (start > end) 3691 return 0; 3692 3693 lock_extent_bits(tree, start, end, 0, &cached_state); 3694 wait_on_page_writeback(page); 3695 clear_extent_bit(tree, start, end, 3696 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3697 EXTENT_DO_ACCOUNTING, 3698 1, 1, &cached_state, GFP_NOFS); 3699 return 0; 3700 } 3701 3702 /* 3703 * a helper for releasepage, this tests for areas of the page that 3704 * are locked or under IO and drops the related state bits if it is safe 3705 * to drop the page. 3706 */ 3707 int try_release_extent_state(struct extent_map_tree *map, 3708 struct extent_io_tree *tree, struct page *page, 3709 gfp_t mask) 3710 { 3711 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3712 u64 end = start + PAGE_CACHE_SIZE - 1; 3713 int ret = 1; 3714 3715 if (test_range_bit(tree, start, end, 3716 EXTENT_IOBITS, 0, NULL)) 3717 ret = 0; 3718 else { 3719 if ((mask & GFP_NOFS) == GFP_NOFS) 3720 mask = GFP_NOFS; 3721 /* 3722 * at this point we can safely clear everything except the 3723 * locked bit and the nodatasum bit 3724 */ 3725 ret = clear_extent_bit(tree, start, end, 3726 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3727 0, 0, NULL, mask); 3728 3729 /* if clear_extent_bit failed for enomem reasons, 3730 * we can't allow the release to continue. 3731 */ 3732 if (ret < 0) 3733 ret = 0; 3734 else 3735 ret = 1; 3736 } 3737 return ret; 3738 } 3739 3740 /* 3741 * a helper for releasepage. As long as there are no locked extents 3742 * in the range corresponding to the page, both state records and extent 3743 * map records are removed 3744 */ 3745 int try_release_extent_mapping(struct extent_map_tree *map, 3746 struct extent_io_tree *tree, struct page *page, 3747 gfp_t mask) 3748 { 3749 struct extent_map *em; 3750 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3751 u64 end = start + PAGE_CACHE_SIZE - 1; 3752 3753 if ((mask & __GFP_WAIT) && 3754 page->mapping->host->i_size > 16 * 1024 * 1024) { 3755 u64 len; 3756 while (start <= end) { 3757 len = end - start + 1; 3758 write_lock(&map->lock); 3759 em = lookup_extent_mapping(map, start, len); 3760 if (!em) { 3761 write_unlock(&map->lock); 3762 break; 3763 } 3764 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 3765 em->start != start) { 3766 write_unlock(&map->lock); 3767 free_extent_map(em); 3768 break; 3769 } 3770 if (!test_range_bit(tree, em->start, 3771 extent_map_end(em) - 1, 3772 EXTENT_LOCKED | EXTENT_WRITEBACK, 3773 0, NULL)) { 3774 remove_extent_mapping(map, em); 3775 /* once for the rb tree */ 3776 free_extent_map(em); 3777 } 3778 start = extent_map_end(em); 3779 write_unlock(&map->lock); 3780 3781 /* once for us */ 3782 free_extent_map(em); 3783 } 3784 } 3785 return try_release_extent_state(map, tree, page, mask); 3786 } 3787 3788 /* 3789 * helper function for fiemap, which doesn't want to see any holes. 3790 * This maps until we find something past 'last' 3791 */ 3792 static struct extent_map *get_extent_skip_holes(struct inode *inode, 3793 u64 offset, 3794 u64 last, 3795 get_extent_t *get_extent) 3796 { 3797 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 3798 struct extent_map *em; 3799 u64 len; 3800 3801 if (offset >= last) 3802 return NULL; 3803 3804 while(1) { 3805 len = last - offset; 3806 if (len == 0) 3807 break; 3808 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3809 em = get_extent(inode, NULL, 0, offset, len, 0); 3810 if (IS_ERR_OR_NULL(em)) 3811 return em; 3812 3813 /* if this isn't a hole return it */ 3814 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 3815 em->block_start != EXTENT_MAP_HOLE) { 3816 return em; 3817 } 3818 3819 /* this is a hole, advance to the next extent */ 3820 offset = extent_map_end(em); 3821 free_extent_map(em); 3822 if (offset >= last) 3823 break; 3824 } 3825 return NULL; 3826 } 3827 3828 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3829 __u64 start, __u64 len, get_extent_t *get_extent) 3830 { 3831 int ret = 0; 3832 u64 off = start; 3833 u64 max = start + len; 3834 u32 flags = 0; 3835 u32 found_type; 3836 u64 last; 3837 u64 last_for_get_extent = 0; 3838 u64 disko = 0; 3839 u64 isize = i_size_read(inode); 3840 struct btrfs_key found_key; 3841 struct extent_map *em = NULL; 3842 struct extent_state *cached_state = NULL; 3843 struct btrfs_path *path; 3844 struct btrfs_file_extent_item *item; 3845 int end = 0; 3846 u64 em_start = 0; 3847 u64 em_len = 0; 3848 u64 em_end = 0; 3849 unsigned long emflags; 3850 3851 if (len == 0) 3852 return -EINVAL; 3853 3854 path = btrfs_alloc_path(); 3855 if (!path) 3856 return -ENOMEM; 3857 path->leave_spinning = 1; 3858 3859 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 3860 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 3861 3862 /* 3863 * lookup the last file extent. We're not using i_size here 3864 * because there might be preallocation past i_size 3865 */ 3866 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3867 path, btrfs_ino(inode), -1, 0); 3868 if (ret < 0) { 3869 btrfs_free_path(path); 3870 return ret; 3871 } 3872 WARN_ON(!ret); 3873 path->slots[0]--; 3874 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3875 struct btrfs_file_extent_item); 3876 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3877 found_type = btrfs_key_type(&found_key); 3878 3879 /* No extents, but there might be delalloc bits */ 3880 if (found_key.objectid != btrfs_ino(inode) || 3881 found_type != BTRFS_EXTENT_DATA_KEY) { 3882 /* have to trust i_size as the end */ 3883 last = (u64)-1; 3884 last_for_get_extent = isize; 3885 } else { 3886 /* 3887 * remember the start of the last extent. There are a 3888 * bunch of different factors that go into the length of the 3889 * extent, so its much less complex to remember where it started 3890 */ 3891 last = found_key.offset; 3892 last_for_get_extent = last + 1; 3893 } 3894 btrfs_free_path(path); 3895 3896 /* 3897 * we might have some extents allocated but more delalloc past those 3898 * extents. so, we trust isize unless the start of the last extent is 3899 * beyond isize 3900 */ 3901 if (last < isize) { 3902 last = (u64)-1; 3903 last_for_get_extent = isize; 3904 } 3905 3906 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3907 &cached_state); 3908 3909 em = get_extent_skip_holes(inode, start, last_for_get_extent, 3910 get_extent); 3911 if (!em) 3912 goto out; 3913 if (IS_ERR(em)) { 3914 ret = PTR_ERR(em); 3915 goto out; 3916 } 3917 3918 while (!end) { 3919 u64 offset_in_extent; 3920 3921 /* break if the extent we found is outside the range */ 3922 if (em->start >= max || extent_map_end(em) < off) 3923 break; 3924 3925 /* 3926 * get_extent may return an extent that starts before our 3927 * requested range. We have to make sure the ranges 3928 * we return to fiemap always move forward and don't 3929 * overlap, so adjust the offsets here 3930 */ 3931 em_start = max(em->start, off); 3932 3933 /* 3934 * record the offset from the start of the extent 3935 * for adjusting the disk offset below 3936 */ 3937 offset_in_extent = em_start - em->start; 3938 em_end = extent_map_end(em); 3939 em_len = em_end - em_start; 3940 emflags = em->flags; 3941 disko = 0; 3942 flags = 0; 3943 3944 /* 3945 * bump off for our next call to get_extent 3946 */ 3947 off = extent_map_end(em); 3948 if (off >= max) 3949 end = 1; 3950 3951 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3952 end = 1; 3953 flags |= FIEMAP_EXTENT_LAST; 3954 } else if (em->block_start == EXTENT_MAP_INLINE) { 3955 flags |= (FIEMAP_EXTENT_DATA_INLINE | 3956 FIEMAP_EXTENT_NOT_ALIGNED); 3957 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 3958 flags |= (FIEMAP_EXTENT_DELALLOC | 3959 FIEMAP_EXTENT_UNKNOWN); 3960 } else { 3961 disko = em->block_start + offset_in_extent; 3962 } 3963 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3964 flags |= FIEMAP_EXTENT_ENCODED; 3965 3966 free_extent_map(em); 3967 em = NULL; 3968 if ((em_start >= last) || em_len == (u64)-1 || 3969 (last == (u64)-1 && isize <= em_end)) { 3970 flags |= FIEMAP_EXTENT_LAST; 3971 end = 1; 3972 } 3973 3974 /* now scan forward to see if this is really the last extent. */ 3975 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3976 get_extent); 3977 if (IS_ERR(em)) { 3978 ret = PTR_ERR(em); 3979 goto out; 3980 } 3981 if (!em) { 3982 flags |= FIEMAP_EXTENT_LAST; 3983 end = 1; 3984 } 3985 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3986 em_len, flags); 3987 if (ret) 3988 goto out_free; 3989 } 3990 out_free: 3991 free_extent_map(em); 3992 out: 3993 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3994 &cached_state, GFP_NOFS); 3995 return ret; 3996 } 3997 3998 static void __free_extent_buffer(struct extent_buffer *eb) 3999 { 4000 #if LEAK_DEBUG 4001 unsigned long flags; 4002 spin_lock_irqsave(&leak_lock, flags); 4003 list_del(&eb->leak_list); 4004 spin_unlock_irqrestore(&leak_lock, flags); 4005 #endif 4006 if (eb->pages && eb->pages != eb->inline_pages) 4007 kfree(eb->pages); 4008 kmem_cache_free(extent_buffer_cache, eb); 4009 } 4010 4011 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4012 u64 start, 4013 unsigned long len, 4014 gfp_t mask) 4015 { 4016 struct extent_buffer *eb = NULL; 4017 #if LEAK_DEBUG 4018 unsigned long flags; 4019 #endif 4020 4021 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4022 if (eb == NULL) 4023 return NULL; 4024 eb->start = start; 4025 eb->len = len; 4026 eb->tree = tree; 4027 eb->bflags = 0; 4028 rwlock_init(&eb->lock); 4029 atomic_set(&eb->write_locks, 0); 4030 atomic_set(&eb->read_locks, 0); 4031 atomic_set(&eb->blocking_readers, 0); 4032 atomic_set(&eb->blocking_writers, 0); 4033 atomic_set(&eb->spinning_readers, 0); 4034 atomic_set(&eb->spinning_writers, 0); 4035 eb->lock_nested = 0; 4036 init_waitqueue_head(&eb->write_lock_wq); 4037 init_waitqueue_head(&eb->read_lock_wq); 4038 4039 #if LEAK_DEBUG 4040 spin_lock_irqsave(&leak_lock, flags); 4041 list_add(&eb->leak_list, &buffers); 4042 spin_unlock_irqrestore(&leak_lock, flags); 4043 #endif 4044 spin_lock_init(&eb->refs_lock); 4045 atomic_set(&eb->refs, 1); 4046 atomic_set(&eb->io_pages, 0); 4047 4048 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { 4049 struct page **pages; 4050 int num_pages = (len + PAGE_CACHE_SIZE - 1) >> 4051 PAGE_CACHE_SHIFT; 4052 pages = kzalloc(num_pages, mask); 4053 if (!pages) { 4054 __free_extent_buffer(eb); 4055 return NULL; 4056 } 4057 eb->pages = pages; 4058 } else { 4059 eb->pages = eb->inline_pages; 4060 } 4061 4062 return eb; 4063 } 4064 4065 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4066 { 4067 unsigned long i; 4068 struct page *p; 4069 struct extent_buffer *new; 4070 unsigned long num_pages = num_extent_pages(src->start, src->len); 4071 4072 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); 4073 if (new == NULL) 4074 return NULL; 4075 4076 for (i = 0; i < num_pages; i++) { 4077 p = alloc_page(GFP_ATOMIC); 4078 BUG_ON(!p); 4079 attach_extent_buffer_page(new, p); 4080 WARN_ON(PageDirty(p)); 4081 SetPageUptodate(p); 4082 new->pages[i] = p; 4083 } 4084 4085 copy_extent_buffer(new, src, 0, 0, src->len); 4086 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4087 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4088 4089 return new; 4090 } 4091 4092 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4093 { 4094 struct extent_buffer *eb; 4095 unsigned long num_pages = num_extent_pages(0, len); 4096 unsigned long i; 4097 4098 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); 4099 if (!eb) 4100 return NULL; 4101 4102 for (i = 0; i < num_pages; i++) { 4103 eb->pages[i] = alloc_page(GFP_ATOMIC); 4104 if (!eb->pages[i]) 4105 goto err; 4106 } 4107 set_extent_buffer_uptodate(eb); 4108 btrfs_set_header_nritems(eb, 0); 4109 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4110 4111 return eb; 4112 err: 4113 for (; i > 0; i--) 4114 __free_page(eb->pages[i - 1]); 4115 __free_extent_buffer(eb); 4116 return NULL; 4117 } 4118 4119 static int extent_buffer_under_io(struct extent_buffer *eb) 4120 { 4121 return (atomic_read(&eb->io_pages) || 4122 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4123 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4124 } 4125 4126 /* 4127 * Helper for releasing extent buffer page. 4128 */ 4129 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4130 unsigned long start_idx) 4131 { 4132 unsigned long index; 4133 unsigned long num_pages; 4134 struct page *page; 4135 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4136 4137 BUG_ON(extent_buffer_under_io(eb)); 4138 4139 num_pages = num_extent_pages(eb->start, eb->len); 4140 index = start_idx + num_pages; 4141 if (start_idx >= index) 4142 return; 4143 4144 do { 4145 index--; 4146 page = extent_buffer_page(eb, index); 4147 if (page && mapped) { 4148 spin_lock(&page->mapping->private_lock); 4149 /* 4150 * We do this since we'll remove the pages after we've 4151 * removed the eb from the radix tree, so we could race 4152 * and have this page now attached to the new eb. So 4153 * only clear page_private if it's still connected to 4154 * this eb. 4155 */ 4156 if (PagePrivate(page) && 4157 page->private == (unsigned long)eb) { 4158 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4159 BUG_ON(PageDirty(page)); 4160 BUG_ON(PageWriteback(page)); 4161 /* 4162 * We need to make sure we haven't be attached 4163 * to a new eb. 4164 */ 4165 ClearPagePrivate(page); 4166 set_page_private(page, 0); 4167 /* One for the page private */ 4168 page_cache_release(page); 4169 } 4170 spin_unlock(&page->mapping->private_lock); 4171 4172 } 4173 if (page) { 4174 /* One for when we alloced the page */ 4175 page_cache_release(page); 4176 } 4177 } while (index != start_idx); 4178 } 4179 4180 /* 4181 * Helper for releasing the extent buffer. 4182 */ 4183 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4184 { 4185 btrfs_release_extent_buffer_page(eb, 0); 4186 __free_extent_buffer(eb); 4187 } 4188 4189 static void check_buffer_tree_ref(struct extent_buffer *eb) 4190 { 4191 /* the ref bit is tricky. We have to make sure it is set 4192 * if we have the buffer dirty. Otherwise the 4193 * code to free a buffer can end up dropping a dirty 4194 * page 4195 * 4196 * Once the ref bit is set, it won't go away while the 4197 * buffer is dirty or in writeback, and it also won't 4198 * go away while we have the reference count on the 4199 * eb bumped. 4200 * 4201 * We can't just set the ref bit without bumping the 4202 * ref on the eb because free_extent_buffer might 4203 * see the ref bit and try to clear it. If this happens 4204 * free_extent_buffer might end up dropping our original 4205 * ref by mistake and freeing the page before we are able 4206 * to add one more ref. 4207 * 4208 * So bump the ref count first, then set the bit. If someone 4209 * beat us to it, drop the ref we added. 4210 */ 4211 spin_lock(&eb->refs_lock); 4212 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4213 atomic_inc(&eb->refs); 4214 spin_unlock(&eb->refs_lock); 4215 } 4216 4217 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4218 { 4219 unsigned long num_pages, i; 4220 4221 check_buffer_tree_ref(eb); 4222 4223 num_pages = num_extent_pages(eb->start, eb->len); 4224 for (i = 0; i < num_pages; i++) { 4225 struct page *p = extent_buffer_page(eb, i); 4226 mark_page_accessed(p); 4227 } 4228 } 4229 4230 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4231 u64 start, unsigned long len) 4232 { 4233 unsigned long num_pages = num_extent_pages(start, len); 4234 unsigned long i; 4235 unsigned long index = start >> PAGE_CACHE_SHIFT; 4236 struct extent_buffer *eb; 4237 struct extent_buffer *exists = NULL; 4238 struct page *p; 4239 struct address_space *mapping = tree->mapping; 4240 int uptodate = 1; 4241 int ret; 4242 4243 rcu_read_lock(); 4244 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4245 if (eb && atomic_inc_not_zero(&eb->refs)) { 4246 rcu_read_unlock(); 4247 mark_extent_buffer_accessed(eb); 4248 return eb; 4249 } 4250 rcu_read_unlock(); 4251 4252 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4253 if (!eb) 4254 return NULL; 4255 4256 for (i = 0; i < num_pages; i++, index++) { 4257 p = find_or_create_page(mapping, index, GFP_NOFS); 4258 if (!p) 4259 goto free_eb; 4260 4261 spin_lock(&mapping->private_lock); 4262 if (PagePrivate(p)) { 4263 /* 4264 * We could have already allocated an eb for this page 4265 * and attached one so lets see if we can get a ref on 4266 * the existing eb, and if we can we know it's good and 4267 * we can just return that one, else we know we can just 4268 * overwrite page->private. 4269 */ 4270 exists = (struct extent_buffer *)p->private; 4271 if (atomic_inc_not_zero(&exists->refs)) { 4272 spin_unlock(&mapping->private_lock); 4273 unlock_page(p); 4274 page_cache_release(p); 4275 mark_extent_buffer_accessed(exists); 4276 goto free_eb; 4277 } 4278 4279 /* 4280 * Do this so attach doesn't complain and we need to 4281 * drop the ref the old guy had. 4282 */ 4283 ClearPagePrivate(p); 4284 WARN_ON(PageDirty(p)); 4285 page_cache_release(p); 4286 } 4287 attach_extent_buffer_page(eb, p); 4288 spin_unlock(&mapping->private_lock); 4289 WARN_ON(PageDirty(p)); 4290 mark_page_accessed(p); 4291 eb->pages[i] = p; 4292 if (!PageUptodate(p)) 4293 uptodate = 0; 4294 4295 /* 4296 * see below about how we avoid a nasty race with release page 4297 * and why we unlock later 4298 */ 4299 } 4300 if (uptodate) 4301 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4302 again: 4303 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4304 if (ret) 4305 goto free_eb; 4306 4307 spin_lock(&tree->buffer_lock); 4308 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4309 if (ret == -EEXIST) { 4310 exists = radix_tree_lookup(&tree->buffer, 4311 start >> PAGE_CACHE_SHIFT); 4312 if (!atomic_inc_not_zero(&exists->refs)) { 4313 spin_unlock(&tree->buffer_lock); 4314 radix_tree_preload_end(); 4315 exists = NULL; 4316 goto again; 4317 } 4318 spin_unlock(&tree->buffer_lock); 4319 radix_tree_preload_end(); 4320 mark_extent_buffer_accessed(exists); 4321 goto free_eb; 4322 } 4323 /* add one reference for the tree */ 4324 check_buffer_tree_ref(eb); 4325 spin_unlock(&tree->buffer_lock); 4326 radix_tree_preload_end(); 4327 4328 /* 4329 * there is a race where release page may have 4330 * tried to find this extent buffer in the radix 4331 * but failed. It will tell the VM it is safe to 4332 * reclaim the, and it will clear the page private bit. 4333 * We must make sure to set the page private bit properly 4334 * after the extent buffer is in the radix tree so 4335 * it doesn't get lost 4336 */ 4337 SetPageChecked(eb->pages[0]); 4338 for (i = 1; i < num_pages; i++) { 4339 p = extent_buffer_page(eb, i); 4340 ClearPageChecked(p); 4341 unlock_page(p); 4342 } 4343 unlock_page(eb->pages[0]); 4344 return eb; 4345 4346 free_eb: 4347 for (i = 0; i < num_pages; i++) { 4348 if (eb->pages[i]) 4349 unlock_page(eb->pages[i]); 4350 } 4351 4352 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4353 btrfs_release_extent_buffer(eb); 4354 return exists; 4355 } 4356 4357 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4358 u64 start, unsigned long len) 4359 { 4360 struct extent_buffer *eb; 4361 4362 rcu_read_lock(); 4363 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4364 if (eb && atomic_inc_not_zero(&eb->refs)) { 4365 rcu_read_unlock(); 4366 mark_extent_buffer_accessed(eb); 4367 return eb; 4368 } 4369 rcu_read_unlock(); 4370 4371 return NULL; 4372 } 4373 4374 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4375 { 4376 struct extent_buffer *eb = 4377 container_of(head, struct extent_buffer, rcu_head); 4378 4379 __free_extent_buffer(eb); 4380 } 4381 4382 /* Expects to have eb->eb_lock already held */ 4383 static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) 4384 { 4385 WARN_ON(atomic_read(&eb->refs) == 0); 4386 if (atomic_dec_and_test(&eb->refs)) { 4387 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4388 spin_unlock(&eb->refs_lock); 4389 } else { 4390 struct extent_io_tree *tree = eb->tree; 4391 4392 spin_unlock(&eb->refs_lock); 4393 4394 spin_lock(&tree->buffer_lock); 4395 radix_tree_delete(&tree->buffer, 4396 eb->start >> PAGE_CACHE_SHIFT); 4397 spin_unlock(&tree->buffer_lock); 4398 } 4399 4400 /* Should be safe to release our pages at this point */ 4401 btrfs_release_extent_buffer_page(eb, 0); 4402 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4403 return 1; 4404 } 4405 spin_unlock(&eb->refs_lock); 4406 4407 return 0; 4408 } 4409 4410 void free_extent_buffer(struct extent_buffer *eb) 4411 { 4412 if (!eb) 4413 return; 4414 4415 spin_lock(&eb->refs_lock); 4416 if (atomic_read(&eb->refs) == 2 && 4417 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4418 atomic_dec(&eb->refs); 4419 4420 if (atomic_read(&eb->refs) == 2 && 4421 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4422 !extent_buffer_under_io(eb) && 4423 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4424 atomic_dec(&eb->refs); 4425 4426 /* 4427 * I know this is terrible, but it's temporary until we stop tracking 4428 * the uptodate bits and such for the extent buffers. 4429 */ 4430 release_extent_buffer(eb, GFP_ATOMIC); 4431 } 4432 4433 void free_extent_buffer_stale(struct extent_buffer *eb) 4434 { 4435 if (!eb) 4436 return; 4437 4438 spin_lock(&eb->refs_lock); 4439 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4440 4441 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4442 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4443 atomic_dec(&eb->refs); 4444 release_extent_buffer(eb, GFP_NOFS); 4445 } 4446 4447 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4448 { 4449 unsigned long i; 4450 unsigned long num_pages; 4451 struct page *page; 4452 4453 num_pages = num_extent_pages(eb->start, eb->len); 4454 4455 for (i = 0; i < num_pages; i++) { 4456 page = extent_buffer_page(eb, i); 4457 if (!PageDirty(page)) 4458 continue; 4459 4460 lock_page(page); 4461 WARN_ON(!PagePrivate(page)); 4462 4463 clear_page_dirty_for_io(page); 4464 spin_lock_irq(&page->mapping->tree_lock); 4465 if (!PageDirty(page)) { 4466 radix_tree_tag_clear(&page->mapping->page_tree, 4467 page_index(page), 4468 PAGECACHE_TAG_DIRTY); 4469 } 4470 spin_unlock_irq(&page->mapping->tree_lock); 4471 ClearPageError(page); 4472 unlock_page(page); 4473 } 4474 WARN_ON(atomic_read(&eb->refs) == 0); 4475 } 4476 4477 int set_extent_buffer_dirty(struct extent_buffer *eb) 4478 { 4479 unsigned long i; 4480 unsigned long num_pages; 4481 int was_dirty = 0; 4482 4483 check_buffer_tree_ref(eb); 4484 4485 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4486 4487 num_pages = num_extent_pages(eb->start, eb->len); 4488 WARN_ON(atomic_read(&eb->refs) == 0); 4489 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4490 4491 for (i = 0; i < num_pages; i++) 4492 set_page_dirty(extent_buffer_page(eb, i)); 4493 return was_dirty; 4494 } 4495 4496 static int range_straddles_pages(u64 start, u64 len) 4497 { 4498 if (len < PAGE_CACHE_SIZE) 4499 return 1; 4500 if (start & (PAGE_CACHE_SIZE - 1)) 4501 return 1; 4502 if ((start + len) & (PAGE_CACHE_SIZE - 1)) 4503 return 1; 4504 return 0; 4505 } 4506 4507 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4508 { 4509 unsigned long i; 4510 struct page *page; 4511 unsigned long num_pages; 4512 4513 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4514 num_pages = num_extent_pages(eb->start, eb->len); 4515 for (i = 0; i < num_pages; i++) { 4516 page = extent_buffer_page(eb, i); 4517 if (page) 4518 ClearPageUptodate(page); 4519 } 4520 return 0; 4521 } 4522 4523 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4524 { 4525 unsigned long i; 4526 struct page *page; 4527 unsigned long num_pages; 4528 4529 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4530 num_pages = num_extent_pages(eb->start, eb->len); 4531 for (i = 0; i < num_pages; i++) { 4532 page = extent_buffer_page(eb, i); 4533 SetPageUptodate(page); 4534 } 4535 return 0; 4536 } 4537 4538 int extent_range_uptodate(struct extent_io_tree *tree, 4539 u64 start, u64 end) 4540 { 4541 struct page *page; 4542 int ret; 4543 int pg_uptodate = 1; 4544 int uptodate; 4545 unsigned long index; 4546 4547 if (range_straddles_pages(start, end - start + 1)) { 4548 ret = test_range_bit(tree, start, end, 4549 EXTENT_UPTODATE, 1, NULL); 4550 if (ret) 4551 return 1; 4552 } 4553 while (start <= end) { 4554 index = start >> PAGE_CACHE_SHIFT; 4555 page = find_get_page(tree->mapping, index); 4556 if (!page) 4557 return 1; 4558 uptodate = PageUptodate(page); 4559 page_cache_release(page); 4560 if (!uptodate) { 4561 pg_uptodate = 0; 4562 break; 4563 } 4564 start += PAGE_CACHE_SIZE; 4565 } 4566 return pg_uptodate; 4567 } 4568 4569 int extent_buffer_uptodate(struct extent_buffer *eb) 4570 { 4571 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4572 } 4573 4574 int read_extent_buffer_pages(struct extent_io_tree *tree, 4575 struct extent_buffer *eb, u64 start, int wait, 4576 get_extent_t *get_extent, int mirror_num) 4577 { 4578 unsigned long i; 4579 unsigned long start_i; 4580 struct page *page; 4581 int err; 4582 int ret = 0; 4583 int locked_pages = 0; 4584 int all_uptodate = 1; 4585 unsigned long num_pages; 4586 unsigned long num_reads = 0; 4587 struct bio *bio = NULL; 4588 unsigned long bio_flags = 0; 4589 4590 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4591 return 0; 4592 4593 if (start) { 4594 WARN_ON(start < eb->start); 4595 start_i = (start >> PAGE_CACHE_SHIFT) - 4596 (eb->start >> PAGE_CACHE_SHIFT); 4597 } else { 4598 start_i = 0; 4599 } 4600 4601 num_pages = num_extent_pages(eb->start, eb->len); 4602 for (i = start_i; i < num_pages; i++) { 4603 page = extent_buffer_page(eb, i); 4604 if (wait == WAIT_NONE) { 4605 if (!trylock_page(page)) 4606 goto unlock_exit; 4607 } else { 4608 lock_page(page); 4609 } 4610 locked_pages++; 4611 if (!PageUptodate(page)) { 4612 num_reads++; 4613 all_uptodate = 0; 4614 } 4615 } 4616 if (all_uptodate) { 4617 if (start_i == 0) 4618 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4619 goto unlock_exit; 4620 } 4621 4622 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4623 eb->read_mirror = 0; 4624 atomic_set(&eb->io_pages, num_reads); 4625 for (i = start_i; i < num_pages; i++) { 4626 page = extent_buffer_page(eb, i); 4627 if (!PageUptodate(page)) { 4628 ClearPageError(page); 4629 err = __extent_read_full_page(tree, page, 4630 get_extent, &bio, 4631 mirror_num, &bio_flags); 4632 if (err) 4633 ret = err; 4634 } else { 4635 unlock_page(page); 4636 } 4637 } 4638 4639 if (bio) { 4640 err = submit_one_bio(READ, bio, mirror_num, bio_flags); 4641 if (err) 4642 return err; 4643 } 4644 4645 if (ret || wait != WAIT_COMPLETE) 4646 return ret; 4647 4648 for (i = start_i; i < num_pages; i++) { 4649 page = extent_buffer_page(eb, i); 4650 wait_on_page_locked(page); 4651 if (!PageUptodate(page)) 4652 ret = -EIO; 4653 } 4654 4655 return ret; 4656 4657 unlock_exit: 4658 i = start_i; 4659 while (locked_pages > 0) { 4660 page = extent_buffer_page(eb, i); 4661 i++; 4662 unlock_page(page); 4663 locked_pages--; 4664 } 4665 return ret; 4666 } 4667 4668 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4669 unsigned long start, 4670 unsigned long len) 4671 { 4672 size_t cur; 4673 size_t offset; 4674 struct page *page; 4675 char *kaddr; 4676 char *dst = (char *)dstv; 4677 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4678 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4679 4680 WARN_ON(start > eb->len); 4681 WARN_ON(start + len > eb->start + eb->len); 4682 4683 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4684 4685 while (len > 0) { 4686 page = extent_buffer_page(eb, i); 4687 4688 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4689 kaddr = page_address(page); 4690 memcpy(dst, kaddr + offset, cur); 4691 4692 dst += cur; 4693 len -= cur; 4694 offset = 0; 4695 i++; 4696 } 4697 } 4698 4699 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4700 unsigned long min_len, char **map, 4701 unsigned long *map_start, 4702 unsigned long *map_len) 4703 { 4704 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4705 char *kaddr; 4706 struct page *p; 4707 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4708 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4709 unsigned long end_i = (start_offset + start + min_len - 1) >> 4710 PAGE_CACHE_SHIFT; 4711 4712 if (i != end_i) 4713 return -EINVAL; 4714 4715 if (i == 0) { 4716 offset = start_offset; 4717 *map_start = 0; 4718 } else { 4719 offset = 0; 4720 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4721 } 4722 4723 if (start + min_len > eb->len) { 4724 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4725 "wanted %lu %lu\n", (unsigned long long)eb->start, 4726 eb->len, start, min_len); 4727 WARN_ON(1); 4728 return -EINVAL; 4729 } 4730 4731 p = extent_buffer_page(eb, i); 4732 kaddr = page_address(p); 4733 *map = kaddr + offset; 4734 *map_len = PAGE_CACHE_SIZE - offset; 4735 return 0; 4736 } 4737 4738 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4739 unsigned long start, 4740 unsigned long len) 4741 { 4742 size_t cur; 4743 size_t offset; 4744 struct page *page; 4745 char *kaddr; 4746 char *ptr = (char *)ptrv; 4747 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4748 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4749 int ret = 0; 4750 4751 WARN_ON(start > eb->len); 4752 WARN_ON(start + len > eb->start + eb->len); 4753 4754 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4755 4756 while (len > 0) { 4757 page = extent_buffer_page(eb, i); 4758 4759 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4760 4761 kaddr = page_address(page); 4762 ret = memcmp(ptr, kaddr + offset, cur); 4763 if (ret) 4764 break; 4765 4766 ptr += cur; 4767 len -= cur; 4768 offset = 0; 4769 i++; 4770 } 4771 return ret; 4772 } 4773 4774 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4775 unsigned long start, unsigned long len) 4776 { 4777 size_t cur; 4778 size_t offset; 4779 struct page *page; 4780 char *kaddr; 4781 char *src = (char *)srcv; 4782 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4783 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4784 4785 WARN_ON(start > eb->len); 4786 WARN_ON(start + len > eb->start + eb->len); 4787 4788 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4789 4790 while (len > 0) { 4791 page = extent_buffer_page(eb, i); 4792 WARN_ON(!PageUptodate(page)); 4793 4794 cur = min(len, PAGE_CACHE_SIZE - offset); 4795 kaddr = page_address(page); 4796 memcpy(kaddr + offset, src, cur); 4797 4798 src += cur; 4799 len -= cur; 4800 offset = 0; 4801 i++; 4802 } 4803 } 4804 4805 void memset_extent_buffer(struct extent_buffer *eb, char c, 4806 unsigned long start, unsigned long len) 4807 { 4808 size_t cur; 4809 size_t offset; 4810 struct page *page; 4811 char *kaddr; 4812 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4813 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4814 4815 WARN_ON(start > eb->len); 4816 WARN_ON(start + len > eb->start + eb->len); 4817 4818 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4819 4820 while (len > 0) { 4821 page = extent_buffer_page(eb, i); 4822 WARN_ON(!PageUptodate(page)); 4823 4824 cur = min(len, PAGE_CACHE_SIZE - offset); 4825 kaddr = page_address(page); 4826 memset(kaddr + offset, c, cur); 4827 4828 len -= cur; 4829 offset = 0; 4830 i++; 4831 } 4832 } 4833 4834 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 4835 unsigned long dst_offset, unsigned long src_offset, 4836 unsigned long len) 4837 { 4838 u64 dst_len = dst->len; 4839 size_t cur; 4840 size_t offset; 4841 struct page *page; 4842 char *kaddr; 4843 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4844 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4845 4846 WARN_ON(src->len != dst_len); 4847 4848 offset = (start_offset + dst_offset) & 4849 ((unsigned long)PAGE_CACHE_SIZE - 1); 4850 4851 while (len > 0) { 4852 page = extent_buffer_page(dst, i); 4853 WARN_ON(!PageUptodate(page)); 4854 4855 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4856 4857 kaddr = page_address(page); 4858 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4859 4860 src_offset += cur; 4861 len -= cur; 4862 offset = 0; 4863 i++; 4864 } 4865 } 4866 4867 static void move_pages(struct page *dst_page, struct page *src_page, 4868 unsigned long dst_off, unsigned long src_off, 4869 unsigned long len) 4870 { 4871 char *dst_kaddr = page_address(dst_page); 4872 if (dst_page == src_page) { 4873 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4874 } else { 4875 char *src_kaddr = page_address(src_page); 4876 char *p = dst_kaddr + dst_off + len; 4877 char *s = src_kaddr + src_off + len; 4878 4879 while (len--) 4880 *--p = *--s; 4881 } 4882 } 4883 4884 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4885 { 4886 unsigned long distance = (src > dst) ? src - dst : dst - src; 4887 return distance < len; 4888 } 4889 4890 static void copy_pages(struct page *dst_page, struct page *src_page, 4891 unsigned long dst_off, unsigned long src_off, 4892 unsigned long len) 4893 { 4894 char *dst_kaddr = page_address(dst_page); 4895 char *src_kaddr; 4896 int must_memmove = 0; 4897 4898 if (dst_page != src_page) { 4899 src_kaddr = page_address(src_page); 4900 } else { 4901 src_kaddr = dst_kaddr; 4902 if (areas_overlap(src_off, dst_off, len)) 4903 must_memmove = 1; 4904 } 4905 4906 if (must_memmove) 4907 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 4908 else 4909 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4910 } 4911 4912 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4913 unsigned long src_offset, unsigned long len) 4914 { 4915 size_t cur; 4916 size_t dst_off_in_page; 4917 size_t src_off_in_page; 4918 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4919 unsigned long dst_i; 4920 unsigned long src_i; 4921 4922 if (src_offset + len > dst->len) { 4923 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4924 "len %lu dst len %lu\n", src_offset, len, dst->len); 4925 BUG_ON(1); 4926 } 4927 if (dst_offset + len > dst->len) { 4928 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4929 "len %lu dst len %lu\n", dst_offset, len, dst->len); 4930 BUG_ON(1); 4931 } 4932 4933 while (len > 0) { 4934 dst_off_in_page = (start_offset + dst_offset) & 4935 ((unsigned long)PAGE_CACHE_SIZE - 1); 4936 src_off_in_page = (start_offset + src_offset) & 4937 ((unsigned long)PAGE_CACHE_SIZE - 1); 4938 4939 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4940 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 4941 4942 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 4943 src_off_in_page)); 4944 cur = min_t(unsigned long, cur, 4945 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 4946 4947 copy_pages(extent_buffer_page(dst, dst_i), 4948 extent_buffer_page(dst, src_i), 4949 dst_off_in_page, src_off_in_page, cur); 4950 4951 src_offset += cur; 4952 dst_offset += cur; 4953 len -= cur; 4954 } 4955 } 4956 4957 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4958 unsigned long src_offset, unsigned long len) 4959 { 4960 size_t cur; 4961 size_t dst_off_in_page; 4962 size_t src_off_in_page; 4963 unsigned long dst_end = dst_offset + len - 1; 4964 unsigned long src_end = src_offset + len - 1; 4965 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4966 unsigned long dst_i; 4967 unsigned long src_i; 4968 4969 if (src_offset + len > dst->len) { 4970 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4971 "len %lu len %lu\n", src_offset, len, dst->len); 4972 BUG_ON(1); 4973 } 4974 if (dst_offset + len > dst->len) { 4975 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4976 "len %lu len %lu\n", dst_offset, len, dst->len); 4977 BUG_ON(1); 4978 } 4979 if (dst_offset < src_offset) { 4980 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4981 return; 4982 } 4983 while (len > 0) { 4984 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 4985 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 4986 4987 dst_off_in_page = (start_offset + dst_end) & 4988 ((unsigned long)PAGE_CACHE_SIZE - 1); 4989 src_off_in_page = (start_offset + src_end) & 4990 ((unsigned long)PAGE_CACHE_SIZE - 1); 4991 4992 cur = min_t(unsigned long, len, src_off_in_page + 1); 4993 cur = min(cur, dst_off_in_page + 1); 4994 move_pages(extent_buffer_page(dst, dst_i), 4995 extent_buffer_page(dst, src_i), 4996 dst_off_in_page - cur + 1, 4997 src_off_in_page - cur + 1, cur); 4998 4999 dst_end -= cur; 5000 src_end -= cur; 5001 len -= cur; 5002 } 5003 } 5004 5005 int try_release_extent_buffer(struct page *page, gfp_t mask) 5006 { 5007 struct extent_buffer *eb; 5008 5009 /* 5010 * We need to make sure noboody is attaching this page to an eb right 5011 * now. 5012 */ 5013 spin_lock(&page->mapping->private_lock); 5014 if (!PagePrivate(page)) { 5015 spin_unlock(&page->mapping->private_lock); 5016 return 1; 5017 } 5018 5019 eb = (struct extent_buffer *)page->private; 5020 BUG_ON(!eb); 5021 5022 /* 5023 * This is a little awful but should be ok, we need to make sure that 5024 * the eb doesn't disappear out from under us while we're looking at 5025 * this page. 5026 */ 5027 spin_lock(&eb->refs_lock); 5028 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5029 spin_unlock(&eb->refs_lock); 5030 spin_unlock(&page->mapping->private_lock); 5031 return 0; 5032 } 5033 spin_unlock(&page->mapping->private_lock); 5034 5035 if ((mask & GFP_NOFS) == GFP_NOFS) 5036 mask = GFP_NOFS; 5037 5038 /* 5039 * If tree ref isn't set then we know the ref on this eb is a real ref, 5040 * so just return, this page will likely be freed soon anyway. 5041 */ 5042 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5043 spin_unlock(&eb->refs_lock); 5044 return 0; 5045 } 5046 5047 return release_extent_buffer(eb, mask); 5048 } 5049