1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include <linux/prefetch.h> 14 #include <linux/cleancache.h> 15 #include "extent_io.h" 16 #include "extent_map.h" 17 #include "compat.h" 18 #include "ctree.h" 19 #include "btrfs_inode.h" 20 #include "volumes.h" 21 #include "check-integrity.h" 22 #include "locking.h" 23 #include "rcu-string.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 28 static LIST_HEAD(buffers); 29 static LIST_HEAD(states); 30 31 #define LEAK_DEBUG 0 32 #if LEAK_DEBUG 33 static DEFINE_SPINLOCK(leak_lock); 34 #endif 35 36 #define BUFFER_LRU_MAX 64 37 38 struct tree_entry { 39 u64 start; 40 u64 end; 41 struct rb_node rb_node; 42 }; 43 44 struct extent_page_data { 45 struct bio *bio; 46 struct extent_io_tree *tree; 47 get_extent_t *get_extent; 48 unsigned long bio_flags; 49 50 /* tells writepage not to lock the state bits for this range 51 * it still does the unlocking 52 */ 53 unsigned int extent_locked:1; 54 55 /* tells the submit_bio code to use a WRITE_SYNC */ 56 unsigned int sync_io:1; 57 }; 58 59 static noinline void flush_write_bio(void *data); 60 static inline struct btrfs_fs_info * 61 tree_fs_info(struct extent_io_tree *tree) 62 { 63 return btrfs_sb(tree->mapping->host->i_sb); 64 } 65 66 int __init extent_io_init(void) 67 { 68 extent_state_cache = kmem_cache_create("btrfs_extent_state", 69 sizeof(struct extent_state), 0, 70 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 71 if (!extent_state_cache) 72 return -ENOMEM; 73 74 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 75 sizeof(struct extent_buffer), 0, 76 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 77 if (!extent_buffer_cache) 78 goto free_state_cache; 79 return 0; 80 81 free_state_cache: 82 kmem_cache_destroy(extent_state_cache); 83 return -ENOMEM; 84 } 85 86 void extent_io_exit(void) 87 { 88 struct extent_state *state; 89 struct extent_buffer *eb; 90 91 while (!list_empty(&states)) { 92 state = list_entry(states.next, struct extent_state, leak_list); 93 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 94 "state %lu in tree %p refs %d\n", 95 (unsigned long long)state->start, 96 (unsigned long long)state->end, 97 state->state, state->tree, atomic_read(&state->refs)); 98 list_del(&state->leak_list); 99 kmem_cache_free(extent_state_cache, state); 100 101 } 102 103 while (!list_empty(&buffers)) { 104 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 105 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 106 "refs %d\n", (unsigned long long)eb->start, 107 eb->len, atomic_read(&eb->refs)); 108 list_del(&eb->leak_list); 109 kmem_cache_free(extent_buffer_cache, eb); 110 } 111 112 /* 113 * Make sure all delayed rcu free are flushed before we 114 * destroy caches. 115 */ 116 rcu_barrier(); 117 if (extent_state_cache) 118 kmem_cache_destroy(extent_state_cache); 119 if (extent_buffer_cache) 120 kmem_cache_destroy(extent_buffer_cache); 121 } 122 123 void extent_io_tree_init(struct extent_io_tree *tree, 124 struct address_space *mapping) 125 { 126 tree->state = RB_ROOT; 127 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 128 tree->ops = NULL; 129 tree->dirty_bytes = 0; 130 spin_lock_init(&tree->lock); 131 spin_lock_init(&tree->buffer_lock); 132 tree->mapping = mapping; 133 } 134 135 static struct extent_state *alloc_extent_state(gfp_t mask) 136 { 137 struct extent_state *state; 138 #if LEAK_DEBUG 139 unsigned long flags; 140 #endif 141 142 state = kmem_cache_alloc(extent_state_cache, mask); 143 if (!state) 144 return state; 145 state->state = 0; 146 state->private = 0; 147 state->tree = NULL; 148 #if LEAK_DEBUG 149 spin_lock_irqsave(&leak_lock, flags); 150 list_add(&state->leak_list, &states); 151 spin_unlock_irqrestore(&leak_lock, flags); 152 #endif 153 atomic_set(&state->refs, 1); 154 init_waitqueue_head(&state->wq); 155 trace_alloc_extent_state(state, mask, _RET_IP_); 156 return state; 157 } 158 159 void free_extent_state(struct extent_state *state) 160 { 161 if (!state) 162 return; 163 if (atomic_dec_and_test(&state->refs)) { 164 #if LEAK_DEBUG 165 unsigned long flags; 166 #endif 167 WARN_ON(state->tree); 168 #if LEAK_DEBUG 169 spin_lock_irqsave(&leak_lock, flags); 170 list_del(&state->leak_list); 171 spin_unlock_irqrestore(&leak_lock, flags); 172 #endif 173 trace_free_extent_state(state, _RET_IP_); 174 kmem_cache_free(extent_state_cache, state); 175 } 176 } 177 178 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 179 struct rb_node *node) 180 { 181 struct rb_node **p = &root->rb_node; 182 struct rb_node *parent = NULL; 183 struct tree_entry *entry; 184 185 while (*p) { 186 parent = *p; 187 entry = rb_entry(parent, struct tree_entry, rb_node); 188 189 if (offset < entry->start) 190 p = &(*p)->rb_left; 191 else if (offset > entry->end) 192 p = &(*p)->rb_right; 193 else 194 return parent; 195 } 196 197 rb_link_node(node, parent, p); 198 rb_insert_color(node, root); 199 return NULL; 200 } 201 202 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 203 struct rb_node **prev_ret, 204 struct rb_node **next_ret) 205 { 206 struct rb_root *root = &tree->state; 207 struct rb_node *n = root->rb_node; 208 struct rb_node *prev = NULL; 209 struct rb_node *orig_prev = NULL; 210 struct tree_entry *entry; 211 struct tree_entry *prev_entry = NULL; 212 213 while (n) { 214 entry = rb_entry(n, struct tree_entry, rb_node); 215 prev = n; 216 prev_entry = entry; 217 218 if (offset < entry->start) 219 n = n->rb_left; 220 else if (offset > entry->end) 221 n = n->rb_right; 222 else 223 return n; 224 } 225 226 if (prev_ret) { 227 orig_prev = prev; 228 while (prev && offset > prev_entry->end) { 229 prev = rb_next(prev); 230 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 231 } 232 *prev_ret = prev; 233 prev = orig_prev; 234 } 235 236 if (next_ret) { 237 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 238 while (prev && offset < prev_entry->start) { 239 prev = rb_prev(prev); 240 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 241 } 242 *next_ret = prev; 243 } 244 return NULL; 245 } 246 247 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 248 u64 offset) 249 { 250 struct rb_node *prev = NULL; 251 struct rb_node *ret; 252 253 ret = __etree_search(tree, offset, &prev, NULL); 254 if (!ret) 255 return prev; 256 return ret; 257 } 258 259 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 260 struct extent_state *other) 261 { 262 if (tree->ops && tree->ops->merge_extent_hook) 263 tree->ops->merge_extent_hook(tree->mapping->host, new, 264 other); 265 } 266 267 /* 268 * utility function to look for merge candidates inside a given range. 269 * Any extents with matching state are merged together into a single 270 * extent in the tree. Extents with EXTENT_IO in their state field 271 * are not merged because the end_io handlers need to be able to do 272 * operations on them without sleeping (or doing allocations/splits). 273 * 274 * This should be called with the tree lock held. 275 */ 276 static void merge_state(struct extent_io_tree *tree, 277 struct extent_state *state) 278 { 279 struct extent_state *other; 280 struct rb_node *other_node; 281 282 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 283 return; 284 285 other_node = rb_prev(&state->rb_node); 286 if (other_node) { 287 other = rb_entry(other_node, struct extent_state, rb_node); 288 if (other->end == state->start - 1 && 289 other->state == state->state) { 290 merge_cb(tree, state, other); 291 state->start = other->start; 292 other->tree = NULL; 293 rb_erase(&other->rb_node, &tree->state); 294 free_extent_state(other); 295 } 296 } 297 other_node = rb_next(&state->rb_node); 298 if (other_node) { 299 other = rb_entry(other_node, struct extent_state, rb_node); 300 if (other->start == state->end + 1 && 301 other->state == state->state) { 302 merge_cb(tree, state, other); 303 state->end = other->end; 304 other->tree = NULL; 305 rb_erase(&other->rb_node, &tree->state); 306 free_extent_state(other); 307 } 308 } 309 } 310 311 static void set_state_cb(struct extent_io_tree *tree, 312 struct extent_state *state, int *bits) 313 { 314 if (tree->ops && tree->ops->set_bit_hook) 315 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 316 } 317 318 static void clear_state_cb(struct extent_io_tree *tree, 319 struct extent_state *state, int *bits) 320 { 321 if (tree->ops && tree->ops->clear_bit_hook) 322 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 323 } 324 325 static void set_state_bits(struct extent_io_tree *tree, 326 struct extent_state *state, int *bits); 327 328 /* 329 * insert an extent_state struct into the tree. 'bits' are set on the 330 * struct before it is inserted. 331 * 332 * This may return -EEXIST if the extent is already there, in which case the 333 * state struct is freed. 334 * 335 * The tree lock is not taken internally. This is a utility function and 336 * probably isn't what you want to call (see set/clear_extent_bit). 337 */ 338 static int insert_state(struct extent_io_tree *tree, 339 struct extent_state *state, u64 start, u64 end, 340 int *bits) 341 { 342 struct rb_node *node; 343 344 if (end < start) 345 WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", 346 (unsigned long long)end, 347 (unsigned long long)start); 348 state->start = start; 349 state->end = end; 350 351 set_state_bits(tree, state, bits); 352 353 node = tree_insert(&tree->state, end, &state->rb_node); 354 if (node) { 355 struct extent_state *found; 356 found = rb_entry(node, struct extent_state, rb_node); 357 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 358 "%llu %llu\n", (unsigned long long)found->start, 359 (unsigned long long)found->end, 360 (unsigned long long)start, (unsigned long long)end); 361 return -EEXIST; 362 } 363 state->tree = tree; 364 merge_state(tree, state); 365 return 0; 366 } 367 368 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 369 u64 split) 370 { 371 if (tree->ops && tree->ops->split_extent_hook) 372 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 373 } 374 375 /* 376 * split a given extent state struct in two, inserting the preallocated 377 * struct 'prealloc' as the newly created second half. 'split' indicates an 378 * offset inside 'orig' where it should be split. 379 * 380 * Before calling, 381 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 382 * are two extent state structs in the tree: 383 * prealloc: [orig->start, split - 1] 384 * orig: [ split, orig->end ] 385 * 386 * The tree locks are not taken by this function. They need to be held 387 * by the caller. 388 */ 389 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 390 struct extent_state *prealloc, u64 split) 391 { 392 struct rb_node *node; 393 394 split_cb(tree, orig, split); 395 396 prealloc->start = orig->start; 397 prealloc->end = split - 1; 398 prealloc->state = orig->state; 399 orig->start = split; 400 401 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 402 if (node) { 403 free_extent_state(prealloc); 404 return -EEXIST; 405 } 406 prealloc->tree = tree; 407 return 0; 408 } 409 410 static struct extent_state *next_state(struct extent_state *state) 411 { 412 struct rb_node *next = rb_next(&state->rb_node); 413 if (next) 414 return rb_entry(next, struct extent_state, rb_node); 415 else 416 return NULL; 417 } 418 419 /* 420 * utility function to clear some bits in an extent state struct. 421 * it will optionally wake up any one waiting on this state (wake == 1). 422 * 423 * If no bits are set on the state struct after clearing things, the 424 * struct is freed and removed from the tree 425 */ 426 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 427 struct extent_state *state, 428 int *bits, int wake) 429 { 430 struct extent_state *next; 431 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 432 433 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 434 u64 range = state->end - state->start + 1; 435 WARN_ON(range > tree->dirty_bytes); 436 tree->dirty_bytes -= range; 437 } 438 clear_state_cb(tree, state, bits); 439 state->state &= ~bits_to_clear; 440 if (wake) 441 wake_up(&state->wq); 442 if (state->state == 0) { 443 next = next_state(state); 444 if (state->tree) { 445 rb_erase(&state->rb_node, &tree->state); 446 state->tree = NULL; 447 free_extent_state(state); 448 } else { 449 WARN_ON(1); 450 } 451 } else { 452 merge_state(tree, state); 453 next = next_state(state); 454 } 455 return next; 456 } 457 458 static struct extent_state * 459 alloc_extent_state_atomic(struct extent_state *prealloc) 460 { 461 if (!prealloc) 462 prealloc = alloc_extent_state(GFP_ATOMIC); 463 464 return prealloc; 465 } 466 467 void extent_io_tree_panic(struct extent_io_tree *tree, int err) 468 { 469 btrfs_panic(tree_fs_info(tree), err, "Locking error: " 470 "Extent tree was modified by another " 471 "thread while locked."); 472 } 473 474 /* 475 * clear some bits on a range in the tree. This may require splitting 476 * or inserting elements in the tree, so the gfp mask is used to 477 * indicate which allocations or sleeping are allowed. 478 * 479 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 480 * the given range from the tree regardless of state (ie for truncate). 481 * 482 * the range [start, end] is inclusive. 483 * 484 * This takes the tree lock, and returns 0 on success and < 0 on error. 485 */ 486 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 487 int bits, int wake, int delete, 488 struct extent_state **cached_state, 489 gfp_t mask) 490 { 491 struct extent_state *state; 492 struct extent_state *cached; 493 struct extent_state *prealloc = NULL; 494 struct rb_node *node; 495 u64 last_end; 496 int err; 497 int clear = 0; 498 499 if (delete) 500 bits |= ~EXTENT_CTLBITS; 501 bits |= EXTENT_FIRST_DELALLOC; 502 503 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 504 clear = 1; 505 again: 506 if (!prealloc && (mask & __GFP_WAIT)) { 507 prealloc = alloc_extent_state(mask); 508 if (!prealloc) 509 return -ENOMEM; 510 } 511 512 spin_lock(&tree->lock); 513 if (cached_state) { 514 cached = *cached_state; 515 516 if (clear) { 517 *cached_state = NULL; 518 cached_state = NULL; 519 } 520 521 if (cached && cached->tree && cached->start <= start && 522 cached->end > start) { 523 if (clear) 524 atomic_dec(&cached->refs); 525 state = cached; 526 goto hit_next; 527 } 528 if (clear) 529 free_extent_state(cached); 530 } 531 /* 532 * this search will find the extents that end after 533 * our range starts 534 */ 535 node = tree_search(tree, start); 536 if (!node) 537 goto out; 538 state = rb_entry(node, struct extent_state, rb_node); 539 hit_next: 540 if (state->start > end) 541 goto out; 542 WARN_ON(state->end < start); 543 last_end = state->end; 544 545 /* the state doesn't have the wanted bits, go ahead */ 546 if (!(state->state & bits)) { 547 state = next_state(state); 548 goto next; 549 } 550 551 /* 552 * | ---- desired range ---- | 553 * | state | or 554 * | ------------- state -------------- | 555 * 556 * We need to split the extent we found, and may flip 557 * bits on second half. 558 * 559 * If the extent we found extends past our range, we 560 * just split and search again. It'll get split again 561 * the next time though. 562 * 563 * If the extent we found is inside our range, we clear 564 * the desired bit on it. 565 */ 566 567 if (state->start < start) { 568 prealloc = alloc_extent_state_atomic(prealloc); 569 BUG_ON(!prealloc); 570 err = split_state(tree, state, prealloc, start); 571 if (err) 572 extent_io_tree_panic(tree, err); 573 574 prealloc = NULL; 575 if (err) 576 goto out; 577 if (state->end <= end) { 578 state = clear_state_bit(tree, state, &bits, wake); 579 goto next; 580 } 581 goto search_again; 582 } 583 /* 584 * | ---- desired range ---- | 585 * | state | 586 * We need to split the extent, and clear the bit 587 * on the first half 588 */ 589 if (state->start <= end && state->end > end) { 590 prealloc = alloc_extent_state_atomic(prealloc); 591 BUG_ON(!prealloc); 592 err = split_state(tree, state, prealloc, end + 1); 593 if (err) 594 extent_io_tree_panic(tree, err); 595 596 if (wake) 597 wake_up(&state->wq); 598 599 clear_state_bit(tree, prealloc, &bits, wake); 600 601 prealloc = NULL; 602 goto out; 603 } 604 605 state = clear_state_bit(tree, state, &bits, wake); 606 next: 607 if (last_end == (u64)-1) 608 goto out; 609 start = last_end + 1; 610 if (start <= end && state && !need_resched()) 611 goto hit_next; 612 goto search_again; 613 614 out: 615 spin_unlock(&tree->lock); 616 if (prealloc) 617 free_extent_state(prealloc); 618 619 return 0; 620 621 search_again: 622 if (start > end) 623 goto out; 624 spin_unlock(&tree->lock); 625 if (mask & __GFP_WAIT) 626 cond_resched(); 627 goto again; 628 } 629 630 static void wait_on_state(struct extent_io_tree *tree, 631 struct extent_state *state) 632 __releases(tree->lock) 633 __acquires(tree->lock) 634 { 635 DEFINE_WAIT(wait); 636 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 637 spin_unlock(&tree->lock); 638 schedule(); 639 spin_lock(&tree->lock); 640 finish_wait(&state->wq, &wait); 641 } 642 643 /* 644 * waits for one or more bits to clear on a range in the state tree. 645 * The range [start, end] is inclusive. 646 * The tree lock is taken by this function 647 */ 648 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 649 { 650 struct extent_state *state; 651 struct rb_node *node; 652 653 spin_lock(&tree->lock); 654 again: 655 while (1) { 656 /* 657 * this search will find all the extents that end after 658 * our range starts 659 */ 660 node = tree_search(tree, start); 661 if (!node) 662 break; 663 664 state = rb_entry(node, struct extent_state, rb_node); 665 666 if (state->start > end) 667 goto out; 668 669 if (state->state & bits) { 670 start = state->start; 671 atomic_inc(&state->refs); 672 wait_on_state(tree, state); 673 free_extent_state(state); 674 goto again; 675 } 676 start = state->end + 1; 677 678 if (start > end) 679 break; 680 681 cond_resched_lock(&tree->lock); 682 } 683 out: 684 spin_unlock(&tree->lock); 685 } 686 687 static void set_state_bits(struct extent_io_tree *tree, 688 struct extent_state *state, 689 int *bits) 690 { 691 int bits_to_set = *bits & ~EXTENT_CTLBITS; 692 693 set_state_cb(tree, state, bits); 694 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 695 u64 range = state->end - state->start + 1; 696 tree->dirty_bytes += range; 697 } 698 state->state |= bits_to_set; 699 } 700 701 static void cache_state(struct extent_state *state, 702 struct extent_state **cached_ptr) 703 { 704 if (cached_ptr && !(*cached_ptr)) { 705 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 706 *cached_ptr = state; 707 atomic_inc(&state->refs); 708 } 709 } 710 } 711 712 static void uncache_state(struct extent_state **cached_ptr) 713 { 714 if (cached_ptr && (*cached_ptr)) { 715 struct extent_state *state = *cached_ptr; 716 *cached_ptr = NULL; 717 free_extent_state(state); 718 } 719 } 720 721 /* 722 * set some bits on a range in the tree. This may require allocations or 723 * sleeping, so the gfp mask is used to indicate what is allowed. 724 * 725 * If any of the exclusive bits are set, this will fail with -EEXIST if some 726 * part of the range already has the desired bits set. The start of the 727 * existing range is returned in failed_start in this case. 728 * 729 * [start, end] is inclusive This takes the tree lock. 730 */ 731 732 static int __must_check 733 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 734 int bits, int exclusive_bits, u64 *failed_start, 735 struct extent_state **cached_state, gfp_t mask) 736 { 737 struct extent_state *state; 738 struct extent_state *prealloc = NULL; 739 struct rb_node *node; 740 int err = 0; 741 u64 last_start; 742 u64 last_end; 743 744 bits |= EXTENT_FIRST_DELALLOC; 745 again: 746 if (!prealloc && (mask & __GFP_WAIT)) { 747 prealloc = alloc_extent_state(mask); 748 BUG_ON(!prealloc); 749 } 750 751 spin_lock(&tree->lock); 752 if (cached_state && *cached_state) { 753 state = *cached_state; 754 if (state->start <= start && state->end > start && 755 state->tree) { 756 node = &state->rb_node; 757 goto hit_next; 758 } 759 } 760 /* 761 * this search will find all the extents that end after 762 * our range starts. 763 */ 764 node = tree_search(tree, start); 765 if (!node) { 766 prealloc = alloc_extent_state_atomic(prealloc); 767 BUG_ON(!prealloc); 768 err = insert_state(tree, prealloc, start, end, &bits); 769 if (err) 770 extent_io_tree_panic(tree, err); 771 772 prealloc = NULL; 773 goto out; 774 } 775 state = rb_entry(node, struct extent_state, rb_node); 776 hit_next: 777 last_start = state->start; 778 last_end = state->end; 779 780 /* 781 * | ---- desired range ---- | 782 * | state | 783 * 784 * Just lock what we found and keep going 785 */ 786 if (state->start == start && state->end <= end) { 787 if (state->state & exclusive_bits) { 788 *failed_start = state->start; 789 err = -EEXIST; 790 goto out; 791 } 792 793 set_state_bits(tree, state, &bits); 794 cache_state(state, cached_state); 795 merge_state(tree, state); 796 if (last_end == (u64)-1) 797 goto out; 798 start = last_end + 1; 799 state = next_state(state); 800 if (start < end && state && state->start == start && 801 !need_resched()) 802 goto hit_next; 803 goto search_again; 804 } 805 806 /* 807 * | ---- desired range ---- | 808 * | state | 809 * or 810 * | ------------- state -------------- | 811 * 812 * We need to split the extent we found, and may flip bits on 813 * second half. 814 * 815 * If the extent we found extends past our 816 * range, we just split and search again. It'll get split 817 * again the next time though. 818 * 819 * If the extent we found is inside our range, we set the 820 * desired bit on it. 821 */ 822 if (state->start < start) { 823 if (state->state & exclusive_bits) { 824 *failed_start = start; 825 err = -EEXIST; 826 goto out; 827 } 828 829 prealloc = alloc_extent_state_atomic(prealloc); 830 BUG_ON(!prealloc); 831 err = split_state(tree, state, prealloc, start); 832 if (err) 833 extent_io_tree_panic(tree, err); 834 835 prealloc = NULL; 836 if (err) 837 goto out; 838 if (state->end <= end) { 839 set_state_bits(tree, state, &bits); 840 cache_state(state, cached_state); 841 merge_state(tree, state); 842 if (last_end == (u64)-1) 843 goto out; 844 start = last_end + 1; 845 state = next_state(state); 846 if (start < end && state && state->start == start && 847 !need_resched()) 848 goto hit_next; 849 } 850 goto search_again; 851 } 852 /* 853 * | ---- desired range ---- | 854 * | state | or | state | 855 * 856 * There's a hole, we need to insert something in it and 857 * ignore the extent we found. 858 */ 859 if (state->start > start) { 860 u64 this_end; 861 if (end < last_start) 862 this_end = end; 863 else 864 this_end = last_start - 1; 865 866 prealloc = alloc_extent_state_atomic(prealloc); 867 BUG_ON(!prealloc); 868 869 /* 870 * Avoid to free 'prealloc' if it can be merged with 871 * the later extent. 872 */ 873 err = insert_state(tree, prealloc, start, this_end, 874 &bits); 875 if (err) 876 extent_io_tree_panic(tree, err); 877 878 cache_state(prealloc, cached_state); 879 prealloc = NULL; 880 start = this_end + 1; 881 goto search_again; 882 } 883 /* 884 * | ---- desired range ---- | 885 * | state | 886 * We need to split the extent, and set the bit 887 * on the first half 888 */ 889 if (state->start <= end && state->end > end) { 890 if (state->state & exclusive_bits) { 891 *failed_start = start; 892 err = -EEXIST; 893 goto out; 894 } 895 896 prealloc = alloc_extent_state_atomic(prealloc); 897 BUG_ON(!prealloc); 898 err = split_state(tree, state, prealloc, end + 1); 899 if (err) 900 extent_io_tree_panic(tree, err); 901 902 set_state_bits(tree, prealloc, &bits); 903 cache_state(prealloc, cached_state); 904 merge_state(tree, prealloc); 905 prealloc = NULL; 906 goto out; 907 } 908 909 goto search_again; 910 911 out: 912 spin_unlock(&tree->lock); 913 if (prealloc) 914 free_extent_state(prealloc); 915 916 return err; 917 918 search_again: 919 if (start > end) 920 goto out; 921 spin_unlock(&tree->lock); 922 if (mask & __GFP_WAIT) 923 cond_resched(); 924 goto again; 925 } 926 927 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, 928 u64 *failed_start, struct extent_state **cached_state, 929 gfp_t mask) 930 { 931 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 932 cached_state, mask); 933 } 934 935 936 /** 937 * convert_extent_bit - convert all bits in a given range from one bit to 938 * another 939 * @tree: the io tree to search 940 * @start: the start offset in bytes 941 * @end: the end offset in bytes (inclusive) 942 * @bits: the bits to set in this range 943 * @clear_bits: the bits to clear in this range 944 * @cached_state: state that we're going to cache 945 * @mask: the allocation mask 946 * 947 * This will go through and set bits for the given range. If any states exist 948 * already in this range they are set with the given bit and cleared of the 949 * clear_bits. This is only meant to be used by things that are mergeable, ie 950 * converting from say DELALLOC to DIRTY. This is not meant to be used with 951 * boundary bits like LOCK. 952 */ 953 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 954 int bits, int clear_bits, 955 struct extent_state **cached_state, gfp_t mask) 956 { 957 struct extent_state *state; 958 struct extent_state *prealloc = NULL; 959 struct rb_node *node; 960 int err = 0; 961 u64 last_start; 962 u64 last_end; 963 964 again: 965 if (!prealloc && (mask & __GFP_WAIT)) { 966 prealloc = alloc_extent_state(mask); 967 if (!prealloc) 968 return -ENOMEM; 969 } 970 971 spin_lock(&tree->lock); 972 if (cached_state && *cached_state) { 973 state = *cached_state; 974 if (state->start <= start && state->end > start && 975 state->tree) { 976 node = &state->rb_node; 977 goto hit_next; 978 } 979 } 980 981 /* 982 * this search will find all the extents that end after 983 * our range starts. 984 */ 985 node = tree_search(tree, start); 986 if (!node) { 987 prealloc = alloc_extent_state_atomic(prealloc); 988 if (!prealloc) { 989 err = -ENOMEM; 990 goto out; 991 } 992 err = insert_state(tree, prealloc, start, end, &bits); 993 prealloc = NULL; 994 if (err) 995 extent_io_tree_panic(tree, err); 996 goto out; 997 } 998 state = rb_entry(node, struct extent_state, rb_node); 999 hit_next: 1000 last_start = state->start; 1001 last_end = state->end; 1002 1003 /* 1004 * | ---- desired range ---- | 1005 * | state | 1006 * 1007 * Just lock what we found and keep going 1008 */ 1009 if (state->start == start && state->end <= end) { 1010 set_state_bits(tree, state, &bits); 1011 cache_state(state, cached_state); 1012 state = clear_state_bit(tree, state, &clear_bits, 0); 1013 if (last_end == (u64)-1) 1014 goto out; 1015 start = last_end + 1; 1016 if (start < end && state && state->start == start && 1017 !need_resched()) 1018 goto hit_next; 1019 goto search_again; 1020 } 1021 1022 /* 1023 * | ---- desired range ---- | 1024 * | state | 1025 * or 1026 * | ------------- state -------------- | 1027 * 1028 * We need to split the extent we found, and may flip bits on 1029 * second half. 1030 * 1031 * If the extent we found extends past our 1032 * range, we just split and search again. It'll get split 1033 * again the next time though. 1034 * 1035 * If the extent we found is inside our range, we set the 1036 * desired bit on it. 1037 */ 1038 if (state->start < start) { 1039 prealloc = alloc_extent_state_atomic(prealloc); 1040 if (!prealloc) { 1041 err = -ENOMEM; 1042 goto out; 1043 } 1044 err = split_state(tree, state, prealloc, start); 1045 if (err) 1046 extent_io_tree_panic(tree, err); 1047 prealloc = NULL; 1048 if (err) 1049 goto out; 1050 if (state->end <= end) { 1051 set_state_bits(tree, state, &bits); 1052 cache_state(state, cached_state); 1053 state = clear_state_bit(tree, state, &clear_bits, 0); 1054 if (last_end == (u64)-1) 1055 goto out; 1056 start = last_end + 1; 1057 if (start < end && state && state->start == start && 1058 !need_resched()) 1059 goto hit_next; 1060 } 1061 goto search_again; 1062 } 1063 /* 1064 * | ---- desired range ---- | 1065 * | state | or | state | 1066 * 1067 * There's a hole, we need to insert something in it and 1068 * ignore the extent we found. 1069 */ 1070 if (state->start > start) { 1071 u64 this_end; 1072 if (end < last_start) 1073 this_end = end; 1074 else 1075 this_end = last_start - 1; 1076 1077 prealloc = alloc_extent_state_atomic(prealloc); 1078 if (!prealloc) { 1079 err = -ENOMEM; 1080 goto out; 1081 } 1082 1083 /* 1084 * Avoid to free 'prealloc' if it can be merged with 1085 * the later extent. 1086 */ 1087 err = insert_state(tree, prealloc, start, this_end, 1088 &bits); 1089 if (err) 1090 extent_io_tree_panic(tree, err); 1091 cache_state(prealloc, cached_state); 1092 prealloc = NULL; 1093 start = this_end + 1; 1094 goto search_again; 1095 } 1096 /* 1097 * | ---- desired range ---- | 1098 * | state | 1099 * We need to split the extent, and set the bit 1100 * on the first half 1101 */ 1102 if (state->start <= end && state->end > end) { 1103 prealloc = alloc_extent_state_atomic(prealloc); 1104 if (!prealloc) { 1105 err = -ENOMEM; 1106 goto out; 1107 } 1108 1109 err = split_state(tree, state, prealloc, end + 1); 1110 if (err) 1111 extent_io_tree_panic(tree, err); 1112 1113 set_state_bits(tree, prealloc, &bits); 1114 cache_state(prealloc, cached_state); 1115 clear_state_bit(tree, prealloc, &clear_bits, 0); 1116 prealloc = NULL; 1117 goto out; 1118 } 1119 1120 goto search_again; 1121 1122 out: 1123 spin_unlock(&tree->lock); 1124 if (prealloc) 1125 free_extent_state(prealloc); 1126 1127 return err; 1128 1129 search_again: 1130 if (start > end) 1131 goto out; 1132 spin_unlock(&tree->lock); 1133 if (mask & __GFP_WAIT) 1134 cond_resched(); 1135 goto again; 1136 } 1137 1138 /* wrappers around set/clear extent bit */ 1139 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1140 gfp_t mask) 1141 { 1142 return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, 1143 NULL, mask); 1144 } 1145 1146 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1147 int bits, gfp_t mask) 1148 { 1149 return set_extent_bit(tree, start, end, bits, NULL, 1150 NULL, mask); 1151 } 1152 1153 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1154 int bits, gfp_t mask) 1155 { 1156 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 1157 } 1158 1159 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1160 struct extent_state **cached_state, gfp_t mask) 1161 { 1162 return set_extent_bit(tree, start, end, 1163 EXTENT_DELALLOC | EXTENT_UPTODATE, 1164 NULL, cached_state, mask); 1165 } 1166 1167 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, 1168 struct extent_state **cached_state, gfp_t mask) 1169 { 1170 return set_extent_bit(tree, start, end, 1171 EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, 1172 NULL, cached_state, mask); 1173 } 1174 1175 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1176 gfp_t mask) 1177 { 1178 return clear_extent_bit(tree, start, end, 1179 EXTENT_DIRTY | EXTENT_DELALLOC | 1180 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 1181 } 1182 1183 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 1184 gfp_t mask) 1185 { 1186 return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, 1187 NULL, mask); 1188 } 1189 1190 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1191 struct extent_state **cached_state, gfp_t mask) 1192 { 1193 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 1194 cached_state, mask); 1195 } 1196 1197 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 1198 struct extent_state **cached_state, gfp_t mask) 1199 { 1200 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1201 cached_state, mask); 1202 } 1203 1204 /* 1205 * either insert or lock state struct between start and end use mask to tell 1206 * us if waiting is desired. 1207 */ 1208 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1209 int bits, struct extent_state **cached_state) 1210 { 1211 int err; 1212 u64 failed_start; 1213 while (1) { 1214 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1215 EXTENT_LOCKED, &failed_start, 1216 cached_state, GFP_NOFS); 1217 if (err == -EEXIST) { 1218 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1219 start = failed_start; 1220 } else 1221 break; 1222 WARN_ON(start > end); 1223 } 1224 return err; 1225 } 1226 1227 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1228 { 1229 return lock_extent_bits(tree, start, end, 0, NULL); 1230 } 1231 1232 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1233 { 1234 int err; 1235 u64 failed_start; 1236 1237 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1238 &failed_start, NULL, GFP_NOFS); 1239 if (err == -EEXIST) { 1240 if (failed_start > start) 1241 clear_extent_bit(tree, start, failed_start - 1, 1242 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1243 return 0; 1244 } 1245 return 1; 1246 } 1247 1248 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1249 struct extent_state **cached, gfp_t mask) 1250 { 1251 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1252 mask); 1253 } 1254 1255 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1256 { 1257 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1258 GFP_NOFS); 1259 } 1260 1261 /* 1262 * helper function to set both pages and extents in the tree writeback 1263 */ 1264 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1265 { 1266 unsigned long index = start >> PAGE_CACHE_SHIFT; 1267 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1268 struct page *page; 1269 1270 while (index <= end_index) { 1271 page = find_get_page(tree->mapping, index); 1272 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1273 set_page_writeback(page); 1274 page_cache_release(page); 1275 index++; 1276 } 1277 return 0; 1278 } 1279 1280 /* find the first state struct with 'bits' set after 'start', and 1281 * return it. tree->lock must be held. NULL will returned if 1282 * nothing was found after 'start' 1283 */ 1284 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1285 u64 start, int bits) 1286 { 1287 struct rb_node *node; 1288 struct extent_state *state; 1289 1290 /* 1291 * this search will find all the extents that end after 1292 * our range starts. 1293 */ 1294 node = tree_search(tree, start); 1295 if (!node) 1296 goto out; 1297 1298 while (1) { 1299 state = rb_entry(node, struct extent_state, rb_node); 1300 if (state->end >= start && (state->state & bits)) 1301 return state; 1302 1303 node = rb_next(node); 1304 if (!node) 1305 break; 1306 } 1307 out: 1308 return NULL; 1309 } 1310 1311 /* 1312 * find the first offset in the io tree with 'bits' set. zero is 1313 * returned if we find something, and *start_ret and *end_ret are 1314 * set to reflect the state struct that was found. 1315 * 1316 * If nothing was found, 1 is returned. If found something, return 0. 1317 */ 1318 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1319 u64 *start_ret, u64 *end_ret, int bits, 1320 struct extent_state **cached_state) 1321 { 1322 struct extent_state *state; 1323 struct rb_node *n; 1324 int ret = 1; 1325 1326 spin_lock(&tree->lock); 1327 if (cached_state && *cached_state) { 1328 state = *cached_state; 1329 if (state->end == start - 1 && state->tree) { 1330 n = rb_next(&state->rb_node); 1331 while (n) { 1332 state = rb_entry(n, struct extent_state, 1333 rb_node); 1334 if (state->state & bits) 1335 goto got_it; 1336 n = rb_next(n); 1337 } 1338 free_extent_state(*cached_state); 1339 *cached_state = NULL; 1340 goto out; 1341 } 1342 free_extent_state(*cached_state); 1343 *cached_state = NULL; 1344 } 1345 1346 state = find_first_extent_bit_state(tree, start, bits); 1347 got_it: 1348 if (state) { 1349 cache_state(state, cached_state); 1350 *start_ret = state->start; 1351 *end_ret = state->end; 1352 ret = 0; 1353 } 1354 out: 1355 spin_unlock(&tree->lock); 1356 return ret; 1357 } 1358 1359 /* 1360 * find a contiguous range of bytes in the file marked as delalloc, not 1361 * more than 'max_bytes'. start and end are used to return the range, 1362 * 1363 * 1 is returned if we find something, 0 if nothing was in the tree 1364 */ 1365 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1366 u64 *start, u64 *end, u64 max_bytes, 1367 struct extent_state **cached_state) 1368 { 1369 struct rb_node *node; 1370 struct extent_state *state; 1371 u64 cur_start = *start; 1372 u64 found = 0; 1373 u64 total_bytes = 0; 1374 1375 spin_lock(&tree->lock); 1376 1377 /* 1378 * this search will find all the extents that end after 1379 * our range starts. 1380 */ 1381 node = tree_search(tree, cur_start); 1382 if (!node) { 1383 if (!found) 1384 *end = (u64)-1; 1385 goto out; 1386 } 1387 1388 while (1) { 1389 state = rb_entry(node, struct extent_state, rb_node); 1390 if (found && (state->start != cur_start || 1391 (state->state & EXTENT_BOUNDARY))) { 1392 goto out; 1393 } 1394 if (!(state->state & EXTENT_DELALLOC)) { 1395 if (!found) 1396 *end = state->end; 1397 goto out; 1398 } 1399 if (!found) { 1400 *start = state->start; 1401 *cached_state = state; 1402 atomic_inc(&state->refs); 1403 } 1404 found++; 1405 *end = state->end; 1406 cur_start = state->end + 1; 1407 node = rb_next(node); 1408 if (!node) 1409 break; 1410 total_bytes += state->end - state->start + 1; 1411 if (total_bytes >= max_bytes) 1412 break; 1413 } 1414 out: 1415 spin_unlock(&tree->lock); 1416 return found; 1417 } 1418 1419 static noinline void __unlock_for_delalloc(struct inode *inode, 1420 struct page *locked_page, 1421 u64 start, u64 end) 1422 { 1423 int ret; 1424 struct page *pages[16]; 1425 unsigned long index = start >> PAGE_CACHE_SHIFT; 1426 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1427 unsigned long nr_pages = end_index - index + 1; 1428 int i; 1429 1430 if (index == locked_page->index && end_index == index) 1431 return; 1432 1433 while (nr_pages > 0) { 1434 ret = find_get_pages_contig(inode->i_mapping, index, 1435 min_t(unsigned long, nr_pages, 1436 ARRAY_SIZE(pages)), pages); 1437 for (i = 0; i < ret; i++) { 1438 if (pages[i] != locked_page) 1439 unlock_page(pages[i]); 1440 page_cache_release(pages[i]); 1441 } 1442 nr_pages -= ret; 1443 index += ret; 1444 cond_resched(); 1445 } 1446 } 1447 1448 static noinline int lock_delalloc_pages(struct inode *inode, 1449 struct page *locked_page, 1450 u64 delalloc_start, 1451 u64 delalloc_end) 1452 { 1453 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1454 unsigned long start_index = index; 1455 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1456 unsigned long pages_locked = 0; 1457 struct page *pages[16]; 1458 unsigned long nrpages; 1459 int ret; 1460 int i; 1461 1462 /* the caller is responsible for locking the start index */ 1463 if (index == locked_page->index && index == end_index) 1464 return 0; 1465 1466 /* skip the page at the start index */ 1467 nrpages = end_index - index + 1; 1468 while (nrpages > 0) { 1469 ret = find_get_pages_contig(inode->i_mapping, index, 1470 min_t(unsigned long, 1471 nrpages, ARRAY_SIZE(pages)), pages); 1472 if (ret == 0) { 1473 ret = -EAGAIN; 1474 goto done; 1475 } 1476 /* now we have an array of pages, lock them all */ 1477 for (i = 0; i < ret; i++) { 1478 /* 1479 * the caller is taking responsibility for 1480 * locked_page 1481 */ 1482 if (pages[i] != locked_page) { 1483 lock_page(pages[i]); 1484 if (!PageDirty(pages[i]) || 1485 pages[i]->mapping != inode->i_mapping) { 1486 ret = -EAGAIN; 1487 unlock_page(pages[i]); 1488 page_cache_release(pages[i]); 1489 goto done; 1490 } 1491 } 1492 page_cache_release(pages[i]); 1493 pages_locked++; 1494 } 1495 nrpages -= ret; 1496 index += ret; 1497 cond_resched(); 1498 } 1499 ret = 0; 1500 done: 1501 if (ret && pages_locked) { 1502 __unlock_for_delalloc(inode, locked_page, 1503 delalloc_start, 1504 ((u64)(start_index + pages_locked - 1)) << 1505 PAGE_CACHE_SHIFT); 1506 } 1507 return ret; 1508 } 1509 1510 /* 1511 * find a contiguous range of bytes in the file marked as delalloc, not 1512 * more than 'max_bytes'. start and end are used to return the range, 1513 * 1514 * 1 is returned if we find something, 0 if nothing was in the tree 1515 */ 1516 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1517 struct extent_io_tree *tree, 1518 struct page *locked_page, 1519 u64 *start, u64 *end, 1520 u64 max_bytes) 1521 { 1522 u64 delalloc_start; 1523 u64 delalloc_end; 1524 u64 found; 1525 struct extent_state *cached_state = NULL; 1526 int ret; 1527 int loops = 0; 1528 1529 again: 1530 /* step one, find a bunch of delalloc bytes starting at start */ 1531 delalloc_start = *start; 1532 delalloc_end = 0; 1533 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1534 max_bytes, &cached_state); 1535 if (!found || delalloc_end <= *start) { 1536 *start = delalloc_start; 1537 *end = delalloc_end; 1538 free_extent_state(cached_state); 1539 return found; 1540 } 1541 1542 /* 1543 * start comes from the offset of locked_page. We have to lock 1544 * pages in order, so we can't process delalloc bytes before 1545 * locked_page 1546 */ 1547 if (delalloc_start < *start) 1548 delalloc_start = *start; 1549 1550 /* 1551 * make sure to limit the number of pages we try to lock down 1552 * if we're looping. 1553 */ 1554 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1555 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1556 1557 /* step two, lock all the pages after the page that has start */ 1558 ret = lock_delalloc_pages(inode, locked_page, 1559 delalloc_start, delalloc_end); 1560 if (ret == -EAGAIN) { 1561 /* some of the pages are gone, lets avoid looping by 1562 * shortening the size of the delalloc range we're searching 1563 */ 1564 free_extent_state(cached_state); 1565 if (!loops) { 1566 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1567 max_bytes = PAGE_CACHE_SIZE - offset; 1568 loops = 1; 1569 goto again; 1570 } else { 1571 found = 0; 1572 goto out_failed; 1573 } 1574 } 1575 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1576 1577 /* step three, lock the state bits for the whole range */ 1578 lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); 1579 1580 /* then test to make sure it is all still delalloc */ 1581 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1582 EXTENT_DELALLOC, 1, cached_state); 1583 if (!ret) { 1584 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1585 &cached_state, GFP_NOFS); 1586 __unlock_for_delalloc(inode, locked_page, 1587 delalloc_start, delalloc_end); 1588 cond_resched(); 1589 goto again; 1590 } 1591 free_extent_state(cached_state); 1592 *start = delalloc_start; 1593 *end = delalloc_end; 1594 out_failed: 1595 return found; 1596 } 1597 1598 int extent_clear_unlock_delalloc(struct inode *inode, 1599 struct extent_io_tree *tree, 1600 u64 start, u64 end, struct page *locked_page, 1601 unsigned long op) 1602 { 1603 int ret; 1604 struct page *pages[16]; 1605 unsigned long index = start >> PAGE_CACHE_SHIFT; 1606 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1607 unsigned long nr_pages = end_index - index + 1; 1608 int i; 1609 int clear_bits = 0; 1610 1611 if (op & EXTENT_CLEAR_UNLOCK) 1612 clear_bits |= EXTENT_LOCKED; 1613 if (op & EXTENT_CLEAR_DIRTY) 1614 clear_bits |= EXTENT_DIRTY; 1615 1616 if (op & EXTENT_CLEAR_DELALLOC) 1617 clear_bits |= EXTENT_DELALLOC; 1618 1619 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1620 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1621 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1622 EXTENT_SET_PRIVATE2))) 1623 return 0; 1624 1625 while (nr_pages > 0) { 1626 ret = find_get_pages_contig(inode->i_mapping, index, 1627 min_t(unsigned long, 1628 nr_pages, ARRAY_SIZE(pages)), pages); 1629 for (i = 0; i < ret; i++) { 1630 1631 if (op & EXTENT_SET_PRIVATE2) 1632 SetPagePrivate2(pages[i]); 1633 1634 if (pages[i] == locked_page) { 1635 page_cache_release(pages[i]); 1636 continue; 1637 } 1638 if (op & EXTENT_CLEAR_DIRTY) 1639 clear_page_dirty_for_io(pages[i]); 1640 if (op & EXTENT_SET_WRITEBACK) 1641 set_page_writeback(pages[i]); 1642 if (op & EXTENT_END_WRITEBACK) 1643 end_page_writeback(pages[i]); 1644 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1645 unlock_page(pages[i]); 1646 page_cache_release(pages[i]); 1647 } 1648 nr_pages -= ret; 1649 index += ret; 1650 cond_resched(); 1651 } 1652 return 0; 1653 } 1654 1655 /* 1656 * count the number of bytes in the tree that have a given bit(s) 1657 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1658 * cached. The total number found is returned. 1659 */ 1660 u64 count_range_bits(struct extent_io_tree *tree, 1661 u64 *start, u64 search_end, u64 max_bytes, 1662 unsigned long bits, int contig) 1663 { 1664 struct rb_node *node; 1665 struct extent_state *state; 1666 u64 cur_start = *start; 1667 u64 total_bytes = 0; 1668 u64 last = 0; 1669 int found = 0; 1670 1671 if (search_end <= cur_start) { 1672 WARN_ON(1); 1673 return 0; 1674 } 1675 1676 spin_lock(&tree->lock); 1677 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1678 total_bytes = tree->dirty_bytes; 1679 goto out; 1680 } 1681 /* 1682 * this search will find all the extents that end after 1683 * our range starts. 1684 */ 1685 node = tree_search(tree, cur_start); 1686 if (!node) 1687 goto out; 1688 1689 while (1) { 1690 state = rb_entry(node, struct extent_state, rb_node); 1691 if (state->start > search_end) 1692 break; 1693 if (contig && found && state->start > last + 1) 1694 break; 1695 if (state->end >= cur_start && (state->state & bits) == bits) { 1696 total_bytes += min(search_end, state->end) + 1 - 1697 max(cur_start, state->start); 1698 if (total_bytes >= max_bytes) 1699 break; 1700 if (!found) { 1701 *start = max(cur_start, state->start); 1702 found = 1; 1703 } 1704 last = state->end; 1705 } else if (contig && found) { 1706 break; 1707 } 1708 node = rb_next(node); 1709 if (!node) 1710 break; 1711 } 1712 out: 1713 spin_unlock(&tree->lock); 1714 return total_bytes; 1715 } 1716 1717 /* 1718 * set the private field for a given byte offset in the tree. If there isn't 1719 * an extent_state there already, this does nothing. 1720 */ 1721 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1722 { 1723 struct rb_node *node; 1724 struct extent_state *state; 1725 int ret = 0; 1726 1727 spin_lock(&tree->lock); 1728 /* 1729 * this search will find all the extents that end after 1730 * our range starts. 1731 */ 1732 node = tree_search(tree, start); 1733 if (!node) { 1734 ret = -ENOENT; 1735 goto out; 1736 } 1737 state = rb_entry(node, struct extent_state, rb_node); 1738 if (state->start != start) { 1739 ret = -ENOENT; 1740 goto out; 1741 } 1742 state->private = private; 1743 out: 1744 spin_unlock(&tree->lock); 1745 return ret; 1746 } 1747 1748 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1749 { 1750 struct rb_node *node; 1751 struct extent_state *state; 1752 int ret = 0; 1753 1754 spin_lock(&tree->lock); 1755 /* 1756 * this search will find all the extents that end after 1757 * our range starts. 1758 */ 1759 node = tree_search(tree, start); 1760 if (!node) { 1761 ret = -ENOENT; 1762 goto out; 1763 } 1764 state = rb_entry(node, struct extent_state, rb_node); 1765 if (state->start != start) { 1766 ret = -ENOENT; 1767 goto out; 1768 } 1769 *private = state->private; 1770 out: 1771 spin_unlock(&tree->lock); 1772 return ret; 1773 } 1774 1775 /* 1776 * searches a range in the state tree for a given mask. 1777 * If 'filled' == 1, this returns 1 only if every extent in the tree 1778 * has the bits set. Otherwise, 1 is returned if any bit in the 1779 * range is found set. 1780 */ 1781 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1782 int bits, int filled, struct extent_state *cached) 1783 { 1784 struct extent_state *state = NULL; 1785 struct rb_node *node; 1786 int bitset = 0; 1787 1788 spin_lock(&tree->lock); 1789 if (cached && cached->tree && cached->start <= start && 1790 cached->end > start) 1791 node = &cached->rb_node; 1792 else 1793 node = tree_search(tree, start); 1794 while (node && start <= end) { 1795 state = rb_entry(node, struct extent_state, rb_node); 1796 1797 if (filled && state->start > start) { 1798 bitset = 0; 1799 break; 1800 } 1801 1802 if (state->start > end) 1803 break; 1804 1805 if (state->state & bits) { 1806 bitset = 1; 1807 if (!filled) 1808 break; 1809 } else if (filled) { 1810 bitset = 0; 1811 break; 1812 } 1813 1814 if (state->end == (u64)-1) 1815 break; 1816 1817 start = state->end + 1; 1818 if (start > end) 1819 break; 1820 node = rb_next(node); 1821 if (!node) { 1822 if (filled) 1823 bitset = 0; 1824 break; 1825 } 1826 } 1827 spin_unlock(&tree->lock); 1828 return bitset; 1829 } 1830 1831 /* 1832 * helper function to set a given page up to date if all the 1833 * extents in the tree for that page are up to date 1834 */ 1835 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1836 { 1837 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1838 u64 end = start + PAGE_CACHE_SIZE - 1; 1839 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1840 SetPageUptodate(page); 1841 } 1842 1843 /* 1844 * helper function to unlock a page if all the extents in the tree 1845 * for that page are unlocked 1846 */ 1847 static void check_page_locked(struct extent_io_tree *tree, struct page *page) 1848 { 1849 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1850 u64 end = start + PAGE_CACHE_SIZE - 1; 1851 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1852 unlock_page(page); 1853 } 1854 1855 /* 1856 * helper function to end page writeback if all the extents 1857 * in the tree for that page are done with writeback 1858 */ 1859 static void check_page_writeback(struct extent_io_tree *tree, 1860 struct page *page) 1861 { 1862 end_page_writeback(page); 1863 } 1864 1865 /* 1866 * When IO fails, either with EIO or csum verification fails, we 1867 * try other mirrors that might have a good copy of the data. This 1868 * io_failure_record is used to record state as we go through all the 1869 * mirrors. If another mirror has good data, the page is set up to date 1870 * and things continue. If a good mirror can't be found, the original 1871 * bio end_io callback is called to indicate things have failed. 1872 */ 1873 struct io_failure_record { 1874 struct page *page; 1875 u64 start; 1876 u64 len; 1877 u64 logical; 1878 unsigned long bio_flags; 1879 int this_mirror; 1880 int failed_mirror; 1881 int in_validation; 1882 }; 1883 1884 static int free_io_failure(struct inode *inode, struct io_failure_record *rec, 1885 int did_repair) 1886 { 1887 int ret; 1888 int err = 0; 1889 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 1890 1891 set_state_private(failure_tree, rec->start, 0); 1892 ret = clear_extent_bits(failure_tree, rec->start, 1893 rec->start + rec->len - 1, 1894 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 1895 if (ret) 1896 err = ret; 1897 1898 if (did_repair) { 1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1900 rec->start + rec->len - 1, 1901 EXTENT_DAMAGED, GFP_NOFS); 1902 if (ret && !err) 1903 err = ret; 1904 } 1905 1906 kfree(rec); 1907 return err; 1908 } 1909 1910 static void repair_io_failure_callback(struct bio *bio, int err) 1911 { 1912 complete(bio->bi_private); 1913 } 1914 1915 /* 1916 * this bypasses the standard btrfs submit functions deliberately, as 1917 * the standard behavior is to write all copies in a raid setup. here we only 1918 * want to write the one bad copy. so we do the mapping for ourselves and issue 1919 * submit_bio directly. 1920 * to avoid any synchronization issues, wait for the data after writing, which 1921 * actually prevents the read that triggered the error from finishing. 1922 * currently, there can be no more than two copies of every data bit. thus, 1923 * exactly one rewrite is required. 1924 */ 1925 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1926 u64 length, u64 logical, struct page *page, 1927 int mirror_num) 1928 { 1929 struct bio *bio; 1930 struct btrfs_device *dev; 1931 DECLARE_COMPLETION_ONSTACK(compl); 1932 u64 map_length = 0; 1933 u64 sector; 1934 struct btrfs_bio *bbio = NULL; 1935 int ret; 1936 1937 BUG_ON(!mirror_num); 1938 1939 bio = bio_alloc(GFP_NOFS, 1); 1940 if (!bio) 1941 return -EIO; 1942 bio->bi_private = &compl; 1943 bio->bi_end_io = repair_io_failure_callback; 1944 bio->bi_size = 0; 1945 map_length = length; 1946 1947 ret = btrfs_map_block(fs_info, WRITE, logical, 1948 &map_length, &bbio, mirror_num); 1949 if (ret) { 1950 bio_put(bio); 1951 return -EIO; 1952 } 1953 BUG_ON(mirror_num != bbio->mirror_num); 1954 sector = bbio->stripes[mirror_num-1].physical >> 9; 1955 bio->bi_sector = sector; 1956 dev = bbio->stripes[mirror_num-1].dev; 1957 kfree(bbio); 1958 if (!dev || !dev->bdev || !dev->writeable) { 1959 bio_put(bio); 1960 return -EIO; 1961 } 1962 bio->bi_bdev = dev->bdev; 1963 bio_add_page(bio, page, length, start-page_offset(page)); 1964 btrfsic_submit_bio(WRITE_SYNC, bio); 1965 wait_for_completion(&compl); 1966 1967 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1968 /* try to remap that extent elsewhere? */ 1969 bio_put(bio); 1970 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 1971 return -EIO; 1972 } 1973 1974 printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " 1975 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 1976 start, rcu_str_deref(dev->name), sector); 1977 1978 bio_put(bio); 1979 return 0; 1980 } 1981 1982 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 1983 int mirror_num) 1984 { 1985 u64 start = eb->start; 1986 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 1987 int ret = 0; 1988 1989 for (i = 0; i < num_pages; i++) { 1990 struct page *p = extent_buffer_page(eb, i); 1991 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 1992 start, p, mirror_num); 1993 if (ret) 1994 break; 1995 start += PAGE_CACHE_SIZE; 1996 } 1997 1998 return ret; 1999 } 2000 2001 /* 2002 * each time an IO finishes, we do a fast check in the IO failure tree 2003 * to see if we need to process or clean up an io_failure_record 2004 */ 2005 static int clean_io_failure(u64 start, struct page *page) 2006 { 2007 u64 private; 2008 u64 private_failure; 2009 struct io_failure_record *failrec; 2010 struct btrfs_fs_info *fs_info; 2011 struct extent_state *state; 2012 int num_copies; 2013 int did_repair = 0; 2014 int ret; 2015 struct inode *inode = page->mapping->host; 2016 2017 private = 0; 2018 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2019 (u64)-1, 1, EXTENT_DIRTY, 0); 2020 if (!ret) 2021 return 0; 2022 2023 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, 2024 &private_failure); 2025 if (ret) 2026 return 0; 2027 2028 failrec = (struct io_failure_record *)(unsigned long) private_failure; 2029 BUG_ON(!failrec->this_mirror); 2030 2031 if (failrec->in_validation) { 2032 /* there was no real error, just free the record */ 2033 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2034 failrec->start); 2035 did_repair = 1; 2036 goto out; 2037 } 2038 2039 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2040 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2041 failrec->start, 2042 EXTENT_LOCKED); 2043 spin_unlock(&BTRFS_I(inode)->io_tree.lock); 2044 2045 if (state && state->start == failrec->start) { 2046 fs_info = BTRFS_I(inode)->root->fs_info; 2047 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2048 failrec->len); 2049 if (num_copies > 1) { 2050 ret = repair_io_failure(fs_info, start, failrec->len, 2051 failrec->logical, page, 2052 failrec->failed_mirror); 2053 did_repair = !ret; 2054 } 2055 } 2056 2057 out: 2058 if (!ret) 2059 ret = free_io_failure(inode, failrec, did_repair); 2060 2061 return ret; 2062 } 2063 2064 /* 2065 * this is a generic handler for readpage errors (default 2066 * readpage_io_failed_hook). if other copies exist, read those and write back 2067 * good data to the failed position. does not investigate in remapping the 2068 * failed extent elsewhere, hoping the device will be smart enough to do this as 2069 * needed 2070 */ 2071 2072 static int bio_readpage_error(struct bio *failed_bio, struct page *page, 2073 u64 start, u64 end, int failed_mirror, 2074 struct extent_state *state) 2075 { 2076 struct io_failure_record *failrec = NULL; 2077 u64 private; 2078 struct extent_map *em; 2079 struct inode *inode = page->mapping->host; 2080 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2081 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2082 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2083 struct bio *bio; 2084 int num_copies; 2085 int ret; 2086 int read_mode; 2087 u64 logical; 2088 2089 BUG_ON(failed_bio->bi_rw & REQ_WRITE); 2090 2091 ret = get_state_private(failure_tree, start, &private); 2092 if (ret) { 2093 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2094 if (!failrec) 2095 return -ENOMEM; 2096 failrec->start = start; 2097 failrec->len = end - start + 1; 2098 failrec->this_mirror = 0; 2099 failrec->bio_flags = 0; 2100 failrec->in_validation = 0; 2101 2102 read_lock(&em_tree->lock); 2103 em = lookup_extent_mapping(em_tree, start, failrec->len); 2104 if (!em) { 2105 read_unlock(&em_tree->lock); 2106 kfree(failrec); 2107 return -EIO; 2108 } 2109 2110 if (em->start > start || em->start + em->len < start) { 2111 free_extent_map(em); 2112 em = NULL; 2113 } 2114 read_unlock(&em_tree->lock); 2115 2116 if (!em) { 2117 kfree(failrec); 2118 return -EIO; 2119 } 2120 logical = start - em->start; 2121 logical = em->block_start + logical; 2122 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2123 logical = em->block_start; 2124 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2125 extent_set_compress_type(&failrec->bio_flags, 2126 em->compress_type); 2127 } 2128 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2129 "len=%llu\n", logical, start, failrec->len); 2130 failrec->logical = logical; 2131 free_extent_map(em); 2132 2133 /* set the bits in the private failure tree */ 2134 ret = set_extent_bits(failure_tree, start, end, 2135 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); 2136 if (ret >= 0) 2137 ret = set_state_private(failure_tree, start, 2138 (u64)(unsigned long)failrec); 2139 /* set the bits in the inode's tree */ 2140 if (ret >= 0) 2141 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, 2142 GFP_NOFS); 2143 if (ret < 0) { 2144 kfree(failrec); 2145 return ret; 2146 } 2147 } else { 2148 failrec = (struct io_failure_record *)(unsigned long)private; 2149 pr_debug("bio_readpage_error: (found) logical=%llu, " 2150 "start=%llu, len=%llu, validation=%d\n", 2151 failrec->logical, failrec->start, failrec->len, 2152 failrec->in_validation); 2153 /* 2154 * when data can be on disk more than twice, add to failrec here 2155 * (e.g. with a list for failed_mirror) to make 2156 * clean_io_failure() clean all those errors at once. 2157 */ 2158 } 2159 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2160 failrec->logical, failrec->len); 2161 if (num_copies == 1) { 2162 /* 2163 * we only have a single copy of the data, so don't bother with 2164 * all the retry and error correction code that follows. no 2165 * matter what the error is, it is very likely to persist. 2166 */ 2167 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " 2168 "state=%p, num_copies=%d, next_mirror %d, " 2169 "failed_mirror %d\n", state, num_copies, 2170 failrec->this_mirror, failed_mirror); 2171 free_io_failure(inode, failrec, 0); 2172 return -EIO; 2173 } 2174 2175 if (!state) { 2176 spin_lock(&tree->lock); 2177 state = find_first_extent_bit_state(tree, failrec->start, 2178 EXTENT_LOCKED); 2179 if (state && state->start != failrec->start) 2180 state = NULL; 2181 spin_unlock(&tree->lock); 2182 } 2183 2184 /* 2185 * there are two premises: 2186 * a) deliver good data to the caller 2187 * b) correct the bad sectors on disk 2188 */ 2189 if (failed_bio->bi_vcnt > 1) { 2190 /* 2191 * to fulfill b), we need to know the exact failing sectors, as 2192 * we don't want to rewrite any more than the failed ones. thus, 2193 * we need separate read requests for the failed bio 2194 * 2195 * if the following BUG_ON triggers, our validation request got 2196 * merged. we need separate requests for our algorithm to work. 2197 */ 2198 BUG_ON(failrec->in_validation); 2199 failrec->in_validation = 1; 2200 failrec->this_mirror = failed_mirror; 2201 read_mode = READ_SYNC | REQ_FAILFAST_DEV; 2202 } else { 2203 /* 2204 * we're ready to fulfill a) and b) alongside. get a good copy 2205 * of the failed sector and if we succeed, we have setup 2206 * everything for repair_io_failure to do the rest for us. 2207 */ 2208 if (failrec->in_validation) { 2209 BUG_ON(failrec->this_mirror != failed_mirror); 2210 failrec->in_validation = 0; 2211 failrec->this_mirror = 0; 2212 } 2213 failrec->failed_mirror = failed_mirror; 2214 failrec->this_mirror++; 2215 if (failrec->this_mirror == failed_mirror) 2216 failrec->this_mirror++; 2217 read_mode = READ_SYNC; 2218 } 2219 2220 if (!state || failrec->this_mirror > num_copies) { 2221 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " 2222 "next_mirror %d, failed_mirror %d\n", state, 2223 num_copies, failrec->this_mirror, failed_mirror); 2224 free_io_failure(inode, failrec, 0); 2225 return -EIO; 2226 } 2227 2228 bio = bio_alloc(GFP_NOFS, 1); 2229 if (!bio) { 2230 free_io_failure(inode, failrec, 0); 2231 return -EIO; 2232 } 2233 bio->bi_private = state; 2234 bio->bi_end_io = failed_bio->bi_end_io; 2235 bio->bi_sector = failrec->logical >> 9; 2236 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2237 bio->bi_size = 0; 2238 2239 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2240 2241 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2242 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2243 failrec->this_mirror, num_copies, failrec->in_validation); 2244 2245 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2246 failrec->this_mirror, 2247 failrec->bio_flags, 0); 2248 return ret; 2249 } 2250 2251 /* lots and lots of room for performance fixes in the end_bio funcs */ 2252 2253 int end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2254 { 2255 int uptodate = (err == 0); 2256 struct extent_io_tree *tree; 2257 int ret; 2258 2259 tree = &BTRFS_I(page->mapping->host)->io_tree; 2260 2261 if (tree->ops && tree->ops->writepage_end_io_hook) { 2262 ret = tree->ops->writepage_end_io_hook(page, start, 2263 end, NULL, uptodate); 2264 if (ret) 2265 uptodate = 0; 2266 } 2267 2268 if (!uptodate) { 2269 ClearPageUptodate(page); 2270 SetPageError(page); 2271 } 2272 return 0; 2273 } 2274 2275 /* 2276 * after a writepage IO is done, we need to: 2277 * clear the uptodate bits on error 2278 * clear the writeback bits in the extent tree for this IO 2279 * end_page_writeback if the page has no more pending IO 2280 * 2281 * Scheduling is not allowed, so the extent state tree is expected 2282 * to have one and only one object corresponding to this IO. 2283 */ 2284 static void end_bio_extent_writepage(struct bio *bio, int err) 2285 { 2286 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2287 struct extent_io_tree *tree; 2288 u64 start; 2289 u64 end; 2290 int whole_page; 2291 2292 do { 2293 struct page *page = bvec->bv_page; 2294 tree = &BTRFS_I(page->mapping->host)->io_tree; 2295 2296 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2297 bvec->bv_offset; 2298 end = start + bvec->bv_len - 1; 2299 2300 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2301 whole_page = 1; 2302 else 2303 whole_page = 0; 2304 2305 if (--bvec >= bio->bi_io_vec) 2306 prefetchw(&bvec->bv_page->flags); 2307 2308 if (end_extent_writepage(page, err, start, end)) 2309 continue; 2310 2311 if (whole_page) 2312 end_page_writeback(page); 2313 else 2314 check_page_writeback(tree, page); 2315 } while (bvec >= bio->bi_io_vec); 2316 2317 bio_put(bio); 2318 } 2319 2320 /* 2321 * after a readpage IO is done, we need to: 2322 * clear the uptodate bits on error 2323 * set the uptodate bits if things worked 2324 * set the page up to date if all extents in the tree are uptodate 2325 * clear the lock bit in the extent tree 2326 * unlock the page if there are no other extents locked for it 2327 * 2328 * Scheduling is not allowed, so the extent state tree is expected 2329 * to have one and only one object corresponding to this IO. 2330 */ 2331 static void end_bio_extent_readpage(struct bio *bio, int err) 2332 { 2333 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 2334 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 2335 struct bio_vec *bvec = bio->bi_io_vec; 2336 struct extent_io_tree *tree; 2337 u64 start; 2338 u64 end; 2339 int whole_page; 2340 int mirror; 2341 int ret; 2342 2343 if (err) 2344 uptodate = 0; 2345 2346 do { 2347 struct page *page = bvec->bv_page; 2348 struct extent_state *cached = NULL; 2349 struct extent_state *state; 2350 2351 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2352 "mirror=%ld\n", (u64)bio->bi_sector, err, 2353 (long int)bio->bi_bdev); 2354 tree = &BTRFS_I(page->mapping->host)->io_tree; 2355 2356 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2357 bvec->bv_offset; 2358 end = start + bvec->bv_len - 1; 2359 2360 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 2361 whole_page = 1; 2362 else 2363 whole_page = 0; 2364 2365 if (++bvec <= bvec_end) 2366 prefetchw(&bvec->bv_page->flags); 2367 2368 spin_lock(&tree->lock); 2369 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 2370 if (state && state->start == start) { 2371 /* 2372 * take a reference on the state, unlock will drop 2373 * the ref 2374 */ 2375 cache_state(state, &cached); 2376 } 2377 spin_unlock(&tree->lock); 2378 2379 mirror = (int)(unsigned long)bio->bi_bdev; 2380 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 2381 ret = tree->ops->readpage_end_io_hook(page, start, end, 2382 state, mirror); 2383 if (ret) 2384 uptodate = 0; 2385 else 2386 clean_io_failure(start, page); 2387 } 2388 2389 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { 2390 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2391 if (!ret && !err && 2392 test_bit(BIO_UPTODATE, &bio->bi_flags)) 2393 uptodate = 1; 2394 } else if (!uptodate) { 2395 /* 2396 * The generic bio_readpage_error handles errors the 2397 * following way: If possible, new read requests are 2398 * created and submitted and will end up in 2399 * end_bio_extent_readpage as well (if we're lucky, not 2400 * in the !uptodate case). In that case it returns 0 and 2401 * we just go on with the next page in our bio. If it 2402 * can't handle the error it will return -EIO and we 2403 * remain responsible for that page. 2404 */ 2405 ret = bio_readpage_error(bio, page, start, end, mirror, NULL); 2406 if (ret == 0) { 2407 uptodate = 2408 test_bit(BIO_UPTODATE, &bio->bi_flags); 2409 if (err) 2410 uptodate = 0; 2411 uncache_state(&cached); 2412 continue; 2413 } 2414 } 2415 2416 if (uptodate && tree->track_uptodate) { 2417 set_extent_uptodate(tree, start, end, &cached, 2418 GFP_ATOMIC); 2419 } 2420 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2421 2422 if (whole_page) { 2423 if (uptodate) { 2424 SetPageUptodate(page); 2425 } else { 2426 ClearPageUptodate(page); 2427 SetPageError(page); 2428 } 2429 unlock_page(page); 2430 } else { 2431 if (uptodate) { 2432 check_page_uptodate(tree, page); 2433 } else { 2434 ClearPageUptodate(page); 2435 SetPageError(page); 2436 } 2437 check_page_locked(tree, page); 2438 } 2439 } while (bvec <= bvec_end); 2440 2441 bio_put(bio); 2442 } 2443 2444 struct bio * 2445 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2446 gfp_t gfp_flags) 2447 { 2448 struct bio *bio; 2449 2450 bio = bio_alloc(gfp_flags, nr_vecs); 2451 2452 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2453 while (!bio && (nr_vecs /= 2)) 2454 bio = bio_alloc(gfp_flags, nr_vecs); 2455 } 2456 2457 if (bio) { 2458 bio->bi_size = 0; 2459 bio->bi_bdev = bdev; 2460 bio->bi_sector = first_sector; 2461 } 2462 return bio; 2463 } 2464 2465 static int __must_check submit_one_bio(int rw, struct bio *bio, 2466 int mirror_num, unsigned long bio_flags) 2467 { 2468 int ret = 0; 2469 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2470 struct page *page = bvec->bv_page; 2471 struct extent_io_tree *tree = bio->bi_private; 2472 u64 start; 2473 2474 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 2475 2476 bio->bi_private = NULL; 2477 2478 bio_get(bio); 2479 2480 if (tree->ops && tree->ops->submit_bio_hook) 2481 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2482 mirror_num, bio_flags, start); 2483 else 2484 btrfsic_submit_bio(rw, bio); 2485 2486 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2487 ret = -EOPNOTSUPP; 2488 bio_put(bio); 2489 return ret; 2490 } 2491 2492 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2493 unsigned long offset, size_t size, struct bio *bio, 2494 unsigned long bio_flags) 2495 { 2496 int ret = 0; 2497 if (tree->ops && tree->ops->merge_bio_hook) 2498 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2499 bio_flags); 2500 BUG_ON(ret < 0); 2501 return ret; 2502 2503 } 2504 2505 static int submit_extent_page(int rw, struct extent_io_tree *tree, 2506 struct page *page, sector_t sector, 2507 size_t size, unsigned long offset, 2508 struct block_device *bdev, 2509 struct bio **bio_ret, 2510 unsigned long max_pages, 2511 bio_end_io_t end_io_func, 2512 int mirror_num, 2513 unsigned long prev_bio_flags, 2514 unsigned long bio_flags) 2515 { 2516 int ret = 0; 2517 struct bio *bio; 2518 int nr; 2519 int contig = 0; 2520 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 2521 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2522 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 2523 2524 if (bio_ret && *bio_ret) { 2525 bio = *bio_ret; 2526 if (old_compressed) 2527 contig = bio->bi_sector == sector; 2528 else 2529 contig = bio->bi_sector + (bio->bi_size >> 9) == 2530 sector; 2531 2532 if (prev_bio_flags != bio_flags || !contig || 2533 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2534 bio_add_page(bio, page, page_size, offset) < page_size) { 2535 ret = submit_one_bio(rw, bio, mirror_num, 2536 prev_bio_flags); 2537 if (ret < 0) 2538 return ret; 2539 bio = NULL; 2540 } else { 2541 return 0; 2542 } 2543 } 2544 if (this_compressed) 2545 nr = BIO_MAX_PAGES; 2546 else 2547 nr = bio_get_nr_vecs(bdev); 2548 2549 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 2550 if (!bio) 2551 return -ENOMEM; 2552 2553 bio_add_page(bio, page, page_size, offset); 2554 bio->bi_end_io = end_io_func; 2555 bio->bi_private = tree; 2556 2557 if (bio_ret) 2558 *bio_ret = bio; 2559 else 2560 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 2561 2562 return ret; 2563 } 2564 2565 void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) 2566 { 2567 if (!PagePrivate(page)) { 2568 SetPagePrivate(page); 2569 page_cache_get(page); 2570 set_page_private(page, (unsigned long)eb); 2571 } else { 2572 WARN_ON(page->private != (unsigned long)eb); 2573 } 2574 } 2575 2576 void set_page_extent_mapped(struct page *page) 2577 { 2578 if (!PagePrivate(page)) { 2579 SetPagePrivate(page); 2580 page_cache_get(page); 2581 set_page_private(page, EXTENT_PAGE_PRIVATE); 2582 } 2583 } 2584 2585 /* 2586 * basic readpage implementation. Locked extent state structs are inserted 2587 * into the tree that are removed when the IO is done (by the end_io 2588 * handlers) 2589 * XXX JDM: This needs looking at to ensure proper page locking 2590 */ 2591 static int __extent_read_full_page(struct extent_io_tree *tree, 2592 struct page *page, 2593 get_extent_t *get_extent, 2594 struct bio **bio, int mirror_num, 2595 unsigned long *bio_flags) 2596 { 2597 struct inode *inode = page->mapping->host; 2598 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2599 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2600 u64 end; 2601 u64 cur = start; 2602 u64 extent_offset; 2603 u64 last_byte = i_size_read(inode); 2604 u64 block_start; 2605 u64 cur_end; 2606 sector_t sector; 2607 struct extent_map *em; 2608 struct block_device *bdev; 2609 struct btrfs_ordered_extent *ordered; 2610 int ret; 2611 int nr = 0; 2612 size_t pg_offset = 0; 2613 size_t iosize; 2614 size_t disk_io_size; 2615 size_t blocksize = inode->i_sb->s_blocksize; 2616 unsigned long this_bio_flag = 0; 2617 2618 set_page_extent_mapped(page); 2619 2620 if (!PageUptodate(page)) { 2621 if (cleancache_get_page(page) == 0) { 2622 BUG_ON(blocksize != PAGE_SIZE); 2623 goto out; 2624 } 2625 } 2626 2627 end = page_end; 2628 while (1) { 2629 lock_extent(tree, start, end); 2630 ordered = btrfs_lookup_ordered_extent(inode, start); 2631 if (!ordered) 2632 break; 2633 unlock_extent(tree, start, end); 2634 btrfs_start_ordered_extent(inode, ordered, 1); 2635 btrfs_put_ordered_extent(ordered); 2636 } 2637 2638 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2639 char *userpage; 2640 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2641 2642 if (zero_offset) { 2643 iosize = PAGE_CACHE_SIZE - zero_offset; 2644 userpage = kmap_atomic(page); 2645 memset(userpage + zero_offset, 0, iosize); 2646 flush_dcache_page(page); 2647 kunmap_atomic(userpage); 2648 } 2649 } 2650 while (cur <= end) { 2651 if (cur >= last_byte) { 2652 char *userpage; 2653 struct extent_state *cached = NULL; 2654 2655 iosize = PAGE_CACHE_SIZE - pg_offset; 2656 userpage = kmap_atomic(page); 2657 memset(userpage + pg_offset, 0, iosize); 2658 flush_dcache_page(page); 2659 kunmap_atomic(userpage); 2660 set_extent_uptodate(tree, cur, cur + iosize - 1, 2661 &cached, GFP_NOFS); 2662 unlock_extent_cached(tree, cur, cur + iosize - 1, 2663 &cached, GFP_NOFS); 2664 break; 2665 } 2666 em = get_extent(inode, page, pg_offset, cur, 2667 end - cur + 1, 0); 2668 if (IS_ERR_OR_NULL(em)) { 2669 SetPageError(page); 2670 unlock_extent(tree, cur, end); 2671 break; 2672 } 2673 extent_offset = cur - em->start; 2674 BUG_ON(extent_map_end(em) <= cur); 2675 BUG_ON(end < cur); 2676 2677 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2678 this_bio_flag = EXTENT_BIO_COMPRESSED; 2679 extent_set_compress_type(&this_bio_flag, 2680 em->compress_type); 2681 } 2682 2683 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2684 cur_end = min(extent_map_end(em) - 1, end); 2685 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2686 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2687 disk_io_size = em->block_len; 2688 sector = em->block_start >> 9; 2689 } else { 2690 sector = (em->block_start + extent_offset) >> 9; 2691 disk_io_size = iosize; 2692 } 2693 bdev = em->bdev; 2694 block_start = em->block_start; 2695 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2696 block_start = EXTENT_MAP_HOLE; 2697 free_extent_map(em); 2698 em = NULL; 2699 2700 /* we've found a hole, just zero and go on */ 2701 if (block_start == EXTENT_MAP_HOLE) { 2702 char *userpage; 2703 struct extent_state *cached = NULL; 2704 2705 userpage = kmap_atomic(page); 2706 memset(userpage + pg_offset, 0, iosize); 2707 flush_dcache_page(page); 2708 kunmap_atomic(userpage); 2709 2710 set_extent_uptodate(tree, cur, cur + iosize - 1, 2711 &cached, GFP_NOFS); 2712 unlock_extent_cached(tree, cur, cur + iosize - 1, 2713 &cached, GFP_NOFS); 2714 cur = cur + iosize; 2715 pg_offset += iosize; 2716 continue; 2717 } 2718 /* the get_extent function already copied into the page */ 2719 if (test_range_bit(tree, cur, cur_end, 2720 EXTENT_UPTODATE, 1, NULL)) { 2721 check_page_uptodate(tree, page); 2722 unlock_extent(tree, cur, cur + iosize - 1); 2723 cur = cur + iosize; 2724 pg_offset += iosize; 2725 continue; 2726 } 2727 /* we have an inline extent but it didn't get marked up 2728 * to date. Error out 2729 */ 2730 if (block_start == EXTENT_MAP_INLINE) { 2731 SetPageError(page); 2732 unlock_extent(tree, cur, cur + iosize - 1); 2733 cur = cur + iosize; 2734 pg_offset += iosize; 2735 continue; 2736 } 2737 2738 ret = 0; 2739 if (tree->ops && tree->ops->readpage_io_hook) { 2740 ret = tree->ops->readpage_io_hook(page, cur, 2741 cur + iosize - 1); 2742 } 2743 if (!ret) { 2744 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2745 pnr -= page->index; 2746 ret = submit_extent_page(READ, tree, page, 2747 sector, disk_io_size, pg_offset, 2748 bdev, bio, pnr, 2749 end_bio_extent_readpage, mirror_num, 2750 *bio_flags, 2751 this_bio_flag); 2752 if (!ret) { 2753 nr++; 2754 *bio_flags = this_bio_flag; 2755 } 2756 } 2757 if (ret) { 2758 SetPageError(page); 2759 unlock_extent(tree, cur, cur + iosize - 1); 2760 } 2761 cur = cur + iosize; 2762 pg_offset += iosize; 2763 } 2764 out: 2765 if (!nr) { 2766 if (!PageError(page)) 2767 SetPageUptodate(page); 2768 unlock_page(page); 2769 } 2770 return 0; 2771 } 2772 2773 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2774 get_extent_t *get_extent, int mirror_num) 2775 { 2776 struct bio *bio = NULL; 2777 unsigned long bio_flags = 0; 2778 int ret; 2779 2780 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 2781 &bio_flags); 2782 if (bio) 2783 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 2784 return ret; 2785 } 2786 2787 static noinline void update_nr_written(struct page *page, 2788 struct writeback_control *wbc, 2789 unsigned long nr_written) 2790 { 2791 wbc->nr_to_write -= nr_written; 2792 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2793 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2794 page->mapping->writeback_index = page->index + nr_written; 2795 } 2796 2797 /* 2798 * the writepage semantics are similar to regular writepage. extent 2799 * records are inserted to lock ranges in the tree, and as dirty areas 2800 * are found, they are marked writeback. Then the lock bits are removed 2801 * and the end_io handler clears the writeback ranges 2802 */ 2803 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2804 void *data) 2805 { 2806 struct inode *inode = page->mapping->host; 2807 struct extent_page_data *epd = data; 2808 struct extent_io_tree *tree = epd->tree; 2809 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2810 u64 delalloc_start; 2811 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2812 u64 end; 2813 u64 cur = start; 2814 u64 extent_offset; 2815 u64 last_byte = i_size_read(inode); 2816 u64 block_start; 2817 u64 iosize; 2818 sector_t sector; 2819 struct extent_state *cached_state = NULL; 2820 struct extent_map *em; 2821 struct block_device *bdev; 2822 int ret; 2823 int nr = 0; 2824 size_t pg_offset = 0; 2825 size_t blocksize; 2826 loff_t i_size = i_size_read(inode); 2827 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2828 u64 nr_delalloc; 2829 u64 delalloc_end; 2830 int page_started; 2831 int compressed; 2832 int write_flags; 2833 unsigned long nr_written = 0; 2834 bool fill_delalloc = true; 2835 2836 if (wbc->sync_mode == WB_SYNC_ALL) 2837 write_flags = WRITE_SYNC; 2838 else 2839 write_flags = WRITE; 2840 2841 trace___extent_writepage(page, inode, wbc); 2842 2843 WARN_ON(!PageLocked(page)); 2844 2845 ClearPageError(page); 2846 2847 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2848 if (page->index > end_index || 2849 (page->index == end_index && !pg_offset)) { 2850 page->mapping->a_ops->invalidatepage(page, 0); 2851 unlock_page(page); 2852 return 0; 2853 } 2854 2855 if (page->index == end_index) { 2856 char *userpage; 2857 2858 userpage = kmap_atomic(page); 2859 memset(userpage + pg_offset, 0, 2860 PAGE_CACHE_SIZE - pg_offset); 2861 kunmap_atomic(userpage); 2862 flush_dcache_page(page); 2863 } 2864 pg_offset = 0; 2865 2866 set_page_extent_mapped(page); 2867 2868 if (!tree->ops || !tree->ops->fill_delalloc) 2869 fill_delalloc = false; 2870 2871 delalloc_start = start; 2872 delalloc_end = 0; 2873 page_started = 0; 2874 if (!epd->extent_locked && fill_delalloc) { 2875 u64 delalloc_to_write = 0; 2876 /* 2877 * make sure the wbc mapping index is at least updated 2878 * to this page. 2879 */ 2880 update_nr_written(page, wbc, 0); 2881 2882 while (delalloc_end < page_end) { 2883 nr_delalloc = find_lock_delalloc_range(inode, tree, 2884 page, 2885 &delalloc_start, 2886 &delalloc_end, 2887 128 * 1024 * 1024); 2888 if (nr_delalloc == 0) { 2889 delalloc_start = delalloc_end + 1; 2890 continue; 2891 } 2892 ret = tree->ops->fill_delalloc(inode, page, 2893 delalloc_start, 2894 delalloc_end, 2895 &page_started, 2896 &nr_written); 2897 /* File system has been set read-only */ 2898 if (ret) { 2899 SetPageError(page); 2900 goto done; 2901 } 2902 /* 2903 * delalloc_end is already one less than the total 2904 * length, so we don't subtract one from 2905 * PAGE_CACHE_SIZE 2906 */ 2907 delalloc_to_write += (delalloc_end - delalloc_start + 2908 PAGE_CACHE_SIZE) >> 2909 PAGE_CACHE_SHIFT; 2910 delalloc_start = delalloc_end + 1; 2911 } 2912 if (wbc->nr_to_write < delalloc_to_write) { 2913 int thresh = 8192; 2914 2915 if (delalloc_to_write < thresh * 2) 2916 thresh = delalloc_to_write; 2917 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2918 thresh); 2919 } 2920 2921 /* did the fill delalloc function already unlock and start 2922 * the IO? 2923 */ 2924 if (page_started) { 2925 ret = 0; 2926 /* 2927 * we've unlocked the page, so we can't update 2928 * the mapping's writeback index, just update 2929 * nr_to_write. 2930 */ 2931 wbc->nr_to_write -= nr_written; 2932 goto done_unlocked; 2933 } 2934 } 2935 if (tree->ops && tree->ops->writepage_start_hook) { 2936 ret = tree->ops->writepage_start_hook(page, start, 2937 page_end); 2938 if (ret) { 2939 /* Fixup worker will requeue */ 2940 if (ret == -EBUSY) 2941 wbc->pages_skipped++; 2942 else 2943 redirty_page_for_writepage(wbc, page); 2944 update_nr_written(page, wbc, nr_written); 2945 unlock_page(page); 2946 ret = 0; 2947 goto done_unlocked; 2948 } 2949 } 2950 2951 /* 2952 * we don't want to touch the inode after unlocking the page, 2953 * so we update the mapping writeback index now 2954 */ 2955 update_nr_written(page, wbc, nr_written + 1); 2956 2957 end = page_end; 2958 if (last_byte <= start) { 2959 if (tree->ops && tree->ops->writepage_end_io_hook) 2960 tree->ops->writepage_end_io_hook(page, start, 2961 page_end, NULL, 1); 2962 goto done; 2963 } 2964 2965 blocksize = inode->i_sb->s_blocksize; 2966 2967 while (cur <= end) { 2968 if (cur >= last_byte) { 2969 if (tree->ops && tree->ops->writepage_end_io_hook) 2970 tree->ops->writepage_end_io_hook(page, cur, 2971 page_end, NULL, 1); 2972 break; 2973 } 2974 em = epd->get_extent(inode, page, pg_offset, cur, 2975 end - cur + 1, 1); 2976 if (IS_ERR_OR_NULL(em)) { 2977 SetPageError(page); 2978 break; 2979 } 2980 2981 extent_offset = cur - em->start; 2982 BUG_ON(extent_map_end(em) <= cur); 2983 BUG_ON(end < cur); 2984 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2985 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2986 sector = (em->block_start + extent_offset) >> 9; 2987 bdev = em->bdev; 2988 block_start = em->block_start; 2989 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2990 free_extent_map(em); 2991 em = NULL; 2992 2993 /* 2994 * compressed and inline extents are written through other 2995 * paths in the FS 2996 */ 2997 if (compressed || block_start == EXTENT_MAP_HOLE || 2998 block_start == EXTENT_MAP_INLINE) { 2999 /* 3000 * end_io notification does not happen here for 3001 * compressed extents 3002 */ 3003 if (!compressed && tree->ops && 3004 tree->ops->writepage_end_io_hook) 3005 tree->ops->writepage_end_io_hook(page, cur, 3006 cur + iosize - 1, 3007 NULL, 1); 3008 else if (compressed) { 3009 /* we don't want to end_page_writeback on 3010 * a compressed extent. this happens 3011 * elsewhere 3012 */ 3013 nr++; 3014 } 3015 3016 cur += iosize; 3017 pg_offset += iosize; 3018 continue; 3019 } 3020 /* leave this out until we have a page_mkwrite call */ 3021 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 3022 EXTENT_DIRTY, 0, NULL)) { 3023 cur = cur + iosize; 3024 pg_offset += iosize; 3025 continue; 3026 } 3027 3028 if (tree->ops && tree->ops->writepage_io_hook) { 3029 ret = tree->ops->writepage_io_hook(page, cur, 3030 cur + iosize - 1); 3031 } else { 3032 ret = 0; 3033 } 3034 if (ret) { 3035 SetPageError(page); 3036 } else { 3037 unsigned long max_nr = end_index + 1; 3038 3039 set_range_writeback(tree, cur, cur + iosize - 1); 3040 if (!PageWriteback(page)) { 3041 printk(KERN_ERR "btrfs warning page %lu not " 3042 "writeback, cur %llu end %llu\n", 3043 page->index, (unsigned long long)cur, 3044 (unsigned long long)end); 3045 } 3046 3047 ret = submit_extent_page(write_flags, tree, page, 3048 sector, iosize, pg_offset, 3049 bdev, &epd->bio, max_nr, 3050 end_bio_extent_writepage, 3051 0, 0, 0); 3052 if (ret) 3053 SetPageError(page); 3054 } 3055 cur = cur + iosize; 3056 pg_offset += iosize; 3057 nr++; 3058 } 3059 done: 3060 if (nr == 0) { 3061 /* make sure the mapping tag for page dirty gets cleared */ 3062 set_page_writeback(page); 3063 end_page_writeback(page); 3064 } 3065 unlock_page(page); 3066 3067 done_unlocked: 3068 3069 /* drop our reference on any cached states */ 3070 free_extent_state(cached_state); 3071 return 0; 3072 } 3073 3074 static int eb_wait(void *word) 3075 { 3076 io_schedule(); 3077 return 0; 3078 } 3079 3080 static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3081 { 3082 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3083 TASK_UNINTERRUPTIBLE); 3084 } 3085 3086 static int lock_extent_buffer_for_io(struct extent_buffer *eb, 3087 struct btrfs_fs_info *fs_info, 3088 struct extent_page_data *epd) 3089 { 3090 unsigned long i, num_pages; 3091 int flush = 0; 3092 int ret = 0; 3093 3094 if (!btrfs_try_tree_write_lock(eb)) { 3095 flush = 1; 3096 flush_write_bio(epd); 3097 btrfs_tree_lock(eb); 3098 } 3099 3100 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3101 btrfs_tree_unlock(eb); 3102 if (!epd->sync_io) 3103 return 0; 3104 if (!flush) { 3105 flush_write_bio(epd); 3106 flush = 1; 3107 } 3108 while (1) { 3109 wait_on_extent_buffer_writeback(eb); 3110 btrfs_tree_lock(eb); 3111 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3112 break; 3113 btrfs_tree_unlock(eb); 3114 } 3115 } 3116 3117 /* 3118 * We need to do this to prevent races in people who check if the eb is 3119 * under IO since we can end up having no IO bits set for a short period 3120 * of time. 3121 */ 3122 spin_lock(&eb->refs_lock); 3123 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3124 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3125 spin_unlock(&eb->refs_lock); 3126 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3127 spin_lock(&fs_info->delalloc_lock); 3128 if (fs_info->dirty_metadata_bytes >= eb->len) 3129 fs_info->dirty_metadata_bytes -= eb->len; 3130 else 3131 WARN_ON(1); 3132 spin_unlock(&fs_info->delalloc_lock); 3133 ret = 1; 3134 } else { 3135 spin_unlock(&eb->refs_lock); 3136 } 3137 3138 btrfs_tree_unlock(eb); 3139 3140 if (!ret) 3141 return ret; 3142 3143 num_pages = num_extent_pages(eb->start, eb->len); 3144 for (i = 0; i < num_pages; i++) { 3145 struct page *p = extent_buffer_page(eb, i); 3146 3147 if (!trylock_page(p)) { 3148 if (!flush) { 3149 flush_write_bio(epd); 3150 flush = 1; 3151 } 3152 lock_page(p); 3153 } 3154 } 3155 3156 return ret; 3157 } 3158 3159 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3160 { 3161 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3162 smp_mb__after_clear_bit(); 3163 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3164 } 3165 3166 static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3167 { 3168 int uptodate = err == 0; 3169 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 3170 struct extent_buffer *eb; 3171 int done; 3172 3173 do { 3174 struct page *page = bvec->bv_page; 3175 3176 bvec--; 3177 eb = (struct extent_buffer *)page->private; 3178 BUG_ON(!eb); 3179 done = atomic_dec_and_test(&eb->io_pages); 3180 3181 if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3182 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3183 ClearPageUptodate(page); 3184 SetPageError(page); 3185 } 3186 3187 end_page_writeback(page); 3188 3189 if (!done) 3190 continue; 3191 3192 end_extent_buffer_writeback(eb); 3193 } while (bvec >= bio->bi_io_vec); 3194 3195 bio_put(bio); 3196 3197 } 3198 3199 static int write_one_eb(struct extent_buffer *eb, 3200 struct btrfs_fs_info *fs_info, 3201 struct writeback_control *wbc, 3202 struct extent_page_data *epd) 3203 { 3204 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3205 u64 offset = eb->start; 3206 unsigned long i, num_pages; 3207 unsigned long bio_flags = 0; 3208 int rw = (epd->sync_io ? WRITE_SYNC : WRITE); 3209 int ret = 0; 3210 3211 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3212 num_pages = num_extent_pages(eb->start, eb->len); 3213 atomic_set(&eb->io_pages, num_pages); 3214 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3215 bio_flags = EXTENT_BIO_TREE_LOG; 3216 3217 for (i = 0; i < num_pages; i++) { 3218 struct page *p = extent_buffer_page(eb, i); 3219 3220 clear_page_dirty_for_io(p); 3221 set_page_writeback(p); 3222 ret = submit_extent_page(rw, eb->tree, p, offset >> 9, 3223 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3224 -1, end_bio_extent_buffer_writepage, 3225 0, epd->bio_flags, bio_flags); 3226 epd->bio_flags = bio_flags; 3227 if (ret) { 3228 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3229 SetPageError(p); 3230 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3231 end_extent_buffer_writeback(eb); 3232 ret = -EIO; 3233 break; 3234 } 3235 offset += PAGE_CACHE_SIZE; 3236 update_nr_written(p, wbc, 1); 3237 unlock_page(p); 3238 } 3239 3240 if (unlikely(ret)) { 3241 for (; i < num_pages; i++) { 3242 struct page *p = extent_buffer_page(eb, i); 3243 unlock_page(p); 3244 } 3245 } 3246 3247 return ret; 3248 } 3249 3250 int btree_write_cache_pages(struct address_space *mapping, 3251 struct writeback_control *wbc) 3252 { 3253 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3254 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3255 struct extent_buffer *eb, *prev_eb = NULL; 3256 struct extent_page_data epd = { 3257 .bio = NULL, 3258 .tree = tree, 3259 .extent_locked = 0, 3260 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3261 .bio_flags = 0, 3262 }; 3263 int ret = 0; 3264 int done = 0; 3265 int nr_to_write_done = 0; 3266 struct pagevec pvec; 3267 int nr_pages; 3268 pgoff_t index; 3269 pgoff_t end; /* Inclusive */ 3270 int scanned = 0; 3271 int tag; 3272 3273 pagevec_init(&pvec, 0); 3274 if (wbc->range_cyclic) { 3275 index = mapping->writeback_index; /* Start from prev offset */ 3276 end = -1; 3277 } else { 3278 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3279 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3280 scanned = 1; 3281 } 3282 if (wbc->sync_mode == WB_SYNC_ALL) 3283 tag = PAGECACHE_TAG_TOWRITE; 3284 else 3285 tag = PAGECACHE_TAG_DIRTY; 3286 retry: 3287 if (wbc->sync_mode == WB_SYNC_ALL) 3288 tag_pages_for_writeback(mapping, index, end); 3289 while (!done && !nr_to_write_done && (index <= end) && 3290 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3291 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3292 unsigned i; 3293 3294 scanned = 1; 3295 for (i = 0; i < nr_pages; i++) { 3296 struct page *page = pvec.pages[i]; 3297 3298 if (!PagePrivate(page)) 3299 continue; 3300 3301 if (!wbc->range_cyclic && page->index > end) { 3302 done = 1; 3303 break; 3304 } 3305 3306 spin_lock(&mapping->private_lock); 3307 if (!PagePrivate(page)) { 3308 spin_unlock(&mapping->private_lock); 3309 continue; 3310 } 3311 3312 eb = (struct extent_buffer *)page->private; 3313 3314 /* 3315 * Shouldn't happen and normally this would be a BUG_ON 3316 * but no sense in crashing the users box for something 3317 * we can survive anyway. 3318 */ 3319 if (!eb) { 3320 spin_unlock(&mapping->private_lock); 3321 WARN_ON(1); 3322 continue; 3323 } 3324 3325 if (eb == prev_eb) { 3326 spin_unlock(&mapping->private_lock); 3327 continue; 3328 } 3329 3330 ret = atomic_inc_not_zero(&eb->refs); 3331 spin_unlock(&mapping->private_lock); 3332 if (!ret) 3333 continue; 3334 3335 prev_eb = eb; 3336 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3337 if (!ret) { 3338 free_extent_buffer(eb); 3339 continue; 3340 } 3341 3342 ret = write_one_eb(eb, fs_info, wbc, &epd); 3343 if (ret) { 3344 done = 1; 3345 free_extent_buffer(eb); 3346 break; 3347 } 3348 free_extent_buffer(eb); 3349 3350 /* 3351 * the filesystem may choose to bump up nr_to_write. 3352 * We have to make sure to honor the new nr_to_write 3353 * at any time 3354 */ 3355 nr_to_write_done = wbc->nr_to_write <= 0; 3356 } 3357 pagevec_release(&pvec); 3358 cond_resched(); 3359 } 3360 if (!scanned && !done) { 3361 /* 3362 * We hit the last page and there is more work to be done: wrap 3363 * back to the start of the file 3364 */ 3365 scanned = 1; 3366 index = 0; 3367 goto retry; 3368 } 3369 flush_write_bio(&epd); 3370 return ret; 3371 } 3372 3373 /** 3374 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3375 * @mapping: address space structure to write 3376 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3377 * @writepage: function called for each page 3378 * @data: data passed to writepage function 3379 * 3380 * If a page is already under I/O, write_cache_pages() skips it, even 3381 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3382 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3383 * and msync() need to guarantee that all the data which was dirty at the time 3384 * the call was made get new I/O started against them. If wbc->sync_mode is 3385 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3386 * existing IO to complete. 3387 */ 3388 static int extent_write_cache_pages(struct extent_io_tree *tree, 3389 struct address_space *mapping, 3390 struct writeback_control *wbc, 3391 writepage_t writepage, void *data, 3392 void (*flush_fn)(void *)) 3393 { 3394 struct inode *inode = mapping->host; 3395 int ret = 0; 3396 int done = 0; 3397 int nr_to_write_done = 0; 3398 struct pagevec pvec; 3399 int nr_pages; 3400 pgoff_t index; 3401 pgoff_t end; /* Inclusive */ 3402 int scanned = 0; 3403 int tag; 3404 3405 /* 3406 * We have to hold onto the inode so that ordered extents can do their 3407 * work when the IO finishes. The alternative to this is failing to add 3408 * an ordered extent if the igrab() fails there and that is a huge pain 3409 * to deal with, so instead just hold onto the inode throughout the 3410 * writepages operation. If it fails here we are freeing up the inode 3411 * anyway and we'd rather not waste our time writing out stuff that is 3412 * going to be truncated anyway. 3413 */ 3414 if (!igrab(inode)) 3415 return 0; 3416 3417 pagevec_init(&pvec, 0); 3418 if (wbc->range_cyclic) { 3419 index = mapping->writeback_index; /* Start from prev offset */ 3420 end = -1; 3421 } else { 3422 index = wbc->range_start >> PAGE_CACHE_SHIFT; 3423 end = wbc->range_end >> PAGE_CACHE_SHIFT; 3424 scanned = 1; 3425 } 3426 if (wbc->sync_mode == WB_SYNC_ALL) 3427 tag = PAGECACHE_TAG_TOWRITE; 3428 else 3429 tag = PAGECACHE_TAG_DIRTY; 3430 retry: 3431 if (wbc->sync_mode == WB_SYNC_ALL) 3432 tag_pages_for_writeback(mapping, index, end); 3433 while (!done && !nr_to_write_done && (index <= end) && 3434 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3435 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3436 unsigned i; 3437 3438 scanned = 1; 3439 for (i = 0; i < nr_pages; i++) { 3440 struct page *page = pvec.pages[i]; 3441 3442 /* 3443 * At this point we hold neither mapping->tree_lock nor 3444 * lock on the page itself: the page may be truncated or 3445 * invalidated (changing page->mapping to NULL), or even 3446 * swizzled back from swapper_space to tmpfs file 3447 * mapping 3448 */ 3449 if (tree->ops && 3450 tree->ops->write_cache_pages_lock_hook) { 3451 tree->ops->write_cache_pages_lock_hook(page, 3452 data, flush_fn); 3453 } else { 3454 if (!trylock_page(page)) { 3455 flush_fn(data); 3456 lock_page(page); 3457 } 3458 } 3459 3460 if (unlikely(page->mapping != mapping)) { 3461 unlock_page(page); 3462 continue; 3463 } 3464 3465 if (!wbc->range_cyclic && page->index > end) { 3466 done = 1; 3467 unlock_page(page); 3468 continue; 3469 } 3470 3471 if (wbc->sync_mode != WB_SYNC_NONE) { 3472 if (PageWriteback(page)) 3473 flush_fn(data); 3474 wait_on_page_writeback(page); 3475 } 3476 3477 if (PageWriteback(page) || 3478 !clear_page_dirty_for_io(page)) { 3479 unlock_page(page); 3480 continue; 3481 } 3482 3483 ret = (*writepage)(page, wbc, data); 3484 3485 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3486 unlock_page(page); 3487 ret = 0; 3488 } 3489 if (ret) 3490 done = 1; 3491 3492 /* 3493 * the filesystem may choose to bump up nr_to_write. 3494 * We have to make sure to honor the new nr_to_write 3495 * at any time 3496 */ 3497 nr_to_write_done = wbc->nr_to_write <= 0; 3498 } 3499 pagevec_release(&pvec); 3500 cond_resched(); 3501 } 3502 if (!scanned && !done) { 3503 /* 3504 * We hit the last page and there is more work to be done: wrap 3505 * back to the start of the file 3506 */ 3507 scanned = 1; 3508 index = 0; 3509 goto retry; 3510 } 3511 btrfs_add_delayed_iput(inode); 3512 return ret; 3513 } 3514 3515 static void flush_epd_write_bio(struct extent_page_data *epd) 3516 { 3517 if (epd->bio) { 3518 int rw = WRITE; 3519 int ret; 3520 3521 if (epd->sync_io) 3522 rw = WRITE_SYNC; 3523 3524 ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); 3525 BUG_ON(ret < 0); /* -ENOMEM */ 3526 epd->bio = NULL; 3527 } 3528 } 3529 3530 static noinline void flush_write_bio(void *data) 3531 { 3532 struct extent_page_data *epd = data; 3533 flush_epd_write_bio(epd); 3534 } 3535 3536 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 3537 get_extent_t *get_extent, 3538 struct writeback_control *wbc) 3539 { 3540 int ret; 3541 struct extent_page_data epd = { 3542 .bio = NULL, 3543 .tree = tree, 3544 .get_extent = get_extent, 3545 .extent_locked = 0, 3546 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3547 .bio_flags = 0, 3548 }; 3549 3550 ret = __extent_writepage(page, wbc, &epd); 3551 3552 flush_epd_write_bio(&epd); 3553 return ret; 3554 } 3555 3556 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 3557 u64 start, u64 end, get_extent_t *get_extent, 3558 int mode) 3559 { 3560 int ret = 0; 3561 struct address_space *mapping = inode->i_mapping; 3562 struct page *page; 3563 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 3564 PAGE_CACHE_SHIFT; 3565 3566 struct extent_page_data epd = { 3567 .bio = NULL, 3568 .tree = tree, 3569 .get_extent = get_extent, 3570 .extent_locked = 1, 3571 .sync_io = mode == WB_SYNC_ALL, 3572 .bio_flags = 0, 3573 }; 3574 struct writeback_control wbc_writepages = { 3575 .sync_mode = mode, 3576 .nr_to_write = nr_pages * 2, 3577 .range_start = start, 3578 .range_end = end + 1, 3579 }; 3580 3581 while (start <= end) { 3582 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 3583 if (clear_page_dirty_for_io(page)) 3584 ret = __extent_writepage(page, &wbc_writepages, &epd); 3585 else { 3586 if (tree->ops && tree->ops->writepage_end_io_hook) 3587 tree->ops->writepage_end_io_hook(page, start, 3588 start + PAGE_CACHE_SIZE - 1, 3589 NULL, 1); 3590 unlock_page(page); 3591 } 3592 page_cache_release(page); 3593 start += PAGE_CACHE_SIZE; 3594 } 3595 3596 flush_epd_write_bio(&epd); 3597 return ret; 3598 } 3599 3600 int extent_writepages(struct extent_io_tree *tree, 3601 struct address_space *mapping, 3602 get_extent_t *get_extent, 3603 struct writeback_control *wbc) 3604 { 3605 int ret = 0; 3606 struct extent_page_data epd = { 3607 .bio = NULL, 3608 .tree = tree, 3609 .get_extent = get_extent, 3610 .extent_locked = 0, 3611 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3612 .bio_flags = 0, 3613 }; 3614 3615 ret = extent_write_cache_pages(tree, mapping, wbc, 3616 __extent_writepage, &epd, 3617 flush_write_bio); 3618 flush_epd_write_bio(&epd); 3619 return ret; 3620 } 3621 3622 int extent_readpages(struct extent_io_tree *tree, 3623 struct address_space *mapping, 3624 struct list_head *pages, unsigned nr_pages, 3625 get_extent_t get_extent) 3626 { 3627 struct bio *bio = NULL; 3628 unsigned page_idx; 3629 unsigned long bio_flags = 0; 3630 struct page *pagepool[16]; 3631 struct page *page; 3632 int i = 0; 3633 int nr = 0; 3634 3635 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 3636 page = list_entry(pages->prev, struct page, lru); 3637 3638 prefetchw(&page->flags); 3639 list_del(&page->lru); 3640 if (add_to_page_cache_lru(page, mapping, 3641 page->index, GFP_NOFS)) { 3642 page_cache_release(page); 3643 continue; 3644 } 3645 3646 pagepool[nr++] = page; 3647 if (nr < ARRAY_SIZE(pagepool)) 3648 continue; 3649 for (i = 0; i < nr; i++) { 3650 __extent_read_full_page(tree, pagepool[i], get_extent, 3651 &bio, 0, &bio_flags); 3652 page_cache_release(pagepool[i]); 3653 } 3654 nr = 0; 3655 } 3656 for (i = 0; i < nr; i++) { 3657 __extent_read_full_page(tree, pagepool[i], get_extent, 3658 &bio, 0, &bio_flags); 3659 page_cache_release(pagepool[i]); 3660 } 3661 3662 BUG_ON(!list_empty(pages)); 3663 if (bio) 3664 return submit_one_bio(READ, bio, 0, bio_flags); 3665 return 0; 3666 } 3667 3668 /* 3669 * basic invalidatepage code, this waits on any locked or writeback 3670 * ranges corresponding to the page, and then deletes any extent state 3671 * records from the tree 3672 */ 3673 int extent_invalidatepage(struct extent_io_tree *tree, 3674 struct page *page, unsigned long offset) 3675 { 3676 struct extent_state *cached_state = NULL; 3677 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 3678 u64 end = start + PAGE_CACHE_SIZE - 1; 3679 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 3680 3681 start += (offset + blocksize - 1) & ~(blocksize - 1); 3682 if (start > end) 3683 return 0; 3684 3685 lock_extent_bits(tree, start, end, 0, &cached_state); 3686 wait_on_page_writeback(page); 3687 clear_extent_bit(tree, start, end, 3688 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 3689 EXTENT_DO_ACCOUNTING, 3690 1, 1, &cached_state, GFP_NOFS); 3691 return 0; 3692 } 3693 3694 /* 3695 * a helper for releasepage, this tests for areas of the page that 3696 * are locked or under IO and drops the related state bits if it is safe 3697 * to drop the page. 3698 */ 3699 int try_release_extent_state(struct extent_map_tree *map, 3700 struct extent_io_tree *tree, struct page *page, 3701 gfp_t mask) 3702 { 3703 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3704 u64 end = start + PAGE_CACHE_SIZE - 1; 3705 int ret = 1; 3706 3707 if (test_range_bit(tree, start, end, 3708 EXTENT_IOBITS, 0, NULL)) 3709 ret = 0; 3710 else { 3711 if ((mask & GFP_NOFS) == GFP_NOFS) 3712 mask = GFP_NOFS; 3713 /* 3714 * at this point we can safely clear everything except the 3715 * locked bit and the nodatasum bit 3716 */ 3717 ret = clear_extent_bit(tree, start, end, 3718 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 3719 0, 0, NULL, mask); 3720 3721 /* if clear_extent_bit failed for enomem reasons, 3722 * we can't allow the release to continue. 3723 */ 3724 if (ret < 0) 3725 ret = 0; 3726 else 3727 ret = 1; 3728 } 3729 return ret; 3730 } 3731 3732 /* 3733 * a helper for releasepage. As long as there are no locked extents 3734 * in the range corresponding to the page, both state records and extent 3735 * map records are removed 3736 */ 3737 int try_release_extent_mapping(struct extent_map_tree *map, 3738 struct extent_io_tree *tree, struct page *page, 3739 gfp_t mask) 3740 { 3741 struct extent_map *em; 3742 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 3743 u64 end = start + PAGE_CACHE_SIZE - 1; 3744 3745 if ((mask & __GFP_WAIT) && 3746 page->mapping->host->i_size > 16 * 1024 * 1024) { 3747 u64 len; 3748 while (start <= end) { 3749 len = end - start + 1; 3750 write_lock(&map->lock); 3751 em = lookup_extent_mapping(map, start, len); 3752 if (!em) { 3753 write_unlock(&map->lock); 3754 break; 3755 } 3756 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 3757 em->start != start) { 3758 write_unlock(&map->lock); 3759 free_extent_map(em); 3760 break; 3761 } 3762 if (!test_range_bit(tree, em->start, 3763 extent_map_end(em) - 1, 3764 EXTENT_LOCKED | EXTENT_WRITEBACK, 3765 0, NULL)) { 3766 remove_extent_mapping(map, em); 3767 /* once for the rb tree */ 3768 free_extent_map(em); 3769 } 3770 start = extent_map_end(em); 3771 write_unlock(&map->lock); 3772 3773 /* once for us */ 3774 free_extent_map(em); 3775 } 3776 } 3777 return try_release_extent_state(map, tree, page, mask); 3778 } 3779 3780 /* 3781 * helper function for fiemap, which doesn't want to see any holes. 3782 * This maps until we find something past 'last' 3783 */ 3784 static struct extent_map *get_extent_skip_holes(struct inode *inode, 3785 u64 offset, 3786 u64 last, 3787 get_extent_t *get_extent) 3788 { 3789 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 3790 struct extent_map *em; 3791 u64 len; 3792 3793 if (offset >= last) 3794 return NULL; 3795 3796 while(1) { 3797 len = last - offset; 3798 if (len == 0) 3799 break; 3800 len = (len + sectorsize - 1) & ~(sectorsize - 1); 3801 em = get_extent(inode, NULL, 0, offset, len, 0); 3802 if (IS_ERR_OR_NULL(em)) 3803 return em; 3804 3805 /* if this isn't a hole return it */ 3806 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 3807 em->block_start != EXTENT_MAP_HOLE) { 3808 return em; 3809 } 3810 3811 /* this is a hole, advance to the next extent */ 3812 offset = extent_map_end(em); 3813 free_extent_map(em); 3814 if (offset >= last) 3815 break; 3816 } 3817 return NULL; 3818 } 3819 3820 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 3821 __u64 start, __u64 len, get_extent_t *get_extent) 3822 { 3823 int ret = 0; 3824 u64 off = start; 3825 u64 max = start + len; 3826 u32 flags = 0; 3827 u32 found_type; 3828 u64 last; 3829 u64 last_for_get_extent = 0; 3830 u64 disko = 0; 3831 u64 isize = i_size_read(inode); 3832 struct btrfs_key found_key; 3833 struct extent_map *em = NULL; 3834 struct extent_state *cached_state = NULL; 3835 struct btrfs_path *path; 3836 struct btrfs_file_extent_item *item; 3837 int end = 0; 3838 u64 em_start = 0; 3839 u64 em_len = 0; 3840 u64 em_end = 0; 3841 unsigned long emflags; 3842 3843 if (len == 0) 3844 return -EINVAL; 3845 3846 path = btrfs_alloc_path(); 3847 if (!path) 3848 return -ENOMEM; 3849 path->leave_spinning = 1; 3850 3851 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 3852 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 3853 3854 /* 3855 * lookup the last file extent. We're not using i_size here 3856 * because there might be preallocation past i_size 3857 */ 3858 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3859 path, btrfs_ino(inode), -1, 0); 3860 if (ret < 0) { 3861 btrfs_free_path(path); 3862 return ret; 3863 } 3864 WARN_ON(!ret); 3865 path->slots[0]--; 3866 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3867 struct btrfs_file_extent_item); 3868 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3869 found_type = btrfs_key_type(&found_key); 3870 3871 /* No extents, but there might be delalloc bits */ 3872 if (found_key.objectid != btrfs_ino(inode) || 3873 found_type != BTRFS_EXTENT_DATA_KEY) { 3874 /* have to trust i_size as the end */ 3875 last = (u64)-1; 3876 last_for_get_extent = isize; 3877 } else { 3878 /* 3879 * remember the start of the last extent. There are a 3880 * bunch of different factors that go into the length of the 3881 * extent, so its much less complex to remember where it started 3882 */ 3883 last = found_key.offset; 3884 last_for_get_extent = last + 1; 3885 } 3886 btrfs_free_path(path); 3887 3888 /* 3889 * we might have some extents allocated but more delalloc past those 3890 * extents. so, we trust isize unless the start of the last extent is 3891 * beyond isize 3892 */ 3893 if (last < isize) { 3894 last = (u64)-1; 3895 last_for_get_extent = isize; 3896 } 3897 3898 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3899 &cached_state); 3900 3901 em = get_extent_skip_holes(inode, start, last_for_get_extent, 3902 get_extent); 3903 if (!em) 3904 goto out; 3905 if (IS_ERR(em)) { 3906 ret = PTR_ERR(em); 3907 goto out; 3908 } 3909 3910 while (!end) { 3911 u64 offset_in_extent; 3912 3913 /* break if the extent we found is outside the range */ 3914 if (em->start >= max || extent_map_end(em) < off) 3915 break; 3916 3917 /* 3918 * get_extent may return an extent that starts before our 3919 * requested range. We have to make sure the ranges 3920 * we return to fiemap always move forward and don't 3921 * overlap, so adjust the offsets here 3922 */ 3923 em_start = max(em->start, off); 3924 3925 /* 3926 * record the offset from the start of the extent 3927 * for adjusting the disk offset below 3928 */ 3929 offset_in_extent = em_start - em->start; 3930 em_end = extent_map_end(em); 3931 em_len = em_end - em_start; 3932 emflags = em->flags; 3933 disko = 0; 3934 flags = 0; 3935 3936 /* 3937 * bump off for our next call to get_extent 3938 */ 3939 off = extent_map_end(em); 3940 if (off >= max) 3941 end = 1; 3942 3943 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3944 end = 1; 3945 flags |= FIEMAP_EXTENT_LAST; 3946 } else if (em->block_start == EXTENT_MAP_INLINE) { 3947 flags |= (FIEMAP_EXTENT_DATA_INLINE | 3948 FIEMAP_EXTENT_NOT_ALIGNED); 3949 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 3950 flags |= (FIEMAP_EXTENT_DELALLOC | 3951 FIEMAP_EXTENT_UNKNOWN); 3952 } else { 3953 disko = em->block_start + offset_in_extent; 3954 } 3955 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3956 flags |= FIEMAP_EXTENT_ENCODED; 3957 3958 free_extent_map(em); 3959 em = NULL; 3960 if ((em_start >= last) || em_len == (u64)-1 || 3961 (last == (u64)-1 && isize <= em_end)) { 3962 flags |= FIEMAP_EXTENT_LAST; 3963 end = 1; 3964 } 3965 3966 /* now scan forward to see if this is really the last extent. */ 3967 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3968 get_extent); 3969 if (IS_ERR(em)) { 3970 ret = PTR_ERR(em); 3971 goto out; 3972 } 3973 if (!em) { 3974 flags |= FIEMAP_EXTENT_LAST; 3975 end = 1; 3976 } 3977 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3978 em_len, flags); 3979 if (ret) 3980 goto out_free; 3981 } 3982 out_free: 3983 free_extent_map(em); 3984 out: 3985 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3986 &cached_state, GFP_NOFS); 3987 return ret; 3988 } 3989 3990 static void __free_extent_buffer(struct extent_buffer *eb) 3991 { 3992 #if LEAK_DEBUG 3993 unsigned long flags; 3994 spin_lock_irqsave(&leak_lock, flags); 3995 list_del(&eb->leak_list); 3996 spin_unlock_irqrestore(&leak_lock, flags); 3997 #endif 3998 if (eb->pages && eb->pages != eb->inline_pages) 3999 kfree(eb->pages); 4000 kmem_cache_free(extent_buffer_cache, eb); 4001 } 4002 4003 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 4004 u64 start, 4005 unsigned long len, 4006 gfp_t mask) 4007 { 4008 struct extent_buffer *eb = NULL; 4009 #if LEAK_DEBUG 4010 unsigned long flags; 4011 #endif 4012 4013 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 4014 if (eb == NULL) 4015 return NULL; 4016 eb->start = start; 4017 eb->len = len; 4018 eb->tree = tree; 4019 eb->bflags = 0; 4020 rwlock_init(&eb->lock); 4021 atomic_set(&eb->write_locks, 0); 4022 atomic_set(&eb->read_locks, 0); 4023 atomic_set(&eb->blocking_readers, 0); 4024 atomic_set(&eb->blocking_writers, 0); 4025 atomic_set(&eb->spinning_readers, 0); 4026 atomic_set(&eb->spinning_writers, 0); 4027 eb->lock_nested = 0; 4028 init_waitqueue_head(&eb->write_lock_wq); 4029 init_waitqueue_head(&eb->read_lock_wq); 4030 4031 #if LEAK_DEBUG 4032 spin_lock_irqsave(&leak_lock, flags); 4033 list_add(&eb->leak_list, &buffers); 4034 spin_unlock_irqrestore(&leak_lock, flags); 4035 #endif 4036 spin_lock_init(&eb->refs_lock); 4037 atomic_set(&eb->refs, 1); 4038 atomic_set(&eb->io_pages, 0); 4039 4040 if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { 4041 struct page **pages; 4042 int num_pages = (len + PAGE_CACHE_SIZE - 1) >> 4043 PAGE_CACHE_SHIFT; 4044 pages = kzalloc(num_pages, mask); 4045 if (!pages) { 4046 __free_extent_buffer(eb); 4047 return NULL; 4048 } 4049 eb->pages = pages; 4050 } else { 4051 eb->pages = eb->inline_pages; 4052 } 4053 4054 return eb; 4055 } 4056 4057 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4058 { 4059 unsigned long i; 4060 struct page *p; 4061 struct extent_buffer *new; 4062 unsigned long num_pages = num_extent_pages(src->start, src->len); 4063 4064 new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); 4065 if (new == NULL) 4066 return NULL; 4067 4068 for (i = 0; i < num_pages; i++) { 4069 p = alloc_page(GFP_ATOMIC); 4070 BUG_ON(!p); 4071 attach_extent_buffer_page(new, p); 4072 WARN_ON(PageDirty(p)); 4073 SetPageUptodate(p); 4074 new->pages[i] = p; 4075 } 4076 4077 copy_extent_buffer(new, src, 0, 0, src->len); 4078 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4079 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4080 4081 return new; 4082 } 4083 4084 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) 4085 { 4086 struct extent_buffer *eb; 4087 unsigned long num_pages = num_extent_pages(0, len); 4088 unsigned long i; 4089 4090 eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); 4091 if (!eb) 4092 return NULL; 4093 4094 for (i = 0; i < num_pages; i++) { 4095 eb->pages[i] = alloc_page(GFP_ATOMIC); 4096 if (!eb->pages[i]) 4097 goto err; 4098 } 4099 set_extent_buffer_uptodate(eb); 4100 btrfs_set_header_nritems(eb, 0); 4101 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4102 4103 return eb; 4104 err: 4105 for (; i > 0; i--) 4106 __free_page(eb->pages[i - 1]); 4107 __free_extent_buffer(eb); 4108 return NULL; 4109 } 4110 4111 static int extent_buffer_under_io(struct extent_buffer *eb) 4112 { 4113 return (atomic_read(&eb->io_pages) || 4114 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4115 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4116 } 4117 4118 /* 4119 * Helper for releasing extent buffer page. 4120 */ 4121 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4122 unsigned long start_idx) 4123 { 4124 unsigned long index; 4125 unsigned long num_pages; 4126 struct page *page; 4127 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4128 4129 BUG_ON(extent_buffer_under_io(eb)); 4130 4131 num_pages = num_extent_pages(eb->start, eb->len); 4132 index = start_idx + num_pages; 4133 if (start_idx >= index) 4134 return; 4135 4136 do { 4137 index--; 4138 page = extent_buffer_page(eb, index); 4139 if (page && mapped) { 4140 spin_lock(&page->mapping->private_lock); 4141 /* 4142 * We do this since we'll remove the pages after we've 4143 * removed the eb from the radix tree, so we could race 4144 * and have this page now attached to the new eb. So 4145 * only clear page_private if it's still connected to 4146 * this eb. 4147 */ 4148 if (PagePrivate(page) && 4149 page->private == (unsigned long)eb) { 4150 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4151 BUG_ON(PageDirty(page)); 4152 BUG_ON(PageWriteback(page)); 4153 /* 4154 * We need to make sure we haven't be attached 4155 * to a new eb. 4156 */ 4157 ClearPagePrivate(page); 4158 set_page_private(page, 0); 4159 /* One for the page private */ 4160 page_cache_release(page); 4161 } 4162 spin_unlock(&page->mapping->private_lock); 4163 4164 } 4165 if (page) { 4166 /* One for when we alloced the page */ 4167 page_cache_release(page); 4168 } 4169 } while (index != start_idx); 4170 } 4171 4172 /* 4173 * Helper for releasing the extent buffer. 4174 */ 4175 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4176 { 4177 btrfs_release_extent_buffer_page(eb, 0); 4178 __free_extent_buffer(eb); 4179 } 4180 4181 static void check_buffer_tree_ref(struct extent_buffer *eb) 4182 { 4183 /* the ref bit is tricky. We have to make sure it is set 4184 * if we have the buffer dirty. Otherwise the 4185 * code to free a buffer can end up dropping a dirty 4186 * page 4187 * 4188 * Once the ref bit is set, it won't go away while the 4189 * buffer is dirty or in writeback, and it also won't 4190 * go away while we have the reference count on the 4191 * eb bumped. 4192 * 4193 * We can't just set the ref bit without bumping the 4194 * ref on the eb because free_extent_buffer might 4195 * see the ref bit and try to clear it. If this happens 4196 * free_extent_buffer might end up dropping our original 4197 * ref by mistake and freeing the page before we are able 4198 * to add one more ref. 4199 * 4200 * So bump the ref count first, then set the bit. If someone 4201 * beat us to it, drop the ref we added. 4202 */ 4203 spin_lock(&eb->refs_lock); 4204 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4205 atomic_inc(&eb->refs); 4206 spin_unlock(&eb->refs_lock); 4207 } 4208 4209 static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4210 { 4211 unsigned long num_pages, i; 4212 4213 check_buffer_tree_ref(eb); 4214 4215 num_pages = num_extent_pages(eb->start, eb->len); 4216 for (i = 0; i < num_pages; i++) { 4217 struct page *p = extent_buffer_page(eb, i); 4218 mark_page_accessed(p); 4219 } 4220 } 4221 4222 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 4223 u64 start, unsigned long len) 4224 { 4225 unsigned long num_pages = num_extent_pages(start, len); 4226 unsigned long i; 4227 unsigned long index = start >> PAGE_CACHE_SHIFT; 4228 struct extent_buffer *eb; 4229 struct extent_buffer *exists = NULL; 4230 struct page *p; 4231 struct address_space *mapping = tree->mapping; 4232 int uptodate = 1; 4233 int ret; 4234 4235 rcu_read_lock(); 4236 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4237 if (eb && atomic_inc_not_zero(&eb->refs)) { 4238 rcu_read_unlock(); 4239 mark_extent_buffer_accessed(eb); 4240 return eb; 4241 } 4242 rcu_read_unlock(); 4243 4244 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 4245 if (!eb) 4246 return NULL; 4247 4248 for (i = 0; i < num_pages; i++, index++) { 4249 p = find_or_create_page(mapping, index, GFP_NOFS); 4250 if (!p) 4251 goto free_eb; 4252 4253 spin_lock(&mapping->private_lock); 4254 if (PagePrivate(p)) { 4255 /* 4256 * We could have already allocated an eb for this page 4257 * and attached one so lets see if we can get a ref on 4258 * the existing eb, and if we can we know it's good and 4259 * we can just return that one, else we know we can just 4260 * overwrite page->private. 4261 */ 4262 exists = (struct extent_buffer *)p->private; 4263 if (atomic_inc_not_zero(&exists->refs)) { 4264 spin_unlock(&mapping->private_lock); 4265 unlock_page(p); 4266 page_cache_release(p); 4267 mark_extent_buffer_accessed(exists); 4268 goto free_eb; 4269 } 4270 4271 /* 4272 * Do this so attach doesn't complain and we need to 4273 * drop the ref the old guy had. 4274 */ 4275 ClearPagePrivate(p); 4276 WARN_ON(PageDirty(p)); 4277 page_cache_release(p); 4278 } 4279 attach_extent_buffer_page(eb, p); 4280 spin_unlock(&mapping->private_lock); 4281 WARN_ON(PageDirty(p)); 4282 mark_page_accessed(p); 4283 eb->pages[i] = p; 4284 if (!PageUptodate(p)) 4285 uptodate = 0; 4286 4287 /* 4288 * see below about how we avoid a nasty race with release page 4289 * and why we unlock later 4290 */ 4291 } 4292 if (uptodate) 4293 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4294 again: 4295 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4296 if (ret) 4297 goto free_eb; 4298 4299 spin_lock(&tree->buffer_lock); 4300 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 4301 if (ret == -EEXIST) { 4302 exists = radix_tree_lookup(&tree->buffer, 4303 start >> PAGE_CACHE_SHIFT); 4304 if (!atomic_inc_not_zero(&exists->refs)) { 4305 spin_unlock(&tree->buffer_lock); 4306 radix_tree_preload_end(); 4307 exists = NULL; 4308 goto again; 4309 } 4310 spin_unlock(&tree->buffer_lock); 4311 radix_tree_preload_end(); 4312 mark_extent_buffer_accessed(exists); 4313 goto free_eb; 4314 } 4315 /* add one reference for the tree */ 4316 check_buffer_tree_ref(eb); 4317 spin_unlock(&tree->buffer_lock); 4318 radix_tree_preload_end(); 4319 4320 /* 4321 * there is a race where release page may have 4322 * tried to find this extent buffer in the radix 4323 * but failed. It will tell the VM it is safe to 4324 * reclaim the, and it will clear the page private bit. 4325 * We must make sure to set the page private bit properly 4326 * after the extent buffer is in the radix tree so 4327 * it doesn't get lost 4328 */ 4329 SetPageChecked(eb->pages[0]); 4330 for (i = 1; i < num_pages; i++) { 4331 p = extent_buffer_page(eb, i); 4332 ClearPageChecked(p); 4333 unlock_page(p); 4334 } 4335 unlock_page(eb->pages[0]); 4336 return eb; 4337 4338 free_eb: 4339 for (i = 0; i < num_pages; i++) { 4340 if (eb->pages[i]) 4341 unlock_page(eb->pages[i]); 4342 } 4343 4344 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4345 btrfs_release_extent_buffer(eb); 4346 return exists; 4347 } 4348 4349 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 4350 u64 start, unsigned long len) 4351 { 4352 struct extent_buffer *eb; 4353 4354 rcu_read_lock(); 4355 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4356 if (eb && atomic_inc_not_zero(&eb->refs)) { 4357 rcu_read_unlock(); 4358 mark_extent_buffer_accessed(eb); 4359 return eb; 4360 } 4361 rcu_read_unlock(); 4362 4363 return NULL; 4364 } 4365 4366 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 4367 { 4368 struct extent_buffer *eb = 4369 container_of(head, struct extent_buffer, rcu_head); 4370 4371 __free_extent_buffer(eb); 4372 } 4373 4374 /* Expects to have eb->eb_lock already held */ 4375 static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) 4376 { 4377 WARN_ON(atomic_read(&eb->refs) == 0); 4378 if (atomic_dec_and_test(&eb->refs)) { 4379 if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) { 4380 spin_unlock(&eb->refs_lock); 4381 } else { 4382 struct extent_io_tree *tree = eb->tree; 4383 4384 spin_unlock(&eb->refs_lock); 4385 4386 spin_lock(&tree->buffer_lock); 4387 radix_tree_delete(&tree->buffer, 4388 eb->start >> PAGE_CACHE_SHIFT); 4389 spin_unlock(&tree->buffer_lock); 4390 } 4391 4392 /* Should be safe to release our pages at this point */ 4393 btrfs_release_extent_buffer_page(eb, 0); 4394 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4395 return 1; 4396 } 4397 spin_unlock(&eb->refs_lock); 4398 4399 return 0; 4400 } 4401 4402 void free_extent_buffer(struct extent_buffer *eb) 4403 { 4404 if (!eb) 4405 return; 4406 4407 spin_lock(&eb->refs_lock); 4408 if (atomic_read(&eb->refs) == 2 && 4409 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4410 atomic_dec(&eb->refs); 4411 4412 if (atomic_read(&eb->refs) == 2 && 4413 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 4414 !extent_buffer_under_io(eb) && 4415 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4416 atomic_dec(&eb->refs); 4417 4418 /* 4419 * I know this is terrible, but it's temporary until we stop tracking 4420 * the uptodate bits and such for the extent buffers. 4421 */ 4422 release_extent_buffer(eb, GFP_ATOMIC); 4423 } 4424 4425 void free_extent_buffer_stale(struct extent_buffer *eb) 4426 { 4427 if (!eb) 4428 return; 4429 4430 spin_lock(&eb->refs_lock); 4431 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 4432 4433 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 4434 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4435 atomic_dec(&eb->refs); 4436 release_extent_buffer(eb, GFP_NOFS); 4437 } 4438 4439 void clear_extent_buffer_dirty(struct extent_buffer *eb) 4440 { 4441 unsigned long i; 4442 unsigned long num_pages; 4443 struct page *page; 4444 4445 num_pages = num_extent_pages(eb->start, eb->len); 4446 4447 for (i = 0; i < num_pages; i++) { 4448 page = extent_buffer_page(eb, i); 4449 if (!PageDirty(page)) 4450 continue; 4451 4452 lock_page(page); 4453 WARN_ON(!PagePrivate(page)); 4454 4455 clear_page_dirty_for_io(page); 4456 spin_lock_irq(&page->mapping->tree_lock); 4457 if (!PageDirty(page)) { 4458 radix_tree_tag_clear(&page->mapping->page_tree, 4459 page_index(page), 4460 PAGECACHE_TAG_DIRTY); 4461 } 4462 spin_unlock_irq(&page->mapping->tree_lock); 4463 ClearPageError(page); 4464 unlock_page(page); 4465 } 4466 WARN_ON(atomic_read(&eb->refs) == 0); 4467 } 4468 4469 int set_extent_buffer_dirty(struct extent_buffer *eb) 4470 { 4471 unsigned long i; 4472 unsigned long num_pages; 4473 int was_dirty = 0; 4474 4475 check_buffer_tree_ref(eb); 4476 4477 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4478 4479 num_pages = num_extent_pages(eb->start, eb->len); 4480 WARN_ON(atomic_read(&eb->refs) == 0); 4481 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 4482 4483 for (i = 0; i < num_pages; i++) 4484 set_page_dirty(extent_buffer_page(eb, i)); 4485 return was_dirty; 4486 } 4487 4488 static int range_straddles_pages(u64 start, u64 len) 4489 { 4490 if (len < PAGE_CACHE_SIZE) 4491 return 1; 4492 if (start & (PAGE_CACHE_SIZE - 1)) 4493 return 1; 4494 if ((start + len) & (PAGE_CACHE_SIZE - 1)) 4495 return 1; 4496 return 0; 4497 } 4498 4499 int clear_extent_buffer_uptodate(struct extent_buffer *eb) 4500 { 4501 unsigned long i; 4502 struct page *page; 4503 unsigned long num_pages; 4504 4505 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4506 num_pages = num_extent_pages(eb->start, eb->len); 4507 for (i = 0; i < num_pages; i++) { 4508 page = extent_buffer_page(eb, i); 4509 if (page) 4510 ClearPageUptodate(page); 4511 } 4512 return 0; 4513 } 4514 4515 int set_extent_buffer_uptodate(struct extent_buffer *eb) 4516 { 4517 unsigned long i; 4518 struct page *page; 4519 unsigned long num_pages; 4520 4521 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4522 num_pages = num_extent_pages(eb->start, eb->len); 4523 for (i = 0; i < num_pages; i++) { 4524 page = extent_buffer_page(eb, i); 4525 SetPageUptodate(page); 4526 } 4527 return 0; 4528 } 4529 4530 int extent_range_uptodate(struct extent_io_tree *tree, 4531 u64 start, u64 end) 4532 { 4533 struct page *page; 4534 int ret; 4535 int pg_uptodate = 1; 4536 int uptodate; 4537 unsigned long index; 4538 4539 if (range_straddles_pages(start, end - start + 1)) { 4540 ret = test_range_bit(tree, start, end, 4541 EXTENT_UPTODATE, 1, NULL); 4542 if (ret) 4543 return 1; 4544 } 4545 while (start <= end) { 4546 index = start >> PAGE_CACHE_SHIFT; 4547 page = find_get_page(tree->mapping, index); 4548 if (!page) 4549 return 1; 4550 uptodate = PageUptodate(page); 4551 page_cache_release(page); 4552 if (!uptodate) { 4553 pg_uptodate = 0; 4554 break; 4555 } 4556 start += PAGE_CACHE_SIZE; 4557 } 4558 return pg_uptodate; 4559 } 4560 4561 int extent_buffer_uptodate(struct extent_buffer *eb) 4562 { 4563 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4564 } 4565 4566 int read_extent_buffer_pages(struct extent_io_tree *tree, 4567 struct extent_buffer *eb, u64 start, int wait, 4568 get_extent_t *get_extent, int mirror_num) 4569 { 4570 unsigned long i; 4571 unsigned long start_i; 4572 struct page *page; 4573 int err; 4574 int ret = 0; 4575 int locked_pages = 0; 4576 int all_uptodate = 1; 4577 unsigned long num_pages; 4578 unsigned long num_reads = 0; 4579 struct bio *bio = NULL; 4580 unsigned long bio_flags = 0; 4581 4582 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 4583 return 0; 4584 4585 if (start) { 4586 WARN_ON(start < eb->start); 4587 start_i = (start >> PAGE_CACHE_SHIFT) - 4588 (eb->start >> PAGE_CACHE_SHIFT); 4589 } else { 4590 start_i = 0; 4591 } 4592 4593 num_pages = num_extent_pages(eb->start, eb->len); 4594 for (i = start_i; i < num_pages; i++) { 4595 page = extent_buffer_page(eb, i); 4596 if (wait == WAIT_NONE) { 4597 if (!trylock_page(page)) 4598 goto unlock_exit; 4599 } else { 4600 lock_page(page); 4601 } 4602 locked_pages++; 4603 if (!PageUptodate(page)) { 4604 num_reads++; 4605 all_uptodate = 0; 4606 } 4607 } 4608 if (all_uptodate) { 4609 if (start_i == 0) 4610 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4611 goto unlock_exit; 4612 } 4613 4614 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 4615 eb->read_mirror = 0; 4616 atomic_set(&eb->io_pages, num_reads); 4617 for (i = start_i; i < num_pages; i++) { 4618 page = extent_buffer_page(eb, i); 4619 if (!PageUptodate(page)) { 4620 ClearPageError(page); 4621 err = __extent_read_full_page(tree, page, 4622 get_extent, &bio, 4623 mirror_num, &bio_flags); 4624 if (err) 4625 ret = err; 4626 } else { 4627 unlock_page(page); 4628 } 4629 } 4630 4631 if (bio) { 4632 err = submit_one_bio(READ, bio, mirror_num, bio_flags); 4633 if (err) 4634 return err; 4635 } 4636 4637 if (ret || wait != WAIT_COMPLETE) 4638 return ret; 4639 4640 for (i = start_i; i < num_pages; i++) { 4641 page = extent_buffer_page(eb, i); 4642 wait_on_page_locked(page); 4643 if (!PageUptodate(page)) 4644 ret = -EIO; 4645 } 4646 4647 return ret; 4648 4649 unlock_exit: 4650 i = start_i; 4651 while (locked_pages > 0) { 4652 page = extent_buffer_page(eb, i); 4653 i++; 4654 unlock_page(page); 4655 locked_pages--; 4656 } 4657 return ret; 4658 } 4659 4660 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 4661 unsigned long start, 4662 unsigned long len) 4663 { 4664 size_t cur; 4665 size_t offset; 4666 struct page *page; 4667 char *kaddr; 4668 char *dst = (char *)dstv; 4669 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4670 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4671 4672 WARN_ON(start > eb->len); 4673 WARN_ON(start + len > eb->start + eb->len); 4674 4675 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4676 4677 while (len > 0) { 4678 page = extent_buffer_page(eb, i); 4679 4680 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4681 kaddr = page_address(page); 4682 memcpy(dst, kaddr + offset, cur); 4683 4684 dst += cur; 4685 len -= cur; 4686 offset = 0; 4687 i++; 4688 } 4689 } 4690 4691 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 4692 unsigned long min_len, char **map, 4693 unsigned long *map_start, 4694 unsigned long *map_len) 4695 { 4696 size_t offset = start & (PAGE_CACHE_SIZE - 1); 4697 char *kaddr; 4698 struct page *p; 4699 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4700 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4701 unsigned long end_i = (start_offset + start + min_len - 1) >> 4702 PAGE_CACHE_SHIFT; 4703 4704 if (i != end_i) 4705 return -EINVAL; 4706 4707 if (i == 0) { 4708 offset = start_offset; 4709 *map_start = 0; 4710 } else { 4711 offset = 0; 4712 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 4713 } 4714 4715 if (start + min_len > eb->len) { 4716 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 4717 "wanted %lu %lu\n", (unsigned long long)eb->start, 4718 eb->len, start, min_len); 4719 return -EINVAL; 4720 } 4721 4722 p = extent_buffer_page(eb, i); 4723 kaddr = page_address(p); 4724 *map = kaddr + offset; 4725 *map_len = PAGE_CACHE_SIZE - offset; 4726 return 0; 4727 } 4728 4729 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 4730 unsigned long start, 4731 unsigned long len) 4732 { 4733 size_t cur; 4734 size_t offset; 4735 struct page *page; 4736 char *kaddr; 4737 char *ptr = (char *)ptrv; 4738 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4739 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4740 int ret = 0; 4741 4742 WARN_ON(start > eb->len); 4743 WARN_ON(start + len > eb->start + eb->len); 4744 4745 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4746 4747 while (len > 0) { 4748 page = extent_buffer_page(eb, i); 4749 4750 cur = min(len, (PAGE_CACHE_SIZE - offset)); 4751 4752 kaddr = page_address(page); 4753 ret = memcmp(ptr, kaddr + offset, cur); 4754 if (ret) 4755 break; 4756 4757 ptr += cur; 4758 len -= cur; 4759 offset = 0; 4760 i++; 4761 } 4762 return ret; 4763 } 4764 4765 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 4766 unsigned long start, unsigned long len) 4767 { 4768 size_t cur; 4769 size_t offset; 4770 struct page *page; 4771 char *kaddr; 4772 char *src = (char *)srcv; 4773 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4774 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4775 4776 WARN_ON(start > eb->len); 4777 WARN_ON(start + len > eb->start + eb->len); 4778 4779 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4780 4781 while (len > 0) { 4782 page = extent_buffer_page(eb, i); 4783 WARN_ON(!PageUptodate(page)); 4784 4785 cur = min(len, PAGE_CACHE_SIZE - offset); 4786 kaddr = page_address(page); 4787 memcpy(kaddr + offset, src, cur); 4788 4789 src += cur; 4790 len -= cur; 4791 offset = 0; 4792 i++; 4793 } 4794 } 4795 4796 void memset_extent_buffer(struct extent_buffer *eb, char c, 4797 unsigned long start, unsigned long len) 4798 { 4799 size_t cur; 4800 size_t offset; 4801 struct page *page; 4802 char *kaddr; 4803 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 4804 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 4805 4806 WARN_ON(start > eb->len); 4807 WARN_ON(start + len > eb->start + eb->len); 4808 4809 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 4810 4811 while (len > 0) { 4812 page = extent_buffer_page(eb, i); 4813 WARN_ON(!PageUptodate(page)); 4814 4815 cur = min(len, PAGE_CACHE_SIZE - offset); 4816 kaddr = page_address(page); 4817 memset(kaddr + offset, c, cur); 4818 4819 len -= cur; 4820 offset = 0; 4821 i++; 4822 } 4823 } 4824 4825 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 4826 unsigned long dst_offset, unsigned long src_offset, 4827 unsigned long len) 4828 { 4829 u64 dst_len = dst->len; 4830 size_t cur; 4831 size_t offset; 4832 struct page *page; 4833 char *kaddr; 4834 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4835 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4836 4837 WARN_ON(src->len != dst_len); 4838 4839 offset = (start_offset + dst_offset) & 4840 ((unsigned long)PAGE_CACHE_SIZE - 1); 4841 4842 while (len > 0) { 4843 page = extent_buffer_page(dst, i); 4844 WARN_ON(!PageUptodate(page)); 4845 4846 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 4847 4848 kaddr = page_address(page); 4849 read_extent_buffer(src, kaddr + offset, src_offset, cur); 4850 4851 src_offset += cur; 4852 len -= cur; 4853 offset = 0; 4854 i++; 4855 } 4856 } 4857 4858 static void move_pages(struct page *dst_page, struct page *src_page, 4859 unsigned long dst_off, unsigned long src_off, 4860 unsigned long len) 4861 { 4862 char *dst_kaddr = page_address(dst_page); 4863 if (dst_page == src_page) { 4864 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 4865 } else { 4866 char *src_kaddr = page_address(src_page); 4867 char *p = dst_kaddr + dst_off + len; 4868 char *s = src_kaddr + src_off + len; 4869 4870 while (len--) 4871 *--p = *--s; 4872 } 4873 } 4874 4875 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 4876 { 4877 unsigned long distance = (src > dst) ? src - dst : dst - src; 4878 return distance < len; 4879 } 4880 4881 static void copy_pages(struct page *dst_page, struct page *src_page, 4882 unsigned long dst_off, unsigned long src_off, 4883 unsigned long len) 4884 { 4885 char *dst_kaddr = page_address(dst_page); 4886 char *src_kaddr; 4887 int must_memmove = 0; 4888 4889 if (dst_page != src_page) { 4890 src_kaddr = page_address(src_page); 4891 } else { 4892 src_kaddr = dst_kaddr; 4893 if (areas_overlap(src_off, dst_off, len)) 4894 must_memmove = 1; 4895 } 4896 4897 if (must_memmove) 4898 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 4899 else 4900 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 4901 } 4902 4903 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4904 unsigned long src_offset, unsigned long len) 4905 { 4906 size_t cur; 4907 size_t dst_off_in_page; 4908 size_t src_off_in_page; 4909 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4910 unsigned long dst_i; 4911 unsigned long src_i; 4912 4913 if (src_offset + len > dst->len) { 4914 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4915 "len %lu dst len %lu\n", src_offset, len, dst->len); 4916 BUG_ON(1); 4917 } 4918 if (dst_offset + len > dst->len) { 4919 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4920 "len %lu dst len %lu\n", dst_offset, len, dst->len); 4921 BUG_ON(1); 4922 } 4923 4924 while (len > 0) { 4925 dst_off_in_page = (start_offset + dst_offset) & 4926 ((unsigned long)PAGE_CACHE_SIZE - 1); 4927 src_off_in_page = (start_offset + src_offset) & 4928 ((unsigned long)PAGE_CACHE_SIZE - 1); 4929 4930 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 4931 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 4932 4933 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 4934 src_off_in_page)); 4935 cur = min_t(unsigned long, cur, 4936 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 4937 4938 copy_pages(extent_buffer_page(dst, dst_i), 4939 extent_buffer_page(dst, src_i), 4940 dst_off_in_page, src_off_in_page, cur); 4941 4942 src_offset += cur; 4943 dst_offset += cur; 4944 len -= cur; 4945 } 4946 } 4947 4948 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 4949 unsigned long src_offset, unsigned long len) 4950 { 4951 size_t cur; 4952 size_t dst_off_in_page; 4953 size_t src_off_in_page; 4954 unsigned long dst_end = dst_offset + len - 1; 4955 unsigned long src_end = src_offset + len - 1; 4956 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 4957 unsigned long dst_i; 4958 unsigned long src_i; 4959 4960 if (src_offset + len > dst->len) { 4961 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 4962 "len %lu len %lu\n", src_offset, len, dst->len); 4963 BUG_ON(1); 4964 } 4965 if (dst_offset + len > dst->len) { 4966 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 4967 "len %lu len %lu\n", dst_offset, len, dst->len); 4968 BUG_ON(1); 4969 } 4970 if (dst_offset < src_offset) { 4971 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4972 return; 4973 } 4974 while (len > 0) { 4975 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 4976 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 4977 4978 dst_off_in_page = (start_offset + dst_end) & 4979 ((unsigned long)PAGE_CACHE_SIZE - 1); 4980 src_off_in_page = (start_offset + src_end) & 4981 ((unsigned long)PAGE_CACHE_SIZE - 1); 4982 4983 cur = min_t(unsigned long, len, src_off_in_page + 1); 4984 cur = min(cur, dst_off_in_page + 1); 4985 move_pages(extent_buffer_page(dst, dst_i), 4986 extent_buffer_page(dst, src_i), 4987 dst_off_in_page - cur + 1, 4988 src_off_in_page - cur + 1, cur); 4989 4990 dst_end -= cur; 4991 src_end -= cur; 4992 len -= cur; 4993 } 4994 } 4995 4996 int try_release_extent_buffer(struct page *page, gfp_t mask) 4997 { 4998 struct extent_buffer *eb; 4999 5000 /* 5001 * We need to make sure noboody is attaching this page to an eb right 5002 * now. 5003 */ 5004 spin_lock(&page->mapping->private_lock); 5005 if (!PagePrivate(page)) { 5006 spin_unlock(&page->mapping->private_lock); 5007 return 1; 5008 } 5009 5010 eb = (struct extent_buffer *)page->private; 5011 BUG_ON(!eb); 5012 5013 /* 5014 * This is a little awful but should be ok, we need to make sure that 5015 * the eb doesn't disappear out from under us while we're looking at 5016 * this page. 5017 */ 5018 spin_lock(&eb->refs_lock); 5019 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5020 spin_unlock(&eb->refs_lock); 5021 spin_unlock(&page->mapping->private_lock); 5022 return 0; 5023 } 5024 spin_unlock(&page->mapping->private_lock); 5025 5026 if ((mask & GFP_NOFS) == GFP_NOFS) 5027 mask = GFP_NOFS; 5028 5029 /* 5030 * If tree ref isn't set then we know the ref on this eb is a real ref, 5031 * so just return, this page will likely be freed soon anyway. 5032 */ 5033 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5034 spin_unlock(&eb->refs_lock); 5035 return 0; 5036 } 5037 5038 return release_extent_buffer(eb, mask); 5039 } 5040