1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include <linux/prefetch.h> 14 #include <linux/cleancache.h> 15 #include "extent_io.h" 16 #include "extent_map.h" 17 #include "compat.h" 18 #include "ctree.h" 19 #include "btrfs_inode.h" 20 21 static struct kmem_cache *extent_state_cache; 22 static struct kmem_cache *extent_buffer_cache; 23 24 static LIST_HEAD(buffers); 25 static LIST_HEAD(states); 26 27 #define LEAK_DEBUG 0 28 #if LEAK_DEBUG 29 static DEFINE_SPINLOCK(leak_lock); 30 #endif 31 32 #define BUFFER_LRU_MAX 64 33 34 struct tree_entry { 35 u64 start; 36 u64 end; 37 struct rb_node rb_node; 38 }; 39 40 struct extent_page_data { 41 struct bio *bio; 42 struct extent_io_tree *tree; 43 get_extent_t *get_extent; 44 45 /* tells writepage not to lock the state bits for this range 46 * it still does the unlocking 47 */ 48 unsigned int extent_locked:1; 49 50 /* tells the submit_bio code to use a WRITE_SYNC */ 51 unsigned int sync_io:1; 52 }; 53 54 int __init extent_io_init(void) 55 { 56 extent_state_cache = kmem_cache_create("extent_state", 57 sizeof(struct extent_state), 0, 58 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 59 if (!extent_state_cache) 60 return -ENOMEM; 61 62 extent_buffer_cache = kmem_cache_create("extent_buffers", 63 sizeof(struct extent_buffer), 0, 64 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 65 if (!extent_buffer_cache) 66 goto free_state_cache; 67 return 0; 68 69 free_state_cache: 70 kmem_cache_destroy(extent_state_cache); 71 return -ENOMEM; 72 } 73 74 void extent_io_exit(void) 75 { 76 struct extent_state *state; 77 struct extent_buffer *eb; 78 79 while (!list_empty(&states)) { 80 state = list_entry(states.next, struct extent_state, leak_list); 81 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 82 "state %lu in tree %p refs %d\n", 83 (unsigned long long)state->start, 84 (unsigned long long)state->end, 85 state->state, state->tree, atomic_read(&state->refs)); 86 list_del(&state->leak_list); 87 kmem_cache_free(extent_state_cache, state); 88 89 } 90 91 while (!list_empty(&buffers)) { 92 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 93 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 94 "refs %d\n", (unsigned long long)eb->start, 95 eb->len, atomic_read(&eb->refs)); 96 list_del(&eb->leak_list); 97 kmem_cache_free(extent_buffer_cache, eb); 98 } 99 if (extent_state_cache) 100 kmem_cache_destroy(extent_state_cache); 101 if (extent_buffer_cache) 102 kmem_cache_destroy(extent_buffer_cache); 103 } 104 105 void extent_io_tree_init(struct extent_io_tree *tree, 106 struct address_space *mapping) 107 { 108 tree->state = RB_ROOT; 109 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 110 tree->ops = NULL; 111 tree->dirty_bytes = 0; 112 spin_lock_init(&tree->lock); 113 spin_lock_init(&tree->buffer_lock); 114 tree->mapping = mapping; 115 } 116 117 static struct extent_state *alloc_extent_state(gfp_t mask) 118 { 119 struct extent_state *state; 120 #if LEAK_DEBUG 121 unsigned long flags; 122 #endif 123 124 state = kmem_cache_alloc(extent_state_cache, mask); 125 if (!state) 126 return state; 127 state->state = 0; 128 state->private = 0; 129 state->tree = NULL; 130 #if LEAK_DEBUG 131 spin_lock_irqsave(&leak_lock, flags); 132 list_add(&state->leak_list, &states); 133 spin_unlock_irqrestore(&leak_lock, flags); 134 #endif 135 atomic_set(&state->refs, 1); 136 init_waitqueue_head(&state->wq); 137 return state; 138 } 139 140 void free_extent_state(struct extent_state *state) 141 { 142 if (!state) 143 return; 144 if (atomic_dec_and_test(&state->refs)) { 145 #if LEAK_DEBUG 146 unsigned long flags; 147 #endif 148 WARN_ON(state->tree); 149 #if LEAK_DEBUG 150 spin_lock_irqsave(&leak_lock, flags); 151 list_del(&state->leak_list); 152 spin_unlock_irqrestore(&leak_lock, flags); 153 #endif 154 kmem_cache_free(extent_state_cache, state); 155 } 156 } 157 158 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 159 struct rb_node *node) 160 { 161 struct rb_node **p = &root->rb_node; 162 struct rb_node *parent = NULL; 163 struct tree_entry *entry; 164 165 while (*p) { 166 parent = *p; 167 entry = rb_entry(parent, struct tree_entry, rb_node); 168 169 if (offset < entry->start) 170 p = &(*p)->rb_left; 171 else if (offset > entry->end) 172 p = &(*p)->rb_right; 173 else 174 return parent; 175 } 176 177 entry = rb_entry(node, struct tree_entry, rb_node); 178 rb_link_node(node, parent, p); 179 rb_insert_color(node, root); 180 return NULL; 181 } 182 183 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 184 struct rb_node **prev_ret, 185 struct rb_node **next_ret) 186 { 187 struct rb_root *root = &tree->state; 188 struct rb_node *n = root->rb_node; 189 struct rb_node *prev = NULL; 190 struct rb_node *orig_prev = NULL; 191 struct tree_entry *entry; 192 struct tree_entry *prev_entry = NULL; 193 194 while (n) { 195 entry = rb_entry(n, struct tree_entry, rb_node); 196 prev = n; 197 prev_entry = entry; 198 199 if (offset < entry->start) 200 n = n->rb_left; 201 else if (offset > entry->end) 202 n = n->rb_right; 203 else 204 return n; 205 } 206 207 if (prev_ret) { 208 orig_prev = prev; 209 while (prev && offset > prev_entry->end) { 210 prev = rb_next(prev); 211 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 212 } 213 *prev_ret = prev; 214 prev = orig_prev; 215 } 216 217 if (next_ret) { 218 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 219 while (prev && offset < prev_entry->start) { 220 prev = rb_prev(prev); 221 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 222 } 223 *next_ret = prev; 224 } 225 return NULL; 226 } 227 228 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 229 u64 offset) 230 { 231 struct rb_node *prev = NULL; 232 struct rb_node *ret; 233 234 ret = __etree_search(tree, offset, &prev, NULL); 235 if (!ret) 236 return prev; 237 return ret; 238 } 239 240 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 241 struct extent_state *other) 242 { 243 if (tree->ops && tree->ops->merge_extent_hook) 244 tree->ops->merge_extent_hook(tree->mapping->host, new, 245 other); 246 } 247 248 /* 249 * utility function to look for merge candidates inside a given range. 250 * Any extents with matching state are merged together into a single 251 * extent in the tree. Extents with EXTENT_IO in their state field 252 * are not merged because the end_io handlers need to be able to do 253 * operations on them without sleeping (or doing allocations/splits). 254 * 255 * This should be called with the tree lock held. 256 */ 257 static void merge_state(struct extent_io_tree *tree, 258 struct extent_state *state) 259 { 260 struct extent_state *other; 261 struct rb_node *other_node; 262 263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 264 return; 265 266 other_node = rb_prev(&state->rb_node); 267 if (other_node) { 268 other = rb_entry(other_node, struct extent_state, rb_node); 269 if (other->end == state->start - 1 && 270 other->state == state->state) { 271 merge_cb(tree, state, other); 272 state->start = other->start; 273 other->tree = NULL; 274 rb_erase(&other->rb_node, &tree->state); 275 free_extent_state(other); 276 } 277 } 278 other_node = rb_next(&state->rb_node); 279 if (other_node) { 280 other = rb_entry(other_node, struct extent_state, rb_node); 281 if (other->start == state->end + 1 && 282 other->state == state->state) { 283 merge_cb(tree, state, other); 284 state->end = other->end; 285 other->tree = NULL; 286 rb_erase(&other->rb_node, &tree->state); 287 free_extent_state(other); 288 } 289 } 290 } 291 292 static void set_state_cb(struct extent_io_tree *tree, 293 struct extent_state *state, int *bits) 294 { 295 if (tree->ops && tree->ops->set_bit_hook) 296 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 297 } 298 299 static void clear_state_cb(struct extent_io_tree *tree, 300 struct extent_state *state, int *bits) 301 { 302 if (tree->ops && tree->ops->clear_bit_hook) 303 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 304 } 305 306 static void set_state_bits(struct extent_io_tree *tree, 307 struct extent_state *state, int *bits); 308 309 /* 310 * insert an extent_state struct into the tree. 'bits' are set on the 311 * struct before it is inserted. 312 * 313 * This may return -EEXIST if the extent is already there, in which case the 314 * state struct is freed. 315 * 316 * The tree lock is not taken internally. This is a utility function and 317 * probably isn't what you want to call (see set/clear_extent_bit). 318 */ 319 static int insert_state(struct extent_io_tree *tree, 320 struct extent_state *state, u64 start, u64 end, 321 int *bits) 322 { 323 struct rb_node *node; 324 325 if (end < start) { 326 printk(KERN_ERR "btrfs end < start %llu %llu\n", 327 (unsigned long long)end, 328 (unsigned long long)start); 329 WARN_ON(1); 330 } 331 state->start = start; 332 state->end = end; 333 334 set_state_bits(tree, state, bits); 335 336 node = tree_insert(&tree->state, end, &state->rb_node); 337 if (node) { 338 struct extent_state *found; 339 found = rb_entry(node, struct extent_state, rb_node); 340 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 341 "%llu %llu\n", (unsigned long long)found->start, 342 (unsigned long long)found->end, 343 (unsigned long long)start, (unsigned long long)end); 344 return -EEXIST; 345 } 346 state->tree = tree; 347 merge_state(tree, state); 348 return 0; 349 } 350 351 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 352 u64 split) 353 { 354 if (tree->ops && tree->ops->split_extent_hook) 355 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 356 } 357 358 /* 359 * split a given extent state struct in two, inserting the preallocated 360 * struct 'prealloc' as the newly created second half. 'split' indicates an 361 * offset inside 'orig' where it should be split. 362 * 363 * Before calling, 364 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 365 * are two extent state structs in the tree: 366 * prealloc: [orig->start, split - 1] 367 * orig: [ split, orig->end ] 368 * 369 * The tree locks are not taken by this function. They need to be held 370 * by the caller. 371 */ 372 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 373 struct extent_state *prealloc, u64 split) 374 { 375 struct rb_node *node; 376 377 split_cb(tree, orig, split); 378 379 prealloc->start = orig->start; 380 prealloc->end = split - 1; 381 prealloc->state = orig->state; 382 orig->start = split; 383 384 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 385 if (node) { 386 free_extent_state(prealloc); 387 return -EEXIST; 388 } 389 prealloc->tree = tree; 390 return 0; 391 } 392 393 /* 394 * utility function to clear some bits in an extent state struct. 395 * it will optionally wake up any one waiting on this state (wake == 1), or 396 * forcibly remove the state from the tree (delete == 1). 397 * 398 * If no bits are set on the state struct after clearing things, the 399 * struct is freed and removed from the tree 400 */ 401 static int clear_state_bit(struct extent_io_tree *tree, 402 struct extent_state *state, 403 int *bits, int wake) 404 { 405 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 406 int ret = state->state & bits_to_clear; 407 408 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 409 u64 range = state->end - state->start + 1; 410 WARN_ON(range > tree->dirty_bytes); 411 tree->dirty_bytes -= range; 412 } 413 clear_state_cb(tree, state, bits); 414 state->state &= ~bits_to_clear; 415 if (wake) 416 wake_up(&state->wq); 417 if (state->state == 0) { 418 if (state->tree) { 419 rb_erase(&state->rb_node, &tree->state); 420 state->tree = NULL; 421 free_extent_state(state); 422 } else { 423 WARN_ON(1); 424 } 425 } else { 426 merge_state(tree, state); 427 } 428 return ret; 429 } 430 431 static struct extent_state * 432 alloc_extent_state_atomic(struct extent_state *prealloc) 433 { 434 if (!prealloc) 435 prealloc = alloc_extent_state(GFP_ATOMIC); 436 437 return prealloc; 438 } 439 440 /* 441 * clear some bits on a range in the tree. This may require splitting 442 * or inserting elements in the tree, so the gfp mask is used to 443 * indicate which allocations or sleeping are allowed. 444 * 445 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 446 * the given range from the tree regardless of state (ie for truncate). 447 * 448 * the range [start, end] is inclusive. 449 * 450 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 451 * bits were already set, or zero if none of the bits were already set. 452 */ 453 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 454 int bits, int wake, int delete, 455 struct extent_state **cached_state, 456 gfp_t mask) 457 { 458 struct extent_state *state; 459 struct extent_state *cached; 460 struct extent_state *prealloc = NULL; 461 struct rb_node *next_node; 462 struct rb_node *node; 463 u64 last_end; 464 int err; 465 int set = 0; 466 int clear = 0; 467 468 if (delete) 469 bits |= ~EXTENT_CTLBITS; 470 bits |= EXTENT_FIRST_DELALLOC; 471 472 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 473 clear = 1; 474 again: 475 if (!prealloc && (mask & __GFP_WAIT)) { 476 prealloc = alloc_extent_state(mask); 477 if (!prealloc) 478 return -ENOMEM; 479 } 480 481 spin_lock(&tree->lock); 482 if (cached_state) { 483 cached = *cached_state; 484 485 if (clear) { 486 *cached_state = NULL; 487 cached_state = NULL; 488 } 489 490 if (cached && cached->tree && cached->start <= start && 491 cached->end > start) { 492 if (clear) 493 atomic_dec(&cached->refs); 494 state = cached; 495 goto hit_next; 496 } 497 if (clear) 498 free_extent_state(cached); 499 } 500 /* 501 * this search will find the extents that end after 502 * our range starts 503 */ 504 node = tree_search(tree, start); 505 if (!node) 506 goto out; 507 state = rb_entry(node, struct extent_state, rb_node); 508 hit_next: 509 if (state->start > end) 510 goto out; 511 WARN_ON(state->end < start); 512 last_end = state->end; 513 514 /* 515 * | ---- desired range ---- | 516 * | state | or 517 * | ------------- state -------------- | 518 * 519 * We need to split the extent we found, and may flip 520 * bits on second half. 521 * 522 * If the extent we found extends past our range, we 523 * just split and search again. It'll get split again 524 * the next time though. 525 * 526 * If the extent we found is inside our range, we clear 527 * the desired bit on it. 528 */ 529 530 if (state->start < start) { 531 prealloc = alloc_extent_state_atomic(prealloc); 532 BUG_ON(!prealloc); 533 err = split_state(tree, state, prealloc, start); 534 BUG_ON(err == -EEXIST); 535 prealloc = NULL; 536 if (err) 537 goto out; 538 if (state->end <= end) { 539 set |= clear_state_bit(tree, state, &bits, wake); 540 if (last_end == (u64)-1) 541 goto out; 542 start = last_end + 1; 543 } 544 goto search_again; 545 } 546 /* 547 * | ---- desired range ---- | 548 * | state | 549 * We need to split the extent, and clear the bit 550 * on the first half 551 */ 552 if (state->start <= end && state->end > end) { 553 prealloc = alloc_extent_state_atomic(prealloc); 554 BUG_ON(!prealloc); 555 err = split_state(tree, state, prealloc, end + 1); 556 BUG_ON(err == -EEXIST); 557 if (wake) 558 wake_up(&state->wq); 559 560 set |= clear_state_bit(tree, prealloc, &bits, wake); 561 562 prealloc = NULL; 563 goto out; 564 } 565 566 if (state->end < end && prealloc && !need_resched()) 567 next_node = rb_next(&state->rb_node); 568 else 569 next_node = NULL; 570 571 set |= clear_state_bit(tree, state, &bits, wake); 572 if (last_end == (u64)-1) 573 goto out; 574 start = last_end + 1; 575 if (start <= end && next_node) { 576 state = rb_entry(next_node, struct extent_state, 577 rb_node); 578 if (state->start == start) 579 goto hit_next; 580 } 581 goto search_again; 582 583 out: 584 spin_unlock(&tree->lock); 585 if (prealloc) 586 free_extent_state(prealloc); 587 588 return set; 589 590 search_again: 591 if (start > end) 592 goto out; 593 spin_unlock(&tree->lock); 594 if (mask & __GFP_WAIT) 595 cond_resched(); 596 goto again; 597 } 598 599 static int wait_on_state(struct extent_io_tree *tree, 600 struct extent_state *state) 601 __releases(tree->lock) 602 __acquires(tree->lock) 603 { 604 DEFINE_WAIT(wait); 605 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 606 spin_unlock(&tree->lock); 607 schedule(); 608 spin_lock(&tree->lock); 609 finish_wait(&state->wq, &wait); 610 return 0; 611 } 612 613 /* 614 * waits for one or more bits to clear on a range in the state tree. 615 * The range [start, end] is inclusive. 616 * The tree lock is taken by this function 617 */ 618 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 619 { 620 struct extent_state *state; 621 struct rb_node *node; 622 623 spin_lock(&tree->lock); 624 again: 625 while (1) { 626 /* 627 * this search will find all the extents that end after 628 * our range starts 629 */ 630 node = tree_search(tree, start); 631 if (!node) 632 break; 633 634 state = rb_entry(node, struct extent_state, rb_node); 635 636 if (state->start > end) 637 goto out; 638 639 if (state->state & bits) { 640 start = state->start; 641 atomic_inc(&state->refs); 642 wait_on_state(tree, state); 643 free_extent_state(state); 644 goto again; 645 } 646 start = state->end + 1; 647 648 if (start > end) 649 break; 650 651 cond_resched_lock(&tree->lock); 652 } 653 out: 654 spin_unlock(&tree->lock); 655 return 0; 656 } 657 658 static void set_state_bits(struct extent_io_tree *tree, 659 struct extent_state *state, 660 int *bits) 661 { 662 int bits_to_set = *bits & ~EXTENT_CTLBITS; 663 664 set_state_cb(tree, state, bits); 665 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 666 u64 range = state->end - state->start + 1; 667 tree->dirty_bytes += range; 668 } 669 state->state |= bits_to_set; 670 } 671 672 static void cache_state(struct extent_state *state, 673 struct extent_state **cached_ptr) 674 { 675 if (cached_ptr && !(*cached_ptr)) { 676 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 677 *cached_ptr = state; 678 atomic_inc(&state->refs); 679 } 680 } 681 } 682 683 static void uncache_state(struct extent_state **cached_ptr) 684 { 685 if (cached_ptr && (*cached_ptr)) { 686 struct extent_state *state = *cached_ptr; 687 *cached_ptr = NULL; 688 free_extent_state(state); 689 } 690 } 691 692 /* 693 * set some bits on a range in the tree. This may require allocations or 694 * sleeping, so the gfp mask is used to indicate what is allowed. 695 * 696 * If any of the exclusive bits are set, this will fail with -EEXIST if some 697 * part of the range already has the desired bits set. The start of the 698 * existing range is returned in failed_start in this case. 699 * 700 * [start, end] is inclusive This takes the tree lock. 701 */ 702 703 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 704 int bits, int exclusive_bits, u64 *failed_start, 705 struct extent_state **cached_state, gfp_t mask) 706 { 707 struct extent_state *state; 708 struct extent_state *prealloc = NULL; 709 struct rb_node *node; 710 int err = 0; 711 u64 last_start; 712 u64 last_end; 713 714 bits |= EXTENT_FIRST_DELALLOC; 715 again: 716 if (!prealloc && (mask & __GFP_WAIT)) { 717 prealloc = alloc_extent_state(mask); 718 BUG_ON(!prealloc); 719 } 720 721 spin_lock(&tree->lock); 722 if (cached_state && *cached_state) { 723 state = *cached_state; 724 if (state->start <= start && state->end > start && 725 state->tree) { 726 node = &state->rb_node; 727 goto hit_next; 728 } 729 } 730 /* 731 * this search will find all the extents that end after 732 * our range starts. 733 */ 734 node = tree_search(tree, start); 735 if (!node) { 736 prealloc = alloc_extent_state_atomic(prealloc); 737 BUG_ON(!prealloc); 738 err = insert_state(tree, prealloc, start, end, &bits); 739 prealloc = NULL; 740 BUG_ON(err == -EEXIST); 741 goto out; 742 } 743 state = rb_entry(node, struct extent_state, rb_node); 744 hit_next: 745 last_start = state->start; 746 last_end = state->end; 747 748 /* 749 * | ---- desired range ---- | 750 * | state | 751 * 752 * Just lock what we found and keep going 753 */ 754 if (state->start == start && state->end <= end) { 755 struct rb_node *next_node; 756 if (state->state & exclusive_bits) { 757 *failed_start = state->start; 758 err = -EEXIST; 759 goto out; 760 } 761 762 set_state_bits(tree, state, &bits); 763 764 cache_state(state, cached_state); 765 merge_state(tree, state); 766 if (last_end == (u64)-1) 767 goto out; 768 769 start = last_end + 1; 770 next_node = rb_next(&state->rb_node); 771 if (next_node && start < end && prealloc && !need_resched()) { 772 state = rb_entry(next_node, struct extent_state, 773 rb_node); 774 if (state->start == start) 775 goto hit_next; 776 } 777 goto search_again; 778 } 779 780 /* 781 * | ---- desired range ---- | 782 * | state | 783 * or 784 * | ------------- state -------------- | 785 * 786 * We need to split the extent we found, and may flip bits on 787 * second half. 788 * 789 * If the extent we found extends past our 790 * range, we just split and search again. It'll get split 791 * again the next time though. 792 * 793 * If the extent we found is inside our range, we set the 794 * desired bit on it. 795 */ 796 if (state->start < start) { 797 if (state->state & exclusive_bits) { 798 *failed_start = start; 799 err = -EEXIST; 800 goto out; 801 } 802 803 prealloc = alloc_extent_state_atomic(prealloc); 804 BUG_ON(!prealloc); 805 err = split_state(tree, state, prealloc, start); 806 BUG_ON(err == -EEXIST); 807 prealloc = NULL; 808 if (err) 809 goto out; 810 if (state->end <= end) { 811 set_state_bits(tree, state, &bits); 812 cache_state(state, cached_state); 813 merge_state(tree, state); 814 if (last_end == (u64)-1) 815 goto out; 816 start = last_end + 1; 817 } 818 goto search_again; 819 } 820 /* 821 * | ---- desired range ---- | 822 * | state | or | state | 823 * 824 * There's a hole, we need to insert something in it and 825 * ignore the extent we found. 826 */ 827 if (state->start > start) { 828 u64 this_end; 829 if (end < last_start) 830 this_end = end; 831 else 832 this_end = last_start - 1; 833 834 prealloc = alloc_extent_state_atomic(prealloc); 835 BUG_ON(!prealloc); 836 837 /* 838 * Avoid to free 'prealloc' if it can be merged with 839 * the later extent. 840 */ 841 err = insert_state(tree, prealloc, start, this_end, 842 &bits); 843 BUG_ON(err == -EEXIST); 844 if (err) { 845 free_extent_state(prealloc); 846 prealloc = NULL; 847 goto out; 848 } 849 cache_state(prealloc, cached_state); 850 prealloc = NULL; 851 start = this_end + 1; 852 goto search_again; 853 } 854 /* 855 * | ---- desired range ---- | 856 * | state | 857 * We need to split the extent, and set the bit 858 * on the first half 859 */ 860 if (state->start <= end && state->end > end) { 861 if (state->state & exclusive_bits) { 862 *failed_start = start; 863 err = -EEXIST; 864 goto out; 865 } 866 867 prealloc = alloc_extent_state_atomic(prealloc); 868 BUG_ON(!prealloc); 869 err = split_state(tree, state, prealloc, end + 1); 870 BUG_ON(err == -EEXIST); 871 872 set_state_bits(tree, prealloc, &bits); 873 cache_state(prealloc, cached_state); 874 merge_state(tree, prealloc); 875 prealloc = NULL; 876 goto out; 877 } 878 879 goto search_again; 880 881 out: 882 spin_unlock(&tree->lock); 883 if (prealloc) 884 free_extent_state(prealloc); 885 886 return err; 887 888 search_again: 889 if (start > end) 890 goto out; 891 spin_unlock(&tree->lock); 892 if (mask & __GFP_WAIT) 893 cond_resched(); 894 goto again; 895 } 896 897 /* wrappers around set/clear extent bit */ 898 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 899 gfp_t mask) 900 { 901 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 902 NULL, mask); 903 } 904 905 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 906 int bits, gfp_t mask) 907 { 908 return set_extent_bit(tree, start, end, bits, 0, NULL, 909 NULL, mask); 910 } 911 912 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 913 int bits, gfp_t mask) 914 { 915 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 916 } 917 918 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 919 struct extent_state **cached_state, gfp_t mask) 920 { 921 return set_extent_bit(tree, start, end, 922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 923 0, NULL, cached_state, mask); 924 } 925 926 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 927 gfp_t mask) 928 { 929 return clear_extent_bit(tree, start, end, 930 EXTENT_DIRTY | EXTENT_DELALLOC | 931 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 932 } 933 934 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 935 gfp_t mask) 936 { 937 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 938 NULL, mask); 939 } 940 941 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 942 struct extent_state **cached_state, gfp_t mask) 943 { 944 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 945 NULL, cached_state, mask); 946 } 947 948 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 949 u64 end, struct extent_state **cached_state, 950 gfp_t mask) 951 { 952 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 953 cached_state, mask); 954 } 955 956 /* 957 * either insert or lock state struct between start and end use mask to tell 958 * us if waiting is desired. 959 */ 960 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 961 int bits, struct extent_state **cached_state, gfp_t mask) 962 { 963 int err; 964 u64 failed_start; 965 while (1) { 966 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 967 EXTENT_LOCKED, &failed_start, 968 cached_state, mask); 969 if (err == -EEXIST && (mask & __GFP_WAIT)) { 970 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 971 start = failed_start; 972 } else { 973 break; 974 } 975 WARN_ON(start > end); 976 } 977 return err; 978 } 979 980 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 981 { 982 return lock_extent_bits(tree, start, end, 0, NULL, mask); 983 } 984 985 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 986 gfp_t mask) 987 { 988 int err; 989 u64 failed_start; 990 991 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 992 &failed_start, NULL, mask); 993 if (err == -EEXIST) { 994 if (failed_start > start) 995 clear_extent_bit(tree, start, failed_start - 1, 996 EXTENT_LOCKED, 1, 0, NULL, mask); 997 return 0; 998 } 999 return 1; 1000 } 1001 1002 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1003 struct extent_state **cached, gfp_t mask) 1004 { 1005 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1006 mask); 1007 } 1008 1009 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1010 { 1011 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1012 mask); 1013 } 1014 1015 /* 1016 * helper function to set both pages and extents in the tree writeback 1017 */ 1018 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1019 { 1020 unsigned long index = start >> PAGE_CACHE_SHIFT; 1021 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1022 struct page *page; 1023 1024 while (index <= end_index) { 1025 page = find_get_page(tree->mapping, index); 1026 BUG_ON(!page); 1027 set_page_writeback(page); 1028 page_cache_release(page); 1029 index++; 1030 } 1031 return 0; 1032 } 1033 1034 /* find the first state struct with 'bits' set after 'start', and 1035 * return it. tree->lock must be held. NULL will returned if 1036 * nothing was found after 'start' 1037 */ 1038 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1039 u64 start, int bits) 1040 { 1041 struct rb_node *node; 1042 struct extent_state *state; 1043 1044 /* 1045 * this search will find all the extents that end after 1046 * our range starts. 1047 */ 1048 node = tree_search(tree, start); 1049 if (!node) 1050 goto out; 1051 1052 while (1) { 1053 state = rb_entry(node, struct extent_state, rb_node); 1054 if (state->end >= start && (state->state & bits)) 1055 return state; 1056 1057 node = rb_next(node); 1058 if (!node) 1059 break; 1060 } 1061 out: 1062 return NULL; 1063 } 1064 1065 /* 1066 * find the first offset in the io tree with 'bits' set. zero is 1067 * returned if we find something, and *start_ret and *end_ret are 1068 * set to reflect the state struct that was found. 1069 * 1070 * If nothing was found, 1 is returned, < 0 on error 1071 */ 1072 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1073 u64 *start_ret, u64 *end_ret, int bits) 1074 { 1075 struct extent_state *state; 1076 int ret = 1; 1077 1078 spin_lock(&tree->lock); 1079 state = find_first_extent_bit_state(tree, start, bits); 1080 if (state) { 1081 *start_ret = state->start; 1082 *end_ret = state->end; 1083 ret = 0; 1084 } 1085 spin_unlock(&tree->lock); 1086 return ret; 1087 } 1088 1089 /* 1090 * find a contiguous range of bytes in the file marked as delalloc, not 1091 * more than 'max_bytes'. start and end are used to return the range, 1092 * 1093 * 1 is returned if we find something, 0 if nothing was in the tree 1094 */ 1095 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1096 u64 *start, u64 *end, u64 max_bytes, 1097 struct extent_state **cached_state) 1098 { 1099 struct rb_node *node; 1100 struct extent_state *state; 1101 u64 cur_start = *start; 1102 u64 found = 0; 1103 u64 total_bytes = 0; 1104 1105 spin_lock(&tree->lock); 1106 1107 /* 1108 * this search will find all the extents that end after 1109 * our range starts. 1110 */ 1111 node = tree_search(tree, cur_start); 1112 if (!node) { 1113 if (!found) 1114 *end = (u64)-1; 1115 goto out; 1116 } 1117 1118 while (1) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (found && (state->start != cur_start || 1121 (state->state & EXTENT_BOUNDARY))) { 1122 goto out; 1123 } 1124 if (!(state->state & EXTENT_DELALLOC)) { 1125 if (!found) 1126 *end = state->end; 1127 goto out; 1128 } 1129 if (!found) { 1130 *start = state->start; 1131 *cached_state = state; 1132 atomic_inc(&state->refs); 1133 } 1134 found++; 1135 *end = state->end; 1136 cur_start = state->end + 1; 1137 node = rb_next(node); 1138 if (!node) 1139 break; 1140 total_bytes += state->end - state->start + 1; 1141 if (total_bytes >= max_bytes) 1142 break; 1143 } 1144 out: 1145 spin_unlock(&tree->lock); 1146 return found; 1147 } 1148 1149 static noinline int __unlock_for_delalloc(struct inode *inode, 1150 struct page *locked_page, 1151 u64 start, u64 end) 1152 { 1153 int ret; 1154 struct page *pages[16]; 1155 unsigned long index = start >> PAGE_CACHE_SHIFT; 1156 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1157 unsigned long nr_pages = end_index - index + 1; 1158 int i; 1159 1160 if (index == locked_page->index && end_index == index) 1161 return 0; 1162 1163 while (nr_pages > 0) { 1164 ret = find_get_pages_contig(inode->i_mapping, index, 1165 min_t(unsigned long, nr_pages, 1166 ARRAY_SIZE(pages)), pages); 1167 for (i = 0; i < ret; i++) { 1168 if (pages[i] != locked_page) 1169 unlock_page(pages[i]); 1170 page_cache_release(pages[i]); 1171 } 1172 nr_pages -= ret; 1173 index += ret; 1174 cond_resched(); 1175 } 1176 return 0; 1177 } 1178 1179 static noinline int lock_delalloc_pages(struct inode *inode, 1180 struct page *locked_page, 1181 u64 delalloc_start, 1182 u64 delalloc_end) 1183 { 1184 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1185 unsigned long start_index = index; 1186 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1187 unsigned long pages_locked = 0; 1188 struct page *pages[16]; 1189 unsigned long nrpages; 1190 int ret; 1191 int i; 1192 1193 /* the caller is responsible for locking the start index */ 1194 if (index == locked_page->index && index == end_index) 1195 return 0; 1196 1197 /* skip the page at the start index */ 1198 nrpages = end_index - index + 1; 1199 while (nrpages > 0) { 1200 ret = find_get_pages_contig(inode->i_mapping, index, 1201 min_t(unsigned long, 1202 nrpages, ARRAY_SIZE(pages)), pages); 1203 if (ret == 0) { 1204 ret = -EAGAIN; 1205 goto done; 1206 } 1207 /* now we have an array of pages, lock them all */ 1208 for (i = 0; i < ret; i++) { 1209 /* 1210 * the caller is taking responsibility for 1211 * locked_page 1212 */ 1213 if (pages[i] != locked_page) { 1214 lock_page(pages[i]); 1215 if (!PageDirty(pages[i]) || 1216 pages[i]->mapping != inode->i_mapping) { 1217 ret = -EAGAIN; 1218 unlock_page(pages[i]); 1219 page_cache_release(pages[i]); 1220 goto done; 1221 } 1222 } 1223 page_cache_release(pages[i]); 1224 pages_locked++; 1225 } 1226 nrpages -= ret; 1227 index += ret; 1228 cond_resched(); 1229 } 1230 ret = 0; 1231 done: 1232 if (ret && pages_locked) { 1233 __unlock_for_delalloc(inode, locked_page, 1234 delalloc_start, 1235 ((u64)(start_index + pages_locked - 1)) << 1236 PAGE_CACHE_SHIFT); 1237 } 1238 return ret; 1239 } 1240 1241 /* 1242 * find a contiguous range of bytes in the file marked as delalloc, not 1243 * more than 'max_bytes'. start and end are used to return the range, 1244 * 1245 * 1 is returned if we find something, 0 if nothing was in the tree 1246 */ 1247 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1248 struct extent_io_tree *tree, 1249 struct page *locked_page, 1250 u64 *start, u64 *end, 1251 u64 max_bytes) 1252 { 1253 u64 delalloc_start; 1254 u64 delalloc_end; 1255 u64 found; 1256 struct extent_state *cached_state = NULL; 1257 int ret; 1258 int loops = 0; 1259 1260 again: 1261 /* step one, find a bunch of delalloc bytes starting at start */ 1262 delalloc_start = *start; 1263 delalloc_end = 0; 1264 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1265 max_bytes, &cached_state); 1266 if (!found || delalloc_end <= *start) { 1267 *start = delalloc_start; 1268 *end = delalloc_end; 1269 free_extent_state(cached_state); 1270 return found; 1271 } 1272 1273 /* 1274 * start comes from the offset of locked_page. We have to lock 1275 * pages in order, so we can't process delalloc bytes before 1276 * locked_page 1277 */ 1278 if (delalloc_start < *start) 1279 delalloc_start = *start; 1280 1281 /* 1282 * make sure to limit the number of pages we try to lock down 1283 * if we're looping. 1284 */ 1285 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1286 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1287 1288 /* step two, lock all the pages after the page that has start */ 1289 ret = lock_delalloc_pages(inode, locked_page, 1290 delalloc_start, delalloc_end); 1291 if (ret == -EAGAIN) { 1292 /* some of the pages are gone, lets avoid looping by 1293 * shortening the size of the delalloc range we're searching 1294 */ 1295 free_extent_state(cached_state); 1296 if (!loops) { 1297 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1298 max_bytes = PAGE_CACHE_SIZE - offset; 1299 loops = 1; 1300 goto again; 1301 } else { 1302 found = 0; 1303 goto out_failed; 1304 } 1305 } 1306 BUG_ON(ret); 1307 1308 /* step three, lock the state bits for the whole range */ 1309 lock_extent_bits(tree, delalloc_start, delalloc_end, 1310 0, &cached_state, GFP_NOFS); 1311 1312 /* then test to make sure it is all still delalloc */ 1313 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1314 EXTENT_DELALLOC, 1, cached_state); 1315 if (!ret) { 1316 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1317 &cached_state, GFP_NOFS); 1318 __unlock_for_delalloc(inode, locked_page, 1319 delalloc_start, delalloc_end); 1320 cond_resched(); 1321 goto again; 1322 } 1323 free_extent_state(cached_state); 1324 *start = delalloc_start; 1325 *end = delalloc_end; 1326 out_failed: 1327 return found; 1328 } 1329 1330 int extent_clear_unlock_delalloc(struct inode *inode, 1331 struct extent_io_tree *tree, 1332 u64 start, u64 end, struct page *locked_page, 1333 unsigned long op) 1334 { 1335 int ret; 1336 struct page *pages[16]; 1337 unsigned long index = start >> PAGE_CACHE_SHIFT; 1338 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1339 unsigned long nr_pages = end_index - index + 1; 1340 int i; 1341 int clear_bits = 0; 1342 1343 if (op & EXTENT_CLEAR_UNLOCK) 1344 clear_bits |= EXTENT_LOCKED; 1345 if (op & EXTENT_CLEAR_DIRTY) 1346 clear_bits |= EXTENT_DIRTY; 1347 1348 if (op & EXTENT_CLEAR_DELALLOC) 1349 clear_bits |= EXTENT_DELALLOC; 1350 1351 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1352 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1353 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1354 EXTENT_SET_PRIVATE2))) 1355 return 0; 1356 1357 while (nr_pages > 0) { 1358 ret = find_get_pages_contig(inode->i_mapping, index, 1359 min_t(unsigned long, 1360 nr_pages, ARRAY_SIZE(pages)), pages); 1361 for (i = 0; i < ret; i++) { 1362 1363 if (op & EXTENT_SET_PRIVATE2) 1364 SetPagePrivate2(pages[i]); 1365 1366 if (pages[i] == locked_page) { 1367 page_cache_release(pages[i]); 1368 continue; 1369 } 1370 if (op & EXTENT_CLEAR_DIRTY) 1371 clear_page_dirty_for_io(pages[i]); 1372 if (op & EXTENT_SET_WRITEBACK) 1373 set_page_writeback(pages[i]); 1374 if (op & EXTENT_END_WRITEBACK) 1375 end_page_writeback(pages[i]); 1376 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1377 unlock_page(pages[i]); 1378 page_cache_release(pages[i]); 1379 } 1380 nr_pages -= ret; 1381 index += ret; 1382 cond_resched(); 1383 } 1384 return 0; 1385 } 1386 1387 /* 1388 * count the number of bytes in the tree that have a given bit(s) 1389 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1390 * cached. The total number found is returned. 1391 */ 1392 u64 count_range_bits(struct extent_io_tree *tree, 1393 u64 *start, u64 search_end, u64 max_bytes, 1394 unsigned long bits, int contig) 1395 { 1396 struct rb_node *node; 1397 struct extent_state *state; 1398 u64 cur_start = *start; 1399 u64 total_bytes = 0; 1400 u64 last = 0; 1401 int found = 0; 1402 1403 if (search_end <= cur_start) { 1404 WARN_ON(1); 1405 return 0; 1406 } 1407 1408 spin_lock(&tree->lock); 1409 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1410 total_bytes = tree->dirty_bytes; 1411 goto out; 1412 } 1413 /* 1414 * this search will find all the extents that end after 1415 * our range starts. 1416 */ 1417 node = tree_search(tree, cur_start); 1418 if (!node) 1419 goto out; 1420 1421 while (1) { 1422 state = rb_entry(node, struct extent_state, rb_node); 1423 if (state->start > search_end) 1424 break; 1425 if (contig && found && state->start > last + 1) 1426 break; 1427 if (state->end >= cur_start && (state->state & bits) == bits) { 1428 total_bytes += min(search_end, state->end) + 1 - 1429 max(cur_start, state->start); 1430 if (total_bytes >= max_bytes) 1431 break; 1432 if (!found) { 1433 *start = max(cur_start, state->start); 1434 found = 1; 1435 } 1436 last = state->end; 1437 } else if (contig && found) { 1438 break; 1439 } 1440 node = rb_next(node); 1441 if (!node) 1442 break; 1443 } 1444 out: 1445 spin_unlock(&tree->lock); 1446 return total_bytes; 1447 } 1448 1449 /* 1450 * set the private field for a given byte offset in the tree. If there isn't 1451 * an extent_state there already, this does nothing. 1452 */ 1453 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1454 { 1455 struct rb_node *node; 1456 struct extent_state *state; 1457 int ret = 0; 1458 1459 spin_lock(&tree->lock); 1460 /* 1461 * this search will find all the extents that end after 1462 * our range starts. 1463 */ 1464 node = tree_search(tree, start); 1465 if (!node) { 1466 ret = -ENOENT; 1467 goto out; 1468 } 1469 state = rb_entry(node, struct extent_state, rb_node); 1470 if (state->start != start) { 1471 ret = -ENOENT; 1472 goto out; 1473 } 1474 state->private = private; 1475 out: 1476 spin_unlock(&tree->lock); 1477 return ret; 1478 } 1479 1480 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1481 { 1482 struct rb_node *node; 1483 struct extent_state *state; 1484 int ret = 0; 1485 1486 spin_lock(&tree->lock); 1487 /* 1488 * this search will find all the extents that end after 1489 * our range starts. 1490 */ 1491 node = tree_search(tree, start); 1492 if (!node) { 1493 ret = -ENOENT; 1494 goto out; 1495 } 1496 state = rb_entry(node, struct extent_state, rb_node); 1497 if (state->start != start) { 1498 ret = -ENOENT; 1499 goto out; 1500 } 1501 *private = state->private; 1502 out: 1503 spin_unlock(&tree->lock); 1504 return ret; 1505 } 1506 1507 /* 1508 * searches a range in the state tree for a given mask. 1509 * If 'filled' == 1, this returns 1 only if every extent in the tree 1510 * has the bits set. Otherwise, 1 is returned if any bit in the 1511 * range is found set. 1512 */ 1513 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1514 int bits, int filled, struct extent_state *cached) 1515 { 1516 struct extent_state *state = NULL; 1517 struct rb_node *node; 1518 int bitset = 0; 1519 1520 spin_lock(&tree->lock); 1521 if (cached && cached->tree && cached->start <= start && 1522 cached->end > start) 1523 node = &cached->rb_node; 1524 else 1525 node = tree_search(tree, start); 1526 while (node && start <= end) { 1527 state = rb_entry(node, struct extent_state, rb_node); 1528 1529 if (filled && state->start > start) { 1530 bitset = 0; 1531 break; 1532 } 1533 1534 if (state->start > end) 1535 break; 1536 1537 if (state->state & bits) { 1538 bitset = 1; 1539 if (!filled) 1540 break; 1541 } else if (filled) { 1542 bitset = 0; 1543 break; 1544 } 1545 1546 if (state->end == (u64)-1) 1547 break; 1548 1549 start = state->end + 1; 1550 if (start > end) 1551 break; 1552 node = rb_next(node); 1553 if (!node) { 1554 if (filled) 1555 bitset = 0; 1556 break; 1557 } 1558 } 1559 spin_unlock(&tree->lock); 1560 return bitset; 1561 } 1562 1563 /* 1564 * helper function to set a given page up to date if all the 1565 * extents in the tree for that page are up to date 1566 */ 1567 static int check_page_uptodate(struct extent_io_tree *tree, 1568 struct page *page) 1569 { 1570 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1571 u64 end = start + PAGE_CACHE_SIZE - 1; 1572 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1573 SetPageUptodate(page); 1574 return 0; 1575 } 1576 1577 /* 1578 * helper function to unlock a page if all the extents in the tree 1579 * for that page are unlocked 1580 */ 1581 static int check_page_locked(struct extent_io_tree *tree, 1582 struct page *page) 1583 { 1584 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1585 u64 end = start + PAGE_CACHE_SIZE - 1; 1586 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1587 unlock_page(page); 1588 return 0; 1589 } 1590 1591 /* 1592 * helper function to end page writeback if all the extents 1593 * in the tree for that page are done with writeback 1594 */ 1595 static int check_page_writeback(struct extent_io_tree *tree, 1596 struct page *page) 1597 { 1598 end_page_writeback(page); 1599 return 0; 1600 } 1601 1602 /* lots and lots of room for performance fixes in the end_bio funcs */ 1603 1604 /* 1605 * after a writepage IO is done, we need to: 1606 * clear the uptodate bits on error 1607 * clear the writeback bits in the extent tree for this IO 1608 * end_page_writeback if the page has no more pending IO 1609 * 1610 * Scheduling is not allowed, so the extent state tree is expected 1611 * to have one and only one object corresponding to this IO. 1612 */ 1613 static void end_bio_extent_writepage(struct bio *bio, int err) 1614 { 1615 int uptodate = err == 0; 1616 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1617 struct extent_io_tree *tree; 1618 u64 start; 1619 u64 end; 1620 int whole_page; 1621 int ret; 1622 1623 do { 1624 struct page *page = bvec->bv_page; 1625 tree = &BTRFS_I(page->mapping->host)->io_tree; 1626 1627 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1628 bvec->bv_offset; 1629 end = start + bvec->bv_len - 1; 1630 1631 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1632 whole_page = 1; 1633 else 1634 whole_page = 0; 1635 1636 if (--bvec >= bio->bi_io_vec) 1637 prefetchw(&bvec->bv_page->flags); 1638 if (tree->ops && tree->ops->writepage_end_io_hook) { 1639 ret = tree->ops->writepage_end_io_hook(page, start, 1640 end, NULL, uptodate); 1641 if (ret) 1642 uptodate = 0; 1643 } 1644 1645 if (!uptodate && tree->ops && 1646 tree->ops->writepage_io_failed_hook) { 1647 ret = tree->ops->writepage_io_failed_hook(bio, page, 1648 start, end, NULL); 1649 if (ret == 0) { 1650 uptodate = (err == 0); 1651 continue; 1652 } 1653 } 1654 1655 if (!uptodate) { 1656 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1657 ClearPageUptodate(page); 1658 SetPageError(page); 1659 } 1660 1661 if (whole_page) 1662 end_page_writeback(page); 1663 else 1664 check_page_writeback(tree, page); 1665 } while (bvec >= bio->bi_io_vec); 1666 1667 bio_put(bio); 1668 } 1669 1670 /* 1671 * after a readpage IO is done, we need to: 1672 * clear the uptodate bits on error 1673 * set the uptodate bits if things worked 1674 * set the page up to date if all extents in the tree are uptodate 1675 * clear the lock bit in the extent tree 1676 * unlock the page if there are no other extents locked for it 1677 * 1678 * Scheduling is not allowed, so the extent state tree is expected 1679 * to have one and only one object corresponding to this IO. 1680 */ 1681 static void end_bio_extent_readpage(struct bio *bio, int err) 1682 { 1683 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1684 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1685 struct bio_vec *bvec = bio->bi_io_vec; 1686 struct extent_io_tree *tree; 1687 u64 start; 1688 u64 end; 1689 int whole_page; 1690 int ret; 1691 1692 if (err) 1693 uptodate = 0; 1694 1695 do { 1696 struct page *page = bvec->bv_page; 1697 struct extent_state *cached = NULL; 1698 struct extent_state *state; 1699 1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 1701 1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1703 bvec->bv_offset; 1704 end = start + bvec->bv_len - 1; 1705 1706 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1707 whole_page = 1; 1708 else 1709 whole_page = 0; 1710 1711 if (++bvec <= bvec_end) 1712 prefetchw(&bvec->bv_page->flags); 1713 1714 spin_lock(&tree->lock); 1715 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); 1716 if (state && state->start == start) { 1717 /* 1718 * take a reference on the state, unlock will drop 1719 * the ref 1720 */ 1721 cache_state(state, &cached); 1722 } 1723 spin_unlock(&tree->lock); 1724 1725 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1726 ret = tree->ops->readpage_end_io_hook(page, start, end, 1727 state); 1728 if (ret) 1729 uptodate = 0; 1730 } 1731 if (!uptodate && tree->ops && 1732 tree->ops->readpage_io_failed_hook) { 1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 1734 start, end, NULL); 1735 if (ret == 0) { 1736 uptodate = 1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 1738 if (err) 1739 uptodate = 0; 1740 uncache_state(&cached); 1741 continue; 1742 } 1743 } 1744 1745 if (uptodate) { 1746 set_extent_uptodate(tree, start, end, &cached, 1747 GFP_ATOMIC); 1748 } 1749 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 1750 1751 if (whole_page) { 1752 if (uptodate) { 1753 SetPageUptodate(page); 1754 } else { 1755 ClearPageUptodate(page); 1756 SetPageError(page); 1757 } 1758 unlock_page(page); 1759 } else { 1760 if (uptodate) { 1761 check_page_uptodate(tree, page); 1762 } else { 1763 ClearPageUptodate(page); 1764 SetPageError(page); 1765 } 1766 check_page_locked(tree, page); 1767 } 1768 } while (bvec <= bvec_end); 1769 1770 bio_put(bio); 1771 } 1772 1773 struct bio * 1774 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1775 gfp_t gfp_flags) 1776 { 1777 struct bio *bio; 1778 1779 bio = bio_alloc(gfp_flags, nr_vecs); 1780 1781 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1782 while (!bio && (nr_vecs /= 2)) 1783 bio = bio_alloc(gfp_flags, nr_vecs); 1784 } 1785 1786 if (bio) { 1787 bio->bi_size = 0; 1788 bio->bi_bdev = bdev; 1789 bio->bi_sector = first_sector; 1790 } 1791 return bio; 1792 } 1793 1794 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1795 unsigned long bio_flags) 1796 { 1797 int ret = 0; 1798 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1799 struct page *page = bvec->bv_page; 1800 struct extent_io_tree *tree = bio->bi_private; 1801 u64 start; 1802 1803 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1804 1805 bio->bi_private = NULL; 1806 1807 bio_get(bio); 1808 1809 if (tree->ops && tree->ops->submit_bio_hook) 1810 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1811 mirror_num, bio_flags, start); 1812 else 1813 submit_bio(rw, bio); 1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1815 ret = -EOPNOTSUPP; 1816 bio_put(bio); 1817 return ret; 1818 } 1819 1820 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1821 struct page *page, sector_t sector, 1822 size_t size, unsigned long offset, 1823 struct block_device *bdev, 1824 struct bio **bio_ret, 1825 unsigned long max_pages, 1826 bio_end_io_t end_io_func, 1827 int mirror_num, 1828 unsigned long prev_bio_flags, 1829 unsigned long bio_flags) 1830 { 1831 int ret = 0; 1832 struct bio *bio; 1833 int nr; 1834 int contig = 0; 1835 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1836 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1837 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1838 1839 if (bio_ret && *bio_ret) { 1840 bio = *bio_ret; 1841 if (old_compressed) 1842 contig = bio->bi_sector == sector; 1843 else 1844 contig = bio->bi_sector + (bio->bi_size >> 9) == 1845 sector; 1846 1847 if (prev_bio_flags != bio_flags || !contig || 1848 (tree->ops && tree->ops->merge_bio_hook && 1849 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1850 bio_flags)) || 1851 bio_add_page(bio, page, page_size, offset) < page_size) { 1852 ret = submit_one_bio(rw, bio, mirror_num, 1853 prev_bio_flags); 1854 bio = NULL; 1855 } else { 1856 return 0; 1857 } 1858 } 1859 if (this_compressed) 1860 nr = BIO_MAX_PAGES; 1861 else 1862 nr = bio_get_nr_vecs(bdev); 1863 1864 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1865 if (!bio) 1866 return -ENOMEM; 1867 1868 bio_add_page(bio, page, page_size, offset); 1869 bio->bi_end_io = end_io_func; 1870 bio->bi_private = tree; 1871 1872 if (bio_ret) 1873 *bio_ret = bio; 1874 else 1875 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1876 1877 return ret; 1878 } 1879 1880 void set_page_extent_mapped(struct page *page) 1881 { 1882 if (!PagePrivate(page)) { 1883 SetPagePrivate(page); 1884 page_cache_get(page); 1885 set_page_private(page, EXTENT_PAGE_PRIVATE); 1886 } 1887 } 1888 1889 static void set_page_extent_head(struct page *page, unsigned long len) 1890 { 1891 WARN_ON(!PagePrivate(page)); 1892 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1893 } 1894 1895 /* 1896 * basic readpage implementation. Locked extent state structs are inserted 1897 * into the tree that are removed when the IO is done (by the end_io 1898 * handlers) 1899 */ 1900 static int __extent_read_full_page(struct extent_io_tree *tree, 1901 struct page *page, 1902 get_extent_t *get_extent, 1903 struct bio **bio, int mirror_num, 1904 unsigned long *bio_flags) 1905 { 1906 struct inode *inode = page->mapping->host; 1907 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1908 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1909 u64 end; 1910 u64 cur = start; 1911 u64 extent_offset; 1912 u64 last_byte = i_size_read(inode); 1913 u64 block_start; 1914 u64 cur_end; 1915 sector_t sector; 1916 struct extent_map *em; 1917 struct block_device *bdev; 1918 struct btrfs_ordered_extent *ordered; 1919 int ret; 1920 int nr = 0; 1921 size_t pg_offset = 0; 1922 size_t iosize; 1923 size_t disk_io_size; 1924 size_t blocksize = inode->i_sb->s_blocksize; 1925 unsigned long this_bio_flag = 0; 1926 1927 set_page_extent_mapped(page); 1928 1929 if (!PageUptodate(page)) { 1930 if (cleancache_get_page(page) == 0) { 1931 BUG_ON(blocksize != PAGE_SIZE); 1932 goto out; 1933 } 1934 } 1935 1936 end = page_end; 1937 while (1) { 1938 lock_extent(tree, start, end, GFP_NOFS); 1939 ordered = btrfs_lookup_ordered_extent(inode, start); 1940 if (!ordered) 1941 break; 1942 unlock_extent(tree, start, end, GFP_NOFS); 1943 btrfs_start_ordered_extent(inode, ordered, 1); 1944 btrfs_put_ordered_extent(ordered); 1945 } 1946 1947 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1948 char *userpage; 1949 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 1950 1951 if (zero_offset) { 1952 iosize = PAGE_CACHE_SIZE - zero_offset; 1953 userpage = kmap_atomic(page, KM_USER0); 1954 memset(userpage + zero_offset, 0, iosize); 1955 flush_dcache_page(page); 1956 kunmap_atomic(userpage, KM_USER0); 1957 } 1958 } 1959 while (cur <= end) { 1960 if (cur >= last_byte) { 1961 char *userpage; 1962 struct extent_state *cached = NULL; 1963 1964 iosize = PAGE_CACHE_SIZE - pg_offset; 1965 userpage = kmap_atomic(page, KM_USER0); 1966 memset(userpage + pg_offset, 0, iosize); 1967 flush_dcache_page(page); 1968 kunmap_atomic(userpage, KM_USER0); 1969 set_extent_uptodate(tree, cur, cur + iosize - 1, 1970 &cached, GFP_NOFS); 1971 unlock_extent_cached(tree, cur, cur + iosize - 1, 1972 &cached, GFP_NOFS); 1973 break; 1974 } 1975 em = get_extent(inode, page, pg_offset, cur, 1976 end - cur + 1, 0); 1977 if (IS_ERR_OR_NULL(em)) { 1978 SetPageError(page); 1979 unlock_extent(tree, cur, end, GFP_NOFS); 1980 break; 1981 } 1982 extent_offset = cur - em->start; 1983 BUG_ON(extent_map_end(em) <= cur); 1984 BUG_ON(end < cur); 1985 1986 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1987 this_bio_flag = EXTENT_BIO_COMPRESSED; 1988 extent_set_compress_type(&this_bio_flag, 1989 em->compress_type); 1990 } 1991 1992 iosize = min(extent_map_end(em) - cur, end - cur + 1); 1993 cur_end = min(extent_map_end(em) - 1, end); 1994 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1995 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 1996 disk_io_size = em->block_len; 1997 sector = em->block_start >> 9; 1998 } else { 1999 sector = (em->block_start + extent_offset) >> 9; 2000 disk_io_size = iosize; 2001 } 2002 bdev = em->bdev; 2003 block_start = em->block_start; 2004 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2005 block_start = EXTENT_MAP_HOLE; 2006 free_extent_map(em); 2007 em = NULL; 2008 2009 /* we've found a hole, just zero and go on */ 2010 if (block_start == EXTENT_MAP_HOLE) { 2011 char *userpage; 2012 struct extent_state *cached = NULL; 2013 2014 userpage = kmap_atomic(page, KM_USER0); 2015 memset(userpage + pg_offset, 0, iosize); 2016 flush_dcache_page(page); 2017 kunmap_atomic(userpage, KM_USER0); 2018 2019 set_extent_uptodate(tree, cur, cur + iosize - 1, 2020 &cached, GFP_NOFS); 2021 unlock_extent_cached(tree, cur, cur + iosize - 1, 2022 &cached, GFP_NOFS); 2023 cur = cur + iosize; 2024 pg_offset += iosize; 2025 continue; 2026 } 2027 /* the get_extent function already copied into the page */ 2028 if (test_range_bit(tree, cur, cur_end, 2029 EXTENT_UPTODATE, 1, NULL)) { 2030 check_page_uptodate(tree, page); 2031 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2032 cur = cur + iosize; 2033 pg_offset += iosize; 2034 continue; 2035 } 2036 /* we have an inline extent but it didn't get marked up 2037 * to date. Error out 2038 */ 2039 if (block_start == EXTENT_MAP_INLINE) { 2040 SetPageError(page); 2041 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2042 cur = cur + iosize; 2043 pg_offset += iosize; 2044 continue; 2045 } 2046 2047 ret = 0; 2048 if (tree->ops && tree->ops->readpage_io_hook) { 2049 ret = tree->ops->readpage_io_hook(page, cur, 2050 cur + iosize - 1); 2051 } 2052 if (!ret) { 2053 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2054 pnr -= page->index; 2055 ret = submit_extent_page(READ, tree, page, 2056 sector, disk_io_size, pg_offset, 2057 bdev, bio, pnr, 2058 end_bio_extent_readpage, mirror_num, 2059 *bio_flags, 2060 this_bio_flag); 2061 nr++; 2062 *bio_flags = this_bio_flag; 2063 } 2064 if (ret) 2065 SetPageError(page); 2066 cur = cur + iosize; 2067 pg_offset += iosize; 2068 } 2069 out: 2070 if (!nr) { 2071 if (!PageError(page)) 2072 SetPageUptodate(page); 2073 unlock_page(page); 2074 } 2075 return 0; 2076 } 2077 2078 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2079 get_extent_t *get_extent) 2080 { 2081 struct bio *bio = NULL; 2082 unsigned long bio_flags = 0; 2083 int ret; 2084 2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2086 &bio_flags); 2087 if (bio) 2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2089 return ret; 2090 } 2091 2092 static noinline void update_nr_written(struct page *page, 2093 struct writeback_control *wbc, 2094 unsigned long nr_written) 2095 { 2096 wbc->nr_to_write -= nr_written; 2097 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2098 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2099 page->mapping->writeback_index = page->index + nr_written; 2100 } 2101 2102 /* 2103 * the writepage semantics are similar to regular writepage. extent 2104 * records are inserted to lock ranges in the tree, and as dirty areas 2105 * are found, they are marked writeback. Then the lock bits are removed 2106 * and the end_io handler clears the writeback ranges 2107 */ 2108 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2109 void *data) 2110 { 2111 struct inode *inode = page->mapping->host; 2112 struct extent_page_data *epd = data; 2113 struct extent_io_tree *tree = epd->tree; 2114 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2115 u64 delalloc_start; 2116 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2117 u64 end; 2118 u64 cur = start; 2119 u64 extent_offset; 2120 u64 last_byte = i_size_read(inode); 2121 u64 block_start; 2122 u64 iosize; 2123 sector_t sector; 2124 struct extent_state *cached_state = NULL; 2125 struct extent_map *em; 2126 struct block_device *bdev; 2127 int ret; 2128 int nr = 0; 2129 size_t pg_offset = 0; 2130 size_t blocksize; 2131 loff_t i_size = i_size_read(inode); 2132 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2133 u64 nr_delalloc; 2134 u64 delalloc_end; 2135 int page_started; 2136 int compressed; 2137 int write_flags; 2138 unsigned long nr_written = 0; 2139 2140 if (wbc->sync_mode == WB_SYNC_ALL) 2141 write_flags = WRITE_SYNC; 2142 else 2143 write_flags = WRITE; 2144 2145 trace___extent_writepage(page, inode, wbc); 2146 2147 WARN_ON(!PageLocked(page)); 2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2149 if (page->index > end_index || 2150 (page->index == end_index && !pg_offset)) { 2151 page->mapping->a_ops->invalidatepage(page, 0); 2152 unlock_page(page); 2153 return 0; 2154 } 2155 2156 if (page->index == end_index) { 2157 char *userpage; 2158 2159 userpage = kmap_atomic(page, KM_USER0); 2160 memset(userpage + pg_offset, 0, 2161 PAGE_CACHE_SIZE - pg_offset); 2162 kunmap_atomic(userpage, KM_USER0); 2163 flush_dcache_page(page); 2164 } 2165 pg_offset = 0; 2166 2167 set_page_extent_mapped(page); 2168 2169 delalloc_start = start; 2170 delalloc_end = 0; 2171 page_started = 0; 2172 if (!epd->extent_locked) { 2173 u64 delalloc_to_write = 0; 2174 /* 2175 * make sure the wbc mapping index is at least updated 2176 * to this page. 2177 */ 2178 update_nr_written(page, wbc, 0); 2179 2180 while (delalloc_end < page_end) { 2181 nr_delalloc = find_lock_delalloc_range(inode, tree, 2182 page, 2183 &delalloc_start, 2184 &delalloc_end, 2185 128 * 1024 * 1024); 2186 if (nr_delalloc == 0) { 2187 delalloc_start = delalloc_end + 1; 2188 continue; 2189 } 2190 tree->ops->fill_delalloc(inode, page, delalloc_start, 2191 delalloc_end, &page_started, 2192 &nr_written); 2193 /* 2194 * delalloc_end is already one less than the total 2195 * length, so we don't subtract one from 2196 * PAGE_CACHE_SIZE 2197 */ 2198 delalloc_to_write += (delalloc_end - delalloc_start + 2199 PAGE_CACHE_SIZE) >> 2200 PAGE_CACHE_SHIFT; 2201 delalloc_start = delalloc_end + 1; 2202 } 2203 if (wbc->nr_to_write < delalloc_to_write) { 2204 int thresh = 8192; 2205 2206 if (delalloc_to_write < thresh * 2) 2207 thresh = delalloc_to_write; 2208 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2209 thresh); 2210 } 2211 2212 /* did the fill delalloc function already unlock and start 2213 * the IO? 2214 */ 2215 if (page_started) { 2216 ret = 0; 2217 /* 2218 * we've unlocked the page, so we can't update 2219 * the mapping's writeback index, just update 2220 * nr_to_write. 2221 */ 2222 wbc->nr_to_write -= nr_written; 2223 goto done_unlocked; 2224 } 2225 } 2226 if (tree->ops && tree->ops->writepage_start_hook) { 2227 ret = tree->ops->writepage_start_hook(page, start, 2228 page_end); 2229 if (ret == -EAGAIN) { 2230 redirty_page_for_writepage(wbc, page); 2231 update_nr_written(page, wbc, nr_written); 2232 unlock_page(page); 2233 ret = 0; 2234 goto done_unlocked; 2235 } 2236 } 2237 2238 /* 2239 * we don't want to touch the inode after unlocking the page, 2240 * so we update the mapping writeback index now 2241 */ 2242 update_nr_written(page, wbc, nr_written + 1); 2243 2244 end = page_end; 2245 if (last_byte <= start) { 2246 if (tree->ops && tree->ops->writepage_end_io_hook) 2247 tree->ops->writepage_end_io_hook(page, start, 2248 page_end, NULL, 1); 2249 goto done; 2250 } 2251 2252 blocksize = inode->i_sb->s_blocksize; 2253 2254 while (cur <= end) { 2255 if (cur >= last_byte) { 2256 if (tree->ops && tree->ops->writepage_end_io_hook) 2257 tree->ops->writepage_end_io_hook(page, cur, 2258 page_end, NULL, 1); 2259 break; 2260 } 2261 em = epd->get_extent(inode, page, pg_offset, cur, 2262 end - cur + 1, 1); 2263 if (IS_ERR_OR_NULL(em)) { 2264 SetPageError(page); 2265 break; 2266 } 2267 2268 extent_offset = cur - em->start; 2269 BUG_ON(extent_map_end(em) <= cur); 2270 BUG_ON(end < cur); 2271 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2272 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2273 sector = (em->block_start + extent_offset) >> 9; 2274 bdev = em->bdev; 2275 block_start = em->block_start; 2276 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2277 free_extent_map(em); 2278 em = NULL; 2279 2280 /* 2281 * compressed and inline extents are written through other 2282 * paths in the FS 2283 */ 2284 if (compressed || block_start == EXTENT_MAP_HOLE || 2285 block_start == EXTENT_MAP_INLINE) { 2286 /* 2287 * end_io notification does not happen here for 2288 * compressed extents 2289 */ 2290 if (!compressed && tree->ops && 2291 tree->ops->writepage_end_io_hook) 2292 tree->ops->writepage_end_io_hook(page, cur, 2293 cur + iosize - 1, 2294 NULL, 1); 2295 else if (compressed) { 2296 /* we don't want to end_page_writeback on 2297 * a compressed extent. this happens 2298 * elsewhere 2299 */ 2300 nr++; 2301 } 2302 2303 cur += iosize; 2304 pg_offset += iosize; 2305 continue; 2306 } 2307 /* leave this out until we have a page_mkwrite call */ 2308 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2309 EXTENT_DIRTY, 0, NULL)) { 2310 cur = cur + iosize; 2311 pg_offset += iosize; 2312 continue; 2313 } 2314 2315 if (tree->ops && tree->ops->writepage_io_hook) { 2316 ret = tree->ops->writepage_io_hook(page, cur, 2317 cur + iosize - 1); 2318 } else { 2319 ret = 0; 2320 } 2321 if (ret) { 2322 SetPageError(page); 2323 } else { 2324 unsigned long max_nr = end_index + 1; 2325 2326 set_range_writeback(tree, cur, cur + iosize - 1); 2327 if (!PageWriteback(page)) { 2328 printk(KERN_ERR "btrfs warning page %lu not " 2329 "writeback, cur %llu end %llu\n", 2330 page->index, (unsigned long long)cur, 2331 (unsigned long long)end); 2332 } 2333 2334 ret = submit_extent_page(write_flags, tree, page, 2335 sector, iosize, pg_offset, 2336 bdev, &epd->bio, max_nr, 2337 end_bio_extent_writepage, 2338 0, 0, 0); 2339 if (ret) 2340 SetPageError(page); 2341 } 2342 cur = cur + iosize; 2343 pg_offset += iosize; 2344 nr++; 2345 } 2346 done: 2347 if (nr == 0) { 2348 /* make sure the mapping tag for page dirty gets cleared */ 2349 set_page_writeback(page); 2350 end_page_writeback(page); 2351 } 2352 unlock_page(page); 2353 2354 done_unlocked: 2355 2356 /* drop our reference on any cached states */ 2357 free_extent_state(cached_state); 2358 return 0; 2359 } 2360 2361 /** 2362 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2363 * @mapping: address space structure to write 2364 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2365 * @writepage: function called for each page 2366 * @data: data passed to writepage function 2367 * 2368 * If a page is already under I/O, write_cache_pages() skips it, even 2369 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2370 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2371 * and msync() need to guarantee that all the data which was dirty at the time 2372 * the call was made get new I/O started against them. If wbc->sync_mode is 2373 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2374 * existing IO to complete. 2375 */ 2376 static int extent_write_cache_pages(struct extent_io_tree *tree, 2377 struct address_space *mapping, 2378 struct writeback_control *wbc, 2379 writepage_t writepage, void *data, 2380 void (*flush_fn)(void *)) 2381 { 2382 int ret = 0; 2383 int done = 0; 2384 int nr_to_write_done = 0; 2385 struct pagevec pvec; 2386 int nr_pages; 2387 pgoff_t index; 2388 pgoff_t end; /* Inclusive */ 2389 int scanned = 0; 2390 int tag; 2391 2392 pagevec_init(&pvec, 0); 2393 if (wbc->range_cyclic) { 2394 index = mapping->writeback_index; /* Start from prev offset */ 2395 end = -1; 2396 } else { 2397 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2398 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2399 scanned = 1; 2400 } 2401 if (wbc->sync_mode == WB_SYNC_ALL) 2402 tag = PAGECACHE_TAG_TOWRITE; 2403 else 2404 tag = PAGECACHE_TAG_DIRTY; 2405 retry: 2406 if (wbc->sync_mode == WB_SYNC_ALL) 2407 tag_pages_for_writeback(mapping, index, end); 2408 while (!done && !nr_to_write_done && (index <= end) && 2409 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2410 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2411 unsigned i; 2412 2413 scanned = 1; 2414 for (i = 0; i < nr_pages; i++) { 2415 struct page *page = pvec.pages[i]; 2416 2417 /* 2418 * At this point we hold neither mapping->tree_lock nor 2419 * lock on the page itself: the page may be truncated or 2420 * invalidated (changing page->mapping to NULL), or even 2421 * swizzled back from swapper_space to tmpfs file 2422 * mapping 2423 */ 2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2425 tree->ops->write_cache_pages_lock_hook(page); 2426 else 2427 lock_page(page); 2428 2429 if (unlikely(page->mapping != mapping)) { 2430 unlock_page(page); 2431 continue; 2432 } 2433 2434 if (!wbc->range_cyclic && page->index > end) { 2435 done = 1; 2436 unlock_page(page); 2437 continue; 2438 } 2439 2440 if (wbc->sync_mode != WB_SYNC_NONE) { 2441 if (PageWriteback(page)) 2442 flush_fn(data); 2443 wait_on_page_writeback(page); 2444 } 2445 2446 if (PageWriteback(page) || 2447 !clear_page_dirty_for_io(page)) { 2448 unlock_page(page); 2449 continue; 2450 } 2451 2452 ret = (*writepage)(page, wbc, data); 2453 2454 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2455 unlock_page(page); 2456 ret = 0; 2457 } 2458 if (ret) 2459 done = 1; 2460 2461 /* 2462 * the filesystem may choose to bump up nr_to_write. 2463 * We have to make sure to honor the new nr_to_write 2464 * at any time 2465 */ 2466 nr_to_write_done = wbc->nr_to_write <= 0; 2467 } 2468 pagevec_release(&pvec); 2469 cond_resched(); 2470 } 2471 if (!scanned && !done) { 2472 /* 2473 * We hit the last page and there is more work to be done: wrap 2474 * back to the start of the file 2475 */ 2476 scanned = 1; 2477 index = 0; 2478 goto retry; 2479 } 2480 return ret; 2481 } 2482 2483 static void flush_epd_write_bio(struct extent_page_data *epd) 2484 { 2485 if (epd->bio) { 2486 if (epd->sync_io) 2487 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2488 else 2489 submit_one_bio(WRITE, epd->bio, 0, 0); 2490 epd->bio = NULL; 2491 } 2492 } 2493 2494 static noinline void flush_write_bio(void *data) 2495 { 2496 struct extent_page_data *epd = data; 2497 flush_epd_write_bio(epd); 2498 } 2499 2500 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2501 get_extent_t *get_extent, 2502 struct writeback_control *wbc) 2503 { 2504 int ret; 2505 struct extent_page_data epd = { 2506 .bio = NULL, 2507 .tree = tree, 2508 .get_extent = get_extent, 2509 .extent_locked = 0, 2510 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2511 }; 2512 2513 ret = __extent_writepage(page, wbc, &epd); 2514 2515 flush_epd_write_bio(&epd); 2516 return ret; 2517 } 2518 2519 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2520 u64 start, u64 end, get_extent_t *get_extent, 2521 int mode) 2522 { 2523 int ret = 0; 2524 struct address_space *mapping = inode->i_mapping; 2525 struct page *page; 2526 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2527 PAGE_CACHE_SHIFT; 2528 2529 struct extent_page_data epd = { 2530 .bio = NULL, 2531 .tree = tree, 2532 .get_extent = get_extent, 2533 .extent_locked = 1, 2534 .sync_io = mode == WB_SYNC_ALL, 2535 }; 2536 struct writeback_control wbc_writepages = { 2537 .sync_mode = mode, 2538 .nr_to_write = nr_pages * 2, 2539 .range_start = start, 2540 .range_end = end + 1, 2541 }; 2542 2543 while (start <= end) { 2544 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2545 if (clear_page_dirty_for_io(page)) 2546 ret = __extent_writepage(page, &wbc_writepages, &epd); 2547 else { 2548 if (tree->ops && tree->ops->writepage_end_io_hook) 2549 tree->ops->writepage_end_io_hook(page, start, 2550 start + PAGE_CACHE_SIZE - 1, 2551 NULL, 1); 2552 unlock_page(page); 2553 } 2554 page_cache_release(page); 2555 start += PAGE_CACHE_SIZE; 2556 } 2557 2558 flush_epd_write_bio(&epd); 2559 return ret; 2560 } 2561 2562 int extent_writepages(struct extent_io_tree *tree, 2563 struct address_space *mapping, 2564 get_extent_t *get_extent, 2565 struct writeback_control *wbc) 2566 { 2567 int ret = 0; 2568 struct extent_page_data epd = { 2569 .bio = NULL, 2570 .tree = tree, 2571 .get_extent = get_extent, 2572 .extent_locked = 0, 2573 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2574 }; 2575 2576 ret = extent_write_cache_pages(tree, mapping, wbc, 2577 __extent_writepage, &epd, 2578 flush_write_bio); 2579 flush_epd_write_bio(&epd); 2580 return ret; 2581 } 2582 2583 int extent_readpages(struct extent_io_tree *tree, 2584 struct address_space *mapping, 2585 struct list_head *pages, unsigned nr_pages, 2586 get_extent_t get_extent) 2587 { 2588 struct bio *bio = NULL; 2589 unsigned page_idx; 2590 unsigned long bio_flags = 0; 2591 2592 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2593 struct page *page = list_entry(pages->prev, struct page, lru); 2594 2595 prefetchw(&page->flags); 2596 list_del(&page->lru); 2597 if (!add_to_page_cache_lru(page, mapping, 2598 page->index, GFP_NOFS)) { 2599 __extent_read_full_page(tree, page, get_extent, 2600 &bio, 0, &bio_flags); 2601 } 2602 page_cache_release(page); 2603 } 2604 BUG_ON(!list_empty(pages)); 2605 if (bio) 2606 submit_one_bio(READ, bio, 0, bio_flags); 2607 return 0; 2608 } 2609 2610 /* 2611 * basic invalidatepage code, this waits on any locked or writeback 2612 * ranges corresponding to the page, and then deletes any extent state 2613 * records from the tree 2614 */ 2615 int extent_invalidatepage(struct extent_io_tree *tree, 2616 struct page *page, unsigned long offset) 2617 { 2618 struct extent_state *cached_state = NULL; 2619 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2620 u64 end = start + PAGE_CACHE_SIZE - 1; 2621 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2622 2623 start += (offset + blocksize - 1) & ~(blocksize - 1); 2624 if (start > end) 2625 return 0; 2626 2627 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2628 wait_on_page_writeback(page); 2629 clear_extent_bit(tree, start, end, 2630 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2631 EXTENT_DO_ACCOUNTING, 2632 1, 1, &cached_state, GFP_NOFS); 2633 return 0; 2634 } 2635 2636 /* 2637 * a helper for releasepage, this tests for areas of the page that 2638 * are locked or under IO and drops the related state bits if it is safe 2639 * to drop the page. 2640 */ 2641 int try_release_extent_state(struct extent_map_tree *map, 2642 struct extent_io_tree *tree, struct page *page, 2643 gfp_t mask) 2644 { 2645 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2646 u64 end = start + PAGE_CACHE_SIZE - 1; 2647 int ret = 1; 2648 2649 if (test_range_bit(tree, start, end, 2650 EXTENT_IOBITS, 0, NULL)) 2651 ret = 0; 2652 else { 2653 if ((mask & GFP_NOFS) == GFP_NOFS) 2654 mask = GFP_NOFS; 2655 /* 2656 * at this point we can safely clear everything except the 2657 * locked bit and the nodatasum bit 2658 */ 2659 ret = clear_extent_bit(tree, start, end, 2660 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2661 0, 0, NULL, mask); 2662 2663 /* if clear_extent_bit failed for enomem reasons, 2664 * we can't allow the release to continue. 2665 */ 2666 if (ret < 0) 2667 ret = 0; 2668 else 2669 ret = 1; 2670 } 2671 return ret; 2672 } 2673 2674 /* 2675 * a helper for releasepage. As long as there are no locked extents 2676 * in the range corresponding to the page, both state records and extent 2677 * map records are removed 2678 */ 2679 int try_release_extent_mapping(struct extent_map_tree *map, 2680 struct extent_io_tree *tree, struct page *page, 2681 gfp_t mask) 2682 { 2683 struct extent_map *em; 2684 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2685 u64 end = start + PAGE_CACHE_SIZE - 1; 2686 2687 if ((mask & __GFP_WAIT) && 2688 page->mapping->host->i_size > 16 * 1024 * 1024) { 2689 u64 len; 2690 while (start <= end) { 2691 len = end - start + 1; 2692 write_lock(&map->lock); 2693 em = lookup_extent_mapping(map, start, len); 2694 if (IS_ERR_OR_NULL(em)) { 2695 write_unlock(&map->lock); 2696 break; 2697 } 2698 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2699 em->start != start) { 2700 write_unlock(&map->lock); 2701 free_extent_map(em); 2702 break; 2703 } 2704 if (!test_range_bit(tree, em->start, 2705 extent_map_end(em) - 1, 2706 EXTENT_LOCKED | EXTENT_WRITEBACK, 2707 0, NULL)) { 2708 remove_extent_mapping(map, em); 2709 /* once for the rb tree */ 2710 free_extent_map(em); 2711 } 2712 start = extent_map_end(em); 2713 write_unlock(&map->lock); 2714 2715 /* once for us */ 2716 free_extent_map(em); 2717 } 2718 } 2719 return try_release_extent_state(map, tree, page, mask); 2720 } 2721 2722 /* 2723 * helper function for fiemap, which doesn't want to see any holes. 2724 * This maps until we find something past 'last' 2725 */ 2726 static struct extent_map *get_extent_skip_holes(struct inode *inode, 2727 u64 offset, 2728 u64 last, 2729 get_extent_t *get_extent) 2730 { 2731 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 2732 struct extent_map *em; 2733 u64 len; 2734 2735 if (offset >= last) 2736 return NULL; 2737 2738 while(1) { 2739 len = last - offset; 2740 if (len == 0) 2741 break; 2742 len = (len + sectorsize - 1) & ~(sectorsize - 1); 2743 em = get_extent(inode, NULL, 0, offset, len, 0); 2744 if (IS_ERR_OR_NULL(em)) 2745 return em; 2746 2747 /* if this isn't a hole return it */ 2748 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 2749 em->block_start != EXTENT_MAP_HOLE) { 2750 return em; 2751 } 2752 2753 /* this is a hole, advance to the next extent */ 2754 offset = extent_map_end(em); 2755 free_extent_map(em); 2756 if (offset >= last) 2757 break; 2758 } 2759 return NULL; 2760 } 2761 2762 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2763 __u64 start, __u64 len, get_extent_t *get_extent) 2764 { 2765 int ret = 0; 2766 u64 off = start; 2767 u64 max = start + len; 2768 u32 flags = 0; 2769 u32 found_type; 2770 u64 last; 2771 u64 last_for_get_extent = 0; 2772 u64 disko = 0; 2773 u64 isize = i_size_read(inode); 2774 struct btrfs_key found_key; 2775 struct extent_map *em = NULL; 2776 struct extent_state *cached_state = NULL; 2777 struct btrfs_path *path; 2778 struct btrfs_file_extent_item *item; 2779 int end = 0; 2780 u64 em_start = 0; 2781 u64 em_len = 0; 2782 u64 em_end = 0; 2783 unsigned long emflags; 2784 2785 if (len == 0) 2786 return -EINVAL; 2787 2788 path = btrfs_alloc_path(); 2789 if (!path) 2790 return -ENOMEM; 2791 path->leave_spinning = 1; 2792 2793 /* 2794 * lookup the last file extent. We're not using i_size here 2795 * because there might be preallocation past i_size 2796 */ 2797 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2798 path, btrfs_ino(inode), -1, 0); 2799 if (ret < 0) { 2800 btrfs_free_path(path); 2801 return ret; 2802 } 2803 WARN_ON(!ret); 2804 path->slots[0]--; 2805 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2806 struct btrfs_file_extent_item); 2807 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 2808 found_type = btrfs_key_type(&found_key); 2809 2810 /* No extents, but there might be delalloc bits */ 2811 if (found_key.objectid != btrfs_ino(inode) || 2812 found_type != BTRFS_EXTENT_DATA_KEY) { 2813 /* have to trust i_size as the end */ 2814 last = (u64)-1; 2815 last_for_get_extent = isize; 2816 } else { 2817 /* 2818 * remember the start of the last extent. There are a 2819 * bunch of different factors that go into the length of the 2820 * extent, so its much less complex to remember where it started 2821 */ 2822 last = found_key.offset; 2823 last_for_get_extent = last + 1; 2824 } 2825 btrfs_free_path(path); 2826 2827 /* 2828 * we might have some extents allocated but more delalloc past those 2829 * extents. so, we trust isize unless the start of the last extent is 2830 * beyond isize 2831 */ 2832 if (last < isize) { 2833 last = (u64)-1; 2834 last_for_get_extent = isize; 2835 } 2836 2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2838 &cached_state, GFP_NOFS); 2839 2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 2841 get_extent); 2842 if (!em) 2843 goto out; 2844 if (IS_ERR(em)) { 2845 ret = PTR_ERR(em); 2846 goto out; 2847 } 2848 2849 while (!end) { 2850 u64 offset_in_extent; 2851 2852 /* break if the extent we found is outside the range */ 2853 if (em->start >= max || extent_map_end(em) < off) 2854 break; 2855 2856 /* 2857 * get_extent may return an extent that starts before our 2858 * requested range. We have to make sure the ranges 2859 * we return to fiemap always move forward and don't 2860 * overlap, so adjust the offsets here 2861 */ 2862 em_start = max(em->start, off); 2863 2864 /* 2865 * record the offset from the start of the extent 2866 * for adjusting the disk offset below 2867 */ 2868 offset_in_extent = em_start - em->start; 2869 em_end = extent_map_end(em); 2870 em_len = em_end - em_start; 2871 emflags = em->flags; 2872 disko = 0; 2873 flags = 0; 2874 2875 /* 2876 * bump off for our next call to get_extent 2877 */ 2878 off = extent_map_end(em); 2879 if (off >= max) 2880 end = 1; 2881 2882 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2883 end = 1; 2884 flags |= FIEMAP_EXTENT_LAST; 2885 } else if (em->block_start == EXTENT_MAP_INLINE) { 2886 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2887 FIEMAP_EXTENT_NOT_ALIGNED); 2888 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2889 flags |= (FIEMAP_EXTENT_DELALLOC | 2890 FIEMAP_EXTENT_UNKNOWN); 2891 } else { 2892 disko = em->block_start + offset_in_extent; 2893 } 2894 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2895 flags |= FIEMAP_EXTENT_ENCODED; 2896 2897 free_extent_map(em); 2898 em = NULL; 2899 if ((em_start >= last) || em_len == (u64)-1 || 2900 (last == (u64)-1 && isize <= em_end)) { 2901 flags |= FIEMAP_EXTENT_LAST; 2902 end = 1; 2903 } 2904 2905 /* now scan forward to see if this is really the last extent. */ 2906 em = get_extent_skip_holes(inode, off, last_for_get_extent, 2907 get_extent); 2908 if (IS_ERR(em)) { 2909 ret = PTR_ERR(em); 2910 goto out; 2911 } 2912 if (!em) { 2913 flags |= FIEMAP_EXTENT_LAST; 2914 end = 1; 2915 } 2916 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 2917 em_len, flags); 2918 if (ret) 2919 goto out_free; 2920 } 2921 out_free: 2922 free_extent_map(em); 2923 out: 2924 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 2925 &cached_state, GFP_NOFS); 2926 return ret; 2927 } 2928 2929 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2930 unsigned long i) 2931 { 2932 struct page *p; 2933 struct address_space *mapping; 2934 2935 if (i == 0) 2936 return eb->first_page; 2937 i += eb->start >> PAGE_CACHE_SHIFT; 2938 mapping = eb->first_page->mapping; 2939 if (!mapping) 2940 return NULL; 2941 2942 /* 2943 * extent_buffer_page is only called after pinning the page 2944 * by increasing the reference count. So we know the page must 2945 * be in the radix tree. 2946 */ 2947 rcu_read_lock(); 2948 p = radix_tree_lookup(&mapping->page_tree, i); 2949 rcu_read_unlock(); 2950 2951 return p; 2952 } 2953 2954 static inline unsigned long num_extent_pages(u64 start, u64 len) 2955 { 2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 2957 (start >> PAGE_CACHE_SHIFT); 2958 } 2959 2960 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 2961 u64 start, 2962 unsigned long len, 2963 gfp_t mask) 2964 { 2965 struct extent_buffer *eb = NULL; 2966 #if LEAK_DEBUG 2967 unsigned long flags; 2968 #endif 2969 2970 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2971 if (eb == NULL) 2972 return NULL; 2973 eb->start = start; 2974 eb->len = len; 2975 rwlock_init(&eb->lock); 2976 atomic_set(&eb->write_locks, 0); 2977 atomic_set(&eb->read_locks, 0); 2978 atomic_set(&eb->blocking_readers, 0); 2979 atomic_set(&eb->blocking_writers, 0); 2980 atomic_set(&eb->spinning_readers, 0); 2981 atomic_set(&eb->spinning_writers, 0); 2982 init_waitqueue_head(&eb->write_lock_wq); 2983 init_waitqueue_head(&eb->read_lock_wq); 2984 2985 #if LEAK_DEBUG 2986 spin_lock_irqsave(&leak_lock, flags); 2987 list_add(&eb->leak_list, &buffers); 2988 spin_unlock_irqrestore(&leak_lock, flags); 2989 #endif 2990 atomic_set(&eb->refs, 1); 2991 2992 return eb; 2993 } 2994 2995 static void __free_extent_buffer(struct extent_buffer *eb) 2996 { 2997 #if LEAK_DEBUG 2998 unsigned long flags; 2999 spin_lock_irqsave(&leak_lock, flags); 3000 list_del(&eb->leak_list); 3001 spin_unlock_irqrestore(&leak_lock, flags); 3002 #endif 3003 kmem_cache_free(extent_buffer_cache, eb); 3004 } 3005 3006 /* 3007 * Helper for releasing extent buffer page. 3008 */ 3009 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3010 unsigned long start_idx) 3011 { 3012 unsigned long index; 3013 struct page *page; 3014 3015 if (!eb->first_page) 3016 return; 3017 3018 index = num_extent_pages(eb->start, eb->len); 3019 if (start_idx >= index) 3020 return; 3021 3022 do { 3023 index--; 3024 page = extent_buffer_page(eb, index); 3025 if (page) 3026 page_cache_release(page); 3027 } while (index != start_idx); 3028 } 3029 3030 /* 3031 * Helper for releasing the extent buffer. 3032 */ 3033 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3034 { 3035 btrfs_release_extent_buffer_page(eb, 0); 3036 __free_extent_buffer(eb); 3037 } 3038 3039 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3040 u64 start, unsigned long len, 3041 struct page *page0) 3042 { 3043 unsigned long num_pages = num_extent_pages(start, len); 3044 unsigned long i; 3045 unsigned long index = start >> PAGE_CACHE_SHIFT; 3046 struct extent_buffer *eb; 3047 struct extent_buffer *exists = NULL; 3048 struct page *p; 3049 struct address_space *mapping = tree->mapping; 3050 int uptodate = 1; 3051 int ret; 3052 3053 rcu_read_lock(); 3054 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3055 if (eb && atomic_inc_not_zero(&eb->refs)) { 3056 rcu_read_unlock(); 3057 mark_page_accessed(eb->first_page); 3058 return eb; 3059 } 3060 rcu_read_unlock(); 3061 3062 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS); 3063 if (!eb) 3064 return NULL; 3065 3066 if (page0) { 3067 eb->first_page = page0; 3068 i = 1; 3069 index++; 3070 page_cache_get(page0); 3071 mark_page_accessed(page0); 3072 set_page_extent_mapped(page0); 3073 set_page_extent_head(page0, len); 3074 uptodate = PageUptodate(page0); 3075 } else { 3076 i = 0; 3077 } 3078 for (; i < num_pages; i++, index++) { 3079 p = find_or_create_page(mapping, index, GFP_NOFS); 3080 if (!p) { 3081 WARN_ON(1); 3082 goto free_eb; 3083 } 3084 set_page_extent_mapped(p); 3085 mark_page_accessed(p); 3086 if (i == 0) { 3087 eb->first_page = p; 3088 set_page_extent_head(p, len); 3089 } else { 3090 set_page_private(p, EXTENT_PAGE_PRIVATE); 3091 } 3092 if (!PageUptodate(p)) 3093 uptodate = 0; 3094 3095 /* 3096 * see below about how we avoid a nasty race with release page 3097 * and why we unlock later 3098 */ 3099 if (i != 0) 3100 unlock_page(p); 3101 } 3102 if (uptodate) 3103 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3104 3105 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3106 if (ret) 3107 goto free_eb; 3108 3109 spin_lock(&tree->buffer_lock); 3110 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3111 if (ret == -EEXIST) { 3112 exists = radix_tree_lookup(&tree->buffer, 3113 start >> PAGE_CACHE_SHIFT); 3114 /* add one reference for the caller */ 3115 atomic_inc(&exists->refs); 3116 spin_unlock(&tree->buffer_lock); 3117 radix_tree_preload_end(); 3118 goto free_eb; 3119 } 3120 /* add one reference for the tree */ 3121 atomic_inc(&eb->refs); 3122 spin_unlock(&tree->buffer_lock); 3123 radix_tree_preload_end(); 3124 3125 /* 3126 * there is a race where release page may have 3127 * tried to find this extent buffer in the radix 3128 * but failed. It will tell the VM it is safe to 3129 * reclaim the, and it will clear the page private bit. 3130 * We must make sure to set the page private bit properly 3131 * after the extent buffer is in the radix tree so 3132 * it doesn't get lost 3133 */ 3134 set_page_extent_mapped(eb->first_page); 3135 set_page_extent_head(eb->first_page, eb->len); 3136 if (!page0) 3137 unlock_page(eb->first_page); 3138 return eb; 3139 3140 free_eb: 3141 if (eb->first_page && !page0) 3142 unlock_page(eb->first_page); 3143 3144 if (!atomic_dec_and_test(&eb->refs)) 3145 return exists; 3146 btrfs_release_extent_buffer(eb); 3147 return exists; 3148 } 3149 3150 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3151 u64 start, unsigned long len) 3152 { 3153 struct extent_buffer *eb; 3154 3155 rcu_read_lock(); 3156 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3157 if (eb && atomic_inc_not_zero(&eb->refs)) { 3158 rcu_read_unlock(); 3159 mark_page_accessed(eb->first_page); 3160 return eb; 3161 } 3162 rcu_read_unlock(); 3163 3164 return NULL; 3165 } 3166 3167 void free_extent_buffer(struct extent_buffer *eb) 3168 { 3169 if (!eb) 3170 return; 3171 3172 if (!atomic_dec_and_test(&eb->refs)) 3173 return; 3174 3175 WARN_ON(1); 3176 } 3177 3178 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3179 struct extent_buffer *eb) 3180 { 3181 unsigned long i; 3182 unsigned long num_pages; 3183 struct page *page; 3184 3185 num_pages = num_extent_pages(eb->start, eb->len); 3186 3187 for (i = 0; i < num_pages; i++) { 3188 page = extent_buffer_page(eb, i); 3189 if (!PageDirty(page)) 3190 continue; 3191 3192 lock_page(page); 3193 WARN_ON(!PagePrivate(page)); 3194 3195 set_page_extent_mapped(page); 3196 if (i == 0) 3197 set_page_extent_head(page, eb->len); 3198 3199 clear_page_dirty_for_io(page); 3200 spin_lock_irq(&page->mapping->tree_lock); 3201 if (!PageDirty(page)) { 3202 radix_tree_tag_clear(&page->mapping->page_tree, 3203 page_index(page), 3204 PAGECACHE_TAG_DIRTY); 3205 } 3206 spin_unlock_irq(&page->mapping->tree_lock); 3207 unlock_page(page); 3208 } 3209 return 0; 3210 } 3211 3212 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3213 struct extent_buffer *eb) 3214 { 3215 unsigned long i; 3216 unsigned long num_pages; 3217 int was_dirty = 0; 3218 3219 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3220 num_pages = num_extent_pages(eb->start, eb->len); 3221 for (i = 0; i < num_pages; i++) 3222 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3223 return was_dirty; 3224 } 3225 3226 static int __eb_straddles_pages(u64 start, u64 len) 3227 { 3228 if (len < PAGE_CACHE_SIZE) 3229 return 1; 3230 if (start & (PAGE_CACHE_SIZE - 1)) 3231 return 1; 3232 if ((start + len) & (PAGE_CACHE_SIZE - 1)) 3233 return 1; 3234 return 0; 3235 } 3236 3237 static int eb_straddles_pages(struct extent_buffer *eb) 3238 { 3239 return __eb_straddles_pages(eb->start, eb->len); 3240 } 3241 3242 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3243 struct extent_buffer *eb, 3244 struct extent_state **cached_state) 3245 { 3246 unsigned long i; 3247 struct page *page; 3248 unsigned long num_pages; 3249 3250 num_pages = num_extent_pages(eb->start, eb->len); 3251 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3252 3253 if (eb_straddles_pages(eb)) { 3254 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3255 cached_state, GFP_NOFS); 3256 } 3257 for (i = 0; i < num_pages; i++) { 3258 page = extent_buffer_page(eb, i); 3259 if (page) 3260 ClearPageUptodate(page); 3261 } 3262 return 0; 3263 } 3264 3265 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3266 struct extent_buffer *eb) 3267 { 3268 unsigned long i; 3269 struct page *page; 3270 unsigned long num_pages; 3271 3272 num_pages = num_extent_pages(eb->start, eb->len); 3273 3274 if (eb_straddles_pages(eb)) { 3275 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3276 NULL, GFP_NOFS); 3277 } 3278 for (i = 0; i < num_pages; i++) { 3279 page = extent_buffer_page(eb, i); 3280 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3281 ((i == num_pages - 1) && 3282 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3283 check_page_uptodate(tree, page); 3284 continue; 3285 } 3286 SetPageUptodate(page); 3287 } 3288 return 0; 3289 } 3290 3291 int extent_range_uptodate(struct extent_io_tree *tree, 3292 u64 start, u64 end) 3293 { 3294 struct page *page; 3295 int ret; 3296 int pg_uptodate = 1; 3297 int uptodate; 3298 unsigned long index; 3299 3300 if (__eb_straddles_pages(start, end - start + 1)) { 3301 ret = test_range_bit(tree, start, end, 3302 EXTENT_UPTODATE, 1, NULL); 3303 if (ret) 3304 return 1; 3305 } 3306 while (start <= end) { 3307 index = start >> PAGE_CACHE_SHIFT; 3308 page = find_get_page(tree->mapping, index); 3309 uptodate = PageUptodate(page); 3310 page_cache_release(page); 3311 if (!uptodate) { 3312 pg_uptodate = 0; 3313 break; 3314 } 3315 start += PAGE_CACHE_SIZE; 3316 } 3317 return pg_uptodate; 3318 } 3319 3320 int extent_buffer_uptodate(struct extent_io_tree *tree, 3321 struct extent_buffer *eb, 3322 struct extent_state *cached_state) 3323 { 3324 int ret = 0; 3325 unsigned long num_pages; 3326 unsigned long i; 3327 struct page *page; 3328 int pg_uptodate = 1; 3329 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3331 return 1; 3332 3333 if (eb_straddles_pages(eb)) { 3334 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3335 EXTENT_UPTODATE, 1, cached_state); 3336 if (ret) 3337 return ret; 3338 } 3339 3340 num_pages = num_extent_pages(eb->start, eb->len); 3341 for (i = 0; i < num_pages; i++) { 3342 page = extent_buffer_page(eb, i); 3343 if (!PageUptodate(page)) { 3344 pg_uptodate = 0; 3345 break; 3346 } 3347 } 3348 return pg_uptodate; 3349 } 3350 3351 int read_extent_buffer_pages(struct extent_io_tree *tree, 3352 struct extent_buffer *eb, 3353 u64 start, int wait, 3354 get_extent_t *get_extent, int mirror_num) 3355 { 3356 unsigned long i; 3357 unsigned long start_i; 3358 struct page *page; 3359 int err; 3360 int ret = 0; 3361 int locked_pages = 0; 3362 int all_uptodate = 1; 3363 int inc_all_pages = 0; 3364 unsigned long num_pages; 3365 struct bio *bio = NULL; 3366 unsigned long bio_flags = 0; 3367 3368 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3369 return 0; 3370 3371 if (eb_straddles_pages(eb)) { 3372 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3373 EXTENT_UPTODATE, 1, NULL)) { 3374 return 0; 3375 } 3376 } 3377 3378 if (start) { 3379 WARN_ON(start < eb->start); 3380 start_i = (start >> PAGE_CACHE_SHIFT) - 3381 (eb->start >> PAGE_CACHE_SHIFT); 3382 } else { 3383 start_i = 0; 3384 } 3385 3386 num_pages = num_extent_pages(eb->start, eb->len); 3387 for (i = start_i; i < num_pages; i++) { 3388 page = extent_buffer_page(eb, i); 3389 if (!wait) { 3390 if (!trylock_page(page)) 3391 goto unlock_exit; 3392 } else { 3393 lock_page(page); 3394 } 3395 locked_pages++; 3396 if (!PageUptodate(page)) 3397 all_uptodate = 0; 3398 } 3399 if (all_uptodate) { 3400 if (start_i == 0) 3401 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3402 goto unlock_exit; 3403 } 3404 3405 for (i = start_i; i < num_pages; i++) { 3406 page = extent_buffer_page(eb, i); 3407 3408 WARN_ON(!PagePrivate(page)); 3409 3410 set_page_extent_mapped(page); 3411 if (i == 0) 3412 set_page_extent_head(page, eb->len); 3413 3414 if (inc_all_pages) 3415 page_cache_get(page); 3416 if (!PageUptodate(page)) { 3417 if (start_i == 0) 3418 inc_all_pages = 1; 3419 ClearPageError(page); 3420 err = __extent_read_full_page(tree, page, 3421 get_extent, &bio, 3422 mirror_num, &bio_flags); 3423 if (err) 3424 ret = err; 3425 } else { 3426 unlock_page(page); 3427 } 3428 } 3429 3430 if (bio) 3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 3432 3433 if (ret || !wait) 3434 return ret; 3435 3436 for (i = start_i; i < num_pages; i++) { 3437 page = extent_buffer_page(eb, i); 3438 wait_on_page_locked(page); 3439 if (!PageUptodate(page)) 3440 ret = -EIO; 3441 } 3442 3443 if (!ret) 3444 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3445 return ret; 3446 3447 unlock_exit: 3448 i = start_i; 3449 while (locked_pages > 0) { 3450 page = extent_buffer_page(eb, i); 3451 i++; 3452 unlock_page(page); 3453 locked_pages--; 3454 } 3455 return ret; 3456 } 3457 3458 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3459 unsigned long start, 3460 unsigned long len) 3461 { 3462 size_t cur; 3463 size_t offset; 3464 struct page *page; 3465 char *kaddr; 3466 char *dst = (char *)dstv; 3467 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3468 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3469 3470 WARN_ON(start > eb->len); 3471 WARN_ON(start + len > eb->start + eb->len); 3472 3473 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3474 3475 while (len > 0) { 3476 page = extent_buffer_page(eb, i); 3477 3478 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3479 kaddr = page_address(page); 3480 memcpy(dst, kaddr + offset, cur); 3481 3482 dst += cur; 3483 len -= cur; 3484 offset = 0; 3485 i++; 3486 } 3487 } 3488 3489 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3490 unsigned long min_len, char **map, 3491 unsigned long *map_start, 3492 unsigned long *map_len) 3493 { 3494 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3495 char *kaddr; 3496 struct page *p; 3497 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3498 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3499 unsigned long end_i = (start_offset + start + min_len - 1) >> 3500 PAGE_CACHE_SHIFT; 3501 3502 if (i != end_i) 3503 return -EINVAL; 3504 3505 if (i == 0) { 3506 offset = start_offset; 3507 *map_start = 0; 3508 } else { 3509 offset = 0; 3510 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3511 } 3512 3513 if (start + min_len > eb->len) { 3514 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3515 "wanted %lu %lu\n", (unsigned long long)eb->start, 3516 eb->len, start, min_len); 3517 WARN_ON(1); 3518 return -EINVAL; 3519 } 3520 3521 p = extent_buffer_page(eb, i); 3522 kaddr = page_address(p); 3523 *map = kaddr + offset; 3524 *map_len = PAGE_CACHE_SIZE - offset; 3525 return 0; 3526 } 3527 3528 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3529 unsigned long start, 3530 unsigned long len) 3531 { 3532 size_t cur; 3533 size_t offset; 3534 struct page *page; 3535 char *kaddr; 3536 char *ptr = (char *)ptrv; 3537 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3538 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3539 int ret = 0; 3540 3541 WARN_ON(start > eb->len); 3542 WARN_ON(start + len > eb->start + eb->len); 3543 3544 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3545 3546 while (len > 0) { 3547 page = extent_buffer_page(eb, i); 3548 3549 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3550 3551 kaddr = page_address(page); 3552 ret = memcmp(ptr, kaddr + offset, cur); 3553 if (ret) 3554 break; 3555 3556 ptr += cur; 3557 len -= cur; 3558 offset = 0; 3559 i++; 3560 } 3561 return ret; 3562 } 3563 3564 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3565 unsigned long start, unsigned long len) 3566 { 3567 size_t cur; 3568 size_t offset; 3569 struct page *page; 3570 char *kaddr; 3571 char *src = (char *)srcv; 3572 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3573 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3574 3575 WARN_ON(start > eb->len); 3576 WARN_ON(start + len > eb->start + eb->len); 3577 3578 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3579 3580 while (len > 0) { 3581 page = extent_buffer_page(eb, i); 3582 WARN_ON(!PageUptodate(page)); 3583 3584 cur = min(len, PAGE_CACHE_SIZE - offset); 3585 kaddr = page_address(page); 3586 memcpy(kaddr + offset, src, cur); 3587 3588 src += cur; 3589 len -= cur; 3590 offset = 0; 3591 i++; 3592 } 3593 } 3594 3595 void memset_extent_buffer(struct extent_buffer *eb, char c, 3596 unsigned long start, unsigned long len) 3597 { 3598 size_t cur; 3599 size_t offset; 3600 struct page *page; 3601 char *kaddr; 3602 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3603 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3604 3605 WARN_ON(start > eb->len); 3606 WARN_ON(start + len > eb->start + eb->len); 3607 3608 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3609 3610 while (len > 0) { 3611 page = extent_buffer_page(eb, i); 3612 WARN_ON(!PageUptodate(page)); 3613 3614 cur = min(len, PAGE_CACHE_SIZE - offset); 3615 kaddr = page_address(page); 3616 memset(kaddr + offset, c, cur); 3617 3618 len -= cur; 3619 offset = 0; 3620 i++; 3621 } 3622 } 3623 3624 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3625 unsigned long dst_offset, unsigned long src_offset, 3626 unsigned long len) 3627 { 3628 u64 dst_len = dst->len; 3629 size_t cur; 3630 size_t offset; 3631 struct page *page; 3632 char *kaddr; 3633 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3634 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3635 3636 WARN_ON(src->len != dst_len); 3637 3638 offset = (start_offset + dst_offset) & 3639 ((unsigned long)PAGE_CACHE_SIZE - 1); 3640 3641 while (len > 0) { 3642 page = extent_buffer_page(dst, i); 3643 WARN_ON(!PageUptodate(page)); 3644 3645 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3646 3647 kaddr = page_address(page); 3648 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3649 3650 src_offset += cur; 3651 len -= cur; 3652 offset = 0; 3653 i++; 3654 } 3655 } 3656 3657 static void move_pages(struct page *dst_page, struct page *src_page, 3658 unsigned long dst_off, unsigned long src_off, 3659 unsigned long len) 3660 { 3661 char *dst_kaddr = page_address(dst_page); 3662 if (dst_page == src_page) { 3663 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3664 } else { 3665 char *src_kaddr = page_address(src_page); 3666 char *p = dst_kaddr + dst_off + len; 3667 char *s = src_kaddr + src_off + len; 3668 3669 while (len--) 3670 *--p = *--s; 3671 } 3672 } 3673 3674 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3675 { 3676 unsigned long distance = (src > dst) ? src - dst : dst - src; 3677 return distance < len; 3678 } 3679 3680 static void copy_pages(struct page *dst_page, struct page *src_page, 3681 unsigned long dst_off, unsigned long src_off, 3682 unsigned long len) 3683 { 3684 char *dst_kaddr = page_address(dst_page); 3685 char *src_kaddr; 3686 3687 if (dst_page != src_page) { 3688 src_kaddr = page_address(src_page); 3689 } else { 3690 src_kaddr = dst_kaddr; 3691 BUG_ON(areas_overlap(src_off, dst_off, len)); 3692 } 3693 3694 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3695 } 3696 3697 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3698 unsigned long src_offset, unsigned long len) 3699 { 3700 size_t cur; 3701 size_t dst_off_in_page; 3702 size_t src_off_in_page; 3703 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3704 unsigned long dst_i; 3705 unsigned long src_i; 3706 3707 if (src_offset + len > dst->len) { 3708 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3709 "len %lu dst len %lu\n", src_offset, len, dst->len); 3710 BUG_ON(1); 3711 } 3712 if (dst_offset + len > dst->len) { 3713 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3714 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3715 BUG_ON(1); 3716 } 3717 3718 while (len > 0) { 3719 dst_off_in_page = (start_offset + dst_offset) & 3720 ((unsigned long)PAGE_CACHE_SIZE - 1); 3721 src_off_in_page = (start_offset + src_offset) & 3722 ((unsigned long)PAGE_CACHE_SIZE - 1); 3723 3724 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3725 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3726 3727 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3728 src_off_in_page)); 3729 cur = min_t(unsigned long, cur, 3730 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3731 3732 copy_pages(extent_buffer_page(dst, dst_i), 3733 extent_buffer_page(dst, src_i), 3734 dst_off_in_page, src_off_in_page, cur); 3735 3736 src_offset += cur; 3737 dst_offset += cur; 3738 len -= cur; 3739 } 3740 } 3741 3742 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3743 unsigned long src_offset, unsigned long len) 3744 { 3745 size_t cur; 3746 size_t dst_off_in_page; 3747 size_t src_off_in_page; 3748 unsigned long dst_end = dst_offset + len - 1; 3749 unsigned long src_end = src_offset + len - 1; 3750 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3751 unsigned long dst_i; 3752 unsigned long src_i; 3753 3754 if (src_offset + len > dst->len) { 3755 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3756 "len %lu len %lu\n", src_offset, len, dst->len); 3757 BUG_ON(1); 3758 } 3759 if (dst_offset + len > dst->len) { 3760 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3761 "len %lu len %lu\n", dst_offset, len, dst->len); 3762 BUG_ON(1); 3763 } 3764 if (!areas_overlap(src_offset, dst_offset, len)) { 3765 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3766 return; 3767 } 3768 while (len > 0) { 3769 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3770 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3771 3772 dst_off_in_page = (start_offset + dst_end) & 3773 ((unsigned long)PAGE_CACHE_SIZE - 1); 3774 src_off_in_page = (start_offset + src_end) & 3775 ((unsigned long)PAGE_CACHE_SIZE - 1); 3776 3777 cur = min_t(unsigned long, len, src_off_in_page + 1); 3778 cur = min(cur, dst_off_in_page + 1); 3779 move_pages(extent_buffer_page(dst, dst_i), 3780 extent_buffer_page(dst, src_i), 3781 dst_off_in_page - cur + 1, 3782 src_off_in_page - cur + 1, cur); 3783 3784 dst_end -= cur; 3785 src_end -= cur; 3786 len -= cur; 3787 } 3788 } 3789 3790 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3791 { 3792 struct extent_buffer *eb = 3793 container_of(head, struct extent_buffer, rcu_head); 3794 3795 btrfs_release_extent_buffer(eb); 3796 } 3797 3798 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3799 { 3800 u64 start = page_offset(page); 3801 struct extent_buffer *eb; 3802 int ret = 1; 3803 3804 spin_lock(&tree->buffer_lock); 3805 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3806 if (!eb) { 3807 spin_unlock(&tree->buffer_lock); 3808 return ret; 3809 } 3810 3811 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3812 ret = 0; 3813 goto out; 3814 } 3815 3816 /* 3817 * set @eb->refs to 0 if it is already 1, and then release the @eb. 3818 * Or go back. 3819 */ 3820 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3821 ret = 0; 3822 goto out; 3823 } 3824 3825 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3826 out: 3827 spin_unlock(&tree->buffer_lock); 3828 3829 /* at this point we can safely release the extent buffer */ 3830 if (atomic_read(&eb->refs) == 0) 3831 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3832 return ret; 3833 } 3834