1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/gfp.h> 6 #include <linux/pagemap.h> 7 #include <linux/page-flags.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/writeback.h> 13 #include <linux/pagevec.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "compat.h" 17 #include "ctree.h" 18 #include "btrfs_inode.h" 19 20 static struct kmem_cache *extent_state_cache; 21 static struct kmem_cache *extent_buffer_cache; 22 23 static LIST_HEAD(buffers); 24 static LIST_HEAD(states); 25 26 #define LEAK_DEBUG 0 27 #if LEAK_DEBUG 28 static DEFINE_SPINLOCK(leak_lock); 29 #endif 30 31 #define BUFFER_LRU_MAX 64 32 33 struct tree_entry { 34 u64 start; 35 u64 end; 36 struct rb_node rb_node; 37 }; 38 39 struct extent_page_data { 40 struct bio *bio; 41 struct extent_io_tree *tree; 42 get_extent_t *get_extent; 43 44 /* tells writepage not to lock the state bits for this range 45 * it still does the unlocking 46 */ 47 unsigned int extent_locked:1; 48 49 /* tells the submit_bio code to use a WRITE_SYNC */ 50 unsigned int sync_io:1; 51 }; 52 53 int __init extent_io_init(void) 54 { 55 extent_state_cache = kmem_cache_create("extent_state", 56 sizeof(struct extent_state), 0, 57 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 58 if (!extent_state_cache) 59 return -ENOMEM; 60 61 extent_buffer_cache = kmem_cache_create("extent_buffers", 62 sizeof(struct extent_buffer), 0, 63 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 64 if (!extent_buffer_cache) 65 goto free_state_cache; 66 return 0; 67 68 free_state_cache: 69 kmem_cache_destroy(extent_state_cache); 70 return -ENOMEM; 71 } 72 73 void extent_io_exit(void) 74 { 75 struct extent_state *state; 76 struct extent_buffer *eb; 77 78 while (!list_empty(&states)) { 79 state = list_entry(states.next, struct extent_state, leak_list); 80 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 81 "state %lu in tree %p refs %d\n", 82 (unsigned long long)state->start, 83 (unsigned long long)state->end, 84 state->state, state->tree, atomic_read(&state->refs)); 85 list_del(&state->leak_list); 86 kmem_cache_free(extent_state_cache, state); 87 88 } 89 90 while (!list_empty(&buffers)) { 91 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 92 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 93 "refs %d\n", (unsigned long long)eb->start, 94 eb->len, atomic_read(&eb->refs)); 95 list_del(&eb->leak_list); 96 kmem_cache_free(extent_buffer_cache, eb); 97 } 98 if (extent_state_cache) 99 kmem_cache_destroy(extent_state_cache); 100 if (extent_buffer_cache) 101 kmem_cache_destroy(extent_buffer_cache); 102 } 103 104 void extent_io_tree_init(struct extent_io_tree *tree, 105 struct address_space *mapping, gfp_t mask) 106 { 107 tree->state = RB_ROOT; 108 tree->buffer = RB_ROOT; 109 tree->ops = NULL; 110 tree->dirty_bytes = 0; 111 spin_lock_init(&tree->lock); 112 spin_lock_init(&tree->buffer_lock); 113 tree->mapping = mapping; 114 } 115 116 static struct extent_state *alloc_extent_state(gfp_t mask) 117 { 118 struct extent_state *state; 119 #if LEAK_DEBUG 120 unsigned long flags; 121 #endif 122 123 state = kmem_cache_alloc(extent_state_cache, mask); 124 if (!state) 125 return state; 126 state->state = 0; 127 state->private = 0; 128 state->tree = NULL; 129 #if LEAK_DEBUG 130 spin_lock_irqsave(&leak_lock, flags); 131 list_add(&state->leak_list, &states); 132 spin_unlock_irqrestore(&leak_lock, flags); 133 #endif 134 atomic_set(&state->refs, 1); 135 init_waitqueue_head(&state->wq); 136 return state; 137 } 138 139 static void free_extent_state(struct extent_state *state) 140 { 141 if (!state) 142 return; 143 if (atomic_dec_and_test(&state->refs)) { 144 #if LEAK_DEBUG 145 unsigned long flags; 146 #endif 147 WARN_ON(state->tree); 148 #if LEAK_DEBUG 149 spin_lock_irqsave(&leak_lock, flags); 150 list_del(&state->leak_list); 151 spin_unlock_irqrestore(&leak_lock, flags); 152 #endif 153 kmem_cache_free(extent_state_cache, state); 154 } 155 } 156 157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 158 struct rb_node *node) 159 { 160 struct rb_node **p = &root->rb_node; 161 struct rb_node *parent = NULL; 162 struct tree_entry *entry; 163 164 while (*p) { 165 parent = *p; 166 entry = rb_entry(parent, struct tree_entry, rb_node); 167 168 if (offset < entry->start) 169 p = &(*p)->rb_left; 170 else if (offset > entry->end) 171 p = &(*p)->rb_right; 172 else 173 return parent; 174 } 175 176 entry = rb_entry(node, struct tree_entry, rb_node); 177 rb_link_node(node, parent, p); 178 rb_insert_color(node, root); 179 return NULL; 180 } 181 182 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 183 struct rb_node **prev_ret, 184 struct rb_node **next_ret) 185 { 186 struct rb_root *root = &tree->state; 187 struct rb_node *n = root->rb_node; 188 struct rb_node *prev = NULL; 189 struct rb_node *orig_prev = NULL; 190 struct tree_entry *entry; 191 struct tree_entry *prev_entry = NULL; 192 193 while (n) { 194 entry = rb_entry(n, struct tree_entry, rb_node); 195 prev = n; 196 prev_entry = entry; 197 198 if (offset < entry->start) 199 n = n->rb_left; 200 else if (offset > entry->end) 201 n = n->rb_right; 202 else 203 return n; 204 } 205 206 if (prev_ret) { 207 orig_prev = prev; 208 while (prev && offset > prev_entry->end) { 209 prev = rb_next(prev); 210 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 211 } 212 *prev_ret = prev; 213 prev = orig_prev; 214 } 215 216 if (next_ret) { 217 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 218 while (prev && offset < prev_entry->start) { 219 prev = rb_prev(prev); 220 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 221 } 222 *next_ret = prev; 223 } 224 return NULL; 225 } 226 227 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 228 u64 offset) 229 { 230 struct rb_node *prev = NULL; 231 struct rb_node *ret; 232 233 ret = __etree_search(tree, offset, &prev, NULL); 234 if (!ret) 235 return prev; 236 return ret; 237 } 238 239 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, 240 u64 offset, struct rb_node *node) 241 { 242 struct rb_root *root = &tree->buffer; 243 struct rb_node **p = &root->rb_node; 244 struct rb_node *parent = NULL; 245 struct extent_buffer *eb; 246 247 while (*p) { 248 parent = *p; 249 eb = rb_entry(parent, struct extent_buffer, rb_node); 250 251 if (offset < eb->start) 252 p = &(*p)->rb_left; 253 else if (offset > eb->start) 254 p = &(*p)->rb_right; 255 else 256 return eb; 257 } 258 259 rb_link_node(node, parent, p); 260 rb_insert_color(node, root); 261 return NULL; 262 } 263 264 static struct extent_buffer *buffer_search(struct extent_io_tree *tree, 265 u64 offset) 266 { 267 struct rb_root *root = &tree->buffer; 268 struct rb_node *n = root->rb_node; 269 struct extent_buffer *eb; 270 271 while (n) { 272 eb = rb_entry(n, struct extent_buffer, rb_node); 273 if (offset < eb->start) 274 n = n->rb_left; 275 else if (offset > eb->start) 276 n = n->rb_right; 277 else 278 return eb; 279 } 280 return NULL; 281 } 282 283 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 284 struct extent_state *other) 285 { 286 if (tree->ops && tree->ops->merge_extent_hook) 287 tree->ops->merge_extent_hook(tree->mapping->host, new, 288 other); 289 } 290 291 /* 292 * utility function to look for merge candidates inside a given range. 293 * Any extents with matching state are merged together into a single 294 * extent in the tree. Extents with EXTENT_IO in their state field 295 * are not merged because the end_io handlers need to be able to do 296 * operations on them without sleeping (or doing allocations/splits). 297 * 298 * This should be called with the tree lock held. 299 */ 300 static int merge_state(struct extent_io_tree *tree, 301 struct extent_state *state) 302 { 303 struct extent_state *other; 304 struct rb_node *other_node; 305 306 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 307 return 0; 308 309 other_node = rb_prev(&state->rb_node); 310 if (other_node) { 311 other = rb_entry(other_node, struct extent_state, rb_node); 312 if (other->end == state->start - 1 && 313 other->state == state->state) { 314 merge_cb(tree, state, other); 315 state->start = other->start; 316 other->tree = NULL; 317 rb_erase(&other->rb_node, &tree->state); 318 free_extent_state(other); 319 } 320 } 321 other_node = rb_next(&state->rb_node); 322 if (other_node) { 323 other = rb_entry(other_node, struct extent_state, rb_node); 324 if (other->start == state->end + 1 && 325 other->state == state->state) { 326 merge_cb(tree, state, other); 327 other->start = state->start; 328 state->tree = NULL; 329 rb_erase(&state->rb_node, &tree->state); 330 free_extent_state(state); 331 state = NULL; 332 } 333 } 334 335 return 0; 336 } 337 338 static int set_state_cb(struct extent_io_tree *tree, 339 struct extent_state *state, 340 unsigned long bits) 341 { 342 if (tree->ops && tree->ops->set_bit_hook) { 343 return tree->ops->set_bit_hook(tree->mapping->host, 344 state->start, state->end, 345 state->state, bits); 346 } 347 348 return 0; 349 } 350 351 static void clear_state_cb(struct extent_io_tree *tree, 352 struct extent_state *state, 353 unsigned long bits) 354 { 355 if (tree->ops && tree->ops->clear_bit_hook) 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 357 } 358 359 /* 360 * insert an extent_state struct into the tree. 'bits' are set on the 361 * struct before it is inserted. 362 * 363 * This may return -EEXIST if the extent is already there, in which case the 364 * state struct is freed. 365 * 366 * The tree lock is not taken internally. This is a utility function and 367 * probably isn't what you want to call (see set/clear_extent_bit). 368 */ 369 static int insert_state(struct extent_io_tree *tree, 370 struct extent_state *state, u64 start, u64 end, 371 int bits) 372 { 373 struct rb_node *node; 374 int ret; 375 376 if (end < start) { 377 printk(KERN_ERR "btrfs end < start %llu %llu\n", 378 (unsigned long long)end, 379 (unsigned long long)start); 380 WARN_ON(1); 381 } 382 state->start = start; 383 state->end = end; 384 ret = set_state_cb(tree, state, bits); 385 if (ret) 386 return ret; 387 388 if (bits & EXTENT_DIRTY) 389 tree->dirty_bytes += end - start + 1; 390 state->state |= bits; 391 node = tree_insert(&tree->state, end, &state->rb_node); 392 if (node) { 393 struct extent_state *found; 394 found = rb_entry(node, struct extent_state, rb_node); 395 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 396 "%llu %llu\n", (unsigned long long)found->start, 397 (unsigned long long)found->end, 398 (unsigned long long)start, (unsigned long long)end); 399 free_extent_state(state); 400 return -EEXIST; 401 } 402 state->tree = tree; 403 merge_state(tree, state); 404 return 0; 405 } 406 407 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 408 u64 split) 409 { 410 if (tree->ops && tree->ops->split_extent_hook) 411 return tree->ops->split_extent_hook(tree->mapping->host, 412 orig, split); 413 return 0; 414 } 415 416 /* 417 * split a given extent state struct in two, inserting the preallocated 418 * struct 'prealloc' as the newly created second half. 'split' indicates an 419 * offset inside 'orig' where it should be split. 420 * 421 * Before calling, 422 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 423 * are two extent state structs in the tree: 424 * prealloc: [orig->start, split - 1] 425 * orig: [ split, orig->end ] 426 * 427 * The tree locks are not taken by this function. They need to be held 428 * by the caller. 429 */ 430 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 431 struct extent_state *prealloc, u64 split) 432 { 433 struct rb_node *node; 434 435 split_cb(tree, orig, split); 436 437 prealloc->start = orig->start; 438 prealloc->end = split - 1; 439 prealloc->state = orig->state; 440 orig->start = split; 441 442 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 443 if (node) { 444 free_extent_state(prealloc); 445 return -EEXIST; 446 } 447 prealloc->tree = tree; 448 return 0; 449 } 450 451 /* 452 * utility function to clear some bits in an extent state struct. 453 * it will optionally wake up any one waiting on this state (wake == 1), or 454 * forcibly remove the state from the tree (delete == 1). 455 * 456 * If no bits are set on the state struct after clearing things, the 457 * struct is freed and removed from the tree 458 */ 459 static int clear_state_bit(struct extent_io_tree *tree, 460 struct extent_state *state, int bits, int wake, 461 int delete) 462 { 463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 464 int ret = state->state & bits_to_clear; 465 466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 467 u64 range = state->end - state->start + 1; 468 WARN_ON(range > tree->dirty_bytes); 469 tree->dirty_bytes -= range; 470 } 471 clear_state_cb(tree, state, bits); 472 state->state &= ~bits_to_clear; 473 if (wake) 474 wake_up(&state->wq); 475 if (delete || state->state == 0) { 476 if (state->tree) { 477 clear_state_cb(tree, state, state->state); 478 rb_erase(&state->rb_node, &tree->state); 479 state->tree = NULL; 480 free_extent_state(state); 481 } else { 482 WARN_ON(1); 483 } 484 } else { 485 merge_state(tree, state); 486 } 487 return ret; 488 } 489 490 /* 491 * clear some bits on a range in the tree. This may require splitting 492 * or inserting elements in the tree, so the gfp mask is used to 493 * indicate which allocations or sleeping are allowed. 494 * 495 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 496 * the given range from the tree regardless of state (ie for truncate). 497 * 498 * the range [start, end] is inclusive. 499 * 500 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 501 * bits were already set, or zero if none of the bits were already set. 502 */ 503 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 504 int bits, int wake, int delete, 505 struct extent_state **cached_state, 506 gfp_t mask) 507 { 508 struct extent_state *state; 509 struct extent_state *cached; 510 struct extent_state *prealloc = NULL; 511 struct rb_node *next_node; 512 struct rb_node *node; 513 u64 last_end; 514 int err; 515 int set = 0; 516 int clear = 0; 517 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 519 clear = 1; 520 again: 521 if (!prealloc && (mask & __GFP_WAIT)) { 522 prealloc = alloc_extent_state(mask); 523 if (!prealloc) 524 return -ENOMEM; 525 } 526 527 spin_lock(&tree->lock); 528 if (cached_state) { 529 cached = *cached_state; 530 531 if (clear) { 532 *cached_state = NULL; 533 cached_state = NULL; 534 } 535 536 if (cached && cached->tree && cached->start == start) { 537 if (clear) 538 atomic_dec(&cached->refs); 539 state = cached; 540 goto hit_next; 541 } 542 if (clear) 543 free_extent_state(cached); 544 } 545 /* 546 * this search will find the extents that end after 547 * our range starts 548 */ 549 node = tree_search(tree, start); 550 if (!node) 551 goto out; 552 state = rb_entry(node, struct extent_state, rb_node); 553 hit_next: 554 if (state->start > end) 555 goto out; 556 WARN_ON(state->end < start); 557 last_end = state->end; 558 559 /* 560 * | ---- desired range ---- | 561 * | state | or 562 * | ------------- state -------------- | 563 * 564 * We need to split the extent we found, and may flip 565 * bits on second half. 566 * 567 * If the extent we found extends past our range, we 568 * just split and search again. It'll get split again 569 * the next time though. 570 * 571 * If the extent we found is inside our range, we clear 572 * the desired bit on it. 573 */ 574 575 if (state->start < start) { 576 if (!prealloc) 577 prealloc = alloc_extent_state(GFP_ATOMIC); 578 err = split_state(tree, state, prealloc, start); 579 BUG_ON(err == -EEXIST); 580 prealloc = NULL; 581 if (err) 582 goto out; 583 if (state->end <= end) { 584 set |= clear_state_bit(tree, state, bits, wake, 585 delete); 586 if (last_end == (u64)-1) 587 goto out; 588 start = last_end + 1; 589 } 590 goto search_again; 591 } 592 /* 593 * | ---- desired range ---- | 594 * | state | 595 * We need to split the extent, and clear the bit 596 * on the first half 597 */ 598 if (state->start <= end && state->end > end) { 599 if (!prealloc) 600 prealloc = alloc_extent_state(GFP_ATOMIC); 601 err = split_state(tree, state, prealloc, end + 1); 602 BUG_ON(err == -EEXIST); 603 if (wake) 604 wake_up(&state->wq); 605 606 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 607 608 prealloc = NULL; 609 goto out; 610 } 611 612 if (state->end < end && prealloc && !need_resched()) 613 next_node = rb_next(&state->rb_node); 614 else 615 next_node = NULL; 616 617 set |= clear_state_bit(tree, state, bits, wake, delete); 618 if (last_end == (u64)-1) 619 goto out; 620 start = last_end + 1; 621 if (start <= end && next_node) { 622 state = rb_entry(next_node, struct extent_state, 623 rb_node); 624 if (state->start == start) 625 goto hit_next; 626 } 627 goto search_again; 628 629 out: 630 spin_unlock(&tree->lock); 631 if (prealloc) 632 free_extent_state(prealloc); 633 634 return set; 635 636 search_again: 637 if (start > end) 638 goto out; 639 spin_unlock(&tree->lock); 640 if (mask & __GFP_WAIT) 641 cond_resched(); 642 goto again; 643 } 644 645 static int wait_on_state(struct extent_io_tree *tree, 646 struct extent_state *state) 647 __releases(tree->lock) 648 __acquires(tree->lock) 649 { 650 DEFINE_WAIT(wait); 651 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 652 spin_unlock(&tree->lock); 653 schedule(); 654 spin_lock(&tree->lock); 655 finish_wait(&state->wq, &wait); 656 return 0; 657 } 658 659 /* 660 * waits for one or more bits to clear on a range in the state tree. 661 * The range [start, end] is inclusive. 662 * The tree lock is taken by this function 663 */ 664 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 665 { 666 struct extent_state *state; 667 struct rb_node *node; 668 669 spin_lock(&tree->lock); 670 again: 671 while (1) { 672 /* 673 * this search will find all the extents that end after 674 * our range starts 675 */ 676 node = tree_search(tree, start); 677 if (!node) 678 break; 679 680 state = rb_entry(node, struct extent_state, rb_node); 681 682 if (state->start > end) 683 goto out; 684 685 if (state->state & bits) { 686 start = state->start; 687 atomic_inc(&state->refs); 688 wait_on_state(tree, state); 689 free_extent_state(state); 690 goto again; 691 } 692 start = state->end + 1; 693 694 if (start > end) 695 break; 696 697 if (need_resched()) { 698 spin_unlock(&tree->lock); 699 cond_resched(); 700 spin_lock(&tree->lock); 701 } 702 } 703 out: 704 spin_unlock(&tree->lock); 705 return 0; 706 } 707 708 static int set_state_bits(struct extent_io_tree *tree, 709 struct extent_state *state, 710 int bits) 711 { 712 int ret; 713 714 ret = set_state_cb(tree, state, bits); 715 if (ret) 716 return ret; 717 718 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 719 u64 range = state->end - state->start + 1; 720 tree->dirty_bytes += range; 721 } 722 state->state |= bits; 723 724 return 0; 725 } 726 727 static void cache_state(struct extent_state *state, 728 struct extent_state **cached_ptr) 729 { 730 if (cached_ptr && !(*cached_ptr)) { 731 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 732 *cached_ptr = state; 733 atomic_inc(&state->refs); 734 } 735 } 736 } 737 738 /* 739 * set some bits on a range in the tree. This may require allocations or 740 * sleeping, so the gfp mask is used to indicate what is allowed. 741 * 742 * If any of the exclusive bits are set, this will fail with -EEXIST if some 743 * part of the range already has the desired bits set. The start of the 744 * existing range is returned in failed_start in this case. 745 * 746 * [start, end] is inclusive This takes the tree lock. 747 */ 748 749 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 750 int bits, int exclusive_bits, u64 *failed_start, 751 struct extent_state **cached_state, 752 gfp_t mask) 753 { 754 struct extent_state *state; 755 struct extent_state *prealloc = NULL; 756 struct rb_node *node; 757 int err = 0; 758 u64 last_start; 759 u64 last_end; 760 761 again: 762 if (!prealloc && (mask & __GFP_WAIT)) { 763 prealloc = alloc_extent_state(mask); 764 if (!prealloc) 765 return -ENOMEM; 766 } 767 768 spin_lock(&tree->lock); 769 if (cached_state && *cached_state) { 770 state = *cached_state; 771 if (state->start == start && state->tree) { 772 node = &state->rb_node; 773 goto hit_next; 774 } 775 } 776 /* 777 * this search will find all the extents that end after 778 * our range starts. 779 */ 780 node = tree_search(tree, start); 781 if (!node) { 782 err = insert_state(tree, prealloc, start, end, bits); 783 prealloc = NULL; 784 BUG_ON(err == -EEXIST); 785 goto out; 786 } 787 state = rb_entry(node, struct extent_state, rb_node); 788 hit_next: 789 last_start = state->start; 790 last_end = state->end; 791 792 /* 793 * | ---- desired range ---- | 794 * | state | 795 * 796 * Just lock what we found and keep going 797 */ 798 if (state->start == start && state->end <= end) { 799 struct rb_node *next_node; 800 if (state->state & exclusive_bits) { 801 *failed_start = state->start; 802 err = -EEXIST; 803 goto out; 804 } 805 806 err = set_state_bits(tree, state, bits); 807 if (err) 808 goto out; 809 810 cache_state(state, cached_state); 811 merge_state(tree, state); 812 if (last_end == (u64)-1) 813 goto out; 814 815 start = last_end + 1; 816 if (start < end && prealloc && !need_resched()) { 817 next_node = rb_next(node); 818 if (next_node) { 819 state = rb_entry(next_node, struct extent_state, 820 rb_node); 821 if (state->start == start) 822 goto hit_next; 823 } 824 } 825 goto search_again; 826 } 827 828 /* 829 * | ---- desired range ---- | 830 * | state | 831 * or 832 * | ------------- state -------------- | 833 * 834 * We need to split the extent we found, and may flip bits on 835 * second half. 836 * 837 * If the extent we found extends past our 838 * range, we just split and search again. It'll get split 839 * again the next time though. 840 * 841 * If the extent we found is inside our range, we set the 842 * desired bit on it. 843 */ 844 if (state->start < start) { 845 if (state->state & exclusive_bits) { 846 *failed_start = start; 847 err = -EEXIST; 848 goto out; 849 } 850 err = split_state(tree, state, prealloc, start); 851 BUG_ON(err == -EEXIST); 852 prealloc = NULL; 853 if (err) 854 goto out; 855 if (state->end <= end) { 856 err = set_state_bits(tree, state, bits); 857 if (err) 858 goto out; 859 cache_state(state, cached_state); 860 merge_state(tree, state); 861 if (last_end == (u64)-1) 862 goto out; 863 start = last_end + 1; 864 } 865 goto search_again; 866 } 867 /* 868 * | ---- desired range ---- | 869 * | state | or | state | 870 * 871 * There's a hole, we need to insert something in it and 872 * ignore the extent we found. 873 */ 874 if (state->start > start) { 875 u64 this_end; 876 if (end < last_start) 877 this_end = end; 878 else 879 this_end = last_start - 1; 880 err = insert_state(tree, prealloc, start, this_end, 881 bits); 882 BUG_ON(err == -EEXIST); 883 if (err) { 884 prealloc = NULL; 885 goto out; 886 } 887 cache_state(prealloc, cached_state); 888 prealloc = NULL; 889 start = this_end + 1; 890 goto search_again; 891 } 892 /* 893 * | ---- desired range ---- | 894 * | state | 895 * We need to split the extent, and set the bit 896 * on the first half 897 */ 898 if (state->start <= end && state->end > end) { 899 if (state->state & exclusive_bits) { 900 *failed_start = start; 901 err = -EEXIST; 902 goto out; 903 } 904 err = split_state(tree, state, prealloc, end + 1); 905 BUG_ON(err == -EEXIST); 906 907 err = set_state_bits(tree, prealloc, bits); 908 if (err) { 909 prealloc = NULL; 910 goto out; 911 } 912 cache_state(prealloc, cached_state); 913 merge_state(tree, prealloc); 914 prealloc = NULL; 915 goto out; 916 } 917 918 goto search_again; 919 920 out: 921 spin_unlock(&tree->lock); 922 if (prealloc) 923 free_extent_state(prealloc); 924 925 return err; 926 927 search_again: 928 if (start > end) 929 goto out; 930 spin_unlock(&tree->lock); 931 if (mask & __GFP_WAIT) 932 cond_resched(); 933 goto again; 934 } 935 936 /* wrappers around set/clear extent bit */ 937 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 938 gfp_t mask) 939 { 940 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 941 NULL, mask); 942 } 943 944 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 945 int bits, gfp_t mask) 946 { 947 return set_extent_bit(tree, start, end, bits, 0, NULL, 948 NULL, mask); 949 } 950 951 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 952 int bits, gfp_t mask) 953 { 954 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 955 } 956 957 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 958 struct extent_state **cached_state, gfp_t mask) 959 { 960 return set_extent_bit(tree, start, end, 961 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 962 0, NULL, cached_state, mask); 963 } 964 965 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 966 gfp_t mask) 967 { 968 return clear_extent_bit(tree, start, end, 969 EXTENT_DIRTY | EXTENT_DELALLOC | 970 EXTENT_DO_ACCOUNTING, 0, 0, 971 NULL, mask); 972 } 973 974 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 975 gfp_t mask) 976 { 977 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 978 NULL, mask); 979 } 980 981 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 982 gfp_t mask) 983 { 984 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, 985 NULL, mask); 986 } 987 988 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 989 gfp_t mask) 990 { 991 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 992 NULL, mask); 993 } 994 995 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 996 u64 end, struct extent_state **cached_state, 997 gfp_t mask) 998 { 999 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 1000 cached_state, mask); 1001 } 1002 1003 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1004 { 1005 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 1006 } 1007 1008 /* 1009 * either insert or lock state struct between start and end use mask to tell 1010 * us if waiting is desired. 1011 */ 1012 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1013 int bits, struct extent_state **cached_state, gfp_t mask) 1014 { 1015 int err; 1016 u64 failed_start; 1017 while (1) { 1018 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1019 EXTENT_LOCKED, &failed_start, 1020 cached_state, mask); 1021 if (err == -EEXIST && (mask & __GFP_WAIT)) { 1022 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1023 start = failed_start; 1024 } else { 1025 break; 1026 } 1027 WARN_ON(start > end); 1028 } 1029 return err; 1030 } 1031 1032 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 1033 { 1034 return lock_extent_bits(tree, start, end, 0, NULL, mask); 1035 } 1036 1037 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1038 gfp_t mask) 1039 { 1040 int err; 1041 u64 failed_start; 1042 1043 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1044 &failed_start, NULL, mask); 1045 if (err == -EEXIST) { 1046 if (failed_start > start) 1047 clear_extent_bit(tree, start, failed_start - 1, 1048 EXTENT_LOCKED, 1, 0, NULL, mask); 1049 return 0; 1050 } 1051 return 1; 1052 } 1053 1054 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1055 struct extent_state **cached, gfp_t mask) 1056 { 1057 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1058 mask); 1059 } 1060 1061 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1062 gfp_t mask) 1063 { 1064 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1065 mask); 1066 } 1067 1068 /* 1069 * helper function to set pages and extents in the tree dirty 1070 */ 1071 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) 1072 { 1073 unsigned long index = start >> PAGE_CACHE_SHIFT; 1074 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1075 struct page *page; 1076 1077 while (index <= end_index) { 1078 page = find_get_page(tree->mapping, index); 1079 BUG_ON(!page); 1080 __set_page_dirty_nobuffers(page); 1081 page_cache_release(page); 1082 index++; 1083 } 1084 return 0; 1085 } 1086 1087 /* 1088 * helper function to set both pages and extents in the tree writeback 1089 */ 1090 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1091 { 1092 unsigned long index = start >> PAGE_CACHE_SHIFT; 1093 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1094 struct page *page; 1095 1096 while (index <= end_index) { 1097 page = find_get_page(tree->mapping, index); 1098 BUG_ON(!page); 1099 set_page_writeback(page); 1100 page_cache_release(page); 1101 index++; 1102 } 1103 return 0; 1104 } 1105 1106 /* 1107 * find the first offset in the io tree with 'bits' set. zero is 1108 * returned if we find something, and *start_ret and *end_ret are 1109 * set to reflect the state struct that was found. 1110 * 1111 * If nothing was found, 1 is returned, < 0 on error 1112 */ 1113 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1114 u64 *start_ret, u64 *end_ret, int bits) 1115 { 1116 struct rb_node *node; 1117 struct extent_state *state; 1118 int ret = 1; 1119 1120 spin_lock(&tree->lock); 1121 /* 1122 * this search will find all the extents that end after 1123 * our range starts. 1124 */ 1125 node = tree_search(tree, start); 1126 if (!node) 1127 goto out; 1128 1129 while (1) { 1130 state = rb_entry(node, struct extent_state, rb_node); 1131 if (state->end >= start && (state->state & bits)) { 1132 *start_ret = state->start; 1133 *end_ret = state->end; 1134 ret = 0; 1135 break; 1136 } 1137 node = rb_next(node); 1138 if (!node) 1139 break; 1140 } 1141 out: 1142 spin_unlock(&tree->lock); 1143 return ret; 1144 } 1145 1146 /* find the first state struct with 'bits' set after 'start', and 1147 * return it. tree->lock must be held. NULL will returned if 1148 * nothing was found after 'start' 1149 */ 1150 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1151 u64 start, int bits) 1152 { 1153 struct rb_node *node; 1154 struct extent_state *state; 1155 1156 /* 1157 * this search will find all the extents that end after 1158 * our range starts. 1159 */ 1160 node = tree_search(tree, start); 1161 if (!node) 1162 goto out; 1163 1164 while (1) { 1165 state = rb_entry(node, struct extent_state, rb_node); 1166 if (state->end >= start && (state->state & bits)) 1167 return state; 1168 1169 node = rb_next(node); 1170 if (!node) 1171 break; 1172 } 1173 out: 1174 return NULL; 1175 } 1176 1177 /* 1178 * find a contiguous range of bytes in the file marked as delalloc, not 1179 * more than 'max_bytes'. start and end are used to return the range, 1180 * 1181 * 1 is returned if we find something, 0 if nothing was in the tree 1182 */ 1183 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1184 u64 *start, u64 *end, u64 max_bytes, 1185 struct extent_state **cached_state) 1186 { 1187 struct rb_node *node; 1188 struct extent_state *state; 1189 u64 cur_start = *start; 1190 u64 found = 0; 1191 u64 total_bytes = 0; 1192 1193 spin_lock(&tree->lock); 1194 1195 /* 1196 * this search will find all the extents that end after 1197 * our range starts. 1198 */ 1199 node = tree_search(tree, cur_start); 1200 if (!node) { 1201 if (!found) 1202 *end = (u64)-1; 1203 goto out; 1204 } 1205 1206 while (1) { 1207 state = rb_entry(node, struct extent_state, rb_node); 1208 if (found && (state->start != cur_start || 1209 (state->state & EXTENT_BOUNDARY))) { 1210 goto out; 1211 } 1212 if (!(state->state & EXTENT_DELALLOC)) { 1213 if (!found) 1214 *end = state->end; 1215 goto out; 1216 } 1217 if (!found) { 1218 *start = state->start; 1219 *cached_state = state; 1220 atomic_inc(&state->refs); 1221 } 1222 found++; 1223 *end = state->end; 1224 cur_start = state->end + 1; 1225 node = rb_next(node); 1226 if (!node) 1227 break; 1228 total_bytes += state->end - state->start + 1; 1229 if (total_bytes >= max_bytes) 1230 break; 1231 } 1232 out: 1233 spin_unlock(&tree->lock); 1234 return found; 1235 } 1236 1237 static noinline int __unlock_for_delalloc(struct inode *inode, 1238 struct page *locked_page, 1239 u64 start, u64 end) 1240 { 1241 int ret; 1242 struct page *pages[16]; 1243 unsigned long index = start >> PAGE_CACHE_SHIFT; 1244 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1245 unsigned long nr_pages = end_index - index + 1; 1246 int i; 1247 1248 if (index == locked_page->index && end_index == index) 1249 return 0; 1250 1251 while (nr_pages > 0) { 1252 ret = find_get_pages_contig(inode->i_mapping, index, 1253 min_t(unsigned long, nr_pages, 1254 ARRAY_SIZE(pages)), pages); 1255 for (i = 0; i < ret; i++) { 1256 if (pages[i] != locked_page) 1257 unlock_page(pages[i]); 1258 page_cache_release(pages[i]); 1259 } 1260 nr_pages -= ret; 1261 index += ret; 1262 cond_resched(); 1263 } 1264 return 0; 1265 } 1266 1267 static noinline int lock_delalloc_pages(struct inode *inode, 1268 struct page *locked_page, 1269 u64 delalloc_start, 1270 u64 delalloc_end) 1271 { 1272 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1273 unsigned long start_index = index; 1274 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1275 unsigned long pages_locked = 0; 1276 struct page *pages[16]; 1277 unsigned long nrpages; 1278 int ret; 1279 int i; 1280 1281 /* the caller is responsible for locking the start index */ 1282 if (index == locked_page->index && index == end_index) 1283 return 0; 1284 1285 /* skip the page at the start index */ 1286 nrpages = end_index - index + 1; 1287 while (nrpages > 0) { 1288 ret = find_get_pages_contig(inode->i_mapping, index, 1289 min_t(unsigned long, 1290 nrpages, ARRAY_SIZE(pages)), pages); 1291 if (ret == 0) { 1292 ret = -EAGAIN; 1293 goto done; 1294 } 1295 /* now we have an array of pages, lock them all */ 1296 for (i = 0; i < ret; i++) { 1297 /* 1298 * the caller is taking responsibility for 1299 * locked_page 1300 */ 1301 if (pages[i] != locked_page) { 1302 lock_page(pages[i]); 1303 if (!PageDirty(pages[i]) || 1304 pages[i]->mapping != inode->i_mapping) { 1305 ret = -EAGAIN; 1306 unlock_page(pages[i]); 1307 page_cache_release(pages[i]); 1308 goto done; 1309 } 1310 } 1311 page_cache_release(pages[i]); 1312 pages_locked++; 1313 } 1314 nrpages -= ret; 1315 index += ret; 1316 cond_resched(); 1317 } 1318 ret = 0; 1319 done: 1320 if (ret && pages_locked) { 1321 __unlock_for_delalloc(inode, locked_page, 1322 delalloc_start, 1323 ((u64)(start_index + pages_locked - 1)) << 1324 PAGE_CACHE_SHIFT); 1325 } 1326 return ret; 1327 } 1328 1329 /* 1330 * find a contiguous range of bytes in the file marked as delalloc, not 1331 * more than 'max_bytes'. start and end are used to return the range, 1332 * 1333 * 1 is returned if we find something, 0 if nothing was in the tree 1334 */ 1335 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1336 struct extent_io_tree *tree, 1337 struct page *locked_page, 1338 u64 *start, u64 *end, 1339 u64 max_bytes) 1340 { 1341 u64 delalloc_start; 1342 u64 delalloc_end; 1343 u64 found; 1344 struct extent_state *cached_state = NULL; 1345 int ret; 1346 int loops = 0; 1347 1348 again: 1349 /* step one, find a bunch of delalloc bytes starting at start */ 1350 delalloc_start = *start; 1351 delalloc_end = 0; 1352 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1353 max_bytes, &cached_state); 1354 if (!found || delalloc_end <= *start) { 1355 *start = delalloc_start; 1356 *end = delalloc_end; 1357 free_extent_state(cached_state); 1358 return found; 1359 } 1360 1361 /* 1362 * start comes from the offset of locked_page. We have to lock 1363 * pages in order, so we can't process delalloc bytes before 1364 * locked_page 1365 */ 1366 if (delalloc_start < *start) 1367 delalloc_start = *start; 1368 1369 /* 1370 * make sure to limit the number of pages we try to lock down 1371 * if we're looping. 1372 */ 1373 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1374 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1375 1376 /* step two, lock all the pages after the page that has start */ 1377 ret = lock_delalloc_pages(inode, locked_page, 1378 delalloc_start, delalloc_end); 1379 if (ret == -EAGAIN) { 1380 /* some of the pages are gone, lets avoid looping by 1381 * shortening the size of the delalloc range we're searching 1382 */ 1383 free_extent_state(cached_state); 1384 if (!loops) { 1385 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1386 max_bytes = PAGE_CACHE_SIZE - offset; 1387 loops = 1; 1388 goto again; 1389 } else { 1390 found = 0; 1391 goto out_failed; 1392 } 1393 } 1394 BUG_ON(ret); 1395 1396 /* step three, lock the state bits for the whole range */ 1397 lock_extent_bits(tree, delalloc_start, delalloc_end, 1398 0, &cached_state, GFP_NOFS); 1399 1400 /* then test to make sure it is all still delalloc */ 1401 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1402 EXTENT_DELALLOC, 1, cached_state); 1403 if (!ret) { 1404 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1405 &cached_state, GFP_NOFS); 1406 __unlock_for_delalloc(inode, locked_page, 1407 delalloc_start, delalloc_end); 1408 cond_resched(); 1409 goto again; 1410 } 1411 free_extent_state(cached_state); 1412 *start = delalloc_start; 1413 *end = delalloc_end; 1414 out_failed: 1415 return found; 1416 } 1417 1418 int extent_clear_unlock_delalloc(struct inode *inode, 1419 struct extent_io_tree *tree, 1420 u64 start, u64 end, struct page *locked_page, 1421 unsigned long op) 1422 { 1423 int ret; 1424 struct page *pages[16]; 1425 unsigned long index = start >> PAGE_CACHE_SHIFT; 1426 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1427 unsigned long nr_pages = end_index - index + 1; 1428 int i; 1429 int clear_bits = 0; 1430 1431 if (op & EXTENT_CLEAR_UNLOCK) 1432 clear_bits |= EXTENT_LOCKED; 1433 if (op & EXTENT_CLEAR_DIRTY) 1434 clear_bits |= EXTENT_DIRTY; 1435 1436 if (op & EXTENT_CLEAR_DELALLOC) 1437 clear_bits |= EXTENT_DELALLOC; 1438 1439 if (op & EXTENT_CLEAR_ACCOUNTING) 1440 clear_bits |= EXTENT_DO_ACCOUNTING; 1441 1442 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1443 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1445 EXTENT_SET_PRIVATE2))) 1446 return 0; 1447 1448 while (nr_pages > 0) { 1449 ret = find_get_pages_contig(inode->i_mapping, index, 1450 min_t(unsigned long, 1451 nr_pages, ARRAY_SIZE(pages)), pages); 1452 for (i = 0; i < ret; i++) { 1453 1454 if (op & EXTENT_SET_PRIVATE2) 1455 SetPagePrivate2(pages[i]); 1456 1457 if (pages[i] == locked_page) { 1458 page_cache_release(pages[i]); 1459 continue; 1460 } 1461 if (op & EXTENT_CLEAR_DIRTY) 1462 clear_page_dirty_for_io(pages[i]); 1463 if (op & EXTENT_SET_WRITEBACK) 1464 set_page_writeback(pages[i]); 1465 if (op & EXTENT_END_WRITEBACK) 1466 end_page_writeback(pages[i]); 1467 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1468 unlock_page(pages[i]); 1469 page_cache_release(pages[i]); 1470 } 1471 nr_pages -= ret; 1472 index += ret; 1473 cond_resched(); 1474 } 1475 return 0; 1476 } 1477 1478 /* 1479 * count the number of bytes in the tree that have a given bit(s) 1480 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1481 * cached. The total number found is returned. 1482 */ 1483 u64 count_range_bits(struct extent_io_tree *tree, 1484 u64 *start, u64 search_end, u64 max_bytes, 1485 unsigned long bits) 1486 { 1487 struct rb_node *node; 1488 struct extent_state *state; 1489 u64 cur_start = *start; 1490 u64 total_bytes = 0; 1491 int found = 0; 1492 1493 if (search_end <= cur_start) { 1494 WARN_ON(1); 1495 return 0; 1496 } 1497 1498 spin_lock(&tree->lock); 1499 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1500 total_bytes = tree->dirty_bytes; 1501 goto out; 1502 } 1503 /* 1504 * this search will find all the extents that end after 1505 * our range starts. 1506 */ 1507 node = tree_search(tree, cur_start); 1508 if (!node) 1509 goto out; 1510 1511 while (1) { 1512 state = rb_entry(node, struct extent_state, rb_node); 1513 if (state->start > search_end) 1514 break; 1515 if (state->end >= cur_start && (state->state & bits)) { 1516 total_bytes += min(search_end, state->end) + 1 - 1517 max(cur_start, state->start); 1518 if (total_bytes >= max_bytes) 1519 break; 1520 if (!found) { 1521 *start = state->start; 1522 found = 1; 1523 } 1524 } 1525 node = rb_next(node); 1526 if (!node) 1527 break; 1528 } 1529 out: 1530 spin_unlock(&tree->lock); 1531 return total_bytes; 1532 } 1533 1534 /* 1535 * set the private field for a given byte offset in the tree. If there isn't 1536 * an extent_state there already, this does nothing. 1537 */ 1538 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1539 { 1540 struct rb_node *node; 1541 struct extent_state *state; 1542 int ret = 0; 1543 1544 spin_lock(&tree->lock); 1545 /* 1546 * this search will find all the extents that end after 1547 * our range starts. 1548 */ 1549 node = tree_search(tree, start); 1550 if (!node) { 1551 ret = -ENOENT; 1552 goto out; 1553 } 1554 state = rb_entry(node, struct extent_state, rb_node); 1555 if (state->start != start) { 1556 ret = -ENOENT; 1557 goto out; 1558 } 1559 state->private = private; 1560 out: 1561 spin_unlock(&tree->lock); 1562 return ret; 1563 } 1564 1565 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1566 { 1567 struct rb_node *node; 1568 struct extent_state *state; 1569 int ret = 0; 1570 1571 spin_lock(&tree->lock); 1572 /* 1573 * this search will find all the extents that end after 1574 * our range starts. 1575 */ 1576 node = tree_search(tree, start); 1577 if (!node) { 1578 ret = -ENOENT; 1579 goto out; 1580 } 1581 state = rb_entry(node, struct extent_state, rb_node); 1582 if (state->start != start) { 1583 ret = -ENOENT; 1584 goto out; 1585 } 1586 *private = state->private; 1587 out: 1588 spin_unlock(&tree->lock); 1589 return ret; 1590 } 1591 1592 /* 1593 * searches a range in the state tree for a given mask. 1594 * If 'filled' == 1, this returns 1 only if every extent in the tree 1595 * has the bits set. Otherwise, 1 is returned if any bit in the 1596 * range is found set. 1597 */ 1598 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1599 int bits, int filled, struct extent_state *cached) 1600 { 1601 struct extent_state *state = NULL; 1602 struct rb_node *node; 1603 int bitset = 0; 1604 1605 spin_lock(&tree->lock); 1606 if (cached && cached->tree && cached->start == start) 1607 node = &cached->rb_node; 1608 else 1609 node = tree_search(tree, start); 1610 while (node && start <= end) { 1611 state = rb_entry(node, struct extent_state, rb_node); 1612 1613 if (filled && state->start > start) { 1614 bitset = 0; 1615 break; 1616 } 1617 1618 if (state->start > end) 1619 break; 1620 1621 if (state->state & bits) { 1622 bitset = 1; 1623 if (!filled) 1624 break; 1625 } else if (filled) { 1626 bitset = 0; 1627 break; 1628 } 1629 1630 if (state->end == (u64)-1) 1631 break; 1632 1633 start = state->end + 1; 1634 if (start > end) 1635 break; 1636 node = rb_next(node); 1637 if (!node) { 1638 if (filled) 1639 bitset = 0; 1640 break; 1641 } 1642 } 1643 spin_unlock(&tree->lock); 1644 return bitset; 1645 } 1646 1647 /* 1648 * helper function to set a given page up to date if all the 1649 * extents in the tree for that page are up to date 1650 */ 1651 static int check_page_uptodate(struct extent_io_tree *tree, 1652 struct page *page) 1653 { 1654 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1655 u64 end = start + PAGE_CACHE_SIZE - 1; 1656 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1657 SetPageUptodate(page); 1658 return 0; 1659 } 1660 1661 /* 1662 * helper function to unlock a page if all the extents in the tree 1663 * for that page are unlocked 1664 */ 1665 static int check_page_locked(struct extent_io_tree *tree, 1666 struct page *page) 1667 { 1668 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1669 u64 end = start + PAGE_CACHE_SIZE - 1; 1670 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1671 unlock_page(page); 1672 return 0; 1673 } 1674 1675 /* 1676 * helper function to end page writeback if all the extents 1677 * in the tree for that page are done with writeback 1678 */ 1679 static int check_page_writeback(struct extent_io_tree *tree, 1680 struct page *page) 1681 { 1682 end_page_writeback(page); 1683 return 0; 1684 } 1685 1686 /* lots and lots of room for performance fixes in the end_bio funcs */ 1687 1688 /* 1689 * after a writepage IO is done, we need to: 1690 * clear the uptodate bits on error 1691 * clear the writeback bits in the extent tree for this IO 1692 * end_page_writeback if the page has no more pending IO 1693 * 1694 * Scheduling is not allowed, so the extent state tree is expected 1695 * to have one and only one object corresponding to this IO. 1696 */ 1697 static void end_bio_extent_writepage(struct bio *bio, int err) 1698 { 1699 int uptodate = err == 0; 1700 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1701 struct extent_io_tree *tree; 1702 u64 start; 1703 u64 end; 1704 int whole_page; 1705 int ret; 1706 1707 do { 1708 struct page *page = bvec->bv_page; 1709 tree = &BTRFS_I(page->mapping->host)->io_tree; 1710 1711 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1712 bvec->bv_offset; 1713 end = start + bvec->bv_len - 1; 1714 1715 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1716 whole_page = 1; 1717 else 1718 whole_page = 0; 1719 1720 if (--bvec >= bio->bi_io_vec) 1721 prefetchw(&bvec->bv_page->flags); 1722 if (tree->ops && tree->ops->writepage_end_io_hook) { 1723 ret = tree->ops->writepage_end_io_hook(page, start, 1724 end, NULL, uptodate); 1725 if (ret) 1726 uptodate = 0; 1727 } 1728 1729 if (!uptodate && tree->ops && 1730 tree->ops->writepage_io_failed_hook) { 1731 ret = tree->ops->writepage_io_failed_hook(bio, page, 1732 start, end, NULL); 1733 if (ret == 0) { 1734 uptodate = (err == 0); 1735 continue; 1736 } 1737 } 1738 1739 if (!uptodate) { 1740 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1741 ClearPageUptodate(page); 1742 SetPageError(page); 1743 } 1744 1745 if (whole_page) 1746 end_page_writeback(page); 1747 else 1748 check_page_writeback(tree, page); 1749 } while (bvec >= bio->bi_io_vec); 1750 1751 bio_put(bio); 1752 } 1753 1754 /* 1755 * after a readpage IO is done, we need to: 1756 * clear the uptodate bits on error 1757 * set the uptodate bits if things worked 1758 * set the page up to date if all extents in the tree are uptodate 1759 * clear the lock bit in the extent tree 1760 * unlock the page if there are no other extents locked for it 1761 * 1762 * Scheduling is not allowed, so the extent state tree is expected 1763 * to have one and only one object corresponding to this IO. 1764 */ 1765 static void end_bio_extent_readpage(struct bio *bio, int err) 1766 { 1767 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1768 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1769 struct bio_vec *bvec = bio->bi_io_vec; 1770 struct extent_io_tree *tree; 1771 u64 start; 1772 u64 end; 1773 int whole_page; 1774 int ret; 1775 1776 if (err) 1777 uptodate = 0; 1778 1779 do { 1780 struct page *page = bvec->bv_page; 1781 tree = &BTRFS_I(page->mapping->host)->io_tree; 1782 1783 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1784 bvec->bv_offset; 1785 end = start + bvec->bv_len - 1; 1786 1787 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1788 whole_page = 1; 1789 else 1790 whole_page = 0; 1791 1792 if (++bvec <= bvec_end) 1793 prefetchw(&bvec->bv_page->flags); 1794 1795 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1796 ret = tree->ops->readpage_end_io_hook(page, start, end, 1797 NULL); 1798 if (ret) 1799 uptodate = 0; 1800 } 1801 if (!uptodate && tree->ops && 1802 tree->ops->readpage_io_failed_hook) { 1803 ret = tree->ops->readpage_io_failed_hook(bio, page, 1804 start, end, NULL); 1805 if (ret == 0) { 1806 uptodate = 1807 test_bit(BIO_UPTODATE, &bio->bi_flags); 1808 if (err) 1809 uptodate = 0; 1810 continue; 1811 } 1812 } 1813 1814 if (uptodate) { 1815 set_extent_uptodate(tree, start, end, 1816 GFP_ATOMIC); 1817 } 1818 unlock_extent(tree, start, end, GFP_ATOMIC); 1819 1820 if (whole_page) { 1821 if (uptodate) { 1822 SetPageUptodate(page); 1823 } else { 1824 ClearPageUptodate(page); 1825 SetPageError(page); 1826 } 1827 unlock_page(page); 1828 } else { 1829 if (uptodate) { 1830 check_page_uptodate(tree, page); 1831 } else { 1832 ClearPageUptodate(page); 1833 SetPageError(page); 1834 } 1835 check_page_locked(tree, page); 1836 } 1837 } while (bvec <= bvec_end); 1838 1839 bio_put(bio); 1840 } 1841 1842 /* 1843 * IO done from prepare_write is pretty simple, we just unlock 1844 * the structs in the extent tree when done, and set the uptodate bits 1845 * as appropriate. 1846 */ 1847 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1848 { 1849 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1850 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1851 struct extent_io_tree *tree; 1852 u64 start; 1853 u64 end; 1854 1855 do { 1856 struct page *page = bvec->bv_page; 1857 tree = &BTRFS_I(page->mapping->host)->io_tree; 1858 1859 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1860 bvec->bv_offset; 1861 end = start + bvec->bv_len - 1; 1862 1863 if (--bvec >= bio->bi_io_vec) 1864 prefetchw(&bvec->bv_page->flags); 1865 1866 if (uptodate) { 1867 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1868 } else { 1869 ClearPageUptodate(page); 1870 SetPageError(page); 1871 } 1872 1873 unlock_extent(tree, start, end, GFP_ATOMIC); 1874 1875 } while (bvec >= bio->bi_io_vec); 1876 1877 bio_put(bio); 1878 } 1879 1880 static struct bio * 1881 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1882 gfp_t gfp_flags) 1883 { 1884 struct bio *bio; 1885 1886 bio = bio_alloc(gfp_flags, nr_vecs); 1887 1888 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1889 while (!bio && (nr_vecs /= 2)) 1890 bio = bio_alloc(gfp_flags, nr_vecs); 1891 } 1892 1893 if (bio) { 1894 bio->bi_size = 0; 1895 bio->bi_bdev = bdev; 1896 bio->bi_sector = first_sector; 1897 } 1898 return bio; 1899 } 1900 1901 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1902 unsigned long bio_flags) 1903 { 1904 int ret = 0; 1905 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1906 struct page *page = bvec->bv_page; 1907 struct extent_io_tree *tree = bio->bi_private; 1908 u64 start; 1909 u64 end; 1910 1911 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1912 end = start + bvec->bv_len - 1; 1913 1914 bio->bi_private = NULL; 1915 1916 bio_get(bio); 1917 1918 if (tree->ops && tree->ops->submit_bio_hook) 1919 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1920 mirror_num, bio_flags); 1921 else 1922 submit_bio(rw, bio); 1923 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1924 ret = -EOPNOTSUPP; 1925 bio_put(bio); 1926 return ret; 1927 } 1928 1929 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1930 struct page *page, sector_t sector, 1931 size_t size, unsigned long offset, 1932 struct block_device *bdev, 1933 struct bio **bio_ret, 1934 unsigned long max_pages, 1935 bio_end_io_t end_io_func, 1936 int mirror_num, 1937 unsigned long prev_bio_flags, 1938 unsigned long bio_flags) 1939 { 1940 int ret = 0; 1941 struct bio *bio; 1942 int nr; 1943 int contig = 0; 1944 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1945 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1946 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1947 1948 if (bio_ret && *bio_ret) { 1949 bio = *bio_ret; 1950 if (old_compressed) 1951 contig = bio->bi_sector == sector; 1952 else 1953 contig = bio->bi_sector + (bio->bi_size >> 9) == 1954 sector; 1955 1956 if (prev_bio_flags != bio_flags || !contig || 1957 (tree->ops && tree->ops->merge_bio_hook && 1958 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1959 bio_flags)) || 1960 bio_add_page(bio, page, page_size, offset) < page_size) { 1961 ret = submit_one_bio(rw, bio, mirror_num, 1962 prev_bio_flags); 1963 bio = NULL; 1964 } else { 1965 return 0; 1966 } 1967 } 1968 if (this_compressed) 1969 nr = BIO_MAX_PAGES; 1970 else 1971 nr = bio_get_nr_vecs(bdev); 1972 1973 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1974 1975 bio_add_page(bio, page, page_size, offset); 1976 bio->bi_end_io = end_io_func; 1977 bio->bi_private = tree; 1978 1979 if (bio_ret) 1980 *bio_ret = bio; 1981 else 1982 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1983 1984 return ret; 1985 } 1986 1987 void set_page_extent_mapped(struct page *page) 1988 { 1989 if (!PagePrivate(page)) { 1990 SetPagePrivate(page); 1991 page_cache_get(page); 1992 set_page_private(page, EXTENT_PAGE_PRIVATE); 1993 } 1994 } 1995 1996 static void set_page_extent_head(struct page *page, unsigned long len) 1997 { 1998 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1999 } 2000 2001 /* 2002 * basic readpage implementation. Locked extent state structs are inserted 2003 * into the tree that are removed when the IO is done (by the end_io 2004 * handlers) 2005 */ 2006 static int __extent_read_full_page(struct extent_io_tree *tree, 2007 struct page *page, 2008 get_extent_t *get_extent, 2009 struct bio **bio, int mirror_num, 2010 unsigned long *bio_flags) 2011 { 2012 struct inode *inode = page->mapping->host; 2013 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2014 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2015 u64 end; 2016 u64 cur = start; 2017 u64 extent_offset; 2018 u64 last_byte = i_size_read(inode); 2019 u64 block_start; 2020 u64 cur_end; 2021 sector_t sector; 2022 struct extent_map *em; 2023 struct block_device *bdev; 2024 int ret; 2025 int nr = 0; 2026 size_t page_offset = 0; 2027 size_t iosize; 2028 size_t disk_io_size; 2029 size_t blocksize = inode->i_sb->s_blocksize; 2030 unsigned long this_bio_flag = 0; 2031 2032 set_page_extent_mapped(page); 2033 2034 end = page_end; 2035 lock_extent(tree, start, end, GFP_NOFS); 2036 2037 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2038 char *userpage; 2039 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2040 2041 if (zero_offset) { 2042 iosize = PAGE_CACHE_SIZE - zero_offset; 2043 userpage = kmap_atomic(page, KM_USER0); 2044 memset(userpage + zero_offset, 0, iosize); 2045 flush_dcache_page(page); 2046 kunmap_atomic(userpage, KM_USER0); 2047 } 2048 } 2049 while (cur <= end) { 2050 if (cur >= last_byte) { 2051 char *userpage; 2052 iosize = PAGE_CACHE_SIZE - page_offset; 2053 userpage = kmap_atomic(page, KM_USER0); 2054 memset(userpage + page_offset, 0, iosize); 2055 flush_dcache_page(page); 2056 kunmap_atomic(userpage, KM_USER0); 2057 set_extent_uptodate(tree, cur, cur + iosize - 1, 2058 GFP_NOFS); 2059 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2060 break; 2061 } 2062 em = get_extent(inode, page, page_offset, cur, 2063 end - cur + 1, 0); 2064 if (IS_ERR(em) || !em) { 2065 SetPageError(page); 2066 unlock_extent(tree, cur, end, GFP_NOFS); 2067 break; 2068 } 2069 extent_offset = cur - em->start; 2070 BUG_ON(extent_map_end(em) <= cur); 2071 BUG_ON(end < cur); 2072 2073 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2074 this_bio_flag = EXTENT_BIO_COMPRESSED; 2075 2076 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2077 cur_end = min(extent_map_end(em) - 1, end); 2078 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2079 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2080 disk_io_size = em->block_len; 2081 sector = em->block_start >> 9; 2082 } else { 2083 sector = (em->block_start + extent_offset) >> 9; 2084 disk_io_size = iosize; 2085 } 2086 bdev = em->bdev; 2087 block_start = em->block_start; 2088 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2089 block_start = EXTENT_MAP_HOLE; 2090 free_extent_map(em); 2091 em = NULL; 2092 2093 /* we've found a hole, just zero and go on */ 2094 if (block_start == EXTENT_MAP_HOLE) { 2095 char *userpage; 2096 userpage = kmap_atomic(page, KM_USER0); 2097 memset(userpage + page_offset, 0, iosize); 2098 flush_dcache_page(page); 2099 kunmap_atomic(userpage, KM_USER0); 2100 2101 set_extent_uptodate(tree, cur, cur + iosize - 1, 2102 GFP_NOFS); 2103 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2104 cur = cur + iosize; 2105 page_offset += iosize; 2106 continue; 2107 } 2108 /* the get_extent function already copied into the page */ 2109 if (test_range_bit(tree, cur, cur_end, 2110 EXTENT_UPTODATE, 1, NULL)) { 2111 check_page_uptodate(tree, page); 2112 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2113 cur = cur + iosize; 2114 page_offset += iosize; 2115 continue; 2116 } 2117 /* we have an inline extent but it didn't get marked up 2118 * to date. Error out 2119 */ 2120 if (block_start == EXTENT_MAP_INLINE) { 2121 SetPageError(page); 2122 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2123 cur = cur + iosize; 2124 page_offset += iosize; 2125 continue; 2126 } 2127 2128 ret = 0; 2129 if (tree->ops && tree->ops->readpage_io_hook) { 2130 ret = tree->ops->readpage_io_hook(page, cur, 2131 cur + iosize - 1); 2132 } 2133 if (!ret) { 2134 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2135 pnr -= page->index; 2136 ret = submit_extent_page(READ, tree, page, 2137 sector, disk_io_size, page_offset, 2138 bdev, bio, pnr, 2139 end_bio_extent_readpage, mirror_num, 2140 *bio_flags, 2141 this_bio_flag); 2142 nr++; 2143 *bio_flags = this_bio_flag; 2144 } 2145 if (ret) 2146 SetPageError(page); 2147 cur = cur + iosize; 2148 page_offset += iosize; 2149 } 2150 if (!nr) { 2151 if (!PageError(page)) 2152 SetPageUptodate(page); 2153 unlock_page(page); 2154 } 2155 return 0; 2156 } 2157 2158 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2159 get_extent_t *get_extent) 2160 { 2161 struct bio *bio = NULL; 2162 unsigned long bio_flags = 0; 2163 int ret; 2164 2165 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2166 &bio_flags); 2167 if (bio) 2168 submit_one_bio(READ, bio, 0, bio_flags); 2169 return ret; 2170 } 2171 2172 static noinline void update_nr_written(struct page *page, 2173 struct writeback_control *wbc, 2174 unsigned long nr_written) 2175 { 2176 wbc->nr_to_write -= nr_written; 2177 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2178 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2179 page->mapping->writeback_index = page->index + nr_written; 2180 } 2181 2182 /* 2183 * the writepage semantics are similar to regular writepage. extent 2184 * records are inserted to lock ranges in the tree, and as dirty areas 2185 * are found, they are marked writeback. Then the lock bits are removed 2186 * and the end_io handler clears the writeback ranges 2187 */ 2188 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2189 void *data) 2190 { 2191 struct inode *inode = page->mapping->host; 2192 struct extent_page_data *epd = data; 2193 struct extent_io_tree *tree = epd->tree; 2194 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2195 u64 delalloc_start; 2196 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2197 u64 end; 2198 u64 cur = start; 2199 u64 extent_offset; 2200 u64 last_byte = i_size_read(inode); 2201 u64 block_start; 2202 u64 iosize; 2203 u64 unlock_start; 2204 sector_t sector; 2205 struct extent_state *cached_state = NULL; 2206 struct extent_map *em; 2207 struct block_device *bdev; 2208 int ret; 2209 int nr = 0; 2210 size_t pg_offset = 0; 2211 size_t blocksize; 2212 loff_t i_size = i_size_read(inode); 2213 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2214 u64 nr_delalloc; 2215 u64 delalloc_end; 2216 int page_started; 2217 int compressed; 2218 int write_flags; 2219 unsigned long nr_written = 0; 2220 2221 if (wbc->sync_mode == WB_SYNC_ALL) 2222 write_flags = WRITE_SYNC_PLUG; 2223 else 2224 write_flags = WRITE; 2225 2226 WARN_ON(!PageLocked(page)); 2227 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2228 if (page->index > end_index || 2229 (page->index == end_index && !pg_offset)) { 2230 page->mapping->a_ops->invalidatepage(page, 0); 2231 unlock_page(page); 2232 return 0; 2233 } 2234 2235 if (page->index == end_index) { 2236 char *userpage; 2237 2238 userpage = kmap_atomic(page, KM_USER0); 2239 memset(userpage + pg_offset, 0, 2240 PAGE_CACHE_SIZE - pg_offset); 2241 kunmap_atomic(userpage, KM_USER0); 2242 flush_dcache_page(page); 2243 } 2244 pg_offset = 0; 2245 2246 set_page_extent_mapped(page); 2247 2248 delalloc_start = start; 2249 delalloc_end = 0; 2250 page_started = 0; 2251 if (!epd->extent_locked) { 2252 u64 delalloc_to_write = 0; 2253 /* 2254 * make sure the wbc mapping index is at least updated 2255 * to this page. 2256 */ 2257 update_nr_written(page, wbc, 0); 2258 2259 while (delalloc_end < page_end) { 2260 nr_delalloc = find_lock_delalloc_range(inode, tree, 2261 page, 2262 &delalloc_start, 2263 &delalloc_end, 2264 128 * 1024 * 1024); 2265 if (nr_delalloc == 0) { 2266 delalloc_start = delalloc_end + 1; 2267 continue; 2268 } 2269 tree->ops->fill_delalloc(inode, page, delalloc_start, 2270 delalloc_end, &page_started, 2271 &nr_written); 2272 /* 2273 * delalloc_end is already one less than the total 2274 * length, so we don't subtract one from 2275 * PAGE_CACHE_SIZE 2276 */ 2277 delalloc_to_write += (delalloc_end - delalloc_start + 2278 PAGE_CACHE_SIZE) >> 2279 PAGE_CACHE_SHIFT; 2280 delalloc_start = delalloc_end + 1; 2281 } 2282 if (wbc->nr_to_write < delalloc_to_write) { 2283 int thresh = 8192; 2284 2285 if (delalloc_to_write < thresh * 2) 2286 thresh = delalloc_to_write; 2287 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2288 thresh); 2289 } 2290 2291 /* did the fill delalloc function already unlock and start 2292 * the IO? 2293 */ 2294 if (page_started) { 2295 ret = 0; 2296 /* 2297 * we've unlocked the page, so we can't update 2298 * the mapping's writeback index, just update 2299 * nr_to_write. 2300 */ 2301 wbc->nr_to_write -= nr_written; 2302 goto done_unlocked; 2303 } 2304 } 2305 if (tree->ops && tree->ops->writepage_start_hook) { 2306 ret = tree->ops->writepage_start_hook(page, start, 2307 page_end); 2308 if (ret == -EAGAIN) { 2309 redirty_page_for_writepage(wbc, page); 2310 update_nr_written(page, wbc, nr_written); 2311 unlock_page(page); 2312 ret = 0; 2313 goto done_unlocked; 2314 } 2315 } 2316 2317 /* 2318 * we don't want to touch the inode after unlocking the page, 2319 * so we update the mapping writeback index now 2320 */ 2321 update_nr_written(page, wbc, nr_written + 1); 2322 2323 end = page_end; 2324 if (last_byte <= start) { 2325 if (tree->ops && tree->ops->writepage_end_io_hook) 2326 tree->ops->writepage_end_io_hook(page, start, 2327 page_end, NULL, 1); 2328 unlock_start = page_end + 1; 2329 goto done; 2330 } 2331 2332 blocksize = inode->i_sb->s_blocksize; 2333 2334 while (cur <= end) { 2335 if (cur >= last_byte) { 2336 if (tree->ops && tree->ops->writepage_end_io_hook) 2337 tree->ops->writepage_end_io_hook(page, cur, 2338 page_end, NULL, 1); 2339 unlock_start = page_end + 1; 2340 break; 2341 } 2342 em = epd->get_extent(inode, page, pg_offset, cur, 2343 end - cur + 1, 1); 2344 if (IS_ERR(em) || !em) { 2345 SetPageError(page); 2346 break; 2347 } 2348 2349 extent_offset = cur - em->start; 2350 BUG_ON(extent_map_end(em) <= cur); 2351 BUG_ON(end < cur); 2352 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2353 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2354 sector = (em->block_start + extent_offset) >> 9; 2355 bdev = em->bdev; 2356 block_start = em->block_start; 2357 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2358 free_extent_map(em); 2359 em = NULL; 2360 2361 /* 2362 * compressed and inline extents are written through other 2363 * paths in the FS 2364 */ 2365 if (compressed || block_start == EXTENT_MAP_HOLE || 2366 block_start == EXTENT_MAP_INLINE) { 2367 /* 2368 * end_io notification does not happen here for 2369 * compressed extents 2370 */ 2371 if (!compressed && tree->ops && 2372 tree->ops->writepage_end_io_hook) 2373 tree->ops->writepage_end_io_hook(page, cur, 2374 cur + iosize - 1, 2375 NULL, 1); 2376 else if (compressed) { 2377 /* we don't want to end_page_writeback on 2378 * a compressed extent. this happens 2379 * elsewhere 2380 */ 2381 nr++; 2382 } 2383 2384 cur += iosize; 2385 pg_offset += iosize; 2386 unlock_start = cur; 2387 continue; 2388 } 2389 /* leave this out until we have a page_mkwrite call */ 2390 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2391 EXTENT_DIRTY, 0, NULL)) { 2392 cur = cur + iosize; 2393 pg_offset += iosize; 2394 continue; 2395 } 2396 2397 if (tree->ops && tree->ops->writepage_io_hook) { 2398 ret = tree->ops->writepage_io_hook(page, cur, 2399 cur + iosize - 1); 2400 } else { 2401 ret = 0; 2402 } 2403 if (ret) { 2404 SetPageError(page); 2405 } else { 2406 unsigned long max_nr = end_index + 1; 2407 2408 set_range_writeback(tree, cur, cur + iosize - 1); 2409 if (!PageWriteback(page)) { 2410 printk(KERN_ERR "btrfs warning page %lu not " 2411 "writeback, cur %llu end %llu\n", 2412 page->index, (unsigned long long)cur, 2413 (unsigned long long)end); 2414 } 2415 2416 ret = submit_extent_page(write_flags, tree, page, 2417 sector, iosize, pg_offset, 2418 bdev, &epd->bio, max_nr, 2419 end_bio_extent_writepage, 2420 0, 0, 0); 2421 if (ret) 2422 SetPageError(page); 2423 } 2424 cur = cur + iosize; 2425 pg_offset += iosize; 2426 nr++; 2427 } 2428 done: 2429 if (nr == 0) { 2430 /* make sure the mapping tag for page dirty gets cleared */ 2431 set_page_writeback(page); 2432 end_page_writeback(page); 2433 } 2434 unlock_page(page); 2435 2436 done_unlocked: 2437 2438 /* drop our reference on any cached states */ 2439 free_extent_state(cached_state); 2440 return 0; 2441 } 2442 2443 /** 2444 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2445 * @mapping: address space structure to write 2446 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2447 * @writepage: function called for each page 2448 * @data: data passed to writepage function 2449 * 2450 * If a page is already under I/O, write_cache_pages() skips it, even 2451 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2452 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2453 * and msync() need to guarantee that all the data which was dirty at the time 2454 * the call was made get new I/O started against them. If wbc->sync_mode is 2455 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2456 * existing IO to complete. 2457 */ 2458 static int extent_write_cache_pages(struct extent_io_tree *tree, 2459 struct address_space *mapping, 2460 struct writeback_control *wbc, 2461 writepage_t writepage, void *data, 2462 void (*flush_fn)(void *)) 2463 { 2464 int ret = 0; 2465 int done = 0; 2466 int nr_to_write_done = 0; 2467 struct pagevec pvec; 2468 int nr_pages; 2469 pgoff_t index; 2470 pgoff_t end; /* Inclusive */ 2471 int scanned = 0; 2472 int range_whole = 0; 2473 2474 pagevec_init(&pvec, 0); 2475 if (wbc->range_cyclic) { 2476 index = mapping->writeback_index; /* Start from prev offset */ 2477 end = -1; 2478 } else { 2479 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2480 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2481 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2482 range_whole = 1; 2483 scanned = 1; 2484 } 2485 retry: 2486 while (!done && !nr_to_write_done && (index <= end) && 2487 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2488 PAGECACHE_TAG_DIRTY, min(end - index, 2489 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2490 unsigned i; 2491 2492 scanned = 1; 2493 for (i = 0; i < nr_pages; i++) { 2494 struct page *page = pvec.pages[i]; 2495 2496 /* 2497 * At this point we hold neither mapping->tree_lock nor 2498 * lock on the page itself: the page may be truncated or 2499 * invalidated (changing page->mapping to NULL), or even 2500 * swizzled back from swapper_space to tmpfs file 2501 * mapping 2502 */ 2503 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2504 tree->ops->write_cache_pages_lock_hook(page); 2505 else 2506 lock_page(page); 2507 2508 if (unlikely(page->mapping != mapping)) { 2509 unlock_page(page); 2510 continue; 2511 } 2512 2513 if (!wbc->range_cyclic && page->index > end) { 2514 done = 1; 2515 unlock_page(page); 2516 continue; 2517 } 2518 2519 if (wbc->sync_mode != WB_SYNC_NONE) { 2520 if (PageWriteback(page)) 2521 flush_fn(data); 2522 wait_on_page_writeback(page); 2523 } 2524 2525 if (PageWriteback(page) || 2526 !clear_page_dirty_for_io(page)) { 2527 unlock_page(page); 2528 continue; 2529 } 2530 2531 ret = (*writepage)(page, wbc, data); 2532 2533 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2534 unlock_page(page); 2535 ret = 0; 2536 } 2537 if (ret) 2538 done = 1; 2539 2540 /* 2541 * the filesystem may choose to bump up nr_to_write. 2542 * We have to make sure to honor the new nr_to_write 2543 * at any time 2544 */ 2545 nr_to_write_done = wbc->nr_to_write <= 0; 2546 } 2547 pagevec_release(&pvec); 2548 cond_resched(); 2549 } 2550 if (!scanned && !done) { 2551 /* 2552 * We hit the last page and there is more work to be done: wrap 2553 * back to the start of the file 2554 */ 2555 scanned = 1; 2556 index = 0; 2557 goto retry; 2558 } 2559 return ret; 2560 } 2561 2562 static void flush_epd_write_bio(struct extent_page_data *epd) 2563 { 2564 if (epd->bio) { 2565 if (epd->sync_io) 2566 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2567 else 2568 submit_one_bio(WRITE, epd->bio, 0, 0); 2569 epd->bio = NULL; 2570 } 2571 } 2572 2573 static noinline void flush_write_bio(void *data) 2574 { 2575 struct extent_page_data *epd = data; 2576 flush_epd_write_bio(epd); 2577 } 2578 2579 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2580 get_extent_t *get_extent, 2581 struct writeback_control *wbc) 2582 { 2583 int ret; 2584 struct address_space *mapping = page->mapping; 2585 struct extent_page_data epd = { 2586 .bio = NULL, 2587 .tree = tree, 2588 .get_extent = get_extent, 2589 .extent_locked = 0, 2590 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2591 }; 2592 struct writeback_control wbc_writepages = { 2593 .bdi = wbc->bdi, 2594 .sync_mode = wbc->sync_mode, 2595 .older_than_this = NULL, 2596 .nr_to_write = 64, 2597 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2598 .range_end = (loff_t)-1, 2599 }; 2600 2601 ret = __extent_writepage(page, wbc, &epd); 2602 2603 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2604 __extent_writepage, &epd, flush_write_bio); 2605 flush_epd_write_bio(&epd); 2606 return ret; 2607 } 2608 2609 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2610 u64 start, u64 end, get_extent_t *get_extent, 2611 int mode) 2612 { 2613 int ret = 0; 2614 struct address_space *mapping = inode->i_mapping; 2615 struct page *page; 2616 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2617 PAGE_CACHE_SHIFT; 2618 2619 struct extent_page_data epd = { 2620 .bio = NULL, 2621 .tree = tree, 2622 .get_extent = get_extent, 2623 .extent_locked = 1, 2624 .sync_io = mode == WB_SYNC_ALL, 2625 }; 2626 struct writeback_control wbc_writepages = { 2627 .bdi = inode->i_mapping->backing_dev_info, 2628 .sync_mode = mode, 2629 .older_than_this = NULL, 2630 .nr_to_write = nr_pages * 2, 2631 .range_start = start, 2632 .range_end = end + 1, 2633 }; 2634 2635 while (start <= end) { 2636 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2637 if (clear_page_dirty_for_io(page)) 2638 ret = __extent_writepage(page, &wbc_writepages, &epd); 2639 else { 2640 if (tree->ops && tree->ops->writepage_end_io_hook) 2641 tree->ops->writepage_end_io_hook(page, start, 2642 start + PAGE_CACHE_SIZE - 1, 2643 NULL, 1); 2644 unlock_page(page); 2645 } 2646 page_cache_release(page); 2647 start += PAGE_CACHE_SIZE; 2648 } 2649 2650 flush_epd_write_bio(&epd); 2651 return ret; 2652 } 2653 2654 int extent_writepages(struct extent_io_tree *tree, 2655 struct address_space *mapping, 2656 get_extent_t *get_extent, 2657 struct writeback_control *wbc) 2658 { 2659 int ret = 0; 2660 struct extent_page_data epd = { 2661 .bio = NULL, 2662 .tree = tree, 2663 .get_extent = get_extent, 2664 .extent_locked = 0, 2665 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2666 }; 2667 2668 ret = extent_write_cache_pages(tree, mapping, wbc, 2669 __extent_writepage, &epd, 2670 flush_write_bio); 2671 flush_epd_write_bio(&epd); 2672 return ret; 2673 } 2674 2675 int extent_readpages(struct extent_io_tree *tree, 2676 struct address_space *mapping, 2677 struct list_head *pages, unsigned nr_pages, 2678 get_extent_t get_extent) 2679 { 2680 struct bio *bio = NULL; 2681 unsigned page_idx; 2682 struct pagevec pvec; 2683 unsigned long bio_flags = 0; 2684 2685 pagevec_init(&pvec, 0); 2686 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2687 struct page *page = list_entry(pages->prev, struct page, lru); 2688 2689 prefetchw(&page->flags); 2690 list_del(&page->lru); 2691 /* 2692 * what we want to do here is call add_to_page_cache_lru, 2693 * but that isn't exported, so we reproduce it here 2694 */ 2695 if (!add_to_page_cache(page, mapping, 2696 page->index, GFP_KERNEL)) { 2697 2698 /* open coding of lru_cache_add, also not exported */ 2699 page_cache_get(page); 2700 if (!pagevec_add(&pvec, page)) 2701 __pagevec_lru_add_file(&pvec); 2702 __extent_read_full_page(tree, page, get_extent, 2703 &bio, 0, &bio_flags); 2704 } 2705 page_cache_release(page); 2706 } 2707 if (pagevec_count(&pvec)) 2708 __pagevec_lru_add_file(&pvec); 2709 BUG_ON(!list_empty(pages)); 2710 if (bio) 2711 submit_one_bio(READ, bio, 0, bio_flags); 2712 return 0; 2713 } 2714 2715 /* 2716 * basic invalidatepage code, this waits on any locked or writeback 2717 * ranges corresponding to the page, and then deletes any extent state 2718 * records from the tree 2719 */ 2720 int extent_invalidatepage(struct extent_io_tree *tree, 2721 struct page *page, unsigned long offset) 2722 { 2723 struct extent_state *cached_state = NULL; 2724 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2725 u64 end = start + PAGE_CACHE_SIZE - 1; 2726 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2727 2728 start += (offset + blocksize - 1) & ~(blocksize - 1); 2729 if (start > end) 2730 return 0; 2731 2732 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2733 wait_on_page_writeback(page); 2734 clear_extent_bit(tree, start, end, 2735 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2736 EXTENT_DO_ACCOUNTING, 2737 1, 1, &cached_state, GFP_NOFS); 2738 return 0; 2739 } 2740 2741 /* 2742 * simple commit_write call, set_range_dirty is used to mark both 2743 * the pages and the extent records as dirty 2744 */ 2745 int extent_commit_write(struct extent_io_tree *tree, 2746 struct inode *inode, struct page *page, 2747 unsigned from, unsigned to) 2748 { 2749 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2750 2751 set_page_extent_mapped(page); 2752 set_page_dirty(page); 2753 2754 if (pos > inode->i_size) { 2755 i_size_write(inode, pos); 2756 mark_inode_dirty(inode); 2757 } 2758 return 0; 2759 } 2760 2761 int extent_prepare_write(struct extent_io_tree *tree, 2762 struct inode *inode, struct page *page, 2763 unsigned from, unsigned to, get_extent_t *get_extent) 2764 { 2765 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2766 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2767 u64 block_start; 2768 u64 orig_block_start; 2769 u64 block_end; 2770 u64 cur_end; 2771 struct extent_map *em; 2772 unsigned blocksize = 1 << inode->i_blkbits; 2773 size_t page_offset = 0; 2774 size_t block_off_start; 2775 size_t block_off_end; 2776 int err = 0; 2777 int iocount = 0; 2778 int ret = 0; 2779 int isnew; 2780 2781 set_page_extent_mapped(page); 2782 2783 block_start = (page_start + from) & ~((u64)blocksize - 1); 2784 block_end = (page_start + to - 1) | (blocksize - 1); 2785 orig_block_start = block_start; 2786 2787 lock_extent(tree, page_start, page_end, GFP_NOFS); 2788 while (block_start <= block_end) { 2789 em = get_extent(inode, page, page_offset, block_start, 2790 block_end - block_start + 1, 1); 2791 if (IS_ERR(em) || !em) 2792 goto err; 2793 2794 cur_end = min(block_end, extent_map_end(em) - 1); 2795 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2796 block_off_end = block_off_start + blocksize; 2797 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2798 2799 if (!PageUptodate(page) && isnew && 2800 (block_off_end > to || block_off_start < from)) { 2801 void *kaddr; 2802 2803 kaddr = kmap_atomic(page, KM_USER0); 2804 if (block_off_end > to) 2805 memset(kaddr + to, 0, block_off_end - to); 2806 if (block_off_start < from) 2807 memset(kaddr + block_off_start, 0, 2808 from - block_off_start); 2809 flush_dcache_page(page); 2810 kunmap_atomic(kaddr, KM_USER0); 2811 } 2812 if ((em->block_start != EXTENT_MAP_HOLE && 2813 em->block_start != EXTENT_MAP_INLINE) && 2814 !isnew && !PageUptodate(page) && 2815 (block_off_end > to || block_off_start < from) && 2816 !test_range_bit(tree, block_start, cur_end, 2817 EXTENT_UPTODATE, 1, NULL)) { 2818 u64 sector; 2819 u64 extent_offset = block_start - em->start; 2820 size_t iosize; 2821 sector = (em->block_start + extent_offset) >> 9; 2822 iosize = (cur_end - block_start + blocksize) & 2823 ~((u64)blocksize - 1); 2824 /* 2825 * we've already got the extent locked, but we 2826 * need to split the state such that our end_bio 2827 * handler can clear the lock. 2828 */ 2829 set_extent_bit(tree, block_start, 2830 block_start + iosize - 1, 2831 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); 2832 ret = submit_extent_page(READ, tree, page, 2833 sector, iosize, page_offset, em->bdev, 2834 NULL, 1, 2835 end_bio_extent_preparewrite, 0, 2836 0, 0); 2837 iocount++; 2838 block_start = block_start + iosize; 2839 } else { 2840 set_extent_uptodate(tree, block_start, cur_end, 2841 GFP_NOFS); 2842 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2843 block_start = cur_end + 1; 2844 } 2845 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2846 free_extent_map(em); 2847 } 2848 if (iocount) { 2849 wait_extent_bit(tree, orig_block_start, 2850 block_end, EXTENT_LOCKED); 2851 } 2852 check_page_uptodate(tree, page); 2853 err: 2854 /* FIXME, zero out newly allocated blocks on error */ 2855 return err; 2856 } 2857 2858 /* 2859 * a helper for releasepage, this tests for areas of the page that 2860 * are locked or under IO and drops the related state bits if it is safe 2861 * to drop the page. 2862 */ 2863 int try_release_extent_state(struct extent_map_tree *map, 2864 struct extent_io_tree *tree, struct page *page, 2865 gfp_t mask) 2866 { 2867 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2868 u64 end = start + PAGE_CACHE_SIZE - 1; 2869 int ret = 1; 2870 2871 if (test_range_bit(tree, start, end, 2872 EXTENT_IOBITS, 0, NULL)) 2873 ret = 0; 2874 else { 2875 if ((mask & GFP_NOFS) == GFP_NOFS) 2876 mask = GFP_NOFS; 2877 /* 2878 * at this point we can safely clear everything except the 2879 * locked bit and the nodatasum bit 2880 */ 2881 clear_extent_bit(tree, start, end, 2882 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2883 0, 0, NULL, mask); 2884 } 2885 return ret; 2886 } 2887 2888 /* 2889 * a helper for releasepage. As long as there are no locked extents 2890 * in the range corresponding to the page, both state records and extent 2891 * map records are removed 2892 */ 2893 int try_release_extent_mapping(struct extent_map_tree *map, 2894 struct extent_io_tree *tree, struct page *page, 2895 gfp_t mask) 2896 { 2897 struct extent_map *em; 2898 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2899 u64 end = start + PAGE_CACHE_SIZE - 1; 2900 2901 if ((mask & __GFP_WAIT) && 2902 page->mapping->host->i_size > 16 * 1024 * 1024) { 2903 u64 len; 2904 while (start <= end) { 2905 len = end - start + 1; 2906 write_lock(&map->lock); 2907 em = lookup_extent_mapping(map, start, len); 2908 if (!em || IS_ERR(em)) { 2909 write_unlock(&map->lock); 2910 break; 2911 } 2912 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2913 em->start != start) { 2914 write_unlock(&map->lock); 2915 free_extent_map(em); 2916 break; 2917 } 2918 if (!test_range_bit(tree, em->start, 2919 extent_map_end(em) - 1, 2920 EXTENT_LOCKED | EXTENT_WRITEBACK, 2921 0, NULL)) { 2922 remove_extent_mapping(map, em); 2923 /* once for the rb tree */ 2924 free_extent_map(em); 2925 } 2926 start = extent_map_end(em); 2927 write_unlock(&map->lock); 2928 2929 /* once for us */ 2930 free_extent_map(em); 2931 } 2932 } 2933 return try_release_extent_state(map, tree, page, mask); 2934 } 2935 2936 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2937 get_extent_t *get_extent) 2938 { 2939 struct inode *inode = mapping->host; 2940 struct extent_state *cached_state = NULL; 2941 u64 start = iblock << inode->i_blkbits; 2942 sector_t sector = 0; 2943 size_t blksize = (1 << inode->i_blkbits); 2944 struct extent_map *em; 2945 2946 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2947 0, &cached_state, GFP_NOFS); 2948 em = get_extent(inode, NULL, 0, start, blksize, 0); 2949 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, 2950 start + blksize - 1, &cached_state, GFP_NOFS); 2951 if (!em || IS_ERR(em)) 2952 return 0; 2953 2954 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2955 goto out; 2956 2957 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2958 out: 2959 free_extent_map(em); 2960 return sector; 2961 } 2962 2963 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2964 __u64 start, __u64 len, get_extent_t *get_extent) 2965 { 2966 int ret; 2967 u64 off = start; 2968 u64 max = start + len; 2969 u32 flags = 0; 2970 u64 disko = 0; 2971 struct extent_map *em = NULL; 2972 struct extent_state *cached_state = NULL; 2973 int end = 0; 2974 u64 em_start = 0, em_len = 0; 2975 unsigned long emflags; 2976 ret = 0; 2977 2978 if (len == 0) 2979 return -EINVAL; 2980 2981 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2982 &cached_state, GFP_NOFS); 2983 em = get_extent(inode, NULL, 0, off, max - off, 0); 2984 if (!em) 2985 goto out; 2986 if (IS_ERR(em)) { 2987 ret = PTR_ERR(em); 2988 goto out; 2989 } 2990 while (!end) { 2991 off = em->start + em->len; 2992 if (off >= max) 2993 end = 1; 2994 2995 em_start = em->start; 2996 em_len = em->len; 2997 2998 disko = 0; 2999 flags = 0; 3000 3001 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3002 end = 1; 3003 flags |= FIEMAP_EXTENT_LAST; 3004 } else if (em->block_start == EXTENT_MAP_HOLE) { 3005 flags |= FIEMAP_EXTENT_UNWRITTEN; 3006 } else if (em->block_start == EXTENT_MAP_INLINE) { 3007 flags |= (FIEMAP_EXTENT_DATA_INLINE | 3008 FIEMAP_EXTENT_NOT_ALIGNED); 3009 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 3010 flags |= (FIEMAP_EXTENT_DELALLOC | 3011 FIEMAP_EXTENT_UNKNOWN); 3012 } else { 3013 disko = em->block_start; 3014 } 3015 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3016 flags |= FIEMAP_EXTENT_ENCODED; 3017 3018 emflags = em->flags; 3019 free_extent_map(em); 3020 em = NULL; 3021 3022 if (!end) { 3023 em = get_extent(inode, NULL, 0, off, max - off, 0); 3024 if (!em) 3025 goto out; 3026 if (IS_ERR(em)) { 3027 ret = PTR_ERR(em); 3028 goto out; 3029 } 3030 emflags = em->flags; 3031 } 3032 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3033 flags |= FIEMAP_EXTENT_LAST; 3034 end = 1; 3035 } 3036 3037 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3038 em_len, flags); 3039 if (ret) 3040 goto out_free; 3041 } 3042 out_free: 3043 free_extent_map(em); 3044 out: 3045 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3046 &cached_state, GFP_NOFS); 3047 return ret; 3048 } 3049 3050 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3051 unsigned long i) 3052 { 3053 struct page *p; 3054 struct address_space *mapping; 3055 3056 if (i == 0) 3057 return eb->first_page; 3058 i += eb->start >> PAGE_CACHE_SHIFT; 3059 mapping = eb->first_page->mapping; 3060 if (!mapping) 3061 return NULL; 3062 3063 /* 3064 * extent_buffer_page is only called after pinning the page 3065 * by increasing the reference count. So we know the page must 3066 * be in the radix tree. 3067 */ 3068 rcu_read_lock(); 3069 p = radix_tree_lookup(&mapping->page_tree, i); 3070 rcu_read_unlock(); 3071 3072 return p; 3073 } 3074 3075 static inline unsigned long num_extent_pages(u64 start, u64 len) 3076 { 3077 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3078 (start >> PAGE_CACHE_SHIFT); 3079 } 3080 3081 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3082 u64 start, 3083 unsigned long len, 3084 gfp_t mask) 3085 { 3086 struct extent_buffer *eb = NULL; 3087 #if LEAK_DEBUG 3088 unsigned long flags; 3089 #endif 3090 3091 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3092 eb->start = start; 3093 eb->len = len; 3094 spin_lock_init(&eb->lock); 3095 init_waitqueue_head(&eb->lock_wq); 3096 3097 #if LEAK_DEBUG 3098 spin_lock_irqsave(&leak_lock, flags); 3099 list_add(&eb->leak_list, &buffers); 3100 spin_unlock_irqrestore(&leak_lock, flags); 3101 #endif 3102 atomic_set(&eb->refs, 1); 3103 3104 return eb; 3105 } 3106 3107 static void __free_extent_buffer(struct extent_buffer *eb) 3108 { 3109 #if LEAK_DEBUG 3110 unsigned long flags; 3111 spin_lock_irqsave(&leak_lock, flags); 3112 list_del(&eb->leak_list); 3113 spin_unlock_irqrestore(&leak_lock, flags); 3114 #endif 3115 kmem_cache_free(extent_buffer_cache, eb); 3116 } 3117 3118 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3119 u64 start, unsigned long len, 3120 struct page *page0, 3121 gfp_t mask) 3122 { 3123 unsigned long num_pages = num_extent_pages(start, len); 3124 unsigned long i; 3125 unsigned long index = start >> PAGE_CACHE_SHIFT; 3126 struct extent_buffer *eb; 3127 struct extent_buffer *exists = NULL; 3128 struct page *p; 3129 struct address_space *mapping = tree->mapping; 3130 int uptodate = 1; 3131 3132 spin_lock(&tree->buffer_lock); 3133 eb = buffer_search(tree, start); 3134 if (eb) { 3135 atomic_inc(&eb->refs); 3136 spin_unlock(&tree->buffer_lock); 3137 mark_page_accessed(eb->first_page); 3138 return eb; 3139 } 3140 spin_unlock(&tree->buffer_lock); 3141 3142 eb = __alloc_extent_buffer(tree, start, len, mask); 3143 if (!eb) 3144 return NULL; 3145 3146 if (page0) { 3147 eb->first_page = page0; 3148 i = 1; 3149 index++; 3150 page_cache_get(page0); 3151 mark_page_accessed(page0); 3152 set_page_extent_mapped(page0); 3153 set_page_extent_head(page0, len); 3154 uptodate = PageUptodate(page0); 3155 } else { 3156 i = 0; 3157 } 3158 for (; i < num_pages; i++, index++) { 3159 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3160 if (!p) { 3161 WARN_ON(1); 3162 goto free_eb; 3163 } 3164 set_page_extent_mapped(p); 3165 mark_page_accessed(p); 3166 if (i == 0) { 3167 eb->first_page = p; 3168 set_page_extent_head(p, len); 3169 } else { 3170 set_page_private(p, EXTENT_PAGE_PRIVATE); 3171 } 3172 if (!PageUptodate(p)) 3173 uptodate = 0; 3174 unlock_page(p); 3175 } 3176 if (uptodate) 3177 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3178 3179 spin_lock(&tree->buffer_lock); 3180 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3181 if (exists) { 3182 /* add one reference for the caller */ 3183 atomic_inc(&exists->refs); 3184 spin_unlock(&tree->buffer_lock); 3185 goto free_eb; 3186 } 3187 /* add one reference for the tree */ 3188 atomic_inc(&eb->refs); 3189 spin_unlock(&tree->buffer_lock); 3190 return eb; 3191 3192 free_eb: 3193 if (!atomic_dec_and_test(&eb->refs)) 3194 return exists; 3195 for (index = 1; index < i; index++) 3196 page_cache_release(extent_buffer_page(eb, index)); 3197 page_cache_release(extent_buffer_page(eb, 0)); 3198 __free_extent_buffer(eb); 3199 return exists; 3200 } 3201 3202 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3203 u64 start, unsigned long len, 3204 gfp_t mask) 3205 { 3206 struct extent_buffer *eb; 3207 3208 spin_lock(&tree->buffer_lock); 3209 eb = buffer_search(tree, start); 3210 if (eb) 3211 atomic_inc(&eb->refs); 3212 spin_unlock(&tree->buffer_lock); 3213 3214 if (eb) 3215 mark_page_accessed(eb->first_page); 3216 3217 return eb; 3218 } 3219 3220 void free_extent_buffer(struct extent_buffer *eb) 3221 { 3222 if (!eb) 3223 return; 3224 3225 if (!atomic_dec_and_test(&eb->refs)) 3226 return; 3227 3228 WARN_ON(1); 3229 } 3230 3231 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3232 struct extent_buffer *eb) 3233 { 3234 unsigned long i; 3235 unsigned long num_pages; 3236 struct page *page; 3237 3238 num_pages = num_extent_pages(eb->start, eb->len); 3239 3240 for (i = 0; i < num_pages; i++) { 3241 page = extent_buffer_page(eb, i); 3242 if (!PageDirty(page)) 3243 continue; 3244 3245 lock_page(page); 3246 if (i == 0) 3247 set_page_extent_head(page, eb->len); 3248 else 3249 set_page_private(page, EXTENT_PAGE_PRIVATE); 3250 3251 clear_page_dirty_for_io(page); 3252 spin_lock_irq(&page->mapping->tree_lock); 3253 if (!PageDirty(page)) { 3254 radix_tree_tag_clear(&page->mapping->page_tree, 3255 page_index(page), 3256 PAGECACHE_TAG_DIRTY); 3257 } 3258 spin_unlock_irq(&page->mapping->tree_lock); 3259 unlock_page(page); 3260 } 3261 return 0; 3262 } 3263 3264 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, 3265 struct extent_buffer *eb) 3266 { 3267 return wait_on_extent_writeback(tree, eb->start, 3268 eb->start + eb->len - 1); 3269 } 3270 3271 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3272 struct extent_buffer *eb) 3273 { 3274 unsigned long i; 3275 unsigned long num_pages; 3276 int was_dirty = 0; 3277 3278 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3279 num_pages = num_extent_pages(eb->start, eb->len); 3280 for (i = 0; i < num_pages; i++) 3281 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3282 return was_dirty; 3283 } 3284 3285 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3286 struct extent_buffer *eb, 3287 struct extent_state **cached_state) 3288 { 3289 unsigned long i; 3290 struct page *page; 3291 unsigned long num_pages; 3292 3293 num_pages = num_extent_pages(eb->start, eb->len); 3294 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3295 3296 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3297 cached_state, GFP_NOFS); 3298 for (i = 0; i < num_pages; i++) { 3299 page = extent_buffer_page(eb, i); 3300 if (page) 3301 ClearPageUptodate(page); 3302 } 3303 return 0; 3304 } 3305 3306 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3307 struct extent_buffer *eb) 3308 { 3309 unsigned long i; 3310 struct page *page; 3311 unsigned long num_pages; 3312 3313 num_pages = num_extent_pages(eb->start, eb->len); 3314 3315 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3316 GFP_NOFS); 3317 for (i = 0; i < num_pages; i++) { 3318 page = extent_buffer_page(eb, i); 3319 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3320 ((i == num_pages - 1) && 3321 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3322 check_page_uptodate(tree, page); 3323 continue; 3324 } 3325 SetPageUptodate(page); 3326 } 3327 return 0; 3328 } 3329 3330 int extent_range_uptodate(struct extent_io_tree *tree, 3331 u64 start, u64 end) 3332 { 3333 struct page *page; 3334 int ret; 3335 int pg_uptodate = 1; 3336 int uptodate; 3337 unsigned long index; 3338 3339 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3340 if (ret) 3341 return 1; 3342 while (start <= end) { 3343 index = start >> PAGE_CACHE_SHIFT; 3344 page = find_get_page(tree->mapping, index); 3345 uptodate = PageUptodate(page); 3346 page_cache_release(page); 3347 if (!uptodate) { 3348 pg_uptodate = 0; 3349 break; 3350 } 3351 start += PAGE_CACHE_SIZE; 3352 } 3353 return pg_uptodate; 3354 } 3355 3356 int extent_buffer_uptodate(struct extent_io_tree *tree, 3357 struct extent_buffer *eb, 3358 struct extent_state *cached_state) 3359 { 3360 int ret = 0; 3361 unsigned long num_pages; 3362 unsigned long i; 3363 struct page *page; 3364 int pg_uptodate = 1; 3365 3366 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3367 return 1; 3368 3369 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3370 EXTENT_UPTODATE, 1, cached_state); 3371 if (ret) 3372 return ret; 3373 3374 num_pages = num_extent_pages(eb->start, eb->len); 3375 for (i = 0; i < num_pages; i++) { 3376 page = extent_buffer_page(eb, i); 3377 if (!PageUptodate(page)) { 3378 pg_uptodate = 0; 3379 break; 3380 } 3381 } 3382 return pg_uptodate; 3383 } 3384 3385 int read_extent_buffer_pages(struct extent_io_tree *tree, 3386 struct extent_buffer *eb, 3387 u64 start, int wait, 3388 get_extent_t *get_extent, int mirror_num) 3389 { 3390 unsigned long i; 3391 unsigned long start_i; 3392 struct page *page; 3393 int err; 3394 int ret = 0; 3395 int locked_pages = 0; 3396 int all_uptodate = 1; 3397 int inc_all_pages = 0; 3398 unsigned long num_pages; 3399 struct bio *bio = NULL; 3400 unsigned long bio_flags = 0; 3401 3402 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3403 return 0; 3404 3405 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3406 EXTENT_UPTODATE, 1, NULL)) { 3407 return 0; 3408 } 3409 3410 if (start) { 3411 WARN_ON(start < eb->start); 3412 start_i = (start >> PAGE_CACHE_SHIFT) - 3413 (eb->start >> PAGE_CACHE_SHIFT); 3414 } else { 3415 start_i = 0; 3416 } 3417 3418 num_pages = num_extent_pages(eb->start, eb->len); 3419 for (i = start_i; i < num_pages; i++) { 3420 page = extent_buffer_page(eb, i); 3421 if (!wait) { 3422 if (!trylock_page(page)) 3423 goto unlock_exit; 3424 } else { 3425 lock_page(page); 3426 } 3427 locked_pages++; 3428 if (!PageUptodate(page)) 3429 all_uptodate = 0; 3430 } 3431 if (all_uptodate) { 3432 if (start_i == 0) 3433 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3434 goto unlock_exit; 3435 } 3436 3437 for (i = start_i; i < num_pages; i++) { 3438 page = extent_buffer_page(eb, i); 3439 if (inc_all_pages) 3440 page_cache_get(page); 3441 if (!PageUptodate(page)) { 3442 if (start_i == 0) 3443 inc_all_pages = 1; 3444 ClearPageError(page); 3445 err = __extent_read_full_page(tree, page, 3446 get_extent, &bio, 3447 mirror_num, &bio_flags); 3448 if (err) 3449 ret = err; 3450 } else { 3451 unlock_page(page); 3452 } 3453 } 3454 3455 if (bio) 3456 submit_one_bio(READ, bio, mirror_num, bio_flags); 3457 3458 if (ret || !wait) 3459 return ret; 3460 3461 for (i = start_i; i < num_pages; i++) { 3462 page = extent_buffer_page(eb, i); 3463 wait_on_page_locked(page); 3464 if (!PageUptodate(page)) 3465 ret = -EIO; 3466 } 3467 3468 if (!ret) 3469 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3470 return ret; 3471 3472 unlock_exit: 3473 i = start_i; 3474 while (locked_pages > 0) { 3475 page = extent_buffer_page(eb, i); 3476 i++; 3477 unlock_page(page); 3478 locked_pages--; 3479 } 3480 return ret; 3481 } 3482 3483 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3484 unsigned long start, 3485 unsigned long len) 3486 { 3487 size_t cur; 3488 size_t offset; 3489 struct page *page; 3490 char *kaddr; 3491 char *dst = (char *)dstv; 3492 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3493 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3494 3495 WARN_ON(start > eb->len); 3496 WARN_ON(start + len > eb->start + eb->len); 3497 3498 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3499 3500 while (len > 0) { 3501 page = extent_buffer_page(eb, i); 3502 3503 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3504 kaddr = kmap_atomic(page, KM_USER1); 3505 memcpy(dst, kaddr + offset, cur); 3506 kunmap_atomic(kaddr, KM_USER1); 3507 3508 dst += cur; 3509 len -= cur; 3510 offset = 0; 3511 i++; 3512 } 3513 } 3514 3515 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3516 unsigned long min_len, char **token, char **map, 3517 unsigned long *map_start, 3518 unsigned long *map_len, int km) 3519 { 3520 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3521 char *kaddr; 3522 struct page *p; 3523 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3524 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3525 unsigned long end_i = (start_offset + start + min_len - 1) >> 3526 PAGE_CACHE_SHIFT; 3527 3528 if (i != end_i) 3529 return -EINVAL; 3530 3531 if (i == 0) { 3532 offset = start_offset; 3533 *map_start = 0; 3534 } else { 3535 offset = 0; 3536 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3537 } 3538 3539 if (start + min_len > eb->len) { 3540 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3541 "wanted %lu %lu\n", (unsigned long long)eb->start, 3542 eb->len, start, min_len); 3543 WARN_ON(1); 3544 } 3545 3546 p = extent_buffer_page(eb, i); 3547 kaddr = kmap_atomic(p, km); 3548 *token = kaddr; 3549 *map = kaddr + offset; 3550 *map_len = PAGE_CACHE_SIZE - offset; 3551 return 0; 3552 } 3553 3554 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3555 unsigned long min_len, 3556 char **token, char **map, 3557 unsigned long *map_start, 3558 unsigned long *map_len, int km) 3559 { 3560 int err; 3561 int save = 0; 3562 if (eb->map_token) { 3563 unmap_extent_buffer(eb, eb->map_token, km); 3564 eb->map_token = NULL; 3565 save = 1; 3566 } 3567 err = map_private_extent_buffer(eb, start, min_len, token, map, 3568 map_start, map_len, km); 3569 if (!err && save) { 3570 eb->map_token = *token; 3571 eb->kaddr = *map; 3572 eb->map_start = *map_start; 3573 eb->map_len = *map_len; 3574 } 3575 return err; 3576 } 3577 3578 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3579 { 3580 kunmap_atomic(token, km); 3581 } 3582 3583 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3584 unsigned long start, 3585 unsigned long len) 3586 { 3587 size_t cur; 3588 size_t offset; 3589 struct page *page; 3590 char *kaddr; 3591 char *ptr = (char *)ptrv; 3592 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3593 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3594 int ret = 0; 3595 3596 WARN_ON(start > eb->len); 3597 WARN_ON(start + len > eb->start + eb->len); 3598 3599 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3600 3601 while (len > 0) { 3602 page = extent_buffer_page(eb, i); 3603 3604 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3605 3606 kaddr = kmap_atomic(page, KM_USER0); 3607 ret = memcmp(ptr, kaddr + offset, cur); 3608 kunmap_atomic(kaddr, KM_USER0); 3609 if (ret) 3610 break; 3611 3612 ptr += cur; 3613 len -= cur; 3614 offset = 0; 3615 i++; 3616 } 3617 return ret; 3618 } 3619 3620 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3621 unsigned long start, unsigned long len) 3622 { 3623 size_t cur; 3624 size_t offset; 3625 struct page *page; 3626 char *kaddr; 3627 char *src = (char *)srcv; 3628 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3629 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3630 3631 WARN_ON(start > eb->len); 3632 WARN_ON(start + len > eb->start + eb->len); 3633 3634 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3635 3636 while (len > 0) { 3637 page = extent_buffer_page(eb, i); 3638 WARN_ON(!PageUptodate(page)); 3639 3640 cur = min(len, PAGE_CACHE_SIZE - offset); 3641 kaddr = kmap_atomic(page, KM_USER1); 3642 memcpy(kaddr + offset, src, cur); 3643 kunmap_atomic(kaddr, KM_USER1); 3644 3645 src += cur; 3646 len -= cur; 3647 offset = 0; 3648 i++; 3649 } 3650 } 3651 3652 void memset_extent_buffer(struct extent_buffer *eb, char c, 3653 unsigned long start, unsigned long len) 3654 { 3655 size_t cur; 3656 size_t offset; 3657 struct page *page; 3658 char *kaddr; 3659 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3660 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3661 3662 WARN_ON(start > eb->len); 3663 WARN_ON(start + len > eb->start + eb->len); 3664 3665 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3666 3667 while (len > 0) { 3668 page = extent_buffer_page(eb, i); 3669 WARN_ON(!PageUptodate(page)); 3670 3671 cur = min(len, PAGE_CACHE_SIZE - offset); 3672 kaddr = kmap_atomic(page, KM_USER0); 3673 memset(kaddr + offset, c, cur); 3674 kunmap_atomic(kaddr, KM_USER0); 3675 3676 len -= cur; 3677 offset = 0; 3678 i++; 3679 } 3680 } 3681 3682 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3683 unsigned long dst_offset, unsigned long src_offset, 3684 unsigned long len) 3685 { 3686 u64 dst_len = dst->len; 3687 size_t cur; 3688 size_t offset; 3689 struct page *page; 3690 char *kaddr; 3691 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3692 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3693 3694 WARN_ON(src->len != dst_len); 3695 3696 offset = (start_offset + dst_offset) & 3697 ((unsigned long)PAGE_CACHE_SIZE - 1); 3698 3699 while (len > 0) { 3700 page = extent_buffer_page(dst, i); 3701 WARN_ON(!PageUptodate(page)); 3702 3703 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3704 3705 kaddr = kmap_atomic(page, KM_USER0); 3706 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3707 kunmap_atomic(kaddr, KM_USER0); 3708 3709 src_offset += cur; 3710 len -= cur; 3711 offset = 0; 3712 i++; 3713 } 3714 } 3715 3716 static void move_pages(struct page *dst_page, struct page *src_page, 3717 unsigned long dst_off, unsigned long src_off, 3718 unsigned long len) 3719 { 3720 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3721 if (dst_page == src_page) { 3722 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3723 } else { 3724 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3725 char *p = dst_kaddr + dst_off + len; 3726 char *s = src_kaddr + src_off + len; 3727 3728 while (len--) 3729 *--p = *--s; 3730 3731 kunmap_atomic(src_kaddr, KM_USER1); 3732 } 3733 kunmap_atomic(dst_kaddr, KM_USER0); 3734 } 3735 3736 static void copy_pages(struct page *dst_page, struct page *src_page, 3737 unsigned long dst_off, unsigned long src_off, 3738 unsigned long len) 3739 { 3740 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3741 char *src_kaddr; 3742 3743 if (dst_page != src_page) 3744 src_kaddr = kmap_atomic(src_page, KM_USER1); 3745 else 3746 src_kaddr = dst_kaddr; 3747 3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3749 kunmap_atomic(dst_kaddr, KM_USER0); 3750 if (dst_page != src_page) 3751 kunmap_atomic(src_kaddr, KM_USER1); 3752 } 3753 3754 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3755 unsigned long src_offset, unsigned long len) 3756 { 3757 size_t cur; 3758 size_t dst_off_in_page; 3759 size_t src_off_in_page; 3760 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3761 unsigned long dst_i; 3762 unsigned long src_i; 3763 3764 if (src_offset + len > dst->len) { 3765 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3766 "len %lu dst len %lu\n", src_offset, len, dst->len); 3767 BUG_ON(1); 3768 } 3769 if (dst_offset + len > dst->len) { 3770 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3771 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3772 BUG_ON(1); 3773 } 3774 3775 while (len > 0) { 3776 dst_off_in_page = (start_offset + dst_offset) & 3777 ((unsigned long)PAGE_CACHE_SIZE - 1); 3778 src_off_in_page = (start_offset + src_offset) & 3779 ((unsigned long)PAGE_CACHE_SIZE - 1); 3780 3781 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3782 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3783 3784 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3785 src_off_in_page)); 3786 cur = min_t(unsigned long, cur, 3787 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3788 3789 copy_pages(extent_buffer_page(dst, dst_i), 3790 extent_buffer_page(dst, src_i), 3791 dst_off_in_page, src_off_in_page, cur); 3792 3793 src_offset += cur; 3794 dst_offset += cur; 3795 len -= cur; 3796 } 3797 } 3798 3799 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3800 unsigned long src_offset, unsigned long len) 3801 { 3802 size_t cur; 3803 size_t dst_off_in_page; 3804 size_t src_off_in_page; 3805 unsigned long dst_end = dst_offset + len - 1; 3806 unsigned long src_end = src_offset + len - 1; 3807 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3808 unsigned long dst_i; 3809 unsigned long src_i; 3810 3811 if (src_offset + len > dst->len) { 3812 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3813 "len %lu len %lu\n", src_offset, len, dst->len); 3814 BUG_ON(1); 3815 } 3816 if (dst_offset + len > dst->len) { 3817 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3818 "len %lu len %lu\n", dst_offset, len, dst->len); 3819 BUG_ON(1); 3820 } 3821 if (dst_offset < src_offset) { 3822 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3823 return; 3824 } 3825 while (len > 0) { 3826 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3827 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3828 3829 dst_off_in_page = (start_offset + dst_end) & 3830 ((unsigned long)PAGE_CACHE_SIZE - 1); 3831 src_off_in_page = (start_offset + src_end) & 3832 ((unsigned long)PAGE_CACHE_SIZE - 1); 3833 3834 cur = min_t(unsigned long, len, src_off_in_page + 1); 3835 cur = min(cur, dst_off_in_page + 1); 3836 move_pages(extent_buffer_page(dst, dst_i), 3837 extent_buffer_page(dst, src_i), 3838 dst_off_in_page - cur + 1, 3839 src_off_in_page - cur + 1, cur); 3840 3841 dst_end -= cur; 3842 src_end -= cur; 3843 len -= cur; 3844 } 3845 } 3846 3847 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3848 { 3849 u64 start = page_offset(page); 3850 struct extent_buffer *eb; 3851 int ret = 1; 3852 unsigned long i; 3853 unsigned long num_pages; 3854 3855 spin_lock(&tree->buffer_lock); 3856 eb = buffer_search(tree, start); 3857 if (!eb) 3858 goto out; 3859 3860 if (atomic_read(&eb->refs) > 1) { 3861 ret = 0; 3862 goto out; 3863 } 3864 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3865 ret = 0; 3866 goto out; 3867 } 3868 /* at this point we can safely release the extent buffer */ 3869 num_pages = num_extent_pages(eb->start, eb->len); 3870 for (i = 0; i < num_pages; i++) 3871 page_cache_release(extent_buffer_page(eb, i)); 3872 rb_erase(&eb->rb_node, &tree->buffer); 3873 __free_extent_buffer(eb); 3874 out: 3875 spin_unlock(&tree->buffer_lock); 3876 return ret; 3877 } 3878