1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include "extent_io.h" 14 #include "extent_map.h" 15 #include "compat.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 19 static struct kmem_cache *extent_state_cache; 20 static struct kmem_cache *extent_buffer_cache; 21 22 static LIST_HEAD(buffers); 23 static LIST_HEAD(states); 24 25 #define LEAK_DEBUG 0 26 #if LEAK_DEBUG 27 static DEFINE_SPINLOCK(leak_lock); 28 #endif 29 30 #define BUFFER_LRU_MAX 64 31 32 struct tree_entry { 33 u64 start; 34 u64 end; 35 struct rb_node rb_node; 36 }; 37 38 struct extent_page_data { 39 struct bio *bio; 40 struct extent_io_tree *tree; 41 get_extent_t *get_extent; 42 43 /* tells writepage not to lock the state bits for this range 44 * it still does the unlocking 45 */ 46 unsigned int extent_locked:1; 47 48 /* tells the submit_bio code to use a WRITE_SYNC */ 49 unsigned int sync_io:1; 50 }; 51 52 int __init extent_io_init(void) 53 { 54 extent_state_cache = kmem_cache_create("extent_state", 55 sizeof(struct extent_state), 0, 56 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 57 if (!extent_state_cache) 58 return -ENOMEM; 59 60 extent_buffer_cache = kmem_cache_create("extent_buffers", 61 sizeof(struct extent_buffer), 0, 62 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 63 if (!extent_buffer_cache) 64 goto free_state_cache; 65 return 0; 66 67 free_state_cache: 68 kmem_cache_destroy(extent_state_cache); 69 return -ENOMEM; 70 } 71 72 void extent_io_exit(void) 73 { 74 struct extent_state *state; 75 struct extent_buffer *eb; 76 77 while (!list_empty(&states)) { 78 state = list_entry(states.next, struct extent_state, leak_list); 79 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 80 "state %lu in tree %p refs %d\n", 81 (unsigned long long)state->start, 82 (unsigned long long)state->end, 83 state->state, state->tree, atomic_read(&state->refs)); 84 list_del(&state->leak_list); 85 kmem_cache_free(extent_state_cache, state); 86 87 } 88 89 while (!list_empty(&buffers)) { 90 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 91 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 92 "refs %d\n", (unsigned long long)eb->start, 93 eb->len, atomic_read(&eb->refs)); 94 list_del(&eb->leak_list); 95 kmem_cache_free(extent_buffer_cache, eb); 96 } 97 if (extent_state_cache) 98 kmem_cache_destroy(extent_state_cache); 99 if (extent_buffer_cache) 100 kmem_cache_destroy(extent_buffer_cache); 101 } 102 103 void extent_io_tree_init(struct extent_io_tree *tree, 104 struct address_space *mapping, gfp_t mask) 105 { 106 tree->state = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 108 tree->ops = NULL; 109 tree->dirty_bytes = 0; 110 spin_lock_init(&tree->lock); 111 spin_lock_init(&tree->buffer_lock); 112 tree->mapping = mapping; 113 } 114 115 static struct extent_state *alloc_extent_state(gfp_t mask) 116 { 117 struct extent_state *state; 118 #if LEAK_DEBUG 119 unsigned long flags; 120 #endif 121 122 state = kmem_cache_alloc(extent_state_cache, mask); 123 if (!state) 124 return state; 125 state->state = 0; 126 state->private = 0; 127 state->tree = NULL; 128 #if LEAK_DEBUG 129 spin_lock_irqsave(&leak_lock, flags); 130 list_add(&state->leak_list, &states); 131 spin_unlock_irqrestore(&leak_lock, flags); 132 #endif 133 atomic_set(&state->refs, 1); 134 init_waitqueue_head(&state->wq); 135 return state; 136 } 137 138 void free_extent_state(struct extent_state *state) 139 { 140 if (!state) 141 return; 142 if (atomic_dec_and_test(&state->refs)) { 143 #if LEAK_DEBUG 144 unsigned long flags; 145 #endif 146 WARN_ON(state->tree); 147 #if LEAK_DEBUG 148 spin_lock_irqsave(&leak_lock, flags); 149 list_del(&state->leak_list); 150 spin_unlock_irqrestore(&leak_lock, flags); 151 #endif 152 kmem_cache_free(extent_state_cache, state); 153 } 154 } 155 156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 157 struct rb_node *node) 158 { 159 struct rb_node **p = &root->rb_node; 160 struct rb_node *parent = NULL; 161 struct tree_entry *entry; 162 163 while (*p) { 164 parent = *p; 165 entry = rb_entry(parent, struct tree_entry, rb_node); 166 167 if (offset < entry->start) 168 p = &(*p)->rb_left; 169 else if (offset > entry->end) 170 p = &(*p)->rb_right; 171 else 172 return parent; 173 } 174 175 entry = rb_entry(node, struct tree_entry, rb_node); 176 rb_link_node(node, parent, p); 177 rb_insert_color(node, root); 178 return NULL; 179 } 180 181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 182 struct rb_node **prev_ret, 183 struct rb_node **next_ret) 184 { 185 struct rb_root *root = &tree->state; 186 struct rb_node *n = root->rb_node; 187 struct rb_node *prev = NULL; 188 struct rb_node *orig_prev = NULL; 189 struct tree_entry *entry; 190 struct tree_entry *prev_entry = NULL; 191 192 while (n) { 193 entry = rb_entry(n, struct tree_entry, rb_node); 194 prev = n; 195 prev_entry = entry; 196 197 if (offset < entry->start) 198 n = n->rb_left; 199 else if (offset > entry->end) 200 n = n->rb_right; 201 else 202 return n; 203 } 204 205 if (prev_ret) { 206 orig_prev = prev; 207 while (prev && offset > prev_entry->end) { 208 prev = rb_next(prev); 209 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 210 } 211 *prev_ret = prev; 212 prev = orig_prev; 213 } 214 215 if (next_ret) { 216 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 217 while (prev && offset < prev_entry->start) { 218 prev = rb_prev(prev); 219 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 220 } 221 *next_ret = prev; 222 } 223 return NULL; 224 } 225 226 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 227 u64 offset) 228 { 229 struct rb_node *prev = NULL; 230 struct rb_node *ret; 231 232 ret = __etree_search(tree, offset, &prev, NULL); 233 if (!ret) 234 return prev; 235 return ret; 236 } 237 238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 239 struct extent_state *other) 240 { 241 if (tree->ops && tree->ops->merge_extent_hook) 242 tree->ops->merge_extent_hook(tree->mapping->host, new, 243 other); 244 } 245 246 /* 247 * utility function to look for merge candidates inside a given range. 248 * Any extents with matching state are merged together into a single 249 * extent in the tree. Extents with EXTENT_IO in their state field 250 * are not merged because the end_io handlers need to be able to do 251 * operations on them without sleeping (or doing allocations/splits). 252 * 253 * This should be called with the tree lock held. 254 */ 255 static int merge_state(struct extent_io_tree *tree, 256 struct extent_state *state) 257 { 258 struct extent_state *other; 259 struct rb_node *other_node; 260 261 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 262 return 0; 263 264 other_node = rb_prev(&state->rb_node); 265 if (other_node) { 266 other = rb_entry(other_node, struct extent_state, rb_node); 267 if (other->end == state->start - 1 && 268 other->state == state->state) { 269 merge_cb(tree, state, other); 270 state->start = other->start; 271 other->tree = NULL; 272 rb_erase(&other->rb_node, &tree->state); 273 free_extent_state(other); 274 } 275 } 276 other_node = rb_next(&state->rb_node); 277 if (other_node) { 278 other = rb_entry(other_node, struct extent_state, rb_node); 279 if (other->start == state->end + 1 && 280 other->state == state->state) { 281 merge_cb(tree, state, other); 282 other->start = state->start; 283 state->tree = NULL; 284 rb_erase(&state->rb_node, &tree->state); 285 free_extent_state(state); 286 state = NULL; 287 } 288 } 289 290 return 0; 291 } 292 293 static int set_state_cb(struct extent_io_tree *tree, 294 struct extent_state *state, int *bits) 295 { 296 if (tree->ops && tree->ops->set_bit_hook) { 297 return tree->ops->set_bit_hook(tree->mapping->host, 298 state, bits); 299 } 300 301 return 0; 302 } 303 304 static void clear_state_cb(struct extent_io_tree *tree, 305 struct extent_state *state, int *bits) 306 { 307 if (tree->ops && tree->ops->clear_bit_hook) 308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 309 } 310 311 /* 312 * insert an extent_state struct into the tree. 'bits' are set on the 313 * struct before it is inserted. 314 * 315 * This may return -EEXIST if the extent is already there, in which case the 316 * state struct is freed. 317 * 318 * The tree lock is not taken internally. This is a utility function and 319 * probably isn't what you want to call (see set/clear_extent_bit). 320 */ 321 static int insert_state(struct extent_io_tree *tree, 322 struct extent_state *state, u64 start, u64 end, 323 int *bits) 324 { 325 struct rb_node *node; 326 int bits_to_set = *bits & ~EXTENT_CTLBITS; 327 int ret; 328 329 if (end < start) { 330 printk(KERN_ERR "btrfs end < start %llu %llu\n", 331 (unsigned long long)end, 332 (unsigned long long)start); 333 WARN_ON(1); 334 } 335 state->start = start; 336 state->end = end; 337 ret = set_state_cb(tree, state, bits); 338 if (ret) 339 return ret; 340 341 if (bits_to_set & EXTENT_DIRTY) 342 tree->dirty_bytes += end - start + 1; 343 state->state |= bits_to_set; 344 node = tree_insert(&tree->state, end, &state->rb_node); 345 if (node) { 346 struct extent_state *found; 347 found = rb_entry(node, struct extent_state, rb_node); 348 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 349 "%llu %llu\n", (unsigned long long)found->start, 350 (unsigned long long)found->end, 351 (unsigned long long)start, (unsigned long long)end); 352 free_extent_state(state); 353 return -EEXIST; 354 } 355 state->tree = tree; 356 merge_state(tree, state); 357 return 0; 358 } 359 360 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 361 u64 split) 362 { 363 if (tree->ops && tree->ops->split_extent_hook) 364 return tree->ops->split_extent_hook(tree->mapping->host, 365 orig, split); 366 return 0; 367 } 368 369 /* 370 * split a given extent state struct in two, inserting the preallocated 371 * struct 'prealloc' as the newly created second half. 'split' indicates an 372 * offset inside 'orig' where it should be split. 373 * 374 * Before calling, 375 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 376 * are two extent state structs in the tree: 377 * prealloc: [orig->start, split - 1] 378 * orig: [ split, orig->end ] 379 * 380 * The tree locks are not taken by this function. They need to be held 381 * by the caller. 382 */ 383 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 384 struct extent_state *prealloc, u64 split) 385 { 386 struct rb_node *node; 387 388 split_cb(tree, orig, split); 389 390 prealloc->start = orig->start; 391 prealloc->end = split - 1; 392 prealloc->state = orig->state; 393 orig->start = split; 394 395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 396 if (node) { 397 free_extent_state(prealloc); 398 return -EEXIST; 399 } 400 prealloc->tree = tree; 401 return 0; 402 } 403 404 /* 405 * utility function to clear some bits in an extent state struct. 406 * it will optionally wake up any one waiting on this state (wake == 1), or 407 * forcibly remove the state from the tree (delete == 1). 408 * 409 * If no bits are set on the state struct after clearing things, the 410 * struct is freed and removed from the tree 411 */ 412 static int clear_state_bit(struct extent_io_tree *tree, 413 struct extent_state *state, 414 int *bits, int wake) 415 { 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 417 int ret = state->state & bits_to_clear; 418 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 420 u64 range = state->end - state->start + 1; 421 WARN_ON(range > tree->dirty_bytes); 422 tree->dirty_bytes -= range; 423 } 424 clear_state_cb(tree, state, bits); 425 state->state &= ~bits_to_clear; 426 if (wake) 427 wake_up(&state->wq); 428 if (state->state == 0) { 429 if (state->tree) { 430 rb_erase(&state->rb_node, &tree->state); 431 state->tree = NULL; 432 free_extent_state(state); 433 } else { 434 WARN_ON(1); 435 } 436 } else { 437 merge_state(tree, state); 438 } 439 return ret; 440 } 441 442 /* 443 * clear some bits on a range in the tree. This may require splitting 444 * or inserting elements in the tree, so the gfp mask is used to 445 * indicate which allocations or sleeping are allowed. 446 * 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 448 * the given range from the tree regardless of state (ie for truncate). 449 * 450 * the range [start, end] is inclusive. 451 * 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 453 * bits were already set, or zero if none of the bits were already set. 454 */ 455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 456 int bits, int wake, int delete, 457 struct extent_state **cached_state, 458 gfp_t mask) 459 { 460 struct extent_state *state; 461 struct extent_state *cached; 462 struct extent_state *prealloc = NULL; 463 struct rb_node *next_node; 464 struct rb_node *node; 465 u64 last_end; 466 int err; 467 int set = 0; 468 int clear = 0; 469 470 if (delete) 471 bits |= ~EXTENT_CTLBITS; 472 bits |= EXTENT_FIRST_DELALLOC; 473 474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 475 clear = 1; 476 again: 477 if (!prealloc && (mask & __GFP_WAIT)) { 478 prealloc = alloc_extent_state(mask); 479 if (!prealloc) 480 return -ENOMEM; 481 } 482 483 spin_lock(&tree->lock); 484 if (cached_state) { 485 cached = *cached_state; 486 487 if (clear) { 488 *cached_state = NULL; 489 cached_state = NULL; 490 } 491 492 if (cached && cached->tree && cached->start == start) { 493 if (clear) 494 atomic_dec(&cached->refs); 495 state = cached; 496 goto hit_next; 497 } 498 if (clear) 499 free_extent_state(cached); 500 } 501 /* 502 * this search will find the extents that end after 503 * our range starts 504 */ 505 node = tree_search(tree, start); 506 if (!node) 507 goto out; 508 state = rb_entry(node, struct extent_state, rb_node); 509 hit_next: 510 if (state->start > end) 511 goto out; 512 WARN_ON(state->end < start); 513 last_end = state->end; 514 515 /* 516 * | ---- desired range ---- | 517 * | state | or 518 * | ------------- state -------------- | 519 * 520 * We need to split the extent we found, and may flip 521 * bits on second half. 522 * 523 * If the extent we found extends past our range, we 524 * just split and search again. It'll get split again 525 * the next time though. 526 * 527 * If the extent we found is inside our range, we clear 528 * the desired bit on it. 529 */ 530 531 if (state->start < start) { 532 if (!prealloc) 533 prealloc = alloc_extent_state(GFP_ATOMIC); 534 err = split_state(tree, state, prealloc, start); 535 BUG_ON(err == -EEXIST); 536 prealloc = NULL; 537 if (err) 538 goto out; 539 if (state->end <= end) { 540 set |= clear_state_bit(tree, state, &bits, wake); 541 if (last_end == (u64)-1) 542 goto out; 543 start = last_end + 1; 544 } 545 goto search_again; 546 } 547 /* 548 * | ---- desired range ---- | 549 * | state | 550 * We need to split the extent, and clear the bit 551 * on the first half 552 */ 553 if (state->start <= end && state->end > end) { 554 if (!prealloc) 555 prealloc = alloc_extent_state(GFP_ATOMIC); 556 err = split_state(tree, state, prealloc, end + 1); 557 BUG_ON(err == -EEXIST); 558 if (wake) 559 wake_up(&state->wq); 560 561 set |= clear_state_bit(tree, prealloc, &bits, wake); 562 563 prealloc = NULL; 564 goto out; 565 } 566 567 if (state->end < end && prealloc && !need_resched()) 568 next_node = rb_next(&state->rb_node); 569 else 570 next_node = NULL; 571 572 set |= clear_state_bit(tree, state, &bits, wake); 573 if (last_end == (u64)-1) 574 goto out; 575 start = last_end + 1; 576 if (start <= end && next_node) { 577 state = rb_entry(next_node, struct extent_state, 578 rb_node); 579 if (state->start == start) 580 goto hit_next; 581 } 582 goto search_again; 583 584 out: 585 spin_unlock(&tree->lock); 586 if (prealloc) 587 free_extent_state(prealloc); 588 589 return set; 590 591 search_again: 592 if (start > end) 593 goto out; 594 spin_unlock(&tree->lock); 595 if (mask & __GFP_WAIT) 596 cond_resched(); 597 goto again; 598 } 599 600 static int wait_on_state(struct extent_io_tree *tree, 601 struct extent_state *state) 602 __releases(tree->lock) 603 __acquires(tree->lock) 604 { 605 DEFINE_WAIT(wait); 606 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 607 spin_unlock(&tree->lock); 608 schedule(); 609 spin_lock(&tree->lock); 610 finish_wait(&state->wq, &wait); 611 return 0; 612 } 613 614 /* 615 * waits for one or more bits to clear on a range in the state tree. 616 * The range [start, end] is inclusive. 617 * The tree lock is taken by this function 618 */ 619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 620 { 621 struct extent_state *state; 622 struct rb_node *node; 623 624 spin_lock(&tree->lock); 625 again: 626 while (1) { 627 /* 628 * this search will find all the extents that end after 629 * our range starts 630 */ 631 node = tree_search(tree, start); 632 if (!node) 633 break; 634 635 state = rb_entry(node, struct extent_state, rb_node); 636 637 if (state->start > end) 638 goto out; 639 640 if (state->state & bits) { 641 start = state->start; 642 atomic_inc(&state->refs); 643 wait_on_state(tree, state); 644 free_extent_state(state); 645 goto again; 646 } 647 start = state->end + 1; 648 649 if (start > end) 650 break; 651 652 if (need_resched()) { 653 spin_unlock(&tree->lock); 654 cond_resched(); 655 spin_lock(&tree->lock); 656 } 657 } 658 out: 659 spin_unlock(&tree->lock); 660 return 0; 661 } 662 663 static int set_state_bits(struct extent_io_tree *tree, 664 struct extent_state *state, 665 int *bits) 666 { 667 int ret; 668 int bits_to_set = *bits & ~EXTENT_CTLBITS; 669 670 ret = set_state_cb(tree, state, bits); 671 if (ret) 672 return ret; 673 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 674 u64 range = state->end - state->start + 1; 675 tree->dirty_bytes += range; 676 } 677 state->state |= bits_to_set; 678 679 return 0; 680 } 681 682 static void cache_state(struct extent_state *state, 683 struct extent_state **cached_ptr) 684 { 685 if (cached_ptr && !(*cached_ptr)) { 686 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 687 *cached_ptr = state; 688 atomic_inc(&state->refs); 689 } 690 } 691 } 692 693 /* 694 * set some bits on a range in the tree. This may require allocations or 695 * sleeping, so the gfp mask is used to indicate what is allowed. 696 * 697 * If any of the exclusive bits are set, this will fail with -EEXIST if some 698 * part of the range already has the desired bits set. The start of the 699 * existing range is returned in failed_start in this case. 700 * 701 * [start, end] is inclusive This takes the tree lock. 702 */ 703 704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 705 int bits, int exclusive_bits, u64 *failed_start, 706 struct extent_state **cached_state, gfp_t mask) 707 { 708 struct extent_state *state; 709 struct extent_state *prealloc = NULL; 710 struct rb_node *node; 711 int err = 0; 712 u64 last_start; 713 u64 last_end; 714 715 bits |= EXTENT_FIRST_DELALLOC; 716 again: 717 if (!prealloc && (mask & __GFP_WAIT)) { 718 prealloc = alloc_extent_state(mask); 719 if (!prealloc) 720 return -ENOMEM; 721 } 722 723 spin_lock(&tree->lock); 724 if (cached_state && *cached_state) { 725 state = *cached_state; 726 if (state->start == start && state->tree) { 727 node = &state->rb_node; 728 goto hit_next; 729 } 730 } 731 /* 732 * this search will find all the extents that end after 733 * our range starts. 734 */ 735 node = tree_search(tree, start); 736 if (!node) { 737 err = insert_state(tree, prealloc, start, end, &bits); 738 prealloc = NULL; 739 BUG_ON(err == -EEXIST); 740 goto out; 741 } 742 state = rb_entry(node, struct extent_state, rb_node); 743 hit_next: 744 last_start = state->start; 745 last_end = state->end; 746 747 /* 748 * | ---- desired range ---- | 749 * | state | 750 * 751 * Just lock what we found and keep going 752 */ 753 if (state->start == start && state->end <= end) { 754 struct rb_node *next_node; 755 if (state->state & exclusive_bits) { 756 *failed_start = state->start; 757 err = -EEXIST; 758 goto out; 759 } 760 761 err = set_state_bits(tree, state, &bits); 762 if (err) 763 goto out; 764 765 cache_state(state, cached_state); 766 merge_state(tree, state); 767 if (last_end == (u64)-1) 768 goto out; 769 770 start = last_end + 1; 771 if (start < end && prealloc && !need_resched()) { 772 next_node = rb_next(node); 773 if (next_node) { 774 state = rb_entry(next_node, struct extent_state, 775 rb_node); 776 if (state->start == start) 777 goto hit_next; 778 } 779 } 780 goto search_again; 781 } 782 783 /* 784 * | ---- desired range ---- | 785 * | state | 786 * or 787 * | ------------- state -------------- | 788 * 789 * We need to split the extent we found, and may flip bits on 790 * second half. 791 * 792 * If the extent we found extends past our 793 * range, we just split and search again. It'll get split 794 * again the next time though. 795 * 796 * If the extent we found is inside our range, we set the 797 * desired bit on it. 798 */ 799 if (state->start < start) { 800 if (state->state & exclusive_bits) { 801 *failed_start = start; 802 err = -EEXIST; 803 goto out; 804 } 805 err = split_state(tree, state, prealloc, start); 806 BUG_ON(err == -EEXIST); 807 prealloc = NULL; 808 if (err) 809 goto out; 810 if (state->end <= end) { 811 err = set_state_bits(tree, state, &bits); 812 if (err) 813 goto out; 814 cache_state(state, cached_state); 815 merge_state(tree, state); 816 if (last_end == (u64)-1) 817 goto out; 818 start = last_end + 1; 819 } 820 goto search_again; 821 } 822 /* 823 * | ---- desired range ---- | 824 * | state | or | state | 825 * 826 * There's a hole, we need to insert something in it and 827 * ignore the extent we found. 828 */ 829 if (state->start > start) { 830 u64 this_end; 831 if (end < last_start) 832 this_end = end; 833 else 834 this_end = last_start - 1; 835 err = insert_state(tree, prealloc, start, this_end, 836 &bits); 837 BUG_ON(err == -EEXIST); 838 if (err) { 839 prealloc = NULL; 840 goto out; 841 } 842 cache_state(prealloc, cached_state); 843 prealloc = NULL; 844 start = this_end + 1; 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and set the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 if (state->state & exclusive_bits) { 855 *failed_start = start; 856 err = -EEXIST; 857 goto out; 858 } 859 err = split_state(tree, state, prealloc, end + 1); 860 BUG_ON(err == -EEXIST); 861 862 err = set_state_bits(tree, prealloc, &bits); 863 if (err) { 864 prealloc = NULL; 865 goto out; 866 } 867 cache_state(prealloc, cached_state); 868 merge_state(tree, prealloc); 869 prealloc = NULL; 870 goto out; 871 } 872 873 goto search_again; 874 875 out: 876 spin_unlock(&tree->lock); 877 if (prealloc) 878 free_extent_state(prealloc); 879 880 return err; 881 882 search_again: 883 if (start > end) 884 goto out; 885 spin_unlock(&tree->lock); 886 if (mask & __GFP_WAIT) 887 cond_resched(); 888 goto again; 889 } 890 891 /* wrappers around set/clear extent bit */ 892 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 893 gfp_t mask) 894 { 895 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 896 NULL, mask); 897 } 898 899 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 900 int bits, gfp_t mask) 901 { 902 return set_extent_bit(tree, start, end, bits, 0, NULL, 903 NULL, mask); 904 } 905 906 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 907 int bits, gfp_t mask) 908 { 909 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 910 } 911 912 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 913 struct extent_state **cached_state, gfp_t mask) 914 { 915 return set_extent_bit(tree, start, end, 916 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 917 0, NULL, cached_state, mask); 918 } 919 920 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 921 gfp_t mask) 922 { 923 return clear_extent_bit(tree, start, end, 924 EXTENT_DIRTY | EXTENT_DELALLOC | 925 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 926 } 927 928 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 929 gfp_t mask) 930 { 931 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 932 NULL, mask); 933 } 934 935 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 936 gfp_t mask) 937 { 938 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, 939 NULL, mask); 940 } 941 942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 943 gfp_t mask) 944 { 945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 946 NULL, mask); 947 } 948 949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 950 u64 end, struct extent_state **cached_state, 951 gfp_t mask) 952 { 953 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 954 cached_state, mask); 955 } 956 957 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 958 { 959 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 960 } 961 962 /* 963 * either insert or lock state struct between start and end use mask to tell 964 * us if waiting is desired. 965 */ 966 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 967 int bits, struct extent_state **cached_state, gfp_t mask) 968 { 969 int err; 970 u64 failed_start; 971 while (1) { 972 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 973 EXTENT_LOCKED, &failed_start, 974 cached_state, mask); 975 if (err == -EEXIST && (mask & __GFP_WAIT)) { 976 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 977 start = failed_start; 978 } else { 979 break; 980 } 981 WARN_ON(start > end); 982 } 983 return err; 984 } 985 986 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 987 { 988 return lock_extent_bits(tree, start, end, 0, NULL, mask); 989 } 990 991 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 992 gfp_t mask) 993 { 994 int err; 995 u64 failed_start; 996 997 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 998 &failed_start, NULL, mask); 999 if (err == -EEXIST) { 1000 if (failed_start > start) 1001 clear_extent_bit(tree, start, failed_start - 1, 1002 EXTENT_LOCKED, 1, 0, NULL, mask); 1003 return 0; 1004 } 1005 return 1; 1006 } 1007 1008 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1009 struct extent_state **cached, gfp_t mask) 1010 { 1011 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1012 mask); 1013 } 1014 1015 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1016 gfp_t mask) 1017 { 1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1019 mask); 1020 } 1021 1022 /* 1023 * helper function to set pages and extents in the tree dirty 1024 */ 1025 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) 1026 { 1027 unsigned long index = start >> PAGE_CACHE_SHIFT; 1028 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1029 struct page *page; 1030 1031 while (index <= end_index) { 1032 page = find_get_page(tree->mapping, index); 1033 BUG_ON(!page); 1034 __set_page_dirty_nobuffers(page); 1035 page_cache_release(page); 1036 index++; 1037 } 1038 return 0; 1039 } 1040 1041 /* 1042 * helper function to set both pages and extents in the tree writeback 1043 */ 1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1045 { 1046 unsigned long index = start >> PAGE_CACHE_SHIFT; 1047 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1048 struct page *page; 1049 1050 while (index <= end_index) { 1051 page = find_get_page(tree->mapping, index); 1052 BUG_ON(!page); 1053 set_page_writeback(page); 1054 page_cache_release(page); 1055 index++; 1056 } 1057 return 0; 1058 } 1059 1060 /* 1061 * find the first offset in the io tree with 'bits' set. zero is 1062 * returned if we find something, and *start_ret and *end_ret are 1063 * set to reflect the state struct that was found. 1064 * 1065 * If nothing was found, 1 is returned, < 0 on error 1066 */ 1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1068 u64 *start_ret, u64 *end_ret, int bits) 1069 { 1070 struct rb_node *node; 1071 struct extent_state *state; 1072 int ret = 1; 1073 1074 spin_lock(&tree->lock); 1075 /* 1076 * this search will find all the extents that end after 1077 * our range starts. 1078 */ 1079 node = tree_search(tree, start); 1080 if (!node) 1081 goto out; 1082 1083 while (1) { 1084 state = rb_entry(node, struct extent_state, rb_node); 1085 if (state->end >= start && (state->state & bits)) { 1086 *start_ret = state->start; 1087 *end_ret = state->end; 1088 ret = 0; 1089 break; 1090 } 1091 node = rb_next(node); 1092 if (!node) 1093 break; 1094 } 1095 out: 1096 spin_unlock(&tree->lock); 1097 return ret; 1098 } 1099 1100 /* find the first state struct with 'bits' set after 'start', and 1101 * return it. tree->lock must be held. NULL will returned if 1102 * nothing was found after 'start' 1103 */ 1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1105 u64 start, int bits) 1106 { 1107 struct rb_node *node; 1108 struct extent_state *state; 1109 1110 /* 1111 * this search will find all the extents that end after 1112 * our range starts. 1113 */ 1114 node = tree_search(tree, start); 1115 if (!node) 1116 goto out; 1117 1118 while (1) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (state->end >= start && (state->state & bits)) 1121 return state; 1122 1123 node = rb_next(node); 1124 if (!node) 1125 break; 1126 } 1127 out: 1128 return NULL; 1129 } 1130 1131 /* 1132 * find a contiguous range of bytes in the file marked as delalloc, not 1133 * more than 'max_bytes'. start and end are used to return the range, 1134 * 1135 * 1 is returned if we find something, 0 if nothing was in the tree 1136 */ 1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1138 u64 *start, u64 *end, u64 max_bytes, 1139 struct extent_state **cached_state) 1140 { 1141 struct rb_node *node; 1142 struct extent_state *state; 1143 u64 cur_start = *start; 1144 u64 found = 0; 1145 u64 total_bytes = 0; 1146 1147 spin_lock(&tree->lock); 1148 1149 /* 1150 * this search will find all the extents that end after 1151 * our range starts. 1152 */ 1153 node = tree_search(tree, cur_start); 1154 if (!node) { 1155 if (!found) 1156 *end = (u64)-1; 1157 goto out; 1158 } 1159 1160 while (1) { 1161 state = rb_entry(node, struct extent_state, rb_node); 1162 if (found && (state->start != cur_start || 1163 (state->state & EXTENT_BOUNDARY))) { 1164 goto out; 1165 } 1166 if (!(state->state & EXTENT_DELALLOC)) { 1167 if (!found) 1168 *end = state->end; 1169 goto out; 1170 } 1171 if (!found) { 1172 *start = state->start; 1173 *cached_state = state; 1174 atomic_inc(&state->refs); 1175 } 1176 found++; 1177 *end = state->end; 1178 cur_start = state->end + 1; 1179 node = rb_next(node); 1180 if (!node) 1181 break; 1182 total_bytes += state->end - state->start + 1; 1183 if (total_bytes >= max_bytes) 1184 break; 1185 } 1186 out: 1187 spin_unlock(&tree->lock); 1188 return found; 1189 } 1190 1191 static noinline int __unlock_for_delalloc(struct inode *inode, 1192 struct page *locked_page, 1193 u64 start, u64 end) 1194 { 1195 int ret; 1196 struct page *pages[16]; 1197 unsigned long index = start >> PAGE_CACHE_SHIFT; 1198 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1199 unsigned long nr_pages = end_index - index + 1; 1200 int i; 1201 1202 if (index == locked_page->index && end_index == index) 1203 return 0; 1204 1205 while (nr_pages > 0) { 1206 ret = find_get_pages_contig(inode->i_mapping, index, 1207 min_t(unsigned long, nr_pages, 1208 ARRAY_SIZE(pages)), pages); 1209 for (i = 0; i < ret; i++) { 1210 if (pages[i] != locked_page) 1211 unlock_page(pages[i]); 1212 page_cache_release(pages[i]); 1213 } 1214 nr_pages -= ret; 1215 index += ret; 1216 cond_resched(); 1217 } 1218 return 0; 1219 } 1220 1221 static noinline int lock_delalloc_pages(struct inode *inode, 1222 struct page *locked_page, 1223 u64 delalloc_start, 1224 u64 delalloc_end) 1225 { 1226 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1227 unsigned long start_index = index; 1228 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1229 unsigned long pages_locked = 0; 1230 struct page *pages[16]; 1231 unsigned long nrpages; 1232 int ret; 1233 int i; 1234 1235 /* the caller is responsible for locking the start index */ 1236 if (index == locked_page->index && index == end_index) 1237 return 0; 1238 1239 /* skip the page at the start index */ 1240 nrpages = end_index - index + 1; 1241 while (nrpages > 0) { 1242 ret = find_get_pages_contig(inode->i_mapping, index, 1243 min_t(unsigned long, 1244 nrpages, ARRAY_SIZE(pages)), pages); 1245 if (ret == 0) { 1246 ret = -EAGAIN; 1247 goto done; 1248 } 1249 /* now we have an array of pages, lock them all */ 1250 for (i = 0; i < ret; i++) { 1251 /* 1252 * the caller is taking responsibility for 1253 * locked_page 1254 */ 1255 if (pages[i] != locked_page) { 1256 lock_page(pages[i]); 1257 if (!PageDirty(pages[i]) || 1258 pages[i]->mapping != inode->i_mapping) { 1259 ret = -EAGAIN; 1260 unlock_page(pages[i]); 1261 page_cache_release(pages[i]); 1262 goto done; 1263 } 1264 } 1265 page_cache_release(pages[i]); 1266 pages_locked++; 1267 } 1268 nrpages -= ret; 1269 index += ret; 1270 cond_resched(); 1271 } 1272 ret = 0; 1273 done: 1274 if (ret && pages_locked) { 1275 __unlock_for_delalloc(inode, locked_page, 1276 delalloc_start, 1277 ((u64)(start_index + pages_locked - 1)) << 1278 PAGE_CACHE_SHIFT); 1279 } 1280 return ret; 1281 } 1282 1283 /* 1284 * find a contiguous range of bytes in the file marked as delalloc, not 1285 * more than 'max_bytes'. start and end are used to return the range, 1286 * 1287 * 1 is returned if we find something, 0 if nothing was in the tree 1288 */ 1289 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1290 struct extent_io_tree *tree, 1291 struct page *locked_page, 1292 u64 *start, u64 *end, 1293 u64 max_bytes) 1294 { 1295 u64 delalloc_start; 1296 u64 delalloc_end; 1297 u64 found; 1298 struct extent_state *cached_state = NULL; 1299 int ret; 1300 int loops = 0; 1301 1302 again: 1303 /* step one, find a bunch of delalloc bytes starting at start */ 1304 delalloc_start = *start; 1305 delalloc_end = 0; 1306 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1307 max_bytes, &cached_state); 1308 if (!found || delalloc_end <= *start) { 1309 *start = delalloc_start; 1310 *end = delalloc_end; 1311 free_extent_state(cached_state); 1312 return found; 1313 } 1314 1315 /* 1316 * start comes from the offset of locked_page. We have to lock 1317 * pages in order, so we can't process delalloc bytes before 1318 * locked_page 1319 */ 1320 if (delalloc_start < *start) 1321 delalloc_start = *start; 1322 1323 /* 1324 * make sure to limit the number of pages we try to lock down 1325 * if we're looping. 1326 */ 1327 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1328 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1329 1330 /* step two, lock all the pages after the page that has start */ 1331 ret = lock_delalloc_pages(inode, locked_page, 1332 delalloc_start, delalloc_end); 1333 if (ret == -EAGAIN) { 1334 /* some of the pages are gone, lets avoid looping by 1335 * shortening the size of the delalloc range we're searching 1336 */ 1337 free_extent_state(cached_state); 1338 if (!loops) { 1339 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1340 max_bytes = PAGE_CACHE_SIZE - offset; 1341 loops = 1; 1342 goto again; 1343 } else { 1344 found = 0; 1345 goto out_failed; 1346 } 1347 } 1348 BUG_ON(ret); 1349 1350 /* step three, lock the state bits for the whole range */ 1351 lock_extent_bits(tree, delalloc_start, delalloc_end, 1352 0, &cached_state, GFP_NOFS); 1353 1354 /* then test to make sure it is all still delalloc */ 1355 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1356 EXTENT_DELALLOC, 1, cached_state); 1357 if (!ret) { 1358 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1359 &cached_state, GFP_NOFS); 1360 __unlock_for_delalloc(inode, locked_page, 1361 delalloc_start, delalloc_end); 1362 cond_resched(); 1363 goto again; 1364 } 1365 free_extent_state(cached_state); 1366 *start = delalloc_start; 1367 *end = delalloc_end; 1368 out_failed: 1369 return found; 1370 } 1371 1372 int extent_clear_unlock_delalloc(struct inode *inode, 1373 struct extent_io_tree *tree, 1374 u64 start, u64 end, struct page *locked_page, 1375 unsigned long op) 1376 { 1377 int ret; 1378 struct page *pages[16]; 1379 unsigned long index = start >> PAGE_CACHE_SHIFT; 1380 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1381 unsigned long nr_pages = end_index - index + 1; 1382 int i; 1383 int clear_bits = 0; 1384 1385 if (op & EXTENT_CLEAR_UNLOCK) 1386 clear_bits |= EXTENT_LOCKED; 1387 if (op & EXTENT_CLEAR_DIRTY) 1388 clear_bits |= EXTENT_DIRTY; 1389 1390 if (op & EXTENT_CLEAR_DELALLOC) 1391 clear_bits |= EXTENT_DELALLOC; 1392 1393 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1394 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1395 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1396 EXTENT_SET_PRIVATE2))) 1397 return 0; 1398 1399 while (nr_pages > 0) { 1400 ret = find_get_pages_contig(inode->i_mapping, index, 1401 min_t(unsigned long, 1402 nr_pages, ARRAY_SIZE(pages)), pages); 1403 for (i = 0; i < ret; i++) { 1404 1405 if (op & EXTENT_SET_PRIVATE2) 1406 SetPagePrivate2(pages[i]); 1407 1408 if (pages[i] == locked_page) { 1409 page_cache_release(pages[i]); 1410 continue; 1411 } 1412 if (op & EXTENT_CLEAR_DIRTY) 1413 clear_page_dirty_for_io(pages[i]); 1414 if (op & EXTENT_SET_WRITEBACK) 1415 set_page_writeback(pages[i]); 1416 if (op & EXTENT_END_WRITEBACK) 1417 end_page_writeback(pages[i]); 1418 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1419 unlock_page(pages[i]); 1420 page_cache_release(pages[i]); 1421 } 1422 nr_pages -= ret; 1423 index += ret; 1424 cond_resched(); 1425 } 1426 return 0; 1427 } 1428 1429 /* 1430 * count the number of bytes in the tree that have a given bit(s) 1431 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1432 * cached. The total number found is returned. 1433 */ 1434 u64 count_range_bits(struct extent_io_tree *tree, 1435 u64 *start, u64 search_end, u64 max_bytes, 1436 unsigned long bits) 1437 { 1438 struct rb_node *node; 1439 struct extent_state *state; 1440 u64 cur_start = *start; 1441 u64 total_bytes = 0; 1442 int found = 0; 1443 1444 if (search_end <= cur_start) { 1445 WARN_ON(1); 1446 return 0; 1447 } 1448 1449 spin_lock(&tree->lock); 1450 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1451 total_bytes = tree->dirty_bytes; 1452 goto out; 1453 } 1454 /* 1455 * this search will find all the extents that end after 1456 * our range starts. 1457 */ 1458 node = tree_search(tree, cur_start); 1459 if (!node) 1460 goto out; 1461 1462 while (1) { 1463 state = rb_entry(node, struct extent_state, rb_node); 1464 if (state->start > search_end) 1465 break; 1466 if (state->end >= cur_start && (state->state & bits)) { 1467 total_bytes += min(search_end, state->end) + 1 - 1468 max(cur_start, state->start); 1469 if (total_bytes >= max_bytes) 1470 break; 1471 if (!found) { 1472 *start = state->start; 1473 found = 1; 1474 } 1475 } 1476 node = rb_next(node); 1477 if (!node) 1478 break; 1479 } 1480 out: 1481 spin_unlock(&tree->lock); 1482 return total_bytes; 1483 } 1484 1485 /* 1486 * set the private field for a given byte offset in the tree. If there isn't 1487 * an extent_state there already, this does nothing. 1488 */ 1489 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1490 { 1491 struct rb_node *node; 1492 struct extent_state *state; 1493 int ret = 0; 1494 1495 spin_lock(&tree->lock); 1496 /* 1497 * this search will find all the extents that end after 1498 * our range starts. 1499 */ 1500 node = tree_search(tree, start); 1501 if (!node) { 1502 ret = -ENOENT; 1503 goto out; 1504 } 1505 state = rb_entry(node, struct extent_state, rb_node); 1506 if (state->start != start) { 1507 ret = -ENOENT; 1508 goto out; 1509 } 1510 state->private = private; 1511 out: 1512 spin_unlock(&tree->lock); 1513 return ret; 1514 } 1515 1516 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1517 { 1518 struct rb_node *node; 1519 struct extent_state *state; 1520 int ret = 0; 1521 1522 spin_lock(&tree->lock); 1523 /* 1524 * this search will find all the extents that end after 1525 * our range starts. 1526 */ 1527 node = tree_search(tree, start); 1528 if (!node) { 1529 ret = -ENOENT; 1530 goto out; 1531 } 1532 state = rb_entry(node, struct extent_state, rb_node); 1533 if (state->start != start) { 1534 ret = -ENOENT; 1535 goto out; 1536 } 1537 *private = state->private; 1538 out: 1539 spin_unlock(&tree->lock); 1540 return ret; 1541 } 1542 1543 /* 1544 * searches a range in the state tree for a given mask. 1545 * If 'filled' == 1, this returns 1 only if every extent in the tree 1546 * has the bits set. Otherwise, 1 is returned if any bit in the 1547 * range is found set. 1548 */ 1549 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1550 int bits, int filled, struct extent_state *cached) 1551 { 1552 struct extent_state *state = NULL; 1553 struct rb_node *node; 1554 int bitset = 0; 1555 1556 spin_lock(&tree->lock); 1557 if (cached && cached->tree && cached->start == start) 1558 node = &cached->rb_node; 1559 else 1560 node = tree_search(tree, start); 1561 while (node && start <= end) { 1562 state = rb_entry(node, struct extent_state, rb_node); 1563 1564 if (filled && state->start > start) { 1565 bitset = 0; 1566 break; 1567 } 1568 1569 if (state->start > end) 1570 break; 1571 1572 if (state->state & bits) { 1573 bitset = 1; 1574 if (!filled) 1575 break; 1576 } else if (filled) { 1577 bitset = 0; 1578 break; 1579 } 1580 1581 if (state->end == (u64)-1) 1582 break; 1583 1584 start = state->end + 1; 1585 if (start > end) 1586 break; 1587 node = rb_next(node); 1588 if (!node) { 1589 if (filled) 1590 bitset = 0; 1591 break; 1592 } 1593 } 1594 spin_unlock(&tree->lock); 1595 return bitset; 1596 } 1597 1598 /* 1599 * helper function to set a given page up to date if all the 1600 * extents in the tree for that page are up to date 1601 */ 1602 static int check_page_uptodate(struct extent_io_tree *tree, 1603 struct page *page) 1604 { 1605 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1606 u64 end = start + PAGE_CACHE_SIZE - 1; 1607 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1608 SetPageUptodate(page); 1609 return 0; 1610 } 1611 1612 /* 1613 * helper function to unlock a page if all the extents in the tree 1614 * for that page are unlocked 1615 */ 1616 static int check_page_locked(struct extent_io_tree *tree, 1617 struct page *page) 1618 { 1619 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1620 u64 end = start + PAGE_CACHE_SIZE - 1; 1621 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1622 unlock_page(page); 1623 return 0; 1624 } 1625 1626 /* 1627 * helper function to end page writeback if all the extents 1628 * in the tree for that page are done with writeback 1629 */ 1630 static int check_page_writeback(struct extent_io_tree *tree, 1631 struct page *page) 1632 { 1633 end_page_writeback(page); 1634 return 0; 1635 } 1636 1637 /* lots and lots of room for performance fixes in the end_bio funcs */ 1638 1639 /* 1640 * after a writepage IO is done, we need to: 1641 * clear the uptodate bits on error 1642 * clear the writeback bits in the extent tree for this IO 1643 * end_page_writeback if the page has no more pending IO 1644 * 1645 * Scheduling is not allowed, so the extent state tree is expected 1646 * to have one and only one object corresponding to this IO. 1647 */ 1648 static void end_bio_extent_writepage(struct bio *bio, int err) 1649 { 1650 int uptodate = err == 0; 1651 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1652 struct extent_io_tree *tree; 1653 u64 start; 1654 u64 end; 1655 int whole_page; 1656 int ret; 1657 1658 do { 1659 struct page *page = bvec->bv_page; 1660 tree = &BTRFS_I(page->mapping->host)->io_tree; 1661 1662 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1663 bvec->bv_offset; 1664 end = start + bvec->bv_len - 1; 1665 1666 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1667 whole_page = 1; 1668 else 1669 whole_page = 0; 1670 1671 if (--bvec >= bio->bi_io_vec) 1672 prefetchw(&bvec->bv_page->flags); 1673 if (tree->ops && tree->ops->writepage_end_io_hook) { 1674 ret = tree->ops->writepage_end_io_hook(page, start, 1675 end, NULL, uptodate); 1676 if (ret) 1677 uptodate = 0; 1678 } 1679 1680 if (!uptodate && tree->ops && 1681 tree->ops->writepage_io_failed_hook) { 1682 ret = tree->ops->writepage_io_failed_hook(bio, page, 1683 start, end, NULL); 1684 if (ret == 0) { 1685 uptodate = (err == 0); 1686 continue; 1687 } 1688 } 1689 1690 if (!uptodate) { 1691 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1692 ClearPageUptodate(page); 1693 SetPageError(page); 1694 } 1695 1696 if (whole_page) 1697 end_page_writeback(page); 1698 else 1699 check_page_writeback(tree, page); 1700 } while (bvec >= bio->bi_io_vec); 1701 1702 bio_put(bio); 1703 } 1704 1705 /* 1706 * after a readpage IO is done, we need to: 1707 * clear the uptodate bits on error 1708 * set the uptodate bits if things worked 1709 * set the page up to date if all extents in the tree are uptodate 1710 * clear the lock bit in the extent tree 1711 * unlock the page if there are no other extents locked for it 1712 * 1713 * Scheduling is not allowed, so the extent state tree is expected 1714 * to have one and only one object corresponding to this IO. 1715 */ 1716 static void end_bio_extent_readpage(struct bio *bio, int err) 1717 { 1718 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1719 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1720 struct bio_vec *bvec = bio->bi_io_vec; 1721 struct extent_io_tree *tree; 1722 u64 start; 1723 u64 end; 1724 int whole_page; 1725 int ret; 1726 1727 if (err) 1728 uptodate = 0; 1729 1730 do { 1731 struct page *page = bvec->bv_page; 1732 tree = &BTRFS_I(page->mapping->host)->io_tree; 1733 1734 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1735 bvec->bv_offset; 1736 end = start + bvec->bv_len - 1; 1737 1738 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1739 whole_page = 1; 1740 else 1741 whole_page = 0; 1742 1743 if (++bvec <= bvec_end) 1744 prefetchw(&bvec->bv_page->flags); 1745 1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1747 ret = tree->ops->readpage_end_io_hook(page, start, end, 1748 NULL); 1749 if (ret) 1750 uptodate = 0; 1751 } 1752 if (!uptodate && tree->ops && 1753 tree->ops->readpage_io_failed_hook) { 1754 ret = tree->ops->readpage_io_failed_hook(bio, page, 1755 start, end, NULL); 1756 if (ret == 0) { 1757 uptodate = 1758 test_bit(BIO_UPTODATE, &bio->bi_flags); 1759 if (err) 1760 uptodate = 0; 1761 continue; 1762 } 1763 } 1764 1765 if (uptodate) { 1766 set_extent_uptodate(tree, start, end, 1767 GFP_ATOMIC); 1768 } 1769 unlock_extent(tree, start, end, GFP_ATOMIC); 1770 1771 if (whole_page) { 1772 if (uptodate) { 1773 SetPageUptodate(page); 1774 } else { 1775 ClearPageUptodate(page); 1776 SetPageError(page); 1777 } 1778 unlock_page(page); 1779 } else { 1780 if (uptodate) { 1781 check_page_uptodate(tree, page); 1782 } else { 1783 ClearPageUptodate(page); 1784 SetPageError(page); 1785 } 1786 check_page_locked(tree, page); 1787 } 1788 } while (bvec <= bvec_end); 1789 1790 bio_put(bio); 1791 } 1792 1793 /* 1794 * IO done from prepare_write is pretty simple, we just unlock 1795 * the structs in the extent tree when done, and set the uptodate bits 1796 * as appropriate. 1797 */ 1798 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1799 { 1800 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1801 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1802 struct extent_io_tree *tree; 1803 u64 start; 1804 u64 end; 1805 1806 do { 1807 struct page *page = bvec->bv_page; 1808 tree = &BTRFS_I(page->mapping->host)->io_tree; 1809 1810 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1811 bvec->bv_offset; 1812 end = start + bvec->bv_len - 1; 1813 1814 if (--bvec >= bio->bi_io_vec) 1815 prefetchw(&bvec->bv_page->flags); 1816 1817 if (uptodate) { 1818 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1819 } else { 1820 ClearPageUptodate(page); 1821 SetPageError(page); 1822 } 1823 1824 unlock_extent(tree, start, end, GFP_ATOMIC); 1825 1826 } while (bvec >= bio->bi_io_vec); 1827 1828 bio_put(bio); 1829 } 1830 1831 struct bio * 1832 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1833 gfp_t gfp_flags) 1834 { 1835 struct bio *bio; 1836 1837 bio = bio_alloc(gfp_flags, nr_vecs); 1838 1839 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1840 while (!bio && (nr_vecs /= 2)) 1841 bio = bio_alloc(gfp_flags, nr_vecs); 1842 } 1843 1844 if (bio) { 1845 bio->bi_size = 0; 1846 bio->bi_bdev = bdev; 1847 bio->bi_sector = first_sector; 1848 } 1849 return bio; 1850 } 1851 1852 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1853 unsigned long bio_flags) 1854 { 1855 int ret = 0; 1856 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1857 struct page *page = bvec->bv_page; 1858 struct extent_io_tree *tree = bio->bi_private; 1859 u64 start; 1860 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1862 1863 bio->bi_private = NULL; 1864 1865 bio_get(bio); 1866 1867 if (tree->ops && tree->ops->submit_bio_hook) 1868 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1869 mirror_num, bio_flags, start); 1870 else 1871 submit_bio(rw, bio); 1872 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1873 ret = -EOPNOTSUPP; 1874 bio_put(bio); 1875 return ret; 1876 } 1877 1878 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1879 struct page *page, sector_t sector, 1880 size_t size, unsigned long offset, 1881 struct block_device *bdev, 1882 struct bio **bio_ret, 1883 unsigned long max_pages, 1884 bio_end_io_t end_io_func, 1885 int mirror_num, 1886 unsigned long prev_bio_flags, 1887 unsigned long bio_flags) 1888 { 1889 int ret = 0; 1890 struct bio *bio; 1891 int nr; 1892 int contig = 0; 1893 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1894 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1895 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1896 1897 if (bio_ret && *bio_ret) { 1898 bio = *bio_ret; 1899 if (old_compressed) 1900 contig = bio->bi_sector == sector; 1901 else 1902 contig = bio->bi_sector + (bio->bi_size >> 9) == 1903 sector; 1904 1905 if (prev_bio_flags != bio_flags || !contig || 1906 (tree->ops && tree->ops->merge_bio_hook && 1907 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1908 bio_flags)) || 1909 bio_add_page(bio, page, page_size, offset) < page_size) { 1910 ret = submit_one_bio(rw, bio, mirror_num, 1911 prev_bio_flags); 1912 bio = NULL; 1913 } else { 1914 return 0; 1915 } 1916 } 1917 if (this_compressed) 1918 nr = BIO_MAX_PAGES; 1919 else 1920 nr = bio_get_nr_vecs(bdev); 1921 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1923 if (!bio) 1924 return -ENOMEM; 1925 1926 bio_add_page(bio, page, page_size, offset); 1927 bio->bi_end_io = end_io_func; 1928 bio->bi_private = tree; 1929 1930 if (bio_ret) 1931 *bio_ret = bio; 1932 else 1933 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1934 1935 return ret; 1936 } 1937 1938 void set_page_extent_mapped(struct page *page) 1939 { 1940 if (!PagePrivate(page)) { 1941 SetPagePrivate(page); 1942 page_cache_get(page); 1943 set_page_private(page, EXTENT_PAGE_PRIVATE); 1944 } 1945 } 1946 1947 static void set_page_extent_head(struct page *page, unsigned long len) 1948 { 1949 WARN_ON(!PagePrivate(page)); 1950 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1951 } 1952 1953 /* 1954 * basic readpage implementation. Locked extent state structs are inserted 1955 * into the tree that are removed when the IO is done (by the end_io 1956 * handlers) 1957 */ 1958 static int __extent_read_full_page(struct extent_io_tree *tree, 1959 struct page *page, 1960 get_extent_t *get_extent, 1961 struct bio **bio, int mirror_num, 1962 unsigned long *bio_flags) 1963 { 1964 struct inode *inode = page->mapping->host; 1965 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1966 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1967 u64 end; 1968 u64 cur = start; 1969 u64 extent_offset; 1970 u64 last_byte = i_size_read(inode); 1971 u64 block_start; 1972 u64 cur_end; 1973 sector_t sector; 1974 struct extent_map *em; 1975 struct block_device *bdev; 1976 struct btrfs_ordered_extent *ordered; 1977 int ret; 1978 int nr = 0; 1979 size_t page_offset = 0; 1980 size_t iosize; 1981 size_t disk_io_size; 1982 size_t blocksize = inode->i_sb->s_blocksize; 1983 unsigned long this_bio_flag = 0; 1984 1985 set_page_extent_mapped(page); 1986 1987 end = page_end; 1988 while (1) { 1989 lock_extent(tree, start, end, GFP_NOFS); 1990 ordered = btrfs_lookup_ordered_extent(inode, start); 1991 if (!ordered) 1992 break; 1993 unlock_extent(tree, start, end, GFP_NOFS); 1994 btrfs_start_ordered_extent(inode, ordered, 1); 1995 btrfs_put_ordered_extent(ordered); 1996 } 1997 1998 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1999 char *userpage; 2000 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2001 2002 if (zero_offset) { 2003 iosize = PAGE_CACHE_SIZE - zero_offset; 2004 userpage = kmap_atomic(page, KM_USER0); 2005 memset(userpage + zero_offset, 0, iosize); 2006 flush_dcache_page(page); 2007 kunmap_atomic(userpage, KM_USER0); 2008 } 2009 } 2010 while (cur <= end) { 2011 if (cur >= last_byte) { 2012 char *userpage; 2013 iosize = PAGE_CACHE_SIZE - page_offset; 2014 userpage = kmap_atomic(page, KM_USER0); 2015 memset(userpage + page_offset, 0, iosize); 2016 flush_dcache_page(page); 2017 kunmap_atomic(userpage, KM_USER0); 2018 set_extent_uptodate(tree, cur, cur + iosize - 1, 2019 GFP_NOFS); 2020 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2021 break; 2022 } 2023 em = get_extent(inode, page, page_offset, cur, 2024 end - cur + 1, 0); 2025 if (IS_ERR(em) || !em) { 2026 SetPageError(page); 2027 unlock_extent(tree, cur, end, GFP_NOFS); 2028 break; 2029 } 2030 extent_offset = cur - em->start; 2031 BUG_ON(extent_map_end(em) <= cur); 2032 BUG_ON(end < cur); 2033 2034 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2035 this_bio_flag = EXTENT_BIO_COMPRESSED; 2036 extent_set_compress_type(&this_bio_flag, 2037 em->compress_type); 2038 } 2039 2040 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2041 cur_end = min(extent_map_end(em) - 1, end); 2042 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2043 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2044 disk_io_size = em->block_len; 2045 sector = em->block_start >> 9; 2046 } else { 2047 sector = (em->block_start + extent_offset) >> 9; 2048 disk_io_size = iosize; 2049 } 2050 bdev = em->bdev; 2051 block_start = em->block_start; 2052 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2053 block_start = EXTENT_MAP_HOLE; 2054 free_extent_map(em); 2055 em = NULL; 2056 2057 /* we've found a hole, just zero and go on */ 2058 if (block_start == EXTENT_MAP_HOLE) { 2059 char *userpage; 2060 userpage = kmap_atomic(page, KM_USER0); 2061 memset(userpage + page_offset, 0, iosize); 2062 flush_dcache_page(page); 2063 kunmap_atomic(userpage, KM_USER0); 2064 2065 set_extent_uptodate(tree, cur, cur + iosize - 1, 2066 GFP_NOFS); 2067 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2068 cur = cur + iosize; 2069 page_offset += iosize; 2070 continue; 2071 } 2072 /* the get_extent function already copied into the page */ 2073 if (test_range_bit(tree, cur, cur_end, 2074 EXTENT_UPTODATE, 1, NULL)) { 2075 check_page_uptodate(tree, page); 2076 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2077 cur = cur + iosize; 2078 page_offset += iosize; 2079 continue; 2080 } 2081 /* we have an inline extent but it didn't get marked up 2082 * to date. Error out 2083 */ 2084 if (block_start == EXTENT_MAP_INLINE) { 2085 SetPageError(page); 2086 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2087 cur = cur + iosize; 2088 page_offset += iosize; 2089 continue; 2090 } 2091 2092 ret = 0; 2093 if (tree->ops && tree->ops->readpage_io_hook) { 2094 ret = tree->ops->readpage_io_hook(page, cur, 2095 cur + iosize - 1); 2096 } 2097 if (!ret) { 2098 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2099 pnr -= page->index; 2100 ret = submit_extent_page(READ, tree, page, 2101 sector, disk_io_size, page_offset, 2102 bdev, bio, pnr, 2103 end_bio_extent_readpage, mirror_num, 2104 *bio_flags, 2105 this_bio_flag); 2106 nr++; 2107 *bio_flags = this_bio_flag; 2108 } 2109 if (ret) 2110 SetPageError(page); 2111 cur = cur + iosize; 2112 page_offset += iosize; 2113 } 2114 if (!nr) { 2115 if (!PageError(page)) 2116 SetPageUptodate(page); 2117 unlock_page(page); 2118 } 2119 return 0; 2120 } 2121 2122 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2123 get_extent_t *get_extent) 2124 { 2125 struct bio *bio = NULL; 2126 unsigned long bio_flags = 0; 2127 int ret; 2128 2129 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2130 &bio_flags); 2131 if (bio) 2132 ret = submit_one_bio(READ, bio, 0, bio_flags); 2133 return ret; 2134 } 2135 2136 static noinline void update_nr_written(struct page *page, 2137 struct writeback_control *wbc, 2138 unsigned long nr_written) 2139 { 2140 wbc->nr_to_write -= nr_written; 2141 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2142 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2143 page->mapping->writeback_index = page->index + nr_written; 2144 } 2145 2146 /* 2147 * the writepage semantics are similar to regular writepage. extent 2148 * records are inserted to lock ranges in the tree, and as dirty areas 2149 * are found, they are marked writeback. Then the lock bits are removed 2150 * and the end_io handler clears the writeback ranges 2151 */ 2152 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2153 void *data) 2154 { 2155 struct inode *inode = page->mapping->host; 2156 struct extent_page_data *epd = data; 2157 struct extent_io_tree *tree = epd->tree; 2158 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2159 u64 delalloc_start; 2160 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2161 u64 end; 2162 u64 cur = start; 2163 u64 extent_offset; 2164 u64 last_byte = i_size_read(inode); 2165 u64 block_start; 2166 u64 iosize; 2167 sector_t sector; 2168 struct extent_state *cached_state = NULL; 2169 struct extent_map *em; 2170 struct block_device *bdev; 2171 int ret; 2172 int nr = 0; 2173 size_t pg_offset = 0; 2174 size_t blocksize; 2175 loff_t i_size = i_size_read(inode); 2176 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2177 u64 nr_delalloc; 2178 u64 delalloc_end; 2179 int page_started; 2180 int compressed; 2181 int write_flags; 2182 unsigned long nr_written = 0; 2183 2184 if (wbc->sync_mode == WB_SYNC_ALL) 2185 write_flags = WRITE_SYNC_PLUG; 2186 else 2187 write_flags = WRITE; 2188 2189 WARN_ON(!PageLocked(page)); 2190 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2191 if (page->index > end_index || 2192 (page->index == end_index && !pg_offset)) { 2193 page->mapping->a_ops->invalidatepage(page, 0); 2194 unlock_page(page); 2195 return 0; 2196 } 2197 2198 if (page->index == end_index) { 2199 char *userpage; 2200 2201 userpage = kmap_atomic(page, KM_USER0); 2202 memset(userpage + pg_offset, 0, 2203 PAGE_CACHE_SIZE - pg_offset); 2204 kunmap_atomic(userpage, KM_USER0); 2205 flush_dcache_page(page); 2206 } 2207 pg_offset = 0; 2208 2209 set_page_extent_mapped(page); 2210 2211 delalloc_start = start; 2212 delalloc_end = 0; 2213 page_started = 0; 2214 if (!epd->extent_locked) { 2215 u64 delalloc_to_write = 0; 2216 /* 2217 * make sure the wbc mapping index is at least updated 2218 * to this page. 2219 */ 2220 update_nr_written(page, wbc, 0); 2221 2222 while (delalloc_end < page_end) { 2223 nr_delalloc = find_lock_delalloc_range(inode, tree, 2224 page, 2225 &delalloc_start, 2226 &delalloc_end, 2227 128 * 1024 * 1024); 2228 if (nr_delalloc == 0) { 2229 delalloc_start = delalloc_end + 1; 2230 continue; 2231 } 2232 tree->ops->fill_delalloc(inode, page, delalloc_start, 2233 delalloc_end, &page_started, 2234 &nr_written); 2235 /* 2236 * delalloc_end is already one less than the total 2237 * length, so we don't subtract one from 2238 * PAGE_CACHE_SIZE 2239 */ 2240 delalloc_to_write += (delalloc_end - delalloc_start + 2241 PAGE_CACHE_SIZE) >> 2242 PAGE_CACHE_SHIFT; 2243 delalloc_start = delalloc_end + 1; 2244 } 2245 if (wbc->nr_to_write < delalloc_to_write) { 2246 int thresh = 8192; 2247 2248 if (delalloc_to_write < thresh * 2) 2249 thresh = delalloc_to_write; 2250 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2251 thresh); 2252 } 2253 2254 /* did the fill delalloc function already unlock and start 2255 * the IO? 2256 */ 2257 if (page_started) { 2258 ret = 0; 2259 /* 2260 * we've unlocked the page, so we can't update 2261 * the mapping's writeback index, just update 2262 * nr_to_write. 2263 */ 2264 wbc->nr_to_write -= nr_written; 2265 goto done_unlocked; 2266 } 2267 } 2268 if (tree->ops && tree->ops->writepage_start_hook) { 2269 ret = tree->ops->writepage_start_hook(page, start, 2270 page_end); 2271 if (ret == -EAGAIN) { 2272 redirty_page_for_writepage(wbc, page); 2273 update_nr_written(page, wbc, nr_written); 2274 unlock_page(page); 2275 ret = 0; 2276 goto done_unlocked; 2277 } 2278 } 2279 2280 /* 2281 * we don't want to touch the inode after unlocking the page, 2282 * so we update the mapping writeback index now 2283 */ 2284 update_nr_written(page, wbc, nr_written + 1); 2285 2286 end = page_end; 2287 if (last_byte <= start) { 2288 if (tree->ops && tree->ops->writepage_end_io_hook) 2289 tree->ops->writepage_end_io_hook(page, start, 2290 page_end, NULL, 1); 2291 goto done; 2292 } 2293 2294 blocksize = inode->i_sb->s_blocksize; 2295 2296 while (cur <= end) { 2297 if (cur >= last_byte) { 2298 if (tree->ops && tree->ops->writepage_end_io_hook) 2299 tree->ops->writepage_end_io_hook(page, cur, 2300 page_end, NULL, 1); 2301 break; 2302 } 2303 em = epd->get_extent(inode, page, pg_offset, cur, 2304 end - cur + 1, 1); 2305 if (IS_ERR(em) || !em) { 2306 SetPageError(page); 2307 break; 2308 } 2309 2310 extent_offset = cur - em->start; 2311 BUG_ON(extent_map_end(em) <= cur); 2312 BUG_ON(end < cur); 2313 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2314 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2315 sector = (em->block_start + extent_offset) >> 9; 2316 bdev = em->bdev; 2317 block_start = em->block_start; 2318 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2319 free_extent_map(em); 2320 em = NULL; 2321 2322 /* 2323 * compressed and inline extents are written through other 2324 * paths in the FS 2325 */ 2326 if (compressed || block_start == EXTENT_MAP_HOLE || 2327 block_start == EXTENT_MAP_INLINE) { 2328 /* 2329 * end_io notification does not happen here for 2330 * compressed extents 2331 */ 2332 if (!compressed && tree->ops && 2333 tree->ops->writepage_end_io_hook) 2334 tree->ops->writepage_end_io_hook(page, cur, 2335 cur + iosize - 1, 2336 NULL, 1); 2337 else if (compressed) { 2338 /* we don't want to end_page_writeback on 2339 * a compressed extent. this happens 2340 * elsewhere 2341 */ 2342 nr++; 2343 } 2344 2345 cur += iosize; 2346 pg_offset += iosize; 2347 continue; 2348 } 2349 /* leave this out until we have a page_mkwrite call */ 2350 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2351 EXTENT_DIRTY, 0, NULL)) { 2352 cur = cur + iosize; 2353 pg_offset += iosize; 2354 continue; 2355 } 2356 2357 if (tree->ops && tree->ops->writepage_io_hook) { 2358 ret = tree->ops->writepage_io_hook(page, cur, 2359 cur + iosize - 1); 2360 } else { 2361 ret = 0; 2362 } 2363 if (ret) { 2364 SetPageError(page); 2365 } else { 2366 unsigned long max_nr = end_index + 1; 2367 2368 set_range_writeback(tree, cur, cur + iosize - 1); 2369 if (!PageWriteback(page)) { 2370 printk(KERN_ERR "btrfs warning page %lu not " 2371 "writeback, cur %llu end %llu\n", 2372 page->index, (unsigned long long)cur, 2373 (unsigned long long)end); 2374 } 2375 2376 ret = submit_extent_page(write_flags, tree, page, 2377 sector, iosize, pg_offset, 2378 bdev, &epd->bio, max_nr, 2379 end_bio_extent_writepage, 2380 0, 0, 0); 2381 if (ret) 2382 SetPageError(page); 2383 } 2384 cur = cur + iosize; 2385 pg_offset += iosize; 2386 nr++; 2387 } 2388 done: 2389 if (nr == 0) { 2390 /* make sure the mapping tag for page dirty gets cleared */ 2391 set_page_writeback(page); 2392 end_page_writeback(page); 2393 } 2394 unlock_page(page); 2395 2396 done_unlocked: 2397 2398 /* drop our reference on any cached states */ 2399 free_extent_state(cached_state); 2400 return 0; 2401 } 2402 2403 /** 2404 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2405 * @mapping: address space structure to write 2406 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2407 * @writepage: function called for each page 2408 * @data: data passed to writepage function 2409 * 2410 * If a page is already under I/O, write_cache_pages() skips it, even 2411 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2412 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2413 * and msync() need to guarantee that all the data which was dirty at the time 2414 * the call was made get new I/O started against them. If wbc->sync_mode is 2415 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2416 * existing IO to complete. 2417 */ 2418 static int extent_write_cache_pages(struct extent_io_tree *tree, 2419 struct address_space *mapping, 2420 struct writeback_control *wbc, 2421 writepage_t writepage, void *data, 2422 void (*flush_fn)(void *)) 2423 { 2424 int ret = 0; 2425 int done = 0; 2426 int nr_to_write_done = 0; 2427 struct pagevec pvec; 2428 int nr_pages; 2429 pgoff_t index; 2430 pgoff_t end; /* Inclusive */ 2431 int scanned = 0; 2432 2433 pagevec_init(&pvec, 0); 2434 if (wbc->range_cyclic) { 2435 index = mapping->writeback_index; /* Start from prev offset */ 2436 end = -1; 2437 } else { 2438 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2439 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2440 scanned = 1; 2441 } 2442 retry: 2443 while (!done && !nr_to_write_done && (index <= end) && 2444 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2445 PAGECACHE_TAG_DIRTY, min(end - index, 2446 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2447 unsigned i; 2448 2449 scanned = 1; 2450 for (i = 0; i < nr_pages; i++) { 2451 struct page *page = pvec.pages[i]; 2452 2453 /* 2454 * At this point we hold neither mapping->tree_lock nor 2455 * lock on the page itself: the page may be truncated or 2456 * invalidated (changing page->mapping to NULL), or even 2457 * swizzled back from swapper_space to tmpfs file 2458 * mapping 2459 */ 2460 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2461 tree->ops->write_cache_pages_lock_hook(page); 2462 else 2463 lock_page(page); 2464 2465 if (unlikely(page->mapping != mapping)) { 2466 unlock_page(page); 2467 continue; 2468 } 2469 2470 if (!wbc->range_cyclic && page->index > end) { 2471 done = 1; 2472 unlock_page(page); 2473 continue; 2474 } 2475 2476 if (wbc->sync_mode != WB_SYNC_NONE) { 2477 if (PageWriteback(page)) 2478 flush_fn(data); 2479 wait_on_page_writeback(page); 2480 } 2481 2482 if (PageWriteback(page) || 2483 !clear_page_dirty_for_io(page)) { 2484 unlock_page(page); 2485 continue; 2486 } 2487 2488 ret = (*writepage)(page, wbc, data); 2489 2490 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2491 unlock_page(page); 2492 ret = 0; 2493 } 2494 if (ret) 2495 done = 1; 2496 2497 /* 2498 * the filesystem may choose to bump up nr_to_write. 2499 * We have to make sure to honor the new nr_to_write 2500 * at any time 2501 */ 2502 nr_to_write_done = wbc->nr_to_write <= 0; 2503 } 2504 pagevec_release(&pvec); 2505 cond_resched(); 2506 } 2507 if (!scanned && !done) { 2508 /* 2509 * We hit the last page and there is more work to be done: wrap 2510 * back to the start of the file 2511 */ 2512 scanned = 1; 2513 index = 0; 2514 goto retry; 2515 } 2516 return ret; 2517 } 2518 2519 static void flush_epd_write_bio(struct extent_page_data *epd) 2520 { 2521 if (epd->bio) { 2522 if (epd->sync_io) 2523 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2524 else 2525 submit_one_bio(WRITE, epd->bio, 0, 0); 2526 epd->bio = NULL; 2527 } 2528 } 2529 2530 static noinline void flush_write_bio(void *data) 2531 { 2532 struct extent_page_data *epd = data; 2533 flush_epd_write_bio(epd); 2534 } 2535 2536 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2537 get_extent_t *get_extent, 2538 struct writeback_control *wbc) 2539 { 2540 int ret; 2541 struct address_space *mapping = page->mapping; 2542 struct extent_page_data epd = { 2543 .bio = NULL, 2544 .tree = tree, 2545 .get_extent = get_extent, 2546 .extent_locked = 0, 2547 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2548 }; 2549 struct writeback_control wbc_writepages = { 2550 .sync_mode = wbc->sync_mode, 2551 .older_than_this = NULL, 2552 .nr_to_write = 64, 2553 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2554 .range_end = (loff_t)-1, 2555 }; 2556 2557 ret = __extent_writepage(page, wbc, &epd); 2558 2559 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2560 __extent_writepage, &epd, flush_write_bio); 2561 flush_epd_write_bio(&epd); 2562 return ret; 2563 } 2564 2565 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2566 u64 start, u64 end, get_extent_t *get_extent, 2567 int mode) 2568 { 2569 int ret = 0; 2570 struct address_space *mapping = inode->i_mapping; 2571 struct page *page; 2572 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2573 PAGE_CACHE_SHIFT; 2574 2575 struct extent_page_data epd = { 2576 .bio = NULL, 2577 .tree = tree, 2578 .get_extent = get_extent, 2579 .extent_locked = 1, 2580 .sync_io = mode == WB_SYNC_ALL, 2581 }; 2582 struct writeback_control wbc_writepages = { 2583 .sync_mode = mode, 2584 .older_than_this = NULL, 2585 .nr_to_write = nr_pages * 2, 2586 .range_start = start, 2587 .range_end = end + 1, 2588 }; 2589 2590 while (start <= end) { 2591 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2592 if (clear_page_dirty_for_io(page)) 2593 ret = __extent_writepage(page, &wbc_writepages, &epd); 2594 else { 2595 if (tree->ops && tree->ops->writepage_end_io_hook) 2596 tree->ops->writepage_end_io_hook(page, start, 2597 start + PAGE_CACHE_SIZE - 1, 2598 NULL, 1); 2599 unlock_page(page); 2600 } 2601 page_cache_release(page); 2602 start += PAGE_CACHE_SIZE; 2603 } 2604 2605 flush_epd_write_bio(&epd); 2606 return ret; 2607 } 2608 2609 int extent_writepages(struct extent_io_tree *tree, 2610 struct address_space *mapping, 2611 get_extent_t *get_extent, 2612 struct writeback_control *wbc) 2613 { 2614 int ret = 0; 2615 struct extent_page_data epd = { 2616 .bio = NULL, 2617 .tree = tree, 2618 .get_extent = get_extent, 2619 .extent_locked = 0, 2620 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2621 }; 2622 2623 ret = extent_write_cache_pages(tree, mapping, wbc, 2624 __extent_writepage, &epd, 2625 flush_write_bio); 2626 flush_epd_write_bio(&epd); 2627 return ret; 2628 } 2629 2630 int extent_readpages(struct extent_io_tree *tree, 2631 struct address_space *mapping, 2632 struct list_head *pages, unsigned nr_pages, 2633 get_extent_t get_extent) 2634 { 2635 struct bio *bio = NULL; 2636 unsigned page_idx; 2637 unsigned long bio_flags = 0; 2638 2639 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2640 struct page *page = list_entry(pages->prev, struct page, lru); 2641 2642 prefetchw(&page->flags); 2643 list_del(&page->lru); 2644 if (!add_to_page_cache_lru(page, mapping, 2645 page->index, GFP_KERNEL)) { 2646 __extent_read_full_page(tree, page, get_extent, 2647 &bio, 0, &bio_flags); 2648 } 2649 page_cache_release(page); 2650 } 2651 BUG_ON(!list_empty(pages)); 2652 if (bio) 2653 submit_one_bio(READ, bio, 0, bio_flags); 2654 return 0; 2655 } 2656 2657 /* 2658 * basic invalidatepage code, this waits on any locked or writeback 2659 * ranges corresponding to the page, and then deletes any extent state 2660 * records from the tree 2661 */ 2662 int extent_invalidatepage(struct extent_io_tree *tree, 2663 struct page *page, unsigned long offset) 2664 { 2665 struct extent_state *cached_state = NULL; 2666 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2667 u64 end = start + PAGE_CACHE_SIZE - 1; 2668 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2669 2670 start += (offset + blocksize - 1) & ~(blocksize - 1); 2671 if (start > end) 2672 return 0; 2673 2674 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2675 wait_on_page_writeback(page); 2676 clear_extent_bit(tree, start, end, 2677 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2678 EXTENT_DO_ACCOUNTING, 2679 1, 1, &cached_state, GFP_NOFS); 2680 return 0; 2681 } 2682 2683 /* 2684 * simple commit_write call, set_range_dirty is used to mark both 2685 * the pages and the extent records as dirty 2686 */ 2687 int extent_commit_write(struct extent_io_tree *tree, 2688 struct inode *inode, struct page *page, 2689 unsigned from, unsigned to) 2690 { 2691 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2692 2693 set_page_extent_mapped(page); 2694 set_page_dirty(page); 2695 2696 if (pos > inode->i_size) { 2697 i_size_write(inode, pos); 2698 mark_inode_dirty(inode); 2699 } 2700 return 0; 2701 } 2702 2703 int extent_prepare_write(struct extent_io_tree *tree, 2704 struct inode *inode, struct page *page, 2705 unsigned from, unsigned to, get_extent_t *get_extent) 2706 { 2707 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2708 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2709 u64 block_start; 2710 u64 orig_block_start; 2711 u64 block_end; 2712 u64 cur_end; 2713 struct extent_map *em; 2714 unsigned blocksize = 1 << inode->i_blkbits; 2715 size_t page_offset = 0; 2716 size_t block_off_start; 2717 size_t block_off_end; 2718 int err = 0; 2719 int iocount = 0; 2720 int ret = 0; 2721 int isnew; 2722 2723 set_page_extent_mapped(page); 2724 2725 block_start = (page_start + from) & ~((u64)blocksize - 1); 2726 block_end = (page_start + to - 1) | (blocksize - 1); 2727 orig_block_start = block_start; 2728 2729 lock_extent(tree, page_start, page_end, GFP_NOFS); 2730 while (block_start <= block_end) { 2731 em = get_extent(inode, page, page_offset, block_start, 2732 block_end - block_start + 1, 1); 2733 if (IS_ERR(em) || !em) 2734 goto err; 2735 2736 cur_end = min(block_end, extent_map_end(em) - 1); 2737 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2738 block_off_end = block_off_start + blocksize; 2739 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2740 2741 if (!PageUptodate(page) && isnew && 2742 (block_off_end > to || block_off_start < from)) { 2743 void *kaddr; 2744 2745 kaddr = kmap_atomic(page, KM_USER0); 2746 if (block_off_end > to) 2747 memset(kaddr + to, 0, block_off_end - to); 2748 if (block_off_start < from) 2749 memset(kaddr + block_off_start, 0, 2750 from - block_off_start); 2751 flush_dcache_page(page); 2752 kunmap_atomic(kaddr, KM_USER0); 2753 } 2754 if ((em->block_start != EXTENT_MAP_HOLE && 2755 em->block_start != EXTENT_MAP_INLINE) && 2756 !isnew && !PageUptodate(page) && 2757 (block_off_end > to || block_off_start < from) && 2758 !test_range_bit(tree, block_start, cur_end, 2759 EXTENT_UPTODATE, 1, NULL)) { 2760 u64 sector; 2761 u64 extent_offset = block_start - em->start; 2762 size_t iosize; 2763 sector = (em->block_start + extent_offset) >> 9; 2764 iosize = (cur_end - block_start + blocksize) & 2765 ~((u64)blocksize - 1); 2766 /* 2767 * we've already got the extent locked, but we 2768 * need to split the state such that our end_bio 2769 * handler can clear the lock. 2770 */ 2771 set_extent_bit(tree, block_start, 2772 block_start + iosize - 1, 2773 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); 2774 ret = submit_extent_page(READ, tree, page, 2775 sector, iosize, page_offset, em->bdev, 2776 NULL, 1, 2777 end_bio_extent_preparewrite, 0, 2778 0, 0); 2779 if (ret && !err) 2780 err = ret; 2781 iocount++; 2782 block_start = block_start + iosize; 2783 } else { 2784 set_extent_uptodate(tree, block_start, cur_end, 2785 GFP_NOFS); 2786 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2787 block_start = cur_end + 1; 2788 } 2789 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2790 free_extent_map(em); 2791 } 2792 if (iocount) { 2793 wait_extent_bit(tree, orig_block_start, 2794 block_end, EXTENT_LOCKED); 2795 } 2796 check_page_uptodate(tree, page); 2797 err: 2798 /* FIXME, zero out newly allocated blocks on error */ 2799 return err; 2800 } 2801 2802 /* 2803 * a helper for releasepage, this tests for areas of the page that 2804 * are locked or under IO and drops the related state bits if it is safe 2805 * to drop the page. 2806 */ 2807 int try_release_extent_state(struct extent_map_tree *map, 2808 struct extent_io_tree *tree, struct page *page, 2809 gfp_t mask) 2810 { 2811 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2812 u64 end = start + PAGE_CACHE_SIZE - 1; 2813 int ret = 1; 2814 2815 if (test_range_bit(tree, start, end, 2816 EXTENT_IOBITS, 0, NULL)) 2817 ret = 0; 2818 else { 2819 if ((mask & GFP_NOFS) == GFP_NOFS) 2820 mask = GFP_NOFS; 2821 /* 2822 * at this point we can safely clear everything except the 2823 * locked bit and the nodatasum bit 2824 */ 2825 ret = clear_extent_bit(tree, start, end, 2826 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2827 0, 0, NULL, mask); 2828 2829 /* if clear_extent_bit failed for enomem reasons, 2830 * we can't allow the release to continue. 2831 */ 2832 if (ret < 0) 2833 ret = 0; 2834 else 2835 ret = 1; 2836 } 2837 return ret; 2838 } 2839 2840 /* 2841 * a helper for releasepage. As long as there are no locked extents 2842 * in the range corresponding to the page, both state records and extent 2843 * map records are removed 2844 */ 2845 int try_release_extent_mapping(struct extent_map_tree *map, 2846 struct extent_io_tree *tree, struct page *page, 2847 gfp_t mask) 2848 { 2849 struct extent_map *em; 2850 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2851 u64 end = start + PAGE_CACHE_SIZE - 1; 2852 2853 if ((mask & __GFP_WAIT) && 2854 page->mapping->host->i_size > 16 * 1024 * 1024) { 2855 u64 len; 2856 while (start <= end) { 2857 len = end - start + 1; 2858 write_lock(&map->lock); 2859 em = lookup_extent_mapping(map, start, len); 2860 if (!em || IS_ERR(em)) { 2861 write_unlock(&map->lock); 2862 break; 2863 } 2864 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2865 em->start != start) { 2866 write_unlock(&map->lock); 2867 free_extent_map(em); 2868 break; 2869 } 2870 if (!test_range_bit(tree, em->start, 2871 extent_map_end(em) - 1, 2872 EXTENT_LOCKED | EXTENT_WRITEBACK, 2873 0, NULL)) { 2874 remove_extent_mapping(map, em); 2875 /* once for the rb tree */ 2876 free_extent_map(em); 2877 } 2878 start = extent_map_end(em); 2879 write_unlock(&map->lock); 2880 2881 /* once for us */ 2882 free_extent_map(em); 2883 } 2884 } 2885 return try_release_extent_state(map, tree, page, mask); 2886 } 2887 2888 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2889 get_extent_t *get_extent) 2890 { 2891 struct inode *inode = mapping->host; 2892 struct extent_state *cached_state = NULL; 2893 u64 start = iblock << inode->i_blkbits; 2894 sector_t sector = 0; 2895 size_t blksize = (1 << inode->i_blkbits); 2896 struct extent_map *em; 2897 2898 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2899 0, &cached_state, GFP_NOFS); 2900 em = get_extent(inode, NULL, 0, start, blksize, 0); 2901 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, 2902 start + blksize - 1, &cached_state, GFP_NOFS); 2903 if (!em || IS_ERR(em)) 2904 return 0; 2905 2906 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2907 goto out; 2908 2909 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2910 out: 2911 free_extent_map(em); 2912 return sector; 2913 } 2914 2915 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2916 __u64 start, __u64 len, get_extent_t *get_extent) 2917 { 2918 int ret = 0; 2919 u64 off = start; 2920 u64 max = start + len; 2921 u32 flags = 0; 2922 u32 found_type; 2923 u64 last; 2924 u64 disko = 0; 2925 struct btrfs_key found_key; 2926 struct extent_map *em = NULL; 2927 struct extent_state *cached_state = NULL; 2928 struct btrfs_path *path; 2929 struct btrfs_file_extent_item *item; 2930 int end = 0; 2931 u64 em_start = 0, em_len = 0; 2932 unsigned long emflags; 2933 int hole = 0; 2934 2935 if (len == 0) 2936 return -EINVAL; 2937 2938 path = btrfs_alloc_path(); 2939 if (!path) 2940 return -ENOMEM; 2941 path->leave_spinning = 1; 2942 2943 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2944 path, inode->i_ino, -1, 0); 2945 if (ret < 0) { 2946 btrfs_free_path(path); 2947 return ret; 2948 } 2949 WARN_ON(!ret); 2950 path->slots[0]--; 2951 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2952 struct btrfs_file_extent_item); 2953 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 2954 found_type = btrfs_key_type(&found_key); 2955 2956 /* No extents, just return */ 2957 if (found_key.objectid != inode->i_ino || 2958 found_type != BTRFS_EXTENT_DATA_KEY) { 2959 btrfs_free_path(path); 2960 return 0; 2961 } 2962 last = found_key.offset; 2963 btrfs_free_path(path); 2964 2965 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2966 &cached_state, GFP_NOFS); 2967 em = get_extent(inode, NULL, 0, off, max - off, 0); 2968 if (!em) 2969 goto out; 2970 if (IS_ERR(em)) { 2971 ret = PTR_ERR(em); 2972 goto out; 2973 } 2974 2975 while (!end) { 2976 hole = 0; 2977 off = em->start + em->len; 2978 if (off >= max) 2979 end = 1; 2980 2981 if (em->block_start == EXTENT_MAP_HOLE) { 2982 hole = 1; 2983 goto next; 2984 } 2985 2986 em_start = em->start; 2987 em_len = em->len; 2988 2989 disko = 0; 2990 flags = 0; 2991 2992 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2993 end = 1; 2994 flags |= FIEMAP_EXTENT_LAST; 2995 } else if (em->block_start == EXTENT_MAP_INLINE) { 2996 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2997 FIEMAP_EXTENT_NOT_ALIGNED); 2998 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2999 flags |= (FIEMAP_EXTENT_DELALLOC | 3000 FIEMAP_EXTENT_UNKNOWN); 3001 } else { 3002 disko = em->block_start; 3003 } 3004 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3005 flags |= FIEMAP_EXTENT_ENCODED; 3006 3007 next: 3008 emflags = em->flags; 3009 free_extent_map(em); 3010 em = NULL; 3011 if (!end) { 3012 em = get_extent(inode, NULL, 0, off, max - off, 0); 3013 if (!em) 3014 goto out; 3015 if (IS_ERR(em)) { 3016 ret = PTR_ERR(em); 3017 goto out; 3018 } 3019 emflags = em->flags; 3020 } 3021 3022 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3023 flags |= FIEMAP_EXTENT_LAST; 3024 end = 1; 3025 } 3026 3027 if (em_start == last) { 3028 flags |= FIEMAP_EXTENT_LAST; 3029 end = 1; 3030 } 3031 3032 if (!hole) { 3033 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3034 em_len, flags); 3035 if (ret) 3036 goto out_free; 3037 } 3038 } 3039 out_free: 3040 free_extent_map(em); 3041 out: 3042 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3043 &cached_state, GFP_NOFS); 3044 return ret; 3045 } 3046 3047 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3048 unsigned long i) 3049 { 3050 struct page *p; 3051 struct address_space *mapping; 3052 3053 if (i == 0) 3054 return eb->first_page; 3055 i += eb->start >> PAGE_CACHE_SHIFT; 3056 mapping = eb->first_page->mapping; 3057 if (!mapping) 3058 return NULL; 3059 3060 /* 3061 * extent_buffer_page is only called after pinning the page 3062 * by increasing the reference count. So we know the page must 3063 * be in the radix tree. 3064 */ 3065 rcu_read_lock(); 3066 p = radix_tree_lookup(&mapping->page_tree, i); 3067 rcu_read_unlock(); 3068 3069 return p; 3070 } 3071 3072 static inline unsigned long num_extent_pages(u64 start, u64 len) 3073 { 3074 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3075 (start >> PAGE_CACHE_SHIFT); 3076 } 3077 3078 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3079 u64 start, 3080 unsigned long len, 3081 gfp_t mask) 3082 { 3083 struct extent_buffer *eb = NULL; 3084 #if LEAK_DEBUG 3085 unsigned long flags; 3086 #endif 3087 3088 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3089 if (eb == NULL) 3090 return NULL; 3091 eb->start = start; 3092 eb->len = len; 3093 spin_lock_init(&eb->lock); 3094 init_waitqueue_head(&eb->lock_wq); 3095 3096 #if LEAK_DEBUG 3097 spin_lock_irqsave(&leak_lock, flags); 3098 list_add(&eb->leak_list, &buffers); 3099 spin_unlock_irqrestore(&leak_lock, flags); 3100 #endif 3101 atomic_set(&eb->refs, 1); 3102 3103 return eb; 3104 } 3105 3106 static void __free_extent_buffer(struct extent_buffer *eb) 3107 { 3108 #if LEAK_DEBUG 3109 unsigned long flags; 3110 spin_lock_irqsave(&leak_lock, flags); 3111 list_del(&eb->leak_list); 3112 spin_unlock_irqrestore(&leak_lock, flags); 3113 #endif 3114 kmem_cache_free(extent_buffer_cache, eb); 3115 } 3116 3117 /* 3118 * Helper for releasing extent buffer page. 3119 */ 3120 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3121 unsigned long start_idx) 3122 { 3123 unsigned long index; 3124 struct page *page; 3125 3126 if (!eb->first_page) 3127 return; 3128 3129 index = num_extent_pages(eb->start, eb->len); 3130 if (start_idx >= index) 3131 return; 3132 3133 do { 3134 index--; 3135 page = extent_buffer_page(eb, index); 3136 if (page) 3137 page_cache_release(page); 3138 } while (index != start_idx); 3139 } 3140 3141 /* 3142 * Helper for releasing the extent buffer. 3143 */ 3144 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3145 { 3146 btrfs_release_extent_buffer_page(eb, 0); 3147 __free_extent_buffer(eb); 3148 } 3149 3150 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3151 u64 start, unsigned long len, 3152 struct page *page0, 3153 gfp_t mask) 3154 { 3155 unsigned long num_pages = num_extent_pages(start, len); 3156 unsigned long i; 3157 unsigned long index = start >> PAGE_CACHE_SHIFT; 3158 struct extent_buffer *eb; 3159 struct extent_buffer *exists = NULL; 3160 struct page *p; 3161 struct address_space *mapping = tree->mapping; 3162 int uptodate = 1; 3163 int ret; 3164 3165 rcu_read_lock(); 3166 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3167 if (eb && atomic_inc_not_zero(&eb->refs)) { 3168 rcu_read_unlock(); 3169 mark_page_accessed(eb->first_page); 3170 return eb; 3171 } 3172 rcu_read_unlock(); 3173 3174 eb = __alloc_extent_buffer(tree, start, len, mask); 3175 if (!eb) 3176 return NULL; 3177 3178 if (page0) { 3179 eb->first_page = page0; 3180 i = 1; 3181 index++; 3182 page_cache_get(page0); 3183 mark_page_accessed(page0); 3184 set_page_extent_mapped(page0); 3185 set_page_extent_head(page0, len); 3186 uptodate = PageUptodate(page0); 3187 } else { 3188 i = 0; 3189 } 3190 for (; i < num_pages; i++, index++) { 3191 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3192 if (!p) { 3193 WARN_ON(1); 3194 goto free_eb; 3195 } 3196 set_page_extent_mapped(p); 3197 mark_page_accessed(p); 3198 if (i == 0) { 3199 eb->first_page = p; 3200 set_page_extent_head(p, len); 3201 } else { 3202 set_page_private(p, EXTENT_PAGE_PRIVATE); 3203 } 3204 if (!PageUptodate(p)) 3205 uptodate = 0; 3206 3207 /* 3208 * see below about how we avoid a nasty race with release page 3209 * and why we unlock later 3210 */ 3211 if (i != 0) 3212 unlock_page(p); 3213 } 3214 if (uptodate) 3215 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3216 3217 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3218 if (ret) 3219 goto free_eb; 3220 3221 spin_lock(&tree->buffer_lock); 3222 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3223 if (ret == -EEXIST) { 3224 exists = radix_tree_lookup(&tree->buffer, 3225 start >> PAGE_CACHE_SHIFT); 3226 /* add one reference for the caller */ 3227 atomic_inc(&exists->refs); 3228 spin_unlock(&tree->buffer_lock); 3229 radix_tree_preload_end(); 3230 goto free_eb; 3231 } 3232 /* add one reference for the tree */ 3233 atomic_inc(&eb->refs); 3234 spin_unlock(&tree->buffer_lock); 3235 radix_tree_preload_end(); 3236 3237 /* 3238 * there is a race where release page may have 3239 * tried to find this extent buffer in the radix 3240 * but failed. It will tell the VM it is safe to 3241 * reclaim the, and it will clear the page private bit. 3242 * We must make sure to set the page private bit properly 3243 * after the extent buffer is in the radix tree so 3244 * it doesn't get lost 3245 */ 3246 set_page_extent_mapped(eb->first_page); 3247 set_page_extent_head(eb->first_page, eb->len); 3248 if (!page0) 3249 unlock_page(eb->first_page); 3250 return eb; 3251 3252 free_eb: 3253 if (eb->first_page && !page0) 3254 unlock_page(eb->first_page); 3255 3256 if (!atomic_dec_and_test(&eb->refs)) 3257 return exists; 3258 btrfs_release_extent_buffer(eb); 3259 return exists; 3260 } 3261 3262 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3263 u64 start, unsigned long len, 3264 gfp_t mask) 3265 { 3266 struct extent_buffer *eb; 3267 3268 rcu_read_lock(); 3269 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3270 if (eb && atomic_inc_not_zero(&eb->refs)) { 3271 rcu_read_unlock(); 3272 mark_page_accessed(eb->first_page); 3273 return eb; 3274 } 3275 rcu_read_unlock(); 3276 3277 return NULL; 3278 } 3279 3280 void free_extent_buffer(struct extent_buffer *eb) 3281 { 3282 if (!eb) 3283 return; 3284 3285 if (!atomic_dec_and_test(&eb->refs)) 3286 return; 3287 3288 WARN_ON(1); 3289 } 3290 3291 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3292 struct extent_buffer *eb) 3293 { 3294 unsigned long i; 3295 unsigned long num_pages; 3296 struct page *page; 3297 3298 num_pages = num_extent_pages(eb->start, eb->len); 3299 3300 for (i = 0; i < num_pages; i++) { 3301 page = extent_buffer_page(eb, i); 3302 if (!PageDirty(page)) 3303 continue; 3304 3305 lock_page(page); 3306 WARN_ON(!PagePrivate(page)); 3307 3308 set_page_extent_mapped(page); 3309 if (i == 0) 3310 set_page_extent_head(page, eb->len); 3311 3312 clear_page_dirty_for_io(page); 3313 spin_lock_irq(&page->mapping->tree_lock); 3314 if (!PageDirty(page)) { 3315 radix_tree_tag_clear(&page->mapping->page_tree, 3316 page_index(page), 3317 PAGECACHE_TAG_DIRTY); 3318 } 3319 spin_unlock_irq(&page->mapping->tree_lock); 3320 unlock_page(page); 3321 } 3322 return 0; 3323 } 3324 3325 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, 3326 struct extent_buffer *eb) 3327 { 3328 return wait_on_extent_writeback(tree, eb->start, 3329 eb->start + eb->len - 1); 3330 } 3331 3332 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3333 struct extent_buffer *eb) 3334 { 3335 unsigned long i; 3336 unsigned long num_pages; 3337 int was_dirty = 0; 3338 3339 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3340 num_pages = num_extent_pages(eb->start, eb->len); 3341 for (i = 0; i < num_pages; i++) 3342 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3343 return was_dirty; 3344 } 3345 3346 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3347 struct extent_buffer *eb, 3348 struct extent_state **cached_state) 3349 { 3350 unsigned long i; 3351 struct page *page; 3352 unsigned long num_pages; 3353 3354 num_pages = num_extent_pages(eb->start, eb->len); 3355 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3356 3357 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3358 cached_state, GFP_NOFS); 3359 for (i = 0; i < num_pages; i++) { 3360 page = extent_buffer_page(eb, i); 3361 if (page) 3362 ClearPageUptodate(page); 3363 } 3364 return 0; 3365 } 3366 3367 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3368 struct extent_buffer *eb) 3369 { 3370 unsigned long i; 3371 struct page *page; 3372 unsigned long num_pages; 3373 3374 num_pages = num_extent_pages(eb->start, eb->len); 3375 3376 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3377 GFP_NOFS); 3378 for (i = 0; i < num_pages; i++) { 3379 page = extent_buffer_page(eb, i); 3380 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3381 ((i == num_pages - 1) && 3382 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3383 check_page_uptodate(tree, page); 3384 continue; 3385 } 3386 SetPageUptodate(page); 3387 } 3388 return 0; 3389 } 3390 3391 int extent_range_uptodate(struct extent_io_tree *tree, 3392 u64 start, u64 end) 3393 { 3394 struct page *page; 3395 int ret; 3396 int pg_uptodate = 1; 3397 int uptodate; 3398 unsigned long index; 3399 3400 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3401 if (ret) 3402 return 1; 3403 while (start <= end) { 3404 index = start >> PAGE_CACHE_SHIFT; 3405 page = find_get_page(tree->mapping, index); 3406 uptodate = PageUptodate(page); 3407 page_cache_release(page); 3408 if (!uptodate) { 3409 pg_uptodate = 0; 3410 break; 3411 } 3412 start += PAGE_CACHE_SIZE; 3413 } 3414 return pg_uptodate; 3415 } 3416 3417 int extent_buffer_uptodate(struct extent_io_tree *tree, 3418 struct extent_buffer *eb, 3419 struct extent_state *cached_state) 3420 { 3421 int ret = 0; 3422 unsigned long num_pages; 3423 unsigned long i; 3424 struct page *page; 3425 int pg_uptodate = 1; 3426 3427 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3428 return 1; 3429 3430 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3431 EXTENT_UPTODATE, 1, cached_state); 3432 if (ret) 3433 return ret; 3434 3435 num_pages = num_extent_pages(eb->start, eb->len); 3436 for (i = 0; i < num_pages; i++) { 3437 page = extent_buffer_page(eb, i); 3438 if (!PageUptodate(page)) { 3439 pg_uptodate = 0; 3440 break; 3441 } 3442 } 3443 return pg_uptodate; 3444 } 3445 3446 int read_extent_buffer_pages(struct extent_io_tree *tree, 3447 struct extent_buffer *eb, 3448 u64 start, int wait, 3449 get_extent_t *get_extent, int mirror_num) 3450 { 3451 unsigned long i; 3452 unsigned long start_i; 3453 struct page *page; 3454 int err; 3455 int ret = 0; 3456 int locked_pages = 0; 3457 int all_uptodate = 1; 3458 int inc_all_pages = 0; 3459 unsigned long num_pages; 3460 struct bio *bio = NULL; 3461 unsigned long bio_flags = 0; 3462 3463 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3464 return 0; 3465 3466 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3467 EXTENT_UPTODATE, 1, NULL)) { 3468 return 0; 3469 } 3470 3471 if (start) { 3472 WARN_ON(start < eb->start); 3473 start_i = (start >> PAGE_CACHE_SHIFT) - 3474 (eb->start >> PAGE_CACHE_SHIFT); 3475 } else { 3476 start_i = 0; 3477 } 3478 3479 num_pages = num_extent_pages(eb->start, eb->len); 3480 for (i = start_i; i < num_pages; i++) { 3481 page = extent_buffer_page(eb, i); 3482 if (!wait) { 3483 if (!trylock_page(page)) 3484 goto unlock_exit; 3485 } else { 3486 lock_page(page); 3487 } 3488 locked_pages++; 3489 if (!PageUptodate(page)) 3490 all_uptodate = 0; 3491 } 3492 if (all_uptodate) { 3493 if (start_i == 0) 3494 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3495 goto unlock_exit; 3496 } 3497 3498 for (i = start_i; i < num_pages; i++) { 3499 page = extent_buffer_page(eb, i); 3500 3501 WARN_ON(!PagePrivate(page)); 3502 3503 set_page_extent_mapped(page); 3504 if (i == 0) 3505 set_page_extent_head(page, eb->len); 3506 3507 if (inc_all_pages) 3508 page_cache_get(page); 3509 if (!PageUptodate(page)) { 3510 if (start_i == 0) 3511 inc_all_pages = 1; 3512 ClearPageError(page); 3513 err = __extent_read_full_page(tree, page, 3514 get_extent, &bio, 3515 mirror_num, &bio_flags); 3516 if (err) 3517 ret = err; 3518 } else { 3519 unlock_page(page); 3520 } 3521 } 3522 3523 if (bio) 3524 submit_one_bio(READ, bio, mirror_num, bio_flags); 3525 3526 if (ret || !wait) 3527 return ret; 3528 3529 for (i = start_i; i < num_pages; i++) { 3530 page = extent_buffer_page(eb, i); 3531 wait_on_page_locked(page); 3532 if (!PageUptodate(page)) 3533 ret = -EIO; 3534 } 3535 3536 if (!ret) 3537 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3538 return ret; 3539 3540 unlock_exit: 3541 i = start_i; 3542 while (locked_pages > 0) { 3543 page = extent_buffer_page(eb, i); 3544 i++; 3545 unlock_page(page); 3546 locked_pages--; 3547 } 3548 return ret; 3549 } 3550 3551 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3552 unsigned long start, 3553 unsigned long len) 3554 { 3555 size_t cur; 3556 size_t offset; 3557 struct page *page; 3558 char *kaddr; 3559 char *dst = (char *)dstv; 3560 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3561 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3562 3563 WARN_ON(start > eb->len); 3564 WARN_ON(start + len > eb->start + eb->len); 3565 3566 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3567 3568 while (len > 0) { 3569 page = extent_buffer_page(eb, i); 3570 3571 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3572 kaddr = kmap_atomic(page, KM_USER1); 3573 memcpy(dst, kaddr + offset, cur); 3574 kunmap_atomic(kaddr, KM_USER1); 3575 3576 dst += cur; 3577 len -= cur; 3578 offset = 0; 3579 i++; 3580 } 3581 } 3582 3583 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3584 unsigned long min_len, char **token, char **map, 3585 unsigned long *map_start, 3586 unsigned long *map_len, int km) 3587 { 3588 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3589 char *kaddr; 3590 struct page *p; 3591 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3592 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3593 unsigned long end_i = (start_offset + start + min_len - 1) >> 3594 PAGE_CACHE_SHIFT; 3595 3596 if (i != end_i) 3597 return -EINVAL; 3598 3599 if (i == 0) { 3600 offset = start_offset; 3601 *map_start = 0; 3602 } else { 3603 offset = 0; 3604 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3605 } 3606 3607 if (start + min_len > eb->len) { 3608 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3609 "wanted %lu %lu\n", (unsigned long long)eb->start, 3610 eb->len, start, min_len); 3611 WARN_ON(1); 3612 } 3613 3614 p = extent_buffer_page(eb, i); 3615 kaddr = kmap_atomic(p, km); 3616 *token = kaddr; 3617 *map = kaddr + offset; 3618 *map_len = PAGE_CACHE_SIZE - offset; 3619 return 0; 3620 } 3621 3622 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3623 unsigned long min_len, 3624 char **token, char **map, 3625 unsigned long *map_start, 3626 unsigned long *map_len, int km) 3627 { 3628 int err; 3629 int save = 0; 3630 if (eb->map_token) { 3631 unmap_extent_buffer(eb, eb->map_token, km); 3632 eb->map_token = NULL; 3633 save = 1; 3634 } 3635 err = map_private_extent_buffer(eb, start, min_len, token, map, 3636 map_start, map_len, km); 3637 if (!err && save) { 3638 eb->map_token = *token; 3639 eb->kaddr = *map; 3640 eb->map_start = *map_start; 3641 eb->map_len = *map_len; 3642 } 3643 return err; 3644 } 3645 3646 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3647 { 3648 kunmap_atomic(token, km); 3649 } 3650 3651 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3652 unsigned long start, 3653 unsigned long len) 3654 { 3655 size_t cur; 3656 size_t offset; 3657 struct page *page; 3658 char *kaddr; 3659 char *ptr = (char *)ptrv; 3660 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3661 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3662 int ret = 0; 3663 3664 WARN_ON(start > eb->len); 3665 WARN_ON(start + len > eb->start + eb->len); 3666 3667 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3668 3669 while (len > 0) { 3670 page = extent_buffer_page(eb, i); 3671 3672 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3673 3674 kaddr = kmap_atomic(page, KM_USER0); 3675 ret = memcmp(ptr, kaddr + offset, cur); 3676 kunmap_atomic(kaddr, KM_USER0); 3677 if (ret) 3678 break; 3679 3680 ptr += cur; 3681 len -= cur; 3682 offset = 0; 3683 i++; 3684 } 3685 return ret; 3686 } 3687 3688 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3689 unsigned long start, unsigned long len) 3690 { 3691 size_t cur; 3692 size_t offset; 3693 struct page *page; 3694 char *kaddr; 3695 char *src = (char *)srcv; 3696 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3697 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3698 3699 WARN_ON(start > eb->len); 3700 WARN_ON(start + len > eb->start + eb->len); 3701 3702 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3703 3704 while (len > 0) { 3705 page = extent_buffer_page(eb, i); 3706 WARN_ON(!PageUptodate(page)); 3707 3708 cur = min(len, PAGE_CACHE_SIZE - offset); 3709 kaddr = kmap_atomic(page, KM_USER1); 3710 memcpy(kaddr + offset, src, cur); 3711 kunmap_atomic(kaddr, KM_USER1); 3712 3713 src += cur; 3714 len -= cur; 3715 offset = 0; 3716 i++; 3717 } 3718 } 3719 3720 void memset_extent_buffer(struct extent_buffer *eb, char c, 3721 unsigned long start, unsigned long len) 3722 { 3723 size_t cur; 3724 size_t offset; 3725 struct page *page; 3726 char *kaddr; 3727 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3728 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3729 3730 WARN_ON(start > eb->len); 3731 WARN_ON(start + len > eb->start + eb->len); 3732 3733 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3734 3735 while (len > 0) { 3736 page = extent_buffer_page(eb, i); 3737 WARN_ON(!PageUptodate(page)); 3738 3739 cur = min(len, PAGE_CACHE_SIZE - offset); 3740 kaddr = kmap_atomic(page, KM_USER0); 3741 memset(kaddr + offset, c, cur); 3742 kunmap_atomic(kaddr, KM_USER0); 3743 3744 len -= cur; 3745 offset = 0; 3746 i++; 3747 } 3748 } 3749 3750 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3751 unsigned long dst_offset, unsigned long src_offset, 3752 unsigned long len) 3753 { 3754 u64 dst_len = dst->len; 3755 size_t cur; 3756 size_t offset; 3757 struct page *page; 3758 char *kaddr; 3759 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3760 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3761 3762 WARN_ON(src->len != dst_len); 3763 3764 offset = (start_offset + dst_offset) & 3765 ((unsigned long)PAGE_CACHE_SIZE - 1); 3766 3767 while (len > 0) { 3768 page = extent_buffer_page(dst, i); 3769 WARN_ON(!PageUptodate(page)); 3770 3771 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3772 3773 kaddr = kmap_atomic(page, KM_USER0); 3774 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3775 kunmap_atomic(kaddr, KM_USER0); 3776 3777 src_offset += cur; 3778 len -= cur; 3779 offset = 0; 3780 i++; 3781 } 3782 } 3783 3784 static void move_pages(struct page *dst_page, struct page *src_page, 3785 unsigned long dst_off, unsigned long src_off, 3786 unsigned long len) 3787 { 3788 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3789 if (dst_page == src_page) { 3790 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3791 } else { 3792 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3793 char *p = dst_kaddr + dst_off + len; 3794 char *s = src_kaddr + src_off + len; 3795 3796 while (len--) 3797 *--p = *--s; 3798 3799 kunmap_atomic(src_kaddr, KM_USER1); 3800 } 3801 kunmap_atomic(dst_kaddr, KM_USER0); 3802 } 3803 3804 static void copy_pages(struct page *dst_page, struct page *src_page, 3805 unsigned long dst_off, unsigned long src_off, 3806 unsigned long len) 3807 { 3808 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3809 char *src_kaddr; 3810 3811 if (dst_page != src_page) 3812 src_kaddr = kmap_atomic(src_page, KM_USER1); 3813 else 3814 src_kaddr = dst_kaddr; 3815 3816 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3817 kunmap_atomic(dst_kaddr, KM_USER0); 3818 if (dst_page != src_page) 3819 kunmap_atomic(src_kaddr, KM_USER1); 3820 } 3821 3822 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3823 unsigned long src_offset, unsigned long len) 3824 { 3825 size_t cur; 3826 size_t dst_off_in_page; 3827 size_t src_off_in_page; 3828 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3829 unsigned long dst_i; 3830 unsigned long src_i; 3831 3832 if (src_offset + len > dst->len) { 3833 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3834 "len %lu dst len %lu\n", src_offset, len, dst->len); 3835 BUG_ON(1); 3836 } 3837 if (dst_offset + len > dst->len) { 3838 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3839 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3840 BUG_ON(1); 3841 } 3842 3843 while (len > 0) { 3844 dst_off_in_page = (start_offset + dst_offset) & 3845 ((unsigned long)PAGE_CACHE_SIZE - 1); 3846 src_off_in_page = (start_offset + src_offset) & 3847 ((unsigned long)PAGE_CACHE_SIZE - 1); 3848 3849 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3850 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3851 3852 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3853 src_off_in_page)); 3854 cur = min_t(unsigned long, cur, 3855 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3856 3857 copy_pages(extent_buffer_page(dst, dst_i), 3858 extent_buffer_page(dst, src_i), 3859 dst_off_in_page, src_off_in_page, cur); 3860 3861 src_offset += cur; 3862 dst_offset += cur; 3863 len -= cur; 3864 } 3865 } 3866 3867 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3868 unsigned long src_offset, unsigned long len) 3869 { 3870 size_t cur; 3871 size_t dst_off_in_page; 3872 size_t src_off_in_page; 3873 unsigned long dst_end = dst_offset + len - 1; 3874 unsigned long src_end = src_offset + len - 1; 3875 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3876 unsigned long dst_i; 3877 unsigned long src_i; 3878 3879 if (src_offset + len > dst->len) { 3880 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3881 "len %lu len %lu\n", src_offset, len, dst->len); 3882 BUG_ON(1); 3883 } 3884 if (dst_offset + len > dst->len) { 3885 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3886 "len %lu len %lu\n", dst_offset, len, dst->len); 3887 BUG_ON(1); 3888 } 3889 if (dst_offset < src_offset) { 3890 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3891 return; 3892 } 3893 while (len > 0) { 3894 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3895 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3896 3897 dst_off_in_page = (start_offset + dst_end) & 3898 ((unsigned long)PAGE_CACHE_SIZE - 1); 3899 src_off_in_page = (start_offset + src_end) & 3900 ((unsigned long)PAGE_CACHE_SIZE - 1); 3901 3902 cur = min_t(unsigned long, len, src_off_in_page + 1); 3903 cur = min(cur, dst_off_in_page + 1); 3904 move_pages(extent_buffer_page(dst, dst_i), 3905 extent_buffer_page(dst, src_i), 3906 dst_off_in_page - cur + 1, 3907 src_off_in_page - cur + 1, cur); 3908 3909 dst_end -= cur; 3910 src_end -= cur; 3911 len -= cur; 3912 } 3913 } 3914 3915 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3916 { 3917 struct extent_buffer *eb = 3918 container_of(head, struct extent_buffer, rcu_head); 3919 3920 btrfs_release_extent_buffer(eb); 3921 } 3922 3923 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3924 { 3925 u64 start = page_offset(page); 3926 struct extent_buffer *eb; 3927 int ret = 1; 3928 3929 spin_lock(&tree->buffer_lock); 3930 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3931 if (!eb) { 3932 spin_unlock(&tree->buffer_lock); 3933 return ret; 3934 } 3935 3936 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3937 ret = 0; 3938 goto out; 3939 } 3940 3941 /* 3942 * set @eb->refs to 0 if it is already 1, and then release the @eb. 3943 * Or go back. 3944 */ 3945 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3946 ret = 0; 3947 goto out; 3948 } 3949 3950 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3951 out: 3952 spin_unlock(&tree->buffer_lock); 3953 3954 /* at this point we can safely release the extent buffer */ 3955 if (atomic_read(&eb->refs) == 0) 3956 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3957 return ret; 3958 } 3959