1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include "extent_io.h" 14 #include "extent_map.h" 15 #include "compat.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 19 static struct kmem_cache *extent_state_cache; 20 static struct kmem_cache *extent_buffer_cache; 21 22 static LIST_HEAD(buffers); 23 static LIST_HEAD(states); 24 25 #define LEAK_DEBUG 0 26 #if LEAK_DEBUG 27 static DEFINE_SPINLOCK(leak_lock); 28 #endif 29 30 #define BUFFER_LRU_MAX 64 31 32 struct tree_entry { 33 u64 start; 34 u64 end; 35 struct rb_node rb_node; 36 }; 37 38 struct extent_page_data { 39 struct bio *bio; 40 struct extent_io_tree *tree; 41 get_extent_t *get_extent; 42 43 /* tells writepage not to lock the state bits for this range 44 * it still does the unlocking 45 */ 46 unsigned int extent_locked:1; 47 48 /* tells the submit_bio code to use a WRITE_SYNC */ 49 unsigned int sync_io:1; 50 }; 51 52 int __init extent_io_init(void) 53 { 54 extent_state_cache = kmem_cache_create("extent_state", 55 sizeof(struct extent_state), 0, 56 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 57 if (!extent_state_cache) 58 return -ENOMEM; 59 60 extent_buffer_cache = kmem_cache_create("extent_buffers", 61 sizeof(struct extent_buffer), 0, 62 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 63 if (!extent_buffer_cache) 64 goto free_state_cache; 65 return 0; 66 67 free_state_cache: 68 kmem_cache_destroy(extent_state_cache); 69 return -ENOMEM; 70 } 71 72 void extent_io_exit(void) 73 { 74 struct extent_state *state; 75 struct extent_buffer *eb; 76 77 while (!list_empty(&states)) { 78 state = list_entry(states.next, struct extent_state, leak_list); 79 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 80 "state %lu in tree %p refs %d\n", 81 (unsigned long long)state->start, 82 (unsigned long long)state->end, 83 state->state, state->tree, atomic_read(&state->refs)); 84 list_del(&state->leak_list); 85 kmem_cache_free(extent_state_cache, state); 86 87 } 88 89 while (!list_empty(&buffers)) { 90 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 91 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 92 "refs %d\n", (unsigned long long)eb->start, 93 eb->len, atomic_read(&eb->refs)); 94 list_del(&eb->leak_list); 95 kmem_cache_free(extent_buffer_cache, eb); 96 } 97 if (extent_state_cache) 98 kmem_cache_destroy(extent_state_cache); 99 if (extent_buffer_cache) 100 kmem_cache_destroy(extent_buffer_cache); 101 } 102 103 void extent_io_tree_init(struct extent_io_tree *tree, 104 struct address_space *mapping, gfp_t mask) 105 { 106 tree->state = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 108 tree->ops = NULL; 109 tree->dirty_bytes = 0; 110 spin_lock_init(&tree->lock); 111 spin_lock_init(&tree->buffer_lock); 112 tree->mapping = mapping; 113 } 114 115 static struct extent_state *alloc_extent_state(gfp_t mask) 116 { 117 struct extent_state *state; 118 #if LEAK_DEBUG 119 unsigned long flags; 120 #endif 121 122 state = kmem_cache_alloc(extent_state_cache, mask); 123 if (!state) 124 return state; 125 state->state = 0; 126 state->private = 0; 127 state->tree = NULL; 128 #if LEAK_DEBUG 129 spin_lock_irqsave(&leak_lock, flags); 130 list_add(&state->leak_list, &states); 131 spin_unlock_irqrestore(&leak_lock, flags); 132 #endif 133 atomic_set(&state->refs, 1); 134 init_waitqueue_head(&state->wq); 135 return state; 136 } 137 138 void free_extent_state(struct extent_state *state) 139 { 140 if (!state) 141 return; 142 if (atomic_dec_and_test(&state->refs)) { 143 #if LEAK_DEBUG 144 unsigned long flags; 145 #endif 146 WARN_ON(state->tree); 147 #if LEAK_DEBUG 148 spin_lock_irqsave(&leak_lock, flags); 149 list_del(&state->leak_list); 150 spin_unlock_irqrestore(&leak_lock, flags); 151 #endif 152 kmem_cache_free(extent_state_cache, state); 153 } 154 } 155 156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 157 struct rb_node *node) 158 { 159 struct rb_node **p = &root->rb_node; 160 struct rb_node *parent = NULL; 161 struct tree_entry *entry; 162 163 while (*p) { 164 parent = *p; 165 entry = rb_entry(parent, struct tree_entry, rb_node); 166 167 if (offset < entry->start) 168 p = &(*p)->rb_left; 169 else if (offset > entry->end) 170 p = &(*p)->rb_right; 171 else 172 return parent; 173 } 174 175 entry = rb_entry(node, struct tree_entry, rb_node); 176 rb_link_node(node, parent, p); 177 rb_insert_color(node, root); 178 return NULL; 179 } 180 181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 182 struct rb_node **prev_ret, 183 struct rb_node **next_ret) 184 { 185 struct rb_root *root = &tree->state; 186 struct rb_node *n = root->rb_node; 187 struct rb_node *prev = NULL; 188 struct rb_node *orig_prev = NULL; 189 struct tree_entry *entry; 190 struct tree_entry *prev_entry = NULL; 191 192 while (n) { 193 entry = rb_entry(n, struct tree_entry, rb_node); 194 prev = n; 195 prev_entry = entry; 196 197 if (offset < entry->start) 198 n = n->rb_left; 199 else if (offset > entry->end) 200 n = n->rb_right; 201 else 202 return n; 203 } 204 205 if (prev_ret) { 206 orig_prev = prev; 207 while (prev && offset > prev_entry->end) { 208 prev = rb_next(prev); 209 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 210 } 211 *prev_ret = prev; 212 prev = orig_prev; 213 } 214 215 if (next_ret) { 216 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 217 while (prev && offset < prev_entry->start) { 218 prev = rb_prev(prev); 219 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 220 } 221 *next_ret = prev; 222 } 223 return NULL; 224 } 225 226 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 227 u64 offset) 228 { 229 struct rb_node *prev = NULL; 230 struct rb_node *ret; 231 232 ret = __etree_search(tree, offset, &prev, NULL); 233 if (!ret) 234 return prev; 235 return ret; 236 } 237 238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 239 struct extent_state *other) 240 { 241 if (tree->ops && tree->ops->merge_extent_hook) 242 tree->ops->merge_extent_hook(tree->mapping->host, new, 243 other); 244 } 245 246 /* 247 * utility function to look for merge candidates inside a given range. 248 * Any extents with matching state are merged together into a single 249 * extent in the tree. Extents with EXTENT_IO in their state field 250 * are not merged because the end_io handlers need to be able to do 251 * operations on them without sleeping (or doing allocations/splits). 252 * 253 * This should be called with the tree lock held. 254 */ 255 static int merge_state(struct extent_io_tree *tree, 256 struct extent_state *state) 257 { 258 struct extent_state *other; 259 struct rb_node *other_node; 260 261 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 262 return 0; 263 264 other_node = rb_prev(&state->rb_node); 265 if (other_node) { 266 other = rb_entry(other_node, struct extent_state, rb_node); 267 if (other->end == state->start - 1 && 268 other->state == state->state) { 269 merge_cb(tree, state, other); 270 state->start = other->start; 271 other->tree = NULL; 272 rb_erase(&other->rb_node, &tree->state); 273 free_extent_state(other); 274 } 275 } 276 other_node = rb_next(&state->rb_node); 277 if (other_node) { 278 other = rb_entry(other_node, struct extent_state, rb_node); 279 if (other->start == state->end + 1 && 280 other->state == state->state) { 281 merge_cb(tree, state, other); 282 other->start = state->start; 283 state->tree = NULL; 284 rb_erase(&state->rb_node, &tree->state); 285 free_extent_state(state); 286 state = NULL; 287 } 288 } 289 290 return 0; 291 } 292 293 static int set_state_cb(struct extent_io_tree *tree, 294 struct extent_state *state, int *bits) 295 { 296 if (tree->ops && tree->ops->set_bit_hook) { 297 return tree->ops->set_bit_hook(tree->mapping->host, 298 state, bits); 299 } 300 301 return 0; 302 } 303 304 static void clear_state_cb(struct extent_io_tree *tree, 305 struct extent_state *state, int *bits) 306 { 307 if (tree->ops && tree->ops->clear_bit_hook) 308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 309 } 310 311 /* 312 * insert an extent_state struct into the tree. 'bits' are set on the 313 * struct before it is inserted. 314 * 315 * This may return -EEXIST if the extent is already there, in which case the 316 * state struct is freed. 317 * 318 * The tree lock is not taken internally. This is a utility function and 319 * probably isn't what you want to call (see set/clear_extent_bit). 320 */ 321 static int insert_state(struct extent_io_tree *tree, 322 struct extent_state *state, u64 start, u64 end, 323 int *bits) 324 { 325 struct rb_node *node; 326 int bits_to_set = *bits & ~EXTENT_CTLBITS; 327 int ret; 328 329 if (end < start) { 330 printk(KERN_ERR "btrfs end < start %llu %llu\n", 331 (unsigned long long)end, 332 (unsigned long long)start); 333 WARN_ON(1); 334 } 335 state->start = start; 336 state->end = end; 337 ret = set_state_cb(tree, state, bits); 338 if (ret) 339 return ret; 340 341 if (bits_to_set & EXTENT_DIRTY) 342 tree->dirty_bytes += end - start + 1; 343 state->state |= bits_to_set; 344 node = tree_insert(&tree->state, end, &state->rb_node); 345 if (node) { 346 struct extent_state *found; 347 found = rb_entry(node, struct extent_state, rb_node); 348 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 349 "%llu %llu\n", (unsigned long long)found->start, 350 (unsigned long long)found->end, 351 (unsigned long long)start, (unsigned long long)end); 352 free_extent_state(state); 353 return -EEXIST; 354 } 355 state->tree = tree; 356 merge_state(tree, state); 357 return 0; 358 } 359 360 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 361 u64 split) 362 { 363 if (tree->ops && tree->ops->split_extent_hook) 364 return tree->ops->split_extent_hook(tree->mapping->host, 365 orig, split); 366 return 0; 367 } 368 369 /* 370 * split a given extent state struct in two, inserting the preallocated 371 * struct 'prealloc' as the newly created second half. 'split' indicates an 372 * offset inside 'orig' where it should be split. 373 * 374 * Before calling, 375 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 376 * are two extent state structs in the tree: 377 * prealloc: [orig->start, split - 1] 378 * orig: [ split, orig->end ] 379 * 380 * The tree locks are not taken by this function. They need to be held 381 * by the caller. 382 */ 383 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 384 struct extent_state *prealloc, u64 split) 385 { 386 struct rb_node *node; 387 388 split_cb(tree, orig, split); 389 390 prealloc->start = orig->start; 391 prealloc->end = split - 1; 392 prealloc->state = orig->state; 393 orig->start = split; 394 395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 396 if (node) { 397 free_extent_state(prealloc); 398 return -EEXIST; 399 } 400 prealloc->tree = tree; 401 return 0; 402 } 403 404 /* 405 * utility function to clear some bits in an extent state struct. 406 * it will optionally wake up any one waiting on this state (wake == 1), or 407 * forcibly remove the state from the tree (delete == 1). 408 * 409 * If no bits are set on the state struct after clearing things, the 410 * struct is freed and removed from the tree 411 */ 412 static int clear_state_bit(struct extent_io_tree *tree, 413 struct extent_state *state, 414 int *bits, int wake) 415 { 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 417 int ret = state->state & bits_to_clear; 418 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 420 u64 range = state->end - state->start + 1; 421 WARN_ON(range > tree->dirty_bytes); 422 tree->dirty_bytes -= range; 423 } 424 clear_state_cb(tree, state, bits); 425 state->state &= ~bits_to_clear; 426 if (wake) 427 wake_up(&state->wq); 428 if (state->state == 0) { 429 if (state->tree) { 430 rb_erase(&state->rb_node, &tree->state); 431 state->tree = NULL; 432 free_extent_state(state); 433 } else { 434 WARN_ON(1); 435 } 436 } else { 437 merge_state(tree, state); 438 } 439 return ret; 440 } 441 442 /* 443 * clear some bits on a range in the tree. This may require splitting 444 * or inserting elements in the tree, so the gfp mask is used to 445 * indicate which allocations or sleeping are allowed. 446 * 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 448 * the given range from the tree regardless of state (ie for truncate). 449 * 450 * the range [start, end] is inclusive. 451 * 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 453 * bits were already set, or zero if none of the bits were already set. 454 */ 455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 456 int bits, int wake, int delete, 457 struct extent_state **cached_state, 458 gfp_t mask) 459 { 460 struct extent_state *state; 461 struct extent_state *cached; 462 struct extent_state *prealloc = NULL; 463 struct rb_node *next_node; 464 struct rb_node *node; 465 u64 last_end; 466 int err; 467 int set = 0; 468 int clear = 0; 469 470 if (delete) 471 bits |= ~EXTENT_CTLBITS; 472 bits |= EXTENT_FIRST_DELALLOC; 473 474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 475 clear = 1; 476 again: 477 if (!prealloc && (mask & __GFP_WAIT)) { 478 prealloc = alloc_extent_state(mask); 479 if (!prealloc) 480 return -ENOMEM; 481 } 482 483 spin_lock(&tree->lock); 484 if (cached_state) { 485 cached = *cached_state; 486 487 if (clear) { 488 *cached_state = NULL; 489 cached_state = NULL; 490 } 491 492 if (cached && cached->tree && cached->start == start) { 493 if (clear) 494 atomic_dec(&cached->refs); 495 state = cached; 496 goto hit_next; 497 } 498 if (clear) 499 free_extent_state(cached); 500 } 501 /* 502 * this search will find the extents that end after 503 * our range starts 504 */ 505 node = tree_search(tree, start); 506 if (!node) 507 goto out; 508 state = rb_entry(node, struct extent_state, rb_node); 509 hit_next: 510 if (state->start > end) 511 goto out; 512 WARN_ON(state->end < start); 513 last_end = state->end; 514 515 /* 516 * | ---- desired range ---- | 517 * | state | or 518 * | ------------- state -------------- | 519 * 520 * We need to split the extent we found, and may flip 521 * bits on second half. 522 * 523 * If the extent we found extends past our range, we 524 * just split and search again. It'll get split again 525 * the next time though. 526 * 527 * If the extent we found is inside our range, we clear 528 * the desired bit on it. 529 */ 530 531 if (state->start < start) { 532 if (!prealloc) 533 prealloc = alloc_extent_state(GFP_ATOMIC); 534 err = split_state(tree, state, prealloc, start); 535 BUG_ON(err == -EEXIST); 536 prealloc = NULL; 537 if (err) 538 goto out; 539 if (state->end <= end) { 540 set |= clear_state_bit(tree, state, &bits, wake); 541 if (last_end == (u64)-1) 542 goto out; 543 start = last_end + 1; 544 } 545 goto search_again; 546 } 547 /* 548 * | ---- desired range ---- | 549 * | state | 550 * We need to split the extent, and clear the bit 551 * on the first half 552 */ 553 if (state->start <= end && state->end > end) { 554 if (!prealloc) 555 prealloc = alloc_extent_state(GFP_ATOMIC); 556 err = split_state(tree, state, prealloc, end + 1); 557 BUG_ON(err == -EEXIST); 558 if (wake) 559 wake_up(&state->wq); 560 561 set |= clear_state_bit(tree, prealloc, &bits, wake); 562 563 prealloc = NULL; 564 goto out; 565 } 566 567 if (state->end < end && prealloc && !need_resched()) 568 next_node = rb_next(&state->rb_node); 569 else 570 next_node = NULL; 571 572 set |= clear_state_bit(tree, state, &bits, wake); 573 if (last_end == (u64)-1) 574 goto out; 575 start = last_end + 1; 576 if (start <= end && next_node) { 577 state = rb_entry(next_node, struct extent_state, 578 rb_node); 579 if (state->start == start) 580 goto hit_next; 581 } 582 goto search_again; 583 584 out: 585 spin_unlock(&tree->lock); 586 if (prealloc) 587 free_extent_state(prealloc); 588 589 return set; 590 591 search_again: 592 if (start > end) 593 goto out; 594 spin_unlock(&tree->lock); 595 if (mask & __GFP_WAIT) 596 cond_resched(); 597 goto again; 598 } 599 600 static int wait_on_state(struct extent_io_tree *tree, 601 struct extent_state *state) 602 __releases(tree->lock) 603 __acquires(tree->lock) 604 { 605 DEFINE_WAIT(wait); 606 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 607 spin_unlock(&tree->lock); 608 schedule(); 609 spin_lock(&tree->lock); 610 finish_wait(&state->wq, &wait); 611 return 0; 612 } 613 614 /* 615 * waits for one or more bits to clear on a range in the state tree. 616 * The range [start, end] is inclusive. 617 * The tree lock is taken by this function 618 */ 619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 620 { 621 struct extent_state *state; 622 struct rb_node *node; 623 624 spin_lock(&tree->lock); 625 again: 626 while (1) { 627 /* 628 * this search will find all the extents that end after 629 * our range starts 630 */ 631 node = tree_search(tree, start); 632 if (!node) 633 break; 634 635 state = rb_entry(node, struct extent_state, rb_node); 636 637 if (state->start > end) 638 goto out; 639 640 if (state->state & bits) { 641 start = state->start; 642 atomic_inc(&state->refs); 643 wait_on_state(tree, state); 644 free_extent_state(state); 645 goto again; 646 } 647 start = state->end + 1; 648 649 if (start > end) 650 break; 651 652 if (need_resched()) { 653 spin_unlock(&tree->lock); 654 cond_resched(); 655 spin_lock(&tree->lock); 656 } 657 } 658 out: 659 spin_unlock(&tree->lock); 660 return 0; 661 } 662 663 static int set_state_bits(struct extent_io_tree *tree, 664 struct extent_state *state, 665 int *bits) 666 { 667 int ret; 668 int bits_to_set = *bits & ~EXTENT_CTLBITS; 669 670 ret = set_state_cb(tree, state, bits); 671 if (ret) 672 return ret; 673 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 674 u64 range = state->end - state->start + 1; 675 tree->dirty_bytes += range; 676 } 677 state->state |= bits_to_set; 678 679 return 0; 680 } 681 682 static void cache_state(struct extent_state *state, 683 struct extent_state **cached_ptr) 684 { 685 if (cached_ptr && !(*cached_ptr)) { 686 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 687 *cached_ptr = state; 688 atomic_inc(&state->refs); 689 } 690 } 691 } 692 693 /* 694 * set some bits on a range in the tree. This may require allocations or 695 * sleeping, so the gfp mask is used to indicate what is allowed. 696 * 697 * If any of the exclusive bits are set, this will fail with -EEXIST if some 698 * part of the range already has the desired bits set. The start of the 699 * existing range is returned in failed_start in this case. 700 * 701 * [start, end] is inclusive This takes the tree lock. 702 */ 703 704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 705 int bits, int exclusive_bits, u64 *failed_start, 706 struct extent_state **cached_state, gfp_t mask) 707 { 708 struct extent_state *state; 709 struct extent_state *prealloc = NULL; 710 struct rb_node *node; 711 int err = 0; 712 u64 last_start; 713 u64 last_end; 714 715 bits |= EXTENT_FIRST_DELALLOC; 716 again: 717 if (!prealloc && (mask & __GFP_WAIT)) { 718 prealloc = alloc_extent_state(mask); 719 if (!prealloc) 720 return -ENOMEM; 721 } 722 723 spin_lock(&tree->lock); 724 if (cached_state && *cached_state) { 725 state = *cached_state; 726 if (state->start == start && state->tree) { 727 node = &state->rb_node; 728 goto hit_next; 729 } 730 } 731 /* 732 * this search will find all the extents that end after 733 * our range starts. 734 */ 735 node = tree_search(tree, start); 736 if (!node) { 737 err = insert_state(tree, prealloc, start, end, &bits); 738 prealloc = NULL; 739 BUG_ON(err == -EEXIST); 740 goto out; 741 } 742 state = rb_entry(node, struct extent_state, rb_node); 743 hit_next: 744 last_start = state->start; 745 last_end = state->end; 746 747 /* 748 * | ---- desired range ---- | 749 * | state | 750 * 751 * Just lock what we found and keep going 752 */ 753 if (state->start == start && state->end <= end) { 754 struct rb_node *next_node; 755 if (state->state & exclusive_bits) { 756 *failed_start = state->start; 757 err = -EEXIST; 758 goto out; 759 } 760 761 err = set_state_bits(tree, state, &bits); 762 if (err) 763 goto out; 764 765 cache_state(state, cached_state); 766 merge_state(tree, state); 767 if (last_end == (u64)-1) 768 goto out; 769 770 start = last_end + 1; 771 if (start < end && prealloc && !need_resched()) { 772 next_node = rb_next(node); 773 if (next_node) { 774 state = rb_entry(next_node, struct extent_state, 775 rb_node); 776 if (state->start == start) 777 goto hit_next; 778 } 779 } 780 goto search_again; 781 } 782 783 /* 784 * | ---- desired range ---- | 785 * | state | 786 * or 787 * | ------------- state -------------- | 788 * 789 * We need to split the extent we found, and may flip bits on 790 * second half. 791 * 792 * If the extent we found extends past our 793 * range, we just split and search again. It'll get split 794 * again the next time though. 795 * 796 * If the extent we found is inside our range, we set the 797 * desired bit on it. 798 */ 799 if (state->start < start) { 800 if (state->state & exclusive_bits) { 801 *failed_start = start; 802 err = -EEXIST; 803 goto out; 804 } 805 err = split_state(tree, state, prealloc, start); 806 BUG_ON(err == -EEXIST); 807 prealloc = NULL; 808 if (err) 809 goto out; 810 if (state->end <= end) { 811 err = set_state_bits(tree, state, &bits); 812 if (err) 813 goto out; 814 cache_state(state, cached_state); 815 merge_state(tree, state); 816 if (last_end == (u64)-1) 817 goto out; 818 start = last_end + 1; 819 } 820 goto search_again; 821 } 822 /* 823 * | ---- desired range ---- | 824 * | state | or | state | 825 * 826 * There's a hole, we need to insert something in it and 827 * ignore the extent we found. 828 */ 829 if (state->start > start) { 830 u64 this_end; 831 if (end < last_start) 832 this_end = end; 833 else 834 this_end = last_start - 1; 835 err = insert_state(tree, prealloc, start, this_end, 836 &bits); 837 BUG_ON(err == -EEXIST); 838 if (err) { 839 prealloc = NULL; 840 goto out; 841 } 842 cache_state(prealloc, cached_state); 843 prealloc = NULL; 844 start = this_end + 1; 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and set the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 if (state->state & exclusive_bits) { 855 *failed_start = start; 856 err = -EEXIST; 857 goto out; 858 } 859 err = split_state(tree, state, prealloc, end + 1); 860 BUG_ON(err == -EEXIST); 861 862 err = set_state_bits(tree, prealloc, &bits); 863 if (err) { 864 prealloc = NULL; 865 goto out; 866 } 867 cache_state(prealloc, cached_state); 868 merge_state(tree, prealloc); 869 prealloc = NULL; 870 goto out; 871 } 872 873 goto search_again; 874 875 out: 876 spin_unlock(&tree->lock); 877 if (prealloc) 878 free_extent_state(prealloc); 879 880 return err; 881 882 search_again: 883 if (start > end) 884 goto out; 885 spin_unlock(&tree->lock); 886 if (mask & __GFP_WAIT) 887 cond_resched(); 888 goto again; 889 } 890 891 /* wrappers around set/clear extent bit */ 892 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 893 gfp_t mask) 894 { 895 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 896 NULL, mask); 897 } 898 899 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 900 int bits, gfp_t mask) 901 { 902 return set_extent_bit(tree, start, end, bits, 0, NULL, 903 NULL, mask); 904 } 905 906 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 907 int bits, gfp_t mask) 908 { 909 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 910 } 911 912 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 913 struct extent_state **cached_state, gfp_t mask) 914 { 915 return set_extent_bit(tree, start, end, 916 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 917 0, NULL, cached_state, mask); 918 } 919 920 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 921 gfp_t mask) 922 { 923 return clear_extent_bit(tree, start, end, 924 EXTENT_DIRTY | EXTENT_DELALLOC | 925 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 926 } 927 928 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 929 gfp_t mask) 930 { 931 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 932 NULL, mask); 933 } 934 935 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 936 gfp_t mask) 937 { 938 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, 939 NULL, mask); 940 } 941 942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 943 gfp_t mask) 944 { 945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 946 NULL, mask); 947 } 948 949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 950 u64 end, struct extent_state **cached_state, 951 gfp_t mask) 952 { 953 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 954 cached_state, mask); 955 } 956 957 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 958 { 959 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 960 } 961 962 /* 963 * either insert or lock state struct between start and end use mask to tell 964 * us if waiting is desired. 965 */ 966 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 967 int bits, struct extent_state **cached_state, gfp_t mask) 968 { 969 int err; 970 u64 failed_start; 971 while (1) { 972 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 973 EXTENT_LOCKED, &failed_start, 974 cached_state, mask); 975 if (err == -EEXIST && (mask & __GFP_WAIT)) { 976 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 977 start = failed_start; 978 } else { 979 break; 980 } 981 WARN_ON(start > end); 982 } 983 return err; 984 } 985 986 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 987 { 988 return lock_extent_bits(tree, start, end, 0, NULL, mask); 989 } 990 991 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 992 gfp_t mask) 993 { 994 int err; 995 u64 failed_start; 996 997 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 998 &failed_start, NULL, mask); 999 if (err == -EEXIST) { 1000 if (failed_start > start) 1001 clear_extent_bit(tree, start, failed_start - 1, 1002 EXTENT_LOCKED, 1, 0, NULL, mask); 1003 return 0; 1004 } 1005 return 1; 1006 } 1007 1008 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1009 struct extent_state **cached, gfp_t mask) 1010 { 1011 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1012 mask); 1013 } 1014 1015 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1016 gfp_t mask) 1017 { 1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1019 mask); 1020 } 1021 1022 /* 1023 * helper function to set pages and extents in the tree dirty 1024 */ 1025 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) 1026 { 1027 unsigned long index = start >> PAGE_CACHE_SHIFT; 1028 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1029 struct page *page; 1030 1031 while (index <= end_index) { 1032 page = find_get_page(tree->mapping, index); 1033 BUG_ON(!page); 1034 __set_page_dirty_nobuffers(page); 1035 page_cache_release(page); 1036 index++; 1037 } 1038 return 0; 1039 } 1040 1041 /* 1042 * helper function to set both pages and extents in the tree writeback 1043 */ 1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1045 { 1046 unsigned long index = start >> PAGE_CACHE_SHIFT; 1047 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1048 struct page *page; 1049 1050 while (index <= end_index) { 1051 page = find_get_page(tree->mapping, index); 1052 BUG_ON(!page); 1053 set_page_writeback(page); 1054 page_cache_release(page); 1055 index++; 1056 } 1057 return 0; 1058 } 1059 1060 /* 1061 * find the first offset in the io tree with 'bits' set. zero is 1062 * returned if we find something, and *start_ret and *end_ret are 1063 * set to reflect the state struct that was found. 1064 * 1065 * If nothing was found, 1 is returned, < 0 on error 1066 */ 1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1068 u64 *start_ret, u64 *end_ret, int bits) 1069 { 1070 struct rb_node *node; 1071 struct extent_state *state; 1072 int ret = 1; 1073 1074 spin_lock(&tree->lock); 1075 /* 1076 * this search will find all the extents that end after 1077 * our range starts. 1078 */ 1079 node = tree_search(tree, start); 1080 if (!node) 1081 goto out; 1082 1083 while (1) { 1084 state = rb_entry(node, struct extent_state, rb_node); 1085 if (state->end >= start && (state->state & bits)) { 1086 *start_ret = state->start; 1087 *end_ret = state->end; 1088 ret = 0; 1089 break; 1090 } 1091 node = rb_next(node); 1092 if (!node) 1093 break; 1094 } 1095 out: 1096 spin_unlock(&tree->lock); 1097 return ret; 1098 } 1099 1100 /* find the first state struct with 'bits' set after 'start', and 1101 * return it. tree->lock must be held. NULL will returned if 1102 * nothing was found after 'start' 1103 */ 1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1105 u64 start, int bits) 1106 { 1107 struct rb_node *node; 1108 struct extent_state *state; 1109 1110 /* 1111 * this search will find all the extents that end after 1112 * our range starts. 1113 */ 1114 node = tree_search(tree, start); 1115 if (!node) 1116 goto out; 1117 1118 while (1) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (state->end >= start && (state->state & bits)) 1121 return state; 1122 1123 node = rb_next(node); 1124 if (!node) 1125 break; 1126 } 1127 out: 1128 return NULL; 1129 } 1130 1131 /* 1132 * find a contiguous range of bytes in the file marked as delalloc, not 1133 * more than 'max_bytes'. start and end are used to return the range, 1134 * 1135 * 1 is returned if we find something, 0 if nothing was in the tree 1136 */ 1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1138 u64 *start, u64 *end, u64 max_bytes, 1139 struct extent_state **cached_state) 1140 { 1141 struct rb_node *node; 1142 struct extent_state *state; 1143 u64 cur_start = *start; 1144 u64 found = 0; 1145 u64 total_bytes = 0; 1146 1147 spin_lock(&tree->lock); 1148 1149 /* 1150 * this search will find all the extents that end after 1151 * our range starts. 1152 */ 1153 node = tree_search(tree, cur_start); 1154 if (!node) { 1155 if (!found) 1156 *end = (u64)-1; 1157 goto out; 1158 } 1159 1160 while (1) { 1161 state = rb_entry(node, struct extent_state, rb_node); 1162 if (found && (state->start != cur_start || 1163 (state->state & EXTENT_BOUNDARY))) { 1164 goto out; 1165 } 1166 if (!(state->state & EXTENT_DELALLOC)) { 1167 if (!found) 1168 *end = state->end; 1169 goto out; 1170 } 1171 if (!found) { 1172 *start = state->start; 1173 *cached_state = state; 1174 atomic_inc(&state->refs); 1175 } 1176 found++; 1177 *end = state->end; 1178 cur_start = state->end + 1; 1179 node = rb_next(node); 1180 if (!node) 1181 break; 1182 total_bytes += state->end - state->start + 1; 1183 if (total_bytes >= max_bytes) 1184 break; 1185 } 1186 out: 1187 spin_unlock(&tree->lock); 1188 return found; 1189 } 1190 1191 static noinline int __unlock_for_delalloc(struct inode *inode, 1192 struct page *locked_page, 1193 u64 start, u64 end) 1194 { 1195 int ret; 1196 struct page *pages[16]; 1197 unsigned long index = start >> PAGE_CACHE_SHIFT; 1198 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1199 unsigned long nr_pages = end_index - index + 1; 1200 int i; 1201 1202 if (index == locked_page->index && end_index == index) 1203 return 0; 1204 1205 while (nr_pages > 0) { 1206 ret = find_get_pages_contig(inode->i_mapping, index, 1207 min_t(unsigned long, nr_pages, 1208 ARRAY_SIZE(pages)), pages); 1209 for (i = 0; i < ret; i++) { 1210 if (pages[i] != locked_page) 1211 unlock_page(pages[i]); 1212 page_cache_release(pages[i]); 1213 } 1214 nr_pages -= ret; 1215 index += ret; 1216 cond_resched(); 1217 } 1218 return 0; 1219 } 1220 1221 static noinline int lock_delalloc_pages(struct inode *inode, 1222 struct page *locked_page, 1223 u64 delalloc_start, 1224 u64 delalloc_end) 1225 { 1226 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1227 unsigned long start_index = index; 1228 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1229 unsigned long pages_locked = 0; 1230 struct page *pages[16]; 1231 unsigned long nrpages; 1232 int ret; 1233 int i; 1234 1235 /* the caller is responsible for locking the start index */ 1236 if (index == locked_page->index && index == end_index) 1237 return 0; 1238 1239 /* skip the page at the start index */ 1240 nrpages = end_index - index + 1; 1241 while (nrpages > 0) { 1242 ret = find_get_pages_contig(inode->i_mapping, index, 1243 min_t(unsigned long, 1244 nrpages, ARRAY_SIZE(pages)), pages); 1245 if (ret == 0) { 1246 ret = -EAGAIN; 1247 goto done; 1248 } 1249 /* now we have an array of pages, lock them all */ 1250 for (i = 0; i < ret; i++) { 1251 /* 1252 * the caller is taking responsibility for 1253 * locked_page 1254 */ 1255 if (pages[i] != locked_page) { 1256 lock_page(pages[i]); 1257 if (!PageDirty(pages[i]) || 1258 pages[i]->mapping != inode->i_mapping) { 1259 ret = -EAGAIN; 1260 unlock_page(pages[i]); 1261 page_cache_release(pages[i]); 1262 goto done; 1263 } 1264 } 1265 page_cache_release(pages[i]); 1266 pages_locked++; 1267 } 1268 nrpages -= ret; 1269 index += ret; 1270 cond_resched(); 1271 } 1272 ret = 0; 1273 done: 1274 if (ret && pages_locked) { 1275 __unlock_for_delalloc(inode, locked_page, 1276 delalloc_start, 1277 ((u64)(start_index + pages_locked - 1)) << 1278 PAGE_CACHE_SHIFT); 1279 } 1280 return ret; 1281 } 1282 1283 /* 1284 * find a contiguous range of bytes in the file marked as delalloc, not 1285 * more than 'max_bytes'. start and end are used to return the range, 1286 * 1287 * 1 is returned if we find something, 0 if nothing was in the tree 1288 */ 1289 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1290 struct extent_io_tree *tree, 1291 struct page *locked_page, 1292 u64 *start, u64 *end, 1293 u64 max_bytes) 1294 { 1295 u64 delalloc_start; 1296 u64 delalloc_end; 1297 u64 found; 1298 struct extent_state *cached_state = NULL; 1299 int ret; 1300 int loops = 0; 1301 1302 again: 1303 /* step one, find a bunch of delalloc bytes starting at start */ 1304 delalloc_start = *start; 1305 delalloc_end = 0; 1306 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1307 max_bytes, &cached_state); 1308 if (!found || delalloc_end <= *start) { 1309 *start = delalloc_start; 1310 *end = delalloc_end; 1311 free_extent_state(cached_state); 1312 return found; 1313 } 1314 1315 /* 1316 * start comes from the offset of locked_page. We have to lock 1317 * pages in order, so we can't process delalloc bytes before 1318 * locked_page 1319 */ 1320 if (delalloc_start < *start) 1321 delalloc_start = *start; 1322 1323 /* 1324 * make sure to limit the number of pages we try to lock down 1325 * if we're looping. 1326 */ 1327 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1328 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1329 1330 /* step two, lock all the pages after the page that has start */ 1331 ret = lock_delalloc_pages(inode, locked_page, 1332 delalloc_start, delalloc_end); 1333 if (ret == -EAGAIN) { 1334 /* some of the pages are gone, lets avoid looping by 1335 * shortening the size of the delalloc range we're searching 1336 */ 1337 free_extent_state(cached_state); 1338 if (!loops) { 1339 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1340 max_bytes = PAGE_CACHE_SIZE - offset; 1341 loops = 1; 1342 goto again; 1343 } else { 1344 found = 0; 1345 goto out_failed; 1346 } 1347 } 1348 BUG_ON(ret); 1349 1350 /* step three, lock the state bits for the whole range */ 1351 lock_extent_bits(tree, delalloc_start, delalloc_end, 1352 0, &cached_state, GFP_NOFS); 1353 1354 /* then test to make sure it is all still delalloc */ 1355 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1356 EXTENT_DELALLOC, 1, cached_state); 1357 if (!ret) { 1358 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1359 &cached_state, GFP_NOFS); 1360 __unlock_for_delalloc(inode, locked_page, 1361 delalloc_start, delalloc_end); 1362 cond_resched(); 1363 goto again; 1364 } 1365 free_extent_state(cached_state); 1366 *start = delalloc_start; 1367 *end = delalloc_end; 1368 out_failed: 1369 return found; 1370 } 1371 1372 int extent_clear_unlock_delalloc(struct inode *inode, 1373 struct extent_io_tree *tree, 1374 u64 start, u64 end, struct page *locked_page, 1375 unsigned long op) 1376 { 1377 int ret; 1378 struct page *pages[16]; 1379 unsigned long index = start >> PAGE_CACHE_SHIFT; 1380 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1381 unsigned long nr_pages = end_index - index + 1; 1382 int i; 1383 int clear_bits = 0; 1384 1385 if (op & EXTENT_CLEAR_UNLOCK) 1386 clear_bits |= EXTENT_LOCKED; 1387 if (op & EXTENT_CLEAR_DIRTY) 1388 clear_bits |= EXTENT_DIRTY; 1389 1390 if (op & EXTENT_CLEAR_DELALLOC) 1391 clear_bits |= EXTENT_DELALLOC; 1392 1393 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1394 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1395 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1396 EXTENT_SET_PRIVATE2))) 1397 return 0; 1398 1399 while (nr_pages > 0) { 1400 ret = find_get_pages_contig(inode->i_mapping, index, 1401 min_t(unsigned long, 1402 nr_pages, ARRAY_SIZE(pages)), pages); 1403 for (i = 0; i < ret; i++) { 1404 1405 if (op & EXTENT_SET_PRIVATE2) 1406 SetPagePrivate2(pages[i]); 1407 1408 if (pages[i] == locked_page) { 1409 page_cache_release(pages[i]); 1410 continue; 1411 } 1412 if (op & EXTENT_CLEAR_DIRTY) 1413 clear_page_dirty_for_io(pages[i]); 1414 if (op & EXTENT_SET_WRITEBACK) 1415 set_page_writeback(pages[i]); 1416 if (op & EXTENT_END_WRITEBACK) 1417 end_page_writeback(pages[i]); 1418 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1419 unlock_page(pages[i]); 1420 page_cache_release(pages[i]); 1421 } 1422 nr_pages -= ret; 1423 index += ret; 1424 cond_resched(); 1425 } 1426 return 0; 1427 } 1428 1429 /* 1430 * count the number of bytes in the tree that have a given bit(s) 1431 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1432 * cached. The total number found is returned. 1433 */ 1434 u64 count_range_bits(struct extent_io_tree *tree, 1435 u64 *start, u64 search_end, u64 max_bytes, 1436 unsigned long bits) 1437 { 1438 struct rb_node *node; 1439 struct extent_state *state; 1440 u64 cur_start = *start; 1441 u64 total_bytes = 0; 1442 int found = 0; 1443 1444 if (search_end <= cur_start) { 1445 WARN_ON(1); 1446 return 0; 1447 } 1448 1449 spin_lock(&tree->lock); 1450 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1451 total_bytes = tree->dirty_bytes; 1452 goto out; 1453 } 1454 /* 1455 * this search will find all the extents that end after 1456 * our range starts. 1457 */ 1458 node = tree_search(tree, cur_start); 1459 if (!node) 1460 goto out; 1461 1462 while (1) { 1463 state = rb_entry(node, struct extent_state, rb_node); 1464 if (state->start > search_end) 1465 break; 1466 if (state->end >= cur_start && (state->state & bits)) { 1467 total_bytes += min(search_end, state->end) + 1 - 1468 max(cur_start, state->start); 1469 if (total_bytes >= max_bytes) 1470 break; 1471 if (!found) { 1472 *start = state->start; 1473 found = 1; 1474 } 1475 } 1476 node = rb_next(node); 1477 if (!node) 1478 break; 1479 } 1480 out: 1481 spin_unlock(&tree->lock); 1482 return total_bytes; 1483 } 1484 1485 /* 1486 * set the private field for a given byte offset in the tree. If there isn't 1487 * an extent_state there already, this does nothing. 1488 */ 1489 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1490 { 1491 struct rb_node *node; 1492 struct extent_state *state; 1493 int ret = 0; 1494 1495 spin_lock(&tree->lock); 1496 /* 1497 * this search will find all the extents that end after 1498 * our range starts. 1499 */ 1500 node = tree_search(tree, start); 1501 if (!node) { 1502 ret = -ENOENT; 1503 goto out; 1504 } 1505 state = rb_entry(node, struct extent_state, rb_node); 1506 if (state->start != start) { 1507 ret = -ENOENT; 1508 goto out; 1509 } 1510 state->private = private; 1511 out: 1512 spin_unlock(&tree->lock); 1513 return ret; 1514 } 1515 1516 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1517 { 1518 struct rb_node *node; 1519 struct extent_state *state; 1520 int ret = 0; 1521 1522 spin_lock(&tree->lock); 1523 /* 1524 * this search will find all the extents that end after 1525 * our range starts. 1526 */ 1527 node = tree_search(tree, start); 1528 if (!node) { 1529 ret = -ENOENT; 1530 goto out; 1531 } 1532 state = rb_entry(node, struct extent_state, rb_node); 1533 if (state->start != start) { 1534 ret = -ENOENT; 1535 goto out; 1536 } 1537 *private = state->private; 1538 out: 1539 spin_unlock(&tree->lock); 1540 return ret; 1541 } 1542 1543 /* 1544 * searches a range in the state tree for a given mask. 1545 * If 'filled' == 1, this returns 1 only if every extent in the tree 1546 * has the bits set. Otherwise, 1 is returned if any bit in the 1547 * range is found set. 1548 */ 1549 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1550 int bits, int filled, struct extent_state *cached) 1551 { 1552 struct extent_state *state = NULL; 1553 struct rb_node *node; 1554 int bitset = 0; 1555 1556 spin_lock(&tree->lock); 1557 if (cached && cached->tree && cached->start == start) 1558 node = &cached->rb_node; 1559 else 1560 node = tree_search(tree, start); 1561 while (node && start <= end) { 1562 state = rb_entry(node, struct extent_state, rb_node); 1563 1564 if (filled && state->start > start) { 1565 bitset = 0; 1566 break; 1567 } 1568 1569 if (state->start > end) 1570 break; 1571 1572 if (state->state & bits) { 1573 bitset = 1; 1574 if (!filled) 1575 break; 1576 } else if (filled) { 1577 bitset = 0; 1578 break; 1579 } 1580 1581 if (state->end == (u64)-1) 1582 break; 1583 1584 start = state->end + 1; 1585 if (start > end) 1586 break; 1587 node = rb_next(node); 1588 if (!node) { 1589 if (filled) 1590 bitset = 0; 1591 break; 1592 } 1593 } 1594 spin_unlock(&tree->lock); 1595 return bitset; 1596 } 1597 1598 /* 1599 * helper function to set a given page up to date if all the 1600 * extents in the tree for that page are up to date 1601 */ 1602 static int check_page_uptodate(struct extent_io_tree *tree, 1603 struct page *page) 1604 { 1605 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1606 u64 end = start + PAGE_CACHE_SIZE - 1; 1607 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1608 SetPageUptodate(page); 1609 return 0; 1610 } 1611 1612 /* 1613 * helper function to unlock a page if all the extents in the tree 1614 * for that page are unlocked 1615 */ 1616 static int check_page_locked(struct extent_io_tree *tree, 1617 struct page *page) 1618 { 1619 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1620 u64 end = start + PAGE_CACHE_SIZE - 1; 1621 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1622 unlock_page(page); 1623 return 0; 1624 } 1625 1626 /* 1627 * helper function to end page writeback if all the extents 1628 * in the tree for that page are done with writeback 1629 */ 1630 static int check_page_writeback(struct extent_io_tree *tree, 1631 struct page *page) 1632 { 1633 end_page_writeback(page); 1634 return 0; 1635 } 1636 1637 /* lots and lots of room for performance fixes in the end_bio funcs */ 1638 1639 /* 1640 * after a writepage IO is done, we need to: 1641 * clear the uptodate bits on error 1642 * clear the writeback bits in the extent tree for this IO 1643 * end_page_writeback if the page has no more pending IO 1644 * 1645 * Scheduling is not allowed, so the extent state tree is expected 1646 * to have one and only one object corresponding to this IO. 1647 */ 1648 static void end_bio_extent_writepage(struct bio *bio, int err) 1649 { 1650 int uptodate = err == 0; 1651 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1652 struct extent_io_tree *tree; 1653 u64 start; 1654 u64 end; 1655 int whole_page; 1656 int ret; 1657 1658 do { 1659 struct page *page = bvec->bv_page; 1660 tree = &BTRFS_I(page->mapping->host)->io_tree; 1661 1662 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1663 bvec->bv_offset; 1664 end = start + bvec->bv_len - 1; 1665 1666 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1667 whole_page = 1; 1668 else 1669 whole_page = 0; 1670 1671 if (--bvec >= bio->bi_io_vec) 1672 prefetchw(&bvec->bv_page->flags); 1673 if (tree->ops && tree->ops->writepage_end_io_hook) { 1674 ret = tree->ops->writepage_end_io_hook(page, start, 1675 end, NULL, uptodate); 1676 if (ret) 1677 uptodate = 0; 1678 } 1679 1680 if (!uptodate && tree->ops && 1681 tree->ops->writepage_io_failed_hook) { 1682 ret = tree->ops->writepage_io_failed_hook(bio, page, 1683 start, end, NULL); 1684 if (ret == 0) { 1685 uptodate = (err == 0); 1686 continue; 1687 } 1688 } 1689 1690 if (!uptodate) { 1691 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1692 ClearPageUptodate(page); 1693 SetPageError(page); 1694 } 1695 1696 if (whole_page) 1697 end_page_writeback(page); 1698 else 1699 check_page_writeback(tree, page); 1700 } while (bvec >= bio->bi_io_vec); 1701 1702 bio_put(bio); 1703 } 1704 1705 /* 1706 * after a readpage IO is done, we need to: 1707 * clear the uptodate bits on error 1708 * set the uptodate bits if things worked 1709 * set the page up to date if all extents in the tree are uptodate 1710 * clear the lock bit in the extent tree 1711 * unlock the page if there are no other extents locked for it 1712 * 1713 * Scheduling is not allowed, so the extent state tree is expected 1714 * to have one and only one object corresponding to this IO. 1715 */ 1716 static void end_bio_extent_readpage(struct bio *bio, int err) 1717 { 1718 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1719 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1720 struct bio_vec *bvec = bio->bi_io_vec; 1721 struct extent_io_tree *tree; 1722 u64 start; 1723 u64 end; 1724 int whole_page; 1725 int ret; 1726 1727 if (err) 1728 uptodate = 0; 1729 1730 do { 1731 struct page *page = bvec->bv_page; 1732 tree = &BTRFS_I(page->mapping->host)->io_tree; 1733 1734 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1735 bvec->bv_offset; 1736 end = start + bvec->bv_len - 1; 1737 1738 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1739 whole_page = 1; 1740 else 1741 whole_page = 0; 1742 1743 if (++bvec <= bvec_end) 1744 prefetchw(&bvec->bv_page->flags); 1745 1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1747 ret = tree->ops->readpage_end_io_hook(page, start, end, 1748 NULL); 1749 if (ret) 1750 uptodate = 0; 1751 } 1752 if (!uptodate && tree->ops && 1753 tree->ops->readpage_io_failed_hook) { 1754 ret = tree->ops->readpage_io_failed_hook(bio, page, 1755 start, end, NULL); 1756 if (ret == 0) { 1757 uptodate = 1758 test_bit(BIO_UPTODATE, &bio->bi_flags); 1759 if (err) 1760 uptodate = 0; 1761 continue; 1762 } 1763 } 1764 1765 if (uptodate) { 1766 set_extent_uptodate(tree, start, end, 1767 GFP_ATOMIC); 1768 } 1769 unlock_extent(tree, start, end, GFP_ATOMIC); 1770 1771 if (whole_page) { 1772 if (uptodate) { 1773 SetPageUptodate(page); 1774 } else { 1775 ClearPageUptodate(page); 1776 SetPageError(page); 1777 } 1778 unlock_page(page); 1779 } else { 1780 if (uptodate) { 1781 check_page_uptodate(tree, page); 1782 } else { 1783 ClearPageUptodate(page); 1784 SetPageError(page); 1785 } 1786 check_page_locked(tree, page); 1787 } 1788 } while (bvec <= bvec_end); 1789 1790 bio_put(bio); 1791 } 1792 1793 /* 1794 * IO done from prepare_write is pretty simple, we just unlock 1795 * the structs in the extent tree when done, and set the uptodate bits 1796 * as appropriate. 1797 */ 1798 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1799 { 1800 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1801 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1802 struct extent_io_tree *tree; 1803 u64 start; 1804 u64 end; 1805 1806 do { 1807 struct page *page = bvec->bv_page; 1808 tree = &BTRFS_I(page->mapping->host)->io_tree; 1809 1810 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1811 bvec->bv_offset; 1812 end = start + bvec->bv_len - 1; 1813 1814 if (--bvec >= bio->bi_io_vec) 1815 prefetchw(&bvec->bv_page->flags); 1816 1817 if (uptodate) { 1818 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1819 } else { 1820 ClearPageUptodate(page); 1821 SetPageError(page); 1822 } 1823 1824 unlock_extent(tree, start, end, GFP_ATOMIC); 1825 1826 } while (bvec >= bio->bi_io_vec); 1827 1828 bio_put(bio); 1829 } 1830 1831 struct bio * 1832 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1833 gfp_t gfp_flags) 1834 { 1835 struct bio *bio; 1836 1837 bio = bio_alloc(gfp_flags, nr_vecs); 1838 1839 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1840 while (!bio && (nr_vecs /= 2)) 1841 bio = bio_alloc(gfp_flags, nr_vecs); 1842 } 1843 1844 if (bio) { 1845 bio->bi_size = 0; 1846 bio->bi_bdev = bdev; 1847 bio->bi_sector = first_sector; 1848 } 1849 return bio; 1850 } 1851 1852 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1853 unsigned long bio_flags) 1854 { 1855 int ret = 0; 1856 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1857 struct page *page = bvec->bv_page; 1858 struct extent_io_tree *tree = bio->bi_private; 1859 u64 start; 1860 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1862 1863 bio->bi_private = NULL; 1864 1865 bio_get(bio); 1866 1867 if (tree->ops && tree->ops->submit_bio_hook) 1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1869 mirror_num, bio_flags, start); 1870 else 1871 submit_bio(rw, bio); 1872 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1873 ret = -EOPNOTSUPP; 1874 bio_put(bio); 1875 return ret; 1876 } 1877 1878 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1879 struct page *page, sector_t sector, 1880 size_t size, unsigned long offset, 1881 struct block_device *bdev, 1882 struct bio **bio_ret, 1883 unsigned long max_pages, 1884 bio_end_io_t end_io_func, 1885 int mirror_num, 1886 unsigned long prev_bio_flags, 1887 unsigned long bio_flags) 1888 { 1889 int ret = 0; 1890 struct bio *bio; 1891 int nr; 1892 int contig = 0; 1893 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1894 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1895 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1896 1897 if (bio_ret && *bio_ret) { 1898 bio = *bio_ret; 1899 if (old_compressed) 1900 contig = bio->bi_sector == sector; 1901 else 1902 contig = bio->bi_sector + (bio->bi_size >> 9) == 1903 sector; 1904 1905 if (prev_bio_flags != bio_flags || !contig || 1906 (tree->ops && tree->ops->merge_bio_hook && 1907 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1908 bio_flags)) || 1909 bio_add_page(bio, page, page_size, offset) < page_size) { 1910 ret = submit_one_bio(rw, bio, mirror_num, 1911 prev_bio_flags); 1912 bio = NULL; 1913 } else { 1914 return 0; 1915 } 1916 } 1917 if (this_compressed) 1918 nr = BIO_MAX_PAGES; 1919 else 1920 nr = bio_get_nr_vecs(bdev); 1921 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1923 1924 bio_add_page(bio, page, page_size, offset); 1925 bio->bi_end_io = end_io_func; 1926 bio->bi_private = tree; 1927 1928 if (bio_ret) 1929 *bio_ret = bio; 1930 else 1931 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1932 1933 return ret; 1934 } 1935 1936 void set_page_extent_mapped(struct page *page) 1937 { 1938 if (!PagePrivate(page)) { 1939 SetPagePrivate(page); 1940 page_cache_get(page); 1941 set_page_private(page, EXTENT_PAGE_PRIVATE); 1942 } 1943 } 1944 1945 static void set_page_extent_head(struct page *page, unsigned long len) 1946 { 1947 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1948 } 1949 1950 /* 1951 * basic readpage implementation. Locked extent state structs are inserted 1952 * into the tree that are removed when the IO is done (by the end_io 1953 * handlers) 1954 */ 1955 static int __extent_read_full_page(struct extent_io_tree *tree, 1956 struct page *page, 1957 get_extent_t *get_extent, 1958 struct bio **bio, int mirror_num, 1959 unsigned long *bio_flags) 1960 { 1961 struct inode *inode = page->mapping->host; 1962 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1963 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1964 u64 end; 1965 u64 cur = start; 1966 u64 extent_offset; 1967 u64 last_byte = i_size_read(inode); 1968 u64 block_start; 1969 u64 cur_end; 1970 sector_t sector; 1971 struct extent_map *em; 1972 struct block_device *bdev; 1973 struct btrfs_ordered_extent *ordered; 1974 int ret; 1975 int nr = 0; 1976 size_t page_offset = 0; 1977 size_t iosize; 1978 size_t disk_io_size; 1979 size_t blocksize = inode->i_sb->s_blocksize; 1980 unsigned long this_bio_flag = 0; 1981 1982 set_page_extent_mapped(page); 1983 1984 end = page_end; 1985 while (1) { 1986 lock_extent(tree, start, end, GFP_NOFS); 1987 ordered = btrfs_lookup_ordered_extent(inode, start); 1988 if (!ordered) 1989 break; 1990 unlock_extent(tree, start, end, GFP_NOFS); 1991 btrfs_start_ordered_extent(inode, ordered, 1); 1992 btrfs_put_ordered_extent(ordered); 1993 } 1994 1995 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1996 char *userpage; 1997 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 1998 1999 if (zero_offset) { 2000 iosize = PAGE_CACHE_SIZE - zero_offset; 2001 userpage = kmap_atomic(page, KM_USER0); 2002 memset(userpage + zero_offset, 0, iosize); 2003 flush_dcache_page(page); 2004 kunmap_atomic(userpage, KM_USER0); 2005 } 2006 } 2007 while (cur <= end) { 2008 if (cur >= last_byte) { 2009 char *userpage; 2010 iosize = PAGE_CACHE_SIZE - page_offset; 2011 userpage = kmap_atomic(page, KM_USER0); 2012 memset(userpage + page_offset, 0, iosize); 2013 flush_dcache_page(page); 2014 kunmap_atomic(userpage, KM_USER0); 2015 set_extent_uptodate(tree, cur, cur + iosize - 1, 2016 GFP_NOFS); 2017 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2018 break; 2019 } 2020 em = get_extent(inode, page, page_offset, cur, 2021 end - cur + 1, 0); 2022 if (IS_ERR(em) || !em) { 2023 SetPageError(page); 2024 unlock_extent(tree, cur, end, GFP_NOFS); 2025 break; 2026 } 2027 extent_offset = cur - em->start; 2028 BUG_ON(extent_map_end(em) <= cur); 2029 BUG_ON(end < cur); 2030 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2032 this_bio_flag = EXTENT_BIO_COMPRESSED; 2033 2034 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2035 cur_end = min(extent_map_end(em) - 1, end); 2036 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2037 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2038 disk_io_size = em->block_len; 2039 sector = em->block_start >> 9; 2040 } else { 2041 sector = (em->block_start + extent_offset) >> 9; 2042 disk_io_size = iosize; 2043 } 2044 bdev = em->bdev; 2045 block_start = em->block_start; 2046 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2047 block_start = EXTENT_MAP_HOLE; 2048 free_extent_map(em); 2049 em = NULL; 2050 2051 /* we've found a hole, just zero and go on */ 2052 if (block_start == EXTENT_MAP_HOLE) { 2053 char *userpage; 2054 userpage = kmap_atomic(page, KM_USER0); 2055 memset(userpage + page_offset, 0, iosize); 2056 flush_dcache_page(page); 2057 kunmap_atomic(userpage, KM_USER0); 2058 2059 set_extent_uptodate(tree, cur, cur + iosize - 1, 2060 GFP_NOFS); 2061 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2062 cur = cur + iosize; 2063 page_offset += iosize; 2064 continue; 2065 } 2066 /* the get_extent function already copied into the page */ 2067 if (test_range_bit(tree, cur, cur_end, 2068 EXTENT_UPTODATE, 1, NULL)) { 2069 check_page_uptodate(tree, page); 2070 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2071 cur = cur + iosize; 2072 page_offset += iosize; 2073 continue; 2074 } 2075 /* we have an inline extent but it didn't get marked up 2076 * to date. Error out 2077 */ 2078 if (block_start == EXTENT_MAP_INLINE) { 2079 SetPageError(page); 2080 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2081 cur = cur + iosize; 2082 page_offset += iosize; 2083 continue; 2084 } 2085 2086 ret = 0; 2087 if (tree->ops && tree->ops->readpage_io_hook) { 2088 ret = tree->ops->readpage_io_hook(page, cur, 2089 cur + iosize - 1); 2090 } 2091 if (!ret) { 2092 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2093 pnr -= page->index; 2094 ret = submit_extent_page(READ, tree, page, 2095 sector, disk_io_size, page_offset, 2096 bdev, bio, pnr, 2097 end_bio_extent_readpage, mirror_num, 2098 *bio_flags, 2099 this_bio_flag); 2100 nr++; 2101 *bio_flags = this_bio_flag; 2102 } 2103 if (ret) 2104 SetPageError(page); 2105 cur = cur + iosize; 2106 page_offset += iosize; 2107 } 2108 if (!nr) { 2109 if (!PageError(page)) 2110 SetPageUptodate(page); 2111 unlock_page(page); 2112 } 2113 return 0; 2114 } 2115 2116 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2117 get_extent_t *get_extent) 2118 { 2119 struct bio *bio = NULL; 2120 unsigned long bio_flags = 0; 2121 int ret; 2122 2123 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2124 &bio_flags); 2125 if (bio) 2126 submit_one_bio(READ, bio, 0, bio_flags); 2127 return ret; 2128 } 2129 2130 static noinline void update_nr_written(struct page *page, 2131 struct writeback_control *wbc, 2132 unsigned long nr_written) 2133 { 2134 wbc->nr_to_write -= nr_written; 2135 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2136 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2137 page->mapping->writeback_index = page->index + nr_written; 2138 } 2139 2140 /* 2141 * the writepage semantics are similar to regular writepage. extent 2142 * records are inserted to lock ranges in the tree, and as dirty areas 2143 * are found, they are marked writeback. Then the lock bits are removed 2144 * and the end_io handler clears the writeback ranges 2145 */ 2146 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2147 void *data) 2148 { 2149 struct inode *inode = page->mapping->host; 2150 struct extent_page_data *epd = data; 2151 struct extent_io_tree *tree = epd->tree; 2152 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2153 u64 delalloc_start; 2154 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2155 u64 end; 2156 u64 cur = start; 2157 u64 extent_offset; 2158 u64 last_byte = i_size_read(inode); 2159 u64 block_start; 2160 u64 iosize; 2161 sector_t sector; 2162 struct extent_state *cached_state = NULL; 2163 struct extent_map *em; 2164 struct block_device *bdev; 2165 int ret; 2166 int nr = 0; 2167 size_t pg_offset = 0; 2168 size_t blocksize; 2169 loff_t i_size = i_size_read(inode); 2170 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2171 u64 nr_delalloc; 2172 u64 delalloc_end; 2173 int page_started; 2174 int compressed; 2175 int write_flags; 2176 unsigned long nr_written = 0; 2177 2178 if (wbc->sync_mode == WB_SYNC_ALL) 2179 write_flags = WRITE_SYNC_PLUG; 2180 else 2181 write_flags = WRITE; 2182 2183 WARN_ON(!PageLocked(page)); 2184 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2185 if (page->index > end_index || 2186 (page->index == end_index && !pg_offset)) { 2187 page->mapping->a_ops->invalidatepage(page, 0); 2188 unlock_page(page); 2189 return 0; 2190 } 2191 2192 if (page->index == end_index) { 2193 char *userpage; 2194 2195 userpage = kmap_atomic(page, KM_USER0); 2196 memset(userpage + pg_offset, 0, 2197 PAGE_CACHE_SIZE - pg_offset); 2198 kunmap_atomic(userpage, KM_USER0); 2199 flush_dcache_page(page); 2200 } 2201 pg_offset = 0; 2202 2203 set_page_extent_mapped(page); 2204 2205 delalloc_start = start; 2206 delalloc_end = 0; 2207 page_started = 0; 2208 if (!epd->extent_locked) { 2209 u64 delalloc_to_write = 0; 2210 /* 2211 * make sure the wbc mapping index is at least updated 2212 * to this page. 2213 */ 2214 update_nr_written(page, wbc, 0); 2215 2216 while (delalloc_end < page_end) { 2217 nr_delalloc = find_lock_delalloc_range(inode, tree, 2218 page, 2219 &delalloc_start, 2220 &delalloc_end, 2221 128 * 1024 * 1024); 2222 if (nr_delalloc == 0) { 2223 delalloc_start = delalloc_end + 1; 2224 continue; 2225 } 2226 tree->ops->fill_delalloc(inode, page, delalloc_start, 2227 delalloc_end, &page_started, 2228 &nr_written); 2229 /* 2230 * delalloc_end is already one less than the total 2231 * length, so we don't subtract one from 2232 * PAGE_CACHE_SIZE 2233 */ 2234 delalloc_to_write += (delalloc_end - delalloc_start + 2235 PAGE_CACHE_SIZE) >> 2236 PAGE_CACHE_SHIFT; 2237 delalloc_start = delalloc_end + 1; 2238 } 2239 if (wbc->nr_to_write < delalloc_to_write) { 2240 int thresh = 8192; 2241 2242 if (delalloc_to_write < thresh * 2) 2243 thresh = delalloc_to_write; 2244 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2245 thresh); 2246 } 2247 2248 /* did the fill delalloc function already unlock and start 2249 * the IO? 2250 */ 2251 if (page_started) { 2252 ret = 0; 2253 /* 2254 * we've unlocked the page, so we can't update 2255 * the mapping's writeback index, just update 2256 * nr_to_write. 2257 */ 2258 wbc->nr_to_write -= nr_written; 2259 goto done_unlocked; 2260 } 2261 } 2262 if (tree->ops && tree->ops->writepage_start_hook) { 2263 ret = tree->ops->writepage_start_hook(page, start, 2264 page_end); 2265 if (ret == -EAGAIN) { 2266 redirty_page_for_writepage(wbc, page); 2267 update_nr_written(page, wbc, nr_written); 2268 unlock_page(page); 2269 ret = 0; 2270 goto done_unlocked; 2271 } 2272 } 2273 2274 /* 2275 * we don't want to touch the inode after unlocking the page, 2276 * so we update the mapping writeback index now 2277 */ 2278 update_nr_written(page, wbc, nr_written + 1); 2279 2280 end = page_end; 2281 if (last_byte <= start) { 2282 if (tree->ops && tree->ops->writepage_end_io_hook) 2283 tree->ops->writepage_end_io_hook(page, start, 2284 page_end, NULL, 1); 2285 goto done; 2286 } 2287 2288 blocksize = inode->i_sb->s_blocksize; 2289 2290 while (cur <= end) { 2291 if (cur >= last_byte) { 2292 if (tree->ops && tree->ops->writepage_end_io_hook) 2293 tree->ops->writepage_end_io_hook(page, cur, 2294 page_end, NULL, 1); 2295 break; 2296 } 2297 em = epd->get_extent(inode, page, pg_offset, cur, 2298 end - cur + 1, 1); 2299 if (IS_ERR(em) || !em) { 2300 SetPageError(page); 2301 break; 2302 } 2303 2304 extent_offset = cur - em->start; 2305 BUG_ON(extent_map_end(em) <= cur); 2306 BUG_ON(end < cur); 2307 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2308 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2309 sector = (em->block_start + extent_offset) >> 9; 2310 bdev = em->bdev; 2311 block_start = em->block_start; 2312 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2313 free_extent_map(em); 2314 em = NULL; 2315 2316 /* 2317 * compressed and inline extents are written through other 2318 * paths in the FS 2319 */ 2320 if (compressed || block_start == EXTENT_MAP_HOLE || 2321 block_start == EXTENT_MAP_INLINE) { 2322 /* 2323 * end_io notification does not happen here for 2324 * compressed extents 2325 */ 2326 if (!compressed && tree->ops && 2327 tree->ops->writepage_end_io_hook) 2328 tree->ops->writepage_end_io_hook(page, cur, 2329 cur + iosize - 1, 2330 NULL, 1); 2331 else if (compressed) { 2332 /* we don't want to end_page_writeback on 2333 * a compressed extent. this happens 2334 * elsewhere 2335 */ 2336 nr++; 2337 } 2338 2339 cur += iosize; 2340 pg_offset += iosize; 2341 continue; 2342 } 2343 /* leave this out until we have a page_mkwrite call */ 2344 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2345 EXTENT_DIRTY, 0, NULL)) { 2346 cur = cur + iosize; 2347 pg_offset += iosize; 2348 continue; 2349 } 2350 2351 if (tree->ops && tree->ops->writepage_io_hook) { 2352 ret = tree->ops->writepage_io_hook(page, cur, 2353 cur + iosize - 1); 2354 } else { 2355 ret = 0; 2356 } 2357 if (ret) { 2358 SetPageError(page); 2359 } else { 2360 unsigned long max_nr = end_index + 1; 2361 2362 set_range_writeback(tree, cur, cur + iosize - 1); 2363 if (!PageWriteback(page)) { 2364 printk(KERN_ERR "btrfs warning page %lu not " 2365 "writeback, cur %llu end %llu\n", 2366 page->index, (unsigned long long)cur, 2367 (unsigned long long)end); 2368 } 2369 2370 ret = submit_extent_page(write_flags, tree, page, 2371 sector, iosize, pg_offset, 2372 bdev, &epd->bio, max_nr, 2373 end_bio_extent_writepage, 2374 0, 0, 0); 2375 if (ret) 2376 SetPageError(page); 2377 } 2378 cur = cur + iosize; 2379 pg_offset += iosize; 2380 nr++; 2381 } 2382 done: 2383 if (nr == 0) { 2384 /* make sure the mapping tag for page dirty gets cleared */ 2385 set_page_writeback(page); 2386 end_page_writeback(page); 2387 } 2388 unlock_page(page); 2389 2390 done_unlocked: 2391 2392 /* drop our reference on any cached states */ 2393 free_extent_state(cached_state); 2394 return 0; 2395 } 2396 2397 /** 2398 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2399 * @mapping: address space structure to write 2400 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2401 * @writepage: function called for each page 2402 * @data: data passed to writepage function 2403 * 2404 * If a page is already under I/O, write_cache_pages() skips it, even 2405 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2406 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2407 * and msync() need to guarantee that all the data which was dirty at the time 2408 * the call was made get new I/O started against them. If wbc->sync_mode is 2409 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2410 * existing IO to complete. 2411 */ 2412 static int extent_write_cache_pages(struct extent_io_tree *tree, 2413 struct address_space *mapping, 2414 struct writeback_control *wbc, 2415 writepage_t writepage, void *data, 2416 void (*flush_fn)(void *)) 2417 { 2418 int ret = 0; 2419 int done = 0; 2420 int nr_to_write_done = 0; 2421 struct pagevec pvec; 2422 int nr_pages; 2423 pgoff_t index; 2424 pgoff_t end; /* Inclusive */ 2425 int scanned = 0; 2426 2427 pagevec_init(&pvec, 0); 2428 if (wbc->range_cyclic) { 2429 index = mapping->writeback_index; /* Start from prev offset */ 2430 end = -1; 2431 } else { 2432 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2433 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2434 scanned = 1; 2435 } 2436 retry: 2437 while (!done && !nr_to_write_done && (index <= end) && 2438 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2439 PAGECACHE_TAG_DIRTY, min(end - index, 2440 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2441 unsigned i; 2442 2443 scanned = 1; 2444 for (i = 0; i < nr_pages; i++) { 2445 struct page *page = pvec.pages[i]; 2446 2447 /* 2448 * At this point we hold neither mapping->tree_lock nor 2449 * lock on the page itself: the page may be truncated or 2450 * invalidated (changing page->mapping to NULL), or even 2451 * swizzled back from swapper_space to tmpfs file 2452 * mapping 2453 */ 2454 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2455 tree->ops->write_cache_pages_lock_hook(page); 2456 else 2457 lock_page(page); 2458 2459 if (unlikely(page->mapping != mapping)) { 2460 unlock_page(page); 2461 continue; 2462 } 2463 2464 if (!wbc->range_cyclic && page->index > end) { 2465 done = 1; 2466 unlock_page(page); 2467 continue; 2468 } 2469 2470 if (wbc->sync_mode != WB_SYNC_NONE) { 2471 if (PageWriteback(page)) 2472 flush_fn(data); 2473 wait_on_page_writeback(page); 2474 } 2475 2476 if (PageWriteback(page) || 2477 !clear_page_dirty_for_io(page)) { 2478 unlock_page(page); 2479 continue; 2480 } 2481 2482 ret = (*writepage)(page, wbc, data); 2483 2484 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2485 unlock_page(page); 2486 ret = 0; 2487 } 2488 if (ret) 2489 done = 1; 2490 2491 /* 2492 * the filesystem may choose to bump up nr_to_write. 2493 * We have to make sure to honor the new nr_to_write 2494 * at any time 2495 */ 2496 nr_to_write_done = wbc->nr_to_write <= 0; 2497 } 2498 pagevec_release(&pvec); 2499 cond_resched(); 2500 } 2501 if (!scanned && !done) { 2502 /* 2503 * We hit the last page and there is more work to be done: wrap 2504 * back to the start of the file 2505 */ 2506 scanned = 1; 2507 index = 0; 2508 goto retry; 2509 } 2510 return ret; 2511 } 2512 2513 static void flush_epd_write_bio(struct extent_page_data *epd) 2514 { 2515 if (epd->bio) { 2516 if (epd->sync_io) 2517 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2518 else 2519 submit_one_bio(WRITE, epd->bio, 0, 0); 2520 epd->bio = NULL; 2521 } 2522 } 2523 2524 static noinline void flush_write_bio(void *data) 2525 { 2526 struct extent_page_data *epd = data; 2527 flush_epd_write_bio(epd); 2528 } 2529 2530 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2531 get_extent_t *get_extent, 2532 struct writeback_control *wbc) 2533 { 2534 int ret; 2535 struct address_space *mapping = page->mapping; 2536 struct extent_page_data epd = { 2537 .bio = NULL, 2538 .tree = tree, 2539 .get_extent = get_extent, 2540 .extent_locked = 0, 2541 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2542 }; 2543 struct writeback_control wbc_writepages = { 2544 .sync_mode = wbc->sync_mode, 2545 .older_than_this = NULL, 2546 .nr_to_write = 64, 2547 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2548 .range_end = (loff_t)-1, 2549 }; 2550 2551 ret = __extent_writepage(page, wbc, &epd); 2552 2553 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2554 __extent_writepage, &epd, flush_write_bio); 2555 flush_epd_write_bio(&epd); 2556 return ret; 2557 } 2558 2559 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2560 u64 start, u64 end, get_extent_t *get_extent, 2561 int mode) 2562 { 2563 int ret = 0; 2564 struct address_space *mapping = inode->i_mapping; 2565 struct page *page; 2566 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2567 PAGE_CACHE_SHIFT; 2568 2569 struct extent_page_data epd = { 2570 .bio = NULL, 2571 .tree = tree, 2572 .get_extent = get_extent, 2573 .extent_locked = 1, 2574 .sync_io = mode == WB_SYNC_ALL, 2575 }; 2576 struct writeback_control wbc_writepages = { 2577 .sync_mode = mode, 2578 .older_than_this = NULL, 2579 .nr_to_write = nr_pages * 2, 2580 .range_start = start, 2581 .range_end = end + 1, 2582 }; 2583 2584 while (start <= end) { 2585 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2586 if (clear_page_dirty_for_io(page)) 2587 ret = __extent_writepage(page, &wbc_writepages, &epd); 2588 else { 2589 if (tree->ops && tree->ops->writepage_end_io_hook) 2590 tree->ops->writepage_end_io_hook(page, start, 2591 start + PAGE_CACHE_SIZE - 1, 2592 NULL, 1); 2593 unlock_page(page); 2594 } 2595 page_cache_release(page); 2596 start += PAGE_CACHE_SIZE; 2597 } 2598 2599 flush_epd_write_bio(&epd); 2600 return ret; 2601 } 2602 2603 int extent_writepages(struct extent_io_tree *tree, 2604 struct address_space *mapping, 2605 get_extent_t *get_extent, 2606 struct writeback_control *wbc) 2607 { 2608 int ret = 0; 2609 struct extent_page_data epd = { 2610 .bio = NULL, 2611 .tree = tree, 2612 .get_extent = get_extent, 2613 .extent_locked = 0, 2614 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2615 }; 2616 2617 ret = extent_write_cache_pages(tree, mapping, wbc, 2618 __extent_writepage, &epd, 2619 flush_write_bio); 2620 flush_epd_write_bio(&epd); 2621 return ret; 2622 } 2623 2624 int extent_readpages(struct extent_io_tree *tree, 2625 struct address_space *mapping, 2626 struct list_head *pages, unsigned nr_pages, 2627 get_extent_t get_extent) 2628 { 2629 struct bio *bio = NULL; 2630 unsigned page_idx; 2631 unsigned long bio_flags = 0; 2632 2633 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2634 struct page *page = list_entry(pages->prev, struct page, lru); 2635 2636 prefetchw(&page->flags); 2637 list_del(&page->lru); 2638 if (!add_to_page_cache_lru(page, mapping, 2639 page->index, GFP_KERNEL)) { 2640 __extent_read_full_page(tree, page, get_extent, 2641 &bio, 0, &bio_flags); 2642 } 2643 page_cache_release(page); 2644 } 2645 BUG_ON(!list_empty(pages)); 2646 if (bio) 2647 submit_one_bio(READ, bio, 0, bio_flags); 2648 return 0; 2649 } 2650 2651 /* 2652 * basic invalidatepage code, this waits on any locked or writeback 2653 * ranges corresponding to the page, and then deletes any extent state 2654 * records from the tree 2655 */ 2656 int extent_invalidatepage(struct extent_io_tree *tree, 2657 struct page *page, unsigned long offset) 2658 { 2659 struct extent_state *cached_state = NULL; 2660 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2661 u64 end = start + PAGE_CACHE_SIZE - 1; 2662 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2663 2664 start += (offset + blocksize - 1) & ~(blocksize - 1); 2665 if (start > end) 2666 return 0; 2667 2668 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2669 wait_on_page_writeback(page); 2670 clear_extent_bit(tree, start, end, 2671 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2672 EXTENT_DO_ACCOUNTING, 2673 1, 1, &cached_state, GFP_NOFS); 2674 return 0; 2675 } 2676 2677 /* 2678 * simple commit_write call, set_range_dirty is used to mark both 2679 * the pages and the extent records as dirty 2680 */ 2681 int extent_commit_write(struct extent_io_tree *tree, 2682 struct inode *inode, struct page *page, 2683 unsigned from, unsigned to) 2684 { 2685 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2686 2687 set_page_extent_mapped(page); 2688 set_page_dirty(page); 2689 2690 if (pos > inode->i_size) { 2691 i_size_write(inode, pos); 2692 mark_inode_dirty(inode); 2693 } 2694 return 0; 2695 } 2696 2697 int extent_prepare_write(struct extent_io_tree *tree, 2698 struct inode *inode, struct page *page, 2699 unsigned from, unsigned to, get_extent_t *get_extent) 2700 { 2701 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2702 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2703 u64 block_start; 2704 u64 orig_block_start; 2705 u64 block_end; 2706 u64 cur_end; 2707 struct extent_map *em; 2708 unsigned blocksize = 1 << inode->i_blkbits; 2709 size_t page_offset = 0; 2710 size_t block_off_start; 2711 size_t block_off_end; 2712 int err = 0; 2713 int iocount = 0; 2714 int ret = 0; 2715 int isnew; 2716 2717 set_page_extent_mapped(page); 2718 2719 block_start = (page_start + from) & ~((u64)blocksize - 1); 2720 block_end = (page_start + to - 1) | (blocksize - 1); 2721 orig_block_start = block_start; 2722 2723 lock_extent(tree, page_start, page_end, GFP_NOFS); 2724 while (block_start <= block_end) { 2725 em = get_extent(inode, page, page_offset, block_start, 2726 block_end - block_start + 1, 1); 2727 if (IS_ERR(em) || !em) 2728 goto err; 2729 2730 cur_end = min(block_end, extent_map_end(em) - 1); 2731 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2732 block_off_end = block_off_start + blocksize; 2733 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2734 2735 if (!PageUptodate(page) && isnew && 2736 (block_off_end > to || block_off_start < from)) { 2737 void *kaddr; 2738 2739 kaddr = kmap_atomic(page, KM_USER0); 2740 if (block_off_end > to) 2741 memset(kaddr + to, 0, block_off_end - to); 2742 if (block_off_start < from) 2743 memset(kaddr + block_off_start, 0, 2744 from - block_off_start); 2745 flush_dcache_page(page); 2746 kunmap_atomic(kaddr, KM_USER0); 2747 } 2748 if ((em->block_start != EXTENT_MAP_HOLE && 2749 em->block_start != EXTENT_MAP_INLINE) && 2750 !isnew && !PageUptodate(page) && 2751 (block_off_end > to || block_off_start < from) && 2752 !test_range_bit(tree, block_start, cur_end, 2753 EXTENT_UPTODATE, 1, NULL)) { 2754 u64 sector; 2755 u64 extent_offset = block_start - em->start; 2756 size_t iosize; 2757 sector = (em->block_start + extent_offset) >> 9; 2758 iosize = (cur_end - block_start + blocksize) & 2759 ~((u64)blocksize - 1); 2760 /* 2761 * we've already got the extent locked, but we 2762 * need to split the state such that our end_bio 2763 * handler can clear the lock. 2764 */ 2765 set_extent_bit(tree, block_start, 2766 block_start + iosize - 1, 2767 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); 2768 ret = submit_extent_page(READ, tree, page, 2769 sector, iosize, page_offset, em->bdev, 2770 NULL, 1, 2771 end_bio_extent_preparewrite, 0, 2772 0, 0); 2773 if (ret && !err) 2774 err = ret; 2775 iocount++; 2776 block_start = block_start + iosize; 2777 } else { 2778 set_extent_uptodate(tree, block_start, cur_end, 2779 GFP_NOFS); 2780 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2781 block_start = cur_end + 1; 2782 } 2783 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2784 free_extent_map(em); 2785 } 2786 if (iocount) { 2787 wait_extent_bit(tree, orig_block_start, 2788 block_end, EXTENT_LOCKED); 2789 } 2790 check_page_uptodate(tree, page); 2791 err: 2792 /* FIXME, zero out newly allocated blocks on error */ 2793 return err; 2794 } 2795 2796 /* 2797 * a helper for releasepage, this tests for areas of the page that 2798 * are locked or under IO and drops the related state bits if it is safe 2799 * to drop the page. 2800 */ 2801 int try_release_extent_state(struct extent_map_tree *map, 2802 struct extent_io_tree *tree, struct page *page, 2803 gfp_t mask) 2804 { 2805 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2806 u64 end = start + PAGE_CACHE_SIZE - 1; 2807 int ret = 1; 2808 2809 if (test_range_bit(tree, start, end, 2810 EXTENT_IOBITS, 0, NULL)) 2811 ret = 0; 2812 else { 2813 if ((mask & GFP_NOFS) == GFP_NOFS) 2814 mask = GFP_NOFS; 2815 /* 2816 * at this point we can safely clear everything except the 2817 * locked bit and the nodatasum bit 2818 */ 2819 clear_extent_bit(tree, start, end, 2820 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2821 0, 0, NULL, mask); 2822 } 2823 return ret; 2824 } 2825 2826 /* 2827 * a helper for releasepage. As long as there are no locked extents 2828 * in the range corresponding to the page, both state records and extent 2829 * map records are removed 2830 */ 2831 int try_release_extent_mapping(struct extent_map_tree *map, 2832 struct extent_io_tree *tree, struct page *page, 2833 gfp_t mask) 2834 { 2835 struct extent_map *em; 2836 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2837 u64 end = start + PAGE_CACHE_SIZE - 1; 2838 2839 if ((mask & __GFP_WAIT) && 2840 page->mapping->host->i_size > 16 * 1024 * 1024) { 2841 u64 len; 2842 while (start <= end) { 2843 len = end - start + 1; 2844 write_lock(&map->lock); 2845 em = lookup_extent_mapping(map, start, len); 2846 if (!em || IS_ERR(em)) { 2847 write_unlock(&map->lock); 2848 break; 2849 } 2850 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2851 em->start != start) { 2852 write_unlock(&map->lock); 2853 free_extent_map(em); 2854 break; 2855 } 2856 if (!test_range_bit(tree, em->start, 2857 extent_map_end(em) - 1, 2858 EXTENT_LOCKED | EXTENT_WRITEBACK, 2859 0, NULL)) { 2860 remove_extent_mapping(map, em); 2861 /* once for the rb tree */ 2862 free_extent_map(em); 2863 } 2864 start = extent_map_end(em); 2865 write_unlock(&map->lock); 2866 2867 /* once for us */ 2868 free_extent_map(em); 2869 } 2870 } 2871 return try_release_extent_state(map, tree, page, mask); 2872 } 2873 2874 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2875 get_extent_t *get_extent) 2876 { 2877 struct inode *inode = mapping->host; 2878 struct extent_state *cached_state = NULL; 2879 u64 start = iblock << inode->i_blkbits; 2880 sector_t sector = 0; 2881 size_t blksize = (1 << inode->i_blkbits); 2882 struct extent_map *em; 2883 2884 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2885 0, &cached_state, GFP_NOFS); 2886 em = get_extent(inode, NULL, 0, start, blksize, 0); 2887 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, 2888 start + blksize - 1, &cached_state, GFP_NOFS); 2889 if (!em || IS_ERR(em)) 2890 return 0; 2891 2892 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2893 goto out; 2894 2895 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2896 out: 2897 free_extent_map(em); 2898 return sector; 2899 } 2900 2901 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2902 __u64 start, __u64 len, get_extent_t *get_extent) 2903 { 2904 int ret = 0; 2905 u64 off = start; 2906 u64 max = start + len; 2907 u32 flags = 0; 2908 u32 found_type; 2909 u64 last; 2910 u64 disko = 0; 2911 struct btrfs_key found_key; 2912 struct extent_map *em = NULL; 2913 struct extent_state *cached_state = NULL; 2914 struct btrfs_path *path; 2915 struct btrfs_file_extent_item *item; 2916 int end = 0; 2917 u64 em_start = 0, em_len = 0; 2918 unsigned long emflags; 2919 int hole = 0; 2920 2921 if (len == 0) 2922 return -EINVAL; 2923 2924 path = btrfs_alloc_path(); 2925 if (!path) 2926 return -ENOMEM; 2927 path->leave_spinning = 1; 2928 2929 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2930 path, inode->i_ino, -1, 0); 2931 if (ret < 0) { 2932 btrfs_free_path(path); 2933 return ret; 2934 } 2935 WARN_ON(!ret); 2936 path->slots[0]--; 2937 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2938 struct btrfs_file_extent_item); 2939 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 2940 found_type = btrfs_key_type(&found_key); 2941 2942 /* No extents, just return */ 2943 if (found_key.objectid != inode->i_ino || 2944 found_type != BTRFS_EXTENT_DATA_KEY) { 2945 btrfs_free_path(path); 2946 return 0; 2947 } 2948 last = found_key.offset; 2949 btrfs_free_path(path); 2950 2951 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2952 &cached_state, GFP_NOFS); 2953 em = get_extent(inode, NULL, 0, off, max - off, 0); 2954 if (!em) 2955 goto out; 2956 if (IS_ERR(em)) { 2957 ret = PTR_ERR(em); 2958 goto out; 2959 } 2960 2961 while (!end) { 2962 hole = 0; 2963 off = em->start + em->len; 2964 if (off >= max) 2965 end = 1; 2966 2967 if (em->block_start == EXTENT_MAP_HOLE) { 2968 hole = 1; 2969 goto next; 2970 } 2971 2972 em_start = em->start; 2973 em_len = em->len; 2974 2975 disko = 0; 2976 flags = 0; 2977 2978 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2979 end = 1; 2980 flags |= FIEMAP_EXTENT_LAST; 2981 } else if (em->block_start == EXTENT_MAP_INLINE) { 2982 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2983 FIEMAP_EXTENT_NOT_ALIGNED); 2984 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 2985 flags |= (FIEMAP_EXTENT_DELALLOC | 2986 FIEMAP_EXTENT_UNKNOWN); 2987 } else { 2988 disko = em->block_start; 2989 } 2990 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2991 flags |= FIEMAP_EXTENT_ENCODED; 2992 2993 next: 2994 emflags = em->flags; 2995 free_extent_map(em); 2996 em = NULL; 2997 if (!end) { 2998 em = get_extent(inode, NULL, 0, off, max - off, 0); 2999 if (!em) 3000 goto out; 3001 if (IS_ERR(em)) { 3002 ret = PTR_ERR(em); 3003 goto out; 3004 } 3005 emflags = em->flags; 3006 } 3007 3008 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3009 flags |= FIEMAP_EXTENT_LAST; 3010 end = 1; 3011 } 3012 3013 if (em_start == last) { 3014 flags |= FIEMAP_EXTENT_LAST; 3015 end = 1; 3016 } 3017 3018 if (!hole) { 3019 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3020 em_len, flags); 3021 if (ret) 3022 goto out_free; 3023 } 3024 } 3025 out_free: 3026 free_extent_map(em); 3027 out: 3028 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3029 &cached_state, GFP_NOFS); 3030 return ret; 3031 } 3032 3033 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3034 unsigned long i) 3035 { 3036 struct page *p; 3037 struct address_space *mapping; 3038 3039 if (i == 0) 3040 return eb->first_page; 3041 i += eb->start >> PAGE_CACHE_SHIFT; 3042 mapping = eb->first_page->mapping; 3043 if (!mapping) 3044 return NULL; 3045 3046 /* 3047 * extent_buffer_page is only called after pinning the page 3048 * by increasing the reference count. So we know the page must 3049 * be in the radix tree. 3050 */ 3051 rcu_read_lock(); 3052 p = radix_tree_lookup(&mapping->page_tree, i); 3053 rcu_read_unlock(); 3054 3055 return p; 3056 } 3057 3058 static inline unsigned long num_extent_pages(u64 start, u64 len) 3059 { 3060 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3061 (start >> PAGE_CACHE_SHIFT); 3062 } 3063 3064 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3065 u64 start, 3066 unsigned long len, 3067 gfp_t mask) 3068 { 3069 struct extent_buffer *eb = NULL; 3070 #if LEAK_DEBUG 3071 unsigned long flags; 3072 #endif 3073 3074 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3075 eb->start = start; 3076 eb->len = len; 3077 spin_lock_init(&eb->lock); 3078 init_waitqueue_head(&eb->lock_wq); 3079 3080 #if LEAK_DEBUG 3081 spin_lock_irqsave(&leak_lock, flags); 3082 list_add(&eb->leak_list, &buffers); 3083 spin_unlock_irqrestore(&leak_lock, flags); 3084 #endif 3085 atomic_set(&eb->refs, 1); 3086 3087 return eb; 3088 } 3089 3090 static void __free_extent_buffer(struct extent_buffer *eb) 3091 { 3092 #if LEAK_DEBUG 3093 unsigned long flags; 3094 spin_lock_irqsave(&leak_lock, flags); 3095 list_del(&eb->leak_list); 3096 spin_unlock_irqrestore(&leak_lock, flags); 3097 #endif 3098 kmem_cache_free(extent_buffer_cache, eb); 3099 } 3100 3101 /* 3102 * Helper for releasing extent buffer page. 3103 */ 3104 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3105 unsigned long start_idx) 3106 { 3107 unsigned long index; 3108 struct page *page; 3109 3110 if (!eb->first_page) 3111 return; 3112 3113 index = num_extent_pages(eb->start, eb->len); 3114 if (start_idx >= index) 3115 return; 3116 3117 do { 3118 index--; 3119 page = extent_buffer_page(eb, index); 3120 if (page) 3121 page_cache_release(page); 3122 } while (index != start_idx); 3123 } 3124 3125 /* 3126 * Helper for releasing the extent buffer. 3127 */ 3128 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3129 { 3130 btrfs_release_extent_buffer_page(eb, 0); 3131 __free_extent_buffer(eb); 3132 } 3133 3134 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3135 u64 start, unsigned long len, 3136 struct page *page0, 3137 gfp_t mask) 3138 { 3139 unsigned long num_pages = num_extent_pages(start, len); 3140 unsigned long i; 3141 unsigned long index = start >> PAGE_CACHE_SHIFT; 3142 struct extent_buffer *eb; 3143 struct extent_buffer *exists = NULL; 3144 struct page *p; 3145 struct address_space *mapping = tree->mapping; 3146 int uptodate = 1; 3147 int ret; 3148 3149 rcu_read_lock(); 3150 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3151 if (eb && atomic_inc_not_zero(&eb->refs)) { 3152 rcu_read_unlock(); 3153 mark_page_accessed(eb->first_page); 3154 return eb; 3155 } 3156 rcu_read_unlock(); 3157 3158 eb = __alloc_extent_buffer(tree, start, len, mask); 3159 if (!eb) 3160 return NULL; 3161 3162 if (page0) { 3163 eb->first_page = page0; 3164 i = 1; 3165 index++; 3166 page_cache_get(page0); 3167 mark_page_accessed(page0); 3168 set_page_extent_mapped(page0); 3169 set_page_extent_head(page0, len); 3170 uptodate = PageUptodate(page0); 3171 } else { 3172 i = 0; 3173 } 3174 for (; i < num_pages; i++, index++) { 3175 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3176 if (!p) { 3177 WARN_ON(1); 3178 goto free_eb; 3179 } 3180 set_page_extent_mapped(p); 3181 mark_page_accessed(p); 3182 if (i == 0) { 3183 eb->first_page = p; 3184 set_page_extent_head(p, len); 3185 } else { 3186 set_page_private(p, EXTENT_PAGE_PRIVATE); 3187 } 3188 if (!PageUptodate(p)) 3189 uptodate = 0; 3190 unlock_page(p); 3191 } 3192 if (uptodate) 3193 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3194 3195 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3196 if (ret) 3197 goto free_eb; 3198 3199 spin_lock(&tree->buffer_lock); 3200 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3201 if (ret == -EEXIST) { 3202 exists = radix_tree_lookup(&tree->buffer, 3203 start >> PAGE_CACHE_SHIFT); 3204 /* add one reference for the caller */ 3205 atomic_inc(&exists->refs); 3206 spin_unlock(&tree->buffer_lock); 3207 radix_tree_preload_end(); 3208 goto free_eb; 3209 } 3210 /* add one reference for the tree */ 3211 atomic_inc(&eb->refs); 3212 spin_unlock(&tree->buffer_lock); 3213 radix_tree_preload_end(); 3214 return eb; 3215 3216 free_eb: 3217 if (!atomic_dec_and_test(&eb->refs)) 3218 return exists; 3219 btrfs_release_extent_buffer(eb); 3220 return exists; 3221 } 3222 3223 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3224 u64 start, unsigned long len, 3225 gfp_t mask) 3226 { 3227 struct extent_buffer *eb; 3228 3229 rcu_read_lock(); 3230 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3231 if (eb && atomic_inc_not_zero(&eb->refs)) { 3232 rcu_read_unlock(); 3233 mark_page_accessed(eb->first_page); 3234 return eb; 3235 } 3236 rcu_read_unlock(); 3237 3238 return NULL; 3239 } 3240 3241 void free_extent_buffer(struct extent_buffer *eb) 3242 { 3243 if (!eb) 3244 return; 3245 3246 if (!atomic_dec_and_test(&eb->refs)) 3247 return; 3248 3249 WARN_ON(1); 3250 } 3251 3252 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3253 struct extent_buffer *eb) 3254 { 3255 unsigned long i; 3256 unsigned long num_pages; 3257 struct page *page; 3258 3259 num_pages = num_extent_pages(eb->start, eb->len); 3260 3261 for (i = 0; i < num_pages; i++) { 3262 page = extent_buffer_page(eb, i); 3263 if (!PageDirty(page)) 3264 continue; 3265 3266 lock_page(page); 3267 if (i == 0) 3268 set_page_extent_head(page, eb->len); 3269 else 3270 set_page_private(page, EXTENT_PAGE_PRIVATE); 3271 3272 clear_page_dirty_for_io(page); 3273 spin_lock_irq(&page->mapping->tree_lock); 3274 if (!PageDirty(page)) { 3275 radix_tree_tag_clear(&page->mapping->page_tree, 3276 page_index(page), 3277 PAGECACHE_TAG_DIRTY); 3278 } 3279 spin_unlock_irq(&page->mapping->tree_lock); 3280 unlock_page(page); 3281 } 3282 return 0; 3283 } 3284 3285 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, 3286 struct extent_buffer *eb) 3287 { 3288 return wait_on_extent_writeback(tree, eb->start, 3289 eb->start + eb->len - 1); 3290 } 3291 3292 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3293 struct extent_buffer *eb) 3294 { 3295 unsigned long i; 3296 unsigned long num_pages; 3297 int was_dirty = 0; 3298 3299 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3300 num_pages = num_extent_pages(eb->start, eb->len); 3301 for (i = 0; i < num_pages; i++) 3302 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3303 return was_dirty; 3304 } 3305 3306 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3307 struct extent_buffer *eb, 3308 struct extent_state **cached_state) 3309 { 3310 unsigned long i; 3311 struct page *page; 3312 unsigned long num_pages; 3313 3314 num_pages = num_extent_pages(eb->start, eb->len); 3315 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3316 3317 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3318 cached_state, GFP_NOFS); 3319 for (i = 0; i < num_pages; i++) { 3320 page = extent_buffer_page(eb, i); 3321 if (page) 3322 ClearPageUptodate(page); 3323 } 3324 return 0; 3325 } 3326 3327 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3328 struct extent_buffer *eb) 3329 { 3330 unsigned long i; 3331 struct page *page; 3332 unsigned long num_pages; 3333 3334 num_pages = num_extent_pages(eb->start, eb->len); 3335 3336 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3337 GFP_NOFS); 3338 for (i = 0; i < num_pages; i++) { 3339 page = extent_buffer_page(eb, i); 3340 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3341 ((i == num_pages - 1) && 3342 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3343 check_page_uptodate(tree, page); 3344 continue; 3345 } 3346 SetPageUptodate(page); 3347 } 3348 return 0; 3349 } 3350 3351 int extent_range_uptodate(struct extent_io_tree *tree, 3352 u64 start, u64 end) 3353 { 3354 struct page *page; 3355 int ret; 3356 int pg_uptodate = 1; 3357 int uptodate; 3358 unsigned long index; 3359 3360 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3361 if (ret) 3362 return 1; 3363 while (start <= end) { 3364 index = start >> PAGE_CACHE_SHIFT; 3365 page = find_get_page(tree->mapping, index); 3366 uptodate = PageUptodate(page); 3367 page_cache_release(page); 3368 if (!uptodate) { 3369 pg_uptodate = 0; 3370 break; 3371 } 3372 start += PAGE_CACHE_SIZE; 3373 } 3374 return pg_uptodate; 3375 } 3376 3377 int extent_buffer_uptodate(struct extent_io_tree *tree, 3378 struct extent_buffer *eb, 3379 struct extent_state *cached_state) 3380 { 3381 int ret = 0; 3382 unsigned long num_pages; 3383 unsigned long i; 3384 struct page *page; 3385 int pg_uptodate = 1; 3386 3387 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3388 return 1; 3389 3390 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3391 EXTENT_UPTODATE, 1, cached_state); 3392 if (ret) 3393 return ret; 3394 3395 num_pages = num_extent_pages(eb->start, eb->len); 3396 for (i = 0; i < num_pages; i++) { 3397 page = extent_buffer_page(eb, i); 3398 if (!PageUptodate(page)) { 3399 pg_uptodate = 0; 3400 break; 3401 } 3402 } 3403 return pg_uptodate; 3404 } 3405 3406 int read_extent_buffer_pages(struct extent_io_tree *tree, 3407 struct extent_buffer *eb, 3408 u64 start, int wait, 3409 get_extent_t *get_extent, int mirror_num) 3410 { 3411 unsigned long i; 3412 unsigned long start_i; 3413 struct page *page; 3414 int err; 3415 int ret = 0; 3416 int locked_pages = 0; 3417 int all_uptodate = 1; 3418 int inc_all_pages = 0; 3419 unsigned long num_pages; 3420 struct bio *bio = NULL; 3421 unsigned long bio_flags = 0; 3422 3423 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3424 return 0; 3425 3426 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3427 EXTENT_UPTODATE, 1, NULL)) { 3428 return 0; 3429 } 3430 3431 if (start) { 3432 WARN_ON(start < eb->start); 3433 start_i = (start >> PAGE_CACHE_SHIFT) - 3434 (eb->start >> PAGE_CACHE_SHIFT); 3435 } else { 3436 start_i = 0; 3437 } 3438 3439 num_pages = num_extent_pages(eb->start, eb->len); 3440 for (i = start_i; i < num_pages; i++) { 3441 page = extent_buffer_page(eb, i); 3442 if (!wait) { 3443 if (!trylock_page(page)) 3444 goto unlock_exit; 3445 } else { 3446 lock_page(page); 3447 } 3448 locked_pages++; 3449 if (!PageUptodate(page)) 3450 all_uptodate = 0; 3451 } 3452 if (all_uptodate) { 3453 if (start_i == 0) 3454 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3455 goto unlock_exit; 3456 } 3457 3458 for (i = start_i; i < num_pages; i++) { 3459 page = extent_buffer_page(eb, i); 3460 if (inc_all_pages) 3461 page_cache_get(page); 3462 if (!PageUptodate(page)) { 3463 if (start_i == 0) 3464 inc_all_pages = 1; 3465 ClearPageError(page); 3466 err = __extent_read_full_page(tree, page, 3467 get_extent, &bio, 3468 mirror_num, &bio_flags); 3469 if (err) 3470 ret = err; 3471 } else { 3472 unlock_page(page); 3473 } 3474 } 3475 3476 if (bio) 3477 submit_one_bio(READ, bio, mirror_num, bio_flags); 3478 3479 if (ret || !wait) 3480 return ret; 3481 3482 for (i = start_i; i < num_pages; i++) { 3483 page = extent_buffer_page(eb, i); 3484 wait_on_page_locked(page); 3485 if (!PageUptodate(page)) 3486 ret = -EIO; 3487 } 3488 3489 if (!ret) 3490 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3491 return ret; 3492 3493 unlock_exit: 3494 i = start_i; 3495 while (locked_pages > 0) { 3496 page = extent_buffer_page(eb, i); 3497 i++; 3498 unlock_page(page); 3499 locked_pages--; 3500 } 3501 return ret; 3502 } 3503 3504 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3505 unsigned long start, 3506 unsigned long len) 3507 { 3508 size_t cur; 3509 size_t offset; 3510 struct page *page; 3511 char *kaddr; 3512 char *dst = (char *)dstv; 3513 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3514 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3515 3516 WARN_ON(start > eb->len); 3517 WARN_ON(start + len > eb->start + eb->len); 3518 3519 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3520 3521 while (len > 0) { 3522 page = extent_buffer_page(eb, i); 3523 3524 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3525 kaddr = kmap_atomic(page, KM_USER1); 3526 memcpy(dst, kaddr + offset, cur); 3527 kunmap_atomic(kaddr, KM_USER1); 3528 3529 dst += cur; 3530 len -= cur; 3531 offset = 0; 3532 i++; 3533 } 3534 } 3535 3536 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3537 unsigned long min_len, char **token, char **map, 3538 unsigned long *map_start, 3539 unsigned long *map_len, int km) 3540 { 3541 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3542 char *kaddr; 3543 struct page *p; 3544 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3545 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3546 unsigned long end_i = (start_offset + start + min_len - 1) >> 3547 PAGE_CACHE_SHIFT; 3548 3549 if (i != end_i) 3550 return -EINVAL; 3551 3552 if (i == 0) { 3553 offset = start_offset; 3554 *map_start = 0; 3555 } else { 3556 offset = 0; 3557 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3558 } 3559 3560 if (start + min_len > eb->len) { 3561 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3562 "wanted %lu %lu\n", (unsigned long long)eb->start, 3563 eb->len, start, min_len); 3564 WARN_ON(1); 3565 } 3566 3567 p = extent_buffer_page(eb, i); 3568 kaddr = kmap_atomic(p, km); 3569 *token = kaddr; 3570 *map = kaddr + offset; 3571 *map_len = PAGE_CACHE_SIZE - offset; 3572 return 0; 3573 } 3574 3575 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3576 unsigned long min_len, 3577 char **token, char **map, 3578 unsigned long *map_start, 3579 unsigned long *map_len, int km) 3580 { 3581 int err; 3582 int save = 0; 3583 if (eb->map_token) { 3584 unmap_extent_buffer(eb, eb->map_token, km); 3585 eb->map_token = NULL; 3586 save = 1; 3587 } 3588 err = map_private_extent_buffer(eb, start, min_len, token, map, 3589 map_start, map_len, km); 3590 if (!err && save) { 3591 eb->map_token = *token; 3592 eb->kaddr = *map; 3593 eb->map_start = *map_start; 3594 eb->map_len = *map_len; 3595 } 3596 return err; 3597 } 3598 3599 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3600 { 3601 kunmap_atomic(token, km); 3602 } 3603 3604 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3605 unsigned long start, 3606 unsigned long len) 3607 { 3608 size_t cur; 3609 size_t offset; 3610 struct page *page; 3611 char *kaddr; 3612 char *ptr = (char *)ptrv; 3613 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3614 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3615 int ret = 0; 3616 3617 WARN_ON(start > eb->len); 3618 WARN_ON(start + len > eb->start + eb->len); 3619 3620 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3621 3622 while (len > 0) { 3623 page = extent_buffer_page(eb, i); 3624 3625 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3626 3627 kaddr = kmap_atomic(page, KM_USER0); 3628 ret = memcmp(ptr, kaddr + offset, cur); 3629 kunmap_atomic(kaddr, KM_USER0); 3630 if (ret) 3631 break; 3632 3633 ptr += cur; 3634 len -= cur; 3635 offset = 0; 3636 i++; 3637 } 3638 return ret; 3639 } 3640 3641 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3642 unsigned long start, unsigned long len) 3643 { 3644 size_t cur; 3645 size_t offset; 3646 struct page *page; 3647 char *kaddr; 3648 char *src = (char *)srcv; 3649 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3650 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3651 3652 WARN_ON(start > eb->len); 3653 WARN_ON(start + len > eb->start + eb->len); 3654 3655 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3656 3657 while (len > 0) { 3658 page = extent_buffer_page(eb, i); 3659 WARN_ON(!PageUptodate(page)); 3660 3661 cur = min(len, PAGE_CACHE_SIZE - offset); 3662 kaddr = kmap_atomic(page, KM_USER1); 3663 memcpy(kaddr + offset, src, cur); 3664 kunmap_atomic(kaddr, KM_USER1); 3665 3666 src += cur; 3667 len -= cur; 3668 offset = 0; 3669 i++; 3670 } 3671 } 3672 3673 void memset_extent_buffer(struct extent_buffer *eb, char c, 3674 unsigned long start, unsigned long len) 3675 { 3676 size_t cur; 3677 size_t offset; 3678 struct page *page; 3679 char *kaddr; 3680 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3681 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3682 3683 WARN_ON(start > eb->len); 3684 WARN_ON(start + len > eb->start + eb->len); 3685 3686 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3687 3688 while (len > 0) { 3689 page = extent_buffer_page(eb, i); 3690 WARN_ON(!PageUptodate(page)); 3691 3692 cur = min(len, PAGE_CACHE_SIZE - offset); 3693 kaddr = kmap_atomic(page, KM_USER0); 3694 memset(kaddr + offset, c, cur); 3695 kunmap_atomic(kaddr, KM_USER0); 3696 3697 len -= cur; 3698 offset = 0; 3699 i++; 3700 } 3701 } 3702 3703 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3704 unsigned long dst_offset, unsigned long src_offset, 3705 unsigned long len) 3706 { 3707 u64 dst_len = dst->len; 3708 size_t cur; 3709 size_t offset; 3710 struct page *page; 3711 char *kaddr; 3712 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3713 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3714 3715 WARN_ON(src->len != dst_len); 3716 3717 offset = (start_offset + dst_offset) & 3718 ((unsigned long)PAGE_CACHE_SIZE - 1); 3719 3720 while (len > 0) { 3721 page = extent_buffer_page(dst, i); 3722 WARN_ON(!PageUptodate(page)); 3723 3724 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3725 3726 kaddr = kmap_atomic(page, KM_USER0); 3727 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3728 kunmap_atomic(kaddr, KM_USER0); 3729 3730 src_offset += cur; 3731 len -= cur; 3732 offset = 0; 3733 i++; 3734 } 3735 } 3736 3737 static void move_pages(struct page *dst_page, struct page *src_page, 3738 unsigned long dst_off, unsigned long src_off, 3739 unsigned long len) 3740 { 3741 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3742 if (dst_page == src_page) { 3743 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3744 } else { 3745 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3746 char *p = dst_kaddr + dst_off + len; 3747 char *s = src_kaddr + src_off + len; 3748 3749 while (len--) 3750 *--p = *--s; 3751 3752 kunmap_atomic(src_kaddr, KM_USER1); 3753 } 3754 kunmap_atomic(dst_kaddr, KM_USER0); 3755 } 3756 3757 static void copy_pages(struct page *dst_page, struct page *src_page, 3758 unsigned long dst_off, unsigned long src_off, 3759 unsigned long len) 3760 { 3761 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3762 char *src_kaddr; 3763 3764 if (dst_page != src_page) 3765 src_kaddr = kmap_atomic(src_page, KM_USER1); 3766 else 3767 src_kaddr = dst_kaddr; 3768 3769 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3770 kunmap_atomic(dst_kaddr, KM_USER0); 3771 if (dst_page != src_page) 3772 kunmap_atomic(src_kaddr, KM_USER1); 3773 } 3774 3775 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3776 unsigned long src_offset, unsigned long len) 3777 { 3778 size_t cur; 3779 size_t dst_off_in_page; 3780 size_t src_off_in_page; 3781 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3782 unsigned long dst_i; 3783 unsigned long src_i; 3784 3785 if (src_offset + len > dst->len) { 3786 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3787 "len %lu dst len %lu\n", src_offset, len, dst->len); 3788 BUG_ON(1); 3789 } 3790 if (dst_offset + len > dst->len) { 3791 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3792 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3793 BUG_ON(1); 3794 } 3795 3796 while (len > 0) { 3797 dst_off_in_page = (start_offset + dst_offset) & 3798 ((unsigned long)PAGE_CACHE_SIZE - 1); 3799 src_off_in_page = (start_offset + src_offset) & 3800 ((unsigned long)PAGE_CACHE_SIZE - 1); 3801 3802 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3803 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3804 3805 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3806 src_off_in_page)); 3807 cur = min_t(unsigned long, cur, 3808 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3809 3810 copy_pages(extent_buffer_page(dst, dst_i), 3811 extent_buffer_page(dst, src_i), 3812 dst_off_in_page, src_off_in_page, cur); 3813 3814 src_offset += cur; 3815 dst_offset += cur; 3816 len -= cur; 3817 } 3818 } 3819 3820 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3821 unsigned long src_offset, unsigned long len) 3822 { 3823 size_t cur; 3824 size_t dst_off_in_page; 3825 size_t src_off_in_page; 3826 unsigned long dst_end = dst_offset + len - 1; 3827 unsigned long src_end = src_offset + len - 1; 3828 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3829 unsigned long dst_i; 3830 unsigned long src_i; 3831 3832 if (src_offset + len > dst->len) { 3833 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3834 "len %lu len %lu\n", src_offset, len, dst->len); 3835 BUG_ON(1); 3836 } 3837 if (dst_offset + len > dst->len) { 3838 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3839 "len %lu len %lu\n", dst_offset, len, dst->len); 3840 BUG_ON(1); 3841 } 3842 if (dst_offset < src_offset) { 3843 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3844 return; 3845 } 3846 while (len > 0) { 3847 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3848 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3849 3850 dst_off_in_page = (start_offset + dst_end) & 3851 ((unsigned long)PAGE_CACHE_SIZE - 1); 3852 src_off_in_page = (start_offset + src_end) & 3853 ((unsigned long)PAGE_CACHE_SIZE - 1); 3854 3855 cur = min_t(unsigned long, len, src_off_in_page + 1); 3856 cur = min(cur, dst_off_in_page + 1); 3857 move_pages(extent_buffer_page(dst, dst_i), 3858 extent_buffer_page(dst, src_i), 3859 dst_off_in_page - cur + 1, 3860 src_off_in_page - cur + 1, cur); 3861 3862 dst_end -= cur; 3863 src_end -= cur; 3864 len -= cur; 3865 } 3866 } 3867 3868 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3869 { 3870 struct extent_buffer *eb = 3871 container_of(head, struct extent_buffer, rcu_head); 3872 3873 btrfs_release_extent_buffer(eb); 3874 } 3875 3876 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3877 { 3878 u64 start = page_offset(page); 3879 struct extent_buffer *eb; 3880 int ret = 1; 3881 3882 spin_lock(&tree->buffer_lock); 3883 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3884 if (!eb) { 3885 spin_unlock(&tree->buffer_lock); 3886 return ret; 3887 } 3888 3889 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3890 ret = 0; 3891 goto out; 3892 } 3893 3894 /* 3895 * set @eb->refs to 0 if it is already 1, and then release the @eb. 3896 * Or go back. 3897 */ 3898 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 3899 ret = 0; 3900 goto out; 3901 } 3902 3903 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3904 out: 3905 spin_unlock(&tree->buffer_lock); 3906 3907 /* at this point we can safely release the extent buffer */ 3908 if (atomic_read(&eb->refs) == 0) 3909 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 3910 return ret; 3911 } 3912