1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/module.h> 8 #include <linux/spinlock.h> 9 #include <linux/blkdev.h> 10 #include <linux/swap.h> 11 #include <linux/writeback.h> 12 #include <linux/pagevec.h> 13 #include "extent_io.h" 14 #include "extent_map.h" 15 #include "compat.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 19 static struct kmem_cache *extent_state_cache; 20 static struct kmem_cache *extent_buffer_cache; 21 22 static LIST_HEAD(buffers); 23 static LIST_HEAD(states); 24 25 #define LEAK_DEBUG 0 26 #if LEAK_DEBUG 27 static DEFINE_SPINLOCK(leak_lock); 28 #endif 29 30 #define BUFFER_LRU_MAX 64 31 32 struct tree_entry { 33 u64 start; 34 u64 end; 35 struct rb_node rb_node; 36 }; 37 38 struct extent_page_data { 39 struct bio *bio; 40 struct extent_io_tree *tree; 41 get_extent_t *get_extent; 42 43 /* tells writepage not to lock the state bits for this range 44 * it still does the unlocking 45 */ 46 unsigned int extent_locked:1; 47 48 /* tells the submit_bio code to use a WRITE_SYNC */ 49 unsigned int sync_io:1; 50 }; 51 52 int __init extent_io_init(void) 53 { 54 extent_state_cache = kmem_cache_create("extent_state", 55 sizeof(struct extent_state), 0, 56 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 57 if (!extent_state_cache) 58 return -ENOMEM; 59 60 extent_buffer_cache = kmem_cache_create("extent_buffers", 61 sizeof(struct extent_buffer), 0, 62 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); 63 if (!extent_buffer_cache) 64 goto free_state_cache; 65 return 0; 66 67 free_state_cache: 68 kmem_cache_destroy(extent_state_cache); 69 return -ENOMEM; 70 } 71 72 void extent_io_exit(void) 73 { 74 struct extent_state *state; 75 struct extent_buffer *eb; 76 77 while (!list_empty(&states)) { 78 state = list_entry(states.next, struct extent_state, leak_list); 79 printk(KERN_ERR "btrfs state leak: start %llu end %llu " 80 "state %lu in tree %p refs %d\n", 81 (unsigned long long)state->start, 82 (unsigned long long)state->end, 83 state->state, state->tree, atomic_read(&state->refs)); 84 list_del(&state->leak_list); 85 kmem_cache_free(extent_state_cache, state); 86 87 } 88 89 while (!list_empty(&buffers)) { 90 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 91 printk(KERN_ERR "btrfs buffer leak start %llu len %lu " 92 "refs %d\n", (unsigned long long)eb->start, 93 eb->len, atomic_read(&eb->refs)); 94 list_del(&eb->leak_list); 95 kmem_cache_free(extent_buffer_cache, eb); 96 } 97 if (extent_state_cache) 98 kmem_cache_destroy(extent_state_cache); 99 if (extent_buffer_cache) 100 kmem_cache_destroy(extent_buffer_cache); 101 } 102 103 void extent_io_tree_init(struct extent_io_tree *tree, 104 struct address_space *mapping, gfp_t mask) 105 { 106 tree->state = RB_ROOT; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); 108 tree->ops = NULL; 109 tree->dirty_bytes = 0; 110 spin_lock_init(&tree->lock); 111 spin_lock_init(&tree->buffer_lock); 112 tree->mapping = mapping; 113 } 114 115 static struct extent_state *alloc_extent_state(gfp_t mask) 116 { 117 struct extent_state *state; 118 #if LEAK_DEBUG 119 unsigned long flags; 120 #endif 121 122 state = kmem_cache_alloc(extent_state_cache, mask); 123 if (!state) 124 return state; 125 state->state = 0; 126 state->private = 0; 127 state->tree = NULL; 128 #if LEAK_DEBUG 129 spin_lock_irqsave(&leak_lock, flags); 130 list_add(&state->leak_list, &states); 131 spin_unlock_irqrestore(&leak_lock, flags); 132 #endif 133 atomic_set(&state->refs, 1); 134 init_waitqueue_head(&state->wq); 135 return state; 136 } 137 138 void free_extent_state(struct extent_state *state) 139 { 140 if (!state) 141 return; 142 if (atomic_dec_and_test(&state->refs)) { 143 #if LEAK_DEBUG 144 unsigned long flags; 145 #endif 146 WARN_ON(state->tree); 147 #if LEAK_DEBUG 148 spin_lock_irqsave(&leak_lock, flags); 149 list_del(&state->leak_list); 150 spin_unlock_irqrestore(&leak_lock, flags); 151 #endif 152 kmem_cache_free(extent_state_cache, state); 153 } 154 } 155 156 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 157 struct rb_node *node) 158 { 159 struct rb_node **p = &root->rb_node; 160 struct rb_node *parent = NULL; 161 struct tree_entry *entry; 162 163 while (*p) { 164 parent = *p; 165 entry = rb_entry(parent, struct tree_entry, rb_node); 166 167 if (offset < entry->start) 168 p = &(*p)->rb_left; 169 else if (offset > entry->end) 170 p = &(*p)->rb_right; 171 else 172 return parent; 173 } 174 175 entry = rb_entry(node, struct tree_entry, rb_node); 176 rb_link_node(node, parent, p); 177 rb_insert_color(node, root); 178 return NULL; 179 } 180 181 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 182 struct rb_node **prev_ret, 183 struct rb_node **next_ret) 184 { 185 struct rb_root *root = &tree->state; 186 struct rb_node *n = root->rb_node; 187 struct rb_node *prev = NULL; 188 struct rb_node *orig_prev = NULL; 189 struct tree_entry *entry; 190 struct tree_entry *prev_entry = NULL; 191 192 while (n) { 193 entry = rb_entry(n, struct tree_entry, rb_node); 194 prev = n; 195 prev_entry = entry; 196 197 if (offset < entry->start) 198 n = n->rb_left; 199 else if (offset > entry->end) 200 n = n->rb_right; 201 else 202 return n; 203 } 204 205 if (prev_ret) { 206 orig_prev = prev; 207 while (prev && offset > prev_entry->end) { 208 prev = rb_next(prev); 209 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 210 } 211 *prev_ret = prev; 212 prev = orig_prev; 213 } 214 215 if (next_ret) { 216 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 217 while (prev && offset < prev_entry->start) { 218 prev = rb_prev(prev); 219 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 220 } 221 *next_ret = prev; 222 } 223 return NULL; 224 } 225 226 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 227 u64 offset) 228 { 229 struct rb_node *prev = NULL; 230 struct rb_node *ret; 231 232 ret = __etree_search(tree, offset, &prev, NULL); 233 if (!ret) 234 return prev; 235 return ret; 236 } 237 238 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 239 struct extent_state *other) 240 { 241 if (tree->ops && tree->ops->merge_extent_hook) 242 tree->ops->merge_extent_hook(tree->mapping->host, new, 243 other); 244 } 245 246 /* 247 * utility function to look for merge candidates inside a given range. 248 * Any extents with matching state are merged together into a single 249 * extent in the tree. Extents with EXTENT_IO in their state field 250 * are not merged because the end_io handlers need to be able to do 251 * operations on them without sleeping (or doing allocations/splits). 252 * 253 * This should be called with the tree lock held. 254 */ 255 static int merge_state(struct extent_io_tree *tree, 256 struct extent_state *state) 257 { 258 struct extent_state *other; 259 struct rb_node *other_node; 260 261 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 262 return 0; 263 264 other_node = rb_prev(&state->rb_node); 265 if (other_node) { 266 other = rb_entry(other_node, struct extent_state, rb_node); 267 if (other->end == state->start - 1 && 268 other->state == state->state) { 269 merge_cb(tree, state, other); 270 state->start = other->start; 271 other->tree = NULL; 272 rb_erase(&other->rb_node, &tree->state); 273 free_extent_state(other); 274 } 275 } 276 other_node = rb_next(&state->rb_node); 277 if (other_node) { 278 other = rb_entry(other_node, struct extent_state, rb_node); 279 if (other->start == state->end + 1 && 280 other->state == state->state) { 281 merge_cb(tree, state, other); 282 other->start = state->start; 283 state->tree = NULL; 284 rb_erase(&state->rb_node, &tree->state); 285 free_extent_state(state); 286 state = NULL; 287 } 288 } 289 290 return 0; 291 } 292 293 static int set_state_cb(struct extent_io_tree *tree, 294 struct extent_state *state, int *bits) 295 { 296 if (tree->ops && tree->ops->set_bit_hook) { 297 return tree->ops->set_bit_hook(tree->mapping->host, 298 state, bits); 299 } 300 301 return 0; 302 } 303 304 static void clear_state_cb(struct extent_io_tree *tree, 305 struct extent_state *state, int *bits) 306 { 307 if (tree->ops && tree->ops->clear_bit_hook) 308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 309 } 310 311 /* 312 * insert an extent_state struct into the tree. 'bits' are set on the 313 * struct before it is inserted. 314 * 315 * This may return -EEXIST if the extent is already there, in which case the 316 * state struct is freed. 317 * 318 * The tree lock is not taken internally. This is a utility function and 319 * probably isn't what you want to call (see set/clear_extent_bit). 320 */ 321 static int insert_state(struct extent_io_tree *tree, 322 struct extent_state *state, u64 start, u64 end, 323 int *bits) 324 { 325 struct rb_node *node; 326 int bits_to_set = *bits & ~EXTENT_CTLBITS; 327 int ret; 328 329 if (end < start) { 330 printk(KERN_ERR "btrfs end < start %llu %llu\n", 331 (unsigned long long)end, 332 (unsigned long long)start); 333 WARN_ON(1); 334 } 335 state->start = start; 336 state->end = end; 337 ret = set_state_cb(tree, state, bits); 338 if (ret) 339 return ret; 340 341 if (bits_to_set & EXTENT_DIRTY) 342 tree->dirty_bytes += end - start + 1; 343 state->state |= bits_to_set; 344 node = tree_insert(&tree->state, end, &state->rb_node); 345 if (node) { 346 struct extent_state *found; 347 found = rb_entry(node, struct extent_state, rb_node); 348 printk(KERN_ERR "btrfs found node %llu %llu on insert of " 349 "%llu %llu\n", (unsigned long long)found->start, 350 (unsigned long long)found->end, 351 (unsigned long long)start, (unsigned long long)end); 352 free_extent_state(state); 353 return -EEXIST; 354 } 355 state->tree = tree; 356 merge_state(tree, state); 357 return 0; 358 } 359 360 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 361 u64 split) 362 { 363 if (tree->ops && tree->ops->split_extent_hook) 364 return tree->ops->split_extent_hook(tree->mapping->host, 365 orig, split); 366 return 0; 367 } 368 369 /* 370 * split a given extent state struct in two, inserting the preallocated 371 * struct 'prealloc' as the newly created second half. 'split' indicates an 372 * offset inside 'orig' where it should be split. 373 * 374 * Before calling, 375 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 376 * are two extent state structs in the tree: 377 * prealloc: [orig->start, split - 1] 378 * orig: [ split, orig->end ] 379 * 380 * The tree locks are not taken by this function. They need to be held 381 * by the caller. 382 */ 383 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 384 struct extent_state *prealloc, u64 split) 385 { 386 struct rb_node *node; 387 388 split_cb(tree, orig, split); 389 390 prealloc->start = orig->start; 391 prealloc->end = split - 1; 392 prealloc->state = orig->state; 393 orig->start = split; 394 395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 396 if (node) { 397 free_extent_state(prealloc); 398 return -EEXIST; 399 } 400 prealloc->tree = tree; 401 return 0; 402 } 403 404 /* 405 * utility function to clear some bits in an extent state struct. 406 * it will optionally wake up any one waiting on this state (wake == 1), or 407 * forcibly remove the state from the tree (delete == 1). 408 * 409 * If no bits are set on the state struct after clearing things, the 410 * struct is freed and removed from the tree 411 */ 412 static int clear_state_bit(struct extent_io_tree *tree, 413 struct extent_state *state, 414 int *bits, int wake) 415 { 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS; 417 int ret = state->state & bits_to_clear; 418 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 420 u64 range = state->end - state->start + 1; 421 WARN_ON(range > tree->dirty_bytes); 422 tree->dirty_bytes -= range; 423 } 424 clear_state_cb(tree, state, bits); 425 state->state &= ~bits_to_clear; 426 if (wake) 427 wake_up(&state->wq); 428 if (state->state == 0) { 429 if (state->tree) { 430 rb_erase(&state->rb_node, &tree->state); 431 state->tree = NULL; 432 free_extent_state(state); 433 } else { 434 WARN_ON(1); 435 } 436 } else { 437 merge_state(tree, state); 438 } 439 return ret; 440 } 441 442 /* 443 * clear some bits on a range in the tree. This may require splitting 444 * or inserting elements in the tree, so the gfp mask is used to 445 * indicate which allocations or sleeping are allowed. 446 * 447 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 448 * the given range from the tree regardless of state (ie for truncate). 449 * 450 * the range [start, end] is inclusive. 451 * 452 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 453 * bits were already set, or zero if none of the bits were already set. 454 */ 455 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 456 int bits, int wake, int delete, 457 struct extent_state **cached_state, 458 gfp_t mask) 459 { 460 struct extent_state *state; 461 struct extent_state *cached; 462 struct extent_state *prealloc = NULL; 463 struct rb_node *next_node; 464 struct rb_node *node; 465 u64 last_end; 466 int err; 467 int set = 0; 468 int clear = 0; 469 470 if (delete) 471 bits |= ~EXTENT_CTLBITS; 472 bits |= EXTENT_FIRST_DELALLOC; 473 474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 475 clear = 1; 476 again: 477 if (!prealloc && (mask & __GFP_WAIT)) { 478 prealloc = alloc_extent_state(mask); 479 if (!prealloc) 480 return -ENOMEM; 481 } 482 483 spin_lock(&tree->lock); 484 if (cached_state) { 485 cached = *cached_state; 486 487 if (clear) { 488 *cached_state = NULL; 489 cached_state = NULL; 490 } 491 492 if (cached && cached->tree && cached->start == start) { 493 if (clear) 494 atomic_dec(&cached->refs); 495 state = cached; 496 goto hit_next; 497 } 498 if (clear) 499 free_extent_state(cached); 500 } 501 /* 502 * this search will find the extents that end after 503 * our range starts 504 */ 505 node = tree_search(tree, start); 506 if (!node) 507 goto out; 508 state = rb_entry(node, struct extent_state, rb_node); 509 hit_next: 510 if (state->start > end) 511 goto out; 512 WARN_ON(state->end < start); 513 last_end = state->end; 514 515 /* 516 * | ---- desired range ---- | 517 * | state | or 518 * | ------------- state -------------- | 519 * 520 * We need to split the extent we found, and may flip 521 * bits on second half. 522 * 523 * If the extent we found extends past our range, we 524 * just split and search again. It'll get split again 525 * the next time though. 526 * 527 * If the extent we found is inside our range, we clear 528 * the desired bit on it. 529 */ 530 531 if (state->start < start) { 532 if (!prealloc) 533 prealloc = alloc_extent_state(GFP_ATOMIC); 534 err = split_state(tree, state, prealloc, start); 535 BUG_ON(err == -EEXIST); 536 prealloc = NULL; 537 if (err) 538 goto out; 539 if (state->end <= end) { 540 set |= clear_state_bit(tree, state, &bits, wake); 541 if (last_end == (u64)-1) 542 goto out; 543 start = last_end + 1; 544 } 545 goto search_again; 546 } 547 /* 548 * | ---- desired range ---- | 549 * | state | 550 * We need to split the extent, and clear the bit 551 * on the first half 552 */ 553 if (state->start <= end && state->end > end) { 554 if (!prealloc) 555 prealloc = alloc_extent_state(GFP_ATOMIC); 556 err = split_state(tree, state, prealloc, end + 1); 557 BUG_ON(err == -EEXIST); 558 if (wake) 559 wake_up(&state->wq); 560 561 set |= clear_state_bit(tree, prealloc, &bits, wake); 562 563 prealloc = NULL; 564 goto out; 565 } 566 567 if (state->end < end && prealloc && !need_resched()) 568 next_node = rb_next(&state->rb_node); 569 else 570 next_node = NULL; 571 572 set |= clear_state_bit(tree, state, &bits, wake); 573 if (last_end == (u64)-1) 574 goto out; 575 start = last_end + 1; 576 if (start <= end && next_node) { 577 state = rb_entry(next_node, struct extent_state, 578 rb_node); 579 if (state->start == start) 580 goto hit_next; 581 } 582 goto search_again; 583 584 out: 585 spin_unlock(&tree->lock); 586 if (prealloc) 587 free_extent_state(prealloc); 588 589 return set; 590 591 search_again: 592 if (start > end) 593 goto out; 594 spin_unlock(&tree->lock); 595 if (mask & __GFP_WAIT) 596 cond_resched(); 597 goto again; 598 } 599 600 static int wait_on_state(struct extent_io_tree *tree, 601 struct extent_state *state) 602 __releases(tree->lock) 603 __acquires(tree->lock) 604 { 605 DEFINE_WAIT(wait); 606 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 607 spin_unlock(&tree->lock); 608 schedule(); 609 spin_lock(&tree->lock); 610 finish_wait(&state->wq, &wait); 611 return 0; 612 } 613 614 /* 615 * waits for one or more bits to clear on a range in the state tree. 616 * The range [start, end] is inclusive. 617 * The tree lock is taken by this function 618 */ 619 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) 620 { 621 struct extent_state *state; 622 struct rb_node *node; 623 624 spin_lock(&tree->lock); 625 again: 626 while (1) { 627 /* 628 * this search will find all the extents that end after 629 * our range starts 630 */ 631 node = tree_search(tree, start); 632 if (!node) 633 break; 634 635 state = rb_entry(node, struct extent_state, rb_node); 636 637 if (state->start > end) 638 goto out; 639 640 if (state->state & bits) { 641 start = state->start; 642 atomic_inc(&state->refs); 643 wait_on_state(tree, state); 644 free_extent_state(state); 645 goto again; 646 } 647 start = state->end + 1; 648 649 if (start > end) 650 break; 651 652 if (need_resched()) { 653 spin_unlock(&tree->lock); 654 cond_resched(); 655 spin_lock(&tree->lock); 656 } 657 } 658 out: 659 spin_unlock(&tree->lock); 660 return 0; 661 } 662 663 static int set_state_bits(struct extent_io_tree *tree, 664 struct extent_state *state, 665 int *bits) 666 { 667 int ret; 668 int bits_to_set = *bits & ~EXTENT_CTLBITS; 669 670 ret = set_state_cb(tree, state, bits); 671 if (ret) 672 return ret; 673 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 674 u64 range = state->end - state->start + 1; 675 tree->dirty_bytes += range; 676 } 677 state->state |= bits_to_set; 678 679 return 0; 680 } 681 682 static void cache_state(struct extent_state *state, 683 struct extent_state **cached_ptr) 684 { 685 if (cached_ptr && !(*cached_ptr)) { 686 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { 687 *cached_ptr = state; 688 atomic_inc(&state->refs); 689 } 690 } 691 } 692 693 /* 694 * set some bits on a range in the tree. This may require allocations or 695 * sleeping, so the gfp mask is used to indicate what is allowed. 696 * 697 * If any of the exclusive bits are set, this will fail with -EEXIST if some 698 * part of the range already has the desired bits set. The start of the 699 * existing range is returned in failed_start in this case. 700 * 701 * [start, end] is inclusive This takes the tree lock. 702 */ 703 704 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 705 int bits, int exclusive_bits, u64 *failed_start, 706 struct extent_state **cached_state, gfp_t mask) 707 { 708 struct extent_state *state; 709 struct extent_state *prealloc = NULL; 710 struct rb_node *node; 711 int err = 0; 712 u64 last_start; 713 u64 last_end; 714 715 bits |= EXTENT_FIRST_DELALLOC; 716 again: 717 if (!prealloc && (mask & __GFP_WAIT)) { 718 prealloc = alloc_extent_state(mask); 719 if (!prealloc) 720 return -ENOMEM; 721 } 722 723 spin_lock(&tree->lock); 724 if (cached_state && *cached_state) { 725 state = *cached_state; 726 if (state->start == start && state->tree) { 727 node = &state->rb_node; 728 goto hit_next; 729 } 730 } 731 /* 732 * this search will find all the extents that end after 733 * our range starts. 734 */ 735 node = tree_search(tree, start); 736 if (!node) { 737 err = insert_state(tree, prealloc, start, end, &bits); 738 prealloc = NULL; 739 BUG_ON(err == -EEXIST); 740 goto out; 741 } 742 state = rb_entry(node, struct extent_state, rb_node); 743 hit_next: 744 last_start = state->start; 745 last_end = state->end; 746 747 /* 748 * | ---- desired range ---- | 749 * | state | 750 * 751 * Just lock what we found and keep going 752 */ 753 if (state->start == start && state->end <= end) { 754 struct rb_node *next_node; 755 if (state->state & exclusive_bits) { 756 *failed_start = state->start; 757 err = -EEXIST; 758 goto out; 759 } 760 761 err = set_state_bits(tree, state, &bits); 762 if (err) 763 goto out; 764 765 cache_state(state, cached_state); 766 merge_state(tree, state); 767 if (last_end == (u64)-1) 768 goto out; 769 770 start = last_end + 1; 771 if (start < end && prealloc && !need_resched()) { 772 next_node = rb_next(node); 773 if (next_node) { 774 state = rb_entry(next_node, struct extent_state, 775 rb_node); 776 if (state->start == start) 777 goto hit_next; 778 } 779 } 780 goto search_again; 781 } 782 783 /* 784 * | ---- desired range ---- | 785 * | state | 786 * or 787 * | ------------- state -------------- | 788 * 789 * We need to split the extent we found, and may flip bits on 790 * second half. 791 * 792 * If the extent we found extends past our 793 * range, we just split and search again. It'll get split 794 * again the next time though. 795 * 796 * If the extent we found is inside our range, we set the 797 * desired bit on it. 798 */ 799 if (state->start < start) { 800 if (state->state & exclusive_bits) { 801 *failed_start = start; 802 err = -EEXIST; 803 goto out; 804 } 805 err = split_state(tree, state, prealloc, start); 806 BUG_ON(err == -EEXIST); 807 prealloc = NULL; 808 if (err) 809 goto out; 810 if (state->end <= end) { 811 err = set_state_bits(tree, state, &bits); 812 if (err) 813 goto out; 814 cache_state(state, cached_state); 815 merge_state(tree, state); 816 if (last_end == (u64)-1) 817 goto out; 818 start = last_end + 1; 819 } 820 goto search_again; 821 } 822 /* 823 * | ---- desired range ---- | 824 * | state | or | state | 825 * 826 * There's a hole, we need to insert something in it and 827 * ignore the extent we found. 828 */ 829 if (state->start > start) { 830 u64 this_end; 831 if (end < last_start) 832 this_end = end; 833 else 834 this_end = last_start - 1; 835 err = insert_state(tree, prealloc, start, this_end, 836 &bits); 837 BUG_ON(err == -EEXIST); 838 if (err) { 839 prealloc = NULL; 840 goto out; 841 } 842 cache_state(prealloc, cached_state); 843 prealloc = NULL; 844 start = this_end + 1; 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and set the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 if (state->state & exclusive_bits) { 855 *failed_start = start; 856 err = -EEXIST; 857 goto out; 858 } 859 err = split_state(tree, state, prealloc, end + 1); 860 BUG_ON(err == -EEXIST); 861 862 err = set_state_bits(tree, prealloc, &bits); 863 if (err) { 864 prealloc = NULL; 865 goto out; 866 } 867 cache_state(prealloc, cached_state); 868 merge_state(tree, prealloc); 869 prealloc = NULL; 870 goto out; 871 } 872 873 goto search_again; 874 875 out: 876 spin_unlock(&tree->lock); 877 if (prealloc) 878 free_extent_state(prealloc); 879 880 return err; 881 882 search_again: 883 if (start > end) 884 goto out; 885 spin_unlock(&tree->lock); 886 if (mask & __GFP_WAIT) 887 cond_resched(); 888 goto again; 889 } 890 891 /* wrappers around set/clear extent bit */ 892 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 893 gfp_t mask) 894 { 895 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 896 NULL, mask); 897 } 898 899 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 900 int bits, gfp_t mask) 901 { 902 return set_extent_bit(tree, start, end, bits, 0, NULL, 903 NULL, mask); 904 } 905 906 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 907 int bits, gfp_t mask) 908 { 909 return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); 910 } 911 912 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 913 struct extent_state **cached_state, gfp_t mask) 914 { 915 return set_extent_bit(tree, start, end, 916 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 917 0, NULL, cached_state, mask); 918 } 919 920 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 921 gfp_t mask) 922 { 923 return clear_extent_bit(tree, start, end, 924 EXTENT_DIRTY | EXTENT_DELALLOC | 925 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); 926 } 927 928 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 929 gfp_t mask) 930 { 931 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 932 NULL, mask); 933 } 934 935 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 936 gfp_t mask) 937 { 938 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, 939 NULL, mask); 940 } 941 942 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 943 gfp_t mask) 944 { 945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 946 NULL, mask); 947 } 948 949 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 950 u64 end, struct extent_state **cached_state, 951 gfp_t mask) 952 { 953 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 954 cached_state, mask); 955 } 956 957 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 958 { 959 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 960 } 961 962 /* 963 * either insert or lock state struct between start and end use mask to tell 964 * us if waiting is desired. 965 */ 966 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 967 int bits, struct extent_state **cached_state, gfp_t mask) 968 { 969 int err; 970 u64 failed_start; 971 while (1) { 972 err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 973 EXTENT_LOCKED, &failed_start, 974 cached_state, mask); 975 if (err == -EEXIST && (mask & __GFP_WAIT)) { 976 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 977 start = failed_start; 978 } else { 979 break; 980 } 981 WARN_ON(start > end); 982 } 983 return err; 984 } 985 986 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) 987 { 988 return lock_extent_bits(tree, start, end, 0, NULL, mask); 989 } 990 991 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 992 gfp_t mask) 993 { 994 int err; 995 u64 failed_start; 996 997 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 998 &failed_start, NULL, mask); 999 if (err == -EEXIST) { 1000 if (failed_start > start) 1001 clear_extent_bit(tree, start, failed_start - 1, 1002 EXTENT_LOCKED, 1, 0, NULL, mask); 1003 return 0; 1004 } 1005 return 1; 1006 } 1007 1008 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, 1009 struct extent_state **cached, gfp_t mask) 1010 { 1011 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, 1012 mask); 1013 } 1014 1015 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1016 gfp_t mask) 1017 { 1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1019 mask); 1020 } 1021 1022 /* 1023 * helper function to set pages and extents in the tree dirty 1024 */ 1025 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) 1026 { 1027 unsigned long index = start >> PAGE_CACHE_SHIFT; 1028 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1029 struct page *page; 1030 1031 while (index <= end_index) { 1032 page = find_get_page(tree->mapping, index); 1033 BUG_ON(!page); 1034 __set_page_dirty_nobuffers(page); 1035 page_cache_release(page); 1036 index++; 1037 } 1038 return 0; 1039 } 1040 1041 /* 1042 * helper function to set both pages and extents in the tree writeback 1043 */ 1044 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1045 { 1046 unsigned long index = start >> PAGE_CACHE_SHIFT; 1047 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1048 struct page *page; 1049 1050 while (index <= end_index) { 1051 page = find_get_page(tree->mapping, index); 1052 BUG_ON(!page); 1053 set_page_writeback(page); 1054 page_cache_release(page); 1055 index++; 1056 } 1057 return 0; 1058 } 1059 1060 /* 1061 * find the first offset in the io tree with 'bits' set. zero is 1062 * returned if we find something, and *start_ret and *end_ret are 1063 * set to reflect the state struct that was found. 1064 * 1065 * If nothing was found, 1 is returned, < 0 on error 1066 */ 1067 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1068 u64 *start_ret, u64 *end_ret, int bits) 1069 { 1070 struct rb_node *node; 1071 struct extent_state *state; 1072 int ret = 1; 1073 1074 spin_lock(&tree->lock); 1075 /* 1076 * this search will find all the extents that end after 1077 * our range starts. 1078 */ 1079 node = tree_search(tree, start); 1080 if (!node) 1081 goto out; 1082 1083 while (1) { 1084 state = rb_entry(node, struct extent_state, rb_node); 1085 if (state->end >= start && (state->state & bits)) { 1086 *start_ret = state->start; 1087 *end_ret = state->end; 1088 ret = 0; 1089 break; 1090 } 1091 node = rb_next(node); 1092 if (!node) 1093 break; 1094 } 1095 out: 1096 spin_unlock(&tree->lock); 1097 return ret; 1098 } 1099 1100 /* find the first state struct with 'bits' set after 'start', and 1101 * return it. tree->lock must be held. NULL will returned if 1102 * nothing was found after 'start' 1103 */ 1104 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 1105 u64 start, int bits) 1106 { 1107 struct rb_node *node; 1108 struct extent_state *state; 1109 1110 /* 1111 * this search will find all the extents that end after 1112 * our range starts. 1113 */ 1114 node = tree_search(tree, start); 1115 if (!node) 1116 goto out; 1117 1118 while (1) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (state->end >= start && (state->state & bits)) 1121 return state; 1122 1123 node = rb_next(node); 1124 if (!node) 1125 break; 1126 } 1127 out: 1128 return NULL; 1129 } 1130 1131 /* 1132 * find a contiguous range of bytes in the file marked as delalloc, not 1133 * more than 'max_bytes'. start and end are used to return the range, 1134 * 1135 * 1 is returned if we find something, 0 if nothing was in the tree 1136 */ 1137 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1138 u64 *start, u64 *end, u64 max_bytes, 1139 struct extent_state **cached_state) 1140 { 1141 struct rb_node *node; 1142 struct extent_state *state; 1143 u64 cur_start = *start; 1144 u64 found = 0; 1145 u64 total_bytes = 0; 1146 1147 spin_lock(&tree->lock); 1148 1149 /* 1150 * this search will find all the extents that end after 1151 * our range starts. 1152 */ 1153 node = tree_search(tree, cur_start); 1154 if (!node) { 1155 if (!found) 1156 *end = (u64)-1; 1157 goto out; 1158 } 1159 1160 while (1) { 1161 state = rb_entry(node, struct extent_state, rb_node); 1162 if (found && (state->start != cur_start || 1163 (state->state & EXTENT_BOUNDARY))) { 1164 goto out; 1165 } 1166 if (!(state->state & EXTENT_DELALLOC)) { 1167 if (!found) 1168 *end = state->end; 1169 goto out; 1170 } 1171 if (!found) { 1172 *start = state->start; 1173 *cached_state = state; 1174 atomic_inc(&state->refs); 1175 } 1176 found++; 1177 *end = state->end; 1178 cur_start = state->end + 1; 1179 node = rb_next(node); 1180 if (!node) 1181 break; 1182 total_bytes += state->end - state->start + 1; 1183 if (total_bytes >= max_bytes) 1184 break; 1185 } 1186 out: 1187 spin_unlock(&tree->lock); 1188 return found; 1189 } 1190 1191 static noinline int __unlock_for_delalloc(struct inode *inode, 1192 struct page *locked_page, 1193 u64 start, u64 end) 1194 { 1195 int ret; 1196 struct page *pages[16]; 1197 unsigned long index = start >> PAGE_CACHE_SHIFT; 1198 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1199 unsigned long nr_pages = end_index - index + 1; 1200 int i; 1201 1202 if (index == locked_page->index && end_index == index) 1203 return 0; 1204 1205 while (nr_pages > 0) { 1206 ret = find_get_pages_contig(inode->i_mapping, index, 1207 min_t(unsigned long, nr_pages, 1208 ARRAY_SIZE(pages)), pages); 1209 for (i = 0; i < ret; i++) { 1210 if (pages[i] != locked_page) 1211 unlock_page(pages[i]); 1212 page_cache_release(pages[i]); 1213 } 1214 nr_pages -= ret; 1215 index += ret; 1216 cond_resched(); 1217 } 1218 return 0; 1219 } 1220 1221 static noinline int lock_delalloc_pages(struct inode *inode, 1222 struct page *locked_page, 1223 u64 delalloc_start, 1224 u64 delalloc_end) 1225 { 1226 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; 1227 unsigned long start_index = index; 1228 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; 1229 unsigned long pages_locked = 0; 1230 struct page *pages[16]; 1231 unsigned long nrpages; 1232 int ret; 1233 int i; 1234 1235 /* the caller is responsible for locking the start index */ 1236 if (index == locked_page->index && index == end_index) 1237 return 0; 1238 1239 /* skip the page at the start index */ 1240 nrpages = end_index - index + 1; 1241 while (nrpages > 0) { 1242 ret = find_get_pages_contig(inode->i_mapping, index, 1243 min_t(unsigned long, 1244 nrpages, ARRAY_SIZE(pages)), pages); 1245 if (ret == 0) { 1246 ret = -EAGAIN; 1247 goto done; 1248 } 1249 /* now we have an array of pages, lock them all */ 1250 for (i = 0; i < ret; i++) { 1251 /* 1252 * the caller is taking responsibility for 1253 * locked_page 1254 */ 1255 if (pages[i] != locked_page) { 1256 lock_page(pages[i]); 1257 if (!PageDirty(pages[i]) || 1258 pages[i]->mapping != inode->i_mapping) { 1259 ret = -EAGAIN; 1260 unlock_page(pages[i]); 1261 page_cache_release(pages[i]); 1262 goto done; 1263 } 1264 } 1265 page_cache_release(pages[i]); 1266 pages_locked++; 1267 } 1268 nrpages -= ret; 1269 index += ret; 1270 cond_resched(); 1271 } 1272 ret = 0; 1273 done: 1274 if (ret && pages_locked) { 1275 __unlock_for_delalloc(inode, locked_page, 1276 delalloc_start, 1277 ((u64)(start_index + pages_locked - 1)) << 1278 PAGE_CACHE_SHIFT); 1279 } 1280 return ret; 1281 } 1282 1283 /* 1284 * find a contiguous range of bytes in the file marked as delalloc, not 1285 * more than 'max_bytes'. start and end are used to return the range, 1286 * 1287 * 1 is returned if we find something, 0 if nothing was in the tree 1288 */ 1289 static noinline u64 find_lock_delalloc_range(struct inode *inode, 1290 struct extent_io_tree *tree, 1291 struct page *locked_page, 1292 u64 *start, u64 *end, 1293 u64 max_bytes) 1294 { 1295 u64 delalloc_start; 1296 u64 delalloc_end; 1297 u64 found; 1298 struct extent_state *cached_state = NULL; 1299 int ret; 1300 int loops = 0; 1301 1302 again: 1303 /* step one, find a bunch of delalloc bytes starting at start */ 1304 delalloc_start = *start; 1305 delalloc_end = 0; 1306 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1307 max_bytes, &cached_state); 1308 if (!found || delalloc_end <= *start) { 1309 *start = delalloc_start; 1310 *end = delalloc_end; 1311 free_extent_state(cached_state); 1312 return found; 1313 } 1314 1315 /* 1316 * start comes from the offset of locked_page. We have to lock 1317 * pages in order, so we can't process delalloc bytes before 1318 * locked_page 1319 */ 1320 if (delalloc_start < *start) 1321 delalloc_start = *start; 1322 1323 /* 1324 * make sure to limit the number of pages we try to lock down 1325 * if we're looping. 1326 */ 1327 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) 1328 delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; 1329 1330 /* step two, lock all the pages after the page that has start */ 1331 ret = lock_delalloc_pages(inode, locked_page, 1332 delalloc_start, delalloc_end); 1333 if (ret == -EAGAIN) { 1334 /* some of the pages are gone, lets avoid looping by 1335 * shortening the size of the delalloc range we're searching 1336 */ 1337 free_extent_state(cached_state); 1338 if (!loops) { 1339 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); 1340 max_bytes = PAGE_CACHE_SIZE - offset; 1341 loops = 1; 1342 goto again; 1343 } else { 1344 found = 0; 1345 goto out_failed; 1346 } 1347 } 1348 BUG_ON(ret); 1349 1350 /* step three, lock the state bits for the whole range */ 1351 lock_extent_bits(tree, delalloc_start, delalloc_end, 1352 0, &cached_state, GFP_NOFS); 1353 1354 /* then test to make sure it is all still delalloc */ 1355 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1356 EXTENT_DELALLOC, 1, cached_state); 1357 if (!ret) { 1358 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1359 &cached_state, GFP_NOFS); 1360 __unlock_for_delalloc(inode, locked_page, 1361 delalloc_start, delalloc_end); 1362 cond_resched(); 1363 goto again; 1364 } 1365 free_extent_state(cached_state); 1366 *start = delalloc_start; 1367 *end = delalloc_end; 1368 out_failed: 1369 return found; 1370 } 1371 1372 int extent_clear_unlock_delalloc(struct inode *inode, 1373 struct extent_io_tree *tree, 1374 u64 start, u64 end, struct page *locked_page, 1375 unsigned long op) 1376 { 1377 int ret; 1378 struct page *pages[16]; 1379 unsigned long index = start >> PAGE_CACHE_SHIFT; 1380 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1381 unsigned long nr_pages = end_index - index + 1; 1382 int i; 1383 int clear_bits = 0; 1384 1385 if (op & EXTENT_CLEAR_UNLOCK) 1386 clear_bits |= EXTENT_LOCKED; 1387 if (op & EXTENT_CLEAR_DIRTY) 1388 clear_bits |= EXTENT_DIRTY; 1389 1390 if (op & EXTENT_CLEAR_DELALLOC) 1391 clear_bits |= EXTENT_DELALLOC; 1392 1393 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1394 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1395 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1396 EXTENT_SET_PRIVATE2))) 1397 return 0; 1398 1399 while (nr_pages > 0) { 1400 ret = find_get_pages_contig(inode->i_mapping, index, 1401 min_t(unsigned long, 1402 nr_pages, ARRAY_SIZE(pages)), pages); 1403 for (i = 0; i < ret; i++) { 1404 1405 if (op & EXTENT_SET_PRIVATE2) 1406 SetPagePrivate2(pages[i]); 1407 1408 if (pages[i] == locked_page) { 1409 page_cache_release(pages[i]); 1410 continue; 1411 } 1412 if (op & EXTENT_CLEAR_DIRTY) 1413 clear_page_dirty_for_io(pages[i]); 1414 if (op & EXTENT_SET_WRITEBACK) 1415 set_page_writeback(pages[i]); 1416 if (op & EXTENT_END_WRITEBACK) 1417 end_page_writeback(pages[i]); 1418 if (op & EXTENT_CLEAR_UNLOCK_PAGE) 1419 unlock_page(pages[i]); 1420 page_cache_release(pages[i]); 1421 } 1422 nr_pages -= ret; 1423 index += ret; 1424 cond_resched(); 1425 } 1426 return 0; 1427 } 1428 1429 /* 1430 * count the number of bytes in the tree that have a given bit(s) 1431 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1432 * cached. The total number found is returned. 1433 */ 1434 u64 count_range_bits(struct extent_io_tree *tree, 1435 u64 *start, u64 search_end, u64 max_bytes, 1436 unsigned long bits, int contig) 1437 { 1438 struct rb_node *node; 1439 struct extent_state *state; 1440 u64 cur_start = *start; 1441 u64 total_bytes = 0; 1442 u64 last = 0; 1443 int found = 0; 1444 1445 if (search_end <= cur_start) { 1446 WARN_ON(1); 1447 return 0; 1448 } 1449 1450 spin_lock(&tree->lock); 1451 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1452 total_bytes = tree->dirty_bytes; 1453 goto out; 1454 } 1455 /* 1456 * this search will find all the extents that end after 1457 * our range starts. 1458 */ 1459 node = tree_search(tree, cur_start); 1460 if (!node) 1461 goto out; 1462 1463 while (1) { 1464 state = rb_entry(node, struct extent_state, rb_node); 1465 if (state->start > search_end) 1466 break; 1467 if (contig && found && state->start > last + 1) 1468 break; 1469 if (state->end >= cur_start && (state->state & bits) == bits) { 1470 total_bytes += min(search_end, state->end) + 1 - 1471 max(cur_start, state->start); 1472 if (total_bytes >= max_bytes) 1473 break; 1474 if (!found) { 1475 *start = state->start; 1476 found = 1; 1477 } 1478 last = state->end; 1479 } else if (contig && found) { 1480 break; 1481 } 1482 node = rb_next(node); 1483 if (!node) 1484 break; 1485 } 1486 out: 1487 spin_unlock(&tree->lock); 1488 return total_bytes; 1489 } 1490 1491 /* 1492 * set the private field for a given byte offset in the tree. If there isn't 1493 * an extent_state there already, this does nothing. 1494 */ 1495 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) 1496 { 1497 struct rb_node *node; 1498 struct extent_state *state; 1499 int ret = 0; 1500 1501 spin_lock(&tree->lock); 1502 /* 1503 * this search will find all the extents that end after 1504 * our range starts. 1505 */ 1506 node = tree_search(tree, start); 1507 if (!node) { 1508 ret = -ENOENT; 1509 goto out; 1510 } 1511 state = rb_entry(node, struct extent_state, rb_node); 1512 if (state->start != start) { 1513 ret = -ENOENT; 1514 goto out; 1515 } 1516 state->private = private; 1517 out: 1518 spin_unlock(&tree->lock); 1519 return ret; 1520 } 1521 1522 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) 1523 { 1524 struct rb_node *node; 1525 struct extent_state *state; 1526 int ret = 0; 1527 1528 spin_lock(&tree->lock); 1529 /* 1530 * this search will find all the extents that end after 1531 * our range starts. 1532 */ 1533 node = tree_search(tree, start); 1534 if (!node) { 1535 ret = -ENOENT; 1536 goto out; 1537 } 1538 state = rb_entry(node, struct extent_state, rb_node); 1539 if (state->start != start) { 1540 ret = -ENOENT; 1541 goto out; 1542 } 1543 *private = state->private; 1544 out: 1545 spin_unlock(&tree->lock); 1546 return ret; 1547 } 1548 1549 /* 1550 * searches a range in the state tree for a given mask. 1551 * If 'filled' == 1, this returns 1 only if every extent in the tree 1552 * has the bits set. Otherwise, 1 is returned if any bit in the 1553 * range is found set. 1554 */ 1555 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1556 int bits, int filled, struct extent_state *cached) 1557 { 1558 struct extent_state *state = NULL; 1559 struct rb_node *node; 1560 int bitset = 0; 1561 1562 spin_lock(&tree->lock); 1563 if (cached && cached->tree && cached->start == start) 1564 node = &cached->rb_node; 1565 else 1566 node = tree_search(tree, start); 1567 while (node && start <= end) { 1568 state = rb_entry(node, struct extent_state, rb_node); 1569 1570 if (filled && state->start > start) { 1571 bitset = 0; 1572 break; 1573 } 1574 1575 if (state->start > end) 1576 break; 1577 1578 if (state->state & bits) { 1579 bitset = 1; 1580 if (!filled) 1581 break; 1582 } else if (filled) { 1583 bitset = 0; 1584 break; 1585 } 1586 1587 if (state->end == (u64)-1) 1588 break; 1589 1590 start = state->end + 1; 1591 if (start > end) 1592 break; 1593 node = rb_next(node); 1594 if (!node) { 1595 if (filled) 1596 bitset = 0; 1597 break; 1598 } 1599 } 1600 spin_unlock(&tree->lock); 1601 return bitset; 1602 } 1603 1604 /* 1605 * helper function to set a given page up to date if all the 1606 * extents in the tree for that page are up to date 1607 */ 1608 static int check_page_uptodate(struct extent_io_tree *tree, 1609 struct page *page) 1610 { 1611 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1612 u64 end = start + PAGE_CACHE_SIZE - 1; 1613 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1614 SetPageUptodate(page); 1615 return 0; 1616 } 1617 1618 /* 1619 * helper function to unlock a page if all the extents in the tree 1620 * for that page are unlocked 1621 */ 1622 static int check_page_locked(struct extent_io_tree *tree, 1623 struct page *page) 1624 { 1625 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1626 u64 end = start + PAGE_CACHE_SIZE - 1; 1627 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) 1628 unlock_page(page); 1629 return 0; 1630 } 1631 1632 /* 1633 * helper function to end page writeback if all the extents 1634 * in the tree for that page are done with writeback 1635 */ 1636 static int check_page_writeback(struct extent_io_tree *tree, 1637 struct page *page) 1638 { 1639 end_page_writeback(page); 1640 return 0; 1641 } 1642 1643 /* lots and lots of room for performance fixes in the end_bio funcs */ 1644 1645 /* 1646 * after a writepage IO is done, we need to: 1647 * clear the uptodate bits on error 1648 * clear the writeback bits in the extent tree for this IO 1649 * end_page_writeback if the page has no more pending IO 1650 * 1651 * Scheduling is not allowed, so the extent state tree is expected 1652 * to have one and only one object corresponding to this IO. 1653 */ 1654 static void end_bio_extent_writepage(struct bio *bio, int err) 1655 { 1656 int uptodate = err == 0; 1657 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1658 struct extent_io_tree *tree; 1659 u64 start; 1660 u64 end; 1661 int whole_page; 1662 int ret; 1663 1664 do { 1665 struct page *page = bvec->bv_page; 1666 tree = &BTRFS_I(page->mapping->host)->io_tree; 1667 1668 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1669 bvec->bv_offset; 1670 end = start + bvec->bv_len - 1; 1671 1672 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1673 whole_page = 1; 1674 else 1675 whole_page = 0; 1676 1677 if (--bvec >= bio->bi_io_vec) 1678 prefetchw(&bvec->bv_page->flags); 1679 if (tree->ops && tree->ops->writepage_end_io_hook) { 1680 ret = tree->ops->writepage_end_io_hook(page, start, 1681 end, NULL, uptodate); 1682 if (ret) 1683 uptodate = 0; 1684 } 1685 1686 if (!uptodate && tree->ops && 1687 tree->ops->writepage_io_failed_hook) { 1688 ret = tree->ops->writepage_io_failed_hook(bio, page, 1689 start, end, NULL); 1690 if (ret == 0) { 1691 uptodate = (err == 0); 1692 continue; 1693 } 1694 } 1695 1696 if (!uptodate) { 1697 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); 1698 ClearPageUptodate(page); 1699 SetPageError(page); 1700 } 1701 1702 if (whole_page) 1703 end_page_writeback(page); 1704 else 1705 check_page_writeback(tree, page); 1706 } while (bvec >= bio->bi_io_vec); 1707 1708 bio_put(bio); 1709 } 1710 1711 /* 1712 * after a readpage IO is done, we need to: 1713 * clear the uptodate bits on error 1714 * set the uptodate bits if things worked 1715 * set the page up to date if all extents in the tree are uptodate 1716 * clear the lock bit in the extent tree 1717 * unlock the page if there are no other extents locked for it 1718 * 1719 * Scheduling is not allowed, so the extent state tree is expected 1720 * to have one and only one object corresponding to this IO. 1721 */ 1722 static void end_bio_extent_readpage(struct bio *bio, int err) 1723 { 1724 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1725 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 1726 struct bio_vec *bvec = bio->bi_io_vec; 1727 struct extent_io_tree *tree; 1728 u64 start; 1729 u64 end; 1730 int whole_page; 1731 int ret; 1732 1733 if (err) 1734 uptodate = 0; 1735 1736 do { 1737 struct page *page = bvec->bv_page; 1738 tree = &BTRFS_I(page->mapping->host)->io_tree; 1739 1740 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1741 bvec->bv_offset; 1742 end = start + bvec->bv_len - 1; 1743 1744 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1745 whole_page = 1; 1746 else 1747 whole_page = 0; 1748 1749 if (++bvec <= bvec_end) 1750 prefetchw(&bvec->bv_page->flags); 1751 1752 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1753 ret = tree->ops->readpage_end_io_hook(page, start, end, 1754 NULL); 1755 if (ret) 1756 uptodate = 0; 1757 } 1758 if (!uptodate && tree->ops && 1759 tree->ops->readpage_io_failed_hook) { 1760 ret = tree->ops->readpage_io_failed_hook(bio, page, 1761 start, end, NULL); 1762 if (ret == 0) { 1763 uptodate = 1764 test_bit(BIO_UPTODATE, &bio->bi_flags); 1765 if (err) 1766 uptodate = 0; 1767 continue; 1768 } 1769 } 1770 1771 if (uptodate) { 1772 set_extent_uptodate(tree, start, end, 1773 GFP_ATOMIC); 1774 } 1775 unlock_extent(tree, start, end, GFP_ATOMIC); 1776 1777 if (whole_page) { 1778 if (uptodate) { 1779 SetPageUptodate(page); 1780 } else { 1781 ClearPageUptodate(page); 1782 SetPageError(page); 1783 } 1784 unlock_page(page); 1785 } else { 1786 if (uptodate) { 1787 check_page_uptodate(tree, page); 1788 } else { 1789 ClearPageUptodate(page); 1790 SetPageError(page); 1791 } 1792 check_page_locked(tree, page); 1793 } 1794 } while (bvec <= bvec_end); 1795 1796 bio_put(bio); 1797 } 1798 1799 /* 1800 * IO done from prepare_write is pretty simple, we just unlock 1801 * the structs in the extent tree when done, and set the uptodate bits 1802 * as appropriate. 1803 */ 1804 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1805 { 1806 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1807 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1808 struct extent_io_tree *tree; 1809 u64 start; 1810 u64 end; 1811 1812 do { 1813 struct page *page = bvec->bv_page; 1814 tree = &BTRFS_I(page->mapping->host)->io_tree; 1815 1816 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1817 bvec->bv_offset; 1818 end = start + bvec->bv_len - 1; 1819 1820 if (--bvec >= bio->bi_io_vec) 1821 prefetchw(&bvec->bv_page->flags); 1822 1823 if (uptodate) { 1824 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1825 } else { 1826 ClearPageUptodate(page); 1827 SetPageError(page); 1828 } 1829 1830 unlock_extent(tree, start, end, GFP_ATOMIC); 1831 1832 } while (bvec >= bio->bi_io_vec); 1833 1834 bio_put(bio); 1835 } 1836 1837 struct bio * 1838 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1839 gfp_t gfp_flags) 1840 { 1841 struct bio *bio; 1842 1843 bio = bio_alloc(gfp_flags, nr_vecs); 1844 1845 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1846 while (!bio && (nr_vecs /= 2)) 1847 bio = bio_alloc(gfp_flags, nr_vecs); 1848 } 1849 1850 if (bio) { 1851 bio->bi_size = 0; 1852 bio->bi_bdev = bdev; 1853 bio->bi_sector = first_sector; 1854 } 1855 return bio; 1856 } 1857 1858 static int submit_one_bio(int rw, struct bio *bio, int mirror_num, 1859 unsigned long bio_flags) 1860 { 1861 int ret = 0; 1862 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1863 struct page *page = bvec->bv_page; 1864 struct extent_io_tree *tree = bio->bi_private; 1865 u64 start; 1866 1867 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1868 1869 bio->bi_private = NULL; 1870 1871 bio_get(bio); 1872 1873 if (tree->ops && tree->ops->submit_bio_hook) 1874 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1875 mirror_num, bio_flags, start); 1876 else 1877 submit_bio(rw, bio); 1878 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1879 ret = -EOPNOTSUPP; 1880 bio_put(bio); 1881 return ret; 1882 } 1883 1884 static int submit_extent_page(int rw, struct extent_io_tree *tree, 1885 struct page *page, sector_t sector, 1886 size_t size, unsigned long offset, 1887 struct block_device *bdev, 1888 struct bio **bio_ret, 1889 unsigned long max_pages, 1890 bio_end_io_t end_io_func, 1891 int mirror_num, 1892 unsigned long prev_bio_flags, 1893 unsigned long bio_flags) 1894 { 1895 int ret = 0; 1896 struct bio *bio; 1897 int nr; 1898 int contig = 0; 1899 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; 1900 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 1901 size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); 1902 1903 if (bio_ret && *bio_ret) { 1904 bio = *bio_ret; 1905 if (old_compressed) 1906 contig = bio->bi_sector == sector; 1907 else 1908 contig = bio->bi_sector + (bio->bi_size >> 9) == 1909 sector; 1910 1911 if (prev_bio_flags != bio_flags || !contig || 1912 (tree->ops && tree->ops->merge_bio_hook && 1913 tree->ops->merge_bio_hook(page, offset, page_size, bio, 1914 bio_flags)) || 1915 bio_add_page(bio, page, page_size, offset) < page_size) { 1916 ret = submit_one_bio(rw, bio, mirror_num, 1917 prev_bio_flags); 1918 bio = NULL; 1919 } else { 1920 return 0; 1921 } 1922 } 1923 if (this_compressed) 1924 nr = BIO_MAX_PAGES; 1925 else 1926 nr = bio_get_nr_vecs(bdev); 1927 1928 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1929 if (!bio) 1930 return -ENOMEM; 1931 1932 bio_add_page(bio, page, page_size, offset); 1933 bio->bi_end_io = end_io_func; 1934 bio->bi_private = tree; 1935 1936 if (bio_ret) 1937 *bio_ret = bio; 1938 else 1939 ret = submit_one_bio(rw, bio, mirror_num, bio_flags); 1940 1941 return ret; 1942 } 1943 1944 void set_page_extent_mapped(struct page *page) 1945 { 1946 if (!PagePrivate(page)) { 1947 SetPagePrivate(page); 1948 page_cache_get(page); 1949 set_page_private(page, EXTENT_PAGE_PRIVATE); 1950 } 1951 } 1952 1953 static void set_page_extent_head(struct page *page, unsigned long len) 1954 { 1955 WARN_ON(!PagePrivate(page)); 1956 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1957 } 1958 1959 /* 1960 * basic readpage implementation. Locked extent state structs are inserted 1961 * into the tree that are removed when the IO is done (by the end_io 1962 * handlers) 1963 */ 1964 static int __extent_read_full_page(struct extent_io_tree *tree, 1965 struct page *page, 1966 get_extent_t *get_extent, 1967 struct bio **bio, int mirror_num, 1968 unsigned long *bio_flags) 1969 { 1970 struct inode *inode = page->mapping->host; 1971 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1972 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1973 u64 end; 1974 u64 cur = start; 1975 u64 extent_offset; 1976 u64 last_byte = i_size_read(inode); 1977 u64 block_start; 1978 u64 cur_end; 1979 sector_t sector; 1980 struct extent_map *em; 1981 struct block_device *bdev; 1982 struct btrfs_ordered_extent *ordered; 1983 int ret; 1984 int nr = 0; 1985 size_t page_offset = 0; 1986 size_t iosize; 1987 size_t disk_io_size; 1988 size_t blocksize = inode->i_sb->s_blocksize; 1989 unsigned long this_bio_flag = 0; 1990 1991 set_page_extent_mapped(page); 1992 1993 end = page_end; 1994 while (1) { 1995 lock_extent(tree, start, end, GFP_NOFS); 1996 ordered = btrfs_lookup_ordered_extent(inode, start); 1997 if (!ordered) 1998 break; 1999 unlock_extent(tree, start, end, GFP_NOFS); 2000 btrfs_start_ordered_extent(inode, ordered, 1); 2001 btrfs_put_ordered_extent(ordered); 2002 } 2003 2004 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2005 char *userpage; 2006 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); 2007 2008 if (zero_offset) { 2009 iosize = PAGE_CACHE_SIZE - zero_offset; 2010 userpage = kmap_atomic(page, KM_USER0); 2011 memset(userpage + zero_offset, 0, iosize); 2012 flush_dcache_page(page); 2013 kunmap_atomic(userpage, KM_USER0); 2014 } 2015 } 2016 while (cur <= end) { 2017 if (cur >= last_byte) { 2018 char *userpage; 2019 iosize = PAGE_CACHE_SIZE - page_offset; 2020 userpage = kmap_atomic(page, KM_USER0); 2021 memset(userpage + page_offset, 0, iosize); 2022 flush_dcache_page(page); 2023 kunmap_atomic(userpage, KM_USER0); 2024 set_extent_uptodate(tree, cur, cur + iosize - 1, 2025 GFP_NOFS); 2026 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2027 break; 2028 } 2029 em = get_extent(inode, page, page_offset, cur, 2030 end - cur + 1, 0); 2031 if (IS_ERR(em) || !em) { 2032 SetPageError(page); 2033 unlock_extent(tree, cur, end, GFP_NOFS); 2034 break; 2035 } 2036 extent_offset = cur - em->start; 2037 BUG_ON(extent_map_end(em) <= cur); 2038 BUG_ON(end < cur); 2039 2040 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2041 this_bio_flag = EXTENT_BIO_COMPRESSED; 2042 extent_set_compress_type(&this_bio_flag, 2043 em->compress_type); 2044 } 2045 2046 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2047 cur_end = min(extent_map_end(em) - 1, end); 2048 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2049 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2050 disk_io_size = em->block_len; 2051 sector = em->block_start >> 9; 2052 } else { 2053 sector = (em->block_start + extent_offset) >> 9; 2054 disk_io_size = iosize; 2055 } 2056 bdev = em->bdev; 2057 block_start = em->block_start; 2058 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2059 block_start = EXTENT_MAP_HOLE; 2060 free_extent_map(em); 2061 em = NULL; 2062 2063 /* we've found a hole, just zero and go on */ 2064 if (block_start == EXTENT_MAP_HOLE) { 2065 char *userpage; 2066 userpage = kmap_atomic(page, KM_USER0); 2067 memset(userpage + page_offset, 0, iosize); 2068 flush_dcache_page(page); 2069 kunmap_atomic(userpage, KM_USER0); 2070 2071 set_extent_uptodate(tree, cur, cur + iosize - 1, 2072 GFP_NOFS); 2073 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2074 cur = cur + iosize; 2075 page_offset += iosize; 2076 continue; 2077 } 2078 /* the get_extent function already copied into the page */ 2079 if (test_range_bit(tree, cur, cur_end, 2080 EXTENT_UPTODATE, 1, NULL)) { 2081 check_page_uptodate(tree, page); 2082 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2083 cur = cur + iosize; 2084 page_offset += iosize; 2085 continue; 2086 } 2087 /* we have an inline extent but it didn't get marked up 2088 * to date. Error out 2089 */ 2090 if (block_start == EXTENT_MAP_INLINE) { 2091 SetPageError(page); 2092 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2093 cur = cur + iosize; 2094 page_offset += iosize; 2095 continue; 2096 } 2097 2098 ret = 0; 2099 if (tree->ops && tree->ops->readpage_io_hook) { 2100 ret = tree->ops->readpage_io_hook(page, cur, 2101 cur + iosize - 1); 2102 } 2103 if (!ret) { 2104 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2105 pnr -= page->index; 2106 ret = submit_extent_page(READ, tree, page, 2107 sector, disk_io_size, page_offset, 2108 bdev, bio, pnr, 2109 end_bio_extent_readpage, mirror_num, 2110 *bio_flags, 2111 this_bio_flag); 2112 nr++; 2113 *bio_flags = this_bio_flag; 2114 } 2115 if (ret) 2116 SetPageError(page); 2117 cur = cur + iosize; 2118 page_offset += iosize; 2119 } 2120 if (!nr) { 2121 if (!PageError(page)) 2122 SetPageUptodate(page); 2123 unlock_page(page); 2124 } 2125 return 0; 2126 } 2127 2128 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2129 get_extent_t *get_extent) 2130 { 2131 struct bio *bio = NULL; 2132 unsigned long bio_flags = 0; 2133 int ret; 2134 2135 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2136 &bio_flags); 2137 if (bio) 2138 ret = submit_one_bio(READ, bio, 0, bio_flags); 2139 return ret; 2140 } 2141 2142 static noinline void update_nr_written(struct page *page, 2143 struct writeback_control *wbc, 2144 unsigned long nr_written) 2145 { 2146 wbc->nr_to_write -= nr_written; 2147 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && 2148 wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) 2149 page->mapping->writeback_index = page->index + nr_written; 2150 } 2151 2152 /* 2153 * the writepage semantics are similar to regular writepage. extent 2154 * records are inserted to lock ranges in the tree, and as dirty areas 2155 * are found, they are marked writeback. Then the lock bits are removed 2156 * and the end_io handler clears the writeback ranges 2157 */ 2158 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 2159 void *data) 2160 { 2161 struct inode *inode = page->mapping->host; 2162 struct extent_page_data *epd = data; 2163 struct extent_io_tree *tree = epd->tree; 2164 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2165 u64 delalloc_start; 2166 u64 page_end = start + PAGE_CACHE_SIZE - 1; 2167 u64 end; 2168 u64 cur = start; 2169 u64 extent_offset; 2170 u64 last_byte = i_size_read(inode); 2171 u64 block_start; 2172 u64 iosize; 2173 sector_t sector; 2174 struct extent_state *cached_state = NULL; 2175 struct extent_map *em; 2176 struct block_device *bdev; 2177 int ret; 2178 int nr = 0; 2179 size_t pg_offset = 0; 2180 size_t blocksize; 2181 loff_t i_size = i_size_read(inode); 2182 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2183 u64 nr_delalloc; 2184 u64 delalloc_end; 2185 int page_started; 2186 int compressed; 2187 int write_flags; 2188 unsigned long nr_written = 0; 2189 2190 if (wbc->sync_mode == WB_SYNC_ALL) 2191 write_flags = WRITE_SYNC_PLUG; 2192 else 2193 write_flags = WRITE; 2194 2195 WARN_ON(!PageLocked(page)); 2196 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2197 if (page->index > end_index || 2198 (page->index == end_index && !pg_offset)) { 2199 page->mapping->a_ops->invalidatepage(page, 0); 2200 unlock_page(page); 2201 return 0; 2202 } 2203 2204 if (page->index == end_index) { 2205 char *userpage; 2206 2207 userpage = kmap_atomic(page, KM_USER0); 2208 memset(userpage + pg_offset, 0, 2209 PAGE_CACHE_SIZE - pg_offset); 2210 kunmap_atomic(userpage, KM_USER0); 2211 flush_dcache_page(page); 2212 } 2213 pg_offset = 0; 2214 2215 set_page_extent_mapped(page); 2216 2217 delalloc_start = start; 2218 delalloc_end = 0; 2219 page_started = 0; 2220 if (!epd->extent_locked) { 2221 u64 delalloc_to_write = 0; 2222 /* 2223 * make sure the wbc mapping index is at least updated 2224 * to this page. 2225 */ 2226 update_nr_written(page, wbc, 0); 2227 2228 while (delalloc_end < page_end) { 2229 nr_delalloc = find_lock_delalloc_range(inode, tree, 2230 page, 2231 &delalloc_start, 2232 &delalloc_end, 2233 128 * 1024 * 1024); 2234 if (nr_delalloc == 0) { 2235 delalloc_start = delalloc_end + 1; 2236 continue; 2237 } 2238 tree->ops->fill_delalloc(inode, page, delalloc_start, 2239 delalloc_end, &page_started, 2240 &nr_written); 2241 /* 2242 * delalloc_end is already one less than the total 2243 * length, so we don't subtract one from 2244 * PAGE_CACHE_SIZE 2245 */ 2246 delalloc_to_write += (delalloc_end - delalloc_start + 2247 PAGE_CACHE_SIZE) >> 2248 PAGE_CACHE_SHIFT; 2249 delalloc_start = delalloc_end + 1; 2250 } 2251 if (wbc->nr_to_write < delalloc_to_write) { 2252 int thresh = 8192; 2253 2254 if (delalloc_to_write < thresh * 2) 2255 thresh = delalloc_to_write; 2256 wbc->nr_to_write = min_t(u64, delalloc_to_write, 2257 thresh); 2258 } 2259 2260 /* did the fill delalloc function already unlock and start 2261 * the IO? 2262 */ 2263 if (page_started) { 2264 ret = 0; 2265 /* 2266 * we've unlocked the page, so we can't update 2267 * the mapping's writeback index, just update 2268 * nr_to_write. 2269 */ 2270 wbc->nr_to_write -= nr_written; 2271 goto done_unlocked; 2272 } 2273 } 2274 if (tree->ops && tree->ops->writepage_start_hook) { 2275 ret = tree->ops->writepage_start_hook(page, start, 2276 page_end); 2277 if (ret == -EAGAIN) { 2278 redirty_page_for_writepage(wbc, page); 2279 update_nr_written(page, wbc, nr_written); 2280 unlock_page(page); 2281 ret = 0; 2282 goto done_unlocked; 2283 } 2284 } 2285 2286 /* 2287 * we don't want to touch the inode after unlocking the page, 2288 * so we update the mapping writeback index now 2289 */ 2290 update_nr_written(page, wbc, nr_written + 1); 2291 2292 end = page_end; 2293 if (last_byte <= start) { 2294 if (tree->ops && tree->ops->writepage_end_io_hook) 2295 tree->ops->writepage_end_io_hook(page, start, 2296 page_end, NULL, 1); 2297 goto done; 2298 } 2299 2300 blocksize = inode->i_sb->s_blocksize; 2301 2302 while (cur <= end) { 2303 if (cur >= last_byte) { 2304 if (tree->ops && tree->ops->writepage_end_io_hook) 2305 tree->ops->writepage_end_io_hook(page, cur, 2306 page_end, NULL, 1); 2307 break; 2308 } 2309 em = epd->get_extent(inode, page, pg_offset, cur, 2310 end - cur + 1, 1); 2311 if (IS_ERR(em) || !em) { 2312 SetPageError(page); 2313 break; 2314 } 2315 2316 extent_offset = cur - em->start; 2317 BUG_ON(extent_map_end(em) <= cur); 2318 BUG_ON(end < cur); 2319 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2320 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 2321 sector = (em->block_start + extent_offset) >> 9; 2322 bdev = em->bdev; 2323 block_start = em->block_start; 2324 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 2325 free_extent_map(em); 2326 em = NULL; 2327 2328 /* 2329 * compressed and inline extents are written through other 2330 * paths in the FS 2331 */ 2332 if (compressed || block_start == EXTENT_MAP_HOLE || 2333 block_start == EXTENT_MAP_INLINE) { 2334 /* 2335 * end_io notification does not happen here for 2336 * compressed extents 2337 */ 2338 if (!compressed && tree->ops && 2339 tree->ops->writepage_end_io_hook) 2340 tree->ops->writepage_end_io_hook(page, cur, 2341 cur + iosize - 1, 2342 NULL, 1); 2343 else if (compressed) { 2344 /* we don't want to end_page_writeback on 2345 * a compressed extent. this happens 2346 * elsewhere 2347 */ 2348 nr++; 2349 } 2350 2351 cur += iosize; 2352 pg_offset += iosize; 2353 continue; 2354 } 2355 /* leave this out until we have a page_mkwrite call */ 2356 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2357 EXTENT_DIRTY, 0, NULL)) { 2358 cur = cur + iosize; 2359 pg_offset += iosize; 2360 continue; 2361 } 2362 2363 if (tree->ops && tree->ops->writepage_io_hook) { 2364 ret = tree->ops->writepage_io_hook(page, cur, 2365 cur + iosize - 1); 2366 } else { 2367 ret = 0; 2368 } 2369 if (ret) { 2370 SetPageError(page); 2371 } else { 2372 unsigned long max_nr = end_index + 1; 2373 2374 set_range_writeback(tree, cur, cur + iosize - 1); 2375 if (!PageWriteback(page)) { 2376 printk(KERN_ERR "btrfs warning page %lu not " 2377 "writeback, cur %llu end %llu\n", 2378 page->index, (unsigned long long)cur, 2379 (unsigned long long)end); 2380 } 2381 2382 ret = submit_extent_page(write_flags, tree, page, 2383 sector, iosize, pg_offset, 2384 bdev, &epd->bio, max_nr, 2385 end_bio_extent_writepage, 2386 0, 0, 0); 2387 if (ret) 2388 SetPageError(page); 2389 } 2390 cur = cur + iosize; 2391 pg_offset += iosize; 2392 nr++; 2393 } 2394 done: 2395 if (nr == 0) { 2396 /* make sure the mapping tag for page dirty gets cleared */ 2397 set_page_writeback(page); 2398 end_page_writeback(page); 2399 } 2400 unlock_page(page); 2401 2402 done_unlocked: 2403 2404 /* drop our reference on any cached states */ 2405 free_extent_state(cached_state); 2406 return 0; 2407 } 2408 2409 /** 2410 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 2411 * @mapping: address space structure to write 2412 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 2413 * @writepage: function called for each page 2414 * @data: data passed to writepage function 2415 * 2416 * If a page is already under I/O, write_cache_pages() skips it, even 2417 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 2418 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 2419 * and msync() need to guarantee that all the data which was dirty at the time 2420 * the call was made get new I/O started against them. If wbc->sync_mode is 2421 * WB_SYNC_ALL then we were called for data integrity and we must wait for 2422 * existing IO to complete. 2423 */ 2424 static int extent_write_cache_pages(struct extent_io_tree *tree, 2425 struct address_space *mapping, 2426 struct writeback_control *wbc, 2427 writepage_t writepage, void *data, 2428 void (*flush_fn)(void *)) 2429 { 2430 int ret = 0; 2431 int done = 0; 2432 int nr_to_write_done = 0; 2433 struct pagevec pvec; 2434 int nr_pages; 2435 pgoff_t index; 2436 pgoff_t end; /* Inclusive */ 2437 int scanned = 0; 2438 2439 pagevec_init(&pvec, 0); 2440 if (wbc->range_cyclic) { 2441 index = mapping->writeback_index; /* Start from prev offset */ 2442 end = -1; 2443 } else { 2444 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2445 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2446 scanned = 1; 2447 } 2448 retry: 2449 while (!done && !nr_to_write_done && (index <= end) && 2450 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2451 PAGECACHE_TAG_DIRTY, min(end - index, 2452 (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2453 unsigned i; 2454 2455 scanned = 1; 2456 for (i = 0; i < nr_pages; i++) { 2457 struct page *page = pvec.pages[i]; 2458 2459 /* 2460 * At this point we hold neither mapping->tree_lock nor 2461 * lock on the page itself: the page may be truncated or 2462 * invalidated (changing page->mapping to NULL), or even 2463 * swizzled back from swapper_space to tmpfs file 2464 * mapping 2465 */ 2466 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2467 tree->ops->write_cache_pages_lock_hook(page); 2468 else 2469 lock_page(page); 2470 2471 if (unlikely(page->mapping != mapping)) { 2472 unlock_page(page); 2473 continue; 2474 } 2475 2476 if (!wbc->range_cyclic && page->index > end) { 2477 done = 1; 2478 unlock_page(page); 2479 continue; 2480 } 2481 2482 if (wbc->sync_mode != WB_SYNC_NONE) { 2483 if (PageWriteback(page)) 2484 flush_fn(data); 2485 wait_on_page_writeback(page); 2486 } 2487 2488 if (PageWriteback(page) || 2489 !clear_page_dirty_for_io(page)) { 2490 unlock_page(page); 2491 continue; 2492 } 2493 2494 ret = (*writepage)(page, wbc, data); 2495 2496 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2497 unlock_page(page); 2498 ret = 0; 2499 } 2500 if (ret) 2501 done = 1; 2502 2503 /* 2504 * the filesystem may choose to bump up nr_to_write. 2505 * We have to make sure to honor the new nr_to_write 2506 * at any time 2507 */ 2508 nr_to_write_done = wbc->nr_to_write <= 0; 2509 } 2510 pagevec_release(&pvec); 2511 cond_resched(); 2512 } 2513 if (!scanned && !done) { 2514 /* 2515 * We hit the last page and there is more work to be done: wrap 2516 * back to the start of the file 2517 */ 2518 scanned = 1; 2519 index = 0; 2520 goto retry; 2521 } 2522 return ret; 2523 } 2524 2525 static void flush_epd_write_bio(struct extent_page_data *epd) 2526 { 2527 if (epd->bio) { 2528 if (epd->sync_io) 2529 submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); 2530 else 2531 submit_one_bio(WRITE, epd->bio, 0, 0); 2532 epd->bio = NULL; 2533 } 2534 } 2535 2536 static noinline void flush_write_bio(void *data) 2537 { 2538 struct extent_page_data *epd = data; 2539 flush_epd_write_bio(epd); 2540 } 2541 2542 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 2543 get_extent_t *get_extent, 2544 struct writeback_control *wbc) 2545 { 2546 int ret; 2547 struct address_space *mapping = page->mapping; 2548 struct extent_page_data epd = { 2549 .bio = NULL, 2550 .tree = tree, 2551 .get_extent = get_extent, 2552 .extent_locked = 0, 2553 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2554 }; 2555 struct writeback_control wbc_writepages = { 2556 .sync_mode = wbc->sync_mode, 2557 .older_than_this = NULL, 2558 .nr_to_write = 64, 2559 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2560 .range_end = (loff_t)-1, 2561 }; 2562 2563 ret = __extent_writepage(page, wbc, &epd); 2564 2565 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2566 __extent_writepage, &epd, flush_write_bio); 2567 flush_epd_write_bio(&epd); 2568 return ret; 2569 } 2570 2571 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 2572 u64 start, u64 end, get_extent_t *get_extent, 2573 int mode) 2574 { 2575 int ret = 0; 2576 struct address_space *mapping = inode->i_mapping; 2577 struct page *page; 2578 unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> 2579 PAGE_CACHE_SHIFT; 2580 2581 struct extent_page_data epd = { 2582 .bio = NULL, 2583 .tree = tree, 2584 .get_extent = get_extent, 2585 .extent_locked = 1, 2586 .sync_io = mode == WB_SYNC_ALL, 2587 }; 2588 struct writeback_control wbc_writepages = { 2589 .sync_mode = mode, 2590 .older_than_this = NULL, 2591 .nr_to_write = nr_pages * 2, 2592 .range_start = start, 2593 .range_end = end + 1, 2594 }; 2595 2596 while (start <= end) { 2597 page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 2598 if (clear_page_dirty_for_io(page)) 2599 ret = __extent_writepage(page, &wbc_writepages, &epd); 2600 else { 2601 if (tree->ops && tree->ops->writepage_end_io_hook) 2602 tree->ops->writepage_end_io_hook(page, start, 2603 start + PAGE_CACHE_SIZE - 1, 2604 NULL, 1); 2605 unlock_page(page); 2606 } 2607 page_cache_release(page); 2608 start += PAGE_CACHE_SIZE; 2609 } 2610 2611 flush_epd_write_bio(&epd); 2612 return ret; 2613 } 2614 2615 int extent_writepages(struct extent_io_tree *tree, 2616 struct address_space *mapping, 2617 get_extent_t *get_extent, 2618 struct writeback_control *wbc) 2619 { 2620 int ret = 0; 2621 struct extent_page_data epd = { 2622 .bio = NULL, 2623 .tree = tree, 2624 .get_extent = get_extent, 2625 .extent_locked = 0, 2626 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2627 }; 2628 2629 ret = extent_write_cache_pages(tree, mapping, wbc, 2630 __extent_writepage, &epd, 2631 flush_write_bio); 2632 flush_epd_write_bio(&epd); 2633 return ret; 2634 } 2635 2636 int extent_readpages(struct extent_io_tree *tree, 2637 struct address_space *mapping, 2638 struct list_head *pages, unsigned nr_pages, 2639 get_extent_t get_extent) 2640 { 2641 struct bio *bio = NULL; 2642 unsigned page_idx; 2643 unsigned long bio_flags = 0; 2644 2645 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2646 struct page *page = list_entry(pages->prev, struct page, lru); 2647 2648 prefetchw(&page->flags); 2649 list_del(&page->lru); 2650 if (!add_to_page_cache_lru(page, mapping, 2651 page->index, GFP_KERNEL)) { 2652 __extent_read_full_page(tree, page, get_extent, 2653 &bio, 0, &bio_flags); 2654 } 2655 page_cache_release(page); 2656 } 2657 BUG_ON(!list_empty(pages)); 2658 if (bio) 2659 submit_one_bio(READ, bio, 0, bio_flags); 2660 return 0; 2661 } 2662 2663 /* 2664 * basic invalidatepage code, this waits on any locked or writeback 2665 * ranges corresponding to the page, and then deletes any extent state 2666 * records from the tree 2667 */ 2668 int extent_invalidatepage(struct extent_io_tree *tree, 2669 struct page *page, unsigned long offset) 2670 { 2671 struct extent_state *cached_state = NULL; 2672 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2673 u64 end = start + PAGE_CACHE_SIZE - 1; 2674 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2675 2676 start += (offset + blocksize - 1) & ~(blocksize - 1); 2677 if (start > end) 2678 return 0; 2679 2680 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS); 2681 wait_on_page_writeback(page); 2682 clear_extent_bit(tree, start, end, 2683 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2684 EXTENT_DO_ACCOUNTING, 2685 1, 1, &cached_state, GFP_NOFS); 2686 return 0; 2687 } 2688 2689 /* 2690 * simple commit_write call, set_range_dirty is used to mark both 2691 * the pages and the extent records as dirty 2692 */ 2693 int extent_commit_write(struct extent_io_tree *tree, 2694 struct inode *inode, struct page *page, 2695 unsigned from, unsigned to) 2696 { 2697 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2698 2699 set_page_extent_mapped(page); 2700 set_page_dirty(page); 2701 2702 if (pos > inode->i_size) { 2703 i_size_write(inode, pos); 2704 mark_inode_dirty(inode); 2705 } 2706 return 0; 2707 } 2708 2709 int extent_prepare_write(struct extent_io_tree *tree, 2710 struct inode *inode, struct page *page, 2711 unsigned from, unsigned to, get_extent_t *get_extent) 2712 { 2713 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2714 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2715 u64 block_start; 2716 u64 orig_block_start; 2717 u64 block_end; 2718 u64 cur_end; 2719 struct extent_map *em; 2720 unsigned blocksize = 1 << inode->i_blkbits; 2721 size_t page_offset = 0; 2722 size_t block_off_start; 2723 size_t block_off_end; 2724 int err = 0; 2725 int iocount = 0; 2726 int ret = 0; 2727 int isnew; 2728 2729 set_page_extent_mapped(page); 2730 2731 block_start = (page_start + from) & ~((u64)blocksize - 1); 2732 block_end = (page_start + to - 1) | (blocksize - 1); 2733 orig_block_start = block_start; 2734 2735 lock_extent(tree, page_start, page_end, GFP_NOFS); 2736 while (block_start <= block_end) { 2737 em = get_extent(inode, page, page_offset, block_start, 2738 block_end - block_start + 1, 1); 2739 if (IS_ERR(em) || !em) 2740 goto err; 2741 2742 cur_end = min(block_end, extent_map_end(em) - 1); 2743 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2744 block_off_end = block_off_start + blocksize; 2745 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2746 2747 if (!PageUptodate(page) && isnew && 2748 (block_off_end > to || block_off_start < from)) { 2749 void *kaddr; 2750 2751 kaddr = kmap_atomic(page, KM_USER0); 2752 if (block_off_end > to) 2753 memset(kaddr + to, 0, block_off_end - to); 2754 if (block_off_start < from) 2755 memset(kaddr + block_off_start, 0, 2756 from - block_off_start); 2757 flush_dcache_page(page); 2758 kunmap_atomic(kaddr, KM_USER0); 2759 } 2760 if ((em->block_start != EXTENT_MAP_HOLE && 2761 em->block_start != EXTENT_MAP_INLINE) && 2762 !isnew && !PageUptodate(page) && 2763 (block_off_end > to || block_off_start < from) && 2764 !test_range_bit(tree, block_start, cur_end, 2765 EXTENT_UPTODATE, 1, NULL)) { 2766 u64 sector; 2767 u64 extent_offset = block_start - em->start; 2768 size_t iosize; 2769 sector = (em->block_start + extent_offset) >> 9; 2770 iosize = (cur_end - block_start + blocksize) & 2771 ~((u64)blocksize - 1); 2772 /* 2773 * we've already got the extent locked, but we 2774 * need to split the state such that our end_bio 2775 * handler can clear the lock. 2776 */ 2777 set_extent_bit(tree, block_start, 2778 block_start + iosize - 1, 2779 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); 2780 ret = submit_extent_page(READ, tree, page, 2781 sector, iosize, page_offset, em->bdev, 2782 NULL, 1, 2783 end_bio_extent_preparewrite, 0, 2784 0, 0); 2785 if (ret && !err) 2786 err = ret; 2787 iocount++; 2788 block_start = block_start + iosize; 2789 } else { 2790 set_extent_uptodate(tree, block_start, cur_end, 2791 GFP_NOFS); 2792 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2793 block_start = cur_end + 1; 2794 } 2795 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2796 free_extent_map(em); 2797 } 2798 if (iocount) { 2799 wait_extent_bit(tree, orig_block_start, 2800 block_end, EXTENT_LOCKED); 2801 } 2802 check_page_uptodate(tree, page); 2803 err: 2804 /* FIXME, zero out newly allocated blocks on error */ 2805 return err; 2806 } 2807 2808 /* 2809 * a helper for releasepage, this tests for areas of the page that 2810 * are locked or under IO and drops the related state bits if it is safe 2811 * to drop the page. 2812 */ 2813 int try_release_extent_state(struct extent_map_tree *map, 2814 struct extent_io_tree *tree, struct page *page, 2815 gfp_t mask) 2816 { 2817 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2818 u64 end = start + PAGE_CACHE_SIZE - 1; 2819 int ret = 1; 2820 2821 if (test_range_bit(tree, start, end, 2822 EXTENT_IOBITS, 0, NULL)) 2823 ret = 0; 2824 else { 2825 if ((mask & GFP_NOFS) == GFP_NOFS) 2826 mask = GFP_NOFS; 2827 /* 2828 * at this point we can safely clear everything except the 2829 * locked bit and the nodatasum bit 2830 */ 2831 ret = clear_extent_bit(tree, start, end, 2832 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2833 0, 0, NULL, mask); 2834 2835 /* if clear_extent_bit failed for enomem reasons, 2836 * we can't allow the release to continue. 2837 */ 2838 if (ret < 0) 2839 ret = 0; 2840 else 2841 ret = 1; 2842 } 2843 return ret; 2844 } 2845 2846 /* 2847 * a helper for releasepage. As long as there are no locked extents 2848 * in the range corresponding to the page, both state records and extent 2849 * map records are removed 2850 */ 2851 int try_release_extent_mapping(struct extent_map_tree *map, 2852 struct extent_io_tree *tree, struct page *page, 2853 gfp_t mask) 2854 { 2855 struct extent_map *em; 2856 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2857 u64 end = start + PAGE_CACHE_SIZE - 1; 2858 2859 if ((mask & __GFP_WAIT) && 2860 page->mapping->host->i_size > 16 * 1024 * 1024) { 2861 u64 len; 2862 while (start <= end) { 2863 len = end - start + 1; 2864 write_lock(&map->lock); 2865 em = lookup_extent_mapping(map, start, len); 2866 if (!em || IS_ERR(em)) { 2867 write_unlock(&map->lock); 2868 break; 2869 } 2870 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 2871 em->start != start) { 2872 write_unlock(&map->lock); 2873 free_extent_map(em); 2874 break; 2875 } 2876 if (!test_range_bit(tree, em->start, 2877 extent_map_end(em) - 1, 2878 EXTENT_LOCKED | EXTENT_WRITEBACK, 2879 0, NULL)) { 2880 remove_extent_mapping(map, em); 2881 /* once for the rb tree */ 2882 free_extent_map(em); 2883 } 2884 start = extent_map_end(em); 2885 write_unlock(&map->lock); 2886 2887 /* once for us */ 2888 free_extent_map(em); 2889 } 2890 } 2891 return try_release_extent_state(map, tree, page, mask); 2892 } 2893 2894 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2895 get_extent_t *get_extent) 2896 { 2897 struct inode *inode = mapping->host; 2898 struct extent_state *cached_state = NULL; 2899 u64 start = iblock << inode->i_blkbits; 2900 sector_t sector = 0; 2901 size_t blksize = (1 << inode->i_blkbits); 2902 struct extent_map *em; 2903 2904 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2905 0, &cached_state, GFP_NOFS); 2906 em = get_extent(inode, NULL, 0, start, blksize, 0); 2907 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, 2908 start + blksize - 1, &cached_state, GFP_NOFS); 2909 if (!em || IS_ERR(em)) 2910 return 0; 2911 2912 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2913 goto out; 2914 2915 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2916 out: 2917 free_extent_map(em); 2918 return sector; 2919 } 2920 2921 /* 2922 * helper function for fiemap, which doesn't want to see any holes. 2923 * This maps until we find something past 'last' 2924 */ 2925 static struct extent_map *get_extent_skip_holes(struct inode *inode, 2926 u64 offset, 2927 u64 last, 2928 get_extent_t *get_extent) 2929 { 2930 u64 sectorsize = BTRFS_I(inode)->root->sectorsize; 2931 struct extent_map *em; 2932 u64 len; 2933 2934 if (offset >= last) 2935 return NULL; 2936 2937 while(1) { 2938 len = last - offset; 2939 if (len == 0) 2940 break; 2941 len = (len + sectorsize - 1) & ~(sectorsize - 1); 2942 em = get_extent(inode, NULL, 0, offset, len, 0); 2943 if (!em || IS_ERR(em)) 2944 return em; 2945 2946 /* if this isn't a hole return it */ 2947 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 2948 em->block_start != EXTENT_MAP_HOLE) { 2949 return em; 2950 } 2951 2952 /* this is a hole, advance to the next extent */ 2953 offset = extent_map_end(em); 2954 free_extent_map(em); 2955 if (offset >= last) 2956 break; 2957 } 2958 return NULL; 2959 } 2960 2961 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2962 __u64 start, __u64 len, get_extent_t *get_extent) 2963 { 2964 int ret = 0; 2965 u64 off = start; 2966 u64 max = start + len; 2967 u32 flags = 0; 2968 u32 found_type; 2969 u64 last; 2970 u64 last_for_get_extent = 0; 2971 u64 disko = 0; 2972 u64 isize = i_size_read(inode); 2973 struct btrfs_key found_key; 2974 struct extent_map *em = NULL; 2975 struct extent_state *cached_state = NULL; 2976 struct btrfs_path *path; 2977 struct btrfs_file_extent_item *item; 2978 int end = 0; 2979 u64 em_start = 0; 2980 u64 em_len = 0; 2981 u64 em_end = 0; 2982 unsigned long emflags; 2983 2984 if (len == 0) 2985 return -EINVAL; 2986 2987 path = btrfs_alloc_path(); 2988 if (!path) 2989 return -ENOMEM; 2990 path->leave_spinning = 1; 2991 2992 /* 2993 * lookup the last file extent. We're not using i_size here 2994 * because there might be preallocation past i_size 2995 */ 2996 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 2997 path, inode->i_ino, -1, 0); 2998 if (ret < 0) { 2999 btrfs_free_path(path); 3000 return ret; 3001 } 3002 WARN_ON(!ret); 3003 path->slots[0]--; 3004 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3005 struct btrfs_file_extent_item); 3006 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3007 found_type = btrfs_key_type(&found_key); 3008 3009 /* No extents, but there might be delalloc bits */ 3010 if (found_key.objectid != inode->i_ino || 3011 found_type != BTRFS_EXTENT_DATA_KEY) { 3012 /* have to trust i_size as the end */ 3013 last = (u64)-1; 3014 last_for_get_extent = isize; 3015 } else { 3016 /* 3017 * remember the start of the last extent. There are a 3018 * bunch of different factors that go into the length of the 3019 * extent, so its much less complex to remember where it started 3020 */ 3021 last = found_key.offset; 3022 last_for_get_extent = last + 1; 3023 } 3024 btrfs_free_path(path); 3025 3026 /* 3027 * we might have some extents allocated but more delalloc past those 3028 * extents. so, we trust isize unless the start of the last extent is 3029 * beyond isize 3030 */ 3031 if (last < isize) { 3032 last = (u64)-1; 3033 last_for_get_extent = isize; 3034 } 3035 3036 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3037 &cached_state, GFP_NOFS); 3038 3039 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3040 get_extent); 3041 if (!em) 3042 goto out; 3043 if (IS_ERR(em)) { 3044 ret = PTR_ERR(em); 3045 goto out; 3046 } 3047 3048 while (!end) { 3049 u64 offset_in_extent; 3050 3051 /* break if the extent we found is outside the range */ 3052 if (em->start >= max || extent_map_end(em) < off) 3053 break; 3054 3055 /* 3056 * get_extent may return an extent that starts before our 3057 * requested range. We have to make sure the ranges 3058 * we return to fiemap always move forward and don't 3059 * overlap, so adjust the offsets here 3060 */ 3061 em_start = max(em->start, off); 3062 3063 /* 3064 * record the offset from the start of the extent 3065 * for adjusting the disk offset below 3066 */ 3067 offset_in_extent = em_start - em->start; 3068 em_end = extent_map_end(em); 3069 em_len = em_end - em_start; 3070 emflags = em->flags; 3071 disko = 0; 3072 flags = 0; 3073 3074 /* 3075 * bump off for our next call to get_extent 3076 */ 3077 off = extent_map_end(em); 3078 if (off >= max) 3079 end = 1; 3080 3081 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3082 end = 1; 3083 flags |= FIEMAP_EXTENT_LAST; 3084 } else if (em->block_start == EXTENT_MAP_INLINE) { 3085 flags |= (FIEMAP_EXTENT_DATA_INLINE | 3086 FIEMAP_EXTENT_NOT_ALIGNED); 3087 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 3088 flags |= (FIEMAP_EXTENT_DELALLOC | 3089 FIEMAP_EXTENT_UNKNOWN); 3090 } else { 3091 disko = em->block_start + offset_in_extent; 3092 } 3093 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3094 flags |= FIEMAP_EXTENT_ENCODED; 3095 3096 free_extent_map(em); 3097 em = NULL; 3098 if ((em_start >= last) || em_len == (u64)-1 || 3099 (last == (u64)-1 && isize <= em_end)) { 3100 flags |= FIEMAP_EXTENT_LAST; 3101 end = 1; 3102 } 3103 3104 /* now scan forward to see if this is really the last extent. */ 3105 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3106 get_extent); 3107 if (IS_ERR(em)) { 3108 ret = PTR_ERR(em); 3109 goto out; 3110 } 3111 if (!em) { 3112 flags |= FIEMAP_EXTENT_LAST; 3113 end = 1; 3114 } 3115 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3116 em_len, flags); 3117 if (ret) 3118 goto out_free; 3119 } 3120 out_free: 3121 free_extent_map(em); 3122 out: 3123 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len, 3124 &cached_state, GFP_NOFS); 3125 return ret; 3126 } 3127 3128 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3129 unsigned long i) 3130 { 3131 struct page *p; 3132 struct address_space *mapping; 3133 3134 if (i == 0) 3135 return eb->first_page; 3136 i += eb->start >> PAGE_CACHE_SHIFT; 3137 mapping = eb->first_page->mapping; 3138 if (!mapping) 3139 return NULL; 3140 3141 /* 3142 * extent_buffer_page is only called after pinning the page 3143 * by increasing the reference count. So we know the page must 3144 * be in the radix tree. 3145 */ 3146 rcu_read_lock(); 3147 p = radix_tree_lookup(&mapping->page_tree, i); 3148 rcu_read_unlock(); 3149 3150 return p; 3151 } 3152 3153 static inline unsigned long num_extent_pages(u64 start, u64 len) 3154 { 3155 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3156 (start >> PAGE_CACHE_SHIFT); 3157 } 3158 3159 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, 3160 u64 start, 3161 unsigned long len, 3162 gfp_t mask) 3163 { 3164 struct extent_buffer *eb = NULL; 3165 #if LEAK_DEBUG 3166 unsigned long flags; 3167 #endif 3168 3169 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3170 if (eb == NULL) 3171 return NULL; 3172 eb->start = start; 3173 eb->len = len; 3174 spin_lock_init(&eb->lock); 3175 init_waitqueue_head(&eb->lock_wq); 3176 3177 #if LEAK_DEBUG 3178 spin_lock_irqsave(&leak_lock, flags); 3179 list_add(&eb->leak_list, &buffers); 3180 spin_unlock_irqrestore(&leak_lock, flags); 3181 #endif 3182 atomic_set(&eb->refs, 1); 3183 3184 return eb; 3185 } 3186 3187 static void __free_extent_buffer(struct extent_buffer *eb) 3188 { 3189 #if LEAK_DEBUG 3190 unsigned long flags; 3191 spin_lock_irqsave(&leak_lock, flags); 3192 list_del(&eb->leak_list); 3193 spin_unlock_irqrestore(&leak_lock, flags); 3194 #endif 3195 kmem_cache_free(extent_buffer_cache, eb); 3196 } 3197 3198 /* 3199 * Helper for releasing extent buffer page. 3200 */ 3201 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 3202 unsigned long start_idx) 3203 { 3204 unsigned long index; 3205 struct page *page; 3206 3207 if (!eb->first_page) 3208 return; 3209 3210 index = num_extent_pages(eb->start, eb->len); 3211 if (start_idx >= index) 3212 return; 3213 3214 do { 3215 index--; 3216 page = extent_buffer_page(eb, index); 3217 if (page) 3218 page_cache_release(page); 3219 } while (index != start_idx); 3220 } 3221 3222 /* 3223 * Helper for releasing the extent buffer. 3224 */ 3225 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 3226 { 3227 btrfs_release_extent_buffer_page(eb, 0); 3228 __free_extent_buffer(eb); 3229 } 3230 3231 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3232 u64 start, unsigned long len, 3233 struct page *page0, 3234 gfp_t mask) 3235 { 3236 unsigned long num_pages = num_extent_pages(start, len); 3237 unsigned long i; 3238 unsigned long index = start >> PAGE_CACHE_SHIFT; 3239 struct extent_buffer *eb; 3240 struct extent_buffer *exists = NULL; 3241 struct page *p; 3242 struct address_space *mapping = tree->mapping; 3243 int uptodate = 1; 3244 int ret; 3245 3246 rcu_read_lock(); 3247 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3248 if (eb && atomic_inc_not_zero(&eb->refs)) { 3249 rcu_read_unlock(); 3250 mark_page_accessed(eb->first_page); 3251 return eb; 3252 } 3253 rcu_read_unlock(); 3254 3255 eb = __alloc_extent_buffer(tree, start, len, mask); 3256 if (!eb) 3257 return NULL; 3258 3259 if (page0) { 3260 eb->first_page = page0; 3261 i = 1; 3262 index++; 3263 page_cache_get(page0); 3264 mark_page_accessed(page0); 3265 set_page_extent_mapped(page0); 3266 set_page_extent_head(page0, len); 3267 uptodate = PageUptodate(page0); 3268 } else { 3269 i = 0; 3270 } 3271 for (; i < num_pages; i++, index++) { 3272 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3273 if (!p) { 3274 WARN_ON(1); 3275 goto free_eb; 3276 } 3277 set_page_extent_mapped(p); 3278 mark_page_accessed(p); 3279 if (i == 0) { 3280 eb->first_page = p; 3281 set_page_extent_head(p, len); 3282 } else { 3283 set_page_private(p, EXTENT_PAGE_PRIVATE); 3284 } 3285 if (!PageUptodate(p)) 3286 uptodate = 0; 3287 3288 /* 3289 * see below about how we avoid a nasty race with release page 3290 * and why we unlock later 3291 */ 3292 if (i != 0) 3293 unlock_page(p); 3294 } 3295 if (uptodate) 3296 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3297 3298 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 3299 if (ret) 3300 goto free_eb; 3301 3302 spin_lock(&tree->buffer_lock); 3303 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); 3304 if (ret == -EEXIST) { 3305 exists = radix_tree_lookup(&tree->buffer, 3306 start >> PAGE_CACHE_SHIFT); 3307 /* add one reference for the caller */ 3308 atomic_inc(&exists->refs); 3309 spin_unlock(&tree->buffer_lock); 3310 radix_tree_preload_end(); 3311 goto free_eb; 3312 } 3313 /* add one reference for the tree */ 3314 atomic_inc(&eb->refs); 3315 spin_unlock(&tree->buffer_lock); 3316 radix_tree_preload_end(); 3317 3318 /* 3319 * there is a race where release page may have 3320 * tried to find this extent buffer in the radix 3321 * but failed. It will tell the VM it is safe to 3322 * reclaim the, and it will clear the page private bit. 3323 * We must make sure to set the page private bit properly 3324 * after the extent buffer is in the radix tree so 3325 * it doesn't get lost 3326 */ 3327 set_page_extent_mapped(eb->first_page); 3328 set_page_extent_head(eb->first_page, eb->len); 3329 if (!page0) 3330 unlock_page(eb->first_page); 3331 return eb; 3332 3333 free_eb: 3334 if (eb->first_page && !page0) 3335 unlock_page(eb->first_page); 3336 3337 if (!atomic_dec_and_test(&eb->refs)) 3338 return exists; 3339 btrfs_release_extent_buffer(eb); 3340 return exists; 3341 } 3342 3343 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3344 u64 start, unsigned long len, 3345 gfp_t mask) 3346 { 3347 struct extent_buffer *eb; 3348 3349 rcu_read_lock(); 3350 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 3351 if (eb && atomic_inc_not_zero(&eb->refs)) { 3352 rcu_read_unlock(); 3353 mark_page_accessed(eb->first_page); 3354 return eb; 3355 } 3356 rcu_read_unlock(); 3357 3358 return NULL; 3359 } 3360 3361 void free_extent_buffer(struct extent_buffer *eb) 3362 { 3363 if (!eb) 3364 return; 3365 3366 if (!atomic_dec_and_test(&eb->refs)) 3367 return; 3368 3369 WARN_ON(1); 3370 } 3371 3372 int clear_extent_buffer_dirty(struct extent_io_tree *tree, 3373 struct extent_buffer *eb) 3374 { 3375 unsigned long i; 3376 unsigned long num_pages; 3377 struct page *page; 3378 3379 num_pages = num_extent_pages(eb->start, eb->len); 3380 3381 for (i = 0; i < num_pages; i++) { 3382 page = extent_buffer_page(eb, i); 3383 if (!PageDirty(page)) 3384 continue; 3385 3386 lock_page(page); 3387 WARN_ON(!PagePrivate(page)); 3388 3389 set_page_extent_mapped(page); 3390 if (i == 0) 3391 set_page_extent_head(page, eb->len); 3392 3393 clear_page_dirty_for_io(page); 3394 spin_lock_irq(&page->mapping->tree_lock); 3395 if (!PageDirty(page)) { 3396 radix_tree_tag_clear(&page->mapping->page_tree, 3397 page_index(page), 3398 PAGECACHE_TAG_DIRTY); 3399 } 3400 spin_unlock_irq(&page->mapping->tree_lock); 3401 unlock_page(page); 3402 } 3403 return 0; 3404 } 3405 3406 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, 3407 struct extent_buffer *eb) 3408 { 3409 return wait_on_extent_writeback(tree, eb->start, 3410 eb->start + eb->len - 1); 3411 } 3412 3413 int set_extent_buffer_dirty(struct extent_io_tree *tree, 3414 struct extent_buffer *eb) 3415 { 3416 unsigned long i; 3417 unsigned long num_pages; 3418 int was_dirty = 0; 3419 3420 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3421 num_pages = num_extent_pages(eb->start, eb->len); 3422 for (i = 0; i < num_pages; i++) 3423 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 3424 return was_dirty; 3425 } 3426 3427 int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3428 struct extent_buffer *eb, 3429 struct extent_state **cached_state) 3430 { 3431 unsigned long i; 3432 struct page *page; 3433 unsigned long num_pages; 3434 3435 num_pages = num_extent_pages(eb->start, eb->len); 3436 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3437 3438 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3439 cached_state, GFP_NOFS); 3440 for (i = 0; i < num_pages; i++) { 3441 page = extent_buffer_page(eb, i); 3442 if (page) 3443 ClearPageUptodate(page); 3444 } 3445 return 0; 3446 } 3447 3448 int set_extent_buffer_uptodate(struct extent_io_tree *tree, 3449 struct extent_buffer *eb) 3450 { 3451 unsigned long i; 3452 struct page *page; 3453 unsigned long num_pages; 3454 3455 num_pages = num_extent_pages(eb->start, eb->len); 3456 3457 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3458 GFP_NOFS); 3459 for (i = 0; i < num_pages; i++) { 3460 page = extent_buffer_page(eb, i); 3461 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3462 ((i == num_pages - 1) && 3463 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 3464 check_page_uptodate(tree, page); 3465 continue; 3466 } 3467 SetPageUptodate(page); 3468 } 3469 return 0; 3470 } 3471 3472 int extent_range_uptodate(struct extent_io_tree *tree, 3473 u64 start, u64 end) 3474 { 3475 struct page *page; 3476 int ret; 3477 int pg_uptodate = 1; 3478 int uptodate; 3479 unsigned long index; 3480 3481 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3482 if (ret) 3483 return 1; 3484 while (start <= end) { 3485 index = start >> PAGE_CACHE_SHIFT; 3486 page = find_get_page(tree->mapping, index); 3487 uptodate = PageUptodate(page); 3488 page_cache_release(page); 3489 if (!uptodate) { 3490 pg_uptodate = 0; 3491 break; 3492 } 3493 start += PAGE_CACHE_SIZE; 3494 } 3495 return pg_uptodate; 3496 } 3497 3498 int extent_buffer_uptodate(struct extent_io_tree *tree, 3499 struct extent_buffer *eb, 3500 struct extent_state *cached_state) 3501 { 3502 int ret = 0; 3503 unsigned long num_pages; 3504 unsigned long i; 3505 struct page *page; 3506 int pg_uptodate = 1; 3507 3508 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3509 return 1; 3510 3511 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3512 EXTENT_UPTODATE, 1, cached_state); 3513 if (ret) 3514 return ret; 3515 3516 num_pages = num_extent_pages(eb->start, eb->len); 3517 for (i = 0; i < num_pages; i++) { 3518 page = extent_buffer_page(eb, i); 3519 if (!PageUptodate(page)) { 3520 pg_uptodate = 0; 3521 break; 3522 } 3523 } 3524 return pg_uptodate; 3525 } 3526 3527 int read_extent_buffer_pages(struct extent_io_tree *tree, 3528 struct extent_buffer *eb, 3529 u64 start, int wait, 3530 get_extent_t *get_extent, int mirror_num) 3531 { 3532 unsigned long i; 3533 unsigned long start_i; 3534 struct page *page; 3535 int err; 3536 int ret = 0; 3537 int locked_pages = 0; 3538 int all_uptodate = 1; 3539 int inc_all_pages = 0; 3540 unsigned long num_pages; 3541 struct bio *bio = NULL; 3542 unsigned long bio_flags = 0; 3543 3544 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3545 return 0; 3546 3547 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3548 EXTENT_UPTODATE, 1, NULL)) { 3549 return 0; 3550 } 3551 3552 if (start) { 3553 WARN_ON(start < eb->start); 3554 start_i = (start >> PAGE_CACHE_SHIFT) - 3555 (eb->start >> PAGE_CACHE_SHIFT); 3556 } else { 3557 start_i = 0; 3558 } 3559 3560 num_pages = num_extent_pages(eb->start, eb->len); 3561 for (i = start_i; i < num_pages; i++) { 3562 page = extent_buffer_page(eb, i); 3563 if (!wait) { 3564 if (!trylock_page(page)) 3565 goto unlock_exit; 3566 } else { 3567 lock_page(page); 3568 } 3569 locked_pages++; 3570 if (!PageUptodate(page)) 3571 all_uptodate = 0; 3572 } 3573 if (all_uptodate) { 3574 if (start_i == 0) 3575 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3576 goto unlock_exit; 3577 } 3578 3579 for (i = start_i; i < num_pages; i++) { 3580 page = extent_buffer_page(eb, i); 3581 3582 WARN_ON(!PagePrivate(page)); 3583 3584 set_page_extent_mapped(page); 3585 if (i == 0) 3586 set_page_extent_head(page, eb->len); 3587 3588 if (inc_all_pages) 3589 page_cache_get(page); 3590 if (!PageUptodate(page)) { 3591 if (start_i == 0) 3592 inc_all_pages = 1; 3593 ClearPageError(page); 3594 err = __extent_read_full_page(tree, page, 3595 get_extent, &bio, 3596 mirror_num, &bio_flags); 3597 if (err) 3598 ret = err; 3599 } else { 3600 unlock_page(page); 3601 } 3602 } 3603 3604 if (bio) 3605 submit_one_bio(READ, bio, mirror_num, bio_flags); 3606 3607 if (ret || !wait) 3608 return ret; 3609 3610 for (i = start_i; i < num_pages; i++) { 3611 page = extent_buffer_page(eb, i); 3612 wait_on_page_locked(page); 3613 if (!PageUptodate(page)) 3614 ret = -EIO; 3615 } 3616 3617 if (!ret) 3618 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3619 return ret; 3620 3621 unlock_exit: 3622 i = start_i; 3623 while (locked_pages > 0) { 3624 page = extent_buffer_page(eb, i); 3625 i++; 3626 unlock_page(page); 3627 locked_pages--; 3628 } 3629 return ret; 3630 } 3631 3632 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 3633 unsigned long start, 3634 unsigned long len) 3635 { 3636 size_t cur; 3637 size_t offset; 3638 struct page *page; 3639 char *kaddr; 3640 char *dst = (char *)dstv; 3641 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3642 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3643 3644 WARN_ON(start > eb->len); 3645 WARN_ON(start + len > eb->start + eb->len); 3646 3647 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3648 3649 while (len > 0) { 3650 page = extent_buffer_page(eb, i); 3651 3652 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3653 kaddr = kmap_atomic(page, KM_USER1); 3654 memcpy(dst, kaddr + offset, cur); 3655 kunmap_atomic(kaddr, KM_USER1); 3656 3657 dst += cur; 3658 len -= cur; 3659 offset = 0; 3660 i++; 3661 } 3662 } 3663 3664 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3665 unsigned long min_len, char **token, char **map, 3666 unsigned long *map_start, 3667 unsigned long *map_len, int km) 3668 { 3669 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3670 char *kaddr; 3671 struct page *p; 3672 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3673 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3674 unsigned long end_i = (start_offset + start + min_len - 1) >> 3675 PAGE_CACHE_SHIFT; 3676 3677 if (i != end_i) 3678 return -EINVAL; 3679 3680 if (i == 0) { 3681 offset = start_offset; 3682 *map_start = 0; 3683 } else { 3684 offset = 0; 3685 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 3686 } 3687 3688 if (start + min_len > eb->len) { 3689 printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " 3690 "wanted %lu %lu\n", (unsigned long long)eb->start, 3691 eb->len, start, min_len); 3692 WARN_ON(1); 3693 } 3694 3695 p = extent_buffer_page(eb, i); 3696 kaddr = kmap_atomic(p, km); 3697 *token = kaddr; 3698 *map = kaddr + offset; 3699 *map_len = PAGE_CACHE_SIZE - offset; 3700 return 0; 3701 } 3702 3703 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 3704 unsigned long min_len, 3705 char **token, char **map, 3706 unsigned long *map_start, 3707 unsigned long *map_len, int km) 3708 { 3709 int err; 3710 int save = 0; 3711 if (eb->map_token) { 3712 unmap_extent_buffer(eb, eb->map_token, km); 3713 eb->map_token = NULL; 3714 save = 1; 3715 } 3716 err = map_private_extent_buffer(eb, start, min_len, token, map, 3717 map_start, map_len, km); 3718 if (!err && save) { 3719 eb->map_token = *token; 3720 eb->kaddr = *map; 3721 eb->map_start = *map_start; 3722 eb->map_len = *map_len; 3723 } 3724 return err; 3725 } 3726 3727 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 3728 { 3729 kunmap_atomic(token, km); 3730 } 3731 3732 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3733 unsigned long start, 3734 unsigned long len) 3735 { 3736 size_t cur; 3737 size_t offset; 3738 struct page *page; 3739 char *kaddr; 3740 char *ptr = (char *)ptrv; 3741 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3742 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3743 int ret = 0; 3744 3745 WARN_ON(start > eb->len); 3746 WARN_ON(start + len > eb->start + eb->len); 3747 3748 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3749 3750 while (len > 0) { 3751 page = extent_buffer_page(eb, i); 3752 3753 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3754 3755 kaddr = kmap_atomic(page, KM_USER0); 3756 ret = memcmp(ptr, kaddr + offset, cur); 3757 kunmap_atomic(kaddr, KM_USER0); 3758 if (ret) 3759 break; 3760 3761 ptr += cur; 3762 len -= cur; 3763 offset = 0; 3764 i++; 3765 } 3766 return ret; 3767 } 3768 3769 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 3770 unsigned long start, unsigned long len) 3771 { 3772 size_t cur; 3773 size_t offset; 3774 struct page *page; 3775 char *kaddr; 3776 char *src = (char *)srcv; 3777 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3778 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3779 3780 WARN_ON(start > eb->len); 3781 WARN_ON(start + len > eb->start + eb->len); 3782 3783 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3784 3785 while (len > 0) { 3786 page = extent_buffer_page(eb, i); 3787 WARN_ON(!PageUptodate(page)); 3788 3789 cur = min(len, PAGE_CACHE_SIZE - offset); 3790 kaddr = kmap_atomic(page, KM_USER1); 3791 memcpy(kaddr + offset, src, cur); 3792 kunmap_atomic(kaddr, KM_USER1); 3793 3794 src += cur; 3795 len -= cur; 3796 offset = 0; 3797 i++; 3798 } 3799 } 3800 3801 void memset_extent_buffer(struct extent_buffer *eb, char c, 3802 unsigned long start, unsigned long len) 3803 { 3804 size_t cur; 3805 size_t offset; 3806 struct page *page; 3807 char *kaddr; 3808 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3809 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3810 3811 WARN_ON(start > eb->len); 3812 WARN_ON(start + len > eb->start + eb->len); 3813 3814 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3815 3816 while (len > 0) { 3817 page = extent_buffer_page(eb, i); 3818 WARN_ON(!PageUptodate(page)); 3819 3820 cur = min(len, PAGE_CACHE_SIZE - offset); 3821 kaddr = kmap_atomic(page, KM_USER0); 3822 memset(kaddr + offset, c, cur); 3823 kunmap_atomic(kaddr, KM_USER0); 3824 3825 len -= cur; 3826 offset = 0; 3827 i++; 3828 } 3829 } 3830 3831 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3832 unsigned long dst_offset, unsigned long src_offset, 3833 unsigned long len) 3834 { 3835 u64 dst_len = dst->len; 3836 size_t cur; 3837 size_t offset; 3838 struct page *page; 3839 char *kaddr; 3840 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3841 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3842 3843 WARN_ON(src->len != dst_len); 3844 3845 offset = (start_offset + dst_offset) & 3846 ((unsigned long)PAGE_CACHE_SIZE - 1); 3847 3848 while (len > 0) { 3849 page = extent_buffer_page(dst, i); 3850 WARN_ON(!PageUptodate(page)); 3851 3852 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3853 3854 kaddr = kmap_atomic(page, KM_USER0); 3855 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3856 kunmap_atomic(kaddr, KM_USER0); 3857 3858 src_offset += cur; 3859 len -= cur; 3860 offset = 0; 3861 i++; 3862 } 3863 } 3864 3865 static void move_pages(struct page *dst_page, struct page *src_page, 3866 unsigned long dst_off, unsigned long src_off, 3867 unsigned long len) 3868 { 3869 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3870 if (dst_page == src_page) { 3871 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3872 } else { 3873 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3874 char *p = dst_kaddr + dst_off + len; 3875 char *s = src_kaddr + src_off + len; 3876 3877 while (len--) 3878 *--p = *--s; 3879 3880 kunmap_atomic(src_kaddr, KM_USER1); 3881 } 3882 kunmap_atomic(dst_kaddr, KM_USER0); 3883 } 3884 3885 static void copy_pages(struct page *dst_page, struct page *src_page, 3886 unsigned long dst_off, unsigned long src_off, 3887 unsigned long len) 3888 { 3889 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3890 char *src_kaddr; 3891 3892 if (dst_page != src_page) 3893 src_kaddr = kmap_atomic(src_page, KM_USER1); 3894 else 3895 src_kaddr = dst_kaddr; 3896 3897 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3898 kunmap_atomic(dst_kaddr, KM_USER0); 3899 if (dst_page != src_page) 3900 kunmap_atomic(src_kaddr, KM_USER1); 3901 } 3902 3903 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3904 unsigned long src_offset, unsigned long len) 3905 { 3906 size_t cur; 3907 size_t dst_off_in_page; 3908 size_t src_off_in_page; 3909 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3910 unsigned long dst_i; 3911 unsigned long src_i; 3912 3913 if (src_offset + len > dst->len) { 3914 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3915 "len %lu dst len %lu\n", src_offset, len, dst->len); 3916 BUG_ON(1); 3917 } 3918 if (dst_offset + len > dst->len) { 3919 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3920 "len %lu dst len %lu\n", dst_offset, len, dst->len); 3921 BUG_ON(1); 3922 } 3923 3924 while (len > 0) { 3925 dst_off_in_page = (start_offset + dst_offset) & 3926 ((unsigned long)PAGE_CACHE_SIZE - 1); 3927 src_off_in_page = (start_offset + src_offset) & 3928 ((unsigned long)PAGE_CACHE_SIZE - 1); 3929 3930 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3931 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3932 3933 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3934 src_off_in_page)); 3935 cur = min_t(unsigned long, cur, 3936 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3937 3938 copy_pages(extent_buffer_page(dst, dst_i), 3939 extent_buffer_page(dst, src_i), 3940 dst_off_in_page, src_off_in_page, cur); 3941 3942 src_offset += cur; 3943 dst_offset += cur; 3944 len -= cur; 3945 } 3946 } 3947 3948 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3949 unsigned long src_offset, unsigned long len) 3950 { 3951 size_t cur; 3952 size_t dst_off_in_page; 3953 size_t src_off_in_page; 3954 unsigned long dst_end = dst_offset + len - 1; 3955 unsigned long src_end = src_offset + len - 1; 3956 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3957 unsigned long dst_i; 3958 unsigned long src_i; 3959 3960 if (src_offset + len > dst->len) { 3961 printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " 3962 "len %lu len %lu\n", src_offset, len, dst->len); 3963 BUG_ON(1); 3964 } 3965 if (dst_offset + len > dst->len) { 3966 printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " 3967 "len %lu len %lu\n", dst_offset, len, dst->len); 3968 BUG_ON(1); 3969 } 3970 if (dst_offset < src_offset) { 3971 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3972 return; 3973 } 3974 while (len > 0) { 3975 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3976 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3977 3978 dst_off_in_page = (start_offset + dst_end) & 3979 ((unsigned long)PAGE_CACHE_SIZE - 1); 3980 src_off_in_page = (start_offset + src_end) & 3981 ((unsigned long)PAGE_CACHE_SIZE - 1); 3982 3983 cur = min_t(unsigned long, len, src_off_in_page + 1); 3984 cur = min(cur, dst_off_in_page + 1); 3985 move_pages(extent_buffer_page(dst, dst_i), 3986 extent_buffer_page(dst, src_i), 3987 dst_off_in_page - cur + 1, 3988 src_off_in_page - cur + 1, cur); 3989 3990 dst_end -= cur; 3991 src_end -= cur; 3992 len -= cur; 3993 } 3994 } 3995 3996 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 3997 { 3998 struct extent_buffer *eb = 3999 container_of(head, struct extent_buffer, rcu_head); 4000 4001 btrfs_release_extent_buffer(eb); 4002 } 4003 4004 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 4005 { 4006 u64 start = page_offset(page); 4007 struct extent_buffer *eb; 4008 int ret = 1; 4009 4010 spin_lock(&tree->buffer_lock); 4011 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4012 if (!eb) { 4013 spin_unlock(&tree->buffer_lock); 4014 return ret; 4015 } 4016 4017 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4018 ret = 0; 4019 goto out; 4020 } 4021 4022 /* 4023 * set @eb->refs to 0 if it is already 1, and then release the @eb. 4024 * Or go back. 4025 */ 4026 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { 4027 ret = 0; 4028 goto out; 4029 } 4030 4031 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4032 out: 4033 spin_unlock(&tree->buffer_lock); 4034 4035 /* at this point we can safely release the extent buffer */ 4036 if (atomic_read(&eb->refs) == 0) 4037 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4038 return ret; 4039 } 4040