1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 atomic_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(&changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio)); 178 if (!btrfs_bioset) 179 goto free_buffer_cache; 180 181 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 182 goto free_bioset; 183 184 return 0; 185 186 free_bioset: 187 bioset_free(btrfs_bioset); 188 btrfs_bioset = NULL; 189 190 free_buffer_cache: 191 kmem_cache_destroy(extent_buffer_cache); 192 extent_buffer_cache = NULL; 193 194 free_state_cache: 195 kmem_cache_destroy(extent_state_cache); 196 extent_state_cache = NULL; 197 return -ENOMEM; 198 } 199 200 void extent_io_exit(void) 201 { 202 btrfs_leak_debug_check(); 203 204 /* 205 * Make sure all delayed rcu free are flushed before we 206 * destroy caches. 207 */ 208 rcu_barrier(); 209 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache); 211 if (btrfs_bioset) 212 bioset_free(btrfs_bioset); 213 } 214 215 void extent_io_tree_init(struct extent_io_tree *tree, 216 struct address_space *mapping) 217 { 218 tree->state = RB_ROOT; 219 tree->ops = NULL; 220 tree->dirty_bytes = 0; 221 spin_lock_init(&tree->lock); 222 tree->mapping = mapping; 223 } 224 225 static struct extent_state *alloc_extent_state(gfp_t mask) 226 { 227 struct extent_state *state; 228 229 /* 230 * The given mask might be not appropriate for the slab allocator, 231 * drop the unsupported bits 232 */ 233 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 234 state = kmem_cache_alloc(extent_state_cache, mask); 235 if (!state) 236 return state; 237 state->state = 0; 238 state->failrec = NULL; 239 RB_CLEAR_NODE(&state->rb_node); 240 btrfs_leak_debug_add(&state->leak_list, &states); 241 atomic_set(&state->refs, 1); 242 init_waitqueue_head(&state->wq); 243 trace_alloc_extent_state(state, mask, _RET_IP_); 244 return state; 245 } 246 247 void free_extent_state(struct extent_state *state) 248 { 249 if (!state) 250 return; 251 if (atomic_dec_and_test(&state->refs)) { 252 WARN_ON(extent_state_in_tree(state)); 253 btrfs_leak_debug_del(&state->leak_list); 254 trace_free_extent_state(state, _RET_IP_); 255 kmem_cache_free(extent_state_cache, state); 256 } 257 } 258 259 static struct rb_node *tree_insert(struct rb_root *root, 260 struct rb_node *search_start, 261 u64 offset, 262 struct rb_node *node, 263 struct rb_node ***p_in, 264 struct rb_node **parent_in) 265 { 266 struct rb_node **p; 267 struct rb_node *parent = NULL; 268 struct tree_entry *entry; 269 270 if (p_in && parent_in) { 271 p = *p_in; 272 parent = *parent_in; 273 goto do_insert; 274 } 275 276 p = search_start ? &search_start : &root->rb_node; 277 while (*p) { 278 parent = *p; 279 entry = rb_entry(parent, struct tree_entry, rb_node); 280 281 if (offset < entry->start) 282 p = &(*p)->rb_left; 283 else if (offset > entry->end) 284 p = &(*p)->rb_right; 285 else 286 return parent; 287 } 288 289 do_insert: 290 rb_link_node(node, parent, p); 291 rb_insert_color(node, root); 292 return NULL; 293 } 294 295 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 296 struct rb_node **prev_ret, 297 struct rb_node **next_ret, 298 struct rb_node ***p_ret, 299 struct rb_node **parent_ret) 300 { 301 struct rb_root *root = &tree->state; 302 struct rb_node **n = &root->rb_node; 303 struct rb_node *prev = NULL; 304 struct rb_node *orig_prev = NULL; 305 struct tree_entry *entry; 306 struct tree_entry *prev_entry = NULL; 307 308 while (*n) { 309 prev = *n; 310 entry = rb_entry(prev, struct tree_entry, rb_node); 311 prev_entry = entry; 312 313 if (offset < entry->start) 314 n = &(*n)->rb_left; 315 else if (offset > entry->end) 316 n = &(*n)->rb_right; 317 else 318 return *n; 319 } 320 321 if (p_ret) 322 *p_ret = n; 323 if (parent_ret) 324 *parent_ret = prev; 325 326 if (prev_ret) { 327 orig_prev = prev; 328 while (prev && offset > prev_entry->end) { 329 prev = rb_next(prev); 330 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 331 } 332 *prev_ret = prev; 333 prev = orig_prev; 334 } 335 336 if (next_ret) { 337 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 338 while (prev && offset < prev_entry->start) { 339 prev = rb_prev(prev); 340 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 341 } 342 *next_ret = prev; 343 } 344 return NULL; 345 } 346 347 static inline struct rb_node * 348 tree_search_for_insert(struct extent_io_tree *tree, 349 u64 offset, 350 struct rb_node ***p_ret, 351 struct rb_node **parent_ret) 352 { 353 struct rb_node *prev = NULL; 354 struct rb_node *ret; 355 356 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 357 if (!ret) 358 return prev; 359 return ret; 360 } 361 362 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 363 u64 offset) 364 { 365 return tree_search_for_insert(tree, offset, NULL, NULL); 366 } 367 368 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 369 struct extent_state *other) 370 { 371 if (tree->ops && tree->ops->merge_extent_hook) 372 tree->ops->merge_extent_hook(tree->mapping->host, new, 373 other); 374 } 375 376 /* 377 * utility function to look for merge candidates inside a given range. 378 * Any extents with matching state are merged together into a single 379 * extent in the tree. Extents with EXTENT_IO in their state field 380 * are not merged because the end_io handlers need to be able to do 381 * operations on them without sleeping (or doing allocations/splits). 382 * 383 * This should be called with the tree lock held. 384 */ 385 static void merge_state(struct extent_io_tree *tree, 386 struct extent_state *state) 387 { 388 struct extent_state *other; 389 struct rb_node *other_node; 390 391 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 392 return; 393 394 other_node = rb_prev(&state->rb_node); 395 if (other_node) { 396 other = rb_entry(other_node, struct extent_state, rb_node); 397 if (other->end == state->start - 1 && 398 other->state == state->state) { 399 merge_cb(tree, state, other); 400 state->start = other->start; 401 rb_erase(&other->rb_node, &tree->state); 402 RB_CLEAR_NODE(&other->rb_node); 403 free_extent_state(other); 404 } 405 } 406 other_node = rb_next(&state->rb_node); 407 if (other_node) { 408 other = rb_entry(other_node, struct extent_state, rb_node); 409 if (other->start == state->end + 1 && 410 other->state == state->state) { 411 merge_cb(tree, state, other); 412 state->end = other->end; 413 rb_erase(&other->rb_node, &tree->state); 414 RB_CLEAR_NODE(&other->rb_node); 415 free_extent_state(other); 416 } 417 } 418 } 419 420 static void set_state_cb(struct extent_io_tree *tree, 421 struct extent_state *state, unsigned *bits) 422 { 423 if (tree->ops && tree->ops->set_bit_hook) 424 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 425 } 426 427 static void clear_state_cb(struct extent_io_tree *tree, 428 struct extent_state *state, unsigned *bits) 429 { 430 if (tree->ops && tree->ops->clear_bit_hook) 431 tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), 432 state, bits); 433 } 434 435 static void set_state_bits(struct extent_io_tree *tree, 436 struct extent_state *state, unsigned *bits, 437 struct extent_changeset *changeset); 438 439 /* 440 * insert an extent_state struct into the tree. 'bits' are set on the 441 * struct before it is inserted. 442 * 443 * This may return -EEXIST if the extent is already there, in which case the 444 * state struct is freed. 445 * 446 * The tree lock is not taken internally. This is a utility function and 447 * probably isn't what you want to call (see set/clear_extent_bit). 448 */ 449 static int insert_state(struct extent_io_tree *tree, 450 struct extent_state *state, u64 start, u64 end, 451 struct rb_node ***p, 452 struct rb_node **parent, 453 unsigned *bits, struct extent_changeset *changeset) 454 { 455 struct rb_node *node; 456 457 if (end < start) 458 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 459 end, start); 460 state->start = start; 461 state->end = end; 462 463 set_state_bits(tree, state, bits, changeset); 464 465 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 466 if (node) { 467 struct extent_state *found; 468 found = rb_entry(node, struct extent_state, rb_node); 469 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 470 found->start, found->end, start, end); 471 return -EEXIST; 472 } 473 merge_state(tree, state); 474 return 0; 475 } 476 477 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 478 u64 split) 479 { 480 if (tree->ops && tree->ops->split_extent_hook) 481 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 482 } 483 484 /* 485 * split a given extent state struct in two, inserting the preallocated 486 * struct 'prealloc' as the newly created second half. 'split' indicates an 487 * offset inside 'orig' where it should be split. 488 * 489 * Before calling, 490 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 491 * are two extent state structs in the tree: 492 * prealloc: [orig->start, split - 1] 493 * orig: [ split, orig->end ] 494 * 495 * The tree locks are not taken by this function. They need to be held 496 * by the caller. 497 */ 498 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 499 struct extent_state *prealloc, u64 split) 500 { 501 struct rb_node *node; 502 503 split_cb(tree, orig, split); 504 505 prealloc->start = orig->start; 506 prealloc->end = split - 1; 507 prealloc->state = orig->state; 508 orig->start = split; 509 510 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 511 &prealloc->rb_node, NULL, NULL); 512 if (node) { 513 free_extent_state(prealloc); 514 return -EEXIST; 515 } 516 return 0; 517 } 518 519 static struct extent_state *next_state(struct extent_state *state) 520 { 521 struct rb_node *next = rb_next(&state->rb_node); 522 if (next) 523 return rb_entry(next, struct extent_state, rb_node); 524 else 525 return NULL; 526 } 527 528 /* 529 * utility function to clear some bits in an extent state struct. 530 * it will optionally wake up any one waiting on this state (wake == 1). 531 * 532 * If no bits are set on the state struct after clearing things, the 533 * struct is freed and removed from the tree 534 */ 535 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 536 struct extent_state *state, 537 unsigned *bits, int wake, 538 struct extent_changeset *changeset) 539 { 540 struct extent_state *next; 541 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 542 543 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 544 u64 range = state->end - state->start + 1; 545 WARN_ON(range > tree->dirty_bytes); 546 tree->dirty_bytes -= range; 547 } 548 clear_state_cb(tree, state, bits); 549 add_extent_changeset(state, bits_to_clear, changeset, 0); 550 state->state &= ~bits_to_clear; 551 if (wake) 552 wake_up(&state->wq); 553 if (state->state == 0) { 554 next = next_state(state); 555 if (extent_state_in_tree(state)) { 556 rb_erase(&state->rb_node, &tree->state); 557 RB_CLEAR_NODE(&state->rb_node); 558 free_extent_state(state); 559 } else { 560 WARN_ON(1); 561 } 562 } else { 563 merge_state(tree, state); 564 next = next_state(state); 565 } 566 return next; 567 } 568 569 static struct extent_state * 570 alloc_extent_state_atomic(struct extent_state *prealloc) 571 { 572 if (!prealloc) 573 prealloc = alloc_extent_state(GFP_ATOMIC); 574 575 return prealloc; 576 } 577 578 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 579 { 580 btrfs_panic(tree_fs_info(tree), err, 581 "Locking error: Extent tree was modified by another thread while locked."); 582 } 583 584 /* 585 * clear some bits on a range in the tree. This may require splitting 586 * or inserting elements in the tree, so the gfp mask is used to 587 * indicate which allocations or sleeping are allowed. 588 * 589 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 590 * the given range from the tree regardless of state (ie for truncate). 591 * 592 * the range [start, end] is inclusive. 593 * 594 * This takes the tree lock, and returns 0 on success and < 0 on error. 595 */ 596 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 597 unsigned bits, int wake, int delete, 598 struct extent_state **cached_state, 599 gfp_t mask, struct extent_changeset *changeset) 600 { 601 struct extent_state *state; 602 struct extent_state *cached; 603 struct extent_state *prealloc = NULL; 604 struct rb_node *node; 605 u64 last_end; 606 int err; 607 int clear = 0; 608 609 btrfs_debug_check_extent_io_range(tree, start, end); 610 611 if (bits & EXTENT_DELALLOC) 612 bits |= EXTENT_NORESERVE; 613 614 if (delete) 615 bits |= ~EXTENT_CTLBITS; 616 bits |= EXTENT_FIRST_DELALLOC; 617 618 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 619 clear = 1; 620 again: 621 if (!prealloc && gfpflags_allow_blocking(mask)) { 622 /* 623 * Don't care for allocation failure here because we might end 624 * up not needing the pre-allocated extent state at all, which 625 * is the case if we only have in the tree extent states that 626 * cover our input range and don't cover too any other range. 627 * If we end up needing a new extent state we allocate it later. 628 */ 629 prealloc = alloc_extent_state(mask); 630 } 631 632 spin_lock(&tree->lock); 633 if (cached_state) { 634 cached = *cached_state; 635 636 if (clear) { 637 *cached_state = NULL; 638 cached_state = NULL; 639 } 640 641 if (cached && extent_state_in_tree(cached) && 642 cached->start <= start && cached->end > start) { 643 if (clear) 644 atomic_dec(&cached->refs); 645 state = cached; 646 goto hit_next; 647 } 648 if (clear) 649 free_extent_state(cached); 650 } 651 /* 652 * this search will find the extents that end after 653 * our range starts 654 */ 655 node = tree_search(tree, start); 656 if (!node) 657 goto out; 658 state = rb_entry(node, struct extent_state, rb_node); 659 hit_next: 660 if (state->start > end) 661 goto out; 662 WARN_ON(state->end < start); 663 last_end = state->end; 664 665 /* the state doesn't have the wanted bits, go ahead */ 666 if (!(state->state & bits)) { 667 state = next_state(state); 668 goto next; 669 } 670 671 /* 672 * | ---- desired range ---- | 673 * | state | or 674 * | ------------- state -------------- | 675 * 676 * We need to split the extent we found, and may flip 677 * bits on second half. 678 * 679 * If the extent we found extends past our range, we 680 * just split and search again. It'll get split again 681 * the next time though. 682 * 683 * If the extent we found is inside our range, we clear 684 * the desired bit on it. 685 */ 686 687 if (state->start < start) { 688 prealloc = alloc_extent_state_atomic(prealloc); 689 BUG_ON(!prealloc); 690 err = split_state(tree, state, prealloc, start); 691 if (err) 692 extent_io_tree_panic(tree, err); 693 694 prealloc = NULL; 695 if (err) 696 goto out; 697 if (state->end <= end) { 698 state = clear_state_bit(tree, state, &bits, wake, 699 changeset); 700 goto next; 701 } 702 goto search_again; 703 } 704 /* 705 * | ---- desired range ---- | 706 * | state | 707 * We need to split the extent, and clear the bit 708 * on the first half 709 */ 710 if (state->start <= end && state->end > end) { 711 prealloc = alloc_extent_state_atomic(prealloc); 712 BUG_ON(!prealloc); 713 err = split_state(tree, state, prealloc, end + 1); 714 if (err) 715 extent_io_tree_panic(tree, err); 716 717 if (wake) 718 wake_up(&state->wq); 719 720 clear_state_bit(tree, prealloc, &bits, wake, changeset); 721 722 prealloc = NULL; 723 goto out; 724 } 725 726 state = clear_state_bit(tree, state, &bits, wake, changeset); 727 next: 728 if (last_end == (u64)-1) 729 goto out; 730 start = last_end + 1; 731 if (start <= end && state && !need_resched()) 732 goto hit_next; 733 734 search_again: 735 if (start > end) 736 goto out; 737 spin_unlock(&tree->lock); 738 if (gfpflags_allow_blocking(mask)) 739 cond_resched(); 740 goto again; 741 742 out: 743 spin_unlock(&tree->lock); 744 if (prealloc) 745 free_extent_state(prealloc); 746 747 return 0; 748 749 } 750 751 static void wait_on_state(struct extent_io_tree *tree, 752 struct extent_state *state) 753 __releases(tree->lock) 754 __acquires(tree->lock) 755 { 756 DEFINE_WAIT(wait); 757 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 758 spin_unlock(&tree->lock); 759 schedule(); 760 spin_lock(&tree->lock); 761 finish_wait(&state->wq, &wait); 762 } 763 764 /* 765 * waits for one or more bits to clear on a range in the state tree. 766 * The range [start, end] is inclusive. 767 * The tree lock is taken by this function 768 */ 769 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 770 unsigned long bits) 771 { 772 struct extent_state *state; 773 struct rb_node *node; 774 775 btrfs_debug_check_extent_io_range(tree, start, end); 776 777 spin_lock(&tree->lock); 778 again: 779 while (1) { 780 /* 781 * this search will find all the extents that end after 782 * our range starts 783 */ 784 node = tree_search(tree, start); 785 process_node: 786 if (!node) 787 break; 788 789 state = rb_entry(node, struct extent_state, rb_node); 790 791 if (state->start > end) 792 goto out; 793 794 if (state->state & bits) { 795 start = state->start; 796 atomic_inc(&state->refs); 797 wait_on_state(tree, state); 798 free_extent_state(state); 799 goto again; 800 } 801 start = state->end + 1; 802 803 if (start > end) 804 break; 805 806 if (!cond_resched_lock(&tree->lock)) { 807 node = rb_next(node); 808 goto process_node; 809 } 810 } 811 out: 812 spin_unlock(&tree->lock); 813 } 814 815 static void set_state_bits(struct extent_io_tree *tree, 816 struct extent_state *state, 817 unsigned *bits, struct extent_changeset *changeset) 818 { 819 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 820 821 set_state_cb(tree, state, bits); 822 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 823 u64 range = state->end - state->start + 1; 824 tree->dirty_bytes += range; 825 } 826 add_extent_changeset(state, bits_to_set, changeset, 1); 827 state->state |= bits_to_set; 828 } 829 830 static void cache_state_if_flags(struct extent_state *state, 831 struct extent_state **cached_ptr, 832 unsigned flags) 833 { 834 if (cached_ptr && !(*cached_ptr)) { 835 if (!flags || (state->state & flags)) { 836 *cached_ptr = state; 837 atomic_inc(&state->refs); 838 } 839 } 840 } 841 842 static void cache_state(struct extent_state *state, 843 struct extent_state **cached_ptr) 844 { 845 return cache_state_if_flags(state, cached_ptr, 846 EXTENT_IOBITS | EXTENT_BOUNDARY); 847 } 848 849 /* 850 * set some bits on a range in the tree. This may require allocations or 851 * sleeping, so the gfp mask is used to indicate what is allowed. 852 * 853 * If any of the exclusive bits are set, this will fail with -EEXIST if some 854 * part of the range already has the desired bits set. The start of the 855 * existing range is returned in failed_start in this case. 856 * 857 * [start, end] is inclusive This takes the tree lock. 858 */ 859 860 static int __must_check 861 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 862 unsigned bits, unsigned exclusive_bits, 863 u64 *failed_start, struct extent_state **cached_state, 864 gfp_t mask, struct extent_changeset *changeset) 865 { 866 struct extent_state *state; 867 struct extent_state *prealloc = NULL; 868 struct rb_node *node; 869 struct rb_node **p; 870 struct rb_node *parent; 871 int err = 0; 872 u64 last_start; 873 u64 last_end; 874 875 btrfs_debug_check_extent_io_range(tree, start, end); 876 877 bits |= EXTENT_FIRST_DELALLOC; 878 again: 879 if (!prealloc && gfpflags_allow_blocking(mask)) { 880 /* 881 * Don't care for allocation failure here because we might end 882 * up not needing the pre-allocated extent state at all, which 883 * is the case if we only have in the tree extent states that 884 * cover our input range and don't cover too any other range. 885 * If we end up needing a new extent state we allocate it later. 886 */ 887 prealloc = alloc_extent_state(mask); 888 } 889 890 spin_lock(&tree->lock); 891 if (cached_state && *cached_state) { 892 state = *cached_state; 893 if (state->start <= start && state->end > start && 894 extent_state_in_tree(state)) { 895 node = &state->rb_node; 896 goto hit_next; 897 } 898 } 899 /* 900 * this search will find all the extents that end after 901 * our range starts. 902 */ 903 node = tree_search_for_insert(tree, start, &p, &parent); 904 if (!node) { 905 prealloc = alloc_extent_state_atomic(prealloc); 906 BUG_ON(!prealloc); 907 err = insert_state(tree, prealloc, start, end, 908 &p, &parent, &bits, changeset); 909 if (err) 910 extent_io_tree_panic(tree, err); 911 912 cache_state(prealloc, cached_state); 913 prealloc = NULL; 914 goto out; 915 } 916 state = rb_entry(node, struct extent_state, rb_node); 917 hit_next: 918 last_start = state->start; 919 last_end = state->end; 920 921 /* 922 * | ---- desired range ---- | 923 * | state | 924 * 925 * Just lock what we found and keep going 926 */ 927 if (state->start == start && state->end <= end) { 928 if (state->state & exclusive_bits) { 929 *failed_start = state->start; 930 err = -EEXIST; 931 goto out; 932 } 933 934 set_state_bits(tree, state, &bits, changeset); 935 cache_state(state, cached_state); 936 merge_state(tree, state); 937 if (last_end == (u64)-1) 938 goto out; 939 start = last_end + 1; 940 state = next_state(state); 941 if (start < end && state && state->start == start && 942 !need_resched()) 943 goto hit_next; 944 goto search_again; 945 } 946 947 /* 948 * | ---- desired range ---- | 949 * | state | 950 * or 951 * | ------------- state -------------- | 952 * 953 * We need to split the extent we found, and may flip bits on 954 * second half. 955 * 956 * If the extent we found extends past our 957 * range, we just split and search again. It'll get split 958 * again the next time though. 959 * 960 * If the extent we found is inside our range, we set the 961 * desired bit on it. 962 */ 963 if (state->start < start) { 964 if (state->state & exclusive_bits) { 965 *failed_start = start; 966 err = -EEXIST; 967 goto out; 968 } 969 970 prealloc = alloc_extent_state_atomic(prealloc); 971 BUG_ON(!prealloc); 972 err = split_state(tree, state, prealloc, start); 973 if (err) 974 extent_io_tree_panic(tree, err); 975 976 prealloc = NULL; 977 if (err) 978 goto out; 979 if (state->end <= end) { 980 set_state_bits(tree, state, &bits, changeset); 981 cache_state(state, cached_state); 982 merge_state(tree, state); 983 if (last_end == (u64)-1) 984 goto out; 985 start = last_end + 1; 986 state = next_state(state); 987 if (start < end && state && state->start == start && 988 !need_resched()) 989 goto hit_next; 990 } 991 goto search_again; 992 } 993 /* 994 * | ---- desired range ---- | 995 * | state | or | state | 996 * 997 * There's a hole, we need to insert something in it and 998 * ignore the extent we found. 999 */ 1000 if (state->start > start) { 1001 u64 this_end; 1002 if (end < last_start) 1003 this_end = end; 1004 else 1005 this_end = last_start - 1; 1006 1007 prealloc = alloc_extent_state_atomic(prealloc); 1008 BUG_ON(!prealloc); 1009 1010 /* 1011 * Avoid to free 'prealloc' if it can be merged with 1012 * the later extent. 1013 */ 1014 err = insert_state(tree, prealloc, start, this_end, 1015 NULL, NULL, &bits, changeset); 1016 if (err) 1017 extent_io_tree_panic(tree, err); 1018 1019 cache_state(prealloc, cached_state); 1020 prealloc = NULL; 1021 start = this_end + 1; 1022 goto search_again; 1023 } 1024 /* 1025 * | ---- desired range ---- | 1026 * | state | 1027 * We need to split the extent, and set the bit 1028 * on the first half 1029 */ 1030 if (state->start <= end && state->end > end) { 1031 if (state->state & exclusive_bits) { 1032 *failed_start = start; 1033 err = -EEXIST; 1034 goto out; 1035 } 1036 1037 prealloc = alloc_extent_state_atomic(prealloc); 1038 BUG_ON(!prealloc); 1039 err = split_state(tree, state, prealloc, end + 1); 1040 if (err) 1041 extent_io_tree_panic(tree, err); 1042 1043 set_state_bits(tree, prealloc, &bits, changeset); 1044 cache_state(prealloc, cached_state); 1045 merge_state(tree, prealloc); 1046 prealloc = NULL; 1047 goto out; 1048 } 1049 1050 search_again: 1051 if (start > end) 1052 goto out; 1053 spin_unlock(&tree->lock); 1054 if (gfpflags_allow_blocking(mask)) 1055 cond_resched(); 1056 goto again; 1057 1058 out: 1059 spin_unlock(&tree->lock); 1060 if (prealloc) 1061 free_extent_state(prealloc); 1062 1063 return err; 1064 1065 } 1066 1067 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1068 unsigned bits, u64 * failed_start, 1069 struct extent_state **cached_state, gfp_t mask) 1070 { 1071 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1072 cached_state, mask, NULL); 1073 } 1074 1075 1076 /** 1077 * convert_extent_bit - convert all bits in a given range from one bit to 1078 * another 1079 * @tree: the io tree to search 1080 * @start: the start offset in bytes 1081 * @end: the end offset in bytes (inclusive) 1082 * @bits: the bits to set in this range 1083 * @clear_bits: the bits to clear in this range 1084 * @cached_state: state that we're going to cache 1085 * 1086 * This will go through and set bits for the given range. If any states exist 1087 * already in this range they are set with the given bit and cleared of the 1088 * clear_bits. This is only meant to be used by things that are mergeable, ie 1089 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1090 * boundary bits like LOCK. 1091 * 1092 * All allocations are done with GFP_NOFS. 1093 */ 1094 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1095 unsigned bits, unsigned clear_bits, 1096 struct extent_state **cached_state) 1097 { 1098 struct extent_state *state; 1099 struct extent_state *prealloc = NULL; 1100 struct rb_node *node; 1101 struct rb_node **p; 1102 struct rb_node *parent; 1103 int err = 0; 1104 u64 last_start; 1105 u64 last_end; 1106 bool first_iteration = true; 1107 1108 btrfs_debug_check_extent_io_range(tree, start, end); 1109 1110 again: 1111 if (!prealloc) { 1112 /* 1113 * Best effort, don't worry if extent state allocation fails 1114 * here for the first iteration. We might have a cached state 1115 * that matches exactly the target range, in which case no 1116 * extent state allocations are needed. We'll only know this 1117 * after locking the tree. 1118 */ 1119 prealloc = alloc_extent_state(GFP_NOFS); 1120 if (!prealloc && !first_iteration) 1121 return -ENOMEM; 1122 } 1123 1124 spin_lock(&tree->lock); 1125 if (cached_state && *cached_state) { 1126 state = *cached_state; 1127 if (state->start <= start && state->end > start && 1128 extent_state_in_tree(state)) { 1129 node = &state->rb_node; 1130 goto hit_next; 1131 } 1132 } 1133 1134 /* 1135 * this search will find all the extents that end after 1136 * our range starts. 1137 */ 1138 node = tree_search_for_insert(tree, start, &p, &parent); 1139 if (!node) { 1140 prealloc = alloc_extent_state_atomic(prealloc); 1141 if (!prealloc) { 1142 err = -ENOMEM; 1143 goto out; 1144 } 1145 err = insert_state(tree, prealloc, start, end, 1146 &p, &parent, &bits, NULL); 1147 if (err) 1148 extent_io_tree_panic(tree, err); 1149 cache_state(prealloc, cached_state); 1150 prealloc = NULL; 1151 goto out; 1152 } 1153 state = rb_entry(node, struct extent_state, rb_node); 1154 hit_next: 1155 last_start = state->start; 1156 last_end = state->end; 1157 1158 /* 1159 * | ---- desired range ---- | 1160 * | state | 1161 * 1162 * Just lock what we found and keep going 1163 */ 1164 if (state->start == start && state->end <= end) { 1165 set_state_bits(tree, state, &bits, NULL); 1166 cache_state(state, cached_state); 1167 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1168 if (last_end == (u64)-1) 1169 goto out; 1170 start = last_end + 1; 1171 if (start < end && state && state->start == start && 1172 !need_resched()) 1173 goto hit_next; 1174 goto search_again; 1175 } 1176 1177 /* 1178 * | ---- desired range ---- | 1179 * | state | 1180 * or 1181 * | ------------- state -------------- | 1182 * 1183 * We need to split the extent we found, and may flip bits on 1184 * second half. 1185 * 1186 * If the extent we found extends past our 1187 * range, we just split and search again. It'll get split 1188 * again the next time though. 1189 * 1190 * If the extent we found is inside our range, we set the 1191 * desired bit on it. 1192 */ 1193 if (state->start < start) { 1194 prealloc = alloc_extent_state_atomic(prealloc); 1195 if (!prealloc) { 1196 err = -ENOMEM; 1197 goto out; 1198 } 1199 err = split_state(tree, state, prealloc, start); 1200 if (err) 1201 extent_io_tree_panic(tree, err); 1202 prealloc = NULL; 1203 if (err) 1204 goto out; 1205 if (state->end <= end) { 1206 set_state_bits(tree, state, &bits, NULL); 1207 cache_state(state, cached_state); 1208 state = clear_state_bit(tree, state, &clear_bits, 0, 1209 NULL); 1210 if (last_end == (u64)-1) 1211 goto out; 1212 start = last_end + 1; 1213 if (start < end && state && state->start == start && 1214 !need_resched()) 1215 goto hit_next; 1216 } 1217 goto search_again; 1218 } 1219 /* 1220 * | ---- desired range ---- | 1221 * | state | or | state | 1222 * 1223 * There's a hole, we need to insert something in it and 1224 * ignore the extent we found. 1225 */ 1226 if (state->start > start) { 1227 u64 this_end; 1228 if (end < last_start) 1229 this_end = end; 1230 else 1231 this_end = last_start - 1; 1232 1233 prealloc = alloc_extent_state_atomic(prealloc); 1234 if (!prealloc) { 1235 err = -ENOMEM; 1236 goto out; 1237 } 1238 1239 /* 1240 * Avoid to free 'prealloc' if it can be merged with 1241 * the later extent. 1242 */ 1243 err = insert_state(tree, prealloc, start, this_end, 1244 NULL, NULL, &bits, NULL); 1245 if (err) 1246 extent_io_tree_panic(tree, err); 1247 cache_state(prealloc, cached_state); 1248 prealloc = NULL; 1249 start = this_end + 1; 1250 goto search_again; 1251 } 1252 /* 1253 * | ---- desired range ---- | 1254 * | state | 1255 * We need to split the extent, and set the bit 1256 * on the first half 1257 */ 1258 if (state->start <= end && state->end > end) { 1259 prealloc = alloc_extent_state_atomic(prealloc); 1260 if (!prealloc) { 1261 err = -ENOMEM; 1262 goto out; 1263 } 1264 1265 err = split_state(tree, state, prealloc, end + 1); 1266 if (err) 1267 extent_io_tree_panic(tree, err); 1268 1269 set_state_bits(tree, prealloc, &bits, NULL); 1270 cache_state(prealloc, cached_state); 1271 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1272 prealloc = NULL; 1273 goto out; 1274 } 1275 1276 search_again: 1277 if (start > end) 1278 goto out; 1279 spin_unlock(&tree->lock); 1280 cond_resched(); 1281 first_iteration = false; 1282 goto again; 1283 1284 out: 1285 spin_unlock(&tree->lock); 1286 if (prealloc) 1287 free_extent_state(prealloc); 1288 1289 return err; 1290 } 1291 1292 /* wrappers around set/clear extent bit */ 1293 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1294 unsigned bits, struct extent_changeset *changeset) 1295 { 1296 /* 1297 * We don't support EXTENT_LOCKED yet, as current changeset will 1298 * record any bits changed, so for EXTENT_LOCKED case, it will 1299 * either fail with -EEXIST or changeset will record the whole 1300 * range. 1301 */ 1302 BUG_ON(bits & EXTENT_LOCKED); 1303 1304 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1305 changeset); 1306 } 1307 1308 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1309 unsigned bits, int wake, int delete, 1310 struct extent_state **cached, gfp_t mask) 1311 { 1312 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1313 cached, mask, NULL); 1314 } 1315 1316 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1317 unsigned bits, struct extent_changeset *changeset) 1318 { 1319 /* 1320 * Don't support EXTENT_LOCKED case, same reason as 1321 * set_record_extent_bits(). 1322 */ 1323 BUG_ON(bits & EXTENT_LOCKED); 1324 1325 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1326 changeset); 1327 } 1328 1329 /* 1330 * either insert or lock state struct between start and end use mask to tell 1331 * us if waiting is desired. 1332 */ 1333 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1334 struct extent_state **cached_state) 1335 { 1336 int err; 1337 u64 failed_start; 1338 1339 while (1) { 1340 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1341 EXTENT_LOCKED, &failed_start, 1342 cached_state, GFP_NOFS, NULL); 1343 if (err == -EEXIST) { 1344 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1345 start = failed_start; 1346 } else 1347 break; 1348 WARN_ON(start > end); 1349 } 1350 return err; 1351 } 1352 1353 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1354 { 1355 int err; 1356 u64 failed_start; 1357 1358 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1359 &failed_start, NULL, GFP_NOFS, NULL); 1360 if (err == -EEXIST) { 1361 if (failed_start > start) 1362 clear_extent_bit(tree, start, failed_start - 1, 1363 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1364 return 0; 1365 } 1366 return 1; 1367 } 1368 1369 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1370 { 1371 unsigned long index = start >> PAGE_SHIFT; 1372 unsigned long end_index = end >> PAGE_SHIFT; 1373 struct page *page; 1374 1375 while (index <= end_index) { 1376 page = find_get_page(inode->i_mapping, index); 1377 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1378 clear_page_dirty_for_io(page); 1379 put_page(page); 1380 index++; 1381 } 1382 } 1383 1384 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1385 { 1386 unsigned long index = start >> PAGE_SHIFT; 1387 unsigned long end_index = end >> PAGE_SHIFT; 1388 struct page *page; 1389 1390 while (index <= end_index) { 1391 page = find_get_page(inode->i_mapping, index); 1392 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1393 __set_page_dirty_nobuffers(page); 1394 account_page_redirty(page); 1395 put_page(page); 1396 index++; 1397 } 1398 } 1399 1400 /* 1401 * helper function to set both pages and extents in the tree writeback 1402 */ 1403 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1404 { 1405 unsigned long index = start >> PAGE_SHIFT; 1406 unsigned long end_index = end >> PAGE_SHIFT; 1407 struct page *page; 1408 1409 while (index <= end_index) { 1410 page = find_get_page(tree->mapping, index); 1411 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1412 set_page_writeback(page); 1413 put_page(page); 1414 index++; 1415 } 1416 } 1417 1418 /* find the first state struct with 'bits' set after 'start', and 1419 * return it. tree->lock must be held. NULL will returned if 1420 * nothing was found after 'start' 1421 */ 1422 static struct extent_state * 1423 find_first_extent_bit_state(struct extent_io_tree *tree, 1424 u64 start, unsigned bits) 1425 { 1426 struct rb_node *node; 1427 struct extent_state *state; 1428 1429 /* 1430 * this search will find all the extents that end after 1431 * our range starts. 1432 */ 1433 node = tree_search(tree, start); 1434 if (!node) 1435 goto out; 1436 1437 while (1) { 1438 state = rb_entry(node, struct extent_state, rb_node); 1439 if (state->end >= start && (state->state & bits)) 1440 return state; 1441 1442 node = rb_next(node); 1443 if (!node) 1444 break; 1445 } 1446 out: 1447 return NULL; 1448 } 1449 1450 /* 1451 * find the first offset in the io tree with 'bits' set. zero is 1452 * returned if we find something, and *start_ret and *end_ret are 1453 * set to reflect the state struct that was found. 1454 * 1455 * If nothing was found, 1 is returned. If found something, return 0. 1456 */ 1457 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1458 u64 *start_ret, u64 *end_ret, unsigned bits, 1459 struct extent_state **cached_state) 1460 { 1461 struct extent_state *state; 1462 struct rb_node *n; 1463 int ret = 1; 1464 1465 spin_lock(&tree->lock); 1466 if (cached_state && *cached_state) { 1467 state = *cached_state; 1468 if (state->end == start - 1 && extent_state_in_tree(state)) { 1469 n = rb_next(&state->rb_node); 1470 while (n) { 1471 state = rb_entry(n, struct extent_state, 1472 rb_node); 1473 if (state->state & bits) 1474 goto got_it; 1475 n = rb_next(n); 1476 } 1477 free_extent_state(*cached_state); 1478 *cached_state = NULL; 1479 goto out; 1480 } 1481 free_extent_state(*cached_state); 1482 *cached_state = NULL; 1483 } 1484 1485 state = find_first_extent_bit_state(tree, start, bits); 1486 got_it: 1487 if (state) { 1488 cache_state_if_flags(state, cached_state, 0); 1489 *start_ret = state->start; 1490 *end_ret = state->end; 1491 ret = 0; 1492 } 1493 out: 1494 spin_unlock(&tree->lock); 1495 return ret; 1496 } 1497 1498 /* 1499 * find a contiguous range of bytes in the file marked as delalloc, not 1500 * more than 'max_bytes'. start and end are used to return the range, 1501 * 1502 * 1 is returned if we find something, 0 if nothing was in the tree 1503 */ 1504 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1505 u64 *start, u64 *end, u64 max_bytes, 1506 struct extent_state **cached_state) 1507 { 1508 struct rb_node *node; 1509 struct extent_state *state; 1510 u64 cur_start = *start; 1511 u64 found = 0; 1512 u64 total_bytes = 0; 1513 1514 spin_lock(&tree->lock); 1515 1516 /* 1517 * this search will find all the extents that end after 1518 * our range starts. 1519 */ 1520 node = tree_search(tree, cur_start); 1521 if (!node) { 1522 if (!found) 1523 *end = (u64)-1; 1524 goto out; 1525 } 1526 1527 while (1) { 1528 state = rb_entry(node, struct extent_state, rb_node); 1529 if (found && (state->start != cur_start || 1530 (state->state & EXTENT_BOUNDARY))) { 1531 goto out; 1532 } 1533 if (!(state->state & EXTENT_DELALLOC)) { 1534 if (!found) 1535 *end = state->end; 1536 goto out; 1537 } 1538 if (!found) { 1539 *start = state->start; 1540 *cached_state = state; 1541 atomic_inc(&state->refs); 1542 } 1543 found++; 1544 *end = state->end; 1545 cur_start = state->end + 1; 1546 node = rb_next(node); 1547 total_bytes += state->end - state->start + 1; 1548 if (total_bytes >= max_bytes) 1549 break; 1550 if (!node) 1551 break; 1552 } 1553 out: 1554 spin_unlock(&tree->lock); 1555 return found; 1556 } 1557 1558 static int __process_pages_contig(struct address_space *mapping, 1559 struct page *locked_page, 1560 pgoff_t start_index, pgoff_t end_index, 1561 unsigned long page_ops, pgoff_t *index_ret); 1562 1563 static noinline void __unlock_for_delalloc(struct inode *inode, 1564 struct page *locked_page, 1565 u64 start, u64 end) 1566 { 1567 unsigned long index = start >> PAGE_SHIFT; 1568 unsigned long end_index = end >> PAGE_SHIFT; 1569 1570 ASSERT(locked_page); 1571 if (index == locked_page->index && end_index == index) 1572 return; 1573 1574 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1575 PAGE_UNLOCK, NULL); 1576 } 1577 1578 static noinline int lock_delalloc_pages(struct inode *inode, 1579 struct page *locked_page, 1580 u64 delalloc_start, 1581 u64 delalloc_end) 1582 { 1583 unsigned long index = delalloc_start >> PAGE_SHIFT; 1584 unsigned long index_ret = index; 1585 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1586 int ret; 1587 1588 ASSERT(locked_page); 1589 if (index == locked_page->index && index == end_index) 1590 return 0; 1591 1592 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1593 end_index, PAGE_LOCK, &index_ret); 1594 if (ret == -EAGAIN) 1595 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1596 (u64)index_ret << PAGE_SHIFT); 1597 return ret; 1598 } 1599 1600 /* 1601 * find a contiguous range of bytes in the file marked as delalloc, not 1602 * more than 'max_bytes'. start and end are used to return the range, 1603 * 1604 * 1 is returned if we find something, 0 if nothing was in the tree 1605 */ 1606 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1607 struct extent_io_tree *tree, 1608 struct page *locked_page, u64 *start, 1609 u64 *end, u64 max_bytes) 1610 { 1611 u64 delalloc_start; 1612 u64 delalloc_end; 1613 u64 found; 1614 struct extent_state *cached_state = NULL; 1615 int ret; 1616 int loops = 0; 1617 1618 again: 1619 /* step one, find a bunch of delalloc bytes starting at start */ 1620 delalloc_start = *start; 1621 delalloc_end = 0; 1622 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1623 max_bytes, &cached_state); 1624 if (!found || delalloc_end <= *start) { 1625 *start = delalloc_start; 1626 *end = delalloc_end; 1627 free_extent_state(cached_state); 1628 return 0; 1629 } 1630 1631 /* 1632 * start comes from the offset of locked_page. We have to lock 1633 * pages in order, so we can't process delalloc bytes before 1634 * locked_page 1635 */ 1636 if (delalloc_start < *start) 1637 delalloc_start = *start; 1638 1639 /* 1640 * make sure to limit the number of pages we try to lock down 1641 */ 1642 if (delalloc_end + 1 - delalloc_start > max_bytes) 1643 delalloc_end = delalloc_start + max_bytes - 1; 1644 1645 /* step two, lock all the pages after the page that has start */ 1646 ret = lock_delalloc_pages(inode, locked_page, 1647 delalloc_start, delalloc_end); 1648 if (ret == -EAGAIN) { 1649 /* some of the pages are gone, lets avoid looping by 1650 * shortening the size of the delalloc range we're searching 1651 */ 1652 free_extent_state(cached_state); 1653 cached_state = NULL; 1654 if (!loops) { 1655 max_bytes = PAGE_SIZE; 1656 loops = 1; 1657 goto again; 1658 } else { 1659 found = 0; 1660 goto out_failed; 1661 } 1662 } 1663 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1664 1665 /* step three, lock the state bits for the whole range */ 1666 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1667 1668 /* then test to make sure it is all still delalloc */ 1669 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1670 EXTENT_DELALLOC, 1, cached_state); 1671 if (!ret) { 1672 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1673 &cached_state, GFP_NOFS); 1674 __unlock_for_delalloc(inode, locked_page, 1675 delalloc_start, delalloc_end); 1676 cond_resched(); 1677 goto again; 1678 } 1679 free_extent_state(cached_state); 1680 *start = delalloc_start; 1681 *end = delalloc_end; 1682 out_failed: 1683 return found; 1684 } 1685 1686 static int __process_pages_contig(struct address_space *mapping, 1687 struct page *locked_page, 1688 pgoff_t start_index, pgoff_t end_index, 1689 unsigned long page_ops, pgoff_t *index_ret) 1690 { 1691 unsigned long nr_pages = end_index - start_index + 1; 1692 unsigned long pages_locked = 0; 1693 pgoff_t index = start_index; 1694 struct page *pages[16]; 1695 unsigned ret; 1696 int err = 0; 1697 int i; 1698 1699 if (page_ops & PAGE_LOCK) { 1700 ASSERT(page_ops == PAGE_LOCK); 1701 ASSERT(index_ret && *index_ret == start_index); 1702 } 1703 1704 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1705 mapping_set_error(mapping, -EIO); 1706 1707 while (nr_pages > 0) { 1708 ret = find_get_pages_contig(mapping, index, 1709 min_t(unsigned long, 1710 nr_pages, ARRAY_SIZE(pages)), pages); 1711 if (ret == 0) { 1712 /* 1713 * Only if we're going to lock these pages, 1714 * can we find nothing at @index. 1715 */ 1716 ASSERT(page_ops & PAGE_LOCK); 1717 return ret; 1718 } 1719 1720 for (i = 0; i < ret; i++) { 1721 if (page_ops & PAGE_SET_PRIVATE2) 1722 SetPagePrivate2(pages[i]); 1723 1724 if (pages[i] == locked_page) { 1725 put_page(pages[i]); 1726 pages_locked++; 1727 continue; 1728 } 1729 if (page_ops & PAGE_CLEAR_DIRTY) 1730 clear_page_dirty_for_io(pages[i]); 1731 if (page_ops & PAGE_SET_WRITEBACK) 1732 set_page_writeback(pages[i]); 1733 if (page_ops & PAGE_SET_ERROR) 1734 SetPageError(pages[i]); 1735 if (page_ops & PAGE_END_WRITEBACK) 1736 end_page_writeback(pages[i]); 1737 if (page_ops & PAGE_UNLOCK) 1738 unlock_page(pages[i]); 1739 if (page_ops & PAGE_LOCK) { 1740 lock_page(pages[i]); 1741 if (!PageDirty(pages[i]) || 1742 pages[i]->mapping != mapping) { 1743 unlock_page(pages[i]); 1744 put_page(pages[i]); 1745 err = -EAGAIN; 1746 goto out; 1747 } 1748 } 1749 put_page(pages[i]); 1750 pages_locked++; 1751 } 1752 nr_pages -= ret; 1753 index += ret; 1754 cond_resched(); 1755 } 1756 out: 1757 if (err && index_ret) 1758 *index_ret = start_index + pages_locked - 1; 1759 return err; 1760 } 1761 1762 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1763 u64 delalloc_end, struct page *locked_page, 1764 unsigned clear_bits, 1765 unsigned long page_ops) 1766 { 1767 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1768 NULL, GFP_NOFS); 1769 1770 __process_pages_contig(inode->i_mapping, locked_page, 1771 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1772 page_ops, NULL); 1773 } 1774 1775 /* 1776 * count the number of bytes in the tree that have a given bit(s) 1777 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1778 * cached. The total number found is returned. 1779 */ 1780 u64 count_range_bits(struct extent_io_tree *tree, 1781 u64 *start, u64 search_end, u64 max_bytes, 1782 unsigned bits, int contig) 1783 { 1784 struct rb_node *node; 1785 struct extent_state *state; 1786 u64 cur_start = *start; 1787 u64 total_bytes = 0; 1788 u64 last = 0; 1789 int found = 0; 1790 1791 if (WARN_ON(search_end <= cur_start)) 1792 return 0; 1793 1794 spin_lock(&tree->lock); 1795 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1796 total_bytes = tree->dirty_bytes; 1797 goto out; 1798 } 1799 /* 1800 * this search will find all the extents that end after 1801 * our range starts. 1802 */ 1803 node = tree_search(tree, cur_start); 1804 if (!node) 1805 goto out; 1806 1807 while (1) { 1808 state = rb_entry(node, struct extent_state, rb_node); 1809 if (state->start > search_end) 1810 break; 1811 if (contig && found && state->start > last + 1) 1812 break; 1813 if (state->end >= cur_start && (state->state & bits) == bits) { 1814 total_bytes += min(search_end, state->end) + 1 - 1815 max(cur_start, state->start); 1816 if (total_bytes >= max_bytes) 1817 break; 1818 if (!found) { 1819 *start = max(cur_start, state->start); 1820 found = 1; 1821 } 1822 last = state->end; 1823 } else if (contig && found) { 1824 break; 1825 } 1826 node = rb_next(node); 1827 if (!node) 1828 break; 1829 } 1830 out: 1831 spin_unlock(&tree->lock); 1832 return total_bytes; 1833 } 1834 1835 /* 1836 * set the private field for a given byte offset in the tree. If there isn't 1837 * an extent_state there already, this does nothing. 1838 */ 1839 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1840 struct io_failure_record *failrec) 1841 { 1842 struct rb_node *node; 1843 struct extent_state *state; 1844 int ret = 0; 1845 1846 spin_lock(&tree->lock); 1847 /* 1848 * this search will find all the extents that end after 1849 * our range starts. 1850 */ 1851 node = tree_search(tree, start); 1852 if (!node) { 1853 ret = -ENOENT; 1854 goto out; 1855 } 1856 state = rb_entry(node, struct extent_state, rb_node); 1857 if (state->start != start) { 1858 ret = -ENOENT; 1859 goto out; 1860 } 1861 state->failrec = failrec; 1862 out: 1863 spin_unlock(&tree->lock); 1864 return ret; 1865 } 1866 1867 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1868 struct io_failure_record **failrec) 1869 { 1870 struct rb_node *node; 1871 struct extent_state *state; 1872 int ret = 0; 1873 1874 spin_lock(&tree->lock); 1875 /* 1876 * this search will find all the extents that end after 1877 * our range starts. 1878 */ 1879 node = tree_search(tree, start); 1880 if (!node) { 1881 ret = -ENOENT; 1882 goto out; 1883 } 1884 state = rb_entry(node, struct extent_state, rb_node); 1885 if (state->start != start) { 1886 ret = -ENOENT; 1887 goto out; 1888 } 1889 *failrec = state->failrec; 1890 out: 1891 spin_unlock(&tree->lock); 1892 return ret; 1893 } 1894 1895 /* 1896 * searches a range in the state tree for a given mask. 1897 * If 'filled' == 1, this returns 1 only if every extent in the tree 1898 * has the bits set. Otherwise, 1 is returned if any bit in the 1899 * range is found set. 1900 */ 1901 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1902 unsigned bits, int filled, struct extent_state *cached) 1903 { 1904 struct extent_state *state = NULL; 1905 struct rb_node *node; 1906 int bitset = 0; 1907 1908 spin_lock(&tree->lock); 1909 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1910 cached->end > start) 1911 node = &cached->rb_node; 1912 else 1913 node = tree_search(tree, start); 1914 while (node && start <= end) { 1915 state = rb_entry(node, struct extent_state, rb_node); 1916 1917 if (filled && state->start > start) { 1918 bitset = 0; 1919 break; 1920 } 1921 1922 if (state->start > end) 1923 break; 1924 1925 if (state->state & bits) { 1926 bitset = 1; 1927 if (!filled) 1928 break; 1929 } else if (filled) { 1930 bitset = 0; 1931 break; 1932 } 1933 1934 if (state->end == (u64)-1) 1935 break; 1936 1937 start = state->end + 1; 1938 if (start > end) 1939 break; 1940 node = rb_next(node); 1941 if (!node) { 1942 if (filled) 1943 bitset = 0; 1944 break; 1945 } 1946 } 1947 spin_unlock(&tree->lock); 1948 return bitset; 1949 } 1950 1951 /* 1952 * helper function to set a given page up to date if all the 1953 * extents in the tree for that page are up to date 1954 */ 1955 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1956 { 1957 u64 start = page_offset(page); 1958 u64 end = start + PAGE_SIZE - 1; 1959 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1960 SetPageUptodate(page); 1961 } 1962 1963 int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) 1964 { 1965 int ret; 1966 int err = 0; 1967 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 1968 1969 set_state_failrec(failure_tree, rec->start, NULL); 1970 ret = clear_extent_bits(failure_tree, rec->start, 1971 rec->start + rec->len - 1, 1972 EXTENT_LOCKED | EXTENT_DIRTY); 1973 if (ret) 1974 err = ret; 1975 1976 ret = clear_extent_bits(&inode->io_tree, rec->start, 1977 rec->start + rec->len - 1, 1978 EXTENT_DAMAGED); 1979 if (ret && !err) 1980 err = ret; 1981 1982 kfree(rec); 1983 return err; 1984 } 1985 1986 /* 1987 * this bypasses the standard btrfs submit functions deliberately, as 1988 * the standard behavior is to write all copies in a raid setup. here we only 1989 * want to write the one bad copy. so we do the mapping for ourselves and issue 1990 * submit_bio directly. 1991 * to avoid any synchronization issues, wait for the data after writing, which 1992 * actually prevents the read that triggered the error from finishing. 1993 * currently, there can be no more than two copies of every data bit. thus, 1994 * exactly one rewrite is required. 1995 */ 1996 int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 1997 u64 logical, struct page *page, 1998 unsigned int pg_offset, int mirror_num) 1999 { 2000 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2001 struct bio *bio; 2002 struct btrfs_device *dev; 2003 u64 map_length = 0; 2004 u64 sector; 2005 struct btrfs_bio *bbio = NULL; 2006 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 2007 int ret; 2008 2009 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2010 BUG_ON(!mirror_num); 2011 2012 /* we can't repair anything in raid56 yet */ 2013 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 2014 return 0; 2015 2016 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2017 if (!bio) 2018 return -EIO; 2019 bio->bi_iter.bi_size = 0; 2020 map_length = length; 2021 2022 /* 2023 * Avoid races with device replace and make sure our bbio has devices 2024 * associated to its stripes that don't go away while we are doing the 2025 * read repair operation. 2026 */ 2027 btrfs_bio_counter_inc_blocked(fs_info); 2028 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2029 &map_length, &bbio, mirror_num); 2030 if (ret) { 2031 btrfs_bio_counter_dec(fs_info); 2032 bio_put(bio); 2033 return -EIO; 2034 } 2035 BUG_ON(mirror_num != bbio->mirror_num); 2036 sector = bbio->stripes[mirror_num-1].physical >> 9; 2037 bio->bi_iter.bi_sector = sector; 2038 dev = bbio->stripes[mirror_num-1].dev; 2039 btrfs_put_bbio(bbio); 2040 if (!dev || !dev->bdev || !dev->writeable) { 2041 btrfs_bio_counter_dec(fs_info); 2042 bio_put(bio); 2043 return -EIO; 2044 } 2045 bio->bi_bdev = dev->bdev; 2046 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2047 bio_add_page(bio, page, length, pg_offset); 2048 2049 if (btrfsic_submit_bio_wait(bio)) { 2050 /* try to remap that extent elsewhere? */ 2051 btrfs_bio_counter_dec(fs_info); 2052 bio_put(bio); 2053 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2054 return -EIO; 2055 } 2056 2057 btrfs_info_rl_in_rcu(fs_info, 2058 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2059 btrfs_ino(inode), start, 2060 rcu_str_deref(dev->name), sector); 2061 btrfs_bio_counter_dec(fs_info); 2062 bio_put(bio); 2063 return 0; 2064 } 2065 2066 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2067 struct extent_buffer *eb, int mirror_num) 2068 { 2069 u64 start = eb->start; 2070 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2071 int ret = 0; 2072 2073 if (fs_info->sb->s_flags & MS_RDONLY) 2074 return -EROFS; 2075 2076 for (i = 0; i < num_pages; i++) { 2077 struct page *p = eb->pages[i]; 2078 2079 ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, 2080 PAGE_SIZE, start, p, 2081 start - page_offset(p), mirror_num); 2082 if (ret) 2083 break; 2084 start += PAGE_SIZE; 2085 } 2086 2087 return ret; 2088 } 2089 2090 /* 2091 * each time an IO finishes, we do a fast check in the IO failure tree 2092 * to see if we need to process or clean up an io_failure_record 2093 */ 2094 int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, 2095 unsigned int pg_offset) 2096 { 2097 u64 private; 2098 struct io_failure_record *failrec; 2099 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2100 struct extent_state *state; 2101 int num_copies; 2102 int ret; 2103 2104 private = 0; 2105 ret = count_range_bits(&inode->io_failure_tree, &private, 2106 (u64)-1, 1, EXTENT_DIRTY, 0); 2107 if (!ret) 2108 return 0; 2109 2110 ret = get_state_failrec(&inode->io_failure_tree, start, 2111 &failrec); 2112 if (ret) 2113 return 0; 2114 2115 BUG_ON(!failrec->this_mirror); 2116 2117 if (failrec->in_validation) { 2118 /* there was no real error, just free the record */ 2119 btrfs_debug(fs_info, 2120 "clean_io_failure: freeing dummy error at %llu", 2121 failrec->start); 2122 goto out; 2123 } 2124 if (fs_info->sb->s_flags & MS_RDONLY) 2125 goto out; 2126 2127 spin_lock(&inode->io_tree.lock); 2128 state = find_first_extent_bit_state(&inode->io_tree, 2129 failrec->start, 2130 EXTENT_LOCKED); 2131 spin_unlock(&inode->io_tree.lock); 2132 2133 if (state && state->start <= failrec->start && 2134 state->end >= failrec->start + failrec->len - 1) { 2135 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2136 failrec->len); 2137 if (num_copies > 1) { 2138 repair_io_failure(inode, start, failrec->len, 2139 failrec->logical, page, 2140 pg_offset, failrec->failed_mirror); 2141 } 2142 } 2143 2144 out: 2145 free_io_failure(inode, failrec); 2146 2147 return 0; 2148 } 2149 2150 /* 2151 * Can be called when 2152 * - hold extent lock 2153 * - under ordered extent 2154 * - the inode is freeing 2155 */ 2156 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2157 { 2158 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2159 struct io_failure_record *failrec; 2160 struct extent_state *state, *next; 2161 2162 if (RB_EMPTY_ROOT(&failure_tree->state)) 2163 return; 2164 2165 spin_lock(&failure_tree->lock); 2166 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2167 while (state) { 2168 if (state->start > end) 2169 break; 2170 2171 ASSERT(state->end <= end); 2172 2173 next = next_state(state); 2174 2175 failrec = state->failrec; 2176 free_extent_state(state); 2177 kfree(failrec); 2178 2179 state = next; 2180 } 2181 spin_unlock(&failure_tree->lock); 2182 } 2183 2184 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2185 struct io_failure_record **failrec_ret) 2186 { 2187 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2188 struct io_failure_record *failrec; 2189 struct extent_map *em; 2190 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2191 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2192 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2193 int ret; 2194 u64 logical; 2195 2196 ret = get_state_failrec(failure_tree, start, &failrec); 2197 if (ret) { 2198 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2199 if (!failrec) 2200 return -ENOMEM; 2201 2202 failrec->start = start; 2203 failrec->len = end - start + 1; 2204 failrec->this_mirror = 0; 2205 failrec->bio_flags = 0; 2206 failrec->in_validation = 0; 2207 2208 read_lock(&em_tree->lock); 2209 em = lookup_extent_mapping(em_tree, start, failrec->len); 2210 if (!em) { 2211 read_unlock(&em_tree->lock); 2212 kfree(failrec); 2213 return -EIO; 2214 } 2215 2216 if (em->start > start || em->start + em->len <= start) { 2217 free_extent_map(em); 2218 em = NULL; 2219 } 2220 read_unlock(&em_tree->lock); 2221 if (!em) { 2222 kfree(failrec); 2223 return -EIO; 2224 } 2225 2226 logical = start - em->start; 2227 logical = em->block_start + logical; 2228 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2229 logical = em->block_start; 2230 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2231 extent_set_compress_type(&failrec->bio_flags, 2232 em->compress_type); 2233 } 2234 2235 btrfs_debug(fs_info, 2236 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2237 logical, start, failrec->len); 2238 2239 failrec->logical = logical; 2240 free_extent_map(em); 2241 2242 /* set the bits in the private failure tree */ 2243 ret = set_extent_bits(failure_tree, start, end, 2244 EXTENT_LOCKED | EXTENT_DIRTY); 2245 if (ret >= 0) 2246 ret = set_state_failrec(failure_tree, start, failrec); 2247 /* set the bits in the inode's tree */ 2248 if (ret >= 0) 2249 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2250 if (ret < 0) { 2251 kfree(failrec); 2252 return ret; 2253 } 2254 } else { 2255 btrfs_debug(fs_info, 2256 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2257 failrec->logical, failrec->start, failrec->len, 2258 failrec->in_validation); 2259 /* 2260 * when data can be on disk more than twice, add to failrec here 2261 * (e.g. with a list for failed_mirror) to make 2262 * clean_io_failure() clean all those errors at once. 2263 */ 2264 } 2265 2266 *failrec_ret = failrec; 2267 2268 return 0; 2269 } 2270 2271 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2272 struct io_failure_record *failrec, int failed_mirror) 2273 { 2274 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2275 int num_copies; 2276 2277 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2278 if (num_copies == 1) { 2279 /* 2280 * we only have a single copy of the data, so don't bother with 2281 * all the retry and error correction code that follows. no 2282 * matter what the error is, it is very likely to persist. 2283 */ 2284 btrfs_debug(fs_info, 2285 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2286 num_copies, failrec->this_mirror, failed_mirror); 2287 return 0; 2288 } 2289 2290 /* 2291 * there are two premises: 2292 * a) deliver good data to the caller 2293 * b) correct the bad sectors on disk 2294 */ 2295 if (failed_bio->bi_vcnt > 1) { 2296 /* 2297 * to fulfill b), we need to know the exact failing sectors, as 2298 * we don't want to rewrite any more than the failed ones. thus, 2299 * we need separate read requests for the failed bio 2300 * 2301 * if the following BUG_ON triggers, our validation request got 2302 * merged. we need separate requests for our algorithm to work. 2303 */ 2304 BUG_ON(failrec->in_validation); 2305 failrec->in_validation = 1; 2306 failrec->this_mirror = failed_mirror; 2307 } else { 2308 /* 2309 * we're ready to fulfill a) and b) alongside. get a good copy 2310 * of the failed sector and if we succeed, we have setup 2311 * everything for repair_io_failure to do the rest for us. 2312 */ 2313 if (failrec->in_validation) { 2314 BUG_ON(failrec->this_mirror != failed_mirror); 2315 failrec->in_validation = 0; 2316 failrec->this_mirror = 0; 2317 } 2318 failrec->failed_mirror = failed_mirror; 2319 failrec->this_mirror++; 2320 if (failrec->this_mirror == failed_mirror) 2321 failrec->this_mirror++; 2322 } 2323 2324 if (failrec->this_mirror > num_copies) { 2325 btrfs_debug(fs_info, 2326 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2327 num_copies, failrec->this_mirror, failed_mirror); 2328 return 0; 2329 } 2330 2331 return 1; 2332 } 2333 2334 2335 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2336 struct io_failure_record *failrec, 2337 struct page *page, int pg_offset, int icsum, 2338 bio_end_io_t *endio_func, void *data) 2339 { 2340 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2341 struct bio *bio; 2342 struct btrfs_io_bio *btrfs_failed_bio; 2343 struct btrfs_io_bio *btrfs_bio; 2344 2345 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2346 if (!bio) 2347 return NULL; 2348 2349 bio->bi_end_io = endio_func; 2350 bio->bi_iter.bi_sector = failrec->logical >> 9; 2351 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2352 bio->bi_iter.bi_size = 0; 2353 bio->bi_private = data; 2354 2355 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2356 if (btrfs_failed_bio->csum) { 2357 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2358 2359 btrfs_bio = btrfs_io_bio(bio); 2360 btrfs_bio->csum = btrfs_bio->csum_inline; 2361 icsum *= csum_size; 2362 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2363 csum_size); 2364 } 2365 2366 bio_add_page(bio, page, failrec->len, pg_offset); 2367 2368 return bio; 2369 } 2370 2371 /* 2372 * this is a generic handler for readpage errors (default 2373 * readpage_io_failed_hook). if other copies exist, read those and write back 2374 * good data to the failed position. does not investigate in remapping the 2375 * failed extent elsewhere, hoping the device will be smart enough to do this as 2376 * needed 2377 */ 2378 2379 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2380 struct page *page, u64 start, u64 end, 2381 int failed_mirror) 2382 { 2383 struct io_failure_record *failrec; 2384 struct inode *inode = page->mapping->host; 2385 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2386 struct bio *bio; 2387 int read_mode = 0; 2388 int ret; 2389 2390 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2391 2392 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2393 if (ret) 2394 return ret; 2395 2396 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2397 if (!ret) { 2398 free_io_failure(BTRFS_I(inode), failrec); 2399 return -EIO; 2400 } 2401 2402 if (failed_bio->bi_vcnt > 1) 2403 read_mode |= REQ_FAILFAST_DEV; 2404 2405 phy_offset >>= inode->i_sb->s_blocksize_bits; 2406 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2407 start - page_offset(page), 2408 (int)phy_offset, failed_bio->bi_end_io, 2409 NULL); 2410 if (!bio) { 2411 free_io_failure(BTRFS_I(inode), failrec); 2412 return -EIO; 2413 } 2414 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2415 2416 btrfs_debug(btrfs_sb(inode->i_sb), 2417 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2418 read_mode, failrec->this_mirror, failrec->in_validation); 2419 2420 ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2421 failrec->bio_flags, 0); 2422 if (ret) { 2423 free_io_failure(BTRFS_I(inode), failrec); 2424 bio_put(bio); 2425 } 2426 2427 return ret; 2428 } 2429 2430 /* lots and lots of room for performance fixes in the end_bio funcs */ 2431 2432 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2433 { 2434 int uptodate = (err == 0); 2435 struct extent_io_tree *tree; 2436 int ret = 0; 2437 2438 tree = &BTRFS_I(page->mapping->host)->io_tree; 2439 2440 if (tree->ops && tree->ops->writepage_end_io_hook) 2441 tree->ops->writepage_end_io_hook(page, start, end, NULL, 2442 uptodate); 2443 2444 if (!uptodate) { 2445 ClearPageUptodate(page); 2446 SetPageError(page); 2447 ret = ret < 0 ? ret : -EIO; 2448 mapping_set_error(page->mapping, ret); 2449 } 2450 } 2451 2452 /* 2453 * after a writepage IO is done, we need to: 2454 * clear the uptodate bits on error 2455 * clear the writeback bits in the extent tree for this IO 2456 * end_page_writeback if the page has no more pending IO 2457 * 2458 * Scheduling is not allowed, so the extent state tree is expected 2459 * to have one and only one object corresponding to this IO. 2460 */ 2461 static void end_bio_extent_writepage(struct bio *bio) 2462 { 2463 struct bio_vec *bvec; 2464 u64 start; 2465 u64 end; 2466 int i; 2467 2468 bio_for_each_segment_all(bvec, bio, i) { 2469 struct page *page = bvec->bv_page; 2470 struct inode *inode = page->mapping->host; 2471 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2472 2473 /* We always issue full-page reads, but if some block 2474 * in a page fails to read, blk_update_request() will 2475 * advance bv_offset and adjust bv_len to compensate. 2476 * Print a warning for nonzero offsets, and an error 2477 * if they don't add up to a full page. */ 2478 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2479 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2480 btrfs_err(fs_info, 2481 "partial page write in btrfs with offset %u and length %u", 2482 bvec->bv_offset, bvec->bv_len); 2483 else 2484 btrfs_info(fs_info, 2485 "incomplete page write in btrfs with offset %u and length %u", 2486 bvec->bv_offset, bvec->bv_len); 2487 } 2488 2489 start = page_offset(page); 2490 end = start + bvec->bv_offset + bvec->bv_len - 1; 2491 2492 end_extent_writepage(page, bio->bi_error, start, end); 2493 end_page_writeback(page); 2494 } 2495 2496 bio_put(bio); 2497 } 2498 2499 static void 2500 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2501 int uptodate) 2502 { 2503 struct extent_state *cached = NULL; 2504 u64 end = start + len - 1; 2505 2506 if (uptodate && tree->track_uptodate) 2507 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2508 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2509 } 2510 2511 /* 2512 * after a readpage IO is done, we need to: 2513 * clear the uptodate bits on error 2514 * set the uptodate bits if things worked 2515 * set the page up to date if all extents in the tree are uptodate 2516 * clear the lock bit in the extent tree 2517 * unlock the page if there are no other extents locked for it 2518 * 2519 * Scheduling is not allowed, so the extent state tree is expected 2520 * to have one and only one object corresponding to this IO. 2521 */ 2522 static void end_bio_extent_readpage(struct bio *bio) 2523 { 2524 struct bio_vec *bvec; 2525 int uptodate = !bio->bi_error; 2526 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2527 struct extent_io_tree *tree; 2528 u64 offset = 0; 2529 u64 start; 2530 u64 end; 2531 u64 len; 2532 u64 extent_start = 0; 2533 u64 extent_len = 0; 2534 int mirror; 2535 int ret; 2536 int i; 2537 2538 bio_for_each_segment_all(bvec, bio, i) { 2539 struct page *page = bvec->bv_page; 2540 struct inode *inode = page->mapping->host; 2541 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2542 2543 btrfs_debug(fs_info, 2544 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2545 (u64)bio->bi_iter.bi_sector, bio->bi_error, 2546 io_bio->mirror_num); 2547 tree = &BTRFS_I(inode)->io_tree; 2548 2549 /* We always issue full-page reads, but if some block 2550 * in a page fails to read, blk_update_request() will 2551 * advance bv_offset and adjust bv_len to compensate. 2552 * Print a warning for nonzero offsets, and an error 2553 * if they don't add up to a full page. */ 2554 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2555 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2556 btrfs_err(fs_info, 2557 "partial page read in btrfs with offset %u and length %u", 2558 bvec->bv_offset, bvec->bv_len); 2559 else 2560 btrfs_info(fs_info, 2561 "incomplete page read in btrfs with offset %u and length %u", 2562 bvec->bv_offset, bvec->bv_len); 2563 } 2564 2565 start = page_offset(page); 2566 end = start + bvec->bv_offset + bvec->bv_len - 1; 2567 len = bvec->bv_len; 2568 2569 mirror = io_bio->mirror_num; 2570 if (likely(uptodate && tree->ops)) { 2571 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2572 page, start, end, 2573 mirror); 2574 if (ret) 2575 uptodate = 0; 2576 else 2577 clean_io_failure(BTRFS_I(inode), start, 2578 page, 0); 2579 } 2580 2581 if (likely(uptodate)) 2582 goto readpage_ok; 2583 2584 if (tree->ops) { 2585 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2586 if (!ret && !bio->bi_error) 2587 uptodate = 1; 2588 } else { 2589 /* 2590 * The generic bio_readpage_error handles errors the 2591 * following way: If possible, new read requests are 2592 * created and submitted and will end up in 2593 * end_bio_extent_readpage as well (if we're lucky, not 2594 * in the !uptodate case). In that case it returns 0 and 2595 * we just go on with the next page in our bio. If it 2596 * can't handle the error it will return -EIO and we 2597 * remain responsible for that page. 2598 */ 2599 ret = bio_readpage_error(bio, offset, page, start, end, 2600 mirror); 2601 if (ret == 0) { 2602 uptodate = !bio->bi_error; 2603 offset += len; 2604 continue; 2605 } 2606 } 2607 readpage_ok: 2608 if (likely(uptodate)) { 2609 loff_t i_size = i_size_read(inode); 2610 pgoff_t end_index = i_size >> PAGE_SHIFT; 2611 unsigned off; 2612 2613 /* Zero out the end if this page straddles i_size */ 2614 off = i_size & (PAGE_SIZE-1); 2615 if (page->index == end_index && off) 2616 zero_user_segment(page, off, PAGE_SIZE); 2617 SetPageUptodate(page); 2618 } else { 2619 ClearPageUptodate(page); 2620 SetPageError(page); 2621 } 2622 unlock_page(page); 2623 offset += len; 2624 2625 if (unlikely(!uptodate)) { 2626 if (extent_len) { 2627 endio_readpage_release_extent(tree, 2628 extent_start, 2629 extent_len, 1); 2630 extent_start = 0; 2631 extent_len = 0; 2632 } 2633 endio_readpage_release_extent(tree, start, 2634 end - start + 1, 0); 2635 } else if (!extent_len) { 2636 extent_start = start; 2637 extent_len = end + 1 - start; 2638 } else if (extent_start + extent_len == start) { 2639 extent_len += end + 1 - start; 2640 } else { 2641 endio_readpage_release_extent(tree, extent_start, 2642 extent_len, uptodate); 2643 extent_start = start; 2644 extent_len = end + 1 - start; 2645 } 2646 } 2647 2648 if (extent_len) 2649 endio_readpage_release_extent(tree, extent_start, extent_len, 2650 uptodate); 2651 if (io_bio->end_io) 2652 io_bio->end_io(io_bio, bio->bi_error); 2653 bio_put(bio); 2654 } 2655 2656 /* 2657 * this allocates from the btrfs_bioset. We're returning a bio right now 2658 * but you can call btrfs_io_bio for the appropriate container_of magic 2659 */ 2660 struct bio * 2661 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2662 gfp_t gfp_flags) 2663 { 2664 struct btrfs_io_bio *btrfs_bio; 2665 struct bio *bio; 2666 2667 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2668 2669 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2670 while (!bio && (nr_vecs /= 2)) { 2671 bio = bio_alloc_bioset(gfp_flags, 2672 nr_vecs, btrfs_bioset); 2673 } 2674 } 2675 2676 if (bio) { 2677 bio->bi_bdev = bdev; 2678 bio->bi_iter.bi_sector = first_sector; 2679 btrfs_bio = btrfs_io_bio(bio); 2680 btrfs_bio->csum = NULL; 2681 btrfs_bio->csum_allocated = NULL; 2682 btrfs_bio->end_io = NULL; 2683 } 2684 return bio; 2685 } 2686 2687 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2688 { 2689 struct btrfs_io_bio *btrfs_bio; 2690 struct bio *new; 2691 2692 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2693 if (new) { 2694 btrfs_bio = btrfs_io_bio(new); 2695 btrfs_bio->csum = NULL; 2696 btrfs_bio->csum_allocated = NULL; 2697 btrfs_bio->end_io = NULL; 2698 } 2699 return new; 2700 } 2701 2702 /* this also allocates from the btrfs_bioset */ 2703 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2704 { 2705 struct btrfs_io_bio *btrfs_bio; 2706 struct bio *bio; 2707 2708 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2709 if (bio) { 2710 btrfs_bio = btrfs_io_bio(bio); 2711 btrfs_bio->csum = NULL; 2712 btrfs_bio->csum_allocated = NULL; 2713 btrfs_bio->end_io = NULL; 2714 } 2715 return bio; 2716 } 2717 2718 2719 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2720 unsigned long bio_flags) 2721 { 2722 int ret = 0; 2723 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2724 struct page *page = bvec->bv_page; 2725 struct extent_io_tree *tree = bio->bi_private; 2726 u64 start; 2727 2728 start = page_offset(page) + bvec->bv_offset; 2729 2730 bio->bi_private = NULL; 2731 bio_get(bio); 2732 2733 if (tree->ops) 2734 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2735 mirror_num, bio_flags, start); 2736 else 2737 btrfsic_submit_bio(bio); 2738 2739 bio_put(bio); 2740 return ret; 2741 } 2742 2743 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2744 unsigned long offset, size_t size, struct bio *bio, 2745 unsigned long bio_flags) 2746 { 2747 int ret = 0; 2748 if (tree->ops) 2749 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2750 bio_flags); 2751 return ret; 2752 2753 } 2754 2755 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2756 struct writeback_control *wbc, 2757 struct page *page, sector_t sector, 2758 size_t size, unsigned long offset, 2759 struct block_device *bdev, 2760 struct bio **bio_ret, 2761 bio_end_io_t end_io_func, 2762 int mirror_num, 2763 unsigned long prev_bio_flags, 2764 unsigned long bio_flags, 2765 bool force_bio_submit) 2766 { 2767 int ret = 0; 2768 struct bio *bio; 2769 int contig = 0; 2770 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2771 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2772 2773 if (bio_ret && *bio_ret) { 2774 bio = *bio_ret; 2775 if (old_compressed) 2776 contig = bio->bi_iter.bi_sector == sector; 2777 else 2778 contig = bio_end_sector(bio) == sector; 2779 2780 if (prev_bio_flags != bio_flags || !contig || 2781 force_bio_submit || 2782 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2783 bio_add_page(bio, page, page_size, offset) < page_size) { 2784 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2785 if (ret < 0) { 2786 *bio_ret = NULL; 2787 return ret; 2788 } 2789 bio = NULL; 2790 } else { 2791 if (wbc) 2792 wbc_account_io(wbc, page, page_size); 2793 return 0; 2794 } 2795 } 2796 2797 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2798 GFP_NOFS | __GFP_HIGH); 2799 if (!bio) 2800 return -ENOMEM; 2801 2802 bio_add_page(bio, page, page_size, offset); 2803 bio->bi_end_io = end_io_func; 2804 bio->bi_private = tree; 2805 bio_set_op_attrs(bio, op, op_flags); 2806 if (wbc) { 2807 wbc_init_bio(wbc, bio); 2808 wbc_account_io(wbc, page, page_size); 2809 } 2810 2811 if (bio_ret) 2812 *bio_ret = bio; 2813 else 2814 ret = submit_one_bio(bio, mirror_num, bio_flags); 2815 2816 return ret; 2817 } 2818 2819 static void attach_extent_buffer_page(struct extent_buffer *eb, 2820 struct page *page) 2821 { 2822 if (!PagePrivate(page)) { 2823 SetPagePrivate(page); 2824 get_page(page); 2825 set_page_private(page, (unsigned long)eb); 2826 } else { 2827 WARN_ON(page->private != (unsigned long)eb); 2828 } 2829 } 2830 2831 void set_page_extent_mapped(struct page *page) 2832 { 2833 if (!PagePrivate(page)) { 2834 SetPagePrivate(page); 2835 get_page(page); 2836 set_page_private(page, EXTENT_PAGE_PRIVATE); 2837 } 2838 } 2839 2840 static struct extent_map * 2841 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2842 u64 start, u64 len, get_extent_t *get_extent, 2843 struct extent_map **em_cached) 2844 { 2845 struct extent_map *em; 2846 2847 if (em_cached && *em_cached) { 2848 em = *em_cached; 2849 if (extent_map_in_tree(em) && start >= em->start && 2850 start < extent_map_end(em)) { 2851 atomic_inc(&em->refs); 2852 return em; 2853 } 2854 2855 free_extent_map(em); 2856 *em_cached = NULL; 2857 } 2858 2859 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 2860 if (em_cached && !IS_ERR_OR_NULL(em)) { 2861 BUG_ON(*em_cached); 2862 atomic_inc(&em->refs); 2863 *em_cached = em; 2864 } 2865 return em; 2866 } 2867 /* 2868 * basic readpage implementation. Locked extent state structs are inserted 2869 * into the tree that are removed when the IO is done (by the end_io 2870 * handlers) 2871 * XXX JDM: This needs looking at to ensure proper page locking 2872 * return 0 on success, otherwise return error 2873 */ 2874 static int __do_readpage(struct extent_io_tree *tree, 2875 struct page *page, 2876 get_extent_t *get_extent, 2877 struct extent_map **em_cached, 2878 struct bio **bio, int mirror_num, 2879 unsigned long *bio_flags, int read_flags, 2880 u64 *prev_em_start) 2881 { 2882 struct inode *inode = page->mapping->host; 2883 u64 start = page_offset(page); 2884 u64 page_end = start + PAGE_SIZE - 1; 2885 u64 end; 2886 u64 cur = start; 2887 u64 extent_offset; 2888 u64 last_byte = i_size_read(inode); 2889 u64 block_start; 2890 u64 cur_end; 2891 sector_t sector; 2892 struct extent_map *em; 2893 struct block_device *bdev; 2894 int ret = 0; 2895 int nr = 0; 2896 size_t pg_offset = 0; 2897 size_t iosize; 2898 size_t disk_io_size; 2899 size_t blocksize = inode->i_sb->s_blocksize; 2900 unsigned long this_bio_flag = 0; 2901 2902 set_page_extent_mapped(page); 2903 2904 end = page_end; 2905 if (!PageUptodate(page)) { 2906 if (cleancache_get_page(page) == 0) { 2907 BUG_ON(blocksize != PAGE_SIZE); 2908 unlock_extent(tree, start, end); 2909 goto out; 2910 } 2911 } 2912 2913 if (page->index == last_byte >> PAGE_SHIFT) { 2914 char *userpage; 2915 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2916 2917 if (zero_offset) { 2918 iosize = PAGE_SIZE - zero_offset; 2919 userpage = kmap_atomic(page); 2920 memset(userpage + zero_offset, 0, iosize); 2921 flush_dcache_page(page); 2922 kunmap_atomic(userpage); 2923 } 2924 } 2925 while (cur <= end) { 2926 bool force_bio_submit = false; 2927 2928 if (cur >= last_byte) { 2929 char *userpage; 2930 struct extent_state *cached = NULL; 2931 2932 iosize = PAGE_SIZE - pg_offset; 2933 userpage = kmap_atomic(page); 2934 memset(userpage + pg_offset, 0, iosize); 2935 flush_dcache_page(page); 2936 kunmap_atomic(userpage); 2937 set_extent_uptodate(tree, cur, cur + iosize - 1, 2938 &cached, GFP_NOFS); 2939 unlock_extent_cached(tree, cur, 2940 cur + iosize - 1, 2941 &cached, GFP_NOFS); 2942 break; 2943 } 2944 em = __get_extent_map(inode, page, pg_offset, cur, 2945 end - cur + 1, get_extent, em_cached); 2946 if (IS_ERR_OR_NULL(em)) { 2947 SetPageError(page); 2948 unlock_extent(tree, cur, end); 2949 break; 2950 } 2951 extent_offset = cur - em->start; 2952 BUG_ON(extent_map_end(em) <= cur); 2953 BUG_ON(end < cur); 2954 2955 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2956 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2957 extent_set_compress_type(&this_bio_flag, 2958 em->compress_type); 2959 } 2960 2961 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2962 cur_end = min(extent_map_end(em) - 1, end); 2963 iosize = ALIGN(iosize, blocksize); 2964 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2965 disk_io_size = em->block_len; 2966 sector = em->block_start >> 9; 2967 } else { 2968 sector = (em->block_start + extent_offset) >> 9; 2969 disk_io_size = iosize; 2970 } 2971 bdev = em->bdev; 2972 block_start = em->block_start; 2973 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2974 block_start = EXTENT_MAP_HOLE; 2975 2976 /* 2977 * If we have a file range that points to a compressed extent 2978 * and it's followed by a consecutive file range that points to 2979 * to the same compressed extent (possibly with a different 2980 * offset and/or length, so it either points to the whole extent 2981 * or only part of it), we must make sure we do not submit a 2982 * single bio to populate the pages for the 2 ranges because 2983 * this makes the compressed extent read zero out the pages 2984 * belonging to the 2nd range. Imagine the following scenario: 2985 * 2986 * File layout 2987 * [0 - 8K] [8K - 24K] 2988 * | | 2989 * | | 2990 * points to extent X, points to extent X, 2991 * offset 4K, length of 8K offset 0, length 16K 2992 * 2993 * [extent X, compressed length = 4K uncompressed length = 16K] 2994 * 2995 * If the bio to read the compressed extent covers both ranges, 2996 * it will decompress extent X into the pages belonging to the 2997 * first range and then it will stop, zeroing out the remaining 2998 * pages that belong to the other range that points to extent X. 2999 * So here we make sure we submit 2 bios, one for the first 3000 * range and another one for the third range. Both will target 3001 * the same physical extent from disk, but we can't currently 3002 * make the compressed bio endio callback populate the pages 3003 * for both ranges because each compressed bio is tightly 3004 * coupled with a single extent map, and each range can have 3005 * an extent map with a different offset value relative to the 3006 * uncompressed data of our extent and different lengths. This 3007 * is a corner case so we prioritize correctness over 3008 * non-optimal behavior (submitting 2 bios for the same extent). 3009 */ 3010 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3011 prev_em_start && *prev_em_start != (u64)-1 && 3012 *prev_em_start != em->orig_start) 3013 force_bio_submit = true; 3014 3015 if (prev_em_start) 3016 *prev_em_start = em->orig_start; 3017 3018 free_extent_map(em); 3019 em = NULL; 3020 3021 /* we've found a hole, just zero and go on */ 3022 if (block_start == EXTENT_MAP_HOLE) { 3023 char *userpage; 3024 struct extent_state *cached = NULL; 3025 3026 userpage = kmap_atomic(page); 3027 memset(userpage + pg_offset, 0, iosize); 3028 flush_dcache_page(page); 3029 kunmap_atomic(userpage); 3030 3031 set_extent_uptodate(tree, cur, cur + iosize - 1, 3032 &cached, GFP_NOFS); 3033 unlock_extent_cached(tree, cur, 3034 cur + iosize - 1, 3035 &cached, GFP_NOFS); 3036 cur = cur + iosize; 3037 pg_offset += iosize; 3038 continue; 3039 } 3040 /* the get_extent function already copied into the page */ 3041 if (test_range_bit(tree, cur, cur_end, 3042 EXTENT_UPTODATE, 1, NULL)) { 3043 check_page_uptodate(tree, page); 3044 unlock_extent(tree, cur, cur + iosize - 1); 3045 cur = cur + iosize; 3046 pg_offset += iosize; 3047 continue; 3048 } 3049 /* we have an inline extent but it didn't get marked up 3050 * to date. Error out 3051 */ 3052 if (block_start == EXTENT_MAP_INLINE) { 3053 SetPageError(page); 3054 unlock_extent(tree, cur, cur + iosize - 1); 3055 cur = cur + iosize; 3056 pg_offset += iosize; 3057 continue; 3058 } 3059 3060 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3061 page, sector, disk_io_size, pg_offset, 3062 bdev, bio, 3063 end_bio_extent_readpage, mirror_num, 3064 *bio_flags, 3065 this_bio_flag, 3066 force_bio_submit); 3067 if (!ret) { 3068 nr++; 3069 *bio_flags = this_bio_flag; 3070 } else { 3071 SetPageError(page); 3072 unlock_extent(tree, cur, cur + iosize - 1); 3073 goto out; 3074 } 3075 cur = cur + iosize; 3076 pg_offset += iosize; 3077 } 3078 out: 3079 if (!nr) { 3080 if (!PageError(page)) 3081 SetPageUptodate(page); 3082 unlock_page(page); 3083 } 3084 return ret; 3085 } 3086 3087 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3088 struct page *pages[], int nr_pages, 3089 u64 start, u64 end, 3090 get_extent_t *get_extent, 3091 struct extent_map **em_cached, 3092 struct bio **bio, int mirror_num, 3093 unsigned long *bio_flags, 3094 u64 *prev_em_start) 3095 { 3096 struct inode *inode; 3097 struct btrfs_ordered_extent *ordered; 3098 int index; 3099 3100 inode = pages[0]->mapping->host; 3101 while (1) { 3102 lock_extent(tree, start, end); 3103 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3104 end - start + 1); 3105 if (!ordered) 3106 break; 3107 unlock_extent(tree, start, end); 3108 btrfs_start_ordered_extent(inode, ordered, 1); 3109 btrfs_put_ordered_extent(ordered); 3110 } 3111 3112 for (index = 0; index < nr_pages; index++) { 3113 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3114 mirror_num, bio_flags, 0, prev_em_start); 3115 put_page(pages[index]); 3116 } 3117 } 3118 3119 static void __extent_readpages(struct extent_io_tree *tree, 3120 struct page *pages[], 3121 int nr_pages, get_extent_t *get_extent, 3122 struct extent_map **em_cached, 3123 struct bio **bio, int mirror_num, 3124 unsigned long *bio_flags, 3125 u64 *prev_em_start) 3126 { 3127 u64 start = 0; 3128 u64 end = 0; 3129 u64 page_start; 3130 int index; 3131 int first_index = 0; 3132 3133 for (index = 0; index < nr_pages; index++) { 3134 page_start = page_offset(pages[index]); 3135 if (!end) { 3136 start = page_start; 3137 end = start + PAGE_SIZE - 1; 3138 first_index = index; 3139 } else if (end + 1 == page_start) { 3140 end += PAGE_SIZE; 3141 } else { 3142 __do_contiguous_readpages(tree, &pages[first_index], 3143 index - first_index, start, 3144 end, get_extent, em_cached, 3145 bio, mirror_num, bio_flags, 3146 prev_em_start); 3147 start = page_start; 3148 end = start + PAGE_SIZE - 1; 3149 first_index = index; 3150 } 3151 } 3152 3153 if (end) 3154 __do_contiguous_readpages(tree, &pages[first_index], 3155 index - first_index, start, 3156 end, get_extent, em_cached, bio, 3157 mirror_num, bio_flags, 3158 prev_em_start); 3159 } 3160 3161 static int __extent_read_full_page(struct extent_io_tree *tree, 3162 struct page *page, 3163 get_extent_t *get_extent, 3164 struct bio **bio, int mirror_num, 3165 unsigned long *bio_flags, int read_flags) 3166 { 3167 struct inode *inode = page->mapping->host; 3168 struct btrfs_ordered_extent *ordered; 3169 u64 start = page_offset(page); 3170 u64 end = start + PAGE_SIZE - 1; 3171 int ret; 3172 3173 while (1) { 3174 lock_extent(tree, start, end); 3175 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3176 PAGE_SIZE); 3177 if (!ordered) 3178 break; 3179 unlock_extent(tree, start, end); 3180 btrfs_start_ordered_extent(inode, ordered, 1); 3181 btrfs_put_ordered_extent(ordered); 3182 } 3183 3184 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3185 bio_flags, read_flags, NULL); 3186 return ret; 3187 } 3188 3189 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3190 get_extent_t *get_extent, int mirror_num) 3191 { 3192 struct bio *bio = NULL; 3193 unsigned long bio_flags = 0; 3194 int ret; 3195 3196 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3197 &bio_flags, 0); 3198 if (bio) 3199 ret = submit_one_bio(bio, mirror_num, bio_flags); 3200 return ret; 3201 } 3202 3203 static void update_nr_written(struct writeback_control *wbc, 3204 unsigned long nr_written) 3205 { 3206 wbc->nr_to_write -= nr_written; 3207 } 3208 3209 /* 3210 * helper for __extent_writepage, doing all of the delayed allocation setup. 3211 * 3212 * This returns 1 if our fill_delalloc function did all the work required 3213 * to write the page (copy into inline extent). In this case the IO has 3214 * been started and the page is already unlocked. 3215 * 3216 * This returns 0 if all went well (page still locked) 3217 * This returns < 0 if there were errors (page still locked) 3218 */ 3219 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3220 struct page *page, struct writeback_control *wbc, 3221 struct extent_page_data *epd, 3222 u64 delalloc_start, 3223 unsigned long *nr_written) 3224 { 3225 struct extent_io_tree *tree = epd->tree; 3226 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3227 u64 nr_delalloc; 3228 u64 delalloc_to_write = 0; 3229 u64 delalloc_end = 0; 3230 int ret; 3231 int page_started = 0; 3232 3233 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3234 return 0; 3235 3236 while (delalloc_end < page_end) { 3237 nr_delalloc = find_lock_delalloc_range(inode, tree, 3238 page, 3239 &delalloc_start, 3240 &delalloc_end, 3241 BTRFS_MAX_EXTENT_SIZE); 3242 if (nr_delalloc == 0) { 3243 delalloc_start = delalloc_end + 1; 3244 continue; 3245 } 3246 ret = tree->ops->fill_delalloc(inode, page, 3247 delalloc_start, 3248 delalloc_end, 3249 &page_started, 3250 nr_written); 3251 /* File system has been set read-only */ 3252 if (ret) { 3253 SetPageError(page); 3254 /* fill_delalloc should be return < 0 for error 3255 * but just in case, we use > 0 here meaning the 3256 * IO is started, so we don't want to return > 0 3257 * unless things are going well. 3258 */ 3259 ret = ret < 0 ? ret : -EIO; 3260 goto done; 3261 } 3262 /* 3263 * delalloc_end is already one less than the total length, so 3264 * we don't subtract one from PAGE_SIZE 3265 */ 3266 delalloc_to_write += (delalloc_end - delalloc_start + 3267 PAGE_SIZE) >> PAGE_SHIFT; 3268 delalloc_start = delalloc_end + 1; 3269 } 3270 if (wbc->nr_to_write < delalloc_to_write) { 3271 int thresh = 8192; 3272 3273 if (delalloc_to_write < thresh * 2) 3274 thresh = delalloc_to_write; 3275 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3276 thresh); 3277 } 3278 3279 /* did the fill delalloc function already unlock and start 3280 * the IO? 3281 */ 3282 if (page_started) { 3283 /* 3284 * we've unlocked the page, so we can't update 3285 * the mapping's writeback index, just update 3286 * nr_to_write. 3287 */ 3288 wbc->nr_to_write -= *nr_written; 3289 return 1; 3290 } 3291 3292 ret = 0; 3293 3294 done: 3295 return ret; 3296 } 3297 3298 /* 3299 * helper for __extent_writepage. This calls the writepage start hooks, 3300 * and does the loop to map the page into extents and bios. 3301 * 3302 * We return 1 if the IO is started and the page is unlocked, 3303 * 0 if all went well (page still locked) 3304 * < 0 if there were errors (page still locked) 3305 */ 3306 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3307 struct page *page, 3308 struct writeback_control *wbc, 3309 struct extent_page_data *epd, 3310 loff_t i_size, 3311 unsigned long nr_written, 3312 int write_flags, int *nr_ret) 3313 { 3314 struct extent_io_tree *tree = epd->tree; 3315 u64 start = page_offset(page); 3316 u64 page_end = start + PAGE_SIZE - 1; 3317 u64 end; 3318 u64 cur = start; 3319 u64 extent_offset; 3320 u64 block_start; 3321 u64 iosize; 3322 sector_t sector; 3323 struct extent_map *em; 3324 struct block_device *bdev; 3325 size_t pg_offset = 0; 3326 size_t blocksize; 3327 int ret = 0; 3328 int nr = 0; 3329 bool compressed; 3330 3331 if (tree->ops && tree->ops->writepage_start_hook) { 3332 ret = tree->ops->writepage_start_hook(page, start, 3333 page_end); 3334 if (ret) { 3335 /* Fixup worker will requeue */ 3336 if (ret == -EBUSY) 3337 wbc->pages_skipped++; 3338 else 3339 redirty_page_for_writepage(wbc, page); 3340 3341 update_nr_written(wbc, nr_written); 3342 unlock_page(page); 3343 return 1; 3344 } 3345 } 3346 3347 /* 3348 * we don't want to touch the inode after unlocking the page, 3349 * so we update the mapping writeback index now 3350 */ 3351 update_nr_written(wbc, nr_written + 1); 3352 3353 end = page_end; 3354 if (i_size <= start) { 3355 if (tree->ops && tree->ops->writepage_end_io_hook) 3356 tree->ops->writepage_end_io_hook(page, start, 3357 page_end, NULL, 1); 3358 goto done; 3359 } 3360 3361 blocksize = inode->i_sb->s_blocksize; 3362 3363 while (cur <= end) { 3364 u64 em_end; 3365 3366 if (cur >= i_size) { 3367 if (tree->ops && tree->ops->writepage_end_io_hook) 3368 tree->ops->writepage_end_io_hook(page, cur, 3369 page_end, NULL, 1); 3370 break; 3371 } 3372 em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, 3373 end - cur + 1, 1); 3374 if (IS_ERR_OR_NULL(em)) { 3375 SetPageError(page); 3376 ret = PTR_ERR_OR_ZERO(em); 3377 break; 3378 } 3379 3380 extent_offset = cur - em->start; 3381 em_end = extent_map_end(em); 3382 BUG_ON(em_end <= cur); 3383 BUG_ON(end < cur); 3384 iosize = min(em_end - cur, end - cur + 1); 3385 iosize = ALIGN(iosize, blocksize); 3386 sector = (em->block_start + extent_offset) >> 9; 3387 bdev = em->bdev; 3388 block_start = em->block_start; 3389 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3390 free_extent_map(em); 3391 em = NULL; 3392 3393 /* 3394 * compressed and inline extents are written through other 3395 * paths in the FS 3396 */ 3397 if (compressed || block_start == EXTENT_MAP_HOLE || 3398 block_start == EXTENT_MAP_INLINE) { 3399 /* 3400 * end_io notification does not happen here for 3401 * compressed extents 3402 */ 3403 if (!compressed && tree->ops && 3404 tree->ops->writepage_end_io_hook) 3405 tree->ops->writepage_end_io_hook(page, cur, 3406 cur + iosize - 1, 3407 NULL, 1); 3408 else if (compressed) { 3409 /* we don't want to end_page_writeback on 3410 * a compressed extent. this happens 3411 * elsewhere 3412 */ 3413 nr++; 3414 } 3415 3416 cur += iosize; 3417 pg_offset += iosize; 3418 continue; 3419 } 3420 3421 set_range_writeback(tree, cur, cur + iosize - 1); 3422 if (!PageWriteback(page)) { 3423 btrfs_err(BTRFS_I(inode)->root->fs_info, 3424 "page %lu not writeback, cur %llu end %llu", 3425 page->index, cur, end); 3426 } 3427 3428 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3429 page, sector, iosize, pg_offset, 3430 bdev, &epd->bio, 3431 end_bio_extent_writepage, 3432 0, 0, 0, false); 3433 if (ret) { 3434 SetPageError(page); 3435 if (PageWriteback(page)) 3436 end_page_writeback(page); 3437 } 3438 3439 cur = cur + iosize; 3440 pg_offset += iosize; 3441 nr++; 3442 } 3443 done: 3444 *nr_ret = nr; 3445 return ret; 3446 } 3447 3448 /* 3449 * the writepage semantics are similar to regular writepage. extent 3450 * records are inserted to lock ranges in the tree, and as dirty areas 3451 * are found, they are marked writeback. Then the lock bits are removed 3452 * and the end_io handler clears the writeback ranges 3453 */ 3454 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3455 void *data) 3456 { 3457 struct inode *inode = page->mapping->host; 3458 struct extent_page_data *epd = data; 3459 u64 start = page_offset(page); 3460 u64 page_end = start + PAGE_SIZE - 1; 3461 int ret; 3462 int nr = 0; 3463 size_t pg_offset = 0; 3464 loff_t i_size = i_size_read(inode); 3465 unsigned long end_index = i_size >> PAGE_SHIFT; 3466 int write_flags = 0; 3467 unsigned long nr_written = 0; 3468 3469 if (wbc->sync_mode == WB_SYNC_ALL) 3470 write_flags = REQ_SYNC; 3471 3472 trace___extent_writepage(page, inode, wbc); 3473 3474 WARN_ON(!PageLocked(page)); 3475 3476 ClearPageError(page); 3477 3478 pg_offset = i_size & (PAGE_SIZE - 1); 3479 if (page->index > end_index || 3480 (page->index == end_index && !pg_offset)) { 3481 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3482 unlock_page(page); 3483 return 0; 3484 } 3485 3486 if (page->index == end_index) { 3487 char *userpage; 3488 3489 userpage = kmap_atomic(page); 3490 memset(userpage + pg_offset, 0, 3491 PAGE_SIZE - pg_offset); 3492 kunmap_atomic(userpage); 3493 flush_dcache_page(page); 3494 } 3495 3496 pg_offset = 0; 3497 3498 set_page_extent_mapped(page); 3499 3500 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3501 if (ret == 1) 3502 goto done_unlocked; 3503 if (ret) 3504 goto done; 3505 3506 ret = __extent_writepage_io(inode, page, wbc, epd, 3507 i_size, nr_written, write_flags, &nr); 3508 if (ret == 1) 3509 goto done_unlocked; 3510 3511 done: 3512 if (nr == 0) { 3513 /* make sure the mapping tag for page dirty gets cleared */ 3514 set_page_writeback(page); 3515 end_page_writeback(page); 3516 } 3517 if (PageError(page)) { 3518 ret = ret < 0 ? ret : -EIO; 3519 end_extent_writepage(page, ret, start, page_end); 3520 } 3521 unlock_page(page); 3522 return ret; 3523 3524 done_unlocked: 3525 return 0; 3526 } 3527 3528 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3529 { 3530 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3531 TASK_UNINTERRUPTIBLE); 3532 } 3533 3534 static noinline_for_stack int 3535 lock_extent_buffer_for_io(struct extent_buffer *eb, 3536 struct btrfs_fs_info *fs_info, 3537 struct extent_page_data *epd) 3538 { 3539 unsigned long i, num_pages; 3540 int flush = 0; 3541 int ret = 0; 3542 3543 if (!btrfs_try_tree_write_lock(eb)) { 3544 flush = 1; 3545 flush_write_bio(epd); 3546 btrfs_tree_lock(eb); 3547 } 3548 3549 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3550 btrfs_tree_unlock(eb); 3551 if (!epd->sync_io) 3552 return 0; 3553 if (!flush) { 3554 flush_write_bio(epd); 3555 flush = 1; 3556 } 3557 while (1) { 3558 wait_on_extent_buffer_writeback(eb); 3559 btrfs_tree_lock(eb); 3560 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3561 break; 3562 btrfs_tree_unlock(eb); 3563 } 3564 } 3565 3566 /* 3567 * We need to do this to prevent races in people who check if the eb is 3568 * under IO since we can end up having no IO bits set for a short period 3569 * of time. 3570 */ 3571 spin_lock(&eb->refs_lock); 3572 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3573 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3574 spin_unlock(&eb->refs_lock); 3575 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3576 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3577 -eb->len, 3578 fs_info->dirty_metadata_batch); 3579 ret = 1; 3580 } else { 3581 spin_unlock(&eb->refs_lock); 3582 } 3583 3584 btrfs_tree_unlock(eb); 3585 3586 if (!ret) 3587 return ret; 3588 3589 num_pages = num_extent_pages(eb->start, eb->len); 3590 for (i = 0; i < num_pages; i++) { 3591 struct page *p = eb->pages[i]; 3592 3593 if (!trylock_page(p)) { 3594 if (!flush) { 3595 flush_write_bio(epd); 3596 flush = 1; 3597 } 3598 lock_page(p); 3599 } 3600 } 3601 3602 return ret; 3603 } 3604 3605 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3606 { 3607 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3608 smp_mb__after_atomic(); 3609 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3610 } 3611 3612 static void set_btree_ioerr(struct page *page) 3613 { 3614 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3615 3616 SetPageError(page); 3617 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3618 return; 3619 3620 /* 3621 * If writeback for a btree extent that doesn't belong to a log tree 3622 * failed, increment the counter transaction->eb_write_errors. 3623 * We do this because while the transaction is running and before it's 3624 * committing (when we call filemap_fdata[write|wait]_range against 3625 * the btree inode), we might have 3626 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3627 * returns an error or an error happens during writeback, when we're 3628 * committing the transaction we wouldn't know about it, since the pages 3629 * can be no longer dirty nor marked anymore for writeback (if a 3630 * subsequent modification to the extent buffer didn't happen before the 3631 * transaction commit), which makes filemap_fdata[write|wait]_range not 3632 * able to find the pages tagged with SetPageError at transaction 3633 * commit time. So if this happens we must abort the transaction, 3634 * otherwise we commit a super block with btree roots that point to 3635 * btree nodes/leafs whose content on disk is invalid - either garbage 3636 * or the content of some node/leaf from a past generation that got 3637 * cowed or deleted and is no longer valid. 3638 * 3639 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3640 * not be enough - we need to distinguish between log tree extents vs 3641 * non-log tree extents, and the next filemap_fdatawait_range() call 3642 * will catch and clear such errors in the mapping - and that call might 3643 * be from a log sync and not from a transaction commit. Also, checking 3644 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3645 * not done and would not be reliable - the eb might have been released 3646 * from memory and reading it back again means that flag would not be 3647 * set (since it's a runtime flag, not persisted on disk). 3648 * 3649 * Using the flags below in the btree inode also makes us achieve the 3650 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3651 * writeback for all dirty pages and before filemap_fdatawait_range() 3652 * is called, the writeback for all dirty pages had already finished 3653 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3654 * filemap_fdatawait_range() would return success, as it could not know 3655 * that writeback errors happened (the pages were no longer tagged for 3656 * writeback). 3657 */ 3658 switch (eb->log_index) { 3659 case -1: 3660 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3661 break; 3662 case 0: 3663 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3664 break; 3665 case 1: 3666 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3667 break; 3668 default: 3669 BUG(); /* unexpected, logic error */ 3670 } 3671 } 3672 3673 static void end_bio_extent_buffer_writepage(struct bio *bio) 3674 { 3675 struct bio_vec *bvec; 3676 struct extent_buffer *eb; 3677 int i, done; 3678 3679 bio_for_each_segment_all(bvec, bio, i) { 3680 struct page *page = bvec->bv_page; 3681 3682 eb = (struct extent_buffer *)page->private; 3683 BUG_ON(!eb); 3684 done = atomic_dec_and_test(&eb->io_pages); 3685 3686 if (bio->bi_error || 3687 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3688 ClearPageUptodate(page); 3689 set_btree_ioerr(page); 3690 } 3691 3692 end_page_writeback(page); 3693 3694 if (!done) 3695 continue; 3696 3697 end_extent_buffer_writeback(eb); 3698 } 3699 3700 bio_put(bio); 3701 } 3702 3703 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3704 struct btrfs_fs_info *fs_info, 3705 struct writeback_control *wbc, 3706 struct extent_page_data *epd) 3707 { 3708 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3709 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3710 u64 offset = eb->start; 3711 u32 nritems; 3712 unsigned long i, num_pages; 3713 unsigned long bio_flags = 0; 3714 unsigned long start, end; 3715 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3716 int ret = 0; 3717 3718 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3719 num_pages = num_extent_pages(eb->start, eb->len); 3720 atomic_set(&eb->io_pages, num_pages); 3721 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3722 bio_flags = EXTENT_BIO_TREE_LOG; 3723 3724 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3725 nritems = btrfs_header_nritems(eb); 3726 if (btrfs_header_level(eb) > 0) { 3727 end = btrfs_node_key_ptr_offset(nritems); 3728 3729 memzero_extent_buffer(eb, end, eb->len - end); 3730 } else { 3731 /* 3732 * leaf: 3733 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3734 */ 3735 start = btrfs_item_nr_offset(nritems); 3736 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3737 memzero_extent_buffer(eb, start, end - start); 3738 } 3739 3740 for (i = 0; i < num_pages; i++) { 3741 struct page *p = eb->pages[i]; 3742 3743 clear_page_dirty_for_io(p); 3744 set_page_writeback(p); 3745 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3746 p, offset >> 9, PAGE_SIZE, 0, bdev, 3747 &epd->bio, 3748 end_bio_extent_buffer_writepage, 3749 0, epd->bio_flags, bio_flags, false); 3750 epd->bio_flags = bio_flags; 3751 if (ret) { 3752 set_btree_ioerr(p); 3753 if (PageWriteback(p)) 3754 end_page_writeback(p); 3755 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3756 end_extent_buffer_writeback(eb); 3757 ret = -EIO; 3758 break; 3759 } 3760 offset += PAGE_SIZE; 3761 update_nr_written(wbc, 1); 3762 unlock_page(p); 3763 } 3764 3765 if (unlikely(ret)) { 3766 for (; i < num_pages; i++) { 3767 struct page *p = eb->pages[i]; 3768 clear_page_dirty_for_io(p); 3769 unlock_page(p); 3770 } 3771 } 3772 3773 return ret; 3774 } 3775 3776 int btree_write_cache_pages(struct address_space *mapping, 3777 struct writeback_control *wbc) 3778 { 3779 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3780 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3781 struct extent_buffer *eb, *prev_eb = NULL; 3782 struct extent_page_data epd = { 3783 .bio = NULL, 3784 .tree = tree, 3785 .extent_locked = 0, 3786 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3787 .bio_flags = 0, 3788 }; 3789 int ret = 0; 3790 int done = 0; 3791 int nr_to_write_done = 0; 3792 struct pagevec pvec; 3793 int nr_pages; 3794 pgoff_t index; 3795 pgoff_t end; /* Inclusive */ 3796 int scanned = 0; 3797 int tag; 3798 3799 pagevec_init(&pvec, 0); 3800 if (wbc->range_cyclic) { 3801 index = mapping->writeback_index; /* Start from prev offset */ 3802 end = -1; 3803 } else { 3804 index = wbc->range_start >> PAGE_SHIFT; 3805 end = wbc->range_end >> PAGE_SHIFT; 3806 scanned = 1; 3807 } 3808 if (wbc->sync_mode == WB_SYNC_ALL) 3809 tag = PAGECACHE_TAG_TOWRITE; 3810 else 3811 tag = PAGECACHE_TAG_DIRTY; 3812 retry: 3813 if (wbc->sync_mode == WB_SYNC_ALL) 3814 tag_pages_for_writeback(mapping, index, end); 3815 while (!done && !nr_to_write_done && (index <= end) && 3816 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3817 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3818 unsigned i; 3819 3820 scanned = 1; 3821 for (i = 0; i < nr_pages; i++) { 3822 struct page *page = pvec.pages[i]; 3823 3824 if (!PagePrivate(page)) 3825 continue; 3826 3827 if (!wbc->range_cyclic && page->index > end) { 3828 done = 1; 3829 break; 3830 } 3831 3832 spin_lock(&mapping->private_lock); 3833 if (!PagePrivate(page)) { 3834 spin_unlock(&mapping->private_lock); 3835 continue; 3836 } 3837 3838 eb = (struct extent_buffer *)page->private; 3839 3840 /* 3841 * Shouldn't happen and normally this would be a BUG_ON 3842 * but no sense in crashing the users box for something 3843 * we can survive anyway. 3844 */ 3845 if (WARN_ON(!eb)) { 3846 spin_unlock(&mapping->private_lock); 3847 continue; 3848 } 3849 3850 if (eb == prev_eb) { 3851 spin_unlock(&mapping->private_lock); 3852 continue; 3853 } 3854 3855 ret = atomic_inc_not_zero(&eb->refs); 3856 spin_unlock(&mapping->private_lock); 3857 if (!ret) 3858 continue; 3859 3860 prev_eb = eb; 3861 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3862 if (!ret) { 3863 free_extent_buffer(eb); 3864 continue; 3865 } 3866 3867 ret = write_one_eb(eb, fs_info, wbc, &epd); 3868 if (ret) { 3869 done = 1; 3870 free_extent_buffer(eb); 3871 break; 3872 } 3873 free_extent_buffer(eb); 3874 3875 /* 3876 * the filesystem may choose to bump up nr_to_write. 3877 * We have to make sure to honor the new nr_to_write 3878 * at any time 3879 */ 3880 nr_to_write_done = wbc->nr_to_write <= 0; 3881 } 3882 pagevec_release(&pvec); 3883 cond_resched(); 3884 } 3885 if (!scanned && !done) { 3886 /* 3887 * We hit the last page and there is more work to be done: wrap 3888 * back to the start of the file 3889 */ 3890 scanned = 1; 3891 index = 0; 3892 goto retry; 3893 } 3894 flush_write_bio(&epd); 3895 return ret; 3896 } 3897 3898 /** 3899 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3900 * @mapping: address space structure to write 3901 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3902 * @writepage: function called for each page 3903 * @data: data passed to writepage function 3904 * 3905 * If a page is already under I/O, write_cache_pages() skips it, even 3906 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3907 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3908 * and msync() need to guarantee that all the data which was dirty at the time 3909 * the call was made get new I/O started against them. If wbc->sync_mode is 3910 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3911 * existing IO to complete. 3912 */ 3913 static int extent_write_cache_pages(struct address_space *mapping, 3914 struct writeback_control *wbc, 3915 writepage_t writepage, void *data, 3916 void (*flush_fn)(void *)) 3917 { 3918 struct inode *inode = mapping->host; 3919 int ret = 0; 3920 int done = 0; 3921 int nr_to_write_done = 0; 3922 struct pagevec pvec; 3923 int nr_pages; 3924 pgoff_t index; 3925 pgoff_t end; /* Inclusive */ 3926 pgoff_t done_index; 3927 int range_whole = 0; 3928 int scanned = 0; 3929 int tag; 3930 3931 /* 3932 * We have to hold onto the inode so that ordered extents can do their 3933 * work when the IO finishes. The alternative to this is failing to add 3934 * an ordered extent if the igrab() fails there and that is a huge pain 3935 * to deal with, so instead just hold onto the inode throughout the 3936 * writepages operation. If it fails here we are freeing up the inode 3937 * anyway and we'd rather not waste our time writing out stuff that is 3938 * going to be truncated anyway. 3939 */ 3940 if (!igrab(inode)) 3941 return 0; 3942 3943 pagevec_init(&pvec, 0); 3944 if (wbc->range_cyclic) { 3945 index = mapping->writeback_index; /* Start from prev offset */ 3946 end = -1; 3947 } else { 3948 index = wbc->range_start >> PAGE_SHIFT; 3949 end = wbc->range_end >> PAGE_SHIFT; 3950 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3951 range_whole = 1; 3952 scanned = 1; 3953 } 3954 if (wbc->sync_mode == WB_SYNC_ALL) 3955 tag = PAGECACHE_TAG_TOWRITE; 3956 else 3957 tag = PAGECACHE_TAG_DIRTY; 3958 retry: 3959 if (wbc->sync_mode == WB_SYNC_ALL) 3960 tag_pages_for_writeback(mapping, index, end); 3961 done_index = index; 3962 while (!done && !nr_to_write_done && (index <= end) && 3963 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3964 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3965 unsigned i; 3966 3967 scanned = 1; 3968 for (i = 0; i < nr_pages; i++) { 3969 struct page *page = pvec.pages[i]; 3970 3971 done_index = page->index; 3972 /* 3973 * At this point we hold neither mapping->tree_lock nor 3974 * lock on the page itself: the page may be truncated or 3975 * invalidated (changing page->mapping to NULL), or even 3976 * swizzled back from swapper_space to tmpfs file 3977 * mapping 3978 */ 3979 if (!trylock_page(page)) { 3980 flush_fn(data); 3981 lock_page(page); 3982 } 3983 3984 if (unlikely(page->mapping != mapping)) { 3985 unlock_page(page); 3986 continue; 3987 } 3988 3989 if (!wbc->range_cyclic && page->index > end) { 3990 done = 1; 3991 unlock_page(page); 3992 continue; 3993 } 3994 3995 if (wbc->sync_mode != WB_SYNC_NONE) { 3996 if (PageWriteback(page)) 3997 flush_fn(data); 3998 wait_on_page_writeback(page); 3999 } 4000 4001 if (PageWriteback(page) || 4002 !clear_page_dirty_for_io(page)) { 4003 unlock_page(page); 4004 continue; 4005 } 4006 4007 ret = (*writepage)(page, wbc, data); 4008 4009 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4010 unlock_page(page); 4011 ret = 0; 4012 } 4013 if (ret < 0) { 4014 /* 4015 * done_index is set past this page, 4016 * so media errors will not choke 4017 * background writeout for the entire 4018 * file. This has consequences for 4019 * range_cyclic semantics (ie. it may 4020 * not be suitable for data integrity 4021 * writeout). 4022 */ 4023 done_index = page->index + 1; 4024 done = 1; 4025 break; 4026 } 4027 4028 /* 4029 * the filesystem may choose to bump up nr_to_write. 4030 * We have to make sure to honor the new nr_to_write 4031 * at any time 4032 */ 4033 nr_to_write_done = wbc->nr_to_write <= 0; 4034 } 4035 pagevec_release(&pvec); 4036 cond_resched(); 4037 } 4038 if (!scanned && !done) { 4039 /* 4040 * We hit the last page and there is more work to be done: wrap 4041 * back to the start of the file 4042 */ 4043 scanned = 1; 4044 index = 0; 4045 goto retry; 4046 } 4047 4048 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4049 mapping->writeback_index = done_index; 4050 4051 btrfs_add_delayed_iput(inode); 4052 return ret; 4053 } 4054 4055 static void flush_epd_write_bio(struct extent_page_data *epd) 4056 { 4057 if (epd->bio) { 4058 int ret; 4059 4060 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4061 epd->sync_io ? REQ_SYNC : 0); 4062 4063 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4064 BUG_ON(ret < 0); /* -ENOMEM */ 4065 epd->bio = NULL; 4066 } 4067 } 4068 4069 static noinline void flush_write_bio(void *data) 4070 { 4071 struct extent_page_data *epd = data; 4072 flush_epd_write_bio(epd); 4073 } 4074 4075 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4076 get_extent_t *get_extent, 4077 struct writeback_control *wbc) 4078 { 4079 int ret; 4080 struct extent_page_data epd = { 4081 .bio = NULL, 4082 .tree = tree, 4083 .get_extent = get_extent, 4084 .extent_locked = 0, 4085 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4086 .bio_flags = 0, 4087 }; 4088 4089 ret = __extent_writepage(page, wbc, &epd); 4090 4091 flush_epd_write_bio(&epd); 4092 return ret; 4093 } 4094 4095 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4096 u64 start, u64 end, get_extent_t *get_extent, 4097 int mode) 4098 { 4099 int ret = 0; 4100 struct address_space *mapping = inode->i_mapping; 4101 struct page *page; 4102 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4103 PAGE_SHIFT; 4104 4105 struct extent_page_data epd = { 4106 .bio = NULL, 4107 .tree = tree, 4108 .get_extent = get_extent, 4109 .extent_locked = 1, 4110 .sync_io = mode == WB_SYNC_ALL, 4111 .bio_flags = 0, 4112 }; 4113 struct writeback_control wbc_writepages = { 4114 .sync_mode = mode, 4115 .nr_to_write = nr_pages * 2, 4116 .range_start = start, 4117 .range_end = end + 1, 4118 }; 4119 4120 while (start <= end) { 4121 page = find_get_page(mapping, start >> PAGE_SHIFT); 4122 if (clear_page_dirty_for_io(page)) 4123 ret = __extent_writepage(page, &wbc_writepages, &epd); 4124 else { 4125 if (tree->ops && tree->ops->writepage_end_io_hook) 4126 tree->ops->writepage_end_io_hook(page, start, 4127 start + PAGE_SIZE - 1, 4128 NULL, 1); 4129 unlock_page(page); 4130 } 4131 put_page(page); 4132 start += PAGE_SIZE; 4133 } 4134 4135 flush_epd_write_bio(&epd); 4136 return ret; 4137 } 4138 4139 int extent_writepages(struct extent_io_tree *tree, 4140 struct address_space *mapping, 4141 get_extent_t *get_extent, 4142 struct writeback_control *wbc) 4143 { 4144 int ret = 0; 4145 struct extent_page_data epd = { 4146 .bio = NULL, 4147 .tree = tree, 4148 .get_extent = get_extent, 4149 .extent_locked = 0, 4150 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4151 .bio_flags = 0, 4152 }; 4153 4154 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, 4155 flush_write_bio); 4156 flush_epd_write_bio(&epd); 4157 return ret; 4158 } 4159 4160 int extent_readpages(struct extent_io_tree *tree, 4161 struct address_space *mapping, 4162 struct list_head *pages, unsigned nr_pages, 4163 get_extent_t get_extent) 4164 { 4165 struct bio *bio = NULL; 4166 unsigned page_idx; 4167 unsigned long bio_flags = 0; 4168 struct page *pagepool[16]; 4169 struct page *page; 4170 struct extent_map *em_cached = NULL; 4171 int nr = 0; 4172 u64 prev_em_start = (u64)-1; 4173 4174 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4175 page = list_entry(pages->prev, struct page, lru); 4176 4177 prefetchw(&page->flags); 4178 list_del(&page->lru); 4179 if (add_to_page_cache_lru(page, mapping, 4180 page->index, 4181 readahead_gfp_mask(mapping))) { 4182 put_page(page); 4183 continue; 4184 } 4185 4186 pagepool[nr++] = page; 4187 if (nr < ARRAY_SIZE(pagepool)) 4188 continue; 4189 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4190 &bio, 0, &bio_flags, &prev_em_start); 4191 nr = 0; 4192 } 4193 if (nr) 4194 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4195 &bio, 0, &bio_flags, &prev_em_start); 4196 4197 if (em_cached) 4198 free_extent_map(em_cached); 4199 4200 BUG_ON(!list_empty(pages)); 4201 if (bio) 4202 return submit_one_bio(bio, 0, bio_flags); 4203 return 0; 4204 } 4205 4206 /* 4207 * basic invalidatepage code, this waits on any locked or writeback 4208 * ranges corresponding to the page, and then deletes any extent state 4209 * records from the tree 4210 */ 4211 int extent_invalidatepage(struct extent_io_tree *tree, 4212 struct page *page, unsigned long offset) 4213 { 4214 struct extent_state *cached_state = NULL; 4215 u64 start = page_offset(page); 4216 u64 end = start + PAGE_SIZE - 1; 4217 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4218 4219 start += ALIGN(offset, blocksize); 4220 if (start > end) 4221 return 0; 4222 4223 lock_extent_bits(tree, start, end, &cached_state); 4224 wait_on_page_writeback(page); 4225 clear_extent_bit(tree, start, end, 4226 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4227 EXTENT_DO_ACCOUNTING, 4228 1, 1, &cached_state, GFP_NOFS); 4229 return 0; 4230 } 4231 4232 /* 4233 * a helper for releasepage, this tests for areas of the page that 4234 * are locked or under IO and drops the related state bits if it is safe 4235 * to drop the page. 4236 */ 4237 static int try_release_extent_state(struct extent_map_tree *map, 4238 struct extent_io_tree *tree, 4239 struct page *page, gfp_t mask) 4240 { 4241 u64 start = page_offset(page); 4242 u64 end = start + PAGE_SIZE - 1; 4243 int ret = 1; 4244 4245 if (test_range_bit(tree, start, end, 4246 EXTENT_IOBITS, 0, NULL)) 4247 ret = 0; 4248 else { 4249 /* 4250 * at this point we can safely clear everything except the 4251 * locked bit and the nodatasum bit 4252 */ 4253 ret = clear_extent_bit(tree, start, end, 4254 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4255 0, 0, NULL, mask); 4256 4257 /* if clear_extent_bit failed for enomem reasons, 4258 * we can't allow the release to continue. 4259 */ 4260 if (ret < 0) 4261 ret = 0; 4262 else 4263 ret = 1; 4264 } 4265 return ret; 4266 } 4267 4268 /* 4269 * a helper for releasepage. As long as there are no locked extents 4270 * in the range corresponding to the page, both state records and extent 4271 * map records are removed 4272 */ 4273 int try_release_extent_mapping(struct extent_map_tree *map, 4274 struct extent_io_tree *tree, struct page *page, 4275 gfp_t mask) 4276 { 4277 struct extent_map *em; 4278 u64 start = page_offset(page); 4279 u64 end = start + PAGE_SIZE - 1; 4280 4281 if (gfpflags_allow_blocking(mask) && 4282 page->mapping->host->i_size > SZ_16M) { 4283 u64 len; 4284 while (start <= end) { 4285 len = end - start + 1; 4286 write_lock(&map->lock); 4287 em = lookup_extent_mapping(map, start, len); 4288 if (!em) { 4289 write_unlock(&map->lock); 4290 break; 4291 } 4292 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4293 em->start != start) { 4294 write_unlock(&map->lock); 4295 free_extent_map(em); 4296 break; 4297 } 4298 if (!test_range_bit(tree, em->start, 4299 extent_map_end(em) - 1, 4300 EXTENT_LOCKED | EXTENT_WRITEBACK, 4301 0, NULL)) { 4302 remove_extent_mapping(map, em); 4303 /* once for the rb tree */ 4304 free_extent_map(em); 4305 } 4306 start = extent_map_end(em); 4307 write_unlock(&map->lock); 4308 4309 /* once for us */ 4310 free_extent_map(em); 4311 } 4312 } 4313 return try_release_extent_state(map, tree, page, mask); 4314 } 4315 4316 /* 4317 * helper function for fiemap, which doesn't want to see any holes. 4318 * This maps until we find something past 'last' 4319 */ 4320 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4321 u64 offset, 4322 u64 last, 4323 get_extent_t *get_extent) 4324 { 4325 u64 sectorsize = btrfs_inode_sectorsize(inode); 4326 struct extent_map *em; 4327 u64 len; 4328 4329 if (offset >= last) 4330 return NULL; 4331 4332 while (1) { 4333 len = last - offset; 4334 if (len == 0) 4335 break; 4336 len = ALIGN(len, sectorsize); 4337 em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); 4338 if (IS_ERR_OR_NULL(em)) 4339 return em; 4340 4341 /* if this isn't a hole return it */ 4342 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4343 em->block_start != EXTENT_MAP_HOLE) { 4344 return em; 4345 } 4346 4347 /* this is a hole, advance to the next extent */ 4348 offset = extent_map_end(em); 4349 free_extent_map(em); 4350 if (offset >= last) 4351 break; 4352 } 4353 return NULL; 4354 } 4355 4356 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4357 __u64 start, __u64 len, get_extent_t *get_extent) 4358 { 4359 int ret = 0; 4360 u64 off = start; 4361 u64 max = start + len; 4362 u32 flags = 0; 4363 u32 found_type; 4364 u64 last; 4365 u64 last_for_get_extent = 0; 4366 u64 disko = 0; 4367 u64 isize = i_size_read(inode); 4368 struct btrfs_key found_key; 4369 struct extent_map *em = NULL; 4370 struct extent_state *cached_state = NULL; 4371 struct btrfs_path *path; 4372 struct btrfs_root *root = BTRFS_I(inode)->root; 4373 int end = 0; 4374 u64 em_start = 0; 4375 u64 em_len = 0; 4376 u64 em_end = 0; 4377 4378 if (len == 0) 4379 return -EINVAL; 4380 4381 path = btrfs_alloc_path(); 4382 if (!path) 4383 return -ENOMEM; 4384 path->leave_spinning = 1; 4385 4386 start = round_down(start, btrfs_inode_sectorsize(inode)); 4387 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4388 4389 /* 4390 * lookup the last file extent. We're not using i_size here 4391 * because there might be preallocation past i_size 4392 */ 4393 ret = btrfs_lookup_file_extent(NULL, root, path, 4394 btrfs_ino(BTRFS_I(inode)), -1, 0); 4395 if (ret < 0) { 4396 btrfs_free_path(path); 4397 return ret; 4398 } else { 4399 WARN_ON(!ret); 4400 if (ret == 1) 4401 ret = 0; 4402 } 4403 4404 path->slots[0]--; 4405 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4406 found_type = found_key.type; 4407 4408 /* No extents, but there might be delalloc bits */ 4409 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4410 found_type != BTRFS_EXTENT_DATA_KEY) { 4411 /* have to trust i_size as the end */ 4412 last = (u64)-1; 4413 last_for_get_extent = isize; 4414 } else { 4415 /* 4416 * remember the start of the last extent. There are a 4417 * bunch of different factors that go into the length of the 4418 * extent, so its much less complex to remember where it started 4419 */ 4420 last = found_key.offset; 4421 last_for_get_extent = last + 1; 4422 } 4423 btrfs_release_path(path); 4424 4425 /* 4426 * we might have some extents allocated but more delalloc past those 4427 * extents. so, we trust isize unless the start of the last extent is 4428 * beyond isize 4429 */ 4430 if (last < isize) { 4431 last = (u64)-1; 4432 last_for_get_extent = isize; 4433 } 4434 4435 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4436 &cached_state); 4437 4438 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4439 get_extent); 4440 if (!em) 4441 goto out; 4442 if (IS_ERR(em)) { 4443 ret = PTR_ERR(em); 4444 goto out; 4445 } 4446 4447 while (!end) { 4448 u64 offset_in_extent = 0; 4449 4450 /* break if the extent we found is outside the range */ 4451 if (em->start >= max || extent_map_end(em) < off) 4452 break; 4453 4454 /* 4455 * get_extent may return an extent that starts before our 4456 * requested range. We have to make sure the ranges 4457 * we return to fiemap always move forward and don't 4458 * overlap, so adjust the offsets here 4459 */ 4460 em_start = max(em->start, off); 4461 4462 /* 4463 * record the offset from the start of the extent 4464 * for adjusting the disk offset below. Only do this if the 4465 * extent isn't compressed since our in ram offset may be past 4466 * what we have actually allocated on disk. 4467 */ 4468 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4469 offset_in_extent = em_start - em->start; 4470 em_end = extent_map_end(em); 4471 em_len = em_end - em_start; 4472 disko = 0; 4473 flags = 0; 4474 4475 /* 4476 * bump off for our next call to get_extent 4477 */ 4478 off = extent_map_end(em); 4479 if (off >= max) 4480 end = 1; 4481 4482 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4483 end = 1; 4484 flags |= FIEMAP_EXTENT_LAST; 4485 } else if (em->block_start == EXTENT_MAP_INLINE) { 4486 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4487 FIEMAP_EXTENT_NOT_ALIGNED); 4488 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4489 flags |= (FIEMAP_EXTENT_DELALLOC | 4490 FIEMAP_EXTENT_UNKNOWN); 4491 } else if (fieinfo->fi_extents_max) { 4492 struct btrfs_trans_handle *trans; 4493 4494 u64 bytenr = em->block_start - 4495 (em->start - em->orig_start); 4496 4497 disko = em->block_start + offset_in_extent; 4498 4499 /* 4500 * We need a trans handle to get delayed refs 4501 */ 4502 trans = btrfs_join_transaction(root); 4503 /* 4504 * It's OK if we can't start a trans we can still check 4505 * from commit_root 4506 */ 4507 if (IS_ERR(trans)) 4508 trans = NULL; 4509 4510 /* 4511 * As btrfs supports shared space, this information 4512 * can be exported to userspace tools via 4513 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4514 * then we're just getting a count and we can skip the 4515 * lookup stuff. 4516 */ 4517 ret = btrfs_check_shared(trans, root->fs_info, 4518 root->objectid, 4519 btrfs_ino(BTRFS_I(inode)), bytenr); 4520 if (trans) 4521 btrfs_end_transaction(trans); 4522 if (ret < 0) 4523 goto out_free; 4524 if (ret) 4525 flags |= FIEMAP_EXTENT_SHARED; 4526 ret = 0; 4527 } 4528 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4529 flags |= FIEMAP_EXTENT_ENCODED; 4530 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4531 flags |= FIEMAP_EXTENT_UNWRITTEN; 4532 4533 free_extent_map(em); 4534 em = NULL; 4535 if ((em_start >= last) || em_len == (u64)-1 || 4536 (last == (u64)-1 && isize <= em_end)) { 4537 flags |= FIEMAP_EXTENT_LAST; 4538 end = 1; 4539 } 4540 4541 /* now scan forward to see if this is really the last extent. */ 4542 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4543 get_extent); 4544 if (IS_ERR(em)) { 4545 ret = PTR_ERR(em); 4546 goto out; 4547 } 4548 if (!em) { 4549 flags |= FIEMAP_EXTENT_LAST; 4550 end = 1; 4551 } 4552 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4553 em_len, flags); 4554 if (ret) { 4555 if (ret == 1) 4556 ret = 0; 4557 goto out_free; 4558 } 4559 } 4560 out_free: 4561 free_extent_map(em); 4562 out: 4563 btrfs_free_path(path); 4564 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4565 &cached_state, GFP_NOFS); 4566 return ret; 4567 } 4568 4569 static void __free_extent_buffer(struct extent_buffer *eb) 4570 { 4571 btrfs_leak_debug_del(&eb->leak_list); 4572 kmem_cache_free(extent_buffer_cache, eb); 4573 } 4574 4575 int extent_buffer_under_io(struct extent_buffer *eb) 4576 { 4577 return (atomic_read(&eb->io_pages) || 4578 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4579 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4580 } 4581 4582 /* 4583 * Helper for releasing extent buffer page. 4584 */ 4585 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4586 { 4587 unsigned long index; 4588 struct page *page; 4589 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4590 4591 BUG_ON(extent_buffer_under_io(eb)); 4592 4593 index = num_extent_pages(eb->start, eb->len); 4594 if (index == 0) 4595 return; 4596 4597 do { 4598 index--; 4599 page = eb->pages[index]; 4600 if (!page) 4601 continue; 4602 if (mapped) 4603 spin_lock(&page->mapping->private_lock); 4604 /* 4605 * We do this since we'll remove the pages after we've 4606 * removed the eb from the radix tree, so we could race 4607 * and have this page now attached to the new eb. So 4608 * only clear page_private if it's still connected to 4609 * this eb. 4610 */ 4611 if (PagePrivate(page) && 4612 page->private == (unsigned long)eb) { 4613 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4614 BUG_ON(PageDirty(page)); 4615 BUG_ON(PageWriteback(page)); 4616 /* 4617 * We need to make sure we haven't be attached 4618 * to a new eb. 4619 */ 4620 ClearPagePrivate(page); 4621 set_page_private(page, 0); 4622 /* One for the page private */ 4623 put_page(page); 4624 } 4625 4626 if (mapped) 4627 spin_unlock(&page->mapping->private_lock); 4628 4629 /* One for when we allocated the page */ 4630 put_page(page); 4631 } while (index != 0); 4632 } 4633 4634 /* 4635 * Helper for releasing the extent buffer. 4636 */ 4637 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4638 { 4639 btrfs_release_extent_buffer_page(eb); 4640 __free_extent_buffer(eb); 4641 } 4642 4643 static struct extent_buffer * 4644 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4645 unsigned long len) 4646 { 4647 struct extent_buffer *eb = NULL; 4648 4649 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4650 eb->start = start; 4651 eb->len = len; 4652 eb->fs_info = fs_info; 4653 eb->bflags = 0; 4654 rwlock_init(&eb->lock); 4655 atomic_set(&eb->write_locks, 0); 4656 atomic_set(&eb->read_locks, 0); 4657 atomic_set(&eb->blocking_readers, 0); 4658 atomic_set(&eb->blocking_writers, 0); 4659 atomic_set(&eb->spinning_readers, 0); 4660 atomic_set(&eb->spinning_writers, 0); 4661 eb->lock_nested = 0; 4662 init_waitqueue_head(&eb->write_lock_wq); 4663 init_waitqueue_head(&eb->read_lock_wq); 4664 4665 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4666 4667 spin_lock_init(&eb->refs_lock); 4668 atomic_set(&eb->refs, 1); 4669 atomic_set(&eb->io_pages, 0); 4670 4671 /* 4672 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4673 */ 4674 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4675 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4676 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4677 4678 return eb; 4679 } 4680 4681 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4682 { 4683 unsigned long i; 4684 struct page *p; 4685 struct extent_buffer *new; 4686 unsigned long num_pages = num_extent_pages(src->start, src->len); 4687 4688 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4689 if (new == NULL) 4690 return NULL; 4691 4692 for (i = 0; i < num_pages; i++) { 4693 p = alloc_page(GFP_NOFS); 4694 if (!p) { 4695 btrfs_release_extent_buffer(new); 4696 return NULL; 4697 } 4698 attach_extent_buffer_page(new, p); 4699 WARN_ON(PageDirty(p)); 4700 SetPageUptodate(p); 4701 new->pages[i] = p; 4702 copy_page(page_address(p), page_address(src->pages[i])); 4703 } 4704 4705 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4706 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4707 4708 return new; 4709 } 4710 4711 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4712 u64 start, unsigned long len) 4713 { 4714 struct extent_buffer *eb; 4715 unsigned long num_pages; 4716 unsigned long i; 4717 4718 num_pages = num_extent_pages(start, len); 4719 4720 eb = __alloc_extent_buffer(fs_info, start, len); 4721 if (!eb) 4722 return NULL; 4723 4724 for (i = 0; i < num_pages; i++) { 4725 eb->pages[i] = alloc_page(GFP_NOFS); 4726 if (!eb->pages[i]) 4727 goto err; 4728 } 4729 set_extent_buffer_uptodate(eb); 4730 btrfs_set_header_nritems(eb, 0); 4731 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4732 4733 return eb; 4734 err: 4735 for (; i > 0; i--) 4736 __free_page(eb->pages[i - 1]); 4737 __free_extent_buffer(eb); 4738 return NULL; 4739 } 4740 4741 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4742 u64 start) 4743 { 4744 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4745 } 4746 4747 static void check_buffer_tree_ref(struct extent_buffer *eb) 4748 { 4749 int refs; 4750 /* the ref bit is tricky. We have to make sure it is set 4751 * if we have the buffer dirty. Otherwise the 4752 * code to free a buffer can end up dropping a dirty 4753 * page 4754 * 4755 * Once the ref bit is set, it won't go away while the 4756 * buffer is dirty or in writeback, and it also won't 4757 * go away while we have the reference count on the 4758 * eb bumped. 4759 * 4760 * We can't just set the ref bit without bumping the 4761 * ref on the eb because free_extent_buffer might 4762 * see the ref bit and try to clear it. If this happens 4763 * free_extent_buffer might end up dropping our original 4764 * ref by mistake and freeing the page before we are able 4765 * to add one more ref. 4766 * 4767 * So bump the ref count first, then set the bit. If someone 4768 * beat us to it, drop the ref we added. 4769 */ 4770 refs = atomic_read(&eb->refs); 4771 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4772 return; 4773 4774 spin_lock(&eb->refs_lock); 4775 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4776 atomic_inc(&eb->refs); 4777 spin_unlock(&eb->refs_lock); 4778 } 4779 4780 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4781 struct page *accessed) 4782 { 4783 unsigned long num_pages, i; 4784 4785 check_buffer_tree_ref(eb); 4786 4787 num_pages = num_extent_pages(eb->start, eb->len); 4788 for (i = 0; i < num_pages; i++) { 4789 struct page *p = eb->pages[i]; 4790 4791 if (p != accessed) 4792 mark_page_accessed(p); 4793 } 4794 } 4795 4796 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4797 u64 start) 4798 { 4799 struct extent_buffer *eb; 4800 4801 rcu_read_lock(); 4802 eb = radix_tree_lookup(&fs_info->buffer_radix, 4803 start >> PAGE_SHIFT); 4804 if (eb && atomic_inc_not_zero(&eb->refs)) { 4805 rcu_read_unlock(); 4806 /* 4807 * Lock our eb's refs_lock to avoid races with 4808 * free_extent_buffer. When we get our eb it might be flagged 4809 * with EXTENT_BUFFER_STALE and another task running 4810 * free_extent_buffer might have seen that flag set, 4811 * eb->refs == 2, that the buffer isn't under IO (dirty and 4812 * writeback flags not set) and it's still in the tree (flag 4813 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4814 * of decrementing the extent buffer's reference count twice. 4815 * So here we could race and increment the eb's reference count, 4816 * clear its stale flag, mark it as dirty and drop our reference 4817 * before the other task finishes executing free_extent_buffer, 4818 * which would later result in an attempt to free an extent 4819 * buffer that is dirty. 4820 */ 4821 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4822 spin_lock(&eb->refs_lock); 4823 spin_unlock(&eb->refs_lock); 4824 } 4825 mark_extent_buffer_accessed(eb, NULL); 4826 return eb; 4827 } 4828 rcu_read_unlock(); 4829 4830 return NULL; 4831 } 4832 4833 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4834 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4835 u64 start) 4836 { 4837 struct extent_buffer *eb, *exists = NULL; 4838 int ret; 4839 4840 eb = find_extent_buffer(fs_info, start); 4841 if (eb) 4842 return eb; 4843 eb = alloc_dummy_extent_buffer(fs_info, start); 4844 if (!eb) 4845 return NULL; 4846 eb->fs_info = fs_info; 4847 again: 4848 ret = radix_tree_preload(GFP_NOFS); 4849 if (ret) 4850 goto free_eb; 4851 spin_lock(&fs_info->buffer_lock); 4852 ret = radix_tree_insert(&fs_info->buffer_radix, 4853 start >> PAGE_SHIFT, eb); 4854 spin_unlock(&fs_info->buffer_lock); 4855 radix_tree_preload_end(); 4856 if (ret == -EEXIST) { 4857 exists = find_extent_buffer(fs_info, start); 4858 if (exists) 4859 goto free_eb; 4860 else 4861 goto again; 4862 } 4863 check_buffer_tree_ref(eb); 4864 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4865 4866 /* 4867 * We will free dummy extent buffer's if they come into 4868 * free_extent_buffer with a ref count of 2, but if we are using this we 4869 * want the buffers to stay in memory until we're done with them, so 4870 * bump the ref count again. 4871 */ 4872 atomic_inc(&eb->refs); 4873 return eb; 4874 free_eb: 4875 btrfs_release_extent_buffer(eb); 4876 return exists; 4877 } 4878 #endif 4879 4880 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4881 u64 start) 4882 { 4883 unsigned long len = fs_info->nodesize; 4884 unsigned long num_pages = num_extent_pages(start, len); 4885 unsigned long i; 4886 unsigned long index = start >> PAGE_SHIFT; 4887 struct extent_buffer *eb; 4888 struct extent_buffer *exists = NULL; 4889 struct page *p; 4890 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4891 int uptodate = 1; 4892 int ret; 4893 4894 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 4895 btrfs_err(fs_info, "bad tree block start %llu", start); 4896 return ERR_PTR(-EINVAL); 4897 } 4898 4899 eb = find_extent_buffer(fs_info, start); 4900 if (eb) 4901 return eb; 4902 4903 eb = __alloc_extent_buffer(fs_info, start, len); 4904 if (!eb) 4905 return ERR_PTR(-ENOMEM); 4906 4907 for (i = 0; i < num_pages; i++, index++) { 4908 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 4909 if (!p) { 4910 exists = ERR_PTR(-ENOMEM); 4911 goto free_eb; 4912 } 4913 4914 spin_lock(&mapping->private_lock); 4915 if (PagePrivate(p)) { 4916 /* 4917 * We could have already allocated an eb for this page 4918 * and attached one so lets see if we can get a ref on 4919 * the existing eb, and if we can we know it's good and 4920 * we can just return that one, else we know we can just 4921 * overwrite page->private. 4922 */ 4923 exists = (struct extent_buffer *)p->private; 4924 if (atomic_inc_not_zero(&exists->refs)) { 4925 spin_unlock(&mapping->private_lock); 4926 unlock_page(p); 4927 put_page(p); 4928 mark_extent_buffer_accessed(exists, p); 4929 goto free_eb; 4930 } 4931 exists = NULL; 4932 4933 /* 4934 * Do this so attach doesn't complain and we need to 4935 * drop the ref the old guy had. 4936 */ 4937 ClearPagePrivate(p); 4938 WARN_ON(PageDirty(p)); 4939 put_page(p); 4940 } 4941 attach_extent_buffer_page(eb, p); 4942 spin_unlock(&mapping->private_lock); 4943 WARN_ON(PageDirty(p)); 4944 eb->pages[i] = p; 4945 if (!PageUptodate(p)) 4946 uptodate = 0; 4947 4948 /* 4949 * see below about how we avoid a nasty race with release page 4950 * and why we unlock later 4951 */ 4952 } 4953 if (uptodate) 4954 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4955 again: 4956 ret = radix_tree_preload(GFP_NOFS); 4957 if (ret) { 4958 exists = ERR_PTR(ret); 4959 goto free_eb; 4960 } 4961 4962 spin_lock(&fs_info->buffer_lock); 4963 ret = radix_tree_insert(&fs_info->buffer_radix, 4964 start >> PAGE_SHIFT, eb); 4965 spin_unlock(&fs_info->buffer_lock); 4966 radix_tree_preload_end(); 4967 if (ret == -EEXIST) { 4968 exists = find_extent_buffer(fs_info, start); 4969 if (exists) 4970 goto free_eb; 4971 else 4972 goto again; 4973 } 4974 /* add one reference for the tree */ 4975 check_buffer_tree_ref(eb); 4976 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4977 4978 /* 4979 * there is a race where release page may have 4980 * tried to find this extent buffer in the radix 4981 * but failed. It will tell the VM it is safe to 4982 * reclaim the, and it will clear the page private bit. 4983 * We must make sure to set the page private bit properly 4984 * after the extent buffer is in the radix tree so 4985 * it doesn't get lost 4986 */ 4987 SetPageChecked(eb->pages[0]); 4988 for (i = 1; i < num_pages; i++) { 4989 p = eb->pages[i]; 4990 ClearPageChecked(p); 4991 unlock_page(p); 4992 } 4993 unlock_page(eb->pages[0]); 4994 return eb; 4995 4996 free_eb: 4997 WARN_ON(!atomic_dec_and_test(&eb->refs)); 4998 for (i = 0; i < num_pages; i++) { 4999 if (eb->pages[i]) 5000 unlock_page(eb->pages[i]); 5001 } 5002 5003 btrfs_release_extent_buffer(eb); 5004 return exists; 5005 } 5006 5007 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5008 { 5009 struct extent_buffer *eb = 5010 container_of(head, struct extent_buffer, rcu_head); 5011 5012 __free_extent_buffer(eb); 5013 } 5014 5015 /* Expects to have eb->eb_lock already held */ 5016 static int release_extent_buffer(struct extent_buffer *eb) 5017 { 5018 WARN_ON(atomic_read(&eb->refs) == 0); 5019 if (atomic_dec_and_test(&eb->refs)) { 5020 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5021 struct btrfs_fs_info *fs_info = eb->fs_info; 5022 5023 spin_unlock(&eb->refs_lock); 5024 5025 spin_lock(&fs_info->buffer_lock); 5026 radix_tree_delete(&fs_info->buffer_radix, 5027 eb->start >> PAGE_SHIFT); 5028 spin_unlock(&fs_info->buffer_lock); 5029 } else { 5030 spin_unlock(&eb->refs_lock); 5031 } 5032 5033 /* Should be safe to release our pages at this point */ 5034 btrfs_release_extent_buffer_page(eb); 5035 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5036 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5037 __free_extent_buffer(eb); 5038 return 1; 5039 } 5040 #endif 5041 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5042 return 1; 5043 } 5044 spin_unlock(&eb->refs_lock); 5045 5046 return 0; 5047 } 5048 5049 void free_extent_buffer(struct extent_buffer *eb) 5050 { 5051 int refs; 5052 int old; 5053 if (!eb) 5054 return; 5055 5056 while (1) { 5057 refs = atomic_read(&eb->refs); 5058 if (refs <= 3) 5059 break; 5060 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5061 if (old == refs) 5062 return; 5063 } 5064 5065 spin_lock(&eb->refs_lock); 5066 if (atomic_read(&eb->refs) == 2 && 5067 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5068 atomic_dec(&eb->refs); 5069 5070 if (atomic_read(&eb->refs) == 2 && 5071 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5072 !extent_buffer_under_io(eb) && 5073 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5074 atomic_dec(&eb->refs); 5075 5076 /* 5077 * I know this is terrible, but it's temporary until we stop tracking 5078 * the uptodate bits and such for the extent buffers. 5079 */ 5080 release_extent_buffer(eb); 5081 } 5082 5083 void free_extent_buffer_stale(struct extent_buffer *eb) 5084 { 5085 if (!eb) 5086 return; 5087 5088 spin_lock(&eb->refs_lock); 5089 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5090 5091 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5092 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5093 atomic_dec(&eb->refs); 5094 release_extent_buffer(eb); 5095 } 5096 5097 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5098 { 5099 unsigned long i; 5100 unsigned long num_pages; 5101 struct page *page; 5102 5103 num_pages = num_extent_pages(eb->start, eb->len); 5104 5105 for (i = 0; i < num_pages; i++) { 5106 page = eb->pages[i]; 5107 if (!PageDirty(page)) 5108 continue; 5109 5110 lock_page(page); 5111 WARN_ON(!PagePrivate(page)); 5112 5113 clear_page_dirty_for_io(page); 5114 spin_lock_irq(&page->mapping->tree_lock); 5115 if (!PageDirty(page)) { 5116 radix_tree_tag_clear(&page->mapping->page_tree, 5117 page_index(page), 5118 PAGECACHE_TAG_DIRTY); 5119 } 5120 spin_unlock_irq(&page->mapping->tree_lock); 5121 ClearPageError(page); 5122 unlock_page(page); 5123 } 5124 WARN_ON(atomic_read(&eb->refs) == 0); 5125 } 5126 5127 int set_extent_buffer_dirty(struct extent_buffer *eb) 5128 { 5129 unsigned long i; 5130 unsigned long num_pages; 5131 int was_dirty = 0; 5132 5133 check_buffer_tree_ref(eb); 5134 5135 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5136 5137 num_pages = num_extent_pages(eb->start, eb->len); 5138 WARN_ON(atomic_read(&eb->refs) == 0); 5139 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5140 5141 for (i = 0; i < num_pages; i++) 5142 set_page_dirty(eb->pages[i]); 5143 return was_dirty; 5144 } 5145 5146 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5147 { 5148 unsigned long i; 5149 struct page *page; 5150 unsigned long num_pages; 5151 5152 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5153 num_pages = num_extent_pages(eb->start, eb->len); 5154 for (i = 0; i < num_pages; i++) { 5155 page = eb->pages[i]; 5156 if (page) 5157 ClearPageUptodate(page); 5158 } 5159 } 5160 5161 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5162 { 5163 unsigned long i; 5164 struct page *page; 5165 unsigned long num_pages; 5166 5167 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5168 num_pages = num_extent_pages(eb->start, eb->len); 5169 for (i = 0; i < num_pages; i++) { 5170 page = eb->pages[i]; 5171 SetPageUptodate(page); 5172 } 5173 } 5174 5175 int extent_buffer_uptodate(struct extent_buffer *eb) 5176 { 5177 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5178 } 5179 5180 int read_extent_buffer_pages(struct extent_io_tree *tree, 5181 struct extent_buffer *eb, int wait, 5182 get_extent_t *get_extent, int mirror_num) 5183 { 5184 unsigned long i; 5185 struct page *page; 5186 int err; 5187 int ret = 0; 5188 int locked_pages = 0; 5189 int all_uptodate = 1; 5190 unsigned long num_pages; 5191 unsigned long num_reads = 0; 5192 struct bio *bio = NULL; 5193 unsigned long bio_flags = 0; 5194 5195 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5196 return 0; 5197 5198 num_pages = num_extent_pages(eb->start, eb->len); 5199 for (i = 0; i < num_pages; i++) { 5200 page = eb->pages[i]; 5201 if (wait == WAIT_NONE) { 5202 if (!trylock_page(page)) 5203 goto unlock_exit; 5204 } else { 5205 lock_page(page); 5206 } 5207 locked_pages++; 5208 } 5209 /* 5210 * We need to firstly lock all pages to make sure that 5211 * the uptodate bit of our pages won't be affected by 5212 * clear_extent_buffer_uptodate(). 5213 */ 5214 for (i = 0; i < num_pages; i++) { 5215 page = eb->pages[i]; 5216 if (!PageUptodate(page)) { 5217 num_reads++; 5218 all_uptodate = 0; 5219 } 5220 } 5221 5222 if (all_uptodate) { 5223 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5224 goto unlock_exit; 5225 } 5226 5227 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5228 eb->read_mirror = 0; 5229 atomic_set(&eb->io_pages, num_reads); 5230 for (i = 0; i < num_pages; i++) { 5231 page = eb->pages[i]; 5232 5233 if (!PageUptodate(page)) { 5234 if (ret) { 5235 atomic_dec(&eb->io_pages); 5236 unlock_page(page); 5237 continue; 5238 } 5239 5240 ClearPageError(page); 5241 err = __extent_read_full_page(tree, page, 5242 get_extent, &bio, 5243 mirror_num, &bio_flags, 5244 REQ_META); 5245 if (err) { 5246 ret = err; 5247 /* 5248 * We use &bio in above __extent_read_full_page, 5249 * so we ensure that if it returns error, the 5250 * current page fails to add itself to bio and 5251 * it's been unlocked. 5252 * 5253 * We must dec io_pages by ourselves. 5254 */ 5255 atomic_dec(&eb->io_pages); 5256 } 5257 } else { 5258 unlock_page(page); 5259 } 5260 } 5261 5262 if (bio) { 5263 err = submit_one_bio(bio, mirror_num, bio_flags); 5264 if (err) 5265 return err; 5266 } 5267 5268 if (ret || wait != WAIT_COMPLETE) 5269 return ret; 5270 5271 for (i = 0; i < num_pages; i++) { 5272 page = eb->pages[i]; 5273 wait_on_page_locked(page); 5274 if (!PageUptodate(page)) 5275 ret = -EIO; 5276 } 5277 5278 return ret; 5279 5280 unlock_exit: 5281 while (locked_pages > 0) { 5282 locked_pages--; 5283 page = eb->pages[locked_pages]; 5284 unlock_page(page); 5285 } 5286 return ret; 5287 } 5288 5289 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5290 unsigned long start, 5291 unsigned long len) 5292 { 5293 size_t cur; 5294 size_t offset; 5295 struct page *page; 5296 char *kaddr; 5297 char *dst = (char *)dstv; 5298 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5299 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5300 5301 WARN_ON(start > eb->len); 5302 WARN_ON(start + len > eb->start + eb->len); 5303 5304 offset = (start_offset + start) & (PAGE_SIZE - 1); 5305 5306 while (len > 0) { 5307 page = eb->pages[i]; 5308 5309 cur = min(len, (PAGE_SIZE - offset)); 5310 kaddr = page_address(page); 5311 memcpy(dst, kaddr + offset, cur); 5312 5313 dst += cur; 5314 len -= cur; 5315 offset = 0; 5316 i++; 5317 } 5318 } 5319 5320 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5321 unsigned long start, 5322 unsigned long len) 5323 { 5324 size_t cur; 5325 size_t offset; 5326 struct page *page; 5327 char *kaddr; 5328 char __user *dst = (char __user *)dstv; 5329 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5330 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5331 int ret = 0; 5332 5333 WARN_ON(start > eb->len); 5334 WARN_ON(start + len > eb->start + eb->len); 5335 5336 offset = (start_offset + start) & (PAGE_SIZE - 1); 5337 5338 while (len > 0) { 5339 page = eb->pages[i]; 5340 5341 cur = min(len, (PAGE_SIZE - offset)); 5342 kaddr = page_address(page); 5343 if (copy_to_user(dst, kaddr + offset, cur)) { 5344 ret = -EFAULT; 5345 break; 5346 } 5347 5348 dst += cur; 5349 len -= cur; 5350 offset = 0; 5351 i++; 5352 } 5353 5354 return ret; 5355 } 5356 5357 /* 5358 * return 0 if the item is found within a page. 5359 * return 1 if the item spans two pages. 5360 * return -EINVAL otherwise. 5361 */ 5362 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5363 unsigned long min_len, char **map, 5364 unsigned long *map_start, 5365 unsigned long *map_len) 5366 { 5367 size_t offset = start & (PAGE_SIZE - 1); 5368 char *kaddr; 5369 struct page *p; 5370 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5371 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5372 unsigned long end_i = (start_offset + start + min_len - 1) >> 5373 PAGE_SHIFT; 5374 5375 if (i != end_i) 5376 return 1; 5377 5378 if (i == 0) { 5379 offset = start_offset; 5380 *map_start = 0; 5381 } else { 5382 offset = 0; 5383 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5384 } 5385 5386 if (start + min_len > eb->len) { 5387 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5388 eb->start, eb->len, start, min_len); 5389 return -EINVAL; 5390 } 5391 5392 p = eb->pages[i]; 5393 kaddr = page_address(p); 5394 *map = kaddr + offset; 5395 *map_len = PAGE_SIZE - offset; 5396 return 0; 5397 } 5398 5399 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5400 unsigned long start, 5401 unsigned long len) 5402 { 5403 size_t cur; 5404 size_t offset; 5405 struct page *page; 5406 char *kaddr; 5407 char *ptr = (char *)ptrv; 5408 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5409 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5410 int ret = 0; 5411 5412 WARN_ON(start > eb->len); 5413 WARN_ON(start + len > eb->start + eb->len); 5414 5415 offset = (start_offset + start) & (PAGE_SIZE - 1); 5416 5417 while (len > 0) { 5418 page = eb->pages[i]; 5419 5420 cur = min(len, (PAGE_SIZE - offset)); 5421 5422 kaddr = page_address(page); 5423 ret = memcmp(ptr, kaddr + offset, cur); 5424 if (ret) 5425 break; 5426 5427 ptr += cur; 5428 len -= cur; 5429 offset = 0; 5430 i++; 5431 } 5432 return ret; 5433 } 5434 5435 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5436 const void *srcv) 5437 { 5438 char *kaddr; 5439 5440 WARN_ON(!PageUptodate(eb->pages[0])); 5441 kaddr = page_address(eb->pages[0]); 5442 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5443 BTRFS_FSID_SIZE); 5444 } 5445 5446 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5447 { 5448 char *kaddr; 5449 5450 WARN_ON(!PageUptodate(eb->pages[0])); 5451 kaddr = page_address(eb->pages[0]); 5452 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5453 BTRFS_FSID_SIZE); 5454 } 5455 5456 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5457 unsigned long start, unsigned long len) 5458 { 5459 size_t cur; 5460 size_t offset; 5461 struct page *page; 5462 char *kaddr; 5463 char *src = (char *)srcv; 5464 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5465 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5466 5467 WARN_ON(start > eb->len); 5468 WARN_ON(start + len > eb->start + eb->len); 5469 5470 offset = (start_offset + start) & (PAGE_SIZE - 1); 5471 5472 while (len > 0) { 5473 page = eb->pages[i]; 5474 WARN_ON(!PageUptodate(page)); 5475 5476 cur = min(len, PAGE_SIZE - offset); 5477 kaddr = page_address(page); 5478 memcpy(kaddr + offset, src, cur); 5479 5480 src += cur; 5481 len -= cur; 5482 offset = 0; 5483 i++; 5484 } 5485 } 5486 5487 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5488 unsigned long len) 5489 { 5490 size_t cur; 5491 size_t offset; 5492 struct page *page; 5493 char *kaddr; 5494 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5495 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5496 5497 WARN_ON(start > eb->len); 5498 WARN_ON(start + len > eb->start + eb->len); 5499 5500 offset = (start_offset + start) & (PAGE_SIZE - 1); 5501 5502 while (len > 0) { 5503 page = eb->pages[i]; 5504 WARN_ON(!PageUptodate(page)); 5505 5506 cur = min(len, PAGE_SIZE - offset); 5507 kaddr = page_address(page); 5508 memset(kaddr + offset, 0, cur); 5509 5510 len -= cur; 5511 offset = 0; 5512 i++; 5513 } 5514 } 5515 5516 void copy_extent_buffer_full(struct extent_buffer *dst, 5517 struct extent_buffer *src) 5518 { 5519 int i; 5520 unsigned num_pages; 5521 5522 ASSERT(dst->len == src->len); 5523 5524 num_pages = num_extent_pages(dst->start, dst->len); 5525 for (i = 0; i < num_pages; i++) 5526 copy_page(page_address(dst->pages[i]), 5527 page_address(src->pages[i])); 5528 } 5529 5530 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5531 unsigned long dst_offset, unsigned long src_offset, 5532 unsigned long len) 5533 { 5534 u64 dst_len = dst->len; 5535 size_t cur; 5536 size_t offset; 5537 struct page *page; 5538 char *kaddr; 5539 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5540 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5541 5542 WARN_ON(src->len != dst_len); 5543 5544 offset = (start_offset + dst_offset) & 5545 (PAGE_SIZE - 1); 5546 5547 while (len > 0) { 5548 page = dst->pages[i]; 5549 WARN_ON(!PageUptodate(page)); 5550 5551 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5552 5553 kaddr = page_address(page); 5554 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5555 5556 src_offset += cur; 5557 len -= cur; 5558 offset = 0; 5559 i++; 5560 } 5561 } 5562 5563 void le_bitmap_set(u8 *map, unsigned int start, int len) 5564 { 5565 u8 *p = map + BIT_BYTE(start); 5566 const unsigned int size = start + len; 5567 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5568 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5569 5570 while (len - bits_to_set >= 0) { 5571 *p |= mask_to_set; 5572 len -= bits_to_set; 5573 bits_to_set = BITS_PER_BYTE; 5574 mask_to_set = ~0; 5575 p++; 5576 } 5577 if (len) { 5578 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5579 *p |= mask_to_set; 5580 } 5581 } 5582 5583 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5584 { 5585 u8 *p = map + BIT_BYTE(start); 5586 const unsigned int size = start + len; 5587 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5588 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5589 5590 while (len - bits_to_clear >= 0) { 5591 *p &= ~mask_to_clear; 5592 len -= bits_to_clear; 5593 bits_to_clear = BITS_PER_BYTE; 5594 mask_to_clear = ~0; 5595 p++; 5596 } 5597 if (len) { 5598 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5599 *p &= ~mask_to_clear; 5600 } 5601 } 5602 5603 /* 5604 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5605 * given bit number 5606 * @eb: the extent buffer 5607 * @start: offset of the bitmap item in the extent buffer 5608 * @nr: bit number 5609 * @page_index: return index of the page in the extent buffer that contains the 5610 * given bit number 5611 * @page_offset: return offset into the page given by page_index 5612 * 5613 * This helper hides the ugliness of finding the byte in an extent buffer which 5614 * contains a given bit. 5615 */ 5616 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5617 unsigned long start, unsigned long nr, 5618 unsigned long *page_index, 5619 size_t *page_offset) 5620 { 5621 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5622 size_t byte_offset = BIT_BYTE(nr); 5623 size_t offset; 5624 5625 /* 5626 * The byte we want is the offset of the extent buffer + the offset of 5627 * the bitmap item in the extent buffer + the offset of the byte in the 5628 * bitmap item. 5629 */ 5630 offset = start_offset + start + byte_offset; 5631 5632 *page_index = offset >> PAGE_SHIFT; 5633 *page_offset = offset & (PAGE_SIZE - 1); 5634 } 5635 5636 /** 5637 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5638 * @eb: the extent buffer 5639 * @start: offset of the bitmap item in the extent buffer 5640 * @nr: bit number to test 5641 */ 5642 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5643 unsigned long nr) 5644 { 5645 u8 *kaddr; 5646 struct page *page; 5647 unsigned long i; 5648 size_t offset; 5649 5650 eb_bitmap_offset(eb, start, nr, &i, &offset); 5651 page = eb->pages[i]; 5652 WARN_ON(!PageUptodate(page)); 5653 kaddr = page_address(page); 5654 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5655 } 5656 5657 /** 5658 * extent_buffer_bitmap_set - set an area of a bitmap 5659 * @eb: the extent buffer 5660 * @start: offset of the bitmap item in the extent buffer 5661 * @pos: bit number of the first bit 5662 * @len: number of bits to set 5663 */ 5664 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5665 unsigned long pos, unsigned long len) 5666 { 5667 u8 *kaddr; 5668 struct page *page; 5669 unsigned long i; 5670 size_t offset; 5671 const unsigned int size = pos + len; 5672 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5673 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5674 5675 eb_bitmap_offset(eb, start, pos, &i, &offset); 5676 page = eb->pages[i]; 5677 WARN_ON(!PageUptodate(page)); 5678 kaddr = page_address(page); 5679 5680 while (len >= bits_to_set) { 5681 kaddr[offset] |= mask_to_set; 5682 len -= bits_to_set; 5683 bits_to_set = BITS_PER_BYTE; 5684 mask_to_set = ~0; 5685 if (++offset >= PAGE_SIZE && len > 0) { 5686 offset = 0; 5687 page = eb->pages[++i]; 5688 WARN_ON(!PageUptodate(page)); 5689 kaddr = page_address(page); 5690 } 5691 } 5692 if (len) { 5693 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5694 kaddr[offset] |= mask_to_set; 5695 } 5696 } 5697 5698 5699 /** 5700 * extent_buffer_bitmap_clear - clear an area of a bitmap 5701 * @eb: the extent buffer 5702 * @start: offset of the bitmap item in the extent buffer 5703 * @pos: bit number of the first bit 5704 * @len: number of bits to clear 5705 */ 5706 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5707 unsigned long pos, unsigned long len) 5708 { 5709 u8 *kaddr; 5710 struct page *page; 5711 unsigned long i; 5712 size_t offset; 5713 const unsigned int size = pos + len; 5714 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5715 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5716 5717 eb_bitmap_offset(eb, start, pos, &i, &offset); 5718 page = eb->pages[i]; 5719 WARN_ON(!PageUptodate(page)); 5720 kaddr = page_address(page); 5721 5722 while (len >= bits_to_clear) { 5723 kaddr[offset] &= ~mask_to_clear; 5724 len -= bits_to_clear; 5725 bits_to_clear = BITS_PER_BYTE; 5726 mask_to_clear = ~0; 5727 if (++offset >= PAGE_SIZE && len > 0) { 5728 offset = 0; 5729 page = eb->pages[++i]; 5730 WARN_ON(!PageUptodate(page)); 5731 kaddr = page_address(page); 5732 } 5733 } 5734 if (len) { 5735 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5736 kaddr[offset] &= ~mask_to_clear; 5737 } 5738 } 5739 5740 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5741 { 5742 unsigned long distance = (src > dst) ? src - dst : dst - src; 5743 return distance < len; 5744 } 5745 5746 static void copy_pages(struct page *dst_page, struct page *src_page, 5747 unsigned long dst_off, unsigned long src_off, 5748 unsigned long len) 5749 { 5750 char *dst_kaddr = page_address(dst_page); 5751 char *src_kaddr; 5752 int must_memmove = 0; 5753 5754 if (dst_page != src_page) { 5755 src_kaddr = page_address(src_page); 5756 } else { 5757 src_kaddr = dst_kaddr; 5758 if (areas_overlap(src_off, dst_off, len)) 5759 must_memmove = 1; 5760 } 5761 5762 if (must_memmove) 5763 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5764 else 5765 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5766 } 5767 5768 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5769 unsigned long src_offset, unsigned long len) 5770 { 5771 struct btrfs_fs_info *fs_info = dst->fs_info; 5772 size_t cur; 5773 size_t dst_off_in_page; 5774 size_t src_off_in_page; 5775 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5776 unsigned long dst_i; 5777 unsigned long src_i; 5778 5779 if (src_offset + len > dst->len) { 5780 btrfs_err(fs_info, 5781 "memmove bogus src_offset %lu move len %lu dst len %lu", 5782 src_offset, len, dst->len); 5783 BUG_ON(1); 5784 } 5785 if (dst_offset + len > dst->len) { 5786 btrfs_err(fs_info, 5787 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5788 dst_offset, len, dst->len); 5789 BUG_ON(1); 5790 } 5791 5792 while (len > 0) { 5793 dst_off_in_page = (start_offset + dst_offset) & 5794 (PAGE_SIZE - 1); 5795 src_off_in_page = (start_offset + src_offset) & 5796 (PAGE_SIZE - 1); 5797 5798 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5799 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5800 5801 cur = min(len, (unsigned long)(PAGE_SIZE - 5802 src_off_in_page)); 5803 cur = min_t(unsigned long, cur, 5804 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5805 5806 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5807 dst_off_in_page, src_off_in_page, cur); 5808 5809 src_offset += cur; 5810 dst_offset += cur; 5811 len -= cur; 5812 } 5813 } 5814 5815 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5816 unsigned long src_offset, unsigned long len) 5817 { 5818 struct btrfs_fs_info *fs_info = dst->fs_info; 5819 size_t cur; 5820 size_t dst_off_in_page; 5821 size_t src_off_in_page; 5822 unsigned long dst_end = dst_offset + len - 1; 5823 unsigned long src_end = src_offset + len - 1; 5824 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5825 unsigned long dst_i; 5826 unsigned long src_i; 5827 5828 if (src_offset + len > dst->len) { 5829 btrfs_err(fs_info, 5830 "memmove bogus src_offset %lu move len %lu len %lu", 5831 src_offset, len, dst->len); 5832 BUG_ON(1); 5833 } 5834 if (dst_offset + len > dst->len) { 5835 btrfs_err(fs_info, 5836 "memmove bogus dst_offset %lu move len %lu len %lu", 5837 dst_offset, len, dst->len); 5838 BUG_ON(1); 5839 } 5840 if (dst_offset < src_offset) { 5841 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5842 return; 5843 } 5844 while (len > 0) { 5845 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5846 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5847 5848 dst_off_in_page = (start_offset + dst_end) & 5849 (PAGE_SIZE - 1); 5850 src_off_in_page = (start_offset + src_end) & 5851 (PAGE_SIZE - 1); 5852 5853 cur = min_t(unsigned long, len, src_off_in_page + 1); 5854 cur = min(cur, dst_off_in_page + 1); 5855 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5856 dst_off_in_page - cur + 1, 5857 src_off_in_page - cur + 1, cur); 5858 5859 dst_end -= cur; 5860 src_end -= cur; 5861 len -= cur; 5862 } 5863 } 5864 5865 int try_release_extent_buffer(struct page *page) 5866 { 5867 struct extent_buffer *eb; 5868 5869 /* 5870 * We need to make sure nobody is attaching this page to an eb right 5871 * now. 5872 */ 5873 spin_lock(&page->mapping->private_lock); 5874 if (!PagePrivate(page)) { 5875 spin_unlock(&page->mapping->private_lock); 5876 return 1; 5877 } 5878 5879 eb = (struct extent_buffer *)page->private; 5880 BUG_ON(!eb); 5881 5882 /* 5883 * This is a little awful but should be ok, we need to make sure that 5884 * the eb doesn't disappear out from under us while we're looking at 5885 * this page. 5886 */ 5887 spin_lock(&eb->refs_lock); 5888 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5889 spin_unlock(&eb->refs_lock); 5890 spin_unlock(&page->mapping->private_lock); 5891 return 0; 5892 } 5893 spin_unlock(&page->mapping->private_lock); 5894 5895 /* 5896 * If tree ref isn't set then we know the ref on this eb is a real ref, 5897 * so just return, this page will likely be freed soon anyway. 5898 */ 5899 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5900 spin_unlock(&eb->refs_lock); 5901 return 0; 5902 } 5903 5904 return release_extent_buffer(eb); 5905 } 5906