1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/writeback.h> 13 #include <linux/pagevec.h> 14 #include <linux/prefetch.h> 15 #include <linux/cleancache.h> 16 #include "extent_io.h" 17 #include "extent_map.h" 18 #include "ctree.h" 19 #include "btrfs_inode.h" 20 #include "volumes.h" 21 #include "check-integrity.h" 22 #include "locking.h" 23 #include "rcu-string.h" 24 #include "backref.h" 25 #include "disk-io.h" 26 27 static struct kmem_cache *extent_state_cache; 28 static struct kmem_cache *extent_buffer_cache; 29 static struct bio_set btrfs_bioset; 30 31 static inline bool extent_state_in_tree(const struct extent_state *state) 32 { 33 return !RB_EMPTY_NODE(&state->rb_node); 34 } 35 36 #ifdef CONFIG_BTRFS_DEBUG 37 static LIST_HEAD(buffers); 38 static LIST_HEAD(states); 39 40 static DEFINE_SPINLOCK(leak_lock); 41 42 static inline 43 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 44 { 45 unsigned long flags; 46 47 spin_lock_irqsave(&leak_lock, flags); 48 list_add(new, head); 49 spin_unlock_irqrestore(&leak_lock, flags); 50 } 51 52 static inline 53 void btrfs_leak_debug_del(struct list_head *entry) 54 { 55 unsigned long flags; 56 57 spin_lock_irqsave(&leak_lock, flags); 58 list_del(entry); 59 spin_unlock_irqrestore(&leak_lock, flags); 60 } 61 62 static inline 63 void btrfs_leak_debug_check(void) 64 { 65 struct extent_state *state; 66 struct extent_buffer *eb; 67 68 while (!list_empty(&states)) { 69 state = list_entry(states.next, struct extent_state, leak_list); 70 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 71 state->start, state->end, state->state, 72 extent_state_in_tree(state), 73 refcount_read(&state->refs)); 74 list_del(&state->leak_list); 75 kmem_cache_free(extent_state_cache, state); 76 } 77 78 while (!list_empty(&buffers)) { 79 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 80 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", 81 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); 82 list_del(&eb->leak_list); 83 kmem_cache_free(extent_buffer_cache, eb); 84 } 85 } 86 87 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 88 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 89 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 90 struct extent_io_tree *tree, u64 start, u64 end) 91 { 92 struct inode *inode = tree->private_data; 93 u64 isize; 94 95 if (!inode || !is_data_inode(inode)) 96 return; 97 98 isize = i_size_read(inode); 99 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 100 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 101 "%s: ino %llu isize %llu odd range [%llu,%llu]", 102 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 103 } 104 } 105 #else 106 #define btrfs_leak_debug_add(new, head) do {} while (0) 107 #define btrfs_leak_debug_del(entry) do {} while (0) 108 #define btrfs_leak_debug_check() do {} while (0) 109 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 110 #endif 111 112 #define BUFFER_LRU_MAX 64 113 114 struct tree_entry { 115 u64 start; 116 u64 end; 117 struct rb_node rb_node; 118 }; 119 120 struct extent_page_data { 121 struct bio *bio; 122 struct extent_io_tree *tree; 123 /* tells writepage not to lock the state bits for this range 124 * it still does the unlocking 125 */ 126 unsigned int extent_locked:1; 127 128 /* tells the submit_bio code to use REQ_SYNC */ 129 unsigned int sync_io:1; 130 }; 131 132 static int add_extent_changeset(struct extent_state *state, unsigned bits, 133 struct extent_changeset *changeset, 134 int set) 135 { 136 int ret; 137 138 if (!changeset) 139 return 0; 140 if (set && (state->state & bits) == bits) 141 return 0; 142 if (!set && (state->state & bits) == 0) 143 return 0; 144 changeset->bytes_changed += state->end - state->start + 1; 145 ret = ulist_add(&changeset->range_changed, state->start, state->end, 146 GFP_ATOMIC); 147 return ret; 148 } 149 150 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 151 unsigned long bio_flags) 152 { 153 blk_status_t ret = 0; 154 struct bio_vec *bvec = bio_last_bvec_all(bio); 155 struct bio_vec bv; 156 struct extent_io_tree *tree = bio->bi_private; 157 u64 start; 158 159 mp_bvec_last_segment(bvec, &bv); 160 start = page_offset(bv.bv_page) + bv.bv_offset; 161 162 bio->bi_private = NULL; 163 164 if (tree->ops) 165 ret = tree->ops->submit_bio_hook(tree->private_data, bio, 166 mirror_num, bio_flags, start); 167 else 168 btrfsic_submit_bio(bio); 169 170 return blk_status_to_errno(ret); 171 } 172 173 static void flush_write_bio(struct extent_page_data *epd) 174 { 175 if (epd->bio) { 176 int ret; 177 178 ret = submit_one_bio(epd->bio, 0, 0); 179 BUG_ON(ret < 0); /* -ENOMEM */ 180 epd->bio = NULL; 181 } 182 } 183 184 int __init extent_io_init(void) 185 { 186 extent_state_cache = kmem_cache_create("btrfs_extent_state", 187 sizeof(struct extent_state), 0, 188 SLAB_MEM_SPREAD, NULL); 189 if (!extent_state_cache) 190 return -ENOMEM; 191 192 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 193 sizeof(struct extent_buffer), 0, 194 SLAB_MEM_SPREAD, NULL); 195 if (!extent_buffer_cache) 196 goto free_state_cache; 197 198 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 199 offsetof(struct btrfs_io_bio, bio), 200 BIOSET_NEED_BVECS)) 201 goto free_buffer_cache; 202 203 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 204 goto free_bioset; 205 206 return 0; 207 208 free_bioset: 209 bioset_exit(&btrfs_bioset); 210 211 free_buffer_cache: 212 kmem_cache_destroy(extent_buffer_cache); 213 extent_buffer_cache = NULL; 214 215 free_state_cache: 216 kmem_cache_destroy(extent_state_cache); 217 extent_state_cache = NULL; 218 return -ENOMEM; 219 } 220 221 void __cold extent_io_exit(void) 222 { 223 btrfs_leak_debug_check(); 224 225 /* 226 * Make sure all delayed rcu free are flushed before we 227 * destroy caches. 228 */ 229 rcu_barrier(); 230 kmem_cache_destroy(extent_state_cache); 231 kmem_cache_destroy(extent_buffer_cache); 232 bioset_exit(&btrfs_bioset); 233 } 234 235 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 236 struct extent_io_tree *tree, void *private_data) 237 { 238 tree->fs_info = fs_info; 239 tree->state = RB_ROOT; 240 tree->ops = NULL; 241 tree->dirty_bytes = 0; 242 spin_lock_init(&tree->lock); 243 tree->private_data = private_data; 244 } 245 246 static struct extent_state *alloc_extent_state(gfp_t mask) 247 { 248 struct extent_state *state; 249 250 /* 251 * The given mask might be not appropriate for the slab allocator, 252 * drop the unsupported bits 253 */ 254 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 255 state = kmem_cache_alloc(extent_state_cache, mask); 256 if (!state) 257 return state; 258 state->state = 0; 259 state->failrec = NULL; 260 RB_CLEAR_NODE(&state->rb_node); 261 btrfs_leak_debug_add(&state->leak_list, &states); 262 refcount_set(&state->refs, 1); 263 init_waitqueue_head(&state->wq); 264 trace_alloc_extent_state(state, mask, _RET_IP_); 265 return state; 266 } 267 268 void free_extent_state(struct extent_state *state) 269 { 270 if (!state) 271 return; 272 if (refcount_dec_and_test(&state->refs)) { 273 WARN_ON(extent_state_in_tree(state)); 274 btrfs_leak_debug_del(&state->leak_list); 275 trace_free_extent_state(state, _RET_IP_); 276 kmem_cache_free(extent_state_cache, state); 277 } 278 } 279 280 static struct rb_node *tree_insert(struct rb_root *root, 281 struct rb_node *search_start, 282 u64 offset, 283 struct rb_node *node, 284 struct rb_node ***p_in, 285 struct rb_node **parent_in) 286 { 287 struct rb_node **p; 288 struct rb_node *parent = NULL; 289 struct tree_entry *entry; 290 291 if (p_in && parent_in) { 292 p = *p_in; 293 parent = *parent_in; 294 goto do_insert; 295 } 296 297 p = search_start ? &search_start : &root->rb_node; 298 while (*p) { 299 parent = *p; 300 entry = rb_entry(parent, struct tree_entry, rb_node); 301 302 if (offset < entry->start) 303 p = &(*p)->rb_left; 304 else if (offset > entry->end) 305 p = &(*p)->rb_right; 306 else 307 return parent; 308 } 309 310 do_insert: 311 rb_link_node(node, parent, p); 312 rb_insert_color(node, root); 313 return NULL; 314 } 315 316 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 317 struct rb_node **next_ret, 318 struct rb_node **prev_ret, 319 struct rb_node ***p_ret, 320 struct rb_node **parent_ret) 321 { 322 struct rb_root *root = &tree->state; 323 struct rb_node **n = &root->rb_node; 324 struct rb_node *prev = NULL; 325 struct rb_node *orig_prev = NULL; 326 struct tree_entry *entry; 327 struct tree_entry *prev_entry = NULL; 328 329 while (*n) { 330 prev = *n; 331 entry = rb_entry(prev, struct tree_entry, rb_node); 332 prev_entry = entry; 333 334 if (offset < entry->start) 335 n = &(*n)->rb_left; 336 else if (offset > entry->end) 337 n = &(*n)->rb_right; 338 else 339 return *n; 340 } 341 342 if (p_ret) 343 *p_ret = n; 344 if (parent_ret) 345 *parent_ret = prev; 346 347 if (next_ret) { 348 orig_prev = prev; 349 while (prev && offset > prev_entry->end) { 350 prev = rb_next(prev); 351 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 352 } 353 *next_ret = prev; 354 prev = orig_prev; 355 } 356 357 if (prev_ret) { 358 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 359 while (prev && offset < prev_entry->start) { 360 prev = rb_prev(prev); 361 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 362 } 363 *prev_ret = prev; 364 } 365 return NULL; 366 } 367 368 static inline struct rb_node * 369 tree_search_for_insert(struct extent_io_tree *tree, 370 u64 offset, 371 struct rb_node ***p_ret, 372 struct rb_node **parent_ret) 373 { 374 struct rb_node *next= NULL; 375 struct rb_node *ret; 376 377 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); 378 if (!ret) 379 return next; 380 return ret; 381 } 382 383 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 384 u64 offset) 385 { 386 return tree_search_for_insert(tree, offset, NULL, NULL); 387 } 388 389 /* 390 * utility function to look for merge candidates inside a given range. 391 * Any extents with matching state are merged together into a single 392 * extent in the tree. Extents with EXTENT_IO in their state field 393 * are not merged because the end_io handlers need to be able to do 394 * operations on them without sleeping (or doing allocations/splits). 395 * 396 * This should be called with the tree lock held. 397 */ 398 static void merge_state(struct extent_io_tree *tree, 399 struct extent_state *state) 400 { 401 struct extent_state *other; 402 struct rb_node *other_node; 403 404 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 405 return; 406 407 other_node = rb_prev(&state->rb_node); 408 if (other_node) { 409 other = rb_entry(other_node, struct extent_state, rb_node); 410 if (other->end == state->start - 1 && 411 other->state == state->state) { 412 if (tree->private_data && 413 is_data_inode(tree->private_data)) 414 btrfs_merge_delalloc_extent(tree->private_data, 415 state, other); 416 state->start = other->start; 417 rb_erase(&other->rb_node, &tree->state); 418 RB_CLEAR_NODE(&other->rb_node); 419 free_extent_state(other); 420 } 421 } 422 other_node = rb_next(&state->rb_node); 423 if (other_node) { 424 other = rb_entry(other_node, struct extent_state, rb_node); 425 if (other->start == state->end + 1 && 426 other->state == state->state) { 427 if (tree->private_data && 428 is_data_inode(tree->private_data)) 429 btrfs_merge_delalloc_extent(tree->private_data, 430 state, other); 431 state->end = other->end; 432 rb_erase(&other->rb_node, &tree->state); 433 RB_CLEAR_NODE(&other->rb_node); 434 free_extent_state(other); 435 } 436 } 437 } 438 439 static void set_state_bits(struct extent_io_tree *tree, 440 struct extent_state *state, unsigned *bits, 441 struct extent_changeset *changeset); 442 443 /* 444 * insert an extent_state struct into the tree. 'bits' are set on the 445 * struct before it is inserted. 446 * 447 * This may return -EEXIST if the extent is already there, in which case the 448 * state struct is freed. 449 * 450 * The tree lock is not taken internally. This is a utility function and 451 * probably isn't what you want to call (see set/clear_extent_bit). 452 */ 453 static int insert_state(struct extent_io_tree *tree, 454 struct extent_state *state, u64 start, u64 end, 455 struct rb_node ***p, 456 struct rb_node **parent, 457 unsigned *bits, struct extent_changeset *changeset) 458 { 459 struct rb_node *node; 460 461 if (end < start) 462 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 463 end, start); 464 state->start = start; 465 state->end = end; 466 467 set_state_bits(tree, state, bits, changeset); 468 469 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 470 if (node) { 471 struct extent_state *found; 472 found = rb_entry(node, struct extent_state, rb_node); 473 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 474 found->start, found->end, start, end); 475 return -EEXIST; 476 } 477 merge_state(tree, state); 478 return 0; 479 } 480 481 /* 482 * split a given extent state struct in two, inserting the preallocated 483 * struct 'prealloc' as the newly created second half. 'split' indicates an 484 * offset inside 'orig' where it should be split. 485 * 486 * Before calling, 487 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 488 * are two extent state structs in the tree: 489 * prealloc: [orig->start, split - 1] 490 * orig: [ split, orig->end ] 491 * 492 * The tree locks are not taken by this function. They need to be held 493 * by the caller. 494 */ 495 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 496 struct extent_state *prealloc, u64 split) 497 { 498 struct rb_node *node; 499 500 if (tree->private_data && is_data_inode(tree->private_data)) 501 btrfs_split_delalloc_extent(tree->private_data, orig, split); 502 503 prealloc->start = orig->start; 504 prealloc->end = split - 1; 505 prealloc->state = orig->state; 506 orig->start = split; 507 508 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 509 &prealloc->rb_node, NULL, NULL); 510 if (node) { 511 free_extent_state(prealloc); 512 return -EEXIST; 513 } 514 return 0; 515 } 516 517 static struct extent_state *next_state(struct extent_state *state) 518 { 519 struct rb_node *next = rb_next(&state->rb_node); 520 if (next) 521 return rb_entry(next, struct extent_state, rb_node); 522 else 523 return NULL; 524 } 525 526 /* 527 * utility function to clear some bits in an extent state struct. 528 * it will optionally wake up anyone waiting on this state (wake == 1). 529 * 530 * If no bits are set on the state struct after clearing things, the 531 * struct is freed and removed from the tree 532 */ 533 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 534 struct extent_state *state, 535 unsigned *bits, int wake, 536 struct extent_changeset *changeset) 537 { 538 struct extent_state *next; 539 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 540 int ret; 541 542 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 543 u64 range = state->end - state->start + 1; 544 WARN_ON(range > tree->dirty_bytes); 545 tree->dirty_bytes -= range; 546 } 547 548 if (tree->private_data && is_data_inode(tree->private_data)) 549 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 550 551 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 552 BUG_ON(ret < 0); 553 state->state &= ~bits_to_clear; 554 if (wake) 555 wake_up(&state->wq); 556 if (state->state == 0) { 557 next = next_state(state); 558 if (extent_state_in_tree(state)) { 559 rb_erase(&state->rb_node, &tree->state); 560 RB_CLEAR_NODE(&state->rb_node); 561 free_extent_state(state); 562 } else { 563 WARN_ON(1); 564 } 565 } else { 566 merge_state(tree, state); 567 next = next_state(state); 568 } 569 return next; 570 } 571 572 static struct extent_state * 573 alloc_extent_state_atomic(struct extent_state *prealloc) 574 { 575 if (!prealloc) 576 prealloc = alloc_extent_state(GFP_ATOMIC); 577 578 return prealloc; 579 } 580 581 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 582 { 583 struct inode *inode = tree->private_data; 584 585 btrfs_panic(btrfs_sb(inode->i_sb), err, 586 "locking error: extent tree was modified by another thread while locked"); 587 } 588 589 /* 590 * clear some bits on a range in the tree. This may require splitting 591 * or inserting elements in the tree, so the gfp mask is used to 592 * indicate which allocations or sleeping are allowed. 593 * 594 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 595 * the given range from the tree regardless of state (ie for truncate). 596 * 597 * the range [start, end] is inclusive. 598 * 599 * This takes the tree lock, and returns 0 on success and < 0 on error. 600 */ 601 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 602 unsigned bits, int wake, int delete, 603 struct extent_state **cached_state, 604 gfp_t mask, struct extent_changeset *changeset) 605 { 606 struct extent_state *state; 607 struct extent_state *cached; 608 struct extent_state *prealloc = NULL; 609 struct rb_node *node; 610 u64 last_end; 611 int err; 612 int clear = 0; 613 614 btrfs_debug_check_extent_io_range(tree, start, end); 615 616 if (bits & EXTENT_DELALLOC) 617 bits |= EXTENT_NORESERVE; 618 619 if (delete) 620 bits |= ~EXTENT_CTLBITS; 621 622 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 623 clear = 1; 624 again: 625 if (!prealloc && gfpflags_allow_blocking(mask)) { 626 /* 627 * Don't care for allocation failure here because we might end 628 * up not needing the pre-allocated extent state at all, which 629 * is the case if we only have in the tree extent states that 630 * cover our input range and don't cover too any other range. 631 * If we end up needing a new extent state we allocate it later. 632 */ 633 prealloc = alloc_extent_state(mask); 634 } 635 636 spin_lock(&tree->lock); 637 if (cached_state) { 638 cached = *cached_state; 639 640 if (clear) { 641 *cached_state = NULL; 642 cached_state = NULL; 643 } 644 645 if (cached && extent_state_in_tree(cached) && 646 cached->start <= start && cached->end > start) { 647 if (clear) 648 refcount_dec(&cached->refs); 649 state = cached; 650 goto hit_next; 651 } 652 if (clear) 653 free_extent_state(cached); 654 } 655 /* 656 * this search will find the extents that end after 657 * our range starts 658 */ 659 node = tree_search(tree, start); 660 if (!node) 661 goto out; 662 state = rb_entry(node, struct extent_state, rb_node); 663 hit_next: 664 if (state->start > end) 665 goto out; 666 WARN_ON(state->end < start); 667 last_end = state->end; 668 669 /* the state doesn't have the wanted bits, go ahead */ 670 if (!(state->state & bits)) { 671 state = next_state(state); 672 goto next; 673 } 674 675 /* 676 * | ---- desired range ---- | 677 * | state | or 678 * | ------------- state -------------- | 679 * 680 * We need to split the extent we found, and may flip 681 * bits on second half. 682 * 683 * If the extent we found extends past our range, we 684 * just split and search again. It'll get split again 685 * the next time though. 686 * 687 * If the extent we found is inside our range, we clear 688 * the desired bit on it. 689 */ 690 691 if (state->start < start) { 692 prealloc = alloc_extent_state_atomic(prealloc); 693 BUG_ON(!prealloc); 694 err = split_state(tree, state, prealloc, start); 695 if (err) 696 extent_io_tree_panic(tree, err); 697 698 prealloc = NULL; 699 if (err) 700 goto out; 701 if (state->end <= end) { 702 state = clear_state_bit(tree, state, &bits, wake, 703 changeset); 704 goto next; 705 } 706 goto search_again; 707 } 708 /* 709 * | ---- desired range ---- | 710 * | state | 711 * We need to split the extent, and clear the bit 712 * on the first half 713 */ 714 if (state->start <= end && state->end > end) { 715 prealloc = alloc_extent_state_atomic(prealloc); 716 BUG_ON(!prealloc); 717 err = split_state(tree, state, prealloc, end + 1); 718 if (err) 719 extent_io_tree_panic(tree, err); 720 721 if (wake) 722 wake_up(&state->wq); 723 724 clear_state_bit(tree, prealloc, &bits, wake, changeset); 725 726 prealloc = NULL; 727 goto out; 728 } 729 730 state = clear_state_bit(tree, state, &bits, wake, changeset); 731 next: 732 if (last_end == (u64)-1) 733 goto out; 734 start = last_end + 1; 735 if (start <= end && state && !need_resched()) 736 goto hit_next; 737 738 search_again: 739 if (start > end) 740 goto out; 741 spin_unlock(&tree->lock); 742 if (gfpflags_allow_blocking(mask)) 743 cond_resched(); 744 goto again; 745 746 out: 747 spin_unlock(&tree->lock); 748 if (prealloc) 749 free_extent_state(prealloc); 750 751 return 0; 752 753 } 754 755 static void wait_on_state(struct extent_io_tree *tree, 756 struct extent_state *state) 757 __releases(tree->lock) 758 __acquires(tree->lock) 759 { 760 DEFINE_WAIT(wait); 761 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 762 spin_unlock(&tree->lock); 763 schedule(); 764 spin_lock(&tree->lock); 765 finish_wait(&state->wq, &wait); 766 } 767 768 /* 769 * waits for one or more bits to clear on a range in the state tree. 770 * The range [start, end] is inclusive. 771 * The tree lock is taken by this function 772 */ 773 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 774 unsigned long bits) 775 { 776 struct extent_state *state; 777 struct rb_node *node; 778 779 btrfs_debug_check_extent_io_range(tree, start, end); 780 781 spin_lock(&tree->lock); 782 again: 783 while (1) { 784 /* 785 * this search will find all the extents that end after 786 * our range starts 787 */ 788 node = tree_search(tree, start); 789 process_node: 790 if (!node) 791 break; 792 793 state = rb_entry(node, struct extent_state, rb_node); 794 795 if (state->start > end) 796 goto out; 797 798 if (state->state & bits) { 799 start = state->start; 800 refcount_inc(&state->refs); 801 wait_on_state(tree, state); 802 free_extent_state(state); 803 goto again; 804 } 805 start = state->end + 1; 806 807 if (start > end) 808 break; 809 810 if (!cond_resched_lock(&tree->lock)) { 811 node = rb_next(node); 812 goto process_node; 813 } 814 } 815 out: 816 spin_unlock(&tree->lock); 817 } 818 819 static void set_state_bits(struct extent_io_tree *tree, 820 struct extent_state *state, 821 unsigned *bits, struct extent_changeset *changeset) 822 { 823 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 824 int ret; 825 826 if (tree->private_data && is_data_inode(tree->private_data)) 827 btrfs_set_delalloc_extent(tree->private_data, state, bits); 828 829 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 830 u64 range = state->end - state->start + 1; 831 tree->dirty_bytes += range; 832 } 833 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 834 BUG_ON(ret < 0); 835 state->state |= bits_to_set; 836 } 837 838 static void cache_state_if_flags(struct extent_state *state, 839 struct extent_state **cached_ptr, 840 unsigned flags) 841 { 842 if (cached_ptr && !(*cached_ptr)) { 843 if (!flags || (state->state & flags)) { 844 *cached_ptr = state; 845 refcount_inc(&state->refs); 846 } 847 } 848 } 849 850 static void cache_state(struct extent_state *state, 851 struct extent_state **cached_ptr) 852 { 853 return cache_state_if_flags(state, cached_ptr, 854 EXTENT_IOBITS | EXTENT_BOUNDARY); 855 } 856 857 /* 858 * set some bits on a range in the tree. This may require allocations or 859 * sleeping, so the gfp mask is used to indicate what is allowed. 860 * 861 * If any of the exclusive bits are set, this will fail with -EEXIST if some 862 * part of the range already has the desired bits set. The start of the 863 * existing range is returned in failed_start in this case. 864 * 865 * [start, end] is inclusive This takes the tree lock. 866 */ 867 868 static int __must_check 869 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 870 unsigned bits, unsigned exclusive_bits, 871 u64 *failed_start, struct extent_state **cached_state, 872 gfp_t mask, struct extent_changeset *changeset) 873 { 874 struct extent_state *state; 875 struct extent_state *prealloc = NULL; 876 struct rb_node *node; 877 struct rb_node **p; 878 struct rb_node *parent; 879 int err = 0; 880 u64 last_start; 881 u64 last_end; 882 883 btrfs_debug_check_extent_io_range(tree, start, end); 884 885 again: 886 if (!prealloc && gfpflags_allow_blocking(mask)) { 887 /* 888 * Don't care for allocation failure here because we might end 889 * up not needing the pre-allocated extent state at all, which 890 * is the case if we only have in the tree extent states that 891 * cover our input range and don't cover too any other range. 892 * If we end up needing a new extent state we allocate it later. 893 */ 894 prealloc = alloc_extent_state(mask); 895 } 896 897 spin_lock(&tree->lock); 898 if (cached_state && *cached_state) { 899 state = *cached_state; 900 if (state->start <= start && state->end > start && 901 extent_state_in_tree(state)) { 902 node = &state->rb_node; 903 goto hit_next; 904 } 905 } 906 /* 907 * this search will find all the extents that end after 908 * our range starts. 909 */ 910 node = tree_search_for_insert(tree, start, &p, &parent); 911 if (!node) { 912 prealloc = alloc_extent_state_atomic(prealloc); 913 BUG_ON(!prealloc); 914 err = insert_state(tree, prealloc, start, end, 915 &p, &parent, &bits, changeset); 916 if (err) 917 extent_io_tree_panic(tree, err); 918 919 cache_state(prealloc, cached_state); 920 prealloc = NULL; 921 goto out; 922 } 923 state = rb_entry(node, struct extent_state, rb_node); 924 hit_next: 925 last_start = state->start; 926 last_end = state->end; 927 928 /* 929 * | ---- desired range ---- | 930 * | state | 931 * 932 * Just lock what we found and keep going 933 */ 934 if (state->start == start && state->end <= end) { 935 if (state->state & exclusive_bits) { 936 *failed_start = state->start; 937 err = -EEXIST; 938 goto out; 939 } 940 941 set_state_bits(tree, state, &bits, changeset); 942 cache_state(state, cached_state); 943 merge_state(tree, state); 944 if (last_end == (u64)-1) 945 goto out; 946 start = last_end + 1; 947 state = next_state(state); 948 if (start < end && state && state->start == start && 949 !need_resched()) 950 goto hit_next; 951 goto search_again; 952 } 953 954 /* 955 * | ---- desired range ---- | 956 * | state | 957 * or 958 * | ------------- state -------------- | 959 * 960 * We need to split the extent we found, and may flip bits on 961 * second half. 962 * 963 * If the extent we found extends past our 964 * range, we just split and search again. It'll get split 965 * again the next time though. 966 * 967 * If the extent we found is inside our range, we set the 968 * desired bit on it. 969 */ 970 if (state->start < start) { 971 if (state->state & exclusive_bits) { 972 *failed_start = start; 973 err = -EEXIST; 974 goto out; 975 } 976 977 prealloc = alloc_extent_state_atomic(prealloc); 978 BUG_ON(!prealloc); 979 err = split_state(tree, state, prealloc, start); 980 if (err) 981 extent_io_tree_panic(tree, err); 982 983 prealloc = NULL; 984 if (err) 985 goto out; 986 if (state->end <= end) { 987 set_state_bits(tree, state, &bits, changeset); 988 cache_state(state, cached_state); 989 merge_state(tree, state); 990 if (last_end == (u64)-1) 991 goto out; 992 start = last_end + 1; 993 state = next_state(state); 994 if (start < end && state && state->start == start && 995 !need_resched()) 996 goto hit_next; 997 } 998 goto search_again; 999 } 1000 /* 1001 * | ---- desired range ---- | 1002 * | state | or | state | 1003 * 1004 * There's a hole, we need to insert something in it and 1005 * ignore the extent we found. 1006 */ 1007 if (state->start > start) { 1008 u64 this_end; 1009 if (end < last_start) 1010 this_end = end; 1011 else 1012 this_end = last_start - 1; 1013 1014 prealloc = alloc_extent_state_atomic(prealloc); 1015 BUG_ON(!prealloc); 1016 1017 /* 1018 * Avoid to free 'prealloc' if it can be merged with 1019 * the later extent. 1020 */ 1021 err = insert_state(tree, prealloc, start, this_end, 1022 NULL, NULL, &bits, changeset); 1023 if (err) 1024 extent_io_tree_panic(tree, err); 1025 1026 cache_state(prealloc, cached_state); 1027 prealloc = NULL; 1028 start = this_end + 1; 1029 goto search_again; 1030 } 1031 /* 1032 * | ---- desired range ---- | 1033 * | state | 1034 * We need to split the extent, and set the bit 1035 * on the first half 1036 */ 1037 if (state->start <= end && state->end > end) { 1038 if (state->state & exclusive_bits) { 1039 *failed_start = start; 1040 err = -EEXIST; 1041 goto out; 1042 } 1043 1044 prealloc = alloc_extent_state_atomic(prealloc); 1045 BUG_ON(!prealloc); 1046 err = split_state(tree, state, prealloc, end + 1); 1047 if (err) 1048 extent_io_tree_panic(tree, err); 1049 1050 set_state_bits(tree, prealloc, &bits, changeset); 1051 cache_state(prealloc, cached_state); 1052 merge_state(tree, prealloc); 1053 prealloc = NULL; 1054 goto out; 1055 } 1056 1057 search_again: 1058 if (start > end) 1059 goto out; 1060 spin_unlock(&tree->lock); 1061 if (gfpflags_allow_blocking(mask)) 1062 cond_resched(); 1063 goto again; 1064 1065 out: 1066 spin_unlock(&tree->lock); 1067 if (prealloc) 1068 free_extent_state(prealloc); 1069 1070 return err; 1071 1072 } 1073 1074 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1075 unsigned bits, u64 * failed_start, 1076 struct extent_state **cached_state, gfp_t mask) 1077 { 1078 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1079 cached_state, mask, NULL); 1080 } 1081 1082 1083 /** 1084 * convert_extent_bit - convert all bits in a given range from one bit to 1085 * another 1086 * @tree: the io tree to search 1087 * @start: the start offset in bytes 1088 * @end: the end offset in bytes (inclusive) 1089 * @bits: the bits to set in this range 1090 * @clear_bits: the bits to clear in this range 1091 * @cached_state: state that we're going to cache 1092 * 1093 * This will go through and set bits for the given range. If any states exist 1094 * already in this range they are set with the given bit and cleared of the 1095 * clear_bits. This is only meant to be used by things that are mergeable, ie 1096 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1097 * boundary bits like LOCK. 1098 * 1099 * All allocations are done with GFP_NOFS. 1100 */ 1101 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1102 unsigned bits, unsigned clear_bits, 1103 struct extent_state **cached_state) 1104 { 1105 struct extent_state *state; 1106 struct extent_state *prealloc = NULL; 1107 struct rb_node *node; 1108 struct rb_node **p; 1109 struct rb_node *parent; 1110 int err = 0; 1111 u64 last_start; 1112 u64 last_end; 1113 bool first_iteration = true; 1114 1115 btrfs_debug_check_extent_io_range(tree, start, end); 1116 1117 again: 1118 if (!prealloc) { 1119 /* 1120 * Best effort, don't worry if extent state allocation fails 1121 * here for the first iteration. We might have a cached state 1122 * that matches exactly the target range, in which case no 1123 * extent state allocations are needed. We'll only know this 1124 * after locking the tree. 1125 */ 1126 prealloc = alloc_extent_state(GFP_NOFS); 1127 if (!prealloc && !first_iteration) 1128 return -ENOMEM; 1129 } 1130 1131 spin_lock(&tree->lock); 1132 if (cached_state && *cached_state) { 1133 state = *cached_state; 1134 if (state->start <= start && state->end > start && 1135 extent_state_in_tree(state)) { 1136 node = &state->rb_node; 1137 goto hit_next; 1138 } 1139 } 1140 1141 /* 1142 * this search will find all the extents that end after 1143 * our range starts. 1144 */ 1145 node = tree_search_for_insert(tree, start, &p, &parent); 1146 if (!node) { 1147 prealloc = alloc_extent_state_atomic(prealloc); 1148 if (!prealloc) { 1149 err = -ENOMEM; 1150 goto out; 1151 } 1152 err = insert_state(tree, prealloc, start, end, 1153 &p, &parent, &bits, NULL); 1154 if (err) 1155 extent_io_tree_panic(tree, err); 1156 cache_state(prealloc, cached_state); 1157 prealloc = NULL; 1158 goto out; 1159 } 1160 state = rb_entry(node, struct extent_state, rb_node); 1161 hit_next: 1162 last_start = state->start; 1163 last_end = state->end; 1164 1165 /* 1166 * | ---- desired range ---- | 1167 * | state | 1168 * 1169 * Just lock what we found and keep going 1170 */ 1171 if (state->start == start && state->end <= end) { 1172 set_state_bits(tree, state, &bits, NULL); 1173 cache_state(state, cached_state); 1174 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1175 if (last_end == (u64)-1) 1176 goto out; 1177 start = last_end + 1; 1178 if (start < end && state && state->start == start && 1179 !need_resched()) 1180 goto hit_next; 1181 goto search_again; 1182 } 1183 1184 /* 1185 * | ---- desired range ---- | 1186 * | state | 1187 * or 1188 * | ------------- state -------------- | 1189 * 1190 * We need to split the extent we found, and may flip bits on 1191 * second half. 1192 * 1193 * If the extent we found extends past our 1194 * range, we just split and search again. It'll get split 1195 * again the next time though. 1196 * 1197 * If the extent we found is inside our range, we set the 1198 * desired bit on it. 1199 */ 1200 if (state->start < start) { 1201 prealloc = alloc_extent_state_atomic(prealloc); 1202 if (!prealloc) { 1203 err = -ENOMEM; 1204 goto out; 1205 } 1206 err = split_state(tree, state, prealloc, start); 1207 if (err) 1208 extent_io_tree_panic(tree, err); 1209 prealloc = NULL; 1210 if (err) 1211 goto out; 1212 if (state->end <= end) { 1213 set_state_bits(tree, state, &bits, NULL); 1214 cache_state(state, cached_state); 1215 state = clear_state_bit(tree, state, &clear_bits, 0, 1216 NULL); 1217 if (last_end == (u64)-1) 1218 goto out; 1219 start = last_end + 1; 1220 if (start < end && state && state->start == start && 1221 !need_resched()) 1222 goto hit_next; 1223 } 1224 goto search_again; 1225 } 1226 /* 1227 * | ---- desired range ---- | 1228 * | state | or | state | 1229 * 1230 * There's a hole, we need to insert something in it and 1231 * ignore the extent we found. 1232 */ 1233 if (state->start > start) { 1234 u64 this_end; 1235 if (end < last_start) 1236 this_end = end; 1237 else 1238 this_end = last_start - 1; 1239 1240 prealloc = alloc_extent_state_atomic(prealloc); 1241 if (!prealloc) { 1242 err = -ENOMEM; 1243 goto out; 1244 } 1245 1246 /* 1247 * Avoid to free 'prealloc' if it can be merged with 1248 * the later extent. 1249 */ 1250 err = insert_state(tree, prealloc, start, this_end, 1251 NULL, NULL, &bits, NULL); 1252 if (err) 1253 extent_io_tree_panic(tree, err); 1254 cache_state(prealloc, cached_state); 1255 prealloc = NULL; 1256 start = this_end + 1; 1257 goto search_again; 1258 } 1259 /* 1260 * | ---- desired range ---- | 1261 * | state | 1262 * We need to split the extent, and set the bit 1263 * on the first half 1264 */ 1265 if (state->start <= end && state->end > end) { 1266 prealloc = alloc_extent_state_atomic(prealloc); 1267 if (!prealloc) { 1268 err = -ENOMEM; 1269 goto out; 1270 } 1271 1272 err = split_state(tree, state, prealloc, end + 1); 1273 if (err) 1274 extent_io_tree_panic(tree, err); 1275 1276 set_state_bits(tree, prealloc, &bits, NULL); 1277 cache_state(prealloc, cached_state); 1278 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1279 prealloc = NULL; 1280 goto out; 1281 } 1282 1283 search_again: 1284 if (start > end) 1285 goto out; 1286 spin_unlock(&tree->lock); 1287 cond_resched(); 1288 first_iteration = false; 1289 goto again; 1290 1291 out: 1292 spin_unlock(&tree->lock); 1293 if (prealloc) 1294 free_extent_state(prealloc); 1295 1296 return err; 1297 } 1298 1299 /* wrappers around set/clear extent bit */ 1300 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1301 unsigned bits, struct extent_changeset *changeset) 1302 { 1303 /* 1304 * We don't support EXTENT_LOCKED yet, as current changeset will 1305 * record any bits changed, so for EXTENT_LOCKED case, it will 1306 * either fail with -EEXIST or changeset will record the whole 1307 * range. 1308 */ 1309 BUG_ON(bits & EXTENT_LOCKED); 1310 1311 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1312 changeset); 1313 } 1314 1315 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1316 unsigned bits, int wake, int delete, 1317 struct extent_state **cached) 1318 { 1319 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1320 cached, GFP_NOFS, NULL); 1321 } 1322 1323 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1324 unsigned bits, struct extent_changeset *changeset) 1325 { 1326 /* 1327 * Don't support EXTENT_LOCKED case, same reason as 1328 * set_record_extent_bits(). 1329 */ 1330 BUG_ON(bits & EXTENT_LOCKED); 1331 1332 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1333 changeset); 1334 } 1335 1336 /* 1337 * either insert or lock state struct between start and end use mask to tell 1338 * us if waiting is desired. 1339 */ 1340 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1341 struct extent_state **cached_state) 1342 { 1343 int err; 1344 u64 failed_start; 1345 1346 while (1) { 1347 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1348 EXTENT_LOCKED, &failed_start, 1349 cached_state, GFP_NOFS, NULL); 1350 if (err == -EEXIST) { 1351 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1352 start = failed_start; 1353 } else 1354 break; 1355 WARN_ON(start > end); 1356 } 1357 return err; 1358 } 1359 1360 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1361 { 1362 int err; 1363 u64 failed_start; 1364 1365 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1366 &failed_start, NULL, GFP_NOFS, NULL); 1367 if (err == -EEXIST) { 1368 if (failed_start > start) 1369 clear_extent_bit(tree, start, failed_start - 1, 1370 EXTENT_LOCKED, 1, 0, NULL); 1371 return 0; 1372 } 1373 return 1; 1374 } 1375 1376 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1377 { 1378 unsigned long index = start >> PAGE_SHIFT; 1379 unsigned long end_index = end >> PAGE_SHIFT; 1380 struct page *page; 1381 1382 while (index <= end_index) { 1383 page = find_get_page(inode->i_mapping, index); 1384 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1385 clear_page_dirty_for_io(page); 1386 put_page(page); 1387 index++; 1388 } 1389 } 1390 1391 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1392 { 1393 unsigned long index = start >> PAGE_SHIFT; 1394 unsigned long end_index = end >> PAGE_SHIFT; 1395 struct page *page; 1396 1397 while (index <= end_index) { 1398 page = find_get_page(inode->i_mapping, index); 1399 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1400 __set_page_dirty_nobuffers(page); 1401 account_page_redirty(page); 1402 put_page(page); 1403 index++; 1404 } 1405 } 1406 1407 /* find the first state struct with 'bits' set after 'start', and 1408 * return it. tree->lock must be held. NULL will returned if 1409 * nothing was found after 'start' 1410 */ 1411 static struct extent_state * 1412 find_first_extent_bit_state(struct extent_io_tree *tree, 1413 u64 start, unsigned bits) 1414 { 1415 struct rb_node *node; 1416 struct extent_state *state; 1417 1418 /* 1419 * this search will find all the extents that end after 1420 * our range starts. 1421 */ 1422 node = tree_search(tree, start); 1423 if (!node) 1424 goto out; 1425 1426 while (1) { 1427 state = rb_entry(node, struct extent_state, rb_node); 1428 if (state->end >= start && (state->state & bits)) 1429 return state; 1430 1431 node = rb_next(node); 1432 if (!node) 1433 break; 1434 } 1435 out: 1436 return NULL; 1437 } 1438 1439 /* 1440 * find the first offset in the io tree with 'bits' set. zero is 1441 * returned if we find something, and *start_ret and *end_ret are 1442 * set to reflect the state struct that was found. 1443 * 1444 * If nothing was found, 1 is returned. If found something, return 0. 1445 */ 1446 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1447 u64 *start_ret, u64 *end_ret, unsigned bits, 1448 struct extent_state **cached_state) 1449 { 1450 struct extent_state *state; 1451 int ret = 1; 1452 1453 spin_lock(&tree->lock); 1454 if (cached_state && *cached_state) { 1455 state = *cached_state; 1456 if (state->end == start - 1 && extent_state_in_tree(state)) { 1457 while ((state = next_state(state)) != NULL) { 1458 if (state->state & bits) 1459 goto got_it; 1460 } 1461 free_extent_state(*cached_state); 1462 *cached_state = NULL; 1463 goto out; 1464 } 1465 free_extent_state(*cached_state); 1466 *cached_state = NULL; 1467 } 1468 1469 state = find_first_extent_bit_state(tree, start, bits); 1470 got_it: 1471 if (state) { 1472 cache_state_if_flags(state, cached_state, 0); 1473 *start_ret = state->start; 1474 *end_ret = state->end; 1475 ret = 0; 1476 } 1477 out: 1478 spin_unlock(&tree->lock); 1479 return ret; 1480 } 1481 1482 /* 1483 * find a contiguous range of bytes in the file marked as delalloc, not 1484 * more than 'max_bytes'. start and end are used to return the range, 1485 * 1486 * true is returned if we find something, false if nothing was in the tree 1487 */ 1488 static noinline bool find_delalloc_range(struct extent_io_tree *tree, 1489 u64 *start, u64 *end, u64 max_bytes, 1490 struct extent_state **cached_state) 1491 { 1492 struct rb_node *node; 1493 struct extent_state *state; 1494 u64 cur_start = *start; 1495 bool found = false; 1496 u64 total_bytes = 0; 1497 1498 spin_lock(&tree->lock); 1499 1500 /* 1501 * this search will find all the extents that end after 1502 * our range starts. 1503 */ 1504 node = tree_search(tree, cur_start); 1505 if (!node) { 1506 *end = (u64)-1; 1507 goto out; 1508 } 1509 1510 while (1) { 1511 state = rb_entry(node, struct extent_state, rb_node); 1512 if (found && (state->start != cur_start || 1513 (state->state & EXTENT_BOUNDARY))) { 1514 goto out; 1515 } 1516 if (!(state->state & EXTENT_DELALLOC)) { 1517 if (!found) 1518 *end = state->end; 1519 goto out; 1520 } 1521 if (!found) { 1522 *start = state->start; 1523 *cached_state = state; 1524 refcount_inc(&state->refs); 1525 } 1526 found = true; 1527 *end = state->end; 1528 cur_start = state->end + 1; 1529 node = rb_next(node); 1530 total_bytes += state->end - state->start + 1; 1531 if (total_bytes >= max_bytes) 1532 break; 1533 if (!node) 1534 break; 1535 } 1536 out: 1537 spin_unlock(&tree->lock); 1538 return found; 1539 } 1540 1541 static int __process_pages_contig(struct address_space *mapping, 1542 struct page *locked_page, 1543 pgoff_t start_index, pgoff_t end_index, 1544 unsigned long page_ops, pgoff_t *index_ret); 1545 1546 static noinline void __unlock_for_delalloc(struct inode *inode, 1547 struct page *locked_page, 1548 u64 start, u64 end) 1549 { 1550 unsigned long index = start >> PAGE_SHIFT; 1551 unsigned long end_index = end >> PAGE_SHIFT; 1552 1553 ASSERT(locked_page); 1554 if (index == locked_page->index && end_index == index) 1555 return; 1556 1557 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1558 PAGE_UNLOCK, NULL); 1559 } 1560 1561 static noinline int lock_delalloc_pages(struct inode *inode, 1562 struct page *locked_page, 1563 u64 delalloc_start, 1564 u64 delalloc_end) 1565 { 1566 unsigned long index = delalloc_start >> PAGE_SHIFT; 1567 unsigned long index_ret = index; 1568 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1569 int ret; 1570 1571 ASSERT(locked_page); 1572 if (index == locked_page->index && index == end_index) 1573 return 0; 1574 1575 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1576 end_index, PAGE_LOCK, &index_ret); 1577 if (ret == -EAGAIN) 1578 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1579 (u64)index_ret << PAGE_SHIFT); 1580 return ret; 1581 } 1582 1583 /* 1584 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 1585 * more than @max_bytes. @Start and @end are used to return the range, 1586 * 1587 * Return: true if we find something 1588 * false if nothing was in the tree 1589 */ 1590 EXPORT_FOR_TESTS 1591 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1592 struct extent_io_tree *tree, 1593 struct page *locked_page, u64 *start, 1594 u64 *end) 1595 { 1596 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1597 u64 delalloc_start; 1598 u64 delalloc_end; 1599 bool found; 1600 struct extent_state *cached_state = NULL; 1601 int ret; 1602 int loops = 0; 1603 1604 again: 1605 /* step one, find a bunch of delalloc bytes starting at start */ 1606 delalloc_start = *start; 1607 delalloc_end = 0; 1608 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1609 max_bytes, &cached_state); 1610 if (!found || delalloc_end <= *start) { 1611 *start = delalloc_start; 1612 *end = delalloc_end; 1613 free_extent_state(cached_state); 1614 return false; 1615 } 1616 1617 /* 1618 * start comes from the offset of locked_page. We have to lock 1619 * pages in order, so we can't process delalloc bytes before 1620 * locked_page 1621 */ 1622 if (delalloc_start < *start) 1623 delalloc_start = *start; 1624 1625 /* 1626 * make sure to limit the number of pages we try to lock down 1627 */ 1628 if (delalloc_end + 1 - delalloc_start > max_bytes) 1629 delalloc_end = delalloc_start + max_bytes - 1; 1630 1631 /* step two, lock all the pages after the page that has start */ 1632 ret = lock_delalloc_pages(inode, locked_page, 1633 delalloc_start, delalloc_end); 1634 ASSERT(!ret || ret == -EAGAIN); 1635 if (ret == -EAGAIN) { 1636 /* some of the pages are gone, lets avoid looping by 1637 * shortening the size of the delalloc range we're searching 1638 */ 1639 free_extent_state(cached_state); 1640 cached_state = NULL; 1641 if (!loops) { 1642 max_bytes = PAGE_SIZE; 1643 loops = 1; 1644 goto again; 1645 } else { 1646 found = false; 1647 goto out_failed; 1648 } 1649 } 1650 1651 /* step three, lock the state bits for the whole range */ 1652 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1653 1654 /* then test to make sure it is all still delalloc */ 1655 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1656 EXTENT_DELALLOC, 1, cached_state); 1657 if (!ret) { 1658 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1659 &cached_state); 1660 __unlock_for_delalloc(inode, locked_page, 1661 delalloc_start, delalloc_end); 1662 cond_resched(); 1663 goto again; 1664 } 1665 free_extent_state(cached_state); 1666 *start = delalloc_start; 1667 *end = delalloc_end; 1668 out_failed: 1669 return found; 1670 } 1671 1672 static int __process_pages_contig(struct address_space *mapping, 1673 struct page *locked_page, 1674 pgoff_t start_index, pgoff_t end_index, 1675 unsigned long page_ops, pgoff_t *index_ret) 1676 { 1677 unsigned long nr_pages = end_index - start_index + 1; 1678 unsigned long pages_locked = 0; 1679 pgoff_t index = start_index; 1680 struct page *pages[16]; 1681 unsigned ret; 1682 int err = 0; 1683 int i; 1684 1685 if (page_ops & PAGE_LOCK) { 1686 ASSERT(page_ops == PAGE_LOCK); 1687 ASSERT(index_ret && *index_ret == start_index); 1688 } 1689 1690 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1691 mapping_set_error(mapping, -EIO); 1692 1693 while (nr_pages > 0) { 1694 ret = find_get_pages_contig(mapping, index, 1695 min_t(unsigned long, 1696 nr_pages, ARRAY_SIZE(pages)), pages); 1697 if (ret == 0) { 1698 /* 1699 * Only if we're going to lock these pages, 1700 * can we find nothing at @index. 1701 */ 1702 ASSERT(page_ops & PAGE_LOCK); 1703 err = -EAGAIN; 1704 goto out; 1705 } 1706 1707 for (i = 0; i < ret; i++) { 1708 if (page_ops & PAGE_SET_PRIVATE2) 1709 SetPagePrivate2(pages[i]); 1710 1711 if (pages[i] == locked_page) { 1712 put_page(pages[i]); 1713 pages_locked++; 1714 continue; 1715 } 1716 if (page_ops & PAGE_CLEAR_DIRTY) 1717 clear_page_dirty_for_io(pages[i]); 1718 if (page_ops & PAGE_SET_WRITEBACK) 1719 set_page_writeback(pages[i]); 1720 if (page_ops & PAGE_SET_ERROR) 1721 SetPageError(pages[i]); 1722 if (page_ops & PAGE_END_WRITEBACK) 1723 end_page_writeback(pages[i]); 1724 if (page_ops & PAGE_UNLOCK) 1725 unlock_page(pages[i]); 1726 if (page_ops & PAGE_LOCK) { 1727 lock_page(pages[i]); 1728 if (!PageDirty(pages[i]) || 1729 pages[i]->mapping != mapping) { 1730 unlock_page(pages[i]); 1731 put_page(pages[i]); 1732 err = -EAGAIN; 1733 goto out; 1734 } 1735 } 1736 put_page(pages[i]); 1737 pages_locked++; 1738 } 1739 nr_pages -= ret; 1740 index += ret; 1741 cond_resched(); 1742 } 1743 out: 1744 if (err && index_ret) 1745 *index_ret = start_index + pages_locked - 1; 1746 return err; 1747 } 1748 1749 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1750 u64 delalloc_end, struct page *locked_page, 1751 unsigned clear_bits, 1752 unsigned long page_ops) 1753 { 1754 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1755 NULL); 1756 1757 __process_pages_contig(inode->i_mapping, locked_page, 1758 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1759 page_ops, NULL); 1760 } 1761 1762 /* 1763 * count the number of bytes in the tree that have a given bit(s) 1764 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1765 * cached. The total number found is returned. 1766 */ 1767 u64 count_range_bits(struct extent_io_tree *tree, 1768 u64 *start, u64 search_end, u64 max_bytes, 1769 unsigned bits, int contig) 1770 { 1771 struct rb_node *node; 1772 struct extent_state *state; 1773 u64 cur_start = *start; 1774 u64 total_bytes = 0; 1775 u64 last = 0; 1776 int found = 0; 1777 1778 if (WARN_ON(search_end <= cur_start)) 1779 return 0; 1780 1781 spin_lock(&tree->lock); 1782 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1783 total_bytes = tree->dirty_bytes; 1784 goto out; 1785 } 1786 /* 1787 * this search will find all the extents that end after 1788 * our range starts. 1789 */ 1790 node = tree_search(tree, cur_start); 1791 if (!node) 1792 goto out; 1793 1794 while (1) { 1795 state = rb_entry(node, struct extent_state, rb_node); 1796 if (state->start > search_end) 1797 break; 1798 if (contig && found && state->start > last + 1) 1799 break; 1800 if (state->end >= cur_start && (state->state & bits) == bits) { 1801 total_bytes += min(search_end, state->end) + 1 - 1802 max(cur_start, state->start); 1803 if (total_bytes >= max_bytes) 1804 break; 1805 if (!found) { 1806 *start = max(cur_start, state->start); 1807 found = 1; 1808 } 1809 last = state->end; 1810 } else if (contig && found) { 1811 break; 1812 } 1813 node = rb_next(node); 1814 if (!node) 1815 break; 1816 } 1817 out: 1818 spin_unlock(&tree->lock); 1819 return total_bytes; 1820 } 1821 1822 /* 1823 * set the private field for a given byte offset in the tree. If there isn't 1824 * an extent_state there already, this does nothing. 1825 */ 1826 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1827 struct io_failure_record *failrec) 1828 { 1829 struct rb_node *node; 1830 struct extent_state *state; 1831 int ret = 0; 1832 1833 spin_lock(&tree->lock); 1834 /* 1835 * this search will find all the extents that end after 1836 * our range starts. 1837 */ 1838 node = tree_search(tree, start); 1839 if (!node) { 1840 ret = -ENOENT; 1841 goto out; 1842 } 1843 state = rb_entry(node, struct extent_state, rb_node); 1844 if (state->start != start) { 1845 ret = -ENOENT; 1846 goto out; 1847 } 1848 state->failrec = failrec; 1849 out: 1850 spin_unlock(&tree->lock); 1851 return ret; 1852 } 1853 1854 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1855 struct io_failure_record **failrec) 1856 { 1857 struct rb_node *node; 1858 struct extent_state *state; 1859 int ret = 0; 1860 1861 spin_lock(&tree->lock); 1862 /* 1863 * this search will find all the extents that end after 1864 * our range starts. 1865 */ 1866 node = tree_search(tree, start); 1867 if (!node) { 1868 ret = -ENOENT; 1869 goto out; 1870 } 1871 state = rb_entry(node, struct extent_state, rb_node); 1872 if (state->start != start) { 1873 ret = -ENOENT; 1874 goto out; 1875 } 1876 *failrec = state->failrec; 1877 out: 1878 spin_unlock(&tree->lock); 1879 return ret; 1880 } 1881 1882 /* 1883 * searches a range in the state tree for a given mask. 1884 * If 'filled' == 1, this returns 1 only if every extent in the tree 1885 * has the bits set. Otherwise, 1 is returned if any bit in the 1886 * range is found set. 1887 */ 1888 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1889 unsigned bits, int filled, struct extent_state *cached) 1890 { 1891 struct extent_state *state = NULL; 1892 struct rb_node *node; 1893 int bitset = 0; 1894 1895 spin_lock(&tree->lock); 1896 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1897 cached->end > start) 1898 node = &cached->rb_node; 1899 else 1900 node = tree_search(tree, start); 1901 while (node && start <= end) { 1902 state = rb_entry(node, struct extent_state, rb_node); 1903 1904 if (filled && state->start > start) { 1905 bitset = 0; 1906 break; 1907 } 1908 1909 if (state->start > end) 1910 break; 1911 1912 if (state->state & bits) { 1913 bitset = 1; 1914 if (!filled) 1915 break; 1916 } else if (filled) { 1917 bitset = 0; 1918 break; 1919 } 1920 1921 if (state->end == (u64)-1) 1922 break; 1923 1924 start = state->end + 1; 1925 if (start > end) 1926 break; 1927 node = rb_next(node); 1928 if (!node) { 1929 if (filled) 1930 bitset = 0; 1931 break; 1932 } 1933 } 1934 spin_unlock(&tree->lock); 1935 return bitset; 1936 } 1937 1938 /* 1939 * helper function to set a given page up to date if all the 1940 * extents in the tree for that page are up to date 1941 */ 1942 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1943 { 1944 u64 start = page_offset(page); 1945 u64 end = start + PAGE_SIZE - 1; 1946 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1947 SetPageUptodate(page); 1948 } 1949 1950 int free_io_failure(struct extent_io_tree *failure_tree, 1951 struct extent_io_tree *io_tree, 1952 struct io_failure_record *rec) 1953 { 1954 int ret; 1955 int err = 0; 1956 1957 set_state_failrec(failure_tree, rec->start, NULL); 1958 ret = clear_extent_bits(failure_tree, rec->start, 1959 rec->start + rec->len - 1, 1960 EXTENT_LOCKED | EXTENT_DIRTY); 1961 if (ret) 1962 err = ret; 1963 1964 ret = clear_extent_bits(io_tree, rec->start, 1965 rec->start + rec->len - 1, 1966 EXTENT_DAMAGED); 1967 if (ret && !err) 1968 err = ret; 1969 1970 kfree(rec); 1971 return err; 1972 } 1973 1974 /* 1975 * this bypasses the standard btrfs submit functions deliberately, as 1976 * the standard behavior is to write all copies in a raid setup. here we only 1977 * want to write the one bad copy. so we do the mapping for ourselves and issue 1978 * submit_bio directly. 1979 * to avoid any synchronization issues, wait for the data after writing, which 1980 * actually prevents the read that triggered the error from finishing. 1981 * currently, there can be no more than two copies of every data bit. thus, 1982 * exactly one rewrite is required. 1983 */ 1984 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 1985 u64 length, u64 logical, struct page *page, 1986 unsigned int pg_offset, int mirror_num) 1987 { 1988 struct bio *bio; 1989 struct btrfs_device *dev; 1990 u64 map_length = 0; 1991 u64 sector; 1992 struct btrfs_bio *bbio = NULL; 1993 int ret; 1994 1995 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 1996 BUG_ON(!mirror_num); 1997 1998 bio = btrfs_io_bio_alloc(1); 1999 bio->bi_iter.bi_size = 0; 2000 map_length = length; 2001 2002 /* 2003 * Avoid races with device replace and make sure our bbio has devices 2004 * associated to its stripes that don't go away while we are doing the 2005 * read repair operation. 2006 */ 2007 btrfs_bio_counter_inc_blocked(fs_info); 2008 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2009 /* 2010 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2011 * to update all raid stripes, but here we just want to correct 2012 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2013 * stripe's dev and sector. 2014 */ 2015 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2016 &map_length, &bbio, 0); 2017 if (ret) { 2018 btrfs_bio_counter_dec(fs_info); 2019 bio_put(bio); 2020 return -EIO; 2021 } 2022 ASSERT(bbio->mirror_num == 1); 2023 } else { 2024 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2025 &map_length, &bbio, mirror_num); 2026 if (ret) { 2027 btrfs_bio_counter_dec(fs_info); 2028 bio_put(bio); 2029 return -EIO; 2030 } 2031 BUG_ON(mirror_num != bbio->mirror_num); 2032 } 2033 2034 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2035 bio->bi_iter.bi_sector = sector; 2036 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2037 btrfs_put_bbio(bbio); 2038 if (!dev || !dev->bdev || 2039 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2040 btrfs_bio_counter_dec(fs_info); 2041 bio_put(bio); 2042 return -EIO; 2043 } 2044 bio_set_dev(bio, dev->bdev); 2045 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2046 bio_add_page(bio, page, length, pg_offset); 2047 2048 if (btrfsic_submit_bio_wait(bio)) { 2049 /* try to remap that extent elsewhere? */ 2050 btrfs_bio_counter_dec(fs_info); 2051 bio_put(bio); 2052 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2053 return -EIO; 2054 } 2055 2056 btrfs_info_rl_in_rcu(fs_info, 2057 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2058 ino, start, 2059 rcu_str_deref(dev->name), sector); 2060 btrfs_bio_counter_dec(fs_info); 2061 bio_put(bio); 2062 return 0; 2063 } 2064 2065 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2066 struct extent_buffer *eb, int mirror_num) 2067 { 2068 u64 start = eb->start; 2069 int i, num_pages = num_extent_pages(eb); 2070 int ret = 0; 2071 2072 if (sb_rdonly(fs_info->sb)) 2073 return -EROFS; 2074 2075 for (i = 0; i < num_pages; i++) { 2076 struct page *p = eb->pages[i]; 2077 2078 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2079 start - page_offset(p), mirror_num); 2080 if (ret) 2081 break; 2082 start += PAGE_SIZE; 2083 } 2084 2085 return ret; 2086 } 2087 2088 /* 2089 * each time an IO finishes, we do a fast check in the IO failure tree 2090 * to see if we need to process or clean up an io_failure_record 2091 */ 2092 int clean_io_failure(struct btrfs_fs_info *fs_info, 2093 struct extent_io_tree *failure_tree, 2094 struct extent_io_tree *io_tree, u64 start, 2095 struct page *page, u64 ino, unsigned int pg_offset) 2096 { 2097 u64 private; 2098 struct io_failure_record *failrec; 2099 struct extent_state *state; 2100 int num_copies; 2101 int ret; 2102 2103 private = 0; 2104 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2105 EXTENT_DIRTY, 0); 2106 if (!ret) 2107 return 0; 2108 2109 ret = get_state_failrec(failure_tree, start, &failrec); 2110 if (ret) 2111 return 0; 2112 2113 BUG_ON(!failrec->this_mirror); 2114 2115 if (failrec->in_validation) { 2116 /* there was no real error, just free the record */ 2117 btrfs_debug(fs_info, 2118 "clean_io_failure: freeing dummy error at %llu", 2119 failrec->start); 2120 goto out; 2121 } 2122 if (sb_rdonly(fs_info->sb)) 2123 goto out; 2124 2125 spin_lock(&io_tree->lock); 2126 state = find_first_extent_bit_state(io_tree, 2127 failrec->start, 2128 EXTENT_LOCKED); 2129 spin_unlock(&io_tree->lock); 2130 2131 if (state && state->start <= failrec->start && 2132 state->end >= failrec->start + failrec->len - 1) { 2133 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2134 failrec->len); 2135 if (num_copies > 1) { 2136 repair_io_failure(fs_info, ino, start, failrec->len, 2137 failrec->logical, page, pg_offset, 2138 failrec->failed_mirror); 2139 } 2140 } 2141 2142 out: 2143 free_io_failure(failure_tree, io_tree, failrec); 2144 2145 return 0; 2146 } 2147 2148 /* 2149 * Can be called when 2150 * - hold extent lock 2151 * - under ordered extent 2152 * - the inode is freeing 2153 */ 2154 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2155 { 2156 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2157 struct io_failure_record *failrec; 2158 struct extent_state *state, *next; 2159 2160 if (RB_EMPTY_ROOT(&failure_tree->state)) 2161 return; 2162 2163 spin_lock(&failure_tree->lock); 2164 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2165 while (state) { 2166 if (state->start > end) 2167 break; 2168 2169 ASSERT(state->end <= end); 2170 2171 next = next_state(state); 2172 2173 failrec = state->failrec; 2174 free_extent_state(state); 2175 kfree(failrec); 2176 2177 state = next; 2178 } 2179 spin_unlock(&failure_tree->lock); 2180 } 2181 2182 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2183 struct io_failure_record **failrec_ret) 2184 { 2185 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2186 struct io_failure_record *failrec; 2187 struct extent_map *em; 2188 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2189 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2190 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2191 int ret; 2192 u64 logical; 2193 2194 ret = get_state_failrec(failure_tree, start, &failrec); 2195 if (ret) { 2196 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2197 if (!failrec) 2198 return -ENOMEM; 2199 2200 failrec->start = start; 2201 failrec->len = end - start + 1; 2202 failrec->this_mirror = 0; 2203 failrec->bio_flags = 0; 2204 failrec->in_validation = 0; 2205 2206 read_lock(&em_tree->lock); 2207 em = lookup_extent_mapping(em_tree, start, failrec->len); 2208 if (!em) { 2209 read_unlock(&em_tree->lock); 2210 kfree(failrec); 2211 return -EIO; 2212 } 2213 2214 if (em->start > start || em->start + em->len <= start) { 2215 free_extent_map(em); 2216 em = NULL; 2217 } 2218 read_unlock(&em_tree->lock); 2219 if (!em) { 2220 kfree(failrec); 2221 return -EIO; 2222 } 2223 2224 logical = start - em->start; 2225 logical = em->block_start + logical; 2226 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2227 logical = em->block_start; 2228 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2229 extent_set_compress_type(&failrec->bio_flags, 2230 em->compress_type); 2231 } 2232 2233 btrfs_debug(fs_info, 2234 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2235 logical, start, failrec->len); 2236 2237 failrec->logical = logical; 2238 free_extent_map(em); 2239 2240 /* set the bits in the private failure tree */ 2241 ret = set_extent_bits(failure_tree, start, end, 2242 EXTENT_LOCKED | EXTENT_DIRTY); 2243 if (ret >= 0) 2244 ret = set_state_failrec(failure_tree, start, failrec); 2245 /* set the bits in the inode's tree */ 2246 if (ret >= 0) 2247 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2248 if (ret < 0) { 2249 kfree(failrec); 2250 return ret; 2251 } 2252 } else { 2253 btrfs_debug(fs_info, 2254 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2255 failrec->logical, failrec->start, failrec->len, 2256 failrec->in_validation); 2257 /* 2258 * when data can be on disk more than twice, add to failrec here 2259 * (e.g. with a list for failed_mirror) to make 2260 * clean_io_failure() clean all those errors at once. 2261 */ 2262 } 2263 2264 *failrec_ret = failrec; 2265 2266 return 0; 2267 } 2268 2269 bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, 2270 struct io_failure_record *failrec, int failed_mirror) 2271 { 2272 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2273 int num_copies; 2274 2275 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2276 if (num_copies == 1) { 2277 /* 2278 * we only have a single copy of the data, so don't bother with 2279 * all the retry and error correction code that follows. no 2280 * matter what the error is, it is very likely to persist. 2281 */ 2282 btrfs_debug(fs_info, 2283 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2284 num_copies, failrec->this_mirror, failed_mirror); 2285 return false; 2286 } 2287 2288 /* 2289 * there are two premises: 2290 * a) deliver good data to the caller 2291 * b) correct the bad sectors on disk 2292 */ 2293 if (failed_bio_pages > 1) { 2294 /* 2295 * to fulfill b), we need to know the exact failing sectors, as 2296 * we don't want to rewrite any more than the failed ones. thus, 2297 * we need separate read requests for the failed bio 2298 * 2299 * if the following BUG_ON triggers, our validation request got 2300 * merged. we need separate requests for our algorithm to work. 2301 */ 2302 BUG_ON(failrec->in_validation); 2303 failrec->in_validation = 1; 2304 failrec->this_mirror = failed_mirror; 2305 } else { 2306 /* 2307 * we're ready to fulfill a) and b) alongside. get a good copy 2308 * of the failed sector and if we succeed, we have setup 2309 * everything for repair_io_failure to do the rest for us. 2310 */ 2311 if (failrec->in_validation) { 2312 BUG_ON(failrec->this_mirror != failed_mirror); 2313 failrec->in_validation = 0; 2314 failrec->this_mirror = 0; 2315 } 2316 failrec->failed_mirror = failed_mirror; 2317 failrec->this_mirror++; 2318 if (failrec->this_mirror == failed_mirror) 2319 failrec->this_mirror++; 2320 } 2321 2322 if (failrec->this_mirror > num_copies) { 2323 btrfs_debug(fs_info, 2324 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2325 num_copies, failrec->this_mirror, failed_mirror); 2326 return false; 2327 } 2328 2329 return true; 2330 } 2331 2332 2333 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2334 struct io_failure_record *failrec, 2335 struct page *page, int pg_offset, int icsum, 2336 bio_end_io_t *endio_func, void *data) 2337 { 2338 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2339 struct bio *bio; 2340 struct btrfs_io_bio *btrfs_failed_bio; 2341 struct btrfs_io_bio *btrfs_bio; 2342 2343 bio = btrfs_io_bio_alloc(1); 2344 bio->bi_end_io = endio_func; 2345 bio->bi_iter.bi_sector = failrec->logical >> 9; 2346 bio_set_dev(bio, fs_info->fs_devices->latest_bdev); 2347 bio->bi_iter.bi_size = 0; 2348 bio->bi_private = data; 2349 2350 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2351 if (btrfs_failed_bio->csum) { 2352 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2353 2354 btrfs_bio = btrfs_io_bio(bio); 2355 btrfs_bio->csum = btrfs_bio->csum_inline; 2356 icsum *= csum_size; 2357 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2358 csum_size); 2359 } 2360 2361 bio_add_page(bio, page, failrec->len, pg_offset); 2362 2363 return bio; 2364 } 2365 2366 /* 2367 * This is a generic handler for readpage errors. If other copies exist, read 2368 * those and write back good data to the failed position. Does not investigate 2369 * in remapping the failed extent elsewhere, hoping the device will be smart 2370 * enough to do this as needed 2371 */ 2372 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2373 struct page *page, u64 start, u64 end, 2374 int failed_mirror) 2375 { 2376 struct io_failure_record *failrec; 2377 struct inode *inode = page->mapping->host; 2378 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2379 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2380 struct bio *bio; 2381 int read_mode = 0; 2382 blk_status_t status; 2383 int ret; 2384 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; 2385 2386 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2387 2388 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2389 if (ret) 2390 return ret; 2391 2392 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, 2393 failed_mirror)) { 2394 free_io_failure(failure_tree, tree, failrec); 2395 return -EIO; 2396 } 2397 2398 if (failed_bio_pages > 1) 2399 read_mode |= REQ_FAILFAST_DEV; 2400 2401 phy_offset >>= inode->i_sb->s_blocksize_bits; 2402 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2403 start - page_offset(page), 2404 (int)phy_offset, failed_bio->bi_end_io, 2405 NULL); 2406 bio->bi_opf = REQ_OP_READ | read_mode; 2407 2408 btrfs_debug(btrfs_sb(inode->i_sb), 2409 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2410 read_mode, failrec->this_mirror, failrec->in_validation); 2411 2412 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, 2413 failrec->bio_flags, 0); 2414 if (status) { 2415 free_io_failure(failure_tree, tree, failrec); 2416 bio_put(bio); 2417 ret = blk_status_to_errno(status); 2418 } 2419 2420 return ret; 2421 } 2422 2423 /* lots and lots of room for performance fixes in the end_bio funcs */ 2424 2425 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2426 { 2427 int uptodate = (err == 0); 2428 int ret = 0; 2429 2430 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); 2431 2432 if (!uptodate) { 2433 ClearPageUptodate(page); 2434 SetPageError(page); 2435 ret = err < 0 ? err : -EIO; 2436 mapping_set_error(page->mapping, ret); 2437 } 2438 } 2439 2440 /* 2441 * after a writepage IO is done, we need to: 2442 * clear the uptodate bits on error 2443 * clear the writeback bits in the extent tree for this IO 2444 * end_page_writeback if the page has no more pending IO 2445 * 2446 * Scheduling is not allowed, so the extent state tree is expected 2447 * to have one and only one object corresponding to this IO. 2448 */ 2449 static void end_bio_extent_writepage(struct bio *bio) 2450 { 2451 int error = blk_status_to_errno(bio->bi_status); 2452 struct bio_vec *bvec; 2453 u64 start; 2454 u64 end; 2455 int i; 2456 struct bvec_iter_all iter_all; 2457 2458 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2459 bio_for_each_segment_all(bvec, bio, i, iter_all) { 2460 struct page *page = bvec->bv_page; 2461 struct inode *inode = page->mapping->host; 2462 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2463 2464 /* We always issue full-page reads, but if some block 2465 * in a page fails to read, blk_update_request() will 2466 * advance bv_offset and adjust bv_len to compensate. 2467 * Print a warning for nonzero offsets, and an error 2468 * if they don't add up to a full page. */ 2469 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2470 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2471 btrfs_err(fs_info, 2472 "partial page write in btrfs with offset %u and length %u", 2473 bvec->bv_offset, bvec->bv_len); 2474 else 2475 btrfs_info(fs_info, 2476 "incomplete page write in btrfs with offset %u and length %u", 2477 bvec->bv_offset, bvec->bv_len); 2478 } 2479 2480 start = page_offset(page); 2481 end = start + bvec->bv_offset + bvec->bv_len - 1; 2482 2483 end_extent_writepage(page, error, start, end); 2484 end_page_writeback(page); 2485 } 2486 2487 bio_put(bio); 2488 } 2489 2490 static void 2491 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2492 int uptodate) 2493 { 2494 struct extent_state *cached = NULL; 2495 u64 end = start + len - 1; 2496 2497 if (uptodate && tree->track_uptodate) 2498 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2499 unlock_extent_cached_atomic(tree, start, end, &cached); 2500 } 2501 2502 /* 2503 * after a readpage IO is done, we need to: 2504 * clear the uptodate bits on error 2505 * set the uptodate bits if things worked 2506 * set the page up to date if all extents in the tree are uptodate 2507 * clear the lock bit in the extent tree 2508 * unlock the page if there are no other extents locked for it 2509 * 2510 * Scheduling is not allowed, so the extent state tree is expected 2511 * to have one and only one object corresponding to this IO. 2512 */ 2513 static void end_bio_extent_readpage(struct bio *bio) 2514 { 2515 struct bio_vec *bvec; 2516 int uptodate = !bio->bi_status; 2517 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2518 struct extent_io_tree *tree, *failure_tree; 2519 u64 offset = 0; 2520 u64 start; 2521 u64 end; 2522 u64 len; 2523 u64 extent_start = 0; 2524 u64 extent_len = 0; 2525 int mirror; 2526 int ret; 2527 int i; 2528 struct bvec_iter_all iter_all; 2529 2530 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2531 bio_for_each_segment_all(bvec, bio, i, iter_all) { 2532 struct page *page = bvec->bv_page; 2533 struct inode *inode = page->mapping->host; 2534 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2535 bool data_inode = btrfs_ino(BTRFS_I(inode)) 2536 != BTRFS_BTREE_INODE_OBJECTID; 2537 2538 btrfs_debug(fs_info, 2539 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2540 (u64)bio->bi_iter.bi_sector, bio->bi_status, 2541 io_bio->mirror_num); 2542 tree = &BTRFS_I(inode)->io_tree; 2543 failure_tree = &BTRFS_I(inode)->io_failure_tree; 2544 2545 /* We always issue full-page reads, but if some block 2546 * in a page fails to read, blk_update_request() will 2547 * advance bv_offset and adjust bv_len to compensate. 2548 * Print a warning for nonzero offsets, and an error 2549 * if they don't add up to a full page. */ 2550 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2551 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2552 btrfs_err(fs_info, 2553 "partial page read in btrfs with offset %u and length %u", 2554 bvec->bv_offset, bvec->bv_len); 2555 else 2556 btrfs_info(fs_info, 2557 "incomplete page read in btrfs with offset %u and length %u", 2558 bvec->bv_offset, bvec->bv_len); 2559 } 2560 2561 start = page_offset(page); 2562 end = start + bvec->bv_offset + bvec->bv_len - 1; 2563 len = bvec->bv_len; 2564 2565 mirror = io_bio->mirror_num; 2566 if (likely(uptodate)) { 2567 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2568 page, start, end, 2569 mirror); 2570 if (ret) 2571 uptodate = 0; 2572 else 2573 clean_io_failure(BTRFS_I(inode)->root->fs_info, 2574 failure_tree, tree, start, 2575 page, 2576 btrfs_ino(BTRFS_I(inode)), 0); 2577 } 2578 2579 if (likely(uptodate)) 2580 goto readpage_ok; 2581 2582 if (data_inode) { 2583 2584 /* 2585 * The generic bio_readpage_error handles errors the 2586 * following way: If possible, new read requests are 2587 * created and submitted and will end up in 2588 * end_bio_extent_readpage as well (if we're lucky, 2589 * not in the !uptodate case). In that case it returns 2590 * 0 and we just go on with the next page in our bio. 2591 * If it can't handle the error it will return -EIO and 2592 * we remain responsible for that page. 2593 */ 2594 ret = bio_readpage_error(bio, offset, page, start, end, 2595 mirror); 2596 if (ret == 0) { 2597 uptodate = !bio->bi_status; 2598 offset += len; 2599 continue; 2600 } 2601 } else { 2602 struct extent_buffer *eb; 2603 2604 eb = (struct extent_buffer *)page->private; 2605 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 2606 eb->read_mirror = mirror; 2607 atomic_dec(&eb->io_pages); 2608 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, 2609 &eb->bflags)) 2610 btree_readahead_hook(eb, -EIO); 2611 2612 ret = -EIO; 2613 } 2614 readpage_ok: 2615 if (likely(uptodate)) { 2616 loff_t i_size = i_size_read(inode); 2617 pgoff_t end_index = i_size >> PAGE_SHIFT; 2618 unsigned off; 2619 2620 /* Zero out the end if this page straddles i_size */ 2621 off = offset_in_page(i_size); 2622 if (page->index == end_index && off) 2623 zero_user_segment(page, off, PAGE_SIZE); 2624 SetPageUptodate(page); 2625 } else { 2626 ClearPageUptodate(page); 2627 SetPageError(page); 2628 } 2629 unlock_page(page); 2630 offset += len; 2631 2632 if (unlikely(!uptodate)) { 2633 if (extent_len) { 2634 endio_readpage_release_extent(tree, 2635 extent_start, 2636 extent_len, 1); 2637 extent_start = 0; 2638 extent_len = 0; 2639 } 2640 endio_readpage_release_extent(tree, start, 2641 end - start + 1, 0); 2642 } else if (!extent_len) { 2643 extent_start = start; 2644 extent_len = end + 1 - start; 2645 } else if (extent_start + extent_len == start) { 2646 extent_len += end + 1 - start; 2647 } else { 2648 endio_readpage_release_extent(tree, extent_start, 2649 extent_len, uptodate); 2650 extent_start = start; 2651 extent_len = end + 1 - start; 2652 } 2653 } 2654 2655 if (extent_len) 2656 endio_readpage_release_extent(tree, extent_start, extent_len, 2657 uptodate); 2658 btrfs_io_bio_free_csum(io_bio); 2659 bio_put(bio); 2660 } 2661 2662 /* 2663 * Initialize the members up to but not including 'bio'. Use after allocating a 2664 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 2665 * 'bio' because use of __GFP_ZERO is not supported. 2666 */ 2667 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) 2668 { 2669 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); 2670 } 2671 2672 /* 2673 * The following helpers allocate a bio. As it's backed by a bioset, it'll 2674 * never fail. We're returning a bio right now but you can call btrfs_io_bio 2675 * for the appropriate container_of magic 2676 */ 2677 struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) 2678 { 2679 struct bio *bio; 2680 2681 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); 2682 bio_set_dev(bio, bdev); 2683 bio->bi_iter.bi_sector = first_byte >> 9; 2684 btrfs_io_bio_init(btrfs_io_bio(bio)); 2685 return bio; 2686 } 2687 2688 struct bio *btrfs_bio_clone(struct bio *bio) 2689 { 2690 struct btrfs_io_bio *btrfs_bio; 2691 struct bio *new; 2692 2693 /* Bio allocation backed by a bioset does not fail */ 2694 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); 2695 btrfs_bio = btrfs_io_bio(new); 2696 btrfs_io_bio_init(btrfs_bio); 2697 btrfs_bio->iter = bio->bi_iter; 2698 return new; 2699 } 2700 2701 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) 2702 { 2703 struct bio *bio; 2704 2705 /* Bio allocation backed by a bioset does not fail */ 2706 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); 2707 btrfs_io_bio_init(btrfs_io_bio(bio)); 2708 return bio; 2709 } 2710 2711 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) 2712 { 2713 struct bio *bio; 2714 struct btrfs_io_bio *btrfs_bio; 2715 2716 /* this will never fail when it's backed by a bioset */ 2717 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); 2718 ASSERT(bio); 2719 2720 btrfs_bio = btrfs_io_bio(bio); 2721 btrfs_io_bio_init(btrfs_bio); 2722 2723 bio_trim(bio, offset >> 9, size >> 9); 2724 btrfs_bio->iter = bio->bi_iter; 2725 return bio; 2726 } 2727 2728 /* 2729 * @opf: bio REQ_OP_* and REQ_* flags as one value 2730 * @tree: tree so we can call our merge_bio hook 2731 * @wbc: optional writeback control for io accounting 2732 * @page: page to add to the bio 2733 * @pg_offset: offset of the new bio or to check whether we are adding 2734 * a contiguous page to the previous one 2735 * @size: portion of page that we want to write 2736 * @offset: starting offset in the page 2737 * @bdev: attach newly created bios to this bdev 2738 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 2739 * @end_io_func: end_io callback for new bio 2740 * @mirror_num: desired mirror to read/write 2741 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 2742 * @bio_flags: flags of the current bio to see if we can merge them 2743 */ 2744 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, 2745 struct writeback_control *wbc, 2746 struct page *page, u64 offset, 2747 size_t size, unsigned long pg_offset, 2748 struct block_device *bdev, 2749 struct bio **bio_ret, 2750 bio_end_io_t end_io_func, 2751 int mirror_num, 2752 unsigned long prev_bio_flags, 2753 unsigned long bio_flags, 2754 bool force_bio_submit) 2755 { 2756 int ret = 0; 2757 struct bio *bio; 2758 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2759 sector_t sector = offset >> 9; 2760 2761 ASSERT(bio_ret); 2762 2763 if (*bio_ret) { 2764 bool contig; 2765 bool can_merge = true; 2766 2767 bio = *bio_ret; 2768 if (prev_bio_flags & EXTENT_BIO_COMPRESSED) 2769 contig = bio->bi_iter.bi_sector == sector; 2770 else 2771 contig = bio_end_sector(bio) == sector; 2772 2773 ASSERT(tree->ops); 2774 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) 2775 can_merge = false; 2776 2777 if (prev_bio_flags != bio_flags || !contig || !can_merge || 2778 force_bio_submit || 2779 bio_add_page(bio, page, page_size, pg_offset) < page_size) { 2780 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2781 if (ret < 0) { 2782 *bio_ret = NULL; 2783 return ret; 2784 } 2785 bio = NULL; 2786 } else { 2787 if (wbc) 2788 wbc_account_io(wbc, page, page_size); 2789 return 0; 2790 } 2791 } 2792 2793 bio = btrfs_bio_alloc(bdev, offset); 2794 bio_add_page(bio, page, page_size, pg_offset); 2795 bio->bi_end_io = end_io_func; 2796 bio->bi_private = tree; 2797 bio->bi_write_hint = page->mapping->host->i_write_hint; 2798 bio->bi_opf = opf; 2799 if (wbc) { 2800 wbc_init_bio(wbc, bio); 2801 wbc_account_io(wbc, page, page_size); 2802 } 2803 2804 *bio_ret = bio; 2805 2806 return ret; 2807 } 2808 2809 static void attach_extent_buffer_page(struct extent_buffer *eb, 2810 struct page *page) 2811 { 2812 if (!PagePrivate(page)) { 2813 SetPagePrivate(page); 2814 get_page(page); 2815 set_page_private(page, (unsigned long)eb); 2816 } else { 2817 WARN_ON(page->private != (unsigned long)eb); 2818 } 2819 } 2820 2821 void set_page_extent_mapped(struct page *page) 2822 { 2823 if (!PagePrivate(page)) { 2824 SetPagePrivate(page); 2825 get_page(page); 2826 set_page_private(page, EXTENT_PAGE_PRIVATE); 2827 } 2828 } 2829 2830 static struct extent_map * 2831 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2832 u64 start, u64 len, get_extent_t *get_extent, 2833 struct extent_map **em_cached) 2834 { 2835 struct extent_map *em; 2836 2837 if (em_cached && *em_cached) { 2838 em = *em_cached; 2839 if (extent_map_in_tree(em) && start >= em->start && 2840 start < extent_map_end(em)) { 2841 refcount_inc(&em->refs); 2842 return em; 2843 } 2844 2845 free_extent_map(em); 2846 *em_cached = NULL; 2847 } 2848 2849 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 2850 if (em_cached && !IS_ERR_OR_NULL(em)) { 2851 BUG_ON(*em_cached); 2852 refcount_inc(&em->refs); 2853 *em_cached = em; 2854 } 2855 return em; 2856 } 2857 /* 2858 * basic readpage implementation. Locked extent state structs are inserted 2859 * into the tree that are removed when the IO is done (by the end_io 2860 * handlers) 2861 * XXX JDM: This needs looking at to ensure proper page locking 2862 * return 0 on success, otherwise return error 2863 */ 2864 static int __do_readpage(struct extent_io_tree *tree, 2865 struct page *page, 2866 get_extent_t *get_extent, 2867 struct extent_map **em_cached, 2868 struct bio **bio, int mirror_num, 2869 unsigned long *bio_flags, unsigned int read_flags, 2870 u64 *prev_em_start) 2871 { 2872 struct inode *inode = page->mapping->host; 2873 u64 start = page_offset(page); 2874 const u64 end = start + PAGE_SIZE - 1; 2875 u64 cur = start; 2876 u64 extent_offset; 2877 u64 last_byte = i_size_read(inode); 2878 u64 block_start; 2879 u64 cur_end; 2880 struct extent_map *em; 2881 struct block_device *bdev; 2882 int ret = 0; 2883 int nr = 0; 2884 size_t pg_offset = 0; 2885 size_t iosize; 2886 size_t disk_io_size; 2887 size_t blocksize = inode->i_sb->s_blocksize; 2888 unsigned long this_bio_flag = 0; 2889 2890 set_page_extent_mapped(page); 2891 2892 if (!PageUptodate(page)) { 2893 if (cleancache_get_page(page) == 0) { 2894 BUG_ON(blocksize != PAGE_SIZE); 2895 unlock_extent(tree, start, end); 2896 goto out; 2897 } 2898 } 2899 2900 if (page->index == last_byte >> PAGE_SHIFT) { 2901 char *userpage; 2902 size_t zero_offset = offset_in_page(last_byte); 2903 2904 if (zero_offset) { 2905 iosize = PAGE_SIZE - zero_offset; 2906 userpage = kmap_atomic(page); 2907 memset(userpage + zero_offset, 0, iosize); 2908 flush_dcache_page(page); 2909 kunmap_atomic(userpage); 2910 } 2911 } 2912 while (cur <= end) { 2913 bool force_bio_submit = false; 2914 u64 offset; 2915 2916 if (cur >= last_byte) { 2917 char *userpage; 2918 struct extent_state *cached = NULL; 2919 2920 iosize = PAGE_SIZE - pg_offset; 2921 userpage = kmap_atomic(page); 2922 memset(userpage + pg_offset, 0, iosize); 2923 flush_dcache_page(page); 2924 kunmap_atomic(userpage); 2925 set_extent_uptodate(tree, cur, cur + iosize - 1, 2926 &cached, GFP_NOFS); 2927 unlock_extent_cached(tree, cur, 2928 cur + iosize - 1, &cached); 2929 break; 2930 } 2931 em = __get_extent_map(inode, page, pg_offset, cur, 2932 end - cur + 1, get_extent, em_cached); 2933 if (IS_ERR_OR_NULL(em)) { 2934 SetPageError(page); 2935 unlock_extent(tree, cur, end); 2936 break; 2937 } 2938 extent_offset = cur - em->start; 2939 BUG_ON(extent_map_end(em) <= cur); 2940 BUG_ON(end < cur); 2941 2942 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2943 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2944 extent_set_compress_type(&this_bio_flag, 2945 em->compress_type); 2946 } 2947 2948 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2949 cur_end = min(extent_map_end(em) - 1, end); 2950 iosize = ALIGN(iosize, blocksize); 2951 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2952 disk_io_size = em->block_len; 2953 offset = em->block_start; 2954 } else { 2955 offset = em->block_start + extent_offset; 2956 disk_io_size = iosize; 2957 } 2958 bdev = em->bdev; 2959 block_start = em->block_start; 2960 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2961 block_start = EXTENT_MAP_HOLE; 2962 2963 /* 2964 * If we have a file range that points to a compressed extent 2965 * and it's followed by a consecutive file range that points to 2966 * to the same compressed extent (possibly with a different 2967 * offset and/or length, so it either points to the whole extent 2968 * or only part of it), we must make sure we do not submit a 2969 * single bio to populate the pages for the 2 ranges because 2970 * this makes the compressed extent read zero out the pages 2971 * belonging to the 2nd range. Imagine the following scenario: 2972 * 2973 * File layout 2974 * [0 - 8K] [8K - 24K] 2975 * | | 2976 * | | 2977 * points to extent X, points to extent X, 2978 * offset 4K, length of 8K offset 0, length 16K 2979 * 2980 * [extent X, compressed length = 4K uncompressed length = 16K] 2981 * 2982 * If the bio to read the compressed extent covers both ranges, 2983 * it will decompress extent X into the pages belonging to the 2984 * first range and then it will stop, zeroing out the remaining 2985 * pages that belong to the other range that points to extent X. 2986 * So here we make sure we submit 2 bios, one for the first 2987 * range and another one for the third range. Both will target 2988 * the same physical extent from disk, but we can't currently 2989 * make the compressed bio endio callback populate the pages 2990 * for both ranges because each compressed bio is tightly 2991 * coupled with a single extent map, and each range can have 2992 * an extent map with a different offset value relative to the 2993 * uncompressed data of our extent and different lengths. This 2994 * is a corner case so we prioritize correctness over 2995 * non-optimal behavior (submitting 2 bios for the same extent). 2996 */ 2997 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 2998 prev_em_start && *prev_em_start != (u64)-1 && 2999 *prev_em_start != em->start) 3000 force_bio_submit = true; 3001 3002 if (prev_em_start) 3003 *prev_em_start = em->start; 3004 3005 free_extent_map(em); 3006 em = NULL; 3007 3008 /* we've found a hole, just zero and go on */ 3009 if (block_start == EXTENT_MAP_HOLE) { 3010 char *userpage; 3011 struct extent_state *cached = NULL; 3012 3013 userpage = kmap_atomic(page); 3014 memset(userpage + pg_offset, 0, iosize); 3015 flush_dcache_page(page); 3016 kunmap_atomic(userpage); 3017 3018 set_extent_uptodate(tree, cur, cur + iosize - 1, 3019 &cached, GFP_NOFS); 3020 unlock_extent_cached(tree, cur, 3021 cur + iosize - 1, &cached); 3022 cur = cur + iosize; 3023 pg_offset += iosize; 3024 continue; 3025 } 3026 /* the get_extent function already copied into the page */ 3027 if (test_range_bit(tree, cur, cur_end, 3028 EXTENT_UPTODATE, 1, NULL)) { 3029 check_page_uptodate(tree, page); 3030 unlock_extent(tree, cur, cur + iosize - 1); 3031 cur = cur + iosize; 3032 pg_offset += iosize; 3033 continue; 3034 } 3035 /* we have an inline extent but it didn't get marked up 3036 * to date. Error out 3037 */ 3038 if (block_start == EXTENT_MAP_INLINE) { 3039 SetPageError(page); 3040 unlock_extent(tree, cur, cur + iosize - 1); 3041 cur = cur + iosize; 3042 pg_offset += iosize; 3043 continue; 3044 } 3045 3046 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, 3047 page, offset, disk_io_size, 3048 pg_offset, bdev, bio, 3049 end_bio_extent_readpage, mirror_num, 3050 *bio_flags, 3051 this_bio_flag, 3052 force_bio_submit); 3053 if (!ret) { 3054 nr++; 3055 *bio_flags = this_bio_flag; 3056 } else { 3057 SetPageError(page); 3058 unlock_extent(tree, cur, cur + iosize - 1); 3059 goto out; 3060 } 3061 cur = cur + iosize; 3062 pg_offset += iosize; 3063 } 3064 out: 3065 if (!nr) { 3066 if (!PageError(page)) 3067 SetPageUptodate(page); 3068 unlock_page(page); 3069 } 3070 return ret; 3071 } 3072 3073 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3074 struct page *pages[], int nr_pages, 3075 u64 start, u64 end, 3076 struct extent_map **em_cached, 3077 struct bio **bio, 3078 unsigned long *bio_flags, 3079 u64 *prev_em_start) 3080 { 3081 struct inode *inode; 3082 struct btrfs_ordered_extent *ordered; 3083 int index; 3084 3085 inode = pages[0]->mapping->host; 3086 while (1) { 3087 lock_extent(tree, start, end); 3088 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3089 end - start + 1); 3090 if (!ordered) 3091 break; 3092 unlock_extent(tree, start, end); 3093 btrfs_start_ordered_extent(inode, ordered, 1); 3094 btrfs_put_ordered_extent(ordered); 3095 } 3096 3097 for (index = 0; index < nr_pages; index++) { 3098 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, 3099 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); 3100 put_page(pages[index]); 3101 } 3102 } 3103 3104 static void __extent_readpages(struct extent_io_tree *tree, 3105 struct page *pages[], 3106 int nr_pages, 3107 struct extent_map **em_cached, 3108 struct bio **bio, unsigned long *bio_flags, 3109 u64 *prev_em_start) 3110 { 3111 u64 start = 0; 3112 u64 end = 0; 3113 u64 page_start; 3114 int index; 3115 int first_index = 0; 3116 3117 for (index = 0; index < nr_pages; index++) { 3118 page_start = page_offset(pages[index]); 3119 if (!end) { 3120 start = page_start; 3121 end = start + PAGE_SIZE - 1; 3122 first_index = index; 3123 } else if (end + 1 == page_start) { 3124 end += PAGE_SIZE; 3125 } else { 3126 __do_contiguous_readpages(tree, &pages[first_index], 3127 index - first_index, start, 3128 end, em_cached, 3129 bio, bio_flags, 3130 prev_em_start); 3131 start = page_start; 3132 end = start + PAGE_SIZE - 1; 3133 first_index = index; 3134 } 3135 } 3136 3137 if (end) 3138 __do_contiguous_readpages(tree, &pages[first_index], 3139 index - first_index, start, 3140 end, em_cached, bio, 3141 bio_flags, prev_em_start); 3142 } 3143 3144 static int __extent_read_full_page(struct extent_io_tree *tree, 3145 struct page *page, 3146 get_extent_t *get_extent, 3147 struct bio **bio, int mirror_num, 3148 unsigned long *bio_flags, 3149 unsigned int read_flags) 3150 { 3151 struct inode *inode = page->mapping->host; 3152 struct btrfs_ordered_extent *ordered; 3153 u64 start = page_offset(page); 3154 u64 end = start + PAGE_SIZE - 1; 3155 int ret; 3156 3157 while (1) { 3158 lock_extent(tree, start, end); 3159 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3160 PAGE_SIZE); 3161 if (!ordered) 3162 break; 3163 unlock_extent(tree, start, end); 3164 btrfs_start_ordered_extent(inode, ordered, 1); 3165 btrfs_put_ordered_extent(ordered); 3166 } 3167 3168 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3169 bio_flags, read_flags, NULL); 3170 return ret; 3171 } 3172 3173 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3174 get_extent_t *get_extent, int mirror_num) 3175 { 3176 struct bio *bio = NULL; 3177 unsigned long bio_flags = 0; 3178 int ret; 3179 3180 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3181 &bio_flags, 0); 3182 if (bio) 3183 ret = submit_one_bio(bio, mirror_num, bio_flags); 3184 return ret; 3185 } 3186 3187 static void update_nr_written(struct writeback_control *wbc, 3188 unsigned long nr_written) 3189 { 3190 wbc->nr_to_write -= nr_written; 3191 } 3192 3193 /* 3194 * helper for __extent_writepage, doing all of the delayed allocation setup. 3195 * 3196 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3197 * to write the page (copy into inline extent). In this case the IO has 3198 * been started and the page is already unlocked. 3199 * 3200 * This returns 0 if all went well (page still locked) 3201 * This returns < 0 if there were errors (page still locked) 3202 */ 3203 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3204 struct page *page, struct writeback_control *wbc, 3205 u64 delalloc_start, unsigned long *nr_written) 3206 { 3207 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 3208 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3209 bool found; 3210 u64 delalloc_to_write = 0; 3211 u64 delalloc_end = 0; 3212 int ret; 3213 int page_started = 0; 3214 3215 3216 while (delalloc_end < page_end) { 3217 found = find_lock_delalloc_range(inode, tree, 3218 page, 3219 &delalloc_start, 3220 &delalloc_end); 3221 if (!found) { 3222 delalloc_start = delalloc_end + 1; 3223 continue; 3224 } 3225 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3226 delalloc_end, &page_started, nr_written, wbc); 3227 /* File system has been set read-only */ 3228 if (ret) { 3229 SetPageError(page); 3230 /* 3231 * btrfs_run_delalloc_range should return < 0 for error 3232 * but just in case, we use > 0 here meaning the IO is 3233 * started, so we don't want to return > 0 unless 3234 * things are going well. 3235 */ 3236 ret = ret < 0 ? ret : -EIO; 3237 goto done; 3238 } 3239 /* 3240 * delalloc_end is already one less than the total length, so 3241 * we don't subtract one from PAGE_SIZE 3242 */ 3243 delalloc_to_write += (delalloc_end - delalloc_start + 3244 PAGE_SIZE) >> PAGE_SHIFT; 3245 delalloc_start = delalloc_end + 1; 3246 } 3247 if (wbc->nr_to_write < delalloc_to_write) { 3248 int thresh = 8192; 3249 3250 if (delalloc_to_write < thresh * 2) 3251 thresh = delalloc_to_write; 3252 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3253 thresh); 3254 } 3255 3256 /* did the fill delalloc function already unlock and start 3257 * the IO? 3258 */ 3259 if (page_started) { 3260 /* 3261 * we've unlocked the page, so we can't update 3262 * the mapping's writeback index, just update 3263 * nr_to_write. 3264 */ 3265 wbc->nr_to_write -= *nr_written; 3266 return 1; 3267 } 3268 3269 ret = 0; 3270 3271 done: 3272 return ret; 3273 } 3274 3275 /* 3276 * helper for __extent_writepage. This calls the writepage start hooks, 3277 * and does the loop to map the page into extents and bios. 3278 * 3279 * We return 1 if the IO is started and the page is unlocked, 3280 * 0 if all went well (page still locked) 3281 * < 0 if there were errors (page still locked) 3282 */ 3283 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3284 struct page *page, 3285 struct writeback_control *wbc, 3286 struct extent_page_data *epd, 3287 loff_t i_size, 3288 unsigned long nr_written, 3289 unsigned int write_flags, int *nr_ret) 3290 { 3291 struct extent_io_tree *tree = epd->tree; 3292 u64 start = page_offset(page); 3293 u64 page_end = start + PAGE_SIZE - 1; 3294 u64 end; 3295 u64 cur = start; 3296 u64 extent_offset; 3297 u64 block_start; 3298 u64 iosize; 3299 struct extent_map *em; 3300 struct block_device *bdev; 3301 size_t pg_offset = 0; 3302 size_t blocksize; 3303 int ret = 0; 3304 int nr = 0; 3305 bool compressed; 3306 3307 ret = btrfs_writepage_cow_fixup(page, start, page_end); 3308 if (ret) { 3309 /* Fixup worker will requeue */ 3310 if (ret == -EBUSY) 3311 wbc->pages_skipped++; 3312 else 3313 redirty_page_for_writepage(wbc, page); 3314 3315 update_nr_written(wbc, nr_written); 3316 unlock_page(page); 3317 return 1; 3318 } 3319 3320 /* 3321 * we don't want to touch the inode after unlocking the page, 3322 * so we update the mapping writeback index now 3323 */ 3324 update_nr_written(wbc, nr_written + 1); 3325 3326 end = page_end; 3327 if (i_size <= start) { 3328 btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); 3329 goto done; 3330 } 3331 3332 blocksize = inode->i_sb->s_blocksize; 3333 3334 while (cur <= end) { 3335 u64 em_end; 3336 u64 offset; 3337 3338 if (cur >= i_size) { 3339 btrfs_writepage_endio_finish_ordered(page, cur, 3340 page_end, 1); 3341 break; 3342 } 3343 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, 3344 end - cur + 1, 1); 3345 if (IS_ERR_OR_NULL(em)) { 3346 SetPageError(page); 3347 ret = PTR_ERR_OR_ZERO(em); 3348 break; 3349 } 3350 3351 extent_offset = cur - em->start; 3352 em_end = extent_map_end(em); 3353 BUG_ON(em_end <= cur); 3354 BUG_ON(end < cur); 3355 iosize = min(em_end - cur, end - cur + 1); 3356 iosize = ALIGN(iosize, blocksize); 3357 offset = em->block_start + extent_offset; 3358 bdev = em->bdev; 3359 block_start = em->block_start; 3360 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3361 free_extent_map(em); 3362 em = NULL; 3363 3364 /* 3365 * compressed and inline extents are written through other 3366 * paths in the FS 3367 */ 3368 if (compressed || block_start == EXTENT_MAP_HOLE || 3369 block_start == EXTENT_MAP_INLINE) { 3370 /* 3371 * end_io notification does not happen here for 3372 * compressed extents 3373 */ 3374 if (!compressed) 3375 btrfs_writepage_endio_finish_ordered(page, cur, 3376 cur + iosize - 1, 3377 1); 3378 else if (compressed) { 3379 /* we don't want to end_page_writeback on 3380 * a compressed extent. this happens 3381 * elsewhere 3382 */ 3383 nr++; 3384 } 3385 3386 cur += iosize; 3387 pg_offset += iosize; 3388 continue; 3389 } 3390 3391 btrfs_set_range_writeback(tree, cur, cur + iosize - 1); 3392 if (!PageWriteback(page)) { 3393 btrfs_err(BTRFS_I(inode)->root->fs_info, 3394 "page %lu not writeback, cur %llu end %llu", 3395 page->index, cur, end); 3396 } 3397 3398 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3399 page, offset, iosize, pg_offset, 3400 bdev, &epd->bio, 3401 end_bio_extent_writepage, 3402 0, 0, 0, false); 3403 if (ret) { 3404 SetPageError(page); 3405 if (PageWriteback(page)) 3406 end_page_writeback(page); 3407 } 3408 3409 cur = cur + iosize; 3410 pg_offset += iosize; 3411 nr++; 3412 } 3413 done: 3414 *nr_ret = nr; 3415 return ret; 3416 } 3417 3418 /* 3419 * the writepage semantics are similar to regular writepage. extent 3420 * records are inserted to lock ranges in the tree, and as dirty areas 3421 * are found, they are marked writeback. Then the lock bits are removed 3422 * and the end_io handler clears the writeback ranges 3423 */ 3424 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3425 struct extent_page_data *epd) 3426 { 3427 struct inode *inode = page->mapping->host; 3428 u64 start = page_offset(page); 3429 u64 page_end = start + PAGE_SIZE - 1; 3430 int ret; 3431 int nr = 0; 3432 size_t pg_offset = 0; 3433 loff_t i_size = i_size_read(inode); 3434 unsigned long end_index = i_size >> PAGE_SHIFT; 3435 unsigned int write_flags = 0; 3436 unsigned long nr_written = 0; 3437 3438 write_flags = wbc_to_write_flags(wbc); 3439 3440 trace___extent_writepage(page, inode, wbc); 3441 3442 WARN_ON(!PageLocked(page)); 3443 3444 ClearPageError(page); 3445 3446 pg_offset = offset_in_page(i_size); 3447 if (page->index > end_index || 3448 (page->index == end_index && !pg_offset)) { 3449 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3450 unlock_page(page); 3451 return 0; 3452 } 3453 3454 if (page->index == end_index) { 3455 char *userpage; 3456 3457 userpage = kmap_atomic(page); 3458 memset(userpage + pg_offset, 0, 3459 PAGE_SIZE - pg_offset); 3460 kunmap_atomic(userpage); 3461 flush_dcache_page(page); 3462 } 3463 3464 pg_offset = 0; 3465 3466 set_page_extent_mapped(page); 3467 3468 if (!epd->extent_locked) { 3469 ret = writepage_delalloc(inode, page, wbc, start, &nr_written); 3470 if (ret == 1) 3471 goto done_unlocked; 3472 if (ret) 3473 goto done; 3474 } 3475 3476 ret = __extent_writepage_io(inode, page, wbc, epd, 3477 i_size, nr_written, write_flags, &nr); 3478 if (ret == 1) 3479 goto done_unlocked; 3480 3481 done: 3482 if (nr == 0) { 3483 /* make sure the mapping tag for page dirty gets cleared */ 3484 set_page_writeback(page); 3485 end_page_writeback(page); 3486 } 3487 if (PageError(page)) { 3488 ret = ret < 0 ? ret : -EIO; 3489 end_extent_writepage(page, ret, start, page_end); 3490 } 3491 unlock_page(page); 3492 return ret; 3493 3494 done_unlocked: 3495 return 0; 3496 } 3497 3498 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3499 { 3500 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3501 TASK_UNINTERRUPTIBLE); 3502 } 3503 3504 static noinline_for_stack int 3505 lock_extent_buffer_for_io(struct extent_buffer *eb, 3506 struct btrfs_fs_info *fs_info, 3507 struct extent_page_data *epd) 3508 { 3509 int i, num_pages; 3510 int flush = 0; 3511 int ret = 0; 3512 3513 if (!btrfs_try_tree_write_lock(eb)) { 3514 flush = 1; 3515 flush_write_bio(epd); 3516 btrfs_tree_lock(eb); 3517 } 3518 3519 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3520 btrfs_tree_unlock(eb); 3521 if (!epd->sync_io) 3522 return 0; 3523 if (!flush) { 3524 flush_write_bio(epd); 3525 flush = 1; 3526 } 3527 while (1) { 3528 wait_on_extent_buffer_writeback(eb); 3529 btrfs_tree_lock(eb); 3530 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3531 break; 3532 btrfs_tree_unlock(eb); 3533 } 3534 } 3535 3536 /* 3537 * We need to do this to prevent races in people who check if the eb is 3538 * under IO since we can end up having no IO bits set for a short period 3539 * of time. 3540 */ 3541 spin_lock(&eb->refs_lock); 3542 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3543 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3544 spin_unlock(&eb->refs_lock); 3545 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3546 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 3547 -eb->len, 3548 fs_info->dirty_metadata_batch); 3549 ret = 1; 3550 } else { 3551 spin_unlock(&eb->refs_lock); 3552 } 3553 3554 btrfs_tree_unlock(eb); 3555 3556 if (!ret) 3557 return ret; 3558 3559 num_pages = num_extent_pages(eb); 3560 for (i = 0; i < num_pages; i++) { 3561 struct page *p = eb->pages[i]; 3562 3563 if (!trylock_page(p)) { 3564 if (!flush) { 3565 flush_write_bio(epd); 3566 flush = 1; 3567 } 3568 lock_page(p); 3569 } 3570 } 3571 3572 return ret; 3573 } 3574 3575 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3576 { 3577 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3578 smp_mb__after_atomic(); 3579 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3580 } 3581 3582 static void set_btree_ioerr(struct page *page) 3583 { 3584 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3585 3586 SetPageError(page); 3587 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3588 return; 3589 3590 /* 3591 * If writeback for a btree extent that doesn't belong to a log tree 3592 * failed, increment the counter transaction->eb_write_errors. 3593 * We do this because while the transaction is running and before it's 3594 * committing (when we call filemap_fdata[write|wait]_range against 3595 * the btree inode), we might have 3596 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3597 * returns an error or an error happens during writeback, when we're 3598 * committing the transaction we wouldn't know about it, since the pages 3599 * can be no longer dirty nor marked anymore for writeback (if a 3600 * subsequent modification to the extent buffer didn't happen before the 3601 * transaction commit), which makes filemap_fdata[write|wait]_range not 3602 * able to find the pages tagged with SetPageError at transaction 3603 * commit time. So if this happens we must abort the transaction, 3604 * otherwise we commit a super block with btree roots that point to 3605 * btree nodes/leafs whose content on disk is invalid - either garbage 3606 * or the content of some node/leaf from a past generation that got 3607 * cowed or deleted and is no longer valid. 3608 * 3609 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3610 * not be enough - we need to distinguish between log tree extents vs 3611 * non-log tree extents, and the next filemap_fdatawait_range() call 3612 * will catch and clear such errors in the mapping - and that call might 3613 * be from a log sync and not from a transaction commit. Also, checking 3614 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3615 * not done and would not be reliable - the eb might have been released 3616 * from memory and reading it back again means that flag would not be 3617 * set (since it's a runtime flag, not persisted on disk). 3618 * 3619 * Using the flags below in the btree inode also makes us achieve the 3620 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3621 * writeback for all dirty pages and before filemap_fdatawait_range() 3622 * is called, the writeback for all dirty pages had already finished 3623 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3624 * filemap_fdatawait_range() would return success, as it could not know 3625 * that writeback errors happened (the pages were no longer tagged for 3626 * writeback). 3627 */ 3628 switch (eb->log_index) { 3629 case -1: 3630 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3631 break; 3632 case 0: 3633 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3634 break; 3635 case 1: 3636 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3637 break; 3638 default: 3639 BUG(); /* unexpected, logic error */ 3640 } 3641 } 3642 3643 static void end_bio_extent_buffer_writepage(struct bio *bio) 3644 { 3645 struct bio_vec *bvec; 3646 struct extent_buffer *eb; 3647 int i, done; 3648 struct bvec_iter_all iter_all; 3649 3650 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3651 bio_for_each_segment_all(bvec, bio, i, iter_all) { 3652 struct page *page = bvec->bv_page; 3653 3654 eb = (struct extent_buffer *)page->private; 3655 BUG_ON(!eb); 3656 done = atomic_dec_and_test(&eb->io_pages); 3657 3658 if (bio->bi_status || 3659 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3660 ClearPageUptodate(page); 3661 set_btree_ioerr(page); 3662 } 3663 3664 end_page_writeback(page); 3665 3666 if (!done) 3667 continue; 3668 3669 end_extent_buffer_writeback(eb); 3670 } 3671 3672 bio_put(bio); 3673 } 3674 3675 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3676 struct btrfs_fs_info *fs_info, 3677 struct writeback_control *wbc, 3678 struct extent_page_data *epd) 3679 { 3680 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3681 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3682 u64 offset = eb->start; 3683 u32 nritems; 3684 int i, num_pages; 3685 unsigned long start, end; 3686 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 3687 int ret = 0; 3688 3689 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3690 num_pages = num_extent_pages(eb); 3691 atomic_set(&eb->io_pages, num_pages); 3692 3693 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3694 nritems = btrfs_header_nritems(eb); 3695 if (btrfs_header_level(eb) > 0) { 3696 end = btrfs_node_key_ptr_offset(nritems); 3697 3698 memzero_extent_buffer(eb, end, eb->len - end); 3699 } else { 3700 /* 3701 * leaf: 3702 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3703 */ 3704 start = btrfs_item_nr_offset(nritems); 3705 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb); 3706 memzero_extent_buffer(eb, start, end - start); 3707 } 3708 3709 for (i = 0; i < num_pages; i++) { 3710 struct page *p = eb->pages[i]; 3711 3712 clear_page_dirty_for_io(p); 3713 set_page_writeback(p); 3714 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3715 p, offset, PAGE_SIZE, 0, bdev, 3716 &epd->bio, 3717 end_bio_extent_buffer_writepage, 3718 0, 0, 0, false); 3719 if (ret) { 3720 set_btree_ioerr(p); 3721 if (PageWriteback(p)) 3722 end_page_writeback(p); 3723 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3724 end_extent_buffer_writeback(eb); 3725 ret = -EIO; 3726 break; 3727 } 3728 offset += PAGE_SIZE; 3729 update_nr_written(wbc, 1); 3730 unlock_page(p); 3731 } 3732 3733 if (unlikely(ret)) { 3734 for (; i < num_pages; i++) { 3735 struct page *p = eb->pages[i]; 3736 clear_page_dirty_for_io(p); 3737 unlock_page(p); 3738 } 3739 } 3740 3741 return ret; 3742 } 3743 3744 int btree_write_cache_pages(struct address_space *mapping, 3745 struct writeback_control *wbc) 3746 { 3747 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3748 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3749 struct extent_buffer *eb, *prev_eb = NULL; 3750 struct extent_page_data epd = { 3751 .bio = NULL, 3752 .tree = tree, 3753 .extent_locked = 0, 3754 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3755 }; 3756 int ret = 0; 3757 int done = 0; 3758 int nr_to_write_done = 0; 3759 struct pagevec pvec; 3760 int nr_pages; 3761 pgoff_t index; 3762 pgoff_t end; /* Inclusive */ 3763 int scanned = 0; 3764 xa_mark_t tag; 3765 3766 pagevec_init(&pvec); 3767 if (wbc->range_cyclic) { 3768 index = mapping->writeback_index; /* Start from prev offset */ 3769 end = -1; 3770 } else { 3771 index = wbc->range_start >> PAGE_SHIFT; 3772 end = wbc->range_end >> PAGE_SHIFT; 3773 scanned = 1; 3774 } 3775 if (wbc->sync_mode == WB_SYNC_ALL) 3776 tag = PAGECACHE_TAG_TOWRITE; 3777 else 3778 tag = PAGECACHE_TAG_DIRTY; 3779 retry: 3780 if (wbc->sync_mode == WB_SYNC_ALL) 3781 tag_pages_for_writeback(mapping, index, end); 3782 while (!done && !nr_to_write_done && (index <= end) && 3783 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 3784 tag))) { 3785 unsigned i; 3786 3787 scanned = 1; 3788 for (i = 0; i < nr_pages; i++) { 3789 struct page *page = pvec.pages[i]; 3790 3791 if (!PagePrivate(page)) 3792 continue; 3793 3794 spin_lock(&mapping->private_lock); 3795 if (!PagePrivate(page)) { 3796 spin_unlock(&mapping->private_lock); 3797 continue; 3798 } 3799 3800 eb = (struct extent_buffer *)page->private; 3801 3802 /* 3803 * Shouldn't happen and normally this would be a BUG_ON 3804 * but no sense in crashing the users box for something 3805 * we can survive anyway. 3806 */ 3807 if (WARN_ON(!eb)) { 3808 spin_unlock(&mapping->private_lock); 3809 continue; 3810 } 3811 3812 if (eb == prev_eb) { 3813 spin_unlock(&mapping->private_lock); 3814 continue; 3815 } 3816 3817 ret = atomic_inc_not_zero(&eb->refs); 3818 spin_unlock(&mapping->private_lock); 3819 if (!ret) 3820 continue; 3821 3822 prev_eb = eb; 3823 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3824 if (!ret) { 3825 free_extent_buffer(eb); 3826 continue; 3827 } 3828 3829 ret = write_one_eb(eb, fs_info, wbc, &epd); 3830 if (ret) { 3831 done = 1; 3832 free_extent_buffer(eb); 3833 break; 3834 } 3835 free_extent_buffer(eb); 3836 3837 /* 3838 * the filesystem may choose to bump up nr_to_write. 3839 * We have to make sure to honor the new nr_to_write 3840 * at any time 3841 */ 3842 nr_to_write_done = wbc->nr_to_write <= 0; 3843 } 3844 pagevec_release(&pvec); 3845 cond_resched(); 3846 } 3847 if (!scanned && !done) { 3848 /* 3849 * We hit the last page and there is more work to be done: wrap 3850 * back to the start of the file 3851 */ 3852 scanned = 1; 3853 index = 0; 3854 goto retry; 3855 } 3856 flush_write_bio(&epd); 3857 return ret; 3858 } 3859 3860 /** 3861 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3862 * @mapping: address space structure to write 3863 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3864 * @data: data passed to __extent_writepage function 3865 * 3866 * If a page is already under I/O, write_cache_pages() skips it, even 3867 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3868 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3869 * and msync() need to guarantee that all the data which was dirty at the time 3870 * the call was made get new I/O started against them. If wbc->sync_mode is 3871 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3872 * existing IO to complete. 3873 */ 3874 static int extent_write_cache_pages(struct address_space *mapping, 3875 struct writeback_control *wbc, 3876 struct extent_page_data *epd) 3877 { 3878 struct inode *inode = mapping->host; 3879 int ret = 0; 3880 int done = 0; 3881 int nr_to_write_done = 0; 3882 struct pagevec pvec; 3883 int nr_pages; 3884 pgoff_t index; 3885 pgoff_t end; /* Inclusive */ 3886 pgoff_t done_index; 3887 int range_whole = 0; 3888 int scanned = 0; 3889 xa_mark_t tag; 3890 3891 /* 3892 * We have to hold onto the inode so that ordered extents can do their 3893 * work when the IO finishes. The alternative to this is failing to add 3894 * an ordered extent if the igrab() fails there and that is a huge pain 3895 * to deal with, so instead just hold onto the inode throughout the 3896 * writepages operation. If it fails here we are freeing up the inode 3897 * anyway and we'd rather not waste our time writing out stuff that is 3898 * going to be truncated anyway. 3899 */ 3900 if (!igrab(inode)) 3901 return 0; 3902 3903 pagevec_init(&pvec); 3904 if (wbc->range_cyclic) { 3905 index = mapping->writeback_index; /* Start from prev offset */ 3906 end = -1; 3907 } else { 3908 index = wbc->range_start >> PAGE_SHIFT; 3909 end = wbc->range_end >> PAGE_SHIFT; 3910 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3911 range_whole = 1; 3912 scanned = 1; 3913 } 3914 3915 /* 3916 * We do the tagged writepage as long as the snapshot flush bit is set 3917 * and we are the first one who do the filemap_flush() on this inode. 3918 * 3919 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 3920 * not race in and drop the bit. 3921 */ 3922 if (range_whole && wbc->nr_to_write == LONG_MAX && 3923 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 3924 &BTRFS_I(inode)->runtime_flags)) 3925 wbc->tagged_writepages = 1; 3926 3927 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 3928 tag = PAGECACHE_TAG_TOWRITE; 3929 else 3930 tag = PAGECACHE_TAG_DIRTY; 3931 retry: 3932 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 3933 tag_pages_for_writeback(mapping, index, end); 3934 done_index = index; 3935 while (!done && !nr_to_write_done && (index <= end) && 3936 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 3937 &index, end, tag))) { 3938 unsigned i; 3939 3940 scanned = 1; 3941 for (i = 0; i < nr_pages; i++) { 3942 struct page *page = pvec.pages[i]; 3943 3944 done_index = page->index; 3945 /* 3946 * At this point we hold neither the i_pages lock nor 3947 * the page lock: the page may be truncated or 3948 * invalidated (changing page->mapping to NULL), 3949 * or even swizzled back from swapper_space to 3950 * tmpfs file mapping 3951 */ 3952 if (!trylock_page(page)) { 3953 flush_write_bio(epd); 3954 lock_page(page); 3955 } 3956 3957 if (unlikely(page->mapping != mapping)) { 3958 unlock_page(page); 3959 continue; 3960 } 3961 3962 if (wbc->sync_mode != WB_SYNC_NONE) { 3963 if (PageWriteback(page)) 3964 flush_write_bio(epd); 3965 wait_on_page_writeback(page); 3966 } 3967 3968 if (PageWriteback(page) || 3969 !clear_page_dirty_for_io(page)) { 3970 unlock_page(page); 3971 continue; 3972 } 3973 3974 ret = __extent_writepage(page, wbc, epd); 3975 3976 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 3977 unlock_page(page); 3978 ret = 0; 3979 } 3980 if (ret < 0) { 3981 /* 3982 * done_index is set past this page, 3983 * so media errors will not choke 3984 * background writeout for the entire 3985 * file. This has consequences for 3986 * range_cyclic semantics (ie. it may 3987 * not be suitable for data integrity 3988 * writeout). 3989 */ 3990 done_index = page->index + 1; 3991 done = 1; 3992 break; 3993 } 3994 3995 /* 3996 * the filesystem may choose to bump up nr_to_write. 3997 * We have to make sure to honor the new nr_to_write 3998 * at any time 3999 */ 4000 nr_to_write_done = wbc->nr_to_write <= 0; 4001 } 4002 pagevec_release(&pvec); 4003 cond_resched(); 4004 } 4005 if (!scanned && !done) { 4006 /* 4007 * We hit the last page and there is more work to be done: wrap 4008 * back to the start of the file 4009 */ 4010 scanned = 1; 4011 index = 0; 4012 goto retry; 4013 } 4014 4015 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4016 mapping->writeback_index = done_index; 4017 4018 btrfs_add_delayed_iput(inode); 4019 return ret; 4020 } 4021 4022 int extent_write_full_page(struct page *page, struct writeback_control *wbc) 4023 { 4024 int ret; 4025 struct extent_page_data epd = { 4026 .bio = NULL, 4027 .tree = &BTRFS_I(page->mapping->host)->io_tree, 4028 .extent_locked = 0, 4029 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4030 }; 4031 4032 ret = __extent_writepage(page, wbc, &epd); 4033 4034 flush_write_bio(&epd); 4035 return ret; 4036 } 4037 4038 int extent_write_locked_range(struct inode *inode, u64 start, u64 end, 4039 int mode) 4040 { 4041 int ret = 0; 4042 struct address_space *mapping = inode->i_mapping; 4043 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 4044 struct page *page; 4045 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4046 PAGE_SHIFT; 4047 4048 struct extent_page_data epd = { 4049 .bio = NULL, 4050 .tree = tree, 4051 .extent_locked = 1, 4052 .sync_io = mode == WB_SYNC_ALL, 4053 }; 4054 struct writeback_control wbc_writepages = { 4055 .sync_mode = mode, 4056 .nr_to_write = nr_pages * 2, 4057 .range_start = start, 4058 .range_end = end + 1, 4059 }; 4060 4061 while (start <= end) { 4062 page = find_get_page(mapping, start >> PAGE_SHIFT); 4063 if (clear_page_dirty_for_io(page)) 4064 ret = __extent_writepage(page, &wbc_writepages, &epd); 4065 else { 4066 btrfs_writepage_endio_finish_ordered(page, start, 4067 start + PAGE_SIZE - 1, 1); 4068 unlock_page(page); 4069 } 4070 put_page(page); 4071 start += PAGE_SIZE; 4072 } 4073 4074 flush_write_bio(&epd); 4075 return ret; 4076 } 4077 4078 int extent_writepages(struct address_space *mapping, 4079 struct writeback_control *wbc) 4080 { 4081 int ret = 0; 4082 struct extent_page_data epd = { 4083 .bio = NULL, 4084 .tree = &BTRFS_I(mapping->host)->io_tree, 4085 .extent_locked = 0, 4086 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4087 }; 4088 4089 ret = extent_write_cache_pages(mapping, wbc, &epd); 4090 flush_write_bio(&epd); 4091 return ret; 4092 } 4093 4094 int extent_readpages(struct address_space *mapping, struct list_head *pages, 4095 unsigned nr_pages) 4096 { 4097 struct bio *bio = NULL; 4098 unsigned long bio_flags = 0; 4099 struct page *pagepool[16]; 4100 struct extent_map *em_cached = NULL; 4101 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 4102 int nr = 0; 4103 u64 prev_em_start = (u64)-1; 4104 4105 while (!list_empty(pages)) { 4106 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { 4107 struct page *page = lru_to_page(pages); 4108 4109 prefetchw(&page->flags); 4110 list_del(&page->lru); 4111 if (add_to_page_cache_lru(page, mapping, page->index, 4112 readahead_gfp_mask(mapping))) { 4113 put_page(page); 4114 continue; 4115 } 4116 4117 pagepool[nr++] = page; 4118 } 4119 4120 __extent_readpages(tree, pagepool, nr, &em_cached, &bio, 4121 &bio_flags, &prev_em_start); 4122 } 4123 4124 if (em_cached) 4125 free_extent_map(em_cached); 4126 4127 if (bio) 4128 return submit_one_bio(bio, 0, bio_flags); 4129 return 0; 4130 } 4131 4132 /* 4133 * basic invalidatepage code, this waits on any locked or writeback 4134 * ranges corresponding to the page, and then deletes any extent state 4135 * records from the tree 4136 */ 4137 int extent_invalidatepage(struct extent_io_tree *tree, 4138 struct page *page, unsigned long offset) 4139 { 4140 struct extent_state *cached_state = NULL; 4141 u64 start = page_offset(page); 4142 u64 end = start + PAGE_SIZE - 1; 4143 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4144 4145 start += ALIGN(offset, blocksize); 4146 if (start > end) 4147 return 0; 4148 4149 lock_extent_bits(tree, start, end, &cached_state); 4150 wait_on_page_writeback(page); 4151 clear_extent_bit(tree, start, end, 4152 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4153 EXTENT_DO_ACCOUNTING, 4154 1, 1, &cached_state); 4155 return 0; 4156 } 4157 4158 /* 4159 * a helper for releasepage, this tests for areas of the page that 4160 * are locked or under IO and drops the related state bits if it is safe 4161 * to drop the page. 4162 */ 4163 static int try_release_extent_state(struct extent_io_tree *tree, 4164 struct page *page, gfp_t mask) 4165 { 4166 u64 start = page_offset(page); 4167 u64 end = start + PAGE_SIZE - 1; 4168 int ret = 1; 4169 4170 if (test_range_bit(tree, start, end, 4171 EXTENT_IOBITS, 0, NULL)) 4172 ret = 0; 4173 else { 4174 /* 4175 * at this point we can safely clear everything except the 4176 * locked bit and the nodatasum bit 4177 */ 4178 ret = __clear_extent_bit(tree, start, end, 4179 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4180 0, 0, NULL, mask, NULL); 4181 4182 /* if clear_extent_bit failed for enomem reasons, 4183 * we can't allow the release to continue. 4184 */ 4185 if (ret < 0) 4186 ret = 0; 4187 else 4188 ret = 1; 4189 } 4190 return ret; 4191 } 4192 4193 /* 4194 * a helper for releasepage. As long as there are no locked extents 4195 * in the range corresponding to the page, both state records and extent 4196 * map records are removed 4197 */ 4198 int try_release_extent_mapping(struct page *page, gfp_t mask) 4199 { 4200 struct extent_map *em; 4201 u64 start = page_offset(page); 4202 u64 end = start + PAGE_SIZE - 1; 4203 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 4204 struct extent_io_tree *tree = &btrfs_inode->io_tree; 4205 struct extent_map_tree *map = &btrfs_inode->extent_tree; 4206 4207 if (gfpflags_allow_blocking(mask) && 4208 page->mapping->host->i_size > SZ_16M) { 4209 u64 len; 4210 while (start <= end) { 4211 len = end - start + 1; 4212 write_lock(&map->lock); 4213 em = lookup_extent_mapping(map, start, len); 4214 if (!em) { 4215 write_unlock(&map->lock); 4216 break; 4217 } 4218 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4219 em->start != start) { 4220 write_unlock(&map->lock); 4221 free_extent_map(em); 4222 break; 4223 } 4224 if (!test_range_bit(tree, em->start, 4225 extent_map_end(em) - 1, 4226 EXTENT_LOCKED | EXTENT_WRITEBACK, 4227 0, NULL)) { 4228 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4229 &btrfs_inode->runtime_flags); 4230 remove_extent_mapping(map, em); 4231 /* once for the rb tree */ 4232 free_extent_map(em); 4233 } 4234 start = extent_map_end(em); 4235 write_unlock(&map->lock); 4236 4237 /* once for us */ 4238 free_extent_map(em); 4239 } 4240 } 4241 return try_release_extent_state(tree, page, mask); 4242 } 4243 4244 /* 4245 * helper function for fiemap, which doesn't want to see any holes. 4246 * This maps until we find something past 'last' 4247 */ 4248 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4249 u64 offset, u64 last) 4250 { 4251 u64 sectorsize = btrfs_inode_sectorsize(inode); 4252 struct extent_map *em; 4253 u64 len; 4254 4255 if (offset >= last) 4256 return NULL; 4257 4258 while (1) { 4259 len = last - offset; 4260 if (len == 0) 4261 break; 4262 len = ALIGN(len, sectorsize); 4263 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); 4264 if (IS_ERR_OR_NULL(em)) 4265 return em; 4266 4267 /* if this isn't a hole return it */ 4268 if (em->block_start != EXTENT_MAP_HOLE) 4269 return em; 4270 4271 /* this is a hole, advance to the next extent */ 4272 offset = extent_map_end(em); 4273 free_extent_map(em); 4274 if (offset >= last) 4275 break; 4276 } 4277 return NULL; 4278 } 4279 4280 /* 4281 * To cache previous fiemap extent 4282 * 4283 * Will be used for merging fiemap extent 4284 */ 4285 struct fiemap_cache { 4286 u64 offset; 4287 u64 phys; 4288 u64 len; 4289 u32 flags; 4290 bool cached; 4291 }; 4292 4293 /* 4294 * Helper to submit fiemap extent. 4295 * 4296 * Will try to merge current fiemap extent specified by @offset, @phys, 4297 * @len and @flags with cached one. 4298 * And only when we fails to merge, cached one will be submitted as 4299 * fiemap extent. 4300 * 4301 * Return value is the same as fiemap_fill_next_extent(). 4302 */ 4303 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 4304 struct fiemap_cache *cache, 4305 u64 offset, u64 phys, u64 len, u32 flags) 4306 { 4307 int ret = 0; 4308 4309 if (!cache->cached) 4310 goto assign; 4311 4312 /* 4313 * Sanity check, extent_fiemap() should have ensured that new 4314 * fiemap extent won't overlap with cached one. 4315 * Not recoverable. 4316 * 4317 * NOTE: Physical address can overlap, due to compression 4318 */ 4319 if (cache->offset + cache->len > offset) { 4320 WARN_ON(1); 4321 return -EINVAL; 4322 } 4323 4324 /* 4325 * Only merges fiemap extents if 4326 * 1) Their logical addresses are continuous 4327 * 4328 * 2) Their physical addresses are continuous 4329 * So truly compressed (physical size smaller than logical size) 4330 * extents won't get merged with each other 4331 * 4332 * 3) Share same flags except FIEMAP_EXTENT_LAST 4333 * So regular extent won't get merged with prealloc extent 4334 */ 4335 if (cache->offset + cache->len == offset && 4336 cache->phys + cache->len == phys && 4337 (cache->flags & ~FIEMAP_EXTENT_LAST) == 4338 (flags & ~FIEMAP_EXTENT_LAST)) { 4339 cache->len += len; 4340 cache->flags |= flags; 4341 goto try_submit_last; 4342 } 4343 4344 /* Not mergeable, need to submit cached one */ 4345 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4346 cache->len, cache->flags); 4347 cache->cached = false; 4348 if (ret) 4349 return ret; 4350 assign: 4351 cache->cached = true; 4352 cache->offset = offset; 4353 cache->phys = phys; 4354 cache->len = len; 4355 cache->flags = flags; 4356 try_submit_last: 4357 if (cache->flags & FIEMAP_EXTENT_LAST) { 4358 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 4359 cache->phys, cache->len, cache->flags); 4360 cache->cached = false; 4361 } 4362 return ret; 4363 } 4364 4365 /* 4366 * Emit last fiemap cache 4367 * 4368 * The last fiemap cache may still be cached in the following case: 4369 * 0 4k 8k 4370 * |<- Fiemap range ->| 4371 * |<------------ First extent ----------->| 4372 * 4373 * In this case, the first extent range will be cached but not emitted. 4374 * So we must emit it before ending extent_fiemap(). 4375 */ 4376 static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info, 4377 struct fiemap_extent_info *fieinfo, 4378 struct fiemap_cache *cache) 4379 { 4380 int ret; 4381 4382 if (!cache->cached) 4383 return 0; 4384 4385 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4386 cache->len, cache->flags); 4387 cache->cached = false; 4388 if (ret > 0) 4389 ret = 0; 4390 return ret; 4391 } 4392 4393 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4394 __u64 start, __u64 len) 4395 { 4396 int ret = 0; 4397 u64 off = start; 4398 u64 max = start + len; 4399 u32 flags = 0; 4400 u32 found_type; 4401 u64 last; 4402 u64 last_for_get_extent = 0; 4403 u64 disko = 0; 4404 u64 isize = i_size_read(inode); 4405 struct btrfs_key found_key; 4406 struct extent_map *em = NULL; 4407 struct extent_state *cached_state = NULL; 4408 struct btrfs_path *path; 4409 struct btrfs_root *root = BTRFS_I(inode)->root; 4410 struct fiemap_cache cache = { 0 }; 4411 int end = 0; 4412 u64 em_start = 0; 4413 u64 em_len = 0; 4414 u64 em_end = 0; 4415 4416 if (len == 0) 4417 return -EINVAL; 4418 4419 path = btrfs_alloc_path(); 4420 if (!path) 4421 return -ENOMEM; 4422 path->leave_spinning = 1; 4423 4424 start = round_down(start, btrfs_inode_sectorsize(inode)); 4425 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4426 4427 /* 4428 * lookup the last file extent. We're not using i_size here 4429 * because there might be preallocation past i_size 4430 */ 4431 ret = btrfs_lookup_file_extent(NULL, root, path, 4432 btrfs_ino(BTRFS_I(inode)), -1, 0); 4433 if (ret < 0) { 4434 btrfs_free_path(path); 4435 return ret; 4436 } else { 4437 WARN_ON(!ret); 4438 if (ret == 1) 4439 ret = 0; 4440 } 4441 4442 path->slots[0]--; 4443 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4444 found_type = found_key.type; 4445 4446 /* No extents, but there might be delalloc bits */ 4447 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4448 found_type != BTRFS_EXTENT_DATA_KEY) { 4449 /* have to trust i_size as the end */ 4450 last = (u64)-1; 4451 last_for_get_extent = isize; 4452 } else { 4453 /* 4454 * remember the start of the last extent. There are a 4455 * bunch of different factors that go into the length of the 4456 * extent, so its much less complex to remember where it started 4457 */ 4458 last = found_key.offset; 4459 last_for_get_extent = last + 1; 4460 } 4461 btrfs_release_path(path); 4462 4463 /* 4464 * we might have some extents allocated but more delalloc past those 4465 * extents. so, we trust isize unless the start of the last extent is 4466 * beyond isize 4467 */ 4468 if (last < isize) { 4469 last = (u64)-1; 4470 last_for_get_extent = isize; 4471 } 4472 4473 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4474 &cached_state); 4475 4476 em = get_extent_skip_holes(inode, start, last_for_get_extent); 4477 if (!em) 4478 goto out; 4479 if (IS_ERR(em)) { 4480 ret = PTR_ERR(em); 4481 goto out; 4482 } 4483 4484 while (!end) { 4485 u64 offset_in_extent = 0; 4486 4487 /* break if the extent we found is outside the range */ 4488 if (em->start >= max || extent_map_end(em) < off) 4489 break; 4490 4491 /* 4492 * get_extent may return an extent that starts before our 4493 * requested range. We have to make sure the ranges 4494 * we return to fiemap always move forward and don't 4495 * overlap, so adjust the offsets here 4496 */ 4497 em_start = max(em->start, off); 4498 4499 /* 4500 * record the offset from the start of the extent 4501 * for adjusting the disk offset below. Only do this if the 4502 * extent isn't compressed since our in ram offset may be past 4503 * what we have actually allocated on disk. 4504 */ 4505 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4506 offset_in_extent = em_start - em->start; 4507 em_end = extent_map_end(em); 4508 em_len = em_end - em_start; 4509 flags = 0; 4510 if (em->block_start < EXTENT_MAP_LAST_BYTE) 4511 disko = em->block_start + offset_in_extent; 4512 else 4513 disko = 0; 4514 4515 /* 4516 * bump off for our next call to get_extent 4517 */ 4518 off = extent_map_end(em); 4519 if (off >= max) 4520 end = 1; 4521 4522 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4523 end = 1; 4524 flags |= FIEMAP_EXTENT_LAST; 4525 } else if (em->block_start == EXTENT_MAP_INLINE) { 4526 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4527 FIEMAP_EXTENT_NOT_ALIGNED); 4528 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4529 flags |= (FIEMAP_EXTENT_DELALLOC | 4530 FIEMAP_EXTENT_UNKNOWN); 4531 } else if (fieinfo->fi_extents_max) { 4532 u64 bytenr = em->block_start - 4533 (em->start - em->orig_start); 4534 4535 /* 4536 * As btrfs supports shared space, this information 4537 * can be exported to userspace tools via 4538 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4539 * then we're just getting a count and we can skip the 4540 * lookup stuff. 4541 */ 4542 ret = btrfs_check_shared(root, 4543 btrfs_ino(BTRFS_I(inode)), 4544 bytenr); 4545 if (ret < 0) 4546 goto out_free; 4547 if (ret) 4548 flags |= FIEMAP_EXTENT_SHARED; 4549 ret = 0; 4550 } 4551 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4552 flags |= FIEMAP_EXTENT_ENCODED; 4553 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4554 flags |= FIEMAP_EXTENT_UNWRITTEN; 4555 4556 free_extent_map(em); 4557 em = NULL; 4558 if ((em_start >= last) || em_len == (u64)-1 || 4559 (last == (u64)-1 && isize <= em_end)) { 4560 flags |= FIEMAP_EXTENT_LAST; 4561 end = 1; 4562 } 4563 4564 /* now scan forward to see if this is really the last extent. */ 4565 em = get_extent_skip_holes(inode, off, last_for_get_extent); 4566 if (IS_ERR(em)) { 4567 ret = PTR_ERR(em); 4568 goto out; 4569 } 4570 if (!em) { 4571 flags |= FIEMAP_EXTENT_LAST; 4572 end = 1; 4573 } 4574 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 4575 em_len, flags); 4576 if (ret) { 4577 if (ret == 1) 4578 ret = 0; 4579 goto out_free; 4580 } 4581 } 4582 out_free: 4583 if (!ret) 4584 ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache); 4585 free_extent_map(em); 4586 out: 4587 btrfs_free_path(path); 4588 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4589 &cached_state); 4590 return ret; 4591 } 4592 4593 static void __free_extent_buffer(struct extent_buffer *eb) 4594 { 4595 btrfs_leak_debug_del(&eb->leak_list); 4596 kmem_cache_free(extent_buffer_cache, eb); 4597 } 4598 4599 int extent_buffer_under_io(struct extent_buffer *eb) 4600 { 4601 return (atomic_read(&eb->io_pages) || 4602 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4603 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4604 } 4605 4606 /* 4607 * Release all pages attached to the extent buffer. 4608 */ 4609 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 4610 { 4611 int i; 4612 int num_pages; 4613 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4614 4615 BUG_ON(extent_buffer_under_io(eb)); 4616 4617 num_pages = num_extent_pages(eb); 4618 for (i = 0; i < num_pages; i++) { 4619 struct page *page = eb->pages[i]; 4620 4621 if (!page) 4622 continue; 4623 if (mapped) 4624 spin_lock(&page->mapping->private_lock); 4625 /* 4626 * We do this since we'll remove the pages after we've 4627 * removed the eb from the radix tree, so we could race 4628 * and have this page now attached to the new eb. So 4629 * only clear page_private if it's still connected to 4630 * this eb. 4631 */ 4632 if (PagePrivate(page) && 4633 page->private == (unsigned long)eb) { 4634 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4635 BUG_ON(PageDirty(page)); 4636 BUG_ON(PageWriteback(page)); 4637 /* 4638 * We need to make sure we haven't be attached 4639 * to a new eb. 4640 */ 4641 ClearPagePrivate(page); 4642 set_page_private(page, 0); 4643 /* One for the page private */ 4644 put_page(page); 4645 } 4646 4647 if (mapped) 4648 spin_unlock(&page->mapping->private_lock); 4649 4650 /* One for when we allocated the page */ 4651 put_page(page); 4652 } 4653 } 4654 4655 /* 4656 * Helper for releasing the extent buffer. 4657 */ 4658 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4659 { 4660 btrfs_release_extent_buffer_pages(eb); 4661 __free_extent_buffer(eb); 4662 } 4663 4664 static struct extent_buffer * 4665 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4666 unsigned long len) 4667 { 4668 struct extent_buffer *eb = NULL; 4669 4670 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4671 eb->start = start; 4672 eb->len = len; 4673 eb->fs_info = fs_info; 4674 eb->bflags = 0; 4675 rwlock_init(&eb->lock); 4676 atomic_set(&eb->write_locks, 0); 4677 atomic_set(&eb->read_locks, 0); 4678 atomic_set(&eb->blocking_readers, 0); 4679 atomic_set(&eb->blocking_writers, 0); 4680 atomic_set(&eb->spinning_readers, 0); 4681 atomic_set(&eb->spinning_writers, 0); 4682 eb->lock_nested = 0; 4683 init_waitqueue_head(&eb->write_lock_wq); 4684 init_waitqueue_head(&eb->read_lock_wq); 4685 4686 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4687 4688 spin_lock_init(&eb->refs_lock); 4689 atomic_set(&eb->refs, 1); 4690 atomic_set(&eb->io_pages, 0); 4691 4692 /* 4693 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4694 */ 4695 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4696 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4697 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4698 4699 return eb; 4700 } 4701 4702 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4703 { 4704 int i; 4705 struct page *p; 4706 struct extent_buffer *new; 4707 int num_pages = num_extent_pages(src); 4708 4709 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4710 if (new == NULL) 4711 return NULL; 4712 4713 for (i = 0; i < num_pages; i++) { 4714 p = alloc_page(GFP_NOFS); 4715 if (!p) { 4716 btrfs_release_extent_buffer(new); 4717 return NULL; 4718 } 4719 attach_extent_buffer_page(new, p); 4720 WARN_ON(PageDirty(p)); 4721 SetPageUptodate(p); 4722 new->pages[i] = p; 4723 copy_page(page_address(p), page_address(src->pages[i])); 4724 } 4725 4726 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4727 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 4728 4729 return new; 4730 } 4731 4732 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4733 u64 start, unsigned long len) 4734 { 4735 struct extent_buffer *eb; 4736 int num_pages; 4737 int i; 4738 4739 eb = __alloc_extent_buffer(fs_info, start, len); 4740 if (!eb) 4741 return NULL; 4742 4743 num_pages = num_extent_pages(eb); 4744 for (i = 0; i < num_pages; i++) { 4745 eb->pages[i] = alloc_page(GFP_NOFS); 4746 if (!eb->pages[i]) 4747 goto err; 4748 } 4749 set_extent_buffer_uptodate(eb); 4750 btrfs_set_header_nritems(eb, 0); 4751 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4752 4753 return eb; 4754 err: 4755 for (; i > 0; i--) 4756 __free_page(eb->pages[i - 1]); 4757 __free_extent_buffer(eb); 4758 return NULL; 4759 } 4760 4761 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4762 u64 start) 4763 { 4764 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4765 } 4766 4767 static void check_buffer_tree_ref(struct extent_buffer *eb) 4768 { 4769 int refs; 4770 /* the ref bit is tricky. We have to make sure it is set 4771 * if we have the buffer dirty. Otherwise the 4772 * code to free a buffer can end up dropping a dirty 4773 * page 4774 * 4775 * Once the ref bit is set, it won't go away while the 4776 * buffer is dirty or in writeback, and it also won't 4777 * go away while we have the reference count on the 4778 * eb bumped. 4779 * 4780 * We can't just set the ref bit without bumping the 4781 * ref on the eb because free_extent_buffer might 4782 * see the ref bit and try to clear it. If this happens 4783 * free_extent_buffer might end up dropping our original 4784 * ref by mistake and freeing the page before we are able 4785 * to add one more ref. 4786 * 4787 * So bump the ref count first, then set the bit. If someone 4788 * beat us to it, drop the ref we added. 4789 */ 4790 refs = atomic_read(&eb->refs); 4791 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4792 return; 4793 4794 spin_lock(&eb->refs_lock); 4795 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4796 atomic_inc(&eb->refs); 4797 spin_unlock(&eb->refs_lock); 4798 } 4799 4800 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4801 struct page *accessed) 4802 { 4803 int num_pages, i; 4804 4805 check_buffer_tree_ref(eb); 4806 4807 num_pages = num_extent_pages(eb); 4808 for (i = 0; i < num_pages; i++) { 4809 struct page *p = eb->pages[i]; 4810 4811 if (p != accessed) 4812 mark_page_accessed(p); 4813 } 4814 } 4815 4816 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4817 u64 start) 4818 { 4819 struct extent_buffer *eb; 4820 4821 rcu_read_lock(); 4822 eb = radix_tree_lookup(&fs_info->buffer_radix, 4823 start >> PAGE_SHIFT); 4824 if (eb && atomic_inc_not_zero(&eb->refs)) { 4825 rcu_read_unlock(); 4826 /* 4827 * Lock our eb's refs_lock to avoid races with 4828 * free_extent_buffer. When we get our eb it might be flagged 4829 * with EXTENT_BUFFER_STALE and another task running 4830 * free_extent_buffer might have seen that flag set, 4831 * eb->refs == 2, that the buffer isn't under IO (dirty and 4832 * writeback flags not set) and it's still in the tree (flag 4833 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4834 * of decrementing the extent buffer's reference count twice. 4835 * So here we could race and increment the eb's reference count, 4836 * clear its stale flag, mark it as dirty and drop our reference 4837 * before the other task finishes executing free_extent_buffer, 4838 * which would later result in an attempt to free an extent 4839 * buffer that is dirty. 4840 */ 4841 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4842 spin_lock(&eb->refs_lock); 4843 spin_unlock(&eb->refs_lock); 4844 } 4845 mark_extent_buffer_accessed(eb, NULL); 4846 return eb; 4847 } 4848 rcu_read_unlock(); 4849 4850 return NULL; 4851 } 4852 4853 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4854 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4855 u64 start) 4856 { 4857 struct extent_buffer *eb, *exists = NULL; 4858 int ret; 4859 4860 eb = find_extent_buffer(fs_info, start); 4861 if (eb) 4862 return eb; 4863 eb = alloc_dummy_extent_buffer(fs_info, start); 4864 if (!eb) 4865 return NULL; 4866 eb->fs_info = fs_info; 4867 again: 4868 ret = radix_tree_preload(GFP_NOFS); 4869 if (ret) 4870 goto free_eb; 4871 spin_lock(&fs_info->buffer_lock); 4872 ret = radix_tree_insert(&fs_info->buffer_radix, 4873 start >> PAGE_SHIFT, eb); 4874 spin_unlock(&fs_info->buffer_lock); 4875 radix_tree_preload_end(); 4876 if (ret == -EEXIST) { 4877 exists = find_extent_buffer(fs_info, start); 4878 if (exists) 4879 goto free_eb; 4880 else 4881 goto again; 4882 } 4883 check_buffer_tree_ref(eb); 4884 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4885 4886 return eb; 4887 free_eb: 4888 btrfs_release_extent_buffer(eb); 4889 return exists; 4890 } 4891 #endif 4892 4893 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 4894 u64 start) 4895 { 4896 unsigned long len = fs_info->nodesize; 4897 int num_pages; 4898 int i; 4899 unsigned long index = start >> PAGE_SHIFT; 4900 struct extent_buffer *eb; 4901 struct extent_buffer *exists = NULL; 4902 struct page *p; 4903 struct address_space *mapping = fs_info->btree_inode->i_mapping; 4904 int uptodate = 1; 4905 int ret; 4906 4907 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 4908 btrfs_err(fs_info, "bad tree block start %llu", start); 4909 return ERR_PTR(-EINVAL); 4910 } 4911 4912 eb = find_extent_buffer(fs_info, start); 4913 if (eb) 4914 return eb; 4915 4916 eb = __alloc_extent_buffer(fs_info, start, len); 4917 if (!eb) 4918 return ERR_PTR(-ENOMEM); 4919 4920 num_pages = num_extent_pages(eb); 4921 for (i = 0; i < num_pages; i++, index++) { 4922 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 4923 if (!p) { 4924 exists = ERR_PTR(-ENOMEM); 4925 goto free_eb; 4926 } 4927 4928 spin_lock(&mapping->private_lock); 4929 if (PagePrivate(p)) { 4930 /* 4931 * We could have already allocated an eb for this page 4932 * and attached one so lets see if we can get a ref on 4933 * the existing eb, and if we can we know it's good and 4934 * we can just return that one, else we know we can just 4935 * overwrite page->private. 4936 */ 4937 exists = (struct extent_buffer *)p->private; 4938 if (atomic_inc_not_zero(&exists->refs)) { 4939 spin_unlock(&mapping->private_lock); 4940 unlock_page(p); 4941 put_page(p); 4942 mark_extent_buffer_accessed(exists, p); 4943 goto free_eb; 4944 } 4945 exists = NULL; 4946 4947 /* 4948 * Do this so attach doesn't complain and we need to 4949 * drop the ref the old guy had. 4950 */ 4951 ClearPagePrivate(p); 4952 WARN_ON(PageDirty(p)); 4953 put_page(p); 4954 } 4955 attach_extent_buffer_page(eb, p); 4956 spin_unlock(&mapping->private_lock); 4957 WARN_ON(PageDirty(p)); 4958 eb->pages[i] = p; 4959 if (!PageUptodate(p)) 4960 uptodate = 0; 4961 4962 /* 4963 * We can't unlock the pages just yet since the extent buffer 4964 * hasn't been properly inserted in the radix tree, this 4965 * opens a race with btree_releasepage which can free a page 4966 * while we are still filling in all pages for the buffer and 4967 * we could crash. 4968 */ 4969 } 4970 if (uptodate) 4971 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4972 again: 4973 ret = radix_tree_preload(GFP_NOFS); 4974 if (ret) { 4975 exists = ERR_PTR(ret); 4976 goto free_eb; 4977 } 4978 4979 spin_lock(&fs_info->buffer_lock); 4980 ret = radix_tree_insert(&fs_info->buffer_radix, 4981 start >> PAGE_SHIFT, eb); 4982 spin_unlock(&fs_info->buffer_lock); 4983 radix_tree_preload_end(); 4984 if (ret == -EEXIST) { 4985 exists = find_extent_buffer(fs_info, start); 4986 if (exists) 4987 goto free_eb; 4988 else 4989 goto again; 4990 } 4991 /* add one reference for the tree */ 4992 check_buffer_tree_ref(eb); 4993 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 4994 4995 /* 4996 * Now it's safe to unlock the pages because any calls to 4997 * btree_releasepage will correctly detect that a page belongs to a 4998 * live buffer and won't free them prematurely. 4999 */ 5000 for (i = 0; i < num_pages; i++) 5001 unlock_page(eb->pages[i]); 5002 return eb; 5003 5004 free_eb: 5005 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5006 for (i = 0; i < num_pages; i++) { 5007 if (eb->pages[i]) 5008 unlock_page(eb->pages[i]); 5009 } 5010 5011 btrfs_release_extent_buffer(eb); 5012 return exists; 5013 } 5014 5015 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5016 { 5017 struct extent_buffer *eb = 5018 container_of(head, struct extent_buffer, rcu_head); 5019 5020 __free_extent_buffer(eb); 5021 } 5022 5023 static int release_extent_buffer(struct extent_buffer *eb) 5024 { 5025 lockdep_assert_held(&eb->refs_lock); 5026 5027 WARN_ON(atomic_read(&eb->refs) == 0); 5028 if (atomic_dec_and_test(&eb->refs)) { 5029 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5030 struct btrfs_fs_info *fs_info = eb->fs_info; 5031 5032 spin_unlock(&eb->refs_lock); 5033 5034 spin_lock(&fs_info->buffer_lock); 5035 radix_tree_delete(&fs_info->buffer_radix, 5036 eb->start >> PAGE_SHIFT); 5037 spin_unlock(&fs_info->buffer_lock); 5038 } else { 5039 spin_unlock(&eb->refs_lock); 5040 } 5041 5042 /* Should be safe to release our pages at this point */ 5043 btrfs_release_extent_buffer_pages(eb); 5044 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5045 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 5046 __free_extent_buffer(eb); 5047 return 1; 5048 } 5049 #endif 5050 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5051 return 1; 5052 } 5053 spin_unlock(&eb->refs_lock); 5054 5055 return 0; 5056 } 5057 5058 void free_extent_buffer(struct extent_buffer *eb) 5059 { 5060 int refs; 5061 int old; 5062 if (!eb) 5063 return; 5064 5065 while (1) { 5066 refs = atomic_read(&eb->refs); 5067 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 5068 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 5069 refs == 1)) 5070 break; 5071 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5072 if (old == refs) 5073 return; 5074 } 5075 5076 spin_lock(&eb->refs_lock); 5077 if (atomic_read(&eb->refs) == 2 && 5078 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5079 !extent_buffer_under_io(eb) && 5080 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5081 atomic_dec(&eb->refs); 5082 5083 /* 5084 * I know this is terrible, but it's temporary until we stop tracking 5085 * the uptodate bits and such for the extent buffers. 5086 */ 5087 release_extent_buffer(eb); 5088 } 5089 5090 void free_extent_buffer_stale(struct extent_buffer *eb) 5091 { 5092 if (!eb) 5093 return; 5094 5095 spin_lock(&eb->refs_lock); 5096 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5097 5098 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5099 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5100 atomic_dec(&eb->refs); 5101 release_extent_buffer(eb); 5102 } 5103 5104 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5105 { 5106 int i; 5107 int num_pages; 5108 struct page *page; 5109 5110 num_pages = num_extent_pages(eb); 5111 5112 for (i = 0; i < num_pages; i++) { 5113 page = eb->pages[i]; 5114 if (!PageDirty(page)) 5115 continue; 5116 5117 lock_page(page); 5118 WARN_ON(!PagePrivate(page)); 5119 5120 clear_page_dirty_for_io(page); 5121 xa_lock_irq(&page->mapping->i_pages); 5122 if (!PageDirty(page)) 5123 __xa_clear_mark(&page->mapping->i_pages, 5124 page_index(page), PAGECACHE_TAG_DIRTY); 5125 xa_unlock_irq(&page->mapping->i_pages); 5126 ClearPageError(page); 5127 unlock_page(page); 5128 } 5129 WARN_ON(atomic_read(&eb->refs) == 0); 5130 } 5131 5132 bool set_extent_buffer_dirty(struct extent_buffer *eb) 5133 { 5134 int i; 5135 int num_pages; 5136 bool was_dirty; 5137 5138 check_buffer_tree_ref(eb); 5139 5140 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5141 5142 num_pages = num_extent_pages(eb); 5143 WARN_ON(atomic_read(&eb->refs) == 0); 5144 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5145 5146 if (!was_dirty) 5147 for (i = 0; i < num_pages; i++) 5148 set_page_dirty(eb->pages[i]); 5149 5150 #ifdef CONFIG_BTRFS_DEBUG 5151 for (i = 0; i < num_pages; i++) 5152 ASSERT(PageDirty(eb->pages[i])); 5153 #endif 5154 5155 return was_dirty; 5156 } 5157 5158 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5159 { 5160 int i; 5161 struct page *page; 5162 int num_pages; 5163 5164 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5165 num_pages = num_extent_pages(eb); 5166 for (i = 0; i < num_pages; i++) { 5167 page = eb->pages[i]; 5168 if (page) 5169 ClearPageUptodate(page); 5170 } 5171 } 5172 5173 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5174 { 5175 int i; 5176 struct page *page; 5177 int num_pages; 5178 5179 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5180 num_pages = num_extent_pages(eb); 5181 for (i = 0; i < num_pages; i++) { 5182 page = eb->pages[i]; 5183 SetPageUptodate(page); 5184 } 5185 } 5186 5187 int read_extent_buffer_pages(struct extent_io_tree *tree, 5188 struct extent_buffer *eb, int wait, int mirror_num) 5189 { 5190 int i; 5191 struct page *page; 5192 int err; 5193 int ret = 0; 5194 int locked_pages = 0; 5195 int all_uptodate = 1; 5196 int num_pages; 5197 unsigned long num_reads = 0; 5198 struct bio *bio = NULL; 5199 unsigned long bio_flags = 0; 5200 5201 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5202 return 0; 5203 5204 num_pages = num_extent_pages(eb); 5205 for (i = 0; i < num_pages; i++) { 5206 page = eb->pages[i]; 5207 if (wait == WAIT_NONE) { 5208 if (!trylock_page(page)) 5209 goto unlock_exit; 5210 } else { 5211 lock_page(page); 5212 } 5213 locked_pages++; 5214 } 5215 /* 5216 * We need to firstly lock all pages to make sure that 5217 * the uptodate bit of our pages won't be affected by 5218 * clear_extent_buffer_uptodate(). 5219 */ 5220 for (i = 0; i < num_pages; i++) { 5221 page = eb->pages[i]; 5222 if (!PageUptodate(page)) { 5223 num_reads++; 5224 all_uptodate = 0; 5225 } 5226 } 5227 5228 if (all_uptodate) { 5229 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5230 goto unlock_exit; 5231 } 5232 5233 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5234 eb->read_mirror = 0; 5235 atomic_set(&eb->io_pages, num_reads); 5236 for (i = 0; i < num_pages; i++) { 5237 page = eb->pages[i]; 5238 5239 if (!PageUptodate(page)) { 5240 if (ret) { 5241 atomic_dec(&eb->io_pages); 5242 unlock_page(page); 5243 continue; 5244 } 5245 5246 ClearPageError(page); 5247 err = __extent_read_full_page(tree, page, 5248 btree_get_extent, &bio, 5249 mirror_num, &bio_flags, 5250 REQ_META); 5251 if (err) { 5252 ret = err; 5253 /* 5254 * We use &bio in above __extent_read_full_page, 5255 * so we ensure that if it returns error, the 5256 * current page fails to add itself to bio and 5257 * it's been unlocked. 5258 * 5259 * We must dec io_pages by ourselves. 5260 */ 5261 atomic_dec(&eb->io_pages); 5262 } 5263 } else { 5264 unlock_page(page); 5265 } 5266 } 5267 5268 if (bio) { 5269 err = submit_one_bio(bio, mirror_num, bio_flags); 5270 if (err) 5271 return err; 5272 } 5273 5274 if (ret || wait != WAIT_COMPLETE) 5275 return ret; 5276 5277 for (i = 0; i < num_pages; i++) { 5278 page = eb->pages[i]; 5279 wait_on_page_locked(page); 5280 if (!PageUptodate(page)) 5281 ret = -EIO; 5282 } 5283 5284 return ret; 5285 5286 unlock_exit: 5287 while (locked_pages > 0) { 5288 locked_pages--; 5289 page = eb->pages[locked_pages]; 5290 unlock_page(page); 5291 } 5292 return ret; 5293 } 5294 5295 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 5296 unsigned long start, unsigned long len) 5297 { 5298 size_t cur; 5299 size_t offset; 5300 struct page *page; 5301 char *kaddr; 5302 char *dst = (char *)dstv; 5303 size_t start_offset = offset_in_page(eb->start); 5304 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5305 5306 if (start + len > eb->len) { 5307 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5308 eb->start, eb->len, start, len); 5309 memset(dst, 0, len); 5310 return; 5311 } 5312 5313 offset = offset_in_page(start_offset + start); 5314 5315 while (len > 0) { 5316 page = eb->pages[i]; 5317 5318 cur = min(len, (PAGE_SIZE - offset)); 5319 kaddr = page_address(page); 5320 memcpy(dst, kaddr + offset, cur); 5321 5322 dst += cur; 5323 len -= cur; 5324 offset = 0; 5325 i++; 5326 } 5327 } 5328 5329 int read_extent_buffer_to_user(const struct extent_buffer *eb, 5330 void __user *dstv, 5331 unsigned long start, unsigned long len) 5332 { 5333 size_t cur; 5334 size_t offset; 5335 struct page *page; 5336 char *kaddr; 5337 char __user *dst = (char __user *)dstv; 5338 size_t start_offset = offset_in_page(eb->start); 5339 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5340 int ret = 0; 5341 5342 WARN_ON(start > eb->len); 5343 WARN_ON(start + len > eb->start + eb->len); 5344 5345 offset = offset_in_page(start_offset + start); 5346 5347 while (len > 0) { 5348 page = eb->pages[i]; 5349 5350 cur = min(len, (PAGE_SIZE - offset)); 5351 kaddr = page_address(page); 5352 if (copy_to_user(dst, kaddr + offset, cur)) { 5353 ret = -EFAULT; 5354 break; 5355 } 5356 5357 dst += cur; 5358 len -= cur; 5359 offset = 0; 5360 i++; 5361 } 5362 5363 return ret; 5364 } 5365 5366 /* 5367 * return 0 if the item is found within a page. 5368 * return 1 if the item spans two pages. 5369 * return -EINVAL otherwise. 5370 */ 5371 int map_private_extent_buffer(const struct extent_buffer *eb, 5372 unsigned long start, unsigned long min_len, 5373 char **map, unsigned long *map_start, 5374 unsigned long *map_len) 5375 { 5376 size_t offset; 5377 char *kaddr; 5378 struct page *p; 5379 size_t start_offset = offset_in_page(eb->start); 5380 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5381 unsigned long end_i = (start_offset + start + min_len - 1) >> 5382 PAGE_SHIFT; 5383 5384 if (start + min_len > eb->len) { 5385 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5386 eb->start, eb->len, start, min_len); 5387 return -EINVAL; 5388 } 5389 5390 if (i != end_i) 5391 return 1; 5392 5393 if (i == 0) { 5394 offset = start_offset; 5395 *map_start = 0; 5396 } else { 5397 offset = 0; 5398 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5399 } 5400 5401 p = eb->pages[i]; 5402 kaddr = page_address(p); 5403 *map = kaddr + offset; 5404 *map_len = PAGE_SIZE - offset; 5405 return 0; 5406 } 5407 5408 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 5409 unsigned long start, unsigned long len) 5410 { 5411 size_t cur; 5412 size_t offset; 5413 struct page *page; 5414 char *kaddr; 5415 char *ptr = (char *)ptrv; 5416 size_t start_offset = offset_in_page(eb->start); 5417 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5418 int ret = 0; 5419 5420 WARN_ON(start > eb->len); 5421 WARN_ON(start + len > eb->start + eb->len); 5422 5423 offset = offset_in_page(start_offset + start); 5424 5425 while (len > 0) { 5426 page = eb->pages[i]; 5427 5428 cur = min(len, (PAGE_SIZE - offset)); 5429 5430 kaddr = page_address(page); 5431 ret = memcmp(ptr, kaddr + offset, cur); 5432 if (ret) 5433 break; 5434 5435 ptr += cur; 5436 len -= cur; 5437 offset = 0; 5438 i++; 5439 } 5440 return ret; 5441 } 5442 5443 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5444 const void *srcv) 5445 { 5446 char *kaddr; 5447 5448 WARN_ON(!PageUptodate(eb->pages[0])); 5449 kaddr = page_address(eb->pages[0]); 5450 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5451 BTRFS_FSID_SIZE); 5452 } 5453 5454 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5455 { 5456 char *kaddr; 5457 5458 WARN_ON(!PageUptodate(eb->pages[0])); 5459 kaddr = page_address(eb->pages[0]); 5460 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5461 BTRFS_FSID_SIZE); 5462 } 5463 5464 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5465 unsigned long start, unsigned long len) 5466 { 5467 size_t cur; 5468 size_t offset; 5469 struct page *page; 5470 char *kaddr; 5471 char *src = (char *)srcv; 5472 size_t start_offset = offset_in_page(eb->start); 5473 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5474 5475 WARN_ON(start > eb->len); 5476 WARN_ON(start + len > eb->start + eb->len); 5477 5478 offset = offset_in_page(start_offset + start); 5479 5480 while (len > 0) { 5481 page = eb->pages[i]; 5482 WARN_ON(!PageUptodate(page)); 5483 5484 cur = min(len, PAGE_SIZE - offset); 5485 kaddr = page_address(page); 5486 memcpy(kaddr + offset, src, cur); 5487 5488 src += cur; 5489 len -= cur; 5490 offset = 0; 5491 i++; 5492 } 5493 } 5494 5495 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5496 unsigned long len) 5497 { 5498 size_t cur; 5499 size_t offset; 5500 struct page *page; 5501 char *kaddr; 5502 size_t start_offset = offset_in_page(eb->start); 5503 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5504 5505 WARN_ON(start > eb->len); 5506 WARN_ON(start + len > eb->start + eb->len); 5507 5508 offset = offset_in_page(start_offset + start); 5509 5510 while (len > 0) { 5511 page = eb->pages[i]; 5512 WARN_ON(!PageUptodate(page)); 5513 5514 cur = min(len, PAGE_SIZE - offset); 5515 kaddr = page_address(page); 5516 memset(kaddr + offset, 0, cur); 5517 5518 len -= cur; 5519 offset = 0; 5520 i++; 5521 } 5522 } 5523 5524 void copy_extent_buffer_full(struct extent_buffer *dst, 5525 struct extent_buffer *src) 5526 { 5527 int i; 5528 int num_pages; 5529 5530 ASSERT(dst->len == src->len); 5531 5532 num_pages = num_extent_pages(dst); 5533 for (i = 0; i < num_pages; i++) 5534 copy_page(page_address(dst->pages[i]), 5535 page_address(src->pages[i])); 5536 } 5537 5538 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5539 unsigned long dst_offset, unsigned long src_offset, 5540 unsigned long len) 5541 { 5542 u64 dst_len = dst->len; 5543 size_t cur; 5544 size_t offset; 5545 struct page *page; 5546 char *kaddr; 5547 size_t start_offset = offset_in_page(dst->start); 5548 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5549 5550 WARN_ON(src->len != dst_len); 5551 5552 offset = offset_in_page(start_offset + dst_offset); 5553 5554 while (len > 0) { 5555 page = dst->pages[i]; 5556 WARN_ON(!PageUptodate(page)); 5557 5558 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5559 5560 kaddr = page_address(page); 5561 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5562 5563 src_offset += cur; 5564 len -= cur; 5565 offset = 0; 5566 i++; 5567 } 5568 } 5569 5570 /* 5571 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5572 * given bit number 5573 * @eb: the extent buffer 5574 * @start: offset of the bitmap item in the extent buffer 5575 * @nr: bit number 5576 * @page_index: return index of the page in the extent buffer that contains the 5577 * given bit number 5578 * @page_offset: return offset into the page given by page_index 5579 * 5580 * This helper hides the ugliness of finding the byte in an extent buffer which 5581 * contains a given bit. 5582 */ 5583 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5584 unsigned long start, unsigned long nr, 5585 unsigned long *page_index, 5586 size_t *page_offset) 5587 { 5588 size_t start_offset = offset_in_page(eb->start); 5589 size_t byte_offset = BIT_BYTE(nr); 5590 size_t offset; 5591 5592 /* 5593 * The byte we want is the offset of the extent buffer + the offset of 5594 * the bitmap item in the extent buffer + the offset of the byte in the 5595 * bitmap item. 5596 */ 5597 offset = start_offset + start + byte_offset; 5598 5599 *page_index = offset >> PAGE_SHIFT; 5600 *page_offset = offset_in_page(offset); 5601 } 5602 5603 /** 5604 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5605 * @eb: the extent buffer 5606 * @start: offset of the bitmap item in the extent buffer 5607 * @nr: bit number to test 5608 */ 5609 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5610 unsigned long nr) 5611 { 5612 u8 *kaddr; 5613 struct page *page; 5614 unsigned long i; 5615 size_t offset; 5616 5617 eb_bitmap_offset(eb, start, nr, &i, &offset); 5618 page = eb->pages[i]; 5619 WARN_ON(!PageUptodate(page)); 5620 kaddr = page_address(page); 5621 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5622 } 5623 5624 /** 5625 * extent_buffer_bitmap_set - set an area of a bitmap 5626 * @eb: the extent buffer 5627 * @start: offset of the bitmap item in the extent buffer 5628 * @pos: bit number of the first bit 5629 * @len: number of bits to set 5630 */ 5631 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5632 unsigned long pos, unsigned long len) 5633 { 5634 u8 *kaddr; 5635 struct page *page; 5636 unsigned long i; 5637 size_t offset; 5638 const unsigned int size = pos + len; 5639 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5640 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5641 5642 eb_bitmap_offset(eb, start, pos, &i, &offset); 5643 page = eb->pages[i]; 5644 WARN_ON(!PageUptodate(page)); 5645 kaddr = page_address(page); 5646 5647 while (len >= bits_to_set) { 5648 kaddr[offset] |= mask_to_set; 5649 len -= bits_to_set; 5650 bits_to_set = BITS_PER_BYTE; 5651 mask_to_set = ~0; 5652 if (++offset >= PAGE_SIZE && len > 0) { 5653 offset = 0; 5654 page = eb->pages[++i]; 5655 WARN_ON(!PageUptodate(page)); 5656 kaddr = page_address(page); 5657 } 5658 } 5659 if (len) { 5660 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5661 kaddr[offset] |= mask_to_set; 5662 } 5663 } 5664 5665 5666 /** 5667 * extent_buffer_bitmap_clear - clear an area of a bitmap 5668 * @eb: the extent buffer 5669 * @start: offset of the bitmap item in the extent buffer 5670 * @pos: bit number of the first bit 5671 * @len: number of bits to clear 5672 */ 5673 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5674 unsigned long pos, unsigned long len) 5675 { 5676 u8 *kaddr; 5677 struct page *page; 5678 unsigned long i; 5679 size_t offset; 5680 const unsigned int size = pos + len; 5681 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5682 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5683 5684 eb_bitmap_offset(eb, start, pos, &i, &offset); 5685 page = eb->pages[i]; 5686 WARN_ON(!PageUptodate(page)); 5687 kaddr = page_address(page); 5688 5689 while (len >= bits_to_clear) { 5690 kaddr[offset] &= ~mask_to_clear; 5691 len -= bits_to_clear; 5692 bits_to_clear = BITS_PER_BYTE; 5693 mask_to_clear = ~0; 5694 if (++offset >= PAGE_SIZE && len > 0) { 5695 offset = 0; 5696 page = eb->pages[++i]; 5697 WARN_ON(!PageUptodate(page)); 5698 kaddr = page_address(page); 5699 } 5700 } 5701 if (len) { 5702 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5703 kaddr[offset] &= ~mask_to_clear; 5704 } 5705 } 5706 5707 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5708 { 5709 unsigned long distance = (src > dst) ? src - dst : dst - src; 5710 return distance < len; 5711 } 5712 5713 static void copy_pages(struct page *dst_page, struct page *src_page, 5714 unsigned long dst_off, unsigned long src_off, 5715 unsigned long len) 5716 { 5717 char *dst_kaddr = page_address(dst_page); 5718 char *src_kaddr; 5719 int must_memmove = 0; 5720 5721 if (dst_page != src_page) { 5722 src_kaddr = page_address(src_page); 5723 } else { 5724 src_kaddr = dst_kaddr; 5725 if (areas_overlap(src_off, dst_off, len)) 5726 must_memmove = 1; 5727 } 5728 5729 if (must_memmove) 5730 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5731 else 5732 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5733 } 5734 5735 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5736 unsigned long src_offset, unsigned long len) 5737 { 5738 struct btrfs_fs_info *fs_info = dst->fs_info; 5739 size_t cur; 5740 size_t dst_off_in_page; 5741 size_t src_off_in_page; 5742 size_t start_offset = offset_in_page(dst->start); 5743 unsigned long dst_i; 5744 unsigned long src_i; 5745 5746 if (src_offset + len > dst->len) { 5747 btrfs_err(fs_info, 5748 "memmove bogus src_offset %lu move len %lu dst len %lu", 5749 src_offset, len, dst->len); 5750 BUG_ON(1); 5751 } 5752 if (dst_offset + len > dst->len) { 5753 btrfs_err(fs_info, 5754 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5755 dst_offset, len, dst->len); 5756 BUG_ON(1); 5757 } 5758 5759 while (len > 0) { 5760 dst_off_in_page = offset_in_page(start_offset + dst_offset); 5761 src_off_in_page = offset_in_page(start_offset + src_offset); 5762 5763 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5764 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5765 5766 cur = min(len, (unsigned long)(PAGE_SIZE - 5767 src_off_in_page)); 5768 cur = min_t(unsigned long, cur, 5769 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5770 5771 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5772 dst_off_in_page, src_off_in_page, cur); 5773 5774 src_offset += cur; 5775 dst_offset += cur; 5776 len -= cur; 5777 } 5778 } 5779 5780 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5781 unsigned long src_offset, unsigned long len) 5782 { 5783 struct btrfs_fs_info *fs_info = dst->fs_info; 5784 size_t cur; 5785 size_t dst_off_in_page; 5786 size_t src_off_in_page; 5787 unsigned long dst_end = dst_offset + len - 1; 5788 unsigned long src_end = src_offset + len - 1; 5789 size_t start_offset = offset_in_page(dst->start); 5790 unsigned long dst_i; 5791 unsigned long src_i; 5792 5793 if (src_offset + len > dst->len) { 5794 btrfs_err(fs_info, 5795 "memmove bogus src_offset %lu move len %lu len %lu", 5796 src_offset, len, dst->len); 5797 BUG_ON(1); 5798 } 5799 if (dst_offset + len > dst->len) { 5800 btrfs_err(fs_info, 5801 "memmove bogus dst_offset %lu move len %lu len %lu", 5802 dst_offset, len, dst->len); 5803 BUG_ON(1); 5804 } 5805 if (dst_offset < src_offset) { 5806 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5807 return; 5808 } 5809 while (len > 0) { 5810 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5811 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5812 5813 dst_off_in_page = offset_in_page(start_offset + dst_end); 5814 src_off_in_page = offset_in_page(start_offset + src_end); 5815 5816 cur = min_t(unsigned long, len, src_off_in_page + 1); 5817 cur = min(cur, dst_off_in_page + 1); 5818 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5819 dst_off_in_page - cur + 1, 5820 src_off_in_page - cur + 1, cur); 5821 5822 dst_end -= cur; 5823 src_end -= cur; 5824 len -= cur; 5825 } 5826 } 5827 5828 int try_release_extent_buffer(struct page *page) 5829 { 5830 struct extent_buffer *eb; 5831 5832 /* 5833 * We need to make sure nobody is attaching this page to an eb right 5834 * now. 5835 */ 5836 spin_lock(&page->mapping->private_lock); 5837 if (!PagePrivate(page)) { 5838 spin_unlock(&page->mapping->private_lock); 5839 return 1; 5840 } 5841 5842 eb = (struct extent_buffer *)page->private; 5843 BUG_ON(!eb); 5844 5845 /* 5846 * This is a little awful but should be ok, we need to make sure that 5847 * the eb doesn't disappear out from under us while we're looking at 5848 * this page. 5849 */ 5850 spin_lock(&eb->refs_lock); 5851 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 5852 spin_unlock(&eb->refs_lock); 5853 spin_unlock(&page->mapping->private_lock); 5854 return 0; 5855 } 5856 spin_unlock(&page->mapping->private_lock); 5857 5858 /* 5859 * If tree ref isn't set then we know the ref on this eb is a real ref, 5860 * so just return, this page will likely be freed soon anyway. 5861 */ 5862 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 5863 spin_unlock(&eb->refs_lock); 5864 return 0; 5865 } 5866 5867 return release_extent_buffer(eb); 5868 } 5869