1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/writeback.h> 13 #include <linux/pagevec.h> 14 #include <linux/prefetch.h> 15 #include <linux/cleancache.h> 16 #include "extent_io.h" 17 #include "extent-io-tree.h" 18 #include "extent_map.h" 19 #include "ctree.h" 20 #include "btrfs_inode.h" 21 #include "volumes.h" 22 #include "check-integrity.h" 23 #include "locking.h" 24 #include "rcu-string.h" 25 #include "backref.h" 26 #include "disk-io.h" 27 28 static struct kmem_cache *extent_state_cache; 29 static struct kmem_cache *extent_buffer_cache; 30 static struct bio_set btrfs_bioset; 31 32 static inline bool extent_state_in_tree(const struct extent_state *state) 33 { 34 return !RB_EMPTY_NODE(&state->rb_node); 35 } 36 37 #ifdef CONFIG_BTRFS_DEBUG 38 static LIST_HEAD(buffers); 39 static LIST_HEAD(states); 40 41 static DEFINE_SPINLOCK(leak_lock); 42 43 static inline 44 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 45 { 46 unsigned long flags; 47 48 spin_lock_irqsave(&leak_lock, flags); 49 list_add(new, head); 50 spin_unlock_irqrestore(&leak_lock, flags); 51 } 52 53 static inline 54 void btrfs_leak_debug_del(struct list_head *entry) 55 { 56 unsigned long flags; 57 58 spin_lock_irqsave(&leak_lock, flags); 59 list_del(entry); 60 spin_unlock_irqrestore(&leak_lock, flags); 61 } 62 63 static inline void btrfs_extent_buffer_leak_debug_check(void) 64 { 65 struct extent_buffer *eb; 66 67 while (!list_empty(&buffers)) { 68 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 69 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", 70 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); 71 list_del(&eb->leak_list); 72 kmem_cache_free(extent_buffer_cache, eb); 73 } 74 } 75 76 static inline void btrfs_extent_state_leak_debug_check(void) 77 { 78 struct extent_state *state; 79 80 while (!list_empty(&states)) { 81 state = list_entry(states.next, struct extent_state, leak_list); 82 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 83 state->start, state->end, state->state, 84 extent_state_in_tree(state), 85 refcount_read(&state->refs)); 86 list_del(&state->leak_list); 87 kmem_cache_free(extent_state_cache, state); 88 } 89 } 90 91 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 92 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 93 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 94 struct extent_io_tree *tree, u64 start, u64 end) 95 { 96 struct inode *inode = tree->private_data; 97 u64 isize; 98 99 if (!inode || !is_data_inode(inode)) 100 return; 101 102 isize = i_size_read(inode); 103 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 104 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 105 "%s: ino %llu isize %llu odd range [%llu,%llu]", 106 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 107 } 108 } 109 #else 110 #define btrfs_leak_debug_add(new, head) do {} while (0) 111 #define btrfs_leak_debug_del(entry) do {} while (0) 112 #define btrfs_extent_buffer_leak_debug_check() do {} while (0) 113 #define btrfs_extent_state_leak_debug_check() do {} while (0) 114 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 115 #endif 116 117 struct tree_entry { 118 u64 start; 119 u64 end; 120 struct rb_node rb_node; 121 }; 122 123 struct extent_page_data { 124 struct bio *bio; 125 struct extent_io_tree *tree; 126 /* tells writepage not to lock the state bits for this range 127 * it still does the unlocking 128 */ 129 unsigned int extent_locked:1; 130 131 /* tells the submit_bio code to use REQ_SYNC */ 132 unsigned int sync_io:1; 133 }; 134 135 static int add_extent_changeset(struct extent_state *state, unsigned bits, 136 struct extent_changeset *changeset, 137 int set) 138 { 139 int ret; 140 141 if (!changeset) 142 return 0; 143 if (set && (state->state & bits) == bits) 144 return 0; 145 if (!set && (state->state & bits) == 0) 146 return 0; 147 changeset->bytes_changed += state->end - state->start + 1; 148 ret = ulist_add(&changeset->range_changed, state->start, state->end, 149 GFP_ATOMIC); 150 return ret; 151 } 152 153 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 154 unsigned long bio_flags) 155 { 156 blk_status_t ret = 0; 157 struct extent_io_tree *tree = bio->bi_private; 158 159 bio->bi_private = NULL; 160 161 if (tree->ops) 162 ret = tree->ops->submit_bio_hook(tree->private_data, bio, 163 mirror_num, bio_flags); 164 else 165 btrfsic_submit_bio(bio); 166 167 return blk_status_to_errno(ret); 168 } 169 170 /* Cleanup unsubmitted bios */ 171 static void end_write_bio(struct extent_page_data *epd, int ret) 172 { 173 if (epd->bio) { 174 epd->bio->bi_status = errno_to_blk_status(ret); 175 bio_endio(epd->bio); 176 epd->bio = NULL; 177 } 178 } 179 180 /* 181 * Submit bio from extent page data via submit_one_bio 182 * 183 * Return 0 if everything is OK. 184 * Return <0 for error. 185 */ 186 static int __must_check flush_write_bio(struct extent_page_data *epd) 187 { 188 int ret = 0; 189 190 if (epd->bio) { 191 ret = submit_one_bio(epd->bio, 0, 0); 192 /* 193 * Clean up of epd->bio is handled by its endio function. 194 * And endio is either triggered by successful bio execution 195 * or the error handler of submit bio hook. 196 * So at this point, no matter what happened, we don't need 197 * to clean up epd->bio. 198 */ 199 epd->bio = NULL; 200 } 201 return ret; 202 } 203 204 int __init extent_state_cache_init(void) 205 { 206 extent_state_cache = kmem_cache_create("btrfs_extent_state", 207 sizeof(struct extent_state), 0, 208 SLAB_MEM_SPREAD, NULL); 209 if (!extent_state_cache) 210 return -ENOMEM; 211 return 0; 212 } 213 214 int __init extent_io_init(void) 215 { 216 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 217 sizeof(struct extent_buffer), 0, 218 SLAB_MEM_SPREAD, NULL); 219 if (!extent_buffer_cache) 220 return -ENOMEM; 221 222 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 223 offsetof(struct btrfs_io_bio, bio), 224 BIOSET_NEED_BVECS)) 225 goto free_buffer_cache; 226 227 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 228 goto free_bioset; 229 230 return 0; 231 232 free_bioset: 233 bioset_exit(&btrfs_bioset); 234 235 free_buffer_cache: 236 kmem_cache_destroy(extent_buffer_cache); 237 extent_buffer_cache = NULL; 238 return -ENOMEM; 239 } 240 241 void __cold extent_state_cache_exit(void) 242 { 243 btrfs_extent_state_leak_debug_check(); 244 kmem_cache_destroy(extent_state_cache); 245 } 246 247 void __cold extent_io_exit(void) 248 { 249 btrfs_extent_buffer_leak_debug_check(); 250 251 /* 252 * Make sure all delayed rcu free are flushed before we 253 * destroy caches. 254 */ 255 rcu_barrier(); 256 kmem_cache_destroy(extent_buffer_cache); 257 bioset_exit(&btrfs_bioset); 258 } 259 260 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 261 struct extent_io_tree *tree, unsigned int owner, 262 void *private_data) 263 { 264 tree->fs_info = fs_info; 265 tree->state = RB_ROOT; 266 tree->ops = NULL; 267 tree->dirty_bytes = 0; 268 spin_lock_init(&tree->lock); 269 tree->private_data = private_data; 270 tree->owner = owner; 271 } 272 273 void extent_io_tree_release(struct extent_io_tree *tree) 274 { 275 spin_lock(&tree->lock); 276 /* 277 * Do a single barrier for the waitqueue_active check here, the state 278 * of the waitqueue should not change once extent_io_tree_release is 279 * called. 280 */ 281 smp_mb(); 282 while (!RB_EMPTY_ROOT(&tree->state)) { 283 struct rb_node *node; 284 struct extent_state *state; 285 286 node = rb_first(&tree->state); 287 state = rb_entry(node, struct extent_state, rb_node); 288 rb_erase(&state->rb_node, &tree->state); 289 RB_CLEAR_NODE(&state->rb_node); 290 /* 291 * btree io trees aren't supposed to have tasks waiting for 292 * changes in the flags of extent states ever. 293 */ 294 ASSERT(!waitqueue_active(&state->wq)); 295 free_extent_state(state); 296 297 cond_resched_lock(&tree->lock); 298 } 299 spin_unlock(&tree->lock); 300 } 301 302 static struct extent_state *alloc_extent_state(gfp_t mask) 303 { 304 struct extent_state *state; 305 306 /* 307 * The given mask might be not appropriate for the slab allocator, 308 * drop the unsupported bits 309 */ 310 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 311 state = kmem_cache_alloc(extent_state_cache, mask); 312 if (!state) 313 return state; 314 state->state = 0; 315 state->failrec = NULL; 316 RB_CLEAR_NODE(&state->rb_node); 317 btrfs_leak_debug_add(&state->leak_list, &states); 318 refcount_set(&state->refs, 1); 319 init_waitqueue_head(&state->wq); 320 trace_alloc_extent_state(state, mask, _RET_IP_); 321 return state; 322 } 323 324 void free_extent_state(struct extent_state *state) 325 { 326 if (!state) 327 return; 328 if (refcount_dec_and_test(&state->refs)) { 329 WARN_ON(extent_state_in_tree(state)); 330 btrfs_leak_debug_del(&state->leak_list); 331 trace_free_extent_state(state, _RET_IP_); 332 kmem_cache_free(extent_state_cache, state); 333 } 334 } 335 336 static struct rb_node *tree_insert(struct rb_root *root, 337 struct rb_node *search_start, 338 u64 offset, 339 struct rb_node *node, 340 struct rb_node ***p_in, 341 struct rb_node **parent_in) 342 { 343 struct rb_node **p; 344 struct rb_node *parent = NULL; 345 struct tree_entry *entry; 346 347 if (p_in && parent_in) { 348 p = *p_in; 349 parent = *parent_in; 350 goto do_insert; 351 } 352 353 p = search_start ? &search_start : &root->rb_node; 354 while (*p) { 355 parent = *p; 356 entry = rb_entry(parent, struct tree_entry, rb_node); 357 358 if (offset < entry->start) 359 p = &(*p)->rb_left; 360 else if (offset > entry->end) 361 p = &(*p)->rb_right; 362 else 363 return parent; 364 } 365 366 do_insert: 367 rb_link_node(node, parent, p); 368 rb_insert_color(node, root); 369 return NULL; 370 } 371 372 /** 373 * __etree_search - searche @tree for an entry that contains @offset. Such 374 * entry would have entry->start <= offset && entry->end >= offset. 375 * 376 * @tree - the tree to search 377 * @offset - offset that should fall within an entry in @tree 378 * @next_ret - pointer to the first entry whose range ends after @offset 379 * @prev - pointer to the first entry whose range begins before @offset 380 * @p_ret - pointer where new node should be anchored (used when inserting an 381 * entry in the tree) 382 * @parent_ret - points to entry which would have been the parent of the entry, 383 * containing @offset 384 * 385 * This function returns a pointer to the entry that contains @offset byte 386 * address. If no such entry exists, then NULL is returned and the other 387 * pointer arguments to the function are filled, otherwise the found entry is 388 * returned and other pointers are left untouched. 389 */ 390 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 391 struct rb_node **next_ret, 392 struct rb_node **prev_ret, 393 struct rb_node ***p_ret, 394 struct rb_node **parent_ret) 395 { 396 struct rb_root *root = &tree->state; 397 struct rb_node **n = &root->rb_node; 398 struct rb_node *prev = NULL; 399 struct rb_node *orig_prev = NULL; 400 struct tree_entry *entry; 401 struct tree_entry *prev_entry = NULL; 402 403 while (*n) { 404 prev = *n; 405 entry = rb_entry(prev, struct tree_entry, rb_node); 406 prev_entry = entry; 407 408 if (offset < entry->start) 409 n = &(*n)->rb_left; 410 else if (offset > entry->end) 411 n = &(*n)->rb_right; 412 else 413 return *n; 414 } 415 416 if (p_ret) 417 *p_ret = n; 418 if (parent_ret) 419 *parent_ret = prev; 420 421 if (next_ret) { 422 orig_prev = prev; 423 while (prev && offset > prev_entry->end) { 424 prev = rb_next(prev); 425 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 426 } 427 *next_ret = prev; 428 prev = orig_prev; 429 } 430 431 if (prev_ret) { 432 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 433 while (prev && offset < prev_entry->start) { 434 prev = rb_prev(prev); 435 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 436 } 437 *prev_ret = prev; 438 } 439 return NULL; 440 } 441 442 static inline struct rb_node * 443 tree_search_for_insert(struct extent_io_tree *tree, 444 u64 offset, 445 struct rb_node ***p_ret, 446 struct rb_node **parent_ret) 447 { 448 struct rb_node *next= NULL; 449 struct rb_node *ret; 450 451 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); 452 if (!ret) 453 return next; 454 return ret; 455 } 456 457 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 458 u64 offset) 459 { 460 return tree_search_for_insert(tree, offset, NULL, NULL); 461 } 462 463 /* 464 * utility function to look for merge candidates inside a given range. 465 * Any extents with matching state are merged together into a single 466 * extent in the tree. Extents with EXTENT_IO in their state field 467 * are not merged because the end_io handlers need to be able to do 468 * operations on them without sleeping (or doing allocations/splits). 469 * 470 * This should be called with the tree lock held. 471 */ 472 static void merge_state(struct extent_io_tree *tree, 473 struct extent_state *state) 474 { 475 struct extent_state *other; 476 struct rb_node *other_node; 477 478 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 479 return; 480 481 other_node = rb_prev(&state->rb_node); 482 if (other_node) { 483 other = rb_entry(other_node, struct extent_state, rb_node); 484 if (other->end == state->start - 1 && 485 other->state == state->state) { 486 if (tree->private_data && 487 is_data_inode(tree->private_data)) 488 btrfs_merge_delalloc_extent(tree->private_data, 489 state, other); 490 state->start = other->start; 491 rb_erase(&other->rb_node, &tree->state); 492 RB_CLEAR_NODE(&other->rb_node); 493 free_extent_state(other); 494 } 495 } 496 other_node = rb_next(&state->rb_node); 497 if (other_node) { 498 other = rb_entry(other_node, struct extent_state, rb_node); 499 if (other->start == state->end + 1 && 500 other->state == state->state) { 501 if (tree->private_data && 502 is_data_inode(tree->private_data)) 503 btrfs_merge_delalloc_extent(tree->private_data, 504 state, other); 505 state->end = other->end; 506 rb_erase(&other->rb_node, &tree->state); 507 RB_CLEAR_NODE(&other->rb_node); 508 free_extent_state(other); 509 } 510 } 511 } 512 513 static void set_state_bits(struct extent_io_tree *tree, 514 struct extent_state *state, unsigned *bits, 515 struct extent_changeset *changeset); 516 517 /* 518 * insert an extent_state struct into the tree. 'bits' are set on the 519 * struct before it is inserted. 520 * 521 * This may return -EEXIST if the extent is already there, in which case the 522 * state struct is freed. 523 * 524 * The tree lock is not taken internally. This is a utility function and 525 * probably isn't what you want to call (see set/clear_extent_bit). 526 */ 527 static int insert_state(struct extent_io_tree *tree, 528 struct extent_state *state, u64 start, u64 end, 529 struct rb_node ***p, 530 struct rb_node **parent, 531 unsigned *bits, struct extent_changeset *changeset) 532 { 533 struct rb_node *node; 534 535 if (end < start) { 536 btrfs_err(tree->fs_info, 537 "insert state: end < start %llu %llu", end, start); 538 WARN_ON(1); 539 } 540 state->start = start; 541 state->end = end; 542 543 set_state_bits(tree, state, bits, changeset); 544 545 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 546 if (node) { 547 struct extent_state *found; 548 found = rb_entry(node, struct extent_state, rb_node); 549 btrfs_err(tree->fs_info, 550 "found node %llu %llu on insert of %llu %llu", 551 found->start, found->end, start, end); 552 return -EEXIST; 553 } 554 merge_state(tree, state); 555 return 0; 556 } 557 558 /* 559 * split a given extent state struct in two, inserting the preallocated 560 * struct 'prealloc' as the newly created second half. 'split' indicates an 561 * offset inside 'orig' where it should be split. 562 * 563 * Before calling, 564 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 565 * are two extent state structs in the tree: 566 * prealloc: [orig->start, split - 1] 567 * orig: [ split, orig->end ] 568 * 569 * The tree locks are not taken by this function. They need to be held 570 * by the caller. 571 */ 572 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 573 struct extent_state *prealloc, u64 split) 574 { 575 struct rb_node *node; 576 577 if (tree->private_data && is_data_inode(tree->private_data)) 578 btrfs_split_delalloc_extent(tree->private_data, orig, split); 579 580 prealloc->start = orig->start; 581 prealloc->end = split - 1; 582 prealloc->state = orig->state; 583 orig->start = split; 584 585 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 586 &prealloc->rb_node, NULL, NULL); 587 if (node) { 588 free_extent_state(prealloc); 589 return -EEXIST; 590 } 591 return 0; 592 } 593 594 static struct extent_state *next_state(struct extent_state *state) 595 { 596 struct rb_node *next = rb_next(&state->rb_node); 597 if (next) 598 return rb_entry(next, struct extent_state, rb_node); 599 else 600 return NULL; 601 } 602 603 /* 604 * utility function to clear some bits in an extent state struct. 605 * it will optionally wake up anyone waiting on this state (wake == 1). 606 * 607 * If no bits are set on the state struct after clearing things, the 608 * struct is freed and removed from the tree 609 */ 610 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 611 struct extent_state *state, 612 unsigned *bits, int wake, 613 struct extent_changeset *changeset) 614 { 615 struct extent_state *next; 616 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 617 int ret; 618 619 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 620 u64 range = state->end - state->start + 1; 621 WARN_ON(range > tree->dirty_bytes); 622 tree->dirty_bytes -= range; 623 } 624 625 if (tree->private_data && is_data_inode(tree->private_data)) 626 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 627 628 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 629 BUG_ON(ret < 0); 630 state->state &= ~bits_to_clear; 631 if (wake) 632 wake_up(&state->wq); 633 if (state->state == 0) { 634 next = next_state(state); 635 if (extent_state_in_tree(state)) { 636 rb_erase(&state->rb_node, &tree->state); 637 RB_CLEAR_NODE(&state->rb_node); 638 free_extent_state(state); 639 } else { 640 WARN_ON(1); 641 } 642 } else { 643 merge_state(tree, state); 644 next = next_state(state); 645 } 646 return next; 647 } 648 649 static struct extent_state * 650 alloc_extent_state_atomic(struct extent_state *prealloc) 651 { 652 if (!prealloc) 653 prealloc = alloc_extent_state(GFP_ATOMIC); 654 655 return prealloc; 656 } 657 658 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 659 { 660 struct inode *inode = tree->private_data; 661 662 btrfs_panic(btrfs_sb(inode->i_sb), err, 663 "locking error: extent tree was modified by another thread while locked"); 664 } 665 666 /* 667 * clear some bits on a range in the tree. This may require splitting 668 * or inserting elements in the tree, so the gfp mask is used to 669 * indicate which allocations or sleeping are allowed. 670 * 671 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 672 * the given range from the tree regardless of state (ie for truncate). 673 * 674 * the range [start, end] is inclusive. 675 * 676 * This takes the tree lock, and returns 0 on success and < 0 on error. 677 */ 678 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 679 unsigned bits, int wake, int delete, 680 struct extent_state **cached_state, 681 gfp_t mask, struct extent_changeset *changeset) 682 { 683 struct extent_state *state; 684 struct extent_state *cached; 685 struct extent_state *prealloc = NULL; 686 struct rb_node *node; 687 u64 last_end; 688 int err; 689 int clear = 0; 690 691 btrfs_debug_check_extent_io_range(tree, start, end); 692 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); 693 694 if (bits & EXTENT_DELALLOC) 695 bits |= EXTENT_NORESERVE; 696 697 if (delete) 698 bits |= ~EXTENT_CTLBITS; 699 700 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 701 clear = 1; 702 again: 703 if (!prealloc && gfpflags_allow_blocking(mask)) { 704 /* 705 * Don't care for allocation failure here because we might end 706 * up not needing the pre-allocated extent state at all, which 707 * is the case if we only have in the tree extent states that 708 * cover our input range and don't cover too any other range. 709 * If we end up needing a new extent state we allocate it later. 710 */ 711 prealloc = alloc_extent_state(mask); 712 } 713 714 spin_lock(&tree->lock); 715 if (cached_state) { 716 cached = *cached_state; 717 718 if (clear) { 719 *cached_state = NULL; 720 cached_state = NULL; 721 } 722 723 if (cached && extent_state_in_tree(cached) && 724 cached->start <= start && cached->end > start) { 725 if (clear) 726 refcount_dec(&cached->refs); 727 state = cached; 728 goto hit_next; 729 } 730 if (clear) 731 free_extent_state(cached); 732 } 733 /* 734 * this search will find the extents that end after 735 * our range starts 736 */ 737 node = tree_search(tree, start); 738 if (!node) 739 goto out; 740 state = rb_entry(node, struct extent_state, rb_node); 741 hit_next: 742 if (state->start > end) 743 goto out; 744 WARN_ON(state->end < start); 745 last_end = state->end; 746 747 /* the state doesn't have the wanted bits, go ahead */ 748 if (!(state->state & bits)) { 749 state = next_state(state); 750 goto next; 751 } 752 753 /* 754 * | ---- desired range ---- | 755 * | state | or 756 * | ------------- state -------------- | 757 * 758 * We need to split the extent we found, and may flip 759 * bits on second half. 760 * 761 * If the extent we found extends past our range, we 762 * just split and search again. It'll get split again 763 * the next time though. 764 * 765 * If the extent we found is inside our range, we clear 766 * the desired bit on it. 767 */ 768 769 if (state->start < start) { 770 prealloc = alloc_extent_state_atomic(prealloc); 771 BUG_ON(!prealloc); 772 err = split_state(tree, state, prealloc, start); 773 if (err) 774 extent_io_tree_panic(tree, err); 775 776 prealloc = NULL; 777 if (err) 778 goto out; 779 if (state->end <= end) { 780 state = clear_state_bit(tree, state, &bits, wake, 781 changeset); 782 goto next; 783 } 784 goto search_again; 785 } 786 /* 787 * | ---- desired range ---- | 788 * | state | 789 * We need to split the extent, and clear the bit 790 * on the first half 791 */ 792 if (state->start <= end && state->end > end) { 793 prealloc = alloc_extent_state_atomic(prealloc); 794 BUG_ON(!prealloc); 795 err = split_state(tree, state, prealloc, end + 1); 796 if (err) 797 extent_io_tree_panic(tree, err); 798 799 if (wake) 800 wake_up(&state->wq); 801 802 clear_state_bit(tree, prealloc, &bits, wake, changeset); 803 804 prealloc = NULL; 805 goto out; 806 } 807 808 state = clear_state_bit(tree, state, &bits, wake, changeset); 809 next: 810 if (last_end == (u64)-1) 811 goto out; 812 start = last_end + 1; 813 if (start <= end && state && !need_resched()) 814 goto hit_next; 815 816 search_again: 817 if (start > end) 818 goto out; 819 spin_unlock(&tree->lock); 820 if (gfpflags_allow_blocking(mask)) 821 cond_resched(); 822 goto again; 823 824 out: 825 spin_unlock(&tree->lock); 826 if (prealloc) 827 free_extent_state(prealloc); 828 829 return 0; 830 831 } 832 833 static void wait_on_state(struct extent_io_tree *tree, 834 struct extent_state *state) 835 __releases(tree->lock) 836 __acquires(tree->lock) 837 { 838 DEFINE_WAIT(wait); 839 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 840 spin_unlock(&tree->lock); 841 schedule(); 842 spin_lock(&tree->lock); 843 finish_wait(&state->wq, &wait); 844 } 845 846 /* 847 * waits for one or more bits to clear on a range in the state tree. 848 * The range [start, end] is inclusive. 849 * The tree lock is taken by this function 850 */ 851 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 852 unsigned long bits) 853 { 854 struct extent_state *state; 855 struct rb_node *node; 856 857 btrfs_debug_check_extent_io_range(tree, start, end); 858 859 spin_lock(&tree->lock); 860 again: 861 while (1) { 862 /* 863 * this search will find all the extents that end after 864 * our range starts 865 */ 866 node = tree_search(tree, start); 867 process_node: 868 if (!node) 869 break; 870 871 state = rb_entry(node, struct extent_state, rb_node); 872 873 if (state->start > end) 874 goto out; 875 876 if (state->state & bits) { 877 start = state->start; 878 refcount_inc(&state->refs); 879 wait_on_state(tree, state); 880 free_extent_state(state); 881 goto again; 882 } 883 start = state->end + 1; 884 885 if (start > end) 886 break; 887 888 if (!cond_resched_lock(&tree->lock)) { 889 node = rb_next(node); 890 goto process_node; 891 } 892 } 893 out: 894 spin_unlock(&tree->lock); 895 } 896 897 static void set_state_bits(struct extent_io_tree *tree, 898 struct extent_state *state, 899 unsigned *bits, struct extent_changeset *changeset) 900 { 901 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 902 int ret; 903 904 if (tree->private_data && is_data_inode(tree->private_data)) 905 btrfs_set_delalloc_extent(tree->private_data, state, bits); 906 907 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 908 u64 range = state->end - state->start + 1; 909 tree->dirty_bytes += range; 910 } 911 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 912 BUG_ON(ret < 0); 913 state->state |= bits_to_set; 914 } 915 916 static void cache_state_if_flags(struct extent_state *state, 917 struct extent_state **cached_ptr, 918 unsigned flags) 919 { 920 if (cached_ptr && !(*cached_ptr)) { 921 if (!flags || (state->state & flags)) { 922 *cached_ptr = state; 923 refcount_inc(&state->refs); 924 } 925 } 926 } 927 928 static void cache_state(struct extent_state *state, 929 struct extent_state **cached_ptr) 930 { 931 return cache_state_if_flags(state, cached_ptr, 932 EXTENT_LOCKED | EXTENT_BOUNDARY); 933 } 934 935 /* 936 * set some bits on a range in the tree. This may require allocations or 937 * sleeping, so the gfp mask is used to indicate what is allowed. 938 * 939 * If any of the exclusive bits are set, this will fail with -EEXIST if some 940 * part of the range already has the desired bits set. The start of the 941 * existing range is returned in failed_start in this case. 942 * 943 * [start, end] is inclusive This takes the tree lock. 944 */ 945 946 static int __must_check 947 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 948 unsigned bits, unsigned exclusive_bits, 949 u64 *failed_start, struct extent_state **cached_state, 950 gfp_t mask, struct extent_changeset *changeset) 951 { 952 struct extent_state *state; 953 struct extent_state *prealloc = NULL; 954 struct rb_node *node; 955 struct rb_node **p; 956 struct rb_node *parent; 957 int err = 0; 958 u64 last_start; 959 u64 last_end; 960 961 btrfs_debug_check_extent_io_range(tree, start, end); 962 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); 963 964 again: 965 if (!prealloc && gfpflags_allow_blocking(mask)) { 966 /* 967 * Don't care for allocation failure here because we might end 968 * up not needing the pre-allocated extent state at all, which 969 * is the case if we only have in the tree extent states that 970 * cover our input range and don't cover too any other range. 971 * If we end up needing a new extent state we allocate it later. 972 */ 973 prealloc = alloc_extent_state(mask); 974 } 975 976 spin_lock(&tree->lock); 977 if (cached_state && *cached_state) { 978 state = *cached_state; 979 if (state->start <= start && state->end > start && 980 extent_state_in_tree(state)) { 981 node = &state->rb_node; 982 goto hit_next; 983 } 984 } 985 /* 986 * this search will find all the extents that end after 987 * our range starts. 988 */ 989 node = tree_search_for_insert(tree, start, &p, &parent); 990 if (!node) { 991 prealloc = alloc_extent_state_atomic(prealloc); 992 BUG_ON(!prealloc); 993 err = insert_state(tree, prealloc, start, end, 994 &p, &parent, &bits, changeset); 995 if (err) 996 extent_io_tree_panic(tree, err); 997 998 cache_state(prealloc, cached_state); 999 prealloc = NULL; 1000 goto out; 1001 } 1002 state = rb_entry(node, struct extent_state, rb_node); 1003 hit_next: 1004 last_start = state->start; 1005 last_end = state->end; 1006 1007 /* 1008 * | ---- desired range ---- | 1009 * | state | 1010 * 1011 * Just lock what we found and keep going 1012 */ 1013 if (state->start == start && state->end <= end) { 1014 if (state->state & exclusive_bits) { 1015 *failed_start = state->start; 1016 err = -EEXIST; 1017 goto out; 1018 } 1019 1020 set_state_bits(tree, state, &bits, changeset); 1021 cache_state(state, cached_state); 1022 merge_state(tree, state); 1023 if (last_end == (u64)-1) 1024 goto out; 1025 start = last_end + 1; 1026 state = next_state(state); 1027 if (start < end && state && state->start == start && 1028 !need_resched()) 1029 goto hit_next; 1030 goto search_again; 1031 } 1032 1033 /* 1034 * | ---- desired range ---- | 1035 * | state | 1036 * or 1037 * | ------------- state -------------- | 1038 * 1039 * We need to split the extent we found, and may flip bits on 1040 * second half. 1041 * 1042 * If the extent we found extends past our 1043 * range, we just split and search again. It'll get split 1044 * again the next time though. 1045 * 1046 * If the extent we found is inside our range, we set the 1047 * desired bit on it. 1048 */ 1049 if (state->start < start) { 1050 if (state->state & exclusive_bits) { 1051 *failed_start = start; 1052 err = -EEXIST; 1053 goto out; 1054 } 1055 1056 prealloc = alloc_extent_state_atomic(prealloc); 1057 BUG_ON(!prealloc); 1058 err = split_state(tree, state, prealloc, start); 1059 if (err) 1060 extent_io_tree_panic(tree, err); 1061 1062 prealloc = NULL; 1063 if (err) 1064 goto out; 1065 if (state->end <= end) { 1066 set_state_bits(tree, state, &bits, changeset); 1067 cache_state(state, cached_state); 1068 merge_state(tree, state); 1069 if (last_end == (u64)-1) 1070 goto out; 1071 start = last_end + 1; 1072 state = next_state(state); 1073 if (start < end && state && state->start == start && 1074 !need_resched()) 1075 goto hit_next; 1076 } 1077 goto search_again; 1078 } 1079 /* 1080 * | ---- desired range ---- | 1081 * | state | or | state | 1082 * 1083 * There's a hole, we need to insert something in it and 1084 * ignore the extent we found. 1085 */ 1086 if (state->start > start) { 1087 u64 this_end; 1088 if (end < last_start) 1089 this_end = end; 1090 else 1091 this_end = last_start - 1; 1092 1093 prealloc = alloc_extent_state_atomic(prealloc); 1094 BUG_ON(!prealloc); 1095 1096 /* 1097 * Avoid to free 'prealloc' if it can be merged with 1098 * the later extent. 1099 */ 1100 err = insert_state(tree, prealloc, start, this_end, 1101 NULL, NULL, &bits, changeset); 1102 if (err) 1103 extent_io_tree_panic(tree, err); 1104 1105 cache_state(prealloc, cached_state); 1106 prealloc = NULL; 1107 start = this_end + 1; 1108 goto search_again; 1109 } 1110 /* 1111 * | ---- desired range ---- | 1112 * | state | 1113 * We need to split the extent, and set the bit 1114 * on the first half 1115 */ 1116 if (state->start <= end && state->end > end) { 1117 if (state->state & exclusive_bits) { 1118 *failed_start = start; 1119 err = -EEXIST; 1120 goto out; 1121 } 1122 1123 prealloc = alloc_extent_state_atomic(prealloc); 1124 BUG_ON(!prealloc); 1125 err = split_state(tree, state, prealloc, end + 1); 1126 if (err) 1127 extent_io_tree_panic(tree, err); 1128 1129 set_state_bits(tree, prealloc, &bits, changeset); 1130 cache_state(prealloc, cached_state); 1131 merge_state(tree, prealloc); 1132 prealloc = NULL; 1133 goto out; 1134 } 1135 1136 search_again: 1137 if (start > end) 1138 goto out; 1139 spin_unlock(&tree->lock); 1140 if (gfpflags_allow_blocking(mask)) 1141 cond_resched(); 1142 goto again; 1143 1144 out: 1145 spin_unlock(&tree->lock); 1146 if (prealloc) 1147 free_extent_state(prealloc); 1148 1149 return err; 1150 1151 } 1152 1153 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1154 unsigned bits, u64 * failed_start, 1155 struct extent_state **cached_state, gfp_t mask) 1156 { 1157 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1158 cached_state, mask, NULL); 1159 } 1160 1161 1162 /** 1163 * convert_extent_bit - convert all bits in a given range from one bit to 1164 * another 1165 * @tree: the io tree to search 1166 * @start: the start offset in bytes 1167 * @end: the end offset in bytes (inclusive) 1168 * @bits: the bits to set in this range 1169 * @clear_bits: the bits to clear in this range 1170 * @cached_state: state that we're going to cache 1171 * 1172 * This will go through and set bits for the given range. If any states exist 1173 * already in this range they are set with the given bit and cleared of the 1174 * clear_bits. This is only meant to be used by things that are mergeable, ie 1175 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1176 * boundary bits like LOCK. 1177 * 1178 * All allocations are done with GFP_NOFS. 1179 */ 1180 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1181 unsigned bits, unsigned clear_bits, 1182 struct extent_state **cached_state) 1183 { 1184 struct extent_state *state; 1185 struct extent_state *prealloc = NULL; 1186 struct rb_node *node; 1187 struct rb_node **p; 1188 struct rb_node *parent; 1189 int err = 0; 1190 u64 last_start; 1191 u64 last_end; 1192 bool first_iteration = true; 1193 1194 btrfs_debug_check_extent_io_range(tree, start, end); 1195 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, 1196 clear_bits); 1197 1198 again: 1199 if (!prealloc) { 1200 /* 1201 * Best effort, don't worry if extent state allocation fails 1202 * here for the first iteration. We might have a cached state 1203 * that matches exactly the target range, in which case no 1204 * extent state allocations are needed. We'll only know this 1205 * after locking the tree. 1206 */ 1207 prealloc = alloc_extent_state(GFP_NOFS); 1208 if (!prealloc && !first_iteration) 1209 return -ENOMEM; 1210 } 1211 1212 spin_lock(&tree->lock); 1213 if (cached_state && *cached_state) { 1214 state = *cached_state; 1215 if (state->start <= start && state->end > start && 1216 extent_state_in_tree(state)) { 1217 node = &state->rb_node; 1218 goto hit_next; 1219 } 1220 } 1221 1222 /* 1223 * this search will find all the extents that end after 1224 * our range starts. 1225 */ 1226 node = tree_search_for_insert(tree, start, &p, &parent); 1227 if (!node) { 1228 prealloc = alloc_extent_state_atomic(prealloc); 1229 if (!prealloc) { 1230 err = -ENOMEM; 1231 goto out; 1232 } 1233 err = insert_state(tree, prealloc, start, end, 1234 &p, &parent, &bits, NULL); 1235 if (err) 1236 extent_io_tree_panic(tree, err); 1237 cache_state(prealloc, cached_state); 1238 prealloc = NULL; 1239 goto out; 1240 } 1241 state = rb_entry(node, struct extent_state, rb_node); 1242 hit_next: 1243 last_start = state->start; 1244 last_end = state->end; 1245 1246 /* 1247 * | ---- desired range ---- | 1248 * | state | 1249 * 1250 * Just lock what we found and keep going 1251 */ 1252 if (state->start == start && state->end <= end) { 1253 set_state_bits(tree, state, &bits, NULL); 1254 cache_state(state, cached_state); 1255 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1256 if (last_end == (u64)-1) 1257 goto out; 1258 start = last_end + 1; 1259 if (start < end && state && state->start == start && 1260 !need_resched()) 1261 goto hit_next; 1262 goto search_again; 1263 } 1264 1265 /* 1266 * | ---- desired range ---- | 1267 * | state | 1268 * or 1269 * | ------------- state -------------- | 1270 * 1271 * We need to split the extent we found, and may flip bits on 1272 * second half. 1273 * 1274 * If the extent we found extends past our 1275 * range, we just split and search again. It'll get split 1276 * again the next time though. 1277 * 1278 * If the extent we found is inside our range, we set the 1279 * desired bit on it. 1280 */ 1281 if (state->start < start) { 1282 prealloc = alloc_extent_state_atomic(prealloc); 1283 if (!prealloc) { 1284 err = -ENOMEM; 1285 goto out; 1286 } 1287 err = split_state(tree, state, prealloc, start); 1288 if (err) 1289 extent_io_tree_panic(tree, err); 1290 prealloc = NULL; 1291 if (err) 1292 goto out; 1293 if (state->end <= end) { 1294 set_state_bits(tree, state, &bits, NULL); 1295 cache_state(state, cached_state); 1296 state = clear_state_bit(tree, state, &clear_bits, 0, 1297 NULL); 1298 if (last_end == (u64)-1) 1299 goto out; 1300 start = last_end + 1; 1301 if (start < end && state && state->start == start && 1302 !need_resched()) 1303 goto hit_next; 1304 } 1305 goto search_again; 1306 } 1307 /* 1308 * | ---- desired range ---- | 1309 * | state | or | state | 1310 * 1311 * There's a hole, we need to insert something in it and 1312 * ignore the extent we found. 1313 */ 1314 if (state->start > start) { 1315 u64 this_end; 1316 if (end < last_start) 1317 this_end = end; 1318 else 1319 this_end = last_start - 1; 1320 1321 prealloc = alloc_extent_state_atomic(prealloc); 1322 if (!prealloc) { 1323 err = -ENOMEM; 1324 goto out; 1325 } 1326 1327 /* 1328 * Avoid to free 'prealloc' if it can be merged with 1329 * the later extent. 1330 */ 1331 err = insert_state(tree, prealloc, start, this_end, 1332 NULL, NULL, &bits, NULL); 1333 if (err) 1334 extent_io_tree_panic(tree, err); 1335 cache_state(prealloc, cached_state); 1336 prealloc = NULL; 1337 start = this_end + 1; 1338 goto search_again; 1339 } 1340 /* 1341 * | ---- desired range ---- | 1342 * | state | 1343 * We need to split the extent, and set the bit 1344 * on the first half 1345 */ 1346 if (state->start <= end && state->end > end) { 1347 prealloc = alloc_extent_state_atomic(prealloc); 1348 if (!prealloc) { 1349 err = -ENOMEM; 1350 goto out; 1351 } 1352 1353 err = split_state(tree, state, prealloc, end + 1); 1354 if (err) 1355 extent_io_tree_panic(tree, err); 1356 1357 set_state_bits(tree, prealloc, &bits, NULL); 1358 cache_state(prealloc, cached_state); 1359 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1360 prealloc = NULL; 1361 goto out; 1362 } 1363 1364 search_again: 1365 if (start > end) 1366 goto out; 1367 spin_unlock(&tree->lock); 1368 cond_resched(); 1369 first_iteration = false; 1370 goto again; 1371 1372 out: 1373 spin_unlock(&tree->lock); 1374 if (prealloc) 1375 free_extent_state(prealloc); 1376 1377 return err; 1378 } 1379 1380 /* wrappers around set/clear extent bit */ 1381 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1382 unsigned bits, struct extent_changeset *changeset) 1383 { 1384 /* 1385 * We don't support EXTENT_LOCKED yet, as current changeset will 1386 * record any bits changed, so for EXTENT_LOCKED case, it will 1387 * either fail with -EEXIST or changeset will record the whole 1388 * range. 1389 */ 1390 BUG_ON(bits & EXTENT_LOCKED); 1391 1392 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1393 changeset); 1394 } 1395 1396 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, 1397 unsigned bits) 1398 { 1399 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, 1400 GFP_NOWAIT, NULL); 1401 } 1402 1403 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1404 unsigned bits, int wake, int delete, 1405 struct extent_state **cached) 1406 { 1407 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1408 cached, GFP_NOFS, NULL); 1409 } 1410 1411 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1412 unsigned bits, struct extent_changeset *changeset) 1413 { 1414 /* 1415 * Don't support EXTENT_LOCKED case, same reason as 1416 * set_record_extent_bits(). 1417 */ 1418 BUG_ON(bits & EXTENT_LOCKED); 1419 1420 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1421 changeset); 1422 } 1423 1424 /* 1425 * either insert or lock state struct between start and end use mask to tell 1426 * us if waiting is desired. 1427 */ 1428 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1429 struct extent_state **cached_state) 1430 { 1431 int err; 1432 u64 failed_start; 1433 1434 while (1) { 1435 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1436 EXTENT_LOCKED, &failed_start, 1437 cached_state, GFP_NOFS, NULL); 1438 if (err == -EEXIST) { 1439 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1440 start = failed_start; 1441 } else 1442 break; 1443 WARN_ON(start > end); 1444 } 1445 return err; 1446 } 1447 1448 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1449 { 1450 int err; 1451 u64 failed_start; 1452 1453 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1454 &failed_start, NULL, GFP_NOFS, NULL); 1455 if (err == -EEXIST) { 1456 if (failed_start > start) 1457 clear_extent_bit(tree, start, failed_start - 1, 1458 EXTENT_LOCKED, 1, 0, NULL); 1459 return 0; 1460 } 1461 return 1; 1462 } 1463 1464 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1465 { 1466 unsigned long index = start >> PAGE_SHIFT; 1467 unsigned long end_index = end >> PAGE_SHIFT; 1468 struct page *page; 1469 1470 while (index <= end_index) { 1471 page = find_get_page(inode->i_mapping, index); 1472 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1473 clear_page_dirty_for_io(page); 1474 put_page(page); 1475 index++; 1476 } 1477 } 1478 1479 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1480 { 1481 unsigned long index = start >> PAGE_SHIFT; 1482 unsigned long end_index = end >> PAGE_SHIFT; 1483 struct page *page; 1484 1485 while (index <= end_index) { 1486 page = find_get_page(inode->i_mapping, index); 1487 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1488 __set_page_dirty_nobuffers(page); 1489 account_page_redirty(page); 1490 put_page(page); 1491 index++; 1492 } 1493 } 1494 1495 /* find the first state struct with 'bits' set after 'start', and 1496 * return it. tree->lock must be held. NULL will returned if 1497 * nothing was found after 'start' 1498 */ 1499 static struct extent_state * 1500 find_first_extent_bit_state(struct extent_io_tree *tree, 1501 u64 start, unsigned bits) 1502 { 1503 struct rb_node *node; 1504 struct extent_state *state; 1505 1506 /* 1507 * this search will find all the extents that end after 1508 * our range starts. 1509 */ 1510 node = tree_search(tree, start); 1511 if (!node) 1512 goto out; 1513 1514 while (1) { 1515 state = rb_entry(node, struct extent_state, rb_node); 1516 if (state->end >= start && (state->state & bits)) 1517 return state; 1518 1519 node = rb_next(node); 1520 if (!node) 1521 break; 1522 } 1523 out: 1524 return NULL; 1525 } 1526 1527 /* 1528 * find the first offset in the io tree with 'bits' set. zero is 1529 * returned if we find something, and *start_ret and *end_ret are 1530 * set to reflect the state struct that was found. 1531 * 1532 * If nothing was found, 1 is returned. If found something, return 0. 1533 */ 1534 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1535 u64 *start_ret, u64 *end_ret, unsigned bits, 1536 struct extent_state **cached_state) 1537 { 1538 struct extent_state *state; 1539 int ret = 1; 1540 1541 spin_lock(&tree->lock); 1542 if (cached_state && *cached_state) { 1543 state = *cached_state; 1544 if (state->end == start - 1 && extent_state_in_tree(state)) { 1545 while ((state = next_state(state)) != NULL) { 1546 if (state->state & bits) 1547 goto got_it; 1548 } 1549 free_extent_state(*cached_state); 1550 *cached_state = NULL; 1551 goto out; 1552 } 1553 free_extent_state(*cached_state); 1554 *cached_state = NULL; 1555 } 1556 1557 state = find_first_extent_bit_state(tree, start, bits); 1558 got_it: 1559 if (state) { 1560 cache_state_if_flags(state, cached_state, 0); 1561 *start_ret = state->start; 1562 *end_ret = state->end; 1563 ret = 0; 1564 } 1565 out: 1566 spin_unlock(&tree->lock); 1567 return ret; 1568 } 1569 1570 /** 1571 * find_first_clear_extent_bit - find the first range that has @bits not set. 1572 * This range could start before @start. 1573 * 1574 * @tree - the tree to search 1575 * @start - the offset at/after which the found extent should start 1576 * @start_ret - records the beginning of the range 1577 * @end_ret - records the end of the range (inclusive) 1578 * @bits - the set of bits which must be unset 1579 * 1580 * Since unallocated range is also considered one which doesn't have the bits 1581 * set it's possible that @end_ret contains -1, this happens in case the range 1582 * spans (last_range_end, end of device]. In this case it's up to the caller to 1583 * trim @end_ret to the appropriate size. 1584 */ 1585 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, 1586 u64 *start_ret, u64 *end_ret, unsigned bits) 1587 { 1588 struct extent_state *state; 1589 struct rb_node *node, *prev = NULL, *next; 1590 1591 spin_lock(&tree->lock); 1592 1593 /* Find first extent with bits cleared */ 1594 while (1) { 1595 node = __etree_search(tree, start, &next, &prev, NULL, NULL); 1596 if (!node && !next && !prev) { 1597 /* 1598 * Tree is completely empty, send full range and let 1599 * caller deal with it 1600 */ 1601 *start_ret = 0; 1602 *end_ret = -1; 1603 goto out; 1604 } else if (!node && !next) { 1605 /* 1606 * We are past the last allocated chunk, set start at 1607 * the end of the last extent. 1608 */ 1609 state = rb_entry(prev, struct extent_state, rb_node); 1610 *start_ret = state->end + 1; 1611 *end_ret = -1; 1612 goto out; 1613 } else if (!node) { 1614 node = next; 1615 } 1616 /* 1617 * At this point 'node' either contains 'start' or start is 1618 * before 'node' 1619 */ 1620 state = rb_entry(node, struct extent_state, rb_node); 1621 1622 if (in_range(start, state->start, state->end - state->start + 1)) { 1623 if (state->state & bits) { 1624 /* 1625 * |--range with bits sets--| 1626 * | 1627 * start 1628 */ 1629 start = state->end + 1; 1630 } else { 1631 /* 1632 * 'start' falls within a range that doesn't 1633 * have the bits set, so take its start as 1634 * the beginning of the desired range 1635 * 1636 * |--range with bits cleared----| 1637 * | 1638 * start 1639 */ 1640 *start_ret = state->start; 1641 break; 1642 } 1643 } else { 1644 /* 1645 * |---prev range---|---hole/unset---|---node range---| 1646 * | 1647 * start 1648 * 1649 * or 1650 * 1651 * |---hole/unset--||--first node--| 1652 * 0 | 1653 * start 1654 */ 1655 if (prev) { 1656 state = rb_entry(prev, struct extent_state, 1657 rb_node); 1658 *start_ret = state->end + 1; 1659 } else { 1660 *start_ret = 0; 1661 } 1662 break; 1663 } 1664 } 1665 1666 /* 1667 * Find the longest stretch from start until an entry which has the 1668 * bits set 1669 */ 1670 while (1) { 1671 state = rb_entry(node, struct extent_state, rb_node); 1672 if (state->end >= start && !(state->state & bits)) { 1673 *end_ret = state->end; 1674 } else { 1675 *end_ret = state->start - 1; 1676 break; 1677 } 1678 1679 node = rb_next(node); 1680 if (!node) 1681 break; 1682 } 1683 out: 1684 spin_unlock(&tree->lock); 1685 } 1686 1687 /* 1688 * find a contiguous range of bytes in the file marked as delalloc, not 1689 * more than 'max_bytes'. start and end are used to return the range, 1690 * 1691 * true is returned if we find something, false if nothing was in the tree 1692 */ 1693 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, 1694 u64 *end, u64 max_bytes, 1695 struct extent_state **cached_state) 1696 { 1697 struct rb_node *node; 1698 struct extent_state *state; 1699 u64 cur_start = *start; 1700 bool found = false; 1701 u64 total_bytes = 0; 1702 1703 spin_lock(&tree->lock); 1704 1705 /* 1706 * this search will find all the extents that end after 1707 * our range starts. 1708 */ 1709 node = tree_search(tree, cur_start); 1710 if (!node) { 1711 *end = (u64)-1; 1712 goto out; 1713 } 1714 1715 while (1) { 1716 state = rb_entry(node, struct extent_state, rb_node); 1717 if (found && (state->start != cur_start || 1718 (state->state & EXTENT_BOUNDARY))) { 1719 goto out; 1720 } 1721 if (!(state->state & EXTENT_DELALLOC)) { 1722 if (!found) 1723 *end = state->end; 1724 goto out; 1725 } 1726 if (!found) { 1727 *start = state->start; 1728 *cached_state = state; 1729 refcount_inc(&state->refs); 1730 } 1731 found = true; 1732 *end = state->end; 1733 cur_start = state->end + 1; 1734 node = rb_next(node); 1735 total_bytes += state->end - state->start + 1; 1736 if (total_bytes >= max_bytes) 1737 break; 1738 if (!node) 1739 break; 1740 } 1741 out: 1742 spin_unlock(&tree->lock); 1743 return found; 1744 } 1745 1746 static int __process_pages_contig(struct address_space *mapping, 1747 struct page *locked_page, 1748 pgoff_t start_index, pgoff_t end_index, 1749 unsigned long page_ops, pgoff_t *index_ret); 1750 1751 static noinline void __unlock_for_delalloc(struct inode *inode, 1752 struct page *locked_page, 1753 u64 start, u64 end) 1754 { 1755 unsigned long index = start >> PAGE_SHIFT; 1756 unsigned long end_index = end >> PAGE_SHIFT; 1757 1758 ASSERT(locked_page); 1759 if (index == locked_page->index && end_index == index) 1760 return; 1761 1762 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1763 PAGE_UNLOCK, NULL); 1764 } 1765 1766 static noinline int lock_delalloc_pages(struct inode *inode, 1767 struct page *locked_page, 1768 u64 delalloc_start, 1769 u64 delalloc_end) 1770 { 1771 unsigned long index = delalloc_start >> PAGE_SHIFT; 1772 unsigned long index_ret = index; 1773 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1774 int ret; 1775 1776 ASSERT(locked_page); 1777 if (index == locked_page->index && index == end_index) 1778 return 0; 1779 1780 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1781 end_index, PAGE_LOCK, &index_ret); 1782 if (ret == -EAGAIN) 1783 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1784 (u64)index_ret << PAGE_SHIFT); 1785 return ret; 1786 } 1787 1788 /* 1789 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 1790 * more than @max_bytes. @Start and @end are used to return the range, 1791 * 1792 * Return: true if we find something 1793 * false if nothing was in the tree 1794 */ 1795 EXPORT_FOR_TESTS 1796 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1797 struct page *locked_page, u64 *start, 1798 u64 *end) 1799 { 1800 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1801 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1802 u64 delalloc_start; 1803 u64 delalloc_end; 1804 bool found; 1805 struct extent_state *cached_state = NULL; 1806 int ret; 1807 int loops = 0; 1808 1809 again: 1810 /* step one, find a bunch of delalloc bytes starting at start */ 1811 delalloc_start = *start; 1812 delalloc_end = 0; 1813 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1814 max_bytes, &cached_state); 1815 if (!found || delalloc_end <= *start) { 1816 *start = delalloc_start; 1817 *end = delalloc_end; 1818 free_extent_state(cached_state); 1819 return false; 1820 } 1821 1822 /* 1823 * start comes from the offset of locked_page. We have to lock 1824 * pages in order, so we can't process delalloc bytes before 1825 * locked_page 1826 */ 1827 if (delalloc_start < *start) 1828 delalloc_start = *start; 1829 1830 /* 1831 * make sure to limit the number of pages we try to lock down 1832 */ 1833 if (delalloc_end + 1 - delalloc_start > max_bytes) 1834 delalloc_end = delalloc_start + max_bytes - 1; 1835 1836 /* step two, lock all the pages after the page that has start */ 1837 ret = lock_delalloc_pages(inode, locked_page, 1838 delalloc_start, delalloc_end); 1839 ASSERT(!ret || ret == -EAGAIN); 1840 if (ret == -EAGAIN) { 1841 /* some of the pages are gone, lets avoid looping by 1842 * shortening the size of the delalloc range we're searching 1843 */ 1844 free_extent_state(cached_state); 1845 cached_state = NULL; 1846 if (!loops) { 1847 max_bytes = PAGE_SIZE; 1848 loops = 1; 1849 goto again; 1850 } else { 1851 found = false; 1852 goto out_failed; 1853 } 1854 } 1855 1856 /* step three, lock the state bits for the whole range */ 1857 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1858 1859 /* then test to make sure it is all still delalloc */ 1860 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1861 EXTENT_DELALLOC, 1, cached_state); 1862 if (!ret) { 1863 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1864 &cached_state); 1865 __unlock_for_delalloc(inode, locked_page, 1866 delalloc_start, delalloc_end); 1867 cond_resched(); 1868 goto again; 1869 } 1870 free_extent_state(cached_state); 1871 *start = delalloc_start; 1872 *end = delalloc_end; 1873 out_failed: 1874 return found; 1875 } 1876 1877 static int __process_pages_contig(struct address_space *mapping, 1878 struct page *locked_page, 1879 pgoff_t start_index, pgoff_t end_index, 1880 unsigned long page_ops, pgoff_t *index_ret) 1881 { 1882 unsigned long nr_pages = end_index - start_index + 1; 1883 unsigned long pages_locked = 0; 1884 pgoff_t index = start_index; 1885 struct page *pages[16]; 1886 unsigned ret; 1887 int err = 0; 1888 int i; 1889 1890 if (page_ops & PAGE_LOCK) { 1891 ASSERT(page_ops == PAGE_LOCK); 1892 ASSERT(index_ret && *index_ret == start_index); 1893 } 1894 1895 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1896 mapping_set_error(mapping, -EIO); 1897 1898 while (nr_pages > 0) { 1899 ret = find_get_pages_contig(mapping, index, 1900 min_t(unsigned long, 1901 nr_pages, ARRAY_SIZE(pages)), pages); 1902 if (ret == 0) { 1903 /* 1904 * Only if we're going to lock these pages, 1905 * can we find nothing at @index. 1906 */ 1907 ASSERT(page_ops & PAGE_LOCK); 1908 err = -EAGAIN; 1909 goto out; 1910 } 1911 1912 for (i = 0; i < ret; i++) { 1913 if (page_ops & PAGE_SET_PRIVATE2) 1914 SetPagePrivate2(pages[i]); 1915 1916 if (locked_page && pages[i] == locked_page) { 1917 put_page(pages[i]); 1918 pages_locked++; 1919 continue; 1920 } 1921 if (page_ops & PAGE_CLEAR_DIRTY) 1922 clear_page_dirty_for_io(pages[i]); 1923 if (page_ops & PAGE_SET_WRITEBACK) 1924 set_page_writeback(pages[i]); 1925 if (page_ops & PAGE_SET_ERROR) 1926 SetPageError(pages[i]); 1927 if (page_ops & PAGE_END_WRITEBACK) 1928 end_page_writeback(pages[i]); 1929 if (page_ops & PAGE_UNLOCK) 1930 unlock_page(pages[i]); 1931 if (page_ops & PAGE_LOCK) { 1932 lock_page(pages[i]); 1933 if (!PageDirty(pages[i]) || 1934 pages[i]->mapping != mapping) { 1935 unlock_page(pages[i]); 1936 put_page(pages[i]); 1937 err = -EAGAIN; 1938 goto out; 1939 } 1940 } 1941 put_page(pages[i]); 1942 pages_locked++; 1943 } 1944 nr_pages -= ret; 1945 index += ret; 1946 cond_resched(); 1947 } 1948 out: 1949 if (err && index_ret) 1950 *index_ret = start_index + pages_locked - 1; 1951 return err; 1952 } 1953 1954 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1955 struct page *locked_page, 1956 unsigned clear_bits, 1957 unsigned long page_ops) 1958 { 1959 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1960 NULL); 1961 1962 __process_pages_contig(inode->i_mapping, locked_page, 1963 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1964 page_ops, NULL); 1965 } 1966 1967 /* 1968 * count the number of bytes in the tree that have a given bit(s) 1969 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1970 * cached. The total number found is returned. 1971 */ 1972 u64 count_range_bits(struct extent_io_tree *tree, 1973 u64 *start, u64 search_end, u64 max_bytes, 1974 unsigned bits, int contig) 1975 { 1976 struct rb_node *node; 1977 struct extent_state *state; 1978 u64 cur_start = *start; 1979 u64 total_bytes = 0; 1980 u64 last = 0; 1981 int found = 0; 1982 1983 if (WARN_ON(search_end <= cur_start)) 1984 return 0; 1985 1986 spin_lock(&tree->lock); 1987 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1988 total_bytes = tree->dirty_bytes; 1989 goto out; 1990 } 1991 /* 1992 * this search will find all the extents that end after 1993 * our range starts. 1994 */ 1995 node = tree_search(tree, cur_start); 1996 if (!node) 1997 goto out; 1998 1999 while (1) { 2000 state = rb_entry(node, struct extent_state, rb_node); 2001 if (state->start > search_end) 2002 break; 2003 if (contig && found && state->start > last + 1) 2004 break; 2005 if (state->end >= cur_start && (state->state & bits) == bits) { 2006 total_bytes += min(search_end, state->end) + 1 - 2007 max(cur_start, state->start); 2008 if (total_bytes >= max_bytes) 2009 break; 2010 if (!found) { 2011 *start = max(cur_start, state->start); 2012 found = 1; 2013 } 2014 last = state->end; 2015 } else if (contig && found) { 2016 break; 2017 } 2018 node = rb_next(node); 2019 if (!node) 2020 break; 2021 } 2022 out: 2023 spin_unlock(&tree->lock); 2024 return total_bytes; 2025 } 2026 2027 /* 2028 * set the private field for a given byte offset in the tree. If there isn't 2029 * an extent_state there already, this does nothing. 2030 */ 2031 int set_state_failrec(struct extent_io_tree *tree, u64 start, 2032 struct io_failure_record *failrec) 2033 { 2034 struct rb_node *node; 2035 struct extent_state *state; 2036 int ret = 0; 2037 2038 spin_lock(&tree->lock); 2039 /* 2040 * this search will find all the extents that end after 2041 * our range starts. 2042 */ 2043 node = tree_search(tree, start); 2044 if (!node) { 2045 ret = -ENOENT; 2046 goto out; 2047 } 2048 state = rb_entry(node, struct extent_state, rb_node); 2049 if (state->start != start) { 2050 ret = -ENOENT; 2051 goto out; 2052 } 2053 state->failrec = failrec; 2054 out: 2055 spin_unlock(&tree->lock); 2056 return ret; 2057 } 2058 2059 int get_state_failrec(struct extent_io_tree *tree, u64 start, 2060 struct io_failure_record **failrec) 2061 { 2062 struct rb_node *node; 2063 struct extent_state *state; 2064 int ret = 0; 2065 2066 spin_lock(&tree->lock); 2067 /* 2068 * this search will find all the extents that end after 2069 * our range starts. 2070 */ 2071 node = tree_search(tree, start); 2072 if (!node) { 2073 ret = -ENOENT; 2074 goto out; 2075 } 2076 state = rb_entry(node, struct extent_state, rb_node); 2077 if (state->start != start) { 2078 ret = -ENOENT; 2079 goto out; 2080 } 2081 *failrec = state->failrec; 2082 out: 2083 spin_unlock(&tree->lock); 2084 return ret; 2085 } 2086 2087 /* 2088 * searches a range in the state tree for a given mask. 2089 * If 'filled' == 1, this returns 1 only if every extent in the tree 2090 * has the bits set. Otherwise, 1 is returned if any bit in the 2091 * range is found set. 2092 */ 2093 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 2094 unsigned bits, int filled, struct extent_state *cached) 2095 { 2096 struct extent_state *state = NULL; 2097 struct rb_node *node; 2098 int bitset = 0; 2099 2100 spin_lock(&tree->lock); 2101 if (cached && extent_state_in_tree(cached) && cached->start <= start && 2102 cached->end > start) 2103 node = &cached->rb_node; 2104 else 2105 node = tree_search(tree, start); 2106 while (node && start <= end) { 2107 state = rb_entry(node, struct extent_state, rb_node); 2108 2109 if (filled && state->start > start) { 2110 bitset = 0; 2111 break; 2112 } 2113 2114 if (state->start > end) 2115 break; 2116 2117 if (state->state & bits) { 2118 bitset = 1; 2119 if (!filled) 2120 break; 2121 } else if (filled) { 2122 bitset = 0; 2123 break; 2124 } 2125 2126 if (state->end == (u64)-1) 2127 break; 2128 2129 start = state->end + 1; 2130 if (start > end) 2131 break; 2132 node = rb_next(node); 2133 if (!node) { 2134 if (filled) 2135 bitset = 0; 2136 break; 2137 } 2138 } 2139 spin_unlock(&tree->lock); 2140 return bitset; 2141 } 2142 2143 /* 2144 * helper function to set a given page up to date if all the 2145 * extents in the tree for that page are up to date 2146 */ 2147 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 2148 { 2149 u64 start = page_offset(page); 2150 u64 end = start + PAGE_SIZE - 1; 2151 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 2152 SetPageUptodate(page); 2153 } 2154 2155 int free_io_failure(struct extent_io_tree *failure_tree, 2156 struct extent_io_tree *io_tree, 2157 struct io_failure_record *rec) 2158 { 2159 int ret; 2160 int err = 0; 2161 2162 set_state_failrec(failure_tree, rec->start, NULL); 2163 ret = clear_extent_bits(failure_tree, rec->start, 2164 rec->start + rec->len - 1, 2165 EXTENT_LOCKED | EXTENT_DIRTY); 2166 if (ret) 2167 err = ret; 2168 2169 ret = clear_extent_bits(io_tree, rec->start, 2170 rec->start + rec->len - 1, 2171 EXTENT_DAMAGED); 2172 if (ret && !err) 2173 err = ret; 2174 2175 kfree(rec); 2176 return err; 2177 } 2178 2179 /* 2180 * this bypasses the standard btrfs submit functions deliberately, as 2181 * the standard behavior is to write all copies in a raid setup. here we only 2182 * want to write the one bad copy. so we do the mapping for ourselves and issue 2183 * submit_bio directly. 2184 * to avoid any synchronization issues, wait for the data after writing, which 2185 * actually prevents the read that triggered the error from finishing. 2186 * currently, there can be no more than two copies of every data bit. thus, 2187 * exactly one rewrite is required. 2188 */ 2189 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 2190 u64 length, u64 logical, struct page *page, 2191 unsigned int pg_offset, int mirror_num) 2192 { 2193 struct bio *bio; 2194 struct btrfs_device *dev; 2195 u64 map_length = 0; 2196 u64 sector; 2197 struct btrfs_bio *bbio = NULL; 2198 int ret; 2199 2200 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2201 BUG_ON(!mirror_num); 2202 2203 bio = btrfs_io_bio_alloc(1); 2204 bio->bi_iter.bi_size = 0; 2205 map_length = length; 2206 2207 /* 2208 * Avoid races with device replace and make sure our bbio has devices 2209 * associated to its stripes that don't go away while we are doing the 2210 * read repair operation. 2211 */ 2212 btrfs_bio_counter_inc_blocked(fs_info); 2213 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2214 /* 2215 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2216 * to update all raid stripes, but here we just want to correct 2217 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2218 * stripe's dev and sector. 2219 */ 2220 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2221 &map_length, &bbio, 0); 2222 if (ret) { 2223 btrfs_bio_counter_dec(fs_info); 2224 bio_put(bio); 2225 return -EIO; 2226 } 2227 ASSERT(bbio->mirror_num == 1); 2228 } else { 2229 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2230 &map_length, &bbio, mirror_num); 2231 if (ret) { 2232 btrfs_bio_counter_dec(fs_info); 2233 bio_put(bio); 2234 return -EIO; 2235 } 2236 BUG_ON(mirror_num != bbio->mirror_num); 2237 } 2238 2239 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2240 bio->bi_iter.bi_sector = sector; 2241 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2242 btrfs_put_bbio(bbio); 2243 if (!dev || !dev->bdev || 2244 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2245 btrfs_bio_counter_dec(fs_info); 2246 bio_put(bio); 2247 return -EIO; 2248 } 2249 bio_set_dev(bio, dev->bdev); 2250 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2251 bio_add_page(bio, page, length, pg_offset); 2252 2253 if (btrfsic_submit_bio_wait(bio)) { 2254 /* try to remap that extent elsewhere? */ 2255 btrfs_bio_counter_dec(fs_info); 2256 bio_put(bio); 2257 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2258 return -EIO; 2259 } 2260 2261 btrfs_info_rl_in_rcu(fs_info, 2262 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2263 ino, start, 2264 rcu_str_deref(dev->name), sector); 2265 btrfs_bio_counter_dec(fs_info); 2266 bio_put(bio); 2267 return 0; 2268 } 2269 2270 int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) 2271 { 2272 struct btrfs_fs_info *fs_info = eb->fs_info; 2273 u64 start = eb->start; 2274 int i, num_pages = num_extent_pages(eb); 2275 int ret = 0; 2276 2277 if (sb_rdonly(fs_info->sb)) 2278 return -EROFS; 2279 2280 for (i = 0; i < num_pages; i++) { 2281 struct page *p = eb->pages[i]; 2282 2283 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2284 start - page_offset(p), mirror_num); 2285 if (ret) 2286 break; 2287 start += PAGE_SIZE; 2288 } 2289 2290 return ret; 2291 } 2292 2293 /* 2294 * each time an IO finishes, we do a fast check in the IO failure tree 2295 * to see if we need to process or clean up an io_failure_record 2296 */ 2297 int clean_io_failure(struct btrfs_fs_info *fs_info, 2298 struct extent_io_tree *failure_tree, 2299 struct extent_io_tree *io_tree, u64 start, 2300 struct page *page, u64 ino, unsigned int pg_offset) 2301 { 2302 u64 private; 2303 struct io_failure_record *failrec; 2304 struct extent_state *state; 2305 int num_copies; 2306 int ret; 2307 2308 private = 0; 2309 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2310 EXTENT_DIRTY, 0); 2311 if (!ret) 2312 return 0; 2313 2314 ret = get_state_failrec(failure_tree, start, &failrec); 2315 if (ret) 2316 return 0; 2317 2318 BUG_ON(!failrec->this_mirror); 2319 2320 if (failrec->in_validation) { 2321 /* there was no real error, just free the record */ 2322 btrfs_debug(fs_info, 2323 "clean_io_failure: freeing dummy error at %llu", 2324 failrec->start); 2325 goto out; 2326 } 2327 if (sb_rdonly(fs_info->sb)) 2328 goto out; 2329 2330 spin_lock(&io_tree->lock); 2331 state = find_first_extent_bit_state(io_tree, 2332 failrec->start, 2333 EXTENT_LOCKED); 2334 spin_unlock(&io_tree->lock); 2335 2336 if (state && state->start <= failrec->start && 2337 state->end >= failrec->start + failrec->len - 1) { 2338 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2339 failrec->len); 2340 if (num_copies > 1) { 2341 repair_io_failure(fs_info, ino, start, failrec->len, 2342 failrec->logical, page, pg_offset, 2343 failrec->failed_mirror); 2344 } 2345 } 2346 2347 out: 2348 free_io_failure(failure_tree, io_tree, failrec); 2349 2350 return 0; 2351 } 2352 2353 /* 2354 * Can be called when 2355 * - hold extent lock 2356 * - under ordered extent 2357 * - the inode is freeing 2358 */ 2359 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2360 { 2361 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2362 struct io_failure_record *failrec; 2363 struct extent_state *state, *next; 2364 2365 if (RB_EMPTY_ROOT(&failure_tree->state)) 2366 return; 2367 2368 spin_lock(&failure_tree->lock); 2369 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2370 while (state) { 2371 if (state->start > end) 2372 break; 2373 2374 ASSERT(state->end <= end); 2375 2376 next = next_state(state); 2377 2378 failrec = state->failrec; 2379 free_extent_state(state); 2380 kfree(failrec); 2381 2382 state = next; 2383 } 2384 spin_unlock(&failure_tree->lock); 2385 } 2386 2387 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2388 struct io_failure_record **failrec_ret) 2389 { 2390 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2391 struct io_failure_record *failrec; 2392 struct extent_map *em; 2393 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2394 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2395 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2396 int ret; 2397 u64 logical; 2398 2399 ret = get_state_failrec(failure_tree, start, &failrec); 2400 if (ret) { 2401 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2402 if (!failrec) 2403 return -ENOMEM; 2404 2405 failrec->start = start; 2406 failrec->len = end - start + 1; 2407 failrec->this_mirror = 0; 2408 failrec->bio_flags = 0; 2409 failrec->in_validation = 0; 2410 2411 read_lock(&em_tree->lock); 2412 em = lookup_extent_mapping(em_tree, start, failrec->len); 2413 if (!em) { 2414 read_unlock(&em_tree->lock); 2415 kfree(failrec); 2416 return -EIO; 2417 } 2418 2419 if (em->start > start || em->start + em->len <= start) { 2420 free_extent_map(em); 2421 em = NULL; 2422 } 2423 read_unlock(&em_tree->lock); 2424 if (!em) { 2425 kfree(failrec); 2426 return -EIO; 2427 } 2428 2429 logical = start - em->start; 2430 logical = em->block_start + logical; 2431 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2432 logical = em->block_start; 2433 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2434 extent_set_compress_type(&failrec->bio_flags, 2435 em->compress_type); 2436 } 2437 2438 btrfs_debug(fs_info, 2439 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2440 logical, start, failrec->len); 2441 2442 failrec->logical = logical; 2443 free_extent_map(em); 2444 2445 /* set the bits in the private failure tree */ 2446 ret = set_extent_bits(failure_tree, start, end, 2447 EXTENT_LOCKED | EXTENT_DIRTY); 2448 if (ret >= 0) 2449 ret = set_state_failrec(failure_tree, start, failrec); 2450 /* set the bits in the inode's tree */ 2451 if (ret >= 0) 2452 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2453 if (ret < 0) { 2454 kfree(failrec); 2455 return ret; 2456 } 2457 } else { 2458 btrfs_debug(fs_info, 2459 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2460 failrec->logical, failrec->start, failrec->len, 2461 failrec->in_validation); 2462 /* 2463 * when data can be on disk more than twice, add to failrec here 2464 * (e.g. with a list for failed_mirror) to make 2465 * clean_io_failure() clean all those errors at once. 2466 */ 2467 } 2468 2469 *failrec_ret = failrec; 2470 2471 return 0; 2472 } 2473 2474 bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, 2475 struct io_failure_record *failrec, int failed_mirror) 2476 { 2477 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2478 int num_copies; 2479 2480 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2481 if (num_copies == 1) { 2482 /* 2483 * we only have a single copy of the data, so don't bother with 2484 * all the retry and error correction code that follows. no 2485 * matter what the error is, it is very likely to persist. 2486 */ 2487 btrfs_debug(fs_info, 2488 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2489 num_copies, failrec->this_mirror, failed_mirror); 2490 return false; 2491 } 2492 2493 /* 2494 * there are two premises: 2495 * a) deliver good data to the caller 2496 * b) correct the bad sectors on disk 2497 */ 2498 if (failed_bio_pages > 1) { 2499 /* 2500 * to fulfill b), we need to know the exact failing sectors, as 2501 * we don't want to rewrite any more than the failed ones. thus, 2502 * we need separate read requests for the failed bio 2503 * 2504 * if the following BUG_ON triggers, our validation request got 2505 * merged. we need separate requests for our algorithm to work. 2506 */ 2507 BUG_ON(failrec->in_validation); 2508 failrec->in_validation = 1; 2509 failrec->this_mirror = failed_mirror; 2510 } else { 2511 /* 2512 * we're ready to fulfill a) and b) alongside. get a good copy 2513 * of the failed sector and if we succeed, we have setup 2514 * everything for repair_io_failure to do the rest for us. 2515 */ 2516 if (failrec->in_validation) { 2517 BUG_ON(failrec->this_mirror != failed_mirror); 2518 failrec->in_validation = 0; 2519 failrec->this_mirror = 0; 2520 } 2521 failrec->failed_mirror = failed_mirror; 2522 failrec->this_mirror++; 2523 if (failrec->this_mirror == failed_mirror) 2524 failrec->this_mirror++; 2525 } 2526 2527 if (failrec->this_mirror > num_copies) { 2528 btrfs_debug(fs_info, 2529 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2530 num_copies, failrec->this_mirror, failed_mirror); 2531 return false; 2532 } 2533 2534 return true; 2535 } 2536 2537 2538 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2539 struct io_failure_record *failrec, 2540 struct page *page, int pg_offset, int icsum, 2541 bio_end_io_t *endio_func, void *data) 2542 { 2543 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2544 struct bio *bio; 2545 struct btrfs_io_bio *btrfs_failed_bio; 2546 struct btrfs_io_bio *btrfs_bio; 2547 2548 bio = btrfs_io_bio_alloc(1); 2549 bio->bi_end_io = endio_func; 2550 bio->bi_iter.bi_sector = failrec->logical >> 9; 2551 bio->bi_iter.bi_size = 0; 2552 bio->bi_private = data; 2553 2554 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2555 if (btrfs_failed_bio->csum) { 2556 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2557 2558 btrfs_bio = btrfs_io_bio(bio); 2559 btrfs_bio->csum = btrfs_bio->csum_inline; 2560 icsum *= csum_size; 2561 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2562 csum_size); 2563 } 2564 2565 bio_add_page(bio, page, failrec->len, pg_offset); 2566 2567 return bio; 2568 } 2569 2570 /* 2571 * This is a generic handler for readpage errors. If other copies exist, read 2572 * those and write back good data to the failed position. Does not investigate 2573 * in remapping the failed extent elsewhere, hoping the device will be smart 2574 * enough to do this as needed 2575 */ 2576 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2577 struct page *page, u64 start, u64 end, 2578 int failed_mirror) 2579 { 2580 struct io_failure_record *failrec; 2581 struct inode *inode = page->mapping->host; 2582 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2583 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2584 struct bio *bio; 2585 int read_mode = 0; 2586 blk_status_t status; 2587 int ret; 2588 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; 2589 2590 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2591 2592 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2593 if (ret) 2594 return ret; 2595 2596 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, 2597 failed_mirror)) { 2598 free_io_failure(failure_tree, tree, failrec); 2599 return -EIO; 2600 } 2601 2602 if (failed_bio_pages > 1) 2603 read_mode |= REQ_FAILFAST_DEV; 2604 2605 phy_offset >>= inode->i_sb->s_blocksize_bits; 2606 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2607 start - page_offset(page), 2608 (int)phy_offset, failed_bio->bi_end_io, 2609 NULL); 2610 bio->bi_opf = REQ_OP_READ | read_mode; 2611 2612 btrfs_debug(btrfs_sb(inode->i_sb), 2613 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2614 read_mode, failrec->this_mirror, failrec->in_validation); 2615 2616 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, 2617 failrec->bio_flags); 2618 if (status) { 2619 free_io_failure(failure_tree, tree, failrec); 2620 bio_put(bio); 2621 ret = blk_status_to_errno(status); 2622 } 2623 2624 return ret; 2625 } 2626 2627 /* lots and lots of room for performance fixes in the end_bio funcs */ 2628 2629 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2630 { 2631 int uptodate = (err == 0); 2632 int ret = 0; 2633 2634 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); 2635 2636 if (!uptodate) { 2637 ClearPageUptodate(page); 2638 SetPageError(page); 2639 ret = err < 0 ? err : -EIO; 2640 mapping_set_error(page->mapping, ret); 2641 } 2642 } 2643 2644 /* 2645 * after a writepage IO is done, we need to: 2646 * clear the uptodate bits on error 2647 * clear the writeback bits in the extent tree for this IO 2648 * end_page_writeback if the page has no more pending IO 2649 * 2650 * Scheduling is not allowed, so the extent state tree is expected 2651 * to have one and only one object corresponding to this IO. 2652 */ 2653 static void end_bio_extent_writepage(struct bio *bio) 2654 { 2655 int error = blk_status_to_errno(bio->bi_status); 2656 struct bio_vec *bvec; 2657 u64 start; 2658 u64 end; 2659 struct bvec_iter_all iter_all; 2660 2661 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2662 bio_for_each_segment_all(bvec, bio, iter_all) { 2663 struct page *page = bvec->bv_page; 2664 struct inode *inode = page->mapping->host; 2665 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2666 2667 /* We always issue full-page reads, but if some block 2668 * in a page fails to read, blk_update_request() will 2669 * advance bv_offset and adjust bv_len to compensate. 2670 * Print a warning for nonzero offsets, and an error 2671 * if they don't add up to a full page. */ 2672 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2673 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2674 btrfs_err(fs_info, 2675 "partial page write in btrfs with offset %u and length %u", 2676 bvec->bv_offset, bvec->bv_len); 2677 else 2678 btrfs_info(fs_info, 2679 "incomplete page write in btrfs with offset %u and length %u", 2680 bvec->bv_offset, bvec->bv_len); 2681 } 2682 2683 start = page_offset(page); 2684 end = start + bvec->bv_offset + bvec->bv_len - 1; 2685 2686 end_extent_writepage(page, error, start, end); 2687 end_page_writeback(page); 2688 } 2689 2690 bio_put(bio); 2691 } 2692 2693 static void 2694 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2695 int uptodate) 2696 { 2697 struct extent_state *cached = NULL; 2698 u64 end = start + len - 1; 2699 2700 if (uptodate && tree->track_uptodate) 2701 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2702 unlock_extent_cached_atomic(tree, start, end, &cached); 2703 } 2704 2705 /* 2706 * after a readpage IO is done, we need to: 2707 * clear the uptodate bits on error 2708 * set the uptodate bits if things worked 2709 * set the page up to date if all extents in the tree are uptodate 2710 * clear the lock bit in the extent tree 2711 * unlock the page if there are no other extents locked for it 2712 * 2713 * Scheduling is not allowed, so the extent state tree is expected 2714 * to have one and only one object corresponding to this IO. 2715 */ 2716 static void end_bio_extent_readpage(struct bio *bio) 2717 { 2718 struct bio_vec *bvec; 2719 int uptodate = !bio->bi_status; 2720 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2721 struct extent_io_tree *tree, *failure_tree; 2722 u64 offset = 0; 2723 u64 start; 2724 u64 end; 2725 u64 len; 2726 u64 extent_start = 0; 2727 u64 extent_len = 0; 2728 int mirror; 2729 int ret; 2730 struct bvec_iter_all iter_all; 2731 2732 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2733 bio_for_each_segment_all(bvec, bio, iter_all) { 2734 struct page *page = bvec->bv_page; 2735 struct inode *inode = page->mapping->host; 2736 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2737 bool data_inode = btrfs_ino(BTRFS_I(inode)) 2738 != BTRFS_BTREE_INODE_OBJECTID; 2739 2740 btrfs_debug(fs_info, 2741 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2742 (u64)bio->bi_iter.bi_sector, bio->bi_status, 2743 io_bio->mirror_num); 2744 tree = &BTRFS_I(inode)->io_tree; 2745 failure_tree = &BTRFS_I(inode)->io_failure_tree; 2746 2747 /* We always issue full-page reads, but if some block 2748 * in a page fails to read, blk_update_request() will 2749 * advance bv_offset and adjust bv_len to compensate. 2750 * Print a warning for nonzero offsets, and an error 2751 * if they don't add up to a full page. */ 2752 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2753 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2754 btrfs_err(fs_info, 2755 "partial page read in btrfs with offset %u and length %u", 2756 bvec->bv_offset, bvec->bv_len); 2757 else 2758 btrfs_info(fs_info, 2759 "incomplete page read in btrfs with offset %u and length %u", 2760 bvec->bv_offset, bvec->bv_len); 2761 } 2762 2763 start = page_offset(page); 2764 end = start + bvec->bv_offset + bvec->bv_len - 1; 2765 len = bvec->bv_len; 2766 2767 mirror = io_bio->mirror_num; 2768 if (likely(uptodate)) { 2769 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2770 page, start, end, 2771 mirror); 2772 if (ret) 2773 uptodate = 0; 2774 else 2775 clean_io_failure(BTRFS_I(inode)->root->fs_info, 2776 failure_tree, tree, start, 2777 page, 2778 btrfs_ino(BTRFS_I(inode)), 0); 2779 } 2780 2781 if (likely(uptodate)) 2782 goto readpage_ok; 2783 2784 if (data_inode) { 2785 2786 /* 2787 * The generic bio_readpage_error handles errors the 2788 * following way: If possible, new read requests are 2789 * created and submitted and will end up in 2790 * end_bio_extent_readpage as well (if we're lucky, 2791 * not in the !uptodate case). In that case it returns 2792 * 0 and we just go on with the next page in our bio. 2793 * If it can't handle the error it will return -EIO and 2794 * we remain responsible for that page. 2795 */ 2796 ret = bio_readpage_error(bio, offset, page, start, end, 2797 mirror); 2798 if (ret == 0) { 2799 uptodate = !bio->bi_status; 2800 offset += len; 2801 continue; 2802 } 2803 } else { 2804 struct extent_buffer *eb; 2805 2806 eb = (struct extent_buffer *)page->private; 2807 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 2808 eb->read_mirror = mirror; 2809 atomic_dec(&eb->io_pages); 2810 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, 2811 &eb->bflags)) 2812 btree_readahead_hook(eb, -EIO); 2813 } 2814 readpage_ok: 2815 if (likely(uptodate)) { 2816 loff_t i_size = i_size_read(inode); 2817 pgoff_t end_index = i_size >> PAGE_SHIFT; 2818 unsigned off; 2819 2820 /* Zero out the end if this page straddles i_size */ 2821 off = offset_in_page(i_size); 2822 if (page->index == end_index && off) 2823 zero_user_segment(page, off, PAGE_SIZE); 2824 SetPageUptodate(page); 2825 } else { 2826 ClearPageUptodate(page); 2827 SetPageError(page); 2828 } 2829 unlock_page(page); 2830 offset += len; 2831 2832 if (unlikely(!uptodate)) { 2833 if (extent_len) { 2834 endio_readpage_release_extent(tree, 2835 extent_start, 2836 extent_len, 1); 2837 extent_start = 0; 2838 extent_len = 0; 2839 } 2840 endio_readpage_release_extent(tree, start, 2841 end - start + 1, 0); 2842 } else if (!extent_len) { 2843 extent_start = start; 2844 extent_len = end + 1 - start; 2845 } else if (extent_start + extent_len == start) { 2846 extent_len += end + 1 - start; 2847 } else { 2848 endio_readpage_release_extent(tree, extent_start, 2849 extent_len, uptodate); 2850 extent_start = start; 2851 extent_len = end + 1 - start; 2852 } 2853 } 2854 2855 if (extent_len) 2856 endio_readpage_release_extent(tree, extent_start, extent_len, 2857 uptodate); 2858 btrfs_io_bio_free_csum(io_bio); 2859 bio_put(bio); 2860 } 2861 2862 /* 2863 * Initialize the members up to but not including 'bio'. Use after allocating a 2864 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 2865 * 'bio' because use of __GFP_ZERO is not supported. 2866 */ 2867 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) 2868 { 2869 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); 2870 } 2871 2872 /* 2873 * The following helpers allocate a bio. As it's backed by a bioset, it'll 2874 * never fail. We're returning a bio right now but you can call btrfs_io_bio 2875 * for the appropriate container_of magic 2876 */ 2877 struct bio *btrfs_bio_alloc(u64 first_byte) 2878 { 2879 struct bio *bio; 2880 2881 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); 2882 bio->bi_iter.bi_sector = first_byte >> 9; 2883 btrfs_io_bio_init(btrfs_io_bio(bio)); 2884 return bio; 2885 } 2886 2887 struct bio *btrfs_bio_clone(struct bio *bio) 2888 { 2889 struct btrfs_io_bio *btrfs_bio; 2890 struct bio *new; 2891 2892 /* Bio allocation backed by a bioset does not fail */ 2893 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); 2894 btrfs_bio = btrfs_io_bio(new); 2895 btrfs_io_bio_init(btrfs_bio); 2896 btrfs_bio->iter = bio->bi_iter; 2897 return new; 2898 } 2899 2900 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) 2901 { 2902 struct bio *bio; 2903 2904 /* Bio allocation backed by a bioset does not fail */ 2905 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); 2906 btrfs_io_bio_init(btrfs_io_bio(bio)); 2907 return bio; 2908 } 2909 2910 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) 2911 { 2912 struct bio *bio; 2913 struct btrfs_io_bio *btrfs_bio; 2914 2915 /* this will never fail when it's backed by a bioset */ 2916 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); 2917 ASSERT(bio); 2918 2919 btrfs_bio = btrfs_io_bio(bio); 2920 btrfs_io_bio_init(btrfs_bio); 2921 2922 bio_trim(bio, offset >> 9, size >> 9); 2923 btrfs_bio->iter = bio->bi_iter; 2924 return bio; 2925 } 2926 2927 /* 2928 * @opf: bio REQ_OP_* and REQ_* flags as one value 2929 * @tree: tree so we can call our merge_bio hook 2930 * @wbc: optional writeback control for io accounting 2931 * @page: page to add to the bio 2932 * @pg_offset: offset of the new bio or to check whether we are adding 2933 * a contiguous page to the previous one 2934 * @size: portion of page that we want to write 2935 * @offset: starting offset in the page 2936 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 2937 * @end_io_func: end_io callback for new bio 2938 * @mirror_num: desired mirror to read/write 2939 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 2940 * @bio_flags: flags of the current bio to see if we can merge them 2941 */ 2942 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, 2943 struct writeback_control *wbc, 2944 struct page *page, u64 offset, 2945 size_t size, unsigned long pg_offset, 2946 struct bio **bio_ret, 2947 bio_end_io_t end_io_func, 2948 int mirror_num, 2949 unsigned long prev_bio_flags, 2950 unsigned long bio_flags, 2951 bool force_bio_submit) 2952 { 2953 int ret = 0; 2954 struct bio *bio; 2955 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2956 sector_t sector = offset >> 9; 2957 2958 ASSERT(bio_ret); 2959 2960 if (*bio_ret) { 2961 bool contig; 2962 bool can_merge = true; 2963 2964 bio = *bio_ret; 2965 if (prev_bio_flags & EXTENT_BIO_COMPRESSED) 2966 contig = bio->bi_iter.bi_sector == sector; 2967 else 2968 contig = bio_end_sector(bio) == sector; 2969 2970 ASSERT(tree->ops); 2971 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) 2972 can_merge = false; 2973 2974 if (prev_bio_flags != bio_flags || !contig || !can_merge || 2975 force_bio_submit || 2976 bio_add_page(bio, page, page_size, pg_offset) < page_size) { 2977 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2978 if (ret < 0) { 2979 *bio_ret = NULL; 2980 return ret; 2981 } 2982 bio = NULL; 2983 } else { 2984 if (wbc) 2985 wbc_account_cgroup_owner(wbc, page, page_size); 2986 return 0; 2987 } 2988 } 2989 2990 bio = btrfs_bio_alloc(offset); 2991 bio_add_page(bio, page, page_size, pg_offset); 2992 bio->bi_end_io = end_io_func; 2993 bio->bi_private = tree; 2994 bio->bi_write_hint = page->mapping->host->i_write_hint; 2995 bio->bi_opf = opf; 2996 if (wbc) { 2997 struct block_device *bdev; 2998 2999 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; 3000 bio_set_dev(bio, bdev); 3001 wbc_init_bio(wbc, bio); 3002 wbc_account_cgroup_owner(wbc, page, page_size); 3003 } 3004 3005 *bio_ret = bio; 3006 3007 return ret; 3008 } 3009 3010 static void attach_extent_buffer_page(struct extent_buffer *eb, 3011 struct page *page) 3012 { 3013 if (!PagePrivate(page)) { 3014 SetPagePrivate(page); 3015 get_page(page); 3016 set_page_private(page, (unsigned long)eb); 3017 } else { 3018 WARN_ON(page->private != (unsigned long)eb); 3019 } 3020 } 3021 3022 void set_page_extent_mapped(struct page *page) 3023 { 3024 if (!PagePrivate(page)) { 3025 SetPagePrivate(page); 3026 get_page(page); 3027 set_page_private(page, EXTENT_PAGE_PRIVATE); 3028 } 3029 } 3030 3031 static struct extent_map * 3032 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 3033 u64 start, u64 len, get_extent_t *get_extent, 3034 struct extent_map **em_cached) 3035 { 3036 struct extent_map *em; 3037 3038 if (em_cached && *em_cached) { 3039 em = *em_cached; 3040 if (extent_map_in_tree(em) && start >= em->start && 3041 start < extent_map_end(em)) { 3042 refcount_inc(&em->refs); 3043 return em; 3044 } 3045 3046 free_extent_map(em); 3047 *em_cached = NULL; 3048 } 3049 3050 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); 3051 if (em_cached && !IS_ERR_OR_NULL(em)) { 3052 BUG_ON(*em_cached); 3053 refcount_inc(&em->refs); 3054 *em_cached = em; 3055 } 3056 return em; 3057 } 3058 /* 3059 * basic readpage implementation. Locked extent state structs are inserted 3060 * into the tree that are removed when the IO is done (by the end_io 3061 * handlers) 3062 * XXX JDM: This needs looking at to ensure proper page locking 3063 * return 0 on success, otherwise return error 3064 */ 3065 static int __do_readpage(struct extent_io_tree *tree, 3066 struct page *page, 3067 get_extent_t *get_extent, 3068 struct extent_map **em_cached, 3069 struct bio **bio, int mirror_num, 3070 unsigned long *bio_flags, unsigned int read_flags, 3071 u64 *prev_em_start) 3072 { 3073 struct inode *inode = page->mapping->host; 3074 u64 start = page_offset(page); 3075 const u64 end = start + PAGE_SIZE - 1; 3076 u64 cur = start; 3077 u64 extent_offset; 3078 u64 last_byte = i_size_read(inode); 3079 u64 block_start; 3080 u64 cur_end; 3081 struct extent_map *em; 3082 int ret = 0; 3083 int nr = 0; 3084 size_t pg_offset = 0; 3085 size_t iosize; 3086 size_t disk_io_size; 3087 size_t blocksize = inode->i_sb->s_blocksize; 3088 unsigned long this_bio_flag = 0; 3089 3090 set_page_extent_mapped(page); 3091 3092 if (!PageUptodate(page)) { 3093 if (cleancache_get_page(page) == 0) { 3094 BUG_ON(blocksize != PAGE_SIZE); 3095 unlock_extent(tree, start, end); 3096 goto out; 3097 } 3098 } 3099 3100 if (page->index == last_byte >> PAGE_SHIFT) { 3101 char *userpage; 3102 size_t zero_offset = offset_in_page(last_byte); 3103 3104 if (zero_offset) { 3105 iosize = PAGE_SIZE - zero_offset; 3106 userpage = kmap_atomic(page); 3107 memset(userpage + zero_offset, 0, iosize); 3108 flush_dcache_page(page); 3109 kunmap_atomic(userpage); 3110 } 3111 } 3112 while (cur <= end) { 3113 bool force_bio_submit = false; 3114 u64 offset; 3115 3116 if (cur >= last_byte) { 3117 char *userpage; 3118 struct extent_state *cached = NULL; 3119 3120 iosize = PAGE_SIZE - pg_offset; 3121 userpage = kmap_atomic(page); 3122 memset(userpage + pg_offset, 0, iosize); 3123 flush_dcache_page(page); 3124 kunmap_atomic(userpage); 3125 set_extent_uptodate(tree, cur, cur + iosize - 1, 3126 &cached, GFP_NOFS); 3127 unlock_extent_cached(tree, cur, 3128 cur + iosize - 1, &cached); 3129 break; 3130 } 3131 em = __get_extent_map(inode, page, pg_offset, cur, 3132 end - cur + 1, get_extent, em_cached); 3133 if (IS_ERR_OR_NULL(em)) { 3134 SetPageError(page); 3135 unlock_extent(tree, cur, end); 3136 break; 3137 } 3138 extent_offset = cur - em->start; 3139 BUG_ON(extent_map_end(em) <= cur); 3140 BUG_ON(end < cur); 3141 3142 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 3143 this_bio_flag |= EXTENT_BIO_COMPRESSED; 3144 extent_set_compress_type(&this_bio_flag, 3145 em->compress_type); 3146 } 3147 3148 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3149 cur_end = min(extent_map_end(em) - 1, end); 3150 iosize = ALIGN(iosize, blocksize); 3151 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 3152 disk_io_size = em->block_len; 3153 offset = em->block_start; 3154 } else { 3155 offset = em->block_start + extent_offset; 3156 disk_io_size = iosize; 3157 } 3158 block_start = em->block_start; 3159 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3160 block_start = EXTENT_MAP_HOLE; 3161 3162 /* 3163 * If we have a file range that points to a compressed extent 3164 * and it's followed by a consecutive file range that points to 3165 * to the same compressed extent (possibly with a different 3166 * offset and/or length, so it either points to the whole extent 3167 * or only part of it), we must make sure we do not submit a 3168 * single bio to populate the pages for the 2 ranges because 3169 * this makes the compressed extent read zero out the pages 3170 * belonging to the 2nd range. Imagine the following scenario: 3171 * 3172 * File layout 3173 * [0 - 8K] [8K - 24K] 3174 * | | 3175 * | | 3176 * points to extent X, points to extent X, 3177 * offset 4K, length of 8K offset 0, length 16K 3178 * 3179 * [extent X, compressed length = 4K uncompressed length = 16K] 3180 * 3181 * If the bio to read the compressed extent covers both ranges, 3182 * it will decompress extent X into the pages belonging to the 3183 * first range and then it will stop, zeroing out the remaining 3184 * pages that belong to the other range that points to extent X. 3185 * So here we make sure we submit 2 bios, one for the first 3186 * range and another one for the third range. Both will target 3187 * the same physical extent from disk, but we can't currently 3188 * make the compressed bio endio callback populate the pages 3189 * for both ranges because each compressed bio is tightly 3190 * coupled with a single extent map, and each range can have 3191 * an extent map with a different offset value relative to the 3192 * uncompressed data of our extent and different lengths. This 3193 * is a corner case so we prioritize correctness over 3194 * non-optimal behavior (submitting 2 bios for the same extent). 3195 */ 3196 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3197 prev_em_start && *prev_em_start != (u64)-1 && 3198 *prev_em_start != em->start) 3199 force_bio_submit = true; 3200 3201 if (prev_em_start) 3202 *prev_em_start = em->start; 3203 3204 free_extent_map(em); 3205 em = NULL; 3206 3207 /* we've found a hole, just zero and go on */ 3208 if (block_start == EXTENT_MAP_HOLE) { 3209 char *userpage; 3210 struct extent_state *cached = NULL; 3211 3212 userpage = kmap_atomic(page); 3213 memset(userpage + pg_offset, 0, iosize); 3214 flush_dcache_page(page); 3215 kunmap_atomic(userpage); 3216 3217 set_extent_uptodate(tree, cur, cur + iosize - 1, 3218 &cached, GFP_NOFS); 3219 unlock_extent_cached(tree, cur, 3220 cur + iosize - 1, &cached); 3221 cur = cur + iosize; 3222 pg_offset += iosize; 3223 continue; 3224 } 3225 /* the get_extent function already copied into the page */ 3226 if (test_range_bit(tree, cur, cur_end, 3227 EXTENT_UPTODATE, 1, NULL)) { 3228 check_page_uptodate(tree, page); 3229 unlock_extent(tree, cur, cur + iosize - 1); 3230 cur = cur + iosize; 3231 pg_offset += iosize; 3232 continue; 3233 } 3234 /* we have an inline extent but it didn't get marked up 3235 * to date. Error out 3236 */ 3237 if (block_start == EXTENT_MAP_INLINE) { 3238 SetPageError(page); 3239 unlock_extent(tree, cur, cur + iosize - 1); 3240 cur = cur + iosize; 3241 pg_offset += iosize; 3242 continue; 3243 } 3244 3245 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, 3246 page, offset, disk_io_size, 3247 pg_offset, bio, 3248 end_bio_extent_readpage, mirror_num, 3249 *bio_flags, 3250 this_bio_flag, 3251 force_bio_submit); 3252 if (!ret) { 3253 nr++; 3254 *bio_flags = this_bio_flag; 3255 } else { 3256 SetPageError(page); 3257 unlock_extent(tree, cur, cur + iosize - 1); 3258 goto out; 3259 } 3260 cur = cur + iosize; 3261 pg_offset += iosize; 3262 } 3263 out: 3264 if (!nr) { 3265 if (!PageError(page)) 3266 SetPageUptodate(page); 3267 unlock_page(page); 3268 } 3269 return ret; 3270 } 3271 3272 static inline void contiguous_readpages(struct extent_io_tree *tree, 3273 struct page *pages[], int nr_pages, 3274 u64 start, u64 end, 3275 struct extent_map **em_cached, 3276 struct bio **bio, 3277 unsigned long *bio_flags, 3278 u64 *prev_em_start) 3279 { 3280 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3281 int index; 3282 3283 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3284 3285 for (index = 0; index < nr_pages; index++) { 3286 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, 3287 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); 3288 put_page(pages[index]); 3289 } 3290 } 3291 3292 static int __extent_read_full_page(struct extent_io_tree *tree, 3293 struct page *page, 3294 get_extent_t *get_extent, 3295 struct bio **bio, int mirror_num, 3296 unsigned long *bio_flags, 3297 unsigned int read_flags) 3298 { 3299 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3300 u64 start = page_offset(page); 3301 u64 end = start + PAGE_SIZE - 1; 3302 int ret; 3303 3304 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3305 3306 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3307 bio_flags, read_flags, NULL); 3308 return ret; 3309 } 3310 3311 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3312 get_extent_t *get_extent, int mirror_num) 3313 { 3314 struct bio *bio = NULL; 3315 unsigned long bio_flags = 0; 3316 int ret; 3317 3318 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3319 &bio_flags, 0); 3320 if (bio) 3321 ret = submit_one_bio(bio, mirror_num, bio_flags); 3322 return ret; 3323 } 3324 3325 static void update_nr_written(struct writeback_control *wbc, 3326 unsigned long nr_written) 3327 { 3328 wbc->nr_to_write -= nr_written; 3329 } 3330 3331 /* 3332 * helper for __extent_writepage, doing all of the delayed allocation setup. 3333 * 3334 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3335 * to write the page (copy into inline extent). In this case the IO has 3336 * been started and the page is already unlocked. 3337 * 3338 * This returns 0 if all went well (page still locked) 3339 * This returns < 0 if there were errors (page still locked) 3340 */ 3341 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3342 struct page *page, struct writeback_control *wbc, 3343 u64 delalloc_start, unsigned long *nr_written) 3344 { 3345 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3346 bool found; 3347 u64 delalloc_to_write = 0; 3348 u64 delalloc_end = 0; 3349 int ret; 3350 int page_started = 0; 3351 3352 3353 while (delalloc_end < page_end) { 3354 found = find_lock_delalloc_range(inode, page, 3355 &delalloc_start, 3356 &delalloc_end); 3357 if (!found) { 3358 delalloc_start = delalloc_end + 1; 3359 continue; 3360 } 3361 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3362 delalloc_end, &page_started, nr_written, wbc); 3363 if (ret) { 3364 SetPageError(page); 3365 /* 3366 * btrfs_run_delalloc_range should return < 0 for error 3367 * but just in case, we use > 0 here meaning the IO is 3368 * started, so we don't want to return > 0 unless 3369 * things are going well. 3370 */ 3371 ret = ret < 0 ? ret : -EIO; 3372 goto done; 3373 } 3374 /* 3375 * delalloc_end is already one less than the total length, so 3376 * we don't subtract one from PAGE_SIZE 3377 */ 3378 delalloc_to_write += (delalloc_end - delalloc_start + 3379 PAGE_SIZE) >> PAGE_SHIFT; 3380 delalloc_start = delalloc_end + 1; 3381 } 3382 if (wbc->nr_to_write < delalloc_to_write) { 3383 int thresh = 8192; 3384 3385 if (delalloc_to_write < thresh * 2) 3386 thresh = delalloc_to_write; 3387 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3388 thresh); 3389 } 3390 3391 /* did the fill delalloc function already unlock and start 3392 * the IO? 3393 */ 3394 if (page_started) { 3395 /* 3396 * we've unlocked the page, so we can't update 3397 * the mapping's writeback index, just update 3398 * nr_to_write. 3399 */ 3400 wbc->nr_to_write -= *nr_written; 3401 return 1; 3402 } 3403 3404 ret = 0; 3405 3406 done: 3407 return ret; 3408 } 3409 3410 /* 3411 * helper for __extent_writepage. This calls the writepage start hooks, 3412 * and does the loop to map the page into extents and bios. 3413 * 3414 * We return 1 if the IO is started and the page is unlocked, 3415 * 0 if all went well (page still locked) 3416 * < 0 if there were errors (page still locked) 3417 */ 3418 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3419 struct page *page, 3420 struct writeback_control *wbc, 3421 struct extent_page_data *epd, 3422 loff_t i_size, 3423 unsigned long nr_written, 3424 int *nr_ret) 3425 { 3426 struct extent_io_tree *tree = epd->tree; 3427 u64 start = page_offset(page); 3428 u64 page_end = start + PAGE_SIZE - 1; 3429 u64 end; 3430 u64 cur = start; 3431 u64 extent_offset; 3432 u64 block_start; 3433 u64 iosize; 3434 struct extent_map *em; 3435 size_t pg_offset = 0; 3436 size_t blocksize; 3437 int ret = 0; 3438 int nr = 0; 3439 const unsigned int write_flags = wbc_to_write_flags(wbc); 3440 bool compressed; 3441 3442 ret = btrfs_writepage_cow_fixup(page, start, page_end); 3443 if (ret) { 3444 /* Fixup worker will requeue */ 3445 redirty_page_for_writepage(wbc, page); 3446 update_nr_written(wbc, nr_written); 3447 unlock_page(page); 3448 return 1; 3449 } 3450 3451 /* 3452 * we don't want to touch the inode after unlocking the page, 3453 * so we update the mapping writeback index now 3454 */ 3455 update_nr_written(wbc, nr_written + 1); 3456 3457 end = page_end; 3458 blocksize = inode->i_sb->s_blocksize; 3459 3460 while (cur <= end) { 3461 u64 em_end; 3462 u64 offset; 3463 3464 if (cur >= i_size) { 3465 btrfs_writepage_endio_finish_ordered(page, cur, 3466 page_end, 1); 3467 break; 3468 } 3469 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, 3470 end - cur + 1); 3471 if (IS_ERR_OR_NULL(em)) { 3472 SetPageError(page); 3473 ret = PTR_ERR_OR_ZERO(em); 3474 break; 3475 } 3476 3477 extent_offset = cur - em->start; 3478 em_end = extent_map_end(em); 3479 BUG_ON(em_end <= cur); 3480 BUG_ON(end < cur); 3481 iosize = min(em_end - cur, end - cur + 1); 3482 iosize = ALIGN(iosize, blocksize); 3483 offset = em->block_start + extent_offset; 3484 block_start = em->block_start; 3485 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3486 free_extent_map(em); 3487 em = NULL; 3488 3489 /* 3490 * compressed and inline extents are written through other 3491 * paths in the FS 3492 */ 3493 if (compressed || block_start == EXTENT_MAP_HOLE || 3494 block_start == EXTENT_MAP_INLINE) { 3495 if (compressed) 3496 nr++; 3497 else 3498 btrfs_writepage_endio_finish_ordered(page, cur, 3499 cur + iosize - 1, 1); 3500 cur += iosize; 3501 pg_offset += iosize; 3502 continue; 3503 } 3504 3505 btrfs_set_range_writeback(tree, cur, cur + iosize - 1); 3506 if (!PageWriteback(page)) { 3507 btrfs_err(BTRFS_I(inode)->root->fs_info, 3508 "page %lu not writeback, cur %llu end %llu", 3509 page->index, cur, end); 3510 } 3511 3512 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3513 page, offset, iosize, pg_offset, 3514 &epd->bio, 3515 end_bio_extent_writepage, 3516 0, 0, 0, false); 3517 if (ret) { 3518 SetPageError(page); 3519 if (PageWriteback(page)) 3520 end_page_writeback(page); 3521 } 3522 3523 cur = cur + iosize; 3524 pg_offset += iosize; 3525 nr++; 3526 } 3527 *nr_ret = nr; 3528 return ret; 3529 } 3530 3531 /* 3532 * the writepage semantics are similar to regular writepage. extent 3533 * records are inserted to lock ranges in the tree, and as dirty areas 3534 * are found, they are marked writeback. Then the lock bits are removed 3535 * and the end_io handler clears the writeback ranges 3536 * 3537 * Return 0 if everything goes well. 3538 * Return <0 for error. 3539 */ 3540 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3541 struct extent_page_data *epd) 3542 { 3543 struct inode *inode = page->mapping->host; 3544 u64 start = page_offset(page); 3545 u64 page_end = start + PAGE_SIZE - 1; 3546 int ret; 3547 int nr = 0; 3548 size_t pg_offset; 3549 loff_t i_size = i_size_read(inode); 3550 unsigned long end_index = i_size >> PAGE_SHIFT; 3551 unsigned long nr_written = 0; 3552 3553 trace___extent_writepage(page, inode, wbc); 3554 3555 WARN_ON(!PageLocked(page)); 3556 3557 ClearPageError(page); 3558 3559 pg_offset = offset_in_page(i_size); 3560 if (page->index > end_index || 3561 (page->index == end_index && !pg_offset)) { 3562 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3563 unlock_page(page); 3564 return 0; 3565 } 3566 3567 if (page->index == end_index) { 3568 char *userpage; 3569 3570 userpage = kmap_atomic(page); 3571 memset(userpage + pg_offset, 0, 3572 PAGE_SIZE - pg_offset); 3573 kunmap_atomic(userpage); 3574 flush_dcache_page(page); 3575 } 3576 3577 set_page_extent_mapped(page); 3578 3579 if (!epd->extent_locked) { 3580 ret = writepage_delalloc(inode, page, wbc, start, &nr_written); 3581 if (ret == 1) 3582 return 0; 3583 if (ret) 3584 goto done; 3585 } 3586 3587 ret = __extent_writepage_io(inode, page, wbc, epd, 3588 i_size, nr_written, &nr); 3589 if (ret == 1) 3590 return 0; 3591 3592 done: 3593 if (nr == 0) { 3594 /* make sure the mapping tag for page dirty gets cleared */ 3595 set_page_writeback(page); 3596 end_page_writeback(page); 3597 } 3598 if (PageError(page)) { 3599 ret = ret < 0 ? ret : -EIO; 3600 end_extent_writepage(page, ret, start, page_end); 3601 } 3602 unlock_page(page); 3603 ASSERT(ret <= 0); 3604 return ret; 3605 } 3606 3607 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3608 { 3609 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3610 TASK_UNINTERRUPTIBLE); 3611 } 3612 3613 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3614 { 3615 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3616 smp_mb__after_atomic(); 3617 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3618 } 3619 3620 /* 3621 * Lock eb pages and flush the bio if we can't the locks 3622 * 3623 * Return 0 if nothing went wrong 3624 * Return >0 is same as 0, except bio is not submitted 3625 * Return <0 if something went wrong, no page is locked 3626 */ 3627 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, 3628 struct extent_page_data *epd) 3629 { 3630 struct btrfs_fs_info *fs_info = eb->fs_info; 3631 int i, num_pages, failed_page_nr; 3632 int flush = 0; 3633 int ret = 0; 3634 3635 if (!btrfs_try_tree_write_lock(eb)) { 3636 ret = flush_write_bio(epd); 3637 if (ret < 0) 3638 return ret; 3639 flush = 1; 3640 btrfs_tree_lock(eb); 3641 } 3642 3643 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3644 btrfs_tree_unlock(eb); 3645 if (!epd->sync_io) 3646 return 0; 3647 if (!flush) { 3648 ret = flush_write_bio(epd); 3649 if (ret < 0) 3650 return ret; 3651 flush = 1; 3652 } 3653 while (1) { 3654 wait_on_extent_buffer_writeback(eb); 3655 btrfs_tree_lock(eb); 3656 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3657 break; 3658 btrfs_tree_unlock(eb); 3659 } 3660 } 3661 3662 /* 3663 * We need to do this to prevent races in people who check if the eb is 3664 * under IO since we can end up having no IO bits set for a short period 3665 * of time. 3666 */ 3667 spin_lock(&eb->refs_lock); 3668 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3669 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3670 spin_unlock(&eb->refs_lock); 3671 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3672 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 3673 -eb->len, 3674 fs_info->dirty_metadata_batch); 3675 ret = 1; 3676 } else { 3677 spin_unlock(&eb->refs_lock); 3678 } 3679 3680 btrfs_tree_unlock(eb); 3681 3682 if (!ret) 3683 return ret; 3684 3685 num_pages = num_extent_pages(eb); 3686 for (i = 0; i < num_pages; i++) { 3687 struct page *p = eb->pages[i]; 3688 3689 if (!trylock_page(p)) { 3690 if (!flush) { 3691 int err; 3692 3693 err = flush_write_bio(epd); 3694 if (err < 0) { 3695 ret = err; 3696 failed_page_nr = i; 3697 goto err_unlock; 3698 } 3699 flush = 1; 3700 } 3701 lock_page(p); 3702 } 3703 } 3704 3705 return ret; 3706 err_unlock: 3707 /* Unlock already locked pages */ 3708 for (i = 0; i < failed_page_nr; i++) 3709 unlock_page(eb->pages[i]); 3710 /* 3711 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. 3712 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can 3713 * be made and undo everything done before. 3714 */ 3715 btrfs_tree_lock(eb); 3716 spin_lock(&eb->refs_lock); 3717 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3718 end_extent_buffer_writeback(eb); 3719 spin_unlock(&eb->refs_lock); 3720 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, 3721 fs_info->dirty_metadata_batch); 3722 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3723 btrfs_tree_unlock(eb); 3724 return ret; 3725 } 3726 3727 static void set_btree_ioerr(struct page *page) 3728 { 3729 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3730 struct btrfs_fs_info *fs_info; 3731 3732 SetPageError(page); 3733 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3734 return; 3735 3736 /* 3737 * If we error out, we should add back the dirty_metadata_bytes 3738 * to make it consistent. 3739 */ 3740 fs_info = eb->fs_info; 3741 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 3742 eb->len, fs_info->dirty_metadata_batch); 3743 3744 /* 3745 * If writeback for a btree extent that doesn't belong to a log tree 3746 * failed, increment the counter transaction->eb_write_errors. 3747 * We do this because while the transaction is running and before it's 3748 * committing (when we call filemap_fdata[write|wait]_range against 3749 * the btree inode), we might have 3750 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3751 * returns an error or an error happens during writeback, when we're 3752 * committing the transaction we wouldn't know about it, since the pages 3753 * can be no longer dirty nor marked anymore for writeback (if a 3754 * subsequent modification to the extent buffer didn't happen before the 3755 * transaction commit), which makes filemap_fdata[write|wait]_range not 3756 * able to find the pages tagged with SetPageError at transaction 3757 * commit time. So if this happens we must abort the transaction, 3758 * otherwise we commit a super block with btree roots that point to 3759 * btree nodes/leafs whose content on disk is invalid - either garbage 3760 * or the content of some node/leaf from a past generation that got 3761 * cowed or deleted and is no longer valid. 3762 * 3763 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3764 * not be enough - we need to distinguish between log tree extents vs 3765 * non-log tree extents, and the next filemap_fdatawait_range() call 3766 * will catch and clear such errors in the mapping - and that call might 3767 * be from a log sync and not from a transaction commit. Also, checking 3768 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3769 * not done and would not be reliable - the eb might have been released 3770 * from memory and reading it back again means that flag would not be 3771 * set (since it's a runtime flag, not persisted on disk). 3772 * 3773 * Using the flags below in the btree inode also makes us achieve the 3774 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3775 * writeback for all dirty pages and before filemap_fdatawait_range() 3776 * is called, the writeback for all dirty pages had already finished 3777 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3778 * filemap_fdatawait_range() would return success, as it could not know 3779 * that writeback errors happened (the pages were no longer tagged for 3780 * writeback). 3781 */ 3782 switch (eb->log_index) { 3783 case -1: 3784 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3785 break; 3786 case 0: 3787 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3788 break; 3789 case 1: 3790 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3791 break; 3792 default: 3793 BUG(); /* unexpected, logic error */ 3794 } 3795 } 3796 3797 static void end_bio_extent_buffer_writepage(struct bio *bio) 3798 { 3799 struct bio_vec *bvec; 3800 struct extent_buffer *eb; 3801 int done; 3802 struct bvec_iter_all iter_all; 3803 3804 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3805 bio_for_each_segment_all(bvec, bio, iter_all) { 3806 struct page *page = bvec->bv_page; 3807 3808 eb = (struct extent_buffer *)page->private; 3809 BUG_ON(!eb); 3810 done = atomic_dec_and_test(&eb->io_pages); 3811 3812 if (bio->bi_status || 3813 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3814 ClearPageUptodate(page); 3815 set_btree_ioerr(page); 3816 } 3817 3818 end_page_writeback(page); 3819 3820 if (!done) 3821 continue; 3822 3823 end_extent_buffer_writeback(eb); 3824 } 3825 3826 bio_put(bio); 3827 } 3828 3829 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3830 struct writeback_control *wbc, 3831 struct extent_page_data *epd) 3832 { 3833 struct btrfs_fs_info *fs_info = eb->fs_info; 3834 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3835 u64 offset = eb->start; 3836 u32 nritems; 3837 int i, num_pages; 3838 unsigned long start, end; 3839 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 3840 int ret = 0; 3841 3842 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3843 num_pages = num_extent_pages(eb); 3844 atomic_set(&eb->io_pages, num_pages); 3845 3846 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3847 nritems = btrfs_header_nritems(eb); 3848 if (btrfs_header_level(eb) > 0) { 3849 end = btrfs_node_key_ptr_offset(nritems); 3850 3851 memzero_extent_buffer(eb, end, eb->len - end); 3852 } else { 3853 /* 3854 * leaf: 3855 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3856 */ 3857 start = btrfs_item_nr_offset(nritems); 3858 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 3859 memzero_extent_buffer(eb, start, end - start); 3860 } 3861 3862 for (i = 0; i < num_pages; i++) { 3863 struct page *p = eb->pages[i]; 3864 3865 clear_page_dirty_for_io(p); 3866 set_page_writeback(p); 3867 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3868 p, offset, PAGE_SIZE, 0, 3869 &epd->bio, 3870 end_bio_extent_buffer_writepage, 3871 0, 0, 0, false); 3872 if (ret) { 3873 set_btree_ioerr(p); 3874 if (PageWriteback(p)) 3875 end_page_writeback(p); 3876 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3877 end_extent_buffer_writeback(eb); 3878 ret = -EIO; 3879 break; 3880 } 3881 offset += PAGE_SIZE; 3882 update_nr_written(wbc, 1); 3883 unlock_page(p); 3884 } 3885 3886 if (unlikely(ret)) { 3887 for (; i < num_pages; i++) { 3888 struct page *p = eb->pages[i]; 3889 clear_page_dirty_for_io(p); 3890 unlock_page(p); 3891 } 3892 } 3893 3894 return ret; 3895 } 3896 3897 int btree_write_cache_pages(struct address_space *mapping, 3898 struct writeback_control *wbc) 3899 { 3900 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3901 struct extent_buffer *eb, *prev_eb = NULL; 3902 struct extent_page_data epd = { 3903 .bio = NULL, 3904 .tree = tree, 3905 .extent_locked = 0, 3906 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3907 }; 3908 int ret = 0; 3909 int done = 0; 3910 int nr_to_write_done = 0; 3911 struct pagevec pvec; 3912 int nr_pages; 3913 pgoff_t index; 3914 pgoff_t end; /* Inclusive */ 3915 int scanned = 0; 3916 xa_mark_t tag; 3917 3918 pagevec_init(&pvec); 3919 if (wbc->range_cyclic) { 3920 index = mapping->writeback_index; /* Start from prev offset */ 3921 end = -1; 3922 /* 3923 * Start from the beginning does not need to cycle over the 3924 * range, mark it as scanned. 3925 */ 3926 scanned = (index == 0); 3927 } else { 3928 index = wbc->range_start >> PAGE_SHIFT; 3929 end = wbc->range_end >> PAGE_SHIFT; 3930 scanned = 1; 3931 } 3932 if (wbc->sync_mode == WB_SYNC_ALL) 3933 tag = PAGECACHE_TAG_TOWRITE; 3934 else 3935 tag = PAGECACHE_TAG_DIRTY; 3936 retry: 3937 if (wbc->sync_mode == WB_SYNC_ALL) 3938 tag_pages_for_writeback(mapping, index, end); 3939 while (!done && !nr_to_write_done && (index <= end) && 3940 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 3941 tag))) { 3942 unsigned i; 3943 3944 for (i = 0; i < nr_pages; i++) { 3945 struct page *page = pvec.pages[i]; 3946 3947 if (!PagePrivate(page)) 3948 continue; 3949 3950 spin_lock(&mapping->private_lock); 3951 if (!PagePrivate(page)) { 3952 spin_unlock(&mapping->private_lock); 3953 continue; 3954 } 3955 3956 eb = (struct extent_buffer *)page->private; 3957 3958 /* 3959 * Shouldn't happen and normally this would be a BUG_ON 3960 * but no sense in crashing the users box for something 3961 * we can survive anyway. 3962 */ 3963 if (WARN_ON(!eb)) { 3964 spin_unlock(&mapping->private_lock); 3965 continue; 3966 } 3967 3968 if (eb == prev_eb) { 3969 spin_unlock(&mapping->private_lock); 3970 continue; 3971 } 3972 3973 ret = atomic_inc_not_zero(&eb->refs); 3974 spin_unlock(&mapping->private_lock); 3975 if (!ret) 3976 continue; 3977 3978 prev_eb = eb; 3979 ret = lock_extent_buffer_for_io(eb, &epd); 3980 if (!ret) { 3981 free_extent_buffer(eb); 3982 continue; 3983 } else if (ret < 0) { 3984 done = 1; 3985 free_extent_buffer(eb); 3986 break; 3987 } 3988 3989 ret = write_one_eb(eb, wbc, &epd); 3990 if (ret) { 3991 done = 1; 3992 free_extent_buffer(eb); 3993 break; 3994 } 3995 free_extent_buffer(eb); 3996 3997 /* 3998 * the filesystem may choose to bump up nr_to_write. 3999 * We have to make sure to honor the new nr_to_write 4000 * at any time 4001 */ 4002 nr_to_write_done = wbc->nr_to_write <= 0; 4003 } 4004 pagevec_release(&pvec); 4005 cond_resched(); 4006 } 4007 if (!scanned && !done) { 4008 /* 4009 * We hit the last page and there is more work to be done: wrap 4010 * back to the start of the file 4011 */ 4012 scanned = 1; 4013 index = 0; 4014 goto retry; 4015 } 4016 ASSERT(ret <= 0); 4017 if (ret < 0) { 4018 end_write_bio(&epd, ret); 4019 return ret; 4020 } 4021 ret = flush_write_bio(&epd); 4022 return ret; 4023 } 4024 4025 /** 4026 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 4027 * @mapping: address space structure to write 4028 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 4029 * @data: data passed to __extent_writepage function 4030 * 4031 * If a page is already under I/O, write_cache_pages() skips it, even 4032 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 4033 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 4034 * and msync() need to guarantee that all the data which was dirty at the time 4035 * the call was made get new I/O started against them. If wbc->sync_mode is 4036 * WB_SYNC_ALL then we were called for data integrity and we must wait for 4037 * existing IO to complete. 4038 */ 4039 static int extent_write_cache_pages(struct address_space *mapping, 4040 struct writeback_control *wbc, 4041 struct extent_page_data *epd) 4042 { 4043 struct inode *inode = mapping->host; 4044 int ret = 0; 4045 int done = 0; 4046 int nr_to_write_done = 0; 4047 struct pagevec pvec; 4048 int nr_pages; 4049 pgoff_t index; 4050 pgoff_t end; /* Inclusive */ 4051 pgoff_t done_index; 4052 int range_whole = 0; 4053 int scanned = 0; 4054 xa_mark_t tag; 4055 4056 /* 4057 * We have to hold onto the inode so that ordered extents can do their 4058 * work when the IO finishes. The alternative to this is failing to add 4059 * an ordered extent if the igrab() fails there and that is a huge pain 4060 * to deal with, so instead just hold onto the inode throughout the 4061 * writepages operation. If it fails here we are freeing up the inode 4062 * anyway and we'd rather not waste our time writing out stuff that is 4063 * going to be truncated anyway. 4064 */ 4065 if (!igrab(inode)) 4066 return 0; 4067 4068 pagevec_init(&pvec); 4069 if (wbc->range_cyclic) { 4070 index = mapping->writeback_index; /* Start from prev offset */ 4071 end = -1; 4072 /* 4073 * Start from the beginning does not need to cycle over the 4074 * range, mark it as scanned. 4075 */ 4076 scanned = (index == 0); 4077 } else { 4078 index = wbc->range_start >> PAGE_SHIFT; 4079 end = wbc->range_end >> PAGE_SHIFT; 4080 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 4081 range_whole = 1; 4082 scanned = 1; 4083 } 4084 4085 /* 4086 * We do the tagged writepage as long as the snapshot flush bit is set 4087 * and we are the first one who do the filemap_flush() on this inode. 4088 * 4089 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 4090 * not race in and drop the bit. 4091 */ 4092 if (range_whole && wbc->nr_to_write == LONG_MAX && 4093 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 4094 &BTRFS_I(inode)->runtime_flags)) 4095 wbc->tagged_writepages = 1; 4096 4097 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4098 tag = PAGECACHE_TAG_TOWRITE; 4099 else 4100 tag = PAGECACHE_TAG_DIRTY; 4101 retry: 4102 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4103 tag_pages_for_writeback(mapping, index, end); 4104 done_index = index; 4105 while (!done && !nr_to_write_done && (index <= end) && 4106 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 4107 &index, end, tag))) { 4108 unsigned i; 4109 4110 for (i = 0; i < nr_pages; i++) { 4111 struct page *page = pvec.pages[i]; 4112 4113 done_index = page->index + 1; 4114 /* 4115 * At this point we hold neither the i_pages lock nor 4116 * the page lock: the page may be truncated or 4117 * invalidated (changing page->mapping to NULL), 4118 * or even swizzled back from swapper_space to 4119 * tmpfs file mapping 4120 */ 4121 if (!trylock_page(page)) { 4122 ret = flush_write_bio(epd); 4123 BUG_ON(ret < 0); 4124 lock_page(page); 4125 } 4126 4127 if (unlikely(page->mapping != mapping)) { 4128 unlock_page(page); 4129 continue; 4130 } 4131 4132 if (wbc->sync_mode != WB_SYNC_NONE) { 4133 if (PageWriteback(page)) { 4134 ret = flush_write_bio(epd); 4135 BUG_ON(ret < 0); 4136 } 4137 wait_on_page_writeback(page); 4138 } 4139 4140 if (PageWriteback(page) || 4141 !clear_page_dirty_for_io(page)) { 4142 unlock_page(page); 4143 continue; 4144 } 4145 4146 ret = __extent_writepage(page, wbc, epd); 4147 if (ret < 0) { 4148 done = 1; 4149 break; 4150 } 4151 4152 /* 4153 * the filesystem may choose to bump up nr_to_write. 4154 * We have to make sure to honor the new nr_to_write 4155 * at any time 4156 */ 4157 nr_to_write_done = wbc->nr_to_write <= 0; 4158 } 4159 pagevec_release(&pvec); 4160 cond_resched(); 4161 } 4162 if (!scanned && !done) { 4163 /* 4164 * We hit the last page and there is more work to be done: wrap 4165 * back to the start of the file 4166 */ 4167 scanned = 1; 4168 index = 0; 4169 4170 /* 4171 * If we're looping we could run into a page that is locked by a 4172 * writer and that writer could be waiting on writeback for a 4173 * page in our current bio, and thus deadlock, so flush the 4174 * write bio here. 4175 */ 4176 ret = flush_write_bio(epd); 4177 if (!ret) 4178 goto retry; 4179 } 4180 4181 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4182 mapping->writeback_index = done_index; 4183 4184 btrfs_add_delayed_iput(inode); 4185 return ret; 4186 } 4187 4188 int extent_write_full_page(struct page *page, struct writeback_control *wbc) 4189 { 4190 int ret; 4191 struct extent_page_data epd = { 4192 .bio = NULL, 4193 .tree = &BTRFS_I(page->mapping->host)->io_tree, 4194 .extent_locked = 0, 4195 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4196 }; 4197 4198 ret = __extent_writepage(page, wbc, &epd); 4199 ASSERT(ret <= 0); 4200 if (ret < 0) { 4201 end_write_bio(&epd, ret); 4202 return ret; 4203 } 4204 4205 ret = flush_write_bio(&epd); 4206 ASSERT(ret <= 0); 4207 return ret; 4208 } 4209 4210 int extent_write_locked_range(struct inode *inode, u64 start, u64 end, 4211 int mode) 4212 { 4213 int ret = 0; 4214 struct address_space *mapping = inode->i_mapping; 4215 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 4216 struct page *page; 4217 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4218 PAGE_SHIFT; 4219 4220 struct extent_page_data epd = { 4221 .bio = NULL, 4222 .tree = tree, 4223 .extent_locked = 1, 4224 .sync_io = mode == WB_SYNC_ALL, 4225 }; 4226 struct writeback_control wbc_writepages = { 4227 .sync_mode = mode, 4228 .nr_to_write = nr_pages * 2, 4229 .range_start = start, 4230 .range_end = end + 1, 4231 /* We're called from an async helper function */ 4232 .punt_to_cgroup = 1, 4233 .no_cgroup_owner = 1, 4234 }; 4235 4236 wbc_attach_fdatawrite_inode(&wbc_writepages, inode); 4237 while (start <= end) { 4238 page = find_get_page(mapping, start >> PAGE_SHIFT); 4239 if (clear_page_dirty_for_io(page)) 4240 ret = __extent_writepage(page, &wbc_writepages, &epd); 4241 else { 4242 btrfs_writepage_endio_finish_ordered(page, start, 4243 start + PAGE_SIZE - 1, 1); 4244 unlock_page(page); 4245 } 4246 put_page(page); 4247 start += PAGE_SIZE; 4248 } 4249 4250 ASSERT(ret <= 0); 4251 if (ret == 0) 4252 ret = flush_write_bio(&epd); 4253 else 4254 end_write_bio(&epd, ret); 4255 4256 wbc_detach_inode(&wbc_writepages); 4257 return ret; 4258 } 4259 4260 int extent_writepages(struct address_space *mapping, 4261 struct writeback_control *wbc) 4262 { 4263 int ret = 0; 4264 struct extent_page_data epd = { 4265 .bio = NULL, 4266 .tree = &BTRFS_I(mapping->host)->io_tree, 4267 .extent_locked = 0, 4268 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4269 }; 4270 4271 ret = extent_write_cache_pages(mapping, wbc, &epd); 4272 ASSERT(ret <= 0); 4273 if (ret < 0) { 4274 end_write_bio(&epd, ret); 4275 return ret; 4276 } 4277 ret = flush_write_bio(&epd); 4278 return ret; 4279 } 4280 4281 int extent_readpages(struct address_space *mapping, struct list_head *pages, 4282 unsigned nr_pages) 4283 { 4284 struct bio *bio = NULL; 4285 unsigned long bio_flags = 0; 4286 struct page *pagepool[16]; 4287 struct extent_map *em_cached = NULL; 4288 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 4289 int nr = 0; 4290 u64 prev_em_start = (u64)-1; 4291 4292 while (!list_empty(pages)) { 4293 u64 contig_end = 0; 4294 4295 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { 4296 struct page *page = lru_to_page(pages); 4297 4298 prefetchw(&page->flags); 4299 list_del(&page->lru); 4300 if (add_to_page_cache_lru(page, mapping, page->index, 4301 readahead_gfp_mask(mapping))) { 4302 put_page(page); 4303 break; 4304 } 4305 4306 pagepool[nr++] = page; 4307 contig_end = page_offset(page) + PAGE_SIZE - 1; 4308 } 4309 4310 if (nr) { 4311 u64 contig_start = page_offset(pagepool[0]); 4312 4313 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); 4314 4315 contiguous_readpages(tree, pagepool, nr, contig_start, 4316 contig_end, &em_cached, &bio, &bio_flags, 4317 &prev_em_start); 4318 } 4319 } 4320 4321 if (em_cached) 4322 free_extent_map(em_cached); 4323 4324 if (bio) 4325 return submit_one_bio(bio, 0, bio_flags); 4326 return 0; 4327 } 4328 4329 /* 4330 * basic invalidatepage code, this waits on any locked or writeback 4331 * ranges corresponding to the page, and then deletes any extent state 4332 * records from the tree 4333 */ 4334 int extent_invalidatepage(struct extent_io_tree *tree, 4335 struct page *page, unsigned long offset) 4336 { 4337 struct extent_state *cached_state = NULL; 4338 u64 start = page_offset(page); 4339 u64 end = start + PAGE_SIZE - 1; 4340 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4341 4342 start += ALIGN(offset, blocksize); 4343 if (start > end) 4344 return 0; 4345 4346 lock_extent_bits(tree, start, end, &cached_state); 4347 wait_on_page_writeback(page); 4348 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | 4349 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); 4350 return 0; 4351 } 4352 4353 /* 4354 * a helper for releasepage, this tests for areas of the page that 4355 * are locked or under IO and drops the related state bits if it is safe 4356 * to drop the page. 4357 */ 4358 static int try_release_extent_state(struct extent_io_tree *tree, 4359 struct page *page, gfp_t mask) 4360 { 4361 u64 start = page_offset(page); 4362 u64 end = start + PAGE_SIZE - 1; 4363 int ret = 1; 4364 4365 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { 4366 ret = 0; 4367 } else { 4368 /* 4369 * at this point we can safely clear everything except the 4370 * locked bit and the nodatasum bit 4371 */ 4372 ret = __clear_extent_bit(tree, start, end, 4373 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4374 0, 0, NULL, mask, NULL); 4375 4376 /* if clear_extent_bit failed for enomem reasons, 4377 * we can't allow the release to continue. 4378 */ 4379 if (ret < 0) 4380 ret = 0; 4381 else 4382 ret = 1; 4383 } 4384 return ret; 4385 } 4386 4387 /* 4388 * a helper for releasepage. As long as there are no locked extents 4389 * in the range corresponding to the page, both state records and extent 4390 * map records are removed 4391 */ 4392 int try_release_extent_mapping(struct page *page, gfp_t mask) 4393 { 4394 struct extent_map *em; 4395 u64 start = page_offset(page); 4396 u64 end = start + PAGE_SIZE - 1; 4397 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 4398 struct extent_io_tree *tree = &btrfs_inode->io_tree; 4399 struct extent_map_tree *map = &btrfs_inode->extent_tree; 4400 4401 if (gfpflags_allow_blocking(mask) && 4402 page->mapping->host->i_size > SZ_16M) { 4403 u64 len; 4404 while (start <= end) { 4405 len = end - start + 1; 4406 write_lock(&map->lock); 4407 em = lookup_extent_mapping(map, start, len); 4408 if (!em) { 4409 write_unlock(&map->lock); 4410 break; 4411 } 4412 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4413 em->start != start) { 4414 write_unlock(&map->lock); 4415 free_extent_map(em); 4416 break; 4417 } 4418 if (!test_range_bit(tree, em->start, 4419 extent_map_end(em) - 1, 4420 EXTENT_LOCKED, 0, NULL)) { 4421 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4422 &btrfs_inode->runtime_flags); 4423 remove_extent_mapping(map, em); 4424 /* once for the rb tree */ 4425 free_extent_map(em); 4426 } 4427 start = extent_map_end(em); 4428 write_unlock(&map->lock); 4429 4430 /* once for us */ 4431 free_extent_map(em); 4432 } 4433 } 4434 return try_release_extent_state(tree, page, mask); 4435 } 4436 4437 /* 4438 * helper function for fiemap, which doesn't want to see any holes. 4439 * This maps until we find something past 'last' 4440 */ 4441 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4442 u64 offset, u64 last) 4443 { 4444 u64 sectorsize = btrfs_inode_sectorsize(inode); 4445 struct extent_map *em; 4446 u64 len; 4447 4448 if (offset >= last) 4449 return NULL; 4450 4451 while (1) { 4452 len = last - offset; 4453 if (len == 0) 4454 break; 4455 len = ALIGN(len, sectorsize); 4456 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); 4457 if (IS_ERR_OR_NULL(em)) 4458 return em; 4459 4460 /* if this isn't a hole return it */ 4461 if (em->block_start != EXTENT_MAP_HOLE) 4462 return em; 4463 4464 /* this is a hole, advance to the next extent */ 4465 offset = extent_map_end(em); 4466 free_extent_map(em); 4467 if (offset >= last) 4468 break; 4469 } 4470 return NULL; 4471 } 4472 4473 /* 4474 * To cache previous fiemap extent 4475 * 4476 * Will be used for merging fiemap extent 4477 */ 4478 struct fiemap_cache { 4479 u64 offset; 4480 u64 phys; 4481 u64 len; 4482 u32 flags; 4483 bool cached; 4484 }; 4485 4486 /* 4487 * Helper to submit fiemap extent. 4488 * 4489 * Will try to merge current fiemap extent specified by @offset, @phys, 4490 * @len and @flags with cached one. 4491 * And only when we fails to merge, cached one will be submitted as 4492 * fiemap extent. 4493 * 4494 * Return value is the same as fiemap_fill_next_extent(). 4495 */ 4496 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 4497 struct fiemap_cache *cache, 4498 u64 offset, u64 phys, u64 len, u32 flags) 4499 { 4500 int ret = 0; 4501 4502 if (!cache->cached) 4503 goto assign; 4504 4505 /* 4506 * Sanity check, extent_fiemap() should have ensured that new 4507 * fiemap extent won't overlap with cached one. 4508 * Not recoverable. 4509 * 4510 * NOTE: Physical address can overlap, due to compression 4511 */ 4512 if (cache->offset + cache->len > offset) { 4513 WARN_ON(1); 4514 return -EINVAL; 4515 } 4516 4517 /* 4518 * Only merges fiemap extents if 4519 * 1) Their logical addresses are continuous 4520 * 4521 * 2) Their physical addresses are continuous 4522 * So truly compressed (physical size smaller than logical size) 4523 * extents won't get merged with each other 4524 * 4525 * 3) Share same flags except FIEMAP_EXTENT_LAST 4526 * So regular extent won't get merged with prealloc extent 4527 */ 4528 if (cache->offset + cache->len == offset && 4529 cache->phys + cache->len == phys && 4530 (cache->flags & ~FIEMAP_EXTENT_LAST) == 4531 (flags & ~FIEMAP_EXTENT_LAST)) { 4532 cache->len += len; 4533 cache->flags |= flags; 4534 goto try_submit_last; 4535 } 4536 4537 /* Not mergeable, need to submit cached one */ 4538 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4539 cache->len, cache->flags); 4540 cache->cached = false; 4541 if (ret) 4542 return ret; 4543 assign: 4544 cache->cached = true; 4545 cache->offset = offset; 4546 cache->phys = phys; 4547 cache->len = len; 4548 cache->flags = flags; 4549 try_submit_last: 4550 if (cache->flags & FIEMAP_EXTENT_LAST) { 4551 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 4552 cache->phys, cache->len, cache->flags); 4553 cache->cached = false; 4554 } 4555 return ret; 4556 } 4557 4558 /* 4559 * Emit last fiemap cache 4560 * 4561 * The last fiemap cache may still be cached in the following case: 4562 * 0 4k 8k 4563 * |<- Fiemap range ->| 4564 * |<------------ First extent ----------->| 4565 * 4566 * In this case, the first extent range will be cached but not emitted. 4567 * So we must emit it before ending extent_fiemap(). 4568 */ 4569 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 4570 struct fiemap_cache *cache) 4571 { 4572 int ret; 4573 4574 if (!cache->cached) 4575 return 0; 4576 4577 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4578 cache->len, cache->flags); 4579 cache->cached = false; 4580 if (ret > 0) 4581 ret = 0; 4582 return ret; 4583 } 4584 4585 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4586 __u64 start, __u64 len) 4587 { 4588 int ret = 0; 4589 u64 off = start; 4590 u64 max = start + len; 4591 u32 flags = 0; 4592 u32 found_type; 4593 u64 last; 4594 u64 last_for_get_extent = 0; 4595 u64 disko = 0; 4596 u64 isize = i_size_read(inode); 4597 struct btrfs_key found_key; 4598 struct extent_map *em = NULL; 4599 struct extent_state *cached_state = NULL; 4600 struct btrfs_path *path; 4601 struct btrfs_root *root = BTRFS_I(inode)->root; 4602 struct fiemap_cache cache = { 0 }; 4603 struct ulist *roots; 4604 struct ulist *tmp_ulist; 4605 int end = 0; 4606 u64 em_start = 0; 4607 u64 em_len = 0; 4608 u64 em_end = 0; 4609 4610 if (len == 0) 4611 return -EINVAL; 4612 4613 path = btrfs_alloc_path(); 4614 if (!path) 4615 return -ENOMEM; 4616 path->leave_spinning = 1; 4617 4618 roots = ulist_alloc(GFP_KERNEL); 4619 tmp_ulist = ulist_alloc(GFP_KERNEL); 4620 if (!roots || !tmp_ulist) { 4621 ret = -ENOMEM; 4622 goto out_free_ulist; 4623 } 4624 4625 start = round_down(start, btrfs_inode_sectorsize(inode)); 4626 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4627 4628 /* 4629 * lookup the last file extent. We're not using i_size here 4630 * because there might be preallocation past i_size 4631 */ 4632 ret = btrfs_lookup_file_extent(NULL, root, path, 4633 btrfs_ino(BTRFS_I(inode)), -1, 0); 4634 if (ret < 0) { 4635 goto out_free_ulist; 4636 } else { 4637 WARN_ON(!ret); 4638 if (ret == 1) 4639 ret = 0; 4640 } 4641 4642 path->slots[0]--; 4643 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4644 found_type = found_key.type; 4645 4646 /* No extents, but there might be delalloc bits */ 4647 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4648 found_type != BTRFS_EXTENT_DATA_KEY) { 4649 /* have to trust i_size as the end */ 4650 last = (u64)-1; 4651 last_for_get_extent = isize; 4652 } else { 4653 /* 4654 * remember the start of the last extent. There are a 4655 * bunch of different factors that go into the length of the 4656 * extent, so its much less complex to remember where it started 4657 */ 4658 last = found_key.offset; 4659 last_for_get_extent = last + 1; 4660 } 4661 btrfs_release_path(path); 4662 4663 /* 4664 * we might have some extents allocated but more delalloc past those 4665 * extents. so, we trust isize unless the start of the last extent is 4666 * beyond isize 4667 */ 4668 if (last < isize) { 4669 last = (u64)-1; 4670 last_for_get_extent = isize; 4671 } 4672 4673 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4674 &cached_state); 4675 4676 em = get_extent_skip_holes(inode, start, last_for_get_extent); 4677 if (!em) 4678 goto out; 4679 if (IS_ERR(em)) { 4680 ret = PTR_ERR(em); 4681 goto out; 4682 } 4683 4684 while (!end) { 4685 u64 offset_in_extent = 0; 4686 4687 /* break if the extent we found is outside the range */ 4688 if (em->start >= max || extent_map_end(em) < off) 4689 break; 4690 4691 /* 4692 * get_extent may return an extent that starts before our 4693 * requested range. We have to make sure the ranges 4694 * we return to fiemap always move forward and don't 4695 * overlap, so adjust the offsets here 4696 */ 4697 em_start = max(em->start, off); 4698 4699 /* 4700 * record the offset from the start of the extent 4701 * for adjusting the disk offset below. Only do this if the 4702 * extent isn't compressed since our in ram offset may be past 4703 * what we have actually allocated on disk. 4704 */ 4705 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4706 offset_in_extent = em_start - em->start; 4707 em_end = extent_map_end(em); 4708 em_len = em_end - em_start; 4709 flags = 0; 4710 if (em->block_start < EXTENT_MAP_LAST_BYTE) 4711 disko = em->block_start + offset_in_extent; 4712 else 4713 disko = 0; 4714 4715 /* 4716 * bump off for our next call to get_extent 4717 */ 4718 off = extent_map_end(em); 4719 if (off >= max) 4720 end = 1; 4721 4722 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4723 end = 1; 4724 flags |= FIEMAP_EXTENT_LAST; 4725 } else if (em->block_start == EXTENT_MAP_INLINE) { 4726 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4727 FIEMAP_EXTENT_NOT_ALIGNED); 4728 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4729 flags |= (FIEMAP_EXTENT_DELALLOC | 4730 FIEMAP_EXTENT_UNKNOWN); 4731 } else if (fieinfo->fi_extents_max) { 4732 u64 bytenr = em->block_start - 4733 (em->start - em->orig_start); 4734 4735 /* 4736 * As btrfs supports shared space, this information 4737 * can be exported to userspace tools via 4738 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4739 * then we're just getting a count and we can skip the 4740 * lookup stuff. 4741 */ 4742 ret = btrfs_check_shared(root, 4743 btrfs_ino(BTRFS_I(inode)), 4744 bytenr, roots, tmp_ulist); 4745 if (ret < 0) 4746 goto out_free; 4747 if (ret) 4748 flags |= FIEMAP_EXTENT_SHARED; 4749 ret = 0; 4750 } 4751 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4752 flags |= FIEMAP_EXTENT_ENCODED; 4753 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4754 flags |= FIEMAP_EXTENT_UNWRITTEN; 4755 4756 free_extent_map(em); 4757 em = NULL; 4758 if ((em_start >= last) || em_len == (u64)-1 || 4759 (last == (u64)-1 && isize <= em_end)) { 4760 flags |= FIEMAP_EXTENT_LAST; 4761 end = 1; 4762 } 4763 4764 /* now scan forward to see if this is really the last extent. */ 4765 em = get_extent_skip_holes(inode, off, last_for_get_extent); 4766 if (IS_ERR(em)) { 4767 ret = PTR_ERR(em); 4768 goto out; 4769 } 4770 if (!em) { 4771 flags |= FIEMAP_EXTENT_LAST; 4772 end = 1; 4773 } 4774 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 4775 em_len, flags); 4776 if (ret) { 4777 if (ret == 1) 4778 ret = 0; 4779 goto out_free; 4780 } 4781 } 4782 out_free: 4783 if (!ret) 4784 ret = emit_last_fiemap_cache(fieinfo, &cache); 4785 free_extent_map(em); 4786 out: 4787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4788 &cached_state); 4789 4790 out_free_ulist: 4791 btrfs_free_path(path); 4792 ulist_free(roots); 4793 ulist_free(tmp_ulist); 4794 return ret; 4795 } 4796 4797 static void __free_extent_buffer(struct extent_buffer *eb) 4798 { 4799 btrfs_leak_debug_del(&eb->leak_list); 4800 kmem_cache_free(extent_buffer_cache, eb); 4801 } 4802 4803 int extent_buffer_under_io(struct extent_buffer *eb) 4804 { 4805 return (atomic_read(&eb->io_pages) || 4806 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4807 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4808 } 4809 4810 /* 4811 * Release all pages attached to the extent buffer. 4812 */ 4813 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 4814 { 4815 int i; 4816 int num_pages; 4817 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4818 4819 BUG_ON(extent_buffer_under_io(eb)); 4820 4821 num_pages = num_extent_pages(eb); 4822 for (i = 0; i < num_pages; i++) { 4823 struct page *page = eb->pages[i]; 4824 4825 if (!page) 4826 continue; 4827 if (mapped) 4828 spin_lock(&page->mapping->private_lock); 4829 /* 4830 * We do this since we'll remove the pages after we've 4831 * removed the eb from the radix tree, so we could race 4832 * and have this page now attached to the new eb. So 4833 * only clear page_private if it's still connected to 4834 * this eb. 4835 */ 4836 if (PagePrivate(page) && 4837 page->private == (unsigned long)eb) { 4838 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4839 BUG_ON(PageDirty(page)); 4840 BUG_ON(PageWriteback(page)); 4841 /* 4842 * We need to make sure we haven't be attached 4843 * to a new eb. 4844 */ 4845 ClearPagePrivate(page); 4846 set_page_private(page, 0); 4847 /* One for the page private */ 4848 put_page(page); 4849 } 4850 4851 if (mapped) 4852 spin_unlock(&page->mapping->private_lock); 4853 4854 /* One for when we allocated the page */ 4855 put_page(page); 4856 } 4857 } 4858 4859 /* 4860 * Helper for releasing the extent buffer. 4861 */ 4862 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4863 { 4864 btrfs_release_extent_buffer_pages(eb); 4865 __free_extent_buffer(eb); 4866 } 4867 4868 static struct extent_buffer * 4869 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4870 unsigned long len) 4871 { 4872 struct extent_buffer *eb = NULL; 4873 4874 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4875 eb->start = start; 4876 eb->len = len; 4877 eb->fs_info = fs_info; 4878 eb->bflags = 0; 4879 rwlock_init(&eb->lock); 4880 atomic_set(&eb->blocking_readers, 0); 4881 eb->blocking_writers = 0; 4882 eb->lock_nested = false; 4883 init_waitqueue_head(&eb->write_lock_wq); 4884 init_waitqueue_head(&eb->read_lock_wq); 4885 4886 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4887 4888 spin_lock_init(&eb->refs_lock); 4889 atomic_set(&eb->refs, 1); 4890 atomic_set(&eb->io_pages, 0); 4891 4892 /* 4893 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4894 */ 4895 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4896 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4897 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4898 4899 #ifdef CONFIG_BTRFS_DEBUG 4900 eb->spinning_writers = 0; 4901 atomic_set(&eb->spinning_readers, 0); 4902 atomic_set(&eb->read_locks, 0); 4903 eb->write_locks = 0; 4904 #endif 4905 4906 return eb; 4907 } 4908 4909 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4910 { 4911 int i; 4912 struct page *p; 4913 struct extent_buffer *new; 4914 int num_pages = num_extent_pages(src); 4915 4916 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4917 if (new == NULL) 4918 return NULL; 4919 4920 for (i = 0; i < num_pages; i++) { 4921 p = alloc_page(GFP_NOFS); 4922 if (!p) { 4923 btrfs_release_extent_buffer(new); 4924 return NULL; 4925 } 4926 attach_extent_buffer_page(new, p); 4927 WARN_ON(PageDirty(p)); 4928 SetPageUptodate(p); 4929 new->pages[i] = p; 4930 copy_page(page_address(p), page_address(src->pages[i])); 4931 } 4932 4933 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4934 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 4935 4936 return new; 4937 } 4938 4939 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4940 u64 start, unsigned long len) 4941 { 4942 struct extent_buffer *eb; 4943 int num_pages; 4944 int i; 4945 4946 eb = __alloc_extent_buffer(fs_info, start, len); 4947 if (!eb) 4948 return NULL; 4949 4950 num_pages = num_extent_pages(eb); 4951 for (i = 0; i < num_pages; i++) { 4952 eb->pages[i] = alloc_page(GFP_NOFS); 4953 if (!eb->pages[i]) 4954 goto err; 4955 } 4956 set_extent_buffer_uptodate(eb); 4957 btrfs_set_header_nritems(eb, 0); 4958 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4959 4960 return eb; 4961 err: 4962 for (; i > 0; i--) 4963 __free_page(eb->pages[i - 1]); 4964 __free_extent_buffer(eb); 4965 return NULL; 4966 } 4967 4968 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4969 u64 start) 4970 { 4971 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4972 } 4973 4974 static void check_buffer_tree_ref(struct extent_buffer *eb) 4975 { 4976 int refs; 4977 /* the ref bit is tricky. We have to make sure it is set 4978 * if we have the buffer dirty. Otherwise the 4979 * code to free a buffer can end up dropping a dirty 4980 * page 4981 * 4982 * Once the ref bit is set, it won't go away while the 4983 * buffer is dirty or in writeback, and it also won't 4984 * go away while we have the reference count on the 4985 * eb bumped. 4986 * 4987 * We can't just set the ref bit without bumping the 4988 * ref on the eb because free_extent_buffer might 4989 * see the ref bit and try to clear it. If this happens 4990 * free_extent_buffer might end up dropping our original 4991 * ref by mistake and freeing the page before we are able 4992 * to add one more ref. 4993 * 4994 * So bump the ref count first, then set the bit. If someone 4995 * beat us to it, drop the ref we added. 4996 */ 4997 refs = atomic_read(&eb->refs); 4998 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4999 return; 5000 5001 spin_lock(&eb->refs_lock); 5002 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5003 atomic_inc(&eb->refs); 5004 spin_unlock(&eb->refs_lock); 5005 } 5006 5007 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 5008 struct page *accessed) 5009 { 5010 int num_pages, i; 5011 5012 check_buffer_tree_ref(eb); 5013 5014 num_pages = num_extent_pages(eb); 5015 for (i = 0; i < num_pages; i++) { 5016 struct page *p = eb->pages[i]; 5017 5018 if (p != accessed) 5019 mark_page_accessed(p); 5020 } 5021 } 5022 5023 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 5024 u64 start) 5025 { 5026 struct extent_buffer *eb; 5027 5028 rcu_read_lock(); 5029 eb = radix_tree_lookup(&fs_info->buffer_radix, 5030 start >> PAGE_SHIFT); 5031 if (eb && atomic_inc_not_zero(&eb->refs)) { 5032 rcu_read_unlock(); 5033 /* 5034 * Lock our eb's refs_lock to avoid races with 5035 * free_extent_buffer. When we get our eb it might be flagged 5036 * with EXTENT_BUFFER_STALE and another task running 5037 * free_extent_buffer might have seen that flag set, 5038 * eb->refs == 2, that the buffer isn't under IO (dirty and 5039 * writeback flags not set) and it's still in the tree (flag 5040 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 5041 * of decrementing the extent buffer's reference count twice. 5042 * So here we could race and increment the eb's reference count, 5043 * clear its stale flag, mark it as dirty and drop our reference 5044 * before the other task finishes executing free_extent_buffer, 5045 * which would later result in an attempt to free an extent 5046 * buffer that is dirty. 5047 */ 5048 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 5049 spin_lock(&eb->refs_lock); 5050 spin_unlock(&eb->refs_lock); 5051 } 5052 mark_extent_buffer_accessed(eb, NULL); 5053 return eb; 5054 } 5055 rcu_read_unlock(); 5056 5057 return NULL; 5058 } 5059 5060 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5061 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 5062 u64 start) 5063 { 5064 struct extent_buffer *eb, *exists = NULL; 5065 int ret; 5066 5067 eb = find_extent_buffer(fs_info, start); 5068 if (eb) 5069 return eb; 5070 eb = alloc_dummy_extent_buffer(fs_info, start); 5071 if (!eb) 5072 return ERR_PTR(-ENOMEM); 5073 eb->fs_info = fs_info; 5074 again: 5075 ret = radix_tree_preload(GFP_NOFS); 5076 if (ret) { 5077 exists = ERR_PTR(ret); 5078 goto free_eb; 5079 } 5080 spin_lock(&fs_info->buffer_lock); 5081 ret = radix_tree_insert(&fs_info->buffer_radix, 5082 start >> PAGE_SHIFT, eb); 5083 spin_unlock(&fs_info->buffer_lock); 5084 radix_tree_preload_end(); 5085 if (ret == -EEXIST) { 5086 exists = find_extent_buffer(fs_info, start); 5087 if (exists) 5088 goto free_eb; 5089 else 5090 goto again; 5091 } 5092 check_buffer_tree_ref(eb); 5093 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5094 5095 return eb; 5096 free_eb: 5097 btrfs_release_extent_buffer(eb); 5098 return exists; 5099 } 5100 #endif 5101 5102 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 5103 u64 start) 5104 { 5105 unsigned long len = fs_info->nodesize; 5106 int num_pages; 5107 int i; 5108 unsigned long index = start >> PAGE_SHIFT; 5109 struct extent_buffer *eb; 5110 struct extent_buffer *exists = NULL; 5111 struct page *p; 5112 struct address_space *mapping = fs_info->btree_inode->i_mapping; 5113 int uptodate = 1; 5114 int ret; 5115 5116 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 5117 btrfs_err(fs_info, "bad tree block start %llu", start); 5118 return ERR_PTR(-EINVAL); 5119 } 5120 5121 eb = find_extent_buffer(fs_info, start); 5122 if (eb) 5123 return eb; 5124 5125 eb = __alloc_extent_buffer(fs_info, start, len); 5126 if (!eb) 5127 return ERR_PTR(-ENOMEM); 5128 5129 num_pages = num_extent_pages(eb); 5130 for (i = 0; i < num_pages; i++, index++) { 5131 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 5132 if (!p) { 5133 exists = ERR_PTR(-ENOMEM); 5134 goto free_eb; 5135 } 5136 5137 spin_lock(&mapping->private_lock); 5138 if (PagePrivate(p)) { 5139 /* 5140 * We could have already allocated an eb for this page 5141 * and attached one so lets see if we can get a ref on 5142 * the existing eb, and if we can we know it's good and 5143 * we can just return that one, else we know we can just 5144 * overwrite page->private. 5145 */ 5146 exists = (struct extent_buffer *)p->private; 5147 if (atomic_inc_not_zero(&exists->refs)) { 5148 spin_unlock(&mapping->private_lock); 5149 unlock_page(p); 5150 put_page(p); 5151 mark_extent_buffer_accessed(exists, p); 5152 goto free_eb; 5153 } 5154 exists = NULL; 5155 5156 /* 5157 * Do this so attach doesn't complain and we need to 5158 * drop the ref the old guy had. 5159 */ 5160 ClearPagePrivate(p); 5161 WARN_ON(PageDirty(p)); 5162 put_page(p); 5163 } 5164 attach_extent_buffer_page(eb, p); 5165 spin_unlock(&mapping->private_lock); 5166 WARN_ON(PageDirty(p)); 5167 eb->pages[i] = p; 5168 if (!PageUptodate(p)) 5169 uptodate = 0; 5170 5171 /* 5172 * We can't unlock the pages just yet since the extent buffer 5173 * hasn't been properly inserted in the radix tree, this 5174 * opens a race with btree_releasepage which can free a page 5175 * while we are still filling in all pages for the buffer and 5176 * we could crash. 5177 */ 5178 } 5179 if (uptodate) 5180 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5181 again: 5182 ret = radix_tree_preload(GFP_NOFS); 5183 if (ret) { 5184 exists = ERR_PTR(ret); 5185 goto free_eb; 5186 } 5187 5188 spin_lock(&fs_info->buffer_lock); 5189 ret = radix_tree_insert(&fs_info->buffer_radix, 5190 start >> PAGE_SHIFT, eb); 5191 spin_unlock(&fs_info->buffer_lock); 5192 radix_tree_preload_end(); 5193 if (ret == -EEXIST) { 5194 exists = find_extent_buffer(fs_info, start); 5195 if (exists) 5196 goto free_eb; 5197 else 5198 goto again; 5199 } 5200 /* add one reference for the tree */ 5201 check_buffer_tree_ref(eb); 5202 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5203 5204 /* 5205 * Now it's safe to unlock the pages because any calls to 5206 * btree_releasepage will correctly detect that a page belongs to a 5207 * live buffer and won't free them prematurely. 5208 */ 5209 for (i = 0; i < num_pages; i++) 5210 unlock_page(eb->pages[i]); 5211 return eb; 5212 5213 free_eb: 5214 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5215 for (i = 0; i < num_pages; i++) { 5216 if (eb->pages[i]) 5217 unlock_page(eb->pages[i]); 5218 } 5219 5220 btrfs_release_extent_buffer(eb); 5221 return exists; 5222 } 5223 5224 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5225 { 5226 struct extent_buffer *eb = 5227 container_of(head, struct extent_buffer, rcu_head); 5228 5229 __free_extent_buffer(eb); 5230 } 5231 5232 static int release_extent_buffer(struct extent_buffer *eb) 5233 { 5234 lockdep_assert_held(&eb->refs_lock); 5235 5236 WARN_ON(atomic_read(&eb->refs) == 0); 5237 if (atomic_dec_and_test(&eb->refs)) { 5238 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5239 struct btrfs_fs_info *fs_info = eb->fs_info; 5240 5241 spin_unlock(&eb->refs_lock); 5242 5243 spin_lock(&fs_info->buffer_lock); 5244 radix_tree_delete(&fs_info->buffer_radix, 5245 eb->start >> PAGE_SHIFT); 5246 spin_unlock(&fs_info->buffer_lock); 5247 } else { 5248 spin_unlock(&eb->refs_lock); 5249 } 5250 5251 /* Should be safe to release our pages at this point */ 5252 btrfs_release_extent_buffer_pages(eb); 5253 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5254 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 5255 __free_extent_buffer(eb); 5256 return 1; 5257 } 5258 #endif 5259 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5260 return 1; 5261 } 5262 spin_unlock(&eb->refs_lock); 5263 5264 return 0; 5265 } 5266 5267 void free_extent_buffer(struct extent_buffer *eb) 5268 { 5269 int refs; 5270 int old; 5271 if (!eb) 5272 return; 5273 5274 while (1) { 5275 refs = atomic_read(&eb->refs); 5276 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 5277 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 5278 refs == 1)) 5279 break; 5280 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5281 if (old == refs) 5282 return; 5283 } 5284 5285 spin_lock(&eb->refs_lock); 5286 if (atomic_read(&eb->refs) == 2 && 5287 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5288 !extent_buffer_under_io(eb) && 5289 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5290 atomic_dec(&eb->refs); 5291 5292 /* 5293 * I know this is terrible, but it's temporary until we stop tracking 5294 * the uptodate bits and such for the extent buffers. 5295 */ 5296 release_extent_buffer(eb); 5297 } 5298 5299 void free_extent_buffer_stale(struct extent_buffer *eb) 5300 { 5301 if (!eb) 5302 return; 5303 5304 spin_lock(&eb->refs_lock); 5305 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5306 5307 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5308 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5309 atomic_dec(&eb->refs); 5310 release_extent_buffer(eb); 5311 } 5312 5313 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5314 { 5315 int i; 5316 int num_pages; 5317 struct page *page; 5318 5319 num_pages = num_extent_pages(eb); 5320 5321 for (i = 0; i < num_pages; i++) { 5322 page = eb->pages[i]; 5323 if (!PageDirty(page)) 5324 continue; 5325 5326 lock_page(page); 5327 WARN_ON(!PagePrivate(page)); 5328 5329 clear_page_dirty_for_io(page); 5330 xa_lock_irq(&page->mapping->i_pages); 5331 if (!PageDirty(page)) 5332 __xa_clear_mark(&page->mapping->i_pages, 5333 page_index(page), PAGECACHE_TAG_DIRTY); 5334 xa_unlock_irq(&page->mapping->i_pages); 5335 ClearPageError(page); 5336 unlock_page(page); 5337 } 5338 WARN_ON(atomic_read(&eb->refs) == 0); 5339 } 5340 5341 bool set_extent_buffer_dirty(struct extent_buffer *eb) 5342 { 5343 int i; 5344 int num_pages; 5345 bool was_dirty; 5346 5347 check_buffer_tree_ref(eb); 5348 5349 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5350 5351 num_pages = num_extent_pages(eb); 5352 WARN_ON(atomic_read(&eb->refs) == 0); 5353 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5354 5355 if (!was_dirty) 5356 for (i = 0; i < num_pages; i++) 5357 set_page_dirty(eb->pages[i]); 5358 5359 #ifdef CONFIG_BTRFS_DEBUG 5360 for (i = 0; i < num_pages; i++) 5361 ASSERT(PageDirty(eb->pages[i])); 5362 #endif 5363 5364 return was_dirty; 5365 } 5366 5367 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5368 { 5369 int i; 5370 struct page *page; 5371 int num_pages; 5372 5373 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5374 num_pages = num_extent_pages(eb); 5375 for (i = 0; i < num_pages; i++) { 5376 page = eb->pages[i]; 5377 if (page) 5378 ClearPageUptodate(page); 5379 } 5380 } 5381 5382 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5383 { 5384 int i; 5385 struct page *page; 5386 int num_pages; 5387 5388 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5389 num_pages = num_extent_pages(eb); 5390 for (i = 0; i < num_pages; i++) { 5391 page = eb->pages[i]; 5392 SetPageUptodate(page); 5393 } 5394 } 5395 5396 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) 5397 { 5398 int i; 5399 struct page *page; 5400 int err; 5401 int ret = 0; 5402 int locked_pages = 0; 5403 int all_uptodate = 1; 5404 int num_pages; 5405 unsigned long num_reads = 0; 5406 struct bio *bio = NULL; 5407 unsigned long bio_flags = 0; 5408 struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; 5409 5410 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5411 return 0; 5412 5413 num_pages = num_extent_pages(eb); 5414 for (i = 0; i < num_pages; i++) { 5415 page = eb->pages[i]; 5416 if (wait == WAIT_NONE) { 5417 if (!trylock_page(page)) 5418 goto unlock_exit; 5419 } else { 5420 lock_page(page); 5421 } 5422 locked_pages++; 5423 } 5424 /* 5425 * We need to firstly lock all pages to make sure that 5426 * the uptodate bit of our pages won't be affected by 5427 * clear_extent_buffer_uptodate(). 5428 */ 5429 for (i = 0; i < num_pages; i++) { 5430 page = eb->pages[i]; 5431 if (!PageUptodate(page)) { 5432 num_reads++; 5433 all_uptodate = 0; 5434 } 5435 } 5436 5437 if (all_uptodate) { 5438 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5439 goto unlock_exit; 5440 } 5441 5442 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5443 eb->read_mirror = 0; 5444 atomic_set(&eb->io_pages, num_reads); 5445 for (i = 0; i < num_pages; i++) { 5446 page = eb->pages[i]; 5447 5448 if (!PageUptodate(page)) { 5449 if (ret) { 5450 atomic_dec(&eb->io_pages); 5451 unlock_page(page); 5452 continue; 5453 } 5454 5455 ClearPageError(page); 5456 err = __extent_read_full_page(tree, page, 5457 btree_get_extent, &bio, 5458 mirror_num, &bio_flags, 5459 REQ_META); 5460 if (err) { 5461 ret = err; 5462 /* 5463 * We use &bio in above __extent_read_full_page, 5464 * so we ensure that if it returns error, the 5465 * current page fails to add itself to bio and 5466 * it's been unlocked. 5467 * 5468 * We must dec io_pages by ourselves. 5469 */ 5470 atomic_dec(&eb->io_pages); 5471 } 5472 } else { 5473 unlock_page(page); 5474 } 5475 } 5476 5477 if (bio) { 5478 err = submit_one_bio(bio, mirror_num, bio_flags); 5479 if (err) 5480 return err; 5481 } 5482 5483 if (ret || wait != WAIT_COMPLETE) 5484 return ret; 5485 5486 for (i = 0; i < num_pages; i++) { 5487 page = eb->pages[i]; 5488 wait_on_page_locked(page); 5489 if (!PageUptodate(page)) 5490 ret = -EIO; 5491 } 5492 5493 return ret; 5494 5495 unlock_exit: 5496 while (locked_pages > 0) { 5497 locked_pages--; 5498 page = eb->pages[locked_pages]; 5499 unlock_page(page); 5500 } 5501 return ret; 5502 } 5503 5504 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 5505 unsigned long start, unsigned long len) 5506 { 5507 size_t cur; 5508 size_t offset; 5509 struct page *page; 5510 char *kaddr; 5511 char *dst = (char *)dstv; 5512 size_t start_offset = offset_in_page(eb->start); 5513 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5514 5515 if (start + len > eb->len) { 5516 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5517 eb->start, eb->len, start, len); 5518 memset(dst, 0, len); 5519 return; 5520 } 5521 5522 offset = offset_in_page(start_offset + start); 5523 5524 while (len > 0) { 5525 page = eb->pages[i]; 5526 5527 cur = min(len, (PAGE_SIZE - offset)); 5528 kaddr = page_address(page); 5529 memcpy(dst, kaddr + offset, cur); 5530 5531 dst += cur; 5532 len -= cur; 5533 offset = 0; 5534 i++; 5535 } 5536 } 5537 5538 int read_extent_buffer_to_user(const struct extent_buffer *eb, 5539 void __user *dstv, 5540 unsigned long start, unsigned long len) 5541 { 5542 size_t cur; 5543 size_t offset; 5544 struct page *page; 5545 char *kaddr; 5546 char __user *dst = (char __user *)dstv; 5547 size_t start_offset = offset_in_page(eb->start); 5548 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5549 int ret = 0; 5550 5551 WARN_ON(start > eb->len); 5552 WARN_ON(start + len > eb->start + eb->len); 5553 5554 offset = offset_in_page(start_offset + start); 5555 5556 while (len > 0) { 5557 page = eb->pages[i]; 5558 5559 cur = min(len, (PAGE_SIZE - offset)); 5560 kaddr = page_address(page); 5561 if (copy_to_user(dst, kaddr + offset, cur)) { 5562 ret = -EFAULT; 5563 break; 5564 } 5565 5566 dst += cur; 5567 len -= cur; 5568 offset = 0; 5569 i++; 5570 } 5571 5572 return ret; 5573 } 5574 5575 /* 5576 * return 0 if the item is found within a page. 5577 * return 1 if the item spans two pages. 5578 * return -EINVAL otherwise. 5579 */ 5580 int map_private_extent_buffer(const struct extent_buffer *eb, 5581 unsigned long start, unsigned long min_len, 5582 char **map, unsigned long *map_start, 5583 unsigned long *map_len) 5584 { 5585 size_t offset; 5586 char *kaddr; 5587 struct page *p; 5588 size_t start_offset = offset_in_page(eb->start); 5589 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5590 unsigned long end_i = (start_offset + start + min_len - 1) >> 5591 PAGE_SHIFT; 5592 5593 if (start + min_len > eb->len) { 5594 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5595 eb->start, eb->len, start, min_len); 5596 return -EINVAL; 5597 } 5598 5599 if (i != end_i) 5600 return 1; 5601 5602 if (i == 0) { 5603 offset = start_offset; 5604 *map_start = 0; 5605 } else { 5606 offset = 0; 5607 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5608 } 5609 5610 p = eb->pages[i]; 5611 kaddr = page_address(p); 5612 *map = kaddr + offset; 5613 *map_len = PAGE_SIZE - offset; 5614 return 0; 5615 } 5616 5617 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 5618 unsigned long start, unsigned long len) 5619 { 5620 size_t cur; 5621 size_t offset; 5622 struct page *page; 5623 char *kaddr; 5624 char *ptr = (char *)ptrv; 5625 size_t start_offset = offset_in_page(eb->start); 5626 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5627 int ret = 0; 5628 5629 WARN_ON(start > eb->len); 5630 WARN_ON(start + len > eb->start + eb->len); 5631 5632 offset = offset_in_page(start_offset + start); 5633 5634 while (len > 0) { 5635 page = eb->pages[i]; 5636 5637 cur = min(len, (PAGE_SIZE - offset)); 5638 5639 kaddr = page_address(page); 5640 ret = memcmp(ptr, kaddr + offset, cur); 5641 if (ret) 5642 break; 5643 5644 ptr += cur; 5645 len -= cur; 5646 offset = 0; 5647 i++; 5648 } 5649 return ret; 5650 } 5651 5652 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5653 const void *srcv) 5654 { 5655 char *kaddr; 5656 5657 WARN_ON(!PageUptodate(eb->pages[0])); 5658 kaddr = page_address(eb->pages[0]); 5659 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5660 BTRFS_FSID_SIZE); 5661 } 5662 5663 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5664 { 5665 char *kaddr; 5666 5667 WARN_ON(!PageUptodate(eb->pages[0])); 5668 kaddr = page_address(eb->pages[0]); 5669 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5670 BTRFS_FSID_SIZE); 5671 } 5672 5673 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5674 unsigned long start, unsigned long len) 5675 { 5676 size_t cur; 5677 size_t offset; 5678 struct page *page; 5679 char *kaddr; 5680 char *src = (char *)srcv; 5681 size_t start_offset = offset_in_page(eb->start); 5682 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5683 5684 WARN_ON(start > eb->len); 5685 WARN_ON(start + len > eb->start + eb->len); 5686 5687 offset = offset_in_page(start_offset + start); 5688 5689 while (len > 0) { 5690 page = eb->pages[i]; 5691 WARN_ON(!PageUptodate(page)); 5692 5693 cur = min(len, PAGE_SIZE - offset); 5694 kaddr = page_address(page); 5695 memcpy(kaddr + offset, src, cur); 5696 5697 src += cur; 5698 len -= cur; 5699 offset = 0; 5700 i++; 5701 } 5702 } 5703 5704 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5705 unsigned long len) 5706 { 5707 size_t cur; 5708 size_t offset; 5709 struct page *page; 5710 char *kaddr; 5711 size_t start_offset = offset_in_page(eb->start); 5712 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5713 5714 WARN_ON(start > eb->len); 5715 WARN_ON(start + len > eb->start + eb->len); 5716 5717 offset = offset_in_page(start_offset + start); 5718 5719 while (len > 0) { 5720 page = eb->pages[i]; 5721 WARN_ON(!PageUptodate(page)); 5722 5723 cur = min(len, PAGE_SIZE - offset); 5724 kaddr = page_address(page); 5725 memset(kaddr + offset, 0, cur); 5726 5727 len -= cur; 5728 offset = 0; 5729 i++; 5730 } 5731 } 5732 5733 void copy_extent_buffer_full(struct extent_buffer *dst, 5734 struct extent_buffer *src) 5735 { 5736 int i; 5737 int num_pages; 5738 5739 ASSERT(dst->len == src->len); 5740 5741 num_pages = num_extent_pages(dst); 5742 for (i = 0; i < num_pages; i++) 5743 copy_page(page_address(dst->pages[i]), 5744 page_address(src->pages[i])); 5745 } 5746 5747 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5748 unsigned long dst_offset, unsigned long src_offset, 5749 unsigned long len) 5750 { 5751 u64 dst_len = dst->len; 5752 size_t cur; 5753 size_t offset; 5754 struct page *page; 5755 char *kaddr; 5756 size_t start_offset = offset_in_page(dst->start); 5757 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5758 5759 WARN_ON(src->len != dst_len); 5760 5761 offset = offset_in_page(start_offset + dst_offset); 5762 5763 while (len > 0) { 5764 page = dst->pages[i]; 5765 WARN_ON(!PageUptodate(page)); 5766 5767 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5768 5769 kaddr = page_address(page); 5770 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5771 5772 src_offset += cur; 5773 len -= cur; 5774 offset = 0; 5775 i++; 5776 } 5777 } 5778 5779 /* 5780 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5781 * given bit number 5782 * @eb: the extent buffer 5783 * @start: offset of the bitmap item in the extent buffer 5784 * @nr: bit number 5785 * @page_index: return index of the page in the extent buffer that contains the 5786 * given bit number 5787 * @page_offset: return offset into the page given by page_index 5788 * 5789 * This helper hides the ugliness of finding the byte in an extent buffer which 5790 * contains a given bit. 5791 */ 5792 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5793 unsigned long start, unsigned long nr, 5794 unsigned long *page_index, 5795 size_t *page_offset) 5796 { 5797 size_t start_offset = offset_in_page(eb->start); 5798 size_t byte_offset = BIT_BYTE(nr); 5799 size_t offset; 5800 5801 /* 5802 * The byte we want is the offset of the extent buffer + the offset of 5803 * the bitmap item in the extent buffer + the offset of the byte in the 5804 * bitmap item. 5805 */ 5806 offset = start_offset + start + byte_offset; 5807 5808 *page_index = offset >> PAGE_SHIFT; 5809 *page_offset = offset_in_page(offset); 5810 } 5811 5812 /** 5813 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5814 * @eb: the extent buffer 5815 * @start: offset of the bitmap item in the extent buffer 5816 * @nr: bit number to test 5817 */ 5818 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5819 unsigned long nr) 5820 { 5821 u8 *kaddr; 5822 struct page *page; 5823 unsigned long i; 5824 size_t offset; 5825 5826 eb_bitmap_offset(eb, start, nr, &i, &offset); 5827 page = eb->pages[i]; 5828 WARN_ON(!PageUptodate(page)); 5829 kaddr = page_address(page); 5830 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5831 } 5832 5833 /** 5834 * extent_buffer_bitmap_set - set an area of a bitmap 5835 * @eb: the extent buffer 5836 * @start: offset of the bitmap item in the extent buffer 5837 * @pos: bit number of the first bit 5838 * @len: number of bits to set 5839 */ 5840 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5841 unsigned long pos, unsigned long len) 5842 { 5843 u8 *kaddr; 5844 struct page *page; 5845 unsigned long i; 5846 size_t offset; 5847 const unsigned int size = pos + len; 5848 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5849 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5850 5851 eb_bitmap_offset(eb, start, pos, &i, &offset); 5852 page = eb->pages[i]; 5853 WARN_ON(!PageUptodate(page)); 5854 kaddr = page_address(page); 5855 5856 while (len >= bits_to_set) { 5857 kaddr[offset] |= mask_to_set; 5858 len -= bits_to_set; 5859 bits_to_set = BITS_PER_BYTE; 5860 mask_to_set = ~0; 5861 if (++offset >= PAGE_SIZE && len > 0) { 5862 offset = 0; 5863 page = eb->pages[++i]; 5864 WARN_ON(!PageUptodate(page)); 5865 kaddr = page_address(page); 5866 } 5867 } 5868 if (len) { 5869 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5870 kaddr[offset] |= mask_to_set; 5871 } 5872 } 5873 5874 5875 /** 5876 * extent_buffer_bitmap_clear - clear an area of a bitmap 5877 * @eb: the extent buffer 5878 * @start: offset of the bitmap item in the extent buffer 5879 * @pos: bit number of the first bit 5880 * @len: number of bits to clear 5881 */ 5882 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5883 unsigned long pos, unsigned long len) 5884 { 5885 u8 *kaddr; 5886 struct page *page; 5887 unsigned long i; 5888 size_t offset; 5889 const unsigned int size = pos + len; 5890 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5891 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5892 5893 eb_bitmap_offset(eb, start, pos, &i, &offset); 5894 page = eb->pages[i]; 5895 WARN_ON(!PageUptodate(page)); 5896 kaddr = page_address(page); 5897 5898 while (len >= bits_to_clear) { 5899 kaddr[offset] &= ~mask_to_clear; 5900 len -= bits_to_clear; 5901 bits_to_clear = BITS_PER_BYTE; 5902 mask_to_clear = ~0; 5903 if (++offset >= PAGE_SIZE && len > 0) { 5904 offset = 0; 5905 page = eb->pages[++i]; 5906 WARN_ON(!PageUptodate(page)); 5907 kaddr = page_address(page); 5908 } 5909 } 5910 if (len) { 5911 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5912 kaddr[offset] &= ~mask_to_clear; 5913 } 5914 } 5915 5916 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5917 { 5918 unsigned long distance = (src > dst) ? src - dst : dst - src; 5919 return distance < len; 5920 } 5921 5922 static void copy_pages(struct page *dst_page, struct page *src_page, 5923 unsigned long dst_off, unsigned long src_off, 5924 unsigned long len) 5925 { 5926 char *dst_kaddr = page_address(dst_page); 5927 char *src_kaddr; 5928 int must_memmove = 0; 5929 5930 if (dst_page != src_page) { 5931 src_kaddr = page_address(src_page); 5932 } else { 5933 src_kaddr = dst_kaddr; 5934 if (areas_overlap(src_off, dst_off, len)) 5935 must_memmove = 1; 5936 } 5937 5938 if (must_memmove) 5939 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5940 else 5941 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5942 } 5943 5944 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5945 unsigned long src_offset, unsigned long len) 5946 { 5947 struct btrfs_fs_info *fs_info = dst->fs_info; 5948 size_t cur; 5949 size_t dst_off_in_page; 5950 size_t src_off_in_page; 5951 size_t start_offset = offset_in_page(dst->start); 5952 unsigned long dst_i; 5953 unsigned long src_i; 5954 5955 if (src_offset + len > dst->len) { 5956 btrfs_err(fs_info, 5957 "memmove bogus src_offset %lu move len %lu dst len %lu", 5958 src_offset, len, dst->len); 5959 BUG(); 5960 } 5961 if (dst_offset + len > dst->len) { 5962 btrfs_err(fs_info, 5963 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5964 dst_offset, len, dst->len); 5965 BUG(); 5966 } 5967 5968 while (len > 0) { 5969 dst_off_in_page = offset_in_page(start_offset + dst_offset); 5970 src_off_in_page = offset_in_page(start_offset + src_offset); 5971 5972 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5973 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5974 5975 cur = min(len, (unsigned long)(PAGE_SIZE - 5976 src_off_in_page)); 5977 cur = min_t(unsigned long, cur, 5978 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5979 5980 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5981 dst_off_in_page, src_off_in_page, cur); 5982 5983 src_offset += cur; 5984 dst_offset += cur; 5985 len -= cur; 5986 } 5987 } 5988 5989 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5990 unsigned long src_offset, unsigned long len) 5991 { 5992 struct btrfs_fs_info *fs_info = dst->fs_info; 5993 size_t cur; 5994 size_t dst_off_in_page; 5995 size_t src_off_in_page; 5996 unsigned long dst_end = dst_offset + len - 1; 5997 unsigned long src_end = src_offset + len - 1; 5998 size_t start_offset = offset_in_page(dst->start); 5999 unsigned long dst_i; 6000 unsigned long src_i; 6001 6002 if (src_offset + len > dst->len) { 6003 btrfs_err(fs_info, 6004 "memmove bogus src_offset %lu move len %lu len %lu", 6005 src_offset, len, dst->len); 6006 BUG(); 6007 } 6008 if (dst_offset + len > dst->len) { 6009 btrfs_err(fs_info, 6010 "memmove bogus dst_offset %lu move len %lu len %lu", 6011 dst_offset, len, dst->len); 6012 BUG(); 6013 } 6014 if (dst_offset < src_offset) { 6015 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 6016 return; 6017 } 6018 while (len > 0) { 6019 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 6020 src_i = (start_offset + src_end) >> PAGE_SHIFT; 6021 6022 dst_off_in_page = offset_in_page(start_offset + dst_end); 6023 src_off_in_page = offset_in_page(start_offset + src_end); 6024 6025 cur = min_t(unsigned long, len, src_off_in_page + 1); 6026 cur = min(cur, dst_off_in_page + 1); 6027 copy_pages(dst->pages[dst_i], dst->pages[src_i], 6028 dst_off_in_page - cur + 1, 6029 src_off_in_page - cur + 1, cur); 6030 6031 dst_end -= cur; 6032 src_end -= cur; 6033 len -= cur; 6034 } 6035 } 6036 6037 int try_release_extent_buffer(struct page *page) 6038 { 6039 struct extent_buffer *eb; 6040 6041 /* 6042 * We need to make sure nobody is attaching this page to an eb right 6043 * now. 6044 */ 6045 spin_lock(&page->mapping->private_lock); 6046 if (!PagePrivate(page)) { 6047 spin_unlock(&page->mapping->private_lock); 6048 return 1; 6049 } 6050 6051 eb = (struct extent_buffer *)page->private; 6052 BUG_ON(!eb); 6053 6054 /* 6055 * This is a little awful but should be ok, we need to make sure that 6056 * the eb doesn't disappear out from under us while we're looking at 6057 * this page. 6058 */ 6059 spin_lock(&eb->refs_lock); 6060 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 6061 spin_unlock(&eb->refs_lock); 6062 spin_unlock(&page->mapping->private_lock); 6063 return 0; 6064 } 6065 spin_unlock(&page->mapping->private_lock); 6066 6067 /* 6068 * If tree ref isn't set then we know the ref on this eb is a real ref, 6069 * so just return, this page will likely be freed soon anyway. 6070 */ 6071 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 6072 spin_unlock(&eb->refs_lock); 6073 return 0; 6074 } 6075 6076 return release_extent_buffer(eb); 6077 } 6078