1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/writeback.h> 13 #include <linux/pagevec.h> 14 #include <linux/prefetch.h> 15 #include <linux/cleancache.h> 16 #include "extent_io.h" 17 #include "extent-io-tree.h" 18 #include "extent_map.h" 19 #include "ctree.h" 20 #include "btrfs_inode.h" 21 #include "volumes.h" 22 #include "check-integrity.h" 23 #include "locking.h" 24 #include "rcu-string.h" 25 #include "backref.h" 26 #include "disk-io.h" 27 28 static struct kmem_cache *extent_state_cache; 29 static struct kmem_cache *extent_buffer_cache; 30 static struct bio_set btrfs_bioset; 31 32 static inline bool extent_state_in_tree(const struct extent_state *state) 33 { 34 return !RB_EMPTY_NODE(&state->rb_node); 35 } 36 37 #ifdef CONFIG_BTRFS_DEBUG 38 static LIST_HEAD(buffers); 39 static LIST_HEAD(states); 40 41 static DEFINE_SPINLOCK(leak_lock); 42 43 static inline 44 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 45 { 46 unsigned long flags; 47 48 spin_lock_irqsave(&leak_lock, flags); 49 list_add(new, head); 50 spin_unlock_irqrestore(&leak_lock, flags); 51 } 52 53 static inline 54 void btrfs_leak_debug_del(struct list_head *entry) 55 { 56 unsigned long flags; 57 58 spin_lock_irqsave(&leak_lock, flags); 59 list_del(entry); 60 spin_unlock_irqrestore(&leak_lock, flags); 61 } 62 63 static inline void btrfs_extent_buffer_leak_debug_check(void) 64 { 65 struct extent_buffer *eb; 66 67 while (!list_empty(&buffers)) { 68 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 69 pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n", 70 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags); 71 list_del(&eb->leak_list); 72 kmem_cache_free(extent_buffer_cache, eb); 73 } 74 } 75 76 static inline void btrfs_extent_state_leak_debug_check(void) 77 { 78 struct extent_state *state; 79 80 while (!list_empty(&states)) { 81 state = list_entry(states.next, struct extent_state, leak_list); 82 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 83 state->start, state->end, state->state, 84 extent_state_in_tree(state), 85 refcount_read(&state->refs)); 86 list_del(&state->leak_list); 87 kmem_cache_free(extent_state_cache, state); 88 } 89 } 90 91 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 92 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 93 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 94 struct extent_io_tree *tree, u64 start, u64 end) 95 { 96 struct inode *inode = tree->private_data; 97 u64 isize; 98 99 if (!inode || !is_data_inode(inode)) 100 return; 101 102 isize = i_size_read(inode); 103 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 104 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 105 "%s: ino %llu isize %llu odd range [%llu,%llu]", 106 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 107 } 108 } 109 #else 110 #define btrfs_leak_debug_add(new, head) do {} while (0) 111 #define btrfs_leak_debug_del(entry) do {} while (0) 112 #define btrfs_extent_buffer_leak_debug_check() do {} while (0) 113 #define btrfs_extent_state_leak_debug_check() do {} while (0) 114 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 115 #endif 116 117 struct tree_entry { 118 u64 start; 119 u64 end; 120 struct rb_node rb_node; 121 }; 122 123 struct extent_page_data { 124 struct bio *bio; 125 struct extent_io_tree *tree; 126 /* tells writepage not to lock the state bits for this range 127 * it still does the unlocking 128 */ 129 unsigned int extent_locked:1; 130 131 /* tells the submit_bio code to use REQ_SYNC */ 132 unsigned int sync_io:1; 133 }; 134 135 static int add_extent_changeset(struct extent_state *state, unsigned bits, 136 struct extent_changeset *changeset, 137 int set) 138 { 139 int ret; 140 141 if (!changeset) 142 return 0; 143 if (set && (state->state & bits) == bits) 144 return 0; 145 if (!set && (state->state & bits) == 0) 146 return 0; 147 changeset->bytes_changed += state->end - state->start + 1; 148 ret = ulist_add(&changeset->range_changed, state->start, state->end, 149 GFP_ATOMIC); 150 return ret; 151 } 152 153 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 154 unsigned long bio_flags) 155 { 156 blk_status_t ret = 0; 157 struct extent_io_tree *tree = bio->bi_private; 158 159 bio->bi_private = NULL; 160 161 if (tree->ops) 162 ret = tree->ops->submit_bio_hook(tree->private_data, bio, 163 mirror_num, bio_flags); 164 else 165 btrfsic_submit_bio(bio); 166 167 return blk_status_to_errno(ret); 168 } 169 170 /* Cleanup unsubmitted bios */ 171 static void end_write_bio(struct extent_page_data *epd, int ret) 172 { 173 if (epd->bio) { 174 epd->bio->bi_status = errno_to_blk_status(ret); 175 bio_endio(epd->bio); 176 epd->bio = NULL; 177 } 178 } 179 180 /* 181 * Submit bio from extent page data via submit_one_bio 182 * 183 * Return 0 if everything is OK. 184 * Return <0 for error. 185 */ 186 static int __must_check flush_write_bio(struct extent_page_data *epd) 187 { 188 int ret = 0; 189 190 if (epd->bio) { 191 ret = submit_one_bio(epd->bio, 0, 0); 192 /* 193 * Clean up of epd->bio is handled by its endio function. 194 * And endio is either triggered by successful bio execution 195 * or the error handler of submit bio hook. 196 * So at this point, no matter what happened, we don't need 197 * to clean up epd->bio. 198 */ 199 epd->bio = NULL; 200 } 201 return ret; 202 } 203 204 int __init extent_state_cache_init(void) 205 { 206 extent_state_cache = kmem_cache_create("btrfs_extent_state", 207 sizeof(struct extent_state), 0, 208 SLAB_MEM_SPREAD, NULL); 209 if (!extent_state_cache) 210 return -ENOMEM; 211 return 0; 212 } 213 214 int __init extent_io_init(void) 215 { 216 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 217 sizeof(struct extent_buffer), 0, 218 SLAB_MEM_SPREAD, NULL); 219 if (!extent_buffer_cache) 220 return -ENOMEM; 221 222 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 223 offsetof(struct btrfs_io_bio, bio), 224 BIOSET_NEED_BVECS)) 225 goto free_buffer_cache; 226 227 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 228 goto free_bioset; 229 230 return 0; 231 232 free_bioset: 233 bioset_exit(&btrfs_bioset); 234 235 free_buffer_cache: 236 kmem_cache_destroy(extent_buffer_cache); 237 extent_buffer_cache = NULL; 238 return -ENOMEM; 239 } 240 241 void __cold extent_state_cache_exit(void) 242 { 243 btrfs_extent_state_leak_debug_check(); 244 kmem_cache_destroy(extent_state_cache); 245 } 246 247 void __cold extent_io_exit(void) 248 { 249 btrfs_extent_buffer_leak_debug_check(); 250 251 /* 252 * Make sure all delayed rcu free are flushed before we 253 * destroy caches. 254 */ 255 rcu_barrier(); 256 kmem_cache_destroy(extent_buffer_cache); 257 bioset_exit(&btrfs_bioset); 258 } 259 260 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 261 struct extent_io_tree *tree, unsigned int owner, 262 void *private_data) 263 { 264 tree->fs_info = fs_info; 265 tree->state = RB_ROOT; 266 tree->ops = NULL; 267 tree->dirty_bytes = 0; 268 spin_lock_init(&tree->lock); 269 tree->private_data = private_data; 270 tree->owner = owner; 271 } 272 273 void extent_io_tree_release(struct extent_io_tree *tree) 274 { 275 spin_lock(&tree->lock); 276 /* 277 * Do a single barrier for the waitqueue_active check here, the state 278 * of the waitqueue should not change once extent_io_tree_release is 279 * called. 280 */ 281 smp_mb(); 282 while (!RB_EMPTY_ROOT(&tree->state)) { 283 struct rb_node *node; 284 struct extent_state *state; 285 286 node = rb_first(&tree->state); 287 state = rb_entry(node, struct extent_state, rb_node); 288 rb_erase(&state->rb_node, &tree->state); 289 RB_CLEAR_NODE(&state->rb_node); 290 /* 291 * btree io trees aren't supposed to have tasks waiting for 292 * changes in the flags of extent states ever. 293 */ 294 ASSERT(!waitqueue_active(&state->wq)); 295 free_extent_state(state); 296 297 cond_resched_lock(&tree->lock); 298 } 299 spin_unlock(&tree->lock); 300 } 301 302 static struct extent_state *alloc_extent_state(gfp_t mask) 303 { 304 struct extent_state *state; 305 306 /* 307 * The given mask might be not appropriate for the slab allocator, 308 * drop the unsupported bits 309 */ 310 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 311 state = kmem_cache_alloc(extent_state_cache, mask); 312 if (!state) 313 return state; 314 state->state = 0; 315 state->failrec = NULL; 316 RB_CLEAR_NODE(&state->rb_node); 317 btrfs_leak_debug_add(&state->leak_list, &states); 318 refcount_set(&state->refs, 1); 319 init_waitqueue_head(&state->wq); 320 trace_alloc_extent_state(state, mask, _RET_IP_); 321 return state; 322 } 323 324 void free_extent_state(struct extent_state *state) 325 { 326 if (!state) 327 return; 328 if (refcount_dec_and_test(&state->refs)) { 329 WARN_ON(extent_state_in_tree(state)); 330 btrfs_leak_debug_del(&state->leak_list); 331 trace_free_extent_state(state, _RET_IP_); 332 kmem_cache_free(extent_state_cache, state); 333 } 334 } 335 336 static struct rb_node *tree_insert(struct rb_root *root, 337 struct rb_node *search_start, 338 u64 offset, 339 struct rb_node *node, 340 struct rb_node ***p_in, 341 struct rb_node **parent_in) 342 { 343 struct rb_node **p; 344 struct rb_node *parent = NULL; 345 struct tree_entry *entry; 346 347 if (p_in && parent_in) { 348 p = *p_in; 349 parent = *parent_in; 350 goto do_insert; 351 } 352 353 p = search_start ? &search_start : &root->rb_node; 354 while (*p) { 355 parent = *p; 356 entry = rb_entry(parent, struct tree_entry, rb_node); 357 358 if (offset < entry->start) 359 p = &(*p)->rb_left; 360 else if (offset > entry->end) 361 p = &(*p)->rb_right; 362 else 363 return parent; 364 } 365 366 do_insert: 367 rb_link_node(node, parent, p); 368 rb_insert_color(node, root); 369 return NULL; 370 } 371 372 /** 373 * __etree_search - searche @tree for an entry that contains @offset. Such 374 * entry would have entry->start <= offset && entry->end >= offset. 375 * 376 * @tree - the tree to search 377 * @offset - offset that should fall within an entry in @tree 378 * @next_ret - pointer to the first entry whose range ends after @offset 379 * @prev - pointer to the first entry whose range begins before @offset 380 * @p_ret - pointer where new node should be anchored (used when inserting an 381 * entry in the tree) 382 * @parent_ret - points to entry which would have been the parent of the entry, 383 * containing @offset 384 * 385 * This function returns a pointer to the entry that contains @offset byte 386 * address. If no such entry exists, then NULL is returned and the other 387 * pointer arguments to the function are filled, otherwise the found entry is 388 * returned and other pointers are left untouched. 389 */ 390 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 391 struct rb_node **next_ret, 392 struct rb_node **prev_ret, 393 struct rb_node ***p_ret, 394 struct rb_node **parent_ret) 395 { 396 struct rb_root *root = &tree->state; 397 struct rb_node **n = &root->rb_node; 398 struct rb_node *prev = NULL; 399 struct rb_node *orig_prev = NULL; 400 struct tree_entry *entry; 401 struct tree_entry *prev_entry = NULL; 402 403 while (*n) { 404 prev = *n; 405 entry = rb_entry(prev, struct tree_entry, rb_node); 406 prev_entry = entry; 407 408 if (offset < entry->start) 409 n = &(*n)->rb_left; 410 else if (offset > entry->end) 411 n = &(*n)->rb_right; 412 else 413 return *n; 414 } 415 416 if (p_ret) 417 *p_ret = n; 418 if (parent_ret) 419 *parent_ret = prev; 420 421 if (next_ret) { 422 orig_prev = prev; 423 while (prev && offset > prev_entry->end) { 424 prev = rb_next(prev); 425 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 426 } 427 *next_ret = prev; 428 prev = orig_prev; 429 } 430 431 if (prev_ret) { 432 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 433 while (prev && offset < prev_entry->start) { 434 prev = rb_prev(prev); 435 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 436 } 437 *prev_ret = prev; 438 } 439 return NULL; 440 } 441 442 static inline struct rb_node * 443 tree_search_for_insert(struct extent_io_tree *tree, 444 u64 offset, 445 struct rb_node ***p_ret, 446 struct rb_node **parent_ret) 447 { 448 struct rb_node *next= NULL; 449 struct rb_node *ret; 450 451 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); 452 if (!ret) 453 return next; 454 return ret; 455 } 456 457 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 458 u64 offset) 459 { 460 return tree_search_for_insert(tree, offset, NULL, NULL); 461 } 462 463 /* 464 * utility function to look for merge candidates inside a given range. 465 * Any extents with matching state are merged together into a single 466 * extent in the tree. Extents with EXTENT_IO in their state field 467 * are not merged because the end_io handlers need to be able to do 468 * operations on them without sleeping (or doing allocations/splits). 469 * 470 * This should be called with the tree lock held. 471 */ 472 static void merge_state(struct extent_io_tree *tree, 473 struct extent_state *state) 474 { 475 struct extent_state *other; 476 struct rb_node *other_node; 477 478 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 479 return; 480 481 other_node = rb_prev(&state->rb_node); 482 if (other_node) { 483 other = rb_entry(other_node, struct extent_state, rb_node); 484 if (other->end == state->start - 1 && 485 other->state == state->state) { 486 if (tree->private_data && 487 is_data_inode(tree->private_data)) 488 btrfs_merge_delalloc_extent(tree->private_data, 489 state, other); 490 state->start = other->start; 491 rb_erase(&other->rb_node, &tree->state); 492 RB_CLEAR_NODE(&other->rb_node); 493 free_extent_state(other); 494 } 495 } 496 other_node = rb_next(&state->rb_node); 497 if (other_node) { 498 other = rb_entry(other_node, struct extent_state, rb_node); 499 if (other->start == state->end + 1 && 500 other->state == state->state) { 501 if (tree->private_data && 502 is_data_inode(tree->private_data)) 503 btrfs_merge_delalloc_extent(tree->private_data, 504 state, other); 505 state->end = other->end; 506 rb_erase(&other->rb_node, &tree->state); 507 RB_CLEAR_NODE(&other->rb_node); 508 free_extent_state(other); 509 } 510 } 511 } 512 513 static void set_state_bits(struct extent_io_tree *tree, 514 struct extent_state *state, unsigned *bits, 515 struct extent_changeset *changeset); 516 517 /* 518 * insert an extent_state struct into the tree. 'bits' are set on the 519 * struct before it is inserted. 520 * 521 * This may return -EEXIST if the extent is already there, in which case the 522 * state struct is freed. 523 * 524 * The tree lock is not taken internally. This is a utility function and 525 * probably isn't what you want to call (see set/clear_extent_bit). 526 */ 527 static int insert_state(struct extent_io_tree *tree, 528 struct extent_state *state, u64 start, u64 end, 529 struct rb_node ***p, 530 struct rb_node **parent, 531 unsigned *bits, struct extent_changeset *changeset) 532 { 533 struct rb_node *node; 534 535 if (end < start) { 536 btrfs_err(tree->fs_info, 537 "insert state: end < start %llu %llu", end, start); 538 WARN_ON(1); 539 } 540 state->start = start; 541 state->end = end; 542 543 set_state_bits(tree, state, bits, changeset); 544 545 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 546 if (node) { 547 struct extent_state *found; 548 found = rb_entry(node, struct extent_state, rb_node); 549 btrfs_err(tree->fs_info, 550 "found node %llu %llu on insert of %llu %llu", 551 found->start, found->end, start, end); 552 return -EEXIST; 553 } 554 merge_state(tree, state); 555 return 0; 556 } 557 558 /* 559 * split a given extent state struct in two, inserting the preallocated 560 * struct 'prealloc' as the newly created second half. 'split' indicates an 561 * offset inside 'orig' where it should be split. 562 * 563 * Before calling, 564 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 565 * are two extent state structs in the tree: 566 * prealloc: [orig->start, split - 1] 567 * orig: [ split, orig->end ] 568 * 569 * The tree locks are not taken by this function. They need to be held 570 * by the caller. 571 */ 572 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 573 struct extent_state *prealloc, u64 split) 574 { 575 struct rb_node *node; 576 577 if (tree->private_data && is_data_inode(tree->private_data)) 578 btrfs_split_delalloc_extent(tree->private_data, orig, split); 579 580 prealloc->start = orig->start; 581 prealloc->end = split - 1; 582 prealloc->state = orig->state; 583 orig->start = split; 584 585 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 586 &prealloc->rb_node, NULL, NULL); 587 if (node) { 588 free_extent_state(prealloc); 589 return -EEXIST; 590 } 591 return 0; 592 } 593 594 static struct extent_state *next_state(struct extent_state *state) 595 { 596 struct rb_node *next = rb_next(&state->rb_node); 597 if (next) 598 return rb_entry(next, struct extent_state, rb_node); 599 else 600 return NULL; 601 } 602 603 /* 604 * utility function to clear some bits in an extent state struct. 605 * it will optionally wake up anyone waiting on this state (wake == 1). 606 * 607 * If no bits are set on the state struct after clearing things, the 608 * struct is freed and removed from the tree 609 */ 610 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 611 struct extent_state *state, 612 unsigned *bits, int wake, 613 struct extent_changeset *changeset) 614 { 615 struct extent_state *next; 616 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 617 int ret; 618 619 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 620 u64 range = state->end - state->start + 1; 621 WARN_ON(range > tree->dirty_bytes); 622 tree->dirty_bytes -= range; 623 } 624 625 if (tree->private_data && is_data_inode(tree->private_data)) 626 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 627 628 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 629 BUG_ON(ret < 0); 630 state->state &= ~bits_to_clear; 631 if (wake) 632 wake_up(&state->wq); 633 if (state->state == 0) { 634 next = next_state(state); 635 if (extent_state_in_tree(state)) { 636 rb_erase(&state->rb_node, &tree->state); 637 RB_CLEAR_NODE(&state->rb_node); 638 free_extent_state(state); 639 } else { 640 WARN_ON(1); 641 } 642 } else { 643 merge_state(tree, state); 644 next = next_state(state); 645 } 646 return next; 647 } 648 649 static struct extent_state * 650 alloc_extent_state_atomic(struct extent_state *prealloc) 651 { 652 if (!prealloc) 653 prealloc = alloc_extent_state(GFP_ATOMIC); 654 655 return prealloc; 656 } 657 658 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 659 { 660 struct inode *inode = tree->private_data; 661 662 btrfs_panic(btrfs_sb(inode->i_sb), err, 663 "locking error: extent tree was modified by another thread while locked"); 664 } 665 666 /* 667 * clear some bits on a range in the tree. This may require splitting 668 * or inserting elements in the tree, so the gfp mask is used to 669 * indicate which allocations or sleeping are allowed. 670 * 671 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 672 * the given range from the tree regardless of state (ie for truncate). 673 * 674 * the range [start, end] is inclusive. 675 * 676 * This takes the tree lock, and returns 0 on success and < 0 on error. 677 */ 678 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 679 unsigned bits, int wake, int delete, 680 struct extent_state **cached_state, 681 gfp_t mask, struct extent_changeset *changeset) 682 { 683 struct extent_state *state; 684 struct extent_state *cached; 685 struct extent_state *prealloc = NULL; 686 struct rb_node *node; 687 u64 last_end; 688 int err; 689 int clear = 0; 690 691 btrfs_debug_check_extent_io_range(tree, start, end); 692 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); 693 694 if (bits & EXTENT_DELALLOC) 695 bits |= EXTENT_NORESERVE; 696 697 if (delete) 698 bits |= ~EXTENT_CTLBITS; 699 700 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 701 clear = 1; 702 again: 703 if (!prealloc && gfpflags_allow_blocking(mask)) { 704 /* 705 * Don't care for allocation failure here because we might end 706 * up not needing the pre-allocated extent state at all, which 707 * is the case if we only have in the tree extent states that 708 * cover our input range and don't cover too any other range. 709 * If we end up needing a new extent state we allocate it later. 710 */ 711 prealloc = alloc_extent_state(mask); 712 } 713 714 spin_lock(&tree->lock); 715 if (cached_state) { 716 cached = *cached_state; 717 718 if (clear) { 719 *cached_state = NULL; 720 cached_state = NULL; 721 } 722 723 if (cached && extent_state_in_tree(cached) && 724 cached->start <= start && cached->end > start) { 725 if (clear) 726 refcount_dec(&cached->refs); 727 state = cached; 728 goto hit_next; 729 } 730 if (clear) 731 free_extent_state(cached); 732 } 733 /* 734 * this search will find the extents that end after 735 * our range starts 736 */ 737 node = tree_search(tree, start); 738 if (!node) 739 goto out; 740 state = rb_entry(node, struct extent_state, rb_node); 741 hit_next: 742 if (state->start > end) 743 goto out; 744 WARN_ON(state->end < start); 745 last_end = state->end; 746 747 /* the state doesn't have the wanted bits, go ahead */ 748 if (!(state->state & bits)) { 749 state = next_state(state); 750 goto next; 751 } 752 753 /* 754 * | ---- desired range ---- | 755 * | state | or 756 * | ------------- state -------------- | 757 * 758 * We need to split the extent we found, and may flip 759 * bits on second half. 760 * 761 * If the extent we found extends past our range, we 762 * just split and search again. It'll get split again 763 * the next time though. 764 * 765 * If the extent we found is inside our range, we clear 766 * the desired bit on it. 767 */ 768 769 if (state->start < start) { 770 prealloc = alloc_extent_state_atomic(prealloc); 771 BUG_ON(!prealloc); 772 err = split_state(tree, state, prealloc, start); 773 if (err) 774 extent_io_tree_panic(tree, err); 775 776 prealloc = NULL; 777 if (err) 778 goto out; 779 if (state->end <= end) { 780 state = clear_state_bit(tree, state, &bits, wake, 781 changeset); 782 goto next; 783 } 784 goto search_again; 785 } 786 /* 787 * | ---- desired range ---- | 788 * | state | 789 * We need to split the extent, and clear the bit 790 * on the first half 791 */ 792 if (state->start <= end && state->end > end) { 793 prealloc = alloc_extent_state_atomic(prealloc); 794 BUG_ON(!prealloc); 795 err = split_state(tree, state, prealloc, end + 1); 796 if (err) 797 extent_io_tree_panic(tree, err); 798 799 if (wake) 800 wake_up(&state->wq); 801 802 clear_state_bit(tree, prealloc, &bits, wake, changeset); 803 804 prealloc = NULL; 805 goto out; 806 } 807 808 state = clear_state_bit(tree, state, &bits, wake, changeset); 809 next: 810 if (last_end == (u64)-1) 811 goto out; 812 start = last_end + 1; 813 if (start <= end && state && !need_resched()) 814 goto hit_next; 815 816 search_again: 817 if (start > end) 818 goto out; 819 spin_unlock(&tree->lock); 820 if (gfpflags_allow_blocking(mask)) 821 cond_resched(); 822 goto again; 823 824 out: 825 spin_unlock(&tree->lock); 826 if (prealloc) 827 free_extent_state(prealloc); 828 829 return 0; 830 831 } 832 833 static void wait_on_state(struct extent_io_tree *tree, 834 struct extent_state *state) 835 __releases(tree->lock) 836 __acquires(tree->lock) 837 { 838 DEFINE_WAIT(wait); 839 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 840 spin_unlock(&tree->lock); 841 schedule(); 842 spin_lock(&tree->lock); 843 finish_wait(&state->wq, &wait); 844 } 845 846 /* 847 * waits for one or more bits to clear on a range in the state tree. 848 * The range [start, end] is inclusive. 849 * The tree lock is taken by this function 850 */ 851 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 852 unsigned long bits) 853 { 854 struct extent_state *state; 855 struct rb_node *node; 856 857 btrfs_debug_check_extent_io_range(tree, start, end); 858 859 spin_lock(&tree->lock); 860 again: 861 while (1) { 862 /* 863 * this search will find all the extents that end after 864 * our range starts 865 */ 866 node = tree_search(tree, start); 867 process_node: 868 if (!node) 869 break; 870 871 state = rb_entry(node, struct extent_state, rb_node); 872 873 if (state->start > end) 874 goto out; 875 876 if (state->state & bits) { 877 start = state->start; 878 refcount_inc(&state->refs); 879 wait_on_state(tree, state); 880 free_extent_state(state); 881 goto again; 882 } 883 start = state->end + 1; 884 885 if (start > end) 886 break; 887 888 if (!cond_resched_lock(&tree->lock)) { 889 node = rb_next(node); 890 goto process_node; 891 } 892 } 893 out: 894 spin_unlock(&tree->lock); 895 } 896 897 static void set_state_bits(struct extent_io_tree *tree, 898 struct extent_state *state, 899 unsigned *bits, struct extent_changeset *changeset) 900 { 901 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 902 int ret; 903 904 if (tree->private_data && is_data_inode(tree->private_data)) 905 btrfs_set_delalloc_extent(tree->private_data, state, bits); 906 907 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 908 u64 range = state->end - state->start + 1; 909 tree->dirty_bytes += range; 910 } 911 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 912 BUG_ON(ret < 0); 913 state->state |= bits_to_set; 914 } 915 916 static void cache_state_if_flags(struct extent_state *state, 917 struct extent_state **cached_ptr, 918 unsigned flags) 919 { 920 if (cached_ptr && !(*cached_ptr)) { 921 if (!flags || (state->state & flags)) { 922 *cached_ptr = state; 923 refcount_inc(&state->refs); 924 } 925 } 926 } 927 928 static void cache_state(struct extent_state *state, 929 struct extent_state **cached_ptr) 930 { 931 return cache_state_if_flags(state, cached_ptr, 932 EXTENT_LOCKED | EXTENT_BOUNDARY); 933 } 934 935 /* 936 * set some bits on a range in the tree. This may require allocations or 937 * sleeping, so the gfp mask is used to indicate what is allowed. 938 * 939 * If any of the exclusive bits are set, this will fail with -EEXIST if some 940 * part of the range already has the desired bits set. The start of the 941 * existing range is returned in failed_start in this case. 942 * 943 * [start, end] is inclusive This takes the tree lock. 944 */ 945 946 static int __must_check 947 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 948 unsigned bits, unsigned exclusive_bits, 949 u64 *failed_start, struct extent_state **cached_state, 950 gfp_t mask, struct extent_changeset *changeset) 951 { 952 struct extent_state *state; 953 struct extent_state *prealloc = NULL; 954 struct rb_node *node; 955 struct rb_node **p; 956 struct rb_node *parent; 957 int err = 0; 958 u64 last_start; 959 u64 last_end; 960 961 btrfs_debug_check_extent_io_range(tree, start, end); 962 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); 963 964 again: 965 if (!prealloc && gfpflags_allow_blocking(mask)) { 966 /* 967 * Don't care for allocation failure here because we might end 968 * up not needing the pre-allocated extent state at all, which 969 * is the case if we only have in the tree extent states that 970 * cover our input range and don't cover too any other range. 971 * If we end up needing a new extent state we allocate it later. 972 */ 973 prealloc = alloc_extent_state(mask); 974 } 975 976 spin_lock(&tree->lock); 977 if (cached_state && *cached_state) { 978 state = *cached_state; 979 if (state->start <= start && state->end > start && 980 extent_state_in_tree(state)) { 981 node = &state->rb_node; 982 goto hit_next; 983 } 984 } 985 /* 986 * this search will find all the extents that end after 987 * our range starts. 988 */ 989 node = tree_search_for_insert(tree, start, &p, &parent); 990 if (!node) { 991 prealloc = alloc_extent_state_atomic(prealloc); 992 BUG_ON(!prealloc); 993 err = insert_state(tree, prealloc, start, end, 994 &p, &parent, &bits, changeset); 995 if (err) 996 extent_io_tree_panic(tree, err); 997 998 cache_state(prealloc, cached_state); 999 prealloc = NULL; 1000 goto out; 1001 } 1002 state = rb_entry(node, struct extent_state, rb_node); 1003 hit_next: 1004 last_start = state->start; 1005 last_end = state->end; 1006 1007 /* 1008 * | ---- desired range ---- | 1009 * | state | 1010 * 1011 * Just lock what we found and keep going 1012 */ 1013 if (state->start == start && state->end <= end) { 1014 if (state->state & exclusive_bits) { 1015 *failed_start = state->start; 1016 err = -EEXIST; 1017 goto out; 1018 } 1019 1020 set_state_bits(tree, state, &bits, changeset); 1021 cache_state(state, cached_state); 1022 merge_state(tree, state); 1023 if (last_end == (u64)-1) 1024 goto out; 1025 start = last_end + 1; 1026 state = next_state(state); 1027 if (start < end && state && state->start == start && 1028 !need_resched()) 1029 goto hit_next; 1030 goto search_again; 1031 } 1032 1033 /* 1034 * | ---- desired range ---- | 1035 * | state | 1036 * or 1037 * | ------------- state -------------- | 1038 * 1039 * We need to split the extent we found, and may flip bits on 1040 * second half. 1041 * 1042 * If the extent we found extends past our 1043 * range, we just split and search again. It'll get split 1044 * again the next time though. 1045 * 1046 * If the extent we found is inside our range, we set the 1047 * desired bit on it. 1048 */ 1049 if (state->start < start) { 1050 if (state->state & exclusive_bits) { 1051 *failed_start = start; 1052 err = -EEXIST; 1053 goto out; 1054 } 1055 1056 prealloc = alloc_extent_state_atomic(prealloc); 1057 BUG_ON(!prealloc); 1058 err = split_state(tree, state, prealloc, start); 1059 if (err) 1060 extent_io_tree_panic(tree, err); 1061 1062 prealloc = NULL; 1063 if (err) 1064 goto out; 1065 if (state->end <= end) { 1066 set_state_bits(tree, state, &bits, changeset); 1067 cache_state(state, cached_state); 1068 merge_state(tree, state); 1069 if (last_end == (u64)-1) 1070 goto out; 1071 start = last_end + 1; 1072 state = next_state(state); 1073 if (start < end && state && state->start == start && 1074 !need_resched()) 1075 goto hit_next; 1076 } 1077 goto search_again; 1078 } 1079 /* 1080 * | ---- desired range ---- | 1081 * | state | or | state | 1082 * 1083 * There's a hole, we need to insert something in it and 1084 * ignore the extent we found. 1085 */ 1086 if (state->start > start) { 1087 u64 this_end; 1088 if (end < last_start) 1089 this_end = end; 1090 else 1091 this_end = last_start - 1; 1092 1093 prealloc = alloc_extent_state_atomic(prealloc); 1094 BUG_ON(!prealloc); 1095 1096 /* 1097 * Avoid to free 'prealloc' if it can be merged with 1098 * the later extent. 1099 */ 1100 err = insert_state(tree, prealloc, start, this_end, 1101 NULL, NULL, &bits, changeset); 1102 if (err) 1103 extent_io_tree_panic(tree, err); 1104 1105 cache_state(prealloc, cached_state); 1106 prealloc = NULL; 1107 start = this_end + 1; 1108 goto search_again; 1109 } 1110 /* 1111 * | ---- desired range ---- | 1112 * | state | 1113 * We need to split the extent, and set the bit 1114 * on the first half 1115 */ 1116 if (state->start <= end && state->end > end) { 1117 if (state->state & exclusive_bits) { 1118 *failed_start = start; 1119 err = -EEXIST; 1120 goto out; 1121 } 1122 1123 prealloc = alloc_extent_state_atomic(prealloc); 1124 BUG_ON(!prealloc); 1125 err = split_state(tree, state, prealloc, end + 1); 1126 if (err) 1127 extent_io_tree_panic(tree, err); 1128 1129 set_state_bits(tree, prealloc, &bits, changeset); 1130 cache_state(prealloc, cached_state); 1131 merge_state(tree, prealloc); 1132 prealloc = NULL; 1133 goto out; 1134 } 1135 1136 search_again: 1137 if (start > end) 1138 goto out; 1139 spin_unlock(&tree->lock); 1140 if (gfpflags_allow_blocking(mask)) 1141 cond_resched(); 1142 goto again; 1143 1144 out: 1145 spin_unlock(&tree->lock); 1146 if (prealloc) 1147 free_extent_state(prealloc); 1148 1149 return err; 1150 1151 } 1152 1153 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1154 unsigned bits, u64 * failed_start, 1155 struct extent_state **cached_state, gfp_t mask) 1156 { 1157 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1158 cached_state, mask, NULL); 1159 } 1160 1161 1162 /** 1163 * convert_extent_bit - convert all bits in a given range from one bit to 1164 * another 1165 * @tree: the io tree to search 1166 * @start: the start offset in bytes 1167 * @end: the end offset in bytes (inclusive) 1168 * @bits: the bits to set in this range 1169 * @clear_bits: the bits to clear in this range 1170 * @cached_state: state that we're going to cache 1171 * 1172 * This will go through and set bits for the given range. If any states exist 1173 * already in this range they are set with the given bit and cleared of the 1174 * clear_bits. This is only meant to be used by things that are mergeable, ie 1175 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1176 * boundary bits like LOCK. 1177 * 1178 * All allocations are done with GFP_NOFS. 1179 */ 1180 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1181 unsigned bits, unsigned clear_bits, 1182 struct extent_state **cached_state) 1183 { 1184 struct extent_state *state; 1185 struct extent_state *prealloc = NULL; 1186 struct rb_node *node; 1187 struct rb_node **p; 1188 struct rb_node *parent; 1189 int err = 0; 1190 u64 last_start; 1191 u64 last_end; 1192 bool first_iteration = true; 1193 1194 btrfs_debug_check_extent_io_range(tree, start, end); 1195 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, 1196 clear_bits); 1197 1198 again: 1199 if (!prealloc) { 1200 /* 1201 * Best effort, don't worry if extent state allocation fails 1202 * here for the first iteration. We might have a cached state 1203 * that matches exactly the target range, in which case no 1204 * extent state allocations are needed. We'll only know this 1205 * after locking the tree. 1206 */ 1207 prealloc = alloc_extent_state(GFP_NOFS); 1208 if (!prealloc && !first_iteration) 1209 return -ENOMEM; 1210 } 1211 1212 spin_lock(&tree->lock); 1213 if (cached_state && *cached_state) { 1214 state = *cached_state; 1215 if (state->start <= start && state->end > start && 1216 extent_state_in_tree(state)) { 1217 node = &state->rb_node; 1218 goto hit_next; 1219 } 1220 } 1221 1222 /* 1223 * this search will find all the extents that end after 1224 * our range starts. 1225 */ 1226 node = tree_search_for_insert(tree, start, &p, &parent); 1227 if (!node) { 1228 prealloc = alloc_extent_state_atomic(prealloc); 1229 if (!prealloc) { 1230 err = -ENOMEM; 1231 goto out; 1232 } 1233 err = insert_state(tree, prealloc, start, end, 1234 &p, &parent, &bits, NULL); 1235 if (err) 1236 extent_io_tree_panic(tree, err); 1237 cache_state(prealloc, cached_state); 1238 prealloc = NULL; 1239 goto out; 1240 } 1241 state = rb_entry(node, struct extent_state, rb_node); 1242 hit_next: 1243 last_start = state->start; 1244 last_end = state->end; 1245 1246 /* 1247 * | ---- desired range ---- | 1248 * | state | 1249 * 1250 * Just lock what we found and keep going 1251 */ 1252 if (state->start == start && state->end <= end) { 1253 set_state_bits(tree, state, &bits, NULL); 1254 cache_state(state, cached_state); 1255 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1256 if (last_end == (u64)-1) 1257 goto out; 1258 start = last_end + 1; 1259 if (start < end && state && state->start == start && 1260 !need_resched()) 1261 goto hit_next; 1262 goto search_again; 1263 } 1264 1265 /* 1266 * | ---- desired range ---- | 1267 * | state | 1268 * or 1269 * | ------------- state -------------- | 1270 * 1271 * We need to split the extent we found, and may flip bits on 1272 * second half. 1273 * 1274 * If the extent we found extends past our 1275 * range, we just split and search again. It'll get split 1276 * again the next time though. 1277 * 1278 * If the extent we found is inside our range, we set the 1279 * desired bit on it. 1280 */ 1281 if (state->start < start) { 1282 prealloc = alloc_extent_state_atomic(prealloc); 1283 if (!prealloc) { 1284 err = -ENOMEM; 1285 goto out; 1286 } 1287 err = split_state(tree, state, prealloc, start); 1288 if (err) 1289 extent_io_tree_panic(tree, err); 1290 prealloc = NULL; 1291 if (err) 1292 goto out; 1293 if (state->end <= end) { 1294 set_state_bits(tree, state, &bits, NULL); 1295 cache_state(state, cached_state); 1296 state = clear_state_bit(tree, state, &clear_bits, 0, 1297 NULL); 1298 if (last_end == (u64)-1) 1299 goto out; 1300 start = last_end + 1; 1301 if (start < end && state && state->start == start && 1302 !need_resched()) 1303 goto hit_next; 1304 } 1305 goto search_again; 1306 } 1307 /* 1308 * | ---- desired range ---- | 1309 * | state | or | state | 1310 * 1311 * There's a hole, we need to insert something in it and 1312 * ignore the extent we found. 1313 */ 1314 if (state->start > start) { 1315 u64 this_end; 1316 if (end < last_start) 1317 this_end = end; 1318 else 1319 this_end = last_start - 1; 1320 1321 prealloc = alloc_extent_state_atomic(prealloc); 1322 if (!prealloc) { 1323 err = -ENOMEM; 1324 goto out; 1325 } 1326 1327 /* 1328 * Avoid to free 'prealloc' if it can be merged with 1329 * the later extent. 1330 */ 1331 err = insert_state(tree, prealloc, start, this_end, 1332 NULL, NULL, &bits, NULL); 1333 if (err) 1334 extent_io_tree_panic(tree, err); 1335 cache_state(prealloc, cached_state); 1336 prealloc = NULL; 1337 start = this_end + 1; 1338 goto search_again; 1339 } 1340 /* 1341 * | ---- desired range ---- | 1342 * | state | 1343 * We need to split the extent, and set the bit 1344 * on the first half 1345 */ 1346 if (state->start <= end && state->end > end) { 1347 prealloc = alloc_extent_state_atomic(prealloc); 1348 if (!prealloc) { 1349 err = -ENOMEM; 1350 goto out; 1351 } 1352 1353 err = split_state(tree, state, prealloc, end + 1); 1354 if (err) 1355 extent_io_tree_panic(tree, err); 1356 1357 set_state_bits(tree, prealloc, &bits, NULL); 1358 cache_state(prealloc, cached_state); 1359 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1360 prealloc = NULL; 1361 goto out; 1362 } 1363 1364 search_again: 1365 if (start > end) 1366 goto out; 1367 spin_unlock(&tree->lock); 1368 cond_resched(); 1369 first_iteration = false; 1370 goto again; 1371 1372 out: 1373 spin_unlock(&tree->lock); 1374 if (prealloc) 1375 free_extent_state(prealloc); 1376 1377 return err; 1378 } 1379 1380 /* wrappers around set/clear extent bit */ 1381 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1382 unsigned bits, struct extent_changeset *changeset) 1383 { 1384 /* 1385 * We don't support EXTENT_LOCKED yet, as current changeset will 1386 * record any bits changed, so for EXTENT_LOCKED case, it will 1387 * either fail with -EEXIST or changeset will record the whole 1388 * range. 1389 */ 1390 BUG_ON(bits & EXTENT_LOCKED); 1391 1392 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1393 changeset); 1394 } 1395 1396 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, 1397 unsigned bits) 1398 { 1399 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, 1400 GFP_NOWAIT, NULL); 1401 } 1402 1403 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1404 unsigned bits, int wake, int delete, 1405 struct extent_state **cached) 1406 { 1407 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1408 cached, GFP_NOFS, NULL); 1409 } 1410 1411 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1412 unsigned bits, struct extent_changeset *changeset) 1413 { 1414 /* 1415 * Don't support EXTENT_LOCKED case, same reason as 1416 * set_record_extent_bits(). 1417 */ 1418 BUG_ON(bits & EXTENT_LOCKED); 1419 1420 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1421 changeset); 1422 } 1423 1424 /* 1425 * either insert or lock state struct between start and end use mask to tell 1426 * us if waiting is desired. 1427 */ 1428 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1429 struct extent_state **cached_state) 1430 { 1431 int err; 1432 u64 failed_start; 1433 1434 while (1) { 1435 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1436 EXTENT_LOCKED, &failed_start, 1437 cached_state, GFP_NOFS, NULL); 1438 if (err == -EEXIST) { 1439 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1440 start = failed_start; 1441 } else 1442 break; 1443 WARN_ON(start > end); 1444 } 1445 return err; 1446 } 1447 1448 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1449 { 1450 int err; 1451 u64 failed_start; 1452 1453 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1454 &failed_start, NULL, GFP_NOFS, NULL); 1455 if (err == -EEXIST) { 1456 if (failed_start > start) 1457 clear_extent_bit(tree, start, failed_start - 1, 1458 EXTENT_LOCKED, 1, 0, NULL); 1459 return 0; 1460 } 1461 return 1; 1462 } 1463 1464 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1465 { 1466 unsigned long index = start >> PAGE_SHIFT; 1467 unsigned long end_index = end >> PAGE_SHIFT; 1468 struct page *page; 1469 1470 while (index <= end_index) { 1471 page = find_get_page(inode->i_mapping, index); 1472 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1473 clear_page_dirty_for_io(page); 1474 put_page(page); 1475 index++; 1476 } 1477 } 1478 1479 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1480 { 1481 unsigned long index = start >> PAGE_SHIFT; 1482 unsigned long end_index = end >> PAGE_SHIFT; 1483 struct page *page; 1484 1485 while (index <= end_index) { 1486 page = find_get_page(inode->i_mapping, index); 1487 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1488 __set_page_dirty_nobuffers(page); 1489 account_page_redirty(page); 1490 put_page(page); 1491 index++; 1492 } 1493 } 1494 1495 /* find the first state struct with 'bits' set after 'start', and 1496 * return it. tree->lock must be held. NULL will returned if 1497 * nothing was found after 'start' 1498 */ 1499 static struct extent_state * 1500 find_first_extent_bit_state(struct extent_io_tree *tree, 1501 u64 start, unsigned bits) 1502 { 1503 struct rb_node *node; 1504 struct extent_state *state; 1505 1506 /* 1507 * this search will find all the extents that end after 1508 * our range starts. 1509 */ 1510 node = tree_search(tree, start); 1511 if (!node) 1512 goto out; 1513 1514 while (1) { 1515 state = rb_entry(node, struct extent_state, rb_node); 1516 if (state->end >= start && (state->state & bits)) 1517 return state; 1518 1519 node = rb_next(node); 1520 if (!node) 1521 break; 1522 } 1523 out: 1524 return NULL; 1525 } 1526 1527 /* 1528 * find the first offset in the io tree with 'bits' set. zero is 1529 * returned if we find something, and *start_ret and *end_ret are 1530 * set to reflect the state struct that was found. 1531 * 1532 * If nothing was found, 1 is returned. If found something, return 0. 1533 */ 1534 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1535 u64 *start_ret, u64 *end_ret, unsigned bits, 1536 struct extent_state **cached_state) 1537 { 1538 struct extent_state *state; 1539 int ret = 1; 1540 1541 spin_lock(&tree->lock); 1542 if (cached_state && *cached_state) { 1543 state = *cached_state; 1544 if (state->end == start - 1 && extent_state_in_tree(state)) { 1545 while ((state = next_state(state)) != NULL) { 1546 if (state->state & bits) 1547 goto got_it; 1548 } 1549 free_extent_state(*cached_state); 1550 *cached_state = NULL; 1551 goto out; 1552 } 1553 free_extent_state(*cached_state); 1554 *cached_state = NULL; 1555 } 1556 1557 state = find_first_extent_bit_state(tree, start, bits); 1558 got_it: 1559 if (state) { 1560 cache_state_if_flags(state, cached_state, 0); 1561 *start_ret = state->start; 1562 *end_ret = state->end; 1563 ret = 0; 1564 } 1565 out: 1566 spin_unlock(&tree->lock); 1567 return ret; 1568 } 1569 1570 /** 1571 * find_first_clear_extent_bit - find the first range that has @bits not set. 1572 * This range could start before @start. 1573 * 1574 * @tree - the tree to search 1575 * @start - the offset at/after which the found extent should start 1576 * @start_ret - records the beginning of the range 1577 * @end_ret - records the end of the range (inclusive) 1578 * @bits - the set of bits which must be unset 1579 * 1580 * Since unallocated range is also considered one which doesn't have the bits 1581 * set it's possible that @end_ret contains -1, this happens in case the range 1582 * spans (last_range_end, end of device]. In this case it's up to the caller to 1583 * trim @end_ret to the appropriate size. 1584 */ 1585 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, 1586 u64 *start_ret, u64 *end_ret, unsigned bits) 1587 { 1588 struct extent_state *state; 1589 struct rb_node *node, *prev = NULL, *next; 1590 1591 spin_lock(&tree->lock); 1592 1593 /* Find first extent with bits cleared */ 1594 while (1) { 1595 node = __etree_search(tree, start, &next, &prev, NULL, NULL); 1596 if (!node) { 1597 node = next; 1598 if (!node) { 1599 /* 1600 * We are past the last allocated chunk, 1601 * set start at the end of the last extent. The 1602 * device alloc tree should never be empty so 1603 * prev is always set. 1604 */ 1605 ASSERT(prev); 1606 state = rb_entry(prev, struct extent_state, rb_node); 1607 *start_ret = state->end + 1; 1608 *end_ret = -1; 1609 goto out; 1610 } 1611 } 1612 /* 1613 * At this point 'node' either contains 'start' or start is 1614 * before 'node' 1615 */ 1616 state = rb_entry(node, struct extent_state, rb_node); 1617 1618 if (in_range(start, state->start, state->end - state->start + 1)) { 1619 if (state->state & bits) { 1620 /* 1621 * |--range with bits sets--| 1622 * | 1623 * start 1624 */ 1625 start = state->end + 1; 1626 } else { 1627 /* 1628 * 'start' falls within a range that doesn't 1629 * have the bits set, so take its start as 1630 * the beginning of the desired range 1631 * 1632 * |--range with bits cleared----| 1633 * | 1634 * start 1635 */ 1636 *start_ret = state->start; 1637 break; 1638 } 1639 } else { 1640 /* 1641 * |---prev range---|---hole/unset---|---node range---| 1642 * | 1643 * start 1644 * 1645 * or 1646 * 1647 * |---hole/unset--||--first node--| 1648 * 0 | 1649 * start 1650 */ 1651 if (prev) { 1652 state = rb_entry(prev, struct extent_state, 1653 rb_node); 1654 *start_ret = state->end + 1; 1655 } else { 1656 *start_ret = 0; 1657 } 1658 break; 1659 } 1660 } 1661 1662 /* 1663 * Find the longest stretch from start until an entry which has the 1664 * bits set 1665 */ 1666 while (1) { 1667 state = rb_entry(node, struct extent_state, rb_node); 1668 if (state->end >= start && !(state->state & bits)) { 1669 *end_ret = state->end; 1670 } else { 1671 *end_ret = state->start - 1; 1672 break; 1673 } 1674 1675 node = rb_next(node); 1676 if (!node) 1677 break; 1678 } 1679 out: 1680 spin_unlock(&tree->lock); 1681 } 1682 1683 /* 1684 * find a contiguous range of bytes in the file marked as delalloc, not 1685 * more than 'max_bytes'. start and end are used to return the range, 1686 * 1687 * true is returned if we find something, false if nothing was in the tree 1688 */ 1689 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, 1690 u64 *end, u64 max_bytes, 1691 struct extent_state **cached_state) 1692 { 1693 struct rb_node *node; 1694 struct extent_state *state; 1695 u64 cur_start = *start; 1696 bool found = false; 1697 u64 total_bytes = 0; 1698 1699 spin_lock(&tree->lock); 1700 1701 /* 1702 * this search will find all the extents that end after 1703 * our range starts. 1704 */ 1705 node = tree_search(tree, cur_start); 1706 if (!node) { 1707 *end = (u64)-1; 1708 goto out; 1709 } 1710 1711 while (1) { 1712 state = rb_entry(node, struct extent_state, rb_node); 1713 if (found && (state->start != cur_start || 1714 (state->state & EXTENT_BOUNDARY))) { 1715 goto out; 1716 } 1717 if (!(state->state & EXTENT_DELALLOC)) { 1718 if (!found) 1719 *end = state->end; 1720 goto out; 1721 } 1722 if (!found) { 1723 *start = state->start; 1724 *cached_state = state; 1725 refcount_inc(&state->refs); 1726 } 1727 found = true; 1728 *end = state->end; 1729 cur_start = state->end + 1; 1730 node = rb_next(node); 1731 total_bytes += state->end - state->start + 1; 1732 if (total_bytes >= max_bytes) 1733 break; 1734 if (!node) 1735 break; 1736 } 1737 out: 1738 spin_unlock(&tree->lock); 1739 return found; 1740 } 1741 1742 static int __process_pages_contig(struct address_space *mapping, 1743 struct page *locked_page, 1744 pgoff_t start_index, pgoff_t end_index, 1745 unsigned long page_ops, pgoff_t *index_ret); 1746 1747 static noinline void __unlock_for_delalloc(struct inode *inode, 1748 struct page *locked_page, 1749 u64 start, u64 end) 1750 { 1751 unsigned long index = start >> PAGE_SHIFT; 1752 unsigned long end_index = end >> PAGE_SHIFT; 1753 1754 ASSERT(locked_page); 1755 if (index == locked_page->index && end_index == index) 1756 return; 1757 1758 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1759 PAGE_UNLOCK, NULL); 1760 } 1761 1762 static noinline int lock_delalloc_pages(struct inode *inode, 1763 struct page *locked_page, 1764 u64 delalloc_start, 1765 u64 delalloc_end) 1766 { 1767 unsigned long index = delalloc_start >> PAGE_SHIFT; 1768 unsigned long index_ret = index; 1769 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1770 int ret; 1771 1772 ASSERT(locked_page); 1773 if (index == locked_page->index && index == end_index) 1774 return 0; 1775 1776 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1777 end_index, PAGE_LOCK, &index_ret); 1778 if (ret == -EAGAIN) 1779 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1780 (u64)index_ret << PAGE_SHIFT); 1781 return ret; 1782 } 1783 1784 /* 1785 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 1786 * more than @max_bytes. @Start and @end are used to return the range, 1787 * 1788 * Return: true if we find something 1789 * false if nothing was in the tree 1790 */ 1791 EXPORT_FOR_TESTS 1792 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1793 struct page *locked_page, u64 *start, 1794 u64 *end) 1795 { 1796 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1797 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1798 u64 delalloc_start; 1799 u64 delalloc_end; 1800 bool found; 1801 struct extent_state *cached_state = NULL; 1802 int ret; 1803 int loops = 0; 1804 1805 again: 1806 /* step one, find a bunch of delalloc bytes starting at start */ 1807 delalloc_start = *start; 1808 delalloc_end = 0; 1809 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1810 max_bytes, &cached_state); 1811 if (!found || delalloc_end <= *start) { 1812 *start = delalloc_start; 1813 *end = delalloc_end; 1814 free_extent_state(cached_state); 1815 return false; 1816 } 1817 1818 /* 1819 * start comes from the offset of locked_page. We have to lock 1820 * pages in order, so we can't process delalloc bytes before 1821 * locked_page 1822 */ 1823 if (delalloc_start < *start) 1824 delalloc_start = *start; 1825 1826 /* 1827 * make sure to limit the number of pages we try to lock down 1828 */ 1829 if (delalloc_end + 1 - delalloc_start > max_bytes) 1830 delalloc_end = delalloc_start + max_bytes - 1; 1831 1832 /* step two, lock all the pages after the page that has start */ 1833 ret = lock_delalloc_pages(inode, locked_page, 1834 delalloc_start, delalloc_end); 1835 ASSERT(!ret || ret == -EAGAIN); 1836 if (ret == -EAGAIN) { 1837 /* some of the pages are gone, lets avoid looping by 1838 * shortening the size of the delalloc range we're searching 1839 */ 1840 free_extent_state(cached_state); 1841 cached_state = NULL; 1842 if (!loops) { 1843 max_bytes = PAGE_SIZE; 1844 loops = 1; 1845 goto again; 1846 } else { 1847 found = false; 1848 goto out_failed; 1849 } 1850 } 1851 1852 /* step three, lock the state bits for the whole range */ 1853 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1854 1855 /* then test to make sure it is all still delalloc */ 1856 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1857 EXTENT_DELALLOC, 1, cached_state); 1858 if (!ret) { 1859 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1860 &cached_state); 1861 __unlock_for_delalloc(inode, locked_page, 1862 delalloc_start, delalloc_end); 1863 cond_resched(); 1864 goto again; 1865 } 1866 free_extent_state(cached_state); 1867 *start = delalloc_start; 1868 *end = delalloc_end; 1869 out_failed: 1870 return found; 1871 } 1872 1873 static int __process_pages_contig(struct address_space *mapping, 1874 struct page *locked_page, 1875 pgoff_t start_index, pgoff_t end_index, 1876 unsigned long page_ops, pgoff_t *index_ret) 1877 { 1878 unsigned long nr_pages = end_index - start_index + 1; 1879 unsigned long pages_locked = 0; 1880 pgoff_t index = start_index; 1881 struct page *pages[16]; 1882 unsigned ret; 1883 int err = 0; 1884 int i; 1885 1886 if (page_ops & PAGE_LOCK) { 1887 ASSERT(page_ops == PAGE_LOCK); 1888 ASSERT(index_ret && *index_ret == start_index); 1889 } 1890 1891 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1892 mapping_set_error(mapping, -EIO); 1893 1894 while (nr_pages > 0) { 1895 ret = find_get_pages_contig(mapping, index, 1896 min_t(unsigned long, 1897 nr_pages, ARRAY_SIZE(pages)), pages); 1898 if (ret == 0) { 1899 /* 1900 * Only if we're going to lock these pages, 1901 * can we find nothing at @index. 1902 */ 1903 ASSERT(page_ops & PAGE_LOCK); 1904 err = -EAGAIN; 1905 goto out; 1906 } 1907 1908 for (i = 0; i < ret; i++) { 1909 if (page_ops & PAGE_SET_PRIVATE2) 1910 SetPagePrivate2(pages[i]); 1911 1912 if (locked_page && pages[i] == locked_page) { 1913 put_page(pages[i]); 1914 pages_locked++; 1915 continue; 1916 } 1917 if (page_ops & PAGE_CLEAR_DIRTY) 1918 clear_page_dirty_for_io(pages[i]); 1919 if (page_ops & PAGE_SET_WRITEBACK) 1920 set_page_writeback(pages[i]); 1921 if (page_ops & PAGE_SET_ERROR) 1922 SetPageError(pages[i]); 1923 if (page_ops & PAGE_END_WRITEBACK) 1924 end_page_writeback(pages[i]); 1925 if (page_ops & PAGE_UNLOCK) 1926 unlock_page(pages[i]); 1927 if (page_ops & PAGE_LOCK) { 1928 lock_page(pages[i]); 1929 if (!PageDirty(pages[i]) || 1930 pages[i]->mapping != mapping) { 1931 unlock_page(pages[i]); 1932 put_page(pages[i]); 1933 err = -EAGAIN; 1934 goto out; 1935 } 1936 } 1937 put_page(pages[i]); 1938 pages_locked++; 1939 } 1940 nr_pages -= ret; 1941 index += ret; 1942 cond_resched(); 1943 } 1944 out: 1945 if (err && index_ret) 1946 *index_ret = start_index + pages_locked - 1; 1947 return err; 1948 } 1949 1950 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1951 struct page *locked_page, 1952 unsigned clear_bits, 1953 unsigned long page_ops) 1954 { 1955 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1956 NULL); 1957 1958 __process_pages_contig(inode->i_mapping, locked_page, 1959 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1960 page_ops, NULL); 1961 } 1962 1963 /* 1964 * count the number of bytes in the tree that have a given bit(s) 1965 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1966 * cached. The total number found is returned. 1967 */ 1968 u64 count_range_bits(struct extent_io_tree *tree, 1969 u64 *start, u64 search_end, u64 max_bytes, 1970 unsigned bits, int contig) 1971 { 1972 struct rb_node *node; 1973 struct extent_state *state; 1974 u64 cur_start = *start; 1975 u64 total_bytes = 0; 1976 u64 last = 0; 1977 int found = 0; 1978 1979 if (WARN_ON(search_end <= cur_start)) 1980 return 0; 1981 1982 spin_lock(&tree->lock); 1983 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1984 total_bytes = tree->dirty_bytes; 1985 goto out; 1986 } 1987 /* 1988 * this search will find all the extents that end after 1989 * our range starts. 1990 */ 1991 node = tree_search(tree, cur_start); 1992 if (!node) 1993 goto out; 1994 1995 while (1) { 1996 state = rb_entry(node, struct extent_state, rb_node); 1997 if (state->start > search_end) 1998 break; 1999 if (contig && found && state->start > last + 1) 2000 break; 2001 if (state->end >= cur_start && (state->state & bits) == bits) { 2002 total_bytes += min(search_end, state->end) + 1 - 2003 max(cur_start, state->start); 2004 if (total_bytes >= max_bytes) 2005 break; 2006 if (!found) { 2007 *start = max(cur_start, state->start); 2008 found = 1; 2009 } 2010 last = state->end; 2011 } else if (contig && found) { 2012 break; 2013 } 2014 node = rb_next(node); 2015 if (!node) 2016 break; 2017 } 2018 out: 2019 spin_unlock(&tree->lock); 2020 return total_bytes; 2021 } 2022 2023 /* 2024 * set the private field for a given byte offset in the tree. If there isn't 2025 * an extent_state there already, this does nothing. 2026 */ 2027 int set_state_failrec(struct extent_io_tree *tree, u64 start, 2028 struct io_failure_record *failrec) 2029 { 2030 struct rb_node *node; 2031 struct extent_state *state; 2032 int ret = 0; 2033 2034 spin_lock(&tree->lock); 2035 /* 2036 * this search will find all the extents that end after 2037 * our range starts. 2038 */ 2039 node = tree_search(tree, start); 2040 if (!node) { 2041 ret = -ENOENT; 2042 goto out; 2043 } 2044 state = rb_entry(node, struct extent_state, rb_node); 2045 if (state->start != start) { 2046 ret = -ENOENT; 2047 goto out; 2048 } 2049 state->failrec = failrec; 2050 out: 2051 spin_unlock(&tree->lock); 2052 return ret; 2053 } 2054 2055 int get_state_failrec(struct extent_io_tree *tree, u64 start, 2056 struct io_failure_record **failrec) 2057 { 2058 struct rb_node *node; 2059 struct extent_state *state; 2060 int ret = 0; 2061 2062 spin_lock(&tree->lock); 2063 /* 2064 * this search will find all the extents that end after 2065 * our range starts. 2066 */ 2067 node = tree_search(tree, start); 2068 if (!node) { 2069 ret = -ENOENT; 2070 goto out; 2071 } 2072 state = rb_entry(node, struct extent_state, rb_node); 2073 if (state->start != start) { 2074 ret = -ENOENT; 2075 goto out; 2076 } 2077 *failrec = state->failrec; 2078 out: 2079 spin_unlock(&tree->lock); 2080 return ret; 2081 } 2082 2083 /* 2084 * searches a range in the state tree for a given mask. 2085 * If 'filled' == 1, this returns 1 only if every extent in the tree 2086 * has the bits set. Otherwise, 1 is returned if any bit in the 2087 * range is found set. 2088 */ 2089 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 2090 unsigned bits, int filled, struct extent_state *cached) 2091 { 2092 struct extent_state *state = NULL; 2093 struct rb_node *node; 2094 int bitset = 0; 2095 2096 spin_lock(&tree->lock); 2097 if (cached && extent_state_in_tree(cached) && cached->start <= start && 2098 cached->end > start) 2099 node = &cached->rb_node; 2100 else 2101 node = tree_search(tree, start); 2102 while (node && start <= end) { 2103 state = rb_entry(node, struct extent_state, rb_node); 2104 2105 if (filled && state->start > start) { 2106 bitset = 0; 2107 break; 2108 } 2109 2110 if (state->start > end) 2111 break; 2112 2113 if (state->state & bits) { 2114 bitset = 1; 2115 if (!filled) 2116 break; 2117 } else if (filled) { 2118 bitset = 0; 2119 break; 2120 } 2121 2122 if (state->end == (u64)-1) 2123 break; 2124 2125 start = state->end + 1; 2126 if (start > end) 2127 break; 2128 node = rb_next(node); 2129 if (!node) { 2130 if (filled) 2131 bitset = 0; 2132 break; 2133 } 2134 } 2135 spin_unlock(&tree->lock); 2136 return bitset; 2137 } 2138 2139 /* 2140 * helper function to set a given page up to date if all the 2141 * extents in the tree for that page are up to date 2142 */ 2143 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 2144 { 2145 u64 start = page_offset(page); 2146 u64 end = start + PAGE_SIZE - 1; 2147 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 2148 SetPageUptodate(page); 2149 } 2150 2151 int free_io_failure(struct extent_io_tree *failure_tree, 2152 struct extent_io_tree *io_tree, 2153 struct io_failure_record *rec) 2154 { 2155 int ret; 2156 int err = 0; 2157 2158 set_state_failrec(failure_tree, rec->start, NULL); 2159 ret = clear_extent_bits(failure_tree, rec->start, 2160 rec->start + rec->len - 1, 2161 EXTENT_LOCKED | EXTENT_DIRTY); 2162 if (ret) 2163 err = ret; 2164 2165 ret = clear_extent_bits(io_tree, rec->start, 2166 rec->start + rec->len - 1, 2167 EXTENT_DAMAGED); 2168 if (ret && !err) 2169 err = ret; 2170 2171 kfree(rec); 2172 return err; 2173 } 2174 2175 /* 2176 * this bypasses the standard btrfs submit functions deliberately, as 2177 * the standard behavior is to write all copies in a raid setup. here we only 2178 * want to write the one bad copy. so we do the mapping for ourselves and issue 2179 * submit_bio directly. 2180 * to avoid any synchronization issues, wait for the data after writing, which 2181 * actually prevents the read that triggered the error from finishing. 2182 * currently, there can be no more than two copies of every data bit. thus, 2183 * exactly one rewrite is required. 2184 */ 2185 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 2186 u64 length, u64 logical, struct page *page, 2187 unsigned int pg_offset, int mirror_num) 2188 { 2189 struct bio *bio; 2190 struct btrfs_device *dev; 2191 u64 map_length = 0; 2192 u64 sector; 2193 struct btrfs_bio *bbio = NULL; 2194 int ret; 2195 2196 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2197 BUG_ON(!mirror_num); 2198 2199 bio = btrfs_io_bio_alloc(1); 2200 bio->bi_iter.bi_size = 0; 2201 map_length = length; 2202 2203 /* 2204 * Avoid races with device replace and make sure our bbio has devices 2205 * associated to its stripes that don't go away while we are doing the 2206 * read repair operation. 2207 */ 2208 btrfs_bio_counter_inc_blocked(fs_info); 2209 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2210 /* 2211 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2212 * to update all raid stripes, but here we just want to correct 2213 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2214 * stripe's dev and sector. 2215 */ 2216 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2217 &map_length, &bbio, 0); 2218 if (ret) { 2219 btrfs_bio_counter_dec(fs_info); 2220 bio_put(bio); 2221 return -EIO; 2222 } 2223 ASSERT(bbio->mirror_num == 1); 2224 } else { 2225 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2226 &map_length, &bbio, mirror_num); 2227 if (ret) { 2228 btrfs_bio_counter_dec(fs_info); 2229 bio_put(bio); 2230 return -EIO; 2231 } 2232 BUG_ON(mirror_num != bbio->mirror_num); 2233 } 2234 2235 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2236 bio->bi_iter.bi_sector = sector; 2237 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2238 btrfs_put_bbio(bbio); 2239 if (!dev || !dev->bdev || 2240 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2241 btrfs_bio_counter_dec(fs_info); 2242 bio_put(bio); 2243 return -EIO; 2244 } 2245 bio_set_dev(bio, dev->bdev); 2246 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2247 bio_add_page(bio, page, length, pg_offset); 2248 2249 if (btrfsic_submit_bio_wait(bio)) { 2250 /* try to remap that extent elsewhere? */ 2251 btrfs_bio_counter_dec(fs_info); 2252 bio_put(bio); 2253 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2254 return -EIO; 2255 } 2256 2257 btrfs_info_rl_in_rcu(fs_info, 2258 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2259 ino, start, 2260 rcu_str_deref(dev->name), sector); 2261 btrfs_bio_counter_dec(fs_info); 2262 bio_put(bio); 2263 return 0; 2264 } 2265 2266 int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) 2267 { 2268 struct btrfs_fs_info *fs_info = eb->fs_info; 2269 u64 start = eb->start; 2270 int i, num_pages = num_extent_pages(eb); 2271 int ret = 0; 2272 2273 if (sb_rdonly(fs_info->sb)) 2274 return -EROFS; 2275 2276 for (i = 0; i < num_pages; i++) { 2277 struct page *p = eb->pages[i]; 2278 2279 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2280 start - page_offset(p), mirror_num); 2281 if (ret) 2282 break; 2283 start += PAGE_SIZE; 2284 } 2285 2286 return ret; 2287 } 2288 2289 /* 2290 * each time an IO finishes, we do a fast check in the IO failure tree 2291 * to see if we need to process or clean up an io_failure_record 2292 */ 2293 int clean_io_failure(struct btrfs_fs_info *fs_info, 2294 struct extent_io_tree *failure_tree, 2295 struct extent_io_tree *io_tree, u64 start, 2296 struct page *page, u64 ino, unsigned int pg_offset) 2297 { 2298 u64 private; 2299 struct io_failure_record *failrec; 2300 struct extent_state *state; 2301 int num_copies; 2302 int ret; 2303 2304 private = 0; 2305 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2306 EXTENT_DIRTY, 0); 2307 if (!ret) 2308 return 0; 2309 2310 ret = get_state_failrec(failure_tree, start, &failrec); 2311 if (ret) 2312 return 0; 2313 2314 BUG_ON(!failrec->this_mirror); 2315 2316 if (failrec->in_validation) { 2317 /* there was no real error, just free the record */ 2318 btrfs_debug(fs_info, 2319 "clean_io_failure: freeing dummy error at %llu", 2320 failrec->start); 2321 goto out; 2322 } 2323 if (sb_rdonly(fs_info->sb)) 2324 goto out; 2325 2326 spin_lock(&io_tree->lock); 2327 state = find_first_extent_bit_state(io_tree, 2328 failrec->start, 2329 EXTENT_LOCKED); 2330 spin_unlock(&io_tree->lock); 2331 2332 if (state && state->start <= failrec->start && 2333 state->end >= failrec->start + failrec->len - 1) { 2334 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2335 failrec->len); 2336 if (num_copies > 1) { 2337 repair_io_failure(fs_info, ino, start, failrec->len, 2338 failrec->logical, page, pg_offset, 2339 failrec->failed_mirror); 2340 } 2341 } 2342 2343 out: 2344 free_io_failure(failure_tree, io_tree, failrec); 2345 2346 return 0; 2347 } 2348 2349 /* 2350 * Can be called when 2351 * - hold extent lock 2352 * - under ordered extent 2353 * - the inode is freeing 2354 */ 2355 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2356 { 2357 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2358 struct io_failure_record *failrec; 2359 struct extent_state *state, *next; 2360 2361 if (RB_EMPTY_ROOT(&failure_tree->state)) 2362 return; 2363 2364 spin_lock(&failure_tree->lock); 2365 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2366 while (state) { 2367 if (state->start > end) 2368 break; 2369 2370 ASSERT(state->end <= end); 2371 2372 next = next_state(state); 2373 2374 failrec = state->failrec; 2375 free_extent_state(state); 2376 kfree(failrec); 2377 2378 state = next; 2379 } 2380 spin_unlock(&failure_tree->lock); 2381 } 2382 2383 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2384 struct io_failure_record **failrec_ret) 2385 { 2386 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2387 struct io_failure_record *failrec; 2388 struct extent_map *em; 2389 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2390 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2391 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2392 int ret; 2393 u64 logical; 2394 2395 ret = get_state_failrec(failure_tree, start, &failrec); 2396 if (ret) { 2397 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2398 if (!failrec) 2399 return -ENOMEM; 2400 2401 failrec->start = start; 2402 failrec->len = end - start + 1; 2403 failrec->this_mirror = 0; 2404 failrec->bio_flags = 0; 2405 failrec->in_validation = 0; 2406 2407 read_lock(&em_tree->lock); 2408 em = lookup_extent_mapping(em_tree, start, failrec->len); 2409 if (!em) { 2410 read_unlock(&em_tree->lock); 2411 kfree(failrec); 2412 return -EIO; 2413 } 2414 2415 if (em->start > start || em->start + em->len <= start) { 2416 free_extent_map(em); 2417 em = NULL; 2418 } 2419 read_unlock(&em_tree->lock); 2420 if (!em) { 2421 kfree(failrec); 2422 return -EIO; 2423 } 2424 2425 logical = start - em->start; 2426 logical = em->block_start + logical; 2427 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2428 logical = em->block_start; 2429 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2430 extent_set_compress_type(&failrec->bio_flags, 2431 em->compress_type); 2432 } 2433 2434 btrfs_debug(fs_info, 2435 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2436 logical, start, failrec->len); 2437 2438 failrec->logical = logical; 2439 free_extent_map(em); 2440 2441 /* set the bits in the private failure tree */ 2442 ret = set_extent_bits(failure_tree, start, end, 2443 EXTENT_LOCKED | EXTENT_DIRTY); 2444 if (ret >= 0) 2445 ret = set_state_failrec(failure_tree, start, failrec); 2446 /* set the bits in the inode's tree */ 2447 if (ret >= 0) 2448 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2449 if (ret < 0) { 2450 kfree(failrec); 2451 return ret; 2452 } 2453 } else { 2454 btrfs_debug(fs_info, 2455 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2456 failrec->logical, failrec->start, failrec->len, 2457 failrec->in_validation); 2458 /* 2459 * when data can be on disk more than twice, add to failrec here 2460 * (e.g. with a list for failed_mirror) to make 2461 * clean_io_failure() clean all those errors at once. 2462 */ 2463 } 2464 2465 *failrec_ret = failrec; 2466 2467 return 0; 2468 } 2469 2470 bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, 2471 struct io_failure_record *failrec, int failed_mirror) 2472 { 2473 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2474 int num_copies; 2475 2476 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2477 if (num_copies == 1) { 2478 /* 2479 * we only have a single copy of the data, so don't bother with 2480 * all the retry and error correction code that follows. no 2481 * matter what the error is, it is very likely to persist. 2482 */ 2483 btrfs_debug(fs_info, 2484 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2485 num_copies, failrec->this_mirror, failed_mirror); 2486 return false; 2487 } 2488 2489 /* 2490 * there are two premises: 2491 * a) deliver good data to the caller 2492 * b) correct the bad sectors on disk 2493 */ 2494 if (failed_bio_pages > 1) { 2495 /* 2496 * to fulfill b), we need to know the exact failing sectors, as 2497 * we don't want to rewrite any more than the failed ones. thus, 2498 * we need separate read requests for the failed bio 2499 * 2500 * if the following BUG_ON triggers, our validation request got 2501 * merged. we need separate requests for our algorithm to work. 2502 */ 2503 BUG_ON(failrec->in_validation); 2504 failrec->in_validation = 1; 2505 failrec->this_mirror = failed_mirror; 2506 } else { 2507 /* 2508 * we're ready to fulfill a) and b) alongside. get a good copy 2509 * of the failed sector and if we succeed, we have setup 2510 * everything for repair_io_failure to do the rest for us. 2511 */ 2512 if (failrec->in_validation) { 2513 BUG_ON(failrec->this_mirror != failed_mirror); 2514 failrec->in_validation = 0; 2515 failrec->this_mirror = 0; 2516 } 2517 failrec->failed_mirror = failed_mirror; 2518 failrec->this_mirror++; 2519 if (failrec->this_mirror == failed_mirror) 2520 failrec->this_mirror++; 2521 } 2522 2523 if (failrec->this_mirror > num_copies) { 2524 btrfs_debug(fs_info, 2525 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2526 num_copies, failrec->this_mirror, failed_mirror); 2527 return false; 2528 } 2529 2530 return true; 2531 } 2532 2533 2534 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2535 struct io_failure_record *failrec, 2536 struct page *page, int pg_offset, int icsum, 2537 bio_end_io_t *endio_func, void *data) 2538 { 2539 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2540 struct bio *bio; 2541 struct btrfs_io_bio *btrfs_failed_bio; 2542 struct btrfs_io_bio *btrfs_bio; 2543 2544 bio = btrfs_io_bio_alloc(1); 2545 bio->bi_end_io = endio_func; 2546 bio->bi_iter.bi_sector = failrec->logical >> 9; 2547 bio->bi_iter.bi_size = 0; 2548 bio->bi_private = data; 2549 2550 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2551 if (btrfs_failed_bio->csum) { 2552 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2553 2554 btrfs_bio = btrfs_io_bio(bio); 2555 btrfs_bio->csum = btrfs_bio->csum_inline; 2556 icsum *= csum_size; 2557 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2558 csum_size); 2559 } 2560 2561 bio_add_page(bio, page, failrec->len, pg_offset); 2562 2563 return bio; 2564 } 2565 2566 /* 2567 * This is a generic handler for readpage errors. If other copies exist, read 2568 * those and write back good data to the failed position. Does not investigate 2569 * in remapping the failed extent elsewhere, hoping the device will be smart 2570 * enough to do this as needed 2571 */ 2572 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2573 struct page *page, u64 start, u64 end, 2574 int failed_mirror) 2575 { 2576 struct io_failure_record *failrec; 2577 struct inode *inode = page->mapping->host; 2578 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2579 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2580 struct bio *bio; 2581 int read_mode = 0; 2582 blk_status_t status; 2583 int ret; 2584 unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; 2585 2586 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2587 2588 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2589 if (ret) 2590 return ret; 2591 2592 if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, 2593 failed_mirror)) { 2594 free_io_failure(failure_tree, tree, failrec); 2595 return -EIO; 2596 } 2597 2598 if (failed_bio_pages > 1) 2599 read_mode |= REQ_FAILFAST_DEV; 2600 2601 phy_offset >>= inode->i_sb->s_blocksize_bits; 2602 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2603 start - page_offset(page), 2604 (int)phy_offset, failed_bio->bi_end_io, 2605 NULL); 2606 bio->bi_opf = REQ_OP_READ | read_mode; 2607 2608 btrfs_debug(btrfs_sb(inode->i_sb), 2609 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2610 read_mode, failrec->this_mirror, failrec->in_validation); 2611 2612 status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, 2613 failrec->bio_flags); 2614 if (status) { 2615 free_io_failure(failure_tree, tree, failrec); 2616 bio_put(bio); 2617 ret = blk_status_to_errno(status); 2618 } 2619 2620 return ret; 2621 } 2622 2623 /* lots and lots of room for performance fixes in the end_bio funcs */ 2624 2625 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2626 { 2627 int uptodate = (err == 0); 2628 int ret = 0; 2629 2630 btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); 2631 2632 if (!uptodate) { 2633 ClearPageUptodate(page); 2634 SetPageError(page); 2635 ret = err < 0 ? err : -EIO; 2636 mapping_set_error(page->mapping, ret); 2637 } 2638 } 2639 2640 /* 2641 * after a writepage IO is done, we need to: 2642 * clear the uptodate bits on error 2643 * clear the writeback bits in the extent tree for this IO 2644 * end_page_writeback if the page has no more pending IO 2645 * 2646 * Scheduling is not allowed, so the extent state tree is expected 2647 * to have one and only one object corresponding to this IO. 2648 */ 2649 static void end_bio_extent_writepage(struct bio *bio) 2650 { 2651 int error = blk_status_to_errno(bio->bi_status); 2652 struct bio_vec *bvec; 2653 u64 start; 2654 u64 end; 2655 struct bvec_iter_all iter_all; 2656 2657 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2658 bio_for_each_segment_all(bvec, bio, iter_all) { 2659 struct page *page = bvec->bv_page; 2660 struct inode *inode = page->mapping->host; 2661 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2662 2663 /* We always issue full-page reads, but if some block 2664 * in a page fails to read, blk_update_request() will 2665 * advance bv_offset and adjust bv_len to compensate. 2666 * Print a warning for nonzero offsets, and an error 2667 * if they don't add up to a full page. */ 2668 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2669 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2670 btrfs_err(fs_info, 2671 "partial page write in btrfs with offset %u and length %u", 2672 bvec->bv_offset, bvec->bv_len); 2673 else 2674 btrfs_info(fs_info, 2675 "incomplete page write in btrfs with offset %u and length %u", 2676 bvec->bv_offset, bvec->bv_len); 2677 } 2678 2679 start = page_offset(page); 2680 end = start + bvec->bv_offset + bvec->bv_len - 1; 2681 2682 end_extent_writepage(page, error, start, end); 2683 end_page_writeback(page); 2684 } 2685 2686 bio_put(bio); 2687 } 2688 2689 static void 2690 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2691 int uptodate) 2692 { 2693 struct extent_state *cached = NULL; 2694 u64 end = start + len - 1; 2695 2696 if (uptodate && tree->track_uptodate) 2697 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2698 unlock_extent_cached_atomic(tree, start, end, &cached); 2699 } 2700 2701 /* 2702 * after a readpage IO is done, we need to: 2703 * clear the uptodate bits on error 2704 * set the uptodate bits if things worked 2705 * set the page up to date if all extents in the tree are uptodate 2706 * clear the lock bit in the extent tree 2707 * unlock the page if there are no other extents locked for it 2708 * 2709 * Scheduling is not allowed, so the extent state tree is expected 2710 * to have one and only one object corresponding to this IO. 2711 */ 2712 static void end_bio_extent_readpage(struct bio *bio) 2713 { 2714 struct bio_vec *bvec; 2715 int uptodate = !bio->bi_status; 2716 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2717 struct extent_io_tree *tree, *failure_tree; 2718 u64 offset = 0; 2719 u64 start; 2720 u64 end; 2721 u64 len; 2722 u64 extent_start = 0; 2723 u64 extent_len = 0; 2724 int mirror; 2725 int ret; 2726 struct bvec_iter_all iter_all; 2727 2728 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2729 bio_for_each_segment_all(bvec, bio, iter_all) { 2730 struct page *page = bvec->bv_page; 2731 struct inode *inode = page->mapping->host; 2732 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2733 bool data_inode = btrfs_ino(BTRFS_I(inode)) 2734 != BTRFS_BTREE_INODE_OBJECTID; 2735 2736 btrfs_debug(fs_info, 2737 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2738 (u64)bio->bi_iter.bi_sector, bio->bi_status, 2739 io_bio->mirror_num); 2740 tree = &BTRFS_I(inode)->io_tree; 2741 failure_tree = &BTRFS_I(inode)->io_failure_tree; 2742 2743 /* We always issue full-page reads, but if some block 2744 * in a page fails to read, blk_update_request() will 2745 * advance bv_offset and adjust bv_len to compensate. 2746 * Print a warning for nonzero offsets, and an error 2747 * if they don't add up to a full page. */ 2748 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2749 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2750 btrfs_err(fs_info, 2751 "partial page read in btrfs with offset %u and length %u", 2752 bvec->bv_offset, bvec->bv_len); 2753 else 2754 btrfs_info(fs_info, 2755 "incomplete page read in btrfs with offset %u and length %u", 2756 bvec->bv_offset, bvec->bv_len); 2757 } 2758 2759 start = page_offset(page); 2760 end = start + bvec->bv_offset + bvec->bv_len - 1; 2761 len = bvec->bv_len; 2762 2763 mirror = io_bio->mirror_num; 2764 if (likely(uptodate)) { 2765 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2766 page, start, end, 2767 mirror); 2768 if (ret) 2769 uptodate = 0; 2770 else 2771 clean_io_failure(BTRFS_I(inode)->root->fs_info, 2772 failure_tree, tree, start, 2773 page, 2774 btrfs_ino(BTRFS_I(inode)), 0); 2775 } 2776 2777 if (likely(uptodate)) 2778 goto readpage_ok; 2779 2780 if (data_inode) { 2781 2782 /* 2783 * The generic bio_readpage_error handles errors the 2784 * following way: If possible, new read requests are 2785 * created and submitted and will end up in 2786 * end_bio_extent_readpage as well (if we're lucky, 2787 * not in the !uptodate case). In that case it returns 2788 * 0 and we just go on with the next page in our bio. 2789 * If it can't handle the error it will return -EIO and 2790 * we remain responsible for that page. 2791 */ 2792 ret = bio_readpage_error(bio, offset, page, start, end, 2793 mirror); 2794 if (ret == 0) { 2795 uptodate = !bio->bi_status; 2796 offset += len; 2797 continue; 2798 } 2799 } else { 2800 struct extent_buffer *eb; 2801 2802 eb = (struct extent_buffer *)page->private; 2803 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 2804 eb->read_mirror = mirror; 2805 atomic_dec(&eb->io_pages); 2806 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, 2807 &eb->bflags)) 2808 btree_readahead_hook(eb, -EIO); 2809 } 2810 readpage_ok: 2811 if (likely(uptodate)) { 2812 loff_t i_size = i_size_read(inode); 2813 pgoff_t end_index = i_size >> PAGE_SHIFT; 2814 unsigned off; 2815 2816 /* Zero out the end if this page straddles i_size */ 2817 off = offset_in_page(i_size); 2818 if (page->index == end_index && off) 2819 zero_user_segment(page, off, PAGE_SIZE); 2820 SetPageUptodate(page); 2821 } else { 2822 ClearPageUptodate(page); 2823 SetPageError(page); 2824 } 2825 unlock_page(page); 2826 offset += len; 2827 2828 if (unlikely(!uptodate)) { 2829 if (extent_len) { 2830 endio_readpage_release_extent(tree, 2831 extent_start, 2832 extent_len, 1); 2833 extent_start = 0; 2834 extent_len = 0; 2835 } 2836 endio_readpage_release_extent(tree, start, 2837 end - start + 1, 0); 2838 } else if (!extent_len) { 2839 extent_start = start; 2840 extent_len = end + 1 - start; 2841 } else if (extent_start + extent_len == start) { 2842 extent_len += end + 1 - start; 2843 } else { 2844 endio_readpage_release_extent(tree, extent_start, 2845 extent_len, uptodate); 2846 extent_start = start; 2847 extent_len = end + 1 - start; 2848 } 2849 } 2850 2851 if (extent_len) 2852 endio_readpage_release_extent(tree, extent_start, extent_len, 2853 uptodate); 2854 btrfs_io_bio_free_csum(io_bio); 2855 bio_put(bio); 2856 } 2857 2858 /* 2859 * Initialize the members up to but not including 'bio'. Use after allocating a 2860 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 2861 * 'bio' because use of __GFP_ZERO is not supported. 2862 */ 2863 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) 2864 { 2865 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); 2866 } 2867 2868 /* 2869 * The following helpers allocate a bio. As it's backed by a bioset, it'll 2870 * never fail. We're returning a bio right now but you can call btrfs_io_bio 2871 * for the appropriate container_of magic 2872 */ 2873 struct bio *btrfs_bio_alloc(u64 first_byte) 2874 { 2875 struct bio *bio; 2876 2877 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); 2878 bio->bi_iter.bi_sector = first_byte >> 9; 2879 btrfs_io_bio_init(btrfs_io_bio(bio)); 2880 return bio; 2881 } 2882 2883 struct bio *btrfs_bio_clone(struct bio *bio) 2884 { 2885 struct btrfs_io_bio *btrfs_bio; 2886 struct bio *new; 2887 2888 /* Bio allocation backed by a bioset does not fail */ 2889 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); 2890 btrfs_bio = btrfs_io_bio(new); 2891 btrfs_io_bio_init(btrfs_bio); 2892 btrfs_bio->iter = bio->bi_iter; 2893 return new; 2894 } 2895 2896 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) 2897 { 2898 struct bio *bio; 2899 2900 /* Bio allocation backed by a bioset does not fail */ 2901 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); 2902 btrfs_io_bio_init(btrfs_io_bio(bio)); 2903 return bio; 2904 } 2905 2906 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) 2907 { 2908 struct bio *bio; 2909 struct btrfs_io_bio *btrfs_bio; 2910 2911 /* this will never fail when it's backed by a bioset */ 2912 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); 2913 ASSERT(bio); 2914 2915 btrfs_bio = btrfs_io_bio(bio); 2916 btrfs_io_bio_init(btrfs_bio); 2917 2918 bio_trim(bio, offset >> 9, size >> 9); 2919 btrfs_bio->iter = bio->bi_iter; 2920 return bio; 2921 } 2922 2923 /* 2924 * @opf: bio REQ_OP_* and REQ_* flags as one value 2925 * @tree: tree so we can call our merge_bio hook 2926 * @wbc: optional writeback control for io accounting 2927 * @page: page to add to the bio 2928 * @pg_offset: offset of the new bio or to check whether we are adding 2929 * a contiguous page to the previous one 2930 * @size: portion of page that we want to write 2931 * @offset: starting offset in the page 2932 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 2933 * @end_io_func: end_io callback for new bio 2934 * @mirror_num: desired mirror to read/write 2935 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 2936 * @bio_flags: flags of the current bio to see if we can merge them 2937 */ 2938 static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, 2939 struct writeback_control *wbc, 2940 struct page *page, u64 offset, 2941 size_t size, unsigned long pg_offset, 2942 struct bio **bio_ret, 2943 bio_end_io_t end_io_func, 2944 int mirror_num, 2945 unsigned long prev_bio_flags, 2946 unsigned long bio_flags, 2947 bool force_bio_submit) 2948 { 2949 int ret = 0; 2950 struct bio *bio; 2951 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2952 sector_t sector = offset >> 9; 2953 2954 ASSERT(bio_ret); 2955 2956 if (*bio_ret) { 2957 bool contig; 2958 bool can_merge = true; 2959 2960 bio = *bio_ret; 2961 if (prev_bio_flags & EXTENT_BIO_COMPRESSED) 2962 contig = bio->bi_iter.bi_sector == sector; 2963 else 2964 contig = bio_end_sector(bio) == sector; 2965 2966 ASSERT(tree->ops); 2967 if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) 2968 can_merge = false; 2969 2970 if (prev_bio_flags != bio_flags || !contig || !can_merge || 2971 force_bio_submit || 2972 bio_add_page(bio, page, page_size, pg_offset) < page_size) { 2973 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2974 if (ret < 0) { 2975 *bio_ret = NULL; 2976 return ret; 2977 } 2978 bio = NULL; 2979 } else { 2980 if (wbc) 2981 wbc_account_cgroup_owner(wbc, page, page_size); 2982 return 0; 2983 } 2984 } 2985 2986 bio = btrfs_bio_alloc(offset); 2987 bio_add_page(bio, page, page_size, pg_offset); 2988 bio->bi_end_io = end_io_func; 2989 bio->bi_private = tree; 2990 bio->bi_write_hint = page->mapping->host->i_write_hint; 2991 bio->bi_opf = opf; 2992 if (wbc) { 2993 struct block_device *bdev; 2994 2995 bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; 2996 bio_set_dev(bio, bdev); 2997 wbc_init_bio(wbc, bio); 2998 wbc_account_cgroup_owner(wbc, page, page_size); 2999 } 3000 3001 *bio_ret = bio; 3002 3003 return ret; 3004 } 3005 3006 static void attach_extent_buffer_page(struct extent_buffer *eb, 3007 struct page *page) 3008 { 3009 if (!PagePrivate(page)) { 3010 SetPagePrivate(page); 3011 get_page(page); 3012 set_page_private(page, (unsigned long)eb); 3013 } else { 3014 WARN_ON(page->private != (unsigned long)eb); 3015 } 3016 } 3017 3018 void set_page_extent_mapped(struct page *page) 3019 { 3020 if (!PagePrivate(page)) { 3021 SetPagePrivate(page); 3022 get_page(page); 3023 set_page_private(page, EXTENT_PAGE_PRIVATE); 3024 } 3025 } 3026 3027 static struct extent_map * 3028 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 3029 u64 start, u64 len, get_extent_t *get_extent, 3030 struct extent_map **em_cached) 3031 { 3032 struct extent_map *em; 3033 3034 if (em_cached && *em_cached) { 3035 em = *em_cached; 3036 if (extent_map_in_tree(em) && start >= em->start && 3037 start < extent_map_end(em)) { 3038 refcount_inc(&em->refs); 3039 return em; 3040 } 3041 3042 free_extent_map(em); 3043 *em_cached = NULL; 3044 } 3045 3046 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 3047 if (em_cached && !IS_ERR_OR_NULL(em)) { 3048 BUG_ON(*em_cached); 3049 refcount_inc(&em->refs); 3050 *em_cached = em; 3051 } 3052 return em; 3053 } 3054 /* 3055 * basic readpage implementation. Locked extent state structs are inserted 3056 * into the tree that are removed when the IO is done (by the end_io 3057 * handlers) 3058 * XXX JDM: This needs looking at to ensure proper page locking 3059 * return 0 on success, otherwise return error 3060 */ 3061 static int __do_readpage(struct extent_io_tree *tree, 3062 struct page *page, 3063 get_extent_t *get_extent, 3064 struct extent_map **em_cached, 3065 struct bio **bio, int mirror_num, 3066 unsigned long *bio_flags, unsigned int read_flags, 3067 u64 *prev_em_start) 3068 { 3069 struct inode *inode = page->mapping->host; 3070 u64 start = page_offset(page); 3071 const u64 end = start + PAGE_SIZE - 1; 3072 u64 cur = start; 3073 u64 extent_offset; 3074 u64 last_byte = i_size_read(inode); 3075 u64 block_start; 3076 u64 cur_end; 3077 struct extent_map *em; 3078 int ret = 0; 3079 int nr = 0; 3080 size_t pg_offset = 0; 3081 size_t iosize; 3082 size_t disk_io_size; 3083 size_t blocksize = inode->i_sb->s_blocksize; 3084 unsigned long this_bio_flag = 0; 3085 3086 set_page_extent_mapped(page); 3087 3088 if (!PageUptodate(page)) { 3089 if (cleancache_get_page(page) == 0) { 3090 BUG_ON(blocksize != PAGE_SIZE); 3091 unlock_extent(tree, start, end); 3092 goto out; 3093 } 3094 } 3095 3096 if (page->index == last_byte >> PAGE_SHIFT) { 3097 char *userpage; 3098 size_t zero_offset = offset_in_page(last_byte); 3099 3100 if (zero_offset) { 3101 iosize = PAGE_SIZE - zero_offset; 3102 userpage = kmap_atomic(page); 3103 memset(userpage + zero_offset, 0, iosize); 3104 flush_dcache_page(page); 3105 kunmap_atomic(userpage); 3106 } 3107 } 3108 while (cur <= end) { 3109 bool force_bio_submit = false; 3110 u64 offset; 3111 3112 if (cur >= last_byte) { 3113 char *userpage; 3114 struct extent_state *cached = NULL; 3115 3116 iosize = PAGE_SIZE - pg_offset; 3117 userpage = kmap_atomic(page); 3118 memset(userpage + pg_offset, 0, iosize); 3119 flush_dcache_page(page); 3120 kunmap_atomic(userpage); 3121 set_extent_uptodate(tree, cur, cur + iosize - 1, 3122 &cached, GFP_NOFS); 3123 unlock_extent_cached(tree, cur, 3124 cur + iosize - 1, &cached); 3125 break; 3126 } 3127 em = __get_extent_map(inode, page, pg_offset, cur, 3128 end - cur + 1, get_extent, em_cached); 3129 if (IS_ERR_OR_NULL(em)) { 3130 SetPageError(page); 3131 unlock_extent(tree, cur, end); 3132 break; 3133 } 3134 extent_offset = cur - em->start; 3135 BUG_ON(extent_map_end(em) <= cur); 3136 BUG_ON(end < cur); 3137 3138 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 3139 this_bio_flag |= EXTENT_BIO_COMPRESSED; 3140 extent_set_compress_type(&this_bio_flag, 3141 em->compress_type); 3142 } 3143 3144 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3145 cur_end = min(extent_map_end(em) - 1, end); 3146 iosize = ALIGN(iosize, blocksize); 3147 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 3148 disk_io_size = em->block_len; 3149 offset = em->block_start; 3150 } else { 3151 offset = em->block_start + extent_offset; 3152 disk_io_size = iosize; 3153 } 3154 block_start = em->block_start; 3155 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3156 block_start = EXTENT_MAP_HOLE; 3157 3158 /* 3159 * If we have a file range that points to a compressed extent 3160 * and it's followed by a consecutive file range that points to 3161 * to the same compressed extent (possibly with a different 3162 * offset and/or length, so it either points to the whole extent 3163 * or only part of it), we must make sure we do not submit a 3164 * single bio to populate the pages for the 2 ranges because 3165 * this makes the compressed extent read zero out the pages 3166 * belonging to the 2nd range. Imagine the following scenario: 3167 * 3168 * File layout 3169 * [0 - 8K] [8K - 24K] 3170 * | | 3171 * | | 3172 * points to extent X, points to extent X, 3173 * offset 4K, length of 8K offset 0, length 16K 3174 * 3175 * [extent X, compressed length = 4K uncompressed length = 16K] 3176 * 3177 * If the bio to read the compressed extent covers both ranges, 3178 * it will decompress extent X into the pages belonging to the 3179 * first range and then it will stop, zeroing out the remaining 3180 * pages that belong to the other range that points to extent X. 3181 * So here we make sure we submit 2 bios, one for the first 3182 * range and another one for the third range. Both will target 3183 * the same physical extent from disk, but we can't currently 3184 * make the compressed bio endio callback populate the pages 3185 * for both ranges because each compressed bio is tightly 3186 * coupled with a single extent map, and each range can have 3187 * an extent map with a different offset value relative to the 3188 * uncompressed data of our extent and different lengths. This 3189 * is a corner case so we prioritize correctness over 3190 * non-optimal behavior (submitting 2 bios for the same extent). 3191 */ 3192 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3193 prev_em_start && *prev_em_start != (u64)-1 && 3194 *prev_em_start != em->start) 3195 force_bio_submit = true; 3196 3197 if (prev_em_start) 3198 *prev_em_start = em->start; 3199 3200 free_extent_map(em); 3201 em = NULL; 3202 3203 /* we've found a hole, just zero and go on */ 3204 if (block_start == EXTENT_MAP_HOLE) { 3205 char *userpage; 3206 struct extent_state *cached = NULL; 3207 3208 userpage = kmap_atomic(page); 3209 memset(userpage + pg_offset, 0, iosize); 3210 flush_dcache_page(page); 3211 kunmap_atomic(userpage); 3212 3213 set_extent_uptodate(tree, cur, cur + iosize - 1, 3214 &cached, GFP_NOFS); 3215 unlock_extent_cached(tree, cur, 3216 cur + iosize - 1, &cached); 3217 cur = cur + iosize; 3218 pg_offset += iosize; 3219 continue; 3220 } 3221 /* the get_extent function already copied into the page */ 3222 if (test_range_bit(tree, cur, cur_end, 3223 EXTENT_UPTODATE, 1, NULL)) { 3224 check_page_uptodate(tree, page); 3225 unlock_extent(tree, cur, cur + iosize - 1); 3226 cur = cur + iosize; 3227 pg_offset += iosize; 3228 continue; 3229 } 3230 /* we have an inline extent but it didn't get marked up 3231 * to date. Error out 3232 */ 3233 if (block_start == EXTENT_MAP_INLINE) { 3234 SetPageError(page); 3235 unlock_extent(tree, cur, cur + iosize - 1); 3236 cur = cur + iosize; 3237 pg_offset += iosize; 3238 continue; 3239 } 3240 3241 ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, 3242 page, offset, disk_io_size, 3243 pg_offset, bio, 3244 end_bio_extent_readpage, mirror_num, 3245 *bio_flags, 3246 this_bio_flag, 3247 force_bio_submit); 3248 if (!ret) { 3249 nr++; 3250 *bio_flags = this_bio_flag; 3251 } else { 3252 SetPageError(page); 3253 unlock_extent(tree, cur, cur + iosize - 1); 3254 goto out; 3255 } 3256 cur = cur + iosize; 3257 pg_offset += iosize; 3258 } 3259 out: 3260 if (!nr) { 3261 if (!PageError(page)) 3262 SetPageUptodate(page); 3263 unlock_page(page); 3264 } 3265 return ret; 3266 } 3267 3268 static inline void contiguous_readpages(struct extent_io_tree *tree, 3269 struct page *pages[], int nr_pages, 3270 u64 start, u64 end, 3271 struct extent_map **em_cached, 3272 struct bio **bio, 3273 unsigned long *bio_flags, 3274 u64 *prev_em_start) 3275 { 3276 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3277 int index; 3278 3279 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3280 3281 for (index = 0; index < nr_pages; index++) { 3282 __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, 3283 bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); 3284 put_page(pages[index]); 3285 } 3286 } 3287 3288 static int __extent_read_full_page(struct extent_io_tree *tree, 3289 struct page *page, 3290 get_extent_t *get_extent, 3291 struct bio **bio, int mirror_num, 3292 unsigned long *bio_flags, 3293 unsigned int read_flags) 3294 { 3295 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3296 u64 start = page_offset(page); 3297 u64 end = start + PAGE_SIZE - 1; 3298 int ret; 3299 3300 btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); 3301 3302 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3303 bio_flags, read_flags, NULL); 3304 return ret; 3305 } 3306 3307 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3308 get_extent_t *get_extent, int mirror_num) 3309 { 3310 struct bio *bio = NULL; 3311 unsigned long bio_flags = 0; 3312 int ret; 3313 3314 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3315 &bio_flags, 0); 3316 if (bio) 3317 ret = submit_one_bio(bio, mirror_num, bio_flags); 3318 return ret; 3319 } 3320 3321 static void update_nr_written(struct writeback_control *wbc, 3322 unsigned long nr_written) 3323 { 3324 wbc->nr_to_write -= nr_written; 3325 } 3326 3327 /* 3328 * helper for __extent_writepage, doing all of the delayed allocation setup. 3329 * 3330 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3331 * to write the page (copy into inline extent). In this case the IO has 3332 * been started and the page is already unlocked. 3333 * 3334 * This returns 0 if all went well (page still locked) 3335 * This returns < 0 if there were errors (page still locked) 3336 */ 3337 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3338 struct page *page, struct writeback_control *wbc, 3339 u64 delalloc_start, unsigned long *nr_written) 3340 { 3341 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3342 bool found; 3343 u64 delalloc_to_write = 0; 3344 u64 delalloc_end = 0; 3345 int ret; 3346 int page_started = 0; 3347 3348 3349 while (delalloc_end < page_end) { 3350 found = find_lock_delalloc_range(inode, page, 3351 &delalloc_start, 3352 &delalloc_end); 3353 if (!found) { 3354 delalloc_start = delalloc_end + 1; 3355 continue; 3356 } 3357 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3358 delalloc_end, &page_started, nr_written, wbc); 3359 if (ret) { 3360 SetPageError(page); 3361 /* 3362 * btrfs_run_delalloc_range should return < 0 for error 3363 * but just in case, we use > 0 here meaning the IO is 3364 * started, so we don't want to return > 0 unless 3365 * things are going well. 3366 */ 3367 ret = ret < 0 ? ret : -EIO; 3368 goto done; 3369 } 3370 /* 3371 * delalloc_end is already one less than the total length, so 3372 * we don't subtract one from PAGE_SIZE 3373 */ 3374 delalloc_to_write += (delalloc_end - delalloc_start + 3375 PAGE_SIZE) >> PAGE_SHIFT; 3376 delalloc_start = delalloc_end + 1; 3377 } 3378 if (wbc->nr_to_write < delalloc_to_write) { 3379 int thresh = 8192; 3380 3381 if (delalloc_to_write < thresh * 2) 3382 thresh = delalloc_to_write; 3383 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3384 thresh); 3385 } 3386 3387 /* did the fill delalloc function already unlock and start 3388 * the IO? 3389 */ 3390 if (page_started) { 3391 /* 3392 * we've unlocked the page, so we can't update 3393 * the mapping's writeback index, just update 3394 * nr_to_write. 3395 */ 3396 wbc->nr_to_write -= *nr_written; 3397 return 1; 3398 } 3399 3400 ret = 0; 3401 3402 done: 3403 return ret; 3404 } 3405 3406 /* 3407 * helper for __extent_writepage. This calls the writepage start hooks, 3408 * and does the loop to map the page into extents and bios. 3409 * 3410 * We return 1 if the IO is started and the page is unlocked, 3411 * 0 if all went well (page still locked) 3412 * < 0 if there were errors (page still locked) 3413 */ 3414 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3415 struct page *page, 3416 struct writeback_control *wbc, 3417 struct extent_page_data *epd, 3418 loff_t i_size, 3419 unsigned long nr_written, 3420 int *nr_ret) 3421 { 3422 struct extent_io_tree *tree = epd->tree; 3423 u64 start = page_offset(page); 3424 u64 page_end = start + PAGE_SIZE - 1; 3425 u64 end; 3426 u64 cur = start; 3427 u64 extent_offset; 3428 u64 block_start; 3429 u64 iosize; 3430 struct extent_map *em; 3431 size_t pg_offset = 0; 3432 size_t blocksize; 3433 int ret = 0; 3434 int nr = 0; 3435 const unsigned int write_flags = wbc_to_write_flags(wbc); 3436 bool compressed; 3437 3438 ret = btrfs_writepage_cow_fixup(page, start, page_end); 3439 if (ret) { 3440 /* Fixup worker will requeue */ 3441 if (ret == -EBUSY) 3442 wbc->pages_skipped++; 3443 else 3444 redirty_page_for_writepage(wbc, page); 3445 3446 update_nr_written(wbc, nr_written); 3447 unlock_page(page); 3448 return 1; 3449 } 3450 3451 /* 3452 * we don't want to touch the inode after unlocking the page, 3453 * so we update the mapping writeback index now 3454 */ 3455 update_nr_written(wbc, nr_written + 1); 3456 3457 end = page_end; 3458 if (i_size <= start) { 3459 btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); 3460 goto done; 3461 } 3462 3463 blocksize = inode->i_sb->s_blocksize; 3464 3465 while (cur <= end) { 3466 u64 em_end; 3467 u64 offset; 3468 3469 if (cur >= i_size) { 3470 btrfs_writepage_endio_finish_ordered(page, cur, 3471 page_end, 1); 3472 break; 3473 } 3474 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, 3475 end - cur + 1, 1); 3476 if (IS_ERR_OR_NULL(em)) { 3477 SetPageError(page); 3478 ret = PTR_ERR_OR_ZERO(em); 3479 break; 3480 } 3481 3482 extent_offset = cur - em->start; 3483 em_end = extent_map_end(em); 3484 BUG_ON(em_end <= cur); 3485 BUG_ON(end < cur); 3486 iosize = min(em_end - cur, end - cur + 1); 3487 iosize = ALIGN(iosize, blocksize); 3488 offset = em->block_start + extent_offset; 3489 block_start = em->block_start; 3490 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3491 free_extent_map(em); 3492 em = NULL; 3493 3494 /* 3495 * compressed and inline extents are written through other 3496 * paths in the FS 3497 */ 3498 if (compressed || block_start == EXTENT_MAP_HOLE || 3499 block_start == EXTENT_MAP_INLINE) { 3500 /* 3501 * end_io notification does not happen here for 3502 * compressed extents 3503 */ 3504 if (!compressed) 3505 btrfs_writepage_endio_finish_ordered(page, cur, 3506 cur + iosize - 1, 3507 1); 3508 else if (compressed) { 3509 /* we don't want to end_page_writeback on 3510 * a compressed extent. this happens 3511 * elsewhere 3512 */ 3513 nr++; 3514 } 3515 3516 cur += iosize; 3517 pg_offset += iosize; 3518 continue; 3519 } 3520 3521 btrfs_set_range_writeback(tree, cur, cur + iosize - 1); 3522 if (!PageWriteback(page)) { 3523 btrfs_err(BTRFS_I(inode)->root->fs_info, 3524 "page %lu not writeback, cur %llu end %llu", 3525 page->index, cur, end); 3526 } 3527 3528 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3529 page, offset, iosize, pg_offset, 3530 &epd->bio, 3531 end_bio_extent_writepage, 3532 0, 0, 0, false); 3533 if (ret) { 3534 SetPageError(page); 3535 if (PageWriteback(page)) 3536 end_page_writeback(page); 3537 } 3538 3539 cur = cur + iosize; 3540 pg_offset += iosize; 3541 nr++; 3542 } 3543 done: 3544 *nr_ret = nr; 3545 return ret; 3546 } 3547 3548 /* 3549 * the writepage semantics are similar to regular writepage. extent 3550 * records are inserted to lock ranges in the tree, and as dirty areas 3551 * are found, they are marked writeback. Then the lock bits are removed 3552 * and the end_io handler clears the writeback ranges 3553 * 3554 * Return 0 if everything goes well. 3555 * Return <0 for error. 3556 */ 3557 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3558 struct extent_page_data *epd) 3559 { 3560 struct inode *inode = page->mapping->host; 3561 u64 start = page_offset(page); 3562 u64 page_end = start + PAGE_SIZE - 1; 3563 int ret; 3564 int nr = 0; 3565 size_t pg_offset = 0; 3566 loff_t i_size = i_size_read(inode); 3567 unsigned long end_index = i_size >> PAGE_SHIFT; 3568 unsigned long nr_written = 0; 3569 3570 trace___extent_writepage(page, inode, wbc); 3571 3572 WARN_ON(!PageLocked(page)); 3573 3574 ClearPageError(page); 3575 3576 pg_offset = offset_in_page(i_size); 3577 if (page->index > end_index || 3578 (page->index == end_index && !pg_offset)) { 3579 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3580 unlock_page(page); 3581 return 0; 3582 } 3583 3584 if (page->index == end_index) { 3585 char *userpage; 3586 3587 userpage = kmap_atomic(page); 3588 memset(userpage + pg_offset, 0, 3589 PAGE_SIZE - pg_offset); 3590 kunmap_atomic(userpage); 3591 flush_dcache_page(page); 3592 } 3593 3594 pg_offset = 0; 3595 3596 set_page_extent_mapped(page); 3597 3598 if (!epd->extent_locked) { 3599 ret = writepage_delalloc(inode, page, wbc, start, &nr_written); 3600 if (ret == 1) 3601 goto done_unlocked; 3602 if (ret) 3603 goto done; 3604 } 3605 3606 ret = __extent_writepage_io(inode, page, wbc, epd, 3607 i_size, nr_written, &nr); 3608 if (ret == 1) 3609 goto done_unlocked; 3610 3611 done: 3612 if (nr == 0) { 3613 /* make sure the mapping tag for page dirty gets cleared */ 3614 set_page_writeback(page); 3615 end_page_writeback(page); 3616 } 3617 if (PageError(page)) { 3618 ret = ret < 0 ? ret : -EIO; 3619 end_extent_writepage(page, ret, start, page_end); 3620 } 3621 unlock_page(page); 3622 ASSERT(ret <= 0); 3623 return ret; 3624 3625 done_unlocked: 3626 return 0; 3627 } 3628 3629 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3630 { 3631 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3632 TASK_UNINTERRUPTIBLE); 3633 } 3634 3635 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3636 { 3637 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3638 smp_mb__after_atomic(); 3639 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3640 } 3641 3642 /* 3643 * Lock eb pages and flush the bio if we can't the locks 3644 * 3645 * Return 0 if nothing went wrong 3646 * Return >0 is same as 0, except bio is not submitted 3647 * Return <0 if something went wrong, no page is locked 3648 */ 3649 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, 3650 struct extent_page_data *epd) 3651 { 3652 struct btrfs_fs_info *fs_info = eb->fs_info; 3653 int i, num_pages, failed_page_nr; 3654 int flush = 0; 3655 int ret = 0; 3656 3657 if (!btrfs_try_tree_write_lock(eb)) { 3658 ret = flush_write_bio(epd); 3659 if (ret < 0) 3660 return ret; 3661 flush = 1; 3662 btrfs_tree_lock(eb); 3663 } 3664 3665 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3666 btrfs_tree_unlock(eb); 3667 if (!epd->sync_io) 3668 return 0; 3669 if (!flush) { 3670 ret = flush_write_bio(epd); 3671 if (ret < 0) 3672 return ret; 3673 flush = 1; 3674 } 3675 while (1) { 3676 wait_on_extent_buffer_writeback(eb); 3677 btrfs_tree_lock(eb); 3678 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3679 break; 3680 btrfs_tree_unlock(eb); 3681 } 3682 } 3683 3684 /* 3685 * We need to do this to prevent races in people who check if the eb is 3686 * under IO since we can end up having no IO bits set for a short period 3687 * of time. 3688 */ 3689 spin_lock(&eb->refs_lock); 3690 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3691 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3692 spin_unlock(&eb->refs_lock); 3693 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3694 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 3695 -eb->len, 3696 fs_info->dirty_metadata_batch); 3697 ret = 1; 3698 } else { 3699 spin_unlock(&eb->refs_lock); 3700 } 3701 3702 btrfs_tree_unlock(eb); 3703 3704 if (!ret) 3705 return ret; 3706 3707 num_pages = num_extent_pages(eb); 3708 for (i = 0; i < num_pages; i++) { 3709 struct page *p = eb->pages[i]; 3710 3711 if (!trylock_page(p)) { 3712 if (!flush) { 3713 int err; 3714 3715 err = flush_write_bio(epd); 3716 if (err < 0) { 3717 ret = err; 3718 failed_page_nr = i; 3719 goto err_unlock; 3720 } 3721 flush = 1; 3722 } 3723 lock_page(p); 3724 } 3725 } 3726 3727 return ret; 3728 err_unlock: 3729 /* Unlock already locked pages */ 3730 for (i = 0; i < failed_page_nr; i++) 3731 unlock_page(eb->pages[i]); 3732 /* 3733 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. 3734 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can 3735 * be made and undo everything done before. 3736 */ 3737 btrfs_tree_lock(eb); 3738 spin_lock(&eb->refs_lock); 3739 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 3740 end_extent_buffer_writeback(eb); 3741 spin_unlock(&eb->refs_lock); 3742 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, 3743 fs_info->dirty_metadata_batch); 3744 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3745 btrfs_tree_unlock(eb); 3746 return ret; 3747 } 3748 3749 static void set_btree_ioerr(struct page *page) 3750 { 3751 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3752 struct btrfs_fs_info *fs_info; 3753 3754 SetPageError(page); 3755 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3756 return; 3757 3758 /* 3759 * If we error out, we should add back the dirty_metadata_bytes 3760 * to make it consistent. 3761 */ 3762 fs_info = eb->fs_info; 3763 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 3764 eb->len, fs_info->dirty_metadata_batch); 3765 3766 /* 3767 * If writeback for a btree extent that doesn't belong to a log tree 3768 * failed, increment the counter transaction->eb_write_errors. 3769 * We do this because while the transaction is running and before it's 3770 * committing (when we call filemap_fdata[write|wait]_range against 3771 * the btree inode), we might have 3772 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3773 * returns an error or an error happens during writeback, when we're 3774 * committing the transaction we wouldn't know about it, since the pages 3775 * can be no longer dirty nor marked anymore for writeback (if a 3776 * subsequent modification to the extent buffer didn't happen before the 3777 * transaction commit), which makes filemap_fdata[write|wait]_range not 3778 * able to find the pages tagged with SetPageError at transaction 3779 * commit time. So if this happens we must abort the transaction, 3780 * otherwise we commit a super block with btree roots that point to 3781 * btree nodes/leafs whose content on disk is invalid - either garbage 3782 * or the content of some node/leaf from a past generation that got 3783 * cowed or deleted and is no longer valid. 3784 * 3785 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3786 * not be enough - we need to distinguish between log tree extents vs 3787 * non-log tree extents, and the next filemap_fdatawait_range() call 3788 * will catch and clear such errors in the mapping - and that call might 3789 * be from a log sync and not from a transaction commit. Also, checking 3790 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3791 * not done and would not be reliable - the eb might have been released 3792 * from memory and reading it back again means that flag would not be 3793 * set (since it's a runtime flag, not persisted on disk). 3794 * 3795 * Using the flags below in the btree inode also makes us achieve the 3796 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3797 * writeback for all dirty pages and before filemap_fdatawait_range() 3798 * is called, the writeback for all dirty pages had already finished 3799 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3800 * filemap_fdatawait_range() would return success, as it could not know 3801 * that writeback errors happened (the pages were no longer tagged for 3802 * writeback). 3803 */ 3804 switch (eb->log_index) { 3805 case -1: 3806 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3807 break; 3808 case 0: 3809 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3810 break; 3811 case 1: 3812 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3813 break; 3814 default: 3815 BUG(); /* unexpected, logic error */ 3816 } 3817 } 3818 3819 static void end_bio_extent_buffer_writepage(struct bio *bio) 3820 { 3821 struct bio_vec *bvec; 3822 struct extent_buffer *eb; 3823 int done; 3824 struct bvec_iter_all iter_all; 3825 3826 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3827 bio_for_each_segment_all(bvec, bio, iter_all) { 3828 struct page *page = bvec->bv_page; 3829 3830 eb = (struct extent_buffer *)page->private; 3831 BUG_ON(!eb); 3832 done = atomic_dec_and_test(&eb->io_pages); 3833 3834 if (bio->bi_status || 3835 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3836 ClearPageUptodate(page); 3837 set_btree_ioerr(page); 3838 } 3839 3840 end_page_writeback(page); 3841 3842 if (!done) 3843 continue; 3844 3845 end_extent_buffer_writeback(eb); 3846 } 3847 3848 bio_put(bio); 3849 } 3850 3851 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3852 struct writeback_control *wbc, 3853 struct extent_page_data *epd) 3854 { 3855 struct btrfs_fs_info *fs_info = eb->fs_info; 3856 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3857 u64 offset = eb->start; 3858 u32 nritems; 3859 int i, num_pages; 3860 unsigned long start, end; 3861 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 3862 int ret = 0; 3863 3864 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3865 num_pages = num_extent_pages(eb); 3866 atomic_set(&eb->io_pages, num_pages); 3867 3868 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3869 nritems = btrfs_header_nritems(eb); 3870 if (btrfs_header_level(eb) > 0) { 3871 end = btrfs_node_key_ptr_offset(nritems); 3872 3873 memzero_extent_buffer(eb, end, eb->len - end); 3874 } else { 3875 /* 3876 * leaf: 3877 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3878 */ 3879 start = btrfs_item_nr_offset(nritems); 3880 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 3881 memzero_extent_buffer(eb, start, end - start); 3882 } 3883 3884 for (i = 0; i < num_pages; i++) { 3885 struct page *p = eb->pages[i]; 3886 3887 clear_page_dirty_for_io(p); 3888 set_page_writeback(p); 3889 ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, 3890 p, offset, PAGE_SIZE, 0, 3891 &epd->bio, 3892 end_bio_extent_buffer_writepage, 3893 0, 0, 0, false); 3894 if (ret) { 3895 set_btree_ioerr(p); 3896 if (PageWriteback(p)) 3897 end_page_writeback(p); 3898 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3899 end_extent_buffer_writeback(eb); 3900 ret = -EIO; 3901 break; 3902 } 3903 offset += PAGE_SIZE; 3904 update_nr_written(wbc, 1); 3905 unlock_page(p); 3906 } 3907 3908 if (unlikely(ret)) { 3909 for (; i < num_pages; i++) { 3910 struct page *p = eb->pages[i]; 3911 clear_page_dirty_for_io(p); 3912 unlock_page(p); 3913 } 3914 } 3915 3916 return ret; 3917 } 3918 3919 int btree_write_cache_pages(struct address_space *mapping, 3920 struct writeback_control *wbc) 3921 { 3922 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3923 struct extent_buffer *eb, *prev_eb = NULL; 3924 struct extent_page_data epd = { 3925 .bio = NULL, 3926 .tree = tree, 3927 .extent_locked = 0, 3928 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3929 }; 3930 int ret = 0; 3931 int done = 0; 3932 int nr_to_write_done = 0; 3933 struct pagevec pvec; 3934 int nr_pages; 3935 pgoff_t index; 3936 pgoff_t end; /* Inclusive */ 3937 int scanned = 0; 3938 xa_mark_t tag; 3939 3940 pagevec_init(&pvec); 3941 if (wbc->range_cyclic) { 3942 index = mapping->writeback_index; /* Start from prev offset */ 3943 end = -1; 3944 } else { 3945 index = wbc->range_start >> PAGE_SHIFT; 3946 end = wbc->range_end >> PAGE_SHIFT; 3947 scanned = 1; 3948 } 3949 if (wbc->sync_mode == WB_SYNC_ALL) 3950 tag = PAGECACHE_TAG_TOWRITE; 3951 else 3952 tag = PAGECACHE_TAG_DIRTY; 3953 retry: 3954 if (wbc->sync_mode == WB_SYNC_ALL) 3955 tag_pages_for_writeback(mapping, index, end); 3956 while (!done && !nr_to_write_done && (index <= end) && 3957 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 3958 tag))) { 3959 unsigned i; 3960 3961 scanned = 1; 3962 for (i = 0; i < nr_pages; i++) { 3963 struct page *page = pvec.pages[i]; 3964 3965 if (!PagePrivate(page)) 3966 continue; 3967 3968 spin_lock(&mapping->private_lock); 3969 if (!PagePrivate(page)) { 3970 spin_unlock(&mapping->private_lock); 3971 continue; 3972 } 3973 3974 eb = (struct extent_buffer *)page->private; 3975 3976 /* 3977 * Shouldn't happen and normally this would be a BUG_ON 3978 * but no sense in crashing the users box for something 3979 * we can survive anyway. 3980 */ 3981 if (WARN_ON(!eb)) { 3982 spin_unlock(&mapping->private_lock); 3983 continue; 3984 } 3985 3986 if (eb == prev_eb) { 3987 spin_unlock(&mapping->private_lock); 3988 continue; 3989 } 3990 3991 ret = atomic_inc_not_zero(&eb->refs); 3992 spin_unlock(&mapping->private_lock); 3993 if (!ret) 3994 continue; 3995 3996 prev_eb = eb; 3997 ret = lock_extent_buffer_for_io(eb, &epd); 3998 if (!ret) { 3999 free_extent_buffer(eb); 4000 continue; 4001 } else if (ret < 0) { 4002 done = 1; 4003 free_extent_buffer(eb); 4004 break; 4005 } 4006 4007 ret = write_one_eb(eb, wbc, &epd); 4008 if (ret) { 4009 done = 1; 4010 free_extent_buffer(eb); 4011 break; 4012 } 4013 free_extent_buffer(eb); 4014 4015 /* 4016 * the filesystem may choose to bump up nr_to_write. 4017 * We have to make sure to honor the new nr_to_write 4018 * at any time 4019 */ 4020 nr_to_write_done = wbc->nr_to_write <= 0; 4021 } 4022 pagevec_release(&pvec); 4023 cond_resched(); 4024 } 4025 if (!scanned && !done) { 4026 /* 4027 * We hit the last page and there is more work to be done: wrap 4028 * back to the start of the file 4029 */ 4030 scanned = 1; 4031 index = 0; 4032 goto retry; 4033 } 4034 ASSERT(ret <= 0); 4035 if (ret < 0) { 4036 end_write_bio(&epd, ret); 4037 return ret; 4038 } 4039 ret = flush_write_bio(&epd); 4040 return ret; 4041 } 4042 4043 /** 4044 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 4045 * @mapping: address space structure to write 4046 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 4047 * @data: data passed to __extent_writepage function 4048 * 4049 * If a page is already under I/O, write_cache_pages() skips it, even 4050 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 4051 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 4052 * and msync() need to guarantee that all the data which was dirty at the time 4053 * the call was made get new I/O started against them. If wbc->sync_mode is 4054 * WB_SYNC_ALL then we were called for data integrity and we must wait for 4055 * existing IO to complete. 4056 */ 4057 static int extent_write_cache_pages(struct address_space *mapping, 4058 struct writeback_control *wbc, 4059 struct extent_page_data *epd) 4060 { 4061 struct inode *inode = mapping->host; 4062 int ret = 0; 4063 int done = 0; 4064 int nr_to_write_done = 0; 4065 struct pagevec pvec; 4066 int nr_pages; 4067 pgoff_t index; 4068 pgoff_t end; /* Inclusive */ 4069 pgoff_t done_index; 4070 int range_whole = 0; 4071 int scanned = 0; 4072 xa_mark_t tag; 4073 4074 /* 4075 * We have to hold onto the inode so that ordered extents can do their 4076 * work when the IO finishes. The alternative to this is failing to add 4077 * an ordered extent if the igrab() fails there and that is a huge pain 4078 * to deal with, so instead just hold onto the inode throughout the 4079 * writepages operation. If it fails here we are freeing up the inode 4080 * anyway and we'd rather not waste our time writing out stuff that is 4081 * going to be truncated anyway. 4082 */ 4083 if (!igrab(inode)) 4084 return 0; 4085 4086 pagevec_init(&pvec); 4087 if (wbc->range_cyclic) { 4088 index = mapping->writeback_index; /* Start from prev offset */ 4089 end = -1; 4090 } else { 4091 index = wbc->range_start >> PAGE_SHIFT; 4092 end = wbc->range_end >> PAGE_SHIFT; 4093 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 4094 range_whole = 1; 4095 scanned = 1; 4096 } 4097 4098 /* 4099 * We do the tagged writepage as long as the snapshot flush bit is set 4100 * and we are the first one who do the filemap_flush() on this inode. 4101 * 4102 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 4103 * not race in and drop the bit. 4104 */ 4105 if (range_whole && wbc->nr_to_write == LONG_MAX && 4106 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 4107 &BTRFS_I(inode)->runtime_flags)) 4108 wbc->tagged_writepages = 1; 4109 4110 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4111 tag = PAGECACHE_TAG_TOWRITE; 4112 else 4113 tag = PAGECACHE_TAG_DIRTY; 4114 retry: 4115 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4116 tag_pages_for_writeback(mapping, index, end); 4117 done_index = index; 4118 while (!done && !nr_to_write_done && (index <= end) && 4119 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 4120 &index, end, tag))) { 4121 unsigned i; 4122 4123 scanned = 1; 4124 for (i = 0; i < nr_pages; i++) { 4125 struct page *page = pvec.pages[i]; 4126 4127 done_index = page->index + 1; 4128 /* 4129 * At this point we hold neither the i_pages lock nor 4130 * the page lock: the page may be truncated or 4131 * invalidated (changing page->mapping to NULL), 4132 * or even swizzled back from swapper_space to 4133 * tmpfs file mapping 4134 */ 4135 if (!trylock_page(page)) { 4136 ret = flush_write_bio(epd); 4137 BUG_ON(ret < 0); 4138 lock_page(page); 4139 } 4140 4141 if (unlikely(page->mapping != mapping)) { 4142 unlock_page(page); 4143 continue; 4144 } 4145 4146 if (wbc->sync_mode != WB_SYNC_NONE) { 4147 if (PageWriteback(page)) { 4148 ret = flush_write_bio(epd); 4149 BUG_ON(ret < 0); 4150 } 4151 wait_on_page_writeback(page); 4152 } 4153 4154 if (PageWriteback(page) || 4155 !clear_page_dirty_for_io(page)) { 4156 unlock_page(page); 4157 continue; 4158 } 4159 4160 ret = __extent_writepage(page, wbc, epd); 4161 if (ret < 0) { 4162 done = 1; 4163 break; 4164 } 4165 4166 /* 4167 * the filesystem may choose to bump up nr_to_write. 4168 * We have to make sure to honor the new nr_to_write 4169 * at any time 4170 */ 4171 nr_to_write_done = wbc->nr_to_write <= 0; 4172 } 4173 pagevec_release(&pvec); 4174 cond_resched(); 4175 } 4176 if (!scanned && !done) { 4177 /* 4178 * We hit the last page and there is more work to be done: wrap 4179 * back to the start of the file 4180 */ 4181 scanned = 1; 4182 index = 0; 4183 goto retry; 4184 } 4185 4186 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4187 mapping->writeback_index = done_index; 4188 4189 btrfs_add_delayed_iput(inode); 4190 return ret; 4191 } 4192 4193 int extent_write_full_page(struct page *page, struct writeback_control *wbc) 4194 { 4195 int ret; 4196 struct extent_page_data epd = { 4197 .bio = NULL, 4198 .tree = &BTRFS_I(page->mapping->host)->io_tree, 4199 .extent_locked = 0, 4200 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4201 }; 4202 4203 ret = __extent_writepage(page, wbc, &epd); 4204 ASSERT(ret <= 0); 4205 if (ret < 0) { 4206 end_write_bio(&epd, ret); 4207 return ret; 4208 } 4209 4210 ret = flush_write_bio(&epd); 4211 ASSERT(ret <= 0); 4212 return ret; 4213 } 4214 4215 int extent_write_locked_range(struct inode *inode, u64 start, u64 end, 4216 int mode) 4217 { 4218 int ret = 0; 4219 struct address_space *mapping = inode->i_mapping; 4220 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 4221 struct page *page; 4222 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4223 PAGE_SHIFT; 4224 4225 struct extent_page_data epd = { 4226 .bio = NULL, 4227 .tree = tree, 4228 .extent_locked = 1, 4229 .sync_io = mode == WB_SYNC_ALL, 4230 }; 4231 struct writeback_control wbc_writepages = { 4232 .sync_mode = mode, 4233 .nr_to_write = nr_pages * 2, 4234 .range_start = start, 4235 .range_end = end + 1, 4236 /* We're called from an async helper function */ 4237 .punt_to_cgroup = 1, 4238 .no_cgroup_owner = 1, 4239 }; 4240 4241 wbc_attach_fdatawrite_inode(&wbc_writepages, inode); 4242 while (start <= end) { 4243 page = find_get_page(mapping, start >> PAGE_SHIFT); 4244 if (clear_page_dirty_for_io(page)) 4245 ret = __extent_writepage(page, &wbc_writepages, &epd); 4246 else { 4247 btrfs_writepage_endio_finish_ordered(page, start, 4248 start + PAGE_SIZE - 1, 1); 4249 unlock_page(page); 4250 } 4251 put_page(page); 4252 start += PAGE_SIZE; 4253 } 4254 4255 ASSERT(ret <= 0); 4256 if (ret == 0) 4257 ret = flush_write_bio(&epd); 4258 else 4259 end_write_bio(&epd, ret); 4260 4261 wbc_detach_inode(&wbc_writepages); 4262 return ret; 4263 } 4264 4265 int extent_writepages(struct address_space *mapping, 4266 struct writeback_control *wbc) 4267 { 4268 int ret = 0; 4269 struct extent_page_data epd = { 4270 .bio = NULL, 4271 .tree = &BTRFS_I(mapping->host)->io_tree, 4272 .extent_locked = 0, 4273 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4274 }; 4275 4276 ret = extent_write_cache_pages(mapping, wbc, &epd); 4277 ASSERT(ret <= 0); 4278 if (ret < 0) { 4279 end_write_bio(&epd, ret); 4280 return ret; 4281 } 4282 ret = flush_write_bio(&epd); 4283 return ret; 4284 } 4285 4286 int extent_readpages(struct address_space *mapping, struct list_head *pages, 4287 unsigned nr_pages) 4288 { 4289 struct bio *bio = NULL; 4290 unsigned long bio_flags = 0; 4291 struct page *pagepool[16]; 4292 struct extent_map *em_cached = NULL; 4293 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 4294 int nr = 0; 4295 u64 prev_em_start = (u64)-1; 4296 4297 while (!list_empty(pages)) { 4298 u64 contig_end = 0; 4299 4300 for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { 4301 struct page *page = lru_to_page(pages); 4302 4303 prefetchw(&page->flags); 4304 list_del(&page->lru); 4305 if (add_to_page_cache_lru(page, mapping, page->index, 4306 readahead_gfp_mask(mapping))) { 4307 put_page(page); 4308 break; 4309 } 4310 4311 pagepool[nr++] = page; 4312 contig_end = page_offset(page) + PAGE_SIZE - 1; 4313 } 4314 4315 if (nr) { 4316 u64 contig_start = page_offset(pagepool[0]); 4317 4318 ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); 4319 4320 contiguous_readpages(tree, pagepool, nr, contig_start, 4321 contig_end, &em_cached, &bio, &bio_flags, 4322 &prev_em_start); 4323 } 4324 } 4325 4326 if (em_cached) 4327 free_extent_map(em_cached); 4328 4329 if (bio) 4330 return submit_one_bio(bio, 0, bio_flags); 4331 return 0; 4332 } 4333 4334 /* 4335 * basic invalidatepage code, this waits on any locked or writeback 4336 * ranges corresponding to the page, and then deletes any extent state 4337 * records from the tree 4338 */ 4339 int extent_invalidatepage(struct extent_io_tree *tree, 4340 struct page *page, unsigned long offset) 4341 { 4342 struct extent_state *cached_state = NULL; 4343 u64 start = page_offset(page); 4344 u64 end = start + PAGE_SIZE - 1; 4345 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4346 4347 start += ALIGN(offset, blocksize); 4348 if (start > end) 4349 return 0; 4350 4351 lock_extent_bits(tree, start, end, &cached_state); 4352 wait_on_page_writeback(page); 4353 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DELALLOC | 4354 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state); 4355 return 0; 4356 } 4357 4358 /* 4359 * a helper for releasepage, this tests for areas of the page that 4360 * are locked or under IO and drops the related state bits if it is safe 4361 * to drop the page. 4362 */ 4363 static int try_release_extent_state(struct extent_io_tree *tree, 4364 struct page *page, gfp_t mask) 4365 { 4366 u64 start = page_offset(page); 4367 u64 end = start + PAGE_SIZE - 1; 4368 int ret = 1; 4369 4370 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { 4371 ret = 0; 4372 } else { 4373 /* 4374 * at this point we can safely clear everything except the 4375 * locked bit and the nodatasum bit 4376 */ 4377 ret = __clear_extent_bit(tree, start, end, 4378 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4379 0, 0, NULL, mask, NULL); 4380 4381 /* if clear_extent_bit failed for enomem reasons, 4382 * we can't allow the release to continue. 4383 */ 4384 if (ret < 0) 4385 ret = 0; 4386 else 4387 ret = 1; 4388 } 4389 return ret; 4390 } 4391 4392 /* 4393 * a helper for releasepage. As long as there are no locked extents 4394 * in the range corresponding to the page, both state records and extent 4395 * map records are removed 4396 */ 4397 int try_release_extent_mapping(struct page *page, gfp_t mask) 4398 { 4399 struct extent_map *em; 4400 u64 start = page_offset(page); 4401 u64 end = start + PAGE_SIZE - 1; 4402 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 4403 struct extent_io_tree *tree = &btrfs_inode->io_tree; 4404 struct extent_map_tree *map = &btrfs_inode->extent_tree; 4405 4406 if (gfpflags_allow_blocking(mask) && 4407 page->mapping->host->i_size > SZ_16M) { 4408 u64 len; 4409 while (start <= end) { 4410 len = end - start + 1; 4411 write_lock(&map->lock); 4412 em = lookup_extent_mapping(map, start, len); 4413 if (!em) { 4414 write_unlock(&map->lock); 4415 break; 4416 } 4417 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4418 em->start != start) { 4419 write_unlock(&map->lock); 4420 free_extent_map(em); 4421 break; 4422 } 4423 if (!test_range_bit(tree, em->start, 4424 extent_map_end(em) - 1, 4425 EXTENT_LOCKED, 0, NULL)) { 4426 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4427 &btrfs_inode->runtime_flags); 4428 remove_extent_mapping(map, em); 4429 /* once for the rb tree */ 4430 free_extent_map(em); 4431 } 4432 start = extent_map_end(em); 4433 write_unlock(&map->lock); 4434 4435 /* once for us */ 4436 free_extent_map(em); 4437 } 4438 } 4439 return try_release_extent_state(tree, page, mask); 4440 } 4441 4442 /* 4443 * helper function for fiemap, which doesn't want to see any holes. 4444 * This maps until we find something past 'last' 4445 */ 4446 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4447 u64 offset, u64 last) 4448 { 4449 u64 sectorsize = btrfs_inode_sectorsize(inode); 4450 struct extent_map *em; 4451 u64 len; 4452 4453 if (offset >= last) 4454 return NULL; 4455 4456 while (1) { 4457 len = last - offset; 4458 if (len == 0) 4459 break; 4460 len = ALIGN(len, sectorsize); 4461 em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); 4462 if (IS_ERR_OR_NULL(em)) 4463 return em; 4464 4465 /* if this isn't a hole return it */ 4466 if (em->block_start != EXTENT_MAP_HOLE) 4467 return em; 4468 4469 /* this is a hole, advance to the next extent */ 4470 offset = extent_map_end(em); 4471 free_extent_map(em); 4472 if (offset >= last) 4473 break; 4474 } 4475 return NULL; 4476 } 4477 4478 /* 4479 * To cache previous fiemap extent 4480 * 4481 * Will be used for merging fiemap extent 4482 */ 4483 struct fiemap_cache { 4484 u64 offset; 4485 u64 phys; 4486 u64 len; 4487 u32 flags; 4488 bool cached; 4489 }; 4490 4491 /* 4492 * Helper to submit fiemap extent. 4493 * 4494 * Will try to merge current fiemap extent specified by @offset, @phys, 4495 * @len and @flags with cached one. 4496 * And only when we fails to merge, cached one will be submitted as 4497 * fiemap extent. 4498 * 4499 * Return value is the same as fiemap_fill_next_extent(). 4500 */ 4501 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 4502 struct fiemap_cache *cache, 4503 u64 offset, u64 phys, u64 len, u32 flags) 4504 { 4505 int ret = 0; 4506 4507 if (!cache->cached) 4508 goto assign; 4509 4510 /* 4511 * Sanity check, extent_fiemap() should have ensured that new 4512 * fiemap extent won't overlap with cached one. 4513 * Not recoverable. 4514 * 4515 * NOTE: Physical address can overlap, due to compression 4516 */ 4517 if (cache->offset + cache->len > offset) { 4518 WARN_ON(1); 4519 return -EINVAL; 4520 } 4521 4522 /* 4523 * Only merges fiemap extents if 4524 * 1) Their logical addresses are continuous 4525 * 4526 * 2) Their physical addresses are continuous 4527 * So truly compressed (physical size smaller than logical size) 4528 * extents won't get merged with each other 4529 * 4530 * 3) Share same flags except FIEMAP_EXTENT_LAST 4531 * So regular extent won't get merged with prealloc extent 4532 */ 4533 if (cache->offset + cache->len == offset && 4534 cache->phys + cache->len == phys && 4535 (cache->flags & ~FIEMAP_EXTENT_LAST) == 4536 (flags & ~FIEMAP_EXTENT_LAST)) { 4537 cache->len += len; 4538 cache->flags |= flags; 4539 goto try_submit_last; 4540 } 4541 4542 /* Not mergeable, need to submit cached one */ 4543 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4544 cache->len, cache->flags); 4545 cache->cached = false; 4546 if (ret) 4547 return ret; 4548 assign: 4549 cache->cached = true; 4550 cache->offset = offset; 4551 cache->phys = phys; 4552 cache->len = len; 4553 cache->flags = flags; 4554 try_submit_last: 4555 if (cache->flags & FIEMAP_EXTENT_LAST) { 4556 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 4557 cache->phys, cache->len, cache->flags); 4558 cache->cached = false; 4559 } 4560 return ret; 4561 } 4562 4563 /* 4564 * Emit last fiemap cache 4565 * 4566 * The last fiemap cache may still be cached in the following case: 4567 * 0 4k 8k 4568 * |<- Fiemap range ->| 4569 * |<------------ First extent ----------->| 4570 * 4571 * In this case, the first extent range will be cached but not emitted. 4572 * So we must emit it before ending extent_fiemap(). 4573 */ 4574 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 4575 struct fiemap_cache *cache) 4576 { 4577 int ret; 4578 4579 if (!cache->cached) 4580 return 0; 4581 4582 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4583 cache->len, cache->flags); 4584 cache->cached = false; 4585 if (ret > 0) 4586 ret = 0; 4587 return ret; 4588 } 4589 4590 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4591 __u64 start, __u64 len) 4592 { 4593 int ret = 0; 4594 u64 off = start; 4595 u64 max = start + len; 4596 u32 flags = 0; 4597 u32 found_type; 4598 u64 last; 4599 u64 last_for_get_extent = 0; 4600 u64 disko = 0; 4601 u64 isize = i_size_read(inode); 4602 struct btrfs_key found_key; 4603 struct extent_map *em = NULL; 4604 struct extent_state *cached_state = NULL; 4605 struct btrfs_path *path; 4606 struct btrfs_root *root = BTRFS_I(inode)->root; 4607 struct fiemap_cache cache = { 0 }; 4608 struct ulist *roots; 4609 struct ulist *tmp_ulist; 4610 int end = 0; 4611 u64 em_start = 0; 4612 u64 em_len = 0; 4613 u64 em_end = 0; 4614 4615 if (len == 0) 4616 return -EINVAL; 4617 4618 path = btrfs_alloc_path(); 4619 if (!path) 4620 return -ENOMEM; 4621 path->leave_spinning = 1; 4622 4623 roots = ulist_alloc(GFP_KERNEL); 4624 tmp_ulist = ulist_alloc(GFP_KERNEL); 4625 if (!roots || !tmp_ulist) { 4626 ret = -ENOMEM; 4627 goto out_free_ulist; 4628 } 4629 4630 start = round_down(start, btrfs_inode_sectorsize(inode)); 4631 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4632 4633 /* 4634 * lookup the last file extent. We're not using i_size here 4635 * because there might be preallocation past i_size 4636 */ 4637 ret = btrfs_lookup_file_extent(NULL, root, path, 4638 btrfs_ino(BTRFS_I(inode)), -1, 0); 4639 if (ret < 0) { 4640 goto out_free_ulist; 4641 } else { 4642 WARN_ON(!ret); 4643 if (ret == 1) 4644 ret = 0; 4645 } 4646 4647 path->slots[0]--; 4648 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4649 found_type = found_key.type; 4650 4651 /* No extents, but there might be delalloc bits */ 4652 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4653 found_type != BTRFS_EXTENT_DATA_KEY) { 4654 /* have to trust i_size as the end */ 4655 last = (u64)-1; 4656 last_for_get_extent = isize; 4657 } else { 4658 /* 4659 * remember the start of the last extent. There are a 4660 * bunch of different factors that go into the length of the 4661 * extent, so its much less complex to remember where it started 4662 */ 4663 last = found_key.offset; 4664 last_for_get_extent = last + 1; 4665 } 4666 btrfs_release_path(path); 4667 4668 /* 4669 * we might have some extents allocated but more delalloc past those 4670 * extents. so, we trust isize unless the start of the last extent is 4671 * beyond isize 4672 */ 4673 if (last < isize) { 4674 last = (u64)-1; 4675 last_for_get_extent = isize; 4676 } 4677 4678 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4679 &cached_state); 4680 4681 em = get_extent_skip_holes(inode, start, last_for_get_extent); 4682 if (!em) 4683 goto out; 4684 if (IS_ERR(em)) { 4685 ret = PTR_ERR(em); 4686 goto out; 4687 } 4688 4689 while (!end) { 4690 u64 offset_in_extent = 0; 4691 4692 /* break if the extent we found is outside the range */ 4693 if (em->start >= max || extent_map_end(em) < off) 4694 break; 4695 4696 /* 4697 * get_extent may return an extent that starts before our 4698 * requested range. We have to make sure the ranges 4699 * we return to fiemap always move forward and don't 4700 * overlap, so adjust the offsets here 4701 */ 4702 em_start = max(em->start, off); 4703 4704 /* 4705 * record the offset from the start of the extent 4706 * for adjusting the disk offset below. Only do this if the 4707 * extent isn't compressed since our in ram offset may be past 4708 * what we have actually allocated on disk. 4709 */ 4710 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4711 offset_in_extent = em_start - em->start; 4712 em_end = extent_map_end(em); 4713 em_len = em_end - em_start; 4714 flags = 0; 4715 if (em->block_start < EXTENT_MAP_LAST_BYTE) 4716 disko = em->block_start + offset_in_extent; 4717 else 4718 disko = 0; 4719 4720 /* 4721 * bump off for our next call to get_extent 4722 */ 4723 off = extent_map_end(em); 4724 if (off >= max) 4725 end = 1; 4726 4727 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4728 end = 1; 4729 flags |= FIEMAP_EXTENT_LAST; 4730 } else if (em->block_start == EXTENT_MAP_INLINE) { 4731 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4732 FIEMAP_EXTENT_NOT_ALIGNED); 4733 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4734 flags |= (FIEMAP_EXTENT_DELALLOC | 4735 FIEMAP_EXTENT_UNKNOWN); 4736 } else if (fieinfo->fi_extents_max) { 4737 u64 bytenr = em->block_start - 4738 (em->start - em->orig_start); 4739 4740 /* 4741 * As btrfs supports shared space, this information 4742 * can be exported to userspace tools via 4743 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4744 * then we're just getting a count and we can skip the 4745 * lookup stuff. 4746 */ 4747 ret = btrfs_check_shared(root, 4748 btrfs_ino(BTRFS_I(inode)), 4749 bytenr, roots, tmp_ulist); 4750 if (ret < 0) 4751 goto out_free; 4752 if (ret) 4753 flags |= FIEMAP_EXTENT_SHARED; 4754 ret = 0; 4755 } 4756 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4757 flags |= FIEMAP_EXTENT_ENCODED; 4758 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4759 flags |= FIEMAP_EXTENT_UNWRITTEN; 4760 4761 free_extent_map(em); 4762 em = NULL; 4763 if ((em_start >= last) || em_len == (u64)-1 || 4764 (last == (u64)-1 && isize <= em_end)) { 4765 flags |= FIEMAP_EXTENT_LAST; 4766 end = 1; 4767 } 4768 4769 /* now scan forward to see if this is really the last extent. */ 4770 em = get_extent_skip_holes(inode, off, last_for_get_extent); 4771 if (IS_ERR(em)) { 4772 ret = PTR_ERR(em); 4773 goto out; 4774 } 4775 if (!em) { 4776 flags |= FIEMAP_EXTENT_LAST; 4777 end = 1; 4778 } 4779 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 4780 em_len, flags); 4781 if (ret) { 4782 if (ret == 1) 4783 ret = 0; 4784 goto out_free; 4785 } 4786 } 4787 out_free: 4788 if (!ret) 4789 ret = emit_last_fiemap_cache(fieinfo, &cache); 4790 free_extent_map(em); 4791 out: 4792 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4793 &cached_state); 4794 4795 out_free_ulist: 4796 btrfs_free_path(path); 4797 ulist_free(roots); 4798 ulist_free(tmp_ulist); 4799 return ret; 4800 } 4801 4802 static void __free_extent_buffer(struct extent_buffer *eb) 4803 { 4804 btrfs_leak_debug_del(&eb->leak_list); 4805 kmem_cache_free(extent_buffer_cache, eb); 4806 } 4807 4808 int extent_buffer_under_io(struct extent_buffer *eb) 4809 { 4810 return (atomic_read(&eb->io_pages) || 4811 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4812 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4813 } 4814 4815 /* 4816 * Release all pages attached to the extent buffer. 4817 */ 4818 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 4819 { 4820 int i; 4821 int num_pages; 4822 int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4823 4824 BUG_ON(extent_buffer_under_io(eb)); 4825 4826 num_pages = num_extent_pages(eb); 4827 for (i = 0; i < num_pages; i++) { 4828 struct page *page = eb->pages[i]; 4829 4830 if (!page) 4831 continue; 4832 if (mapped) 4833 spin_lock(&page->mapping->private_lock); 4834 /* 4835 * We do this since we'll remove the pages after we've 4836 * removed the eb from the radix tree, so we could race 4837 * and have this page now attached to the new eb. So 4838 * only clear page_private if it's still connected to 4839 * this eb. 4840 */ 4841 if (PagePrivate(page) && 4842 page->private == (unsigned long)eb) { 4843 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4844 BUG_ON(PageDirty(page)); 4845 BUG_ON(PageWriteback(page)); 4846 /* 4847 * We need to make sure we haven't be attached 4848 * to a new eb. 4849 */ 4850 ClearPagePrivate(page); 4851 set_page_private(page, 0); 4852 /* One for the page private */ 4853 put_page(page); 4854 } 4855 4856 if (mapped) 4857 spin_unlock(&page->mapping->private_lock); 4858 4859 /* One for when we allocated the page */ 4860 put_page(page); 4861 } 4862 } 4863 4864 /* 4865 * Helper for releasing the extent buffer. 4866 */ 4867 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4868 { 4869 btrfs_release_extent_buffer_pages(eb); 4870 __free_extent_buffer(eb); 4871 } 4872 4873 static struct extent_buffer * 4874 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4875 unsigned long len) 4876 { 4877 struct extent_buffer *eb = NULL; 4878 4879 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4880 eb->start = start; 4881 eb->len = len; 4882 eb->fs_info = fs_info; 4883 eb->bflags = 0; 4884 rwlock_init(&eb->lock); 4885 atomic_set(&eb->blocking_readers, 0); 4886 eb->blocking_writers = 0; 4887 eb->lock_nested = false; 4888 init_waitqueue_head(&eb->write_lock_wq); 4889 init_waitqueue_head(&eb->read_lock_wq); 4890 4891 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4892 4893 spin_lock_init(&eb->refs_lock); 4894 atomic_set(&eb->refs, 1); 4895 atomic_set(&eb->io_pages, 0); 4896 4897 /* 4898 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4899 */ 4900 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4901 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4902 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4903 4904 #ifdef CONFIG_BTRFS_DEBUG 4905 eb->spinning_writers = 0; 4906 atomic_set(&eb->spinning_readers, 0); 4907 atomic_set(&eb->read_locks, 0); 4908 eb->write_locks = 0; 4909 #endif 4910 4911 return eb; 4912 } 4913 4914 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4915 { 4916 int i; 4917 struct page *p; 4918 struct extent_buffer *new; 4919 int num_pages = num_extent_pages(src); 4920 4921 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4922 if (new == NULL) 4923 return NULL; 4924 4925 for (i = 0; i < num_pages; i++) { 4926 p = alloc_page(GFP_NOFS); 4927 if (!p) { 4928 btrfs_release_extent_buffer(new); 4929 return NULL; 4930 } 4931 attach_extent_buffer_page(new, p); 4932 WARN_ON(PageDirty(p)); 4933 SetPageUptodate(p); 4934 new->pages[i] = p; 4935 copy_page(page_address(p), page_address(src->pages[i])); 4936 } 4937 4938 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4939 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 4940 4941 return new; 4942 } 4943 4944 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4945 u64 start, unsigned long len) 4946 { 4947 struct extent_buffer *eb; 4948 int num_pages; 4949 int i; 4950 4951 eb = __alloc_extent_buffer(fs_info, start, len); 4952 if (!eb) 4953 return NULL; 4954 4955 num_pages = num_extent_pages(eb); 4956 for (i = 0; i < num_pages; i++) { 4957 eb->pages[i] = alloc_page(GFP_NOFS); 4958 if (!eb->pages[i]) 4959 goto err; 4960 } 4961 set_extent_buffer_uptodate(eb); 4962 btrfs_set_header_nritems(eb, 0); 4963 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4964 4965 return eb; 4966 err: 4967 for (; i > 0; i--) 4968 __free_page(eb->pages[i - 1]); 4969 __free_extent_buffer(eb); 4970 return NULL; 4971 } 4972 4973 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4974 u64 start) 4975 { 4976 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4977 } 4978 4979 static void check_buffer_tree_ref(struct extent_buffer *eb) 4980 { 4981 int refs; 4982 /* the ref bit is tricky. We have to make sure it is set 4983 * if we have the buffer dirty. Otherwise the 4984 * code to free a buffer can end up dropping a dirty 4985 * page 4986 * 4987 * Once the ref bit is set, it won't go away while the 4988 * buffer is dirty or in writeback, and it also won't 4989 * go away while we have the reference count on the 4990 * eb bumped. 4991 * 4992 * We can't just set the ref bit without bumping the 4993 * ref on the eb because free_extent_buffer might 4994 * see the ref bit and try to clear it. If this happens 4995 * free_extent_buffer might end up dropping our original 4996 * ref by mistake and freeing the page before we are able 4997 * to add one more ref. 4998 * 4999 * So bump the ref count first, then set the bit. If someone 5000 * beat us to it, drop the ref we added. 5001 */ 5002 refs = atomic_read(&eb->refs); 5003 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5004 return; 5005 5006 spin_lock(&eb->refs_lock); 5007 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5008 atomic_inc(&eb->refs); 5009 spin_unlock(&eb->refs_lock); 5010 } 5011 5012 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 5013 struct page *accessed) 5014 { 5015 int num_pages, i; 5016 5017 check_buffer_tree_ref(eb); 5018 5019 num_pages = num_extent_pages(eb); 5020 for (i = 0; i < num_pages; i++) { 5021 struct page *p = eb->pages[i]; 5022 5023 if (p != accessed) 5024 mark_page_accessed(p); 5025 } 5026 } 5027 5028 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 5029 u64 start) 5030 { 5031 struct extent_buffer *eb; 5032 5033 rcu_read_lock(); 5034 eb = radix_tree_lookup(&fs_info->buffer_radix, 5035 start >> PAGE_SHIFT); 5036 if (eb && atomic_inc_not_zero(&eb->refs)) { 5037 rcu_read_unlock(); 5038 /* 5039 * Lock our eb's refs_lock to avoid races with 5040 * free_extent_buffer. When we get our eb it might be flagged 5041 * with EXTENT_BUFFER_STALE and another task running 5042 * free_extent_buffer might have seen that flag set, 5043 * eb->refs == 2, that the buffer isn't under IO (dirty and 5044 * writeback flags not set) and it's still in the tree (flag 5045 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 5046 * of decrementing the extent buffer's reference count twice. 5047 * So here we could race and increment the eb's reference count, 5048 * clear its stale flag, mark it as dirty and drop our reference 5049 * before the other task finishes executing free_extent_buffer, 5050 * which would later result in an attempt to free an extent 5051 * buffer that is dirty. 5052 */ 5053 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 5054 spin_lock(&eb->refs_lock); 5055 spin_unlock(&eb->refs_lock); 5056 } 5057 mark_extent_buffer_accessed(eb, NULL); 5058 return eb; 5059 } 5060 rcu_read_unlock(); 5061 5062 return NULL; 5063 } 5064 5065 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5066 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 5067 u64 start) 5068 { 5069 struct extent_buffer *eb, *exists = NULL; 5070 int ret; 5071 5072 eb = find_extent_buffer(fs_info, start); 5073 if (eb) 5074 return eb; 5075 eb = alloc_dummy_extent_buffer(fs_info, start); 5076 if (!eb) 5077 return ERR_PTR(-ENOMEM); 5078 eb->fs_info = fs_info; 5079 again: 5080 ret = radix_tree_preload(GFP_NOFS); 5081 if (ret) { 5082 exists = ERR_PTR(ret); 5083 goto free_eb; 5084 } 5085 spin_lock(&fs_info->buffer_lock); 5086 ret = radix_tree_insert(&fs_info->buffer_radix, 5087 start >> PAGE_SHIFT, eb); 5088 spin_unlock(&fs_info->buffer_lock); 5089 radix_tree_preload_end(); 5090 if (ret == -EEXIST) { 5091 exists = find_extent_buffer(fs_info, start); 5092 if (exists) 5093 goto free_eb; 5094 else 5095 goto again; 5096 } 5097 check_buffer_tree_ref(eb); 5098 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5099 5100 return eb; 5101 free_eb: 5102 btrfs_release_extent_buffer(eb); 5103 return exists; 5104 } 5105 #endif 5106 5107 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 5108 u64 start) 5109 { 5110 unsigned long len = fs_info->nodesize; 5111 int num_pages; 5112 int i; 5113 unsigned long index = start >> PAGE_SHIFT; 5114 struct extent_buffer *eb; 5115 struct extent_buffer *exists = NULL; 5116 struct page *p; 5117 struct address_space *mapping = fs_info->btree_inode->i_mapping; 5118 int uptodate = 1; 5119 int ret; 5120 5121 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 5122 btrfs_err(fs_info, "bad tree block start %llu", start); 5123 return ERR_PTR(-EINVAL); 5124 } 5125 5126 eb = find_extent_buffer(fs_info, start); 5127 if (eb) 5128 return eb; 5129 5130 eb = __alloc_extent_buffer(fs_info, start, len); 5131 if (!eb) 5132 return ERR_PTR(-ENOMEM); 5133 5134 num_pages = num_extent_pages(eb); 5135 for (i = 0; i < num_pages; i++, index++) { 5136 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 5137 if (!p) { 5138 exists = ERR_PTR(-ENOMEM); 5139 goto free_eb; 5140 } 5141 5142 spin_lock(&mapping->private_lock); 5143 if (PagePrivate(p)) { 5144 /* 5145 * We could have already allocated an eb for this page 5146 * and attached one so lets see if we can get a ref on 5147 * the existing eb, and if we can we know it's good and 5148 * we can just return that one, else we know we can just 5149 * overwrite page->private. 5150 */ 5151 exists = (struct extent_buffer *)p->private; 5152 if (atomic_inc_not_zero(&exists->refs)) { 5153 spin_unlock(&mapping->private_lock); 5154 unlock_page(p); 5155 put_page(p); 5156 mark_extent_buffer_accessed(exists, p); 5157 goto free_eb; 5158 } 5159 exists = NULL; 5160 5161 /* 5162 * Do this so attach doesn't complain and we need to 5163 * drop the ref the old guy had. 5164 */ 5165 ClearPagePrivate(p); 5166 WARN_ON(PageDirty(p)); 5167 put_page(p); 5168 } 5169 attach_extent_buffer_page(eb, p); 5170 spin_unlock(&mapping->private_lock); 5171 WARN_ON(PageDirty(p)); 5172 eb->pages[i] = p; 5173 if (!PageUptodate(p)) 5174 uptodate = 0; 5175 5176 /* 5177 * We can't unlock the pages just yet since the extent buffer 5178 * hasn't been properly inserted in the radix tree, this 5179 * opens a race with btree_releasepage which can free a page 5180 * while we are still filling in all pages for the buffer and 5181 * we could crash. 5182 */ 5183 } 5184 if (uptodate) 5185 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5186 again: 5187 ret = radix_tree_preload(GFP_NOFS); 5188 if (ret) { 5189 exists = ERR_PTR(ret); 5190 goto free_eb; 5191 } 5192 5193 spin_lock(&fs_info->buffer_lock); 5194 ret = radix_tree_insert(&fs_info->buffer_radix, 5195 start >> PAGE_SHIFT, eb); 5196 spin_unlock(&fs_info->buffer_lock); 5197 radix_tree_preload_end(); 5198 if (ret == -EEXIST) { 5199 exists = find_extent_buffer(fs_info, start); 5200 if (exists) 5201 goto free_eb; 5202 else 5203 goto again; 5204 } 5205 /* add one reference for the tree */ 5206 check_buffer_tree_ref(eb); 5207 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5208 5209 /* 5210 * Now it's safe to unlock the pages because any calls to 5211 * btree_releasepage will correctly detect that a page belongs to a 5212 * live buffer and won't free them prematurely. 5213 */ 5214 for (i = 0; i < num_pages; i++) 5215 unlock_page(eb->pages[i]); 5216 return eb; 5217 5218 free_eb: 5219 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5220 for (i = 0; i < num_pages; i++) { 5221 if (eb->pages[i]) 5222 unlock_page(eb->pages[i]); 5223 } 5224 5225 btrfs_release_extent_buffer(eb); 5226 return exists; 5227 } 5228 5229 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5230 { 5231 struct extent_buffer *eb = 5232 container_of(head, struct extent_buffer, rcu_head); 5233 5234 __free_extent_buffer(eb); 5235 } 5236 5237 static int release_extent_buffer(struct extent_buffer *eb) 5238 { 5239 lockdep_assert_held(&eb->refs_lock); 5240 5241 WARN_ON(atomic_read(&eb->refs) == 0); 5242 if (atomic_dec_and_test(&eb->refs)) { 5243 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5244 struct btrfs_fs_info *fs_info = eb->fs_info; 5245 5246 spin_unlock(&eb->refs_lock); 5247 5248 spin_lock(&fs_info->buffer_lock); 5249 radix_tree_delete(&fs_info->buffer_radix, 5250 eb->start >> PAGE_SHIFT); 5251 spin_unlock(&fs_info->buffer_lock); 5252 } else { 5253 spin_unlock(&eb->refs_lock); 5254 } 5255 5256 /* Should be safe to release our pages at this point */ 5257 btrfs_release_extent_buffer_pages(eb); 5258 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5259 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 5260 __free_extent_buffer(eb); 5261 return 1; 5262 } 5263 #endif 5264 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5265 return 1; 5266 } 5267 spin_unlock(&eb->refs_lock); 5268 5269 return 0; 5270 } 5271 5272 void free_extent_buffer(struct extent_buffer *eb) 5273 { 5274 int refs; 5275 int old; 5276 if (!eb) 5277 return; 5278 5279 while (1) { 5280 refs = atomic_read(&eb->refs); 5281 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 5282 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 5283 refs == 1)) 5284 break; 5285 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5286 if (old == refs) 5287 return; 5288 } 5289 5290 spin_lock(&eb->refs_lock); 5291 if (atomic_read(&eb->refs) == 2 && 5292 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5293 !extent_buffer_under_io(eb) && 5294 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5295 atomic_dec(&eb->refs); 5296 5297 /* 5298 * I know this is terrible, but it's temporary until we stop tracking 5299 * the uptodate bits and such for the extent buffers. 5300 */ 5301 release_extent_buffer(eb); 5302 } 5303 5304 void free_extent_buffer_stale(struct extent_buffer *eb) 5305 { 5306 if (!eb) 5307 return; 5308 5309 spin_lock(&eb->refs_lock); 5310 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5311 5312 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5313 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5314 atomic_dec(&eb->refs); 5315 release_extent_buffer(eb); 5316 } 5317 5318 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5319 { 5320 int i; 5321 int num_pages; 5322 struct page *page; 5323 5324 num_pages = num_extent_pages(eb); 5325 5326 for (i = 0; i < num_pages; i++) { 5327 page = eb->pages[i]; 5328 if (!PageDirty(page)) 5329 continue; 5330 5331 lock_page(page); 5332 WARN_ON(!PagePrivate(page)); 5333 5334 clear_page_dirty_for_io(page); 5335 xa_lock_irq(&page->mapping->i_pages); 5336 if (!PageDirty(page)) 5337 __xa_clear_mark(&page->mapping->i_pages, 5338 page_index(page), PAGECACHE_TAG_DIRTY); 5339 xa_unlock_irq(&page->mapping->i_pages); 5340 ClearPageError(page); 5341 unlock_page(page); 5342 } 5343 WARN_ON(atomic_read(&eb->refs) == 0); 5344 } 5345 5346 bool set_extent_buffer_dirty(struct extent_buffer *eb) 5347 { 5348 int i; 5349 int num_pages; 5350 bool was_dirty; 5351 5352 check_buffer_tree_ref(eb); 5353 5354 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5355 5356 num_pages = num_extent_pages(eb); 5357 WARN_ON(atomic_read(&eb->refs) == 0); 5358 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5359 5360 if (!was_dirty) 5361 for (i = 0; i < num_pages; i++) 5362 set_page_dirty(eb->pages[i]); 5363 5364 #ifdef CONFIG_BTRFS_DEBUG 5365 for (i = 0; i < num_pages; i++) 5366 ASSERT(PageDirty(eb->pages[i])); 5367 #endif 5368 5369 return was_dirty; 5370 } 5371 5372 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5373 { 5374 int i; 5375 struct page *page; 5376 int num_pages; 5377 5378 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5379 num_pages = num_extent_pages(eb); 5380 for (i = 0; i < num_pages; i++) { 5381 page = eb->pages[i]; 5382 if (page) 5383 ClearPageUptodate(page); 5384 } 5385 } 5386 5387 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5388 { 5389 int i; 5390 struct page *page; 5391 int num_pages; 5392 5393 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5394 num_pages = num_extent_pages(eb); 5395 for (i = 0; i < num_pages; i++) { 5396 page = eb->pages[i]; 5397 SetPageUptodate(page); 5398 } 5399 } 5400 5401 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) 5402 { 5403 int i; 5404 struct page *page; 5405 int err; 5406 int ret = 0; 5407 int locked_pages = 0; 5408 int all_uptodate = 1; 5409 int num_pages; 5410 unsigned long num_reads = 0; 5411 struct bio *bio = NULL; 5412 unsigned long bio_flags = 0; 5413 struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree; 5414 5415 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5416 return 0; 5417 5418 num_pages = num_extent_pages(eb); 5419 for (i = 0; i < num_pages; i++) { 5420 page = eb->pages[i]; 5421 if (wait == WAIT_NONE) { 5422 if (!trylock_page(page)) 5423 goto unlock_exit; 5424 } else { 5425 lock_page(page); 5426 } 5427 locked_pages++; 5428 } 5429 /* 5430 * We need to firstly lock all pages to make sure that 5431 * the uptodate bit of our pages won't be affected by 5432 * clear_extent_buffer_uptodate(). 5433 */ 5434 for (i = 0; i < num_pages; i++) { 5435 page = eb->pages[i]; 5436 if (!PageUptodate(page)) { 5437 num_reads++; 5438 all_uptodate = 0; 5439 } 5440 } 5441 5442 if (all_uptodate) { 5443 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5444 goto unlock_exit; 5445 } 5446 5447 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5448 eb->read_mirror = 0; 5449 atomic_set(&eb->io_pages, num_reads); 5450 for (i = 0; i < num_pages; i++) { 5451 page = eb->pages[i]; 5452 5453 if (!PageUptodate(page)) { 5454 if (ret) { 5455 atomic_dec(&eb->io_pages); 5456 unlock_page(page); 5457 continue; 5458 } 5459 5460 ClearPageError(page); 5461 err = __extent_read_full_page(tree, page, 5462 btree_get_extent, &bio, 5463 mirror_num, &bio_flags, 5464 REQ_META); 5465 if (err) { 5466 ret = err; 5467 /* 5468 * We use &bio in above __extent_read_full_page, 5469 * so we ensure that if it returns error, the 5470 * current page fails to add itself to bio and 5471 * it's been unlocked. 5472 * 5473 * We must dec io_pages by ourselves. 5474 */ 5475 atomic_dec(&eb->io_pages); 5476 } 5477 } else { 5478 unlock_page(page); 5479 } 5480 } 5481 5482 if (bio) { 5483 err = submit_one_bio(bio, mirror_num, bio_flags); 5484 if (err) 5485 return err; 5486 } 5487 5488 if (ret || wait != WAIT_COMPLETE) 5489 return ret; 5490 5491 for (i = 0; i < num_pages; i++) { 5492 page = eb->pages[i]; 5493 wait_on_page_locked(page); 5494 if (!PageUptodate(page)) 5495 ret = -EIO; 5496 } 5497 5498 return ret; 5499 5500 unlock_exit: 5501 while (locked_pages > 0) { 5502 locked_pages--; 5503 page = eb->pages[locked_pages]; 5504 unlock_page(page); 5505 } 5506 return ret; 5507 } 5508 5509 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 5510 unsigned long start, unsigned long len) 5511 { 5512 size_t cur; 5513 size_t offset; 5514 struct page *page; 5515 char *kaddr; 5516 char *dst = (char *)dstv; 5517 size_t start_offset = offset_in_page(eb->start); 5518 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5519 5520 if (start + len > eb->len) { 5521 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5522 eb->start, eb->len, start, len); 5523 memset(dst, 0, len); 5524 return; 5525 } 5526 5527 offset = offset_in_page(start_offset + start); 5528 5529 while (len > 0) { 5530 page = eb->pages[i]; 5531 5532 cur = min(len, (PAGE_SIZE - offset)); 5533 kaddr = page_address(page); 5534 memcpy(dst, kaddr + offset, cur); 5535 5536 dst += cur; 5537 len -= cur; 5538 offset = 0; 5539 i++; 5540 } 5541 } 5542 5543 int read_extent_buffer_to_user(const struct extent_buffer *eb, 5544 void __user *dstv, 5545 unsigned long start, unsigned long len) 5546 { 5547 size_t cur; 5548 size_t offset; 5549 struct page *page; 5550 char *kaddr; 5551 char __user *dst = (char __user *)dstv; 5552 size_t start_offset = offset_in_page(eb->start); 5553 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5554 int ret = 0; 5555 5556 WARN_ON(start > eb->len); 5557 WARN_ON(start + len > eb->start + eb->len); 5558 5559 offset = offset_in_page(start_offset + start); 5560 5561 while (len > 0) { 5562 page = eb->pages[i]; 5563 5564 cur = min(len, (PAGE_SIZE - offset)); 5565 kaddr = page_address(page); 5566 if (copy_to_user(dst, kaddr + offset, cur)) { 5567 ret = -EFAULT; 5568 break; 5569 } 5570 5571 dst += cur; 5572 len -= cur; 5573 offset = 0; 5574 i++; 5575 } 5576 5577 return ret; 5578 } 5579 5580 /* 5581 * return 0 if the item is found within a page. 5582 * return 1 if the item spans two pages. 5583 * return -EINVAL otherwise. 5584 */ 5585 int map_private_extent_buffer(const struct extent_buffer *eb, 5586 unsigned long start, unsigned long min_len, 5587 char **map, unsigned long *map_start, 5588 unsigned long *map_len) 5589 { 5590 size_t offset; 5591 char *kaddr; 5592 struct page *p; 5593 size_t start_offset = offset_in_page(eb->start); 5594 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5595 unsigned long end_i = (start_offset + start + min_len - 1) >> 5596 PAGE_SHIFT; 5597 5598 if (start + min_len > eb->len) { 5599 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5600 eb->start, eb->len, start, min_len); 5601 return -EINVAL; 5602 } 5603 5604 if (i != end_i) 5605 return 1; 5606 5607 if (i == 0) { 5608 offset = start_offset; 5609 *map_start = 0; 5610 } else { 5611 offset = 0; 5612 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5613 } 5614 5615 p = eb->pages[i]; 5616 kaddr = page_address(p); 5617 *map = kaddr + offset; 5618 *map_len = PAGE_SIZE - offset; 5619 return 0; 5620 } 5621 5622 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 5623 unsigned long start, unsigned long len) 5624 { 5625 size_t cur; 5626 size_t offset; 5627 struct page *page; 5628 char *kaddr; 5629 char *ptr = (char *)ptrv; 5630 size_t start_offset = offset_in_page(eb->start); 5631 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5632 int ret = 0; 5633 5634 WARN_ON(start > eb->len); 5635 WARN_ON(start + len > eb->start + eb->len); 5636 5637 offset = offset_in_page(start_offset + start); 5638 5639 while (len > 0) { 5640 page = eb->pages[i]; 5641 5642 cur = min(len, (PAGE_SIZE - offset)); 5643 5644 kaddr = page_address(page); 5645 ret = memcmp(ptr, kaddr + offset, cur); 5646 if (ret) 5647 break; 5648 5649 ptr += cur; 5650 len -= cur; 5651 offset = 0; 5652 i++; 5653 } 5654 return ret; 5655 } 5656 5657 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5658 const void *srcv) 5659 { 5660 char *kaddr; 5661 5662 WARN_ON(!PageUptodate(eb->pages[0])); 5663 kaddr = page_address(eb->pages[0]); 5664 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5665 BTRFS_FSID_SIZE); 5666 } 5667 5668 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5669 { 5670 char *kaddr; 5671 5672 WARN_ON(!PageUptodate(eb->pages[0])); 5673 kaddr = page_address(eb->pages[0]); 5674 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5675 BTRFS_FSID_SIZE); 5676 } 5677 5678 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5679 unsigned long start, unsigned long len) 5680 { 5681 size_t cur; 5682 size_t offset; 5683 struct page *page; 5684 char *kaddr; 5685 char *src = (char *)srcv; 5686 size_t start_offset = offset_in_page(eb->start); 5687 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5688 5689 WARN_ON(start > eb->len); 5690 WARN_ON(start + len > eb->start + eb->len); 5691 5692 offset = offset_in_page(start_offset + start); 5693 5694 while (len > 0) { 5695 page = eb->pages[i]; 5696 WARN_ON(!PageUptodate(page)); 5697 5698 cur = min(len, PAGE_SIZE - offset); 5699 kaddr = page_address(page); 5700 memcpy(kaddr + offset, src, cur); 5701 5702 src += cur; 5703 len -= cur; 5704 offset = 0; 5705 i++; 5706 } 5707 } 5708 5709 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5710 unsigned long len) 5711 { 5712 size_t cur; 5713 size_t offset; 5714 struct page *page; 5715 char *kaddr; 5716 size_t start_offset = offset_in_page(eb->start); 5717 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5718 5719 WARN_ON(start > eb->len); 5720 WARN_ON(start + len > eb->start + eb->len); 5721 5722 offset = offset_in_page(start_offset + start); 5723 5724 while (len > 0) { 5725 page = eb->pages[i]; 5726 WARN_ON(!PageUptodate(page)); 5727 5728 cur = min(len, PAGE_SIZE - offset); 5729 kaddr = page_address(page); 5730 memset(kaddr + offset, 0, cur); 5731 5732 len -= cur; 5733 offset = 0; 5734 i++; 5735 } 5736 } 5737 5738 void copy_extent_buffer_full(struct extent_buffer *dst, 5739 struct extent_buffer *src) 5740 { 5741 int i; 5742 int num_pages; 5743 5744 ASSERT(dst->len == src->len); 5745 5746 num_pages = num_extent_pages(dst); 5747 for (i = 0; i < num_pages; i++) 5748 copy_page(page_address(dst->pages[i]), 5749 page_address(src->pages[i])); 5750 } 5751 5752 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5753 unsigned long dst_offset, unsigned long src_offset, 5754 unsigned long len) 5755 { 5756 u64 dst_len = dst->len; 5757 size_t cur; 5758 size_t offset; 5759 struct page *page; 5760 char *kaddr; 5761 size_t start_offset = offset_in_page(dst->start); 5762 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5763 5764 WARN_ON(src->len != dst_len); 5765 5766 offset = offset_in_page(start_offset + dst_offset); 5767 5768 while (len > 0) { 5769 page = dst->pages[i]; 5770 WARN_ON(!PageUptodate(page)); 5771 5772 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5773 5774 kaddr = page_address(page); 5775 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5776 5777 src_offset += cur; 5778 len -= cur; 5779 offset = 0; 5780 i++; 5781 } 5782 } 5783 5784 /* 5785 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5786 * given bit number 5787 * @eb: the extent buffer 5788 * @start: offset of the bitmap item in the extent buffer 5789 * @nr: bit number 5790 * @page_index: return index of the page in the extent buffer that contains the 5791 * given bit number 5792 * @page_offset: return offset into the page given by page_index 5793 * 5794 * This helper hides the ugliness of finding the byte in an extent buffer which 5795 * contains a given bit. 5796 */ 5797 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5798 unsigned long start, unsigned long nr, 5799 unsigned long *page_index, 5800 size_t *page_offset) 5801 { 5802 size_t start_offset = offset_in_page(eb->start); 5803 size_t byte_offset = BIT_BYTE(nr); 5804 size_t offset; 5805 5806 /* 5807 * The byte we want is the offset of the extent buffer + the offset of 5808 * the bitmap item in the extent buffer + the offset of the byte in the 5809 * bitmap item. 5810 */ 5811 offset = start_offset + start + byte_offset; 5812 5813 *page_index = offset >> PAGE_SHIFT; 5814 *page_offset = offset_in_page(offset); 5815 } 5816 5817 /** 5818 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5819 * @eb: the extent buffer 5820 * @start: offset of the bitmap item in the extent buffer 5821 * @nr: bit number to test 5822 */ 5823 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5824 unsigned long nr) 5825 { 5826 u8 *kaddr; 5827 struct page *page; 5828 unsigned long i; 5829 size_t offset; 5830 5831 eb_bitmap_offset(eb, start, nr, &i, &offset); 5832 page = eb->pages[i]; 5833 WARN_ON(!PageUptodate(page)); 5834 kaddr = page_address(page); 5835 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5836 } 5837 5838 /** 5839 * extent_buffer_bitmap_set - set an area of a bitmap 5840 * @eb: the extent buffer 5841 * @start: offset of the bitmap item in the extent buffer 5842 * @pos: bit number of the first bit 5843 * @len: number of bits to set 5844 */ 5845 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5846 unsigned long pos, unsigned long len) 5847 { 5848 u8 *kaddr; 5849 struct page *page; 5850 unsigned long i; 5851 size_t offset; 5852 const unsigned int size = pos + len; 5853 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5854 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5855 5856 eb_bitmap_offset(eb, start, pos, &i, &offset); 5857 page = eb->pages[i]; 5858 WARN_ON(!PageUptodate(page)); 5859 kaddr = page_address(page); 5860 5861 while (len >= bits_to_set) { 5862 kaddr[offset] |= mask_to_set; 5863 len -= bits_to_set; 5864 bits_to_set = BITS_PER_BYTE; 5865 mask_to_set = ~0; 5866 if (++offset >= PAGE_SIZE && len > 0) { 5867 offset = 0; 5868 page = eb->pages[++i]; 5869 WARN_ON(!PageUptodate(page)); 5870 kaddr = page_address(page); 5871 } 5872 } 5873 if (len) { 5874 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5875 kaddr[offset] |= mask_to_set; 5876 } 5877 } 5878 5879 5880 /** 5881 * extent_buffer_bitmap_clear - clear an area of a bitmap 5882 * @eb: the extent buffer 5883 * @start: offset of the bitmap item in the extent buffer 5884 * @pos: bit number of the first bit 5885 * @len: number of bits to clear 5886 */ 5887 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5888 unsigned long pos, unsigned long len) 5889 { 5890 u8 *kaddr; 5891 struct page *page; 5892 unsigned long i; 5893 size_t offset; 5894 const unsigned int size = pos + len; 5895 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5896 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5897 5898 eb_bitmap_offset(eb, start, pos, &i, &offset); 5899 page = eb->pages[i]; 5900 WARN_ON(!PageUptodate(page)); 5901 kaddr = page_address(page); 5902 5903 while (len >= bits_to_clear) { 5904 kaddr[offset] &= ~mask_to_clear; 5905 len -= bits_to_clear; 5906 bits_to_clear = BITS_PER_BYTE; 5907 mask_to_clear = ~0; 5908 if (++offset >= PAGE_SIZE && len > 0) { 5909 offset = 0; 5910 page = eb->pages[++i]; 5911 WARN_ON(!PageUptodate(page)); 5912 kaddr = page_address(page); 5913 } 5914 } 5915 if (len) { 5916 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5917 kaddr[offset] &= ~mask_to_clear; 5918 } 5919 } 5920 5921 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5922 { 5923 unsigned long distance = (src > dst) ? src - dst : dst - src; 5924 return distance < len; 5925 } 5926 5927 static void copy_pages(struct page *dst_page, struct page *src_page, 5928 unsigned long dst_off, unsigned long src_off, 5929 unsigned long len) 5930 { 5931 char *dst_kaddr = page_address(dst_page); 5932 char *src_kaddr; 5933 int must_memmove = 0; 5934 5935 if (dst_page != src_page) { 5936 src_kaddr = page_address(src_page); 5937 } else { 5938 src_kaddr = dst_kaddr; 5939 if (areas_overlap(src_off, dst_off, len)) 5940 must_memmove = 1; 5941 } 5942 5943 if (must_memmove) 5944 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5945 else 5946 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5947 } 5948 5949 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5950 unsigned long src_offset, unsigned long len) 5951 { 5952 struct btrfs_fs_info *fs_info = dst->fs_info; 5953 size_t cur; 5954 size_t dst_off_in_page; 5955 size_t src_off_in_page; 5956 size_t start_offset = offset_in_page(dst->start); 5957 unsigned long dst_i; 5958 unsigned long src_i; 5959 5960 if (src_offset + len > dst->len) { 5961 btrfs_err(fs_info, 5962 "memmove bogus src_offset %lu move len %lu dst len %lu", 5963 src_offset, len, dst->len); 5964 BUG(); 5965 } 5966 if (dst_offset + len > dst->len) { 5967 btrfs_err(fs_info, 5968 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5969 dst_offset, len, dst->len); 5970 BUG(); 5971 } 5972 5973 while (len > 0) { 5974 dst_off_in_page = offset_in_page(start_offset + dst_offset); 5975 src_off_in_page = offset_in_page(start_offset + src_offset); 5976 5977 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5978 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5979 5980 cur = min(len, (unsigned long)(PAGE_SIZE - 5981 src_off_in_page)); 5982 cur = min_t(unsigned long, cur, 5983 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5984 5985 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5986 dst_off_in_page, src_off_in_page, cur); 5987 5988 src_offset += cur; 5989 dst_offset += cur; 5990 len -= cur; 5991 } 5992 } 5993 5994 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5995 unsigned long src_offset, unsigned long len) 5996 { 5997 struct btrfs_fs_info *fs_info = dst->fs_info; 5998 size_t cur; 5999 size_t dst_off_in_page; 6000 size_t src_off_in_page; 6001 unsigned long dst_end = dst_offset + len - 1; 6002 unsigned long src_end = src_offset + len - 1; 6003 size_t start_offset = offset_in_page(dst->start); 6004 unsigned long dst_i; 6005 unsigned long src_i; 6006 6007 if (src_offset + len > dst->len) { 6008 btrfs_err(fs_info, 6009 "memmove bogus src_offset %lu move len %lu len %lu", 6010 src_offset, len, dst->len); 6011 BUG(); 6012 } 6013 if (dst_offset + len > dst->len) { 6014 btrfs_err(fs_info, 6015 "memmove bogus dst_offset %lu move len %lu len %lu", 6016 dst_offset, len, dst->len); 6017 BUG(); 6018 } 6019 if (dst_offset < src_offset) { 6020 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 6021 return; 6022 } 6023 while (len > 0) { 6024 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 6025 src_i = (start_offset + src_end) >> PAGE_SHIFT; 6026 6027 dst_off_in_page = offset_in_page(start_offset + dst_end); 6028 src_off_in_page = offset_in_page(start_offset + src_end); 6029 6030 cur = min_t(unsigned long, len, src_off_in_page + 1); 6031 cur = min(cur, dst_off_in_page + 1); 6032 copy_pages(dst->pages[dst_i], dst->pages[src_i], 6033 dst_off_in_page - cur + 1, 6034 src_off_in_page - cur + 1, cur); 6035 6036 dst_end -= cur; 6037 src_end -= cur; 6038 len -= cur; 6039 } 6040 } 6041 6042 int try_release_extent_buffer(struct page *page) 6043 { 6044 struct extent_buffer *eb; 6045 6046 /* 6047 * We need to make sure nobody is attaching this page to an eb right 6048 * now. 6049 */ 6050 spin_lock(&page->mapping->private_lock); 6051 if (!PagePrivate(page)) { 6052 spin_unlock(&page->mapping->private_lock); 6053 return 1; 6054 } 6055 6056 eb = (struct extent_buffer *)page->private; 6057 BUG_ON(!eb); 6058 6059 /* 6060 * This is a little awful but should be ok, we need to make sure that 6061 * the eb doesn't disappear out from under us while we're looking at 6062 * this page. 6063 */ 6064 spin_lock(&eb->refs_lock); 6065 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 6066 spin_unlock(&eb->refs_lock); 6067 spin_unlock(&page->mapping->private_lock); 6068 return 0; 6069 } 6070 spin_unlock(&page->mapping->private_lock); 6071 6072 /* 6073 * If tree ref isn't set then we know the ref on this eb is a real ref, 6074 * so just return, this page will likely be freed soon anyway. 6075 */ 6076 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 6077 spin_unlock(&eb->refs_lock); 6078 return 0; 6079 } 6080 6081 return release_extent_buffer(eb); 6082 } 6083