1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/writeback.h> 13 #include <linux/pagevec.h> 14 #include <linux/prefetch.h> 15 #include <linux/cleancache.h> 16 #include "misc.h" 17 #include "extent_io.h" 18 #include "extent-io-tree.h" 19 #include "extent_map.h" 20 #include "ctree.h" 21 #include "btrfs_inode.h" 22 #include "volumes.h" 23 #include "check-integrity.h" 24 #include "locking.h" 25 #include "rcu-string.h" 26 #include "backref.h" 27 #include "disk-io.h" 28 #include "subpage.h" 29 #include "zoned.h" 30 #include "block-group.h" 31 32 static struct kmem_cache *extent_state_cache; 33 static struct kmem_cache *extent_buffer_cache; 34 static struct bio_set btrfs_bioset; 35 36 static inline bool extent_state_in_tree(const struct extent_state *state) 37 { 38 return !RB_EMPTY_NODE(&state->rb_node); 39 } 40 41 #ifdef CONFIG_BTRFS_DEBUG 42 static LIST_HEAD(states); 43 static DEFINE_SPINLOCK(leak_lock); 44 45 static inline void btrfs_leak_debug_add(spinlock_t *lock, 46 struct list_head *new, 47 struct list_head *head) 48 { 49 unsigned long flags; 50 51 spin_lock_irqsave(lock, flags); 52 list_add(new, head); 53 spin_unlock_irqrestore(lock, flags); 54 } 55 56 static inline void btrfs_leak_debug_del(spinlock_t *lock, 57 struct list_head *entry) 58 { 59 unsigned long flags; 60 61 spin_lock_irqsave(lock, flags); 62 list_del(entry); 63 spin_unlock_irqrestore(lock, flags); 64 } 65 66 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 67 { 68 struct extent_buffer *eb; 69 unsigned long flags; 70 71 /* 72 * If we didn't get into open_ctree our allocated_ebs will not be 73 * initialized, so just skip this. 74 */ 75 if (!fs_info->allocated_ebs.next) 76 return; 77 78 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 79 while (!list_empty(&fs_info->allocated_ebs)) { 80 eb = list_first_entry(&fs_info->allocated_ebs, 81 struct extent_buffer, leak_list); 82 pr_err( 83 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", 84 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, 85 btrfs_header_owner(eb)); 86 list_del(&eb->leak_list); 87 kmem_cache_free(extent_buffer_cache, eb); 88 } 89 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 90 } 91 92 static inline void btrfs_extent_state_leak_debug_check(void) 93 { 94 struct extent_state *state; 95 96 while (!list_empty(&states)) { 97 state = list_entry(states.next, struct extent_state, leak_list); 98 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 99 state->start, state->end, state->state, 100 extent_state_in_tree(state), 101 refcount_read(&state->refs)); 102 list_del(&state->leak_list); 103 kmem_cache_free(extent_state_cache, state); 104 } 105 } 106 107 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 108 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 109 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 110 struct extent_io_tree *tree, u64 start, u64 end) 111 { 112 struct inode *inode = tree->private_data; 113 u64 isize; 114 115 if (!inode || !is_data_inode(inode)) 116 return; 117 118 isize = i_size_read(inode); 119 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 120 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 121 "%s: ino %llu isize %llu odd range [%llu,%llu]", 122 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 123 } 124 } 125 #else 126 #define btrfs_leak_debug_add(lock, new, head) do {} while (0) 127 #define btrfs_leak_debug_del(lock, entry) do {} while (0) 128 #define btrfs_extent_state_leak_debug_check() do {} while (0) 129 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 130 #endif 131 132 struct tree_entry { 133 u64 start; 134 u64 end; 135 struct rb_node rb_node; 136 }; 137 138 struct extent_page_data { 139 struct btrfs_bio_ctrl bio_ctrl; 140 /* tells writepage not to lock the state bits for this range 141 * it still does the unlocking 142 */ 143 unsigned int extent_locked:1; 144 145 /* tells the submit_bio code to use REQ_SYNC */ 146 unsigned int sync_io:1; 147 }; 148 149 static int add_extent_changeset(struct extent_state *state, u32 bits, 150 struct extent_changeset *changeset, 151 int set) 152 { 153 int ret; 154 155 if (!changeset) 156 return 0; 157 if (set && (state->state & bits) == bits) 158 return 0; 159 if (!set && (state->state & bits) == 0) 160 return 0; 161 changeset->bytes_changed += state->end - state->start + 1; 162 ret = ulist_add(&changeset->range_changed, state->start, state->end, 163 GFP_ATOMIC); 164 return ret; 165 } 166 167 int __must_check submit_one_bio(struct bio *bio, int mirror_num, 168 unsigned long bio_flags) 169 { 170 blk_status_t ret = 0; 171 struct extent_io_tree *tree = bio->bi_private; 172 173 bio->bi_private = NULL; 174 175 if (is_data_inode(tree->private_data)) 176 ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, 177 bio_flags); 178 else 179 ret = btrfs_submit_metadata_bio(tree->private_data, bio, 180 mirror_num, bio_flags); 181 182 return blk_status_to_errno(ret); 183 } 184 185 /* Cleanup unsubmitted bios */ 186 static void end_write_bio(struct extent_page_data *epd, int ret) 187 { 188 struct bio *bio = epd->bio_ctrl.bio; 189 190 if (bio) { 191 bio->bi_status = errno_to_blk_status(ret); 192 bio_endio(bio); 193 epd->bio_ctrl.bio = NULL; 194 } 195 } 196 197 /* 198 * Submit bio from extent page data via submit_one_bio 199 * 200 * Return 0 if everything is OK. 201 * Return <0 for error. 202 */ 203 static int __must_check flush_write_bio(struct extent_page_data *epd) 204 { 205 int ret = 0; 206 struct bio *bio = epd->bio_ctrl.bio; 207 208 if (bio) { 209 ret = submit_one_bio(bio, 0, 0); 210 /* 211 * Clean up of epd->bio is handled by its endio function. 212 * And endio is either triggered by successful bio execution 213 * or the error handler of submit bio hook. 214 * So at this point, no matter what happened, we don't need 215 * to clean up epd->bio. 216 */ 217 epd->bio_ctrl.bio = NULL; 218 } 219 return ret; 220 } 221 222 int __init extent_state_cache_init(void) 223 { 224 extent_state_cache = kmem_cache_create("btrfs_extent_state", 225 sizeof(struct extent_state), 0, 226 SLAB_MEM_SPREAD, NULL); 227 if (!extent_state_cache) 228 return -ENOMEM; 229 return 0; 230 } 231 232 int __init extent_io_init(void) 233 { 234 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 235 sizeof(struct extent_buffer), 0, 236 SLAB_MEM_SPREAD, NULL); 237 if (!extent_buffer_cache) 238 return -ENOMEM; 239 240 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 241 offsetof(struct btrfs_io_bio, bio), 242 BIOSET_NEED_BVECS)) 243 goto free_buffer_cache; 244 245 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 246 goto free_bioset; 247 248 return 0; 249 250 free_bioset: 251 bioset_exit(&btrfs_bioset); 252 253 free_buffer_cache: 254 kmem_cache_destroy(extent_buffer_cache); 255 extent_buffer_cache = NULL; 256 return -ENOMEM; 257 } 258 259 void __cold extent_state_cache_exit(void) 260 { 261 btrfs_extent_state_leak_debug_check(); 262 kmem_cache_destroy(extent_state_cache); 263 } 264 265 void __cold extent_io_exit(void) 266 { 267 /* 268 * Make sure all delayed rcu free are flushed before we 269 * destroy caches. 270 */ 271 rcu_barrier(); 272 kmem_cache_destroy(extent_buffer_cache); 273 bioset_exit(&btrfs_bioset); 274 } 275 276 /* 277 * For the file_extent_tree, we want to hold the inode lock when we lookup and 278 * update the disk_i_size, but lockdep will complain because our io_tree we hold 279 * the tree lock and get the inode lock when setting delalloc. These two things 280 * are unrelated, so make a class for the file_extent_tree so we don't get the 281 * two locking patterns mixed up. 282 */ 283 static struct lock_class_key file_extent_tree_class; 284 285 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 286 struct extent_io_tree *tree, unsigned int owner, 287 void *private_data) 288 { 289 tree->fs_info = fs_info; 290 tree->state = RB_ROOT; 291 tree->dirty_bytes = 0; 292 spin_lock_init(&tree->lock); 293 tree->private_data = private_data; 294 tree->owner = owner; 295 if (owner == IO_TREE_INODE_FILE_EXTENT) 296 lockdep_set_class(&tree->lock, &file_extent_tree_class); 297 } 298 299 void extent_io_tree_release(struct extent_io_tree *tree) 300 { 301 spin_lock(&tree->lock); 302 /* 303 * Do a single barrier for the waitqueue_active check here, the state 304 * of the waitqueue should not change once extent_io_tree_release is 305 * called. 306 */ 307 smp_mb(); 308 while (!RB_EMPTY_ROOT(&tree->state)) { 309 struct rb_node *node; 310 struct extent_state *state; 311 312 node = rb_first(&tree->state); 313 state = rb_entry(node, struct extent_state, rb_node); 314 rb_erase(&state->rb_node, &tree->state); 315 RB_CLEAR_NODE(&state->rb_node); 316 /* 317 * btree io trees aren't supposed to have tasks waiting for 318 * changes in the flags of extent states ever. 319 */ 320 ASSERT(!waitqueue_active(&state->wq)); 321 free_extent_state(state); 322 323 cond_resched_lock(&tree->lock); 324 } 325 spin_unlock(&tree->lock); 326 } 327 328 static struct extent_state *alloc_extent_state(gfp_t mask) 329 { 330 struct extent_state *state; 331 332 /* 333 * The given mask might be not appropriate for the slab allocator, 334 * drop the unsupported bits 335 */ 336 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 337 state = kmem_cache_alloc(extent_state_cache, mask); 338 if (!state) 339 return state; 340 state->state = 0; 341 state->failrec = NULL; 342 RB_CLEAR_NODE(&state->rb_node); 343 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); 344 refcount_set(&state->refs, 1); 345 init_waitqueue_head(&state->wq); 346 trace_alloc_extent_state(state, mask, _RET_IP_); 347 return state; 348 } 349 350 void free_extent_state(struct extent_state *state) 351 { 352 if (!state) 353 return; 354 if (refcount_dec_and_test(&state->refs)) { 355 WARN_ON(extent_state_in_tree(state)); 356 btrfs_leak_debug_del(&leak_lock, &state->leak_list); 357 trace_free_extent_state(state, _RET_IP_); 358 kmem_cache_free(extent_state_cache, state); 359 } 360 } 361 362 static struct rb_node *tree_insert(struct rb_root *root, 363 struct rb_node *search_start, 364 u64 offset, 365 struct rb_node *node, 366 struct rb_node ***p_in, 367 struct rb_node **parent_in) 368 { 369 struct rb_node **p; 370 struct rb_node *parent = NULL; 371 struct tree_entry *entry; 372 373 if (p_in && parent_in) { 374 p = *p_in; 375 parent = *parent_in; 376 goto do_insert; 377 } 378 379 p = search_start ? &search_start : &root->rb_node; 380 while (*p) { 381 parent = *p; 382 entry = rb_entry(parent, struct tree_entry, rb_node); 383 384 if (offset < entry->start) 385 p = &(*p)->rb_left; 386 else if (offset > entry->end) 387 p = &(*p)->rb_right; 388 else 389 return parent; 390 } 391 392 do_insert: 393 rb_link_node(node, parent, p); 394 rb_insert_color(node, root); 395 return NULL; 396 } 397 398 /** 399 * Search @tree for an entry that contains @offset. Such entry would have 400 * entry->start <= offset && entry->end >= offset. 401 * 402 * @tree: the tree to search 403 * @offset: offset that should fall within an entry in @tree 404 * @next_ret: pointer to the first entry whose range ends after @offset 405 * @prev_ret: pointer to the first entry whose range begins before @offset 406 * @p_ret: pointer where new node should be anchored (used when inserting an 407 * entry in the tree) 408 * @parent_ret: points to entry which would have been the parent of the entry, 409 * containing @offset 410 * 411 * This function returns a pointer to the entry that contains @offset byte 412 * address. If no such entry exists, then NULL is returned and the other 413 * pointer arguments to the function are filled, otherwise the found entry is 414 * returned and other pointers are left untouched. 415 */ 416 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 417 struct rb_node **next_ret, 418 struct rb_node **prev_ret, 419 struct rb_node ***p_ret, 420 struct rb_node **parent_ret) 421 { 422 struct rb_root *root = &tree->state; 423 struct rb_node **n = &root->rb_node; 424 struct rb_node *prev = NULL; 425 struct rb_node *orig_prev = NULL; 426 struct tree_entry *entry; 427 struct tree_entry *prev_entry = NULL; 428 429 while (*n) { 430 prev = *n; 431 entry = rb_entry(prev, struct tree_entry, rb_node); 432 prev_entry = entry; 433 434 if (offset < entry->start) 435 n = &(*n)->rb_left; 436 else if (offset > entry->end) 437 n = &(*n)->rb_right; 438 else 439 return *n; 440 } 441 442 if (p_ret) 443 *p_ret = n; 444 if (parent_ret) 445 *parent_ret = prev; 446 447 if (next_ret) { 448 orig_prev = prev; 449 while (prev && offset > prev_entry->end) { 450 prev = rb_next(prev); 451 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 452 } 453 *next_ret = prev; 454 prev = orig_prev; 455 } 456 457 if (prev_ret) { 458 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 459 while (prev && offset < prev_entry->start) { 460 prev = rb_prev(prev); 461 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 462 } 463 *prev_ret = prev; 464 } 465 return NULL; 466 } 467 468 static inline struct rb_node * 469 tree_search_for_insert(struct extent_io_tree *tree, 470 u64 offset, 471 struct rb_node ***p_ret, 472 struct rb_node **parent_ret) 473 { 474 struct rb_node *next= NULL; 475 struct rb_node *ret; 476 477 ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); 478 if (!ret) 479 return next; 480 return ret; 481 } 482 483 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 484 u64 offset) 485 { 486 return tree_search_for_insert(tree, offset, NULL, NULL); 487 } 488 489 /* 490 * utility function to look for merge candidates inside a given range. 491 * Any extents with matching state are merged together into a single 492 * extent in the tree. Extents with EXTENT_IO in their state field 493 * are not merged because the end_io handlers need to be able to do 494 * operations on them without sleeping (or doing allocations/splits). 495 * 496 * This should be called with the tree lock held. 497 */ 498 static void merge_state(struct extent_io_tree *tree, 499 struct extent_state *state) 500 { 501 struct extent_state *other; 502 struct rb_node *other_node; 503 504 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 505 return; 506 507 other_node = rb_prev(&state->rb_node); 508 if (other_node) { 509 other = rb_entry(other_node, struct extent_state, rb_node); 510 if (other->end == state->start - 1 && 511 other->state == state->state) { 512 if (tree->private_data && 513 is_data_inode(tree->private_data)) 514 btrfs_merge_delalloc_extent(tree->private_data, 515 state, other); 516 state->start = other->start; 517 rb_erase(&other->rb_node, &tree->state); 518 RB_CLEAR_NODE(&other->rb_node); 519 free_extent_state(other); 520 } 521 } 522 other_node = rb_next(&state->rb_node); 523 if (other_node) { 524 other = rb_entry(other_node, struct extent_state, rb_node); 525 if (other->start == state->end + 1 && 526 other->state == state->state) { 527 if (tree->private_data && 528 is_data_inode(tree->private_data)) 529 btrfs_merge_delalloc_extent(tree->private_data, 530 state, other); 531 state->end = other->end; 532 rb_erase(&other->rb_node, &tree->state); 533 RB_CLEAR_NODE(&other->rb_node); 534 free_extent_state(other); 535 } 536 } 537 } 538 539 static void set_state_bits(struct extent_io_tree *tree, 540 struct extent_state *state, u32 *bits, 541 struct extent_changeset *changeset); 542 543 /* 544 * insert an extent_state struct into the tree. 'bits' are set on the 545 * struct before it is inserted. 546 * 547 * This may return -EEXIST if the extent is already there, in which case the 548 * state struct is freed. 549 * 550 * The tree lock is not taken internally. This is a utility function and 551 * probably isn't what you want to call (see set/clear_extent_bit). 552 */ 553 static int insert_state(struct extent_io_tree *tree, 554 struct extent_state *state, u64 start, u64 end, 555 struct rb_node ***p, 556 struct rb_node **parent, 557 u32 *bits, struct extent_changeset *changeset) 558 { 559 struct rb_node *node; 560 561 if (end < start) { 562 btrfs_err(tree->fs_info, 563 "insert state: end < start %llu %llu", end, start); 564 WARN_ON(1); 565 } 566 state->start = start; 567 state->end = end; 568 569 set_state_bits(tree, state, bits, changeset); 570 571 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 572 if (node) { 573 struct extent_state *found; 574 found = rb_entry(node, struct extent_state, rb_node); 575 btrfs_err(tree->fs_info, 576 "found node %llu %llu on insert of %llu %llu", 577 found->start, found->end, start, end); 578 return -EEXIST; 579 } 580 merge_state(tree, state); 581 return 0; 582 } 583 584 /* 585 * split a given extent state struct in two, inserting the preallocated 586 * struct 'prealloc' as the newly created second half. 'split' indicates an 587 * offset inside 'orig' where it should be split. 588 * 589 * Before calling, 590 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 591 * are two extent state structs in the tree: 592 * prealloc: [orig->start, split - 1] 593 * orig: [ split, orig->end ] 594 * 595 * The tree locks are not taken by this function. They need to be held 596 * by the caller. 597 */ 598 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 599 struct extent_state *prealloc, u64 split) 600 { 601 struct rb_node *node; 602 603 if (tree->private_data && is_data_inode(tree->private_data)) 604 btrfs_split_delalloc_extent(tree->private_data, orig, split); 605 606 prealloc->start = orig->start; 607 prealloc->end = split - 1; 608 prealloc->state = orig->state; 609 orig->start = split; 610 611 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 612 &prealloc->rb_node, NULL, NULL); 613 if (node) { 614 free_extent_state(prealloc); 615 return -EEXIST; 616 } 617 return 0; 618 } 619 620 static struct extent_state *next_state(struct extent_state *state) 621 { 622 struct rb_node *next = rb_next(&state->rb_node); 623 if (next) 624 return rb_entry(next, struct extent_state, rb_node); 625 else 626 return NULL; 627 } 628 629 /* 630 * utility function to clear some bits in an extent state struct. 631 * it will optionally wake up anyone waiting on this state (wake == 1). 632 * 633 * If no bits are set on the state struct after clearing things, the 634 * struct is freed and removed from the tree 635 */ 636 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 637 struct extent_state *state, 638 u32 *bits, int wake, 639 struct extent_changeset *changeset) 640 { 641 struct extent_state *next; 642 u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; 643 int ret; 644 645 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 646 u64 range = state->end - state->start + 1; 647 WARN_ON(range > tree->dirty_bytes); 648 tree->dirty_bytes -= range; 649 } 650 651 if (tree->private_data && is_data_inode(tree->private_data)) 652 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 653 654 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 655 BUG_ON(ret < 0); 656 state->state &= ~bits_to_clear; 657 if (wake) 658 wake_up(&state->wq); 659 if (state->state == 0) { 660 next = next_state(state); 661 if (extent_state_in_tree(state)) { 662 rb_erase(&state->rb_node, &tree->state); 663 RB_CLEAR_NODE(&state->rb_node); 664 free_extent_state(state); 665 } else { 666 WARN_ON(1); 667 } 668 } else { 669 merge_state(tree, state); 670 next = next_state(state); 671 } 672 return next; 673 } 674 675 static struct extent_state * 676 alloc_extent_state_atomic(struct extent_state *prealloc) 677 { 678 if (!prealloc) 679 prealloc = alloc_extent_state(GFP_ATOMIC); 680 681 return prealloc; 682 } 683 684 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 685 { 686 btrfs_panic(tree->fs_info, err, 687 "locking error: extent tree was modified by another thread while locked"); 688 } 689 690 /* 691 * clear some bits on a range in the tree. This may require splitting 692 * or inserting elements in the tree, so the gfp mask is used to 693 * indicate which allocations or sleeping are allowed. 694 * 695 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 696 * the given range from the tree regardless of state (ie for truncate). 697 * 698 * the range [start, end] is inclusive. 699 * 700 * This takes the tree lock, and returns 0 on success and < 0 on error. 701 */ 702 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 703 u32 bits, int wake, int delete, 704 struct extent_state **cached_state, 705 gfp_t mask, struct extent_changeset *changeset) 706 { 707 struct extent_state *state; 708 struct extent_state *cached; 709 struct extent_state *prealloc = NULL; 710 struct rb_node *node; 711 u64 last_end; 712 int err; 713 int clear = 0; 714 715 btrfs_debug_check_extent_io_range(tree, start, end); 716 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); 717 718 if (bits & EXTENT_DELALLOC) 719 bits |= EXTENT_NORESERVE; 720 721 if (delete) 722 bits |= ~EXTENT_CTLBITS; 723 724 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 725 clear = 1; 726 again: 727 if (!prealloc && gfpflags_allow_blocking(mask)) { 728 /* 729 * Don't care for allocation failure here because we might end 730 * up not needing the pre-allocated extent state at all, which 731 * is the case if we only have in the tree extent states that 732 * cover our input range and don't cover too any other range. 733 * If we end up needing a new extent state we allocate it later. 734 */ 735 prealloc = alloc_extent_state(mask); 736 } 737 738 spin_lock(&tree->lock); 739 if (cached_state) { 740 cached = *cached_state; 741 742 if (clear) { 743 *cached_state = NULL; 744 cached_state = NULL; 745 } 746 747 if (cached && extent_state_in_tree(cached) && 748 cached->start <= start && cached->end > start) { 749 if (clear) 750 refcount_dec(&cached->refs); 751 state = cached; 752 goto hit_next; 753 } 754 if (clear) 755 free_extent_state(cached); 756 } 757 /* 758 * this search will find the extents that end after 759 * our range starts 760 */ 761 node = tree_search(tree, start); 762 if (!node) 763 goto out; 764 state = rb_entry(node, struct extent_state, rb_node); 765 hit_next: 766 if (state->start > end) 767 goto out; 768 WARN_ON(state->end < start); 769 last_end = state->end; 770 771 /* the state doesn't have the wanted bits, go ahead */ 772 if (!(state->state & bits)) { 773 state = next_state(state); 774 goto next; 775 } 776 777 /* 778 * | ---- desired range ---- | 779 * | state | or 780 * | ------------- state -------------- | 781 * 782 * We need to split the extent we found, and may flip 783 * bits on second half. 784 * 785 * If the extent we found extends past our range, we 786 * just split and search again. It'll get split again 787 * the next time though. 788 * 789 * If the extent we found is inside our range, we clear 790 * the desired bit on it. 791 */ 792 793 if (state->start < start) { 794 prealloc = alloc_extent_state_atomic(prealloc); 795 BUG_ON(!prealloc); 796 err = split_state(tree, state, prealloc, start); 797 if (err) 798 extent_io_tree_panic(tree, err); 799 800 prealloc = NULL; 801 if (err) 802 goto out; 803 if (state->end <= end) { 804 state = clear_state_bit(tree, state, &bits, wake, 805 changeset); 806 goto next; 807 } 808 goto search_again; 809 } 810 /* 811 * | ---- desired range ---- | 812 * | state | 813 * We need to split the extent, and clear the bit 814 * on the first half 815 */ 816 if (state->start <= end && state->end > end) { 817 prealloc = alloc_extent_state_atomic(prealloc); 818 BUG_ON(!prealloc); 819 err = split_state(tree, state, prealloc, end + 1); 820 if (err) 821 extent_io_tree_panic(tree, err); 822 823 if (wake) 824 wake_up(&state->wq); 825 826 clear_state_bit(tree, prealloc, &bits, wake, changeset); 827 828 prealloc = NULL; 829 goto out; 830 } 831 832 state = clear_state_bit(tree, state, &bits, wake, changeset); 833 next: 834 if (last_end == (u64)-1) 835 goto out; 836 start = last_end + 1; 837 if (start <= end && state && !need_resched()) 838 goto hit_next; 839 840 search_again: 841 if (start > end) 842 goto out; 843 spin_unlock(&tree->lock); 844 if (gfpflags_allow_blocking(mask)) 845 cond_resched(); 846 goto again; 847 848 out: 849 spin_unlock(&tree->lock); 850 if (prealloc) 851 free_extent_state(prealloc); 852 853 return 0; 854 855 } 856 857 static void wait_on_state(struct extent_io_tree *tree, 858 struct extent_state *state) 859 __releases(tree->lock) 860 __acquires(tree->lock) 861 { 862 DEFINE_WAIT(wait); 863 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 864 spin_unlock(&tree->lock); 865 schedule(); 866 spin_lock(&tree->lock); 867 finish_wait(&state->wq, &wait); 868 } 869 870 /* 871 * waits for one or more bits to clear on a range in the state tree. 872 * The range [start, end] is inclusive. 873 * The tree lock is taken by this function 874 */ 875 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 876 u32 bits) 877 { 878 struct extent_state *state; 879 struct rb_node *node; 880 881 btrfs_debug_check_extent_io_range(tree, start, end); 882 883 spin_lock(&tree->lock); 884 again: 885 while (1) { 886 /* 887 * this search will find all the extents that end after 888 * our range starts 889 */ 890 node = tree_search(tree, start); 891 process_node: 892 if (!node) 893 break; 894 895 state = rb_entry(node, struct extent_state, rb_node); 896 897 if (state->start > end) 898 goto out; 899 900 if (state->state & bits) { 901 start = state->start; 902 refcount_inc(&state->refs); 903 wait_on_state(tree, state); 904 free_extent_state(state); 905 goto again; 906 } 907 start = state->end + 1; 908 909 if (start > end) 910 break; 911 912 if (!cond_resched_lock(&tree->lock)) { 913 node = rb_next(node); 914 goto process_node; 915 } 916 } 917 out: 918 spin_unlock(&tree->lock); 919 } 920 921 static void set_state_bits(struct extent_io_tree *tree, 922 struct extent_state *state, 923 u32 *bits, struct extent_changeset *changeset) 924 { 925 u32 bits_to_set = *bits & ~EXTENT_CTLBITS; 926 int ret; 927 928 if (tree->private_data && is_data_inode(tree->private_data)) 929 btrfs_set_delalloc_extent(tree->private_data, state, bits); 930 931 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 932 u64 range = state->end - state->start + 1; 933 tree->dirty_bytes += range; 934 } 935 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 936 BUG_ON(ret < 0); 937 state->state |= bits_to_set; 938 } 939 940 static void cache_state_if_flags(struct extent_state *state, 941 struct extent_state **cached_ptr, 942 unsigned flags) 943 { 944 if (cached_ptr && !(*cached_ptr)) { 945 if (!flags || (state->state & flags)) { 946 *cached_ptr = state; 947 refcount_inc(&state->refs); 948 } 949 } 950 } 951 952 static void cache_state(struct extent_state *state, 953 struct extent_state **cached_ptr) 954 { 955 return cache_state_if_flags(state, cached_ptr, 956 EXTENT_LOCKED | EXTENT_BOUNDARY); 957 } 958 959 /* 960 * set some bits on a range in the tree. This may require allocations or 961 * sleeping, so the gfp mask is used to indicate what is allowed. 962 * 963 * If any of the exclusive bits are set, this will fail with -EEXIST if some 964 * part of the range already has the desired bits set. The start of the 965 * existing range is returned in failed_start in this case. 966 * 967 * [start, end] is inclusive This takes the tree lock. 968 */ 969 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, 970 u32 exclusive_bits, u64 *failed_start, 971 struct extent_state **cached_state, gfp_t mask, 972 struct extent_changeset *changeset) 973 { 974 struct extent_state *state; 975 struct extent_state *prealloc = NULL; 976 struct rb_node *node; 977 struct rb_node **p; 978 struct rb_node *parent; 979 int err = 0; 980 u64 last_start; 981 u64 last_end; 982 983 btrfs_debug_check_extent_io_range(tree, start, end); 984 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); 985 986 if (exclusive_bits) 987 ASSERT(failed_start); 988 else 989 ASSERT(failed_start == NULL); 990 again: 991 if (!prealloc && gfpflags_allow_blocking(mask)) { 992 /* 993 * Don't care for allocation failure here because we might end 994 * up not needing the pre-allocated extent state at all, which 995 * is the case if we only have in the tree extent states that 996 * cover our input range and don't cover too any other range. 997 * If we end up needing a new extent state we allocate it later. 998 */ 999 prealloc = alloc_extent_state(mask); 1000 } 1001 1002 spin_lock(&tree->lock); 1003 if (cached_state && *cached_state) { 1004 state = *cached_state; 1005 if (state->start <= start && state->end > start && 1006 extent_state_in_tree(state)) { 1007 node = &state->rb_node; 1008 goto hit_next; 1009 } 1010 } 1011 /* 1012 * this search will find all the extents that end after 1013 * our range starts. 1014 */ 1015 node = tree_search_for_insert(tree, start, &p, &parent); 1016 if (!node) { 1017 prealloc = alloc_extent_state_atomic(prealloc); 1018 BUG_ON(!prealloc); 1019 err = insert_state(tree, prealloc, start, end, 1020 &p, &parent, &bits, changeset); 1021 if (err) 1022 extent_io_tree_panic(tree, err); 1023 1024 cache_state(prealloc, cached_state); 1025 prealloc = NULL; 1026 goto out; 1027 } 1028 state = rb_entry(node, struct extent_state, rb_node); 1029 hit_next: 1030 last_start = state->start; 1031 last_end = state->end; 1032 1033 /* 1034 * | ---- desired range ---- | 1035 * | state | 1036 * 1037 * Just lock what we found and keep going 1038 */ 1039 if (state->start == start && state->end <= end) { 1040 if (state->state & exclusive_bits) { 1041 *failed_start = state->start; 1042 err = -EEXIST; 1043 goto out; 1044 } 1045 1046 set_state_bits(tree, state, &bits, changeset); 1047 cache_state(state, cached_state); 1048 merge_state(tree, state); 1049 if (last_end == (u64)-1) 1050 goto out; 1051 start = last_end + 1; 1052 state = next_state(state); 1053 if (start < end && state && state->start == start && 1054 !need_resched()) 1055 goto hit_next; 1056 goto search_again; 1057 } 1058 1059 /* 1060 * | ---- desired range ---- | 1061 * | state | 1062 * or 1063 * | ------------- state -------------- | 1064 * 1065 * We need to split the extent we found, and may flip bits on 1066 * second half. 1067 * 1068 * If the extent we found extends past our 1069 * range, we just split and search again. It'll get split 1070 * again the next time though. 1071 * 1072 * If the extent we found is inside our range, we set the 1073 * desired bit on it. 1074 */ 1075 if (state->start < start) { 1076 if (state->state & exclusive_bits) { 1077 *failed_start = start; 1078 err = -EEXIST; 1079 goto out; 1080 } 1081 1082 /* 1083 * If this extent already has all the bits we want set, then 1084 * skip it, not necessary to split it or do anything with it. 1085 */ 1086 if ((state->state & bits) == bits) { 1087 start = state->end + 1; 1088 cache_state(state, cached_state); 1089 goto search_again; 1090 } 1091 1092 prealloc = alloc_extent_state_atomic(prealloc); 1093 BUG_ON(!prealloc); 1094 err = split_state(tree, state, prealloc, start); 1095 if (err) 1096 extent_io_tree_panic(tree, err); 1097 1098 prealloc = NULL; 1099 if (err) 1100 goto out; 1101 if (state->end <= end) { 1102 set_state_bits(tree, state, &bits, changeset); 1103 cache_state(state, cached_state); 1104 merge_state(tree, state); 1105 if (last_end == (u64)-1) 1106 goto out; 1107 start = last_end + 1; 1108 state = next_state(state); 1109 if (start < end && state && state->start == start && 1110 !need_resched()) 1111 goto hit_next; 1112 } 1113 goto search_again; 1114 } 1115 /* 1116 * | ---- desired range ---- | 1117 * | state | or | state | 1118 * 1119 * There's a hole, we need to insert something in it and 1120 * ignore the extent we found. 1121 */ 1122 if (state->start > start) { 1123 u64 this_end; 1124 if (end < last_start) 1125 this_end = end; 1126 else 1127 this_end = last_start - 1; 1128 1129 prealloc = alloc_extent_state_atomic(prealloc); 1130 BUG_ON(!prealloc); 1131 1132 /* 1133 * Avoid to free 'prealloc' if it can be merged with 1134 * the later extent. 1135 */ 1136 err = insert_state(tree, prealloc, start, this_end, 1137 NULL, NULL, &bits, changeset); 1138 if (err) 1139 extent_io_tree_panic(tree, err); 1140 1141 cache_state(prealloc, cached_state); 1142 prealloc = NULL; 1143 start = this_end + 1; 1144 goto search_again; 1145 } 1146 /* 1147 * | ---- desired range ---- | 1148 * | state | 1149 * We need to split the extent, and set the bit 1150 * on the first half 1151 */ 1152 if (state->start <= end && state->end > end) { 1153 if (state->state & exclusive_bits) { 1154 *failed_start = start; 1155 err = -EEXIST; 1156 goto out; 1157 } 1158 1159 prealloc = alloc_extent_state_atomic(prealloc); 1160 BUG_ON(!prealloc); 1161 err = split_state(tree, state, prealloc, end + 1); 1162 if (err) 1163 extent_io_tree_panic(tree, err); 1164 1165 set_state_bits(tree, prealloc, &bits, changeset); 1166 cache_state(prealloc, cached_state); 1167 merge_state(tree, prealloc); 1168 prealloc = NULL; 1169 goto out; 1170 } 1171 1172 search_again: 1173 if (start > end) 1174 goto out; 1175 spin_unlock(&tree->lock); 1176 if (gfpflags_allow_blocking(mask)) 1177 cond_resched(); 1178 goto again; 1179 1180 out: 1181 spin_unlock(&tree->lock); 1182 if (prealloc) 1183 free_extent_state(prealloc); 1184 1185 return err; 1186 1187 } 1188 1189 /** 1190 * convert_extent_bit - convert all bits in a given range from one bit to 1191 * another 1192 * @tree: the io tree to search 1193 * @start: the start offset in bytes 1194 * @end: the end offset in bytes (inclusive) 1195 * @bits: the bits to set in this range 1196 * @clear_bits: the bits to clear in this range 1197 * @cached_state: state that we're going to cache 1198 * 1199 * This will go through and set bits for the given range. If any states exist 1200 * already in this range they are set with the given bit and cleared of the 1201 * clear_bits. This is only meant to be used by things that are mergeable, ie 1202 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1203 * boundary bits like LOCK. 1204 * 1205 * All allocations are done with GFP_NOFS. 1206 */ 1207 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1208 u32 bits, u32 clear_bits, 1209 struct extent_state **cached_state) 1210 { 1211 struct extent_state *state; 1212 struct extent_state *prealloc = NULL; 1213 struct rb_node *node; 1214 struct rb_node **p; 1215 struct rb_node *parent; 1216 int err = 0; 1217 u64 last_start; 1218 u64 last_end; 1219 bool first_iteration = true; 1220 1221 btrfs_debug_check_extent_io_range(tree, start, end); 1222 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, 1223 clear_bits); 1224 1225 again: 1226 if (!prealloc) { 1227 /* 1228 * Best effort, don't worry if extent state allocation fails 1229 * here for the first iteration. We might have a cached state 1230 * that matches exactly the target range, in which case no 1231 * extent state allocations are needed. We'll only know this 1232 * after locking the tree. 1233 */ 1234 prealloc = alloc_extent_state(GFP_NOFS); 1235 if (!prealloc && !first_iteration) 1236 return -ENOMEM; 1237 } 1238 1239 spin_lock(&tree->lock); 1240 if (cached_state && *cached_state) { 1241 state = *cached_state; 1242 if (state->start <= start && state->end > start && 1243 extent_state_in_tree(state)) { 1244 node = &state->rb_node; 1245 goto hit_next; 1246 } 1247 } 1248 1249 /* 1250 * this search will find all the extents that end after 1251 * our range starts. 1252 */ 1253 node = tree_search_for_insert(tree, start, &p, &parent); 1254 if (!node) { 1255 prealloc = alloc_extent_state_atomic(prealloc); 1256 if (!prealloc) { 1257 err = -ENOMEM; 1258 goto out; 1259 } 1260 err = insert_state(tree, prealloc, start, end, 1261 &p, &parent, &bits, NULL); 1262 if (err) 1263 extent_io_tree_panic(tree, err); 1264 cache_state(prealloc, cached_state); 1265 prealloc = NULL; 1266 goto out; 1267 } 1268 state = rb_entry(node, struct extent_state, rb_node); 1269 hit_next: 1270 last_start = state->start; 1271 last_end = state->end; 1272 1273 /* 1274 * | ---- desired range ---- | 1275 * | state | 1276 * 1277 * Just lock what we found and keep going 1278 */ 1279 if (state->start == start && state->end <= end) { 1280 set_state_bits(tree, state, &bits, NULL); 1281 cache_state(state, cached_state); 1282 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1283 if (last_end == (u64)-1) 1284 goto out; 1285 start = last_end + 1; 1286 if (start < end && state && state->start == start && 1287 !need_resched()) 1288 goto hit_next; 1289 goto search_again; 1290 } 1291 1292 /* 1293 * | ---- desired range ---- | 1294 * | state | 1295 * or 1296 * | ------------- state -------------- | 1297 * 1298 * We need to split the extent we found, and may flip bits on 1299 * second half. 1300 * 1301 * If the extent we found extends past our 1302 * range, we just split and search again. It'll get split 1303 * again the next time though. 1304 * 1305 * If the extent we found is inside our range, we set the 1306 * desired bit on it. 1307 */ 1308 if (state->start < start) { 1309 prealloc = alloc_extent_state_atomic(prealloc); 1310 if (!prealloc) { 1311 err = -ENOMEM; 1312 goto out; 1313 } 1314 err = split_state(tree, state, prealloc, start); 1315 if (err) 1316 extent_io_tree_panic(tree, err); 1317 prealloc = NULL; 1318 if (err) 1319 goto out; 1320 if (state->end <= end) { 1321 set_state_bits(tree, state, &bits, NULL); 1322 cache_state(state, cached_state); 1323 state = clear_state_bit(tree, state, &clear_bits, 0, 1324 NULL); 1325 if (last_end == (u64)-1) 1326 goto out; 1327 start = last_end + 1; 1328 if (start < end && state && state->start == start && 1329 !need_resched()) 1330 goto hit_next; 1331 } 1332 goto search_again; 1333 } 1334 /* 1335 * | ---- desired range ---- | 1336 * | state | or | state | 1337 * 1338 * There's a hole, we need to insert something in it and 1339 * ignore the extent we found. 1340 */ 1341 if (state->start > start) { 1342 u64 this_end; 1343 if (end < last_start) 1344 this_end = end; 1345 else 1346 this_end = last_start - 1; 1347 1348 prealloc = alloc_extent_state_atomic(prealloc); 1349 if (!prealloc) { 1350 err = -ENOMEM; 1351 goto out; 1352 } 1353 1354 /* 1355 * Avoid to free 'prealloc' if it can be merged with 1356 * the later extent. 1357 */ 1358 err = insert_state(tree, prealloc, start, this_end, 1359 NULL, NULL, &bits, NULL); 1360 if (err) 1361 extent_io_tree_panic(tree, err); 1362 cache_state(prealloc, cached_state); 1363 prealloc = NULL; 1364 start = this_end + 1; 1365 goto search_again; 1366 } 1367 /* 1368 * | ---- desired range ---- | 1369 * | state | 1370 * We need to split the extent, and set the bit 1371 * on the first half 1372 */ 1373 if (state->start <= end && state->end > end) { 1374 prealloc = alloc_extent_state_atomic(prealloc); 1375 if (!prealloc) { 1376 err = -ENOMEM; 1377 goto out; 1378 } 1379 1380 err = split_state(tree, state, prealloc, end + 1); 1381 if (err) 1382 extent_io_tree_panic(tree, err); 1383 1384 set_state_bits(tree, prealloc, &bits, NULL); 1385 cache_state(prealloc, cached_state); 1386 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1387 prealloc = NULL; 1388 goto out; 1389 } 1390 1391 search_again: 1392 if (start > end) 1393 goto out; 1394 spin_unlock(&tree->lock); 1395 cond_resched(); 1396 first_iteration = false; 1397 goto again; 1398 1399 out: 1400 spin_unlock(&tree->lock); 1401 if (prealloc) 1402 free_extent_state(prealloc); 1403 1404 return err; 1405 } 1406 1407 /* wrappers around set/clear extent bit */ 1408 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1409 u32 bits, struct extent_changeset *changeset) 1410 { 1411 /* 1412 * We don't support EXTENT_LOCKED yet, as current changeset will 1413 * record any bits changed, so for EXTENT_LOCKED case, it will 1414 * either fail with -EEXIST or changeset will record the whole 1415 * range. 1416 */ 1417 BUG_ON(bits & EXTENT_LOCKED); 1418 1419 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1420 changeset); 1421 } 1422 1423 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, 1424 u32 bits) 1425 { 1426 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, 1427 GFP_NOWAIT, NULL); 1428 } 1429 1430 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1431 u32 bits, int wake, int delete, 1432 struct extent_state **cached) 1433 { 1434 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1435 cached, GFP_NOFS, NULL); 1436 } 1437 1438 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1439 u32 bits, struct extent_changeset *changeset) 1440 { 1441 /* 1442 * Don't support EXTENT_LOCKED case, same reason as 1443 * set_record_extent_bits(). 1444 */ 1445 BUG_ON(bits & EXTENT_LOCKED); 1446 1447 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1448 changeset); 1449 } 1450 1451 /* 1452 * either insert or lock state struct between start and end use mask to tell 1453 * us if waiting is desired. 1454 */ 1455 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1456 struct extent_state **cached_state) 1457 { 1458 int err; 1459 u64 failed_start; 1460 1461 while (1) { 1462 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1463 EXTENT_LOCKED, &failed_start, 1464 cached_state, GFP_NOFS, NULL); 1465 if (err == -EEXIST) { 1466 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1467 start = failed_start; 1468 } else 1469 break; 1470 WARN_ON(start > end); 1471 } 1472 return err; 1473 } 1474 1475 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1476 { 1477 int err; 1478 u64 failed_start; 1479 1480 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1481 &failed_start, NULL, GFP_NOFS, NULL); 1482 if (err == -EEXIST) { 1483 if (failed_start > start) 1484 clear_extent_bit(tree, start, failed_start - 1, 1485 EXTENT_LOCKED, 1, 0, NULL); 1486 return 0; 1487 } 1488 return 1; 1489 } 1490 1491 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1492 { 1493 unsigned long index = start >> PAGE_SHIFT; 1494 unsigned long end_index = end >> PAGE_SHIFT; 1495 struct page *page; 1496 1497 while (index <= end_index) { 1498 page = find_get_page(inode->i_mapping, index); 1499 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1500 clear_page_dirty_for_io(page); 1501 put_page(page); 1502 index++; 1503 } 1504 } 1505 1506 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1507 { 1508 unsigned long index = start >> PAGE_SHIFT; 1509 unsigned long end_index = end >> PAGE_SHIFT; 1510 struct page *page; 1511 1512 while (index <= end_index) { 1513 page = find_get_page(inode->i_mapping, index); 1514 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1515 __set_page_dirty_nobuffers(page); 1516 account_page_redirty(page); 1517 put_page(page); 1518 index++; 1519 } 1520 } 1521 1522 /* find the first state struct with 'bits' set after 'start', and 1523 * return it. tree->lock must be held. NULL will returned if 1524 * nothing was found after 'start' 1525 */ 1526 static struct extent_state * 1527 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) 1528 { 1529 struct rb_node *node; 1530 struct extent_state *state; 1531 1532 /* 1533 * this search will find all the extents that end after 1534 * our range starts. 1535 */ 1536 node = tree_search(tree, start); 1537 if (!node) 1538 goto out; 1539 1540 while (1) { 1541 state = rb_entry(node, struct extent_state, rb_node); 1542 if (state->end >= start && (state->state & bits)) 1543 return state; 1544 1545 node = rb_next(node); 1546 if (!node) 1547 break; 1548 } 1549 out: 1550 return NULL; 1551 } 1552 1553 /* 1554 * Find the first offset in the io tree with one or more @bits set. 1555 * 1556 * Note: If there are multiple bits set in @bits, any of them will match. 1557 * 1558 * Return 0 if we find something, and update @start_ret and @end_ret. 1559 * Return 1 if we found nothing. 1560 */ 1561 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1562 u64 *start_ret, u64 *end_ret, u32 bits, 1563 struct extent_state **cached_state) 1564 { 1565 struct extent_state *state; 1566 int ret = 1; 1567 1568 spin_lock(&tree->lock); 1569 if (cached_state && *cached_state) { 1570 state = *cached_state; 1571 if (state->end == start - 1 && extent_state_in_tree(state)) { 1572 while ((state = next_state(state)) != NULL) { 1573 if (state->state & bits) 1574 goto got_it; 1575 } 1576 free_extent_state(*cached_state); 1577 *cached_state = NULL; 1578 goto out; 1579 } 1580 free_extent_state(*cached_state); 1581 *cached_state = NULL; 1582 } 1583 1584 state = find_first_extent_bit_state(tree, start, bits); 1585 got_it: 1586 if (state) { 1587 cache_state_if_flags(state, cached_state, 0); 1588 *start_ret = state->start; 1589 *end_ret = state->end; 1590 ret = 0; 1591 } 1592 out: 1593 spin_unlock(&tree->lock); 1594 return ret; 1595 } 1596 1597 /** 1598 * Find a contiguous area of bits 1599 * 1600 * @tree: io tree to check 1601 * @start: offset to start the search from 1602 * @start_ret: the first offset we found with the bits set 1603 * @end_ret: the final contiguous range of the bits that were set 1604 * @bits: bits to look for 1605 * 1606 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges 1607 * to set bits appropriately, and then merge them again. During this time it 1608 * will drop the tree->lock, so use this helper if you want to find the actual 1609 * contiguous area for given bits. We will search to the first bit we find, and 1610 * then walk down the tree until we find a non-contiguous area. The area 1611 * returned will be the full contiguous area with the bits set. 1612 */ 1613 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, 1614 u64 *start_ret, u64 *end_ret, u32 bits) 1615 { 1616 struct extent_state *state; 1617 int ret = 1; 1618 1619 spin_lock(&tree->lock); 1620 state = find_first_extent_bit_state(tree, start, bits); 1621 if (state) { 1622 *start_ret = state->start; 1623 *end_ret = state->end; 1624 while ((state = next_state(state)) != NULL) { 1625 if (state->start > (*end_ret + 1)) 1626 break; 1627 *end_ret = state->end; 1628 } 1629 ret = 0; 1630 } 1631 spin_unlock(&tree->lock); 1632 return ret; 1633 } 1634 1635 /** 1636 * Find the first range that has @bits not set. This range could start before 1637 * @start. 1638 * 1639 * @tree: the tree to search 1640 * @start: offset at/after which the found extent should start 1641 * @start_ret: records the beginning of the range 1642 * @end_ret: records the end of the range (inclusive) 1643 * @bits: the set of bits which must be unset 1644 * 1645 * Since unallocated range is also considered one which doesn't have the bits 1646 * set it's possible that @end_ret contains -1, this happens in case the range 1647 * spans (last_range_end, end of device]. In this case it's up to the caller to 1648 * trim @end_ret to the appropriate size. 1649 */ 1650 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, 1651 u64 *start_ret, u64 *end_ret, u32 bits) 1652 { 1653 struct extent_state *state; 1654 struct rb_node *node, *prev = NULL, *next; 1655 1656 spin_lock(&tree->lock); 1657 1658 /* Find first extent with bits cleared */ 1659 while (1) { 1660 node = __etree_search(tree, start, &next, &prev, NULL, NULL); 1661 if (!node && !next && !prev) { 1662 /* 1663 * Tree is completely empty, send full range and let 1664 * caller deal with it 1665 */ 1666 *start_ret = 0; 1667 *end_ret = -1; 1668 goto out; 1669 } else if (!node && !next) { 1670 /* 1671 * We are past the last allocated chunk, set start at 1672 * the end of the last extent. 1673 */ 1674 state = rb_entry(prev, struct extent_state, rb_node); 1675 *start_ret = state->end + 1; 1676 *end_ret = -1; 1677 goto out; 1678 } else if (!node) { 1679 node = next; 1680 } 1681 /* 1682 * At this point 'node' either contains 'start' or start is 1683 * before 'node' 1684 */ 1685 state = rb_entry(node, struct extent_state, rb_node); 1686 1687 if (in_range(start, state->start, state->end - state->start + 1)) { 1688 if (state->state & bits) { 1689 /* 1690 * |--range with bits sets--| 1691 * | 1692 * start 1693 */ 1694 start = state->end + 1; 1695 } else { 1696 /* 1697 * 'start' falls within a range that doesn't 1698 * have the bits set, so take its start as 1699 * the beginning of the desired range 1700 * 1701 * |--range with bits cleared----| 1702 * | 1703 * start 1704 */ 1705 *start_ret = state->start; 1706 break; 1707 } 1708 } else { 1709 /* 1710 * |---prev range---|---hole/unset---|---node range---| 1711 * | 1712 * start 1713 * 1714 * or 1715 * 1716 * |---hole/unset--||--first node--| 1717 * 0 | 1718 * start 1719 */ 1720 if (prev) { 1721 state = rb_entry(prev, struct extent_state, 1722 rb_node); 1723 *start_ret = state->end + 1; 1724 } else { 1725 *start_ret = 0; 1726 } 1727 break; 1728 } 1729 } 1730 1731 /* 1732 * Find the longest stretch from start until an entry which has the 1733 * bits set 1734 */ 1735 while (1) { 1736 state = rb_entry(node, struct extent_state, rb_node); 1737 if (state->end >= start && !(state->state & bits)) { 1738 *end_ret = state->end; 1739 } else { 1740 *end_ret = state->start - 1; 1741 break; 1742 } 1743 1744 node = rb_next(node); 1745 if (!node) 1746 break; 1747 } 1748 out: 1749 spin_unlock(&tree->lock); 1750 } 1751 1752 /* 1753 * find a contiguous range of bytes in the file marked as delalloc, not 1754 * more than 'max_bytes'. start and end are used to return the range, 1755 * 1756 * true is returned if we find something, false if nothing was in the tree 1757 */ 1758 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, 1759 u64 *end, u64 max_bytes, 1760 struct extent_state **cached_state) 1761 { 1762 struct rb_node *node; 1763 struct extent_state *state; 1764 u64 cur_start = *start; 1765 bool found = false; 1766 u64 total_bytes = 0; 1767 1768 spin_lock(&tree->lock); 1769 1770 /* 1771 * this search will find all the extents that end after 1772 * our range starts. 1773 */ 1774 node = tree_search(tree, cur_start); 1775 if (!node) { 1776 *end = (u64)-1; 1777 goto out; 1778 } 1779 1780 while (1) { 1781 state = rb_entry(node, struct extent_state, rb_node); 1782 if (found && (state->start != cur_start || 1783 (state->state & EXTENT_BOUNDARY))) { 1784 goto out; 1785 } 1786 if (!(state->state & EXTENT_DELALLOC)) { 1787 if (!found) 1788 *end = state->end; 1789 goto out; 1790 } 1791 if (!found) { 1792 *start = state->start; 1793 *cached_state = state; 1794 refcount_inc(&state->refs); 1795 } 1796 found = true; 1797 *end = state->end; 1798 cur_start = state->end + 1; 1799 node = rb_next(node); 1800 total_bytes += state->end - state->start + 1; 1801 if (total_bytes >= max_bytes) 1802 break; 1803 if (!node) 1804 break; 1805 } 1806 out: 1807 spin_unlock(&tree->lock); 1808 return found; 1809 } 1810 1811 /* 1812 * Process one page for __process_pages_contig(). 1813 * 1814 * Return >0 if we hit @page == @locked_page. 1815 * Return 0 if we updated the page status. 1816 * Return -EGAIN if the we need to try again. 1817 * (For PAGE_LOCK case but got dirty page or page not belong to mapping) 1818 */ 1819 static int process_one_page(struct btrfs_fs_info *fs_info, 1820 struct address_space *mapping, 1821 struct page *page, struct page *locked_page, 1822 unsigned long page_ops, u64 start, u64 end) 1823 { 1824 u32 len; 1825 1826 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 1827 len = end + 1 - start; 1828 1829 if (page_ops & PAGE_SET_ORDERED) 1830 btrfs_page_clamp_set_ordered(fs_info, page, start, len); 1831 if (page_ops & PAGE_SET_ERROR) 1832 btrfs_page_clamp_set_error(fs_info, page, start, len); 1833 if (page_ops & PAGE_START_WRITEBACK) { 1834 btrfs_page_clamp_clear_dirty(fs_info, page, start, len); 1835 btrfs_page_clamp_set_writeback(fs_info, page, start, len); 1836 } 1837 if (page_ops & PAGE_END_WRITEBACK) 1838 btrfs_page_clamp_clear_writeback(fs_info, page, start, len); 1839 1840 if (page == locked_page) 1841 return 1; 1842 1843 if (page_ops & PAGE_LOCK) { 1844 int ret; 1845 1846 ret = btrfs_page_start_writer_lock(fs_info, page, start, len); 1847 if (ret) 1848 return ret; 1849 if (!PageDirty(page) || page->mapping != mapping) { 1850 btrfs_page_end_writer_lock(fs_info, page, start, len); 1851 return -EAGAIN; 1852 } 1853 } 1854 if (page_ops & PAGE_UNLOCK) 1855 btrfs_page_end_writer_lock(fs_info, page, start, len); 1856 return 0; 1857 } 1858 1859 static int __process_pages_contig(struct address_space *mapping, 1860 struct page *locked_page, 1861 u64 start, u64 end, unsigned long page_ops, 1862 u64 *processed_end) 1863 { 1864 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); 1865 pgoff_t start_index = start >> PAGE_SHIFT; 1866 pgoff_t end_index = end >> PAGE_SHIFT; 1867 pgoff_t index = start_index; 1868 unsigned long nr_pages = end_index - start_index + 1; 1869 unsigned long pages_processed = 0; 1870 struct page *pages[16]; 1871 int err = 0; 1872 int i; 1873 1874 if (page_ops & PAGE_LOCK) { 1875 ASSERT(page_ops == PAGE_LOCK); 1876 ASSERT(processed_end && *processed_end == start); 1877 } 1878 1879 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1880 mapping_set_error(mapping, -EIO); 1881 1882 while (nr_pages > 0) { 1883 int found_pages; 1884 1885 found_pages = find_get_pages_contig(mapping, index, 1886 min_t(unsigned long, 1887 nr_pages, ARRAY_SIZE(pages)), pages); 1888 if (found_pages == 0) { 1889 /* 1890 * Only if we're going to lock these pages, we can find 1891 * nothing at @index. 1892 */ 1893 ASSERT(page_ops & PAGE_LOCK); 1894 err = -EAGAIN; 1895 goto out; 1896 } 1897 1898 for (i = 0; i < found_pages; i++) { 1899 int process_ret; 1900 1901 process_ret = process_one_page(fs_info, mapping, 1902 pages[i], locked_page, page_ops, 1903 start, end); 1904 if (process_ret < 0) { 1905 for (; i < found_pages; i++) 1906 put_page(pages[i]); 1907 err = -EAGAIN; 1908 goto out; 1909 } 1910 put_page(pages[i]); 1911 pages_processed++; 1912 } 1913 nr_pages -= found_pages; 1914 index += found_pages; 1915 cond_resched(); 1916 } 1917 out: 1918 if (err && processed_end) { 1919 /* 1920 * Update @processed_end. I know this is awful since it has 1921 * two different return value patterns (inclusive vs exclusive). 1922 * 1923 * But the exclusive pattern is necessary if @start is 0, or we 1924 * underflow and check against processed_end won't work as 1925 * expected. 1926 */ 1927 if (pages_processed) 1928 *processed_end = min(end, 1929 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); 1930 else 1931 *processed_end = start; 1932 } 1933 return err; 1934 } 1935 1936 static noinline void __unlock_for_delalloc(struct inode *inode, 1937 struct page *locked_page, 1938 u64 start, u64 end) 1939 { 1940 unsigned long index = start >> PAGE_SHIFT; 1941 unsigned long end_index = end >> PAGE_SHIFT; 1942 1943 ASSERT(locked_page); 1944 if (index == locked_page->index && end_index == index) 1945 return; 1946 1947 __process_pages_contig(inode->i_mapping, locked_page, start, end, 1948 PAGE_UNLOCK, NULL); 1949 } 1950 1951 static noinline int lock_delalloc_pages(struct inode *inode, 1952 struct page *locked_page, 1953 u64 delalloc_start, 1954 u64 delalloc_end) 1955 { 1956 unsigned long index = delalloc_start >> PAGE_SHIFT; 1957 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1958 u64 processed_end = delalloc_start; 1959 int ret; 1960 1961 ASSERT(locked_page); 1962 if (index == locked_page->index && index == end_index) 1963 return 0; 1964 1965 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, 1966 delalloc_end, PAGE_LOCK, &processed_end); 1967 if (ret == -EAGAIN && processed_end > delalloc_start) 1968 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1969 processed_end); 1970 return ret; 1971 } 1972 1973 /* 1974 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 1975 * more than @max_bytes. @Start and @end are used to return the range, 1976 * 1977 * Return: true if we find something 1978 * false if nothing was in the tree 1979 */ 1980 EXPORT_FOR_TESTS 1981 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 1982 struct page *locked_page, u64 *start, 1983 u64 *end) 1984 { 1985 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 1986 u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; 1987 u64 delalloc_start; 1988 u64 delalloc_end; 1989 bool found; 1990 struct extent_state *cached_state = NULL; 1991 int ret; 1992 int loops = 0; 1993 1994 again: 1995 /* step one, find a bunch of delalloc bytes starting at start */ 1996 delalloc_start = *start; 1997 delalloc_end = 0; 1998 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1999 max_bytes, &cached_state); 2000 if (!found || delalloc_end <= *start) { 2001 *start = delalloc_start; 2002 *end = delalloc_end; 2003 free_extent_state(cached_state); 2004 return false; 2005 } 2006 2007 /* 2008 * start comes from the offset of locked_page. We have to lock 2009 * pages in order, so we can't process delalloc bytes before 2010 * locked_page 2011 */ 2012 if (delalloc_start < *start) 2013 delalloc_start = *start; 2014 2015 /* 2016 * make sure to limit the number of pages we try to lock down 2017 */ 2018 if (delalloc_end + 1 - delalloc_start > max_bytes) 2019 delalloc_end = delalloc_start + max_bytes - 1; 2020 2021 /* step two, lock all the pages after the page that has start */ 2022 ret = lock_delalloc_pages(inode, locked_page, 2023 delalloc_start, delalloc_end); 2024 ASSERT(!ret || ret == -EAGAIN); 2025 if (ret == -EAGAIN) { 2026 /* some of the pages are gone, lets avoid looping by 2027 * shortening the size of the delalloc range we're searching 2028 */ 2029 free_extent_state(cached_state); 2030 cached_state = NULL; 2031 if (!loops) { 2032 max_bytes = PAGE_SIZE; 2033 loops = 1; 2034 goto again; 2035 } else { 2036 found = false; 2037 goto out_failed; 2038 } 2039 } 2040 2041 /* step three, lock the state bits for the whole range */ 2042 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 2043 2044 /* then test to make sure it is all still delalloc */ 2045 ret = test_range_bit(tree, delalloc_start, delalloc_end, 2046 EXTENT_DELALLOC, 1, cached_state); 2047 if (!ret) { 2048 unlock_extent_cached(tree, delalloc_start, delalloc_end, 2049 &cached_state); 2050 __unlock_for_delalloc(inode, locked_page, 2051 delalloc_start, delalloc_end); 2052 cond_resched(); 2053 goto again; 2054 } 2055 free_extent_state(cached_state); 2056 *start = delalloc_start; 2057 *end = delalloc_end; 2058 out_failed: 2059 return found; 2060 } 2061 2062 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2063 struct page *locked_page, 2064 u32 clear_bits, unsigned long page_ops) 2065 { 2066 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); 2067 2068 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, 2069 start, end, page_ops, NULL); 2070 } 2071 2072 /* 2073 * count the number of bytes in the tree that have a given bit(s) 2074 * set. This can be fairly slow, except for EXTENT_DIRTY which is 2075 * cached. The total number found is returned. 2076 */ 2077 u64 count_range_bits(struct extent_io_tree *tree, 2078 u64 *start, u64 search_end, u64 max_bytes, 2079 u32 bits, int contig) 2080 { 2081 struct rb_node *node; 2082 struct extent_state *state; 2083 u64 cur_start = *start; 2084 u64 total_bytes = 0; 2085 u64 last = 0; 2086 int found = 0; 2087 2088 if (WARN_ON(search_end <= cur_start)) 2089 return 0; 2090 2091 spin_lock(&tree->lock); 2092 if (cur_start == 0 && bits == EXTENT_DIRTY) { 2093 total_bytes = tree->dirty_bytes; 2094 goto out; 2095 } 2096 /* 2097 * this search will find all the extents that end after 2098 * our range starts. 2099 */ 2100 node = tree_search(tree, cur_start); 2101 if (!node) 2102 goto out; 2103 2104 while (1) { 2105 state = rb_entry(node, struct extent_state, rb_node); 2106 if (state->start > search_end) 2107 break; 2108 if (contig && found && state->start > last + 1) 2109 break; 2110 if (state->end >= cur_start && (state->state & bits) == bits) { 2111 total_bytes += min(search_end, state->end) + 1 - 2112 max(cur_start, state->start); 2113 if (total_bytes >= max_bytes) 2114 break; 2115 if (!found) { 2116 *start = max(cur_start, state->start); 2117 found = 1; 2118 } 2119 last = state->end; 2120 } else if (contig && found) { 2121 break; 2122 } 2123 node = rb_next(node); 2124 if (!node) 2125 break; 2126 } 2127 out: 2128 spin_unlock(&tree->lock); 2129 return total_bytes; 2130 } 2131 2132 /* 2133 * set the private field for a given byte offset in the tree. If there isn't 2134 * an extent_state there already, this does nothing. 2135 */ 2136 int set_state_failrec(struct extent_io_tree *tree, u64 start, 2137 struct io_failure_record *failrec) 2138 { 2139 struct rb_node *node; 2140 struct extent_state *state; 2141 int ret = 0; 2142 2143 spin_lock(&tree->lock); 2144 /* 2145 * this search will find all the extents that end after 2146 * our range starts. 2147 */ 2148 node = tree_search(tree, start); 2149 if (!node) { 2150 ret = -ENOENT; 2151 goto out; 2152 } 2153 state = rb_entry(node, struct extent_state, rb_node); 2154 if (state->start != start) { 2155 ret = -ENOENT; 2156 goto out; 2157 } 2158 state->failrec = failrec; 2159 out: 2160 spin_unlock(&tree->lock); 2161 return ret; 2162 } 2163 2164 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) 2165 { 2166 struct rb_node *node; 2167 struct extent_state *state; 2168 struct io_failure_record *failrec; 2169 2170 spin_lock(&tree->lock); 2171 /* 2172 * this search will find all the extents that end after 2173 * our range starts. 2174 */ 2175 node = tree_search(tree, start); 2176 if (!node) { 2177 failrec = ERR_PTR(-ENOENT); 2178 goto out; 2179 } 2180 state = rb_entry(node, struct extent_state, rb_node); 2181 if (state->start != start) { 2182 failrec = ERR_PTR(-ENOENT); 2183 goto out; 2184 } 2185 2186 failrec = state->failrec; 2187 out: 2188 spin_unlock(&tree->lock); 2189 return failrec; 2190 } 2191 2192 /* 2193 * searches a range in the state tree for a given mask. 2194 * If 'filled' == 1, this returns 1 only if every extent in the tree 2195 * has the bits set. Otherwise, 1 is returned if any bit in the 2196 * range is found set. 2197 */ 2198 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 2199 u32 bits, int filled, struct extent_state *cached) 2200 { 2201 struct extent_state *state = NULL; 2202 struct rb_node *node; 2203 int bitset = 0; 2204 2205 spin_lock(&tree->lock); 2206 if (cached && extent_state_in_tree(cached) && cached->start <= start && 2207 cached->end > start) 2208 node = &cached->rb_node; 2209 else 2210 node = tree_search(tree, start); 2211 while (node && start <= end) { 2212 state = rb_entry(node, struct extent_state, rb_node); 2213 2214 if (filled && state->start > start) { 2215 bitset = 0; 2216 break; 2217 } 2218 2219 if (state->start > end) 2220 break; 2221 2222 if (state->state & bits) { 2223 bitset = 1; 2224 if (!filled) 2225 break; 2226 } else if (filled) { 2227 bitset = 0; 2228 break; 2229 } 2230 2231 if (state->end == (u64)-1) 2232 break; 2233 2234 start = state->end + 1; 2235 if (start > end) 2236 break; 2237 node = rb_next(node); 2238 if (!node) { 2239 if (filled) 2240 bitset = 0; 2241 break; 2242 } 2243 } 2244 spin_unlock(&tree->lock); 2245 return bitset; 2246 } 2247 2248 /* 2249 * helper function to set a given page up to date if all the 2250 * extents in the tree for that page are up to date 2251 */ 2252 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 2253 { 2254 u64 start = page_offset(page); 2255 u64 end = start + PAGE_SIZE - 1; 2256 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 2257 SetPageUptodate(page); 2258 } 2259 2260 int free_io_failure(struct extent_io_tree *failure_tree, 2261 struct extent_io_tree *io_tree, 2262 struct io_failure_record *rec) 2263 { 2264 int ret; 2265 int err = 0; 2266 2267 set_state_failrec(failure_tree, rec->start, NULL); 2268 ret = clear_extent_bits(failure_tree, rec->start, 2269 rec->start + rec->len - 1, 2270 EXTENT_LOCKED | EXTENT_DIRTY); 2271 if (ret) 2272 err = ret; 2273 2274 ret = clear_extent_bits(io_tree, rec->start, 2275 rec->start + rec->len - 1, 2276 EXTENT_DAMAGED); 2277 if (ret && !err) 2278 err = ret; 2279 2280 kfree(rec); 2281 return err; 2282 } 2283 2284 /* 2285 * this bypasses the standard btrfs submit functions deliberately, as 2286 * the standard behavior is to write all copies in a raid setup. here we only 2287 * want to write the one bad copy. so we do the mapping for ourselves and issue 2288 * submit_bio directly. 2289 * to avoid any synchronization issues, wait for the data after writing, which 2290 * actually prevents the read that triggered the error from finishing. 2291 * currently, there can be no more than two copies of every data bit. thus, 2292 * exactly one rewrite is required. 2293 */ 2294 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 2295 u64 length, u64 logical, struct page *page, 2296 unsigned int pg_offset, int mirror_num) 2297 { 2298 struct bio *bio; 2299 struct btrfs_device *dev; 2300 u64 map_length = 0; 2301 u64 sector; 2302 struct btrfs_bio *bbio = NULL; 2303 int ret; 2304 2305 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2306 BUG_ON(!mirror_num); 2307 2308 if (btrfs_is_zoned(fs_info)) 2309 return btrfs_repair_one_zone(fs_info, logical); 2310 2311 bio = btrfs_io_bio_alloc(1); 2312 bio->bi_iter.bi_size = 0; 2313 map_length = length; 2314 2315 /* 2316 * Avoid races with device replace and make sure our bbio has devices 2317 * associated to its stripes that don't go away while we are doing the 2318 * read repair operation. 2319 */ 2320 btrfs_bio_counter_inc_blocked(fs_info); 2321 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2322 /* 2323 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2324 * to update all raid stripes, but here we just want to correct 2325 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2326 * stripe's dev and sector. 2327 */ 2328 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2329 &map_length, &bbio, 0); 2330 if (ret) { 2331 btrfs_bio_counter_dec(fs_info); 2332 bio_put(bio); 2333 return -EIO; 2334 } 2335 ASSERT(bbio->mirror_num == 1); 2336 } else { 2337 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2338 &map_length, &bbio, mirror_num); 2339 if (ret) { 2340 btrfs_bio_counter_dec(fs_info); 2341 bio_put(bio); 2342 return -EIO; 2343 } 2344 BUG_ON(mirror_num != bbio->mirror_num); 2345 } 2346 2347 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2348 bio->bi_iter.bi_sector = sector; 2349 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2350 btrfs_put_bbio(bbio); 2351 if (!dev || !dev->bdev || 2352 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2353 btrfs_bio_counter_dec(fs_info); 2354 bio_put(bio); 2355 return -EIO; 2356 } 2357 bio_set_dev(bio, dev->bdev); 2358 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2359 bio_add_page(bio, page, length, pg_offset); 2360 2361 if (btrfsic_submit_bio_wait(bio)) { 2362 /* try to remap that extent elsewhere? */ 2363 btrfs_bio_counter_dec(fs_info); 2364 bio_put(bio); 2365 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2366 return -EIO; 2367 } 2368 2369 btrfs_info_rl_in_rcu(fs_info, 2370 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2371 ino, start, 2372 rcu_str_deref(dev->name), sector); 2373 btrfs_bio_counter_dec(fs_info); 2374 bio_put(bio); 2375 return 0; 2376 } 2377 2378 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) 2379 { 2380 struct btrfs_fs_info *fs_info = eb->fs_info; 2381 u64 start = eb->start; 2382 int i, num_pages = num_extent_pages(eb); 2383 int ret = 0; 2384 2385 if (sb_rdonly(fs_info->sb)) 2386 return -EROFS; 2387 2388 for (i = 0; i < num_pages; i++) { 2389 struct page *p = eb->pages[i]; 2390 2391 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2392 start - page_offset(p), mirror_num); 2393 if (ret) 2394 break; 2395 start += PAGE_SIZE; 2396 } 2397 2398 return ret; 2399 } 2400 2401 /* 2402 * each time an IO finishes, we do a fast check in the IO failure tree 2403 * to see if we need to process or clean up an io_failure_record 2404 */ 2405 int clean_io_failure(struct btrfs_fs_info *fs_info, 2406 struct extent_io_tree *failure_tree, 2407 struct extent_io_tree *io_tree, u64 start, 2408 struct page *page, u64 ino, unsigned int pg_offset) 2409 { 2410 u64 private; 2411 struct io_failure_record *failrec; 2412 struct extent_state *state; 2413 int num_copies; 2414 int ret; 2415 2416 private = 0; 2417 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2418 EXTENT_DIRTY, 0); 2419 if (!ret) 2420 return 0; 2421 2422 failrec = get_state_failrec(failure_tree, start); 2423 if (IS_ERR(failrec)) 2424 return 0; 2425 2426 BUG_ON(!failrec->this_mirror); 2427 2428 if (sb_rdonly(fs_info->sb)) 2429 goto out; 2430 2431 spin_lock(&io_tree->lock); 2432 state = find_first_extent_bit_state(io_tree, 2433 failrec->start, 2434 EXTENT_LOCKED); 2435 spin_unlock(&io_tree->lock); 2436 2437 if (state && state->start <= failrec->start && 2438 state->end >= failrec->start + failrec->len - 1) { 2439 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2440 failrec->len); 2441 if (num_copies > 1) { 2442 repair_io_failure(fs_info, ino, start, failrec->len, 2443 failrec->logical, page, pg_offset, 2444 failrec->failed_mirror); 2445 } 2446 } 2447 2448 out: 2449 free_io_failure(failure_tree, io_tree, failrec); 2450 2451 return 0; 2452 } 2453 2454 /* 2455 * Can be called when 2456 * - hold extent lock 2457 * - under ordered extent 2458 * - the inode is freeing 2459 */ 2460 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2461 { 2462 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2463 struct io_failure_record *failrec; 2464 struct extent_state *state, *next; 2465 2466 if (RB_EMPTY_ROOT(&failure_tree->state)) 2467 return; 2468 2469 spin_lock(&failure_tree->lock); 2470 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2471 while (state) { 2472 if (state->start > end) 2473 break; 2474 2475 ASSERT(state->end <= end); 2476 2477 next = next_state(state); 2478 2479 failrec = state->failrec; 2480 free_extent_state(state); 2481 kfree(failrec); 2482 2483 state = next; 2484 } 2485 spin_unlock(&failure_tree->lock); 2486 } 2487 2488 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, 2489 u64 start) 2490 { 2491 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2492 struct io_failure_record *failrec; 2493 struct extent_map *em; 2494 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2495 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2496 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2497 const u32 sectorsize = fs_info->sectorsize; 2498 int ret; 2499 u64 logical; 2500 2501 failrec = get_state_failrec(failure_tree, start); 2502 if (!IS_ERR(failrec)) { 2503 btrfs_debug(fs_info, 2504 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", 2505 failrec->logical, failrec->start, failrec->len); 2506 /* 2507 * when data can be on disk more than twice, add to failrec here 2508 * (e.g. with a list for failed_mirror) to make 2509 * clean_io_failure() clean all those errors at once. 2510 */ 2511 2512 return failrec; 2513 } 2514 2515 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2516 if (!failrec) 2517 return ERR_PTR(-ENOMEM); 2518 2519 failrec->start = start; 2520 failrec->len = sectorsize; 2521 failrec->this_mirror = 0; 2522 failrec->bio_flags = 0; 2523 2524 read_lock(&em_tree->lock); 2525 em = lookup_extent_mapping(em_tree, start, failrec->len); 2526 if (!em) { 2527 read_unlock(&em_tree->lock); 2528 kfree(failrec); 2529 return ERR_PTR(-EIO); 2530 } 2531 2532 if (em->start > start || em->start + em->len <= start) { 2533 free_extent_map(em); 2534 em = NULL; 2535 } 2536 read_unlock(&em_tree->lock); 2537 if (!em) { 2538 kfree(failrec); 2539 return ERR_PTR(-EIO); 2540 } 2541 2542 logical = start - em->start; 2543 logical = em->block_start + logical; 2544 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2545 logical = em->block_start; 2546 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2547 extent_set_compress_type(&failrec->bio_flags, em->compress_type); 2548 } 2549 2550 btrfs_debug(fs_info, 2551 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2552 logical, start, failrec->len); 2553 2554 failrec->logical = logical; 2555 free_extent_map(em); 2556 2557 /* Set the bits in the private failure tree */ 2558 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, 2559 EXTENT_LOCKED | EXTENT_DIRTY); 2560 if (ret >= 0) { 2561 ret = set_state_failrec(failure_tree, start, failrec); 2562 /* Set the bits in the inode's tree */ 2563 ret = set_extent_bits(tree, start, start + sectorsize - 1, 2564 EXTENT_DAMAGED); 2565 } else if (ret < 0) { 2566 kfree(failrec); 2567 return ERR_PTR(ret); 2568 } 2569 2570 return failrec; 2571 } 2572 2573 static bool btrfs_check_repairable(struct inode *inode, 2574 struct io_failure_record *failrec, 2575 int failed_mirror) 2576 { 2577 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2578 int num_copies; 2579 2580 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2581 if (num_copies == 1) { 2582 /* 2583 * we only have a single copy of the data, so don't bother with 2584 * all the retry and error correction code that follows. no 2585 * matter what the error is, it is very likely to persist. 2586 */ 2587 btrfs_debug(fs_info, 2588 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2589 num_copies, failrec->this_mirror, failed_mirror); 2590 return false; 2591 } 2592 2593 /* The failure record should only contain one sector */ 2594 ASSERT(failrec->len == fs_info->sectorsize); 2595 2596 /* 2597 * There are two premises: 2598 * a) deliver good data to the caller 2599 * b) correct the bad sectors on disk 2600 * 2601 * Since we're only doing repair for one sector, we only need to get 2602 * a good copy of the failed sector and if we succeed, we have setup 2603 * everything for repair_io_failure to do the rest for us. 2604 */ 2605 failrec->failed_mirror = failed_mirror; 2606 failrec->this_mirror++; 2607 if (failrec->this_mirror == failed_mirror) 2608 failrec->this_mirror++; 2609 2610 if (failrec->this_mirror > num_copies) { 2611 btrfs_debug(fs_info, 2612 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2613 num_copies, failrec->this_mirror, failed_mirror); 2614 return false; 2615 } 2616 2617 return true; 2618 } 2619 2620 int btrfs_repair_one_sector(struct inode *inode, 2621 struct bio *failed_bio, u32 bio_offset, 2622 struct page *page, unsigned int pgoff, 2623 u64 start, int failed_mirror, 2624 submit_bio_hook_t *submit_bio_hook) 2625 { 2626 struct io_failure_record *failrec; 2627 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2628 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2629 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2630 struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); 2631 const int icsum = bio_offset >> fs_info->sectorsize_bits; 2632 struct bio *repair_bio; 2633 struct btrfs_io_bio *repair_io_bio; 2634 blk_status_t status; 2635 2636 btrfs_debug(fs_info, 2637 "repair read error: read error at %llu", start); 2638 2639 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2640 2641 failrec = btrfs_get_io_failure_record(inode, start); 2642 if (IS_ERR(failrec)) 2643 return PTR_ERR(failrec); 2644 2645 2646 if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { 2647 free_io_failure(failure_tree, tree, failrec); 2648 return -EIO; 2649 } 2650 2651 repair_bio = btrfs_io_bio_alloc(1); 2652 repair_io_bio = btrfs_io_bio(repair_bio); 2653 repair_bio->bi_opf = REQ_OP_READ; 2654 repair_bio->bi_end_io = failed_bio->bi_end_io; 2655 repair_bio->bi_iter.bi_sector = failrec->logical >> 9; 2656 repair_bio->bi_private = failed_bio->bi_private; 2657 2658 if (failed_io_bio->csum) { 2659 const u32 csum_size = fs_info->csum_size; 2660 2661 repair_io_bio->csum = repair_io_bio->csum_inline; 2662 memcpy(repair_io_bio->csum, 2663 failed_io_bio->csum + csum_size * icsum, csum_size); 2664 } 2665 2666 bio_add_page(repair_bio, page, failrec->len, pgoff); 2667 repair_io_bio->logical = failrec->start; 2668 repair_io_bio->iter = repair_bio->bi_iter; 2669 2670 btrfs_debug(btrfs_sb(inode->i_sb), 2671 "repair read error: submitting new read to mirror %d", 2672 failrec->this_mirror); 2673 2674 status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, 2675 failrec->bio_flags); 2676 if (status) { 2677 free_io_failure(failure_tree, tree, failrec); 2678 bio_put(repair_bio); 2679 } 2680 return blk_status_to_errno(status); 2681 } 2682 2683 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) 2684 { 2685 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 2686 2687 ASSERT(page_offset(page) <= start && 2688 start + len <= page_offset(page) + PAGE_SIZE); 2689 2690 if (uptodate) { 2691 btrfs_page_set_uptodate(fs_info, page, start, len); 2692 } else { 2693 btrfs_page_clear_uptodate(fs_info, page, start, len); 2694 btrfs_page_set_error(fs_info, page, start, len); 2695 } 2696 2697 if (fs_info->sectorsize == PAGE_SIZE) 2698 unlock_page(page); 2699 else 2700 btrfs_subpage_end_reader(fs_info, page, start, len); 2701 } 2702 2703 static blk_status_t submit_read_repair(struct inode *inode, 2704 struct bio *failed_bio, u32 bio_offset, 2705 struct page *page, unsigned int pgoff, 2706 u64 start, u64 end, int failed_mirror, 2707 unsigned int error_bitmap, 2708 submit_bio_hook_t *submit_bio_hook) 2709 { 2710 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2711 const u32 sectorsize = fs_info->sectorsize; 2712 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; 2713 int error = 0; 2714 int i; 2715 2716 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2717 2718 /* We're here because we had some read errors or csum mismatch */ 2719 ASSERT(error_bitmap); 2720 2721 /* 2722 * We only get called on buffered IO, thus page must be mapped and bio 2723 * must not be cloned. 2724 */ 2725 ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); 2726 2727 /* Iterate through all the sectors in the range */ 2728 for (i = 0; i < nr_bits; i++) { 2729 const unsigned int offset = i * sectorsize; 2730 struct extent_state *cached = NULL; 2731 bool uptodate = false; 2732 int ret; 2733 2734 if (!(error_bitmap & (1U << i))) { 2735 /* 2736 * This sector has no error, just end the page read 2737 * and unlock the range. 2738 */ 2739 uptodate = true; 2740 goto next; 2741 } 2742 2743 ret = btrfs_repair_one_sector(inode, failed_bio, 2744 bio_offset + offset, 2745 page, pgoff + offset, start + offset, 2746 failed_mirror, submit_bio_hook); 2747 if (!ret) { 2748 /* 2749 * We have submitted the read repair, the page release 2750 * will be handled by the endio function of the 2751 * submitted repair bio. 2752 * Thus we don't need to do any thing here. 2753 */ 2754 continue; 2755 } 2756 /* 2757 * Repair failed, just record the error but still continue. 2758 * Or the remaining sectors will not be properly unlocked. 2759 */ 2760 if (!error) 2761 error = ret; 2762 next: 2763 end_page_read(page, uptodate, start + offset, sectorsize); 2764 if (uptodate) 2765 set_extent_uptodate(&BTRFS_I(inode)->io_tree, 2766 start + offset, 2767 start + offset + sectorsize - 1, 2768 &cached, GFP_ATOMIC); 2769 unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, 2770 start + offset, 2771 start + offset + sectorsize - 1, 2772 &cached); 2773 } 2774 return errno_to_blk_status(error); 2775 } 2776 2777 /* lots and lots of room for performance fixes in the end_bio funcs */ 2778 2779 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2780 { 2781 struct btrfs_inode *inode; 2782 int uptodate = (err == 0); 2783 int ret = 0; 2784 2785 ASSERT(page && page->mapping); 2786 inode = BTRFS_I(page->mapping->host); 2787 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); 2788 2789 if (!uptodate) { 2790 ClearPageUptodate(page); 2791 SetPageError(page); 2792 ret = err < 0 ? err : -EIO; 2793 mapping_set_error(page->mapping, ret); 2794 } 2795 } 2796 2797 /* 2798 * after a writepage IO is done, we need to: 2799 * clear the uptodate bits on error 2800 * clear the writeback bits in the extent tree for this IO 2801 * end_page_writeback if the page has no more pending IO 2802 * 2803 * Scheduling is not allowed, so the extent state tree is expected 2804 * to have one and only one object corresponding to this IO. 2805 */ 2806 static void end_bio_extent_writepage(struct bio *bio) 2807 { 2808 int error = blk_status_to_errno(bio->bi_status); 2809 struct bio_vec *bvec; 2810 u64 start; 2811 u64 end; 2812 struct bvec_iter_all iter_all; 2813 bool first_bvec = true; 2814 2815 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2816 bio_for_each_segment_all(bvec, bio, iter_all) { 2817 struct page *page = bvec->bv_page; 2818 struct inode *inode = page->mapping->host; 2819 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2820 const u32 sectorsize = fs_info->sectorsize; 2821 2822 /* Our read/write should always be sector aligned. */ 2823 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 2824 btrfs_err(fs_info, 2825 "partial page write in btrfs with offset %u and length %u", 2826 bvec->bv_offset, bvec->bv_len); 2827 else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) 2828 btrfs_info(fs_info, 2829 "incomplete page write with offset %u and length %u", 2830 bvec->bv_offset, bvec->bv_len); 2831 2832 start = page_offset(page) + bvec->bv_offset; 2833 end = start + bvec->bv_len - 1; 2834 2835 if (first_bvec) { 2836 btrfs_record_physical_zoned(inode, start, bio); 2837 first_bvec = false; 2838 } 2839 2840 end_extent_writepage(page, error, start, end); 2841 2842 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); 2843 } 2844 2845 bio_put(bio); 2846 } 2847 2848 /* 2849 * Record previously processed extent range 2850 * 2851 * For endio_readpage_release_extent() to handle a full extent range, reducing 2852 * the extent io operations. 2853 */ 2854 struct processed_extent { 2855 struct btrfs_inode *inode; 2856 /* Start of the range in @inode */ 2857 u64 start; 2858 /* End of the range in @inode */ 2859 u64 end; 2860 bool uptodate; 2861 }; 2862 2863 /* 2864 * Try to release processed extent range 2865 * 2866 * May not release the extent range right now if the current range is 2867 * contiguous to processed extent. 2868 * 2869 * Will release processed extent when any of @inode, @uptodate, the range is 2870 * no longer contiguous to the processed range. 2871 * 2872 * Passing @inode == NULL will force processed extent to be released. 2873 */ 2874 static void endio_readpage_release_extent(struct processed_extent *processed, 2875 struct btrfs_inode *inode, u64 start, u64 end, 2876 bool uptodate) 2877 { 2878 struct extent_state *cached = NULL; 2879 struct extent_io_tree *tree; 2880 2881 /* The first extent, initialize @processed */ 2882 if (!processed->inode) 2883 goto update; 2884 2885 /* 2886 * Contiguous to processed extent, just uptodate the end. 2887 * 2888 * Several things to notice: 2889 * 2890 * - bio can be merged as long as on-disk bytenr is contiguous 2891 * This means we can have page belonging to other inodes, thus need to 2892 * check if the inode still matches. 2893 * - bvec can contain range beyond current page for multi-page bvec 2894 * Thus we need to do processed->end + 1 >= start check 2895 */ 2896 if (processed->inode == inode && processed->uptodate == uptodate && 2897 processed->end + 1 >= start && end >= processed->end) { 2898 processed->end = end; 2899 return; 2900 } 2901 2902 tree = &processed->inode->io_tree; 2903 /* 2904 * Now we don't have range contiguous to the processed range, release 2905 * the processed range now. 2906 */ 2907 if (processed->uptodate && tree->track_uptodate) 2908 set_extent_uptodate(tree, processed->start, processed->end, 2909 &cached, GFP_ATOMIC); 2910 unlock_extent_cached_atomic(tree, processed->start, processed->end, 2911 &cached); 2912 2913 update: 2914 /* Update processed to current range */ 2915 processed->inode = inode; 2916 processed->start = start; 2917 processed->end = end; 2918 processed->uptodate = uptodate; 2919 } 2920 2921 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) 2922 { 2923 ASSERT(PageLocked(page)); 2924 if (fs_info->sectorsize == PAGE_SIZE) 2925 return; 2926 2927 ASSERT(PagePrivate(page)); 2928 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); 2929 } 2930 2931 /* 2932 * Find extent buffer for a givne bytenr. 2933 * 2934 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking 2935 * in endio context. 2936 */ 2937 static struct extent_buffer *find_extent_buffer_readpage( 2938 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 2939 { 2940 struct extent_buffer *eb; 2941 2942 /* 2943 * For regular sectorsize, we can use page->private to grab extent 2944 * buffer 2945 */ 2946 if (fs_info->sectorsize == PAGE_SIZE) { 2947 ASSERT(PagePrivate(page) && page->private); 2948 return (struct extent_buffer *)page->private; 2949 } 2950 2951 /* For subpage case, we need to lookup buffer radix tree */ 2952 rcu_read_lock(); 2953 eb = radix_tree_lookup(&fs_info->buffer_radix, 2954 bytenr >> fs_info->sectorsize_bits); 2955 rcu_read_unlock(); 2956 ASSERT(eb); 2957 return eb; 2958 } 2959 2960 /* 2961 * after a readpage IO is done, we need to: 2962 * clear the uptodate bits on error 2963 * set the uptodate bits if things worked 2964 * set the page up to date if all extents in the tree are uptodate 2965 * clear the lock bit in the extent tree 2966 * unlock the page if there are no other extents locked for it 2967 * 2968 * Scheduling is not allowed, so the extent state tree is expected 2969 * to have one and only one object corresponding to this IO. 2970 */ 2971 static void end_bio_extent_readpage(struct bio *bio) 2972 { 2973 struct bio_vec *bvec; 2974 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2975 struct extent_io_tree *tree, *failure_tree; 2976 struct processed_extent processed = { 0 }; 2977 /* 2978 * The offset to the beginning of a bio, since one bio can never be 2979 * larger than UINT_MAX, u32 here is enough. 2980 */ 2981 u32 bio_offset = 0; 2982 int mirror; 2983 int ret; 2984 struct bvec_iter_all iter_all; 2985 2986 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2987 bio_for_each_segment_all(bvec, bio, iter_all) { 2988 bool uptodate = !bio->bi_status; 2989 struct page *page = bvec->bv_page; 2990 struct inode *inode = page->mapping->host; 2991 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2992 const u32 sectorsize = fs_info->sectorsize; 2993 unsigned int error_bitmap = (unsigned int)-1; 2994 u64 start; 2995 u64 end; 2996 u32 len; 2997 2998 btrfs_debug(fs_info, 2999 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 3000 bio->bi_iter.bi_sector, bio->bi_status, 3001 io_bio->mirror_num); 3002 tree = &BTRFS_I(inode)->io_tree; 3003 failure_tree = &BTRFS_I(inode)->io_failure_tree; 3004 3005 /* 3006 * We always issue full-sector reads, but if some block in a 3007 * page fails to read, blk_update_request() will advance 3008 * bv_offset and adjust bv_len to compensate. Print a warning 3009 * for unaligned offsets, and an error if they don't add up to 3010 * a full sector. 3011 */ 3012 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 3013 btrfs_err(fs_info, 3014 "partial page read in btrfs with offset %u and length %u", 3015 bvec->bv_offset, bvec->bv_len); 3016 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, 3017 sectorsize)) 3018 btrfs_info(fs_info, 3019 "incomplete page read with offset %u and length %u", 3020 bvec->bv_offset, bvec->bv_len); 3021 3022 start = page_offset(page) + bvec->bv_offset; 3023 end = start + bvec->bv_len - 1; 3024 len = bvec->bv_len; 3025 3026 mirror = io_bio->mirror_num; 3027 if (likely(uptodate)) { 3028 if (is_data_inode(inode)) { 3029 error_bitmap = btrfs_verify_data_csum(io_bio, 3030 bio_offset, page, start, end); 3031 ret = error_bitmap; 3032 } else { 3033 ret = btrfs_validate_metadata_buffer(io_bio, 3034 page, start, end, mirror); 3035 } 3036 if (ret) 3037 uptodate = false; 3038 else 3039 clean_io_failure(BTRFS_I(inode)->root->fs_info, 3040 failure_tree, tree, start, 3041 page, 3042 btrfs_ino(BTRFS_I(inode)), 0); 3043 } 3044 3045 if (likely(uptodate)) 3046 goto readpage_ok; 3047 3048 if (is_data_inode(inode)) { 3049 /* 3050 * btrfs_submit_read_repair() will handle all the good 3051 * and bad sectors, we just continue to the next bvec. 3052 */ 3053 submit_read_repair(inode, bio, bio_offset, page, 3054 start - page_offset(page), start, 3055 end, mirror, error_bitmap, 3056 btrfs_submit_data_bio); 3057 3058 ASSERT(bio_offset + len > bio_offset); 3059 bio_offset += len; 3060 continue; 3061 } else { 3062 struct extent_buffer *eb; 3063 3064 eb = find_extent_buffer_readpage(fs_info, page, start); 3065 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 3066 eb->read_mirror = mirror; 3067 atomic_dec(&eb->io_pages); 3068 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, 3069 &eb->bflags)) 3070 btree_readahead_hook(eb, -EIO); 3071 } 3072 readpage_ok: 3073 if (likely(uptodate)) { 3074 loff_t i_size = i_size_read(inode); 3075 pgoff_t end_index = i_size >> PAGE_SHIFT; 3076 3077 /* 3078 * Zero out the remaining part if this range straddles 3079 * i_size. 3080 * 3081 * Here we should only zero the range inside the bvec, 3082 * not touch anything else. 3083 * 3084 * NOTE: i_size is exclusive while end is inclusive. 3085 */ 3086 if (page->index == end_index && i_size <= end) { 3087 u32 zero_start = max(offset_in_page(i_size), 3088 offset_in_page(start)); 3089 3090 zero_user_segment(page, zero_start, 3091 offset_in_page(end) + 1); 3092 } 3093 } 3094 ASSERT(bio_offset + len > bio_offset); 3095 bio_offset += len; 3096 3097 /* Update page status and unlock */ 3098 end_page_read(page, uptodate, start, len); 3099 endio_readpage_release_extent(&processed, BTRFS_I(inode), 3100 start, end, uptodate); 3101 } 3102 /* Release the last extent */ 3103 endio_readpage_release_extent(&processed, NULL, 0, 0, false); 3104 btrfs_io_bio_free_csum(io_bio); 3105 bio_put(bio); 3106 } 3107 3108 /* 3109 * Initialize the members up to but not including 'bio'. Use after allocating a 3110 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 3111 * 'bio' because use of __GFP_ZERO is not supported. 3112 */ 3113 static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) 3114 { 3115 memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); 3116 } 3117 3118 /* 3119 * The following helpers allocate a bio. As it's backed by a bioset, it'll 3120 * never fail. We're returning a bio right now but you can call btrfs_io_bio 3121 * for the appropriate container_of magic 3122 */ 3123 struct bio *btrfs_bio_alloc(u64 first_byte) 3124 { 3125 struct bio *bio; 3126 3127 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); 3128 bio->bi_iter.bi_sector = first_byte >> 9; 3129 btrfs_io_bio_init(btrfs_io_bio(bio)); 3130 return bio; 3131 } 3132 3133 struct bio *btrfs_bio_clone(struct bio *bio) 3134 { 3135 struct btrfs_io_bio *btrfs_bio; 3136 struct bio *new; 3137 3138 /* Bio allocation backed by a bioset does not fail */ 3139 new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); 3140 btrfs_bio = btrfs_io_bio(new); 3141 btrfs_io_bio_init(btrfs_bio); 3142 btrfs_bio->iter = bio->bi_iter; 3143 return new; 3144 } 3145 3146 struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) 3147 { 3148 struct bio *bio; 3149 3150 /* Bio allocation backed by a bioset does not fail */ 3151 bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); 3152 btrfs_io_bio_init(btrfs_io_bio(bio)); 3153 return bio; 3154 } 3155 3156 struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) 3157 { 3158 struct bio *bio; 3159 struct btrfs_io_bio *btrfs_bio; 3160 3161 /* this will never fail when it's backed by a bioset */ 3162 bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); 3163 ASSERT(bio); 3164 3165 btrfs_bio = btrfs_io_bio(bio); 3166 btrfs_io_bio_init(btrfs_bio); 3167 3168 bio_trim(bio, offset >> 9, size >> 9); 3169 btrfs_bio->iter = bio->bi_iter; 3170 return bio; 3171 } 3172 3173 /** 3174 * Attempt to add a page to bio 3175 * 3176 * @bio: destination bio 3177 * @page: page to add to the bio 3178 * @disk_bytenr: offset of the new bio or to check whether we are adding 3179 * a contiguous page to the previous one 3180 * @pg_offset: starting offset in the page 3181 * @size: portion of page that we want to write 3182 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 3183 * @bio_flags: flags of the current bio to see if we can merge them 3184 * @return: true if page was added, false otherwise 3185 * 3186 * Attempt to add a page to bio considering stripe alignment etc. 3187 * 3188 * Return true if successfully page added. Otherwise, return false. 3189 */ 3190 static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, 3191 struct page *page, 3192 u64 disk_bytenr, unsigned int size, 3193 unsigned int pg_offset, 3194 unsigned long bio_flags) 3195 { 3196 struct bio *bio = bio_ctrl->bio; 3197 u32 bio_size = bio->bi_iter.bi_size; 3198 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 3199 bool contig; 3200 int ret; 3201 3202 ASSERT(bio); 3203 /* The limit should be calculated when bio_ctrl->bio is allocated */ 3204 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); 3205 if (bio_ctrl->bio_flags != bio_flags) 3206 return false; 3207 3208 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) 3209 contig = bio->bi_iter.bi_sector == sector; 3210 else 3211 contig = bio_end_sector(bio) == sector; 3212 if (!contig) 3213 return false; 3214 3215 if (bio_size + size > bio_ctrl->len_to_oe_boundary || 3216 bio_size + size > bio_ctrl->len_to_stripe_boundary) 3217 return false; 3218 3219 if (bio_op(bio) == REQ_OP_ZONE_APPEND) 3220 ret = bio_add_zone_append_page(bio, page, size, pg_offset); 3221 else 3222 ret = bio_add_page(bio, page, size, pg_offset); 3223 3224 return ret == size; 3225 } 3226 3227 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, 3228 struct btrfs_inode *inode) 3229 { 3230 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3231 struct btrfs_io_geometry geom; 3232 struct btrfs_ordered_extent *ordered; 3233 struct extent_map *em; 3234 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); 3235 int ret; 3236 3237 /* 3238 * Pages for compressed extent are never submitted to disk directly, 3239 * thus it has no real boundary, just set them to U32_MAX. 3240 * 3241 * The split happens for real compressed bio, which happens in 3242 * btrfs_submit_compressed_read/write(). 3243 */ 3244 if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { 3245 bio_ctrl->len_to_oe_boundary = U32_MAX; 3246 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3247 return 0; 3248 } 3249 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 3250 if (IS_ERR(em)) 3251 return PTR_ERR(em); 3252 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), 3253 logical, &geom); 3254 free_extent_map(em); 3255 if (ret < 0) { 3256 return ret; 3257 } 3258 if (geom.len > U32_MAX) 3259 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3260 else 3261 bio_ctrl->len_to_stripe_boundary = (u32)geom.len; 3262 3263 if (!btrfs_is_zoned(fs_info) || 3264 bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { 3265 bio_ctrl->len_to_oe_boundary = U32_MAX; 3266 return 0; 3267 } 3268 3269 ASSERT(fs_info->max_zone_append_size > 0); 3270 /* Ordered extent not yet created, so we're good */ 3271 ordered = btrfs_lookup_ordered_extent(inode, logical); 3272 if (!ordered) { 3273 bio_ctrl->len_to_oe_boundary = U32_MAX; 3274 return 0; 3275 } 3276 3277 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 3278 ordered->disk_bytenr + ordered->disk_num_bytes - logical); 3279 btrfs_put_ordered_extent(ordered); 3280 return 0; 3281 } 3282 3283 /* 3284 * @opf: bio REQ_OP_* and REQ_* flags as one value 3285 * @wbc: optional writeback control for io accounting 3286 * @page: page to add to the bio 3287 * @disk_bytenr: logical bytenr where the write will be 3288 * @size: portion of page that we want to write to 3289 * @pg_offset: offset of the new bio or to check whether we are adding 3290 * a contiguous page to the previous one 3291 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 3292 * @end_io_func: end_io callback for new bio 3293 * @mirror_num: desired mirror to read/write 3294 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 3295 * @bio_flags: flags of the current bio to see if we can merge them 3296 */ 3297 static int submit_extent_page(unsigned int opf, 3298 struct writeback_control *wbc, 3299 struct btrfs_bio_ctrl *bio_ctrl, 3300 struct page *page, u64 disk_bytenr, 3301 size_t size, unsigned long pg_offset, 3302 bio_end_io_t end_io_func, 3303 int mirror_num, 3304 unsigned long bio_flags, 3305 bool force_bio_submit) 3306 { 3307 int ret = 0; 3308 struct bio *bio; 3309 size_t io_size = min_t(size_t, size, PAGE_SIZE); 3310 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3311 struct extent_io_tree *tree = &inode->io_tree; 3312 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3313 3314 ASSERT(bio_ctrl); 3315 3316 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && 3317 pg_offset + size <= PAGE_SIZE); 3318 if (bio_ctrl->bio) { 3319 bio = bio_ctrl->bio; 3320 if (force_bio_submit || 3321 !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, 3322 pg_offset, bio_flags)) { 3323 ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); 3324 bio_ctrl->bio = NULL; 3325 if (ret < 0) 3326 return ret; 3327 } else { 3328 if (wbc) 3329 wbc_account_cgroup_owner(wbc, page, io_size); 3330 return 0; 3331 } 3332 } 3333 3334 bio = btrfs_bio_alloc(disk_bytenr); 3335 bio_add_page(bio, page, io_size, pg_offset); 3336 bio->bi_end_io = end_io_func; 3337 bio->bi_private = tree; 3338 bio->bi_write_hint = page->mapping->host->i_write_hint; 3339 bio->bi_opf = opf; 3340 if (wbc) { 3341 struct block_device *bdev; 3342 3343 bdev = fs_info->fs_devices->latest_bdev; 3344 bio_set_dev(bio, bdev); 3345 wbc_init_bio(wbc, bio); 3346 wbc_account_cgroup_owner(wbc, page, io_size); 3347 } 3348 if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { 3349 struct btrfs_device *device; 3350 3351 device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); 3352 if (IS_ERR(device)) 3353 return PTR_ERR(device); 3354 3355 btrfs_io_bio(bio)->device = device; 3356 } 3357 3358 bio_ctrl->bio = bio; 3359 bio_ctrl->bio_flags = bio_flags; 3360 ret = calc_bio_boundaries(bio_ctrl, inode); 3361 3362 return ret; 3363 } 3364 3365 static int attach_extent_buffer_page(struct extent_buffer *eb, 3366 struct page *page, 3367 struct btrfs_subpage *prealloc) 3368 { 3369 struct btrfs_fs_info *fs_info = eb->fs_info; 3370 int ret = 0; 3371 3372 /* 3373 * If the page is mapped to btree inode, we should hold the private 3374 * lock to prevent race. 3375 * For cloned or dummy extent buffers, their pages are not mapped and 3376 * will not race with any other ebs. 3377 */ 3378 if (page->mapping) 3379 lockdep_assert_held(&page->mapping->private_lock); 3380 3381 if (fs_info->sectorsize == PAGE_SIZE) { 3382 if (!PagePrivate(page)) 3383 attach_page_private(page, eb); 3384 else 3385 WARN_ON(page->private != (unsigned long)eb); 3386 return 0; 3387 } 3388 3389 /* Already mapped, just free prealloc */ 3390 if (PagePrivate(page)) { 3391 btrfs_free_subpage(prealloc); 3392 return 0; 3393 } 3394 3395 if (prealloc) 3396 /* Has preallocated memory for subpage */ 3397 attach_page_private(page, prealloc); 3398 else 3399 /* Do new allocation to attach subpage */ 3400 ret = btrfs_attach_subpage(fs_info, page, 3401 BTRFS_SUBPAGE_METADATA); 3402 return ret; 3403 } 3404 3405 int set_page_extent_mapped(struct page *page) 3406 { 3407 struct btrfs_fs_info *fs_info; 3408 3409 ASSERT(page->mapping); 3410 3411 if (PagePrivate(page)) 3412 return 0; 3413 3414 fs_info = btrfs_sb(page->mapping->host->i_sb); 3415 3416 if (fs_info->sectorsize < PAGE_SIZE) 3417 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); 3418 3419 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); 3420 return 0; 3421 } 3422 3423 void clear_page_extent_mapped(struct page *page) 3424 { 3425 struct btrfs_fs_info *fs_info; 3426 3427 ASSERT(page->mapping); 3428 3429 if (!PagePrivate(page)) 3430 return; 3431 3432 fs_info = btrfs_sb(page->mapping->host->i_sb); 3433 if (fs_info->sectorsize < PAGE_SIZE) 3434 return btrfs_detach_subpage(fs_info, page); 3435 3436 detach_page_private(page); 3437 } 3438 3439 static struct extent_map * 3440 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 3441 u64 start, u64 len, struct extent_map **em_cached) 3442 { 3443 struct extent_map *em; 3444 3445 if (em_cached && *em_cached) { 3446 em = *em_cached; 3447 if (extent_map_in_tree(em) && start >= em->start && 3448 start < extent_map_end(em)) { 3449 refcount_inc(&em->refs); 3450 return em; 3451 } 3452 3453 free_extent_map(em); 3454 *em_cached = NULL; 3455 } 3456 3457 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); 3458 if (em_cached && !IS_ERR_OR_NULL(em)) { 3459 BUG_ON(*em_cached); 3460 refcount_inc(&em->refs); 3461 *em_cached = em; 3462 } 3463 return em; 3464 } 3465 /* 3466 * basic readpage implementation. Locked extent state structs are inserted 3467 * into the tree that are removed when the IO is done (by the end_io 3468 * handlers) 3469 * XXX JDM: This needs looking at to ensure proper page locking 3470 * return 0 on success, otherwise return error 3471 */ 3472 int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3473 struct btrfs_bio_ctrl *bio_ctrl, 3474 unsigned int read_flags, u64 *prev_em_start) 3475 { 3476 struct inode *inode = page->mapping->host; 3477 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3478 u64 start = page_offset(page); 3479 const u64 end = start + PAGE_SIZE - 1; 3480 u64 cur = start; 3481 u64 extent_offset; 3482 u64 last_byte = i_size_read(inode); 3483 u64 block_start; 3484 u64 cur_end; 3485 struct extent_map *em; 3486 int ret = 0; 3487 int nr = 0; 3488 size_t pg_offset = 0; 3489 size_t iosize; 3490 size_t blocksize = inode->i_sb->s_blocksize; 3491 unsigned long this_bio_flag = 0; 3492 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 3493 3494 ret = set_page_extent_mapped(page); 3495 if (ret < 0) { 3496 unlock_extent(tree, start, end); 3497 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); 3498 unlock_page(page); 3499 goto out; 3500 } 3501 3502 if (!PageUptodate(page)) { 3503 if (cleancache_get_page(page) == 0) { 3504 BUG_ON(blocksize != PAGE_SIZE); 3505 unlock_extent(tree, start, end); 3506 unlock_page(page); 3507 goto out; 3508 } 3509 } 3510 3511 if (page->index == last_byte >> PAGE_SHIFT) { 3512 size_t zero_offset = offset_in_page(last_byte); 3513 3514 if (zero_offset) { 3515 iosize = PAGE_SIZE - zero_offset; 3516 memzero_page(page, zero_offset, iosize); 3517 flush_dcache_page(page); 3518 } 3519 } 3520 begin_page_read(fs_info, page); 3521 while (cur <= end) { 3522 bool force_bio_submit = false; 3523 u64 disk_bytenr; 3524 3525 if (cur >= last_byte) { 3526 struct extent_state *cached = NULL; 3527 3528 iosize = PAGE_SIZE - pg_offset; 3529 memzero_page(page, pg_offset, iosize); 3530 flush_dcache_page(page); 3531 set_extent_uptodate(tree, cur, cur + iosize - 1, 3532 &cached, GFP_NOFS); 3533 unlock_extent_cached(tree, cur, 3534 cur + iosize - 1, &cached); 3535 end_page_read(page, true, cur, iosize); 3536 break; 3537 } 3538 em = __get_extent_map(inode, page, pg_offset, cur, 3539 end - cur + 1, em_cached); 3540 if (IS_ERR_OR_NULL(em)) { 3541 unlock_extent(tree, cur, end); 3542 end_page_read(page, false, cur, end + 1 - cur); 3543 break; 3544 } 3545 extent_offset = cur - em->start; 3546 BUG_ON(extent_map_end(em) <= cur); 3547 BUG_ON(end < cur); 3548 3549 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 3550 this_bio_flag |= EXTENT_BIO_COMPRESSED; 3551 extent_set_compress_type(&this_bio_flag, 3552 em->compress_type); 3553 } 3554 3555 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3556 cur_end = min(extent_map_end(em) - 1, end); 3557 iosize = ALIGN(iosize, blocksize); 3558 if (this_bio_flag & EXTENT_BIO_COMPRESSED) 3559 disk_bytenr = em->block_start; 3560 else 3561 disk_bytenr = em->block_start + extent_offset; 3562 block_start = em->block_start; 3563 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3564 block_start = EXTENT_MAP_HOLE; 3565 3566 /* 3567 * If we have a file range that points to a compressed extent 3568 * and it's followed by a consecutive file range that points 3569 * to the same compressed extent (possibly with a different 3570 * offset and/or length, so it either points to the whole extent 3571 * or only part of it), we must make sure we do not submit a 3572 * single bio to populate the pages for the 2 ranges because 3573 * this makes the compressed extent read zero out the pages 3574 * belonging to the 2nd range. Imagine the following scenario: 3575 * 3576 * File layout 3577 * [0 - 8K] [8K - 24K] 3578 * | | 3579 * | | 3580 * points to extent X, points to extent X, 3581 * offset 4K, length of 8K offset 0, length 16K 3582 * 3583 * [extent X, compressed length = 4K uncompressed length = 16K] 3584 * 3585 * If the bio to read the compressed extent covers both ranges, 3586 * it will decompress extent X into the pages belonging to the 3587 * first range and then it will stop, zeroing out the remaining 3588 * pages that belong to the other range that points to extent X. 3589 * So here we make sure we submit 2 bios, one for the first 3590 * range and another one for the third range. Both will target 3591 * the same physical extent from disk, but we can't currently 3592 * make the compressed bio endio callback populate the pages 3593 * for both ranges because each compressed bio is tightly 3594 * coupled with a single extent map, and each range can have 3595 * an extent map with a different offset value relative to the 3596 * uncompressed data of our extent and different lengths. This 3597 * is a corner case so we prioritize correctness over 3598 * non-optimal behavior (submitting 2 bios for the same extent). 3599 */ 3600 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3601 prev_em_start && *prev_em_start != (u64)-1 && 3602 *prev_em_start != em->start) 3603 force_bio_submit = true; 3604 3605 if (prev_em_start) 3606 *prev_em_start = em->start; 3607 3608 free_extent_map(em); 3609 em = NULL; 3610 3611 /* we've found a hole, just zero and go on */ 3612 if (block_start == EXTENT_MAP_HOLE) { 3613 struct extent_state *cached = NULL; 3614 3615 memzero_page(page, pg_offset, iosize); 3616 flush_dcache_page(page); 3617 3618 set_extent_uptodate(tree, cur, cur + iosize - 1, 3619 &cached, GFP_NOFS); 3620 unlock_extent_cached(tree, cur, 3621 cur + iosize - 1, &cached); 3622 end_page_read(page, true, cur, iosize); 3623 cur = cur + iosize; 3624 pg_offset += iosize; 3625 continue; 3626 } 3627 /* the get_extent function already copied into the page */ 3628 if (test_range_bit(tree, cur, cur_end, 3629 EXTENT_UPTODATE, 1, NULL)) { 3630 check_page_uptodate(tree, page); 3631 unlock_extent(tree, cur, cur + iosize - 1); 3632 end_page_read(page, true, cur, iosize); 3633 cur = cur + iosize; 3634 pg_offset += iosize; 3635 continue; 3636 } 3637 /* we have an inline extent but it didn't get marked up 3638 * to date. Error out 3639 */ 3640 if (block_start == EXTENT_MAP_INLINE) { 3641 unlock_extent(tree, cur, cur + iosize - 1); 3642 end_page_read(page, false, cur, iosize); 3643 cur = cur + iosize; 3644 pg_offset += iosize; 3645 continue; 3646 } 3647 3648 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, 3649 bio_ctrl, page, disk_bytenr, iosize, 3650 pg_offset, 3651 end_bio_extent_readpage, 0, 3652 this_bio_flag, 3653 force_bio_submit); 3654 if (!ret) { 3655 nr++; 3656 } else { 3657 unlock_extent(tree, cur, cur + iosize - 1); 3658 end_page_read(page, false, cur, iosize); 3659 goto out; 3660 } 3661 cur = cur + iosize; 3662 pg_offset += iosize; 3663 } 3664 out: 3665 return ret; 3666 } 3667 3668 static inline void contiguous_readpages(struct page *pages[], int nr_pages, 3669 u64 start, u64 end, 3670 struct extent_map **em_cached, 3671 struct btrfs_bio_ctrl *bio_ctrl, 3672 u64 *prev_em_start) 3673 { 3674 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3675 int index; 3676 3677 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3678 3679 for (index = 0; index < nr_pages; index++) { 3680 btrfs_do_readpage(pages[index], em_cached, bio_ctrl, 3681 REQ_RAHEAD, prev_em_start); 3682 put_page(pages[index]); 3683 } 3684 } 3685 3686 static void update_nr_written(struct writeback_control *wbc, 3687 unsigned long nr_written) 3688 { 3689 wbc->nr_to_write -= nr_written; 3690 } 3691 3692 /* 3693 * helper for __extent_writepage, doing all of the delayed allocation setup. 3694 * 3695 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3696 * to write the page (copy into inline extent). In this case the IO has 3697 * been started and the page is already unlocked. 3698 * 3699 * This returns 0 if all went well (page still locked) 3700 * This returns < 0 if there were errors (page still locked) 3701 */ 3702 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 3703 struct page *page, struct writeback_control *wbc, 3704 u64 delalloc_start, unsigned long *nr_written) 3705 { 3706 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3707 bool found; 3708 u64 delalloc_to_write = 0; 3709 u64 delalloc_end = 0; 3710 int ret; 3711 int page_started = 0; 3712 3713 3714 while (delalloc_end < page_end) { 3715 found = find_lock_delalloc_range(&inode->vfs_inode, page, 3716 &delalloc_start, 3717 &delalloc_end); 3718 if (!found) { 3719 delalloc_start = delalloc_end + 1; 3720 continue; 3721 } 3722 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3723 delalloc_end, &page_started, nr_written, wbc); 3724 if (ret) { 3725 SetPageError(page); 3726 /* 3727 * btrfs_run_delalloc_range should return < 0 for error 3728 * but just in case, we use > 0 here meaning the IO is 3729 * started, so we don't want to return > 0 unless 3730 * things are going well. 3731 */ 3732 return ret < 0 ? ret : -EIO; 3733 } 3734 /* 3735 * delalloc_end is already one less than the total length, so 3736 * we don't subtract one from PAGE_SIZE 3737 */ 3738 delalloc_to_write += (delalloc_end - delalloc_start + 3739 PAGE_SIZE) >> PAGE_SHIFT; 3740 delalloc_start = delalloc_end + 1; 3741 } 3742 if (wbc->nr_to_write < delalloc_to_write) { 3743 int thresh = 8192; 3744 3745 if (delalloc_to_write < thresh * 2) 3746 thresh = delalloc_to_write; 3747 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3748 thresh); 3749 } 3750 3751 /* did the fill delalloc function already unlock and start 3752 * the IO? 3753 */ 3754 if (page_started) { 3755 /* 3756 * we've unlocked the page, so we can't update 3757 * the mapping's writeback index, just update 3758 * nr_to_write. 3759 */ 3760 wbc->nr_to_write -= *nr_written; 3761 return 1; 3762 } 3763 3764 return 0; 3765 } 3766 3767 /* 3768 * Find the first byte we need to write. 3769 * 3770 * For subpage, one page can contain several sectors, and 3771 * __extent_writepage_io() will just grab all extent maps in the page 3772 * range and try to submit all non-inline/non-compressed extents. 3773 * 3774 * This is a big problem for subpage, we shouldn't re-submit already written 3775 * data at all. 3776 * This function will lookup subpage dirty bit to find which range we really 3777 * need to submit. 3778 * 3779 * Return the next dirty range in [@start, @end). 3780 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. 3781 */ 3782 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, 3783 struct page *page, u64 *start, u64 *end) 3784 { 3785 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 3786 u64 orig_start = *start; 3787 /* Declare as unsigned long so we can use bitmap ops */ 3788 unsigned long dirty_bitmap; 3789 unsigned long flags; 3790 int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; 3791 int range_start_bit = nbits; 3792 int range_end_bit; 3793 3794 /* 3795 * For regular sector size == page size case, since one page only 3796 * contains one sector, we return the page offset directly. 3797 */ 3798 if (fs_info->sectorsize == PAGE_SIZE) { 3799 *start = page_offset(page); 3800 *end = page_offset(page) + PAGE_SIZE; 3801 return; 3802 } 3803 3804 /* We should have the page locked, but just in case */ 3805 spin_lock_irqsave(&subpage->lock, flags); 3806 dirty_bitmap = subpage->dirty_bitmap; 3807 spin_unlock_irqrestore(&subpage->lock, flags); 3808 3809 bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, 3810 BTRFS_SUBPAGE_BITMAP_SIZE); 3811 *start = page_offset(page) + range_start_bit * fs_info->sectorsize; 3812 *end = page_offset(page) + range_end_bit * fs_info->sectorsize; 3813 } 3814 3815 /* 3816 * helper for __extent_writepage. This calls the writepage start hooks, 3817 * and does the loop to map the page into extents and bios. 3818 * 3819 * We return 1 if the IO is started and the page is unlocked, 3820 * 0 if all went well (page still locked) 3821 * < 0 if there were errors (page still locked) 3822 */ 3823 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, 3824 struct page *page, 3825 struct writeback_control *wbc, 3826 struct extent_page_data *epd, 3827 loff_t i_size, 3828 unsigned long nr_written, 3829 int *nr_ret) 3830 { 3831 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3832 u64 start = page_offset(page); 3833 u64 end = start + PAGE_SIZE - 1; 3834 u64 cur = start; 3835 u64 extent_offset; 3836 u64 block_start; 3837 struct extent_map *em; 3838 int ret = 0; 3839 int nr = 0; 3840 u32 opf = REQ_OP_WRITE; 3841 const unsigned int write_flags = wbc_to_write_flags(wbc); 3842 bool compressed; 3843 3844 ret = btrfs_writepage_cow_fixup(page, start, end); 3845 if (ret) { 3846 /* Fixup worker will requeue */ 3847 redirty_page_for_writepage(wbc, page); 3848 update_nr_written(wbc, nr_written); 3849 unlock_page(page); 3850 return 1; 3851 } 3852 3853 /* 3854 * we don't want to touch the inode after unlocking the page, 3855 * so we update the mapping writeback index now 3856 */ 3857 update_nr_written(wbc, nr_written + 1); 3858 3859 while (cur <= end) { 3860 u64 disk_bytenr; 3861 u64 em_end; 3862 u64 dirty_range_start = cur; 3863 u64 dirty_range_end; 3864 u32 iosize; 3865 3866 if (cur >= i_size) { 3867 btrfs_writepage_endio_finish_ordered(inode, page, cur, 3868 end, 1); 3869 break; 3870 } 3871 3872 find_next_dirty_byte(fs_info, page, &dirty_range_start, 3873 &dirty_range_end); 3874 if (cur < dirty_range_start) { 3875 cur = dirty_range_start; 3876 continue; 3877 } 3878 3879 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); 3880 if (IS_ERR_OR_NULL(em)) { 3881 btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 3882 ret = PTR_ERR_OR_ZERO(em); 3883 break; 3884 } 3885 3886 extent_offset = cur - em->start; 3887 em_end = extent_map_end(em); 3888 ASSERT(cur <= em_end); 3889 ASSERT(cur < end); 3890 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); 3891 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); 3892 block_start = em->block_start; 3893 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3894 disk_bytenr = em->block_start + extent_offset; 3895 3896 /* 3897 * Note that em_end from extent_map_end() and dirty_range_end from 3898 * find_next_dirty_byte() are all exclusive 3899 */ 3900 iosize = min(min(em_end, end + 1), dirty_range_end) - cur; 3901 3902 if (btrfs_use_zone_append(inode, em->block_start)) 3903 opf = REQ_OP_ZONE_APPEND; 3904 3905 free_extent_map(em); 3906 em = NULL; 3907 3908 /* 3909 * compressed and inline extents are written through other 3910 * paths in the FS 3911 */ 3912 if (compressed || block_start == EXTENT_MAP_HOLE || 3913 block_start == EXTENT_MAP_INLINE) { 3914 if (compressed) 3915 nr++; 3916 else 3917 btrfs_writepage_endio_finish_ordered(inode, 3918 page, cur, cur + iosize - 1, 1); 3919 cur += iosize; 3920 continue; 3921 } 3922 3923 btrfs_set_range_writeback(inode, cur, cur + iosize - 1); 3924 if (!PageWriteback(page)) { 3925 btrfs_err(inode->root->fs_info, 3926 "page %lu not writeback, cur %llu end %llu", 3927 page->index, cur, end); 3928 } 3929 3930 /* 3931 * Although the PageDirty bit is cleared before entering this 3932 * function, subpage dirty bit is not cleared. 3933 * So clear subpage dirty bit here so next time we won't submit 3934 * page for range already written to disk. 3935 */ 3936 btrfs_page_clear_dirty(fs_info, page, cur, iosize); 3937 3938 ret = submit_extent_page(opf | write_flags, wbc, 3939 &epd->bio_ctrl, page, 3940 disk_bytenr, iosize, 3941 cur - page_offset(page), 3942 end_bio_extent_writepage, 3943 0, 0, false); 3944 if (ret) { 3945 btrfs_page_set_error(fs_info, page, cur, iosize); 3946 if (PageWriteback(page)) 3947 btrfs_page_clear_writeback(fs_info, page, cur, 3948 iosize); 3949 } 3950 3951 cur += iosize; 3952 nr++; 3953 } 3954 *nr_ret = nr; 3955 return ret; 3956 } 3957 3958 /* 3959 * the writepage semantics are similar to regular writepage. extent 3960 * records are inserted to lock ranges in the tree, and as dirty areas 3961 * are found, they are marked writeback. Then the lock bits are removed 3962 * and the end_io handler clears the writeback ranges 3963 * 3964 * Return 0 if everything goes well. 3965 * Return <0 for error. 3966 */ 3967 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3968 struct extent_page_data *epd) 3969 { 3970 struct inode *inode = page->mapping->host; 3971 u64 start = page_offset(page); 3972 u64 page_end = start + PAGE_SIZE - 1; 3973 int ret; 3974 int nr = 0; 3975 size_t pg_offset; 3976 loff_t i_size = i_size_read(inode); 3977 unsigned long end_index = i_size >> PAGE_SHIFT; 3978 unsigned long nr_written = 0; 3979 3980 trace___extent_writepage(page, inode, wbc); 3981 3982 WARN_ON(!PageLocked(page)); 3983 3984 ClearPageError(page); 3985 3986 pg_offset = offset_in_page(i_size); 3987 if (page->index > end_index || 3988 (page->index == end_index && !pg_offset)) { 3989 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3990 unlock_page(page); 3991 return 0; 3992 } 3993 3994 if (page->index == end_index) { 3995 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); 3996 flush_dcache_page(page); 3997 } 3998 3999 ret = set_page_extent_mapped(page); 4000 if (ret < 0) { 4001 SetPageError(page); 4002 goto done; 4003 } 4004 4005 if (!epd->extent_locked) { 4006 ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, 4007 &nr_written); 4008 if (ret == 1) 4009 return 0; 4010 if (ret) 4011 goto done; 4012 } 4013 4014 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, 4015 nr_written, &nr); 4016 if (ret == 1) 4017 return 0; 4018 4019 done: 4020 if (nr == 0) { 4021 /* make sure the mapping tag for page dirty gets cleared */ 4022 set_page_writeback(page); 4023 end_page_writeback(page); 4024 } 4025 if (PageError(page)) { 4026 ret = ret < 0 ? ret : -EIO; 4027 end_extent_writepage(page, ret, start, page_end); 4028 } 4029 unlock_page(page); 4030 ASSERT(ret <= 0); 4031 return ret; 4032 } 4033 4034 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 4035 { 4036 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 4037 TASK_UNINTERRUPTIBLE); 4038 } 4039 4040 static void end_extent_buffer_writeback(struct extent_buffer *eb) 4041 { 4042 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4043 smp_mb__after_atomic(); 4044 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 4045 } 4046 4047 /* 4048 * Lock extent buffer status and pages for writeback. 4049 * 4050 * May try to flush write bio if we can't get the lock. 4051 * 4052 * Return 0 if the extent buffer doesn't need to be submitted. 4053 * (E.g. the extent buffer is not dirty) 4054 * Return >0 is the extent buffer is submitted to bio. 4055 * Return <0 if something went wrong, no page is locked. 4056 */ 4057 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, 4058 struct extent_page_data *epd) 4059 { 4060 struct btrfs_fs_info *fs_info = eb->fs_info; 4061 int i, num_pages, failed_page_nr; 4062 int flush = 0; 4063 int ret = 0; 4064 4065 if (!btrfs_try_tree_write_lock(eb)) { 4066 ret = flush_write_bio(epd); 4067 if (ret < 0) 4068 return ret; 4069 flush = 1; 4070 btrfs_tree_lock(eb); 4071 } 4072 4073 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 4074 btrfs_tree_unlock(eb); 4075 if (!epd->sync_io) 4076 return 0; 4077 if (!flush) { 4078 ret = flush_write_bio(epd); 4079 if (ret < 0) 4080 return ret; 4081 flush = 1; 4082 } 4083 while (1) { 4084 wait_on_extent_buffer_writeback(eb); 4085 btrfs_tree_lock(eb); 4086 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 4087 break; 4088 btrfs_tree_unlock(eb); 4089 } 4090 } 4091 4092 /* 4093 * We need to do this to prevent races in people who check if the eb is 4094 * under IO since we can end up having no IO bits set for a short period 4095 * of time. 4096 */ 4097 spin_lock(&eb->refs_lock); 4098 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4099 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4100 spin_unlock(&eb->refs_lock); 4101 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 4102 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4103 -eb->len, 4104 fs_info->dirty_metadata_batch); 4105 ret = 1; 4106 } else { 4107 spin_unlock(&eb->refs_lock); 4108 } 4109 4110 btrfs_tree_unlock(eb); 4111 4112 /* 4113 * Either we don't need to submit any tree block, or we're submitting 4114 * subpage eb. 4115 * Subpage metadata doesn't use page locking at all, so we can skip 4116 * the page locking. 4117 */ 4118 if (!ret || fs_info->sectorsize < PAGE_SIZE) 4119 return ret; 4120 4121 num_pages = num_extent_pages(eb); 4122 for (i = 0; i < num_pages; i++) { 4123 struct page *p = eb->pages[i]; 4124 4125 if (!trylock_page(p)) { 4126 if (!flush) { 4127 int err; 4128 4129 err = flush_write_bio(epd); 4130 if (err < 0) { 4131 ret = err; 4132 failed_page_nr = i; 4133 goto err_unlock; 4134 } 4135 flush = 1; 4136 } 4137 lock_page(p); 4138 } 4139 } 4140 4141 return ret; 4142 err_unlock: 4143 /* Unlock already locked pages */ 4144 for (i = 0; i < failed_page_nr; i++) 4145 unlock_page(eb->pages[i]); 4146 /* 4147 * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. 4148 * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can 4149 * be made and undo everything done before. 4150 */ 4151 btrfs_tree_lock(eb); 4152 spin_lock(&eb->refs_lock); 4153 set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 4154 end_extent_buffer_writeback(eb); 4155 spin_unlock(&eb->refs_lock); 4156 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, 4157 fs_info->dirty_metadata_batch); 4158 btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 4159 btrfs_tree_unlock(eb); 4160 return ret; 4161 } 4162 4163 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) 4164 { 4165 struct btrfs_fs_info *fs_info = eb->fs_info; 4166 4167 btrfs_page_set_error(fs_info, page, eb->start, eb->len); 4168 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4169 return; 4170 4171 /* 4172 * If we error out, we should add back the dirty_metadata_bytes 4173 * to make it consistent. 4174 */ 4175 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4176 eb->len, fs_info->dirty_metadata_batch); 4177 4178 /* 4179 * If writeback for a btree extent that doesn't belong to a log tree 4180 * failed, increment the counter transaction->eb_write_errors. 4181 * We do this because while the transaction is running and before it's 4182 * committing (when we call filemap_fdata[write|wait]_range against 4183 * the btree inode), we might have 4184 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 4185 * returns an error or an error happens during writeback, when we're 4186 * committing the transaction we wouldn't know about it, since the pages 4187 * can be no longer dirty nor marked anymore for writeback (if a 4188 * subsequent modification to the extent buffer didn't happen before the 4189 * transaction commit), which makes filemap_fdata[write|wait]_range not 4190 * able to find the pages tagged with SetPageError at transaction 4191 * commit time. So if this happens we must abort the transaction, 4192 * otherwise we commit a super block with btree roots that point to 4193 * btree nodes/leafs whose content on disk is invalid - either garbage 4194 * or the content of some node/leaf from a past generation that got 4195 * cowed or deleted and is no longer valid. 4196 * 4197 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 4198 * not be enough - we need to distinguish between log tree extents vs 4199 * non-log tree extents, and the next filemap_fdatawait_range() call 4200 * will catch and clear such errors in the mapping - and that call might 4201 * be from a log sync and not from a transaction commit. Also, checking 4202 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 4203 * not done and would not be reliable - the eb might have been released 4204 * from memory and reading it back again means that flag would not be 4205 * set (since it's a runtime flag, not persisted on disk). 4206 * 4207 * Using the flags below in the btree inode also makes us achieve the 4208 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 4209 * writeback for all dirty pages and before filemap_fdatawait_range() 4210 * is called, the writeback for all dirty pages had already finished 4211 * with errors - because we were not using AS_EIO/AS_ENOSPC, 4212 * filemap_fdatawait_range() would return success, as it could not know 4213 * that writeback errors happened (the pages were no longer tagged for 4214 * writeback). 4215 */ 4216 switch (eb->log_index) { 4217 case -1: 4218 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 4219 break; 4220 case 0: 4221 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 4222 break; 4223 case 1: 4224 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 4225 break; 4226 default: 4227 BUG(); /* unexpected, logic error */ 4228 } 4229 } 4230 4231 /* 4232 * The endio specific version which won't touch any unsafe spinlock in endio 4233 * context. 4234 */ 4235 static struct extent_buffer *find_extent_buffer_nolock( 4236 struct btrfs_fs_info *fs_info, u64 start) 4237 { 4238 struct extent_buffer *eb; 4239 4240 rcu_read_lock(); 4241 eb = radix_tree_lookup(&fs_info->buffer_radix, 4242 start >> fs_info->sectorsize_bits); 4243 if (eb && atomic_inc_not_zero(&eb->refs)) { 4244 rcu_read_unlock(); 4245 return eb; 4246 } 4247 rcu_read_unlock(); 4248 return NULL; 4249 } 4250 4251 /* 4252 * The endio function for subpage extent buffer write. 4253 * 4254 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() 4255 * after all extent buffers in the page has finished their writeback. 4256 */ 4257 static void end_bio_subpage_eb_writepage(struct bio *bio) 4258 { 4259 struct btrfs_fs_info *fs_info; 4260 struct bio_vec *bvec; 4261 struct bvec_iter_all iter_all; 4262 4263 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4264 ASSERT(fs_info->sectorsize < PAGE_SIZE); 4265 4266 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4267 bio_for_each_segment_all(bvec, bio, iter_all) { 4268 struct page *page = bvec->bv_page; 4269 u64 bvec_start = page_offset(page) + bvec->bv_offset; 4270 u64 bvec_end = bvec_start + bvec->bv_len - 1; 4271 u64 cur_bytenr = bvec_start; 4272 4273 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); 4274 4275 /* Iterate through all extent buffers in the range */ 4276 while (cur_bytenr <= bvec_end) { 4277 struct extent_buffer *eb; 4278 int done; 4279 4280 /* 4281 * Here we can't use find_extent_buffer(), as it may 4282 * try to lock eb->refs_lock, which is not safe in endio 4283 * context. 4284 */ 4285 eb = find_extent_buffer_nolock(fs_info, cur_bytenr); 4286 ASSERT(eb); 4287 4288 cur_bytenr = eb->start + eb->len; 4289 4290 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); 4291 done = atomic_dec_and_test(&eb->io_pages); 4292 ASSERT(done); 4293 4294 if (bio->bi_status || 4295 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4296 ClearPageUptodate(page); 4297 set_btree_ioerr(page, eb); 4298 } 4299 4300 btrfs_subpage_clear_writeback(fs_info, page, eb->start, 4301 eb->len); 4302 end_extent_buffer_writeback(eb); 4303 /* 4304 * free_extent_buffer() will grab spinlock which is not 4305 * safe in endio context. Thus here we manually dec 4306 * the ref. 4307 */ 4308 atomic_dec(&eb->refs); 4309 } 4310 } 4311 bio_put(bio); 4312 } 4313 4314 static void end_bio_extent_buffer_writepage(struct bio *bio) 4315 { 4316 struct bio_vec *bvec; 4317 struct extent_buffer *eb; 4318 int done; 4319 struct bvec_iter_all iter_all; 4320 4321 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4322 bio_for_each_segment_all(bvec, bio, iter_all) { 4323 struct page *page = bvec->bv_page; 4324 4325 eb = (struct extent_buffer *)page->private; 4326 BUG_ON(!eb); 4327 done = atomic_dec_and_test(&eb->io_pages); 4328 4329 if (bio->bi_status || 4330 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4331 ClearPageUptodate(page); 4332 set_btree_ioerr(page, eb); 4333 } 4334 4335 end_page_writeback(page); 4336 4337 if (!done) 4338 continue; 4339 4340 end_extent_buffer_writeback(eb); 4341 } 4342 4343 bio_put(bio); 4344 } 4345 4346 static void prepare_eb_write(struct extent_buffer *eb) 4347 { 4348 u32 nritems; 4349 unsigned long start; 4350 unsigned long end; 4351 4352 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 4353 atomic_set(&eb->io_pages, num_extent_pages(eb)); 4354 4355 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 4356 nritems = btrfs_header_nritems(eb); 4357 if (btrfs_header_level(eb) > 0) { 4358 end = btrfs_node_key_ptr_offset(nritems); 4359 memzero_extent_buffer(eb, end, eb->len - end); 4360 } else { 4361 /* 4362 * Leaf: 4363 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 4364 */ 4365 start = btrfs_item_nr_offset(nritems); 4366 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 4367 memzero_extent_buffer(eb, start, end - start); 4368 } 4369 } 4370 4371 /* 4372 * Unlike the work in write_one_eb(), we rely completely on extent locking. 4373 * Page locking is only utilized at minimum to keep the VMM code happy. 4374 */ 4375 static int write_one_subpage_eb(struct extent_buffer *eb, 4376 struct writeback_control *wbc, 4377 struct extent_page_data *epd) 4378 { 4379 struct btrfs_fs_info *fs_info = eb->fs_info; 4380 struct page *page = eb->pages[0]; 4381 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 4382 bool no_dirty_ebs = false; 4383 int ret; 4384 4385 prepare_eb_write(eb); 4386 4387 /* clear_page_dirty_for_io() in subpage helper needs page locked */ 4388 lock_page(page); 4389 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); 4390 4391 /* Check if this is the last dirty bit to update nr_written */ 4392 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, 4393 eb->start, eb->len); 4394 if (no_dirty_ebs) 4395 clear_page_dirty_for_io(page); 4396 4397 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4398 &epd->bio_ctrl, page, eb->start, eb->len, 4399 eb->start - page_offset(page), 4400 end_bio_subpage_eb_writepage, 0, 0, false); 4401 if (ret) { 4402 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); 4403 set_btree_ioerr(page, eb); 4404 unlock_page(page); 4405 4406 if (atomic_dec_and_test(&eb->io_pages)) 4407 end_extent_buffer_writeback(eb); 4408 return -EIO; 4409 } 4410 unlock_page(page); 4411 /* 4412 * Submission finished without problem, if no range of the page is 4413 * dirty anymore, we have submitted a page. Update nr_written in wbc. 4414 */ 4415 if (no_dirty_ebs) 4416 update_nr_written(wbc, 1); 4417 return ret; 4418 } 4419 4420 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 4421 struct writeback_control *wbc, 4422 struct extent_page_data *epd) 4423 { 4424 u64 disk_bytenr = eb->start; 4425 int i, num_pages; 4426 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 4427 int ret = 0; 4428 4429 prepare_eb_write(eb); 4430 4431 num_pages = num_extent_pages(eb); 4432 for (i = 0; i < num_pages; i++) { 4433 struct page *p = eb->pages[i]; 4434 4435 clear_page_dirty_for_io(p); 4436 set_page_writeback(p); 4437 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4438 &epd->bio_ctrl, p, disk_bytenr, 4439 PAGE_SIZE, 0, 4440 end_bio_extent_buffer_writepage, 4441 0, 0, false); 4442 if (ret) { 4443 set_btree_ioerr(p, eb); 4444 if (PageWriteback(p)) 4445 end_page_writeback(p); 4446 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 4447 end_extent_buffer_writeback(eb); 4448 ret = -EIO; 4449 break; 4450 } 4451 disk_bytenr += PAGE_SIZE; 4452 update_nr_written(wbc, 1); 4453 unlock_page(p); 4454 } 4455 4456 if (unlikely(ret)) { 4457 for (; i < num_pages; i++) { 4458 struct page *p = eb->pages[i]; 4459 clear_page_dirty_for_io(p); 4460 unlock_page(p); 4461 } 4462 } 4463 4464 return ret; 4465 } 4466 4467 /* 4468 * Submit one subpage btree page. 4469 * 4470 * The main difference to submit_eb_page() is: 4471 * - Page locking 4472 * For subpage, we don't rely on page locking at all. 4473 * 4474 * - Flush write bio 4475 * We only flush bio if we may be unable to fit current extent buffers into 4476 * current bio. 4477 * 4478 * Return >=0 for the number of submitted extent buffers. 4479 * Return <0 for fatal error. 4480 */ 4481 static int submit_eb_subpage(struct page *page, 4482 struct writeback_control *wbc, 4483 struct extent_page_data *epd) 4484 { 4485 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 4486 int submitted = 0; 4487 u64 page_start = page_offset(page); 4488 int bit_start = 0; 4489 const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; 4490 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; 4491 int ret; 4492 4493 /* Lock and write each dirty extent buffers in the range */ 4494 while (bit_start < nbits) { 4495 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 4496 struct extent_buffer *eb; 4497 unsigned long flags; 4498 u64 start; 4499 4500 /* 4501 * Take private lock to ensure the subpage won't be detached 4502 * in the meantime. 4503 */ 4504 spin_lock(&page->mapping->private_lock); 4505 if (!PagePrivate(page)) { 4506 spin_unlock(&page->mapping->private_lock); 4507 break; 4508 } 4509 spin_lock_irqsave(&subpage->lock, flags); 4510 if (!((1 << bit_start) & subpage->dirty_bitmap)) { 4511 spin_unlock_irqrestore(&subpage->lock, flags); 4512 spin_unlock(&page->mapping->private_lock); 4513 bit_start++; 4514 continue; 4515 } 4516 4517 start = page_start + bit_start * fs_info->sectorsize; 4518 bit_start += sectors_per_node; 4519 4520 /* 4521 * Here we just want to grab the eb without touching extra 4522 * spin locks, so call find_extent_buffer_nolock(). 4523 */ 4524 eb = find_extent_buffer_nolock(fs_info, start); 4525 spin_unlock_irqrestore(&subpage->lock, flags); 4526 spin_unlock(&page->mapping->private_lock); 4527 4528 /* 4529 * The eb has already reached 0 refs thus find_extent_buffer() 4530 * doesn't return it. We don't need to write back such eb 4531 * anyway. 4532 */ 4533 if (!eb) 4534 continue; 4535 4536 ret = lock_extent_buffer_for_io(eb, epd); 4537 if (ret == 0) { 4538 free_extent_buffer(eb); 4539 continue; 4540 } 4541 if (ret < 0) { 4542 free_extent_buffer(eb); 4543 goto cleanup; 4544 } 4545 ret = write_one_subpage_eb(eb, wbc, epd); 4546 free_extent_buffer(eb); 4547 if (ret < 0) 4548 goto cleanup; 4549 submitted++; 4550 } 4551 return submitted; 4552 4553 cleanup: 4554 /* We hit error, end bio for the submitted extent buffers */ 4555 end_write_bio(epd, ret); 4556 return ret; 4557 } 4558 4559 /* 4560 * Submit all page(s) of one extent buffer. 4561 * 4562 * @page: the page of one extent buffer 4563 * @eb_context: to determine if we need to submit this page, if current page 4564 * belongs to this eb, we don't need to submit 4565 * 4566 * The caller should pass each page in their bytenr order, and here we use 4567 * @eb_context to determine if we have submitted pages of one extent buffer. 4568 * 4569 * If we have, we just skip until we hit a new page that doesn't belong to 4570 * current @eb_context. 4571 * 4572 * If not, we submit all the page(s) of the extent buffer. 4573 * 4574 * Return >0 if we have submitted the extent buffer successfully. 4575 * Return 0 if we don't need to submit the page, as it's already submitted by 4576 * previous call. 4577 * Return <0 for fatal error. 4578 */ 4579 static int submit_eb_page(struct page *page, struct writeback_control *wbc, 4580 struct extent_page_data *epd, 4581 struct extent_buffer **eb_context) 4582 { 4583 struct address_space *mapping = page->mapping; 4584 struct btrfs_block_group *cache = NULL; 4585 struct extent_buffer *eb; 4586 int ret; 4587 4588 if (!PagePrivate(page)) 4589 return 0; 4590 4591 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) 4592 return submit_eb_subpage(page, wbc, epd); 4593 4594 spin_lock(&mapping->private_lock); 4595 if (!PagePrivate(page)) { 4596 spin_unlock(&mapping->private_lock); 4597 return 0; 4598 } 4599 4600 eb = (struct extent_buffer *)page->private; 4601 4602 /* 4603 * Shouldn't happen and normally this would be a BUG_ON but no point 4604 * crashing the machine for something we can survive anyway. 4605 */ 4606 if (WARN_ON(!eb)) { 4607 spin_unlock(&mapping->private_lock); 4608 return 0; 4609 } 4610 4611 if (eb == *eb_context) { 4612 spin_unlock(&mapping->private_lock); 4613 return 0; 4614 } 4615 ret = atomic_inc_not_zero(&eb->refs); 4616 spin_unlock(&mapping->private_lock); 4617 if (!ret) 4618 return 0; 4619 4620 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { 4621 /* 4622 * If for_sync, this hole will be filled with 4623 * trasnsaction commit. 4624 */ 4625 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 4626 ret = -EAGAIN; 4627 else 4628 ret = 0; 4629 free_extent_buffer(eb); 4630 return ret; 4631 } 4632 4633 *eb_context = eb; 4634 4635 ret = lock_extent_buffer_for_io(eb, epd); 4636 if (ret <= 0) { 4637 btrfs_revert_meta_write_pointer(cache, eb); 4638 if (cache) 4639 btrfs_put_block_group(cache); 4640 free_extent_buffer(eb); 4641 return ret; 4642 } 4643 if (cache) 4644 btrfs_put_block_group(cache); 4645 ret = write_one_eb(eb, wbc, epd); 4646 free_extent_buffer(eb); 4647 if (ret < 0) 4648 return ret; 4649 return 1; 4650 } 4651 4652 int btree_write_cache_pages(struct address_space *mapping, 4653 struct writeback_control *wbc) 4654 { 4655 struct extent_buffer *eb_context = NULL; 4656 struct extent_page_data epd = { 4657 .bio_ctrl = { 0 }, 4658 .extent_locked = 0, 4659 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4660 }; 4661 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 4662 int ret = 0; 4663 int done = 0; 4664 int nr_to_write_done = 0; 4665 struct pagevec pvec; 4666 int nr_pages; 4667 pgoff_t index; 4668 pgoff_t end; /* Inclusive */ 4669 int scanned = 0; 4670 xa_mark_t tag; 4671 4672 pagevec_init(&pvec); 4673 if (wbc->range_cyclic) { 4674 index = mapping->writeback_index; /* Start from prev offset */ 4675 end = -1; 4676 /* 4677 * Start from the beginning does not need to cycle over the 4678 * range, mark it as scanned. 4679 */ 4680 scanned = (index == 0); 4681 } else { 4682 index = wbc->range_start >> PAGE_SHIFT; 4683 end = wbc->range_end >> PAGE_SHIFT; 4684 scanned = 1; 4685 } 4686 if (wbc->sync_mode == WB_SYNC_ALL) 4687 tag = PAGECACHE_TAG_TOWRITE; 4688 else 4689 tag = PAGECACHE_TAG_DIRTY; 4690 btrfs_zoned_meta_io_lock(fs_info); 4691 retry: 4692 if (wbc->sync_mode == WB_SYNC_ALL) 4693 tag_pages_for_writeback(mapping, index, end); 4694 while (!done && !nr_to_write_done && (index <= end) && 4695 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 4696 tag))) { 4697 unsigned i; 4698 4699 for (i = 0; i < nr_pages; i++) { 4700 struct page *page = pvec.pages[i]; 4701 4702 ret = submit_eb_page(page, wbc, &epd, &eb_context); 4703 if (ret == 0) 4704 continue; 4705 if (ret < 0) { 4706 done = 1; 4707 break; 4708 } 4709 4710 /* 4711 * the filesystem may choose to bump up nr_to_write. 4712 * We have to make sure to honor the new nr_to_write 4713 * at any time 4714 */ 4715 nr_to_write_done = wbc->nr_to_write <= 0; 4716 } 4717 pagevec_release(&pvec); 4718 cond_resched(); 4719 } 4720 if (!scanned && !done) { 4721 /* 4722 * We hit the last page and there is more work to be done: wrap 4723 * back to the start of the file 4724 */ 4725 scanned = 1; 4726 index = 0; 4727 goto retry; 4728 } 4729 if (ret < 0) { 4730 end_write_bio(&epd, ret); 4731 goto out; 4732 } 4733 /* 4734 * If something went wrong, don't allow any metadata write bio to be 4735 * submitted. 4736 * 4737 * This would prevent use-after-free if we had dirty pages not 4738 * cleaned up, which can still happen by fuzzed images. 4739 * 4740 * - Bad extent tree 4741 * Allowing existing tree block to be allocated for other trees. 4742 * 4743 * - Log tree operations 4744 * Exiting tree blocks get allocated to log tree, bumps its 4745 * generation, then get cleaned in tree re-balance. 4746 * Such tree block will not be written back, since it's clean, 4747 * thus no WRITTEN flag set. 4748 * And after log writes back, this tree block is not traced by 4749 * any dirty extent_io_tree. 4750 * 4751 * - Offending tree block gets re-dirtied from its original owner 4752 * Since it has bumped generation, no WRITTEN flag, it can be 4753 * reused without COWing. This tree block will not be traced 4754 * by btrfs_transaction::dirty_pages. 4755 * 4756 * Now such dirty tree block will not be cleaned by any dirty 4757 * extent io tree. Thus we don't want to submit such wild eb 4758 * if the fs already has error. 4759 */ 4760 if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { 4761 ret = flush_write_bio(&epd); 4762 } else { 4763 ret = -EROFS; 4764 end_write_bio(&epd, ret); 4765 } 4766 out: 4767 btrfs_zoned_meta_io_unlock(fs_info); 4768 return ret; 4769 } 4770 4771 /** 4772 * Walk the list of dirty pages of the given address space and write all of them. 4773 * 4774 * @mapping: address space structure to write 4775 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 4776 * @epd: holds context for the write, namely the bio 4777 * 4778 * If a page is already under I/O, write_cache_pages() skips it, even 4779 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 4780 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 4781 * and msync() need to guarantee that all the data which was dirty at the time 4782 * the call was made get new I/O started against them. If wbc->sync_mode is 4783 * WB_SYNC_ALL then we were called for data integrity and we must wait for 4784 * existing IO to complete. 4785 */ 4786 static int extent_write_cache_pages(struct address_space *mapping, 4787 struct writeback_control *wbc, 4788 struct extent_page_data *epd) 4789 { 4790 struct inode *inode = mapping->host; 4791 int ret = 0; 4792 int done = 0; 4793 int nr_to_write_done = 0; 4794 struct pagevec pvec; 4795 int nr_pages; 4796 pgoff_t index; 4797 pgoff_t end; /* Inclusive */ 4798 pgoff_t done_index; 4799 int range_whole = 0; 4800 int scanned = 0; 4801 xa_mark_t tag; 4802 4803 /* 4804 * We have to hold onto the inode so that ordered extents can do their 4805 * work when the IO finishes. The alternative to this is failing to add 4806 * an ordered extent if the igrab() fails there and that is a huge pain 4807 * to deal with, so instead just hold onto the inode throughout the 4808 * writepages operation. If it fails here we are freeing up the inode 4809 * anyway and we'd rather not waste our time writing out stuff that is 4810 * going to be truncated anyway. 4811 */ 4812 if (!igrab(inode)) 4813 return 0; 4814 4815 pagevec_init(&pvec); 4816 if (wbc->range_cyclic) { 4817 index = mapping->writeback_index; /* Start from prev offset */ 4818 end = -1; 4819 /* 4820 * Start from the beginning does not need to cycle over the 4821 * range, mark it as scanned. 4822 */ 4823 scanned = (index == 0); 4824 } else { 4825 index = wbc->range_start >> PAGE_SHIFT; 4826 end = wbc->range_end >> PAGE_SHIFT; 4827 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 4828 range_whole = 1; 4829 scanned = 1; 4830 } 4831 4832 /* 4833 * We do the tagged writepage as long as the snapshot flush bit is set 4834 * and we are the first one who do the filemap_flush() on this inode. 4835 * 4836 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 4837 * not race in and drop the bit. 4838 */ 4839 if (range_whole && wbc->nr_to_write == LONG_MAX && 4840 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 4841 &BTRFS_I(inode)->runtime_flags)) 4842 wbc->tagged_writepages = 1; 4843 4844 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4845 tag = PAGECACHE_TAG_TOWRITE; 4846 else 4847 tag = PAGECACHE_TAG_DIRTY; 4848 retry: 4849 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 4850 tag_pages_for_writeback(mapping, index, end); 4851 done_index = index; 4852 while (!done && !nr_to_write_done && (index <= end) && 4853 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 4854 &index, end, tag))) { 4855 unsigned i; 4856 4857 for (i = 0; i < nr_pages; i++) { 4858 struct page *page = pvec.pages[i]; 4859 4860 done_index = page->index + 1; 4861 /* 4862 * At this point we hold neither the i_pages lock nor 4863 * the page lock: the page may be truncated or 4864 * invalidated (changing page->mapping to NULL), 4865 * or even swizzled back from swapper_space to 4866 * tmpfs file mapping 4867 */ 4868 if (!trylock_page(page)) { 4869 ret = flush_write_bio(epd); 4870 BUG_ON(ret < 0); 4871 lock_page(page); 4872 } 4873 4874 if (unlikely(page->mapping != mapping)) { 4875 unlock_page(page); 4876 continue; 4877 } 4878 4879 if (wbc->sync_mode != WB_SYNC_NONE) { 4880 if (PageWriteback(page)) { 4881 ret = flush_write_bio(epd); 4882 BUG_ON(ret < 0); 4883 } 4884 wait_on_page_writeback(page); 4885 } 4886 4887 if (PageWriteback(page) || 4888 !clear_page_dirty_for_io(page)) { 4889 unlock_page(page); 4890 continue; 4891 } 4892 4893 ret = __extent_writepage(page, wbc, epd); 4894 if (ret < 0) { 4895 done = 1; 4896 break; 4897 } 4898 4899 /* 4900 * the filesystem may choose to bump up nr_to_write. 4901 * We have to make sure to honor the new nr_to_write 4902 * at any time 4903 */ 4904 nr_to_write_done = wbc->nr_to_write <= 0; 4905 } 4906 pagevec_release(&pvec); 4907 cond_resched(); 4908 } 4909 if (!scanned && !done) { 4910 /* 4911 * We hit the last page and there is more work to be done: wrap 4912 * back to the start of the file 4913 */ 4914 scanned = 1; 4915 index = 0; 4916 4917 /* 4918 * If we're looping we could run into a page that is locked by a 4919 * writer and that writer could be waiting on writeback for a 4920 * page in our current bio, and thus deadlock, so flush the 4921 * write bio here. 4922 */ 4923 ret = flush_write_bio(epd); 4924 if (!ret) 4925 goto retry; 4926 } 4927 4928 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4929 mapping->writeback_index = done_index; 4930 4931 btrfs_add_delayed_iput(inode); 4932 return ret; 4933 } 4934 4935 int extent_write_full_page(struct page *page, struct writeback_control *wbc) 4936 { 4937 int ret; 4938 struct extent_page_data epd = { 4939 .bio_ctrl = { 0 }, 4940 .extent_locked = 0, 4941 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4942 }; 4943 4944 ret = __extent_writepage(page, wbc, &epd); 4945 ASSERT(ret <= 0); 4946 if (ret < 0) { 4947 end_write_bio(&epd, ret); 4948 return ret; 4949 } 4950 4951 ret = flush_write_bio(&epd); 4952 ASSERT(ret <= 0); 4953 return ret; 4954 } 4955 4956 int extent_write_locked_range(struct inode *inode, u64 start, u64 end, 4957 int mode) 4958 { 4959 int ret = 0; 4960 struct address_space *mapping = inode->i_mapping; 4961 struct page *page; 4962 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4963 PAGE_SHIFT; 4964 4965 struct extent_page_data epd = { 4966 .bio_ctrl = { 0 }, 4967 .extent_locked = 1, 4968 .sync_io = mode == WB_SYNC_ALL, 4969 }; 4970 struct writeback_control wbc_writepages = { 4971 .sync_mode = mode, 4972 .nr_to_write = nr_pages * 2, 4973 .range_start = start, 4974 .range_end = end + 1, 4975 /* We're called from an async helper function */ 4976 .punt_to_cgroup = 1, 4977 .no_cgroup_owner = 1, 4978 }; 4979 4980 wbc_attach_fdatawrite_inode(&wbc_writepages, inode); 4981 while (start <= end) { 4982 page = find_get_page(mapping, start >> PAGE_SHIFT); 4983 if (clear_page_dirty_for_io(page)) 4984 ret = __extent_writepage(page, &wbc_writepages, &epd); 4985 else { 4986 btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), 4987 page, start, start + PAGE_SIZE - 1, 1); 4988 unlock_page(page); 4989 } 4990 put_page(page); 4991 start += PAGE_SIZE; 4992 } 4993 4994 ASSERT(ret <= 0); 4995 if (ret == 0) 4996 ret = flush_write_bio(&epd); 4997 else 4998 end_write_bio(&epd, ret); 4999 5000 wbc_detach_inode(&wbc_writepages); 5001 return ret; 5002 } 5003 5004 int extent_writepages(struct address_space *mapping, 5005 struct writeback_control *wbc) 5006 { 5007 int ret = 0; 5008 struct extent_page_data epd = { 5009 .bio_ctrl = { 0 }, 5010 .extent_locked = 0, 5011 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 5012 }; 5013 5014 ret = extent_write_cache_pages(mapping, wbc, &epd); 5015 ASSERT(ret <= 0); 5016 if (ret < 0) { 5017 end_write_bio(&epd, ret); 5018 return ret; 5019 } 5020 ret = flush_write_bio(&epd); 5021 return ret; 5022 } 5023 5024 void extent_readahead(struct readahead_control *rac) 5025 { 5026 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 5027 struct page *pagepool[16]; 5028 struct extent_map *em_cached = NULL; 5029 u64 prev_em_start = (u64)-1; 5030 int nr; 5031 5032 while ((nr = readahead_page_batch(rac, pagepool))) { 5033 u64 contig_start = readahead_pos(rac); 5034 u64 contig_end = contig_start + readahead_batch_length(rac) - 1; 5035 5036 contiguous_readpages(pagepool, nr, contig_start, contig_end, 5037 &em_cached, &bio_ctrl, &prev_em_start); 5038 } 5039 5040 if (em_cached) 5041 free_extent_map(em_cached); 5042 5043 if (bio_ctrl.bio) { 5044 if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) 5045 return; 5046 } 5047 } 5048 5049 /* 5050 * basic invalidatepage code, this waits on any locked or writeback 5051 * ranges corresponding to the page, and then deletes any extent state 5052 * records from the tree 5053 */ 5054 int extent_invalidatepage(struct extent_io_tree *tree, 5055 struct page *page, unsigned long offset) 5056 { 5057 struct extent_state *cached_state = NULL; 5058 u64 start = page_offset(page); 5059 u64 end = start + PAGE_SIZE - 1; 5060 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 5061 5062 /* This function is only called for the btree inode */ 5063 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 5064 5065 start += ALIGN(offset, blocksize); 5066 if (start > end) 5067 return 0; 5068 5069 lock_extent_bits(tree, start, end, &cached_state); 5070 wait_on_page_writeback(page); 5071 5072 /* 5073 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 5074 * so here we only need to unlock the extent range to free any 5075 * existing extent state. 5076 */ 5077 unlock_extent_cached(tree, start, end, &cached_state); 5078 return 0; 5079 } 5080 5081 /* 5082 * a helper for releasepage, this tests for areas of the page that 5083 * are locked or under IO and drops the related state bits if it is safe 5084 * to drop the page. 5085 */ 5086 static int try_release_extent_state(struct extent_io_tree *tree, 5087 struct page *page, gfp_t mask) 5088 { 5089 u64 start = page_offset(page); 5090 u64 end = start + PAGE_SIZE - 1; 5091 int ret = 1; 5092 5093 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { 5094 ret = 0; 5095 } else { 5096 /* 5097 * At this point we can safely clear everything except the 5098 * locked bit, the nodatasum bit and the delalloc new bit. 5099 * The delalloc new bit will be cleared by ordered extent 5100 * completion. 5101 */ 5102 ret = __clear_extent_bit(tree, start, end, 5103 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), 5104 0, 0, NULL, mask, NULL); 5105 5106 /* if clear_extent_bit failed for enomem reasons, 5107 * we can't allow the release to continue. 5108 */ 5109 if (ret < 0) 5110 ret = 0; 5111 else 5112 ret = 1; 5113 } 5114 return ret; 5115 } 5116 5117 /* 5118 * a helper for releasepage. As long as there are no locked extents 5119 * in the range corresponding to the page, both state records and extent 5120 * map records are removed 5121 */ 5122 int try_release_extent_mapping(struct page *page, gfp_t mask) 5123 { 5124 struct extent_map *em; 5125 u64 start = page_offset(page); 5126 u64 end = start + PAGE_SIZE - 1; 5127 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 5128 struct extent_io_tree *tree = &btrfs_inode->io_tree; 5129 struct extent_map_tree *map = &btrfs_inode->extent_tree; 5130 5131 if (gfpflags_allow_blocking(mask) && 5132 page->mapping->host->i_size > SZ_16M) { 5133 u64 len; 5134 while (start <= end) { 5135 struct btrfs_fs_info *fs_info; 5136 u64 cur_gen; 5137 5138 len = end - start + 1; 5139 write_lock(&map->lock); 5140 em = lookup_extent_mapping(map, start, len); 5141 if (!em) { 5142 write_unlock(&map->lock); 5143 break; 5144 } 5145 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 5146 em->start != start) { 5147 write_unlock(&map->lock); 5148 free_extent_map(em); 5149 break; 5150 } 5151 if (test_range_bit(tree, em->start, 5152 extent_map_end(em) - 1, 5153 EXTENT_LOCKED, 0, NULL)) 5154 goto next; 5155 /* 5156 * If it's not in the list of modified extents, used 5157 * by a fast fsync, we can remove it. If it's being 5158 * logged we can safely remove it since fsync took an 5159 * extra reference on the em. 5160 */ 5161 if (list_empty(&em->list) || 5162 test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 5163 goto remove_em; 5164 /* 5165 * If it's in the list of modified extents, remove it 5166 * only if its generation is older then the current one, 5167 * in which case we don't need it for a fast fsync. 5168 * Otherwise don't remove it, we could be racing with an 5169 * ongoing fast fsync that could miss the new extent. 5170 */ 5171 fs_info = btrfs_inode->root->fs_info; 5172 spin_lock(&fs_info->trans_lock); 5173 cur_gen = fs_info->generation; 5174 spin_unlock(&fs_info->trans_lock); 5175 if (em->generation >= cur_gen) 5176 goto next; 5177 remove_em: 5178 /* 5179 * We only remove extent maps that are not in the list of 5180 * modified extents or that are in the list but with a 5181 * generation lower then the current generation, so there 5182 * is no need to set the full fsync flag on the inode (it 5183 * hurts the fsync performance for workloads with a data 5184 * size that exceeds or is close to the system's memory). 5185 */ 5186 remove_extent_mapping(map, em); 5187 /* once for the rb tree */ 5188 free_extent_map(em); 5189 next: 5190 start = extent_map_end(em); 5191 write_unlock(&map->lock); 5192 5193 /* once for us */ 5194 free_extent_map(em); 5195 5196 cond_resched(); /* Allow large-extent preemption. */ 5197 } 5198 } 5199 return try_release_extent_state(tree, page, mask); 5200 } 5201 5202 /* 5203 * helper function for fiemap, which doesn't want to see any holes. 5204 * This maps until we find something past 'last' 5205 */ 5206 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, 5207 u64 offset, u64 last) 5208 { 5209 u64 sectorsize = btrfs_inode_sectorsize(inode); 5210 struct extent_map *em; 5211 u64 len; 5212 5213 if (offset >= last) 5214 return NULL; 5215 5216 while (1) { 5217 len = last - offset; 5218 if (len == 0) 5219 break; 5220 len = ALIGN(len, sectorsize); 5221 em = btrfs_get_extent_fiemap(inode, offset, len); 5222 if (IS_ERR_OR_NULL(em)) 5223 return em; 5224 5225 /* if this isn't a hole return it */ 5226 if (em->block_start != EXTENT_MAP_HOLE) 5227 return em; 5228 5229 /* this is a hole, advance to the next extent */ 5230 offset = extent_map_end(em); 5231 free_extent_map(em); 5232 if (offset >= last) 5233 break; 5234 } 5235 return NULL; 5236 } 5237 5238 /* 5239 * To cache previous fiemap extent 5240 * 5241 * Will be used for merging fiemap extent 5242 */ 5243 struct fiemap_cache { 5244 u64 offset; 5245 u64 phys; 5246 u64 len; 5247 u32 flags; 5248 bool cached; 5249 }; 5250 5251 /* 5252 * Helper to submit fiemap extent. 5253 * 5254 * Will try to merge current fiemap extent specified by @offset, @phys, 5255 * @len and @flags with cached one. 5256 * And only when we fails to merge, cached one will be submitted as 5257 * fiemap extent. 5258 * 5259 * Return value is the same as fiemap_fill_next_extent(). 5260 */ 5261 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 5262 struct fiemap_cache *cache, 5263 u64 offset, u64 phys, u64 len, u32 flags) 5264 { 5265 int ret = 0; 5266 5267 if (!cache->cached) 5268 goto assign; 5269 5270 /* 5271 * Sanity check, extent_fiemap() should have ensured that new 5272 * fiemap extent won't overlap with cached one. 5273 * Not recoverable. 5274 * 5275 * NOTE: Physical address can overlap, due to compression 5276 */ 5277 if (cache->offset + cache->len > offset) { 5278 WARN_ON(1); 5279 return -EINVAL; 5280 } 5281 5282 /* 5283 * Only merges fiemap extents if 5284 * 1) Their logical addresses are continuous 5285 * 5286 * 2) Their physical addresses are continuous 5287 * So truly compressed (physical size smaller than logical size) 5288 * extents won't get merged with each other 5289 * 5290 * 3) Share same flags except FIEMAP_EXTENT_LAST 5291 * So regular extent won't get merged with prealloc extent 5292 */ 5293 if (cache->offset + cache->len == offset && 5294 cache->phys + cache->len == phys && 5295 (cache->flags & ~FIEMAP_EXTENT_LAST) == 5296 (flags & ~FIEMAP_EXTENT_LAST)) { 5297 cache->len += len; 5298 cache->flags |= flags; 5299 goto try_submit_last; 5300 } 5301 5302 /* Not mergeable, need to submit cached one */ 5303 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5304 cache->len, cache->flags); 5305 cache->cached = false; 5306 if (ret) 5307 return ret; 5308 assign: 5309 cache->cached = true; 5310 cache->offset = offset; 5311 cache->phys = phys; 5312 cache->len = len; 5313 cache->flags = flags; 5314 try_submit_last: 5315 if (cache->flags & FIEMAP_EXTENT_LAST) { 5316 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 5317 cache->phys, cache->len, cache->flags); 5318 cache->cached = false; 5319 } 5320 return ret; 5321 } 5322 5323 /* 5324 * Emit last fiemap cache 5325 * 5326 * The last fiemap cache may still be cached in the following case: 5327 * 0 4k 8k 5328 * |<- Fiemap range ->| 5329 * |<------------ First extent ----------->| 5330 * 5331 * In this case, the first extent range will be cached but not emitted. 5332 * So we must emit it before ending extent_fiemap(). 5333 */ 5334 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 5335 struct fiemap_cache *cache) 5336 { 5337 int ret; 5338 5339 if (!cache->cached) 5340 return 0; 5341 5342 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5343 cache->len, cache->flags); 5344 cache->cached = false; 5345 if (ret > 0) 5346 ret = 0; 5347 return ret; 5348 } 5349 5350 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, 5351 u64 start, u64 len) 5352 { 5353 int ret = 0; 5354 u64 off; 5355 u64 max = start + len; 5356 u32 flags = 0; 5357 u32 found_type; 5358 u64 last; 5359 u64 last_for_get_extent = 0; 5360 u64 disko = 0; 5361 u64 isize = i_size_read(&inode->vfs_inode); 5362 struct btrfs_key found_key; 5363 struct extent_map *em = NULL; 5364 struct extent_state *cached_state = NULL; 5365 struct btrfs_path *path; 5366 struct btrfs_root *root = inode->root; 5367 struct fiemap_cache cache = { 0 }; 5368 struct ulist *roots; 5369 struct ulist *tmp_ulist; 5370 int end = 0; 5371 u64 em_start = 0; 5372 u64 em_len = 0; 5373 u64 em_end = 0; 5374 5375 if (len == 0) 5376 return -EINVAL; 5377 5378 path = btrfs_alloc_path(); 5379 if (!path) 5380 return -ENOMEM; 5381 5382 roots = ulist_alloc(GFP_KERNEL); 5383 tmp_ulist = ulist_alloc(GFP_KERNEL); 5384 if (!roots || !tmp_ulist) { 5385 ret = -ENOMEM; 5386 goto out_free_ulist; 5387 } 5388 5389 /* 5390 * We can't initialize that to 'start' as this could miss extents due 5391 * to extent item merging 5392 */ 5393 off = 0; 5394 start = round_down(start, btrfs_inode_sectorsize(inode)); 5395 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 5396 5397 /* 5398 * lookup the last file extent. We're not using i_size here 5399 * because there might be preallocation past i_size 5400 */ 5401 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, 5402 0); 5403 if (ret < 0) { 5404 goto out_free_ulist; 5405 } else { 5406 WARN_ON(!ret); 5407 if (ret == 1) 5408 ret = 0; 5409 } 5410 5411 path->slots[0]--; 5412 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5413 found_type = found_key.type; 5414 5415 /* No extents, but there might be delalloc bits */ 5416 if (found_key.objectid != btrfs_ino(inode) || 5417 found_type != BTRFS_EXTENT_DATA_KEY) { 5418 /* have to trust i_size as the end */ 5419 last = (u64)-1; 5420 last_for_get_extent = isize; 5421 } else { 5422 /* 5423 * remember the start of the last extent. There are a 5424 * bunch of different factors that go into the length of the 5425 * extent, so its much less complex to remember where it started 5426 */ 5427 last = found_key.offset; 5428 last_for_get_extent = last + 1; 5429 } 5430 btrfs_release_path(path); 5431 5432 /* 5433 * we might have some extents allocated but more delalloc past those 5434 * extents. so, we trust isize unless the start of the last extent is 5435 * beyond isize 5436 */ 5437 if (last < isize) { 5438 last = (u64)-1; 5439 last_for_get_extent = isize; 5440 } 5441 5442 lock_extent_bits(&inode->io_tree, start, start + len - 1, 5443 &cached_state); 5444 5445 em = get_extent_skip_holes(inode, start, last_for_get_extent); 5446 if (!em) 5447 goto out; 5448 if (IS_ERR(em)) { 5449 ret = PTR_ERR(em); 5450 goto out; 5451 } 5452 5453 while (!end) { 5454 u64 offset_in_extent = 0; 5455 5456 /* break if the extent we found is outside the range */ 5457 if (em->start >= max || extent_map_end(em) < off) 5458 break; 5459 5460 /* 5461 * get_extent may return an extent that starts before our 5462 * requested range. We have to make sure the ranges 5463 * we return to fiemap always move forward and don't 5464 * overlap, so adjust the offsets here 5465 */ 5466 em_start = max(em->start, off); 5467 5468 /* 5469 * record the offset from the start of the extent 5470 * for adjusting the disk offset below. Only do this if the 5471 * extent isn't compressed since our in ram offset may be past 5472 * what we have actually allocated on disk. 5473 */ 5474 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5475 offset_in_extent = em_start - em->start; 5476 em_end = extent_map_end(em); 5477 em_len = em_end - em_start; 5478 flags = 0; 5479 if (em->block_start < EXTENT_MAP_LAST_BYTE) 5480 disko = em->block_start + offset_in_extent; 5481 else 5482 disko = 0; 5483 5484 /* 5485 * bump off for our next call to get_extent 5486 */ 5487 off = extent_map_end(em); 5488 if (off >= max) 5489 end = 1; 5490 5491 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 5492 end = 1; 5493 flags |= FIEMAP_EXTENT_LAST; 5494 } else if (em->block_start == EXTENT_MAP_INLINE) { 5495 flags |= (FIEMAP_EXTENT_DATA_INLINE | 5496 FIEMAP_EXTENT_NOT_ALIGNED); 5497 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 5498 flags |= (FIEMAP_EXTENT_DELALLOC | 5499 FIEMAP_EXTENT_UNKNOWN); 5500 } else if (fieinfo->fi_extents_max) { 5501 u64 bytenr = em->block_start - 5502 (em->start - em->orig_start); 5503 5504 /* 5505 * As btrfs supports shared space, this information 5506 * can be exported to userspace tools via 5507 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 5508 * then we're just getting a count and we can skip the 5509 * lookup stuff. 5510 */ 5511 ret = btrfs_check_shared(root, btrfs_ino(inode), 5512 bytenr, roots, tmp_ulist); 5513 if (ret < 0) 5514 goto out_free; 5515 if (ret) 5516 flags |= FIEMAP_EXTENT_SHARED; 5517 ret = 0; 5518 } 5519 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5520 flags |= FIEMAP_EXTENT_ENCODED; 5521 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5522 flags |= FIEMAP_EXTENT_UNWRITTEN; 5523 5524 free_extent_map(em); 5525 em = NULL; 5526 if ((em_start >= last) || em_len == (u64)-1 || 5527 (last == (u64)-1 && isize <= em_end)) { 5528 flags |= FIEMAP_EXTENT_LAST; 5529 end = 1; 5530 } 5531 5532 /* now scan forward to see if this is really the last extent. */ 5533 em = get_extent_skip_holes(inode, off, last_for_get_extent); 5534 if (IS_ERR(em)) { 5535 ret = PTR_ERR(em); 5536 goto out; 5537 } 5538 if (!em) { 5539 flags |= FIEMAP_EXTENT_LAST; 5540 end = 1; 5541 } 5542 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 5543 em_len, flags); 5544 if (ret) { 5545 if (ret == 1) 5546 ret = 0; 5547 goto out_free; 5548 } 5549 } 5550 out_free: 5551 if (!ret) 5552 ret = emit_last_fiemap_cache(fieinfo, &cache); 5553 free_extent_map(em); 5554 out: 5555 unlock_extent_cached(&inode->io_tree, start, start + len - 1, 5556 &cached_state); 5557 5558 out_free_ulist: 5559 btrfs_free_path(path); 5560 ulist_free(roots); 5561 ulist_free(tmp_ulist); 5562 return ret; 5563 } 5564 5565 static void __free_extent_buffer(struct extent_buffer *eb) 5566 { 5567 kmem_cache_free(extent_buffer_cache, eb); 5568 } 5569 5570 int extent_buffer_under_io(const struct extent_buffer *eb) 5571 { 5572 return (atomic_read(&eb->io_pages) || 5573 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 5574 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5575 } 5576 5577 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) 5578 { 5579 struct btrfs_subpage *subpage; 5580 5581 lockdep_assert_held(&page->mapping->private_lock); 5582 5583 if (PagePrivate(page)) { 5584 subpage = (struct btrfs_subpage *)page->private; 5585 if (atomic_read(&subpage->eb_refs)) 5586 return true; 5587 /* 5588 * Even there is no eb refs here, we may still have 5589 * end_page_read() call relying on page::private. 5590 */ 5591 if (atomic_read(&subpage->readers)) 5592 return true; 5593 } 5594 return false; 5595 } 5596 5597 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) 5598 { 5599 struct btrfs_fs_info *fs_info = eb->fs_info; 5600 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5601 5602 /* 5603 * For mapped eb, we're going to change the page private, which should 5604 * be done under the private_lock. 5605 */ 5606 if (mapped) 5607 spin_lock(&page->mapping->private_lock); 5608 5609 if (!PagePrivate(page)) { 5610 if (mapped) 5611 spin_unlock(&page->mapping->private_lock); 5612 return; 5613 } 5614 5615 if (fs_info->sectorsize == PAGE_SIZE) { 5616 /* 5617 * We do this since we'll remove the pages after we've 5618 * removed the eb from the radix tree, so we could race 5619 * and have this page now attached to the new eb. So 5620 * only clear page_private if it's still connected to 5621 * this eb. 5622 */ 5623 if (PagePrivate(page) && 5624 page->private == (unsigned long)eb) { 5625 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5626 BUG_ON(PageDirty(page)); 5627 BUG_ON(PageWriteback(page)); 5628 /* 5629 * We need to make sure we haven't be attached 5630 * to a new eb. 5631 */ 5632 detach_page_private(page); 5633 } 5634 if (mapped) 5635 spin_unlock(&page->mapping->private_lock); 5636 return; 5637 } 5638 5639 /* 5640 * For subpage, we can have dummy eb with page private. In this case, 5641 * we can directly detach the private as such page is only attached to 5642 * one dummy eb, no sharing. 5643 */ 5644 if (!mapped) { 5645 btrfs_detach_subpage(fs_info, page); 5646 return; 5647 } 5648 5649 btrfs_page_dec_eb_refs(fs_info, page); 5650 5651 /* 5652 * We can only detach the page private if there are no other ebs in the 5653 * page range and no unfinished IO. 5654 */ 5655 if (!page_range_has_eb(fs_info, page)) 5656 btrfs_detach_subpage(fs_info, page); 5657 5658 spin_unlock(&page->mapping->private_lock); 5659 } 5660 5661 /* Release all pages attached to the extent buffer */ 5662 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 5663 { 5664 int i; 5665 int num_pages; 5666 5667 ASSERT(!extent_buffer_under_io(eb)); 5668 5669 num_pages = num_extent_pages(eb); 5670 for (i = 0; i < num_pages; i++) { 5671 struct page *page = eb->pages[i]; 5672 5673 if (!page) 5674 continue; 5675 5676 detach_extent_buffer_page(eb, page); 5677 5678 /* One for when we allocated the page */ 5679 put_page(page); 5680 } 5681 } 5682 5683 /* 5684 * Helper for releasing the extent buffer. 5685 */ 5686 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 5687 { 5688 btrfs_release_extent_buffer_pages(eb); 5689 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 5690 __free_extent_buffer(eb); 5691 } 5692 5693 static struct extent_buffer * 5694 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 5695 unsigned long len) 5696 { 5697 struct extent_buffer *eb = NULL; 5698 5699 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 5700 eb->start = start; 5701 eb->len = len; 5702 eb->fs_info = fs_info; 5703 eb->bflags = 0; 5704 init_rwsem(&eb->lock); 5705 5706 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, 5707 &fs_info->allocated_ebs); 5708 INIT_LIST_HEAD(&eb->release_list); 5709 5710 spin_lock_init(&eb->refs_lock); 5711 atomic_set(&eb->refs, 1); 5712 atomic_set(&eb->io_pages, 0); 5713 5714 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); 5715 5716 return eb; 5717 } 5718 5719 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 5720 { 5721 int i; 5722 struct page *p; 5723 struct extent_buffer *new; 5724 int num_pages = num_extent_pages(src); 5725 5726 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 5727 if (new == NULL) 5728 return NULL; 5729 5730 /* 5731 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 5732 * btrfs_release_extent_buffer() have different behavior for 5733 * UNMAPPED subpage extent buffer. 5734 */ 5735 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 5736 5737 for (i = 0; i < num_pages; i++) { 5738 int ret; 5739 5740 p = alloc_page(GFP_NOFS); 5741 if (!p) { 5742 btrfs_release_extent_buffer(new); 5743 return NULL; 5744 } 5745 ret = attach_extent_buffer_page(new, p, NULL); 5746 if (ret < 0) { 5747 put_page(p); 5748 btrfs_release_extent_buffer(new); 5749 return NULL; 5750 } 5751 WARN_ON(PageDirty(p)); 5752 new->pages[i] = p; 5753 copy_page(page_address(p), page_address(src->pages[i])); 5754 } 5755 set_extent_buffer_uptodate(new); 5756 5757 return new; 5758 } 5759 5760 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5761 u64 start, unsigned long len) 5762 { 5763 struct extent_buffer *eb; 5764 int num_pages; 5765 int i; 5766 5767 eb = __alloc_extent_buffer(fs_info, start, len); 5768 if (!eb) 5769 return NULL; 5770 5771 num_pages = num_extent_pages(eb); 5772 for (i = 0; i < num_pages; i++) { 5773 int ret; 5774 5775 eb->pages[i] = alloc_page(GFP_NOFS); 5776 if (!eb->pages[i]) 5777 goto err; 5778 ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); 5779 if (ret < 0) 5780 goto err; 5781 } 5782 set_extent_buffer_uptodate(eb); 5783 btrfs_set_header_nritems(eb, 0); 5784 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5785 5786 return eb; 5787 err: 5788 for (; i > 0; i--) { 5789 detach_extent_buffer_page(eb, eb->pages[i - 1]); 5790 __free_page(eb->pages[i - 1]); 5791 } 5792 __free_extent_buffer(eb); 5793 return NULL; 5794 } 5795 5796 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5797 u64 start) 5798 { 5799 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 5800 } 5801 5802 static void check_buffer_tree_ref(struct extent_buffer *eb) 5803 { 5804 int refs; 5805 /* 5806 * The TREE_REF bit is first set when the extent_buffer is added 5807 * to the radix tree. It is also reset, if unset, when a new reference 5808 * is created by find_extent_buffer. 5809 * 5810 * It is only cleared in two cases: freeing the last non-tree 5811 * reference to the extent_buffer when its STALE bit is set or 5812 * calling releasepage when the tree reference is the only reference. 5813 * 5814 * In both cases, care is taken to ensure that the extent_buffer's 5815 * pages are not under io. However, releasepage can be concurrently 5816 * called with creating new references, which is prone to race 5817 * conditions between the calls to check_buffer_tree_ref in those 5818 * codepaths and clearing TREE_REF in try_release_extent_buffer. 5819 * 5820 * The actual lifetime of the extent_buffer in the radix tree is 5821 * adequately protected by the refcount, but the TREE_REF bit and 5822 * its corresponding reference are not. To protect against this 5823 * class of races, we call check_buffer_tree_ref from the codepaths 5824 * which trigger io after they set eb->io_pages. Note that once io is 5825 * initiated, TREE_REF can no longer be cleared, so that is the 5826 * moment at which any such race is best fixed. 5827 */ 5828 refs = atomic_read(&eb->refs); 5829 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5830 return; 5831 5832 spin_lock(&eb->refs_lock); 5833 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5834 atomic_inc(&eb->refs); 5835 spin_unlock(&eb->refs_lock); 5836 } 5837 5838 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 5839 struct page *accessed) 5840 { 5841 int num_pages, i; 5842 5843 check_buffer_tree_ref(eb); 5844 5845 num_pages = num_extent_pages(eb); 5846 for (i = 0; i < num_pages; i++) { 5847 struct page *p = eb->pages[i]; 5848 5849 if (p != accessed) 5850 mark_page_accessed(p); 5851 } 5852 } 5853 5854 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 5855 u64 start) 5856 { 5857 struct extent_buffer *eb; 5858 5859 eb = find_extent_buffer_nolock(fs_info, start); 5860 if (!eb) 5861 return NULL; 5862 /* 5863 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 5864 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 5865 * another task running free_extent_buffer() might have seen that flag 5866 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 5867 * writeback flags not set) and it's still in the tree (flag 5868 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 5869 * decrementing the extent buffer's reference count twice. So here we 5870 * could race and increment the eb's reference count, clear its stale 5871 * flag, mark it as dirty and drop our reference before the other task 5872 * finishes executing free_extent_buffer, which would later result in 5873 * an attempt to free an extent buffer that is dirty. 5874 */ 5875 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 5876 spin_lock(&eb->refs_lock); 5877 spin_unlock(&eb->refs_lock); 5878 } 5879 mark_extent_buffer_accessed(eb, NULL); 5880 return eb; 5881 } 5882 5883 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5884 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 5885 u64 start) 5886 { 5887 struct extent_buffer *eb, *exists = NULL; 5888 int ret; 5889 5890 eb = find_extent_buffer(fs_info, start); 5891 if (eb) 5892 return eb; 5893 eb = alloc_dummy_extent_buffer(fs_info, start); 5894 if (!eb) 5895 return ERR_PTR(-ENOMEM); 5896 eb->fs_info = fs_info; 5897 again: 5898 ret = radix_tree_preload(GFP_NOFS); 5899 if (ret) { 5900 exists = ERR_PTR(ret); 5901 goto free_eb; 5902 } 5903 spin_lock(&fs_info->buffer_lock); 5904 ret = radix_tree_insert(&fs_info->buffer_radix, 5905 start >> fs_info->sectorsize_bits, eb); 5906 spin_unlock(&fs_info->buffer_lock); 5907 radix_tree_preload_end(); 5908 if (ret == -EEXIST) { 5909 exists = find_extent_buffer(fs_info, start); 5910 if (exists) 5911 goto free_eb; 5912 else 5913 goto again; 5914 } 5915 check_buffer_tree_ref(eb); 5916 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5917 5918 return eb; 5919 free_eb: 5920 btrfs_release_extent_buffer(eb); 5921 return exists; 5922 } 5923 #endif 5924 5925 static struct extent_buffer *grab_extent_buffer( 5926 struct btrfs_fs_info *fs_info, struct page *page) 5927 { 5928 struct extent_buffer *exists; 5929 5930 /* 5931 * For subpage case, we completely rely on radix tree to ensure we 5932 * don't try to insert two ebs for the same bytenr. So here we always 5933 * return NULL and just continue. 5934 */ 5935 if (fs_info->sectorsize < PAGE_SIZE) 5936 return NULL; 5937 5938 /* Page not yet attached to an extent buffer */ 5939 if (!PagePrivate(page)) 5940 return NULL; 5941 5942 /* 5943 * We could have already allocated an eb for this page and attached one 5944 * so lets see if we can get a ref on the existing eb, and if we can we 5945 * know it's good and we can just return that one, else we know we can 5946 * just overwrite page->private. 5947 */ 5948 exists = (struct extent_buffer *)page->private; 5949 if (atomic_inc_not_zero(&exists->refs)) 5950 return exists; 5951 5952 WARN_ON(PageDirty(page)); 5953 detach_page_private(page); 5954 return NULL; 5955 } 5956 5957 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 5958 u64 start, u64 owner_root, int level) 5959 { 5960 unsigned long len = fs_info->nodesize; 5961 int num_pages; 5962 int i; 5963 unsigned long index = start >> PAGE_SHIFT; 5964 struct extent_buffer *eb; 5965 struct extent_buffer *exists = NULL; 5966 struct page *p; 5967 struct address_space *mapping = fs_info->btree_inode->i_mapping; 5968 int uptodate = 1; 5969 int ret; 5970 5971 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 5972 btrfs_err(fs_info, "bad tree block start %llu", start); 5973 return ERR_PTR(-EINVAL); 5974 } 5975 5976 #if BITS_PER_LONG == 32 5977 if (start >= MAX_LFS_FILESIZE) { 5978 btrfs_err_rl(fs_info, 5979 "extent buffer %llu is beyond 32bit page cache limit", start); 5980 btrfs_err_32bit_limit(fs_info); 5981 return ERR_PTR(-EOVERFLOW); 5982 } 5983 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 5984 btrfs_warn_32bit_limit(fs_info); 5985 #endif 5986 5987 if (fs_info->sectorsize < PAGE_SIZE && 5988 offset_in_page(start) + len > PAGE_SIZE) { 5989 btrfs_err(fs_info, 5990 "tree block crosses page boundary, start %llu nodesize %lu", 5991 start, len); 5992 return ERR_PTR(-EINVAL); 5993 } 5994 5995 eb = find_extent_buffer(fs_info, start); 5996 if (eb) 5997 return eb; 5998 5999 eb = __alloc_extent_buffer(fs_info, start, len); 6000 if (!eb) 6001 return ERR_PTR(-ENOMEM); 6002 btrfs_set_buffer_lockdep_class(owner_root, eb, level); 6003 6004 num_pages = num_extent_pages(eb); 6005 for (i = 0; i < num_pages; i++, index++) { 6006 struct btrfs_subpage *prealloc = NULL; 6007 6008 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 6009 if (!p) { 6010 exists = ERR_PTR(-ENOMEM); 6011 goto free_eb; 6012 } 6013 6014 /* 6015 * Preallocate page->private for subpage case, so that we won't 6016 * allocate memory with private_lock hold. The memory will be 6017 * freed by attach_extent_buffer_page() or freed manually if 6018 * we exit earlier. 6019 * 6020 * Although we have ensured one subpage eb can only have one 6021 * page, but it may change in the future for 16K page size 6022 * support, so we still preallocate the memory in the loop. 6023 */ 6024 ret = btrfs_alloc_subpage(fs_info, &prealloc, 6025 BTRFS_SUBPAGE_METADATA); 6026 if (ret < 0) { 6027 unlock_page(p); 6028 put_page(p); 6029 exists = ERR_PTR(ret); 6030 goto free_eb; 6031 } 6032 6033 spin_lock(&mapping->private_lock); 6034 exists = grab_extent_buffer(fs_info, p); 6035 if (exists) { 6036 spin_unlock(&mapping->private_lock); 6037 unlock_page(p); 6038 put_page(p); 6039 mark_extent_buffer_accessed(exists, p); 6040 btrfs_free_subpage(prealloc); 6041 goto free_eb; 6042 } 6043 /* Should not fail, as we have preallocated the memory */ 6044 ret = attach_extent_buffer_page(eb, p, prealloc); 6045 ASSERT(!ret); 6046 /* 6047 * To inform we have extra eb under allocation, so that 6048 * detach_extent_buffer_page() won't release the page private 6049 * when the eb hasn't yet been inserted into radix tree. 6050 * 6051 * The ref will be decreased when the eb released the page, in 6052 * detach_extent_buffer_page(). 6053 * Thus needs no special handling in error path. 6054 */ 6055 btrfs_page_inc_eb_refs(fs_info, p); 6056 spin_unlock(&mapping->private_lock); 6057 6058 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); 6059 eb->pages[i] = p; 6060 if (!PageUptodate(p)) 6061 uptodate = 0; 6062 6063 /* 6064 * We can't unlock the pages just yet since the extent buffer 6065 * hasn't been properly inserted in the radix tree, this 6066 * opens a race with btree_releasepage which can free a page 6067 * while we are still filling in all pages for the buffer and 6068 * we could crash. 6069 */ 6070 } 6071 if (uptodate) 6072 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6073 again: 6074 ret = radix_tree_preload(GFP_NOFS); 6075 if (ret) { 6076 exists = ERR_PTR(ret); 6077 goto free_eb; 6078 } 6079 6080 spin_lock(&fs_info->buffer_lock); 6081 ret = radix_tree_insert(&fs_info->buffer_radix, 6082 start >> fs_info->sectorsize_bits, eb); 6083 spin_unlock(&fs_info->buffer_lock); 6084 radix_tree_preload_end(); 6085 if (ret == -EEXIST) { 6086 exists = find_extent_buffer(fs_info, start); 6087 if (exists) 6088 goto free_eb; 6089 else 6090 goto again; 6091 } 6092 /* add one reference for the tree */ 6093 check_buffer_tree_ref(eb); 6094 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6095 6096 /* 6097 * Now it's safe to unlock the pages because any calls to 6098 * btree_releasepage will correctly detect that a page belongs to a 6099 * live buffer and won't free them prematurely. 6100 */ 6101 for (i = 0; i < num_pages; i++) 6102 unlock_page(eb->pages[i]); 6103 return eb; 6104 6105 free_eb: 6106 WARN_ON(!atomic_dec_and_test(&eb->refs)); 6107 for (i = 0; i < num_pages; i++) { 6108 if (eb->pages[i]) 6109 unlock_page(eb->pages[i]); 6110 } 6111 6112 btrfs_release_extent_buffer(eb); 6113 return exists; 6114 } 6115 6116 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 6117 { 6118 struct extent_buffer *eb = 6119 container_of(head, struct extent_buffer, rcu_head); 6120 6121 __free_extent_buffer(eb); 6122 } 6123 6124 static int release_extent_buffer(struct extent_buffer *eb) 6125 __releases(&eb->refs_lock) 6126 { 6127 lockdep_assert_held(&eb->refs_lock); 6128 6129 WARN_ON(atomic_read(&eb->refs) == 0); 6130 if (atomic_dec_and_test(&eb->refs)) { 6131 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 6132 struct btrfs_fs_info *fs_info = eb->fs_info; 6133 6134 spin_unlock(&eb->refs_lock); 6135 6136 spin_lock(&fs_info->buffer_lock); 6137 radix_tree_delete(&fs_info->buffer_radix, 6138 eb->start >> fs_info->sectorsize_bits); 6139 spin_unlock(&fs_info->buffer_lock); 6140 } else { 6141 spin_unlock(&eb->refs_lock); 6142 } 6143 6144 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 6145 /* Should be safe to release our pages at this point */ 6146 btrfs_release_extent_buffer_pages(eb); 6147 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6148 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 6149 __free_extent_buffer(eb); 6150 return 1; 6151 } 6152 #endif 6153 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 6154 return 1; 6155 } 6156 spin_unlock(&eb->refs_lock); 6157 6158 return 0; 6159 } 6160 6161 void free_extent_buffer(struct extent_buffer *eb) 6162 { 6163 int refs; 6164 int old; 6165 if (!eb) 6166 return; 6167 6168 while (1) { 6169 refs = atomic_read(&eb->refs); 6170 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 6171 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 6172 refs == 1)) 6173 break; 6174 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 6175 if (old == refs) 6176 return; 6177 } 6178 6179 spin_lock(&eb->refs_lock); 6180 if (atomic_read(&eb->refs) == 2 && 6181 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 6182 !extent_buffer_under_io(eb) && 6183 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6184 atomic_dec(&eb->refs); 6185 6186 /* 6187 * I know this is terrible, but it's temporary until we stop tracking 6188 * the uptodate bits and such for the extent buffers. 6189 */ 6190 release_extent_buffer(eb); 6191 } 6192 6193 void free_extent_buffer_stale(struct extent_buffer *eb) 6194 { 6195 if (!eb) 6196 return; 6197 6198 spin_lock(&eb->refs_lock); 6199 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 6200 6201 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 6202 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6203 atomic_dec(&eb->refs); 6204 release_extent_buffer(eb); 6205 } 6206 6207 static void btree_clear_page_dirty(struct page *page) 6208 { 6209 ASSERT(PageDirty(page)); 6210 ASSERT(PageLocked(page)); 6211 clear_page_dirty_for_io(page); 6212 xa_lock_irq(&page->mapping->i_pages); 6213 if (!PageDirty(page)) 6214 __xa_clear_mark(&page->mapping->i_pages, 6215 page_index(page), PAGECACHE_TAG_DIRTY); 6216 xa_unlock_irq(&page->mapping->i_pages); 6217 } 6218 6219 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) 6220 { 6221 struct btrfs_fs_info *fs_info = eb->fs_info; 6222 struct page *page = eb->pages[0]; 6223 bool last; 6224 6225 /* btree_clear_page_dirty() needs page locked */ 6226 lock_page(page); 6227 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, 6228 eb->len); 6229 if (last) 6230 btree_clear_page_dirty(page); 6231 unlock_page(page); 6232 WARN_ON(atomic_read(&eb->refs) == 0); 6233 } 6234 6235 void clear_extent_buffer_dirty(const struct extent_buffer *eb) 6236 { 6237 int i; 6238 int num_pages; 6239 struct page *page; 6240 6241 if (eb->fs_info->sectorsize < PAGE_SIZE) 6242 return clear_subpage_extent_buffer_dirty(eb); 6243 6244 num_pages = num_extent_pages(eb); 6245 6246 for (i = 0; i < num_pages; i++) { 6247 page = eb->pages[i]; 6248 if (!PageDirty(page)) 6249 continue; 6250 lock_page(page); 6251 btree_clear_page_dirty(page); 6252 ClearPageError(page); 6253 unlock_page(page); 6254 } 6255 WARN_ON(atomic_read(&eb->refs) == 0); 6256 } 6257 6258 bool set_extent_buffer_dirty(struct extent_buffer *eb) 6259 { 6260 int i; 6261 int num_pages; 6262 bool was_dirty; 6263 6264 check_buffer_tree_ref(eb); 6265 6266 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 6267 6268 num_pages = num_extent_pages(eb); 6269 WARN_ON(atomic_read(&eb->refs) == 0); 6270 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 6271 6272 if (!was_dirty) { 6273 bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; 6274 6275 /* 6276 * For subpage case, we can have other extent buffers in the 6277 * same page, and in clear_subpage_extent_buffer_dirty() we 6278 * have to clear page dirty without subpage lock held. 6279 * This can cause race where our page gets dirty cleared after 6280 * we just set it. 6281 * 6282 * Thankfully, clear_subpage_extent_buffer_dirty() has locked 6283 * its page for other reasons, we can use page lock to prevent 6284 * the above race. 6285 */ 6286 if (subpage) 6287 lock_page(eb->pages[0]); 6288 for (i = 0; i < num_pages; i++) 6289 btrfs_page_set_dirty(eb->fs_info, eb->pages[i], 6290 eb->start, eb->len); 6291 if (subpage) 6292 unlock_page(eb->pages[0]); 6293 } 6294 #ifdef CONFIG_BTRFS_DEBUG 6295 for (i = 0; i < num_pages; i++) 6296 ASSERT(PageDirty(eb->pages[i])); 6297 #endif 6298 6299 return was_dirty; 6300 } 6301 6302 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 6303 { 6304 struct btrfs_fs_info *fs_info = eb->fs_info; 6305 struct page *page; 6306 int num_pages; 6307 int i; 6308 6309 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6310 num_pages = num_extent_pages(eb); 6311 for (i = 0; i < num_pages; i++) { 6312 page = eb->pages[i]; 6313 if (page) 6314 btrfs_page_clear_uptodate(fs_info, page, 6315 eb->start, eb->len); 6316 } 6317 } 6318 6319 void set_extent_buffer_uptodate(struct extent_buffer *eb) 6320 { 6321 struct btrfs_fs_info *fs_info = eb->fs_info; 6322 struct page *page; 6323 int num_pages; 6324 int i; 6325 6326 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6327 num_pages = num_extent_pages(eb); 6328 for (i = 0; i < num_pages; i++) { 6329 page = eb->pages[i]; 6330 btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); 6331 } 6332 } 6333 6334 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, 6335 int mirror_num) 6336 { 6337 struct btrfs_fs_info *fs_info = eb->fs_info; 6338 struct extent_io_tree *io_tree; 6339 struct page *page = eb->pages[0]; 6340 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 6341 int ret = 0; 6342 6343 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); 6344 ASSERT(PagePrivate(page)); 6345 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 6346 6347 if (wait == WAIT_NONE) { 6348 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) 6349 return -EAGAIN; 6350 } else { 6351 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6352 if (ret < 0) 6353 return ret; 6354 } 6355 6356 ret = 0; 6357 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || 6358 PageUptodate(page) || 6359 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { 6360 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6361 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6362 return ret; 6363 } 6364 6365 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6366 eb->read_mirror = 0; 6367 atomic_set(&eb->io_pages, 1); 6368 check_buffer_tree_ref(eb); 6369 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); 6370 6371 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); 6372 ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, 6373 page, eb->start, eb->len, 6374 eb->start - page_offset(page), 6375 end_bio_extent_readpage, mirror_num, 0, 6376 true); 6377 if (ret) { 6378 /* 6379 * In the endio function, if we hit something wrong we will 6380 * increase the io_pages, so here we need to decrease it for 6381 * error path. 6382 */ 6383 atomic_dec(&eb->io_pages); 6384 } 6385 if (bio_ctrl.bio) { 6386 int tmp; 6387 6388 tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); 6389 bio_ctrl.bio = NULL; 6390 if (tmp < 0) 6391 return tmp; 6392 } 6393 if (ret || wait != WAIT_COMPLETE) 6394 return ret; 6395 6396 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); 6397 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6398 ret = -EIO; 6399 return ret; 6400 } 6401 6402 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) 6403 { 6404 int i; 6405 struct page *page; 6406 int err; 6407 int ret = 0; 6408 int locked_pages = 0; 6409 int all_uptodate = 1; 6410 int num_pages; 6411 unsigned long num_reads = 0; 6412 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 6413 6414 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6415 return 0; 6416 6417 if (eb->fs_info->sectorsize < PAGE_SIZE) 6418 return read_extent_buffer_subpage(eb, wait, mirror_num); 6419 6420 num_pages = num_extent_pages(eb); 6421 for (i = 0; i < num_pages; i++) { 6422 page = eb->pages[i]; 6423 if (wait == WAIT_NONE) { 6424 /* 6425 * WAIT_NONE is only utilized by readahead. If we can't 6426 * acquire the lock atomically it means either the eb 6427 * is being read out or under modification. 6428 * Either way the eb will be or has been cached, 6429 * readahead can exit safely. 6430 */ 6431 if (!trylock_page(page)) 6432 goto unlock_exit; 6433 } else { 6434 lock_page(page); 6435 } 6436 locked_pages++; 6437 } 6438 /* 6439 * We need to firstly lock all pages to make sure that 6440 * the uptodate bit of our pages won't be affected by 6441 * clear_extent_buffer_uptodate(). 6442 */ 6443 for (i = 0; i < num_pages; i++) { 6444 page = eb->pages[i]; 6445 if (!PageUptodate(page)) { 6446 num_reads++; 6447 all_uptodate = 0; 6448 } 6449 } 6450 6451 if (all_uptodate) { 6452 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6453 goto unlock_exit; 6454 } 6455 6456 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6457 eb->read_mirror = 0; 6458 atomic_set(&eb->io_pages, num_reads); 6459 /* 6460 * It is possible for releasepage to clear the TREE_REF bit before we 6461 * set io_pages. See check_buffer_tree_ref for a more detailed comment. 6462 */ 6463 check_buffer_tree_ref(eb); 6464 for (i = 0; i < num_pages; i++) { 6465 page = eb->pages[i]; 6466 6467 if (!PageUptodate(page)) { 6468 if (ret) { 6469 atomic_dec(&eb->io_pages); 6470 unlock_page(page); 6471 continue; 6472 } 6473 6474 ClearPageError(page); 6475 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, 6476 &bio_ctrl, page, page_offset(page), 6477 PAGE_SIZE, 0, end_bio_extent_readpage, 6478 mirror_num, 0, false); 6479 if (err) { 6480 /* 6481 * We failed to submit the bio so it's the 6482 * caller's responsibility to perform cleanup 6483 * i.e unlock page/set error bit. 6484 */ 6485 ret = err; 6486 SetPageError(page); 6487 unlock_page(page); 6488 atomic_dec(&eb->io_pages); 6489 } 6490 } else { 6491 unlock_page(page); 6492 } 6493 } 6494 6495 if (bio_ctrl.bio) { 6496 err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); 6497 bio_ctrl.bio = NULL; 6498 if (err) 6499 return err; 6500 } 6501 6502 if (ret || wait != WAIT_COMPLETE) 6503 return ret; 6504 6505 for (i = 0; i < num_pages; i++) { 6506 page = eb->pages[i]; 6507 wait_on_page_locked(page); 6508 if (!PageUptodate(page)) 6509 ret = -EIO; 6510 } 6511 6512 return ret; 6513 6514 unlock_exit: 6515 while (locked_pages > 0) { 6516 locked_pages--; 6517 page = eb->pages[locked_pages]; 6518 unlock_page(page); 6519 } 6520 return ret; 6521 } 6522 6523 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 6524 unsigned long len) 6525 { 6526 btrfs_warn(eb->fs_info, 6527 "access to eb bytenr %llu len %lu out of range start %lu len %lu", 6528 eb->start, eb->len, start, len); 6529 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 6530 6531 return true; 6532 } 6533 6534 /* 6535 * Check if the [start, start + len) range is valid before reading/writing 6536 * the eb. 6537 * NOTE: @start and @len are offset inside the eb, not logical address. 6538 * 6539 * Caller should not touch the dst/src memory if this function returns error. 6540 */ 6541 static inline int check_eb_range(const struct extent_buffer *eb, 6542 unsigned long start, unsigned long len) 6543 { 6544 unsigned long offset; 6545 6546 /* start, start + len should not go beyond eb->len nor overflow */ 6547 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 6548 return report_eb_range(eb, start, len); 6549 6550 return false; 6551 } 6552 6553 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 6554 unsigned long start, unsigned long len) 6555 { 6556 size_t cur; 6557 size_t offset; 6558 struct page *page; 6559 char *kaddr; 6560 char *dst = (char *)dstv; 6561 unsigned long i = get_eb_page_index(start); 6562 6563 if (check_eb_range(eb, start, len)) 6564 return; 6565 6566 offset = get_eb_offset_in_page(eb, start); 6567 6568 while (len > 0) { 6569 page = eb->pages[i]; 6570 6571 cur = min(len, (PAGE_SIZE - offset)); 6572 kaddr = page_address(page); 6573 memcpy(dst, kaddr + offset, cur); 6574 6575 dst += cur; 6576 len -= cur; 6577 offset = 0; 6578 i++; 6579 } 6580 } 6581 6582 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 6583 void __user *dstv, 6584 unsigned long start, unsigned long len) 6585 { 6586 size_t cur; 6587 size_t offset; 6588 struct page *page; 6589 char *kaddr; 6590 char __user *dst = (char __user *)dstv; 6591 unsigned long i = get_eb_page_index(start); 6592 int ret = 0; 6593 6594 WARN_ON(start > eb->len); 6595 WARN_ON(start + len > eb->start + eb->len); 6596 6597 offset = get_eb_offset_in_page(eb, start); 6598 6599 while (len > 0) { 6600 page = eb->pages[i]; 6601 6602 cur = min(len, (PAGE_SIZE - offset)); 6603 kaddr = page_address(page); 6604 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 6605 ret = -EFAULT; 6606 break; 6607 } 6608 6609 dst += cur; 6610 len -= cur; 6611 offset = 0; 6612 i++; 6613 } 6614 6615 return ret; 6616 } 6617 6618 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 6619 unsigned long start, unsigned long len) 6620 { 6621 size_t cur; 6622 size_t offset; 6623 struct page *page; 6624 char *kaddr; 6625 char *ptr = (char *)ptrv; 6626 unsigned long i = get_eb_page_index(start); 6627 int ret = 0; 6628 6629 if (check_eb_range(eb, start, len)) 6630 return -EINVAL; 6631 6632 offset = get_eb_offset_in_page(eb, start); 6633 6634 while (len > 0) { 6635 page = eb->pages[i]; 6636 6637 cur = min(len, (PAGE_SIZE - offset)); 6638 6639 kaddr = page_address(page); 6640 ret = memcmp(ptr, kaddr + offset, cur); 6641 if (ret) 6642 break; 6643 6644 ptr += cur; 6645 len -= cur; 6646 offset = 0; 6647 i++; 6648 } 6649 return ret; 6650 } 6651 6652 /* 6653 * Check that the extent buffer is uptodate. 6654 * 6655 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 6656 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 6657 */ 6658 static void assert_eb_page_uptodate(const struct extent_buffer *eb, 6659 struct page *page) 6660 { 6661 struct btrfs_fs_info *fs_info = eb->fs_info; 6662 6663 if (fs_info->sectorsize < PAGE_SIZE) { 6664 bool uptodate; 6665 6666 uptodate = btrfs_subpage_test_uptodate(fs_info, page, 6667 eb->start, eb->len); 6668 WARN_ON(!uptodate); 6669 } else { 6670 WARN_ON(!PageUptodate(page)); 6671 } 6672 } 6673 6674 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, 6675 const void *srcv) 6676 { 6677 char *kaddr; 6678 6679 assert_eb_page_uptodate(eb, eb->pages[0]); 6680 kaddr = page_address(eb->pages[0]) + 6681 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, 6682 chunk_tree_uuid)); 6683 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6684 } 6685 6686 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) 6687 { 6688 char *kaddr; 6689 6690 assert_eb_page_uptodate(eb, eb->pages[0]); 6691 kaddr = page_address(eb->pages[0]) + 6692 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); 6693 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6694 } 6695 6696 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 6697 unsigned long start, unsigned long len) 6698 { 6699 size_t cur; 6700 size_t offset; 6701 struct page *page; 6702 char *kaddr; 6703 char *src = (char *)srcv; 6704 unsigned long i = get_eb_page_index(start); 6705 6706 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); 6707 6708 if (check_eb_range(eb, start, len)) 6709 return; 6710 6711 offset = get_eb_offset_in_page(eb, start); 6712 6713 while (len > 0) { 6714 page = eb->pages[i]; 6715 assert_eb_page_uptodate(eb, page); 6716 6717 cur = min(len, PAGE_SIZE - offset); 6718 kaddr = page_address(page); 6719 memcpy(kaddr + offset, src, cur); 6720 6721 src += cur; 6722 len -= cur; 6723 offset = 0; 6724 i++; 6725 } 6726 } 6727 6728 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 6729 unsigned long len) 6730 { 6731 size_t cur; 6732 size_t offset; 6733 struct page *page; 6734 char *kaddr; 6735 unsigned long i = get_eb_page_index(start); 6736 6737 if (check_eb_range(eb, start, len)) 6738 return; 6739 6740 offset = get_eb_offset_in_page(eb, start); 6741 6742 while (len > 0) { 6743 page = eb->pages[i]; 6744 assert_eb_page_uptodate(eb, page); 6745 6746 cur = min(len, PAGE_SIZE - offset); 6747 kaddr = page_address(page); 6748 memset(kaddr + offset, 0, cur); 6749 6750 len -= cur; 6751 offset = 0; 6752 i++; 6753 } 6754 } 6755 6756 void copy_extent_buffer_full(const struct extent_buffer *dst, 6757 const struct extent_buffer *src) 6758 { 6759 int i; 6760 int num_pages; 6761 6762 ASSERT(dst->len == src->len); 6763 6764 if (dst->fs_info->sectorsize == PAGE_SIZE) { 6765 num_pages = num_extent_pages(dst); 6766 for (i = 0; i < num_pages; i++) 6767 copy_page(page_address(dst->pages[i]), 6768 page_address(src->pages[i])); 6769 } else { 6770 size_t src_offset = get_eb_offset_in_page(src, 0); 6771 size_t dst_offset = get_eb_offset_in_page(dst, 0); 6772 6773 ASSERT(src->fs_info->sectorsize < PAGE_SIZE); 6774 memcpy(page_address(dst->pages[0]) + dst_offset, 6775 page_address(src->pages[0]) + src_offset, 6776 src->len); 6777 } 6778 } 6779 6780 void copy_extent_buffer(const struct extent_buffer *dst, 6781 const struct extent_buffer *src, 6782 unsigned long dst_offset, unsigned long src_offset, 6783 unsigned long len) 6784 { 6785 u64 dst_len = dst->len; 6786 size_t cur; 6787 size_t offset; 6788 struct page *page; 6789 char *kaddr; 6790 unsigned long i = get_eb_page_index(dst_offset); 6791 6792 if (check_eb_range(dst, dst_offset, len) || 6793 check_eb_range(src, src_offset, len)) 6794 return; 6795 6796 WARN_ON(src->len != dst_len); 6797 6798 offset = get_eb_offset_in_page(dst, dst_offset); 6799 6800 while (len > 0) { 6801 page = dst->pages[i]; 6802 assert_eb_page_uptodate(dst, page); 6803 6804 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 6805 6806 kaddr = page_address(page); 6807 read_extent_buffer(src, kaddr + offset, src_offset, cur); 6808 6809 src_offset += cur; 6810 len -= cur; 6811 offset = 0; 6812 i++; 6813 } 6814 } 6815 6816 /* 6817 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 6818 * given bit number 6819 * @eb: the extent buffer 6820 * @start: offset of the bitmap item in the extent buffer 6821 * @nr: bit number 6822 * @page_index: return index of the page in the extent buffer that contains the 6823 * given bit number 6824 * @page_offset: return offset into the page given by page_index 6825 * 6826 * This helper hides the ugliness of finding the byte in an extent buffer which 6827 * contains a given bit. 6828 */ 6829 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 6830 unsigned long start, unsigned long nr, 6831 unsigned long *page_index, 6832 size_t *page_offset) 6833 { 6834 size_t byte_offset = BIT_BYTE(nr); 6835 size_t offset; 6836 6837 /* 6838 * The byte we want is the offset of the extent buffer + the offset of 6839 * the bitmap item in the extent buffer + the offset of the byte in the 6840 * bitmap item. 6841 */ 6842 offset = start + offset_in_page(eb->start) + byte_offset; 6843 6844 *page_index = offset >> PAGE_SHIFT; 6845 *page_offset = offset_in_page(offset); 6846 } 6847 6848 /** 6849 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 6850 * @eb: the extent buffer 6851 * @start: offset of the bitmap item in the extent buffer 6852 * @nr: bit number to test 6853 */ 6854 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 6855 unsigned long nr) 6856 { 6857 u8 *kaddr; 6858 struct page *page; 6859 unsigned long i; 6860 size_t offset; 6861 6862 eb_bitmap_offset(eb, start, nr, &i, &offset); 6863 page = eb->pages[i]; 6864 assert_eb_page_uptodate(eb, page); 6865 kaddr = page_address(page); 6866 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 6867 } 6868 6869 /** 6870 * extent_buffer_bitmap_set - set an area of a bitmap 6871 * @eb: the extent buffer 6872 * @start: offset of the bitmap item in the extent buffer 6873 * @pos: bit number of the first bit 6874 * @len: number of bits to set 6875 */ 6876 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 6877 unsigned long pos, unsigned long len) 6878 { 6879 u8 *kaddr; 6880 struct page *page; 6881 unsigned long i; 6882 size_t offset; 6883 const unsigned int size = pos + len; 6884 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 6885 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 6886 6887 eb_bitmap_offset(eb, start, pos, &i, &offset); 6888 page = eb->pages[i]; 6889 assert_eb_page_uptodate(eb, page); 6890 kaddr = page_address(page); 6891 6892 while (len >= bits_to_set) { 6893 kaddr[offset] |= mask_to_set; 6894 len -= bits_to_set; 6895 bits_to_set = BITS_PER_BYTE; 6896 mask_to_set = ~0; 6897 if (++offset >= PAGE_SIZE && len > 0) { 6898 offset = 0; 6899 page = eb->pages[++i]; 6900 assert_eb_page_uptodate(eb, page); 6901 kaddr = page_address(page); 6902 } 6903 } 6904 if (len) { 6905 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 6906 kaddr[offset] |= mask_to_set; 6907 } 6908 } 6909 6910 6911 /** 6912 * extent_buffer_bitmap_clear - clear an area of a bitmap 6913 * @eb: the extent buffer 6914 * @start: offset of the bitmap item in the extent buffer 6915 * @pos: bit number of the first bit 6916 * @len: number of bits to clear 6917 */ 6918 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 6919 unsigned long start, unsigned long pos, 6920 unsigned long len) 6921 { 6922 u8 *kaddr; 6923 struct page *page; 6924 unsigned long i; 6925 size_t offset; 6926 const unsigned int size = pos + len; 6927 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 6928 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 6929 6930 eb_bitmap_offset(eb, start, pos, &i, &offset); 6931 page = eb->pages[i]; 6932 assert_eb_page_uptodate(eb, page); 6933 kaddr = page_address(page); 6934 6935 while (len >= bits_to_clear) { 6936 kaddr[offset] &= ~mask_to_clear; 6937 len -= bits_to_clear; 6938 bits_to_clear = BITS_PER_BYTE; 6939 mask_to_clear = ~0; 6940 if (++offset >= PAGE_SIZE && len > 0) { 6941 offset = 0; 6942 page = eb->pages[++i]; 6943 assert_eb_page_uptodate(eb, page); 6944 kaddr = page_address(page); 6945 } 6946 } 6947 if (len) { 6948 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 6949 kaddr[offset] &= ~mask_to_clear; 6950 } 6951 } 6952 6953 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 6954 { 6955 unsigned long distance = (src > dst) ? src - dst : dst - src; 6956 return distance < len; 6957 } 6958 6959 static void copy_pages(struct page *dst_page, struct page *src_page, 6960 unsigned long dst_off, unsigned long src_off, 6961 unsigned long len) 6962 { 6963 char *dst_kaddr = page_address(dst_page); 6964 char *src_kaddr; 6965 int must_memmove = 0; 6966 6967 if (dst_page != src_page) { 6968 src_kaddr = page_address(src_page); 6969 } else { 6970 src_kaddr = dst_kaddr; 6971 if (areas_overlap(src_off, dst_off, len)) 6972 must_memmove = 1; 6973 } 6974 6975 if (must_memmove) 6976 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 6977 else 6978 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 6979 } 6980 6981 void memcpy_extent_buffer(const struct extent_buffer *dst, 6982 unsigned long dst_offset, unsigned long src_offset, 6983 unsigned long len) 6984 { 6985 size_t cur; 6986 size_t dst_off_in_page; 6987 size_t src_off_in_page; 6988 unsigned long dst_i; 6989 unsigned long src_i; 6990 6991 if (check_eb_range(dst, dst_offset, len) || 6992 check_eb_range(dst, src_offset, len)) 6993 return; 6994 6995 while (len > 0) { 6996 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); 6997 src_off_in_page = get_eb_offset_in_page(dst, src_offset); 6998 6999 dst_i = get_eb_page_index(dst_offset); 7000 src_i = get_eb_page_index(src_offset); 7001 7002 cur = min(len, (unsigned long)(PAGE_SIZE - 7003 src_off_in_page)); 7004 cur = min_t(unsigned long, cur, 7005 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 7006 7007 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7008 dst_off_in_page, src_off_in_page, cur); 7009 7010 src_offset += cur; 7011 dst_offset += cur; 7012 len -= cur; 7013 } 7014 } 7015 7016 void memmove_extent_buffer(const struct extent_buffer *dst, 7017 unsigned long dst_offset, unsigned long src_offset, 7018 unsigned long len) 7019 { 7020 size_t cur; 7021 size_t dst_off_in_page; 7022 size_t src_off_in_page; 7023 unsigned long dst_end = dst_offset + len - 1; 7024 unsigned long src_end = src_offset + len - 1; 7025 unsigned long dst_i; 7026 unsigned long src_i; 7027 7028 if (check_eb_range(dst, dst_offset, len) || 7029 check_eb_range(dst, src_offset, len)) 7030 return; 7031 if (dst_offset < src_offset) { 7032 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 7033 return; 7034 } 7035 while (len > 0) { 7036 dst_i = get_eb_page_index(dst_end); 7037 src_i = get_eb_page_index(src_end); 7038 7039 dst_off_in_page = get_eb_offset_in_page(dst, dst_end); 7040 src_off_in_page = get_eb_offset_in_page(dst, src_end); 7041 7042 cur = min_t(unsigned long, len, src_off_in_page + 1); 7043 cur = min(cur, dst_off_in_page + 1); 7044 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7045 dst_off_in_page - cur + 1, 7046 src_off_in_page - cur + 1, cur); 7047 7048 dst_end -= cur; 7049 src_end -= cur; 7050 len -= cur; 7051 } 7052 } 7053 7054 static struct extent_buffer *get_next_extent_buffer( 7055 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 7056 { 7057 struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; 7058 struct extent_buffer *found = NULL; 7059 u64 page_start = page_offset(page); 7060 int ret; 7061 int i; 7062 7063 ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); 7064 ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); 7065 lockdep_assert_held(&fs_info->buffer_lock); 7066 7067 ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, 7068 bytenr >> fs_info->sectorsize_bits, 7069 PAGE_SIZE / fs_info->nodesize); 7070 for (i = 0; i < ret; i++) { 7071 /* Already beyond page end */ 7072 if (gang[i]->start >= page_start + PAGE_SIZE) 7073 break; 7074 /* Found one */ 7075 if (gang[i]->start >= bytenr) { 7076 found = gang[i]; 7077 break; 7078 } 7079 } 7080 return found; 7081 } 7082 7083 static int try_release_subpage_extent_buffer(struct page *page) 7084 { 7085 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 7086 u64 cur = page_offset(page); 7087 const u64 end = page_offset(page) + PAGE_SIZE; 7088 int ret; 7089 7090 while (cur < end) { 7091 struct extent_buffer *eb = NULL; 7092 7093 /* 7094 * Unlike try_release_extent_buffer() which uses page->private 7095 * to grab buffer, for subpage case we rely on radix tree, thus 7096 * we need to ensure radix tree consistency. 7097 * 7098 * We also want an atomic snapshot of the radix tree, thus go 7099 * with spinlock rather than RCU. 7100 */ 7101 spin_lock(&fs_info->buffer_lock); 7102 eb = get_next_extent_buffer(fs_info, page, cur); 7103 if (!eb) { 7104 /* No more eb in the page range after or at cur */ 7105 spin_unlock(&fs_info->buffer_lock); 7106 break; 7107 } 7108 cur = eb->start + eb->len; 7109 7110 /* 7111 * The same as try_release_extent_buffer(), to ensure the eb 7112 * won't disappear out from under us. 7113 */ 7114 spin_lock(&eb->refs_lock); 7115 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7116 spin_unlock(&eb->refs_lock); 7117 spin_unlock(&fs_info->buffer_lock); 7118 break; 7119 } 7120 spin_unlock(&fs_info->buffer_lock); 7121 7122 /* 7123 * If tree ref isn't set then we know the ref on this eb is a 7124 * real ref, so just return, this eb will likely be freed soon 7125 * anyway. 7126 */ 7127 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7128 spin_unlock(&eb->refs_lock); 7129 break; 7130 } 7131 7132 /* 7133 * Here we don't care about the return value, we will always 7134 * check the page private at the end. And 7135 * release_extent_buffer() will release the refs_lock. 7136 */ 7137 release_extent_buffer(eb); 7138 } 7139 /* 7140 * Finally to check if we have cleared page private, as if we have 7141 * released all ebs in the page, the page private should be cleared now. 7142 */ 7143 spin_lock(&page->mapping->private_lock); 7144 if (!PagePrivate(page)) 7145 ret = 1; 7146 else 7147 ret = 0; 7148 spin_unlock(&page->mapping->private_lock); 7149 return ret; 7150 7151 } 7152 7153 int try_release_extent_buffer(struct page *page) 7154 { 7155 struct extent_buffer *eb; 7156 7157 if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) 7158 return try_release_subpage_extent_buffer(page); 7159 7160 /* 7161 * We need to make sure nobody is changing page->private, as we rely on 7162 * page->private as the pointer to extent buffer. 7163 */ 7164 spin_lock(&page->mapping->private_lock); 7165 if (!PagePrivate(page)) { 7166 spin_unlock(&page->mapping->private_lock); 7167 return 1; 7168 } 7169 7170 eb = (struct extent_buffer *)page->private; 7171 BUG_ON(!eb); 7172 7173 /* 7174 * This is a little awful but should be ok, we need to make sure that 7175 * the eb doesn't disappear out from under us while we're looking at 7176 * this page. 7177 */ 7178 spin_lock(&eb->refs_lock); 7179 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7180 spin_unlock(&eb->refs_lock); 7181 spin_unlock(&page->mapping->private_lock); 7182 return 0; 7183 } 7184 spin_unlock(&page->mapping->private_lock); 7185 7186 /* 7187 * If tree ref isn't set then we know the ref on this eb is a real ref, 7188 * so just return, this page will likely be freed soon anyway. 7189 */ 7190 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7191 spin_unlock(&eb->refs_lock); 7192 return 0; 7193 } 7194 7195 return release_extent_buffer(eb); 7196 } 7197 7198 /* 7199 * btrfs_readahead_tree_block - attempt to readahead a child block 7200 * @fs_info: the fs_info 7201 * @bytenr: bytenr to read 7202 * @owner_root: objectid of the root that owns this eb 7203 * @gen: generation for the uptodate check, can be 0 7204 * @level: level for the eb 7205 * 7206 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 7207 * normal uptodate check of the eb, without checking the generation. If we have 7208 * to read the block we will not block on anything. 7209 */ 7210 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 7211 u64 bytenr, u64 owner_root, u64 gen, int level) 7212 { 7213 struct extent_buffer *eb; 7214 int ret; 7215 7216 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 7217 if (IS_ERR(eb)) 7218 return; 7219 7220 if (btrfs_buffer_uptodate(eb, gen, 1)) { 7221 free_extent_buffer(eb); 7222 return; 7223 } 7224 7225 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); 7226 if (ret < 0) 7227 free_extent_buffer_stale(eb); 7228 else 7229 free_extent_buffer(eb); 7230 } 7231 7232 /* 7233 * btrfs_readahead_node_child - readahead a node's child block 7234 * @node: parent node we're reading from 7235 * @slot: slot in the parent node for the child we want to read 7236 * 7237 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 7238 * the slot in the node provided. 7239 */ 7240 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 7241 { 7242 btrfs_readahead_tree_block(node->fs_info, 7243 btrfs_node_blockptr(node, slot), 7244 btrfs_header_owner(node), 7245 btrfs_node_ptr_generation(node, slot), 7246 btrfs_header_level(node) - 1); 7247 } 7248