1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "misc.h" 18 #include "extent_io.h" 19 #include "extent-io-tree.h" 20 #include "extent_map.h" 21 #include "ctree.h" 22 #include "btrfs_inode.h" 23 #include "volumes.h" 24 #include "check-integrity.h" 25 #include "locking.h" 26 #include "rcu-string.h" 27 #include "backref.h" 28 #include "disk-io.h" 29 #include "subpage.h" 30 #include "zoned.h" 31 #include "block-group.h" 32 #include "compression.h" 33 34 static struct kmem_cache *extent_state_cache; 35 static struct kmem_cache *extent_buffer_cache; 36 static struct bio_set btrfs_bioset; 37 38 static inline bool extent_state_in_tree(const struct extent_state *state) 39 { 40 return !RB_EMPTY_NODE(&state->rb_node); 41 } 42 43 #ifdef CONFIG_BTRFS_DEBUG 44 static LIST_HEAD(states); 45 static DEFINE_SPINLOCK(leak_lock); 46 47 static inline void btrfs_leak_debug_add(spinlock_t *lock, 48 struct list_head *new, 49 struct list_head *head) 50 { 51 unsigned long flags; 52 53 spin_lock_irqsave(lock, flags); 54 list_add(new, head); 55 spin_unlock_irqrestore(lock, flags); 56 } 57 58 static inline void btrfs_leak_debug_del(spinlock_t *lock, 59 struct list_head *entry) 60 { 61 unsigned long flags; 62 63 spin_lock_irqsave(lock, flags); 64 list_del(entry); 65 spin_unlock_irqrestore(lock, flags); 66 } 67 68 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 69 { 70 struct extent_buffer *eb; 71 unsigned long flags; 72 73 /* 74 * If we didn't get into open_ctree our allocated_ebs will not be 75 * initialized, so just skip this. 76 */ 77 if (!fs_info->allocated_ebs.next) 78 return; 79 80 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 81 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 82 while (!list_empty(&fs_info->allocated_ebs)) { 83 eb = list_first_entry(&fs_info->allocated_ebs, 84 struct extent_buffer, leak_list); 85 pr_err( 86 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", 87 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, 88 btrfs_header_owner(eb)); 89 list_del(&eb->leak_list); 90 kmem_cache_free(extent_buffer_cache, eb); 91 } 92 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 93 } 94 95 static inline void btrfs_extent_state_leak_debug_check(void) 96 { 97 struct extent_state *state; 98 99 while (!list_empty(&states)) { 100 state = list_entry(states.next, struct extent_state, leak_list); 101 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 102 state->start, state->end, state->state, 103 extent_state_in_tree(state), 104 refcount_read(&state->refs)); 105 list_del(&state->leak_list); 106 kmem_cache_free(extent_state_cache, state); 107 } 108 } 109 110 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 111 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 112 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 113 struct extent_io_tree *tree, u64 start, u64 end) 114 { 115 struct inode *inode = tree->private_data; 116 u64 isize; 117 118 if (!inode || !is_data_inode(inode)) 119 return; 120 121 isize = i_size_read(inode); 122 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 123 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 124 "%s: ino %llu isize %llu odd range [%llu,%llu]", 125 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 126 } 127 } 128 #else 129 #define btrfs_leak_debug_add(lock, new, head) do {} while (0) 130 #define btrfs_leak_debug_del(lock, entry) do {} while (0) 131 #define btrfs_extent_state_leak_debug_check() do {} while (0) 132 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 133 #endif 134 135 struct tree_entry { 136 u64 start; 137 u64 end; 138 struct rb_node rb_node; 139 }; 140 141 /* 142 * Structure to record info about the bio being assembled, and other info like 143 * how many bytes are there before stripe/ordered extent boundary. 144 */ 145 struct btrfs_bio_ctrl { 146 struct bio *bio; 147 int mirror_num; 148 enum btrfs_compression_type compress_type; 149 u32 len_to_stripe_boundary; 150 u32 len_to_oe_boundary; 151 }; 152 153 struct extent_page_data { 154 struct btrfs_bio_ctrl bio_ctrl; 155 /* tells writepage not to lock the state bits for this range 156 * it still does the unlocking 157 */ 158 unsigned int extent_locked:1; 159 160 /* tells the submit_bio code to use REQ_SYNC */ 161 unsigned int sync_io:1; 162 }; 163 164 static int add_extent_changeset(struct extent_state *state, u32 bits, 165 struct extent_changeset *changeset, 166 int set) 167 { 168 int ret; 169 170 if (!changeset) 171 return 0; 172 if (set && (state->state & bits) == bits) 173 return 0; 174 if (!set && (state->state & bits) == 0) 175 return 0; 176 changeset->bytes_changed += state->end - state->start + 1; 177 ret = ulist_add(&changeset->range_changed, state->start, state->end, 178 GFP_ATOMIC); 179 return ret; 180 } 181 182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 183 { 184 struct bio *bio; 185 struct bio_vec *bv; 186 struct inode *inode; 187 int mirror_num; 188 189 if (!bio_ctrl->bio) 190 return; 191 192 bio = bio_ctrl->bio; 193 bv = bio_first_bvec_all(bio); 194 inode = bv->bv_page->mapping->host; 195 mirror_num = bio_ctrl->mirror_num; 196 197 /* Caller should ensure the bio has at least some range added */ 198 ASSERT(bio->bi_iter.bi_size); 199 200 btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; 201 202 if (!is_data_inode(inode)) 203 btrfs_submit_metadata_bio(inode, bio, mirror_num); 204 else if (btrfs_op(bio) == BTRFS_MAP_WRITE) 205 btrfs_submit_data_write_bio(inode, bio, mirror_num); 206 else 207 btrfs_submit_data_read_bio(inode, bio, mirror_num, 208 bio_ctrl->compress_type); 209 210 /* The bio is owned by the bi_end_io handler now */ 211 bio_ctrl->bio = NULL; 212 } 213 214 /* 215 * Submit or fail the current bio in an extent_page_data structure. 216 */ 217 static void submit_write_bio(struct extent_page_data *epd, int ret) 218 { 219 struct bio *bio = epd->bio_ctrl.bio; 220 221 if (!bio) 222 return; 223 224 if (ret) { 225 ASSERT(ret < 0); 226 bio->bi_status = errno_to_blk_status(ret); 227 bio_endio(bio); 228 /* The bio is owned by the bi_end_io handler now */ 229 epd->bio_ctrl.bio = NULL; 230 } else { 231 submit_one_bio(&epd->bio_ctrl); 232 } 233 } 234 235 int __init extent_state_cache_init(void) 236 { 237 extent_state_cache = kmem_cache_create("btrfs_extent_state", 238 sizeof(struct extent_state), 0, 239 SLAB_MEM_SPREAD, NULL); 240 if (!extent_state_cache) 241 return -ENOMEM; 242 return 0; 243 } 244 245 int __init extent_io_init(void) 246 { 247 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 248 sizeof(struct extent_buffer), 0, 249 SLAB_MEM_SPREAD, NULL); 250 if (!extent_buffer_cache) 251 return -ENOMEM; 252 253 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 254 offsetof(struct btrfs_bio, bio), 255 BIOSET_NEED_BVECS)) 256 goto free_buffer_cache; 257 258 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 259 goto free_bioset; 260 261 return 0; 262 263 free_bioset: 264 bioset_exit(&btrfs_bioset); 265 266 free_buffer_cache: 267 kmem_cache_destroy(extent_buffer_cache); 268 extent_buffer_cache = NULL; 269 return -ENOMEM; 270 } 271 272 void __cold extent_state_cache_exit(void) 273 { 274 btrfs_extent_state_leak_debug_check(); 275 kmem_cache_destroy(extent_state_cache); 276 } 277 278 void __cold extent_io_exit(void) 279 { 280 /* 281 * Make sure all delayed rcu free are flushed before we 282 * destroy caches. 283 */ 284 rcu_barrier(); 285 kmem_cache_destroy(extent_buffer_cache); 286 bioset_exit(&btrfs_bioset); 287 } 288 289 /* 290 * For the file_extent_tree, we want to hold the inode lock when we lookup and 291 * update the disk_i_size, but lockdep will complain because our io_tree we hold 292 * the tree lock and get the inode lock when setting delalloc. These two things 293 * are unrelated, so make a class for the file_extent_tree so we don't get the 294 * two locking patterns mixed up. 295 */ 296 static struct lock_class_key file_extent_tree_class; 297 298 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 299 struct extent_io_tree *tree, unsigned int owner, 300 void *private_data) 301 { 302 tree->fs_info = fs_info; 303 tree->state = RB_ROOT; 304 tree->dirty_bytes = 0; 305 spin_lock_init(&tree->lock); 306 tree->private_data = private_data; 307 tree->owner = owner; 308 if (owner == IO_TREE_INODE_FILE_EXTENT) 309 lockdep_set_class(&tree->lock, &file_extent_tree_class); 310 } 311 312 void extent_io_tree_release(struct extent_io_tree *tree) 313 { 314 spin_lock(&tree->lock); 315 /* 316 * Do a single barrier for the waitqueue_active check here, the state 317 * of the waitqueue should not change once extent_io_tree_release is 318 * called. 319 */ 320 smp_mb(); 321 while (!RB_EMPTY_ROOT(&tree->state)) { 322 struct rb_node *node; 323 struct extent_state *state; 324 325 node = rb_first(&tree->state); 326 state = rb_entry(node, struct extent_state, rb_node); 327 rb_erase(&state->rb_node, &tree->state); 328 RB_CLEAR_NODE(&state->rb_node); 329 /* 330 * btree io trees aren't supposed to have tasks waiting for 331 * changes in the flags of extent states ever. 332 */ 333 ASSERT(!waitqueue_active(&state->wq)); 334 free_extent_state(state); 335 336 cond_resched_lock(&tree->lock); 337 } 338 spin_unlock(&tree->lock); 339 } 340 341 static struct extent_state *alloc_extent_state(gfp_t mask) 342 { 343 struct extent_state *state; 344 345 /* 346 * The given mask might be not appropriate for the slab allocator, 347 * drop the unsupported bits 348 */ 349 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 350 state = kmem_cache_alloc(extent_state_cache, mask); 351 if (!state) 352 return state; 353 state->state = 0; 354 state->failrec = NULL; 355 RB_CLEAR_NODE(&state->rb_node); 356 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); 357 refcount_set(&state->refs, 1); 358 init_waitqueue_head(&state->wq); 359 trace_alloc_extent_state(state, mask, _RET_IP_); 360 return state; 361 } 362 363 void free_extent_state(struct extent_state *state) 364 { 365 if (!state) 366 return; 367 if (refcount_dec_and_test(&state->refs)) { 368 WARN_ON(extent_state_in_tree(state)); 369 btrfs_leak_debug_del(&leak_lock, &state->leak_list); 370 trace_free_extent_state(state, _RET_IP_); 371 kmem_cache_free(extent_state_cache, state); 372 } 373 } 374 375 /** 376 * Search @tree for an entry that contains @offset. Such entry would have 377 * entry->start <= offset && entry->end >= offset. 378 * 379 * @tree: the tree to search 380 * @offset: offset that should fall within an entry in @tree 381 * @node_ret: pointer where new node should be anchored (used when inserting an 382 * entry in the tree) 383 * @parent_ret: points to entry which would have been the parent of the entry, 384 * containing @offset 385 * 386 * Return a pointer to the entry that contains @offset byte address and don't change 387 * @node_ret and @parent_ret. 388 * 389 * If no such entry exists, return pointer to entry that ends before @offset 390 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. 391 */ 392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, 393 u64 offset, 394 struct rb_node ***node_ret, 395 struct rb_node **parent_ret) 396 { 397 struct rb_root *root = &tree->state; 398 struct rb_node **node = &root->rb_node; 399 struct rb_node *prev = NULL; 400 struct tree_entry *entry; 401 402 while (*node) { 403 prev = *node; 404 entry = rb_entry(prev, struct tree_entry, rb_node); 405 406 if (offset < entry->start) 407 node = &(*node)->rb_left; 408 else if (offset > entry->end) 409 node = &(*node)->rb_right; 410 else 411 return *node; 412 } 413 414 if (node_ret) 415 *node_ret = node; 416 if (parent_ret) 417 *parent_ret = prev; 418 419 /* Search neighbors until we find the first one past the end */ 420 while (prev && offset > entry->end) { 421 prev = rb_next(prev); 422 entry = rb_entry(prev, struct tree_entry, rb_node); 423 } 424 425 return prev; 426 } 427 428 /* 429 * Inexact rb-tree search, return the next entry if @offset is not found 430 */ 431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) 432 { 433 return tree_search_for_insert(tree, offset, NULL, NULL); 434 } 435 436 /** 437 * Search offset in the tree or fill neighbor rbtree node pointers. 438 * 439 * @tree: the tree to search 440 * @offset: offset that should fall within an entry in @tree 441 * @next_ret: pointer to the first entry whose range ends after @offset 442 * @prev_ret: pointer to the first entry whose range begins before @offset 443 * 444 * Return a pointer to the entry that contains @offset byte address. If no 445 * such entry exists, then return NULL and fill @prev_ret and @next_ret. 446 * Otherwise return the found entry and other pointers are left untouched. 447 */ 448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, 449 u64 offset, 450 struct rb_node **prev_ret, 451 struct rb_node **next_ret) 452 { 453 struct rb_root *root = &tree->state; 454 struct rb_node **node = &root->rb_node; 455 struct rb_node *prev = NULL; 456 struct rb_node *orig_prev = NULL; 457 struct tree_entry *entry; 458 459 ASSERT(prev_ret); 460 ASSERT(next_ret); 461 462 while (*node) { 463 prev = *node; 464 entry = rb_entry(prev, struct tree_entry, rb_node); 465 466 if (offset < entry->start) 467 node = &(*node)->rb_left; 468 else if (offset > entry->end) 469 node = &(*node)->rb_right; 470 else 471 return *node; 472 } 473 474 orig_prev = prev; 475 while (prev && offset > entry->end) { 476 prev = rb_next(prev); 477 entry = rb_entry(prev, struct tree_entry, rb_node); 478 } 479 *next_ret = prev; 480 prev = orig_prev; 481 482 entry = rb_entry(prev, struct tree_entry, rb_node); 483 while (prev && offset < entry->start) { 484 prev = rb_prev(prev); 485 entry = rb_entry(prev, struct tree_entry, rb_node); 486 } 487 *prev_ret = prev; 488 489 return NULL; 490 } 491 492 /* 493 * utility function to look for merge candidates inside a given range. 494 * Any extents with matching state are merged together into a single 495 * extent in the tree. Extents with EXTENT_IO in their state field 496 * are not merged because the end_io handlers need to be able to do 497 * operations on them without sleeping (or doing allocations/splits). 498 * 499 * This should be called with the tree lock held. 500 */ 501 static void merge_state(struct extent_io_tree *tree, 502 struct extent_state *state) 503 { 504 struct extent_state *other; 505 struct rb_node *other_node; 506 507 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 508 return; 509 510 other_node = rb_prev(&state->rb_node); 511 if (other_node) { 512 other = rb_entry(other_node, struct extent_state, rb_node); 513 if (other->end == state->start - 1 && 514 other->state == state->state) { 515 if (tree->private_data && 516 is_data_inode(tree->private_data)) 517 btrfs_merge_delalloc_extent(tree->private_data, 518 state, other); 519 state->start = other->start; 520 rb_erase(&other->rb_node, &tree->state); 521 RB_CLEAR_NODE(&other->rb_node); 522 free_extent_state(other); 523 } 524 } 525 other_node = rb_next(&state->rb_node); 526 if (other_node) { 527 other = rb_entry(other_node, struct extent_state, rb_node); 528 if (other->start == state->end + 1 && 529 other->state == state->state) { 530 if (tree->private_data && 531 is_data_inode(tree->private_data)) 532 btrfs_merge_delalloc_extent(tree->private_data, 533 state, other); 534 state->end = other->end; 535 rb_erase(&other->rb_node, &tree->state); 536 RB_CLEAR_NODE(&other->rb_node); 537 free_extent_state(other); 538 } 539 } 540 } 541 542 static void set_state_bits(struct extent_io_tree *tree, 543 struct extent_state *state, u32 bits, 544 struct extent_changeset *changeset); 545 546 /* 547 * insert an extent_state struct into the tree. 'bits' are set on the 548 * struct before it is inserted. 549 * 550 * This may return -EEXIST if the extent is already there, in which case the 551 * state struct is freed. 552 * 553 * The tree lock is not taken internally. This is a utility function and 554 * probably isn't what you want to call (see set/clear_extent_bit). 555 */ 556 static int insert_state(struct extent_io_tree *tree, 557 struct extent_state *state, 558 u32 bits, struct extent_changeset *changeset) 559 { 560 struct rb_node **node; 561 struct rb_node *parent; 562 const u64 end = state->end; 563 564 set_state_bits(tree, state, bits, changeset); 565 566 node = &tree->state.rb_node; 567 while (*node) { 568 struct tree_entry *entry; 569 570 parent = *node; 571 entry = rb_entry(parent, struct tree_entry, rb_node); 572 573 if (end < entry->start) { 574 node = &(*node)->rb_left; 575 } else if (end > entry->end) { 576 node = &(*node)->rb_right; 577 } else { 578 btrfs_err(tree->fs_info, 579 "found node %llu %llu on insert of %llu %llu", 580 entry->start, entry->end, state->start, end); 581 return -EEXIST; 582 } 583 } 584 585 rb_link_node(&state->rb_node, parent, node); 586 rb_insert_color(&state->rb_node, &tree->state); 587 588 merge_state(tree, state); 589 return 0; 590 } 591 592 /* 593 * Insert state to @tree to the location given by @node and @parent. 594 */ 595 static void insert_state_fast(struct extent_io_tree *tree, 596 struct extent_state *state, struct rb_node **node, 597 struct rb_node *parent, unsigned bits, 598 struct extent_changeset *changeset) 599 { 600 set_state_bits(tree, state, bits, changeset); 601 rb_link_node(&state->rb_node, parent, node); 602 rb_insert_color(&state->rb_node, &tree->state); 603 merge_state(tree, state); 604 } 605 606 /* 607 * split a given extent state struct in two, inserting the preallocated 608 * struct 'prealloc' as the newly created second half. 'split' indicates an 609 * offset inside 'orig' where it should be split. 610 * 611 * Before calling, 612 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 613 * are two extent state structs in the tree: 614 * prealloc: [orig->start, split - 1] 615 * orig: [ split, orig->end ] 616 * 617 * The tree locks are not taken by this function. They need to be held 618 * by the caller. 619 */ 620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 621 struct extent_state *prealloc, u64 split) 622 { 623 struct rb_node *parent = NULL; 624 struct rb_node **node; 625 626 if (tree->private_data && is_data_inode(tree->private_data)) 627 btrfs_split_delalloc_extent(tree->private_data, orig, split); 628 629 prealloc->start = orig->start; 630 prealloc->end = split - 1; 631 prealloc->state = orig->state; 632 orig->start = split; 633 634 parent = &orig->rb_node; 635 node = &parent; 636 while (*node) { 637 struct tree_entry *entry; 638 639 parent = *node; 640 entry = rb_entry(parent, struct tree_entry, rb_node); 641 642 if (prealloc->end < entry->start) { 643 node = &(*node)->rb_left; 644 } else if (prealloc->end > entry->end) { 645 node = &(*node)->rb_right; 646 } else { 647 free_extent_state(prealloc); 648 return -EEXIST; 649 } 650 } 651 652 rb_link_node(&prealloc->rb_node, parent, node); 653 rb_insert_color(&prealloc->rb_node, &tree->state); 654 655 return 0; 656 } 657 658 static struct extent_state *next_state(struct extent_state *state) 659 { 660 struct rb_node *next = rb_next(&state->rb_node); 661 if (next) 662 return rb_entry(next, struct extent_state, rb_node); 663 else 664 return NULL; 665 } 666 667 /* 668 * utility function to clear some bits in an extent state struct. 669 * it will optionally wake up anyone waiting on this state (wake == 1). 670 * 671 * If no bits are set on the state struct after clearing things, the 672 * struct is freed and removed from the tree 673 */ 674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 675 struct extent_state *state, 676 u32 bits, int wake, 677 struct extent_changeset *changeset) 678 { 679 struct extent_state *next; 680 u32 bits_to_clear = bits & ~EXTENT_CTLBITS; 681 int ret; 682 683 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 684 u64 range = state->end - state->start + 1; 685 WARN_ON(range > tree->dirty_bytes); 686 tree->dirty_bytes -= range; 687 } 688 689 if (tree->private_data && is_data_inode(tree->private_data)) 690 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 691 692 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 693 BUG_ON(ret < 0); 694 state->state &= ~bits_to_clear; 695 if (wake) 696 wake_up(&state->wq); 697 if (state->state == 0) { 698 next = next_state(state); 699 if (extent_state_in_tree(state)) { 700 rb_erase(&state->rb_node, &tree->state); 701 RB_CLEAR_NODE(&state->rb_node); 702 free_extent_state(state); 703 } else { 704 WARN_ON(1); 705 } 706 } else { 707 merge_state(tree, state); 708 next = next_state(state); 709 } 710 return next; 711 } 712 713 static struct extent_state * 714 alloc_extent_state_atomic(struct extent_state *prealloc) 715 { 716 if (!prealloc) 717 prealloc = alloc_extent_state(GFP_ATOMIC); 718 719 return prealloc; 720 } 721 722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 723 { 724 btrfs_panic(tree->fs_info, err, 725 "locking error: extent tree was modified by another thread while locked"); 726 } 727 728 /* 729 * clear some bits on a range in the tree. This may require splitting 730 * or inserting elements in the tree, so the gfp mask is used to 731 * indicate which allocations or sleeping are allowed. 732 * 733 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 734 * the given range from the tree regardless of state (ie for truncate). 735 * 736 * the range [start, end] is inclusive. 737 * 738 * This takes the tree lock, and returns 0 on success and < 0 on error. 739 */ 740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 741 u32 bits, int wake, int delete, 742 struct extent_state **cached_state, 743 gfp_t mask, struct extent_changeset *changeset) 744 { 745 struct extent_state *state; 746 struct extent_state *cached; 747 struct extent_state *prealloc = NULL; 748 struct rb_node *node; 749 u64 last_end; 750 int err; 751 int clear = 0; 752 753 btrfs_debug_check_extent_io_range(tree, start, end); 754 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); 755 756 if (bits & EXTENT_DELALLOC) 757 bits |= EXTENT_NORESERVE; 758 759 if (delete) 760 bits |= ~EXTENT_CTLBITS; 761 762 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 763 clear = 1; 764 again: 765 if (!prealloc && gfpflags_allow_blocking(mask)) { 766 /* 767 * Don't care for allocation failure here because we might end 768 * up not needing the pre-allocated extent state at all, which 769 * is the case if we only have in the tree extent states that 770 * cover our input range and don't cover too any other range. 771 * If we end up needing a new extent state we allocate it later. 772 */ 773 prealloc = alloc_extent_state(mask); 774 } 775 776 spin_lock(&tree->lock); 777 if (cached_state) { 778 cached = *cached_state; 779 780 if (clear) { 781 *cached_state = NULL; 782 cached_state = NULL; 783 } 784 785 if (cached && extent_state_in_tree(cached) && 786 cached->start <= start && cached->end > start) { 787 if (clear) 788 refcount_dec(&cached->refs); 789 state = cached; 790 goto hit_next; 791 } 792 if (clear) 793 free_extent_state(cached); 794 } 795 /* 796 * this search will find the extents that end after 797 * our range starts 798 */ 799 node = tree_search(tree, start); 800 if (!node) 801 goto out; 802 state = rb_entry(node, struct extent_state, rb_node); 803 hit_next: 804 if (state->start > end) 805 goto out; 806 WARN_ON(state->end < start); 807 last_end = state->end; 808 809 /* the state doesn't have the wanted bits, go ahead */ 810 if (!(state->state & bits)) { 811 state = next_state(state); 812 goto next; 813 } 814 815 /* 816 * | ---- desired range ---- | 817 * | state | or 818 * | ------------- state -------------- | 819 * 820 * We need to split the extent we found, and may flip 821 * bits on second half. 822 * 823 * If the extent we found extends past our range, we 824 * just split and search again. It'll get split again 825 * the next time though. 826 * 827 * If the extent we found is inside our range, we clear 828 * the desired bit on it. 829 */ 830 831 if (state->start < start) { 832 prealloc = alloc_extent_state_atomic(prealloc); 833 BUG_ON(!prealloc); 834 err = split_state(tree, state, prealloc, start); 835 if (err) 836 extent_io_tree_panic(tree, err); 837 838 prealloc = NULL; 839 if (err) 840 goto out; 841 if (state->end <= end) { 842 state = clear_state_bit(tree, state, bits, wake, changeset); 843 goto next; 844 } 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and clear the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 prealloc = alloc_extent_state_atomic(prealloc); 855 BUG_ON(!prealloc); 856 err = split_state(tree, state, prealloc, end + 1); 857 if (err) 858 extent_io_tree_panic(tree, err); 859 860 if (wake) 861 wake_up(&state->wq); 862 863 clear_state_bit(tree, prealloc, bits, wake, changeset); 864 865 prealloc = NULL; 866 goto out; 867 } 868 869 state = clear_state_bit(tree, state, bits, wake, changeset); 870 next: 871 if (last_end == (u64)-1) 872 goto out; 873 start = last_end + 1; 874 if (start <= end && state && !need_resched()) 875 goto hit_next; 876 877 search_again: 878 if (start > end) 879 goto out; 880 spin_unlock(&tree->lock); 881 if (gfpflags_allow_blocking(mask)) 882 cond_resched(); 883 goto again; 884 885 out: 886 spin_unlock(&tree->lock); 887 if (prealloc) 888 free_extent_state(prealloc); 889 890 return 0; 891 892 } 893 894 static void wait_on_state(struct extent_io_tree *tree, 895 struct extent_state *state) 896 __releases(tree->lock) 897 __acquires(tree->lock) 898 { 899 DEFINE_WAIT(wait); 900 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 901 spin_unlock(&tree->lock); 902 schedule(); 903 spin_lock(&tree->lock); 904 finish_wait(&state->wq, &wait); 905 } 906 907 /* 908 * waits for one or more bits to clear on a range in the state tree. 909 * The range [start, end] is inclusive. 910 * The tree lock is taken by this function 911 */ 912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 913 u32 bits) 914 { 915 struct extent_state *state; 916 struct rb_node *node; 917 918 btrfs_debug_check_extent_io_range(tree, start, end); 919 920 spin_lock(&tree->lock); 921 again: 922 while (1) { 923 /* 924 * this search will find all the extents that end after 925 * our range starts 926 */ 927 node = tree_search(tree, start); 928 process_node: 929 if (!node) 930 break; 931 932 state = rb_entry(node, struct extent_state, rb_node); 933 934 if (state->start > end) 935 goto out; 936 937 if (state->state & bits) { 938 start = state->start; 939 refcount_inc(&state->refs); 940 wait_on_state(tree, state); 941 free_extent_state(state); 942 goto again; 943 } 944 start = state->end + 1; 945 946 if (start > end) 947 break; 948 949 if (!cond_resched_lock(&tree->lock)) { 950 node = rb_next(node); 951 goto process_node; 952 } 953 } 954 out: 955 spin_unlock(&tree->lock); 956 } 957 958 static void set_state_bits(struct extent_io_tree *tree, 959 struct extent_state *state, 960 u32 bits, struct extent_changeset *changeset) 961 { 962 u32 bits_to_set = bits & ~EXTENT_CTLBITS; 963 int ret; 964 965 if (tree->private_data && is_data_inode(tree->private_data)) 966 btrfs_set_delalloc_extent(tree->private_data, state, bits); 967 968 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 969 u64 range = state->end - state->start + 1; 970 tree->dirty_bytes += range; 971 } 972 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 973 BUG_ON(ret < 0); 974 state->state |= bits_to_set; 975 } 976 977 static void cache_state_if_flags(struct extent_state *state, 978 struct extent_state **cached_ptr, 979 unsigned flags) 980 { 981 if (cached_ptr && !(*cached_ptr)) { 982 if (!flags || (state->state & flags)) { 983 *cached_ptr = state; 984 refcount_inc(&state->refs); 985 } 986 } 987 } 988 989 static void cache_state(struct extent_state *state, 990 struct extent_state **cached_ptr) 991 { 992 return cache_state_if_flags(state, cached_ptr, 993 EXTENT_LOCKED | EXTENT_BOUNDARY); 994 } 995 996 /* 997 * set some bits on a range in the tree. This may require allocations or 998 * sleeping, so the gfp mask is used to indicate what is allowed. 999 * 1000 * If any of the exclusive bits are set, this will fail with -EEXIST if some 1001 * part of the range already has the desired bits set. The start of the 1002 * existing range is returned in failed_start in this case. 1003 * 1004 * [start, end] is inclusive This takes the tree lock. 1005 */ 1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, 1007 u32 exclusive_bits, u64 *failed_start, 1008 struct extent_state **cached_state, gfp_t mask, 1009 struct extent_changeset *changeset) 1010 { 1011 struct extent_state *state; 1012 struct extent_state *prealloc = NULL; 1013 struct rb_node *node; 1014 struct rb_node **p; 1015 struct rb_node *parent; 1016 int err = 0; 1017 u64 last_start; 1018 u64 last_end; 1019 1020 btrfs_debug_check_extent_io_range(tree, start, end); 1021 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); 1022 1023 if (exclusive_bits) 1024 ASSERT(failed_start); 1025 else 1026 ASSERT(failed_start == NULL); 1027 again: 1028 if (!prealloc && gfpflags_allow_blocking(mask)) { 1029 /* 1030 * Don't care for allocation failure here because we might end 1031 * up not needing the pre-allocated extent state at all, which 1032 * is the case if we only have in the tree extent states that 1033 * cover our input range and don't cover too any other range. 1034 * If we end up needing a new extent state we allocate it later. 1035 */ 1036 prealloc = alloc_extent_state(mask); 1037 } 1038 1039 spin_lock(&tree->lock); 1040 if (cached_state && *cached_state) { 1041 state = *cached_state; 1042 if (state->start <= start && state->end > start && 1043 extent_state_in_tree(state)) { 1044 node = &state->rb_node; 1045 goto hit_next; 1046 } 1047 } 1048 /* 1049 * this search will find all the extents that end after 1050 * our range starts. 1051 */ 1052 node = tree_search_for_insert(tree, start, &p, &parent); 1053 if (!node) { 1054 prealloc = alloc_extent_state_atomic(prealloc); 1055 BUG_ON(!prealloc); 1056 prealloc->start = start; 1057 prealloc->end = end; 1058 insert_state_fast(tree, prealloc, p, parent, bits, changeset); 1059 cache_state(prealloc, cached_state); 1060 prealloc = NULL; 1061 goto out; 1062 } 1063 state = rb_entry(node, struct extent_state, rb_node); 1064 hit_next: 1065 last_start = state->start; 1066 last_end = state->end; 1067 1068 /* 1069 * | ---- desired range ---- | 1070 * | state | 1071 * 1072 * Just lock what we found and keep going 1073 */ 1074 if (state->start == start && state->end <= end) { 1075 if (state->state & exclusive_bits) { 1076 *failed_start = state->start; 1077 err = -EEXIST; 1078 goto out; 1079 } 1080 1081 set_state_bits(tree, state, bits, changeset); 1082 cache_state(state, cached_state); 1083 merge_state(tree, state); 1084 if (last_end == (u64)-1) 1085 goto out; 1086 start = last_end + 1; 1087 state = next_state(state); 1088 if (start < end && state && state->start == start && 1089 !need_resched()) 1090 goto hit_next; 1091 goto search_again; 1092 } 1093 1094 /* 1095 * | ---- desired range ---- | 1096 * | state | 1097 * or 1098 * | ------------- state -------------- | 1099 * 1100 * We need to split the extent we found, and may flip bits on 1101 * second half. 1102 * 1103 * If the extent we found extends past our 1104 * range, we just split and search again. It'll get split 1105 * again the next time though. 1106 * 1107 * If the extent we found is inside our range, we set the 1108 * desired bit on it. 1109 */ 1110 if (state->start < start) { 1111 if (state->state & exclusive_bits) { 1112 *failed_start = start; 1113 err = -EEXIST; 1114 goto out; 1115 } 1116 1117 /* 1118 * If this extent already has all the bits we want set, then 1119 * skip it, not necessary to split it or do anything with it. 1120 */ 1121 if ((state->state & bits) == bits) { 1122 start = state->end + 1; 1123 cache_state(state, cached_state); 1124 goto search_again; 1125 } 1126 1127 prealloc = alloc_extent_state_atomic(prealloc); 1128 BUG_ON(!prealloc); 1129 err = split_state(tree, state, prealloc, start); 1130 if (err) 1131 extent_io_tree_panic(tree, err); 1132 1133 prealloc = NULL; 1134 if (err) 1135 goto out; 1136 if (state->end <= end) { 1137 set_state_bits(tree, state, bits, changeset); 1138 cache_state(state, cached_state); 1139 merge_state(tree, state); 1140 if (last_end == (u64)-1) 1141 goto out; 1142 start = last_end + 1; 1143 state = next_state(state); 1144 if (start < end && state && state->start == start && 1145 !need_resched()) 1146 goto hit_next; 1147 } 1148 goto search_again; 1149 } 1150 /* 1151 * | ---- desired range ---- | 1152 * | state | or | state | 1153 * 1154 * There's a hole, we need to insert something in it and 1155 * ignore the extent we found. 1156 */ 1157 if (state->start > start) { 1158 u64 this_end; 1159 if (end < last_start) 1160 this_end = end; 1161 else 1162 this_end = last_start - 1; 1163 1164 prealloc = alloc_extent_state_atomic(prealloc); 1165 BUG_ON(!prealloc); 1166 1167 /* 1168 * Avoid to free 'prealloc' if it can be merged with 1169 * the later extent. 1170 */ 1171 prealloc->start = start; 1172 prealloc->end = this_end; 1173 err = insert_state(tree, prealloc, bits, changeset); 1174 if (err) 1175 extent_io_tree_panic(tree, err); 1176 1177 cache_state(prealloc, cached_state); 1178 prealloc = NULL; 1179 start = this_end + 1; 1180 goto search_again; 1181 } 1182 /* 1183 * | ---- desired range ---- | 1184 * | state | 1185 * We need to split the extent, and set the bit 1186 * on the first half 1187 */ 1188 if (state->start <= end && state->end > end) { 1189 if (state->state & exclusive_bits) { 1190 *failed_start = start; 1191 err = -EEXIST; 1192 goto out; 1193 } 1194 1195 prealloc = alloc_extent_state_atomic(prealloc); 1196 BUG_ON(!prealloc); 1197 err = split_state(tree, state, prealloc, end + 1); 1198 if (err) 1199 extent_io_tree_panic(tree, err); 1200 1201 set_state_bits(tree, prealloc, bits, changeset); 1202 cache_state(prealloc, cached_state); 1203 merge_state(tree, prealloc); 1204 prealloc = NULL; 1205 goto out; 1206 } 1207 1208 search_again: 1209 if (start > end) 1210 goto out; 1211 spin_unlock(&tree->lock); 1212 if (gfpflags_allow_blocking(mask)) 1213 cond_resched(); 1214 goto again; 1215 1216 out: 1217 spin_unlock(&tree->lock); 1218 if (prealloc) 1219 free_extent_state(prealloc); 1220 1221 return err; 1222 1223 } 1224 1225 /** 1226 * convert_extent_bit - convert all bits in a given range from one bit to 1227 * another 1228 * @tree: the io tree to search 1229 * @start: the start offset in bytes 1230 * @end: the end offset in bytes (inclusive) 1231 * @bits: the bits to set in this range 1232 * @clear_bits: the bits to clear in this range 1233 * @cached_state: state that we're going to cache 1234 * 1235 * This will go through and set bits for the given range. If any states exist 1236 * already in this range they are set with the given bit and cleared of the 1237 * clear_bits. This is only meant to be used by things that are mergeable, ie 1238 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1239 * boundary bits like LOCK. 1240 * 1241 * All allocations are done with GFP_NOFS. 1242 */ 1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1244 u32 bits, u32 clear_bits, 1245 struct extent_state **cached_state) 1246 { 1247 struct extent_state *state; 1248 struct extent_state *prealloc = NULL; 1249 struct rb_node *node; 1250 struct rb_node **p; 1251 struct rb_node *parent; 1252 int err = 0; 1253 u64 last_start; 1254 u64 last_end; 1255 bool first_iteration = true; 1256 1257 btrfs_debug_check_extent_io_range(tree, start, end); 1258 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, 1259 clear_bits); 1260 1261 again: 1262 if (!prealloc) { 1263 /* 1264 * Best effort, don't worry if extent state allocation fails 1265 * here for the first iteration. We might have a cached state 1266 * that matches exactly the target range, in which case no 1267 * extent state allocations are needed. We'll only know this 1268 * after locking the tree. 1269 */ 1270 prealloc = alloc_extent_state(GFP_NOFS); 1271 if (!prealloc && !first_iteration) 1272 return -ENOMEM; 1273 } 1274 1275 spin_lock(&tree->lock); 1276 if (cached_state && *cached_state) { 1277 state = *cached_state; 1278 if (state->start <= start && state->end > start && 1279 extent_state_in_tree(state)) { 1280 node = &state->rb_node; 1281 goto hit_next; 1282 } 1283 } 1284 1285 /* 1286 * this search will find all the extents that end after 1287 * our range starts. 1288 */ 1289 node = tree_search_for_insert(tree, start, &p, &parent); 1290 if (!node) { 1291 prealloc = alloc_extent_state_atomic(prealloc); 1292 if (!prealloc) { 1293 err = -ENOMEM; 1294 goto out; 1295 } 1296 prealloc->start = start; 1297 prealloc->end = end; 1298 insert_state_fast(tree, prealloc, p, parent, bits, NULL); 1299 cache_state(prealloc, cached_state); 1300 prealloc = NULL; 1301 goto out; 1302 } 1303 state = rb_entry(node, struct extent_state, rb_node); 1304 hit_next: 1305 last_start = state->start; 1306 last_end = state->end; 1307 1308 /* 1309 * | ---- desired range ---- | 1310 * | state | 1311 * 1312 * Just lock what we found and keep going 1313 */ 1314 if (state->start == start && state->end <= end) { 1315 set_state_bits(tree, state, bits, NULL); 1316 cache_state(state, cached_state); 1317 state = clear_state_bit(tree, state, clear_bits, 0, NULL); 1318 if (last_end == (u64)-1) 1319 goto out; 1320 start = last_end + 1; 1321 if (start < end && state && state->start == start && 1322 !need_resched()) 1323 goto hit_next; 1324 goto search_again; 1325 } 1326 1327 /* 1328 * | ---- desired range ---- | 1329 * | state | 1330 * or 1331 * | ------------- state -------------- | 1332 * 1333 * We need to split the extent we found, and may flip bits on 1334 * second half. 1335 * 1336 * If the extent we found extends past our 1337 * range, we just split and search again. It'll get split 1338 * again the next time though. 1339 * 1340 * If the extent we found is inside our range, we set the 1341 * desired bit on it. 1342 */ 1343 if (state->start < start) { 1344 prealloc = alloc_extent_state_atomic(prealloc); 1345 if (!prealloc) { 1346 err = -ENOMEM; 1347 goto out; 1348 } 1349 err = split_state(tree, state, prealloc, start); 1350 if (err) 1351 extent_io_tree_panic(tree, err); 1352 prealloc = NULL; 1353 if (err) 1354 goto out; 1355 if (state->end <= end) { 1356 set_state_bits(tree, state, bits, NULL); 1357 cache_state(state, cached_state); 1358 state = clear_state_bit(tree, state, clear_bits, 0, NULL); 1359 if (last_end == (u64)-1) 1360 goto out; 1361 start = last_end + 1; 1362 if (start < end && state && state->start == start && 1363 !need_resched()) 1364 goto hit_next; 1365 } 1366 goto search_again; 1367 } 1368 /* 1369 * | ---- desired range ---- | 1370 * | state | or | state | 1371 * 1372 * There's a hole, we need to insert something in it and 1373 * ignore the extent we found. 1374 */ 1375 if (state->start > start) { 1376 u64 this_end; 1377 if (end < last_start) 1378 this_end = end; 1379 else 1380 this_end = last_start - 1; 1381 1382 prealloc = alloc_extent_state_atomic(prealloc); 1383 if (!prealloc) { 1384 err = -ENOMEM; 1385 goto out; 1386 } 1387 1388 /* 1389 * Avoid to free 'prealloc' if it can be merged with 1390 * the later extent. 1391 */ 1392 prealloc->start = start; 1393 prealloc->end = this_end; 1394 err = insert_state(tree, prealloc, bits, NULL); 1395 if (err) 1396 extent_io_tree_panic(tree, err); 1397 cache_state(prealloc, cached_state); 1398 prealloc = NULL; 1399 start = this_end + 1; 1400 goto search_again; 1401 } 1402 /* 1403 * | ---- desired range ---- | 1404 * | state | 1405 * We need to split the extent, and set the bit 1406 * on the first half 1407 */ 1408 if (state->start <= end && state->end > end) { 1409 prealloc = alloc_extent_state_atomic(prealloc); 1410 if (!prealloc) { 1411 err = -ENOMEM; 1412 goto out; 1413 } 1414 1415 err = split_state(tree, state, prealloc, end + 1); 1416 if (err) 1417 extent_io_tree_panic(tree, err); 1418 1419 set_state_bits(tree, prealloc, bits, NULL); 1420 cache_state(prealloc, cached_state); 1421 clear_state_bit(tree, prealloc, clear_bits, 0, NULL); 1422 prealloc = NULL; 1423 goto out; 1424 } 1425 1426 search_again: 1427 if (start > end) 1428 goto out; 1429 spin_unlock(&tree->lock); 1430 cond_resched(); 1431 first_iteration = false; 1432 goto again; 1433 1434 out: 1435 spin_unlock(&tree->lock); 1436 if (prealloc) 1437 free_extent_state(prealloc); 1438 1439 return err; 1440 } 1441 1442 /* wrappers around set/clear extent bit */ 1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1444 u32 bits, struct extent_changeset *changeset) 1445 { 1446 /* 1447 * We don't support EXTENT_LOCKED yet, as current changeset will 1448 * record any bits changed, so for EXTENT_LOCKED case, it will 1449 * either fail with -EEXIST or changeset will record the whole 1450 * range. 1451 */ 1452 BUG_ON(bits & EXTENT_LOCKED); 1453 1454 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1455 changeset); 1456 } 1457 1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, 1459 u32 bits) 1460 { 1461 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, 1462 GFP_NOWAIT, NULL); 1463 } 1464 1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1466 u32 bits, int wake, int delete, 1467 struct extent_state **cached) 1468 { 1469 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1470 cached, GFP_NOFS, NULL); 1471 } 1472 1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1474 u32 bits, struct extent_changeset *changeset) 1475 { 1476 /* 1477 * Don't support EXTENT_LOCKED case, same reason as 1478 * set_record_extent_bits(). 1479 */ 1480 BUG_ON(bits & EXTENT_LOCKED); 1481 1482 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1483 changeset); 1484 } 1485 1486 /* 1487 * either insert or lock state struct between start and end use mask to tell 1488 * us if waiting is desired. 1489 */ 1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1491 struct extent_state **cached_state) 1492 { 1493 int err; 1494 u64 failed_start; 1495 1496 while (1) { 1497 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1498 EXTENT_LOCKED, &failed_start, 1499 cached_state, GFP_NOFS, NULL); 1500 if (err == -EEXIST) { 1501 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1502 start = failed_start; 1503 } else 1504 break; 1505 WARN_ON(start > end); 1506 } 1507 return err; 1508 } 1509 1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1511 { 1512 int err; 1513 u64 failed_start; 1514 1515 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1516 &failed_start, NULL, GFP_NOFS, NULL); 1517 if (err == -EEXIST) { 1518 if (failed_start > start) 1519 clear_extent_bit(tree, start, failed_start - 1, 1520 EXTENT_LOCKED, 1, 0, NULL); 1521 return 0; 1522 } 1523 return 1; 1524 } 1525 1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1527 { 1528 unsigned long index = start >> PAGE_SHIFT; 1529 unsigned long end_index = end >> PAGE_SHIFT; 1530 struct page *page; 1531 1532 while (index <= end_index) { 1533 page = find_get_page(inode->i_mapping, index); 1534 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1535 clear_page_dirty_for_io(page); 1536 put_page(page); 1537 index++; 1538 } 1539 } 1540 1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1542 { 1543 struct address_space *mapping = inode->i_mapping; 1544 unsigned long index = start >> PAGE_SHIFT; 1545 unsigned long end_index = end >> PAGE_SHIFT; 1546 struct folio *folio; 1547 1548 while (index <= end_index) { 1549 folio = filemap_get_folio(mapping, index); 1550 filemap_dirty_folio(mapping, folio); 1551 folio_account_redirty(folio); 1552 index += folio_nr_pages(folio); 1553 folio_put(folio); 1554 } 1555 } 1556 1557 /* find the first state struct with 'bits' set after 'start', and 1558 * return it. tree->lock must be held. NULL will returned if 1559 * nothing was found after 'start' 1560 */ 1561 static struct extent_state * 1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) 1563 { 1564 struct rb_node *node; 1565 struct extent_state *state; 1566 1567 /* 1568 * this search will find all the extents that end after 1569 * our range starts. 1570 */ 1571 node = tree_search(tree, start); 1572 if (!node) 1573 goto out; 1574 1575 while (1) { 1576 state = rb_entry(node, struct extent_state, rb_node); 1577 if (state->end >= start && (state->state & bits)) 1578 return state; 1579 1580 node = rb_next(node); 1581 if (!node) 1582 break; 1583 } 1584 out: 1585 return NULL; 1586 } 1587 1588 /* 1589 * Find the first offset in the io tree with one or more @bits set. 1590 * 1591 * Note: If there are multiple bits set in @bits, any of them will match. 1592 * 1593 * Return 0 if we find something, and update @start_ret and @end_ret. 1594 * Return 1 if we found nothing. 1595 */ 1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1597 u64 *start_ret, u64 *end_ret, u32 bits, 1598 struct extent_state **cached_state) 1599 { 1600 struct extent_state *state; 1601 int ret = 1; 1602 1603 spin_lock(&tree->lock); 1604 if (cached_state && *cached_state) { 1605 state = *cached_state; 1606 if (state->end == start - 1 && extent_state_in_tree(state)) { 1607 while ((state = next_state(state)) != NULL) { 1608 if (state->state & bits) 1609 goto got_it; 1610 } 1611 free_extent_state(*cached_state); 1612 *cached_state = NULL; 1613 goto out; 1614 } 1615 free_extent_state(*cached_state); 1616 *cached_state = NULL; 1617 } 1618 1619 state = find_first_extent_bit_state(tree, start, bits); 1620 got_it: 1621 if (state) { 1622 cache_state_if_flags(state, cached_state, 0); 1623 *start_ret = state->start; 1624 *end_ret = state->end; 1625 ret = 0; 1626 } 1627 out: 1628 spin_unlock(&tree->lock); 1629 return ret; 1630 } 1631 1632 /** 1633 * Find a contiguous area of bits 1634 * 1635 * @tree: io tree to check 1636 * @start: offset to start the search from 1637 * @start_ret: the first offset we found with the bits set 1638 * @end_ret: the final contiguous range of the bits that were set 1639 * @bits: bits to look for 1640 * 1641 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges 1642 * to set bits appropriately, and then merge them again. During this time it 1643 * will drop the tree->lock, so use this helper if you want to find the actual 1644 * contiguous area for given bits. We will search to the first bit we find, and 1645 * then walk down the tree until we find a non-contiguous area. The area 1646 * returned will be the full contiguous area with the bits set. 1647 */ 1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, 1649 u64 *start_ret, u64 *end_ret, u32 bits) 1650 { 1651 struct extent_state *state; 1652 int ret = 1; 1653 1654 spin_lock(&tree->lock); 1655 state = find_first_extent_bit_state(tree, start, bits); 1656 if (state) { 1657 *start_ret = state->start; 1658 *end_ret = state->end; 1659 while ((state = next_state(state)) != NULL) { 1660 if (state->start > (*end_ret + 1)) 1661 break; 1662 *end_ret = state->end; 1663 } 1664 ret = 0; 1665 } 1666 spin_unlock(&tree->lock); 1667 return ret; 1668 } 1669 1670 /** 1671 * Find the first range that has @bits not set. This range could start before 1672 * @start. 1673 * 1674 * @tree: the tree to search 1675 * @start: offset at/after which the found extent should start 1676 * @start_ret: records the beginning of the range 1677 * @end_ret: records the end of the range (inclusive) 1678 * @bits: the set of bits which must be unset 1679 * 1680 * Since unallocated range is also considered one which doesn't have the bits 1681 * set it's possible that @end_ret contains -1, this happens in case the range 1682 * spans (last_range_end, end of device]. In this case it's up to the caller to 1683 * trim @end_ret to the appropriate size. 1684 */ 1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, 1686 u64 *start_ret, u64 *end_ret, u32 bits) 1687 { 1688 struct extent_state *state; 1689 struct rb_node *node, *prev = NULL, *next; 1690 1691 spin_lock(&tree->lock); 1692 1693 /* Find first extent with bits cleared */ 1694 while (1) { 1695 node = tree_search_prev_next(tree, start, &prev, &next); 1696 if (!node && !next && !prev) { 1697 /* 1698 * Tree is completely empty, send full range and let 1699 * caller deal with it 1700 */ 1701 *start_ret = 0; 1702 *end_ret = -1; 1703 goto out; 1704 } else if (!node && !next) { 1705 /* 1706 * We are past the last allocated chunk, set start at 1707 * the end of the last extent. 1708 */ 1709 state = rb_entry(prev, struct extent_state, rb_node); 1710 *start_ret = state->end + 1; 1711 *end_ret = -1; 1712 goto out; 1713 } else if (!node) { 1714 node = next; 1715 } 1716 /* 1717 * At this point 'node' either contains 'start' or start is 1718 * before 'node' 1719 */ 1720 state = rb_entry(node, struct extent_state, rb_node); 1721 1722 if (in_range(start, state->start, state->end - state->start + 1)) { 1723 if (state->state & bits) { 1724 /* 1725 * |--range with bits sets--| 1726 * | 1727 * start 1728 */ 1729 start = state->end + 1; 1730 } else { 1731 /* 1732 * 'start' falls within a range that doesn't 1733 * have the bits set, so take its start as 1734 * the beginning of the desired range 1735 * 1736 * |--range with bits cleared----| 1737 * | 1738 * start 1739 */ 1740 *start_ret = state->start; 1741 break; 1742 } 1743 } else { 1744 /* 1745 * |---prev range---|---hole/unset---|---node range---| 1746 * | 1747 * start 1748 * 1749 * or 1750 * 1751 * |---hole/unset--||--first node--| 1752 * 0 | 1753 * start 1754 */ 1755 if (prev) { 1756 state = rb_entry(prev, struct extent_state, 1757 rb_node); 1758 *start_ret = state->end + 1; 1759 } else { 1760 *start_ret = 0; 1761 } 1762 break; 1763 } 1764 } 1765 1766 /* 1767 * Find the longest stretch from start until an entry which has the 1768 * bits set 1769 */ 1770 while (1) { 1771 state = rb_entry(node, struct extent_state, rb_node); 1772 if (state->end >= start && !(state->state & bits)) { 1773 *end_ret = state->end; 1774 } else { 1775 *end_ret = state->start - 1; 1776 break; 1777 } 1778 1779 node = rb_next(node); 1780 if (!node) 1781 break; 1782 } 1783 out: 1784 spin_unlock(&tree->lock); 1785 } 1786 1787 /* 1788 * find a contiguous range of bytes in the file marked as delalloc, not 1789 * more than 'max_bytes'. start and end are used to return the range, 1790 * 1791 * true is returned if we find something, false if nothing was in the tree 1792 */ 1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, 1794 u64 *end, u64 max_bytes, 1795 struct extent_state **cached_state) 1796 { 1797 struct rb_node *node; 1798 struct extent_state *state; 1799 u64 cur_start = *start; 1800 bool found = false; 1801 u64 total_bytes = 0; 1802 1803 spin_lock(&tree->lock); 1804 1805 /* 1806 * this search will find all the extents that end after 1807 * our range starts. 1808 */ 1809 node = tree_search(tree, cur_start); 1810 if (!node) { 1811 *end = (u64)-1; 1812 goto out; 1813 } 1814 1815 while (1) { 1816 state = rb_entry(node, struct extent_state, rb_node); 1817 if (found && (state->start != cur_start || 1818 (state->state & EXTENT_BOUNDARY))) { 1819 goto out; 1820 } 1821 if (!(state->state & EXTENT_DELALLOC)) { 1822 if (!found) 1823 *end = state->end; 1824 goto out; 1825 } 1826 if (!found) { 1827 *start = state->start; 1828 *cached_state = state; 1829 refcount_inc(&state->refs); 1830 } 1831 found = true; 1832 *end = state->end; 1833 cur_start = state->end + 1; 1834 node = rb_next(node); 1835 total_bytes += state->end - state->start + 1; 1836 if (total_bytes >= max_bytes) 1837 break; 1838 if (!node) 1839 break; 1840 } 1841 out: 1842 spin_unlock(&tree->lock); 1843 return found; 1844 } 1845 1846 /* 1847 * Process one page for __process_pages_contig(). 1848 * 1849 * Return >0 if we hit @page == @locked_page. 1850 * Return 0 if we updated the page status. 1851 * Return -EGAIN if the we need to try again. 1852 * (For PAGE_LOCK case but got dirty page or page not belong to mapping) 1853 */ 1854 static int process_one_page(struct btrfs_fs_info *fs_info, 1855 struct address_space *mapping, 1856 struct page *page, struct page *locked_page, 1857 unsigned long page_ops, u64 start, u64 end) 1858 { 1859 u32 len; 1860 1861 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 1862 len = end + 1 - start; 1863 1864 if (page_ops & PAGE_SET_ORDERED) 1865 btrfs_page_clamp_set_ordered(fs_info, page, start, len); 1866 if (page_ops & PAGE_SET_ERROR) 1867 btrfs_page_clamp_set_error(fs_info, page, start, len); 1868 if (page_ops & PAGE_START_WRITEBACK) { 1869 btrfs_page_clamp_clear_dirty(fs_info, page, start, len); 1870 btrfs_page_clamp_set_writeback(fs_info, page, start, len); 1871 } 1872 if (page_ops & PAGE_END_WRITEBACK) 1873 btrfs_page_clamp_clear_writeback(fs_info, page, start, len); 1874 1875 if (page == locked_page) 1876 return 1; 1877 1878 if (page_ops & PAGE_LOCK) { 1879 int ret; 1880 1881 ret = btrfs_page_start_writer_lock(fs_info, page, start, len); 1882 if (ret) 1883 return ret; 1884 if (!PageDirty(page) || page->mapping != mapping) { 1885 btrfs_page_end_writer_lock(fs_info, page, start, len); 1886 return -EAGAIN; 1887 } 1888 } 1889 if (page_ops & PAGE_UNLOCK) 1890 btrfs_page_end_writer_lock(fs_info, page, start, len); 1891 return 0; 1892 } 1893 1894 static int __process_pages_contig(struct address_space *mapping, 1895 struct page *locked_page, 1896 u64 start, u64 end, unsigned long page_ops, 1897 u64 *processed_end) 1898 { 1899 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); 1900 pgoff_t start_index = start >> PAGE_SHIFT; 1901 pgoff_t end_index = end >> PAGE_SHIFT; 1902 pgoff_t index = start_index; 1903 unsigned long pages_processed = 0; 1904 struct folio_batch fbatch; 1905 int err = 0; 1906 int i; 1907 1908 if (page_ops & PAGE_LOCK) { 1909 ASSERT(page_ops == PAGE_LOCK); 1910 ASSERT(processed_end && *processed_end == start); 1911 } 1912 1913 if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) 1914 mapping_set_error(mapping, -EIO); 1915 1916 folio_batch_init(&fbatch); 1917 while (index <= end_index) { 1918 int found_folios; 1919 1920 found_folios = filemap_get_folios_contig(mapping, &index, 1921 end_index, &fbatch); 1922 1923 if (found_folios == 0) { 1924 /* 1925 * Only if we're going to lock these pages, we can find 1926 * nothing at @index. 1927 */ 1928 ASSERT(page_ops & PAGE_LOCK); 1929 err = -EAGAIN; 1930 goto out; 1931 } 1932 1933 for (i = 0; i < found_folios; i++) { 1934 int process_ret; 1935 struct folio *folio = fbatch.folios[i]; 1936 process_ret = process_one_page(fs_info, mapping, 1937 &folio->page, locked_page, page_ops, 1938 start, end); 1939 if (process_ret < 0) { 1940 err = -EAGAIN; 1941 folio_batch_release(&fbatch); 1942 goto out; 1943 } 1944 pages_processed += folio_nr_pages(folio); 1945 } 1946 folio_batch_release(&fbatch); 1947 cond_resched(); 1948 } 1949 out: 1950 if (err && processed_end) { 1951 /* 1952 * Update @processed_end. I know this is awful since it has 1953 * two different return value patterns (inclusive vs exclusive). 1954 * 1955 * But the exclusive pattern is necessary if @start is 0, or we 1956 * underflow and check against processed_end won't work as 1957 * expected. 1958 */ 1959 if (pages_processed) 1960 *processed_end = min(end, 1961 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); 1962 else 1963 *processed_end = start; 1964 } 1965 return err; 1966 } 1967 1968 static noinline void __unlock_for_delalloc(struct inode *inode, 1969 struct page *locked_page, 1970 u64 start, u64 end) 1971 { 1972 unsigned long index = start >> PAGE_SHIFT; 1973 unsigned long end_index = end >> PAGE_SHIFT; 1974 1975 ASSERT(locked_page); 1976 if (index == locked_page->index && end_index == index) 1977 return; 1978 1979 __process_pages_contig(inode->i_mapping, locked_page, start, end, 1980 PAGE_UNLOCK, NULL); 1981 } 1982 1983 static noinline int lock_delalloc_pages(struct inode *inode, 1984 struct page *locked_page, 1985 u64 delalloc_start, 1986 u64 delalloc_end) 1987 { 1988 unsigned long index = delalloc_start >> PAGE_SHIFT; 1989 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1990 u64 processed_end = delalloc_start; 1991 int ret; 1992 1993 ASSERT(locked_page); 1994 if (index == locked_page->index && index == end_index) 1995 return 0; 1996 1997 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, 1998 delalloc_end, PAGE_LOCK, &processed_end); 1999 if (ret == -EAGAIN && processed_end > delalloc_start) 2000 __unlock_for_delalloc(inode, locked_page, delalloc_start, 2001 processed_end); 2002 return ret; 2003 } 2004 2005 /* 2006 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 2007 * more than @max_bytes. 2008 * 2009 * @start: The original start bytenr to search. 2010 * Will store the extent range start bytenr. 2011 * @end: The original end bytenr of the search range 2012 * Will store the extent range end bytenr. 2013 * 2014 * Return true if we find a delalloc range which starts inside the original 2015 * range, and @start/@end will store the delalloc range start/end. 2016 * 2017 * Return false if we can't find any delalloc range which starts inside the 2018 * original range, and @start/@end will be the non-delalloc range start/end. 2019 */ 2020 EXPORT_FOR_TESTS 2021 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 2022 struct page *locked_page, u64 *start, 2023 u64 *end) 2024 { 2025 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2026 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2027 const u64 orig_start = *start; 2028 const u64 orig_end = *end; 2029 /* The sanity tests may not set a valid fs_info. */ 2030 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; 2031 u64 delalloc_start; 2032 u64 delalloc_end; 2033 bool found; 2034 struct extent_state *cached_state = NULL; 2035 int ret; 2036 int loops = 0; 2037 2038 /* Caller should pass a valid @end to indicate the search range end */ 2039 ASSERT(orig_end > orig_start); 2040 2041 /* The range should at least cover part of the page */ 2042 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || 2043 orig_end <= page_offset(locked_page))); 2044 again: 2045 /* step one, find a bunch of delalloc bytes starting at start */ 2046 delalloc_start = *start; 2047 delalloc_end = 0; 2048 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 2049 max_bytes, &cached_state); 2050 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 2051 *start = delalloc_start; 2052 2053 /* @delalloc_end can be -1, never go beyond @orig_end */ 2054 *end = min(delalloc_end, orig_end); 2055 free_extent_state(cached_state); 2056 return false; 2057 } 2058 2059 /* 2060 * start comes from the offset of locked_page. We have to lock 2061 * pages in order, so we can't process delalloc bytes before 2062 * locked_page 2063 */ 2064 if (delalloc_start < *start) 2065 delalloc_start = *start; 2066 2067 /* 2068 * make sure to limit the number of pages we try to lock down 2069 */ 2070 if (delalloc_end + 1 - delalloc_start > max_bytes) 2071 delalloc_end = delalloc_start + max_bytes - 1; 2072 2073 /* step two, lock all the pages after the page that has start */ 2074 ret = lock_delalloc_pages(inode, locked_page, 2075 delalloc_start, delalloc_end); 2076 ASSERT(!ret || ret == -EAGAIN); 2077 if (ret == -EAGAIN) { 2078 /* some of the pages are gone, lets avoid looping by 2079 * shortening the size of the delalloc range we're searching 2080 */ 2081 free_extent_state(cached_state); 2082 cached_state = NULL; 2083 if (!loops) { 2084 max_bytes = PAGE_SIZE; 2085 loops = 1; 2086 goto again; 2087 } else { 2088 found = false; 2089 goto out_failed; 2090 } 2091 } 2092 2093 /* step three, lock the state bits for the whole range */ 2094 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 2095 2096 /* then test to make sure it is all still delalloc */ 2097 ret = test_range_bit(tree, delalloc_start, delalloc_end, 2098 EXTENT_DELALLOC, 1, cached_state); 2099 if (!ret) { 2100 unlock_extent_cached(tree, delalloc_start, delalloc_end, 2101 &cached_state); 2102 __unlock_for_delalloc(inode, locked_page, 2103 delalloc_start, delalloc_end); 2104 cond_resched(); 2105 goto again; 2106 } 2107 free_extent_state(cached_state); 2108 *start = delalloc_start; 2109 *end = delalloc_end; 2110 out_failed: 2111 return found; 2112 } 2113 2114 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2115 struct page *locked_page, 2116 u32 clear_bits, unsigned long page_ops) 2117 { 2118 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); 2119 2120 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, 2121 start, end, page_ops, NULL); 2122 } 2123 2124 /* 2125 * count the number of bytes in the tree that have a given bit(s) 2126 * set. This can be fairly slow, except for EXTENT_DIRTY which is 2127 * cached. The total number found is returned. 2128 */ 2129 u64 count_range_bits(struct extent_io_tree *tree, 2130 u64 *start, u64 search_end, u64 max_bytes, 2131 u32 bits, int contig) 2132 { 2133 struct rb_node *node; 2134 struct extent_state *state; 2135 u64 cur_start = *start; 2136 u64 total_bytes = 0; 2137 u64 last = 0; 2138 int found = 0; 2139 2140 if (WARN_ON(search_end <= cur_start)) 2141 return 0; 2142 2143 spin_lock(&tree->lock); 2144 if (cur_start == 0 && bits == EXTENT_DIRTY) { 2145 total_bytes = tree->dirty_bytes; 2146 goto out; 2147 } 2148 /* 2149 * this search will find all the extents that end after 2150 * our range starts. 2151 */ 2152 node = tree_search(tree, cur_start); 2153 if (!node) 2154 goto out; 2155 2156 while (1) { 2157 state = rb_entry(node, struct extent_state, rb_node); 2158 if (state->start > search_end) 2159 break; 2160 if (contig && found && state->start > last + 1) 2161 break; 2162 if (state->end >= cur_start && (state->state & bits) == bits) { 2163 total_bytes += min(search_end, state->end) + 1 - 2164 max(cur_start, state->start); 2165 if (total_bytes >= max_bytes) 2166 break; 2167 if (!found) { 2168 *start = max(cur_start, state->start); 2169 found = 1; 2170 } 2171 last = state->end; 2172 } else if (contig && found) { 2173 break; 2174 } 2175 node = rb_next(node); 2176 if (!node) 2177 break; 2178 } 2179 out: 2180 spin_unlock(&tree->lock); 2181 return total_bytes; 2182 } 2183 2184 /* 2185 * set the private field for a given byte offset in the tree. If there isn't 2186 * an extent_state there already, this does nothing. 2187 */ 2188 int set_state_failrec(struct extent_io_tree *tree, u64 start, 2189 struct io_failure_record *failrec) 2190 { 2191 struct rb_node *node; 2192 struct extent_state *state; 2193 int ret = 0; 2194 2195 spin_lock(&tree->lock); 2196 /* 2197 * this search will find all the extents that end after 2198 * our range starts. 2199 */ 2200 node = tree_search(tree, start); 2201 if (!node) { 2202 ret = -ENOENT; 2203 goto out; 2204 } 2205 state = rb_entry(node, struct extent_state, rb_node); 2206 if (state->start != start) { 2207 ret = -ENOENT; 2208 goto out; 2209 } 2210 state->failrec = failrec; 2211 out: 2212 spin_unlock(&tree->lock); 2213 return ret; 2214 } 2215 2216 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) 2217 { 2218 struct rb_node *node; 2219 struct extent_state *state; 2220 struct io_failure_record *failrec; 2221 2222 spin_lock(&tree->lock); 2223 /* 2224 * this search will find all the extents that end after 2225 * our range starts. 2226 */ 2227 node = tree_search(tree, start); 2228 if (!node) { 2229 failrec = ERR_PTR(-ENOENT); 2230 goto out; 2231 } 2232 state = rb_entry(node, struct extent_state, rb_node); 2233 if (state->start != start) { 2234 failrec = ERR_PTR(-ENOENT); 2235 goto out; 2236 } 2237 2238 failrec = state->failrec; 2239 out: 2240 spin_unlock(&tree->lock); 2241 return failrec; 2242 } 2243 2244 /* 2245 * searches a range in the state tree for a given mask. 2246 * If 'filled' == 1, this returns 1 only if every extent in the tree 2247 * has the bits set. Otherwise, 1 is returned if any bit in the 2248 * range is found set. 2249 */ 2250 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 2251 u32 bits, int filled, struct extent_state *cached) 2252 { 2253 struct extent_state *state = NULL; 2254 struct rb_node *node; 2255 int bitset = 0; 2256 2257 spin_lock(&tree->lock); 2258 if (cached && extent_state_in_tree(cached) && cached->start <= start && 2259 cached->end > start) 2260 node = &cached->rb_node; 2261 else 2262 node = tree_search(tree, start); 2263 while (node && start <= end) { 2264 state = rb_entry(node, struct extent_state, rb_node); 2265 2266 if (filled && state->start > start) { 2267 bitset = 0; 2268 break; 2269 } 2270 2271 if (state->start > end) 2272 break; 2273 2274 if (state->state & bits) { 2275 bitset = 1; 2276 if (!filled) 2277 break; 2278 } else if (filled) { 2279 bitset = 0; 2280 break; 2281 } 2282 2283 if (state->end == (u64)-1) 2284 break; 2285 2286 start = state->end + 1; 2287 if (start > end) 2288 break; 2289 node = rb_next(node); 2290 if (!node) { 2291 if (filled) 2292 bitset = 0; 2293 break; 2294 } 2295 } 2296 spin_unlock(&tree->lock); 2297 return bitset; 2298 } 2299 2300 int free_io_failure(struct extent_io_tree *failure_tree, 2301 struct extent_io_tree *io_tree, 2302 struct io_failure_record *rec) 2303 { 2304 int ret; 2305 int err = 0; 2306 2307 set_state_failrec(failure_tree, rec->start, NULL); 2308 ret = clear_extent_bits(failure_tree, rec->start, 2309 rec->start + rec->len - 1, 2310 EXTENT_LOCKED | EXTENT_DIRTY); 2311 if (ret) 2312 err = ret; 2313 2314 ret = clear_extent_bits(io_tree, rec->start, 2315 rec->start + rec->len - 1, 2316 EXTENT_DAMAGED); 2317 if (ret && !err) 2318 err = ret; 2319 2320 kfree(rec); 2321 return err; 2322 } 2323 2324 /* 2325 * this bypasses the standard btrfs submit functions deliberately, as 2326 * the standard behavior is to write all copies in a raid setup. here we only 2327 * want to write the one bad copy. so we do the mapping for ourselves and issue 2328 * submit_bio directly. 2329 * to avoid any synchronization issues, wait for the data after writing, which 2330 * actually prevents the read that triggered the error from finishing. 2331 * currently, there can be no more than two copies of every data bit. thus, 2332 * exactly one rewrite is required. 2333 */ 2334 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 2335 u64 length, u64 logical, struct page *page, 2336 unsigned int pg_offset, int mirror_num) 2337 { 2338 struct btrfs_device *dev; 2339 struct bio_vec bvec; 2340 struct bio bio; 2341 u64 map_length = 0; 2342 u64 sector; 2343 struct btrfs_io_context *bioc = NULL; 2344 int ret = 0; 2345 2346 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2347 BUG_ON(!mirror_num); 2348 2349 if (btrfs_repair_one_zone(fs_info, logical)) 2350 return 0; 2351 2352 map_length = length; 2353 2354 /* 2355 * Avoid races with device replace and make sure our bioc has devices 2356 * associated to its stripes that don't go away while we are doing the 2357 * read repair operation. 2358 */ 2359 btrfs_bio_counter_inc_blocked(fs_info); 2360 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2361 /* 2362 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2363 * to update all raid stripes, but here we just want to correct 2364 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2365 * stripe's dev and sector. 2366 */ 2367 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2368 &map_length, &bioc, 0); 2369 if (ret) 2370 goto out_counter_dec; 2371 ASSERT(bioc->mirror_num == 1); 2372 } else { 2373 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2374 &map_length, &bioc, mirror_num); 2375 if (ret) 2376 goto out_counter_dec; 2377 BUG_ON(mirror_num != bioc->mirror_num); 2378 } 2379 2380 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 2381 dev = bioc->stripes[bioc->mirror_num - 1].dev; 2382 btrfs_put_bioc(bioc); 2383 2384 if (!dev || !dev->bdev || 2385 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2386 ret = -EIO; 2387 goto out_counter_dec; 2388 } 2389 2390 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 2391 bio.bi_iter.bi_sector = sector; 2392 __bio_add_page(&bio, page, length, pg_offset); 2393 2394 btrfsic_check_bio(&bio); 2395 ret = submit_bio_wait(&bio); 2396 if (ret) { 2397 /* try to remap that extent elsewhere? */ 2398 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2399 goto out_bio_uninit; 2400 } 2401 2402 btrfs_info_rl_in_rcu(fs_info, 2403 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2404 ino, start, 2405 rcu_str_deref(dev->name), sector); 2406 ret = 0; 2407 2408 out_bio_uninit: 2409 bio_uninit(&bio); 2410 out_counter_dec: 2411 btrfs_bio_counter_dec(fs_info); 2412 return ret; 2413 } 2414 2415 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) 2416 { 2417 struct btrfs_fs_info *fs_info = eb->fs_info; 2418 u64 start = eb->start; 2419 int i, num_pages = num_extent_pages(eb); 2420 int ret = 0; 2421 2422 if (sb_rdonly(fs_info->sb)) 2423 return -EROFS; 2424 2425 for (i = 0; i < num_pages; i++) { 2426 struct page *p = eb->pages[i]; 2427 2428 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2429 start - page_offset(p), mirror_num); 2430 if (ret) 2431 break; 2432 start += PAGE_SIZE; 2433 } 2434 2435 return ret; 2436 } 2437 2438 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) 2439 { 2440 if (cur_mirror == failrec->num_copies) 2441 return cur_mirror + 1 - failrec->num_copies; 2442 return cur_mirror + 1; 2443 } 2444 2445 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) 2446 { 2447 if (cur_mirror == 1) 2448 return failrec->num_copies; 2449 return cur_mirror - 1; 2450 } 2451 2452 /* 2453 * each time an IO finishes, we do a fast check in the IO failure tree 2454 * to see if we need to process or clean up an io_failure_record 2455 */ 2456 int clean_io_failure(struct btrfs_fs_info *fs_info, 2457 struct extent_io_tree *failure_tree, 2458 struct extent_io_tree *io_tree, u64 start, 2459 struct page *page, u64 ino, unsigned int pg_offset) 2460 { 2461 u64 private; 2462 struct io_failure_record *failrec; 2463 struct extent_state *state; 2464 int mirror; 2465 int ret; 2466 2467 private = 0; 2468 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2469 EXTENT_DIRTY, 0); 2470 if (!ret) 2471 return 0; 2472 2473 failrec = get_state_failrec(failure_tree, start); 2474 if (IS_ERR(failrec)) 2475 return 0; 2476 2477 BUG_ON(!failrec->this_mirror); 2478 2479 if (sb_rdonly(fs_info->sb)) 2480 goto out; 2481 2482 spin_lock(&io_tree->lock); 2483 state = find_first_extent_bit_state(io_tree, 2484 failrec->start, 2485 EXTENT_LOCKED); 2486 spin_unlock(&io_tree->lock); 2487 2488 if (!state || state->start > failrec->start || 2489 state->end < failrec->start + failrec->len - 1) 2490 goto out; 2491 2492 mirror = failrec->this_mirror; 2493 do { 2494 mirror = prev_mirror(failrec, mirror); 2495 repair_io_failure(fs_info, ino, start, failrec->len, 2496 failrec->logical, page, pg_offset, mirror); 2497 } while (mirror != failrec->failed_mirror); 2498 2499 out: 2500 free_io_failure(failure_tree, io_tree, failrec); 2501 return 0; 2502 } 2503 2504 /* 2505 * Can be called when 2506 * - hold extent lock 2507 * - under ordered extent 2508 * - the inode is freeing 2509 */ 2510 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2511 { 2512 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2513 struct io_failure_record *failrec; 2514 struct extent_state *state, *next; 2515 2516 if (RB_EMPTY_ROOT(&failure_tree->state)) 2517 return; 2518 2519 spin_lock(&failure_tree->lock); 2520 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2521 while (state) { 2522 if (state->start > end) 2523 break; 2524 2525 ASSERT(state->end <= end); 2526 2527 next = next_state(state); 2528 2529 failrec = state->failrec; 2530 free_extent_state(state); 2531 kfree(failrec); 2532 2533 state = next; 2534 } 2535 spin_unlock(&failure_tree->lock); 2536 } 2537 2538 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, 2539 struct btrfs_bio *bbio, 2540 unsigned int bio_offset) 2541 { 2542 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2543 u64 start = bbio->file_offset + bio_offset; 2544 struct io_failure_record *failrec; 2545 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2546 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2547 const u32 sectorsize = fs_info->sectorsize; 2548 int ret; 2549 2550 failrec = get_state_failrec(failure_tree, start); 2551 if (!IS_ERR(failrec)) { 2552 btrfs_debug(fs_info, 2553 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", 2554 failrec->logical, failrec->start, failrec->len); 2555 /* 2556 * when data can be on disk more than twice, add to failrec here 2557 * (e.g. with a list for failed_mirror) to make 2558 * clean_io_failure() clean all those errors at once. 2559 */ 2560 ASSERT(failrec->this_mirror == bbio->mirror_num); 2561 ASSERT(failrec->len == fs_info->sectorsize); 2562 return failrec; 2563 } 2564 2565 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2566 if (!failrec) 2567 return ERR_PTR(-ENOMEM); 2568 2569 failrec->start = start; 2570 failrec->len = sectorsize; 2571 failrec->failed_mirror = bbio->mirror_num; 2572 failrec->this_mirror = bbio->mirror_num; 2573 failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; 2574 2575 btrfs_debug(fs_info, 2576 "new io failure record logical %llu start %llu", 2577 failrec->logical, start); 2578 2579 failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); 2580 if (failrec->num_copies == 1) { 2581 /* 2582 * We only have a single copy of the data, so don't bother with 2583 * all the retry and error correction code that follows. No 2584 * matter what the error is, it is very likely to persist. 2585 */ 2586 btrfs_debug(fs_info, 2587 "cannot repair logical %llu num_copies %d", 2588 failrec->logical, failrec->num_copies); 2589 kfree(failrec); 2590 return ERR_PTR(-EIO); 2591 } 2592 2593 /* Set the bits in the private failure tree */ 2594 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, 2595 EXTENT_LOCKED | EXTENT_DIRTY); 2596 if (ret >= 0) { 2597 ret = set_state_failrec(failure_tree, start, failrec); 2598 /* Set the bits in the inode's tree */ 2599 ret = set_extent_bits(tree, start, start + sectorsize - 1, 2600 EXTENT_DAMAGED); 2601 } else if (ret < 0) { 2602 kfree(failrec); 2603 return ERR_PTR(ret); 2604 } 2605 2606 return failrec; 2607 } 2608 2609 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, 2610 u32 bio_offset, struct page *page, unsigned int pgoff, 2611 submit_bio_hook_t *submit_bio_hook) 2612 { 2613 u64 start = failed_bbio->file_offset + bio_offset; 2614 struct io_failure_record *failrec; 2615 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2616 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2617 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2618 struct bio *failed_bio = &failed_bbio->bio; 2619 const int icsum = bio_offset >> fs_info->sectorsize_bits; 2620 struct bio *repair_bio; 2621 struct btrfs_bio *repair_bbio; 2622 2623 btrfs_debug(fs_info, 2624 "repair read error: read error at %llu", start); 2625 2626 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2627 2628 failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); 2629 if (IS_ERR(failrec)) 2630 return PTR_ERR(failrec); 2631 2632 /* 2633 * There are two premises: 2634 * a) deliver good data to the caller 2635 * b) correct the bad sectors on disk 2636 * 2637 * Since we're only doing repair for one sector, we only need to get 2638 * a good copy of the failed sector and if we succeed, we have setup 2639 * everything for repair_io_failure to do the rest for us. 2640 */ 2641 failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); 2642 if (failrec->this_mirror == failrec->failed_mirror) { 2643 btrfs_debug(fs_info, 2644 "failed to repair num_copies %d this_mirror %d failed_mirror %d", 2645 failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); 2646 free_io_failure(failure_tree, tree, failrec); 2647 return -EIO; 2648 } 2649 2650 repair_bio = btrfs_bio_alloc(1); 2651 repair_bbio = btrfs_bio(repair_bio); 2652 repair_bbio->file_offset = start; 2653 repair_bio->bi_opf = REQ_OP_READ; 2654 repair_bio->bi_end_io = failed_bio->bi_end_io; 2655 repair_bio->bi_iter.bi_sector = failrec->logical >> 9; 2656 repair_bio->bi_private = failed_bio->bi_private; 2657 2658 if (failed_bbio->csum) { 2659 const u32 csum_size = fs_info->csum_size; 2660 2661 repair_bbio->csum = repair_bbio->csum_inline; 2662 memcpy(repair_bbio->csum, 2663 failed_bbio->csum + csum_size * icsum, csum_size); 2664 } 2665 2666 bio_add_page(repair_bio, page, failrec->len, pgoff); 2667 repair_bbio->iter = repair_bio->bi_iter; 2668 2669 btrfs_debug(btrfs_sb(inode->i_sb), 2670 "repair read error: submitting new read to mirror %d", 2671 failrec->this_mirror); 2672 2673 /* 2674 * At this point we have a bio, so any errors from submit_bio_hook() 2675 * will be handled by the endio on the repair_bio, so we can't return an 2676 * error here. 2677 */ 2678 submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); 2679 return BLK_STS_OK; 2680 } 2681 2682 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) 2683 { 2684 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 2685 2686 ASSERT(page_offset(page) <= start && 2687 start + len <= page_offset(page) + PAGE_SIZE); 2688 2689 if (uptodate) { 2690 if (fsverity_active(page->mapping->host) && 2691 !PageError(page) && 2692 !PageUptodate(page) && 2693 start < i_size_read(page->mapping->host) && 2694 !fsverity_verify_page(page)) { 2695 btrfs_page_set_error(fs_info, page, start, len); 2696 } else { 2697 btrfs_page_set_uptodate(fs_info, page, start, len); 2698 } 2699 } else { 2700 btrfs_page_clear_uptodate(fs_info, page, start, len); 2701 btrfs_page_set_error(fs_info, page, start, len); 2702 } 2703 2704 if (!btrfs_is_subpage(fs_info, page)) 2705 unlock_page(page); 2706 else 2707 btrfs_subpage_end_reader(fs_info, page, start, len); 2708 } 2709 2710 static void end_sector_io(struct page *page, u64 offset, bool uptodate) 2711 { 2712 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 2713 const u32 sectorsize = inode->root->fs_info->sectorsize; 2714 struct extent_state *cached = NULL; 2715 2716 end_page_read(page, uptodate, offset, sectorsize); 2717 if (uptodate) 2718 set_extent_uptodate(&inode->io_tree, offset, 2719 offset + sectorsize - 1, &cached, GFP_ATOMIC); 2720 unlock_extent_cached_atomic(&inode->io_tree, offset, 2721 offset + sectorsize - 1, &cached); 2722 } 2723 2724 static void submit_data_read_repair(struct inode *inode, 2725 struct btrfs_bio *failed_bbio, 2726 u32 bio_offset, const struct bio_vec *bvec, 2727 unsigned int error_bitmap) 2728 { 2729 const unsigned int pgoff = bvec->bv_offset; 2730 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2731 struct page *page = bvec->bv_page; 2732 const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; 2733 const u64 end = start + bvec->bv_len - 1; 2734 const u32 sectorsize = fs_info->sectorsize; 2735 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; 2736 int i; 2737 2738 BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); 2739 2740 /* This repair is only for data */ 2741 ASSERT(is_data_inode(inode)); 2742 2743 /* We're here because we had some read errors or csum mismatch */ 2744 ASSERT(error_bitmap); 2745 2746 /* 2747 * We only get called on buffered IO, thus page must be mapped and bio 2748 * must not be cloned. 2749 */ 2750 ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); 2751 2752 /* Iterate through all the sectors in the range */ 2753 for (i = 0; i < nr_bits; i++) { 2754 const unsigned int offset = i * sectorsize; 2755 bool uptodate = false; 2756 int ret; 2757 2758 if (!(error_bitmap & (1U << i))) { 2759 /* 2760 * This sector has no error, just end the page read 2761 * and unlock the range. 2762 */ 2763 uptodate = true; 2764 goto next; 2765 } 2766 2767 ret = btrfs_repair_one_sector(inode, failed_bbio, 2768 bio_offset + offset, page, pgoff + offset, 2769 btrfs_submit_data_read_bio); 2770 if (!ret) { 2771 /* 2772 * We have submitted the read repair, the page release 2773 * will be handled by the endio function of the 2774 * submitted repair bio. 2775 * Thus we don't need to do any thing here. 2776 */ 2777 continue; 2778 } 2779 /* 2780 * Continue on failed repair, otherwise the remaining sectors 2781 * will not be properly unlocked. 2782 */ 2783 next: 2784 end_sector_io(page, start + offset, uptodate); 2785 } 2786 } 2787 2788 /* lots and lots of room for performance fixes in the end_bio funcs */ 2789 2790 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2791 { 2792 struct btrfs_inode *inode; 2793 const bool uptodate = (err == 0); 2794 int ret = 0; 2795 2796 ASSERT(page && page->mapping); 2797 inode = BTRFS_I(page->mapping->host); 2798 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); 2799 2800 if (!uptodate) { 2801 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 2802 u32 len; 2803 2804 ASSERT(end + 1 - start <= U32_MAX); 2805 len = end + 1 - start; 2806 2807 btrfs_page_clear_uptodate(fs_info, page, start, len); 2808 btrfs_page_set_error(fs_info, page, start, len); 2809 ret = err < 0 ? err : -EIO; 2810 mapping_set_error(page->mapping, ret); 2811 } 2812 } 2813 2814 /* 2815 * after a writepage IO is done, we need to: 2816 * clear the uptodate bits on error 2817 * clear the writeback bits in the extent tree for this IO 2818 * end_page_writeback if the page has no more pending IO 2819 * 2820 * Scheduling is not allowed, so the extent state tree is expected 2821 * to have one and only one object corresponding to this IO. 2822 */ 2823 static void end_bio_extent_writepage(struct bio *bio) 2824 { 2825 int error = blk_status_to_errno(bio->bi_status); 2826 struct bio_vec *bvec; 2827 u64 start; 2828 u64 end; 2829 struct bvec_iter_all iter_all; 2830 bool first_bvec = true; 2831 2832 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2833 bio_for_each_segment_all(bvec, bio, iter_all) { 2834 struct page *page = bvec->bv_page; 2835 struct inode *inode = page->mapping->host; 2836 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2837 const u32 sectorsize = fs_info->sectorsize; 2838 2839 /* Our read/write should always be sector aligned. */ 2840 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 2841 btrfs_err(fs_info, 2842 "partial page write in btrfs with offset %u and length %u", 2843 bvec->bv_offset, bvec->bv_len); 2844 else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) 2845 btrfs_info(fs_info, 2846 "incomplete page write with offset %u and length %u", 2847 bvec->bv_offset, bvec->bv_len); 2848 2849 start = page_offset(page) + bvec->bv_offset; 2850 end = start + bvec->bv_len - 1; 2851 2852 if (first_bvec) { 2853 btrfs_record_physical_zoned(inode, start, bio); 2854 first_bvec = false; 2855 } 2856 2857 end_extent_writepage(page, error, start, end); 2858 2859 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); 2860 } 2861 2862 bio_put(bio); 2863 } 2864 2865 /* 2866 * Record previously processed extent range 2867 * 2868 * For endio_readpage_release_extent() to handle a full extent range, reducing 2869 * the extent io operations. 2870 */ 2871 struct processed_extent { 2872 struct btrfs_inode *inode; 2873 /* Start of the range in @inode */ 2874 u64 start; 2875 /* End of the range in @inode */ 2876 u64 end; 2877 bool uptodate; 2878 }; 2879 2880 /* 2881 * Try to release processed extent range 2882 * 2883 * May not release the extent range right now if the current range is 2884 * contiguous to processed extent. 2885 * 2886 * Will release processed extent when any of @inode, @uptodate, the range is 2887 * no longer contiguous to the processed range. 2888 * 2889 * Passing @inode == NULL will force processed extent to be released. 2890 */ 2891 static void endio_readpage_release_extent(struct processed_extent *processed, 2892 struct btrfs_inode *inode, u64 start, u64 end, 2893 bool uptodate) 2894 { 2895 struct extent_state *cached = NULL; 2896 struct extent_io_tree *tree; 2897 2898 /* The first extent, initialize @processed */ 2899 if (!processed->inode) 2900 goto update; 2901 2902 /* 2903 * Contiguous to processed extent, just uptodate the end. 2904 * 2905 * Several things to notice: 2906 * 2907 * - bio can be merged as long as on-disk bytenr is contiguous 2908 * This means we can have page belonging to other inodes, thus need to 2909 * check if the inode still matches. 2910 * - bvec can contain range beyond current page for multi-page bvec 2911 * Thus we need to do processed->end + 1 >= start check 2912 */ 2913 if (processed->inode == inode && processed->uptodate == uptodate && 2914 processed->end + 1 >= start && end >= processed->end) { 2915 processed->end = end; 2916 return; 2917 } 2918 2919 tree = &processed->inode->io_tree; 2920 /* 2921 * Now we don't have range contiguous to the processed range, release 2922 * the processed range now. 2923 */ 2924 if (processed->uptodate && tree->track_uptodate) 2925 set_extent_uptodate(tree, processed->start, processed->end, 2926 &cached, GFP_ATOMIC); 2927 unlock_extent_cached_atomic(tree, processed->start, processed->end, 2928 &cached); 2929 2930 update: 2931 /* Update processed to current range */ 2932 processed->inode = inode; 2933 processed->start = start; 2934 processed->end = end; 2935 processed->uptodate = uptodate; 2936 } 2937 2938 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) 2939 { 2940 ASSERT(PageLocked(page)); 2941 if (!btrfs_is_subpage(fs_info, page)) 2942 return; 2943 2944 ASSERT(PagePrivate(page)); 2945 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); 2946 } 2947 2948 /* 2949 * Find extent buffer for a givne bytenr. 2950 * 2951 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking 2952 * in endio context. 2953 */ 2954 static struct extent_buffer *find_extent_buffer_readpage( 2955 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 2956 { 2957 struct extent_buffer *eb; 2958 2959 /* 2960 * For regular sectorsize, we can use page->private to grab extent 2961 * buffer 2962 */ 2963 if (fs_info->nodesize >= PAGE_SIZE) { 2964 ASSERT(PagePrivate(page) && page->private); 2965 return (struct extent_buffer *)page->private; 2966 } 2967 2968 /* For subpage case, we need to lookup buffer radix tree */ 2969 rcu_read_lock(); 2970 eb = radix_tree_lookup(&fs_info->buffer_radix, 2971 bytenr >> fs_info->sectorsize_bits); 2972 rcu_read_unlock(); 2973 ASSERT(eb); 2974 return eb; 2975 } 2976 2977 /* 2978 * after a readpage IO is done, we need to: 2979 * clear the uptodate bits on error 2980 * set the uptodate bits if things worked 2981 * set the page up to date if all extents in the tree are uptodate 2982 * clear the lock bit in the extent tree 2983 * unlock the page if there are no other extents locked for it 2984 * 2985 * Scheduling is not allowed, so the extent state tree is expected 2986 * to have one and only one object corresponding to this IO. 2987 */ 2988 static void end_bio_extent_readpage(struct bio *bio) 2989 { 2990 struct bio_vec *bvec; 2991 struct btrfs_bio *bbio = btrfs_bio(bio); 2992 struct extent_io_tree *tree, *failure_tree; 2993 struct processed_extent processed = { 0 }; 2994 /* 2995 * The offset to the beginning of a bio, since one bio can never be 2996 * larger than UINT_MAX, u32 here is enough. 2997 */ 2998 u32 bio_offset = 0; 2999 int mirror; 3000 struct bvec_iter_all iter_all; 3001 3002 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3003 bio_for_each_segment_all(bvec, bio, iter_all) { 3004 bool uptodate = !bio->bi_status; 3005 struct page *page = bvec->bv_page; 3006 struct inode *inode = page->mapping->host; 3007 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3008 const u32 sectorsize = fs_info->sectorsize; 3009 unsigned int error_bitmap = (unsigned int)-1; 3010 bool repair = false; 3011 u64 start; 3012 u64 end; 3013 u32 len; 3014 3015 btrfs_debug(fs_info, 3016 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 3017 bio->bi_iter.bi_sector, bio->bi_status, 3018 bbio->mirror_num); 3019 tree = &BTRFS_I(inode)->io_tree; 3020 failure_tree = &BTRFS_I(inode)->io_failure_tree; 3021 3022 /* 3023 * We always issue full-sector reads, but if some block in a 3024 * page fails to read, blk_update_request() will advance 3025 * bv_offset and adjust bv_len to compensate. Print a warning 3026 * for unaligned offsets, and an error if they don't add up to 3027 * a full sector. 3028 */ 3029 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 3030 btrfs_err(fs_info, 3031 "partial page read in btrfs with offset %u and length %u", 3032 bvec->bv_offset, bvec->bv_len); 3033 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, 3034 sectorsize)) 3035 btrfs_info(fs_info, 3036 "incomplete page read with offset %u and length %u", 3037 bvec->bv_offset, bvec->bv_len); 3038 3039 start = page_offset(page) + bvec->bv_offset; 3040 end = start + bvec->bv_len - 1; 3041 len = bvec->bv_len; 3042 3043 mirror = bbio->mirror_num; 3044 if (likely(uptodate)) { 3045 if (is_data_inode(inode)) { 3046 error_bitmap = btrfs_verify_data_csum(bbio, 3047 bio_offset, page, start, end); 3048 if (error_bitmap) 3049 uptodate = false; 3050 } else { 3051 if (btrfs_validate_metadata_buffer(bbio, 3052 page, start, end, mirror)) 3053 uptodate = false; 3054 } 3055 } 3056 3057 if (likely(uptodate)) { 3058 loff_t i_size = i_size_read(inode); 3059 pgoff_t end_index = i_size >> PAGE_SHIFT; 3060 3061 clean_io_failure(BTRFS_I(inode)->root->fs_info, 3062 failure_tree, tree, start, page, 3063 btrfs_ino(BTRFS_I(inode)), 0); 3064 3065 /* 3066 * Zero out the remaining part if this range straddles 3067 * i_size. 3068 * 3069 * Here we should only zero the range inside the bvec, 3070 * not touch anything else. 3071 * 3072 * NOTE: i_size is exclusive while end is inclusive. 3073 */ 3074 if (page->index == end_index && i_size <= end) { 3075 u32 zero_start = max(offset_in_page(i_size), 3076 offset_in_page(start)); 3077 3078 zero_user_segment(page, zero_start, 3079 offset_in_page(end) + 1); 3080 } 3081 } else if (is_data_inode(inode)) { 3082 /* 3083 * Only try to repair bios that actually made it to a 3084 * device. If the bio failed to be submitted mirror 3085 * is 0 and we need to fail it without retrying. 3086 * 3087 * This also includes the high level bios for compressed 3088 * extents - these never make it to a device and repair 3089 * is already handled on the lower compressed bio. 3090 */ 3091 if (mirror > 0) 3092 repair = true; 3093 } else { 3094 struct extent_buffer *eb; 3095 3096 eb = find_extent_buffer_readpage(fs_info, page, start); 3097 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 3098 eb->read_mirror = mirror; 3099 atomic_dec(&eb->io_pages); 3100 } 3101 3102 if (repair) { 3103 /* 3104 * submit_data_read_repair() will handle all the good 3105 * and bad sectors, we just continue to the next bvec. 3106 */ 3107 submit_data_read_repair(inode, bbio, bio_offset, bvec, 3108 error_bitmap); 3109 } else { 3110 /* Update page status and unlock */ 3111 end_page_read(page, uptodate, start, len); 3112 endio_readpage_release_extent(&processed, BTRFS_I(inode), 3113 start, end, PageUptodate(page)); 3114 } 3115 3116 ASSERT(bio_offset + len > bio_offset); 3117 bio_offset += len; 3118 3119 } 3120 /* Release the last extent */ 3121 endio_readpage_release_extent(&processed, NULL, 0, 0, false); 3122 btrfs_bio_free_csum(bbio); 3123 bio_put(bio); 3124 } 3125 3126 /** 3127 * Populate every free slot in a provided array with pages. 3128 * 3129 * @nr_pages: number of pages to allocate 3130 * @page_array: the array to fill with pages; any existing non-null entries in 3131 * the array will be skipped 3132 * 3133 * Return: 0 if all pages were able to be allocated; 3134 * -ENOMEM otherwise, and the caller is responsible for freeing all 3135 * non-null page pointers in the array. 3136 */ 3137 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) 3138 { 3139 unsigned int allocated; 3140 3141 for (allocated = 0; allocated < nr_pages;) { 3142 unsigned int last = allocated; 3143 3144 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); 3145 3146 if (allocated == nr_pages) 3147 return 0; 3148 3149 /* 3150 * During this iteration, no page could be allocated, even 3151 * though alloc_pages_bulk_array() falls back to alloc_page() 3152 * if it could not bulk-allocate. So we must be out of memory. 3153 */ 3154 if (allocated == last) 3155 return -ENOMEM; 3156 3157 memalloc_retry_wait(GFP_NOFS); 3158 } 3159 return 0; 3160 } 3161 3162 /* 3163 * Initialize the members up to but not including 'bio'. Use after allocating a 3164 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 3165 * 'bio' because use of __GFP_ZERO is not supported. 3166 */ 3167 static inline void btrfs_bio_init(struct btrfs_bio *bbio) 3168 { 3169 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 3170 } 3171 3172 /* 3173 * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. 3174 * 3175 * The bio allocation is backed by bioset and does not fail. 3176 */ 3177 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) 3178 { 3179 struct bio *bio; 3180 3181 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); 3182 bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); 3183 btrfs_bio_init(btrfs_bio(bio)); 3184 return bio; 3185 } 3186 3187 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) 3188 { 3189 struct bio *bio; 3190 struct btrfs_bio *bbio; 3191 3192 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 3193 3194 /* this will never fail when it's backed by a bioset */ 3195 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 3196 ASSERT(bio); 3197 3198 bbio = btrfs_bio(bio); 3199 btrfs_bio_init(bbio); 3200 3201 bio_trim(bio, offset >> 9, size >> 9); 3202 bbio->iter = bio->bi_iter; 3203 return bio; 3204 } 3205 3206 /** 3207 * Attempt to add a page to bio 3208 * 3209 * @bio_ctrl: record both the bio, and its bio_flags 3210 * @page: page to add to the bio 3211 * @disk_bytenr: offset of the new bio or to check whether we are adding 3212 * a contiguous page to the previous one 3213 * @size: portion of page that we want to write 3214 * @pg_offset: starting offset in the page 3215 * @compress_type: compression type of the current bio to see if we can merge them 3216 * 3217 * Attempt to add a page to bio considering stripe alignment etc. 3218 * 3219 * Return >= 0 for the number of bytes added to the bio. 3220 * Can return 0 if the current bio is already at stripe/zone boundary. 3221 * Return <0 for error. 3222 */ 3223 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, 3224 struct page *page, 3225 u64 disk_bytenr, unsigned int size, 3226 unsigned int pg_offset, 3227 enum btrfs_compression_type compress_type) 3228 { 3229 struct bio *bio = bio_ctrl->bio; 3230 u32 bio_size = bio->bi_iter.bi_size; 3231 u32 real_size; 3232 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 3233 bool contig = false; 3234 int ret; 3235 3236 ASSERT(bio); 3237 /* The limit should be calculated when bio_ctrl->bio is allocated */ 3238 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); 3239 if (bio_ctrl->compress_type != compress_type) 3240 return 0; 3241 3242 3243 if (bio->bi_iter.bi_size == 0) { 3244 /* We can always add a page into an empty bio. */ 3245 contig = true; 3246 } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { 3247 struct bio_vec *bvec = bio_last_bvec_all(bio); 3248 3249 /* 3250 * The contig check requires the following conditions to be met: 3251 * 1) The pages are belonging to the same inode 3252 * This is implied by the call chain. 3253 * 3254 * 2) The range has adjacent logical bytenr 3255 * 3256 * 3) The range has adjacent file offset 3257 * This is required for the usage of btrfs_bio->file_offset. 3258 */ 3259 if (bio_end_sector(bio) == sector && 3260 page_offset(bvec->bv_page) + bvec->bv_offset + 3261 bvec->bv_len == page_offset(page) + pg_offset) 3262 contig = true; 3263 } else { 3264 /* 3265 * For compression, all IO should have its logical bytenr 3266 * set to the starting bytenr of the compressed extent. 3267 */ 3268 contig = bio->bi_iter.bi_sector == sector; 3269 } 3270 3271 if (!contig) 3272 return 0; 3273 3274 real_size = min(bio_ctrl->len_to_oe_boundary, 3275 bio_ctrl->len_to_stripe_boundary) - bio_size; 3276 real_size = min(real_size, size); 3277 3278 /* 3279 * If real_size is 0, never call bio_add_*_page(), as even size is 0, 3280 * bio will still execute its endio function on the page! 3281 */ 3282 if (real_size == 0) 3283 return 0; 3284 3285 if (bio_op(bio) == REQ_OP_ZONE_APPEND) 3286 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); 3287 else 3288 ret = bio_add_page(bio, page, real_size, pg_offset); 3289 3290 return ret; 3291 } 3292 3293 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, 3294 struct btrfs_inode *inode, u64 file_offset) 3295 { 3296 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3297 struct btrfs_io_geometry geom; 3298 struct btrfs_ordered_extent *ordered; 3299 struct extent_map *em; 3300 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); 3301 int ret; 3302 3303 /* 3304 * Pages for compressed extent are never submitted to disk directly, 3305 * thus it has no real boundary, just set them to U32_MAX. 3306 * 3307 * The split happens for real compressed bio, which happens in 3308 * btrfs_submit_compressed_read/write(). 3309 */ 3310 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 3311 bio_ctrl->len_to_oe_boundary = U32_MAX; 3312 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3313 return 0; 3314 } 3315 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 3316 if (IS_ERR(em)) 3317 return PTR_ERR(em); 3318 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), 3319 logical, &geom); 3320 free_extent_map(em); 3321 if (ret < 0) { 3322 return ret; 3323 } 3324 if (geom.len > U32_MAX) 3325 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3326 else 3327 bio_ctrl->len_to_stripe_boundary = (u32)geom.len; 3328 3329 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { 3330 bio_ctrl->len_to_oe_boundary = U32_MAX; 3331 return 0; 3332 } 3333 3334 /* Ordered extent not yet created, so we're good */ 3335 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 3336 if (!ordered) { 3337 bio_ctrl->len_to_oe_boundary = U32_MAX; 3338 return 0; 3339 } 3340 3341 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 3342 ordered->disk_bytenr + ordered->disk_num_bytes - logical); 3343 btrfs_put_ordered_extent(ordered); 3344 return 0; 3345 } 3346 3347 static int alloc_new_bio(struct btrfs_inode *inode, 3348 struct btrfs_bio_ctrl *bio_ctrl, 3349 struct writeback_control *wbc, 3350 blk_opf_t opf, 3351 bio_end_io_t end_io_func, 3352 u64 disk_bytenr, u32 offset, u64 file_offset, 3353 enum btrfs_compression_type compress_type) 3354 { 3355 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3356 struct bio *bio; 3357 int ret; 3358 3359 bio = btrfs_bio_alloc(BIO_MAX_VECS); 3360 /* 3361 * For compressed page range, its disk_bytenr is always @disk_bytenr 3362 * passed in, no matter if we have added any range into previous bio. 3363 */ 3364 if (compress_type != BTRFS_COMPRESS_NONE) 3365 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 3366 else 3367 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; 3368 bio_ctrl->bio = bio; 3369 bio_ctrl->compress_type = compress_type; 3370 bio->bi_end_io = end_io_func; 3371 bio->bi_opf = opf; 3372 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); 3373 if (ret < 0) 3374 goto error; 3375 3376 if (wbc) { 3377 /* 3378 * For Zone append we need the correct block_device that we are 3379 * going to write to set in the bio to be able to respect the 3380 * hardware limitation. Look it up here: 3381 */ 3382 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 3383 struct btrfs_device *dev; 3384 3385 dev = btrfs_zoned_get_device(fs_info, disk_bytenr, 3386 fs_info->sectorsize); 3387 if (IS_ERR(dev)) { 3388 ret = PTR_ERR(dev); 3389 goto error; 3390 } 3391 3392 bio_set_dev(bio, dev->bdev); 3393 } else { 3394 /* 3395 * Otherwise pick the last added device to support 3396 * cgroup writeback. For multi-device file systems this 3397 * means blk-cgroup policies have to always be set on the 3398 * last added/replaced device. This is a bit odd but has 3399 * been like that for a long time. 3400 */ 3401 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); 3402 } 3403 wbc_init_bio(wbc, bio); 3404 } else { 3405 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); 3406 } 3407 return 0; 3408 error: 3409 bio_ctrl->bio = NULL; 3410 bio->bi_status = errno_to_blk_status(ret); 3411 bio_endio(bio); 3412 return ret; 3413 } 3414 3415 /* 3416 * @opf: bio REQ_OP_* and REQ_* flags as one value 3417 * @wbc: optional writeback control for io accounting 3418 * @page: page to add to the bio 3419 * @disk_bytenr: logical bytenr where the write will be 3420 * @size: portion of page that we want to write to 3421 * @pg_offset: offset of the new bio or to check whether we are adding 3422 * a contiguous page to the previous one 3423 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 3424 * @end_io_func: end_io callback for new bio 3425 * @mirror_num: desired mirror to read/write 3426 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 3427 * @compress_type: compress type for current bio 3428 */ 3429 static int submit_extent_page(blk_opf_t opf, 3430 struct writeback_control *wbc, 3431 struct btrfs_bio_ctrl *bio_ctrl, 3432 struct page *page, u64 disk_bytenr, 3433 size_t size, unsigned long pg_offset, 3434 bio_end_io_t end_io_func, 3435 enum btrfs_compression_type compress_type, 3436 bool force_bio_submit) 3437 { 3438 int ret = 0; 3439 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3440 unsigned int cur = pg_offset; 3441 3442 ASSERT(bio_ctrl); 3443 3444 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && 3445 pg_offset + size <= PAGE_SIZE); 3446 if (force_bio_submit) 3447 submit_one_bio(bio_ctrl); 3448 3449 while (cur < pg_offset + size) { 3450 u32 offset = cur - pg_offset; 3451 int added; 3452 3453 /* Allocate new bio if needed */ 3454 if (!bio_ctrl->bio) { 3455 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, 3456 end_io_func, disk_bytenr, offset, 3457 page_offset(page) + cur, 3458 compress_type); 3459 if (ret < 0) 3460 return ret; 3461 } 3462 /* 3463 * We must go through btrfs_bio_add_page() to ensure each 3464 * page range won't cross various boundaries. 3465 */ 3466 if (compress_type != BTRFS_COMPRESS_NONE) 3467 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, 3468 size - offset, pg_offset + offset, 3469 compress_type); 3470 else 3471 added = btrfs_bio_add_page(bio_ctrl, page, 3472 disk_bytenr + offset, size - offset, 3473 pg_offset + offset, compress_type); 3474 3475 /* Metadata page range should never be split */ 3476 if (!is_data_inode(&inode->vfs_inode)) 3477 ASSERT(added == 0 || added == size - offset); 3478 3479 /* At least we added some page, update the account */ 3480 if (wbc && added) 3481 wbc_account_cgroup_owner(wbc, page, added); 3482 3483 /* We have reached boundary, submit right now */ 3484 if (added < size - offset) { 3485 /* The bio should contain some page(s) */ 3486 ASSERT(bio_ctrl->bio->bi_iter.bi_size); 3487 submit_one_bio(bio_ctrl); 3488 } 3489 cur += added; 3490 } 3491 return 0; 3492 } 3493 3494 static int attach_extent_buffer_page(struct extent_buffer *eb, 3495 struct page *page, 3496 struct btrfs_subpage *prealloc) 3497 { 3498 struct btrfs_fs_info *fs_info = eb->fs_info; 3499 int ret = 0; 3500 3501 /* 3502 * If the page is mapped to btree inode, we should hold the private 3503 * lock to prevent race. 3504 * For cloned or dummy extent buffers, their pages are not mapped and 3505 * will not race with any other ebs. 3506 */ 3507 if (page->mapping) 3508 lockdep_assert_held(&page->mapping->private_lock); 3509 3510 if (fs_info->nodesize >= PAGE_SIZE) { 3511 if (!PagePrivate(page)) 3512 attach_page_private(page, eb); 3513 else 3514 WARN_ON(page->private != (unsigned long)eb); 3515 return 0; 3516 } 3517 3518 /* Already mapped, just free prealloc */ 3519 if (PagePrivate(page)) { 3520 btrfs_free_subpage(prealloc); 3521 return 0; 3522 } 3523 3524 if (prealloc) 3525 /* Has preallocated memory for subpage */ 3526 attach_page_private(page, prealloc); 3527 else 3528 /* Do new allocation to attach subpage */ 3529 ret = btrfs_attach_subpage(fs_info, page, 3530 BTRFS_SUBPAGE_METADATA); 3531 return ret; 3532 } 3533 3534 int set_page_extent_mapped(struct page *page) 3535 { 3536 struct btrfs_fs_info *fs_info; 3537 3538 ASSERT(page->mapping); 3539 3540 if (PagePrivate(page)) 3541 return 0; 3542 3543 fs_info = btrfs_sb(page->mapping->host->i_sb); 3544 3545 if (btrfs_is_subpage(fs_info, page)) 3546 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); 3547 3548 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); 3549 return 0; 3550 } 3551 3552 void clear_page_extent_mapped(struct page *page) 3553 { 3554 struct btrfs_fs_info *fs_info; 3555 3556 ASSERT(page->mapping); 3557 3558 if (!PagePrivate(page)) 3559 return; 3560 3561 fs_info = btrfs_sb(page->mapping->host->i_sb); 3562 if (btrfs_is_subpage(fs_info, page)) 3563 return btrfs_detach_subpage(fs_info, page); 3564 3565 detach_page_private(page); 3566 } 3567 3568 static struct extent_map * 3569 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 3570 u64 start, u64 len, struct extent_map **em_cached) 3571 { 3572 struct extent_map *em; 3573 3574 if (em_cached && *em_cached) { 3575 em = *em_cached; 3576 if (extent_map_in_tree(em) && start >= em->start && 3577 start < extent_map_end(em)) { 3578 refcount_inc(&em->refs); 3579 return em; 3580 } 3581 3582 free_extent_map(em); 3583 *em_cached = NULL; 3584 } 3585 3586 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); 3587 if (em_cached && !IS_ERR(em)) { 3588 BUG_ON(*em_cached); 3589 refcount_inc(&em->refs); 3590 *em_cached = em; 3591 } 3592 return em; 3593 } 3594 /* 3595 * basic readpage implementation. Locked extent state structs are inserted 3596 * into the tree that are removed when the IO is done (by the end_io 3597 * handlers) 3598 * XXX JDM: This needs looking at to ensure proper page locking 3599 * return 0 on success, otherwise return error 3600 */ 3601 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3602 struct btrfs_bio_ctrl *bio_ctrl, 3603 blk_opf_t read_flags, u64 *prev_em_start) 3604 { 3605 struct inode *inode = page->mapping->host; 3606 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3607 u64 start = page_offset(page); 3608 const u64 end = start + PAGE_SIZE - 1; 3609 u64 cur = start; 3610 u64 extent_offset; 3611 u64 last_byte = i_size_read(inode); 3612 u64 block_start; 3613 u64 cur_end; 3614 struct extent_map *em; 3615 int ret = 0; 3616 size_t pg_offset = 0; 3617 size_t iosize; 3618 size_t blocksize = inode->i_sb->s_blocksize; 3619 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 3620 3621 ret = set_page_extent_mapped(page); 3622 if (ret < 0) { 3623 unlock_extent(tree, start, end); 3624 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); 3625 unlock_page(page); 3626 goto out; 3627 } 3628 3629 if (page->index == last_byte >> PAGE_SHIFT) { 3630 size_t zero_offset = offset_in_page(last_byte); 3631 3632 if (zero_offset) { 3633 iosize = PAGE_SIZE - zero_offset; 3634 memzero_page(page, zero_offset, iosize); 3635 } 3636 } 3637 begin_page_read(fs_info, page); 3638 while (cur <= end) { 3639 unsigned long this_bio_flag = 0; 3640 bool force_bio_submit = false; 3641 u64 disk_bytenr; 3642 3643 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 3644 if (cur >= last_byte) { 3645 struct extent_state *cached = NULL; 3646 3647 iosize = PAGE_SIZE - pg_offset; 3648 memzero_page(page, pg_offset, iosize); 3649 set_extent_uptodate(tree, cur, cur + iosize - 1, 3650 &cached, GFP_NOFS); 3651 unlock_extent_cached(tree, cur, 3652 cur + iosize - 1, &cached); 3653 end_page_read(page, true, cur, iosize); 3654 break; 3655 } 3656 em = __get_extent_map(inode, page, pg_offset, cur, 3657 end - cur + 1, em_cached); 3658 if (IS_ERR(em)) { 3659 unlock_extent(tree, cur, end); 3660 end_page_read(page, false, cur, end + 1 - cur); 3661 ret = PTR_ERR(em); 3662 break; 3663 } 3664 extent_offset = cur - em->start; 3665 BUG_ON(extent_map_end(em) <= cur); 3666 BUG_ON(end < cur); 3667 3668 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3669 this_bio_flag = em->compress_type; 3670 3671 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3672 cur_end = min(extent_map_end(em) - 1, end); 3673 iosize = ALIGN(iosize, blocksize); 3674 if (this_bio_flag != BTRFS_COMPRESS_NONE) 3675 disk_bytenr = em->block_start; 3676 else 3677 disk_bytenr = em->block_start + extent_offset; 3678 block_start = em->block_start; 3679 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3680 block_start = EXTENT_MAP_HOLE; 3681 3682 /* 3683 * If we have a file range that points to a compressed extent 3684 * and it's followed by a consecutive file range that points 3685 * to the same compressed extent (possibly with a different 3686 * offset and/or length, so it either points to the whole extent 3687 * or only part of it), we must make sure we do not submit a 3688 * single bio to populate the pages for the 2 ranges because 3689 * this makes the compressed extent read zero out the pages 3690 * belonging to the 2nd range. Imagine the following scenario: 3691 * 3692 * File layout 3693 * [0 - 8K] [8K - 24K] 3694 * | | 3695 * | | 3696 * points to extent X, points to extent X, 3697 * offset 4K, length of 8K offset 0, length 16K 3698 * 3699 * [extent X, compressed length = 4K uncompressed length = 16K] 3700 * 3701 * If the bio to read the compressed extent covers both ranges, 3702 * it will decompress extent X into the pages belonging to the 3703 * first range and then it will stop, zeroing out the remaining 3704 * pages that belong to the other range that points to extent X. 3705 * So here we make sure we submit 2 bios, one for the first 3706 * range and another one for the third range. Both will target 3707 * the same physical extent from disk, but we can't currently 3708 * make the compressed bio endio callback populate the pages 3709 * for both ranges because each compressed bio is tightly 3710 * coupled with a single extent map, and each range can have 3711 * an extent map with a different offset value relative to the 3712 * uncompressed data of our extent and different lengths. This 3713 * is a corner case so we prioritize correctness over 3714 * non-optimal behavior (submitting 2 bios for the same extent). 3715 */ 3716 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3717 prev_em_start && *prev_em_start != (u64)-1 && 3718 *prev_em_start != em->start) 3719 force_bio_submit = true; 3720 3721 if (prev_em_start) 3722 *prev_em_start = em->start; 3723 3724 free_extent_map(em); 3725 em = NULL; 3726 3727 /* we've found a hole, just zero and go on */ 3728 if (block_start == EXTENT_MAP_HOLE) { 3729 struct extent_state *cached = NULL; 3730 3731 memzero_page(page, pg_offset, iosize); 3732 3733 set_extent_uptodate(tree, cur, cur + iosize - 1, 3734 &cached, GFP_NOFS); 3735 unlock_extent_cached(tree, cur, 3736 cur + iosize - 1, &cached); 3737 end_page_read(page, true, cur, iosize); 3738 cur = cur + iosize; 3739 pg_offset += iosize; 3740 continue; 3741 } 3742 /* the get_extent function already copied into the page */ 3743 if (test_range_bit(tree, cur, cur_end, 3744 EXTENT_UPTODATE, 1, NULL)) { 3745 unlock_extent(tree, cur, cur + iosize - 1); 3746 end_page_read(page, true, cur, iosize); 3747 cur = cur + iosize; 3748 pg_offset += iosize; 3749 continue; 3750 } 3751 /* we have an inline extent but it didn't get marked up 3752 * to date. Error out 3753 */ 3754 if (block_start == EXTENT_MAP_INLINE) { 3755 unlock_extent(tree, cur, cur + iosize - 1); 3756 end_page_read(page, false, cur, iosize); 3757 cur = cur + iosize; 3758 pg_offset += iosize; 3759 continue; 3760 } 3761 3762 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, 3763 bio_ctrl, page, disk_bytenr, iosize, 3764 pg_offset, end_bio_extent_readpage, 3765 this_bio_flag, force_bio_submit); 3766 if (ret) { 3767 /* 3768 * We have to unlock the remaining range, or the page 3769 * will never be unlocked. 3770 */ 3771 unlock_extent(tree, cur, end); 3772 end_page_read(page, false, cur, end + 1 - cur); 3773 goto out; 3774 } 3775 cur = cur + iosize; 3776 pg_offset += iosize; 3777 } 3778 out: 3779 return ret; 3780 } 3781 3782 int btrfs_read_folio(struct file *file, struct folio *folio) 3783 { 3784 struct page *page = &folio->page; 3785 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3786 u64 start = page_offset(page); 3787 u64 end = start + PAGE_SIZE - 1; 3788 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 3789 int ret; 3790 3791 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3792 3793 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 3794 /* 3795 * If btrfs_do_readpage() failed we will want to submit the assembled 3796 * bio to do the cleanup. 3797 */ 3798 submit_one_bio(&bio_ctrl); 3799 return ret; 3800 } 3801 3802 static inline void contiguous_readpages(struct page *pages[], int nr_pages, 3803 u64 start, u64 end, 3804 struct extent_map **em_cached, 3805 struct btrfs_bio_ctrl *bio_ctrl, 3806 u64 *prev_em_start) 3807 { 3808 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3809 int index; 3810 3811 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3812 3813 for (index = 0; index < nr_pages; index++) { 3814 btrfs_do_readpage(pages[index], em_cached, bio_ctrl, 3815 REQ_RAHEAD, prev_em_start); 3816 put_page(pages[index]); 3817 } 3818 } 3819 3820 /* 3821 * helper for __extent_writepage, doing all of the delayed allocation setup. 3822 * 3823 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3824 * to write the page (copy into inline extent). In this case the IO has 3825 * been started and the page is already unlocked. 3826 * 3827 * This returns 0 if all went well (page still locked) 3828 * This returns < 0 if there were errors (page still locked) 3829 */ 3830 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 3831 struct page *page, struct writeback_control *wbc) 3832 { 3833 const u64 page_end = page_offset(page) + PAGE_SIZE - 1; 3834 u64 delalloc_start = page_offset(page); 3835 u64 delalloc_to_write = 0; 3836 /* How many pages are started by btrfs_run_delalloc_range() */ 3837 unsigned long nr_written = 0; 3838 int ret; 3839 int page_started = 0; 3840 3841 while (delalloc_start < page_end) { 3842 u64 delalloc_end = page_end; 3843 bool found; 3844 3845 found = find_lock_delalloc_range(&inode->vfs_inode, page, 3846 &delalloc_start, 3847 &delalloc_end); 3848 if (!found) { 3849 delalloc_start = delalloc_end + 1; 3850 continue; 3851 } 3852 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3853 delalloc_end, &page_started, &nr_written, wbc); 3854 if (ret) { 3855 btrfs_page_set_error(inode->root->fs_info, page, 3856 page_offset(page), PAGE_SIZE); 3857 return ret; 3858 } 3859 /* 3860 * delalloc_end is already one less than the total length, so 3861 * we don't subtract one from PAGE_SIZE 3862 */ 3863 delalloc_to_write += (delalloc_end - delalloc_start + 3864 PAGE_SIZE) >> PAGE_SHIFT; 3865 delalloc_start = delalloc_end + 1; 3866 } 3867 if (wbc->nr_to_write < delalloc_to_write) { 3868 int thresh = 8192; 3869 3870 if (delalloc_to_write < thresh * 2) 3871 thresh = delalloc_to_write; 3872 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3873 thresh); 3874 } 3875 3876 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ 3877 if (page_started) { 3878 /* 3879 * We've unlocked the page, so we can't update the mapping's 3880 * writeback index, just update nr_to_write. 3881 */ 3882 wbc->nr_to_write -= nr_written; 3883 return 1; 3884 } 3885 3886 return 0; 3887 } 3888 3889 /* 3890 * Find the first byte we need to write. 3891 * 3892 * For subpage, one page can contain several sectors, and 3893 * __extent_writepage_io() will just grab all extent maps in the page 3894 * range and try to submit all non-inline/non-compressed extents. 3895 * 3896 * This is a big problem for subpage, we shouldn't re-submit already written 3897 * data at all. 3898 * This function will lookup subpage dirty bit to find which range we really 3899 * need to submit. 3900 * 3901 * Return the next dirty range in [@start, @end). 3902 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. 3903 */ 3904 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, 3905 struct page *page, u64 *start, u64 *end) 3906 { 3907 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 3908 struct btrfs_subpage_info *spi = fs_info->subpage_info; 3909 u64 orig_start = *start; 3910 /* Declare as unsigned long so we can use bitmap ops */ 3911 unsigned long flags; 3912 int range_start_bit; 3913 int range_end_bit; 3914 3915 /* 3916 * For regular sector size == page size case, since one page only 3917 * contains one sector, we return the page offset directly. 3918 */ 3919 if (!btrfs_is_subpage(fs_info, page)) { 3920 *start = page_offset(page); 3921 *end = page_offset(page) + PAGE_SIZE; 3922 return; 3923 } 3924 3925 range_start_bit = spi->dirty_offset + 3926 (offset_in_page(orig_start) >> fs_info->sectorsize_bits); 3927 3928 /* We should have the page locked, but just in case */ 3929 spin_lock_irqsave(&subpage->lock, flags); 3930 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, 3931 spi->dirty_offset + spi->bitmap_nr_bits); 3932 spin_unlock_irqrestore(&subpage->lock, flags); 3933 3934 range_start_bit -= spi->dirty_offset; 3935 range_end_bit -= spi->dirty_offset; 3936 3937 *start = page_offset(page) + range_start_bit * fs_info->sectorsize; 3938 *end = page_offset(page) + range_end_bit * fs_info->sectorsize; 3939 } 3940 3941 /* 3942 * helper for __extent_writepage. This calls the writepage start hooks, 3943 * and does the loop to map the page into extents and bios. 3944 * 3945 * We return 1 if the IO is started and the page is unlocked, 3946 * 0 if all went well (page still locked) 3947 * < 0 if there were errors (page still locked) 3948 */ 3949 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, 3950 struct page *page, 3951 struct writeback_control *wbc, 3952 struct extent_page_data *epd, 3953 loff_t i_size, 3954 int *nr_ret) 3955 { 3956 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3957 u64 cur = page_offset(page); 3958 u64 end = cur + PAGE_SIZE - 1; 3959 u64 extent_offset; 3960 u64 block_start; 3961 struct extent_map *em; 3962 int saved_ret = 0; 3963 int ret = 0; 3964 int nr = 0; 3965 enum req_op op = REQ_OP_WRITE; 3966 const blk_opf_t write_flags = wbc_to_write_flags(wbc); 3967 bool has_error = false; 3968 bool compressed; 3969 3970 ret = btrfs_writepage_cow_fixup(page); 3971 if (ret) { 3972 /* Fixup worker will requeue */ 3973 redirty_page_for_writepage(wbc, page); 3974 unlock_page(page); 3975 return 1; 3976 } 3977 3978 /* 3979 * we don't want to touch the inode after unlocking the page, 3980 * so we update the mapping writeback index now 3981 */ 3982 wbc->nr_to_write--; 3983 3984 while (cur <= end) { 3985 u64 disk_bytenr; 3986 u64 em_end; 3987 u64 dirty_range_start = cur; 3988 u64 dirty_range_end; 3989 u32 iosize; 3990 3991 if (cur >= i_size) { 3992 btrfs_writepage_endio_finish_ordered(inode, page, cur, 3993 end, true); 3994 /* 3995 * This range is beyond i_size, thus we don't need to 3996 * bother writing back. 3997 * But we still need to clear the dirty subpage bit, or 3998 * the next time the page gets dirtied, we will try to 3999 * writeback the sectors with subpage dirty bits, 4000 * causing writeback without ordered extent. 4001 */ 4002 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); 4003 break; 4004 } 4005 4006 find_next_dirty_byte(fs_info, page, &dirty_range_start, 4007 &dirty_range_end); 4008 if (cur < dirty_range_start) { 4009 cur = dirty_range_start; 4010 continue; 4011 } 4012 4013 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); 4014 if (IS_ERR(em)) { 4015 btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 4016 ret = PTR_ERR_OR_ZERO(em); 4017 has_error = true; 4018 if (!saved_ret) 4019 saved_ret = ret; 4020 break; 4021 } 4022 4023 extent_offset = cur - em->start; 4024 em_end = extent_map_end(em); 4025 ASSERT(cur <= em_end); 4026 ASSERT(cur < end); 4027 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); 4028 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); 4029 block_start = em->block_start; 4030 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4031 disk_bytenr = em->block_start + extent_offset; 4032 4033 /* 4034 * Note that em_end from extent_map_end() and dirty_range_end from 4035 * find_next_dirty_byte() are all exclusive 4036 */ 4037 iosize = min(min(em_end, end + 1), dirty_range_end) - cur; 4038 4039 if (btrfs_use_zone_append(inode, em->block_start)) 4040 op = REQ_OP_ZONE_APPEND; 4041 4042 free_extent_map(em); 4043 em = NULL; 4044 4045 /* 4046 * compressed and inline extents are written through other 4047 * paths in the FS 4048 */ 4049 if (compressed || block_start == EXTENT_MAP_HOLE || 4050 block_start == EXTENT_MAP_INLINE) { 4051 if (compressed) 4052 nr++; 4053 else 4054 btrfs_writepage_endio_finish_ordered(inode, 4055 page, cur, cur + iosize - 1, true); 4056 btrfs_page_clear_dirty(fs_info, page, cur, iosize); 4057 cur += iosize; 4058 continue; 4059 } 4060 4061 btrfs_set_range_writeback(inode, cur, cur + iosize - 1); 4062 if (!PageWriteback(page)) { 4063 btrfs_err(inode->root->fs_info, 4064 "page %lu not writeback, cur %llu end %llu", 4065 page->index, cur, end); 4066 } 4067 4068 /* 4069 * Although the PageDirty bit is cleared before entering this 4070 * function, subpage dirty bit is not cleared. 4071 * So clear subpage dirty bit here so next time we won't submit 4072 * page for range already written to disk. 4073 */ 4074 btrfs_page_clear_dirty(fs_info, page, cur, iosize); 4075 4076 ret = submit_extent_page(op | write_flags, wbc, 4077 &epd->bio_ctrl, page, 4078 disk_bytenr, iosize, 4079 cur - page_offset(page), 4080 end_bio_extent_writepage, 4081 0, false); 4082 if (ret) { 4083 has_error = true; 4084 if (!saved_ret) 4085 saved_ret = ret; 4086 4087 btrfs_page_set_error(fs_info, page, cur, iosize); 4088 if (PageWriteback(page)) 4089 btrfs_page_clear_writeback(fs_info, page, cur, 4090 iosize); 4091 } 4092 4093 cur += iosize; 4094 nr++; 4095 } 4096 /* 4097 * If we finish without problem, we should not only clear page dirty, 4098 * but also empty subpage dirty bits 4099 */ 4100 if (!has_error) 4101 btrfs_page_assert_not_dirty(fs_info, page); 4102 else 4103 ret = saved_ret; 4104 *nr_ret = nr; 4105 return ret; 4106 } 4107 4108 /* 4109 * the writepage semantics are similar to regular writepage. extent 4110 * records are inserted to lock ranges in the tree, and as dirty areas 4111 * are found, they are marked writeback. Then the lock bits are removed 4112 * and the end_io handler clears the writeback ranges 4113 * 4114 * Return 0 if everything goes well. 4115 * Return <0 for error. 4116 */ 4117 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 4118 struct extent_page_data *epd) 4119 { 4120 struct folio *folio = page_folio(page); 4121 struct inode *inode = page->mapping->host; 4122 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4123 const u64 page_start = page_offset(page); 4124 const u64 page_end = page_start + PAGE_SIZE - 1; 4125 int ret; 4126 int nr = 0; 4127 size_t pg_offset; 4128 loff_t i_size = i_size_read(inode); 4129 unsigned long end_index = i_size >> PAGE_SHIFT; 4130 4131 trace___extent_writepage(page, inode, wbc); 4132 4133 WARN_ON(!PageLocked(page)); 4134 4135 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, 4136 page_offset(page), PAGE_SIZE); 4137 4138 pg_offset = offset_in_page(i_size); 4139 if (page->index > end_index || 4140 (page->index == end_index && !pg_offset)) { 4141 folio_invalidate(folio, 0, folio_size(folio)); 4142 folio_unlock(folio); 4143 return 0; 4144 } 4145 4146 if (page->index == end_index) 4147 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); 4148 4149 ret = set_page_extent_mapped(page); 4150 if (ret < 0) { 4151 SetPageError(page); 4152 goto done; 4153 } 4154 4155 if (!epd->extent_locked) { 4156 ret = writepage_delalloc(BTRFS_I(inode), page, wbc); 4157 if (ret == 1) 4158 return 0; 4159 if (ret) 4160 goto done; 4161 } 4162 4163 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, 4164 &nr); 4165 if (ret == 1) 4166 return 0; 4167 4168 done: 4169 if (nr == 0) { 4170 /* make sure the mapping tag for page dirty gets cleared */ 4171 set_page_writeback(page); 4172 end_page_writeback(page); 4173 } 4174 /* 4175 * Here we used to have a check for PageError() and then set @ret and 4176 * call end_extent_writepage(). 4177 * 4178 * But in fact setting @ret here will cause different error paths 4179 * between subpage and regular sectorsize. 4180 * 4181 * For regular page size, we never submit current page, but only add 4182 * current page to current bio. 4183 * The bio submission can only happen in next page. 4184 * Thus if we hit the PageError() branch, @ret is already set to 4185 * non-zero value and will not get updated for regular sectorsize. 4186 * 4187 * But for subpage case, it's possible we submit part of current page, 4188 * thus can get PageError() set by submitted bio of the same page, 4189 * while our @ret is still 0. 4190 * 4191 * So here we unify the behavior and don't set @ret. 4192 * Error can still be properly passed to higher layer as page will 4193 * be set error, here we just don't handle the IO failure. 4194 * 4195 * NOTE: This is just a hotfix for subpage. 4196 * The root fix will be properly ending ordered extent when we hit 4197 * an error during writeback. 4198 * 4199 * But that needs a bigger refactoring, as we not only need to grab the 4200 * submitted OE, but also need to know exactly at which bytenr we hit 4201 * the error. 4202 * Currently the full page based __extent_writepage_io() is not 4203 * capable of that. 4204 */ 4205 if (PageError(page)) 4206 end_extent_writepage(page, ret, page_start, page_end); 4207 if (epd->extent_locked) { 4208 /* 4209 * If epd->extent_locked, it's from extent_write_locked_range(), 4210 * the page can either be locked by lock_page() or 4211 * process_one_page(). 4212 * Let btrfs_page_unlock_writer() handle both cases. 4213 */ 4214 ASSERT(wbc); 4215 btrfs_page_unlock_writer(fs_info, page, wbc->range_start, 4216 wbc->range_end + 1 - wbc->range_start); 4217 } else { 4218 unlock_page(page); 4219 } 4220 ASSERT(ret <= 0); 4221 return ret; 4222 } 4223 4224 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 4225 { 4226 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 4227 TASK_UNINTERRUPTIBLE); 4228 } 4229 4230 static void end_extent_buffer_writeback(struct extent_buffer *eb) 4231 { 4232 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4233 smp_mb__after_atomic(); 4234 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 4235 } 4236 4237 /* 4238 * Lock extent buffer status and pages for writeback. 4239 * 4240 * May try to flush write bio if we can't get the lock. 4241 * 4242 * Return 0 if the extent buffer doesn't need to be submitted. 4243 * (E.g. the extent buffer is not dirty) 4244 * Return >0 is the extent buffer is submitted to bio. 4245 * Return <0 if something went wrong, no page is locked. 4246 */ 4247 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, 4248 struct extent_page_data *epd) 4249 { 4250 struct btrfs_fs_info *fs_info = eb->fs_info; 4251 int i, num_pages; 4252 int flush = 0; 4253 int ret = 0; 4254 4255 if (!btrfs_try_tree_write_lock(eb)) { 4256 submit_write_bio(epd, 0); 4257 flush = 1; 4258 btrfs_tree_lock(eb); 4259 } 4260 4261 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 4262 btrfs_tree_unlock(eb); 4263 if (!epd->sync_io) 4264 return 0; 4265 if (!flush) { 4266 submit_write_bio(epd, 0); 4267 flush = 1; 4268 } 4269 while (1) { 4270 wait_on_extent_buffer_writeback(eb); 4271 btrfs_tree_lock(eb); 4272 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 4273 break; 4274 btrfs_tree_unlock(eb); 4275 } 4276 } 4277 4278 /* 4279 * We need to do this to prevent races in people who check if the eb is 4280 * under IO since we can end up having no IO bits set for a short period 4281 * of time. 4282 */ 4283 spin_lock(&eb->refs_lock); 4284 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4285 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4286 spin_unlock(&eb->refs_lock); 4287 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 4288 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4289 -eb->len, 4290 fs_info->dirty_metadata_batch); 4291 ret = 1; 4292 } else { 4293 spin_unlock(&eb->refs_lock); 4294 } 4295 4296 btrfs_tree_unlock(eb); 4297 4298 /* 4299 * Either we don't need to submit any tree block, or we're submitting 4300 * subpage eb. 4301 * Subpage metadata doesn't use page locking at all, so we can skip 4302 * the page locking. 4303 */ 4304 if (!ret || fs_info->nodesize < PAGE_SIZE) 4305 return ret; 4306 4307 num_pages = num_extent_pages(eb); 4308 for (i = 0; i < num_pages; i++) { 4309 struct page *p = eb->pages[i]; 4310 4311 if (!trylock_page(p)) { 4312 if (!flush) { 4313 submit_write_bio(epd, 0); 4314 flush = 1; 4315 } 4316 lock_page(p); 4317 } 4318 } 4319 4320 return ret; 4321 } 4322 4323 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) 4324 { 4325 struct btrfs_fs_info *fs_info = eb->fs_info; 4326 4327 btrfs_page_set_error(fs_info, page, eb->start, eb->len); 4328 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4329 return; 4330 4331 /* 4332 * A read may stumble upon this buffer later, make sure that it gets an 4333 * error and knows there was an error. 4334 */ 4335 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4336 4337 /* 4338 * We need to set the mapping with the io error as well because a write 4339 * error will flip the file system readonly, and then syncfs() will 4340 * return a 0 because we are readonly if we don't modify the err seq for 4341 * the superblock. 4342 */ 4343 mapping_set_error(page->mapping, -EIO); 4344 4345 /* 4346 * If we error out, we should add back the dirty_metadata_bytes 4347 * to make it consistent. 4348 */ 4349 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4350 eb->len, fs_info->dirty_metadata_batch); 4351 4352 /* 4353 * If writeback for a btree extent that doesn't belong to a log tree 4354 * failed, increment the counter transaction->eb_write_errors. 4355 * We do this because while the transaction is running and before it's 4356 * committing (when we call filemap_fdata[write|wait]_range against 4357 * the btree inode), we might have 4358 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 4359 * returns an error or an error happens during writeback, when we're 4360 * committing the transaction we wouldn't know about it, since the pages 4361 * can be no longer dirty nor marked anymore for writeback (if a 4362 * subsequent modification to the extent buffer didn't happen before the 4363 * transaction commit), which makes filemap_fdata[write|wait]_range not 4364 * able to find the pages tagged with SetPageError at transaction 4365 * commit time. So if this happens we must abort the transaction, 4366 * otherwise we commit a super block with btree roots that point to 4367 * btree nodes/leafs whose content on disk is invalid - either garbage 4368 * or the content of some node/leaf from a past generation that got 4369 * cowed or deleted and is no longer valid. 4370 * 4371 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 4372 * not be enough - we need to distinguish between log tree extents vs 4373 * non-log tree extents, and the next filemap_fdatawait_range() call 4374 * will catch and clear such errors in the mapping - and that call might 4375 * be from a log sync and not from a transaction commit. Also, checking 4376 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 4377 * not done and would not be reliable - the eb might have been released 4378 * from memory and reading it back again means that flag would not be 4379 * set (since it's a runtime flag, not persisted on disk). 4380 * 4381 * Using the flags below in the btree inode also makes us achieve the 4382 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 4383 * writeback for all dirty pages and before filemap_fdatawait_range() 4384 * is called, the writeback for all dirty pages had already finished 4385 * with errors - because we were not using AS_EIO/AS_ENOSPC, 4386 * filemap_fdatawait_range() would return success, as it could not know 4387 * that writeback errors happened (the pages were no longer tagged for 4388 * writeback). 4389 */ 4390 switch (eb->log_index) { 4391 case -1: 4392 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 4393 break; 4394 case 0: 4395 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 4396 break; 4397 case 1: 4398 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 4399 break; 4400 default: 4401 BUG(); /* unexpected, logic error */ 4402 } 4403 } 4404 4405 /* 4406 * The endio specific version which won't touch any unsafe spinlock in endio 4407 * context. 4408 */ 4409 static struct extent_buffer *find_extent_buffer_nolock( 4410 struct btrfs_fs_info *fs_info, u64 start) 4411 { 4412 struct extent_buffer *eb; 4413 4414 rcu_read_lock(); 4415 eb = radix_tree_lookup(&fs_info->buffer_radix, 4416 start >> fs_info->sectorsize_bits); 4417 if (eb && atomic_inc_not_zero(&eb->refs)) { 4418 rcu_read_unlock(); 4419 return eb; 4420 } 4421 rcu_read_unlock(); 4422 return NULL; 4423 } 4424 4425 /* 4426 * The endio function for subpage extent buffer write. 4427 * 4428 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() 4429 * after all extent buffers in the page has finished their writeback. 4430 */ 4431 static void end_bio_subpage_eb_writepage(struct bio *bio) 4432 { 4433 struct btrfs_fs_info *fs_info; 4434 struct bio_vec *bvec; 4435 struct bvec_iter_all iter_all; 4436 4437 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4438 ASSERT(fs_info->nodesize < PAGE_SIZE); 4439 4440 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4441 bio_for_each_segment_all(bvec, bio, iter_all) { 4442 struct page *page = bvec->bv_page; 4443 u64 bvec_start = page_offset(page) + bvec->bv_offset; 4444 u64 bvec_end = bvec_start + bvec->bv_len - 1; 4445 u64 cur_bytenr = bvec_start; 4446 4447 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); 4448 4449 /* Iterate through all extent buffers in the range */ 4450 while (cur_bytenr <= bvec_end) { 4451 struct extent_buffer *eb; 4452 int done; 4453 4454 /* 4455 * Here we can't use find_extent_buffer(), as it may 4456 * try to lock eb->refs_lock, which is not safe in endio 4457 * context. 4458 */ 4459 eb = find_extent_buffer_nolock(fs_info, cur_bytenr); 4460 ASSERT(eb); 4461 4462 cur_bytenr = eb->start + eb->len; 4463 4464 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); 4465 done = atomic_dec_and_test(&eb->io_pages); 4466 ASSERT(done); 4467 4468 if (bio->bi_status || 4469 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4470 ClearPageUptodate(page); 4471 set_btree_ioerr(page, eb); 4472 } 4473 4474 btrfs_subpage_clear_writeback(fs_info, page, eb->start, 4475 eb->len); 4476 end_extent_buffer_writeback(eb); 4477 /* 4478 * free_extent_buffer() will grab spinlock which is not 4479 * safe in endio context. Thus here we manually dec 4480 * the ref. 4481 */ 4482 atomic_dec(&eb->refs); 4483 } 4484 } 4485 bio_put(bio); 4486 } 4487 4488 static void end_bio_extent_buffer_writepage(struct bio *bio) 4489 { 4490 struct bio_vec *bvec; 4491 struct extent_buffer *eb; 4492 int done; 4493 struct bvec_iter_all iter_all; 4494 4495 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4496 bio_for_each_segment_all(bvec, bio, iter_all) { 4497 struct page *page = bvec->bv_page; 4498 4499 eb = (struct extent_buffer *)page->private; 4500 BUG_ON(!eb); 4501 done = atomic_dec_and_test(&eb->io_pages); 4502 4503 if (bio->bi_status || 4504 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4505 ClearPageUptodate(page); 4506 set_btree_ioerr(page, eb); 4507 } 4508 4509 end_page_writeback(page); 4510 4511 if (!done) 4512 continue; 4513 4514 end_extent_buffer_writeback(eb); 4515 } 4516 4517 bio_put(bio); 4518 } 4519 4520 static void prepare_eb_write(struct extent_buffer *eb) 4521 { 4522 u32 nritems; 4523 unsigned long start; 4524 unsigned long end; 4525 4526 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 4527 atomic_set(&eb->io_pages, num_extent_pages(eb)); 4528 4529 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 4530 nritems = btrfs_header_nritems(eb); 4531 if (btrfs_header_level(eb) > 0) { 4532 end = btrfs_node_key_ptr_offset(nritems); 4533 memzero_extent_buffer(eb, end, eb->len - end); 4534 } else { 4535 /* 4536 * Leaf: 4537 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 4538 */ 4539 start = btrfs_item_nr_offset(nritems); 4540 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 4541 memzero_extent_buffer(eb, start, end - start); 4542 } 4543 } 4544 4545 /* 4546 * Unlike the work in write_one_eb(), we rely completely on extent locking. 4547 * Page locking is only utilized at minimum to keep the VMM code happy. 4548 */ 4549 static int write_one_subpage_eb(struct extent_buffer *eb, 4550 struct writeback_control *wbc, 4551 struct extent_page_data *epd) 4552 { 4553 struct btrfs_fs_info *fs_info = eb->fs_info; 4554 struct page *page = eb->pages[0]; 4555 blk_opf_t write_flags = wbc_to_write_flags(wbc); 4556 bool no_dirty_ebs = false; 4557 int ret; 4558 4559 prepare_eb_write(eb); 4560 4561 /* clear_page_dirty_for_io() in subpage helper needs page locked */ 4562 lock_page(page); 4563 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); 4564 4565 /* Check if this is the last dirty bit to update nr_written */ 4566 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, 4567 eb->start, eb->len); 4568 if (no_dirty_ebs) 4569 clear_page_dirty_for_io(page); 4570 4571 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4572 &epd->bio_ctrl, page, eb->start, eb->len, 4573 eb->start - page_offset(page), 4574 end_bio_subpage_eb_writepage, 0, false); 4575 if (ret) { 4576 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); 4577 set_btree_ioerr(page, eb); 4578 unlock_page(page); 4579 4580 if (atomic_dec_and_test(&eb->io_pages)) 4581 end_extent_buffer_writeback(eb); 4582 return -EIO; 4583 } 4584 unlock_page(page); 4585 /* 4586 * Submission finished without problem, if no range of the page is 4587 * dirty anymore, we have submitted a page. Update nr_written in wbc. 4588 */ 4589 if (no_dirty_ebs) 4590 wbc->nr_to_write--; 4591 return ret; 4592 } 4593 4594 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 4595 struct writeback_control *wbc, 4596 struct extent_page_data *epd) 4597 { 4598 u64 disk_bytenr = eb->start; 4599 int i, num_pages; 4600 blk_opf_t write_flags = wbc_to_write_flags(wbc); 4601 int ret = 0; 4602 4603 prepare_eb_write(eb); 4604 4605 num_pages = num_extent_pages(eb); 4606 for (i = 0; i < num_pages; i++) { 4607 struct page *p = eb->pages[i]; 4608 4609 clear_page_dirty_for_io(p); 4610 set_page_writeback(p); 4611 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4612 &epd->bio_ctrl, p, disk_bytenr, 4613 PAGE_SIZE, 0, 4614 end_bio_extent_buffer_writepage, 4615 0, false); 4616 if (ret) { 4617 set_btree_ioerr(p, eb); 4618 if (PageWriteback(p)) 4619 end_page_writeback(p); 4620 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 4621 end_extent_buffer_writeback(eb); 4622 ret = -EIO; 4623 break; 4624 } 4625 disk_bytenr += PAGE_SIZE; 4626 wbc->nr_to_write--; 4627 unlock_page(p); 4628 } 4629 4630 if (unlikely(ret)) { 4631 for (; i < num_pages; i++) { 4632 struct page *p = eb->pages[i]; 4633 clear_page_dirty_for_io(p); 4634 unlock_page(p); 4635 } 4636 } 4637 4638 return ret; 4639 } 4640 4641 /* 4642 * Submit one subpage btree page. 4643 * 4644 * The main difference to submit_eb_page() is: 4645 * - Page locking 4646 * For subpage, we don't rely on page locking at all. 4647 * 4648 * - Flush write bio 4649 * We only flush bio if we may be unable to fit current extent buffers into 4650 * current bio. 4651 * 4652 * Return >=0 for the number of submitted extent buffers. 4653 * Return <0 for fatal error. 4654 */ 4655 static int submit_eb_subpage(struct page *page, 4656 struct writeback_control *wbc, 4657 struct extent_page_data *epd) 4658 { 4659 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 4660 int submitted = 0; 4661 u64 page_start = page_offset(page); 4662 int bit_start = 0; 4663 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; 4664 int ret; 4665 4666 /* Lock and write each dirty extent buffers in the range */ 4667 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { 4668 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 4669 struct extent_buffer *eb; 4670 unsigned long flags; 4671 u64 start; 4672 4673 /* 4674 * Take private lock to ensure the subpage won't be detached 4675 * in the meantime. 4676 */ 4677 spin_lock(&page->mapping->private_lock); 4678 if (!PagePrivate(page)) { 4679 spin_unlock(&page->mapping->private_lock); 4680 break; 4681 } 4682 spin_lock_irqsave(&subpage->lock, flags); 4683 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, 4684 subpage->bitmaps)) { 4685 spin_unlock_irqrestore(&subpage->lock, flags); 4686 spin_unlock(&page->mapping->private_lock); 4687 bit_start++; 4688 continue; 4689 } 4690 4691 start = page_start + bit_start * fs_info->sectorsize; 4692 bit_start += sectors_per_node; 4693 4694 /* 4695 * Here we just want to grab the eb without touching extra 4696 * spin locks, so call find_extent_buffer_nolock(). 4697 */ 4698 eb = find_extent_buffer_nolock(fs_info, start); 4699 spin_unlock_irqrestore(&subpage->lock, flags); 4700 spin_unlock(&page->mapping->private_lock); 4701 4702 /* 4703 * The eb has already reached 0 refs thus find_extent_buffer() 4704 * doesn't return it. We don't need to write back such eb 4705 * anyway. 4706 */ 4707 if (!eb) 4708 continue; 4709 4710 ret = lock_extent_buffer_for_io(eb, epd); 4711 if (ret == 0) { 4712 free_extent_buffer(eb); 4713 continue; 4714 } 4715 if (ret < 0) { 4716 free_extent_buffer(eb); 4717 goto cleanup; 4718 } 4719 ret = write_one_subpage_eb(eb, wbc, epd); 4720 free_extent_buffer(eb); 4721 if (ret < 0) 4722 goto cleanup; 4723 submitted++; 4724 } 4725 return submitted; 4726 4727 cleanup: 4728 /* We hit error, end bio for the submitted extent buffers */ 4729 submit_write_bio(epd, ret); 4730 return ret; 4731 } 4732 4733 /* 4734 * Submit all page(s) of one extent buffer. 4735 * 4736 * @page: the page of one extent buffer 4737 * @eb_context: to determine if we need to submit this page, if current page 4738 * belongs to this eb, we don't need to submit 4739 * 4740 * The caller should pass each page in their bytenr order, and here we use 4741 * @eb_context to determine if we have submitted pages of one extent buffer. 4742 * 4743 * If we have, we just skip until we hit a new page that doesn't belong to 4744 * current @eb_context. 4745 * 4746 * If not, we submit all the page(s) of the extent buffer. 4747 * 4748 * Return >0 if we have submitted the extent buffer successfully. 4749 * Return 0 if we don't need to submit the page, as it's already submitted by 4750 * previous call. 4751 * Return <0 for fatal error. 4752 */ 4753 static int submit_eb_page(struct page *page, struct writeback_control *wbc, 4754 struct extent_page_data *epd, 4755 struct extent_buffer **eb_context) 4756 { 4757 struct address_space *mapping = page->mapping; 4758 struct btrfs_block_group *cache = NULL; 4759 struct extent_buffer *eb; 4760 int ret; 4761 4762 if (!PagePrivate(page)) 4763 return 0; 4764 4765 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 4766 return submit_eb_subpage(page, wbc, epd); 4767 4768 spin_lock(&mapping->private_lock); 4769 if (!PagePrivate(page)) { 4770 spin_unlock(&mapping->private_lock); 4771 return 0; 4772 } 4773 4774 eb = (struct extent_buffer *)page->private; 4775 4776 /* 4777 * Shouldn't happen and normally this would be a BUG_ON but no point 4778 * crashing the machine for something we can survive anyway. 4779 */ 4780 if (WARN_ON(!eb)) { 4781 spin_unlock(&mapping->private_lock); 4782 return 0; 4783 } 4784 4785 if (eb == *eb_context) { 4786 spin_unlock(&mapping->private_lock); 4787 return 0; 4788 } 4789 ret = atomic_inc_not_zero(&eb->refs); 4790 spin_unlock(&mapping->private_lock); 4791 if (!ret) 4792 return 0; 4793 4794 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { 4795 /* 4796 * If for_sync, this hole will be filled with 4797 * trasnsaction commit. 4798 */ 4799 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 4800 ret = -EAGAIN; 4801 else 4802 ret = 0; 4803 free_extent_buffer(eb); 4804 return ret; 4805 } 4806 4807 *eb_context = eb; 4808 4809 ret = lock_extent_buffer_for_io(eb, epd); 4810 if (ret <= 0) { 4811 btrfs_revert_meta_write_pointer(cache, eb); 4812 if (cache) 4813 btrfs_put_block_group(cache); 4814 free_extent_buffer(eb); 4815 return ret; 4816 } 4817 if (cache) { 4818 /* 4819 * Implies write in zoned mode. Mark the last eb in a block group. 4820 */ 4821 btrfs_schedule_zone_finish_bg(cache, eb); 4822 btrfs_put_block_group(cache); 4823 } 4824 ret = write_one_eb(eb, wbc, epd); 4825 free_extent_buffer(eb); 4826 if (ret < 0) 4827 return ret; 4828 return 1; 4829 } 4830 4831 int btree_write_cache_pages(struct address_space *mapping, 4832 struct writeback_control *wbc) 4833 { 4834 struct extent_buffer *eb_context = NULL; 4835 struct extent_page_data epd = { 4836 .bio_ctrl = { 0 }, 4837 .extent_locked = 0, 4838 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4839 }; 4840 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 4841 int ret = 0; 4842 int done = 0; 4843 int nr_to_write_done = 0; 4844 struct pagevec pvec; 4845 int nr_pages; 4846 pgoff_t index; 4847 pgoff_t end; /* Inclusive */ 4848 int scanned = 0; 4849 xa_mark_t tag; 4850 4851 pagevec_init(&pvec); 4852 if (wbc->range_cyclic) { 4853 index = mapping->writeback_index; /* Start from prev offset */ 4854 end = -1; 4855 /* 4856 * Start from the beginning does not need to cycle over the 4857 * range, mark it as scanned. 4858 */ 4859 scanned = (index == 0); 4860 } else { 4861 index = wbc->range_start >> PAGE_SHIFT; 4862 end = wbc->range_end >> PAGE_SHIFT; 4863 scanned = 1; 4864 } 4865 if (wbc->sync_mode == WB_SYNC_ALL) 4866 tag = PAGECACHE_TAG_TOWRITE; 4867 else 4868 tag = PAGECACHE_TAG_DIRTY; 4869 btrfs_zoned_meta_io_lock(fs_info); 4870 retry: 4871 if (wbc->sync_mode == WB_SYNC_ALL) 4872 tag_pages_for_writeback(mapping, index, end); 4873 while (!done && !nr_to_write_done && (index <= end) && 4874 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 4875 tag))) { 4876 unsigned i; 4877 4878 for (i = 0; i < nr_pages; i++) { 4879 struct page *page = pvec.pages[i]; 4880 4881 ret = submit_eb_page(page, wbc, &epd, &eb_context); 4882 if (ret == 0) 4883 continue; 4884 if (ret < 0) { 4885 done = 1; 4886 break; 4887 } 4888 4889 /* 4890 * the filesystem may choose to bump up nr_to_write. 4891 * We have to make sure to honor the new nr_to_write 4892 * at any time 4893 */ 4894 nr_to_write_done = wbc->nr_to_write <= 0; 4895 } 4896 pagevec_release(&pvec); 4897 cond_resched(); 4898 } 4899 if (!scanned && !done) { 4900 /* 4901 * We hit the last page and there is more work to be done: wrap 4902 * back to the start of the file 4903 */ 4904 scanned = 1; 4905 index = 0; 4906 goto retry; 4907 } 4908 /* 4909 * If something went wrong, don't allow any metadata write bio to be 4910 * submitted. 4911 * 4912 * This would prevent use-after-free if we had dirty pages not 4913 * cleaned up, which can still happen by fuzzed images. 4914 * 4915 * - Bad extent tree 4916 * Allowing existing tree block to be allocated for other trees. 4917 * 4918 * - Log tree operations 4919 * Exiting tree blocks get allocated to log tree, bumps its 4920 * generation, then get cleaned in tree re-balance. 4921 * Such tree block will not be written back, since it's clean, 4922 * thus no WRITTEN flag set. 4923 * And after log writes back, this tree block is not traced by 4924 * any dirty extent_io_tree. 4925 * 4926 * - Offending tree block gets re-dirtied from its original owner 4927 * Since it has bumped generation, no WRITTEN flag, it can be 4928 * reused without COWing. This tree block will not be traced 4929 * by btrfs_transaction::dirty_pages. 4930 * 4931 * Now such dirty tree block will not be cleaned by any dirty 4932 * extent io tree. Thus we don't want to submit such wild eb 4933 * if the fs already has error. 4934 * 4935 * We can get ret > 0 from submit_extent_page() indicating how many ebs 4936 * were submitted. Reset it to 0 to avoid false alerts for the caller. 4937 */ 4938 if (ret > 0) 4939 ret = 0; 4940 if (!ret && BTRFS_FS_ERROR(fs_info)) 4941 ret = -EROFS; 4942 submit_write_bio(&epd, ret); 4943 4944 btrfs_zoned_meta_io_unlock(fs_info); 4945 return ret; 4946 } 4947 4948 /** 4949 * Walk the list of dirty pages of the given address space and write all of them. 4950 * 4951 * @mapping: address space structure to write 4952 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 4953 * @epd: holds context for the write, namely the bio 4954 * 4955 * If a page is already under I/O, write_cache_pages() skips it, even 4956 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 4957 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 4958 * and msync() need to guarantee that all the data which was dirty at the time 4959 * the call was made get new I/O started against them. If wbc->sync_mode is 4960 * WB_SYNC_ALL then we were called for data integrity and we must wait for 4961 * existing IO to complete. 4962 */ 4963 static int extent_write_cache_pages(struct address_space *mapping, 4964 struct writeback_control *wbc, 4965 struct extent_page_data *epd) 4966 { 4967 struct inode *inode = mapping->host; 4968 int ret = 0; 4969 int done = 0; 4970 int nr_to_write_done = 0; 4971 struct pagevec pvec; 4972 int nr_pages; 4973 pgoff_t index; 4974 pgoff_t end; /* Inclusive */ 4975 pgoff_t done_index; 4976 int range_whole = 0; 4977 int scanned = 0; 4978 xa_mark_t tag; 4979 4980 /* 4981 * We have to hold onto the inode so that ordered extents can do their 4982 * work when the IO finishes. The alternative to this is failing to add 4983 * an ordered extent if the igrab() fails there and that is a huge pain 4984 * to deal with, so instead just hold onto the inode throughout the 4985 * writepages operation. If it fails here we are freeing up the inode 4986 * anyway and we'd rather not waste our time writing out stuff that is 4987 * going to be truncated anyway. 4988 */ 4989 if (!igrab(inode)) 4990 return 0; 4991 4992 pagevec_init(&pvec); 4993 if (wbc->range_cyclic) { 4994 index = mapping->writeback_index; /* Start from prev offset */ 4995 end = -1; 4996 /* 4997 * Start from the beginning does not need to cycle over the 4998 * range, mark it as scanned. 4999 */ 5000 scanned = (index == 0); 5001 } else { 5002 index = wbc->range_start >> PAGE_SHIFT; 5003 end = wbc->range_end >> PAGE_SHIFT; 5004 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 5005 range_whole = 1; 5006 scanned = 1; 5007 } 5008 5009 /* 5010 * We do the tagged writepage as long as the snapshot flush bit is set 5011 * and we are the first one who do the filemap_flush() on this inode. 5012 * 5013 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 5014 * not race in and drop the bit. 5015 */ 5016 if (range_whole && wbc->nr_to_write == LONG_MAX && 5017 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 5018 &BTRFS_I(inode)->runtime_flags)) 5019 wbc->tagged_writepages = 1; 5020 5021 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 5022 tag = PAGECACHE_TAG_TOWRITE; 5023 else 5024 tag = PAGECACHE_TAG_DIRTY; 5025 retry: 5026 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 5027 tag_pages_for_writeback(mapping, index, end); 5028 done_index = index; 5029 while (!done && !nr_to_write_done && (index <= end) && 5030 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 5031 &index, end, tag))) { 5032 unsigned i; 5033 5034 for (i = 0; i < nr_pages; i++) { 5035 struct page *page = pvec.pages[i]; 5036 5037 done_index = page->index + 1; 5038 /* 5039 * At this point we hold neither the i_pages lock nor 5040 * the page lock: the page may be truncated or 5041 * invalidated (changing page->mapping to NULL), 5042 * or even swizzled back from swapper_space to 5043 * tmpfs file mapping 5044 */ 5045 if (!trylock_page(page)) { 5046 submit_write_bio(epd, 0); 5047 lock_page(page); 5048 } 5049 5050 if (unlikely(page->mapping != mapping)) { 5051 unlock_page(page); 5052 continue; 5053 } 5054 5055 if (wbc->sync_mode != WB_SYNC_NONE) { 5056 if (PageWriteback(page)) 5057 submit_write_bio(epd, 0); 5058 wait_on_page_writeback(page); 5059 } 5060 5061 if (PageWriteback(page) || 5062 !clear_page_dirty_for_io(page)) { 5063 unlock_page(page); 5064 continue; 5065 } 5066 5067 ret = __extent_writepage(page, wbc, epd); 5068 if (ret < 0) { 5069 done = 1; 5070 break; 5071 } 5072 5073 /* 5074 * the filesystem may choose to bump up nr_to_write. 5075 * We have to make sure to honor the new nr_to_write 5076 * at any time 5077 */ 5078 nr_to_write_done = wbc->nr_to_write <= 0; 5079 } 5080 pagevec_release(&pvec); 5081 cond_resched(); 5082 } 5083 if (!scanned && !done) { 5084 /* 5085 * We hit the last page and there is more work to be done: wrap 5086 * back to the start of the file 5087 */ 5088 scanned = 1; 5089 index = 0; 5090 5091 /* 5092 * If we're looping we could run into a page that is locked by a 5093 * writer and that writer could be waiting on writeback for a 5094 * page in our current bio, and thus deadlock, so flush the 5095 * write bio here. 5096 */ 5097 submit_write_bio(epd, 0); 5098 goto retry; 5099 } 5100 5101 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 5102 mapping->writeback_index = done_index; 5103 5104 btrfs_add_delayed_iput(inode); 5105 return ret; 5106 } 5107 5108 /* 5109 * Submit the pages in the range to bio for call sites which delalloc range has 5110 * already been ran (aka, ordered extent inserted) and all pages are still 5111 * locked. 5112 */ 5113 int extent_write_locked_range(struct inode *inode, u64 start, u64 end) 5114 { 5115 bool found_error = false; 5116 int first_error = 0; 5117 int ret = 0; 5118 struct address_space *mapping = inode->i_mapping; 5119 struct page *page; 5120 u64 cur = start; 5121 unsigned long nr_pages; 5122 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; 5123 struct extent_page_data epd = { 5124 .bio_ctrl = { 0 }, 5125 .extent_locked = 1, 5126 .sync_io = 1, 5127 }; 5128 struct writeback_control wbc_writepages = { 5129 .sync_mode = WB_SYNC_ALL, 5130 .range_start = start, 5131 .range_end = end + 1, 5132 /* We're called from an async helper function */ 5133 .punt_to_cgroup = 1, 5134 .no_cgroup_owner = 1, 5135 }; 5136 5137 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 5138 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> 5139 PAGE_SHIFT; 5140 wbc_writepages.nr_to_write = nr_pages * 2; 5141 5142 wbc_attach_fdatawrite_inode(&wbc_writepages, inode); 5143 while (cur <= end) { 5144 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 5145 5146 page = find_get_page(mapping, cur >> PAGE_SHIFT); 5147 /* 5148 * All pages in the range are locked since 5149 * btrfs_run_delalloc_range(), thus there is no way to clear 5150 * the page dirty flag. 5151 */ 5152 ASSERT(PageLocked(page)); 5153 ASSERT(PageDirty(page)); 5154 clear_page_dirty_for_io(page); 5155 ret = __extent_writepage(page, &wbc_writepages, &epd); 5156 ASSERT(ret <= 0); 5157 if (ret < 0) { 5158 found_error = true; 5159 first_error = ret; 5160 } 5161 put_page(page); 5162 cur = cur_end + 1; 5163 } 5164 5165 submit_write_bio(&epd, found_error ? ret : 0); 5166 5167 wbc_detach_inode(&wbc_writepages); 5168 if (found_error) 5169 return first_error; 5170 return ret; 5171 } 5172 5173 int extent_writepages(struct address_space *mapping, 5174 struct writeback_control *wbc) 5175 { 5176 struct inode *inode = mapping->host; 5177 int ret = 0; 5178 struct extent_page_data epd = { 5179 .bio_ctrl = { 0 }, 5180 .extent_locked = 0, 5181 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 5182 }; 5183 5184 /* 5185 * Allow only a single thread to do the reloc work in zoned mode to 5186 * protect the write pointer updates. 5187 */ 5188 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 5189 ret = extent_write_cache_pages(mapping, wbc, &epd); 5190 submit_write_bio(&epd, ret); 5191 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 5192 return ret; 5193 } 5194 5195 void extent_readahead(struct readahead_control *rac) 5196 { 5197 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 5198 struct page *pagepool[16]; 5199 struct extent_map *em_cached = NULL; 5200 u64 prev_em_start = (u64)-1; 5201 int nr; 5202 5203 while ((nr = readahead_page_batch(rac, pagepool))) { 5204 u64 contig_start = readahead_pos(rac); 5205 u64 contig_end = contig_start + readahead_batch_length(rac) - 1; 5206 5207 contiguous_readpages(pagepool, nr, contig_start, contig_end, 5208 &em_cached, &bio_ctrl, &prev_em_start); 5209 } 5210 5211 if (em_cached) 5212 free_extent_map(em_cached); 5213 submit_one_bio(&bio_ctrl); 5214 } 5215 5216 /* 5217 * basic invalidate_folio code, this waits on any locked or writeback 5218 * ranges corresponding to the folio, and then deletes any extent state 5219 * records from the tree 5220 */ 5221 int extent_invalidate_folio(struct extent_io_tree *tree, 5222 struct folio *folio, size_t offset) 5223 { 5224 struct extent_state *cached_state = NULL; 5225 u64 start = folio_pos(folio); 5226 u64 end = start + folio_size(folio) - 1; 5227 size_t blocksize = folio->mapping->host->i_sb->s_blocksize; 5228 5229 /* This function is only called for the btree inode */ 5230 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 5231 5232 start += ALIGN(offset, blocksize); 5233 if (start > end) 5234 return 0; 5235 5236 lock_extent_bits(tree, start, end, &cached_state); 5237 folio_wait_writeback(folio); 5238 5239 /* 5240 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 5241 * so here we only need to unlock the extent range to free any 5242 * existing extent state. 5243 */ 5244 unlock_extent_cached(tree, start, end, &cached_state); 5245 return 0; 5246 } 5247 5248 /* 5249 * a helper for release_folio, this tests for areas of the page that 5250 * are locked or under IO and drops the related state bits if it is safe 5251 * to drop the page. 5252 */ 5253 static int try_release_extent_state(struct extent_io_tree *tree, 5254 struct page *page, gfp_t mask) 5255 { 5256 u64 start = page_offset(page); 5257 u64 end = start + PAGE_SIZE - 1; 5258 int ret = 1; 5259 5260 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { 5261 ret = 0; 5262 } else { 5263 /* 5264 * At this point we can safely clear everything except the 5265 * locked bit, the nodatasum bit and the delalloc new bit. 5266 * The delalloc new bit will be cleared by ordered extent 5267 * completion. 5268 */ 5269 ret = __clear_extent_bit(tree, start, end, 5270 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), 5271 0, 0, NULL, mask, NULL); 5272 5273 /* if clear_extent_bit failed for enomem reasons, 5274 * we can't allow the release to continue. 5275 */ 5276 if (ret < 0) 5277 ret = 0; 5278 else 5279 ret = 1; 5280 } 5281 return ret; 5282 } 5283 5284 /* 5285 * a helper for release_folio. As long as there are no locked extents 5286 * in the range corresponding to the page, both state records and extent 5287 * map records are removed 5288 */ 5289 int try_release_extent_mapping(struct page *page, gfp_t mask) 5290 { 5291 struct extent_map *em; 5292 u64 start = page_offset(page); 5293 u64 end = start + PAGE_SIZE - 1; 5294 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 5295 struct extent_io_tree *tree = &btrfs_inode->io_tree; 5296 struct extent_map_tree *map = &btrfs_inode->extent_tree; 5297 5298 if (gfpflags_allow_blocking(mask) && 5299 page->mapping->host->i_size > SZ_16M) { 5300 u64 len; 5301 while (start <= end) { 5302 struct btrfs_fs_info *fs_info; 5303 u64 cur_gen; 5304 5305 len = end - start + 1; 5306 write_lock(&map->lock); 5307 em = lookup_extent_mapping(map, start, len); 5308 if (!em) { 5309 write_unlock(&map->lock); 5310 break; 5311 } 5312 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 5313 em->start != start) { 5314 write_unlock(&map->lock); 5315 free_extent_map(em); 5316 break; 5317 } 5318 if (test_range_bit(tree, em->start, 5319 extent_map_end(em) - 1, 5320 EXTENT_LOCKED, 0, NULL)) 5321 goto next; 5322 /* 5323 * If it's not in the list of modified extents, used 5324 * by a fast fsync, we can remove it. If it's being 5325 * logged we can safely remove it since fsync took an 5326 * extra reference on the em. 5327 */ 5328 if (list_empty(&em->list) || 5329 test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 5330 goto remove_em; 5331 /* 5332 * If it's in the list of modified extents, remove it 5333 * only if its generation is older then the current one, 5334 * in which case we don't need it for a fast fsync. 5335 * Otherwise don't remove it, we could be racing with an 5336 * ongoing fast fsync that could miss the new extent. 5337 */ 5338 fs_info = btrfs_inode->root->fs_info; 5339 spin_lock(&fs_info->trans_lock); 5340 cur_gen = fs_info->generation; 5341 spin_unlock(&fs_info->trans_lock); 5342 if (em->generation >= cur_gen) 5343 goto next; 5344 remove_em: 5345 /* 5346 * We only remove extent maps that are not in the list of 5347 * modified extents or that are in the list but with a 5348 * generation lower then the current generation, so there 5349 * is no need to set the full fsync flag on the inode (it 5350 * hurts the fsync performance for workloads with a data 5351 * size that exceeds or is close to the system's memory). 5352 */ 5353 remove_extent_mapping(map, em); 5354 /* once for the rb tree */ 5355 free_extent_map(em); 5356 next: 5357 start = extent_map_end(em); 5358 write_unlock(&map->lock); 5359 5360 /* once for us */ 5361 free_extent_map(em); 5362 5363 cond_resched(); /* Allow large-extent preemption. */ 5364 } 5365 } 5366 return try_release_extent_state(tree, page, mask); 5367 } 5368 5369 /* 5370 * helper function for fiemap, which doesn't want to see any holes. 5371 * This maps until we find something past 'last' 5372 */ 5373 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, 5374 u64 offset, u64 last) 5375 { 5376 u64 sectorsize = btrfs_inode_sectorsize(inode); 5377 struct extent_map *em; 5378 u64 len; 5379 5380 if (offset >= last) 5381 return NULL; 5382 5383 while (1) { 5384 len = last - offset; 5385 if (len == 0) 5386 break; 5387 len = ALIGN(len, sectorsize); 5388 em = btrfs_get_extent_fiemap(inode, offset, len); 5389 if (IS_ERR(em)) 5390 return em; 5391 5392 /* if this isn't a hole return it */ 5393 if (em->block_start != EXTENT_MAP_HOLE) 5394 return em; 5395 5396 /* this is a hole, advance to the next extent */ 5397 offset = extent_map_end(em); 5398 free_extent_map(em); 5399 if (offset >= last) 5400 break; 5401 } 5402 return NULL; 5403 } 5404 5405 /* 5406 * To cache previous fiemap extent 5407 * 5408 * Will be used for merging fiemap extent 5409 */ 5410 struct fiemap_cache { 5411 u64 offset; 5412 u64 phys; 5413 u64 len; 5414 u32 flags; 5415 bool cached; 5416 }; 5417 5418 /* 5419 * Helper to submit fiemap extent. 5420 * 5421 * Will try to merge current fiemap extent specified by @offset, @phys, 5422 * @len and @flags with cached one. 5423 * And only when we fails to merge, cached one will be submitted as 5424 * fiemap extent. 5425 * 5426 * Return value is the same as fiemap_fill_next_extent(). 5427 */ 5428 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 5429 struct fiemap_cache *cache, 5430 u64 offset, u64 phys, u64 len, u32 flags) 5431 { 5432 int ret = 0; 5433 5434 if (!cache->cached) 5435 goto assign; 5436 5437 /* 5438 * Sanity check, extent_fiemap() should have ensured that new 5439 * fiemap extent won't overlap with cached one. 5440 * Not recoverable. 5441 * 5442 * NOTE: Physical address can overlap, due to compression 5443 */ 5444 if (cache->offset + cache->len > offset) { 5445 WARN_ON(1); 5446 return -EINVAL; 5447 } 5448 5449 /* 5450 * Only merges fiemap extents if 5451 * 1) Their logical addresses are continuous 5452 * 5453 * 2) Their physical addresses are continuous 5454 * So truly compressed (physical size smaller than logical size) 5455 * extents won't get merged with each other 5456 * 5457 * 3) Share same flags except FIEMAP_EXTENT_LAST 5458 * So regular extent won't get merged with prealloc extent 5459 */ 5460 if (cache->offset + cache->len == offset && 5461 cache->phys + cache->len == phys && 5462 (cache->flags & ~FIEMAP_EXTENT_LAST) == 5463 (flags & ~FIEMAP_EXTENT_LAST)) { 5464 cache->len += len; 5465 cache->flags |= flags; 5466 goto try_submit_last; 5467 } 5468 5469 /* Not mergeable, need to submit cached one */ 5470 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5471 cache->len, cache->flags); 5472 cache->cached = false; 5473 if (ret) 5474 return ret; 5475 assign: 5476 cache->cached = true; 5477 cache->offset = offset; 5478 cache->phys = phys; 5479 cache->len = len; 5480 cache->flags = flags; 5481 try_submit_last: 5482 if (cache->flags & FIEMAP_EXTENT_LAST) { 5483 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 5484 cache->phys, cache->len, cache->flags); 5485 cache->cached = false; 5486 } 5487 return ret; 5488 } 5489 5490 /* 5491 * Emit last fiemap cache 5492 * 5493 * The last fiemap cache may still be cached in the following case: 5494 * 0 4k 8k 5495 * |<- Fiemap range ->| 5496 * |<------------ First extent ----------->| 5497 * 5498 * In this case, the first extent range will be cached but not emitted. 5499 * So we must emit it before ending extent_fiemap(). 5500 */ 5501 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 5502 struct fiemap_cache *cache) 5503 { 5504 int ret; 5505 5506 if (!cache->cached) 5507 return 0; 5508 5509 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5510 cache->len, cache->flags); 5511 cache->cached = false; 5512 if (ret > 0) 5513 ret = 0; 5514 return ret; 5515 } 5516 5517 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, 5518 u64 start, u64 len) 5519 { 5520 int ret = 0; 5521 u64 off; 5522 u64 max = start + len; 5523 u32 flags = 0; 5524 u32 found_type; 5525 u64 last; 5526 u64 last_for_get_extent = 0; 5527 u64 disko = 0; 5528 u64 isize = i_size_read(&inode->vfs_inode); 5529 struct btrfs_key found_key; 5530 struct extent_map *em = NULL; 5531 struct extent_state *cached_state = NULL; 5532 struct btrfs_path *path; 5533 struct btrfs_root *root = inode->root; 5534 struct fiemap_cache cache = { 0 }; 5535 struct ulist *roots; 5536 struct ulist *tmp_ulist; 5537 int end = 0; 5538 u64 em_start = 0; 5539 u64 em_len = 0; 5540 u64 em_end = 0; 5541 5542 if (len == 0) 5543 return -EINVAL; 5544 5545 path = btrfs_alloc_path(); 5546 if (!path) 5547 return -ENOMEM; 5548 5549 roots = ulist_alloc(GFP_KERNEL); 5550 tmp_ulist = ulist_alloc(GFP_KERNEL); 5551 if (!roots || !tmp_ulist) { 5552 ret = -ENOMEM; 5553 goto out_free_ulist; 5554 } 5555 5556 /* 5557 * We can't initialize that to 'start' as this could miss extents due 5558 * to extent item merging 5559 */ 5560 off = 0; 5561 start = round_down(start, btrfs_inode_sectorsize(inode)); 5562 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 5563 5564 /* 5565 * lookup the last file extent. We're not using i_size here 5566 * because there might be preallocation past i_size 5567 */ 5568 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, 5569 0); 5570 if (ret < 0) { 5571 goto out_free_ulist; 5572 } else { 5573 WARN_ON(!ret); 5574 if (ret == 1) 5575 ret = 0; 5576 } 5577 5578 path->slots[0]--; 5579 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5580 found_type = found_key.type; 5581 5582 /* No extents, but there might be delalloc bits */ 5583 if (found_key.objectid != btrfs_ino(inode) || 5584 found_type != BTRFS_EXTENT_DATA_KEY) { 5585 /* have to trust i_size as the end */ 5586 last = (u64)-1; 5587 last_for_get_extent = isize; 5588 } else { 5589 /* 5590 * remember the start of the last extent. There are a 5591 * bunch of different factors that go into the length of the 5592 * extent, so its much less complex to remember where it started 5593 */ 5594 last = found_key.offset; 5595 last_for_get_extent = last + 1; 5596 } 5597 btrfs_release_path(path); 5598 5599 /* 5600 * we might have some extents allocated but more delalloc past those 5601 * extents. so, we trust isize unless the start of the last extent is 5602 * beyond isize 5603 */ 5604 if (last < isize) { 5605 last = (u64)-1; 5606 last_for_get_extent = isize; 5607 } 5608 5609 lock_extent_bits(&inode->io_tree, start, start + len - 1, 5610 &cached_state); 5611 5612 em = get_extent_skip_holes(inode, start, last_for_get_extent); 5613 if (!em) 5614 goto out; 5615 if (IS_ERR(em)) { 5616 ret = PTR_ERR(em); 5617 goto out; 5618 } 5619 5620 while (!end) { 5621 u64 offset_in_extent = 0; 5622 5623 /* break if the extent we found is outside the range */ 5624 if (em->start >= max || extent_map_end(em) < off) 5625 break; 5626 5627 /* 5628 * get_extent may return an extent that starts before our 5629 * requested range. We have to make sure the ranges 5630 * we return to fiemap always move forward and don't 5631 * overlap, so adjust the offsets here 5632 */ 5633 em_start = max(em->start, off); 5634 5635 /* 5636 * record the offset from the start of the extent 5637 * for adjusting the disk offset below. Only do this if the 5638 * extent isn't compressed since our in ram offset may be past 5639 * what we have actually allocated on disk. 5640 */ 5641 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5642 offset_in_extent = em_start - em->start; 5643 em_end = extent_map_end(em); 5644 em_len = em_end - em_start; 5645 flags = 0; 5646 if (em->block_start < EXTENT_MAP_LAST_BYTE) 5647 disko = em->block_start + offset_in_extent; 5648 else 5649 disko = 0; 5650 5651 /* 5652 * bump off for our next call to get_extent 5653 */ 5654 off = extent_map_end(em); 5655 if (off >= max) 5656 end = 1; 5657 5658 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 5659 end = 1; 5660 flags |= FIEMAP_EXTENT_LAST; 5661 } else if (em->block_start == EXTENT_MAP_INLINE) { 5662 flags |= (FIEMAP_EXTENT_DATA_INLINE | 5663 FIEMAP_EXTENT_NOT_ALIGNED); 5664 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 5665 flags |= (FIEMAP_EXTENT_DELALLOC | 5666 FIEMAP_EXTENT_UNKNOWN); 5667 } else if (fieinfo->fi_extents_max) { 5668 u64 bytenr = em->block_start - 5669 (em->start - em->orig_start); 5670 5671 /* 5672 * As btrfs supports shared space, this information 5673 * can be exported to userspace tools via 5674 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 5675 * then we're just getting a count and we can skip the 5676 * lookup stuff. 5677 */ 5678 ret = btrfs_check_shared(root, btrfs_ino(inode), 5679 bytenr, roots, tmp_ulist); 5680 if (ret < 0) 5681 goto out_free; 5682 if (ret) 5683 flags |= FIEMAP_EXTENT_SHARED; 5684 ret = 0; 5685 } 5686 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5687 flags |= FIEMAP_EXTENT_ENCODED; 5688 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5689 flags |= FIEMAP_EXTENT_UNWRITTEN; 5690 5691 free_extent_map(em); 5692 em = NULL; 5693 if ((em_start >= last) || em_len == (u64)-1 || 5694 (last == (u64)-1 && isize <= em_end)) { 5695 flags |= FIEMAP_EXTENT_LAST; 5696 end = 1; 5697 } 5698 5699 /* now scan forward to see if this is really the last extent. */ 5700 em = get_extent_skip_holes(inode, off, last_for_get_extent); 5701 if (IS_ERR(em)) { 5702 ret = PTR_ERR(em); 5703 goto out; 5704 } 5705 if (!em) { 5706 flags |= FIEMAP_EXTENT_LAST; 5707 end = 1; 5708 } 5709 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 5710 em_len, flags); 5711 if (ret) { 5712 if (ret == 1) 5713 ret = 0; 5714 goto out_free; 5715 } 5716 } 5717 out_free: 5718 if (!ret) 5719 ret = emit_last_fiemap_cache(fieinfo, &cache); 5720 free_extent_map(em); 5721 out: 5722 unlock_extent_cached(&inode->io_tree, start, start + len - 1, 5723 &cached_state); 5724 5725 out_free_ulist: 5726 btrfs_free_path(path); 5727 ulist_free(roots); 5728 ulist_free(tmp_ulist); 5729 return ret; 5730 } 5731 5732 static void __free_extent_buffer(struct extent_buffer *eb) 5733 { 5734 kmem_cache_free(extent_buffer_cache, eb); 5735 } 5736 5737 int extent_buffer_under_io(const struct extent_buffer *eb) 5738 { 5739 return (atomic_read(&eb->io_pages) || 5740 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 5741 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5742 } 5743 5744 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) 5745 { 5746 struct btrfs_subpage *subpage; 5747 5748 lockdep_assert_held(&page->mapping->private_lock); 5749 5750 if (PagePrivate(page)) { 5751 subpage = (struct btrfs_subpage *)page->private; 5752 if (atomic_read(&subpage->eb_refs)) 5753 return true; 5754 /* 5755 * Even there is no eb refs here, we may still have 5756 * end_page_read() call relying on page::private. 5757 */ 5758 if (atomic_read(&subpage->readers)) 5759 return true; 5760 } 5761 return false; 5762 } 5763 5764 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) 5765 { 5766 struct btrfs_fs_info *fs_info = eb->fs_info; 5767 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5768 5769 /* 5770 * For mapped eb, we're going to change the page private, which should 5771 * be done under the private_lock. 5772 */ 5773 if (mapped) 5774 spin_lock(&page->mapping->private_lock); 5775 5776 if (!PagePrivate(page)) { 5777 if (mapped) 5778 spin_unlock(&page->mapping->private_lock); 5779 return; 5780 } 5781 5782 if (fs_info->nodesize >= PAGE_SIZE) { 5783 /* 5784 * We do this since we'll remove the pages after we've 5785 * removed the eb from the radix tree, so we could race 5786 * and have this page now attached to the new eb. So 5787 * only clear page_private if it's still connected to 5788 * this eb. 5789 */ 5790 if (PagePrivate(page) && 5791 page->private == (unsigned long)eb) { 5792 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5793 BUG_ON(PageDirty(page)); 5794 BUG_ON(PageWriteback(page)); 5795 /* 5796 * We need to make sure we haven't be attached 5797 * to a new eb. 5798 */ 5799 detach_page_private(page); 5800 } 5801 if (mapped) 5802 spin_unlock(&page->mapping->private_lock); 5803 return; 5804 } 5805 5806 /* 5807 * For subpage, we can have dummy eb with page private. In this case, 5808 * we can directly detach the private as such page is only attached to 5809 * one dummy eb, no sharing. 5810 */ 5811 if (!mapped) { 5812 btrfs_detach_subpage(fs_info, page); 5813 return; 5814 } 5815 5816 btrfs_page_dec_eb_refs(fs_info, page); 5817 5818 /* 5819 * We can only detach the page private if there are no other ebs in the 5820 * page range and no unfinished IO. 5821 */ 5822 if (!page_range_has_eb(fs_info, page)) 5823 btrfs_detach_subpage(fs_info, page); 5824 5825 spin_unlock(&page->mapping->private_lock); 5826 } 5827 5828 /* Release all pages attached to the extent buffer */ 5829 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 5830 { 5831 int i; 5832 int num_pages; 5833 5834 ASSERT(!extent_buffer_under_io(eb)); 5835 5836 num_pages = num_extent_pages(eb); 5837 for (i = 0; i < num_pages; i++) { 5838 struct page *page = eb->pages[i]; 5839 5840 if (!page) 5841 continue; 5842 5843 detach_extent_buffer_page(eb, page); 5844 5845 /* One for when we allocated the page */ 5846 put_page(page); 5847 } 5848 } 5849 5850 /* 5851 * Helper for releasing the extent buffer. 5852 */ 5853 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 5854 { 5855 btrfs_release_extent_buffer_pages(eb); 5856 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 5857 __free_extent_buffer(eb); 5858 } 5859 5860 static struct extent_buffer * 5861 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 5862 unsigned long len) 5863 { 5864 struct extent_buffer *eb = NULL; 5865 5866 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 5867 eb->start = start; 5868 eb->len = len; 5869 eb->fs_info = fs_info; 5870 eb->bflags = 0; 5871 init_rwsem(&eb->lock); 5872 5873 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, 5874 &fs_info->allocated_ebs); 5875 INIT_LIST_HEAD(&eb->release_list); 5876 5877 spin_lock_init(&eb->refs_lock); 5878 atomic_set(&eb->refs, 1); 5879 atomic_set(&eb->io_pages, 0); 5880 5881 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); 5882 5883 return eb; 5884 } 5885 5886 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 5887 { 5888 int i; 5889 struct extent_buffer *new; 5890 int num_pages = num_extent_pages(src); 5891 int ret; 5892 5893 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 5894 if (new == NULL) 5895 return NULL; 5896 5897 /* 5898 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 5899 * btrfs_release_extent_buffer() have different behavior for 5900 * UNMAPPED subpage extent buffer. 5901 */ 5902 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 5903 5904 memset(new->pages, 0, sizeof(*new->pages) * num_pages); 5905 ret = btrfs_alloc_page_array(num_pages, new->pages); 5906 if (ret) { 5907 btrfs_release_extent_buffer(new); 5908 return NULL; 5909 } 5910 5911 for (i = 0; i < num_pages; i++) { 5912 int ret; 5913 struct page *p = new->pages[i]; 5914 5915 ret = attach_extent_buffer_page(new, p, NULL); 5916 if (ret < 0) { 5917 btrfs_release_extent_buffer(new); 5918 return NULL; 5919 } 5920 WARN_ON(PageDirty(p)); 5921 copy_page(page_address(p), page_address(src->pages[i])); 5922 } 5923 set_extent_buffer_uptodate(new); 5924 5925 return new; 5926 } 5927 5928 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5929 u64 start, unsigned long len) 5930 { 5931 struct extent_buffer *eb; 5932 int num_pages; 5933 int i; 5934 int ret; 5935 5936 eb = __alloc_extent_buffer(fs_info, start, len); 5937 if (!eb) 5938 return NULL; 5939 5940 num_pages = num_extent_pages(eb); 5941 ret = btrfs_alloc_page_array(num_pages, eb->pages); 5942 if (ret) 5943 goto err; 5944 5945 for (i = 0; i < num_pages; i++) { 5946 struct page *p = eb->pages[i]; 5947 5948 ret = attach_extent_buffer_page(eb, p, NULL); 5949 if (ret < 0) 5950 goto err; 5951 } 5952 5953 set_extent_buffer_uptodate(eb); 5954 btrfs_set_header_nritems(eb, 0); 5955 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5956 5957 return eb; 5958 err: 5959 for (i = 0; i < num_pages; i++) { 5960 if (eb->pages[i]) { 5961 detach_extent_buffer_page(eb, eb->pages[i]); 5962 __free_page(eb->pages[i]); 5963 } 5964 } 5965 __free_extent_buffer(eb); 5966 return NULL; 5967 } 5968 5969 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5970 u64 start) 5971 { 5972 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 5973 } 5974 5975 static void check_buffer_tree_ref(struct extent_buffer *eb) 5976 { 5977 int refs; 5978 /* 5979 * The TREE_REF bit is first set when the extent_buffer is added 5980 * to the radix tree. It is also reset, if unset, when a new reference 5981 * is created by find_extent_buffer. 5982 * 5983 * It is only cleared in two cases: freeing the last non-tree 5984 * reference to the extent_buffer when its STALE bit is set or 5985 * calling release_folio when the tree reference is the only reference. 5986 * 5987 * In both cases, care is taken to ensure that the extent_buffer's 5988 * pages are not under io. However, release_folio can be concurrently 5989 * called with creating new references, which is prone to race 5990 * conditions between the calls to check_buffer_tree_ref in those 5991 * codepaths and clearing TREE_REF in try_release_extent_buffer. 5992 * 5993 * The actual lifetime of the extent_buffer in the radix tree is 5994 * adequately protected by the refcount, but the TREE_REF bit and 5995 * its corresponding reference are not. To protect against this 5996 * class of races, we call check_buffer_tree_ref from the codepaths 5997 * which trigger io after they set eb->io_pages. Note that once io is 5998 * initiated, TREE_REF can no longer be cleared, so that is the 5999 * moment at which any such race is best fixed. 6000 */ 6001 refs = atomic_read(&eb->refs); 6002 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6003 return; 6004 6005 spin_lock(&eb->refs_lock); 6006 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6007 atomic_inc(&eb->refs); 6008 spin_unlock(&eb->refs_lock); 6009 } 6010 6011 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 6012 struct page *accessed) 6013 { 6014 int num_pages, i; 6015 6016 check_buffer_tree_ref(eb); 6017 6018 num_pages = num_extent_pages(eb); 6019 for (i = 0; i < num_pages; i++) { 6020 struct page *p = eb->pages[i]; 6021 6022 if (p != accessed) 6023 mark_page_accessed(p); 6024 } 6025 } 6026 6027 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 6028 u64 start) 6029 { 6030 struct extent_buffer *eb; 6031 6032 eb = find_extent_buffer_nolock(fs_info, start); 6033 if (!eb) 6034 return NULL; 6035 /* 6036 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 6037 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 6038 * another task running free_extent_buffer() might have seen that flag 6039 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 6040 * writeback flags not set) and it's still in the tree (flag 6041 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 6042 * decrementing the extent buffer's reference count twice. So here we 6043 * could race and increment the eb's reference count, clear its stale 6044 * flag, mark it as dirty and drop our reference before the other task 6045 * finishes executing free_extent_buffer, which would later result in 6046 * an attempt to free an extent buffer that is dirty. 6047 */ 6048 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 6049 spin_lock(&eb->refs_lock); 6050 spin_unlock(&eb->refs_lock); 6051 } 6052 mark_extent_buffer_accessed(eb, NULL); 6053 return eb; 6054 } 6055 6056 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6057 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 6058 u64 start) 6059 { 6060 struct extent_buffer *eb, *exists = NULL; 6061 int ret; 6062 6063 eb = find_extent_buffer(fs_info, start); 6064 if (eb) 6065 return eb; 6066 eb = alloc_dummy_extent_buffer(fs_info, start); 6067 if (!eb) 6068 return ERR_PTR(-ENOMEM); 6069 eb->fs_info = fs_info; 6070 again: 6071 ret = radix_tree_preload(GFP_NOFS); 6072 if (ret) { 6073 exists = ERR_PTR(ret); 6074 goto free_eb; 6075 } 6076 spin_lock(&fs_info->buffer_lock); 6077 ret = radix_tree_insert(&fs_info->buffer_radix, 6078 start >> fs_info->sectorsize_bits, eb); 6079 spin_unlock(&fs_info->buffer_lock); 6080 radix_tree_preload_end(); 6081 if (ret == -EEXIST) { 6082 exists = find_extent_buffer(fs_info, start); 6083 if (exists) 6084 goto free_eb; 6085 else 6086 goto again; 6087 } 6088 check_buffer_tree_ref(eb); 6089 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6090 6091 return eb; 6092 free_eb: 6093 btrfs_release_extent_buffer(eb); 6094 return exists; 6095 } 6096 #endif 6097 6098 static struct extent_buffer *grab_extent_buffer( 6099 struct btrfs_fs_info *fs_info, struct page *page) 6100 { 6101 struct extent_buffer *exists; 6102 6103 /* 6104 * For subpage case, we completely rely on radix tree to ensure we 6105 * don't try to insert two ebs for the same bytenr. So here we always 6106 * return NULL and just continue. 6107 */ 6108 if (fs_info->nodesize < PAGE_SIZE) 6109 return NULL; 6110 6111 /* Page not yet attached to an extent buffer */ 6112 if (!PagePrivate(page)) 6113 return NULL; 6114 6115 /* 6116 * We could have already allocated an eb for this page and attached one 6117 * so lets see if we can get a ref on the existing eb, and if we can we 6118 * know it's good and we can just return that one, else we know we can 6119 * just overwrite page->private. 6120 */ 6121 exists = (struct extent_buffer *)page->private; 6122 if (atomic_inc_not_zero(&exists->refs)) 6123 return exists; 6124 6125 WARN_ON(PageDirty(page)); 6126 detach_page_private(page); 6127 return NULL; 6128 } 6129 6130 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 6131 { 6132 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 6133 btrfs_err(fs_info, "bad tree block start %llu", start); 6134 return -EINVAL; 6135 } 6136 6137 if (fs_info->nodesize < PAGE_SIZE && 6138 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { 6139 btrfs_err(fs_info, 6140 "tree block crosses page boundary, start %llu nodesize %u", 6141 start, fs_info->nodesize); 6142 return -EINVAL; 6143 } 6144 if (fs_info->nodesize >= PAGE_SIZE && 6145 !PAGE_ALIGNED(start)) { 6146 btrfs_err(fs_info, 6147 "tree block is not page aligned, start %llu nodesize %u", 6148 start, fs_info->nodesize); 6149 return -EINVAL; 6150 } 6151 return 0; 6152 } 6153 6154 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 6155 u64 start, u64 owner_root, int level) 6156 { 6157 unsigned long len = fs_info->nodesize; 6158 int num_pages; 6159 int i; 6160 unsigned long index = start >> PAGE_SHIFT; 6161 struct extent_buffer *eb; 6162 struct extent_buffer *exists = NULL; 6163 struct page *p; 6164 struct address_space *mapping = fs_info->btree_inode->i_mapping; 6165 u64 lockdep_owner = owner_root; 6166 int uptodate = 1; 6167 int ret; 6168 6169 if (check_eb_alignment(fs_info, start)) 6170 return ERR_PTR(-EINVAL); 6171 6172 #if BITS_PER_LONG == 32 6173 if (start >= MAX_LFS_FILESIZE) { 6174 btrfs_err_rl(fs_info, 6175 "extent buffer %llu is beyond 32bit page cache limit", start); 6176 btrfs_err_32bit_limit(fs_info); 6177 return ERR_PTR(-EOVERFLOW); 6178 } 6179 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6180 btrfs_warn_32bit_limit(fs_info); 6181 #endif 6182 6183 eb = find_extent_buffer(fs_info, start); 6184 if (eb) 6185 return eb; 6186 6187 eb = __alloc_extent_buffer(fs_info, start, len); 6188 if (!eb) 6189 return ERR_PTR(-ENOMEM); 6190 6191 /* 6192 * The reloc trees are just snapshots, so we need them to appear to be 6193 * just like any other fs tree WRT lockdep. 6194 */ 6195 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 6196 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 6197 6198 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 6199 6200 num_pages = num_extent_pages(eb); 6201 for (i = 0; i < num_pages; i++, index++) { 6202 struct btrfs_subpage *prealloc = NULL; 6203 6204 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 6205 if (!p) { 6206 exists = ERR_PTR(-ENOMEM); 6207 goto free_eb; 6208 } 6209 6210 /* 6211 * Preallocate page->private for subpage case, so that we won't 6212 * allocate memory with private_lock hold. The memory will be 6213 * freed by attach_extent_buffer_page() or freed manually if 6214 * we exit earlier. 6215 * 6216 * Although we have ensured one subpage eb can only have one 6217 * page, but it may change in the future for 16K page size 6218 * support, so we still preallocate the memory in the loop. 6219 */ 6220 if (fs_info->nodesize < PAGE_SIZE) { 6221 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); 6222 if (IS_ERR(prealloc)) { 6223 ret = PTR_ERR(prealloc); 6224 unlock_page(p); 6225 put_page(p); 6226 exists = ERR_PTR(ret); 6227 goto free_eb; 6228 } 6229 } 6230 6231 spin_lock(&mapping->private_lock); 6232 exists = grab_extent_buffer(fs_info, p); 6233 if (exists) { 6234 spin_unlock(&mapping->private_lock); 6235 unlock_page(p); 6236 put_page(p); 6237 mark_extent_buffer_accessed(exists, p); 6238 btrfs_free_subpage(prealloc); 6239 goto free_eb; 6240 } 6241 /* Should not fail, as we have preallocated the memory */ 6242 ret = attach_extent_buffer_page(eb, p, prealloc); 6243 ASSERT(!ret); 6244 /* 6245 * To inform we have extra eb under allocation, so that 6246 * detach_extent_buffer_page() won't release the page private 6247 * when the eb hasn't yet been inserted into radix tree. 6248 * 6249 * The ref will be decreased when the eb released the page, in 6250 * detach_extent_buffer_page(). 6251 * Thus needs no special handling in error path. 6252 */ 6253 btrfs_page_inc_eb_refs(fs_info, p); 6254 spin_unlock(&mapping->private_lock); 6255 6256 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); 6257 eb->pages[i] = p; 6258 if (!PageUptodate(p)) 6259 uptodate = 0; 6260 6261 /* 6262 * We can't unlock the pages just yet since the extent buffer 6263 * hasn't been properly inserted in the radix tree, this 6264 * opens a race with btree_release_folio which can free a page 6265 * while we are still filling in all pages for the buffer and 6266 * we could crash. 6267 */ 6268 } 6269 if (uptodate) 6270 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6271 again: 6272 ret = radix_tree_preload(GFP_NOFS); 6273 if (ret) { 6274 exists = ERR_PTR(ret); 6275 goto free_eb; 6276 } 6277 6278 spin_lock(&fs_info->buffer_lock); 6279 ret = radix_tree_insert(&fs_info->buffer_radix, 6280 start >> fs_info->sectorsize_bits, eb); 6281 spin_unlock(&fs_info->buffer_lock); 6282 radix_tree_preload_end(); 6283 if (ret == -EEXIST) { 6284 exists = find_extent_buffer(fs_info, start); 6285 if (exists) 6286 goto free_eb; 6287 else 6288 goto again; 6289 } 6290 /* add one reference for the tree */ 6291 check_buffer_tree_ref(eb); 6292 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6293 6294 /* 6295 * Now it's safe to unlock the pages because any calls to 6296 * btree_release_folio will correctly detect that a page belongs to a 6297 * live buffer and won't free them prematurely. 6298 */ 6299 for (i = 0; i < num_pages; i++) 6300 unlock_page(eb->pages[i]); 6301 return eb; 6302 6303 free_eb: 6304 WARN_ON(!atomic_dec_and_test(&eb->refs)); 6305 for (i = 0; i < num_pages; i++) { 6306 if (eb->pages[i]) 6307 unlock_page(eb->pages[i]); 6308 } 6309 6310 btrfs_release_extent_buffer(eb); 6311 return exists; 6312 } 6313 6314 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 6315 { 6316 struct extent_buffer *eb = 6317 container_of(head, struct extent_buffer, rcu_head); 6318 6319 __free_extent_buffer(eb); 6320 } 6321 6322 static int release_extent_buffer(struct extent_buffer *eb) 6323 __releases(&eb->refs_lock) 6324 { 6325 lockdep_assert_held(&eb->refs_lock); 6326 6327 WARN_ON(atomic_read(&eb->refs) == 0); 6328 if (atomic_dec_and_test(&eb->refs)) { 6329 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 6330 struct btrfs_fs_info *fs_info = eb->fs_info; 6331 6332 spin_unlock(&eb->refs_lock); 6333 6334 spin_lock(&fs_info->buffer_lock); 6335 radix_tree_delete(&fs_info->buffer_radix, 6336 eb->start >> fs_info->sectorsize_bits); 6337 spin_unlock(&fs_info->buffer_lock); 6338 } else { 6339 spin_unlock(&eb->refs_lock); 6340 } 6341 6342 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 6343 /* Should be safe to release our pages at this point */ 6344 btrfs_release_extent_buffer_pages(eb); 6345 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6346 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 6347 __free_extent_buffer(eb); 6348 return 1; 6349 } 6350 #endif 6351 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 6352 return 1; 6353 } 6354 spin_unlock(&eb->refs_lock); 6355 6356 return 0; 6357 } 6358 6359 void free_extent_buffer(struct extent_buffer *eb) 6360 { 6361 int refs; 6362 int old; 6363 if (!eb) 6364 return; 6365 6366 while (1) { 6367 refs = atomic_read(&eb->refs); 6368 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 6369 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 6370 refs == 1)) 6371 break; 6372 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 6373 if (old == refs) 6374 return; 6375 } 6376 6377 spin_lock(&eb->refs_lock); 6378 if (atomic_read(&eb->refs) == 2 && 6379 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 6380 !extent_buffer_under_io(eb) && 6381 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6382 atomic_dec(&eb->refs); 6383 6384 /* 6385 * I know this is terrible, but it's temporary until we stop tracking 6386 * the uptodate bits and such for the extent buffers. 6387 */ 6388 release_extent_buffer(eb); 6389 } 6390 6391 void free_extent_buffer_stale(struct extent_buffer *eb) 6392 { 6393 if (!eb) 6394 return; 6395 6396 spin_lock(&eb->refs_lock); 6397 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 6398 6399 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 6400 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6401 atomic_dec(&eb->refs); 6402 release_extent_buffer(eb); 6403 } 6404 6405 static void btree_clear_page_dirty(struct page *page) 6406 { 6407 ASSERT(PageDirty(page)); 6408 ASSERT(PageLocked(page)); 6409 clear_page_dirty_for_io(page); 6410 xa_lock_irq(&page->mapping->i_pages); 6411 if (!PageDirty(page)) 6412 __xa_clear_mark(&page->mapping->i_pages, 6413 page_index(page), PAGECACHE_TAG_DIRTY); 6414 xa_unlock_irq(&page->mapping->i_pages); 6415 } 6416 6417 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) 6418 { 6419 struct btrfs_fs_info *fs_info = eb->fs_info; 6420 struct page *page = eb->pages[0]; 6421 bool last; 6422 6423 /* btree_clear_page_dirty() needs page locked */ 6424 lock_page(page); 6425 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, 6426 eb->len); 6427 if (last) 6428 btree_clear_page_dirty(page); 6429 unlock_page(page); 6430 WARN_ON(atomic_read(&eb->refs) == 0); 6431 } 6432 6433 void clear_extent_buffer_dirty(const struct extent_buffer *eb) 6434 { 6435 int i; 6436 int num_pages; 6437 struct page *page; 6438 6439 if (eb->fs_info->nodesize < PAGE_SIZE) 6440 return clear_subpage_extent_buffer_dirty(eb); 6441 6442 num_pages = num_extent_pages(eb); 6443 6444 for (i = 0; i < num_pages; i++) { 6445 page = eb->pages[i]; 6446 if (!PageDirty(page)) 6447 continue; 6448 lock_page(page); 6449 btree_clear_page_dirty(page); 6450 ClearPageError(page); 6451 unlock_page(page); 6452 } 6453 WARN_ON(atomic_read(&eb->refs) == 0); 6454 } 6455 6456 bool set_extent_buffer_dirty(struct extent_buffer *eb) 6457 { 6458 int i; 6459 int num_pages; 6460 bool was_dirty; 6461 6462 check_buffer_tree_ref(eb); 6463 6464 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 6465 6466 num_pages = num_extent_pages(eb); 6467 WARN_ON(atomic_read(&eb->refs) == 0); 6468 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 6469 6470 if (!was_dirty) { 6471 bool subpage = eb->fs_info->nodesize < PAGE_SIZE; 6472 6473 /* 6474 * For subpage case, we can have other extent buffers in the 6475 * same page, and in clear_subpage_extent_buffer_dirty() we 6476 * have to clear page dirty without subpage lock held. 6477 * This can cause race where our page gets dirty cleared after 6478 * we just set it. 6479 * 6480 * Thankfully, clear_subpage_extent_buffer_dirty() has locked 6481 * its page for other reasons, we can use page lock to prevent 6482 * the above race. 6483 */ 6484 if (subpage) 6485 lock_page(eb->pages[0]); 6486 for (i = 0; i < num_pages; i++) 6487 btrfs_page_set_dirty(eb->fs_info, eb->pages[i], 6488 eb->start, eb->len); 6489 if (subpage) 6490 unlock_page(eb->pages[0]); 6491 } 6492 #ifdef CONFIG_BTRFS_DEBUG 6493 for (i = 0; i < num_pages; i++) 6494 ASSERT(PageDirty(eb->pages[i])); 6495 #endif 6496 6497 return was_dirty; 6498 } 6499 6500 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 6501 { 6502 struct btrfs_fs_info *fs_info = eb->fs_info; 6503 struct page *page; 6504 int num_pages; 6505 int i; 6506 6507 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6508 num_pages = num_extent_pages(eb); 6509 for (i = 0; i < num_pages; i++) { 6510 page = eb->pages[i]; 6511 if (!page) 6512 continue; 6513 6514 /* 6515 * This is special handling for metadata subpage, as regular 6516 * btrfs_is_subpage() can not handle cloned/dummy metadata. 6517 */ 6518 if (fs_info->nodesize >= PAGE_SIZE) 6519 ClearPageUptodate(page); 6520 else 6521 btrfs_subpage_clear_uptodate(fs_info, page, eb->start, 6522 eb->len); 6523 } 6524 } 6525 6526 void set_extent_buffer_uptodate(struct extent_buffer *eb) 6527 { 6528 struct btrfs_fs_info *fs_info = eb->fs_info; 6529 struct page *page; 6530 int num_pages; 6531 int i; 6532 6533 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6534 num_pages = num_extent_pages(eb); 6535 for (i = 0; i < num_pages; i++) { 6536 page = eb->pages[i]; 6537 6538 /* 6539 * This is special handling for metadata subpage, as regular 6540 * btrfs_is_subpage() can not handle cloned/dummy metadata. 6541 */ 6542 if (fs_info->nodesize >= PAGE_SIZE) 6543 SetPageUptodate(page); 6544 else 6545 btrfs_subpage_set_uptodate(fs_info, page, eb->start, 6546 eb->len); 6547 } 6548 } 6549 6550 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, 6551 int mirror_num) 6552 { 6553 struct btrfs_fs_info *fs_info = eb->fs_info; 6554 struct extent_io_tree *io_tree; 6555 struct page *page = eb->pages[0]; 6556 struct btrfs_bio_ctrl bio_ctrl = { 6557 .mirror_num = mirror_num, 6558 }; 6559 int ret = 0; 6560 6561 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); 6562 ASSERT(PagePrivate(page)); 6563 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 6564 6565 if (wait == WAIT_NONE) { 6566 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) 6567 return -EAGAIN; 6568 } else { 6569 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6570 if (ret < 0) 6571 return ret; 6572 } 6573 6574 ret = 0; 6575 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || 6576 PageUptodate(page) || 6577 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { 6578 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6579 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6580 return ret; 6581 } 6582 6583 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6584 eb->read_mirror = 0; 6585 atomic_set(&eb->io_pages, 1); 6586 check_buffer_tree_ref(eb); 6587 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); 6588 6589 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); 6590 ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, 6591 page, eb->start, eb->len, 6592 eb->start - page_offset(page), 6593 end_bio_extent_readpage, 0, true); 6594 if (ret) { 6595 /* 6596 * In the endio function, if we hit something wrong we will 6597 * increase the io_pages, so here we need to decrease it for 6598 * error path. 6599 */ 6600 atomic_dec(&eb->io_pages); 6601 } 6602 submit_one_bio(&bio_ctrl); 6603 if (ret || wait != WAIT_COMPLETE) 6604 return ret; 6605 6606 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); 6607 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6608 ret = -EIO; 6609 return ret; 6610 } 6611 6612 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) 6613 { 6614 int i; 6615 struct page *page; 6616 int err; 6617 int ret = 0; 6618 int locked_pages = 0; 6619 int all_uptodate = 1; 6620 int num_pages; 6621 unsigned long num_reads = 0; 6622 struct btrfs_bio_ctrl bio_ctrl = { 6623 .mirror_num = mirror_num, 6624 }; 6625 6626 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6627 return 0; 6628 6629 /* 6630 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 6631 * operation, which could potentially still be in flight. In this case 6632 * we simply want to return an error. 6633 */ 6634 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 6635 return -EIO; 6636 6637 if (eb->fs_info->nodesize < PAGE_SIZE) 6638 return read_extent_buffer_subpage(eb, wait, mirror_num); 6639 6640 num_pages = num_extent_pages(eb); 6641 for (i = 0; i < num_pages; i++) { 6642 page = eb->pages[i]; 6643 if (wait == WAIT_NONE) { 6644 /* 6645 * WAIT_NONE is only utilized by readahead. If we can't 6646 * acquire the lock atomically it means either the eb 6647 * is being read out or under modification. 6648 * Either way the eb will be or has been cached, 6649 * readahead can exit safely. 6650 */ 6651 if (!trylock_page(page)) 6652 goto unlock_exit; 6653 } else { 6654 lock_page(page); 6655 } 6656 locked_pages++; 6657 } 6658 /* 6659 * We need to firstly lock all pages to make sure that 6660 * the uptodate bit of our pages won't be affected by 6661 * clear_extent_buffer_uptodate(). 6662 */ 6663 for (i = 0; i < num_pages; i++) { 6664 page = eb->pages[i]; 6665 if (!PageUptodate(page)) { 6666 num_reads++; 6667 all_uptodate = 0; 6668 } 6669 } 6670 6671 if (all_uptodate) { 6672 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6673 goto unlock_exit; 6674 } 6675 6676 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6677 eb->read_mirror = 0; 6678 atomic_set(&eb->io_pages, num_reads); 6679 /* 6680 * It is possible for release_folio to clear the TREE_REF bit before we 6681 * set io_pages. See check_buffer_tree_ref for a more detailed comment. 6682 */ 6683 check_buffer_tree_ref(eb); 6684 for (i = 0; i < num_pages; i++) { 6685 page = eb->pages[i]; 6686 6687 if (!PageUptodate(page)) { 6688 if (ret) { 6689 atomic_dec(&eb->io_pages); 6690 unlock_page(page); 6691 continue; 6692 } 6693 6694 ClearPageError(page); 6695 err = submit_extent_page(REQ_OP_READ, NULL, 6696 &bio_ctrl, page, page_offset(page), 6697 PAGE_SIZE, 0, end_bio_extent_readpage, 6698 0, false); 6699 if (err) { 6700 /* 6701 * We failed to submit the bio so it's the 6702 * caller's responsibility to perform cleanup 6703 * i.e unlock page/set error bit. 6704 */ 6705 ret = err; 6706 SetPageError(page); 6707 unlock_page(page); 6708 atomic_dec(&eb->io_pages); 6709 } 6710 } else { 6711 unlock_page(page); 6712 } 6713 } 6714 6715 submit_one_bio(&bio_ctrl); 6716 6717 if (ret || wait != WAIT_COMPLETE) 6718 return ret; 6719 6720 for (i = 0; i < num_pages; i++) { 6721 page = eb->pages[i]; 6722 wait_on_page_locked(page); 6723 if (!PageUptodate(page)) 6724 ret = -EIO; 6725 } 6726 6727 return ret; 6728 6729 unlock_exit: 6730 while (locked_pages > 0) { 6731 locked_pages--; 6732 page = eb->pages[locked_pages]; 6733 unlock_page(page); 6734 } 6735 return ret; 6736 } 6737 6738 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 6739 unsigned long len) 6740 { 6741 btrfs_warn(eb->fs_info, 6742 "access to eb bytenr %llu len %lu out of range start %lu len %lu", 6743 eb->start, eb->len, start, len); 6744 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 6745 6746 return true; 6747 } 6748 6749 /* 6750 * Check if the [start, start + len) range is valid before reading/writing 6751 * the eb. 6752 * NOTE: @start and @len are offset inside the eb, not logical address. 6753 * 6754 * Caller should not touch the dst/src memory if this function returns error. 6755 */ 6756 static inline int check_eb_range(const struct extent_buffer *eb, 6757 unsigned long start, unsigned long len) 6758 { 6759 unsigned long offset; 6760 6761 /* start, start + len should not go beyond eb->len nor overflow */ 6762 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 6763 return report_eb_range(eb, start, len); 6764 6765 return false; 6766 } 6767 6768 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 6769 unsigned long start, unsigned long len) 6770 { 6771 size_t cur; 6772 size_t offset; 6773 struct page *page; 6774 char *kaddr; 6775 char *dst = (char *)dstv; 6776 unsigned long i = get_eb_page_index(start); 6777 6778 if (check_eb_range(eb, start, len)) 6779 return; 6780 6781 offset = get_eb_offset_in_page(eb, start); 6782 6783 while (len > 0) { 6784 page = eb->pages[i]; 6785 6786 cur = min(len, (PAGE_SIZE - offset)); 6787 kaddr = page_address(page); 6788 memcpy(dst, kaddr + offset, cur); 6789 6790 dst += cur; 6791 len -= cur; 6792 offset = 0; 6793 i++; 6794 } 6795 } 6796 6797 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 6798 void __user *dstv, 6799 unsigned long start, unsigned long len) 6800 { 6801 size_t cur; 6802 size_t offset; 6803 struct page *page; 6804 char *kaddr; 6805 char __user *dst = (char __user *)dstv; 6806 unsigned long i = get_eb_page_index(start); 6807 int ret = 0; 6808 6809 WARN_ON(start > eb->len); 6810 WARN_ON(start + len > eb->start + eb->len); 6811 6812 offset = get_eb_offset_in_page(eb, start); 6813 6814 while (len > 0) { 6815 page = eb->pages[i]; 6816 6817 cur = min(len, (PAGE_SIZE - offset)); 6818 kaddr = page_address(page); 6819 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 6820 ret = -EFAULT; 6821 break; 6822 } 6823 6824 dst += cur; 6825 len -= cur; 6826 offset = 0; 6827 i++; 6828 } 6829 6830 return ret; 6831 } 6832 6833 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 6834 unsigned long start, unsigned long len) 6835 { 6836 size_t cur; 6837 size_t offset; 6838 struct page *page; 6839 char *kaddr; 6840 char *ptr = (char *)ptrv; 6841 unsigned long i = get_eb_page_index(start); 6842 int ret = 0; 6843 6844 if (check_eb_range(eb, start, len)) 6845 return -EINVAL; 6846 6847 offset = get_eb_offset_in_page(eb, start); 6848 6849 while (len > 0) { 6850 page = eb->pages[i]; 6851 6852 cur = min(len, (PAGE_SIZE - offset)); 6853 6854 kaddr = page_address(page); 6855 ret = memcmp(ptr, kaddr + offset, cur); 6856 if (ret) 6857 break; 6858 6859 ptr += cur; 6860 len -= cur; 6861 offset = 0; 6862 i++; 6863 } 6864 return ret; 6865 } 6866 6867 /* 6868 * Check that the extent buffer is uptodate. 6869 * 6870 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 6871 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 6872 */ 6873 static void assert_eb_page_uptodate(const struct extent_buffer *eb, 6874 struct page *page) 6875 { 6876 struct btrfs_fs_info *fs_info = eb->fs_info; 6877 6878 /* 6879 * If we are using the commit root we could potentially clear a page 6880 * Uptodate while we're using the extent buffer that we've previously 6881 * looked up. We don't want to complain in this case, as the page was 6882 * valid before, we just didn't write it out. Instead we want to catch 6883 * the case where we didn't actually read the block properly, which 6884 * would have !PageUptodate && !PageError, as we clear PageError before 6885 * reading. 6886 */ 6887 if (fs_info->nodesize < PAGE_SIZE) { 6888 bool uptodate, error; 6889 6890 uptodate = btrfs_subpage_test_uptodate(fs_info, page, 6891 eb->start, eb->len); 6892 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); 6893 WARN_ON(!uptodate && !error); 6894 } else { 6895 WARN_ON(!PageUptodate(page) && !PageError(page)); 6896 } 6897 } 6898 6899 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, 6900 const void *srcv) 6901 { 6902 char *kaddr; 6903 6904 assert_eb_page_uptodate(eb, eb->pages[0]); 6905 kaddr = page_address(eb->pages[0]) + 6906 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, 6907 chunk_tree_uuid)); 6908 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6909 } 6910 6911 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) 6912 { 6913 char *kaddr; 6914 6915 assert_eb_page_uptodate(eb, eb->pages[0]); 6916 kaddr = page_address(eb->pages[0]) + 6917 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); 6918 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6919 } 6920 6921 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 6922 unsigned long start, unsigned long len) 6923 { 6924 size_t cur; 6925 size_t offset; 6926 struct page *page; 6927 char *kaddr; 6928 char *src = (char *)srcv; 6929 unsigned long i = get_eb_page_index(start); 6930 6931 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); 6932 6933 if (check_eb_range(eb, start, len)) 6934 return; 6935 6936 offset = get_eb_offset_in_page(eb, start); 6937 6938 while (len > 0) { 6939 page = eb->pages[i]; 6940 assert_eb_page_uptodate(eb, page); 6941 6942 cur = min(len, PAGE_SIZE - offset); 6943 kaddr = page_address(page); 6944 memcpy(kaddr + offset, src, cur); 6945 6946 src += cur; 6947 len -= cur; 6948 offset = 0; 6949 i++; 6950 } 6951 } 6952 6953 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 6954 unsigned long len) 6955 { 6956 size_t cur; 6957 size_t offset; 6958 struct page *page; 6959 char *kaddr; 6960 unsigned long i = get_eb_page_index(start); 6961 6962 if (check_eb_range(eb, start, len)) 6963 return; 6964 6965 offset = get_eb_offset_in_page(eb, start); 6966 6967 while (len > 0) { 6968 page = eb->pages[i]; 6969 assert_eb_page_uptodate(eb, page); 6970 6971 cur = min(len, PAGE_SIZE - offset); 6972 kaddr = page_address(page); 6973 memset(kaddr + offset, 0, cur); 6974 6975 len -= cur; 6976 offset = 0; 6977 i++; 6978 } 6979 } 6980 6981 void copy_extent_buffer_full(const struct extent_buffer *dst, 6982 const struct extent_buffer *src) 6983 { 6984 int i; 6985 int num_pages; 6986 6987 ASSERT(dst->len == src->len); 6988 6989 if (dst->fs_info->nodesize >= PAGE_SIZE) { 6990 num_pages = num_extent_pages(dst); 6991 for (i = 0; i < num_pages; i++) 6992 copy_page(page_address(dst->pages[i]), 6993 page_address(src->pages[i])); 6994 } else { 6995 size_t src_offset = get_eb_offset_in_page(src, 0); 6996 size_t dst_offset = get_eb_offset_in_page(dst, 0); 6997 6998 ASSERT(src->fs_info->nodesize < PAGE_SIZE); 6999 memcpy(page_address(dst->pages[0]) + dst_offset, 7000 page_address(src->pages[0]) + src_offset, 7001 src->len); 7002 } 7003 } 7004 7005 void copy_extent_buffer(const struct extent_buffer *dst, 7006 const struct extent_buffer *src, 7007 unsigned long dst_offset, unsigned long src_offset, 7008 unsigned long len) 7009 { 7010 u64 dst_len = dst->len; 7011 size_t cur; 7012 size_t offset; 7013 struct page *page; 7014 char *kaddr; 7015 unsigned long i = get_eb_page_index(dst_offset); 7016 7017 if (check_eb_range(dst, dst_offset, len) || 7018 check_eb_range(src, src_offset, len)) 7019 return; 7020 7021 WARN_ON(src->len != dst_len); 7022 7023 offset = get_eb_offset_in_page(dst, dst_offset); 7024 7025 while (len > 0) { 7026 page = dst->pages[i]; 7027 assert_eb_page_uptodate(dst, page); 7028 7029 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 7030 7031 kaddr = page_address(page); 7032 read_extent_buffer(src, kaddr + offset, src_offset, cur); 7033 7034 src_offset += cur; 7035 len -= cur; 7036 offset = 0; 7037 i++; 7038 } 7039 } 7040 7041 /* 7042 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 7043 * given bit number 7044 * @eb: the extent buffer 7045 * @start: offset of the bitmap item in the extent buffer 7046 * @nr: bit number 7047 * @page_index: return index of the page in the extent buffer that contains the 7048 * given bit number 7049 * @page_offset: return offset into the page given by page_index 7050 * 7051 * This helper hides the ugliness of finding the byte in an extent buffer which 7052 * contains a given bit. 7053 */ 7054 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 7055 unsigned long start, unsigned long nr, 7056 unsigned long *page_index, 7057 size_t *page_offset) 7058 { 7059 size_t byte_offset = BIT_BYTE(nr); 7060 size_t offset; 7061 7062 /* 7063 * The byte we want is the offset of the extent buffer + the offset of 7064 * the bitmap item in the extent buffer + the offset of the byte in the 7065 * bitmap item. 7066 */ 7067 offset = start + offset_in_page(eb->start) + byte_offset; 7068 7069 *page_index = offset >> PAGE_SHIFT; 7070 *page_offset = offset_in_page(offset); 7071 } 7072 7073 /** 7074 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 7075 * @eb: the extent buffer 7076 * @start: offset of the bitmap item in the extent buffer 7077 * @nr: bit number to test 7078 */ 7079 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 7080 unsigned long nr) 7081 { 7082 u8 *kaddr; 7083 struct page *page; 7084 unsigned long i; 7085 size_t offset; 7086 7087 eb_bitmap_offset(eb, start, nr, &i, &offset); 7088 page = eb->pages[i]; 7089 assert_eb_page_uptodate(eb, page); 7090 kaddr = page_address(page); 7091 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 7092 } 7093 7094 /** 7095 * extent_buffer_bitmap_set - set an area of a bitmap 7096 * @eb: the extent buffer 7097 * @start: offset of the bitmap item in the extent buffer 7098 * @pos: bit number of the first bit 7099 * @len: number of bits to set 7100 */ 7101 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 7102 unsigned long pos, unsigned long len) 7103 { 7104 u8 *kaddr; 7105 struct page *page; 7106 unsigned long i; 7107 size_t offset; 7108 const unsigned int size = pos + len; 7109 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 7110 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 7111 7112 eb_bitmap_offset(eb, start, pos, &i, &offset); 7113 page = eb->pages[i]; 7114 assert_eb_page_uptodate(eb, page); 7115 kaddr = page_address(page); 7116 7117 while (len >= bits_to_set) { 7118 kaddr[offset] |= mask_to_set; 7119 len -= bits_to_set; 7120 bits_to_set = BITS_PER_BYTE; 7121 mask_to_set = ~0; 7122 if (++offset >= PAGE_SIZE && len > 0) { 7123 offset = 0; 7124 page = eb->pages[++i]; 7125 assert_eb_page_uptodate(eb, page); 7126 kaddr = page_address(page); 7127 } 7128 } 7129 if (len) { 7130 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 7131 kaddr[offset] |= mask_to_set; 7132 } 7133 } 7134 7135 7136 /** 7137 * extent_buffer_bitmap_clear - clear an area of a bitmap 7138 * @eb: the extent buffer 7139 * @start: offset of the bitmap item in the extent buffer 7140 * @pos: bit number of the first bit 7141 * @len: number of bits to clear 7142 */ 7143 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 7144 unsigned long start, unsigned long pos, 7145 unsigned long len) 7146 { 7147 u8 *kaddr; 7148 struct page *page; 7149 unsigned long i; 7150 size_t offset; 7151 const unsigned int size = pos + len; 7152 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 7153 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 7154 7155 eb_bitmap_offset(eb, start, pos, &i, &offset); 7156 page = eb->pages[i]; 7157 assert_eb_page_uptodate(eb, page); 7158 kaddr = page_address(page); 7159 7160 while (len >= bits_to_clear) { 7161 kaddr[offset] &= ~mask_to_clear; 7162 len -= bits_to_clear; 7163 bits_to_clear = BITS_PER_BYTE; 7164 mask_to_clear = ~0; 7165 if (++offset >= PAGE_SIZE && len > 0) { 7166 offset = 0; 7167 page = eb->pages[++i]; 7168 assert_eb_page_uptodate(eb, page); 7169 kaddr = page_address(page); 7170 } 7171 } 7172 if (len) { 7173 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 7174 kaddr[offset] &= ~mask_to_clear; 7175 } 7176 } 7177 7178 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 7179 { 7180 unsigned long distance = (src > dst) ? src - dst : dst - src; 7181 return distance < len; 7182 } 7183 7184 static void copy_pages(struct page *dst_page, struct page *src_page, 7185 unsigned long dst_off, unsigned long src_off, 7186 unsigned long len) 7187 { 7188 char *dst_kaddr = page_address(dst_page); 7189 char *src_kaddr; 7190 int must_memmove = 0; 7191 7192 if (dst_page != src_page) { 7193 src_kaddr = page_address(src_page); 7194 } else { 7195 src_kaddr = dst_kaddr; 7196 if (areas_overlap(src_off, dst_off, len)) 7197 must_memmove = 1; 7198 } 7199 7200 if (must_memmove) 7201 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 7202 else 7203 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 7204 } 7205 7206 void memcpy_extent_buffer(const struct extent_buffer *dst, 7207 unsigned long dst_offset, unsigned long src_offset, 7208 unsigned long len) 7209 { 7210 size_t cur; 7211 size_t dst_off_in_page; 7212 size_t src_off_in_page; 7213 unsigned long dst_i; 7214 unsigned long src_i; 7215 7216 if (check_eb_range(dst, dst_offset, len) || 7217 check_eb_range(dst, src_offset, len)) 7218 return; 7219 7220 while (len > 0) { 7221 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); 7222 src_off_in_page = get_eb_offset_in_page(dst, src_offset); 7223 7224 dst_i = get_eb_page_index(dst_offset); 7225 src_i = get_eb_page_index(src_offset); 7226 7227 cur = min(len, (unsigned long)(PAGE_SIZE - 7228 src_off_in_page)); 7229 cur = min_t(unsigned long, cur, 7230 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 7231 7232 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7233 dst_off_in_page, src_off_in_page, cur); 7234 7235 src_offset += cur; 7236 dst_offset += cur; 7237 len -= cur; 7238 } 7239 } 7240 7241 void memmove_extent_buffer(const struct extent_buffer *dst, 7242 unsigned long dst_offset, unsigned long src_offset, 7243 unsigned long len) 7244 { 7245 size_t cur; 7246 size_t dst_off_in_page; 7247 size_t src_off_in_page; 7248 unsigned long dst_end = dst_offset + len - 1; 7249 unsigned long src_end = src_offset + len - 1; 7250 unsigned long dst_i; 7251 unsigned long src_i; 7252 7253 if (check_eb_range(dst, dst_offset, len) || 7254 check_eb_range(dst, src_offset, len)) 7255 return; 7256 if (dst_offset < src_offset) { 7257 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 7258 return; 7259 } 7260 while (len > 0) { 7261 dst_i = get_eb_page_index(dst_end); 7262 src_i = get_eb_page_index(src_end); 7263 7264 dst_off_in_page = get_eb_offset_in_page(dst, dst_end); 7265 src_off_in_page = get_eb_offset_in_page(dst, src_end); 7266 7267 cur = min_t(unsigned long, len, src_off_in_page + 1); 7268 cur = min(cur, dst_off_in_page + 1); 7269 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7270 dst_off_in_page - cur + 1, 7271 src_off_in_page - cur + 1, cur); 7272 7273 dst_end -= cur; 7274 src_end -= cur; 7275 len -= cur; 7276 } 7277 } 7278 7279 #define GANG_LOOKUP_SIZE 16 7280 static struct extent_buffer *get_next_extent_buffer( 7281 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 7282 { 7283 struct extent_buffer *gang[GANG_LOOKUP_SIZE]; 7284 struct extent_buffer *found = NULL; 7285 u64 page_start = page_offset(page); 7286 u64 cur = page_start; 7287 7288 ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); 7289 lockdep_assert_held(&fs_info->buffer_lock); 7290 7291 while (cur < page_start + PAGE_SIZE) { 7292 int ret; 7293 int i; 7294 7295 ret = radix_tree_gang_lookup(&fs_info->buffer_radix, 7296 (void **)gang, cur >> fs_info->sectorsize_bits, 7297 min_t(unsigned int, GANG_LOOKUP_SIZE, 7298 PAGE_SIZE / fs_info->nodesize)); 7299 if (ret == 0) 7300 goto out; 7301 for (i = 0; i < ret; i++) { 7302 /* Already beyond page end */ 7303 if (gang[i]->start >= page_start + PAGE_SIZE) 7304 goto out; 7305 /* Found one */ 7306 if (gang[i]->start >= bytenr) { 7307 found = gang[i]; 7308 goto out; 7309 } 7310 } 7311 cur = gang[ret - 1]->start + gang[ret - 1]->len; 7312 } 7313 out: 7314 return found; 7315 } 7316 7317 static int try_release_subpage_extent_buffer(struct page *page) 7318 { 7319 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 7320 u64 cur = page_offset(page); 7321 const u64 end = page_offset(page) + PAGE_SIZE; 7322 int ret; 7323 7324 while (cur < end) { 7325 struct extent_buffer *eb = NULL; 7326 7327 /* 7328 * Unlike try_release_extent_buffer() which uses page->private 7329 * to grab buffer, for subpage case we rely on radix tree, thus 7330 * we need to ensure radix tree consistency. 7331 * 7332 * We also want an atomic snapshot of the radix tree, thus go 7333 * with spinlock rather than RCU. 7334 */ 7335 spin_lock(&fs_info->buffer_lock); 7336 eb = get_next_extent_buffer(fs_info, page, cur); 7337 if (!eb) { 7338 /* No more eb in the page range after or at cur */ 7339 spin_unlock(&fs_info->buffer_lock); 7340 break; 7341 } 7342 cur = eb->start + eb->len; 7343 7344 /* 7345 * The same as try_release_extent_buffer(), to ensure the eb 7346 * won't disappear out from under us. 7347 */ 7348 spin_lock(&eb->refs_lock); 7349 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7350 spin_unlock(&eb->refs_lock); 7351 spin_unlock(&fs_info->buffer_lock); 7352 break; 7353 } 7354 spin_unlock(&fs_info->buffer_lock); 7355 7356 /* 7357 * If tree ref isn't set then we know the ref on this eb is a 7358 * real ref, so just return, this eb will likely be freed soon 7359 * anyway. 7360 */ 7361 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7362 spin_unlock(&eb->refs_lock); 7363 break; 7364 } 7365 7366 /* 7367 * Here we don't care about the return value, we will always 7368 * check the page private at the end. And 7369 * release_extent_buffer() will release the refs_lock. 7370 */ 7371 release_extent_buffer(eb); 7372 } 7373 /* 7374 * Finally to check if we have cleared page private, as if we have 7375 * released all ebs in the page, the page private should be cleared now. 7376 */ 7377 spin_lock(&page->mapping->private_lock); 7378 if (!PagePrivate(page)) 7379 ret = 1; 7380 else 7381 ret = 0; 7382 spin_unlock(&page->mapping->private_lock); 7383 return ret; 7384 7385 } 7386 7387 int try_release_extent_buffer(struct page *page) 7388 { 7389 struct extent_buffer *eb; 7390 7391 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 7392 return try_release_subpage_extent_buffer(page); 7393 7394 /* 7395 * We need to make sure nobody is changing page->private, as we rely on 7396 * page->private as the pointer to extent buffer. 7397 */ 7398 spin_lock(&page->mapping->private_lock); 7399 if (!PagePrivate(page)) { 7400 spin_unlock(&page->mapping->private_lock); 7401 return 1; 7402 } 7403 7404 eb = (struct extent_buffer *)page->private; 7405 BUG_ON(!eb); 7406 7407 /* 7408 * This is a little awful but should be ok, we need to make sure that 7409 * the eb doesn't disappear out from under us while we're looking at 7410 * this page. 7411 */ 7412 spin_lock(&eb->refs_lock); 7413 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7414 spin_unlock(&eb->refs_lock); 7415 spin_unlock(&page->mapping->private_lock); 7416 return 0; 7417 } 7418 spin_unlock(&page->mapping->private_lock); 7419 7420 /* 7421 * If tree ref isn't set then we know the ref on this eb is a real ref, 7422 * so just return, this page will likely be freed soon anyway. 7423 */ 7424 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7425 spin_unlock(&eb->refs_lock); 7426 return 0; 7427 } 7428 7429 return release_extent_buffer(eb); 7430 } 7431 7432 /* 7433 * btrfs_readahead_tree_block - attempt to readahead a child block 7434 * @fs_info: the fs_info 7435 * @bytenr: bytenr to read 7436 * @owner_root: objectid of the root that owns this eb 7437 * @gen: generation for the uptodate check, can be 0 7438 * @level: level for the eb 7439 * 7440 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 7441 * normal uptodate check of the eb, without checking the generation. If we have 7442 * to read the block we will not block on anything. 7443 */ 7444 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 7445 u64 bytenr, u64 owner_root, u64 gen, int level) 7446 { 7447 struct extent_buffer *eb; 7448 int ret; 7449 7450 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 7451 if (IS_ERR(eb)) 7452 return; 7453 7454 if (btrfs_buffer_uptodate(eb, gen, 1)) { 7455 free_extent_buffer(eb); 7456 return; 7457 } 7458 7459 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); 7460 if (ret < 0) 7461 free_extent_buffer_stale(eb); 7462 else 7463 free_extent_buffer(eb); 7464 } 7465 7466 /* 7467 * btrfs_readahead_node_child - readahead a node's child block 7468 * @node: parent node we're reading from 7469 * @slot: slot in the parent node for the child we want to read 7470 * 7471 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 7472 * the slot in the node provided. 7473 */ 7474 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 7475 { 7476 btrfs_readahead_tree_block(node->fs_info, 7477 btrfs_node_blockptr(node, slot), 7478 btrfs_header_owner(node), 7479 btrfs_node_ptr_generation(node, slot), 7480 btrfs_header_level(node) - 1); 7481 } 7482