1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/bio.h> 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/page-flags.h> 9 #include <linux/sched/mm.h> 10 #include <linux/spinlock.h> 11 #include <linux/blkdev.h> 12 #include <linux/swap.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include <linux/prefetch.h> 16 #include <linux/fsverity.h> 17 #include "misc.h" 18 #include "extent_io.h" 19 #include "extent-io-tree.h" 20 #include "extent_map.h" 21 #include "ctree.h" 22 #include "btrfs_inode.h" 23 #include "volumes.h" 24 #include "check-integrity.h" 25 #include "locking.h" 26 #include "rcu-string.h" 27 #include "backref.h" 28 #include "disk-io.h" 29 #include "subpage.h" 30 #include "zoned.h" 31 #include "block-group.h" 32 #include "compression.h" 33 34 static struct kmem_cache *extent_state_cache; 35 static struct kmem_cache *extent_buffer_cache; 36 static struct bio_set btrfs_bioset; 37 38 static inline bool extent_state_in_tree(const struct extent_state *state) 39 { 40 return !RB_EMPTY_NODE(&state->rb_node); 41 } 42 43 #ifdef CONFIG_BTRFS_DEBUG 44 static LIST_HEAD(states); 45 static DEFINE_SPINLOCK(leak_lock); 46 47 static inline void btrfs_leak_debug_add(spinlock_t *lock, 48 struct list_head *new, 49 struct list_head *head) 50 { 51 unsigned long flags; 52 53 spin_lock_irqsave(lock, flags); 54 list_add(new, head); 55 spin_unlock_irqrestore(lock, flags); 56 } 57 58 static inline void btrfs_leak_debug_del(spinlock_t *lock, 59 struct list_head *entry) 60 { 61 unsigned long flags; 62 63 spin_lock_irqsave(lock, flags); 64 list_del(entry); 65 spin_unlock_irqrestore(lock, flags); 66 } 67 68 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) 69 { 70 struct extent_buffer *eb; 71 unsigned long flags; 72 73 /* 74 * If we didn't get into open_ctree our allocated_ebs will not be 75 * initialized, so just skip this. 76 */ 77 if (!fs_info->allocated_ebs.next) 78 return; 79 80 WARN_ON(!list_empty(&fs_info->allocated_ebs)); 81 spin_lock_irqsave(&fs_info->eb_leak_lock, flags); 82 while (!list_empty(&fs_info->allocated_ebs)) { 83 eb = list_first_entry(&fs_info->allocated_ebs, 84 struct extent_buffer, leak_list); 85 pr_err( 86 "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n", 87 eb->start, eb->len, atomic_read(&eb->refs), eb->bflags, 88 btrfs_header_owner(eb)); 89 list_del(&eb->leak_list); 90 kmem_cache_free(extent_buffer_cache, eb); 91 } 92 spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags); 93 } 94 95 static inline void btrfs_extent_state_leak_debug_check(void) 96 { 97 struct extent_state *state; 98 99 while (!list_empty(&states)) { 100 state = list_entry(states.next, struct extent_state, leak_list); 101 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 102 state->start, state->end, state->state, 103 extent_state_in_tree(state), 104 refcount_read(&state->refs)); 105 list_del(&state->leak_list); 106 kmem_cache_free(extent_state_cache, state); 107 } 108 } 109 110 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 111 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 112 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 113 struct extent_io_tree *tree, u64 start, u64 end) 114 { 115 struct inode *inode = tree->private_data; 116 u64 isize; 117 118 if (!inode || !is_data_inode(inode)) 119 return; 120 121 isize = i_size_read(inode); 122 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 123 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 124 "%s: ino %llu isize %llu odd range [%llu,%llu]", 125 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 126 } 127 } 128 #else 129 #define btrfs_leak_debug_add(lock, new, head) do {} while (0) 130 #define btrfs_leak_debug_del(lock, entry) do {} while (0) 131 #define btrfs_extent_state_leak_debug_check() do {} while (0) 132 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 133 #endif 134 135 struct tree_entry { 136 u64 start; 137 u64 end; 138 struct rb_node rb_node; 139 }; 140 141 /* 142 * Structure to record info about the bio being assembled, and other info like 143 * how many bytes are there before stripe/ordered extent boundary. 144 */ 145 struct btrfs_bio_ctrl { 146 struct bio *bio; 147 int mirror_num; 148 enum btrfs_compression_type compress_type; 149 u32 len_to_stripe_boundary; 150 u32 len_to_oe_boundary; 151 }; 152 153 struct extent_page_data { 154 struct btrfs_bio_ctrl bio_ctrl; 155 /* tells writepage not to lock the state bits for this range 156 * it still does the unlocking 157 */ 158 unsigned int extent_locked:1; 159 160 /* tells the submit_bio code to use REQ_SYNC */ 161 unsigned int sync_io:1; 162 }; 163 164 static int add_extent_changeset(struct extent_state *state, u32 bits, 165 struct extent_changeset *changeset, 166 int set) 167 { 168 int ret; 169 170 if (!changeset) 171 return 0; 172 if (set && (state->state & bits) == bits) 173 return 0; 174 if (!set && (state->state & bits) == 0) 175 return 0; 176 changeset->bytes_changed += state->end - state->start + 1; 177 ret = ulist_add(&changeset->range_changed, state->start, state->end, 178 GFP_ATOMIC); 179 return ret; 180 } 181 182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) 183 { 184 struct bio *bio; 185 struct bio_vec *bv; 186 struct inode *inode; 187 int mirror_num; 188 189 if (!bio_ctrl->bio) 190 return; 191 192 bio = bio_ctrl->bio; 193 bv = bio_first_bvec_all(bio); 194 inode = bv->bv_page->mapping->host; 195 mirror_num = bio_ctrl->mirror_num; 196 197 /* Caller should ensure the bio has at least some range added */ 198 ASSERT(bio->bi_iter.bi_size); 199 200 btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; 201 202 if (!is_data_inode(inode)) 203 btrfs_submit_metadata_bio(inode, bio, mirror_num); 204 else if (btrfs_op(bio) == BTRFS_MAP_WRITE) 205 btrfs_submit_data_write_bio(inode, bio, mirror_num); 206 else 207 btrfs_submit_data_read_bio(inode, bio, mirror_num, 208 bio_ctrl->compress_type); 209 210 /* The bio is owned by the bi_end_io handler now */ 211 bio_ctrl->bio = NULL; 212 } 213 214 /* 215 * Submit or fail the current bio in an extent_page_data structure. 216 */ 217 static void submit_write_bio(struct extent_page_data *epd, int ret) 218 { 219 struct bio *bio = epd->bio_ctrl.bio; 220 221 if (!bio) 222 return; 223 224 if (ret) { 225 ASSERT(ret < 0); 226 bio->bi_status = errno_to_blk_status(ret); 227 bio_endio(bio); 228 /* The bio is owned by the bi_end_io handler now */ 229 epd->bio_ctrl.bio = NULL; 230 } else { 231 submit_one_bio(&epd->bio_ctrl); 232 } 233 } 234 235 int __init extent_state_cache_init(void) 236 { 237 extent_state_cache = kmem_cache_create("btrfs_extent_state", 238 sizeof(struct extent_state), 0, 239 SLAB_MEM_SPREAD, NULL); 240 if (!extent_state_cache) 241 return -ENOMEM; 242 return 0; 243 } 244 245 int __init extent_io_init(void) 246 { 247 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 248 sizeof(struct extent_buffer), 0, 249 SLAB_MEM_SPREAD, NULL); 250 if (!extent_buffer_cache) 251 return -ENOMEM; 252 253 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 254 offsetof(struct btrfs_bio, bio), 255 BIOSET_NEED_BVECS)) 256 goto free_buffer_cache; 257 258 if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE)) 259 goto free_bioset; 260 261 return 0; 262 263 free_bioset: 264 bioset_exit(&btrfs_bioset); 265 266 free_buffer_cache: 267 kmem_cache_destroy(extent_buffer_cache); 268 extent_buffer_cache = NULL; 269 return -ENOMEM; 270 } 271 272 void __cold extent_state_cache_exit(void) 273 { 274 btrfs_extent_state_leak_debug_check(); 275 kmem_cache_destroy(extent_state_cache); 276 } 277 278 void __cold extent_io_exit(void) 279 { 280 /* 281 * Make sure all delayed rcu free are flushed before we 282 * destroy caches. 283 */ 284 rcu_barrier(); 285 kmem_cache_destroy(extent_buffer_cache); 286 bioset_exit(&btrfs_bioset); 287 } 288 289 /* 290 * For the file_extent_tree, we want to hold the inode lock when we lookup and 291 * update the disk_i_size, but lockdep will complain because our io_tree we hold 292 * the tree lock and get the inode lock when setting delalloc. These two things 293 * are unrelated, so make a class for the file_extent_tree so we don't get the 294 * two locking patterns mixed up. 295 */ 296 static struct lock_class_key file_extent_tree_class; 297 298 void extent_io_tree_init(struct btrfs_fs_info *fs_info, 299 struct extent_io_tree *tree, unsigned int owner, 300 void *private_data) 301 { 302 tree->fs_info = fs_info; 303 tree->state = RB_ROOT; 304 tree->dirty_bytes = 0; 305 spin_lock_init(&tree->lock); 306 tree->private_data = private_data; 307 tree->owner = owner; 308 if (owner == IO_TREE_INODE_FILE_EXTENT) 309 lockdep_set_class(&tree->lock, &file_extent_tree_class); 310 } 311 312 void extent_io_tree_release(struct extent_io_tree *tree) 313 { 314 spin_lock(&tree->lock); 315 /* 316 * Do a single barrier for the waitqueue_active check here, the state 317 * of the waitqueue should not change once extent_io_tree_release is 318 * called. 319 */ 320 smp_mb(); 321 while (!RB_EMPTY_ROOT(&tree->state)) { 322 struct rb_node *node; 323 struct extent_state *state; 324 325 node = rb_first(&tree->state); 326 state = rb_entry(node, struct extent_state, rb_node); 327 rb_erase(&state->rb_node, &tree->state); 328 RB_CLEAR_NODE(&state->rb_node); 329 /* 330 * btree io trees aren't supposed to have tasks waiting for 331 * changes in the flags of extent states ever. 332 */ 333 ASSERT(!waitqueue_active(&state->wq)); 334 free_extent_state(state); 335 336 cond_resched_lock(&tree->lock); 337 } 338 spin_unlock(&tree->lock); 339 } 340 341 static struct extent_state *alloc_extent_state(gfp_t mask) 342 { 343 struct extent_state *state; 344 345 /* 346 * The given mask might be not appropriate for the slab allocator, 347 * drop the unsupported bits 348 */ 349 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 350 state = kmem_cache_alloc(extent_state_cache, mask); 351 if (!state) 352 return state; 353 state->state = 0; 354 state->failrec = NULL; 355 RB_CLEAR_NODE(&state->rb_node); 356 btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states); 357 refcount_set(&state->refs, 1); 358 init_waitqueue_head(&state->wq); 359 trace_alloc_extent_state(state, mask, _RET_IP_); 360 return state; 361 } 362 363 void free_extent_state(struct extent_state *state) 364 { 365 if (!state) 366 return; 367 if (refcount_dec_and_test(&state->refs)) { 368 WARN_ON(extent_state_in_tree(state)); 369 btrfs_leak_debug_del(&leak_lock, &state->leak_list); 370 trace_free_extent_state(state, _RET_IP_); 371 kmem_cache_free(extent_state_cache, state); 372 } 373 } 374 375 /** 376 * Search @tree for an entry that contains @offset. Such entry would have 377 * entry->start <= offset && entry->end >= offset. 378 * 379 * @tree: the tree to search 380 * @offset: offset that should fall within an entry in @tree 381 * @node_ret: pointer where new node should be anchored (used when inserting an 382 * entry in the tree) 383 * @parent_ret: points to entry which would have been the parent of the entry, 384 * containing @offset 385 * 386 * Return a pointer to the entry that contains @offset byte address and don't change 387 * @node_ret and @parent_ret. 388 * 389 * If no such entry exists, return pointer to entry that ends before @offset 390 * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. 391 */ 392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, 393 u64 offset, 394 struct rb_node ***node_ret, 395 struct rb_node **parent_ret) 396 { 397 struct rb_root *root = &tree->state; 398 struct rb_node **node = &root->rb_node; 399 struct rb_node *prev = NULL; 400 struct tree_entry *entry; 401 402 while (*node) { 403 prev = *node; 404 entry = rb_entry(prev, struct tree_entry, rb_node); 405 406 if (offset < entry->start) 407 node = &(*node)->rb_left; 408 else if (offset > entry->end) 409 node = &(*node)->rb_right; 410 else 411 return *node; 412 } 413 414 if (node_ret) 415 *node_ret = node; 416 if (parent_ret) 417 *parent_ret = prev; 418 419 /* Search neighbors until we find the first one past the end */ 420 while (prev && offset > entry->end) { 421 prev = rb_next(prev); 422 entry = rb_entry(prev, struct tree_entry, rb_node); 423 } 424 425 return prev; 426 } 427 428 /* 429 * Inexact rb-tree search, return the next entry if @offset is not found 430 */ 431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) 432 { 433 return tree_search_for_insert(tree, offset, NULL, NULL); 434 } 435 436 /** 437 * Search offset in the tree or fill neighbor rbtree node pointers. 438 * 439 * @tree: the tree to search 440 * @offset: offset that should fall within an entry in @tree 441 * @next_ret: pointer to the first entry whose range ends after @offset 442 * @prev_ret: pointer to the first entry whose range begins before @offset 443 * 444 * Return a pointer to the entry that contains @offset byte address. If no 445 * such entry exists, then return NULL and fill @prev_ret and @next_ret. 446 * Otherwise return the found entry and other pointers are left untouched. 447 */ 448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, 449 u64 offset, 450 struct rb_node **prev_ret, 451 struct rb_node **next_ret) 452 { 453 struct rb_root *root = &tree->state; 454 struct rb_node **node = &root->rb_node; 455 struct rb_node *prev = NULL; 456 struct rb_node *orig_prev = NULL; 457 struct tree_entry *entry; 458 459 ASSERT(prev_ret); 460 ASSERT(next_ret); 461 462 while (*node) { 463 prev = *node; 464 entry = rb_entry(prev, struct tree_entry, rb_node); 465 466 if (offset < entry->start) 467 node = &(*node)->rb_left; 468 else if (offset > entry->end) 469 node = &(*node)->rb_right; 470 else 471 return *node; 472 } 473 474 orig_prev = prev; 475 while (prev && offset > entry->end) { 476 prev = rb_next(prev); 477 entry = rb_entry(prev, struct tree_entry, rb_node); 478 } 479 *next_ret = prev; 480 prev = orig_prev; 481 482 entry = rb_entry(prev, struct tree_entry, rb_node); 483 while (prev && offset < entry->start) { 484 prev = rb_prev(prev); 485 entry = rb_entry(prev, struct tree_entry, rb_node); 486 } 487 *prev_ret = prev; 488 489 return NULL; 490 } 491 492 /* 493 * utility function to look for merge candidates inside a given range. 494 * Any extents with matching state are merged together into a single 495 * extent in the tree. Extents with EXTENT_IO in their state field 496 * are not merged because the end_io handlers need to be able to do 497 * operations on them without sleeping (or doing allocations/splits). 498 * 499 * This should be called with the tree lock held. 500 */ 501 static void merge_state(struct extent_io_tree *tree, 502 struct extent_state *state) 503 { 504 struct extent_state *other; 505 struct rb_node *other_node; 506 507 if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 508 return; 509 510 other_node = rb_prev(&state->rb_node); 511 if (other_node) { 512 other = rb_entry(other_node, struct extent_state, rb_node); 513 if (other->end == state->start - 1 && 514 other->state == state->state) { 515 if (tree->private_data && 516 is_data_inode(tree->private_data)) 517 btrfs_merge_delalloc_extent(tree->private_data, 518 state, other); 519 state->start = other->start; 520 rb_erase(&other->rb_node, &tree->state); 521 RB_CLEAR_NODE(&other->rb_node); 522 free_extent_state(other); 523 } 524 } 525 other_node = rb_next(&state->rb_node); 526 if (other_node) { 527 other = rb_entry(other_node, struct extent_state, rb_node); 528 if (other->start == state->end + 1 && 529 other->state == state->state) { 530 if (tree->private_data && 531 is_data_inode(tree->private_data)) 532 btrfs_merge_delalloc_extent(tree->private_data, 533 state, other); 534 state->end = other->end; 535 rb_erase(&other->rb_node, &tree->state); 536 RB_CLEAR_NODE(&other->rb_node); 537 free_extent_state(other); 538 } 539 } 540 } 541 542 static void set_state_bits(struct extent_io_tree *tree, 543 struct extent_state *state, u32 bits, 544 struct extent_changeset *changeset); 545 546 /* 547 * insert an extent_state struct into the tree. 'bits' are set on the 548 * struct before it is inserted. 549 * 550 * This may return -EEXIST if the extent is already there, in which case the 551 * state struct is freed. 552 * 553 * The tree lock is not taken internally. This is a utility function and 554 * probably isn't what you want to call (see set/clear_extent_bit). 555 */ 556 static int insert_state(struct extent_io_tree *tree, 557 struct extent_state *state, 558 u32 bits, struct extent_changeset *changeset) 559 { 560 struct rb_node **node; 561 struct rb_node *parent; 562 const u64 end = state->end; 563 564 set_state_bits(tree, state, bits, changeset); 565 566 node = &tree->state.rb_node; 567 while (*node) { 568 struct tree_entry *entry; 569 570 parent = *node; 571 entry = rb_entry(parent, struct tree_entry, rb_node); 572 573 if (end < entry->start) { 574 node = &(*node)->rb_left; 575 } else if (end > entry->end) { 576 node = &(*node)->rb_right; 577 } else { 578 btrfs_err(tree->fs_info, 579 "found node %llu %llu on insert of %llu %llu", 580 entry->start, entry->end, state->start, end); 581 return -EEXIST; 582 } 583 } 584 585 rb_link_node(&state->rb_node, parent, node); 586 rb_insert_color(&state->rb_node, &tree->state); 587 588 merge_state(tree, state); 589 return 0; 590 } 591 592 /* 593 * Insert state to @tree to the location given by @node and @parent. 594 */ 595 static void insert_state_fast(struct extent_io_tree *tree, 596 struct extent_state *state, struct rb_node **node, 597 struct rb_node *parent, unsigned bits, 598 struct extent_changeset *changeset) 599 { 600 set_state_bits(tree, state, bits, changeset); 601 rb_link_node(&state->rb_node, parent, node); 602 rb_insert_color(&state->rb_node, &tree->state); 603 merge_state(tree, state); 604 } 605 606 /* 607 * split a given extent state struct in two, inserting the preallocated 608 * struct 'prealloc' as the newly created second half. 'split' indicates an 609 * offset inside 'orig' where it should be split. 610 * 611 * Before calling, 612 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 613 * are two extent state structs in the tree: 614 * prealloc: [orig->start, split - 1] 615 * orig: [ split, orig->end ] 616 * 617 * The tree locks are not taken by this function. They need to be held 618 * by the caller. 619 */ 620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 621 struct extent_state *prealloc, u64 split) 622 { 623 struct rb_node *parent = NULL; 624 struct rb_node **node; 625 626 if (tree->private_data && is_data_inode(tree->private_data)) 627 btrfs_split_delalloc_extent(tree->private_data, orig, split); 628 629 prealloc->start = orig->start; 630 prealloc->end = split - 1; 631 prealloc->state = orig->state; 632 orig->start = split; 633 634 parent = &orig->rb_node; 635 node = &parent; 636 while (*node) { 637 struct tree_entry *entry; 638 639 parent = *node; 640 entry = rb_entry(parent, struct tree_entry, rb_node); 641 642 if (prealloc->end < entry->start) { 643 node = &(*node)->rb_left; 644 } else if (prealloc->end > entry->end) { 645 node = &(*node)->rb_right; 646 } else { 647 free_extent_state(prealloc); 648 return -EEXIST; 649 } 650 } 651 652 rb_link_node(&prealloc->rb_node, parent, node); 653 rb_insert_color(&prealloc->rb_node, &tree->state); 654 655 return 0; 656 } 657 658 static struct extent_state *next_state(struct extent_state *state) 659 { 660 struct rb_node *next = rb_next(&state->rb_node); 661 if (next) 662 return rb_entry(next, struct extent_state, rb_node); 663 else 664 return NULL; 665 } 666 667 /* 668 * utility function to clear some bits in an extent state struct. 669 * it will optionally wake up anyone waiting on this state (wake == 1). 670 * 671 * If no bits are set on the state struct after clearing things, the 672 * struct is freed and removed from the tree 673 */ 674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 675 struct extent_state *state, 676 u32 bits, int wake, 677 struct extent_changeset *changeset) 678 { 679 struct extent_state *next; 680 u32 bits_to_clear = bits & ~EXTENT_CTLBITS; 681 int ret; 682 683 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 684 u64 range = state->end - state->start + 1; 685 WARN_ON(range > tree->dirty_bytes); 686 tree->dirty_bytes -= range; 687 } 688 689 if (tree->private_data && is_data_inode(tree->private_data)) 690 btrfs_clear_delalloc_extent(tree->private_data, state, bits); 691 692 ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 693 BUG_ON(ret < 0); 694 state->state &= ~bits_to_clear; 695 if (wake) 696 wake_up(&state->wq); 697 if (state->state == 0) { 698 next = next_state(state); 699 if (extent_state_in_tree(state)) { 700 rb_erase(&state->rb_node, &tree->state); 701 RB_CLEAR_NODE(&state->rb_node); 702 free_extent_state(state); 703 } else { 704 WARN_ON(1); 705 } 706 } else { 707 merge_state(tree, state); 708 next = next_state(state); 709 } 710 return next; 711 } 712 713 static struct extent_state * 714 alloc_extent_state_atomic(struct extent_state *prealloc) 715 { 716 if (!prealloc) 717 prealloc = alloc_extent_state(GFP_ATOMIC); 718 719 return prealloc; 720 } 721 722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 723 { 724 btrfs_panic(tree->fs_info, err, 725 "locking error: extent tree was modified by another thread while locked"); 726 } 727 728 /* 729 * clear some bits on a range in the tree. This may require splitting 730 * or inserting elements in the tree, so the gfp mask is used to 731 * indicate which allocations or sleeping are allowed. 732 * 733 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 734 * the given range from the tree regardless of state (ie for truncate). 735 * 736 * the range [start, end] is inclusive. 737 * 738 * This takes the tree lock, and returns 0 on success and < 0 on error. 739 */ 740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 741 u32 bits, int wake, int delete, 742 struct extent_state **cached_state, 743 gfp_t mask, struct extent_changeset *changeset) 744 { 745 struct extent_state *state; 746 struct extent_state *cached; 747 struct extent_state *prealloc = NULL; 748 struct rb_node *node; 749 u64 last_end; 750 int err; 751 int clear = 0; 752 753 btrfs_debug_check_extent_io_range(tree, start, end); 754 trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); 755 756 if (bits & EXTENT_DELALLOC) 757 bits |= EXTENT_NORESERVE; 758 759 if (delete) 760 bits |= ~EXTENT_CTLBITS; 761 762 if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) 763 clear = 1; 764 again: 765 if (!prealloc && gfpflags_allow_blocking(mask)) { 766 /* 767 * Don't care for allocation failure here because we might end 768 * up not needing the pre-allocated extent state at all, which 769 * is the case if we only have in the tree extent states that 770 * cover our input range and don't cover too any other range. 771 * If we end up needing a new extent state we allocate it later. 772 */ 773 prealloc = alloc_extent_state(mask); 774 } 775 776 spin_lock(&tree->lock); 777 if (cached_state) { 778 cached = *cached_state; 779 780 if (clear) { 781 *cached_state = NULL; 782 cached_state = NULL; 783 } 784 785 if (cached && extent_state_in_tree(cached) && 786 cached->start <= start && cached->end > start) { 787 if (clear) 788 refcount_dec(&cached->refs); 789 state = cached; 790 goto hit_next; 791 } 792 if (clear) 793 free_extent_state(cached); 794 } 795 /* 796 * this search will find the extents that end after 797 * our range starts 798 */ 799 node = tree_search(tree, start); 800 if (!node) 801 goto out; 802 state = rb_entry(node, struct extent_state, rb_node); 803 hit_next: 804 if (state->start > end) 805 goto out; 806 WARN_ON(state->end < start); 807 last_end = state->end; 808 809 /* the state doesn't have the wanted bits, go ahead */ 810 if (!(state->state & bits)) { 811 state = next_state(state); 812 goto next; 813 } 814 815 /* 816 * | ---- desired range ---- | 817 * | state | or 818 * | ------------- state -------------- | 819 * 820 * We need to split the extent we found, and may flip 821 * bits on second half. 822 * 823 * If the extent we found extends past our range, we 824 * just split and search again. It'll get split again 825 * the next time though. 826 * 827 * If the extent we found is inside our range, we clear 828 * the desired bit on it. 829 */ 830 831 if (state->start < start) { 832 prealloc = alloc_extent_state_atomic(prealloc); 833 BUG_ON(!prealloc); 834 err = split_state(tree, state, prealloc, start); 835 if (err) 836 extent_io_tree_panic(tree, err); 837 838 prealloc = NULL; 839 if (err) 840 goto out; 841 if (state->end <= end) { 842 state = clear_state_bit(tree, state, bits, wake, changeset); 843 goto next; 844 } 845 goto search_again; 846 } 847 /* 848 * | ---- desired range ---- | 849 * | state | 850 * We need to split the extent, and clear the bit 851 * on the first half 852 */ 853 if (state->start <= end && state->end > end) { 854 prealloc = alloc_extent_state_atomic(prealloc); 855 BUG_ON(!prealloc); 856 err = split_state(tree, state, prealloc, end + 1); 857 if (err) 858 extent_io_tree_panic(tree, err); 859 860 if (wake) 861 wake_up(&state->wq); 862 863 clear_state_bit(tree, prealloc, bits, wake, changeset); 864 865 prealloc = NULL; 866 goto out; 867 } 868 869 state = clear_state_bit(tree, state, bits, wake, changeset); 870 next: 871 if (last_end == (u64)-1) 872 goto out; 873 start = last_end + 1; 874 if (start <= end && state && !need_resched()) 875 goto hit_next; 876 877 search_again: 878 if (start > end) 879 goto out; 880 spin_unlock(&tree->lock); 881 if (gfpflags_allow_blocking(mask)) 882 cond_resched(); 883 goto again; 884 885 out: 886 spin_unlock(&tree->lock); 887 if (prealloc) 888 free_extent_state(prealloc); 889 890 return 0; 891 892 } 893 894 static void wait_on_state(struct extent_io_tree *tree, 895 struct extent_state *state) 896 __releases(tree->lock) 897 __acquires(tree->lock) 898 { 899 DEFINE_WAIT(wait); 900 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 901 spin_unlock(&tree->lock); 902 schedule(); 903 spin_lock(&tree->lock); 904 finish_wait(&state->wq, &wait); 905 } 906 907 /* 908 * waits for one or more bits to clear on a range in the state tree. 909 * The range [start, end] is inclusive. 910 * The tree lock is taken by this function 911 */ 912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 913 u32 bits) 914 { 915 struct extent_state *state; 916 struct rb_node *node; 917 918 btrfs_debug_check_extent_io_range(tree, start, end); 919 920 spin_lock(&tree->lock); 921 again: 922 while (1) { 923 /* 924 * this search will find all the extents that end after 925 * our range starts 926 */ 927 node = tree_search(tree, start); 928 process_node: 929 if (!node) 930 break; 931 932 state = rb_entry(node, struct extent_state, rb_node); 933 934 if (state->start > end) 935 goto out; 936 937 if (state->state & bits) { 938 start = state->start; 939 refcount_inc(&state->refs); 940 wait_on_state(tree, state); 941 free_extent_state(state); 942 goto again; 943 } 944 start = state->end + 1; 945 946 if (start > end) 947 break; 948 949 if (!cond_resched_lock(&tree->lock)) { 950 node = rb_next(node); 951 goto process_node; 952 } 953 } 954 out: 955 spin_unlock(&tree->lock); 956 } 957 958 static void set_state_bits(struct extent_io_tree *tree, 959 struct extent_state *state, 960 u32 bits, struct extent_changeset *changeset) 961 { 962 u32 bits_to_set = bits & ~EXTENT_CTLBITS; 963 int ret; 964 965 if (tree->private_data && is_data_inode(tree->private_data)) 966 btrfs_set_delalloc_extent(tree->private_data, state, bits); 967 968 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 969 u64 range = state->end - state->start + 1; 970 tree->dirty_bytes += range; 971 } 972 ret = add_extent_changeset(state, bits_to_set, changeset, 1); 973 BUG_ON(ret < 0); 974 state->state |= bits_to_set; 975 } 976 977 static void cache_state_if_flags(struct extent_state *state, 978 struct extent_state **cached_ptr, 979 unsigned flags) 980 { 981 if (cached_ptr && !(*cached_ptr)) { 982 if (!flags || (state->state & flags)) { 983 *cached_ptr = state; 984 refcount_inc(&state->refs); 985 } 986 } 987 } 988 989 static void cache_state(struct extent_state *state, 990 struct extent_state **cached_ptr) 991 { 992 return cache_state_if_flags(state, cached_ptr, 993 EXTENT_LOCKED | EXTENT_BOUNDARY); 994 } 995 996 /* 997 * set some bits on a range in the tree. This may require allocations or 998 * sleeping, so the gfp mask is used to indicate what is allowed. 999 * 1000 * If any of the exclusive bits are set, this will fail with -EEXIST if some 1001 * part of the range already has the desired bits set. The start of the 1002 * existing range is returned in failed_start in this case. 1003 * 1004 * [start, end] is inclusive This takes the tree lock. 1005 */ 1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, 1007 u32 exclusive_bits, u64 *failed_start, 1008 struct extent_state **cached_state, gfp_t mask, 1009 struct extent_changeset *changeset) 1010 { 1011 struct extent_state *state; 1012 struct extent_state *prealloc = NULL; 1013 struct rb_node *node; 1014 struct rb_node **p; 1015 struct rb_node *parent; 1016 int err = 0; 1017 u64 last_start; 1018 u64 last_end; 1019 1020 btrfs_debug_check_extent_io_range(tree, start, end); 1021 trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); 1022 1023 if (exclusive_bits) 1024 ASSERT(failed_start); 1025 else 1026 ASSERT(failed_start == NULL); 1027 again: 1028 if (!prealloc && gfpflags_allow_blocking(mask)) { 1029 /* 1030 * Don't care for allocation failure here because we might end 1031 * up not needing the pre-allocated extent state at all, which 1032 * is the case if we only have in the tree extent states that 1033 * cover our input range and don't cover too any other range. 1034 * If we end up needing a new extent state we allocate it later. 1035 */ 1036 prealloc = alloc_extent_state(mask); 1037 } 1038 1039 spin_lock(&tree->lock); 1040 if (cached_state && *cached_state) { 1041 state = *cached_state; 1042 if (state->start <= start && state->end > start && 1043 extent_state_in_tree(state)) { 1044 node = &state->rb_node; 1045 goto hit_next; 1046 } 1047 } 1048 /* 1049 * this search will find all the extents that end after 1050 * our range starts. 1051 */ 1052 node = tree_search_for_insert(tree, start, &p, &parent); 1053 if (!node) { 1054 prealloc = alloc_extent_state_atomic(prealloc); 1055 BUG_ON(!prealloc); 1056 prealloc->start = start; 1057 prealloc->end = end; 1058 insert_state_fast(tree, prealloc, p, parent, bits, changeset); 1059 cache_state(prealloc, cached_state); 1060 prealloc = NULL; 1061 goto out; 1062 } 1063 state = rb_entry(node, struct extent_state, rb_node); 1064 hit_next: 1065 last_start = state->start; 1066 last_end = state->end; 1067 1068 /* 1069 * | ---- desired range ---- | 1070 * | state | 1071 * 1072 * Just lock what we found and keep going 1073 */ 1074 if (state->start == start && state->end <= end) { 1075 if (state->state & exclusive_bits) { 1076 *failed_start = state->start; 1077 err = -EEXIST; 1078 goto out; 1079 } 1080 1081 set_state_bits(tree, state, bits, changeset); 1082 cache_state(state, cached_state); 1083 merge_state(tree, state); 1084 if (last_end == (u64)-1) 1085 goto out; 1086 start = last_end + 1; 1087 state = next_state(state); 1088 if (start < end && state && state->start == start && 1089 !need_resched()) 1090 goto hit_next; 1091 goto search_again; 1092 } 1093 1094 /* 1095 * | ---- desired range ---- | 1096 * | state | 1097 * or 1098 * | ------------- state -------------- | 1099 * 1100 * We need to split the extent we found, and may flip bits on 1101 * second half. 1102 * 1103 * If the extent we found extends past our 1104 * range, we just split and search again. It'll get split 1105 * again the next time though. 1106 * 1107 * If the extent we found is inside our range, we set the 1108 * desired bit on it. 1109 */ 1110 if (state->start < start) { 1111 if (state->state & exclusive_bits) { 1112 *failed_start = start; 1113 err = -EEXIST; 1114 goto out; 1115 } 1116 1117 /* 1118 * If this extent already has all the bits we want set, then 1119 * skip it, not necessary to split it or do anything with it. 1120 */ 1121 if ((state->state & bits) == bits) { 1122 start = state->end + 1; 1123 cache_state(state, cached_state); 1124 goto search_again; 1125 } 1126 1127 prealloc = alloc_extent_state_atomic(prealloc); 1128 BUG_ON(!prealloc); 1129 err = split_state(tree, state, prealloc, start); 1130 if (err) 1131 extent_io_tree_panic(tree, err); 1132 1133 prealloc = NULL; 1134 if (err) 1135 goto out; 1136 if (state->end <= end) { 1137 set_state_bits(tree, state, bits, changeset); 1138 cache_state(state, cached_state); 1139 merge_state(tree, state); 1140 if (last_end == (u64)-1) 1141 goto out; 1142 start = last_end + 1; 1143 state = next_state(state); 1144 if (start < end && state && state->start == start && 1145 !need_resched()) 1146 goto hit_next; 1147 } 1148 goto search_again; 1149 } 1150 /* 1151 * | ---- desired range ---- | 1152 * | state | or | state | 1153 * 1154 * There's a hole, we need to insert something in it and 1155 * ignore the extent we found. 1156 */ 1157 if (state->start > start) { 1158 u64 this_end; 1159 if (end < last_start) 1160 this_end = end; 1161 else 1162 this_end = last_start - 1; 1163 1164 prealloc = alloc_extent_state_atomic(prealloc); 1165 BUG_ON(!prealloc); 1166 1167 /* 1168 * Avoid to free 'prealloc' if it can be merged with 1169 * the later extent. 1170 */ 1171 prealloc->start = start; 1172 prealloc->end = this_end; 1173 err = insert_state(tree, prealloc, bits, changeset); 1174 if (err) 1175 extent_io_tree_panic(tree, err); 1176 1177 cache_state(prealloc, cached_state); 1178 prealloc = NULL; 1179 start = this_end + 1; 1180 goto search_again; 1181 } 1182 /* 1183 * | ---- desired range ---- | 1184 * | state | 1185 * We need to split the extent, and set the bit 1186 * on the first half 1187 */ 1188 if (state->start <= end && state->end > end) { 1189 if (state->state & exclusive_bits) { 1190 *failed_start = start; 1191 err = -EEXIST; 1192 goto out; 1193 } 1194 1195 prealloc = alloc_extent_state_atomic(prealloc); 1196 BUG_ON(!prealloc); 1197 err = split_state(tree, state, prealloc, end + 1); 1198 if (err) 1199 extent_io_tree_panic(tree, err); 1200 1201 set_state_bits(tree, prealloc, bits, changeset); 1202 cache_state(prealloc, cached_state); 1203 merge_state(tree, prealloc); 1204 prealloc = NULL; 1205 goto out; 1206 } 1207 1208 search_again: 1209 if (start > end) 1210 goto out; 1211 spin_unlock(&tree->lock); 1212 if (gfpflags_allow_blocking(mask)) 1213 cond_resched(); 1214 goto again; 1215 1216 out: 1217 spin_unlock(&tree->lock); 1218 if (prealloc) 1219 free_extent_state(prealloc); 1220 1221 return err; 1222 1223 } 1224 1225 /** 1226 * convert_extent_bit - convert all bits in a given range from one bit to 1227 * another 1228 * @tree: the io tree to search 1229 * @start: the start offset in bytes 1230 * @end: the end offset in bytes (inclusive) 1231 * @bits: the bits to set in this range 1232 * @clear_bits: the bits to clear in this range 1233 * @cached_state: state that we're going to cache 1234 * 1235 * This will go through and set bits for the given range. If any states exist 1236 * already in this range they are set with the given bit and cleared of the 1237 * clear_bits. This is only meant to be used by things that are mergeable, ie 1238 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1239 * boundary bits like LOCK. 1240 * 1241 * All allocations are done with GFP_NOFS. 1242 */ 1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1244 u32 bits, u32 clear_bits, 1245 struct extent_state **cached_state) 1246 { 1247 struct extent_state *state; 1248 struct extent_state *prealloc = NULL; 1249 struct rb_node *node; 1250 struct rb_node **p; 1251 struct rb_node *parent; 1252 int err = 0; 1253 u64 last_start; 1254 u64 last_end; 1255 bool first_iteration = true; 1256 1257 btrfs_debug_check_extent_io_range(tree, start, end); 1258 trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits, 1259 clear_bits); 1260 1261 again: 1262 if (!prealloc) { 1263 /* 1264 * Best effort, don't worry if extent state allocation fails 1265 * here for the first iteration. We might have a cached state 1266 * that matches exactly the target range, in which case no 1267 * extent state allocations are needed. We'll only know this 1268 * after locking the tree. 1269 */ 1270 prealloc = alloc_extent_state(GFP_NOFS); 1271 if (!prealloc && !first_iteration) 1272 return -ENOMEM; 1273 } 1274 1275 spin_lock(&tree->lock); 1276 if (cached_state && *cached_state) { 1277 state = *cached_state; 1278 if (state->start <= start && state->end > start && 1279 extent_state_in_tree(state)) { 1280 node = &state->rb_node; 1281 goto hit_next; 1282 } 1283 } 1284 1285 /* 1286 * this search will find all the extents that end after 1287 * our range starts. 1288 */ 1289 node = tree_search_for_insert(tree, start, &p, &parent); 1290 if (!node) { 1291 prealloc = alloc_extent_state_atomic(prealloc); 1292 if (!prealloc) { 1293 err = -ENOMEM; 1294 goto out; 1295 } 1296 prealloc->start = start; 1297 prealloc->end = end; 1298 insert_state_fast(tree, prealloc, p, parent, bits, NULL); 1299 cache_state(prealloc, cached_state); 1300 prealloc = NULL; 1301 goto out; 1302 } 1303 state = rb_entry(node, struct extent_state, rb_node); 1304 hit_next: 1305 last_start = state->start; 1306 last_end = state->end; 1307 1308 /* 1309 * | ---- desired range ---- | 1310 * | state | 1311 * 1312 * Just lock what we found and keep going 1313 */ 1314 if (state->start == start && state->end <= end) { 1315 set_state_bits(tree, state, bits, NULL); 1316 cache_state(state, cached_state); 1317 state = clear_state_bit(tree, state, clear_bits, 0, NULL); 1318 if (last_end == (u64)-1) 1319 goto out; 1320 start = last_end + 1; 1321 if (start < end && state && state->start == start && 1322 !need_resched()) 1323 goto hit_next; 1324 goto search_again; 1325 } 1326 1327 /* 1328 * | ---- desired range ---- | 1329 * | state | 1330 * or 1331 * | ------------- state -------------- | 1332 * 1333 * We need to split the extent we found, and may flip bits on 1334 * second half. 1335 * 1336 * If the extent we found extends past our 1337 * range, we just split and search again. It'll get split 1338 * again the next time though. 1339 * 1340 * If the extent we found is inside our range, we set the 1341 * desired bit on it. 1342 */ 1343 if (state->start < start) { 1344 prealloc = alloc_extent_state_atomic(prealloc); 1345 if (!prealloc) { 1346 err = -ENOMEM; 1347 goto out; 1348 } 1349 err = split_state(tree, state, prealloc, start); 1350 if (err) 1351 extent_io_tree_panic(tree, err); 1352 prealloc = NULL; 1353 if (err) 1354 goto out; 1355 if (state->end <= end) { 1356 set_state_bits(tree, state, bits, NULL); 1357 cache_state(state, cached_state); 1358 state = clear_state_bit(tree, state, clear_bits, 0, NULL); 1359 if (last_end == (u64)-1) 1360 goto out; 1361 start = last_end + 1; 1362 if (start < end && state && state->start == start && 1363 !need_resched()) 1364 goto hit_next; 1365 } 1366 goto search_again; 1367 } 1368 /* 1369 * | ---- desired range ---- | 1370 * | state | or | state | 1371 * 1372 * There's a hole, we need to insert something in it and 1373 * ignore the extent we found. 1374 */ 1375 if (state->start > start) { 1376 u64 this_end; 1377 if (end < last_start) 1378 this_end = end; 1379 else 1380 this_end = last_start - 1; 1381 1382 prealloc = alloc_extent_state_atomic(prealloc); 1383 if (!prealloc) { 1384 err = -ENOMEM; 1385 goto out; 1386 } 1387 1388 /* 1389 * Avoid to free 'prealloc' if it can be merged with 1390 * the later extent. 1391 */ 1392 prealloc->start = start; 1393 prealloc->end = this_end; 1394 err = insert_state(tree, prealloc, bits, NULL); 1395 if (err) 1396 extent_io_tree_panic(tree, err); 1397 cache_state(prealloc, cached_state); 1398 prealloc = NULL; 1399 start = this_end + 1; 1400 goto search_again; 1401 } 1402 /* 1403 * | ---- desired range ---- | 1404 * | state | 1405 * We need to split the extent, and set the bit 1406 * on the first half 1407 */ 1408 if (state->start <= end && state->end > end) { 1409 prealloc = alloc_extent_state_atomic(prealloc); 1410 if (!prealloc) { 1411 err = -ENOMEM; 1412 goto out; 1413 } 1414 1415 err = split_state(tree, state, prealloc, end + 1); 1416 if (err) 1417 extent_io_tree_panic(tree, err); 1418 1419 set_state_bits(tree, prealloc, bits, NULL); 1420 cache_state(prealloc, cached_state); 1421 clear_state_bit(tree, prealloc, clear_bits, 0, NULL); 1422 prealloc = NULL; 1423 goto out; 1424 } 1425 1426 search_again: 1427 if (start > end) 1428 goto out; 1429 spin_unlock(&tree->lock); 1430 cond_resched(); 1431 first_iteration = false; 1432 goto again; 1433 1434 out: 1435 spin_unlock(&tree->lock); 1436 if (prealloc) 1437 free_extent_state(prealloc); 1438 1439 return err; 1440 } 1441 1442 /* wrappers around set/clear extent bit */ 1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1444 u32 bits, struct extent_changeset *changeset) 1445 { 1446 /* 1447 * We don't support EXTENT_LOCKED yet, as current changeset will 1448 * record any bits changed, so for EXTENT_LOCKED case, it will 1449 * either fail with -EEXIST or changeset will record the whole 1450 * range. 1451 */ 1452 BUG_ON(bits & EXTENT_LOCKED); 1453 1454 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1455 changeset); 1456 } 1457 1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end, 1459 u32 bits) 1460 { 1461 return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, 1462 GFP_NOWAIT, NULL); 1463 } 1464 1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1466 u32 bits, int wake, int delete, 1467 struct extent_state **cached) 1468 { 1469 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1470 cached, GFP_NOFS, NULL); 1471 } 1472 1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1474 u32 bits, struct extent_changeset *changeset) 1475 { 1476 /* 1477 * Don't support EXTENT_LOCKED case, same reason as 1478 * set_record_extent_bits(). 1479 */ 1480 BUG_ON(bits & EXTENT_LOCKED); 1481 1482 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1483 changeset); 1484 } 1485 1486 /* 1487 * either insert or lock state struct between start and end use mask to tell 1488 * us if waiting is desired. 1489 */ 1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1491 struct extent_state **cached_state) 1492 { 1493 int err; 1494 u64 failed_start; 1495 1496 while (1) { 1497 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1498 EXTENT_LOCKED, &failed_start, 1499 cached_state, GFP_NOFS, NULL); 1500 if (err == -EEXIST) { 1501 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1502 start = failed_start; 1503 } else 1504 break; 1505 WARN_ON(start > end); 1506 } 1507 return err; 1508 } 1509 1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1511 { 1512 int err; 1513 u64 failed_start; 1514 1515 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1516 &failed_start, NULL, GFP_NOFS, NULL); 1517 if (err == -EEXIST) { 1518 if (failed_start > start) 1519 clear_extent_bit(tree, start, failed_start - 1, 1520 EXTENT_LOCKED, 1, 0, NULL); 1521 return 0; 1522 } 1523 return 1; 1524 } 1525 1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1527 { 1528 unsigned long index = start >> PAGE_SHIFT; 1529 unsigned long end_index = end >> PAGE_SHIFT; 1530 struct page *page; 1531 1532 while (index <= end_index) { 1533 page = find_get_page(inode->i_mapping, index); 1534 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1535 clear_page_dirty_for_io(page); 1536 put_page(page); 1537 index++; 1538 } 1539 } 1540 1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1542 { 1543 struct address_space *mapping = inode->i_mapping; 1544 unsigned long index = start >> PAGE_SHIFT; 1545 unsigned long end_index = end >> PAGE_SHIFT; 1546 struct folio *folio; 1547 1548 while (index <= end_index) { 1549 folio = filemap_get_folio(mapping, index); 1550 filemap_dirty_folio(mapping, folio); 1551 folio_account_redirty(folio); 1552 index += folio_nr_pages(folio); 1553 folio_put(folio); 1554 } 1555 } 1556 1557 /* find the first state struct with 'bits' set after 'start', and 1558 * return it. tree->lock must be held. NULL will returned if 1559 * nothing was found after 'start' 1560 */ 1561 static struct extent_state * 1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits) 1563 { 1564 struct rb_node *node; 1565 struct extent_state *state; 1566 1567 /* 1568 * this search will find all the extents that end after 1569 * our range starts. 1570 */ 1571 node = tree_search(tree, start); 1572 if (!node) 1573 goto out; 1574 1575 while (1) { 1576 state = rb_entry(node, struct extent_state, rb_node); 1577 if (state->end >= start && (state->state & bits)) 1578 return state; 1579 1580 node = rb_next(node); 1581 if (!node) 1582 break; 1583 } 1584 out: 1585 return NULL; 1586 } 1587 1588 /* 1589 * Find the first offset in the io tree with one or more @bits set. 1590 * 1591 * Note: If there are multiple bits set in @bits, any of them will match. 1592 * 1593 * Return 0 if we find something, and update @start_ret and @end_ret. 1594 * Return 1 if we found nothing. 1595 */ 1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1597 u64 *start_ret, u64 *end_ret, u32 bits, 1598 struct extent_state **cached_state) 1599 { 1600 struct extent_state *state; 1601 int ret = 1; 1602 1603 spin_lock(&tree->lock); 1604 if (cached_state && *cached_state) { 1605 state = *cached_state; 1606 if (state->end == start - 1 && extent_state_in_tree(state)) { 1607 while ((state = next_state(state)) != NULL) { 1608 if (state->state & bits) 1609 goto got_it; 1610 } 1611 free_extent_state(*cached_state); 1612 *cached_state = NULL; 1613 goto out; 1614 } 1615 free_extent_state(*cached_state); 1616 *cached_state = NULL; 1617 } 1618 1619 state = find_first_extent_bit_state(tree, start, bits); 1620 got_it: 1621 if (state) { 1622 cache_state_if_flags(state, cached_state, 0); 1623 *start_ret = state->start; 1624 *end_ret = state->end; 1625 ret = 0; 1626 } 1627 out: 1628 spin_unlock(&tree->lock); 1629 return ret; 1630 } 1631 1632 /** 1633 * Find a contiguous area of bits 1634 * 1635 * @tree: io tree to check 1636 * @start: offset to start the search from 1637 * @start_ret: the first offset we found with the bits set 1638 * @end_ret: the final contiguous range of the bits that were set 1639 * @bits: bits to look for 1640 * 1641 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges 1642 * to set bits appropriately, and then merge them again. During this time it 1643 * will drop the tree->lock, so use this helper if you want to find the actual 1644 * contiguous area for given bits. We will search to the first bit we find, and 1645 * then walk down the tree until we find a non-contiguous area. The area 1646 * returned will be the full contiguous area with the bits set. 1647 */ 1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, 1649 u64 *start_ret, u64 *end_ret, u32 bits) 1650 { 1651 struct extent_state *state; 1652 int ret = 1; 1653 1654 spin_lock(&tree->lock); 1655 state = find_first_extent_bit_state(tree, start, bits); 1656 if (state) { 1657 *start_ret = state->start; 1658 *end_ret = state->end; 1659 while ((state = next_state(state)) != NULL) { 1660 if (state->start > (*end_ret + 1)) 1661 break; 1662 *end_ret = state->end; 1663 } 1664 ret = 0; 1665 } 1666 spin_unlock(&tree->lock); 1667 return ret; 1668 } 1669 1670 /** 1671 * Find the first range that has @bits not set. This range could start before 1672 * @start. 1673 * 1674 * @tree: the tree to search 1675 * @start: offset at/after which the found extent should start 1676 * @start_ret: records the beginning of the range 1677 * @end_ret: records the end of the range (inclusive) 1678 * @bits: the set of bits which must be unset 1679 * 1680 * Since unallocated range is also considered one which doesn't have the bits 1681 * set it's possible that @end_ret contains -1, this happens in case the range 1682 * spans (last_range_end, end of device]. In this case it's up to the caller to 1683 * trim @end_ret to the appropriate size. 1684 */ 1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, 1686 u64 *start_ret, u64 *end_ret, u32 bits) 1687 { 1688 struct extent_state *state; 1689 struct rb_node *node, *prev = NULL, *next; 1690 1691 spin_lock(&tree->lock); 1692 1693 /* Find first extent with bits cleared */ 1694 while (1) { 1695 node = tree_search_prev_next(tree, start, &prev, &next); 1696 if (!node && !next && !prev) { 1697 /* 1698 * Tree is completely empty, send full range and let 1699 * caller deal with it 1700 */ 1701 *start_ret = 0; 1702 *end_ret = -1; 1703 goto out; 1704 } else if (!node && !next) { 1705 /* 1706 * We are past the last allocated chunk, set start at 1707 * the end of the last extent. 1708 */ 1709 state = rb_entry(prev, struct extent_state, rb_node); 1710 *start_ret = state->end + 1; 1711 *end_ret = -1; 1712 goto out; 1713 } else if (!node) { 1714 node = next; 1715 } 1716 /* 1717 * At this point 'node' either contains 'start' or start is 1718 * before 'node' 1719 */ 1720 state = rb_entry(node, struct extent_state, rb_node); 1721 1722 if (in_range(start, state->start, state->end - state->start + 1)) { 1723 if (state->state & bits) { 1724 /* 1725 * |--range with bits sets--| 1726 * | 1727 * start 1728 */ 1729 start = state->end + 1; 1730 } else { 1731 /* 1732 * 'start' falls within a range that doesn't 1733 * have the bits set, so take its start as 1734 * the beginning of the desired range 1735 * 1736 * |--range with bits cleared----| 1737 * | 1738 * start 1739 */ 1740 *start_ret = state->start; 1741 break; 1742 } 1743 } else { 1744 /* 1745 * |---prev range---|---hole/unset---|---node range---| 1746 * | 1747 * start 1748 * 1749 * or 1750 * 1751 * |---hole/unset--||--first node--| 1752 * 0 | 1753 * start 1754 */ 1755 if (prev) { 1756 state = rb_entry(prev, struct extent_state, 1757 rb_node); 1758 *start_ret = state->end + 1; 1759 } else { 1760 *start_ret = 0; 1761 } 1762 break; 1763 } 1764 } 1765 1766 /* 1767 * Find the longest stretch from start until an entry which has the 1768 * bits set 1769 */ 1770 while (1) { 1771 state = rb_entry(node, struct extent_state, rb_node); 1772 if (state->end >= start && !(state->state & bits)) { 1773 *end_ret = state->end; 1774 } else { 1775 *end_ret = state->start - 1; 1776 break; 1777 } 1778 1779 node = rb_next(node); 1780 if (!node) 1781 break; 1782 } 1783 out: 1784 spin_unlock(&tree->lock); 1785 } 1786 1787 /* 1788 * find a contiguous range of bytes in the file marked as delalloc, not 1789 * more than 'max_bytes'. start and end are used to return the range, 1790 * 1791 * true is returned if we find something, false if nothing was in the tree 1792 */ 1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, 1794 u64 *end, u64 max_bytes, 1795 struct extent_state **cached_state) 1796 { 1797 struct rb_node *node; 1798 struct extent_state *state; 1799 u64 cur_start = *start; 1800 bool found = false; 1801 u64 total_bytes = 0; 1802 1803 spin_lock(&tree->lock); 1804 1805 /* 1806 * this search will find all the extents that end after 1807 * our range starts. 1808 */ 1809 node = tree_search(tree, cur_start); 1810 if (!node) { 1811 *end = (u64)-1; 1812 goto out; 1813 } 1814 1815 while (1) { 1816 state = rb_entry(node, struct extent_state, rb_node); 1817 if (found && (state->start != cur_start || 1818 (state->state & EXTENT_BOUNDARY))) { 1819 goto out; 1820 } 1821 if (!(state->state & EXTENT_DELALLOC)) { 1822 if (!found) 1823 *end = state->end; 1824 goto out; 1825 } 1826 if (!found) { 1827 *start = state->start; 1828 *cached_state = state; 1829 refcount_inc(&state->refs); 1830 } 1831 found = true; 1832 *end = state->end; 1833 cur_start = state->end + 1; 1834 node = rb_next(node); 1835 total_bytes += state->end - state->start + 1; 1836 if (total_bytes >= max_bytes) 1837 break; 1838 if (!node) 1839 break; 1840 } 1841 out: 1842 spin_unlock(&tree->lock); 1843 return found; 1844 } 1845 1846 /* 1847 * Process one page for __process_pages_contig(). 1848 * 1849 * Return >0 if we hit @page == @locked_page. 1850 * Return 0 if we updated the page status. 1851 * Return -EGAIN if the we need to try again. 1852 * (For PAGE_LOCK case but got dirty page or page not belong to mapping) 1853 */ 1854 static int process_one_page(struct btrfs_fs_info *fs_info, 1855 struct address_space *mapping, 1856 struct page *page, struct page *locked_page, 1857 unsigned long page_ops, u64 start, u64 end) 1858 { 1859 u32 len; 1860 1861 ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 1862 len = end + 1 - start; 1863 1864 if (page_ops & PAGE_SET_ORDERED) 1865 btrfs_page_clamp_set_ordered(fs_info, page, start, len); 1866 if (page_ops & PAGE_SET_ERROR) 1867 btrfs_page_clamp_set_error(fs_info, page, start, len); 1868 if (page_ops & PAGE_START_WRITEBACK) { 1869 btrfs_page_clamp_clear_dirty(fs_info, page, start, len); 1870 btrfs_page_clamp_set_writeback(fs_info, page, start, len); 1871 } 1872 if (page_ops & PAGE_END_WRITEBACK) 1873 btrfs_page_clamp_clear_writeback(fs_info, page, start, len); 1874 1875 if (page == locked_page) 1876 return 1; 1877 1878 if (page_ops & PAGE_LOCK) { 1879 int ret; 1880 1881 ret = btrfs_page_start_writer_lock(fs_info, page, start, len); 1882 if (ret) 1883 return ret; 1884 if (!PageDirty(page) || page->mapping != mapping) { 1885 btrfs_page_end_writer_lock(fs_info, page, start, len); 1886 return -EAGAIN; 1887 } 1888 } 1889 if (page_ops & PAGE_UNLOCK) 1890 btrfs_page_end_writer_lock(fs_info, page, start, len); 1891 return 0; 1892 } 1893 1894 static int __process_pages_contig(struct address_space *mapping, 1895 struct page *locked_page, 1896 u64 start, u64 end, unsigned long page_ops, 1897 u64 *processed_end) 1898 { 1899 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); 1900 pgoff_t start_index = start >> PAGE_SHIFT; 1901 pgoff_t end_index = end >> PAGE_SHIFT; 1902 pgoff_t index = start_index; 1903 unsigned long nr_pages = end_index - start_index + 1; 1904 unsigned long pages_processed = 0; 1905 struct page *pages[16]; 1906 int err = 0; 1907 int i; 1908 1909 if (page_ops & PAGE_LOCK) { 1910 ASSERT(page_ops == PAGE_LOCK); 1911 ASSERT(processed_end && *processed_end == start); 1912 } 1913 1914 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1915 mapping_set_error(mapping, -EIO); 1916 1917 while (nr_pages > 0) { 1918 int found_pages; 1919 1920 found_pages = find_get_pages_contig(mapping, index, 1921 min_t(unsigned long, 1922 nr_pages, ARRAY_SIZE(pages)), pages); 1923 if (found_pages == 0) { 1924 /* 1925 * Only if we're going to lock these pages, we can find 1926 * nothing at @index. 1927 */ 1928 ASSERT(page_ops & PAGE_LOCK); 1929 err = -EAGAIN; 1930 goto out; 1931 } 1932 1933 for (i = 0; i < found_pages; i++) { 1934 int process_ret; 1935 1936 process_ret = process_one_page(fs_info, mapping, 1937 pages[i], locked_page, page_ops, 1938 start, end); 1939 if (process_ret < 0) { 1940 for (; i < found_pages; i++) 1941 put_page(pages[i]); 1942 err = -EAGAIN; 1943 goto out; 1944 } 1945 put_page(pages[i]); 1946 pages_processed++; 1947 } 1948 nr_pages -= found_pages; 1949 index += found_pages; 1950 cond_resched(); 1951 } 1952 out: 1953 if (err && processed_end) { 1954 /* 1955 * Update @processed_end. I know this is awful since it has 1956 * two different return value patterns (inclusive vs exclusive). 1957 * 1958 * But the exclusive pattern is necessary if @start is 0, or we 1959 * underflow and check against processed_end won't work as 1960 * expected. 1961 */ 1962 if (pages_processed) 1963 *processed_end = min(end, 1964 ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); 1965 else 1966 *processed_end = start; 1967 } 1968 return err; 1969 } 1970 1971 static noinline void __unlock_for_delalloc(struct inode *inode, 1972 struct page *locked_page, 1973 u64 start, u64 end) 1974 { 1975 unsigned long index = start >> PAGE_SHIFT; 1976 unsigned long end_index = end >> PAGE_SHIFT; 1977 1978 ASSERT(locked_page); 1979 if (index == locked_page->index && end_index == index) 1980 return; 1981 1982 __process_pages_contig(inode->i_mapping, locked_page, start, end, 1983 PAGE_UNLOCK, NULL); 1984 } 1985 1986 static noinline int lock_delalloc_pages(struct inode *inode, 1987 struct page *locked_page, 1988 u64 delalloc_start, 1989 u64 delalloc_end) 1990 { 1991 unsigned long index = delalloc_start >> PAGE_SHIFT; 1992 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1993 u64 processed_end = delalloc_start; 1994 int ret; 1995 1996 ASSERT(locked_page); 1997 if (index == locked_page->index && index == end_index) 1998 return 0; 1999 2000 ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, 2001 delalloc_end, PAGE_LOCK, &processed_end); 2002 if (ret == -EAGAIN && processed_end > delalloc_start) 2003 __unlock_for_delalloc(inode, locked_page, delalloc_start, 2004 processed_end); 2005 return ret; 2006 } 2007 2008 /* 2009 * Find and lock a contiguous range of bytes in the file marked as delalloc, no 2010 * more than @max_bytes. 2011 * 2012 * @start: The original start bytenr to search. 2013 * Will store the extent range start bytenr. 2014 * @end: The original end bytenr of the search range 2015 * Will store the extent range end bytenr. 2016 * 2017 * Return true if we find a delalloc range which starts inside the original 2018 * range, and @start/@end will store the delalloc range start/end. 2019 * 2020 * Return false if we can't find any delalloc range which starts inside the 2021 * original range, and @start/@end will be the non-delalloc range start/end. 2022 */ 2023 EXPORT_FOR_TESTS 2024 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, 2025 struct page *locked_page, u64 *start, 2026 u64 *end) 2027 { 2028 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2029 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2030 const u64 orig_start = *start; 2031 const u64 orig_end = *end; 2032 /* The sanity tests may not set a valid fs_info. */ 2033 u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; 2034 u64 delalloc_start; 2035 u64 delalloc_end; 2036 bool found; 2037 struct extent_state *cached_state = NULL; 2038 int ret; 2039 int loops = 0; 2040 2041 /* Caller should pass a valid @end to indicate the search range end */ 2042 ASSERT(orig_end > orig_start); 2043 2044 /* The range should at least cover part of the page */ 2045 ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || 2046 orig_end <= page_offset(locked_page))); 2047 again: 2048 /* step one, find a bunch of delalloc bytes starting at start */ 2049 delalloc_start = *start; 2050 delalloc_end = 0; 2051 found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, 2052 max_bytes, &cached_state); 2053 if (!found || delalloc_end <= *start || delalloc_start > orig_end) { 2054 *start = delalloc_start; 2055 2056 /* @delalloc_end can be -1, never go beyond @orig_end */ 2057 *end = min(delalloc_end, orig_end); 2058 free_extent_state(cached_state); 2059 return false; 2060 } 2061 2062 /* 2063 * start comes from the offset of locked_page. We have to lock 2064 * pages in order, so we can't process delalloc bytes before 2065 * locked_page 2066 */ 2067 if (delalloc_start < *start) 2068 delalloc_start = *start; 2069 2070 /* 2071 * make sure to limit the number of pages we try to lock down 2072 */ 2073 if (delalloc_end + 1 - delalloc_start > max_bytes) 2074 delalloc_end = delalloc_start + max_bytes - 1; 2075 2076 /* step two, lock all the pages after the page that has start */ 2077 ret = lock_delalloc_pages(inode, locked_page, 2078 delalloc_start, delalloc_end); 2079 ASSERT(!ret || ret == -EAGAIN); 2080 if (ret == -EAGAIN) { 2081 /* some of the pages are gone, lets avoid looping by 2082 * shortening the size of the delalloc range we're searching 2083 */ 2084 free_extent_state(cached_state); 2085 cached_state = NULL; 2086 if (!loops) { 2087 max_bytes = PAGE_SIZE; 2088 loops = 1; 2089 goto again; 2090 } else { 2091 found = false; 2092 goto out_failed; 2093 } 2094 } 2095 2096 /* step three, lock the state bits for the whole range */ 2097 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 2098 2099 /* then test to make sure it is all still delalloc */ 2100 ret = test_range_bit(tree, delalloc_start, delalloc_end, 2101 EXTENT_DELALLOC, 1, cached_state); 2102 if (!ret) { 2103 unlock_extent_cached(tree, delalloc_start, delalloc_end, 2104 &cached_state); 2105 __unlock_for_delalloc(inode, locked_page, 2106 delalloc_start, delalloc_end); 2107 cond_resched(); 2108 goto again; 2109 } 2110 free_extent_state(cached_state); 2111 *start = delalloc_start; 2112 *end = delalloc_end; 2113 out_failed: 2114 return found; 2115 } 2116 2117 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2118 struct page *locked_page, 2119 u32 clear_bits, unsigned long page_ops) 2120 { 2121 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); 2122 2123 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, 2124 start, end, page_ops, NULL); 2125 } 2126 2127 /* 2128 * count the number of bytes in the tree that have a given bit(s) 2129 * set. This can be fairly slow, except for EXTENT_DIRTY which is 2130 * cached. The total number found is returned. 2131 */ 2132 u64 count_range_bits(struct extent_io_tree *tree, 2133 u64 *start, u64 search_end, u64 max_bytes, 2134 u32 bits, int contig) 2135 { 2136 struct rb_node *node; 2137 struct extent_state *state; 2138 u64 cur_start = *start; 2139 u64 total_bytes = 0; 2140 u64 last = 0; 2141 int found = 0; 2142 2143 if (WARN_ON(search_end <= cur_start)) 2144 return 0; 2145 2146 spin_lock(&tree->lock); 2147 if (cur_start == 0 && bits == EXTENT_DIRTY) { 2148 total_bytes = tree->dirty_bytes; 2149 goto out; 2150 } 2151 /* 2152 * this search will find all the extents that end after 2153 * our range starts. 2154 */ 2155 node = tree_search(tree, cur_start); 2156 if (!node) 2157 goto out; 2158 2159 while (1) { 2160 state = rb_entry(node, struct extent_state, rb_node); 2161 if (state->start > search_end) 2162 break; 2163 if (contig && found && state->start > last + 1) 2164 break; 2165 if (state->end >= cur_start && (state->state & bits) == bits) { 2166 total_bytes += min(search_end, state->end) + 1 - 2167 max(cur_start, state->start); 2168 if (total_bytes >= max_bytes) 2169 break; 2170 if (!found) { 2171 *start = max(cur_start, state->start); 2172 found = 1; 2173 } 2174 last = state->end; 2175 } else if (contig && found) { 2176 break; 2177 } 2178 node = rb_next(node); 2179 if (!node) 2180 break; 2181 } 2182 out: 2183 spin_unlock(&tree->lock); 2184 return total_bytes; 2185 } 2186 2187 /* 2188 * set the private field for a given byte offset in the tree. If there isn't 2189 * an extent_state there already, this does nothing. 2190 */ 2191 int set_state_failrec(struct extent_io_tree *tree, u64 start, 2192 struct io_failure_record *failrec) 2193 { 2194 struct rb_node *node; 2195 struct extent_state *state; 2196 int ret = 0; 2197 2198 spin_lock(&tree->lock); 2199 /* 2200 * this search will find all the extents that end after 2201 * our range starts. 2202 */ 2203 node = tree_search(tree, start); 2204 if (!node) { 2205 ret = -ENOENT; 2206 goto out; 2207 } 2208 state = rb_entry(node, struct extent_state, rb_node); 2209 if (state->start != start) { 2210 ret = -ENOENT; 2211 goto out; 2212 } 2213 state->failrec = failrec; 2214 out: 2215 spin_unlock(&tree->lock); 2216 return ret; 2217 } 2218 2219 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start) 2220 { 2221 struct rb_node *node; 2222 struct extent_state *state; 2223 struct io_failure_record *failrec; 2224 2225 spin_lock(&tree->lock); 2226 /* 2227 * this search will find all the extents that end after 2228 * our range starts. 2229 */ 2230 node = tree_search(tree, start); 2231 if (!node) { 2232 failrec = ERR_PTR(-ENOENT); 2233 goto out; 2234 } 2235 state = rb_entry(node, struct extent_state, rb_node); 2236 if (state->start != start) { 2237 failrec = ERR_PTR(-ENOENT); 2238 goto out; 2239 } 2240 2241 failrec = state->failrec; 2242 out: 2243 spin_unlock(&tree->lock); 2244 return failrec; 2245 } 2246 2247 /* 2248 * searches a range in the state tree for a given mask. 2249 * If 'filled' == 1, this returns 1 only if every extent in the tree 2250 * has the bits set. Otherwise, 1 is returned if any bit in the 2251 * range is found set. 2252 */ 2253 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 2254 u32 bits, int filled, struct extent_state *cached) 2255 { 2256 struct extent_state *state = NULL; 2257 struct rb_node *node; 2258 int bitset = 0; 2259 2260 spin_lock(&tree->lock); 2261 if (cached && extent_state_in_tree(cached) && cached->start <= start && 2262 cached->end > start) 2263 node = &cached->rb_node; 2264 else 2265 node = tree_search(tree, start); 2266 while (node && start <= end) { 2267 state = rb_entry(node, struct extent_state, rb_node); 2268 2269 if (filled && state->start > start) { 2270 bitset = 0; 2271 break; 2272 } 2273 2274 if (state->start > end) 2275 break; 2276 2277 if (state->state & bits) { 2278 bitset = 1; 2279 if (!filled) 2280 break; 2281 } else if (filled) { 2282 bitset = 0; 2283 break; 2284 } 2285 2286 if (state->end == (u64)-1) 2287 break; 2288 2289 start = state->end + 1; 2290 if (start > end) 2291 break; 2292 node = rb_next(node); 2293 if (!node) { 2294 if (filled) 2295 bitset = 0; 2296 break; 2297 } 2298 } 2299 spin_unlock(&tree->lock); 2300 return bitset; 2301 } 2302 2303 int free_io_failure(struct extent_io_tree *failure_tree, 2304 struct extent_io_tree *io_tree, 2305 struct io_failure_record *rec) 2306 { 2307 int ret; 2308 int err = 0; 2309 2310 set_state_failrec(failure_tree, rec->start, NULL); 2311 ret = clear_extent_bits(failure_tree, rec->start, 2312 rec->start + rec->len - 1, 2313 EXTENT_LOCKED | EXTENT_DIRTY); 2314 if (ret) 2315 err = ret; 2316 2317 ret = clear_extent_bits(io_tree, rec->start, 2318 rec->start + rec->len - 1, 2319 EXTENT_DAMAGED); 2320 if (ret && !err) 2321 err = ret; 2322 2323 kfree(rec); 2324 return err; 2325 } 2326 2327 /* 2328 * this bypasses the standard btrfs submit functions deliberately, as 2329 * the standard behavior is to write all copies in a raid setup. here we only 2330 * want to write the one bad copy. so we do the mapping for ourselves and issue 2331 * submit_bio directly. 2332 * to avoid any synchronization issues, wait for the data after writing, which 2333 * actually prevents the read that triggered the error from finishing. 2334 * currently, there can be no more than two copies of every data bit. thus, 2335 * exactly one rewrite is required. 2336 */ 2337 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 2338 u64 length, u64 logical, struct page *page, 2339 unsigned int pg_offset, int mirror_num) 2340 { 2341 struct btrfs_device *dev; 2342 struct bio_vec bvec; 2343 struct bio bio; 2344 u64 map_length = 0; 2345 u64 sector; 2346 struct btrfs_io_context *bioc = NULL; 2347 int ret = 0; 2348 2349 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 2350 BUG_ON(!mirror_num); 2351 2352 if (btrfs_repair_one_zone(fs_info, logical)) 2353 return 0; 2354 2355 map_length = length; 2356 2357 /* 2358 * Avoid races with device replace and make sure our bioc has devices 2359 * associated to its stripes that don't go away while we are doing the 2360 * read repair operation. 2361 */ 2362 btrfs_bio_counter_inc_blocked(fs_info); 2363 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 2364 /* 2365 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2366 * to update all raid stripes, but here we just want to correct 2367 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2368 * stripe's dev and sector. 2369 */ 2370 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2371 &map_length, &bioc, 0); 2372 if (ret) 2373 goto out_counter_dec; 2374 ASSERT(bioc->mirror_num == 1); 2375 } else { 2376 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2377 &map_length, &bioc, mirror_num); 2378 if (ret) 2379 goto out_counter_dec; 2380 BUG_ON(mirror_num != bioc->mirror_num); 2381 } 2382 2383 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 2384 dev = bioc->stripes[bioc->mirror_num - 1].dev; 2385 btrfs_put_bioc(bioc); 2386 2387 if (!dev || !dev->bdev || 2388 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 2389 ret = -EIO; 2390 goto out_counter_dec; 2391 } 2392 2393 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 2394 bio.bi_iter.bi_sector = sector; 2395 __bio_add_page(&bio, page, length, pg_offset); 2396 2397 btrfsic_check_bio(&bio); 2398 ret = submit_bio_wait(&bio); 2399 if (ret) { 2400 /* try to remap that extent elsewhere? */ 2401 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2402 goto out_bio_uninit; 2403 } 2404 2405 btrfs_info_rl_in_rcu(fs_info, 2406 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2407 ino, start, 2408 rcu_str_deref(dev->name), sector); 2409 ret = 0; 2410 2411 out_bio_uninit: 2412 bio_uninit(&bio); 2413 out_counter_dec: 2414 btrfs_bio_counter_dec(fs_info); 2415 return ret; 2416 } 2417 2418 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) 2419 { 2420 struct btrfs_fs_info *fs_info = eb->fs_info; 2421 u64 start = eb->start; 2422 int i, num_pages = num_extent_pages(eb); 2423 int ret = 0; 2424 2425 if (sb_rdonly(fs_info->sb)) 2426 return -EROFS; 2427 2428 for (i = 0; i < num_pages; i++) { 2429 struct page *p = eb->pages[i]; 2430 2431 ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, 2432 start - page_offset(p), mirror_num); 2433 if (ret) 2434 break; 2435 start += PAGE_SIZE; 2436 } 2437 2438 return ret; 2439 } 2440 2441 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) 2442 { 2443 if (cur_mirror == failrec->num_copies) 2444 return cur_mirror + 1 - failrec->num_copies; 2445 return cur_mirror + 1; 2446 } 2447 2448 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) 2449 { 2450 if (cur_mirror == 1) 2451 return failrec->num_copies; 2452 return cur_mirror - 1; 2453 } 2454 2455 /* 2456 * each time an IO finishes, we do a fast check in the IO failure tree 2457 * to see if we need to process or clean up an io_failure_record 2458 */ 2459 int clean_io_failure(struct btrfs_fs_info *fs_info, 2460 struct extent_io_tree *failure_tree, 2461 struct extent_io_tree *io_tree, u64 start, 2462 struct page *page, u64 ino, unsigned int pg_offset) 2463 { 2464 u64 private; 2465 struct io_failure_record *failrec; 2466 struct extent_state *state; 2467 int mirror; 2468 int ret; 2469 2470 private = 0; 2471 ret = count_range_bits(failure_tree, &private, (u64)-1, 1, 2472 EXTENT_DIRTY, 0); 2473 if (!ret) 2474 return 0; 2475 2476 failrec = get_state_failrec(failure_tree, start); 2477 if (IS_ERR(failrec)) 2478 return 0; 2479 2480 BUG_ON(!failrec->this_mirror); 2481 2482 if (sb_rdonly(fs_info->sb)) 2483 goto out; 2484 2485 spin_lock(&io_tree->lock); 2486 state = find_first_extent_bit_state(io_tree, 2487 failrec->start, 2488 EXTENT_LOCKED); 2489 spin_unlock(&io_tree->lock); 2490 2491 if (!state || state->start > failrec->start || 2492 state->end < failrec->start + failrec->len - 1) 2493 goto out; 2494 2495 mirror = failrec->this_mirror; 2496 do { 2497 mirror = prev_mirror(failrec, mirror); 2498 repair_io_failure(fs_info, ino, start, failrec->len, 2499 failrec->logical, page, pg_offset, mirror); 2500 } while (mirror != failrec->failed_mirror); 2501 2502 out: 2503 free_io_failure(failure_tree, io_tree, failrec); 2504 return 0; 2505 } 2506 2507 /* 2508 * Can be called when 2509 * - hold extent lock 2510 * - under ordered extent 2511 * - the inode is freeing 2512 */ 2513 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2514 { 2515 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2516 struct io_failure_record *failrec; 2517 struct extent_state *state, *next; 2518 2519 if (RB_EMPTY_ROOT(&failure_tree->state)) 2520 return; 2521 2522 spin_lock(&failure_tree->lock); 2523 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2524 while (state) { 2525 if (state->start > end) 2526 break; 2527 2528 ASSERT(state->end <= end); 2529 2530 next = next_state(state); 2531 2532 failrec = state->failrec; 2533 free_extent_state(state); 2534 kfree(failrec); 2535 2536 state = next; 2537 } 2538 spin_unlock(&failure_tree->lock); 2539 } 2540 2541 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, 2542 struct btrfs_bio *bbio, 2543 unsigned int bio_offset) 2544 { 2545 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2546 u64 start = bbio->file_offset + bio_offset; 2547 struct io_failure_record *failrec; 2548 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2549 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2550 const u32 sectorsize = fs_info->sectorsize; 2551 int ret; 2552 2553 failrec = get_state_failrec(failure_tree, start); 2554 if (!IS_ERR(failrec)) { 2555 btrfs_debug(fs_info, 2556 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", 2557 failrec->logical, failrec->start, failrec->len); 2558 /* 2559 * when data can be on disk more than twice, add to failrec here 2560 * (e.g. with a list for failed_mirror) to make 2561 * clean_io_failure() clean all those errors at once. 2562 */ 2563 ASSERT(failrec->this_mirror == bbio->mirror_num); 2564 ASSERT(failrec->len == fs_info->sectorsize); 2565 return failrec; 2566 } 2567 2568 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2569 if (!failrec) 2570 return ERR_PTR(-ENOMEM); 2571 2572 failrec->start = start; 2573 failrec->len = sectorsize; 2574 failrec->failed_mirror = bbio->mirror_num; 2575 failrec->this_mirror = bbio->mirror_num; 2576 failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; 2577 2578 btrfs_debug(fs_info, 2579 "new io failure record logical %llu start %llu", 2580 failrec->logical, start); 2581 2582 failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); 2583 if (failrec->num_copies == 1) { 2584 /* 2585 * We only have a single copy of the data, so don't bother with 2586 * all the retry and error correction code that follows. No 2587 * matter what the error is, it is very likely to persist. 2588 */ 2589 btrfs_debug(fs_info, 2590 "cannot repair logical %llu num_copies %d", 2591 failrec->logical, failrec->num_copies); 2592 kfree(failrec); 2593 return ERR_PTR(-EIO); 2594 } 2595 2596 /* Set the bits in the private failure tree */ 2597 ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, 2598 EXTENT_LOCKED | EXTENT_DIRTY); 2599 if (ret >= 0) { 2600 ret = set_state_failrec(failure_tree, start, failrec); 2601 /* Set the bits in the inode's tree */ 2602 ret = set_extent_bits(tree, start, start + sectorsize - 1, 2603 EXTENT_DAMAGED); 2604 } else if (ret < 0) { 2605 kfree(failrec); 2606 return ERR_PTR(ret); 2607 } 2608 2609 return failrec; 2610 } 2611 2612 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, 2613 u32 bio_offset, struct page *page, unsigned int pgoff, 2614 submit_bio_hook_t *submit_bio_hook) 2615 { 2616 u64 start = failed_bbio->file_offset + bio_offset; 2617 struct io_failure_record *failrec; 2618 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2619 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2620 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2621 struct bio *failed_bio = &failed_bbio->bio; 2622 const int icsum = bio_offset >> fs_info->sectorsize_bits; 2623 struct bio *repair_bio; 2624 struct btrfs_bio *repair_bbio; 2625 2626 btrfs_debug(fs_info, 2627 "repair read error: read error at %llu", start); 2628 2629 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2630 2631 failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); 2632 if (IS_ERR(failrec)) 2633 return PTR_ERR(failrec); 2634 2635 /* 2636 * There are two premises: 2637 * a) deliver good data to the caller 2638 * b) correct the bad sectors on disk 2639 * 2640 * Since we're only doing repair for one sector, we only need to get 2641 * a good copy of the failed sector and if we succeed, we have setup 2642 * everything for repair_io_failure to do the rest for us. 2643 */ 2644 failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); 2645 if (failrec->this_mirror == failrec->failed_mirror) { 2646 btrfs_debug(fs_info, 2647 "failed to repair num_copies %d this_mirror %d failed_mirror %d", 2648 failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); 2649 free_io_failure(failure_tree, tree, failrec); 2650 return -EIO; 2651 } 2652 2653 repair_bio = btrfs_bio_alloc(1); 2654 repair_bbio = btrfs_bio(repair_bio); 2655 repair_bbio->file_offset = start; 2656 repair_bio->bi_opf = REQ_OP_READ; 2657 repair_bio->bi_end_io = failed_bio->bi_end_io; 2658 repair_bio->bi_iter.bi_sector = failrec->logical >> 9; 2659 repair_bio->bi_private = failed_bio->bi_private; 2660 2661 if (failed_bbio->csum) { 2662 const u32 csum_size = fs_info->csum_size; 2663 2664 repair_bbio->csum = repair_bbio->csum_inline; 2665 memcpy(repair_bbio->csum, 2666 failed_bbio->csum + csum_size * icsum, csum_size); 2667 } 2668 2669 bio_add_page(repair_bio, page, failrec->len, pgoff); 2670 repair_bbio->iter = repair_bio->bi_iter; 2671 2672 btrfs_debug(btrfs_sb(inode->i_sb), 2673 "repair read error: submitting new read to mirror %d", 2674 failrec->this_mirror); 2675 2676 /* 2677 * At this point we have a bio, so any errors from submit_bio_hook() 2678 * will be handled by the endio on the repair_bio, so we can't return an 2679 * error here. 2680 */ 2681 submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); 2682 return BLK_STS_OK; 2683 } 2684 2685 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) 2686 { 2687 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 2688 2689 ASSERT(page_offset(page) <= start && 2690 start + len <= page_offset(page) + PAGE_SIZE); 2691 2692 if (uptodate) { 2693 if (fsverity_active(page->mapping->host) && 2694 !PageError(page) && 2695 !PageUptodate(page) && 2696 start < i_size_read(page->mapping->host) && 2697 !fsverity_verify_page(page)) { 2698 btrfs_page_set_error(fs_info, page, start, len); 2699 } else { 2700 btrfs_page_set_uptodate(fs_info, page, start, len); 2701 } 2702 } else { 2703 btrfs_page_clear_uptodate(fs_info, page, start, len); 2704 btrfs_page_set_error(fs_info, page, start, len); 2705 } 2706 2707 if (!btrfs_is_subpage(fs_info, page)) 2708 unlock_page(page); 2709 else 2710 btrfs_subpage_end_reader(fs_info, page, start, len); 2711 } 2712 2713 static void end_sector_io(struct page *page, u64 offset, bool uptodate) 2714 { 2715 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 2716 const u32 sectorsize = inode->root->fs_info->sectorsize; 2717 struct extent_state *cached = NULL; 2718 2719 end_page_read(page, uptodate, offset, sectorsize); 2720 if (uptodate) 2721 set_extent_uptodate(&inode->io_tree, offset, 2722 offset + sectorsize - 1, &cached, GFP_ATOMIC); 2723 unlock_extent_cached_atomic(&inode->io_tree, offset, 2724 offset + sectorsize - 1, &cached); 2725 } 2726 2727 static void submit_data_read_repair(struct inode *inode, 2728 struct btrfs_bio *failed_bbio, 2729 u32 bio_offset, const struct bio_vec *bvec, 2730 unsigned int error_bitmap) 2731 { 2732 const unsigned int pgoff = bvec->bv_offset; 2733 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2734 struct page *page = bvec->bv_page; 2735 const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; 2736 const u64 end = start + bvec->bv_len - 1; 2737 const u32 sectorsize = fs_info->sectorsize; 2738 const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; 2739 int i; 2740 2741 BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); 2742 2743 /* This repair is only for data */ 2744 ASSERT(is_data_inode(inode)); 2745 2746 /* We're here because we had some read errors or csum mismatch */ 2747 ASSERT(error_bitmap); 2748 2749 /* 2750 * We only get called on buffered IO, thus page must be mapped and bio 2751 * must not be cloned. 2752 */ 2753 ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); 2754 2755 /* Iterate through all the sectors in the range */ 2756 for (i = 0; i < nr_bits; i++) { 2757 const unsigned int offset = i * sectorsize; 2758 bool uptodate = false; 2759 int ret; 2760 2761 if (!(error_bitmap & (1U << i))) { 2762 /* 2763 * This sector has no error, just end the page read 2764 * and unlock the range. 2765 */ 2766 uptodate = true; 2767 goto next; 2768 } 2769 2770 ret = btrfs_repair_one_sector(inode, failed_bbio, 2771 bio_offset + offset, page, pgoff + offset, 2772 btrfs_submit_data_read_bio); 2773 if (!ret) { 2774 /* 2775 * We have submitted the read repair, the page release 2776 * will be handled by the endio function of the 2777 * submitted repair bio. 2778 * Thus we don't need to do any thing here. 2779 */ 2780 continue; 2781 } 2782 /* 2783 * Continue on failed repair, otherwise the remaining sectors 2784 * will not be properly unlocked. 2785 */ 2786 next: 2787 end_sector_io(page, start + offset, uptodate); 2788 } 2789 } 2790 2791 /* lots and lots of room for performance fixes in the end_bio funcs */ 2792 2793 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2794 { 2795 struct btrfs_inode *inode; 2796 const bool uptodate = (err == 0); 2797 int ret = 0; 2798 2799 ASSERT(page && page->mapping); 2800 inode = BTRFS_I(page->mapping->host); 2801 btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); 2802 2803 if (!uptodate) { 2804 const struct btrfs_fs_info *fs_info = inode->root->fs_info; 2805 u32 len; 2806 2807 ASSERT(end + 1 - start <= U32_MAX); 2808 len = end + 1 - start; 2809 2810 btrfs_page_clear_uptodate(fs_info, page, start, len); 2811 btrfs_page_set_error(fs_info, page, start, len); 2812 ret = err < 0 ? err : -EIO; 2813 mapping_set_error(page->mapping, ret); 2814 } 2815 } 2816 2817 /* 2818 * after a writepage IO is done, we need to: 2819 * clear the uptodate bits on error 2820 * clear the writeback bits in the extent tree for this IO 2821 * end_page_writeback if the page has no more pending IO 2822 * 2823 * Scheduling is not allowed, so the extent state tree is expected 2824 * to have one and only one object corresponding to this IO. 2825 */ 2826 static void end_bio_extent_writepage(struct bio *bio) 2827 { 2828 int error = blk_status_to_errno(bio->bi_status); 2829 struct bio_vec *bvec; 2830 u64 start; 2831 u64 end; 2832 struct bvec_iter_all iter_all; 2833 bool first_bvec = true; 2834 2835 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2836 bio_for_each_segment_all(bvec, bio, iter_all) { 2837 struct page *page = bvec->bv_page; 2838 struct inode *inode = page->mapping->host; 2839 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2840 const u32 sectorsize = fs_info->sectorsize; 2841 2842 /* Our read/write should always be sector aligned. */ 2843 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 2844 btrfs_err(fs_info, 2845 "partial page write in btrfs with offset %u and length %u", 2846 bvec->bv_offset, bvec->bv_len); 2847 else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) 2848 btrfs_info(fs_info, 2849 "incomplete page write with offset %u and length %u", 2850 bvec->bv_offset, bvec->bv_len); 2851 2852 start = page_offset(page) + bvec->bv_offset; 2853 end = start + bvec->bv_len - 1; 2854 2855 if (first_bvec) { 2856 btrfs_record_physical_zoned(inode, start, bio); 2857 first_bvec = false; 2858 } 2859 2860 end_extent_writepage(page, error, start, end); 2861 2862 btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); 2863 } 2864 2865 bio_put(bio); 2866 } 2867 2868 /* 2869 * Record previously processed extent range 2870 * 2871 * For endio_readpage_release_extent() to handle a full extent range, reducing 2872 * the extent io operations. 2873 */ 2874 struct processed_extent { 2875 struct btrfs_inode *inode; 2876 /* Start of the range in @inode */ 2877 u64 start; 2878 /* End of the range in @inode */ 2879 u64 end; 2880 bool uptodate; 2881 }; 2882 2883 /* 2884 * Try to release processed extent range 2885 * 2886 * May not release the extent range right now if the current range is 2887 * contiguous to processed extent. 2888 * 2889 * Will release processed extent when any of @inode, @uptodate, the range is 2890 * no longer contiguous to the processed range. 2891 * 2892 * Passing @inode == NULL will force processed extent to be released. 2893 */ 2894 static void endio_readpage_release_extent(struct processed_extent *processed, 2895 struct btrfs_inode *inode, u64 start, u64 end, 2896 bool uptodate) 2897 { 2898 struct extent_state *cached = NULL; 2899 struct extent_io_tree *tree; 2900 2901 /* The first extent, initialize @processed */ 2902 if (!processed->inode) 2903 goto update; 2904 2905 /* 2906 * Contiguous to processed extent, just uptodate the end. 2907 * 2908 * Several things to notice: 2909 * 2910 * - bio can be merged as long as on-disk bytenr is contiguous 2911 * This means we can have page belonging to other inodes, thus need to 2912 * check if the inode still matches. 2913 * - bvec can contain range beyond current page for multi-page bvec 2914 * Thus we need to do processed->end + 1 >= start check 2915 */ 2916 if (processed->inode == inode && processed->uptodate == uptodate && 2917 processed->end + 1 >= start && end >= processed->end) { 2918 processed->end = end; 2919 return; 2920 } 2921 2922 tree = &processed->inode->io_tree; 2923 /* 2924 * Now we don't have range contiguous to the processed range, release 2925 * the processed range now. 2926 */ 2927 if (processed->uptodate && tree->track_uptodate) 2928 set_extent_uptodate(tree, processed->start, processed->end, 2929 &cached, GFP_ATOMIC); 2930 unlock_extent_cached_atomic(tree, processed->start, processed->end, 2931 &cached); 2932 2933 update: 2934 /* Update processed to current range */ 2935 processed->inode = inode; 2936 processed->start = start; 2937 processed->end = end; 2938 processed->uptodate = uptodate; 2939 } 2940 2941 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) 2942 { 2943 ASSERT(PageLocked(page)); 2944 if (!btrfs_is_subpage(fs_info, page)) 2945 return; 2946 2947 ASSERT(PagePrivate(page)); 2948 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); 2949 } 2950 2951 /* 2952 * Find extent buffer for a givne bytenr. 2953 * 2954 * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking 2955 * in endio context. 2956 */ 2957 static struct extent_buffer *find_extent_buffer_readpage( 2958 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 2959 { 2960 struct extent_buffer *eb; 2961 2962 /* 2963 * For regular sectorsize, we can use page->private to grab extent 2964 * buffer 2965 */ 2966 if (fs_info->nodesize >= PAGE_SIZE) { 2967 ASSERT(PagePrivate(page) && page->private); 2968 return (struct extent_buffer *)page->private; 2969 } 2970 2971 /* For subpage case, we need to lookup buffer radix tree */ 2972 rcu_read_lock(); 2973 eb = radix_tree_lookup(&fs_info->buffer_radix, 2974 bytenr >> fs_info->sectorsize_bits); 2975 rcu_read_unlock(); 2976 ASSERT(eb); 2977 return eb; 2978 } 2979 2980 /* 2981 * after a readpage IO is done, we need to: 2982 * clear the uptodate bits on error 2983 * set the uptodate bits if things worked 2984 * set the page up to date if all extents in the tree are uptodate 2985 * clear the lock bit in the extent tree 2986 * unlock the page if there are no other extents locked for it 2987 * 2988 * Scheduling is not allowed, so the extent state tree is expected 2989 * to have one and only one object corresponding to this IO. 2990 */ 2991 static void end_bio_extent_readpage(struct bio *bio) 2992 { 2993 struct bio_vec *bvec; 2994 struct btrfs_bio *bbio = btrfs_bio(bio); 2995 struct extent_io_tree *tree, *failure_tree; 2996 struct processed_extent processed = { 0 }; 2997 /* 2998 * The offset to the beginning of a bio, since one bio can never be 2999 * larger than UINT_MAX, u32 here is enough. 3000 */ 3001 u32 bio_offset = 0; 3002 int mirror; 3003 struct bvec_iter_all iter_all; 3004 3005 ASSERT(!bio_flagged(bio, BIO_CLONED)); 3006 bio_for_each_segment_all(bvec, bio, iter_all) { 3007 bool uptodate = !bio->bi_status; 3008 struct page *page = bvec->bv_page; 3009 struct inode *inode = page->mapping->host; 3010 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3011 const u32 sectorsize = fs_info->sectorsize; 3012 unsigned int error_bitmap = (unsigned int)-1; 3013 bool repair = false; 3014 u64 start; 3015 u64 end; 3016 u32 len; 3017 3018 btrfs_debug(fs_info, 3019 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 3020 bio->bi_iter.bi_sector, bio->bi_status, 3021 bbio->mirror_num); 3022 tree = &BTRFS_I(inode)->io_tree; 3023 failure_tree = &BTRFS_I(inode)->io_failure_tree; 3024 3025 /* 3026 * We always issue full-sector reads, but if some block in a 3027 * page fails to read, blk_update_request() will advance 3028 * bv_offset and adjust bv_len to compensate. Print a warning 3029 * for unaligned offsets, and an error if they don't add up to 3030 * a full sector. 3031 */ 3032 if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 3033 btrfs_err(fs_info, 3034 "partial page read in btrfs with offset %u and length %u", 3035 bvec->bv_offset, bvec->bv_len); 3036 else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, 3037 sectorsize)) 3038 btrfs_info(fs_info, 3039 "incomplete page read with offset %u and length %u", 3040 bvec->bv_offset, bvec->bv_len); 3041 3042 start = page_offset(page) + bvec->bv_offset; 3043 end = start + bvec->bv_len - 1; 3044 len = bvec->bv_len; 3045 3046 mirror = bbio->mirror_num; 3047 if (likely(uptodate)) { 3048 if (is_data_inode(inode)) { 3049 error_bitmap = btrfs_verify_data_csum(bbio, 3050 bio_offset, page, start, end); 3051 if (error_bitmap) 3052 uptodate = false; 3053 } else { 3054 if (btrfs_validate_metadata_buffer(bbio, 3055 page, start, end, mirror)) 3056 uptodate = false; 3057 } 3058 } 3059 3060 if (likely(uptodate)) { 3061 loff_t i_size = i_size_read(inode); 3062 pgoff_t end_index = i_size >> PAGE_SHIFT; 3063 3064 clean_io_failure(BTRFS_I(inode)->root->fs_info, 3065 failure_tree, tree, start, page, 3066 btrfs_ino(BTRFS_I(inode)), 0); 3067 3068 /* 3069 * Zero out the remaining part if this range straddles 3070 * i_size. 3071 * 3072 * Here we should only zero the range inside the bvec, 3073 * not touch anything else. 3074 * 3075 * NOTE: i_size is exclusive while end is inclusive. 3076 */ 3077 if (page->index == end_index && i_size <= end) { 3078 u32 zero_start = max(offset_in_page(i_size), 3079 offset_in_page(start)); 3080 3081 zero_user_segment(page, zero_start, 3082 offset_in_page(end) + 1); 3083 } 3084 } else if (is_data_inode(inode)) { 3085 /* 3086 * Only try to repair bios that actually made it to a 3087 * device. If the bio failed to be submitted mirror 3088 * is 0 and we need to fail it without retrying. 3089 * 3090 * This also includes the high level bios for compressed 3091 * extents - these never make it to a device and repair 3092 * is already handled on the lower compressed bio. 3093 */ 3094 if (mirror > 0) 3095 repair = true; 3096 } else { 3097 struct extent_buffer *eb; 3098 3099 eb = find_extent_buffer_readpage(fs_info, page, start); 3100 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 3101 eb->read_mirror = mirror; 3102 atomic_dec(&eb->io_pages); 3103 } 3104 3105 if (repair) { 3106 /* 3107 * submit_data_read_repair() will handle all the good 3108 * and bad sectors, we just continue to the next bvec. 3109 */ 3110 submit_data_read_repair(inode, bbio, bio_offset, bvec, 3111 error_bitmap); 3112 } else { 3113 /* Update page status and unlock */ 3114 end_page_read(page, uptodate, start, len); 3115 endio_readpage_release_extent(&processed, BTRFS_I(inode), 3116 start, end, PageUptodate(page)); 3117 } 3118 3119 ASSERT(bio_offset + len > bio_offset); 3120 bio_offset += len; 3121 3122 } 3123 /* Release the last extent */ 3124 endio_readpage_release_extent(&processed, NULL, 0, 0, false); 3125 btrfs_bio_free_csum(bbio); 3126 bio_put(bio); 3127 } 3128 3129 /** 3130 * Populate every free slot in a provided array with pages. 3131 * 3132 * @nr_pages: number of pages to allocate 3133 * @page_array: the array to fill with pages; any existing non-null entries in 3134 * the array will be skipped 3135 * 3136 * Return: 0 if all pages were able to be allocated; 3137 * -ENOMEM otherwise, and the caller is responsible for freeing all 3138 * non-null page pointers in the array. 3139 */ 3140 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) 3141 { 3142 unsigned int allocated; 3143 3144 for (allocated = 0; allocated < nr_pages;) { 3145 unsigned int last = allocated; 3146 3147 allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); 3148 3149 if (allocated == nr_pages) 3150 return 0; 3151 3152 /* 3153 * During this iteration, no page could be allocated, even 3154 * though alloc_pages_bulk_array() falls back to alloc_page() 3155 * if it could not bulk-allocate. So we must be out of memory. 3156 */ 3157 if (allocated == last) 3158 return -ENOMEM; 3159 3160 memalloc_retry_wait(GFP_NOFS); 3161 } 3162 return 0; 3163 } 3164 3165 /* 3166 * Initialize the members up to but not including 'bio'. Use after allocating a 3167 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of 3168 * 'bio' because use of __GFP_ZERO is not supported. 3169 */ 3170 static inline void btrfs_bio_init(struct btrfs_bio *bbio) 3171 { 3172 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 3173 } 3174 3175 /* 3176 * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. 3177 * 3178 * The bio allocation is backed by bioset and does not fail. 3179 */ 3180 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) 3181 { 3182 struct bio *bio; 3183 3184 ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); 3185 bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); 3186 btrfs_bio_init(btrfs_bio(bio)); 3187 return bio; 3188 } 3189 3190 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) 3191 { 3192 struct bio *bio; 3193 struct btrfs_bio *bbio; 3194 3195 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 3196 3197 /* this will never fail when it's backed by a bioset */ 3198 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 3199 ASSERT(bio); 3200 3201 bbio = btrfs_bio(bio); 3202 btrfs_bio_init(bbio); 3203 3204 bio_trim(bio, offset >> 9, size >> 9); 3205 bbio->iter = bio->bi_iter; 3206 return bio; 3207 } 3208 3209 /** 3210 * Attempt to add a page to bio 3211 * 3212 * @bio_ctrl: record both the bio, and its bio_flags 3213 * @page: page to add to the bio 3214 * @disk_bytenr: offset of the new bio or to check whether we are adding 3215 * a contiguous page to the previous one 3216 * @size: portion of page that we want to write 3217 * @pg_offset: starting offset in the page 3218 * @compress_type: compression type of the current bio to see if we can merge them 3219 * 3220 * Attempt to add a page to bio considering stripe alignment etc. 3221 * 3222 * Return >= 0 for the number of bytes added to the bio. 3223 * Can return 0 if the current bio is already at stripe/zone boundary. 3224 * Return <0 for error. 3225 */ 3226 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, 3227 struct page *page, 3228 u64 disk_bytenr, unsigned int size, 3229 unsigned int pg_offset, 3230 enum btrfs_compression_type compress_type) 3231 { 3232 struct bio *bio = bio_ctrl->bio; 3233 u32 bio_size = bio->bi_iter.bi_size; 3234 u32 real_size; 3235 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 3236 bool contig; 3237 int ret; 3238 3239 ASSERT(bio); 3240 /* The limit should be calculated when bio_ctrl->bio is allocated */ 3241 ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); 3242 if (bio_ctrl->compress_type != compress_type) 3243 return 0; 3244 3245 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) 3246 contig = bio->bi_iter.bi_sector == sector; 3247 else 3248 contig = bio_end_sector(bio) == sector; 3249 if (!contig) 3250 return 0; 3251 3252 real_size = min(bio_ctrl->len_to_oe_boundary, 3253 bio_ctrl->len_to_stripe_boundary) - bio_size; 3254 real_size = min(real_size, size); 3255 3256 /* 3257 * If real_size is 0, never call bio_add_*_page(), as even size is 0, 3258 * bio will still execute its endio function on the page! 3259 */ 3260 if (real_size == 0) 3261 return 0; 3262 3263 if (bio_op(bio) == REQ_OP_ZONE_APPEND) 3264 ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); 3265 else 3266 ret = bio_add_page(bio, page, real_size, pg_offset); 3267 3268 return ret; 3269 } 3270 3271 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, 3272 struct btrfs_inode *inode, u64 file_offset) 3273 { 3274 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3275 struct btrfs_io_geometry geom; 3276 struct btrfs_ordered_extent *ordered; 3277 struct extent_map *em; 3278 u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); 3279 int ret; 3280 3281 /* 3282 * Pages for compressed extent are never submitted to disk directly, 3283 * thus it has no real boundary, just set them to U32_MAX. 3284 * 3285 * The split happens for real compressed bio, which happens in 3286 * btrfs_submit_compressed_read/write(). 3287 */ 3288 if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { 3289 bio_ctrl->len_to_oe_boundary = U32_MAX; 3290 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3291 return 0; 3292 } 3293 em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 3294 if (IS_ERR(em)) 3295 return PTR_ERR(em); 3296 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), 3297 logical, &geom); 3298 free_extent_map(em); 3299 if (ret < 0) { 3300 return ret; 3301 } 3302 if (geom.len > U32_MAX) 3303 bio_ctrl->len_to_stripe_boundary = U32_MAX; 3304 else 3305 bio_ctrl->len_to_stripe_boundary = (u32)geom.len; 3306 3307 if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { 3308 bio_ctrl->len_to_oe_boundary = U32_MAX; 3309 return 0; 3310 } 3311 3312 /* Ordered extent not yet created, so we're good */ 3313 ordered = btrfs_lookup_ordered_extent(inode, file_offset); 3314 if (!ordered) { 3315 bio_ctrl->len_to_oe_boundary = U32_MAX; 3316 return 0; 3317 } 3318 3319 bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 3320 ordered->disk_bytenr + ordered->disk_num_bytes - logical); 3321 btrfs_put_ordered_extent(ordered); 3322 return 0; 3323 } 3324 3325 static int alloc_new_bio(struct btrfs_inode *inode, 3326 struct btrfs_bio_ctrl *bio_ctrl, 3327 struct writeback_control *wbc, 3328 blk_opf_t opf, 3329 bio_end_io_t end_io_func, 3330 u64 disk_bytenr, u32 offset, u64 file_offset, 3331 enum btrfs_compression_type compress_type) 3332 { 3333 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3334 struct bio *bio; 3335 int ret; 3336 3337 bio = btrfs_bio_alloc(BIO_MAX_VECS); 3338 /* 3339 * For compressed page range, its disk_bytenr is always @disk_bytenr 3340 * passed in, no matter if we have added any range into previous bio. 3341 */ 3342 if (compress_type != BTRFS_COMPRESS_NONE) 3343 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 3344 else 3345 bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; 3346 bio_ctrl->bio = bio; 3347 bio_ctrl->compress_type = compress_type; 3348 bio->bi_end_io = end_io_func; 3349 bio->bi_opf = opf; 3350 ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); 3351 if (ret < 0) 3352 goto error; 3353 3354 if (wbc) { 3355 /* 3356 * For Zone append we need the correct block_device that we are 3357 * going to write to set in the bio to be able to respect the 3358 * hardware limitation. Look it up here: 3359 */ 3360 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 3361 struct btrfs_device *dev; 3362 3363 dev = btrfs_zoned_get_device(fs_info, disk_bytenr, 3364 fs_info->sectorsize); 3365 if (IS_ERR(dev)) { 3366 ret = PTR_ERR(dev); 3367 goto error; 3368 } 3369 3370 bio_set_dev(bio, dev->bdev); 3371 } else { 3372 /* 3373 * Otherwise pick the last added device to support 3374 * cgroup writeback. For multi-device file systems this 3375 * means blk-cgroup policies have to always be set on the 3376 * last added/replaced device. This is a bit odd but has 3377 * been like that for a long time. 3378 */ 3379 bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); 3380 } 3381 wbc_init_bio(wbc, bio); 3382 } else { 3383 ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); 3384 } 3385 return 0; 3386 error: 3387 bio_ctrl->bio = NULL; 3388 bio->bi_status = errno_to_blk_status(ret); 3389 bio_endio(bio); 3390 return ret; 3391 } 3392 3393 /* 3394 * @opf: bio REQ_OP_* and REQ_* flags as one value 3395 * @wbc: optional writeback control for io accounting 3396 * @page: page to add to the bio 3397 * @disk_bytenr: logical bytenr where the write will be 3398 * @size: portion of page that we want to write to 3399 * @pg_offset: offset of the new bio or to check whether we are adding 3400 * a contiguous page to the previous one 3401 * @bio_ret: must be valid pointer, newly allocated bio will be stored there 3402 * @end_io_func: end_io callback for new bio 3403 * @mirror_num: desired mirror to read/write 3404 * @prev_bio_flags: flags of previous bio to see if we can merge the current one 3405 * @compress_type: compress type for current bio 3406 */ 3407 static int submit_extent_page(blk_opf_t opf, 3408 struct writeback_control *wbc, 3409 struct btrfs_bio_ctrl *bio_ctrl, 3410 struct page *page, u64 disk_bytenr, 3411 size_t size, unsigned long pg_offset, 3412 bio_end_io_t end_io_func, 3413 enum btrfs_compression_type compress_type, 3414 bool force_bio_submit) 3415 { 3416 int ret = 0; 3417 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3418 unsigned int cur = pg_offset; 3419 3420 ASSERT(bio_ctrl); 3421 3422 ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && 3423 pg_offset + size <= PAGE_SIZE); 3424 if (force_bio_submit) 3425 submit_one_bio(bio_ctrl); 3426 3427 while (cur < pg_offset + size) { 3428 u32 offset = cur - pg_offset; 3429 int added; 3430 3431 /* Allocate new bio if needed */ 3432 if (!bio_ctrl->bio) { 3433 ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, 3434 end_io_func, disk_bytenr, offset, 3435 page_offset(page) + cur, 3436 compress_type); 3437 if (ret < 0) 3438 return ret; 3439 } 3440 /* 3441 * We must go through btrfs_bio_add_page() to ensure each 3442 * page range won't cross various boundaries. 3443 */ 3444 if (compress_type != BTRFS_COMPRESS_NONE) 3445 added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, 3446 size - offset, pg_offset + offset, 3447 compress_type); 3448 else 3449 added = btrfs_bio_add_page(bio_ctrl, page, 3450 disk_bytenr + offset, size - offset, 3451 pg_offset + offset, compress_type); 3452 3453 /* Metadata page range should never be split */ 3454 if (!is_data_inode(&inode->vfs_inode)) 3455 ASSERT(added == 0 || added == size - offset); 3456 3457 /* At least we added some page, update the account */ 3458 if (wbc && added) 3459 wbc_account_cgroup_owner(wbc, page, added); 3460 3461 /* We have reached boundary, submit right now */ 3462 if (added < size - offset) { 3463 /* The bio should contain some page(s) */ 3464 ASSERT(bio_ctrl->bio->bi_iter.bi_size); 3465 submit_one_bio(bio_ctrl); 3466 } 3467 cur += added; 3468 } 3469 return 0; 3470 } 3471 3472 static int attach_extent_buffer_page(struct extent_buffer *eb, 3473 struct page *page, 3474 struct btrfs_subpage *prealloc) 3475 { 3476 struct btrfs_fs_info *fs_info = eb->fs_info; 3477 int ret = 0; 3478 3479 /* 3480 * If the page is mapped to btree inode, we should hold the private 3481 * lock to prevent race. 3482 * For cloned or dummy extent buffers, their pages are not mapped and 3483 * will not race with any other ebs. 3484 */ 3485 if (page->mapping) 3486 lockdep_assert_held(&page->mapping->private_lock); 3487 3488 if (fs_info->nodesize >= PAGE_SIZE) { 3489 if (!PagePrivate(page)) 3490 attach_page_private(page, eb); 3491 else 3492 WARN_ON(page->private != (unsigned long)eb); 3493 return 0; 3494 } 3495 3496 /* Already mapped, just free prealloc */ 3497 if (PagePrivate(page)) { 3498 btrfs_free_subpage(prealloc); 3499 return 0; 3500 } 3501 3502 if (prealloc) 3503 /* Has preallocated memory for subpage */ 3504 attach_page_private(page, prealloc); 3505 else 3506 /* Do new allocation to attach subpage */ 3507 ret = btrfs_attach_subpage(fs_info, page, 3508 BTRFS_SUBPAGE_METADATA); 3509 return ret; 3510 } 3511 3512 int set_page_extent_mapped(struct page *page) 3513 { 3514 struct btrfs_fs_info *fs_info; 3515 3516 ASSERT(page->mapping); 3517 3518 if (PagePrivate(page)) 3519 return 0; 3520 3521 fs_info = btrfs_sb(page->mapping->host->i_sb); 3522 3523 if (btrfs_is_subpage(fs_info, page)) 3524 return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); 3525 3526 attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); 3527 return 0; 3528 } 3529 3530 void clear_page_extent_mapped(struct page *page) 3531 { 3532 struct btrfs_fs_info *fs_info; 3533 3534 ASSERT(page->mapping); 3535 3536 if (!PagePrivate(page)) 3537 return; 3538 3539 fs_info = btrfs_sb(page->mapping->host->i_sb); 3540 if (btrfs_is_subpage(fs_info, page)) 3541 return btrfs_detach_subpage(fs_info, page); 3542 3543 detach_page_private(page); 3544 } 3545 3546 static struct extent_map * 3547 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 3548 u64 start, u64 len, struct extent_map **em_cached) 3549 { 3550 struct extent_map *em; 3551 3552 if (em_cached && *em_cached) { 3553 em = *em_cached; 3554 if (extent_map_in_tree(em) && start >= em->start && 3555 start < extent_map_end(em)) { 3556 refcount_inc(&em->refs); 3557 return em; 3558 } 3559 3560 free_extent_map(em); 3561 *em_cached = NULL; 3562 } 3563 3564 em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); 3565 if (em_cached && !IS_ERR(em)) { 3566 BUG_ON(*em_cached); 3567 refcount_inc(&em->refs); 3568 *em_cached = em; 3569 } 3570 return em; 3571 } 3572 /* 3573 * basic readpage implementation. Locked extent state structs are inserted 3574 * into the tree that are removed when the IO is done (by the end_io 3575 * handlers) 3576 * XXX JDM: This needs looking at to ensure proper page locking 3577 * return 0 on success, otherwise return error 3578 */ 3579 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3580 struct btrfs_bio_ctrl *bio_ctrl, 3581 blk_opf_t read_flags, u64 *prev_em_start) 3582 { 3583 struct inode *inode = page->mapping->host; 3584 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3585 u64 start = page_offset(page); 3586 const u64 end = start + PAGE_SIZE - 1; 3587 u64 cur = start; 3588 u64 extent_offset; 3589 u64 last_byte = i_size_read(inode); 3590 u64 block_start; 3591 u64 cur_end; 3592 struct extent_map *em; 3593 int ret = 0; 3594 size_t pg_offset = 0; 3595 size_t iosize; 3596 size_t blocksize = inode->i_sb->s_blocksize; 3597 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 3598 3599 ret = set_page_extent_mapped(page); 3600 if (ret < 0) { 3601 unlock_extent(tree, start, end); 3602 btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); 3603 unlock_page(page); 3604 goto out; 3605 } 3606 3607 if (page->index == last_byte >> PAGE_SHIFT) { 3608 size_t zero_offset = offset_in_page(last_byte); 3609 3610 if (zero_offset) { 3611 iosize = PAGE_SIZE - zero_offset; 3612 memzero_page(page, zero_offset, iosize); 3613 } 3614 } 3615 begin_page_read(fs_info, page); 3616 while (cur <= end) { 3617 unsigned long this_bio_flag = 0; 3618 bool force_bio_submit = false; 3619 u64 disk_bytenr; 3620 3621 ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); 3622 if (cur >= last_byte) { 3623 struct extent_state *cached = NULL; 3624 3625 iosize = PAGE_SIZE - pg_offset; 3626 memzero_page(page, pg_offset, iosize); 3627 set_extent_uptodate(tree, cur, cur + iosize - 1, 3628 &cached, GFP_NOFS); 3629 unlock_extent_cached(tree, cur, 3630 cur + iosize - 1, &cached); 3631 end_page_read(page, true, cur, iosize); 3632 break; 3633 } 3634 em = __get_extent_map(inode, page, pg_offset, cur, 3635 end - cur + 1, em_cached); 3636 if (IS_ERR(em)) { 3637 unlock_extent(tree, cur, end); 3638 end_page_read(page, false, cur, end + 1 - cur); 3639 ret = PTR_ERR(em); 3640 break; 3641 } 3642 extent_offset = cur - em->start; 3643 BUG_ON(extent_map_end(em) <= cur); 3644 BUG_ON(end < cur); 3645 3646 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3647 this_bio_flag = em->compress_type; 3648 3649 iosize = min(extent_map_end(em) - cur, end - cur + 1); 3650 cur_end = min(extent_map_end(em) - 1, end); 3651 iosize = ALIGN(iosize, blocksize); 3652 if (this_bio_flag != BTRFS_COMPRESS_NONE) 3653 disk_bytenr = em->block_start; 3654 else 3655 disk_bytenr = em->block_start + extent_offset; 3656 block_start = em->block_start; 3657 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3658 block_start = EXTENT_MAP_HOLE; 3659 3660 /* 3661 * If we have a file range that points to a compressed extent 3662 * and it's followed by a consecutive file range that points 3663 * to the same compressed extent (possibly with a different 3664 * offset and/or length, so it either points to the whole extent 3665 * or only part of it), we must make sure we do not submit a 3666 * single bio to populate the pages for the 2 ranges because 3667 * this makes the compressed extent read zero out the pages 3668 * belonging to the 2nd range. Imagine the following scenario: 3669 * 3670 * File layout 3671 * [0 - 8K] [8K - 24K] 3672 * | | 3673 * | | 3674 * points to extent X, points to extent X, 3675 * offset 4K, length of 8K offset 0, length 16K 3676 * 3677 * [extent X, compressed length = 4K uncompressed length = 16K] 3678 * 3679 * If the bio to read the compressed extent covers both ranges, 3680 * it will decompress extent X into the pages belonging to the 3681 * first range and then it will stop, zeroing out the remaining 3682 * pages that belong to the other range that points to extent X. 3683 * So here we make sure we submit 2 bios, one for the first 3684 * range and another one for the third range. Both will target 3685 * the same physical extent from disk, but we can't currently 3686 * make the compressed bio endio callback populate the pages 3687 * for both ranges because each compressed bio is tightly 3688 * coupled with a single extent map, and each range can have 3689 * an extent map with a different offset value relative to the 3690 * uncompressed data of our extent and different lengths. This 3691 * is a corner case so we prioritize correctness over 3692 * non-optimal behavior (submitting 2 bios for the same extent). 3693 */ 3694 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3695 prev_em_start && *prev_em_start != (u64)-1 && 3696 *prev_em_start != em->start) 3697 force_bio_submit = true; 3698 3699 if (prev_em_start) 3700 *prev_em_start = em->start; 3701 3702 free_extent_map(em); 3703 em = NULL; 3704 3705 /* we've found a hole, just zero and go on */ 3706 if (block_start == EXTENT_MAP_HOLE) { 3707 struct extent_state *cached = NULL; 3708 3709 memzero_page(page, pg_offset, iosize); 3710 3711 set_extent_uptodate(tree, cur, cur + iosize - 1, 3712 &cached, GFP_NOFS); 3713 unlock_extent_cached(tree, cur, 3714 cur + iosize - 1, &cached); 3715 end_page_read(page, true, cur, iosize); 3716 cur = cur + iosize; 3717 pg_offset += iosize; 3718 continue; 3719 } 3720 /* the get_extent function already copied into the page */ 3721 if (test_range_bit(tree, cur, cur_end, 3722 EXTENT_UPTODATE, 1, NULL)) { 3723 unlock_extent(tree, cur, cur + iosize - 1); 3724 end_page_read(page, true, cur, iosize); 3725 cur = cur + iosize; 3726 pg_offset += iosize; 3727 continue; 3728 } 3729 /* we have an inline extent but it didn't get marked up 3730 * to date. Error out 3731 */ 3732 if (block_start == EXTENT_MAP_INLINE) { 3733 unlock_extent(tree, cur, cur + iosize - 1); 3734 end_page_read(page, false, cur, iosize); 3735 cur = cur + iosize; 3736 pg_offset += iosize; 3737 continue; 3738 } 3739 3740 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, 3741 bio_ctrl, page, disk_bytenr, iosize, 3742 pg_offset, end_bio_extent_readpage, 3743 this_bio_flag, force_bio_submit); 3744 if (ret) { 3745 /* 3746 * We have to unlock the remaining range, or the page 3747 * will never be unlocked. 3748 */ 3749 unlock_extent(tree, cur, end); 3750 end_page_read(page, false, cur, end + 1 - cur); 3751 goto out; 3752 } 3753 cur = cur + iosize; 3754 pg_offset += iosize; 3755 } 3756 out: 3757 return ret; 3758 } 3759 3760 int btrfs_read_folio(struct file *file, struct folio *folio) 3761 { 3762 struct page *page = &folio->page; 3763 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3764 u64 start = page_offset(page); 3765 u64 end = start + PAGE_SIZE - 1; 3766 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 3767 int ret; 3768 3769 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3770 3771 ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 3772 /* 3773 * If btrfs_do_readpage() failed we will want to submit the assembled 3774 * bio to do the cleanup. 3775 */ 3776 submit_one_bio(&bio_ctrl); 3777 return ret; 3778 } 3779 3780 static inline void contiguous_readpages(struct page *pages[], int nr_pages, 3781 u64 start, u64 end, 3782 struct extent_map **em_cached, 3783 struct btrfs_bio_ctrl *bio_ctrl, 3784 u64 *prev_em_start) 3785 { 3786 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3787 int index; 3788 3789 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3790 3791 for (index = 0; index < nr_pages; index++) { 3792 btrfs_do_readpage(pages[index], em_cached, bio_ctrl, 3793 REQ_RAHEAD, prev_em_start); 3794 put_page(pages[index]); 3795 } 3796 } 3797 3798 /* 3799 * helper for __extent_writepage, doing all of the delayed allocation setup. 3800 * 3801 * This returns 1 if btrfs_run_delalloc_range function did all the work required 3802 * to write the page (copy into inline extent). In this case the IO has 3803 * been started and the page is already unlocked. 3804 * 3805 * This returns 0 if all went well (page still locked) 3806 * This returns < 0 if there were errors (page still locked) 3807 */ 3808 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, 3809 struct page *page, struct writeback_control *wbc) 3810 { 3811 const u64 page_end = page_offset(page) + PAGE_SIZE - 1; 3812 u64 delalloc_start = page_offset(page); 3813 u64 delalloc_to_write = 0; 3814 /* How many pages are started by btrfs_run_delalloc_range() */ 3815 unsigned long nr_written = 0; 3816 int ret; 3817 int page_started = 0; 3818 3819 while (delalloc_start < page_end) { 3820 u64 delalloc_end = page_end; 3821 bool found; 3822 3823 found = find_lock_delalloc_range(&inode->vfs_inode, page, 3824 &delalloc_start, 3825 &delalloc_end); 3826 if (!found) { 3827 delalloc_start = delalloc_end + 1; 3828 continue; 3829 } 3830 ret = btrfs_run_delalloc_range(inode, page, delalloc_start, 3831 delalloc_end, &page_started, &nr_written, wbc); 3832 if (ret) { 3833 btrfs_page_set_error(inode->root->fs_info, page, 3834 page_offset(page), PAGE_SIZE); 3835 return ret; 3836 } 3837 /* 3838 * delalloc_end is already one less than the total length, so 3839 * we don't subtract one from PAGE_SIZE 3840 */ 3841 delalloc_to_write += (delalloc_end - delalloc_start + 3842 PAGE_SIZE) >> PAGE_SHIFT; 3843 delalloc_start = delalloc_end + 1; 3844 } 3845 if (wbc->nr_to_write < delalloc_to_write) { 3846 int thresh = 8192; 3847 3848 if (delalloc_to_write < thresh * 2) 3849 thresh = delalloc_to_write; 3850 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3851 thresh); 3852 } 3853 3854 /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ 3855 if (page_started) { 3856 /* 3857 * We've unlocked the page, so we can't update the mapping's 3858 * writeback index, just update nr_to_write. 3859 */ 3860 wbc->nr_to_write -= nr_written; 3861 return 1; 3862 } 3863 3864 return 0; 3865 } 3866 3867 /* 3868 * Find the first byte we need to write. 3869 * 3870 * For subpage, one page can contain several sectors, and 3871 * __extent_writepage_io() will just grab all extent maps in the page 3872 * range and try to submit all non-inline/non-compressed extents. 3873 * 3874 * This is a big problem for subpage, we shouldn't re-submit already written 3875 * data at all. 3876 * This function will lookup subpage dirty bit to find which range we really 3877 * need to submit. 3878 * 3879 * Return the next dirty range in [@start, @end). 3880 * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. 3881 */ 3882 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, 3883 struct page *page, u64 *start, u64 *end) 3884 { 3885 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 3886 struct btrfs_subpage_info *spi = fs_info->subpage_info; 3887 u64 orig_start = *start; 3888 /* Declare as unsigned long so we can use bitmap ops */ 3889 unsigned long flags; 3890 int range_start_bit; 3891 int range_end_bit; 3892 3893 /* 3894 * For regular sector size == page size case, since one page only 3895 * contains one sector, we return the page offset directly. 3896 */ 3897 if (!btrfs_is_subpage(fs_info, page)) { 3898 *start = page_offset(page); 3899 *end = page_offset(page) + PAGE_SIZE; 3900 return; 3901 } 3902 3903 range_start_bit = spi->dirty_offset + 3904 (offset_in_page(orig_start) >> fs_info->sectorsize_bits); 3905 3906 /* We should have the page locked, but just in case */ 3907 spin_lock_irqsave(&subpage->lock, flags); 3908 bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, 3909 spi->dirty_offset + spi->bitmap_nr_bits); 3910 spin_unlock_irqrestore(&subpage->lock, flags); 3911 3912 range_start_bit -= spi->dirty_offset; 3913 range_end_bit -= spi->dirty_offset; 3914 3915 *start = page_offset(page) + range_start_bit * fs_info->sectorsize; 3916 *end = page_offset(page) + range_end_bit * fs_info->sectorsize; 3917 } 3918 3919 /* 3920 * helper for __extent_writepage. This calls the writepage start hooks, 3921 * and does the loop to map the page into extents and bios. 3922 * 3923 * We return 1 if the IO is started and the page is unlocked, 3924 * 0 if all went well (page still locked) 3925 * < 0 if there were errors (page still locked) 3926 */ 3927 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, 3928 struct page *page, 3929 struct writeback_control *wbc, 3930 struct extent_page_data *epd, 3931 loff_t i_size, 3932 int *nr_ret) 3933 { 3934 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3935 u64 cur = page_offset(page); 3936 u64 end = cur + PAGE_SIZE - 1; 3937 u64 extent_offset; 3938 u64 block_start; 3939 struct extent_map *em; 3940 int saved_ret = 0; 3941 int ret = 0; 3942 int nr = 0; 3943 enum req_op op = REQ_OP_WRITE; 3944 const blk_opf_t write_flags = wbc_to_write_flags(wbc); 3945 bool has_error = false; 3946 bool compressed; 3947 3948 ret = btrfs_writepage_cow_fixup(page); 3949 if (ret) { 3950 /* Fixup worker will requeue */ 3951 redirty_page_for_writepage(wbc, page); 3952 unlock_page(page); 3953 return 1; 3954 } 3955 3956 /* 3957 * we don't want to touch the inode after unlocking the page, 3958 * so we update the mapping writeback index now 3959 */ 3960 wbc->nr_to_write--; 3961 3962 while (cur <= end) { 3963 u64 disk_bytenr; 3964 u64 em_end; 3965 u64 dirty_range_start = cur; 3966 u64 dirty_range_end; 3967 u32 iosize; 3968 3969 if (cur >= i_size) { 3970 btrfs_writepage_endio_finish_ordered(inode, page, cur, 3971 end, true); 3972 /* 3973 * This range is beyond i_size, thus we don't need to 3974 * bother writing back. 3975 * But we still need to clear the dirty subpage bit, or 3976 * the next time the page gets dirtied, we will try to 3977 * writeback the sectors with subpage dirty bits, 3978 * causing writeback without ordered extent. 3979 */ 3980 btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); 3981 break; 3982 } 3983 3984 find_next_dirty_byte(fs_info, page, &dirty_range_start, 3985 &dirty_range_end); 3986 if (cur < dirty_range_start) { 3987 cur = dirty_range_start; 3988 continue; 3989 } 3990 3991 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); 3992 if (IS_ERR(em)) { 3993 btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 3994 ret = PTR_ERR_OR_ZERO(em); 3995 has_error = true; 3996 if (!saved_ret) 3997 saved_ret = ret; 3998 break; 3999 } 4000 4001 extent_offset = cur - em->start; 4002 em_end = extent_map_end(em); 4003 ASSERT(cur <= em_end); 4004 ASSERT(cur < end); 4005 ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); 4006 ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); 4007 block_start = em->block_start; 4008 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 4009 disk_bytenr = em->block_start + extent_offset; 4010 4011 /* 4012 * Note that em_end from extent_map_end() and dirty_range_end from 4013 * find_next_dirty_byte() are all exclusive 4014 */ 4015 iosize = min(min(em_end, end + 1), dirty_range_end) - cur; 4016 4017 if (btrfs_use_zone_append(inode, em->block_start)) 4018 op = REQ_OP_ZONE_APPEND; 4019 4020 free_extent_map(em); 4021 em = NULL; 4022 4023 /* 4024 * compressed and inline extents are written through other 4025 * paths in the FS 4026 */ 4027 if (compressed || block_start == EXTENT_MAP_HOLE || 4028 block_start == EXTENT_MAP_INLINE) { 4029 if (compressed) 4030 nr++; 4031 else 4032 btrfs_writepage_endio_finish_ordered(inode, 4033 page, cur, cur + iosize - 1, true); 4034 btrfs_page_clear_dirty(fs_info, page, cur, iosize); 4035 cur += iosize; 4036 continue; 4037 } 4038 4039 btrfs_set_range_writeback(inode, cur, cur + iosize - 1); 4040 if (!PageWriteback(page)) { 4041 btrfs_err(inode->root->fs_info, 4042 "page %lu not writeback, cur %llu end %llu", 4043 page->index, cur, end); 4044 } 4045 4046 /* 4047 * Although the PageDirty bit is cleared before entering this 4048 * function, subpage dirty bit is not cleared. 4049 * So clear subpage dirty bit here so next time we won't submit 4050 * page for range already written to disk. 4051 */ 4052 btrfs_page_clear_dirty(fs_info, page, cur, iosize); 4053 4054 ret = submit_extent_page(op | write_flags, wbc, 4055 &epd->bio_ctrl, page, 4056 disk_bytenr, iosize, 4057 cur - page_offset(page), 4058 end_bio_extent_writepage, 4059 0, false); 4060 if (ret) { 4061 has_error = true; 4062 if (!saved_ret) 4063 saved_ret = ret; 4064 4065 btrfs_page_set_error(fs_info, page, cur, iosize); 4066 if (PageWriteback(page)) 4067 btrfs_page_clear_writeback(fs_info, page, cur, 4068 iosize); 4069 } 4070 4071 cur += iosize; 4072 nr++; 4073 } 4074 /* 4075 * If we finish without problem, we should not only clear page dirty, 4076 * but also empty subpage dirty bits 4077 */ 4078 if (!has_error) 4079 btrfs_page_assert_not_dirty(fs_info, page); 4080 else 4081 ret = saved_ret; 4082 *nr_ret = nr; 4083 return ret; 4084 } 4085 4086 /* 4087 * the writepage semantics are similar to regular writepage. extent 4088 * records are inserted to lock ranges in the tree, and as dirty areas 4089 * are found, they are marked writeback. Then the lock bits are removed 4090 * and the end_io handler clears the writeback ranges 4091 * 4092 * Return 0 if everything goes well. 4093 * Return <0 for error. 4094 */ 4095 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 4096 struct extent_page_data *epd) 4097 { 4098 struct folio *folio = page_folio(page); 4099 struct inode *inode = page->mapping->host; 4100 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 4101 const u64 page_start = page_offset(page); 4102 const u64 page_end = page_start + PAGE_SIZE - 1; 4103 int ret; 4104 int nr = 0; 4105 size_t pg_offset; 4106 loff_t i_size = i_size_read(inode); 4107 unsigned long end_index = i_size >> PAGE_SHIFT; 4108 4109 trace___extent_writepage(page, inode, wbc); 4110 4111 WARN_ON(!PageLocked(page)); 4112 4113 btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, 4114 page_offset(page), PAGE_SIZE); 4115 4116 pg_offset = offset_in_page(i_size); 4117 if (page->index > end_index || 4118 (page->index == end_index && !pg_offset)) { 4119 folio_invalidate(folio, 0, folio_size(folio)); 4120 folio_unlock(folio); 4121 return 0; 4122 } 4123 4124 if (page->index == end_index) 4125 memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); 4126 4127 ret = set_page_extent_mapped(page); 4128 if (ret < 0) { 4129 SetPageError(page); 4130 goto done; 4131 } 4132 4133 if (!epd->extent_locked) { 4134 ret = writepage_delalloc(BTRFS_I(inode), page, wbc); 4135 if (ret == 1) 4136 return 0; 4137 if (ret) 4138 goto done; 4139 } 4140 4141 ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, 4142 &nr); 4143 if (ret == 1) 4144 return 0; 4145 4146 done: 4147 if (nr == 0) { 4148 /* make sure the mapping tag for page dirty gets cleared */ 4149 set_page_writeback(page); 4150 end_page_writeback(page); 4151 } 4152 /* 4153 * Here we used to have a check for PageError() and then set @ret and 4154 * call end_extent_writepage(). 4155 * 4156 * But in fact setting @ret here will cause different error paths 4157 * between subpage and regular sectorsize. 4158 * 4159 * For regular page size, we never submit current page, but only add 4160 * current page to current bio. 4161 * The bio submission can only happen in next page. 4162 * Thus if we hit the PageError() branch, @ret is already set to 4163 * non-zero value and will not get updated for regular sectorsize. 4164 * 4165 * But for subpage case, it's possible we submit part of current page, 4166 * thus can get PageError() set by submitted bio of the same page, 4167 * while our @ret is still 0. 4168 * 4169 * So here we unify the behavior and don't set @ret. 4170 * Error can still be properly passed to higher layer as page will 4171 * be set error, here we just don't handle the IO failure. 4172 * 4173 * NOTE: This is just a hotfix for subpage. 4174 * The root fix will be properly ending ordered extent when we hit 4175 * an error during writeback. 4176 * 4177 * But that needs a bigger refactoring, as we not only need to grab the 4178 * submitted OE, but also need to know exactly at which bytenr we hit 4179 * the error. 4180 * Currently the full page based __extent_writepage_io() is not 4181 * capable of that. 4182 */ 4183 if (PageError(page)) 4184 end_extent_writepage(page, ret, page_start, page_end); 4185 if (epd->extent_locked) { 4186 /* 4187 * If epd->extent_locked, it's from extent_write_locked_range(), 4188 * the page can either be locked by lock_page() or 4189 * process_one_page(). 4190 * Let btrfs_page_unlock_writer() handle both cases. 4191 */ 4192 ASSERT(wbc); 4193 btrfs_page_unlock_writer(fs_info, page, wbc->range_start, 4194 wbc->range_end + 1 - wbc->range_start); 4195 } else { 4196 unlock_page(page); 4197 } 4198 ASSERT(ret <= 0); 4199 return ret; 4200 } 4201 4202 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 4203 { 4204 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 4205 TASK_UNINTERRUPTIBLE); 4206 } 4207 4208 static void end_extent_buffer_writeback(struct extent_buffer *eb) 4209 { 4210 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4211 smp_mb__after_atomic(); 4212 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 4213 } 4214 4215 /* 4216 * Lock extent buffer status and pages for writeback. 4217 * 4218 * May try to flush write bio if we can't get the lock. 4219 * 4220 * Return 0 if the extent buffer doesn't need to be submitted. 4221 * (E.g. the extent buffer is not dirty) 4222 * Return >0 is the extent buffer is submitted to bio. 4223 * Return <0 if something went wrong, no page is locked. 4224 */ 4225 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, 4226 struct extent_page_data *epd) 4227 { 4228 struct btrfs_fs_info *fs_info = eb->fs_info; 4229 int i, num_pages; 4230 int flush = 0; 4231 int ret = 0; 4232 4233 if (!btrfs_try_tree_write_lock(eb)) { 4234 submit_write_bio(epd, 0); 4235 flush = 1; 4236 btrfs_tree_lock(eb); 4237 } 4238 4239 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 4240 btrfs_tree_unlock(eb); 4241 if (!epd->sync_io) 4242 return 0; 4243 if (!flush) { 4244 submit_write_bio(epd, 0); 4245 flush = 1; 4246 } 4247 while (1) { 4248 wait_on_extent_buffer_writeback(eb); 4249 btrfs_tree_lock(eb); 4250 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 4251 break; 4252 btrfs_tree_unlock(eb); 4253 } 4254 } 4255 4256 /* 4257 * We need to do this to prevent races in people who check if the eb is 4258 * under IO since we can end up having no IO bits set for a short period 4259 * of time. 4260 */ 4261 spin_lock(&eb->refs_lock); 4262 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 4263 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 4264 spin_unlock(&eb->refs_lock); 4265 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 4266 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4267 -eb->len, 4268 fs_info->dirty_metadata_batch); 4269 ret = 1; 4270 } else { 4271 spin_unlock(&eb->refs_lock); 4272 } 4273 4274 btrfs_tree_unlock(eb); 4275 4276 /* 4277 * Either we don't need to submit any tree block, or we're submitting 4278 * subpage eb. 4279 * Subpage metadata doesn't use page locking at all, so we can skip 4280 * the page locking. 4281 */ 4282 if (!ret || fs_info->nodesize < PAGE_SIZE) 4283 return ret; 4284 4285 num_pages = num_extent_pages(eb); 4286 for (i = 0; i < num_pages; i++) { 4287 struct page *p = eb->pages[i]; 4288 4289 if (!trylock_page(p)) { 4290 if (!flush) { 4291 submit_write_bio(epd, 0); 4292 flush = 1; 4293 } 4294 lock_page(p); 4295 } 4296 } 4297 4298 return ret; 4299 } 4300 4301 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) 4302 { 4303 struct btrfs_fs_info *fs_info = eb->fs_info; 4304 4305 btrfs_page_set_error(fs_info, page, eb->start, eb->len); 4306 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 4307 return; 4308 4309 /* 4310 * A read may stumble upon this buffer later, make sure that it gets an 4311 * error and knows there was an error. 4312 */ 4313 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4314 4315 /* 4316 * We need to set the mapping with the io error as well because a write 4317 * error will flip the file system readonly, and then syncfs() will 4318 * return a 0 because we are readonly if we don't modify the err seq for 4319 * the superblock. 4320 */ 4321 mapping_set_error(page->mapping, -EIO); 4322 4323 /* 4324 * If we error out, we should add back the dirty_metadata_bytes 4325 * to make it consistent. 4326 */ 4327 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4328 eb->len, fs_info->dirty_metadata_batch); 4329 4330 /* 4331 * If writeback for a btree extent that doesn't belong to a log tree 4332 * failed, increment the counter transaction->eb_write_errors. 4333 * We do this because while the transaction is running and before it's 4334 * committing (when we call filemap_fdata[write|wait]_range against 4335 * the btree inode), we might have 4336 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 4337 * returns an error or an error happens during writeback, when we're 4338 * committing the transaction we wouldn't know about it, since the pages 4339 * can be no longer dirty nor marked anymore for writeback (if a 4340 * subsequent modification to the extent buffer didn't happen before the 4341 * transaction commit), which makes filemap_fdata[write|wait]_range not 4342 * able to find the pages tagged with SetPageError at transaction 4343 * commit time. So if this happens we must abort the transaction, 4344 * otherwise we commit a super block with btree roots that point to 4345 * btree nodes/leafs whose content on disk is invalid - either garbage 4346 * or the content of some node/leaf from a past generation that got 4347 * cowed or deleted and is no longer valid. 4348 * 4349 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 4350 * not be enough - we need to distinguish between log tree extents vs 4351 * non-log tree extents, and the next filemap_fdatawait_range() call 4352 * will catch and clear such errors in the mapping - and that call might 4353 * be from a log sync and not from a transaction commit. Also, checking 4354 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 4355 * not done and would not be reliable - the eb might have been released 4356 * from memory and reading it back again means that flag would not be 4357 * set (since it's a runtime flag, not persisted on disk). 4358 * 4359 * Using the flags below in the btree inode also makes us achieve the 4360 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 4361 * writeback for all dirty pages and before filemap_fdatawait_range() 4362 * is called, the writeback for all dirty pages had already finished 4363 * with errors - because we were not using AS_EIO/AS_ENOSPC, 4364 * filemap_fdatawait_range() would return success, as it could not know 4365 * that writeback errors happened (the pages were no longer tagged for 4366 * writeback). 4367 */ 4368 switch (eb->log_index) { 4369 case -1: 4370 set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); 4371 break; 4372 case 0: 4373 set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); 4374 break; 4375 case 1: 4376 set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); 4377 break; 4378 default: 4379 BUG(); /* unexpected, logic error */ 4380 } 4381 } 4382 4383 /* 4384 * The endio specific version which won't touch any unsafe spinlock in endio 4385 * context. 4386 */ 4387 static struct extent_buffer *find_extent_buffer_nolock( 4388 struct btrfs_fs_info *fs_info, u64 start) 4389 { 4390 struct extent_buffer *eb; 4391 4392 rcu_read_lock(); 4393 eb = radix_tree_lookup(&fs_info->buffer_radix, 4394 start >> fs_info->sectorsize_bits); 4395 if (eb && atomic_inc_not_zero(&eb->refs)) { 4396 rcu_read_unlock(); 4397 return eb; 4398 } 4399 rcu_read_unlock(); 4400 return NULL; 4401 } 4402 4403 /* 4404 * The endio function for subpage extent buffer write. 4405 * 4406 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() 4407 * after all extent buffers in the page has finished their writeback. 4408 */ 4409 static void end_bio_subpage_eb_writepage(struct bio *bio) 4410 { 4411 struct btrfs_fs_info *fs_info; 4412 struct bio_vec *bvec; 4413 struct bvec_iter_all iter_all; 4414 4415 fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4416 ASSERT(fs_info->nodesize < PAGE_SIZE); 4417 4418 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4419 bio_for_each_segment_all(bvec, bio, iter_all) { 4420 struct page *page = bvec->bv_page; 4421 u64 bvec_start = page_offset(page) + bvec->bv_offset; 4422 u64 bvec_end = bvec_start + bvec->bv_len - 1; 4423 u64 cur_bytenr = bvec_start; 4424 4425 ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); 4426 4427 /* Iterate through all extent buffers in the range */ 4428 while (cur_bytenr <= bvec_end) { 4429 struct extent_buffer *eb; 4430 int done; 4431 4432 /* 4433 * Here we can't use find_extent_buffer(), as it may 4434 * try to lock eb->refs_lock, which is not safe in endio 4435 * context. 4436 */ 4437 eb = find_extent_buffer_nolock(fs_info, cur_bytenr); 4438 ASSERT(eb); 4439 4440 cur_bytenr = eb->start + eb->len; 4441 4442 ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); 4443 done = atomic_dec_and_test(&eb->io_pages); 4444 ASSERT(done); 4445 4446 if (bio->bi_status || 4447 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4448 ClearPageUptodate(page); 4449 set_btree_ioerr(page, eb); 4450 } 4451 4452 btrfs_subpage_clear_writeback(fs_info, page, eb->start, 4453 eb->len); 4454 end_extent_buffer_writeback(eb); 4455 /* 4456 * free_extent_buffer() will grab spinlock which is not 4457 * safe in endio context. Thus here we manually dec 4458 * the ref. 4459 */ 4460 atomic_dec(&eb->refs); 4461 } 4462 } 4463 bio_put(bio); 4464 } 4465 4466 static void end_bio_extent_buffer_writepage(struct bio *bio) 4467 { 4468 struct bio_vec *bvec; 4469 struct extent_buffer *eb; 4470 int done; 4471 struct bvec_iter_all iter_all; 4472 4473 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4474 bio_for_each_segment_all(bvec, bio, iter_all) { 4475 struct page *page = bvec->bv_page; 4476 4477 eb = (struct extent_buffer *)page->private; 4478 BUG_ON(!eb); 4479 done = atomic_dec_and_test(&eb->io_pages); 4480 4481 if (bio->bi_status || 4482 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 4483 ClearPageUptodate(page); 4484 set_btree_ioerr(page, eb); 4485 } 4486 4487 end_page_writeback(page); 4488 4489 if (!done) 4490 continue; 4491 4492 end_extent_buffer_writeback(eb); 4493 } 4494 4495 bio_put(bio); 4496 } 4497 4498 static void prepare_eb_write(struct extent_buffer *eb) 4499 { 4500 u32 nritems; 4501 unsigned long start; 4502 unsigned long end; 4503 4504 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 4505 atomic_set(&eb->io_pages, num_extent_pages(eb)); 4506 4507 /* Set btree blocks beyond nritems with 0 to avoid stale content */ 4508 nritems = btrfs_header_nritems(eb); 4509 if (btrfs_header_level(eb) > 0) { 4510 end = btrfs_node_key_ptr_offset(nritems); 4511 memzero_extent_buffer(eb, end, eb->len - end); 4512 } else { 4513 /* 4514 * Leaf: 4515 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 4516 */ 4517 start = btrfs_item_nr_offset(nritems); 4518 end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 4519 memzero_extent_buffer(eb, start, end - start); 4520 } 4521 } 4522 4523 /* 4524 * Unlike the work in write_one_eb(), we rely completely on extent locking. 4525 * Page locking is only utilized at minimum to keep the VMM code happy. 4526 */ 4527 static int write_one_subpage_eb(struct extent_buffer *eb, 4528 struct writeback_control *wbc, 4529 struct extent_page_data *epd) 4530 { 4531 struct btrfs_fs_info *fs_info = eb->fs_info; 4532 struct page *page = eb->pages[0]; 4533 blk_opf_t write_flags = wbc_to_write_flags(wbc); 4534 bool no_dirty_ebs = false; 4535 int ret; 4536 4537 prepare_eb_write(eb); 4538 4539 /* clear_page_dirty_for_io() in subpage helper needs page locked */ 4540 lock_page(page); 4541 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); 4542 4543 /* Check if this is the last dirty bit to update nr_written */ 4544 no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, 4545 eb->start, eb->len); 4546 if (no_dirty_ebs) 4547 clear_page_dirty_for_io(page); 4548 4549 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4550 &epd->bio_ctrl, page, eb->start, eb->len, 4551 eb->start - page_offset(page), 4552 end_bio_subpage_eb_writepage, 0, false); 4553 if (ret) { 4554 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); 4555 set_btree_ioerr(page, eb); 4556 unlock_page(page); 4557 4558 if (atomic_dec_and_test(&eb->io_pages)) 4559 end_extent_buffer_writeback(eb); 4560 return -EIO; 4561 } 4562 unlock_page(page); 4563 /* 4564 * Submission finished without problem, if no range of the page is 4565 * dirty anymore, we have submitted a page. Update nr_written in wbc. 4566 */ 4567 if (no_dirty_ebs) 4568 wbc->nr_to_write--; 4569 return ret; 4570 } 4571 4572 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 4573 struct writeback_control *wbc, 4574 struct extent_page_data *epd) 4575 { 4576 u64 disk_bytenr = eb->start; 4577 int i, num_pages; 4578 blk_opf_t write_flags = wbc_to_write_flags(wbc); 4579 int ret = 0; 4580 4581 prepare_eb_write(eb); 4582 4583 num_pages = num_extent_pages(eb); 4584 for (i = 0; i < num_pages; i++) { 4585 struct page *p = eb->pages[i]; 4586 4587 clear_page_dirty_for_io(p); 4588 set_page_writeback(p); 4589 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4590 &epd->bio_ctrl, p, disk_bytenr, 4591 PAGE_SIZE, 0, 4592 end_bio_extent_buffer_writepage, 4593 0, false); 4594 if (ret) { 4595 set_btree_ioerr(p, eb); 4596 if (PageWriteback(p)) 4597 end_page_writeback(p); 4598 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 4599 end_extent_buffer_writeback(eb); 4600 ret = -EIO; 4601 break; 4602 } 4603 disk_bytenr += PAGE_SIZE; 4604 wbc->nr_to_write--; 4605 unlock_page(p); 4606 } 4607 4608 if (unlikely(ret)) { 4609 for (; i < num_pages; i++) { 4610 struct page *p = eb->pages[i]; 4611 clear_page_dirty_for_io(p); 4612 unlock_page(p); 4613 } 4614 } 4615 4616 return ret; 4617 } 4618 4619 /* 4620 * Submit one subpage btree page. 4621 * 4622 * The main difference to submit_eb_page() is: 4623 * - Page locking 4624 * For subpage, we don't rely on page locking at all. 4625 * 4626 * - Flush write bio 4627 * We only flush bio if we may be unable to fit current extent buffers into 4628 * current bio. 4629 * 4630 * Return >=0 for the number of submitted extent buffers. 4631 * Return <0 for fatal error. 4632 */ 4633 static int submit_eb_subpage(struct page *page, 4634 struct writeback_control *wbc, 4635 struct extent_page_data *epd) 4636 { 4637 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 4638 int submitted = 0; 4639 u64 page_start = page_offset(page); 4640 int bit_start = 0; 4641 int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; 4642 int ret; 4643 4644 /* Lock and write each dirty extent buffers in the range */ 4645 while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { 4646 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 4647 struct extent_buffer *eb; 4648 unsigned long flags; 4649 u64 start; 4650 4651 /* 4652 * Take private lock to ensure the subpage won't be detached 4653 * in the meantime. 4654 */ 4655 spin_lock(&page->mapping->private_lock); 4656 if (!PagePrivate(page)) { 4657 spin_unlock(&page->mapping->private_lock); 4658 break; 4659 } 4660 spin_lock_irqsave(&subpage->lock, flags); 4661 if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, 4662 subpage->bitmaps)) { 4663 spin_unlock_irqrestore(&subpage->lock, flags); 4664 spin_unlock(&page->mapping->private_lock); 4665 bit_start++; 4666 continue; 4667 } 4668 4669 start = page_start + bit_start * fs_info->sectorsize; 4670 bit_start += sectors_per_node; 4671 4672 /* 4673 * Here we just want to grab the eb without touching extra 4674 * spin locks, so call find_extent_buffer_nolock(). 4675 */ 4676 eb = find_extent_buffer_nolock(fs_info, start); 4677 spin_unlock_irqrestore(&subpage->lock, flags); 4678 spin_unlock(&page->mapping->private_lock); 4679 4680 /* 4681 * The eb has already reached 0 refs thus find_extent_buffer() 4682 * doesn't return it. We don't need to write back such eb 4683 * anyway. 4684 */ 4685 if (!eb) 4686 continue; 4687 4688 ret = lock_extent_buffer_for_io(eb, epd); 4689 if (ret == 0) { 4690 free_extent_buffer(eb); 4691 continue; 4692 } 4693 if (ret < 0) { 4694 free_extent_buffer(eb); 4695 goto cleanup; 4696 } 4697 ret = write_one_subpage_eb(eb, wbc, epd); 4698 free_extent_buffer(eb); 4699 if (ret < 0) 4700 goto cleanup; 4701 submitted++; 4702 } 4703 return submitted; 4704 4705 cleanup: 4706 /* We hit error, end bio for the submitted extent buffers */ 4707 submit_write_bio(epd, ret); 4708 return ret; 4709 } 4710 4711 /* 4712 * Submit all page(s) of one extent buffer. 4713 * 4714 * @page: the page of one extent buffer 4715 * @eb_context: to determine if we need to submit this page, if current page 4716 * belongs to this eb, we don't need to submit 4717 * 4718 * The caller should pass each page in their bytenr order, and here we use 4719 * @eb_context to determine if we have submitted pages of one extent buffer. 4720 * 4721 * If we have, we just skip until we hit a new page that doesn't belong to 4722 * current @eb_context. 4723 * 4724 * If not, we submit all the page(s) of the extent buffer. 4725 * 4726 * Return >0 if we have submitted the extent buffer successfully. 4727 * Return 0 if we don't need to submit the page, as it's already submitted by 4728 * previous call. 4729 * Return <0 for fatal error. 4730 */ 4731 static int submit_eb_page(struct page *page, struct writeback_control *wbc, 4732 struct extent_page_data *epd, 4733 struct extent_buffer **eb_context) 4734 { 4735 struct address_space *mapping = page->mapping; 4736 struct btrfs_block_group *cache = NULL; 4737 struct extent_buffer *eb; 4738 int ret; 4739 4740 if (!PagePrivate(page)) 4741 return 0; 4742 4743 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 4744 return submit_eb_subpage(page, wbc, epd); 4745 4746 spin_lock(&mapping->private_lock); 4747 if (!PagePrivate(page)) { 4748 spin_unlock(&mapping->private_lock); 4749 return 0; 4750 } 4751 4752 eb = (struct extent_buffer *)page->private; 4753 4754 /* 4755 * Shouldn't happen and normally this would be a BUG_ON but no point 4756 * crashing the machine for something we can survive anyway. 4757 */ 4758 if (WARN_ON(!eb)) { 4759 spin_unlock(&mapping->private_lock); 4760 return 0; 4761 } 4762 4763 if (eb == *eb_context) { 4764 spin_unlock(&mapping->private_lock); 4765 return 0; 4766 } 4767 ret = atomic_inc_not_zero(&eb->refs); 4768 spin_unlock(&mapping->private_lock); 4769 if (!ret) 4770 return 0; 4771 4772 if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { 4773 /* 4774 * If for_sync, this hole will be filled with 4775 * trasnsaction commit. 4776 */ 4777 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 4778 ret = -EAGAIN; 4779 else 4780 ret = 0; 4781 free_extent_buffer(eb); 4782 return ret; 4783 } 4784 4785 *eb_context = eb; 4786 4787 ret = lock_extent_buffer_for_io(eb, epd); 4788 if (ret <= 0) { 4789 btrfs_revert_meta_write_pointer(cache, eb); 4790 if (cache) 4791 btrfs_put_block_group(cache); 4792 free_extent_buffer(eb); 4793 return ret; 4794 } 4795 if (cache) { 4796 /* 4797 * Implies write in zoned mode. Mark the last eb in a block group. 4798 */ 4799 btrfs_schedule_zone_finish_bg(cache, eb); 4800 btrfs_put_block_group(cache); 4801 } 4802 ret = write_one_eb(eb, wbc, epd); 4803 free_extent_buffer(eb); 4804 if (ret < 0) 4805 return ret; 4806 return 1; 4807 } 4808 4809 int btree_write_cache_pages(struct address_space *mapping, 4810 struct writeback_control *wbc) 4811 { 4812 struct extent_buffer *eb_context = NULL; 4813 struct extent_page_data epd = { 4814 .bio_ctrl = { 0 }, 4815 .extent_locked = 0, 4816 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4817 }; 4818 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 4819 int ret = 0; 4820 int done = 0; 4821 int nr_to_write_done = 0; 4822 struct pagevec pvec; 4823 int nr_pages; 4824 pgoff_t index; 4825 pgoff_t end; /* Inclusive */ 4826 int scanned = 0; 4827 xa_mark_t tag; 4828 4829 pagevec_init(&pvec); 4830 if (wbc->range_cyclic) { 4831 index = mapping->writeback_index; /* Start from prev offset */ 4832 end = -1; 4833 /* 4834 * Start from the beginning does not need to cycle over the 4835 * range, mark it as scanned. 4836 */ 4837 scanned = (index == 0); 4838 } else { 4839 index = wbc->range_start >> PAGE_SHIFT; 4840 end = wbc->range_end >> PAGE_SHIFT; 4841 scanned = 1; 4842 } 4843 if (wbc->sync_mode == WB_SYNC_ALL) 4844 tag = PAGECACHE_TAG_TOWRITE; 4845 else 4846 tag = PAGECACHE_TAG_DIRTY; 4847 btrfs_zoned_meta_io_lock(fs_info); 4848 retry: 4849 if (wbc->sync_mode == WB_SYNC_ALL) 4850 tag_pages_for_writeback(mapping, index, end); 4851 while (!done && !nr_to_write_done && (index <= end) && 4852 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, 4853 tag))) { 4854 unsigned i; 4855 4856 for (i = 0; i < nr_pages; i++) { 4857 struct page *page = pvec.pages[i]; 4858 4859 ret = submit_eb_page(page, wbc, &epd, &eb_context); 4860 if (ret == 0) 4861 continue; 4862 if (ret < 0) { 4863 done = 1; 4864 break; 4865 } 4866 4867 /* 4868 * the filesystem may choose to bump up nr_to_write. 4869 * We have to make sure to honor the new nr_to_write 4870 * at any time 4871 */ 4872 nr_to_write_done = wbc->nr_to_write <= 0; 4873 } 4874 pagevec_release(&pvec); 4875 cond_resched(); 4876 } 4877 if (!scanned && !done) { 4878 /* 4879 * We hit the last page and there is more work to be done: wrap 4880 * back to the start of the file 4881 */ 4882 scanned = 1; 4883 index = 0; 4884 goto retry; 4885 } 4886 /* 4887 * If something went wrong, don't allow any metadata write bio to be 4888 * submitted. 4889 * 4890 * This would prevent use-after-free if we had dirty pages not 4891 * cleaned up, which can still happen by fuzzed images. 4892 * 4893 * - Bad extent tree 4894 * Allowing existing tree block to be allocated for other trees. 4895 * 4896 * - Log tree operations 4897 * Exiting tree blocks get allocated to log tree, bumps its 4898 * generation, then get cleaned in tree re-balance. 4899 * Such tree block will not be written back, since it's clean, 4900 * thus no WRITTEN flag set. 4901 * And after log writes back, this tree block is not traced by 4902 * any dirty extent_io_tree. 4903 * 4904 * - Offending tree block gets re-dirtied from its original owner 4905 * Since it has bumped generation, no WRITTEN flag, it can be 4906 * reused without COWing. This tree block will not be traced 4907 * by btrfs_transaction::dirty_pages. 4908 * 4909 * Now such dirty tree block will not be cleaned by any dirty 4910 * extent io tree. Thus we don't want to submit such wild eb 4911 * if the fs already has error. 4912 * 4913 * We can get ret > 0 from submit_extent_page() indicating how many ebs 4914 * were submitted. Reset it to 0 to avoid false alerts for the caller. 4915 */ 4916 if (ret > 0) 4917 ret = 0; 4918 if (!ret && BTRFS_FS_ERROR(fs_info)) 4919 ret = -EROFS; 4920 submit_write_bio(&epd, ret); 4921 4922 btrfs_zoned_meta_io_unlock(fs_info); 4923 return ret; 4924 } 4925 4926 /** 4927 * Walk the list of dirty pages of the given address space and write all of them. 4928 * 4929 * @mapping: address space structure to write 4930 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 4931 * @epd: holds context for the write, namely the bio 4932 * 4933 * If a page is already under I/O, write_cache_pages() skips it, even 4934 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 4935 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 4936 * and msync() need to guarantee that all the data which was dirty at the time 4937 * the call was made get new I/O started against them. If wbc->sync_mode is 4938 * WB_SYNC_ALL then we were called for data integrity and we must wait for 4939 * existing IO to complete. 4940 */ 4941 static int extent_write_cache_pages(struct address_space *mapping, 4942 struct writeback_control *wbc, 4943 struct extent_page_data *epd) 4944 { 4945 struct inode *inode = mapping->host; 4946 int ret = 0; 4947 int done = 0; 4948 int nr_to_write_done = 0; 4949 struct pagevec pvec; 4950 int nr_pages; 4951 pgoff_t index; 4952 pgoff_t end; /* Inclusive */ 4953 pgoff_t done_index; 4954 int range_whole = 0; 4955 int scanned = 0; 4956 xa_mark_t tag; 4957 4958 /* 4959 * We have to hold onto the inode so that ordered extents can do their 4960 * work when the IO finishes. The alternative to this is failing to add 4961 * an ordered extent if the igrab() fails there and that is a huge pain 4962 * to deal with, so instead just hold onto the inode throughout the 4963 * writepages operation. If it fails here we are freeing up the inode 4964 * anyway and we'd rather not waste our time writing out stuff that is 4965 * going to be truncated anyway. 4966 */ 4967 if (!igrab(inode)) 4968 return 0; 4969 4970 pagevec_init(&pvec); 4971 if (wbc->range_cyclic) { 4972 index = mapping->writeback_index; /* Start from prev offset */ 4973 end = -1; 4974 /* 4975 * Start from the beginning does not need to cycle over the 4976 * range, mark it as scanned. 4977 */ 4978 scanned = (index == 0); 4979 } else { 4980 index = wbc->range_start >> PAGE_SHIFT; 4981 end = wbc->range_end >> PAGE_SHIFT; 4982 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 4983 range_whole = 1; 4984 scanned = 1; 4985 } 4986 4987 /* 4988 * We do the tagged writepage as long as the snapshot flush bit is set 4989 * and we are the first one who do the filemap_flush() on this inode. 4990 * 4991 * The nr_to_write == LONG_MAX is needed to make sure other flushers do 4992 * not race in and drop the bit. 4993 */ 4994 if (range_whole && wbc->nr_to_write == LONG_MAX && 4995 test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, 4996 &BTRFS_I(inode)->runtime_flags)) 4997 wbc->tagged_writepages = 1; 4998 4999 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 5000 tag = PAGECACHE_TAG_TOWRITE; 5001 else 5002 tag = PAGECACHE_TAG_DIRTY; 5003 retry: 5004 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 5005 tag_pages_for_writeback(mapping, index, end); 5006 done_index = index; 5007 while (!done && !nr_to_write_done && (index <= end) && 5008 (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, 5009 &index, end, tag))) { 5010 unsigned i; 5011 5012 for (i = 0; i < nr_pages; i++) { 5013 struct page *page = pvec.pages[i]; 5014 5015 done_index = page->index + 1; 5016 /* 5017 * At this point we hold neither the i_pages lock nor 5018 * the page lock: the page may be truncated or 5019 * invalidated (changing page->mapping to NULL), 5020 * or even swizzled back from swapper_space to 5021 * tmpfs file mapping 5022 */ 5023 if (!trylock_page(page)) { 5024 submit_write_bio(epd, 0); 5025 lock_page(page); 5026 } 5027 5028 if (unlikely(page->mapping != mapping)) { 5029 unlock_page(page); 5030 continue; 5031 } 5032 5033 if (wbc->sync_mode != WB_SYNC_NONE) { 5034 if (PageWriteback(page)) 5035 submit_write_bio(epd, 0); 5036 wait_on_page_writeback(page); 5037 } 5038 5039 if (PageWriteback(page) || 5040 !clear_page_dirty_for_io(page)) { 5041 unlock_page(page); 5042 continue; 5043 } 5044 5045 ret = __extent_writepage(page, wbc, epd); 5046 if (ret < 0) { 5047 done = 1; 5048 break; 5049 } 5050 5051 /* 5052 * the filesystem may choose to bump up nr_to_write. 5053 * We have to make sure to honor the new nr_to_write 5054 * at any time 5055 */ 5056 nr_to_write_done = wbc->nr_to_write <= 0; 5057 } 5058 pagevec_release(&pvec); 5059 cond_resched(); 5060 } 5061 if (!scanned && !done) { 5062 /* 5063 * We hit the last page and there is more work to be done: wrap 5064 * back to the start of the file 5065 */ 5066 scanned = 1; 5067 index = 0; 5068 5069 /* 5070 * If we're looping we could run into a page that is locked by a 5071 * writer and that writer could be waiting on writeback for a 5072 * page in our current bio, and thus deadlock, so flush the 5073 * write bio here. 5074 */ 5075 submit_write_bio(epd, 0); 5076 goto retry; 5077 } 5078 5079 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 5080 mapping->writeback_index = done_index; 5081 5082 btrfs_add_delayed_iput(inode); 5083 return ret; 5084 } 5085 5086 /* 5087 * Submit the pages in the range to bio for call sites which delalloc range has 5088 * already been ran (aka, ordered extent inserted) and all pages are still 5089 * locked. 5090 */ 5091 int extent_write_locked_range(struct inode *inode, u64 start, u64 end) 5092 { 5093 bool found_error = false; 5094 int first_error = 0; 5095 int ret = 0; 5096 struct address_space *mapping = inode->i_mapping; 5097 struct page *page; 5098 u64 cur = start; 5099 unsigned long nr_pages; 5100 const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; 5101 struct extent_page_data epd = { 5102 .bio_ctrl = { 0 }, 5103 .extent_locked = 1, 5104 .sync_io = 1, 5105 }; 5106 struct writeback_control wbc_writepages = { 5107 .sync_mode = WB_SYNC_ALL, 5108 .range_start = start, 5109 .range_end = end + 1, 5110 /* We're called from an async helper function */ 5111 .punt_to_cgroup = 1, 5112 .no_cgroup_owner = 1, 5113 }; 5114 5115 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); 5116 nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> 5117 PAGE_SHIFT; 5118 wbc_writepages.nr_to_write = nr_pages * 2; 5119 5120 wbc_attach_fdatawrite_inode(&wbc_writepages, inode); 5121 while (cur <= end) { 5122 u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 5123 5124 page = find_get_page(mapping, cur >> PAGE_SHIFT); 5125 /* 5126 * All pages in the range are locked since 5127 * btrfs_run_delalloc_range(), thus there is no way to clear 5128 * the page dirty flag. 5129 */ 5130 ASSERT(PageLocked(page)); 5131 ASSERT(PageDirty(page)); 5132 clear_page_dirty_for_io(page); 5133 ret = __extent_writepage(page, &wbc_writepages, &epd); 5134 ASSERT(ret <= 0); 5135 if (ret < 0) { 5136 found_error = true; 5137 first_error = ret; 5138 } 5139 put_page(page); 5140 cur = cur_end + 1; 5141 } 5142 5143 submit_write_bio(&epd, found_error ? ret : 0); 5144 5145 wbc_detach_inode(&wbc_writepages); 5146 if (found_error) 5147 return first_error; 5148 return ret; 5149 } 5150 5151 int extent_writepages(struct address_space *mapping, 5152 struct writeback_control *wbc) 5153 { 5154 struct inode *inode = mapping->host; 5155 int ret = 0; 5156 struct extent_page_data epd = { 5157 .bio_ctrl = { 0 }, 5158 .extent_locked = 0, 5159 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 5160 }; 5161 5162 /* 5163 * Allow only a single thread to do the reloc work in zoned mode to 5164 * protect the write pointer updates. 5165 */ 5166 btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); 5167 ret = extent_write_cache_pages(mapping, wbc, &epd); 5168 submit_write_bio(&epd, ret); 5169 btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); 5170 return ret; 5171 } 5172 5173 void extent_readahead(struct readahead_control *rac) 5174 { 5175 struct btrfs_bio_ctrl bio_ctrl = { 0 }; 5176 struct page *pagepool[16]; 5177 struct extent_map *em_cached = NULL; 5178 u64 prev_em_start = (u64)-1; 5179 int nr; 5180 5181 while ((nr = readahead_page_batch(rac, pagepool))) { 5182 u64 contig_start = readahead_pos(rac); 5183 u64 contig_end = contig_start + readahead_batch_length(rac) - 1; 5184 5185 contiguous_readpages(pagepool, nr, contig_start, contig_end, 5186 &em_cached, &bio_ctrl, &prev_em_start); 5187 } 5188 5189 if (em_cached) 5190 free_extent_map(em_cached); 5191 submit_one_bio(&bio_ctrl); 5192 } 5193 5194 /* 5195 * basic invalidate_folio code, this waits on any locked or writeback 5196 * ranges corresponding to the folio, and then deletes any extent state 5197 * records from the tree 5198 */ 5199 int extent_invalidate_folio(struct extent_io_tree *tree, 5200 struct folio *folio, size_t offset) 5201 { 5202 struct extent_state *cached_state = NULL; 5203 u64 start = folio_pos(folio); 5204 u64 end = start + folio_size(folio) - 1; 5205 size_t blocksize = folio->mapping->host->i_sb->s_blocksize; 5206 5207 /* This function is only called for the btree inode */ 5208 ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); 5209 5210 start += ALIGN(offset, blocksize); 5211 if (start > end) 5212 return 0; 5213 5214 lock_extent_bits(tree, start, end, &cached_state); 5215 folio_wait_writeback(folio); 5216 5217 /* 5218 * Currently for btree io tree, only EXTENT_LOCKED is utilized, 5219 * so here we only need to unlock the extent range to free any 5220 * existing extent state. 5221 */ 5222 unlock_extent_cached(tree, start, end, &cached_state); 5223 return 0; 5224 } 5225 5226 /* 5227 * a helper for release_folio, this tests for areas of the page that 5228 * are locked or under IO and drops the related state bits if it is safe 5229 * to drop the page. 5230 */ 5231 static int try_release_extent_state(struct extent_io_tree *tree, 5232 struct page *page, gfp_t mask) 5233 { 5234 u64 start = page_offset(page); 5235 u64 end = start + PAGE_SIZE - 1; 5236 int ret = 1; 5237 5238 if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { 5239 ret = 0; 5240 } else { 5241 /* 5242 * At this point we can safely clear everything except the 5243 * locked bit, the nodatasum bit and the delalloc new bit. 5244 * The delalloc new bit will be cleared by ordered extent 5245 * completion. 5246 */ 5247 ret = __clear_extent_bit(tree, start, end, 5248 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW), 5249 0, 0, NULL, mask, NULL); 5250 5251 /* if clear_extent_bit failed for enomem reasons, 5252 * we can't allow the release to continue. 5253 */ 5254 if (ret < 0) 5255 ret = 0; 5256 else 5257 ret = 1; 5258 } 5259 return ret; 5260 } 5261 5262 /* 5263 * a helper for release_folio. As long as there are no locked extents 5264 * in the range corresponding to the page, both state records and extent 5265 * map records are removed 5266 */ 5267 int try_release_extent_mapping(struct page *page, gfp_t mask) 5268 { 5269 struct extent_map *em; 5270 u64 start = page_offset(page); 5271 u64 end = start + PAGE_SIZE - 1; 5272 struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host); 5273 struct extent_io_tree *tree = &btrfs_inode->io_tree; 5274 struct extent_map_tree *map = &btrfs_inode->extent_tree; 5275 5276 if (gfpflags_allow_blocking(mask) && 5277 page->mapping->host->i_size > SZ_16M) { 5278 u64 len; 5279 while (start <= end) { 5280 struct btrfs_fs_info *fs_info; 5281 u64 cur_gen; 5282 5283 len = end - start + 1; 5284 write_lock(&map->lock); 5285 em = lookup_extent_mapping(map, start, len); 5286 if (!em) { 5287 write_unlock(&map->lock); 5288 break; 5289 } 5290 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 5291 em->start != start) { 5292 write_unlock(&map->lock); 5293 free_extent_map(em); 5294 break; 5295 } 5296 if (test_range_bit(tree, em->start, 5297 extent_map_end(em) - 1, 5298 EXTENT_LOCKED, 0, NULL)) 5299 goto next; 5300 /* 5301 * If it's not in the list of modified extents, used 5302 * by a fast fsync, we can remove it. If it's being 5303 * logged we can safely remove it since fsync took an 5304 * extra reference on the em. 5305 */ 5306 if (list_empty(&em->list) || 5307 test_bit(EXTENT_FLAG_LOGGING, &em->flags)) 5308 goto remove_em; 5309 /* 5310 * If it's in the list of modified extents, remove it 5311 * only if its generation is older then the current one, 5312 * in which case we don't need it for a fast fsync. 5313 * Otherwise don't remove it, we could be racing with an 5314 * ongoing fast fsync that could miss the new extent. 5315 */ 5316 fs_info = btrfs_inode->root->fs_info; 5317 spin_lock(&fs_info->trans_lock); 5318 cur_gen = fs_info->generation; 5319 spin_unlock(&fs_info->trans_lock); 5320 if (em->generation >= cur_gen) 5321 goto next; 5322 remove_em: 5323 /* 5324 * We only remove extent maps that are not in the list of 5325 * modified extents or that are in the list but with a 5326 * generation lower then the current generation, so there 5327 * is no need to set the full fsync flag on the inode (it 5328 * hurts the fsync performance for workloads with a data 5329 * size that exceeds or is close to the system's memory). 5330 */ 5331 remove_extent_mapping(map, em); 5332 /* once for the rb tree */ 5333 free_extent_map(em); 5334 next: 5335 start = extent_map_end(em); 5336 write_unlock(&map->lock); 5337 5338 /* once for us */ 5339 free_extent_map(em); 5340 5341 cond_resched(); /* Allow large-extent preemption. */ 5342 } 5343 } 5344 return try_release_extent_state(tree, page, mask); 5345 } 5346 5347 /* 5348 * helper function for fiemap, which doesn't want to see any holes. 5349 * This maps until we find something past 'last' 5350 */ 5351 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, 5352 u64 offset, u64 last) 5353 { 5354 u64 sectorsize = btrfs_inode_sectorsize(inode); 5355 struct extent_map *em; 5356 u64 len; 5357 5358 if (offset >= last) 5359 return NULL; 5360 5361 while (1) { 5362 len = last - offset; 5363 if (len == 0) 5364 break; 5365 len = ALIGN(len, sectorsize); 5366 em = btrfs_get_extent_fiemap(inode, offset, len); 5367 if (IS_ERR(em)) 5368 return em; 5369 5370 /* if this isn't a hole return it */ 5371 if (em->block_start != EXTENT_MAP_HOLE) 5372 return em; 5373 5374 /* this is a hole, advance to the next extent */ 5375 offset = extent_map_end(em); 5376 free_extent_map(em); 5377 if (offset >= last) 5378 break; 5379 } 5380 return NULL; 5381 } 5382 5383 /* 5384 * To cache previous fiemap extent 5385 * 5386 * Will be used for merging fiemap extent 5387 */ 5388 struct fiemap_cache { 5389 u64 offset; 5390 u64 phys; 5391 u64 len; 5392 u32 flags; 5393 bool cached; 5394 }; 5395 5396 /* 5397 * Helper to submit fiemap extent. 5398 * 5399 * Will try to merge current fiemap extent specified by @offset, @phys, 5400 * @len and @flags with cached one. 5401 * And only when we fails to merge, cached one will be submitted as 5402 * fiemap extent. 5403 * 5404 * Return value is the same as fiemap_fill_next_extent(). 5405 */ 5406 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 5407 struct fiemap_cache *cache, 5408 u64 offset, u64 phys, u64 len, u32 flags) 5409 { 5410 int ret = 0; 5411 5412 if (!cache->cached) 5413 goto assign; 5414 5415 /* 5416 * Sanity check, extent_fiemap() should have ensured that new 5417 * fiemap extent won't overlap with cached one. 5418 * Not recoverable. 5419 * 5420 * NOTE: Physical address can overlap, due to compression 5421 */ 5422 if (cache->offset + cache->len > offset) { 5423 WARN_ON(1); 5424 return -EINVAL; 5425 } 5426 5427 /* 5428 * Only merges fiemap extents if 5429 * 1) Their logical addresses are continuous 5430 * 5431 * 2) Their physical addresses are continuous 5432 * So truly compressed (physical size smaller than logical size) 5433 * extents won't get merged with each other 5434 * 5435 * 3) Share same flags except FIEMAP_EXTENT_LAST 5436 * So regular extent won't get merged with prealloc extent 5437 */ 5438 if (cache->offset + cache->len == offset && 5439 cache->phys + cache->len == phys && 5440 (cache->flags & ~FIEMAP_EXTENT_LAST) == 5441 (flags & ~FIEMAP_EXTENT_LAST)) { 5442 cache->len += len; 5443 cache->flags |= flags; 5444 goto try_submit_last; 5445 } 5446 5447 /* Not mergeable, need to submit cached one */ 5448 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5449 cache->len, cache->flags); 5450 cache->cached = false; 5451 if (ret) 5452 return ret; 5453 assign: 5454 cache->cached = true; 5455 cache->offset = offset; 5456 cache->phys = phys; 5457 cache->len = len; 5458 cache->flags = flags; 5459 try_submit_last: 5460 if (cache->flags & FIEMAP_EXTENT_LAST) { 5461 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 5462 cache->phys, cache->len, cache->flags); 5463 cache->cached = false; 5464 } 5465 return ret; 5466 } 5467 5468 /* 5469 * Emit last fiemap cache 5470 * 5471 * The last fiemap cache may still be cached in the following case: 5472 * 0 4k 8k 5473 * |<- Fiemap range ->| 5474 * |<------------ First extent ----------->| 5475 * 5476 * In this case, the first extent range will be cached but not emitted. 5477 * So we must emit it before ending extent_fiemap(). 5478 */ 5479 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, 5480 struct fiemap_cache *cache) 5481 { 5482 int ret; 5483 5484 if (!cache->cached) 5485 return 0; 5486 5487 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 5488 cache->len, cache->flags); 5489 cache->cached = false; 5490 if (ret > 0) 5491 ret = 0; 5492 return ret; 5493 } 5494 5495 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, 5496 u64 start, u64 len) 5497 { 5498 int ret = 0; 5499 u64 off; 5500 u64 max = start + len; 5501 u32 flags = 0; 5502 u32 found_type; 5503 u64 last; 5504 u64 last_for_get_extent = 0; 5505 u64 disko = 0; 5506 u64 isize = i_size_read(&inode->vfs_inode); 5507 struct btrfs_key found_key; 5508 struct extent_map *em = NULL; 5509 struct extent_state *cached_state = NULL; 5510 struct btrfs_path *path; 5511 struct btrfs_root *root = inode->root; 5512 struct fiemap_cache cache = { 0 }; 5513 struct ulist *roots; 5514 struct ulist *tmp_ulist; 5515 int end = 0; 5516 u64 em_start = 0; 5517 u64 em_len = 0; 5518 u64 em_end = 0; 5519 5520 if (len == 0) 5521 return -EINVAL; 5522 5523 path = btrfs_alloc_path(); 5524 if (!path) 5525 return -ENOMEM; 5526 5527 roots = ulist_alloc(GFP_KERNEL); 5528 tmp_ulist = ulist_alloc(GFP_KERNEL); 5529 if (!roots || !tmp_ulist) { 5530 ret = -ENOMEM; 5531 goto out_free_ulist; 5532 } 5533 5534 /* 5535 * We can't initialize that to 'start' as this could miss extents due 5536 * to extent item merging 5537 */ 5538 off = 0; 5539 start = round_down(start, btrfs_inode_sectorsize(inode)); 5540 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 5541 5542 /* 5543 * lookup the last file extent. We're not using i_size here 5544 * because there might be preallocation past i_size 5545 */ 5546 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, 5547 0); 5548 if (ret < 0) { 5549 goto out_free_ulist; 5550 } else { 5551 WARN_ON(!ret); 5552 if (ret == 1) 5553 ret = 0; 5554 } 5555 5556 path->slots[0]--; 5557 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 5558 found_type = found_key.type; 5559 5560 /* No extents, but there might be delalloc bits */ 5561 if (found_key.objectid != btrfs_ino(inode) || 5562 found_type != BTRFS_EXTENT_DATA_KEY) { 5563 /* have to trust i_size as the end */ 5564 last = (u64)-1; 5565 last_for_get_extent = isize; 5566 } else { 5567 /* 5568 * remember the start of the last extent. There are a 5569 * bunch of different factors that go into the length of the 5570 * extent, so its much less complex to remember where it started 5571 */ 5572 last = found_key.offset; 5573 last_for_get_extent = last + 1; 5574 } 5575 btrfs_release_path(path); 5576 5577 /* 5578 * we might have some extents allocated but more delalloc past those 5579 * extents. so, we trust isize unless the start of the last extent is 5580 * beyond isize 5581 */ 5582 if (last < isize) { 5583 last = (u64)-1; 5584 last_for_get_extent = isize; 5585 } 5586 5587 lock_extent_bits(&inode->io_tree, start, start + len - 1, 5588 &cached_state); 5589 5590 em = get_extent_skip_holes(inode, start, last_for_get_extent); 5591 if (!em) 5592 goto out; 5593 if (IS_ERR(em)) { 5594 ret = PTR_ERR(em); 5595 goto out; 5596 } 5597 5598 while (!end) { 5599 u64 offset_in_extent = 0; 5600 5601 /* break if the extent we found is outside the range */ 5602 if (em->start >= max || extent_map_end(em) < off) 5603 break; 5604 5605 /* 5606 * get_extent may return an extent that starts before our 5607 * requested range. We have to make sure the ranges 5608 * we return to fiemap always move forward and don't 5609 * overlap, so adjust the offsets here 5610 */ 5611 em_start = max(em->start, off); 5612 5613 /* 5614 * record the offset from the start of the extent 5615 * for adjusting the disk offset below. Only do this if the 5616 * extent isn't compressed since our in ram offset may be past 5617 * what we have actually allocated on disk. 5618 */ 5619 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5620 offset_in_extent = em_start - em->start; 5621 em_end = extent_map_end(em); 5622 em_len = em_end - em_start; 5623 flags = 0; 5624 if (em->block_start < EXTENT_MAP_LAST_BYTE) 5625 disko = em->block_start + offset_in_extent; 5626 else 5627 disko = 0; 5628 5629 /* 5630 * bump off for our next call to get_extent 5631 */ 5632 off = extent_map_end(em); 5633 if (off >= max) 5634 end = 1; 5635 5636 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 5637 end = 1; 5638 flags |= FIEMAP_EXTENT_LAST; 5639 } else if (em->block_start == EXTENT_MAP_INLINE) { 5640 flags |= (FIEMAP_EXTENT_DATA_INLINE | 5641 FIEMAP_EXTENT_NOT_ALIGNED); 5642 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 5643 flags |= (FIEMAP_EXTENT_DELALLOC | 5644 FIEMAP_EXTENT_UNKNOWN); 5645 } else if (fieinfo->fi_extents_max) { 5646 u64 bytenr = em->block_start - 5647 (em->start - em->orig_start); 5648 5649 /* 5650 * As btrfs supports shared space, this information 5651 * can be exported to userspace tools via 5652 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 5653 * then we're just getting a count and we can skip the 5654 * lookup stuff. 5655 */ 5656 ret = btrfs_check_shared(root, btrfs_ino(inode), 5657 bytenr, roots, tmp_ulist); 5658 if (ret < 0) 5659 goto out_free; 5660 if (ret) 5661 flags |= FIEMAP_EXTENT_SHARED; 5662 ret = 0; 5663 } 5664 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 5665 flags |= FIEMAP_EXTENT_ENCODED; 5666 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 5667 flags |= FIEMAP_EXTENT_UNWRITTEN; 5668 5669 free_extent_map(em); 5670 em = NULL; 5671 if ((em_start >= last) || em_len == (u64)-1 || 5672 (last == (u64)-1 && isize <= em_end)) { 5673 flags |= FIEMAP_EXTENT_LAST; 5674 end = 1; 5675 } 5676 5677 /* now scan forward to see if this is really the last extent. */ 5678 em = get_extent_skip_holes(inode, off, last_for_get_extent); 5679 if (IS_ERR(em)) { 5680 ret = PTR_ERR(em); 5681 goto out; 5682 } 5683 if (!em) { 5684 flags |= FIEMAP_EXTENT_LAST; 5685 end = 1; 5686 } 5687 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 5688 em_len, flags); 5689 if (ret) { 5690 if (ret == 1) 5691 ret = 0; 5692 goto out_free; 5693 } 5694 } 5695 out_free: 5696 if (!ret) 5697 ret = emit_last_fiemap_cache(fieinfo, &cache); 5698 free_extent_map(em); 5699 out: 5700 unlock_extent_cached(&inode->io_tree, start, start + len - 1, 5701 &cached_state); 5702 5703 out_free_ulist: 5704 btrfs_free_path(path); 5705 ulist_free(roots); 5706 ulist_free(tmp_ulist); 5707 return ret; 5708 } 5709 5710 static void __free_extent_buffer(struct extent_buffer *eb) 5711 { 5712 kmem_cache_free(extent_buffer_cache, eb); 5713 } 5714 5715 int extent_buffer_under_io(const struct extent_buffer *eb) 5716 { 5717 return (atomic_read(&eb->io_pages) || 5718 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 5719 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5720 } 5721 5722 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) 5723 { 5724 struct btrfs_subpage *subpage; 5725 5726 lockdep_assert_held(&page->mapping->private_lock); 5727 5728 if (PagePrivate(page)) { 5729 subpage = (struct btrfs_subpage *)page->private; 5730 if (atomic_read(&subpage->eb_refs)) 5731 return true; 5732 /* 5733 * Even there is no eb refs here, we may still have 5734 * end_page_read() call relying on page::private. 5735 */ 5736 if (atomic_read(&subpage->readers)) 5737 return true; 5738 } 5739 return false; 5740 } 5741 5742 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) 5743 { 5744 struct btrfs_fs_info *fs_info = eb->fs_info; 5745 const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5746 5747 /* 5748 * For mapped eb, we're going to change the page private, which should 5749 * be done under the private_lock. 5750 */ 5751 if (mapped) 5752 spin_lock(&page->mapping->private_lock); 5753 5754 if (!PagePrivate(page)) { 5755 if (mapped) 5756 spin_unlock(&page->mapping->private_lock); 5757 return; 5758 } 5759 5760 if (fs_info->nodesize >= PAGE_SIZE) { 5761 /* 5762 * We do this since we'll remove the pages after we've 5763 * removed the eb from the radix tree, so we could race 5764 * and have this page now attached to the new eb. So 5765 * only clear page_private if it's still connected to 5766 * this eb. 5767 */ 5768 if (PagePrivate(page) && 5769 page->private == (unsigned long)eb) { 5770 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 5771 BUG_ON(PageDirty(page)); 5772 BUG_ON(PageWriteback(page)); 5773 /* 5774 * We need to make sure we haven't be attached 5775 * to a new eb. 5776 */ 5777 detach_page_private(page); 5778 } 5779 if (mapped) 5780 spin_unlock(&page->mapping->private_lock); 5781 return; 5782 } 5783 5784 /* 5785 * For subpage, we can have dummy eb with page private. In this case, 5786 * we can directly detach the private as such page is only attached to 5787 * one dummy eb, no sharing. 5788 */ 5789 if (!mapped) { 5790 btrfs_detach_subpage(fs_info, page); 5791 return; 5792 } 5793 5794 btrfs_page_dec_eb_refs(fs_info, page); 5795 5796 /* 5797 * We can only detach the page private if there are no other ebs in the 5798 * page range and no unfinished IO. 5799 */ 5800 if (!page_range_has_eb(fs_info, page)) 5801 btrfs_detach_subpage(fs_info, page); 5802 5803 spin_unlock(&page->mapping->private_lock); 5804 } 5805 5806 /* Release all pages attached to the extent buffer */ 5807 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) 5808 { 5809 int i; 5810 int num_pages; 5811 5812 ASSERT(!extent_buffer_under_io(eb)); 5813 5814 num_pages = num_extent_pages(eb); 5815 for (i = 0; i < num_pages; i++) { 5816 struct page *page = eb->pages[i]; 5817 5818 if (!page) 5819 continue; 5820 5821 detach_extent_buffer_page(eb, page); 5822 5823 /* One for when we allocated the page */ 5824 put_page(page); 5825 } 5826 } 5827 5828 /* 5829 * Helper for releasing the extent buffer. 5830 */ 5831 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 5832 { 5833 btrfs_release_extent_buffer_pages(eb); 5834 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 5835 __free_extent_buffer(eb); 5836 } 5837 5838 static struct extent_buffer * 5839 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 5840 unsigned long len) 5841 { 5842 struct extent_buffer *eb = NULL; 5843 5844 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 5845 eb->start = start; 5846 eb->len = len; 5847 eb->fs_info = fs_info; 5848 eb->bflags = 0; 5849 init_rwsem(&eb->lock); 5850 5851 btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, 5852 &fs_info->allocated_ebs); 5853 INIT_LIST_HEAD(&eb->release_list); 5854 5855 spin_lock_init(&eb->refs_lock); 5856 atomic_set(&eb->refs, 1); 5857 atomic_set(&eb->io_pages, 0); 5858 5859 ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); 5860 5861 return eb; 5862 } 5863 5864 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) 5865 { 5866 int i; 5867 struct extent_buffer *new; 5868 int num_pages = num_extent_pages(src); 5869 int ret; 5870 5871 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 5872 if (new == NULL) 5873 return NULL; 5874 5875 /* 5876 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as 5877 * btrfs_release_extent_buffer() have different behavior for 5878 * UNMAPPED subpage extent buffer. 5879 */ 5880 set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); 5881 5882 memset(new->pages, 0, sizeof(*new->pages) * num_pages); 5883 ret = btrfs_alloc_page_array(num_pages, new->pages); 5884 if (ret) { 5885 btrfs_release_extent_buffer(new); 5886 return NULL; 5887 } 5888 5889 for (i = 0; i < num_pages; i++) { 5890 int ret; 5891 struct page *p = new->pages[i]; 5892 5893 ret = attach_extent_buffer_page(new, p, NULL); 5894 if (ret < 0) { 5895 btrfs_release_extent_buffer(new); 5896 return NULL; 5897 } 5898 WARN_ON(PageDirty(p)); 5899 copy_page(page_address(p), page_address(src->pages[i])); 5900 } 5901 set_extent_buffer_uptodate(new); 5902 5903 return new; 5904 } 5905 5906 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5907 u64 start, unsigned long len) 5908 { 5909 struct extent_buffer *eb; 5910 int num_pages; 5911 int i; 5912 int ret; 5913 5914 eb = __alloc_extent_buffer(fs_info, start, len); 5915 if (!eb) 5916 return NULL; 5917 5918 num_pages = num_extent_pages(eb); 5919 ret = btrfs_alloc_page_array(num_pages, eb->pages); 5920 if (ret) 5921 goto err; 5922 5923 for (i = 0; i < num_pages; i++) { 5924 struct page *p = eb->pages[i]; 5925 5926 ret = attach_extent_buffer_page(eb, p, NULL); 5927 if (ret < 0) 5928 goto err; 5929 } 5930 5931 set_extent_buffer_uptodate(eb); 5932 btrfs_set_header_nritems(eb, 0); 5933 set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 5934 5935 return eb; 5936 err: 5937 for (i = 0; i < num_pages; i++) { 5938 if (eb->pages[i]) { 5939 detach_extent_buffer_page(eb, eb->pages[i]); 5940 __free_page(eb->pages[i]); 5941 } 5942 } 5943 __free_extent_buffer(eb); 5944 return NULL; 5945 } 5946 5947 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 5948 u64 start) 5949 { 5950 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 5951 } 5952 5953 static void check_buffer_tree_ref(struct extent_buffer *eb) 5954 { 5955 int refs; 5956 /* 5957 * The TREE_REF bit is first set when the extent_buffer is added 5958 * to the radix tree. It is also reset, if unset, when a new reference 5959 * is created by find_extent_buffer. 5960 * 5961 * It is only cleared in two cases: freeing the last non-tree 5962 * reference to the extent_buffer when its STALE bit is set or 5963 * calling release_folio when the tree reference is the only reference. 5964 * 5965 * In both cases, care is taken to ensure that the extent_buffer's 5966 * pages are not under io. However, release_folio can be concurrently 5967 * called with creating new references, which is prone to race 5968 * conditions between the calls to check_buffer_tree_ref in those 5969 * codepaths and clearing TREE_REF in try_release_extent_buffer. 5970 * 5971 * The actual lifetime of the extent_buffer in the radix tree is 5972 * adequately protected by the refcount, but the TREE_REF bit and 5973 * its corresponding reference are not. To protect against this 5974 * class of races, we call check_buffer_tree_ref from the codepaths 5975 * which trigger io after they set eb->io_pages. Note that once io is 5976 * initiated, TREE_REF can no longer be cleared, so that is the 5977 * moment at which any such race is best fixed. 5978 */ 5979 refs = atomic_read(&eb->refs); 5980 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5981 return; 5982 5983 spin_lock(&eb->refs_lock); 5984 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5985 atomic_inc(&eb->refs); 5986 spin_unlock(&eb->refs_lock); 5987 } 5988 5989 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 5990 struct page *accessed) 5991 { 5992 int num_pages, i; 5993 5994 check_buffer_tree_ref(eb); 5995 5996 num_pages = num_extent_pages(eb); 5997 for (i = 0; i < num_pages; i++) { 5998 struct page *p = eb->pages[i]; 5999 6000 if (p != accessed) 6001 mark_page_accessed(p); 6002 } 6003 } 6004 6005 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 6006 u64 start) 6007 { 6008 struct extent_buffer *eb; 6009 6010 eb = find_extent_buffer_nolock(fs_info, start); 6011 if (!eb) 6012 return NULL; 6013 /* 6014 * Lock our eb's refs_lock to avoid races with free_extent_buffer(). 6015 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and 6016 * another task running free_extent_buffer() might have seen that flag 6017 * set, eb->refs == 2, that the buffer isn't under IO (dirty and 6018 * writeback flags not set) and it's still in the tree (flag 6019 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of 6020 * decrementing the extent buffer's reference count twice. So here we 6021 * could race and increment the eb's reference count, clear its stale 6022 * flag, mark it as dirty and drop our reference before the other task 6023 * finishes executing free_extent_buffer, which would later result in 6024 * an attempt to free an extent buffer that is dirty. 6025 */ 6026 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 6027 spin_lock(&eb->refs_lock); 6028 spin_unlock(&eb->refs_lock); 6029 } 6030 mark_extent_buffer_accessed(eb, NULL); 6031 return eb; 6032 } 6033 6034 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6035 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 6036 u64 start) 6037 { 6038 struct extent_buffer *eb, *exists = NULL; 6039 int ret; 6040 6041 eb = find_extent_buffer(fs_info, start); 6042 if (eb) 6043 return eb; 6044 eb = alloc_dummy_extent_buffer(fs_info, start); 6045 if (!eb) 6046 return ERR_PTR(-ENOMEM); 6047 eb->fs_info = fs_info; 6048 again: 6049 ret = radix_tree_preload(GFP_NOFS); 6050 if (ret) { 6051 exists = ERR_PTR(ret); 6052 goto free_eb; 6053 } 6054 spin_lock(&fs_info->buffer_lock); 6055 ret = radix_tree_insert(&fs_info->buffer_radix, 6056 start >> fs_info->sectorsize_bits, eb); 6057 spin_unlock(&fs_info->buffer_lock); 6058 radix_tree_preload_end(); 6059 if (ret == -EEXIST) { 6060 exists = find_extent_buffer(fs_info, start); 6061 if (exists) 6062 goto free_eb; 6063 else 6064 goto again; 6065 } 6066 check_buffer_tree_ref(eb); 6067 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6068 6069 return eb; 6070 free_eb: 6071 btrfs_release_extent_buffer(eb); 6072 return exists; 6073 } 6074 #endif 6075 6076 static struct extent_buffer *grab_extent_buffer( 6077 struct btrfs_fs_info *fs_info, struct page *page) 6078 { 6079 struct extent_buffer *exists; 6080 6081 /* 6082 * For subpage case, we completely rely on radix tree to ensure we 6083 * don't try to insert two ebs for the same bytenr. So here we always 6084 * return NULL and just continue. 6085 */ 6086 if (fs_info->nodesize < PAGE_SIZE) 6087 return NULL; 6088 6089 /* Page not yet attached to an extent buffer */ 6090 if (!PagePrivate(page)) 6091 return NULL; 6092 6093 /* 6094 * We could have already allocated an eb for this page and attached one 6095 * so lets see if we can get a ref on the existing eb, and if we can we 6096 * know it's good and we can just return that one, else we know we can 6097 * just overwrite page->private. 6098 */ 6099 exists = (struct extent_buffer *)page->private; 6100 if (atomic_inc_not_zero(&exists->refs)) 6101 return exists; 6102 6103 WARN_ON(PageDirty(page)); 6104 detach_page_private(page); 6105 return NULL; 6106 } 6107 6108 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) 6109 { 6110 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 6111 btrfs_err(fs_info, "bad tree block start %llu", start); 6112 return -EINVAL; 6113 } 6114 6115 if (fs_info->nodesize < PAGE_SIZE && 6116 offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { 6117 btrfs_err(fs_info, 6118 "tree block crosses page boundary, start %llu nodesize %u", 6119 start, fs_info->nodesize); 6120 return -EINVAL; 6121 } 6122 if (fs_info->nodesize >= PAGE_SIZE && 6123 !PAGE_ALIGNED(start)) { 6124 btrfs_err(fs_info, 6125 "tree block is not page aligned, start %llu nodesize %u", 6126 start, fs_info->nodesize); 6127 return -EINVAL; 6128 } 6129 return 0; 6130 } 6131 6132 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 6133 u64 start, u64 owner_root, int level) 6134 { 6135 unsigned long len = fs_info->nodesize; 6136 int num_pages; 6137 int i; 6138 unsigned long index = start >> PAGE_SHIFT; 6139 struct extent_buffer *eb; 6140 struct extent_buffer *exists = NULL; 6141 struct page *p; 6142 struct address_space *mapping = fs_info->btree_inode->i_mapping; 6143 u64 lockdep_owner = owner_root; 6144 int uptodate = 1; 6145 int ret; 6146 6147 if (check_eb_alignment(fs_info, start)) 6148 return ERR_PTR(-EINVAL); 6149 6150 #if BITS_PER_LONG == 32 6151 if (start >= MAX_LFS_FILESIZE) { 6152 btrfs_err_rl(fs_info, 6153 "extent buffer %llu is beyond 32bit page cache limit", start); 6154 btrfs_err_32bit_limit(fs_info); 6155 return ERR_PTR(-EOVERFLOW); 6156 } 6157 if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6158 btrfs_warn_32bit_limit(fs_info); 6159 #endif 6160 6161 eb = find_extent_buffer(fs_info, start); 6162 if (eb) 6163 return eb; 6164 6165 eb = __alloc_extent_buffer(fs_info, start, len); 6166 if (!eb) 6167 return ERR_PTR(-ENOMEM); 6168 6169 /* 6170 * The reloc trees are just snapshots, so we need them to appear to be 6171 * just like any other fs tree WRT lockdep. 6172 */ 6173 if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID) 6174 lockdep_owner = BTRFS_FS_TREE_OBJECTID; 6175 6176 btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); 6177 6178 num_pages = num_extent_pages(eb); 6179 for (i = 0; i < num_pages; i++, index++) { 6180 struct btrfs_subpage *prealloc = NULL; 6181 6182 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 6183 if (!p) { 6184 exists = ERR_PTR(-ENOMEM); 6185 goto free_eb; 6186 } 6187 6188 /* 6189 * Preallocate page->private for subpage case, so that we won't 6190 * allocate memory with private_lock hold. The memory will be 6191 * freed by attach_extent_buffer_page() or freed manually if 6192 * we exit earlier. 6193 * 6194 * Although we have ensured one subpage eb can only have one 6195 * page, but it may change in the future for 16K page size 6196 * support, so we still preallocate the memory in the loop. 6197 */ 6198 if (fs_info->nodesize < PAGE_SIZE) { 6199 prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); 6200 if (IS_ERR(prealloc)) { 6201 ret = PTR_ERR(prealloc); 6202 unlock_page(p); 6203 put_page(p); 6204 exists = ERR_PTR(ret); 6205 goto free_eb; 6206 } 6207 } 6208 6209 spin_lock(&mapping->private_lock); 6210 exists = grab_extent_buffer(fs_info, p); 6211 if (exists) { 6212 spin_unlock(&mapping->private_lock); 6213 unlock_page(p); 6214 put_page(p); 6215 mark_extent_buffer_accessed(exists, p); 6216 btrfs_free_subpage(prealloc); 6217 goto free_eb; 6218 } 6219 /* Should not fail, as we have preallocated the memory */ 6220 ret = attach_extent_buffer_page(eb, p, prealloc); 6221 ASSERT(!ret); 6222 /* 6223 * To inform we have extra eb under allocation, so that 6224 * detach_extent_buffer_page() won't release the page private 6225 * when the eb hasn't yet been inserted into radix tree. 6226 * 6227 * The ref will be decreased when the eb released the page, in 6228 * detach_extent_buffer_page(). 6229 * Thus needs no special handling in error path. 6230 */ 6231 btrfs_page_inc_eb_refs(fs_info, p); 6232 spin_unlock(&mapping->private_lock); 6233 6234 WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); 6235 eb->pages[i] = p; 6236 if (!PageUptodate(p)) 6237 uptodate = 0; 6238 6239 /* 6240 * We can't unlock the pages just yet since the extent buffer 6241 * hasn't been properly inserted in the radix tree, this 6242 * opens a race with btree_release_folio which can free a page 6243 * while we are still filling in all pages for the buffer and 6244 * we could crash. 6245 */ 6246 } 6247 if (uptodate) 6248 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6249 again: 6250 ret = radix_tree_preload(GFP_NOFS); 6251 if (ret) { 6252 exists = ERR_PTR(ret); 6253 goto free_eb; 6254 } 6255 6256 spin_lock(&fs_info->buffer_lock); 6257 ret = radix_tree_insert(&fs_info->buffer_radix, 6258 start >> fs_info->sectorsize_bits, eb); 6259 spin_unlock(&fs_info->buffer_lock); 6260 radix_tree_preload_end(); 6261 if (ret == -EEXIST) { 6262 exists = find_extent_buffer(fs_info, start); 6263 if (exists) 6264 goto free_eb; 6265 else 6266 goto again; 6267 } 6268 /* add one reference for the tree */ 6269 check_buffer_tree_ref(eb); 6270 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 6271 6272 /* 6273 * Now it's safe to unlock the pages because any calls to 6274 * btree_release_folio will correctly detect that a page belongs to a 6275 * live buffer and won't free them prematurely. 6276 */ 6277 for (i = 0; i < num_pages; i++) 6278 unlock_page(eb->pages[i]); 6279 return eb; 6280 6281 free_eb: 6282 WARN_ON(!atomic_dec_and_test(&eb->refs)); 6283 for (i = 0; i < num_pages; i++) { 6284 if (eb->pages[i]) 6285 unlock_page(eb->pages[i]); 6286 } 6287 6288 btrfs_release_extent_buffer(eb); 6289 return exists; 6290 } 6291 6292 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 6293 { 6294 struct extent_buffer *eb = 6295 container_of(head, struct extent_buffer, rcu_head); 6296 6297 __free_extent_buffer(eb); 6298 } 6299 6300 static int release_extent_buffer(struct extent_buffer *eb) 6301 __releases(&eb->refs_lock) 6302 { 6303 lockdep_assert_held(&eb->refs_lock); 6304 6305 WARN_ON(atomic_read(&eb->refs) == 0); 6306 if (atomic_dec_and_test(&eb->refs)) { 6307 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 6308 struct btrfs_fs_info *fs_info = eb->fs_info; 6309 6310 spin_unlock(&eb->refs_lock); 6311 6312 spin_lock(&fs_info->buffer_lock); 6313 radix_tree_delete(&fs_info->buffer_radix, 6314 eb->start >> fs_info->sectorsize_bits); 6315 spin_unlock(&fs_info->buffer_lock); 6316 } else { 6317 spin_unlock(&eb->refs_lock); 6318 } 6319 6320 btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list); 6321 /* Should be safe to release our pages at this point */ 6322 btrfs_release_extent_buffer_pages(eb); 6323 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6324 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) { 6325 __free_extent_buffer(eb); 6326 return 1; 6327 } 6328 #endif 6329 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 6330 return 1; 6331 } 6332 spin_unlock(&eb->refs_lock); 6333 6334 return 0; 6335 } 6336 6337 void free_extent_buffer(struct extent_buffer *eb) 6338 { 6339 int refs; 6340 int old; 6341 if (!eb) 6342 return; 6343 6344 while (1) { 6345 refs = atomic_read(&eb->refs); 6346 if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) 6347 || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && 6348 refs == 1)) 6349 break; 6350 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 6351 if (old == refs) 6352 return; 6353 } 6354 6355 spin_lock(&eb->refs_lock); 6356 if (atomic_read(&eb->refs) == 2 && 6357 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 6358 !extent_buffer_under_io(eb) && 6359 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6360 atomic_dec(&eb->refs); 6361 6362 /* 6363 * I know this is terrible, but it's temporary until we stop tracking 6364 * the uptodate bits and such for the extent buffers. 6365 */ 6366 release_extent_buffer(eb); 6367 } 6368 6369 void free_extent_buffer_stale(struct extent_buffer *eb) 6370 { 6371 if (!eb) 6372 return; 6373 6374 spin_lock(&eb->refs_lock); 6375 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 6376 6377 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 6378 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 6379 atomic_dec(&eb->refs); 6380 release_extent_buffer(eb); 6381 } 6382 6383 static void btree_clear_page_dirty(struct page *page) 6384 { 6385 ASSERT(PageDirty(page)); 6386 ASSERT(PageLocked(page)); 6387 clear_page_dirty_for_io(page); 6388 xa_lock_irq(&page->mapping->i_pages); 6389 if (!PageDirty(page)) 6390 __xa_clear_mark(&page->mapping->i_pages, 6391 page_index(page), PAGECACHE_TAG_DIRTY); 6392 xa_unlock_irq(&page->mapping->i_pages); 6393 } 6394 6395 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) 6396 { 6397 struct btrfs_fs_info *fs_info = eb->fs_info; 6398 struct page *page = eb->pages[0]; 6399 bool last; 6400 6401 /* btree_clear_page_dirty() needs page locked */ 6402 lock_page(page); 6403 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, 6404 eb->len); 6405 if (last) 6406 btree_clear_page_dirty(page); 6407 unlock_page(page); 6408 WARN_ON(atomic_read(&eb->refs) == 0); 6409 } 6410 6411 void clear_extent_buffer_dirty(const struct extent_buffer *eb) 6412 { 6413 int i; 6414 int num_pages; 6415 struct page *page; 6416 6417 if (eb->fs_info->nodesize < PAGE_SIZE) 6418 return clear_subpage_extent_buffer_dirty(eb); 6419 6420 num_pages = num_extent_pages(eb); 6421 6422 for (i = 0; i < num_pages; i++) { 6423 page = eb->pages[i]; 6424 if (!PageDirty(page)) 6425 continue; 6426 lock_page(page); 6427 btree_clear_page_dirty(page); 6428 ClearPageError(page); 6429 unlock_page(page); 6430 } 6431 WARN_ON(atomic_read(&eb->refs) == 0); 6432 } 6433 6434 bool set_extent_buffer_dirty(struct extent_buffer *eb) 6435 { 6436 int i; 6437 int num_pages; 6438 bool was_dirty; 6439 6440 check_buffer_tree_ref(eb); 6441 6442 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 6443 6444 num_pages = num_extent_pages(eb); 6445 WARN_ON(atomic_read(&eb->refs) == 0); 6446 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 6447 6448 if (!was_dirty) { 6449 bool subpage = eb->fs_info->nodesize < PAGE_SIZE; 6450 6451 /* 6452 * For subpage case, we can have other extent buffers in the 6453 * same page, and in clear_subpage_extent_buffer_dirty() we 6454 * have to clear page dirty without subpage lock held. 6455 * This can cause race where our page gets dirty cleared after 6456 * we just set it. 6457 * 6458 * Thankfully, clear_subpage_extent_buffer_dirty() has locked 6459 * its page for other reasons, we can use page lock to prevent 6460 * the above race. 6461 */ 6462 if (subpage) 6463 lock_page(eb->pages[0]); 6464 for (i = 0; i < num_pages; i++) 6465 btrfs_page_set_dirty(eb->fs_info, eb->pages[i], 6466 eb->start, eb->len); 6467 if (subpage) 6468 unlock_page(eb->pages[0]); 6469 } 6470 #ifdef CONFIG_BTRFS_DEBUG 6471 for (i = 0; i < num_pages; i++) 6472 ASSERT(PageDirty(eb->pages[i])); 6473 #endif 6474 6475 return was_dirty; 6476 } 6477 6478 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 6479 { 6480 struct btrfs_fs_info *fs_info = eb->fs_info; 6481 struct page *page; 6482 int num_pages; 6483 int i; 6484 6485 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6486 num_pages = num_extent_pages(eb); 6487 for (i = 0; i < num_pages; i++) { 6488 page = eb->pages[i]; 6489 if (!page) 6490 continue; 6491 6492 /* 6493 * This is special handling for metadata subpage, as regular 6494 * btrfs_is_subpage() can not handle cloned/dummy metadata. 6495 */ 6496 if (fs_info->nodesize >= PAGE_SIZE) 6497 ClearPageUptodate(page); 6498 else 6499 btrfs_subpage_clear_uptodate(fs_info, page, eb->start, 6500 eb->len); 6501 } 6502 } 6503 6504 void set_extent_buffer_uptodate(struct extent_buffer *eb) 6505 { 6506 struct btrfs_fs_info *fs_info = eb->fs_info; 6507 struct page *page; 6508 int num_pages; 6509 int i; 6510 6511 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6512 num_pages = num_extent_pages(eb); 6513 for (i = 0; i < num_pages; i++) { 6514 page = eb->pages[i]; 6515 6516 /* 6517 * This is special handling for metadata subpage, as regular 6518 * btrfs_is_subpage() can not handle cloned/dummy metadata. 6519 */ 6520 if (fs_info->nodesize >= PAGE_SIZE) 6521 SetPageUptodate(page); 6522 else 6523 btrfs_subpage_set_uptodate(fs_info, page, eb->start, 6524 eb->len); 6525 } 6526 } 6527 6528 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, 6529 int mirror_num) 6530 { 6531 struct btrfs_fs_info *fs_info = eb->fs_info; 6532 struct extent_io_tree *io_tree; 6533 struct page *page = eb->pages[0]; 6534 struct btrfs_bio_ctrl bio_ctrl = { 6535 .mirror_num = mirror_num, 6536 }; 6537 int ret = 0; 6538 6539 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); 6540 ASSERT(PagePrivate(page)); 6541 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 6542 6543 if (wait == WAIT_NONE) { 6544 if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) 6545 return -EAGAIN; 6546 } else { 6547 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6548 if (ret < 0) 6549 return ret; 6550 } 6551 6552 ret = 0; 6553 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || 6554 PageUptodate(page) || 6555 btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { 6556 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6557 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6558 return ret; 6559 } 6560 6561 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6562 eb->read_mirror = 0; 6563 atomic_set(&eb->io_pages, 1); 6564 check_buffer_tree_ref(eb); 6565 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); 6566 6567 btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); 6568 ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, 6569 page, eb->start, eb->len, 6570 eb->start - page_offset(page), 6571 end_bio_extent_readpage, 0, true); 6572 if (ret) { 6573 /* 6574 * In the endio function, if we hit something wrong we will 6575 * increase the io_pages, so here we need to decrease it for 6576 * error path. 6577 */ 6578 atomic_dec(&eb->io_pages); 6579 } 6580 submit_one_bio(&bio_ctrl); 6581 if (ret || wait != WAIT_COMPLETE) 6582 return ret; 6583 6584 wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); 6585 if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6586 ret = -EIO; 6587 return ret; 6588 } 6589 6590 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) 6591 { 6592 int i; 6593 struct page *page; 6594 int err; 6595 int ret = 0; 6596 int locked_pages = 0; 6597 int all_uptodate = 1; 6598 int num_pages; 6599 unsigned long num_reads = 0; 6600 struct btrfs_bio_ctrl bio_ctrl = { 6601 .mirror_num = mirror_num, 6602 }; 6603 6604 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6605 return 0; 6606 6607 /* 6608 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write 6609 * operation, which could potentially still be in flight. In this case 6610 * we simply want to return an error. 6611 */ 6612 if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) 6613 return -EIO; 6614 6615 if (eb->fs_info->nodesize < PAGE_SIZE) 6616 return read_extent_buffer_subpage(eb, wait, mirror_num); 6617 6618 num_pages = num_extent_pages(eb); 6619 for (i = 0; i < num_pages; i++) { 6620 page = eb->pages[i]; 6621 if (wait == WAIT_NONE) { 6622 /* 6623 * WAIT_NONE is only utilized by readahead. If we can't 6624 * acquire the lock atomically it means either the eb 6625 * is being read out or under modification. 6626 * Either way the eb will be or has been cached, 6627 * readahead can exit safely. 6628 */ 6629 if (!trylock_page(page)) 6630 goto unlock_exit; 6631 } else { 6632 lock_page(page); 6633 } 6634 locked_pages++; 6635 } 6636 /* 6637 * We need to firstly lock all pages to make sure that 6638 * the uptodate bit of our pages won't be affected by 6639 * clear_extent_buffer_uptodate(). 6640 */ 6641 for (i = 0; i < num_pages; i++) { 6642 page = eb->pages[i]; 6643 if (!PageUptodate(page)) { 6644 num_reads++; 6645 all_uptodate = 0; 6646 } 6647 } 6648 6649 if (all_uptodate) { 6650 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 6651 goto unlock_exit; 6652 } 6653 6654 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 6655 eb->read_mirror = 0; 6656 atomic_set(&eb->io_pages, num_reads); 6657 /* 6658 * It is possible for release_folio to clear the TREE_REF bit before we 6659 * set io_pages. See check_buffer_tree_ref for a more detailed comment. 6660 */ 6661 check_buffer_tree_ref(eb); 6662 for (i = 0; i < num_pages; i++) { 6663 page = eb->pages[i]; 6664 6665 if (!PageUptodate(page)) { 6666 if (ret) { 6667 atomic_dec(&eb->io_pages); 6668 unlock_page(page); 6669 continue; 6670 } 6671 6672 ClearPageError(page); 6673 err = submit_extent_page(REQ_OP_READ, NULL, 6674 &bio_ctrl, page, page_offset(page), 6675 PAGE_SIZE, 0, end_bio_extent_readpage, 6676 0, false); 6677 if (err) { 6678 /* 6679 * We failed to submit the bio so it's the 6680 * caller's responsibility to perform cleanup 6681 * i.e unlock page/set error bit. 6682 */ 6683 ret = err; 6684 SetPageError(page); 6685 unlock_page(page); 6686 atomic_dec(&eb->io_pages); 6687 } 6688 } else { 6689 unlock_page(page); 6690 } 6691 } 6692 6693 submit_one_bio(&bio_ctrl); 6694 6695 if (ret || wait != WAIT_COMPLETE) 6696 return ret; 6697 6698 for (i = 0; i < num_pages; i++) { 6699 page = eb->pages[i]; 6700 wait_on_page_locked(page); 6701 if (!PageUptodate(page)) 6702 ret = -EIO; 6703 } 6704 6705 return ret; 6706 6707 unlock_exit: 6708 while (locked_pages > 0) { 6709 locked_pages--; 6710 page = eb->pages[locked_pages]; 6711 unlock_page(page); 6712 } 6713 return ret; 6714 } 6715 6716 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, 6717 unsigned long len) 6718 { 6719 btrfs_warn(eb->fs_info, 6720 "access to eb bytenr %llu len %lu out of range start %lu len %lu", 6721 eb->start, eb->len, start, len); 6722 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 6723 6724 return true; 6725 } 6726 6727 /* 6728 * Check if the [start, start + len) range is valid before reading/writing 6729 * the eb. 6730 * NOTE: @start and @len are offset inside the eb, not logical address. 6731 * 6732 * Caller should not touch the dst/src memory if this function returns error. 6733 */ 6734 static inline int check_eb_range(const struct extent_buffer *eb, 6735 unsigned long start, unsigned long len) 6736 { 6737 unsigned long offset; 6738 6739 /* start, start + len should not go beyond eb->len nor overflow */ 6740 if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) 6741 return report_eb_range(eb, start, len); 6742 6743 return false; 6744 } 6745 6746 void read_extent_buffer(const struct extent_buffer *eb, void *dstv, 6747 unsigned long start, unsigned long len) 6748 { 6749 size_t cur; 6750 size_t offset; 6751 struct page *page; 6752 char *kaddr; 6753 char *dst = (char *)dstv; 6754 unsigned long i = get_eb_page_index(start); 6755 6756 if (check_eb_range(eb, start, len)) 6757 return; 6758 6759 offset = get_eb_offset_in_page(eb, start); 6760 6761 while (len > 0) { 6762 page = eb->pages[i]; 6763 6764 cur = min(len, (PAGE_SIZE - offset)); 6765 kaddr = page_address(page); 6766 memcpy(dst, kaddr + offset, cur); 6767 6768 dst += cur; 6769 len -= cur; 6770 offset = 0; 6771 i++; 6772 } 6773 } 6774 6775 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, 6776 void __user *dstv, 6777 unsigned long start, unsigned long len) 6778 { 6779 size_t cur; 6780 size_t offset; 6781 struct page *page; 6782 char *kaddr; 6783 char __user *dst = (char __user *)dstv; 6784 unsigned long i = get_eb_page_index(start); 6785 int ret = 0; 6786 6787 WARN_ON(start > eb->len); 6788 WARN_ON(start + len > eb->start + eb->len); 6789 6790 offset = get_eb_offset_in_page(eb, start); 6791 6792 while (len > 0) { 6793 page = eb->pages[i]; 6794 6795 cur = min(len, (PAGE_SIZE - offset)); 6796 kaddr = page_address(page); 6797 if (copy_to_user_nofault(dst, kaddr + offset, cur)) { 6798 ret = -EFAULT; 6799 break; 6800 } 6801 6802 dst += cur; 6803 len -= cur; 6804 offset = 0; 6805 i++; 6806 } 6807 6808 return ret; 6809 } 6810 6811 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, 6812 unsigned long start, unsigned long len) 6813 { 6814 size_t cur; 6815 size_t offset; 6816 struct page *page; 6817 char *kaddr; 6818 char *ptr = (char *)ptrv; 6819 unsigned long i = get_eb_page_index(start); 6820 int ret = 0; 6821 6822 if (check_eb_range(eb, start, len)) 6823 return -EINVAL; 6824 6825 offset = get_eb_offset_in_page(eb, start); 6826 6827 while (len > 0) { 6828 page = eb->pages[i]; 6829 6830 cur = min(len, (PAGE_SIZE - offset)); 6831 6832 kaddr = page_address(page); 6833 ret = memcmp(ptr, kaddr + offset, cur); 6834 if (ret) 6835 break; 6836 6837 ptr += cur; 6838 len -= cur; 6839 offset = 0; 6840 i++; 6841 } 6842 return ret; 6843 } 6844 6845 /* 6846 * Check that the extent buffer is uptodate. 6847 * 6848 * For regular sector size == PAGE_SIZE case, check if @page is uptodate. 6849 * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. 6850 */ 6851 static void assert_eb_page_uptodate(const struct extent_buffer *eb, 6852 struct page *page) 6853 { 6854 struct btrfs_fs_info *fs_info = eb->fs_info; 6855 6856 /* 6857 * If we are using the commit root we could potentially clear a page 6858 * Uptodate while we're using the extent buffer that we've previously 6859 * looked up. We don't want to complain in this case, as the page was 6860 * valid before, we just didn't write it out. Instead we want to catch 6861 * the case where we didn't actually read the block properly, which 6862 * would have !PageUptodate && !PageError, as we clear PageError before 6863 * reading. 6864 */ 6865 if (fs_info->nodesize < PAGE_SIZE) { 6866 bool uptodate, error; 6867 6868 uptodate = btrfs_subpage_test_uptodate(fs_info, page, 6869 eb->start, eb->len); 6870 error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); 6871 WARN_ON(!uptodate && !error); 6872 } else { 6873 WARN_ON(!PageUptodate(page) && !PageError(page)); 6874 } 6875 } 6876 6877 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, 6878 const void *srcv) 6879 { 6880 char *kaddr; 6881 6882 assert_eb_page_uptodate(eb, eb->pages[0]); 6883 kaddr = page_address(eb->pages[0]) + 6884 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, 6885 chunk_tree_uuid)); 6886 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6887 } 6888 6889 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) 6890 { 6891 char *kaddr; 6892 6893 assert_eb_page_uptodate(eb, eb->pages[0]); 6894 kaddr = page_address(eb->pages[0]) + 6895 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); 6896 memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6897 } 6898 6899 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, 6900 unsigned long start, unsigned long len) 6901 { 6902 size_t cur; 6903 size_t offset; 6904 struct page *page; 6905 char *kaddr; 6906 char *src = (char *)srcv; 6907 unsigned long i = get_eb_page_index(start); 6908 6909 WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); 6910 6911 if (check_eb_range(eb, start, len)) 6912 return; 6913 6914 offset = get_eb_offset_in_page(eb, start); 6915 6916 while (len > 0) { 6917 page = eb->pages[i]; 6918 assert_eb_page_uptodate(eb, page); 6919 6920 cur = min(len, PAGE_SIZE - offset); 6921 kaddr = page_address(page); 6922 memcpy(kaddr + offset, src, cur); 6923 6924 src += cur; 6925 len -= cur; 6926 offset = 0; 6927 i++; 6928 } 6929 } 6930 6931 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, 6932 unsigned long len) 6933 { 6934 size_t cur; 6935 size_t offset; 6936 struct page *page; 6937 char *kaddr; 6938 unsigned long i = get_eb_page_index(start); 6939 6940 if (check_eb_range(eb, start, len)) 6941 return; 6942 6943 offset = get_eb_offset_in_page(eb, start); 6944 6945 while (len > 0) { 6946 page = eb->pages[i]; 6947 assert_eb_page_uptodate(eb, page); 6948 6949 cur = min(len, PAGE_SIZE - offset); 6950 kaddr = page_address(page); 6951 memset(kaddr + offset, 0, cur); 6952 6953 len -= cur; 6954 offset = 0; 6955 i++; 6956 } 6957 } 6958 6959 void copy_extent_buffer_full(const struct extent_buffer *dst, 6960 const struct extent_buffer *src) 6961 { 6962 int i; 6963 int num_pages; 6964 6965 ASSERT(dst->len == src->len); 6966 6967 if (dst->fs_info->nodesize >= PAGE_SIZE) { 6968 num_pages = num_extent_pages(dst); 6969 for (i = 0; i < num_pages; i++) 6970 copy_page(page_address(dst->pages[i]), 6971 page_address(src->pages[i])); 6972 } else { 6973 size_t src_offset = get_eb_offset_in_page(src, 0); 6974 size_t dst_offset = get_eb_offset_in_page(dst, 0); 6975 6976 ASSERT(src->fs_info->nodesize < PAGE_SIZE); 6977 memcpy(page_address(dst->pages[0]) + dst_offset, 6978 page_address(src->pages[0]) + src_offset, 6979 src->len); 6980 } 6981 } 6982 6983 void copy_extent_buffer(const struct extent_buffer *dst, 6984 const struct extent_buffer *src, 6985 unsigned long dst_offset, unsigned long src_offset, 6986 unsigned long len) 6987 { 6988 u64 dst_len = dst->len; 6989 size_t cur; 6990 size_t offset; 6991 struct page *page; 6992 char *kaddr; 6993 unsigned long i = get_eb_page_index(dst_offset); 6994 6995 if (check_eb_range(dst, dst_offset, len) || 6996 check_eb_range(src, src_offset, len)) 6997 return; 6998 6999 WARN_ON(src->len != dst_len); 7000 7001 offset = get_eb_offset_in_page(dst, dst_offset); 7002 7003 while (len > 0) { 7004 page = dst->pages[i]; 7005 assert_eb_page_uptodate(dst, page); 7006 7007 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 7008 7009 kaddr = page_address(page); 7010 read_extent_buffer(src, kaddr + offset, src_offset, cur); 7011 7012 src_offset += cur; 7013 len -= cur; 7014 offset = 0; 7015 i++; 7016 } 7017 } 7018 7019 /* 7020 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 7021 * given bit number 7022 * @eb: the extent buffer 7023 * @start: offset of the bitmap item in the extent buffer 7024 * @nr: bit number 7025 * @page_index: return index of the page in the extent buffer that contains the 7026 * given bit number 7027 * @page_offset: return offset into the page given by page_index 7028 * 7029 * This helper hides the ugliness of finding the byte in an extent buffer which 7030 * contains a given bit. 7031 */ 7032 static inline void eb_bitmap_offset(const struct extent_buffer *eb, 7033 unsigned long start, unsigned long nr, 7034 unsigned long *page_index, 7035 size_t *page_offset) 7036 { 7037 size_t byte_offset = BIT_BYTE(nr); 7038 size_t offset; 7039 7040 /* 7041 * The byte we want is the offset of the extent buffer + the offset of 7042 * the bitmap item in the extent buffer + the offset of the byte in the 7043 * bitmap item. 7044 */ 7045 offset = start + offset_in_page(eb->start) + byte_offset; 7046 7047 *page_index = offset >> PAGE_SHIFT; 7048 *page_offset = offset_in_page(offset); 7049 } 7050 7051 /** 7052 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 7053 * @eb: the extent buffer 7054 * @start: offset of the bitmap item in the extent buffer 7055 * @nr: bit number to test 7056 */ 7057 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, 7058 unsigned long nr) 7059 { 7060 u8 *kaddr; 7061 struct page *page; 7062 unsigned long i; 7063 size_t offset; 7064 7065 eb_bitmap_offset(eb, start, nr, &i, &offset); 7066 page = eb->pages[i]; 7067 assert_eb_page_uptodate(eb, page); 7068 kaddr = page_address(page); 7069 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 7070 } 7071 7072 /** 7073 * extent_buffer_bitmap_set - set an area of a bitmap 7074 * @eb: the extent buffer 7075 * @start: offset of the bitmap item in the extent buffer 7076 * @pos: bit number of the first bit 7077 * @len: number of bits to set 7078 */ 7079 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, 7080 unsigned long pos, unsigned long len) 7081 { 7082 u8 *kaddr; 7083 struct page *page; 7084 unsigned long i; 7085 size_t offset; 7086 const unsigned int size = pos + len; 7087 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 7088 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 7089 7090 eb_bitmap_offset(eb, start, pos, &i, &offset); 7091 page = eb->pages[i]; 7092 assert_eb_page_uptodate(eb, page); 7093 kaddr = page_address(page); 7094 7095 while (len >= bits_to_set) { 7096 kaddr[offset] |= mask_to_set; 7097 len -= bits_to_set; 7098 bits_to_set = BITS_PER_BYTE; 7099 mask_to_set = ~0; 7100 if (++offset >= PAGE_SIZE && len > 0) { 7101 offset = 0; 7102 page = eb->pages[++i]; 7103 assert_eb_page_uptodate(eb, page); 7104 kaddr = page_address(page); 7105 } 7106 } 7107 if (len) { 7108 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 7109 kaddr[offset] |= mask_to_set; 7110 } 7111 } 7112 7113 7114 /** 7115 * extent_buffer_bitmap_clear - clear an area of a bitmap 7116 * @eb: the extent buffer 7117 * @start: offset of the bitmap item in the extent buffer 7118 * @pos: bit number of the first bit 7119 * @len: number of bits to clear 7120 */ 7121 void extent_buffer_bitmap_clear(const struct extent_buffer *eb, 7122 unsigned long start, unsigned long pos, 7123 unsigned long len) 7124 { 7125 u8 *kaddr; 7126 struct page *page; 7127 unsigned long i; 7128 size_t offset; 7129 const unsigned int size = pos + len; 7130 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 7131 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 7132 7133 eb_bitmap_offset(eb, start, pos, &i, &offset); 7134 page = eb->pages[i]; 7135 assert_eb_page_uptodate(eb, page); 7136 kaddr = page_address(page); 7137 7138 while (len >= bits_to_clear) { 7139 kaddr[offset] &= ~mask_to_clear; 7140 len -= bits_to_clear; 7141 bits_to_clear = BITS_PER_BYTE; 7142 mask_to_clear = ~0; 7143 if (++offset >= PAGE_SIZE && len > 0) { 7144 offset = 0; 7145 page = eb->pages[++i]; 7146 assert_eb_page_uptodate(eb, page); 7147 kaddr = page_address(page); 7148 } 7149 } 7150 if (len) { 7151 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 7152 kaddr[offset] &= ~mask_to_clear; 7153 } 7154 } 7155 7156 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 7157 { 7158 unsigned long distance = (src > dst) ? src - dst : dst - src; 7159 return distance < len; 7160 } 7161 7162 static void copy_pages(struct page *dst_page, struct page *src_page, 7163 unsigned long dst_off, unsigned long src_off, 7164 unsigned long len) 7165 { 7166 char *dst_kaddr = page_address(dst_page); 7167 char *src_kaddr; 7168 int must_memmove = 0; 7169 7170 if (dst_page != src_page) { 7171 src_kaddr = page_address(src_page); 7172 } else { 7173 src_kaddr = dst_kaddr; 7174 if (areas_overlap(src_off, dst_off, len)) 7175 must_memmove = 1; 7176 } 7177 7178 if (must_memmove) 7179 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 7180 else 7181 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 7182 } 7183 7184 void memcpy_extent_buffer(const struct extent_buffer *dst, 7185 unsigned long dst_offset, unsigned long src_offset, 7186 unsigned long len) 7187 { 7188 size_t cur; 7189 size_t dst_off_in_page; 7190 size_t src_off_in_page; 7191 unsigned long dst_i; 7192 unsigned long src_i; 7193 7194 if (check_eb_range(dst, dst_offset, len) || 7195 check_eb_range(dst, src_offset, len)) 7196 return; 7197 7198 while (len > 0) { 7199 dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); 7200 src_off_in_page = get_eb_offset_in_page(dst, src_offset); 7201 7202 dst_i = get_eb_page_index(dst_offset); 7203 src_i = get_eb_page_index(src_offset); 7204 7205 cur = min(len, (unsigned long)(PAGE_SIZE - 7206 src_off_in_page)); 7207 cur = min_t(unsigned long, cur, 7208 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 7209 7210 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7211 dst_off_in_page, src_off_in_page, cur); 7212 7213 src_offset += cur; 7214 dst_offset += cur; 7215 len -= cur; 7216 } 7217 } 7218 7219 void memmove_extent_buffer(const struct extent_buffer *dst, 7220 unsigned long dst_offset, unsigned long src_offset, 7221 unsigned long len) 7222 { 7223 size_t cur; 7224 size_t dst_off_in_page; 7225 size_t src_off_in_page; 7226 unsigned long dst_end = dst_offset + len - 1; 7227 unsigned long src_end = src_offset + len - 1; 7228 unsigned long dst_i; 7229 unsigned long src_i; 7230 7231 if (check_eb_range(dst, dst_offset, len) || 7232 check_eb_range(dst, src_offset, len)) 7233 return; 7234 if (dst_offset < src_offset) { 7235 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 7236 return; 7237 } 7238 while (len > 0) { 7239 dst_i = get_eb_page_index(dst_end); 7240 src_i = get_eb_page_index(src_end); 7241 7242 dst_off_in_page = get_eb_offset_in_page(dst, dst_end); 7243 src_off_in_page = get_eb_offset_in_page(dst, src_end); 7244 7245 cur = min_t(unsigned long, len, src_off_in_page + 1); 7246 cur = min(cur, dst_off_in_page + 1); 7247 copy_pages(dst->pages[dst_i], dst->pages[src_i], 7248 dst_off_in_page - cur + 1, 7249 src_off_in_page - cur + 1, cur); 7250 7251 dst_end -= cur; 7252 src_end -= cur; 7253 len -= cur; 7254 } 7255 } 7256 7257 #define GANG_LOOKUP_SIZE 16 7258 static struct extent_buffer *get_next_extent_buffer( 7259 struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 7260 { 7261 struct extent_buffer *gang[GANG_LOOKUP_SIZE]; 7262 struct extent_buffer *found = NULL; 7263 u64 page_start = page_offset(page); 7264 u64 cur = page_start; 7265 7266 ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); 7267 lockdep_assert_held(&fs_info->buffer_lock); 7268 7269 while (cur < page_start + PAGE_SIZE) { 7270 int ret; 7271 int i; 7272 7273 ret = radix_tree_gang_lookup(&fs_info->buffer_radix, 7274 (void **)gang, cur >> fs_info->sectorsize_bits, 7275 min_t(unsigned int, GANG_LOOKUP_SIZE, 7276 PAGE_SIZE / fs_info->nodesize)); 7277 if (ret == 0) 7278 goto out; 7279 for (i = 0; i < ret; i++) { 7280 /* Already beyond page end */ 7281 if (gang[i]->start >= page_start + PAGE_SIZE) 7282 goto out; 7283 /* Found one */ 7284 if (gang[i]->start >= bytenr) { 7285 found = gang[i]; 7286 goto out; 7287 } 7288 } 7289 cur = gang[ret - 1]->start + gang[ret - 1]->len; 7290 } 7291 out: 7292 return found; 7293 } 7294 7295 static int try_release_subpage_extent_buffer(struct page *page) 7296 { 7297 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 7298 u64 cur = page_offset(page); 7299 const u64 end = page_offset(page) + PAGE_SIZE; 7300 int ret; 7301 7302 while (cur < end) { 7303 struct extent_buffer *eb = NULL; 7304 7305 /* 7306 * Unlike try_release_extent_buffer() which uses page->private 7307 * to grab buffer, for subpage case we rely on radix tree, thus 7308 * we need to ensure radix tree consistency. 7309 * 7310 * We also want an atomic snapshot of the radix tree, thus go 7311 * with spinlock rather than RCU. 7312 */ 7313 spin_lock(&fs_info->buffer_lock); 7314 eb = get_next_extent_buffer(fs_info, page, cur); 7315 if (!eb) { 7316 /* No more eb in the page range after or at cur */ 7317 spin_unlock(&fs_info->buffer_lock); 7318 break; 7319 } 7320 cur = eb->start + eb->len; 7321 7322 /* 7323 * The same as try_release_extent_buffer(), to ensure the eb 7324 * won't disappear out from under us. 7325 */ 7326 spin_lock(&eb->refs_lock); 7327 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7328 spin_unlock(&eb->refs_lock); 7329 spin_unlock(&fs_info->buffer_lock); 7330 break; 7331 } 7332 spin_unlock(&fs_info->buffer_lock); 7333 7334 /* 7335 * If tree ref isn't set then we know the ref on this eb is a 7336 * real ref, so just return, this eb will likely be freed soon 7337 * anyway. 7338 */ 7339 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7340 spin_unlock(&eb->refs_lock); 7341 break; 7342 } 7343 7344 /* 7345 * Here we don't care about the return value, we will always 7346 * check the page private at the end. And 7347 * release_extent_buffer() will release the refs_lock. 7348 */ 7349 release_extent_buffer(eb); 7350 } 7351 /* 7352 * Finally to check if we have cleared page private, as if we have 7353 * released all ebs in the page, the page private should be cleared now. 7354 */ 7355 spin_lock(&page->mapping->private_lock); 7356 if (!PagePrivate(page)) 7357 ret = 1; 7358 else 7359 ret = 0; 7360 spin_unlock(&page->mapping->private_lock); 7361 return ret; 7362 7363 } 7364 7365 int try_release_extent_buffer(struct page *page) 7366 { 7367 struct extent_buffer *eb; 7368 7369 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 7370 return try_release_subpage_extent_buffer(page); 7371 7372 /* 7373 * We need to make sure nobody is changing page->private, as we rely on 7374 * page->private as the pointer to extent buffer. 7375 */ 7376 spin_lock(&page->mapping->private_lock); 7377 if (!PagePrivate(page)) { 7378 spin_unlock(&page->mapping->private_lock); 7379 return 1; 7380 } 7381 7382 eb = (struct extent_buffer *)page->private; 7383 BUG_ON(!eb); 7384 7385 /* 7386 * This is a little awful but should be ok, we need to make sure that 7387 * the eb doesn't disappear out from under us while we're looking at 7388 * this page. 7389 */ 7390 spin_lock(&eb->refs_lock); 7391 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 7392 spin_unlock(&eb->refs_lock); 7393 spin_unlock(&page->mapping->private_lock); 7394 return 0; 7395 } 7396 spin_unlock(&page->mapping->private_lock); 7397 7398 /* 7399 * If tree ref isn't set then we know the ref on this eb is a real ref, 7400 * so just return, this page will likely be freed soon anyway. 7401 */ 7402 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 7403 spin_unlock(&eb->refs_lock); 7404 return 0; 7405 } 7406 7407 return release_extent_buffer(eb); 7408 } 7409 7410 /* 7411 * btrfs_readahead_tree_block - attempt to readahead a child block 7412 * @fs_info: the fs_info 7413 * @bytenr: bytenr to read 7414 * @owner_root: objectid of the root that owns this eb 7415 * @gen: generation for the uptodate check, can be 0 7416 * @level: level for the eb 7417 * 7418 * Attempt to readahead a tree block at @bytenr. If @gen is 0 then we do a 7419 * normal uptodate check of the eb, without checking the generation. If we have 7420 * to read the block we will not block on anything. 7421 */ 7422 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, 7423 u64 bytenr, u64 owner_root, u64 gen, int level) 7424 { 7425 struct extent_buffer *eb; 7426 int ret; 7427 7428 eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 7429 if (IS_ERR(eb)) 7430 return; 7431 7432 if (btrfs_buffer_uptodate(eb, gen, 1)) { 7433 free_extent_buffer(eb); 7434 return; 7435 } 7436 7437 ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); 7438 if (ret < 0) 7439 free_extent_buffer_stale(eb); 7440 else 7441 free_extent_buffer(eb); 7442 } 7443 7444 /* 7445 * btrfs_readahead_node_child - readahead a node's child block 7446 * @node: parent node we're reading from 7447 * @slot: slot in the parent node for the child we want to read 7448 * 7449 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at 7450 * the slot in the node provided. 7451 */ 7452 void btrfs_readahead_node_child(struct extent_buffer *node, int slot) 7453 { 7454 btrfs_readahead_tree_block(node->fs_info, 7455 btrfs_node_blockptr(node, slot), 7456 btrfs_header_owner(node), 7457 btrfs_node_ptr_generation(node, slot), 7458 btrfs_header_level(node) - 1); 7459 } 7460