1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/pagemap.h> 6 #include <linux/page-flags.h> 7 #include <linux/spinlock.h> 8 #include <linux/blkdev.h> 9 #include <linux/swap.h> 10 #include <linux/writeback.h> 11 #include <linux/pagevec.h> 12 #include <linux/prefetch.h> 13 #include <linux/cleancache.h> 14 #include "extent_io.h" 15 #include "extent_map.h" 16 #include "ctree.h" 17 #include "btrfs_inode.h" 18 #include "volumes.h" 19 #include "check-integrity.h" 20 #include "locking.h" 21 #include "rcu-string.h" 22 #include "backref.h" 23 #include "transaction.h" 24 25 static struct kmem_cache *extent_state_cache; 26 static struct kmem_cache *extent_buffer_cache; 27 static struct bio_set *btrfs_bioset; 28 29 static inline bool extent_state_in_tree(const struct extent_state *state) 30 { 31 return !RB_EMPTY_NODE(&state->rb_node); 32 } 33 34 #ifdef CONFIG_BTRFS_DEBUG 35 static LIST_HEAD(buffers); 36 static LIST_HEAD(states); 37 38 static DEFINE_SPINLOCK(leak_lock); 39 40 static inline 41 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) 42 { 43 unsigned long flags; 44 45 spin_lock_irqsave(&leak_lock, flags); 46 list_add(new, head); 47 spin_unlock_irqrestore(&leak_lock, flags); 48 } 49 50 static inline 51 void btrfs_leak_debug_del(struct list_head *entry) 52 { 53 unsigned long flags; 54 55 spin_lock_irqsave(&leak_lock, flags); 56 list_del(entry); 57 spin_unlock_irqrestore(&leak_lock, flags); 58 } 59 60 static inline 61 void btrfs_leak_debug_check(void) 62 { 63 struct extent_state *state; 64 struct extent_buffer *eb; 65 66 while (!list_empty(&states)) { 67 state = list_entry(states.next, struct extent_state, leak_list); 68 pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", 69 state->start, state->end, state->state, 70 extent_state_in_tree(state), 71 refcount_read(&state->refs)); 72 list_del(&state->leak_list); 73 kmem_cache_free(extent_state_cache, state); 74 } 75 76 while (!list_empty(&buffers)) { 77 eb = list_entry(buffers.next, struct extent_buffer, leak_list); 78 pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n", 79 eb->start, eb->len, atomic_read(&eb->refs)); 80 list_del(&eb->leak_list); 81 kmem_cache_free(extent_buffer_cache, eb); 82 } 83 } 84 85 #define btrfs_debug_check_extent_io_range(tree, start, end) \ 86 __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end)) 87 static inline void __btrfs_debug_check_extent_io_range(const char *caller, 88 struct extent_io_tree *tree, u64 start, u64 end) 89 { 90 struct inode *inode; 91 u64 isize; 92 93 if (!tree->mapping) 94 return; 95 96 inode = tree->mapping->host; 97 isize = i_size_read(inode); 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, 100 "%s: ino %llu isize %llu odd range [%llu,%llu]", 101 caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); 102 } 103 } 104 #else 105 #define btrfs_leak_debug_add(new, head) do {} while (0) 106 #define btrfs_leak_debug_del(entry) do {} while (0) 107 #define btrfs_leak_debug_check() do {} while (0) 108 #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) 109 #endif 110 111 #define BUFFER_LRU_MAX 64 112 113 struct tree_entry { 114 u64 start; 115 u64 end; 116 struct rb_node rb_node; 117 }; 118 119 struct extent_page_data { 120 struct bio *bio; 121 struct extent_io_tree *tree; 122 get_extent_t *get_extent; 123 unsigned long bio_flags; 124 125 /* tells writepage not to lock the state bits for this range 126 * it still does the unlocking 127 */ 128 unsigned int extent_locked:1; 129 130 /* tells the submit_bio code to use REQ_SYNC */ 131 unsigned int sync_io:1; 132 }; 133 134 static void add_extent_changeset(struct extent_state *state, unsigned bits, 135 struct extent_changeset *changeset, 136 int set) 137 { 138 int ret; 139 140 if (!changeset) 141 return; 142 if (set && (state->state & bits) == bits) 143 return; 144 if (!set && (state->state & bits) == 0) 145 return; 146 changeset->bytes_changed += state->end - state->start + 1; 147 ret = ulist_add(&changeset->range_changed, state->start, state->end, 148 GFP_ATOMIC); 149 /* ENOMEM */ 150 BUG_ON(ret < 0); 151 } 152 153 static noinline void flush_write_bio(void *data); 154 static inline struct btrfs_fs_info * 155 tree_fs_info(struct extent_io_tree *tree) 156 { 157 if (!tree->mapping) 158 return NULL; 159 return btrfs_sb(tree->mapping->host->i_sb); 160 } 161 162 int __init extent_io_init(void) 163 { 164 extent_state_cache = kmem_cache_create("btrfs_extent_state", 165 sizeof(struct extent_state), 0, 166 SLAB_MEM_SPREAD, NULL); 167 if (!extent_state_cache) 168 return -ENOMEM; 169 170 extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", 171 sizeof(struct extent_buffer), 0, 172 SLAB_MEM_SPREAD, NULL); 173 if (!extent_buffer_cache) 174 goto free_state_cache; 175 176 btrfs_bioset = bioset_create(BIO_POOL_SIZE, 177 offsetof(struct btrfs_io_bio, bio)); 178 if (!btrfs_bioset) 179 goto free_buffer_cache; 180 181 if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) 182 goto free_bioset; 183 184 return 0; 185 186 free_bioset: 187 bioset_free(btrfs_bioset); 188 btrfs_bioset = NULL; 189 190 free_buffer_cache: 191 kmem_cache_destroy(extent_buffer_cache); 192 extent_buffer_cache = NULL; 193 194 free_state_cache: 195 kmem_cache_destroy(extent_state_cache); 196 extent_state_cache = NULL; 197 return -ENOMEM; 198 } 199 200 void extent_io_exit(void) 201 { 202 btrfs_leak_debug_check(); 203 204 /* 205 * Make sure all delayed rcu free are flushed before we 206 * destroy caches. 207 */ 208 rcu_barrier(); 209 kmem_cache_destroy(extent_state_cache); 210 kmem_cache_destroy(extent_buffer_cache); 211 if (btrfs_bioset) 212 bioset_free(btrfs_bioset); 213 } 214 215 void extent_io_tree_init(struct extent_io_tree *tree, 216 struct address_space *mapping) 217 { 218 tree->state = RB_ROOT; 219 tree->ops = NULL; 220 tree->dirty_bytes = 0; 221 spin_lock_init(&tree->lock); 222 tree->mapping = mapping; 223 } 224 225 static struct extent_state *alloc_extent_state(gfp_t mask) 226 { 227 struct extent_state *state; 228 229 /* 230 * The given mask might be not appropriate for the slab allocator, 231 * drop the unsupported bits 232 */ 233 mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); 234 state = kmem_cache_alloc(extent_state_cache, mask); 235 if (!state) 236 return state; 237 state->state = 0; 238 state->failrec = NULL; 239 RB_CLEAR_NODE(&state->rb_node); 240 btrfs_leak_debug_add(&state->leak_list, &states); 241 refcount_set(&state->refs, 1); 242 init_waitqueue_head(&state->wq); 243 trace_alloc_extent_state(state, mask, _RET_IP_); 244 return state; 245 } 246 247 void free_extent_state(struct extent_state *state) 248 { 249 if (!state) 250 return; 251 if (refcount_dec_and_test(&state->refs)) { 252 WARN_ON(extent_state_in_tree(state)); 253 btrfs_leak_debug_del(&state->leak_list); 254 trace_free_extent_state(state, _RET_IP_); 255 kmem_cache_free(extent_state_cache, state); 256 } 257 } 258 259 static struct rb_node *tree_insert(struct rb_root *root, 260 struct rb_node *search_start, 261 u64 offset, 262 struct rb_node *node, 263 struct rb_node ***p_in, 264 struct rb_node **parent_in) 265 { 266 struct rb_node **p; 267 struct rb_node *parent = NULL; 268 struct tree_entry *entry; 269 270 if (p_in && parent_in) { 271 p = *p_in; 272 parent = *parent_in; 273 goto do_insert; 274 } 275 276 p = search_start ? &search_start : &root->rb_node; 277 while (*p) { 278 parent = *p; 279 entry = rb_entry(parent, struct tree_entry, rb_node); 280 281 if (offset < entry->start) 282 p = &(*p)->rb_left; 283 else if (offset > entry->end) 284 p = &(*p)->rb_right; 285 else 286 return parent; 287 } 288 289 do_insert: 290 rb_link_node(node, parent, p); 291 rb_insert_color(node, root); 292 return NULL; 293 } 294 295 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, 296 struct rb_node **prev_ret, 297 struct rb_node **next_ret, 298 struct rb_node ***p_ret, 299 struct rb_node **parent_ret) 300 { 301 struct rb_root *root = &tree->state; 302 struct rb_node **n = &root->rb_node; 303 struct rb_node *prev = NULL; 304 struct rb_node *orig_prev = NULL; 305 struct tree_entry *entry; 306 struct tree_entry *prev_entry = NULL; 307 308 while (*n) { 309 prev = *n; 310 entry = rb_entry(prev, struct tree_entry, rb_node); 311 prev_entry = entry; 312 313 if (offset < entry->start) 314 n = &(*n)->rb_left; 315 else if (offset > entry->end) 316 n = &(*n)->rb_right; 317 else 318 return *n; 319 } 320 321 if (p_ret) 322 *p_ret = n; 323 if (parent_ret) 324 *parent_ret = prev; 325 326 if (prev_ret) { 327 orig_prev = prev; 328 while (prev && offset > prev_entry->end) { 329 prev = rb_next(prev); 330 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 331 } 332 *prev_ret = prev; 333 prev = orig_prev; 334 } 335 336 if (next_ret) { 337 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 338 while (prev && offset < prev_entry->start) { 339 prev = rb_prev(prev); 340 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 341 } 342 *next_ret = prev; 343 } 344 return NULL; 345 } 346 347 static inline struct rb_node * 348 tree_search_for_insert(struct extent_io_tree *tree, 349 u64 offset, 350 struct rb_node ***p_ret, 351 struct rb_node **parent_ret) 352 { 353 struct rb_node *prev = NULL; 354 struct rb_node *ret; 355 356 ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret); 357 if (!ret) 358 return prev; 359 return ret; 360 } 361 362 static inline struct rb_node *tree_search(struct extent_io_tree *tree, 363 u64 offset) 364 { 365 return tree_search_for_insert(tree, offset, NULL, NULL); 366 } 367 368 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 369 struct extent_state *other) 370 { 371 if (tree->ops && tree->ops->merge_extent_hook) 372 tree->ops->merge_extent_hook(tree->mapping->host, new, 373 other); 374 } 375 376 /* 377 * utility function to look for merge candidates inside a given range. 378 * Any extents with matching state are merged together into a single 379 * extent in the tree. Extents with EXTENT_IO in their state field 380 * are not merged because the end_io handlers need to be able to do 381 * operations on them without sleeping (or doing allocations/splits). 382 * 383 * This should be called with the tree lock held. 384 */ 385 static void merge_state(struct extent_io_tree *tree, 386 struct extent_state *state) 387 { 388 struct extent_state *other; 389 struct rb_node *other_node; 390 391 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 392 return; 393 394 other_node = rb_prev(&state->rb_node); 395 if (other_node) { 396 other = rb_entry(other_node, struct extent_state, rb_node); 397 if (other->end == state->start - 1 && 398 other->state == state->state) { 399 merge_cb(tree, state, other); 400 state->start = other->start; 401 rb_erase(&other->rb_node, &tree->state); 402 RB_CLEAR_NODE(&other->rb_node); 403 free_extent_state(other); 404 } 405 } 406 other_node = rb_next(&state->rb_node); 407 if (other_node) { 408 other = rb_entry(other_node, struct extent_state, rb_node); 409 if (other->start == state->end + 1 && 410 other->state == state->state) { 411 merge_cb(tree, state, other); 412 state->end = other->end; 413 rb_erase(&other->rb_node, &tree->state); 414 RB_CLEAR_NODE(&other->rb_node); 415 free_extent_state(other); 416 } 417 } 418 } 419 420 static void set_state_cb(struct extent_io_tree *tree, 421 struct extent_state *state, unsigned *bits) 422 { 423 if (tree->ops && tree->ops->set_bit_hook) 424 tree->ops->set_bit_hook(tree->mapping->host, state, bits); 425 } 426 427 static void clear_state_cb(struct extent_io_tree *tree, 428 struct extent_state *state, unsigned *bits) 429 { 430 if (tree->ops && tree->ops->clear_bit_hook) 431 tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host), 432 state, bits); 433 } 434 435 static void set_state_bits(struct extent_io_tree *tree, 436 struct extent_state *state, unsigned *bits, 437 struct extent_changeset *changeset); 438 439 /* 440 * insert an extent_state struct into the tree. 'bits' are set on the 441 * struct before it is inserted. 442 * 443 * This may return -EEXIST if the extent is already there, in which case the 444 * state struct is freed. 445 * 446 * The tree lock is not taken internally. This is a utility function and 447 * probably isn't what you want to call (see set/clear_extent_bit). 448 */ 449 static int insert_state(struct extent_io_tree *tree, 450 struct extent_state *state, u64 start, u64 end, 451 struct rb_node ***p, 452 struct rb_node **parent, 453 unsigned *bits, struct extent_changeset *changeset) 454 { 455 struct rb_node *node; 456 457 if (end < start) 458 WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", 459 end, start); 460 state->start = start; 461 state->end = end; 462 463 set_state_bits(tree, state, bits, changeset); 464 465 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 466 if (node) { 467 struct extent_state *found; 468 found = rb_entry(node, struct extent_state, rb_node); 469 pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", 470 found->start, found->end, start, end); 471 return -EEXIST; 472 } 473 merge_state(tree, state); 474 return 0; 475 } 476 477 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, 478 u64 split) 479 { 480 if (tree->ops && tree->ops->split_extent_hook) 481 tree->ops->split_extent_hook(tree->mapping->host, orig, split); 482 } 483 484 /* 485 * split a given extent state struct in two, inserting the preallocated 486 * struct 'prealloc' as the newly created second half. 'split' indicates an 487 * offset inside 'orig' where it should be split. 488 * 489 * Before calling, 490 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 491 * are two extent state structs in the tree: 492 * prealloc: [orig->start, split - 1] 493 * orig: [ split, orig->end ] 494 * 495 * The tree locks are not taken by this function. They need to be held 496 * by the caller. 497 */ 498 static int split_state(struct extent_io_tree *tree, struct extent_state *orig, 499 struct extent_state *prealloc, u64 split) 500 { 501 struct rb_node *node; 502 503 split_cb(tree, orig, split); 504 505 prealloc->start = orig->start; 506 prealloc->end = split - 1; 507 prealloc->state = orig->state; 508 orig->start = split; 509 510 node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, 511 &prealloc->rb_node, NULL, NULL); 512 if (node) { 513 free_extent_state(prealloc); 514 return -EEXIST; 515 } 516 return 0; 517 } 518 519 static struct extent_state *next_state(struct extent_state *state) 520 { 521 struct rb_node *next = rb_next(&state->rb_node); 522 if (next) 523 return rb_entry(next, struct extent_state, rb_node); 524 else 525 return NULL; 526 } 527 528 /* 529 * utility function to clear some bits in an extent state struct. 530 * it will optionally wake up any one waiting on this state (wake == 1). 531 * 532 * If no bits are set on the state struct after clearing things, the 533 * struct is freed and removed from the tree 534 */ 535 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 536 struct extent_state *state, 537 unsigned *bits, int wake, 538 struct extent_changeset *changeset) 539 { 540 struct extent_state *next; 541 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 542 543 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 544 u64 range = state->end - state->start + 1; 545 WARN_ON(range > tree->dirty_bytes); 546 tree->dirty_bytes -= range; 547 } 548 clear_state_cb(tree, state, bits); 549 add_extent_changeset(state, bits_to_clear, changeset, 0); 550 state->state &= ~bits_to_clear; 551 if (wake) 552 wake_up(&state->wq); 553 if (state->state == 0) { 554 next = next_state(state); 555 if (extent_state_in_tree(state)) { 556 rb_erase(&state->rb_node, &tree->state); 557 RB_CLEAR_NODE(&state->rb_node); 558 free_extent_state(state); 559 } else { 560 WARN_ON(1); 561 } 562 } else { 563 merge_state(tree, state); 564 next = next_state(state); 565 } 566 return next; 567 } 568 569 static struct extent_state * 570 alloc_extent_state_atomic(struct extent_state *prealloc) 571 { 572 if (!prealloc) 573 prealloc = alloc_extent_state(GFP_ATOMIC); 574 575 return prealloc; 576 } 577 578 static void extent_io_tree_panic(struct extent_io_tree *tree, int err) 579 { 580 btrfs_panic(tree_fs_info(tree), err, 581 "Locking error: Extent tree was modified by another thread while locked."); 582 } 583 584 /* 585 * clear some bits on a range in the tree. This may require splitting 586 * or inserting elements in the tree, so the gfp mask is used to 587 * indicate which allocations or sleeping are allowed. 588 * 589 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 590 * the given range from the tree regardless of state (ie for truncate). 591 * 592 * the range [start, end] is inclusive. 593 * 594 * This takes the tree lock, and returns 0 on success and < 0 on error. 595 */ 596 static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 597 unsigned bits, int wake, int delete, 598 struct extent_state **cached_state, 599 gfp_t mask, struct extent_changeset *changeset) 600 { 601 struct extent_state *state; 602 struct extent_state *cached; 603 struct extent_state *prealloc = NULL; 604 struct rb_node *node; 605 u64 last_end; 606 int err; 607 int clear = 0; 608 609 btrfs_debug_check_extent_io_range(tree, start, end); 610 611 if (bits & EXTENT_DELALLOC) 612 bits |= EXTENT_NORESERVE; 613 614 if (delete) 615 bits |= ~EXTENT_CTLBITS; 616 bits |= EXTENT_FIRST_DELALLOC; 617 618 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 619 clear = 1; 620 again: 621 if (!prealloc && gfpflags_allow_blocking(mask)) { 622 /* 623 * Don't care for allocation failure here because we might end 624 * up not needing the pre-allocated extent state at all, which 625 * is the case if we only have in the tree extent states that 626 * cover our input range and don't cover too any other range. 627 * If we end up needing a new extent state we allocate it later. 628 */ 629 prealloc = alloc_extent_state(mask); 630 } 631 632 spin_lock(&tree->lock); 633 if (cached_state) { 634 cached = *cached_state; 635 636 if (clear) { 637 *cached_state = NULL; 638 cached_state = NULL; 639 } 640 641 if (cached && extent_state_in_tree(cached) && 642 cached->start <= start && cached->end > start) { 643 if (clear) 644 refcount_dec(&cached->refs); 645 state = cached; 646 goto hit_next; 647 } 648 if (clear) 649 free_extent_state(cached); 650 } 651 /* 652 * this search will find the extents that end after 653 * our range starts 654 */ 655 node = tree_search(tree, start); 656 if (!node) 657 goto out; 658 state = rb_entry(node, struct extent_state, rb_node); 659 hit_next: 660 if (state->start > end) 661 goto out; 662 WARN_ON(state->end < start); 663 last_end = state->end; 664 665 /* the state doesn't have the wanted bits, go ahead */ 666 if (!(state->state & bits)) { 667 state = next_state(state); 668 goto next; 669 } 670 671 /* 672 * | ---- desired range ---- | 673 * | state | or 674 * | ------------- state -------------- | 675 * 676 * We need to split the extent we found, and may flip 677 * bits on second half. 678 * 679 * If the extent we found extends past our range, we 680 * just split and search again. It'll get split again 681 * the next time though. 682 * 683 * If the extent we found is inside our range, we clear 684 * the desired bit on it. 685 */ 686 687 if (state->start < start) { 688 prealloc = alloc_extent_state_atomic(prealloc); 689 BUG_ON(!prealloc); 690 err = split_state(tree, state, prealloc, start); 691 if (err) 692 extent_io_tree_panic(tree, err); 693 694 prealloc = NULL; 695 if (err) 696 goto out; 697 if (state->end <= end) { 698 state = clear_state_bit(tree, state, &bits, wake, 699 changeset); 700 goto next; 701 } 702 goto search_again; 703 } 704 /* 705 * | ---- desired range ---- | 706 * | state | 707 * We need to split the extent, and clear the bit 708 * on the first half 709 */ 710 if (state->start <= end && state->end > end) { 711 prealloc = alloc_extent_state_atomic(prealloc); 712 BUG_ON(!prealloc); 713 err = split_state(tree, state, prealloc, end + 1); 714 if (err) 715 extent_io_tree_panic(tree, err); 716 717 if (wake) 718 wake_up(&state->wq); 719 720 clear_state_bit(tree, prealloc, &bits, wake, changeset); 721 722 prealloc = NULL; 723 goto out; 724 } 725 726 state = clear_state_bit(tree, state, &bits, wake, changeset); 727 next: 728 if (last_end == (u64)-1) 729 goto out; 730 start = last_end + 1; 731 if (start <= end && state && !need_resched()) 732 goto hit_next; 733 734 search_again: 735 if (start > end) 736 goto out; 737 spin_unlock(&tree->lock); 738 if (gfpflags_allow_blocking(mask)) 739 cond_resched(); 740 goto again; 741 742 out: 743 spin_unlock(&tree->lock); 744 if (prealloc) 745 free_extent_state(prealloc); 746 747 return 0; 748 749 } 750 751 static void wait_on_state(struct extent_io_tree *tree, 752 struct extent_state *state) 753 __releases(tree->lock) 754 __acquires(tree->lock) 755 { 756 DEFINE_WAIT(wait); 757 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 758 spin_unlock(&tree->lock); 759 schedule(); 760 spin_lock(&tree->lock); 761 finish_wait(&state->wq, &wait); 762 } 763 764 /* 765 * waits for one or more bits to clear on a range in the state tree. 766 * The range [start, end] is inclusive. 767 * The tree lock is taken by this function 768 */ 769 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 770 unsigned long bits) 771 { 772 struct extent_state *state; 773 struct rb_node *node; 774 775 btrfs_debug_check_extent_io_range(tree, start, end); 776 777 spin_lock(&tree->lock); 778 again: 779 while (1) { 780 /* 781 * this search will find all the extents that end after 782 * our range starts 783 */ 784 node = tree_search(tree, start); 785 process_node: 786 if (!node) 787 break; 788 789 state = rb_entry(node, struct extent_state, rb_node); 790 791 if (state->start > end) 792 goto out; 793 794 if (state->state & bits) { 795 start = state->start; 796 refcount_inc(&state->refs); 797 wait_on_state(tree, state); 798 free_extent_state(state); 799 goto again; 800 } 801 start = state->end + 1; 802 803 if (start > end) 804 break; 805 806 if (!cond_resched_lock(&tree->lock)) { 807 node = rb_next(node); 808 goto process_node; 809 } 810 } 811 out: 812 spin_unlock(&tree->lock); 813 } 814 815 static void set_state_bits(struct extent_io_tree *tree, 816 struct extent_state *state, 817 unsigned *bits, struct extent_changeset *changeset) 818 { 819 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 820 821 set_state_cb(tree, state, bits); 822 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 823 u64 range = state->end - state->start + 1; 824 tree->dirty_bytes += range; 825 } 826 add_extent_changeset(state, bits_to_set, changeset, 1); 827 state->state |= bits_to_set; 828 } 829 830 static void cache_state_if_flags(struct extent_state *state, 831 struct extent_state **cached_ptr, 832 unsigned flags) 833 { 834 if (cached_ptr && !(*cached_ptr)) { 835 if (!flags || (state->state & flags)) { 836 *cached_ptr = state; 837 refcount_inc(&state->refs); 838 } 839 } 840 } 841 842 static void cache_state(struct extent_state *state, 843 struct extent_state **cached_ptr) 844 { 845 return cache_state_if_flags(state, cached_ptr, 846 EXTENT_IOBITS | EXTENT_BOUNDARY); 847 } 848 849 /* 850 * set some bits on a range in the tree. This may require allocations or 851 * sleeping, so the gfp mask is used to indicate what is allowed. 852 * 853 * If any of the exclusive bits are set, this will fail with -EEXIST if some 854 * part of the range already has the desired bits set. The start of the 855 * existing range is returned in failed_start in this case. 856 * 857 * [start, end] is inclusive This takes the tree lock. 858 */ 859 860 static int __must_check 861 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 862 unsigned bits, unsigned exclusive_bits, 863 u64 *failed_start, struct extent_state **cached_state, 864 gfp_t mask, struct extent_changeset *changeset) 865 { 866 struct extent_state *state; 867 struct extent_state *prealloc = NULL; 868 struct rb_node *node; 869 struct rb_node **p; 870 struct rb_node *parent; 871 int err = 0; 872 u64 last_start; 873 u64 last_end; 874 875 btrfs_debug_check_extent_io_range(tree, start, end); 876 877 bits |= EXTENT_FIRST_DELALLOC; 878 again: 879 if (!prealloc && gfpflags_allow_blocking(mask)) { 880 /* 881 * Don't care for allocation failure here because we might end 882 * up not needing the pre-allocated extent state at all, which 883 * is the case if we only have in the tree extent states that 884 * cover our input range and don't cover too any other range. 885 * If we end up needing a new extent state we allocate it later. 886 */ 887 prealloc = alloc_extent_state(mask); 888 } 889 890 spin_lock(&tree->lock); 891 if (cached_state && *cached_state) { 892 state = *cached_state; 893 if (state->start <= start && state->end > start && 894 extent_state_in_tree(state)) { 895 node = &state->rb_node; 896 goto hit_next; 897 } 898 } 899 /* 900 * this search will find all the extents that end after 901 * our range starts. 902 */ 903 node = tree_search_for_insert(tree, start, &p, &parent); 904 if (!node) { 905 prealloc = alloc_extent_state_atomic(prealloc); 906 BUG_ON(!prealloc); 907 err = insert_state(tree, prealloc, start, end, 908 &p, &parent, &bits, changeset); 909 if (err) 910 extent_io_tree_panic(tree, err); 911 912 cache_state(prealloc, cached_state); 913 prealloc = NULL; 914 goto out; 915 } 916 state = rb_entry(node, struct extent_state, rb_node); 917 hit_next: 918 last_start = state->start; 919 last_end = state->end; 920 921 /* 922 * | ---- desired range ---- | 923 * | state | 924 * 925 * Just lock what we found and keep going 926 */ 927 if (state->start == start && state->end <= end) { 928 if (state->state & exclusive_bits) { 929 *failed_start = state->start; 930 err = -EEXIST; 931 goto out; 932 } 933 934 set_state_bits(tree, state, &bits, changeset); 935 cache_state(state, cached_state); 936 merge_state(tree, state); 937 if (last_end == (u64)-1) 938 goto out; 939 start = last_end + 1; 940 state = next_state(state); 941 if (start < end && state && state->start == start && 942 !need_resched()) 943 goto hit_next; 944 goto search_again; 945 } 946 947 /* 948 * | ---- desired range ---- | 949 * | state | 950 * or 951 * | ------------- state -------------- | 952 * 953 * We need to split the extent we found, and may flip bits on 954 * second half. 955 * 956 * If the extent we found extends past our 957 * range, we just split and search again. It'll get split 958 * again the next time though. 959 * 960 * If the extent we found is inside our range, we set the 961 * desired bit on it. 962 */ 963 if (state->start < start) { 964 if (state->state & exclusive_bits) { 965 *failed_start = start; 966 err = -EEXIST; 967 goto out; 968 } 969 970 prealloc = alloc_extent_state_atomic(prealloc); 971 BUG_ON(!prealloc); 972 err = split_state(tree, state, prealloc, start); 973 if (err) 974 extent_io_tree_panic(tree, err); 975 976 prealloc = NULL; 977 if (err) 978 goto out; 979 if (state->end <= end) { 980 set_state_bits(tree, state, &bits, changeset); 981 cache_state(state, cached_state); 982 merge_state(tree, state); 983 if (last_end == (u64)-1) 984 goto out; 985 start = last_end + 1; 986 state = next_state(state); 987 if (start < end && state && state->start == start && 988 !need_resched()) 989 goto hit_next; 990 } 991 goto search_again; 992 } 993 /* 994 * | ---- desired range ---- | 995 * | state | or | state | 996 * 997 * There's a hole, we need to insert something in it and 998 * ignore the extent we found. 999 */ 1000 if (state->start > start) { 1001 u64 this_end; 1002 if (end < last_start) 1003 this_end = end; 1004 else 1005 this_end = last_start - 1; 1006 1007 prealloc = alloc_extent_state_atomic(prealloc); 1008 BUG_ON(!prealloc); 1009 1010 /* 1011 * Avoid to free 'prealloc' if it can be merged with 1012 * the later extent. 1013 */ 1014 err = insert_state(tree, prealloc, start, this_end, 1015 NULL, NULL, &bits, changeset); 1016 if (err) 1017 extent_io_tree_panic(tree, err); 1018 1019 cache_state(prealloc, cached_state); 1020 prealloc = NULL; 1021 start = this_end + 1; 1022 goto search_again; 1023 } 1024 /* 1025 * | ---- desired range ---- | 1026 * | state | 1027 * We need to split the extent, and set the bit 1028 * on the first half 1029 */ 1030 if (state->start <= end && state->end > end) { 1031 if (state->state & exclusive_bits) { 1032 *failed_start = start; 1033 err = -EEXIST; 1034 goto out; 1035 } 1036 1037 prealloc = alloc_extent_state_atomic(prealloc); 1038 BUG_ON(!prealloc); 1039 err = split_state(tree, state, prealloc, end + 1); 1040 if (err) 1041 extent_io_tree_panic(tree, err); 1042 1043 set_state_bits(tree, prealloc, &bits, changeset); 1044 cache_state(prealloc, cached_state); 1045 merge_state(tree, prealloc); 1046 prealloc = NULL; 1047 goto out; 1048 } 1049 1050 search_again: 1051 if (start > end) 1052 goto out; 1053 spin_unlock(&tree->lock); 1054 if (gfpflags_allow_blocking(mask)) 1055 cond_resched(); 1056 goto again; 1057 1058 out: 1059 spin_unlock(&tree->lock); 1060 if (prealloc) 1061 free_extent_state(prealloc); 1062 1063 return err; 1064 1065 } 1066 1067 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1068 unsigned bits, u64 * failed_start, 1069 struct extent_state **cached_state, gfp_t mask) 1070 { 1071 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1072 cached_state, mask, NULL); 1073 } 1074 1075 1076 /** 1077 * convert_extent_bit - convert all bits in a given range from one bit to 1078 * another 1079 * @tree: the io tree to search 1080 * @start: the start offset in bytes 1081 * @end: the end offset in bytes (inclusive) 1082 * @bits: the bits to set in this range 1083 * @clear_bits: the bits to clear in this range 1084 * @cached_state: state that we're going to cache 1085 * 1086 * This will go through and set bits for the given range. If any states exist 1087 * already in this range they are set with the given bit and cleared of the 1088 * clear_bits. This is only meant to be used by things that are mergeable, ie 1089 * converting from say DELALLOC to DIRTY. This is not meant to be used with 1090 * boundary bits like LOCK. 1091 * 1092 * All allocations are done with GFP_NOFS. 1093 */ 1094 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1095 unsigned bits, unsigned clear_bits, 1096 struct extent_state **cached_state) 1097 { 1098 struct extent_state *state; 1099 struct extent_state *prealloc = NULL; 1100 struct rb_node *node; 1101 struct rb_node **p; 1102 struct rb_node *parent; 1103 int err = 0; 1104 u64 last_start; 1105 u64 last_end; 1106 bool first_iteration = true; 1107 1108 btrfs_debug_check_extent_io_range(tree, start, end); 1109 1110 again: 1111 if (!prealloc) { 1112 /* 1113 * Best effort, don't worry if extent state allocation fails 1114 * here for the first iteration. We might have a cached state 1115 * that matches exactly the target range, in which case no 1116 * extent state allocations are needed. We'll only know this 1117 * after locking the tree. 1118 */ 1119 prealloc = alloc_extent_state(GFP_NOFS); 1120 if (!prealloc && !first_iteration) 1121 return -ENOMEM; 1122 } 1123 1124 spin_lock(&tree->lock); 1125 if (cached_state && *cached_state) { 1126 state = *cached_state; 1127 if (state->start <= start && state->end > start && 1128 extent_state_in_tree(state)) { 1129 node = &state->rb_node; 1130 goto hit_next; 1131 } 1132 } 1133 1134 /* 1135 * this search will find all the extents that end after 1136 * our range starts. 1137 */ 1138 node = tree_search_for_insert(tree, start, &p, &parent); 1139 if (!node) { 1140 prealloc = alloc_extent_state_atomic(prealloc); 1141 if (!prealloc) { 1142 err = -ENOMEM; 1143 goto out; 1144 } 1145 err = insert_state(tree, prealloc, start, end, 1146 &p, &parent, &bits, NULL); 1147 if (err) 1148 extent_io_tree_panic(tree, err); 1149 cache_state(prealloc, cached_state); 1150 prealloc = NULL; 1151 goto out; 1152 } 1153 state = rb_entry(node, struct extent_state, rb_node); 1154 hit_next: 1155 last_start = state->start; 1156 last_end = state->end; 1157 1158 /* 1159 * | ---- desired range ---- | 1160 * | state | 1161 * 1162 * Just lock what we found and keep going 1163 */ 1164 if (state->start == start && state->end <= end) { 1165 set_state_bits(tree, state, &bits, NULL); 1166 cache_state(state, cached_state); 1167 state = clear_state_bit(tree, state, &clear_bits, 0, NULL); 1168 if (last_end == (u64)-1) 1169 goto out; 1170 start = last_end + 1; 1171 if (start < end && state && state->start == start && 1172 !need_resched()) 1173 goto hit_next; 1174 goto search_again; 1175 } 1176 1177 /* 1178 * | ---- desired range ---- | 1179 * | state | 1180 * or 1181 * | ------------- state -------------- | 1182 * 1183 * We need to split the extent we found, and may flip bits on 1184 * second half. 1185 * 1186 * If the extent we found extends past our 1187 * range, we just split and search again. It'll get split 1188 * again the next time though. 1189 * 1190 * If the extent we found is inside our range, we set the 1191 * desired bit on it. 1192 */ 1193 if (state->start < start) { 1194 prealloc = alloc_extent_state_atomic(prealloc); 1195 if (!prealloc) { 1196 err = -ENOMEM; 1197 goto out; 1198 } 1199 err = split_state(tree, state, prealloc, start); 1200 if (err) 1201 extent_io_tree_panic(tree, err); 1202 prealloc = NULL; 1203 if (err) 1204 goto out; 1205 if (state->end <= end) { 1206 set_state_bits(tree, state, &bits, NULL); 1207 cache_state(state, cached_state); 1208 state = clear_state_bit(tree, state, &clear_bits, 0, 1209 NULL); 1210 if (last_end == (u64)-1) 1211 goto out; 1212 start = last_end + 1; 1213 if (start < end && state && state->start == start && 1214 !need_resched()) 1215 goto hit_next; 1216 } 1217 goto search_again; 1218 } 1219 /* 1220 * | ---- desired range ---- | 1221 * | state | or | state | 1222 * 1223 * There's a hole, we need to insert something in it and 1224 * ignore the extent we found. 1225 */ 1226 if (state->start > start) { 1227 u64 this_end; 1228 if (end < last_start) 1229 this_end = end; 1230 else 1231 this_end = last_start - 1; 1232 1233 prealloc = alloc_extent_state_atomic(prealloc); 1234 if (!prealloc) { 1235 err = -ENOMEM; 1236 goto out; 1237 } 1238 1239 /* 1240 * Avoid to free 'prealloc' if it can be merged with 1241 * the later extent. 1242 */ 1243 err = insert_state(tree, prealloc, start, this_end, 1244 NULL, NULL, &bits, NULL); 1245 if (err) 1246 extent_io_tree_panic(tree, err); 1247 cache_state(prealloc, cached_state); 1248 prealloc = NULL; 1249 start = this_end + 1; 1250 goto search_again; 1251 } 1252 /* 1253 * | ---- desired range ---- | 1254 * | state | 1255 * We need to split the extent, and set the bit 1256 * on the first half 1257 */ 1258 if (state->start <= end && state->end > end) { 1259 prealloc = alloc_extent_state_atomic(prealloc); 1260 if (!prealloc) { 1261 err = -ENOMEM; 1262 goto out; 1263 } 1264 1265 err = split_state(tree, state, prealloc, end + 1); 1266 if (err) 1267 extent_io_tree_panic(tree, err); 1268 1269 set_state_bits(tree, prealloc, &bits, NULL); 1270 cache_state(prealloc, cached_state); 1271 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); 1272 prealloc = NULL; 1273 goto out; 1274 } 1275 1276 search_again: 1277 if (start > end) 1278 goto out; 1279 spin_unlock(&tree->lock); 1280 cond_resched(); 1281 first_iteration = false; 1282 goto again; 1283 1284 out: 1285 spin_unlock(&tree->lock); 1286 if (prealloc) 1287 free_extent_state(prealloc); 1288 1289 return err; 1290 } 1291 1292 /* wrappers around set/clear extent bit */ 1293 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1294 unsigned bits, struct extent_changeset *changeset) 1295 { 1296 /* 1297 * We don't support EXTENT_LOCKED yet, as current changeset will 1298 * record any bits changed, so for EXTENT_LOCKED case, it will 1299 * either fail with -EEXIST or changeset will record the whole 1300 * range. 1301 */ 1302 BUG_ON(bits & EXTENT_LOCKED); 1303 1304 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, 1305 changeset); 1306 } 1307 1308 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 1309 unsigned bits, int wake, int delete, 1310 struct extent_state **cached, gfp_t mask) 1311 { 1312 return __clear_extent_bit(tree, start, end, bits, wake, delete, 1313 cached, mask, NULL); 1314 } 1315 1316 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1317 unsigned bits, struct extent_changeset *changeset) 1318 { 1319 /* 1320 * Don't support EXTENT_LOCKED case, same reason as 1321 * set_record_extent_bits(). 1322 */ 1323 BUG_ON(bits & EXTENT_LOCKED); 1324 1325 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, 1326 changeset); 1327 } 1328 1329 /* 1330 * either insert or lock state struct between start and end use mask to tell 1331 * us if waiting is desired. 1332 */ 1333 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1334 struct extent_state **cached_state) 1335 { 1336 int err; 1337 u64 failed_start; 1338 1339 while (1) { 1340 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, 1341 EXTENT_LOCKED, &failed_start, 1342 cached_state, GFP_NOFS, NULL); 1343 if (err == -EEXIST) { 1344 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1345 start = failed_start; 1346 } else 1347 break; 1348 WARN_ON(start > end); 1349 } 1350 return err; 1351 } 1352 1353 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) 1354 { 1355 int err; 1356 u64 failed_start; 1357 1358 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1359 &failed_start, NULL, GFP_NOFS, NULL); 1360 if (err == -EEXIST) { 1361 if (failed_start > start) 1362 clear_extent_bit(tree, start, failed_start - 1, 1363 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 1364 return 0; 1365 } 1366 return 1; 1367 } 1368 1369 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) 1370 { 1371 unsigned long index = start >> PAGE_SHIFT; 1372 unsigned long end_index = end >> PAGE_SHIFT; 1373 struct page *page; 1374 1375 while (index <= end_index) { 1376 page = find_get_page(inode->i_mapping, index); 1377 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1378 clear_page_dirty_for_io(page); 1379 put_page(page); 1380 index++; 1381 } 1382 } 1383 1384 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) 1385 { 1386 unsigned long index = start >> PAGE_SHIFT; 1387 unsigned long end_index = end >> PAGE_SHIFT; 1388 struct page *page; 1389 1390 while (index <= end_index) { 1391 page = find_get_page(inode->i_mapping, index); 1392 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1393 __set_page_dirty_nobuffers(page); 1394 account_page_redirty(page); 1395 put_page(page); 1396 index++; 1397 } 1398 } 1399 1400 /* 1401 * helper function to set both pages and extents in the tree writeback 1402 */ 1403 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1404 { 1405 unsigned long index = start >> PAGE_SHIFT; 1406 unsigned long end_index = end >> PAGE_SHIFT; 1407 struct page *page; 1408 1409 while (index <= end_index) { 1410 page = find_get_page(tree->mapping, index); 1411 BUG_ON(!page); /* Pages should be in the extent_io_tree */ 1412 set_page_writeback(page); 1413 put_page(page); 1414 index++; 1415 } 1416 } 1417 1418 /* find the first state struct with 'bits' set after 'start', and 1419 * return it. tree->lock must be held. NULL will returned if 1420 * nothing was found after 'start' 1421 */ 1422 static struct extent_state * 1423 find_first_extent_bit_state(struct extent_io_tree *tree, 1424 u64 start, unsigned bits) 1425 { 1426 struct rb_node *node; 1427 struct extent_state *state; 1428 1429 /* 1430 * this search will find all the extents that end after 1431 * our range starts. 1432 */ 1433 node = tree_search(tree, start); 1434 if (!node) 1435 goto out; 1436 1437 while (1) { 1438 state = rb_entry(node, struct extent_state, rb_node); 1439 if (state->end >= start && (state->state & bits)) 1440 return state; 1441 1442 node = rb_next(node); 1443 if (!node) 1444 break; 1445 } 1446 out: 1447 return NULL; 1448 } 1449 1450 /* 1451 * find the first offset in the io tree with 'bits' set. zero is 1452 * returned if we find something, and *start_ret and *end_ret are 1453 * set to reflect the state struct that was found. 1454 * 1455 * If nothing was found, 1 is returned. If found something, return 0. 1456 */ 1457 int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 1458 u64 *start_ret, u64 *end_ret, unsigned bits, 1459 struct extent_state **cached_state) 1460 { 1461 struct extent_state *state; 1462 struct rb_node *n; 1463 int ret = 1; 1464 1465 spin_lock(&tree->lock); 1466 if (cached_state && *cached_state) { 1467 state = *cached_state; 1468 if (state->end == start - 1 && extent_state_in_tree(state)) { 1469 n = rb_next(&state->rb_node); 1470 while (n) { 1471 state = rb_entry(n, struct extent_state, 1472 rb_node); 1473 if (state->state & bits) 1474 goto got_it; 1475 n = rb_next(n); 1476 } 1477 free_extent_state(*cached_state); 1478 *cached_state = NULL; 1479 goto out; 1480 } 1481 free_extent_state(*cached_state); 1482 *cached_state = NULL; 1483 } 1484 1485 state = find_first_extent_bit_state(tree, start, bits); 1486 got_it: 1487 if (state) { 1488 cache_state_if_flags(state, cached_state, 0); 1489 *start_ret = state->start; 1490 *end_ret = state->end; 1491 ret = 0; 1492 } 1493 out: 1494 spin_unlock(&tree->lock); 1495 return ret; 1496 } 1497 1498 /* 1499 * find a contiguous range of bytes in the file marked as delalloc, not 1500 * more than 'max_bytes'. start and end are used to return the range, 1501 * 1502 * 1 is returned if we find something, 0 if nothing was in the tree 1503 */ 1504 static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1505 u64 *start, u64 *end, u64 max_bytes, 1506 struct extent_state **cached_state) 1507 { 1508 struct rb_node *node; 1509 struct extent_state *state; 1510 u64 cur_start = *start; 1511 u64 found = 0; 1512 u64 total_bytes = 0; 1513 1514 spin_lock(&tree->lock); 1515 1516 /* 1517 * this search will find all the extents that end after 1518 * our range starts. 1519 */ 1520 node = tree_search(tree, cur_start); 1521 if (!node) { 1522 if (!found) 1523 *end = (u64)-1; 1524 goto out; 1525 } 1526 1527 while (1) { 1528 state = rb_entry(node, struct extent_state, rb_node); 1529 if (found && (state->start != cur_start || 1530 (state->state & EXTENT_BOUNDARY))) { 1531 goto out; 1532 } 1533 if (!(state->state & EXTENT_DELALLOC)) { 1534 if (!found) 1535 *end = state->end; 1536 goto out; 1537 } 1538 if (!found) { 1539 *start = state->start; 1540 *cached_state = state; 1541 refcount_inc(&state->refs); 1542 } 1543 found++; 1544 *end = state->end; 1545 cur_start = state->end + 1; 1546 node = rb_next(node); 1547 total_bytes += state->end - state->start + 1; 1548 if (total_bytes >= max_bytes) 1549 break; 1550 if (!node) 1551 break; 1552 } 1553 out: 1554 spin_unlock(&tree->lock); 1555 return found; 1556 } 1557 1558 static int __process_pages_contig(struct address_space *mapping, 1559 struct page *locked_page, 1560 pgoff_t start_index, pgoff_t end_index, 1561 unsigned long page_ops, pgoff_t *index_ret); 1562 1563 static noinline void __unlock_for_delalloc(struct inode *inode, 1564 struct page *locked_page, 1565 u64 start, u64 end) 1566 { 1567 unsigned long index = start >> PAGE_SHIFT; 1568 unsigned long end_index = end >> PAGE_SHIFT; 1569 1570 ASSERT(locked_page); 1571 if (index == locked_page->index && end_index == index) 1572 return; 1573 1574 __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1575 PAGE_UNLOCK, NULL); 1576 } 1577 1578 static noinline int lock_delalloc_pages(struct inode *inode, 1579 struct page *locked_page, 1580 u64 delalloc_start, 1581 u64 delalloc_end) 1582 { 1583 unsigned long index = delalloc_start >> PAGE_SHIFT; 1584 unsigned long index_ret = index; 1585 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1586 int ret; 1587 1588 ASSERT(locked_page); 1589 if (index == locked_page->index && index == end_index) 1590 return 0; 1591 1592 ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1593 end_index, PAGE_LOCK, &index_ret); 1594 if (ret == -EAGAIN) 1595 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1596 (u64)index_ret << PAGE_SHIFT); 1597 return ret; 1598 } 1599 1600 /* 1601 * find a contiguous range of bytes in the file marked as delalloc, not 1602 * more than 'max_bytes'. start and end are used to return the range, 1603 * 1604 * 1 is returned if we find something, 0 if nothing was in the tree 1605 */ 1606 STATIC u64 find_lock_delalloc_range(struct inode *inode, 1607 struct extent_io_tree *tree, 1608 struct page *locked_page, u64 *start, 1609 u64 *end, u64 max_bytes) 1610 { 1611 u64 delalloc_start; 1612 u64 delalloc_end; 1613 u64 found; 1614 struct extent_state *cached_state = NULL; 1615 int ret; 1616 int loops = 0; 1617 1618 again: 1619 /* step one, find a bunch of delalloc bytes starting at start */ 1620 delalloc_start = *start; 1621 delalloc_end = 0; 1622 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1623 max_bytes, &cached_state); 1624 if (!found || delalloc_end <= *start) { 1625 *start = delalloc_start; 1626 *end = delalloc_end; 1627 free_extent_state(cached_state); 1628 return 0; 1629 } 1630 1631 /* 1632 * start comes from the offset of locked_page. We have to lock 1633 * pages in order, so we can't process delalloc bytes before 1634 * locked_page 1635 */ 1636 if (delalloc_start < *start) 1637 delalloc_start = *start; 1638 1639 /* 1640 * make sure to limit the number of pages we try to lock down 1641 */ 1642 if (delalloc_end + 1 - delalloc_start > max_bytes) 1643 delalloc_end = delalloc_start + max_bytes - 1; 1644 1645 /* step two, lock all the pages after the page that has start */ 1646 ret = lock_delalloc_pages(inode, locked_page, 1647 delalloc_start, delalloc_end); 1648 if (ret == -EAGAIN) { 1649 /* some of the pages are gone, lets avoid looping by 1650 * shortening the size of the delalloc range we're searching 1651 */ 1652 free_extent_state(cached_state); 1653 cached_state = NULL; 1654 if (!loops) { 1655 max_bytes = PAGE_SIZE; 1656 loops = 1; 1657 goto again; 1658 } else { 1659 found = 0; 1660 goto out_failed; 1661 } 1662 } 1663 BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ 1664 1665 /* step three, lock the state bits for the whole range */ 1666 lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); 1667 1668 /* then test to make sure it is all still delalloc */ 1669 ret = test_range_bit(tree, delalloc_start, delalloc_end, 1670 EXTENT_DELALLOC, 1, cached_state); 1671 if (!ret) { 1672 unlock_extent_cached(tree, delalloc_start, delalloc_end, 1673 &cached_state, GFP_NOFS); 1674 __unlock_for_delalloc(inode, locked_page, 1675 delalloc_start, delalloc_end); 1676 cond_resched(); 1677 goto again; 1678 } 1679 free_extent_state(cached_state); 1680 *start = delalloc_start; 1681 *end = delalloc_end; 1682 out_failed: 1683 return found; 1684 } 1685 1686 static int __process_pages_contig(struct address_space *mapping, 1687 struct page *locked_page, 1688 pgoff_t start_index, pgoff_t end_index, 1689 unsigned long page_ops, pgoff_t *index_ret) 1690 { 1691 unsigned long nr_pages = end_index - start_index + 1; 1692 unsigned long pages_locked = 0; 1693 pgoff_t index = start_index; 1694 struct page *pages[16]; 1695 unsigned ret; 1696 int err = 0; 1697 int i; 1698 1699 if (page_ops & PAGE_LOCK) { 1700 ASSERT(page_ops == PAGE_LOCK); 1701 ASSERT(index_ret && *index_ret == start_index); 1702 } 1703 1704 if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1705 mapping_set_error(mapping, -EIO); 1706 1707 while (nr_pages > 0) { 1708 ret = find_get_pages_contig(mapping, index, 1709 min_t(unsigned long, 1710 nr_pages, ARRAY_SIZE(pages)), pages); 1711 if (ret == 0) { 1712 /* 1713 * Only if we're going to lock these pages, 1714 * can we find nothing at @index. 1715 */ 1716 ASSERT(page_ops & PAGE_LOCK); 1717 err = -EAGAIN; 1718 goto out; 1719 } 1720 1721 for (i = 0; i < ret; i++) { 1722 if (page_ops & PAGE_SET_PRIVATE2) 1723 SetPagePrivate2(pages[i]); 1724 1725 if (pages[i] == locked_page) { 1726 put_page(pages[i]); 1727 pages_locked++; 1728 continue; 1729 } 1730 if (page_ops & PAGE_CLEAR_DIRTY) 1731 clear_page_dirty_for_io(pages[i]); 1732 if (page_ops & PAGE_SET_WRITEBACK) 1733 set_page_writeback(pages[i]); 1734 if (page_ops & PAGE_SET_ERROR) 1735 SetPageError(pages[i]); 1736 if (page_ops & PAGE_END_WRITEBACK) 1737 end_page_writeback(pages[i]); 1738 if (page_ops & PAGE_UNLOCK) 1739 unlock_page(pages[i]); 1740 if (page_ops & PAGE_LOCK) { 1741 lock_page(pages[i]); 1742 if (!PageDirty(pages[i]) || 1743 pages[i]->mapping != mapping) { 1744 unlock_page(pages[i]); 1745 put_page(pages[i]); 1746 err = -EAGAIN; 1747 goto out; 1748 } 1749 } 1750 put_page(pages[i]); 1751 pages_locked++; 1752 } 1753 nr_pages -= ret; 1754 index += ret; 1755 cond_resched(); 1756 } 1757 out: 1758 if (err && index_ret) 1759 *index_ret = start_index + pages_locked - 1; 1760 return err; 1761 } 1762 1763 void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, 1764 u64 delalloc_end, struct page *locked_page, 1765 unsigned clear_bits, 1766 unsigned long page_ops) 1767 { 1768 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0, 1769 NULL, GFP_NOFS); 1770 1771 __process_pages_contig(inode->i_mapping, locked_page, 1772 start >> PAGE_SHIFT, end >> PAGE_SHIFT, 1773 page_ops, NULL); 1774 } 1775 1776 /* 1777 * count the number of bytes in the tree that have a given bit(s) 1778 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1779 * cached. The total number found is returned. 1780 */ 1781 u64 count_range_bits(struct extent_io_tree *tree, 1782 u64 *start, u64 search_end, u64 max_bytes, 1783 unsigned bits, int contig) 1784 { 1785 struct rb_node *node; 1786 struct extent_state *state; 1787 u64 cur_start = *start; 1788 u64 total_bytes = 0; 1789 u64 last = 0; 1790 int found = 0; 1791 1792 if (WARN_ON(search_end <= cur_start)) 1793 return 0; 1794 1795 spin_lock(&tree->lock); 1796 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1797 total_bytes = tree->dirty_bytes; 1798 goto out; 1799 } 1800 /* 1801 * this search will find all the extents that end after 1802 * our range starts. 1803 */ 1804 node = tree_search(tree, cur_start); 1805 if (!node) 1806 goto out; 1807 1808 while (1) { 1809 state = rb_entry(node, struct extent_state, rb_node); 1810 if (state->start > search_end) 1811 break; 1812 if (contig && found && state->start > last + 1) 1813 break; 1814 if (state->end >= cur_start && (state->state & bits) == bits) { 1815 total_bytes += min(search_end, state->end) + 1 - 1816 max(cur_start, state->start); 1817 if (total_bytes >= max_bytes) 1818 break; 1819 if (!found) { 1820 *start = max(cur_start, state->start); 1821 found = 1; 1822 } 1823 last = state->end; 1824 } else if (contig && found) { 1825 break; 1826 } 1827 node = rb_next(node); 1828 if (!node) 1829 break; 1830 } 1831 out: 1832 spin_unlock(&tree->lock); 1833 return total_bytes; 1834 } 1835 1836 /* 1837 * set the private field for a given byte offset in the tree. If there isn't 1838 * an extent_state there already, this does nothing. 1839 */ 1840 static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start, 1841 struct io_failure_record *failrec) 1842 { 1843 struct rb_node *node; 1844 struct extent_state *state; 1845 int ret = 0; 1846 1847 spin_lock(&tree->lock); 1848 /* 1849 * this search will find all the extents that end after 1850 * our range starts. 1851 */ 1852 node = tree_search(tree, start); 1853 if (!node) { 1854 ret = -ENOENT; 1855 goto out; 1856 } 1857 state = rb_entry(node, struct extent_state, rb_node); 1858 if (state->start != start) { 1859 ret = -ENOENT; 1860 goto out; 1861 } 1862 state->failrec = failrec; 1863 out: 1864 spin_unlock(&tree->lock); 1865 return ret; 1866 } 1867 1868 static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start, 1869 struct io_failure_record **failrec) 1870 { 1871 struct rb_node *node; 1872 struct extent_state *state; 1873 int ret = 0; 1874 1875 spin_lock(&tree->lock); 1876 /* 1877 * this search will find all the extents that end after 1878 * our range starts. 1879 */ 1880 node = tree_search(tree, start); 1881 if (!node) { 1882 ret = -ENOENT; 1883 goto out; 1884 } 1885 state = rb_entry(node, struct extent_state, rb_node); 1886 if (state->start != start) { 1887 ret = -ENOENT; 1888 goto out; 1889 } 1890 *failrec = state->failrec; 1891 out: 1892 spin_unlock(&tree->lock); 1893 return ret; 1894 } 1895 1896 /* 1897 * searches a range in the state tree for a given mask. 1898 * If 'filled' == 1, this returns 1 only if every extent in the tree 1899 * has the bits set. Otherwise, 1 is returned if any bit in the 1900 * range is found set. 1901 */ 1902 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 1903 unsigned bits, int filled, struct extent_state *cached) 1904 { 1905 struct extent_state *state = NULL; 1906 struct rb_node *node; 1907 int bitset = 0; 1908 1909 spin_lock(&tree->lock); 1910 if (cached && extent_state_in_tree(cached) && cached->start <= start && 1911 cached->end > start) 1912 node = &cached->rb_node; 1913 else 1914 node = tree_search(tree, start); 1915 while (node && start <= end) { 1916 state = rb_entry(node, struct extent_state, rb_node); 1917 1918 if (filled && state->start > start) { 1919 bitset = 0; 1920 break; 1921 } 1922 1923 if (state->start > end) 1924 break; 1925 1926 if (state->state & bits) { 1927 bitset = 1; 1928 if (!filled) 1929 break; 1930 } else if (filled) { 1931 bitset = 0; 1932 break; 1933 } 1934 1935 if (state->end == (u64)-1) 1936 break; 1937 1938 start = state->end + 1; 1939 if (start > end) 1940 break; 1941 node = rb_next(node); 1942 if (!node) { 1943 if (filled) 1944 bitset = 0; 1945 break; 1946 } 1947 } 1948 spin_unlock(&tree->lock); 1949 return bitset; 1950 } 1951 1952 /* 1953 * helper function to set a given page up to date if all the 1954 * extents in the tree for that page are up to date 1955 */ 1956 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) 1957 { 1958 u64 start = page_offset(page); 1959 u64 end = start + PAGE_SIZE - 1; 1960 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) 1961 SetPageUptodate(page); 1962 } 1963 1964 int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec) 1965 { 1966 int ret; 1967 int err = 0; 1968 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 1969 1970 set_state_failrec(failure_tree, rec->start, NULL); 1971 ret = clear_extent_bits(failure_tree, rec->start, 1972 rec->start + rec->len - 1, 1973 EXTENT_LOCKED | EXTENT_DIRTY); 1974 if (ret) 1975 err = ret; 1976 1977 ret = clear_extent_bits(&inode->io_tree, rec->start, 1978 rec->start + rec->len - 1, 1979 EXTENT_DAMAGED); 1980 if (ret && !err) 1981 err = ret; 1982 1983 kfree(rec); 1984 return err; 1985 } 1986 1987 /* 1988 * this bypasses the standard btrfs submit functions deliberately, as 1989 * the standard behavior is to write all copies in a raid setup. here we only 1990 * want to write the one bad copy. so we do the mapping for ourselves and issue 1991 * submit_bio directly. 1992 * to avoid any synchronization issues, wait for the data after writing, which 1993 * actually prevents the read that triggered the error from finishing. 1994 * currently, there can be no more than two copies of every data bit. thus, 1995 * exactly one rewrite is required. 1996 */ 1997 int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, 1998 u64 logical, struct page *page, 1999 unsigned int pg_offset, int mirror_num) 2000 { 2001 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2002 struct bio *bio; 2003 struct btrfs_device *dev; 2004 u64 map_length = 0; 2005 u64 sector; 2006 struct btrfs_bio *bbio = NULL; 2007 int ret; 2008 2009 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); 2010 BUG_ON(!mirror_num); 2011 2012 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2013 if (!bio) 2014 return -EIO; 2015 bio->bi_iter.bi_size = 0; 2016 map_length = length; 2017 2018 /* 2019 * Avoid races with device replace and make sure our bbio has devices 2020 * associated to its stripes that don't go away while we are doing the 2021 * read repair operation. 2022 */ 2023 btrfs_bio_counter_inc_blocked(fs_info); 2024 if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { 2025 /* 2026 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 2027 * to update all raid stripes, but here we just want to correct 2028 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 2029 * stripe's dev and sector. 2030 */ 2031 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 2032 &map_length, &bbio, 0); 2033 if (ret) { 2034 btrfs_bio_counter_dec(fs_info); 2035 bio_put(bio); 2036 return -EIO; 2037 } 2038 ASSERT(bbio->mirror_num == 1); 2039 } else { 2040 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 2041 &map_length, &bbio, mirror_num); 2042 if (ret) { 2043 btrfs_bio_counter_dec(fs_info); 2044 bio_put(bio); 2045 return -EIO; 2046 } 2047 BUG_ON(mirror_num != bbio->mirror_num); 2048 } 2049 2050 sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; 2051 bio->bi_iter.bi_sector = sector; 2052 dev = bbio->stripes[bbio->mirror_num - 1].dev; 2053 btrfs_put_bbio(bbio); 2054 if (!dev || !dev->bdev || !dev->writeable) { 2055 btrfs_bio_counter_dec(fs_info); 2056 bio_put(bio); 2057 return -EIO; 2058 } 2059 bio->bi_bdev = dev->bdev; 2060 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 2061 bio_add_page(bio, page, length, pg_offset); 2062 2063 if (btrfsic_submit_bio_wait(bio)) { 2064 /* try to remap that extent elsewhere? */ 2065 btrfs_bio_counter_dec(fs_info); 2066 bio_put(bio); 2067 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2068 return -EIO; 2069 } 2070 2071 btrfs_info_rl_in_rcu(fs_info, 2072 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 2073 btrfs_ino(inode), start, 2074 rcu_str_deref(dev->name), sector); 2075 btrfs_bio_counter_dec(fs_info); 2076 bio_put(bio); 2077 return 0; 2078 } 2079 2080 int repair_eb_io_failure(struct btrfs_fs_info *fs_info, 2081 struct extent_buffer *eb, int mirror_num) 2082 { 2083 u64 start = eb->start; 2084 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2085 int ret = 0; 2086 2087 if (fs_info->sb->s_flags & MS_RDONLY) 2088 return -EROFS; 2089 2090 for (i = 0; i < num_pages; i++) { 2091 struct page *p = eb->pages[i]; 2092 2093 ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start, 2094 PAGE_SIZE, start, p, 2095 start - page_offset(p), mirror_num); 2096 if (ret) 2097 break; 2098 start += PAGE_SIZE; 2099 } 2100 2101 return ret; 2102 } 2103 2104 /* 2105 * each time an IO finishes, we do a fast check in the IO failure tree 2106 * to see if we need to process or clean up an io_failure_record 2107 */ 2108 int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, 2109 unsigned int pg_offset) 2110 { 2111 u64 private; 2112 struct io_failure_record *failrec; 2113 struct btrfs_fs_info *fs_info = inode->root->fs_info; 2114 struct extent_state *state; 2115 int num_copies; 2116 int ret; 2117 2118 private = 0; 2119 ret = count_range_bits(&inode->io_failure_tree, &private, 2120 (u64)-1, 1, EXTENT_DIRTY, 0); 2121 if (!ret) 2122 return 0; 2123 2124 ret = get_state_failrec(&inode->io_failure_tree, start, 2125 &failrec); 2126 if (ret) 2127 return 0; 2128 2129 BUG_ON(!failrec->this_mirror); 2130 2131 if (failrec->in_validation) { 2132 /* there was no real error, just free the record */ 2133 btrfs_debug(fs_info, 2134 "clean_io_failure: freeing dummy error at %llu", 2135 failrec->start); 2136 goto out; 2137 } 2138 if (fs_info->sb->s_flags & MS_RDONLY) 2139 goto out; 2140 2141 spin_lock(&inode->io_tree.lock); 2142 state = find_first_extent_bit_state(&inode->io_tree, 2143 failrec->start, 2144 EXTENT_LOCKED); 2145 spin_unlock(&inode->io_tree.lock); 2146 2147 if (state && state->start <= failrec->start && 2148 state->end >= failrec->start + failrec->len - 1) { 2149 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2150 failrec->len); 2151 if (num_copies > 1) { 2152 repair_io_failure(inode, start, failrec->len, 2153 failrec->logical, page, 2154 pg_offset, failrec->failed_mirror); 2155 } 2156 } 2157 2158 out: 2159 free_io_failure(inode, failrec); 2160 2161 return 0; 2162 } 2163 2164 /* 2165 * Can be called when 2166 * - hold extent lock 2167 * - under ordered extent 2168 * - the inode is freeing 2169 */ 2170 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) 2171 { 2172 struct extent_io_tree *failure_tree = &inode->io_failure_tree; 2173 struct io_failure_record *failrec; 2174 struct extent_state *state, *next; 2175 2176 if (RB_EMPTY_ROOT(&failure_tree->state)) 2177 return; 2178 2179 spin_lock(&failure_tree->lock); 2180 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); 2181 while (state) { 2182 if (state->start > end) 2183 break; 2184 2185 ASSERT(state->end <= end); 2186 2187 next = next_state(state); 2188 2189 failrec = state->failrec; 2190 free_extent_state(state); 2191 kfree(failrec); 2192 2193 state = next; 2194 } 2195 spin_unlock(&failure_tree->lock); 2196 } 2197 2198 int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, 2199 struct io_failure_record **failrec_ret) 2200 { 2201 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2202 struct io_failure_record *failrec; 2203 struct extent_map *em; 2204 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2205 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2206 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2207 int ret; 2208 u64 logical; 2209 2210 ret = get_state_failrec(failure_tree, start, &failrec); 2211 if (ret) { 2212 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2213 if (!failrec) 2214 return -ENOMEM; 2215 2216 failrec->start = start; 2217 failrec->len = end - start + 1; 2218 failrec->this_mirror = 0; 2219 failrec->bio_flags = 0; 2220 failrec->in_validation = 0; 2221 2222 read_lock(&em_tree->lock); 2223 em = lookup_extent_mapping(em_tree, start, failrec->len); 2224 if (!em) { 2225 read_unlock(&em_tree->lock); 2226 kfree(failrec); 2227 return -EIO; 2228 } 2229 2230 if (em->start > start || em->start + em->len <= start) { 2231 free_extent_map(em); 2232 em = NULL; 2233 } 2234 read_unlock(&em_tree->lock); 2235 if (!em) { 2236 kfree(failrec); 2237 return -EIO; 2238 } 2239 2240 logical = start - em->start; 2241 logical = em->block_start + logical; 2242 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2243 logical = em->block_start; 2244 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 2245 extent_set_compress_type(&failrec->bio_flags, 2246 em->compress_type); 2247 } 2248 2249 btrfs_debug(fs_info, 2250 "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", 2251 logical, start, failrec->len); 2252 2253 failrec->logical = logical; 2254 free_extent_map(em); 2255 2256 /* set the bits in the private failure tree */ 2257 ret = set_extent_bits(failure_tree, start, end, 2258 EXTENT_LOCKED | EXTENT_DIRTY); 2259 if (ret >= 0) 2260 ret = set_state_failrec(failure_tree, start, failrec); 2261 /* set the bits in the inode's tree */ 2262 if (ret >= 0) 2263 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2264 if (ret < 0) { 2265 kfree(failrec); 2266 return ret; 2267 } 2268 } else { 2269 btrfs_debug(fs_info, 2270 "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2271 failrec->logical, failrec->start, failrec->len, 2272 failrec->in_validation); 2273 /* 2274 * when data can be on disk more than twice, add to failrec here 2275 * (e.g. with a list for failed_mirror) to make 2276 * clean_io_failure() clean all those errors at once. 2277 */ 2278 } 2279 2280 *failrec_ret = failrec; 2281 2282 return 0; 2283 } 2284 2285 int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, 2286 struct io_failure_record *failrec, int failed_mirror) 2287 { 2288 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2289 int num_copies; 2290 2291 num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); 2292 if (num_copies == 1) { 2293 /* 2294 * we only have a single copy of the data, so don't bother with 2295 * all the retry and error correction code that follows. no 2296 * matter what the error is, it is very likely to persist. 2297 */ 2298 btrfs_debug(fs_info, 2299 "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", 2300 num_copies, failrec->this_mirror, failed_mirror); 2301 return 0; 2302 } 2303 2304 /* 2305 * there are two premises: 2306 * a) deliver good data to the caller 2307 * b) correct the bad sectors on disk 2308 */ 2309 if (failed_bio->bi_vcnt > 1) { 2310 /* 2311 * to fulfill b), we need to know the exact failing sectors, as 2312 * we don't want to rewrite any more than the failed ones. thus, 2313 * we need separate read requests for the failed bio 2314 * 2315 * if the following BUG_ON triggers, our validation request got 2316 * merged. we need separate requests for our algorithm to work. 2317 */ 2318 BUG_ON(failrec->in_validation); 2319 failrec->in_validation = 1; 2320 failrec->this_mirror = failed_mirror; 2321 } else { 2322 /* 2323 * we're ready to fulfill a) and b) alongside. get a good copy 2324 * of the failed sector and if we succeed, we have setup 2325 * everything for repair_io_failure to do the rest for us. 2326 */ 2327 if (failrec->in_validation) { 2328 BUG_ON(failrec->this_mirror != failed_mirror); 2329 failrec->in_validation = 0; 2330 failrec->this_mirror = 0; 2331 } 2332 failrec->failed_mirror = failed_mirror; 2333 failrec->this_mirror++; 2334 if (failrec->this_mirror == failed_mirror) 2335 failrec->this_mirror++; 2336 } 2337 2338 if (failrec->this_mirror > num_copies) { 2339 btrfs_debug(fs_info, 2340 "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", 2341 num_copies, failrec->this_mirror, failed_mirror); 2342 return 0; 2343 } 2344 2345 return 1; 2346 } 2347 2348 2349 struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, 2350 struct io_failure_record *failrec, 2351 struct page *page, int pg_offset, int icsum, 2352 bio_end_io_t *endio_func, void *data) 2353 { 2354 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2355 struct bio *bio; 2356 struct btrfs_io_bio *btrfs_failed_bio; 2357 struct btrfs_io_bio *btrfs_bio; 2358 2359 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2360 if (!bio) 2361 return NULL; 2362 2363 bio->bi_end_io = endio_func; 2364 bio->bi_iter.bi_sector = failrec->logical >> 9; 2365 bio->bi_bdev = fs_info->fs_devices->latest_bdev; 2366 bio->bi_iter.bi_size = 0; 2367 bio->bi_private = data; 2368 2369 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2370 if (btrfs_failed_bio->csum) { 2371 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 2372 2373 btrfs_bio = btrfs_io_bio(bio); 2374 btrfs_bio->csum = btrfs_bio->csum_inline; 2375 icsum *= csum_size; 2376 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, 2377 csum_size); 2378 } 2379 2380 bio_add_page(bio, page, failrec->len, pg_offset); 2381 2382 return bio; 2383 } 2384 2385 /* 2386 * this is a generic handler for readpage errors (default 2387 * readpage_io_failed_hook). if other copies exist, read those and write back 2388 * good data to the failed position. does not investigate in remapping the 2389 * failed extent elsewhere, hoping the device will be smart enough to do this as 2390 * needed 2391 */ 2392 2393 static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2394 struct page *page, u64 start, u64 end, 2395 int failed_mirror) 2396 { 2397 struct io_failure_record *failrec; 2398 struct inode *inode = page->mapping->host; 2399 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2400 struct bio *bio; 2401 int read_mode = 0; 2402 int ret; 2403 2404 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2405 2406 ret = btrfs_get_io_failure_record(inode, start, end, &failrec); 2407 if (ret) 2408 return ret; 2409 2410 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); 2411 if (!ret) { 2412 free_io_failure(BTRFS_I(inode), failrec); 2413 return -EIO; 2414 } 2415 2416 if (failed_bio->bi_vcnt > 1) 2417 read_mode |= REQ_FAILFAST_DEV; 2418 2419 phy_offset >>= inode->i_sb->s_blocksize_bits; 2420 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, 2421 start - page_offset(page), 2422 (int)phy_offset, failed_bio->bi_end_io, 2423 NULL); 2424 if (!bio) { 2425 free_io_failure(BTRFS_I(inode), failrec); 2426 return -EIO; 2427 } 2428 bio_set_op_attrs(bio, REQ_OP_READ, read_mode); 2429 2430 btrfs_debug(btrfs_sb(inode->i_sb), 2431 "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", 2432 read_mode, failrec->this_mirror, failrec->in_validation); 2433 2434 ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, 2435 failrec->bio_flags, 0); 2436 if (ret) { 2437 free_io_failure(BTRFS_I(inode), failrec); 2438 bio_put(bio); 2439 } 2440 2441 return ret; 2442 } 2443 2444 /* lots and lots of room for performance fixes in the end_bio funcs */ 2445 2446 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2447 { 2448 int uptodate = (err == 0); 2449 struct extent_io_tree *tree; 2450 int ret = 0; 2451 2452 tree = &BTRFS_I(page->mapping->host)->io_tree; 2453 2454 if (tree->ops && tree->ops->writepage_end_io_hook) 2455 tree->ops->writepage_end_io_hook(page, start, end, NULL, 2456 uptodate); 2457 2458 if (!uptodate) { 2459 ClearPageUptodate(page); 2460 SetPageError(page); 2461 ret = err < 0 ? err : -EIO; 2462 mapping_set_error(page->mapping, ret); 2463 } 2464 } 2465 2466 /* 2467 * after a writepage IO is done, we need to: 2468 * clear the uptodate bits on error 2469 * clear the writeback bits in the extent tree for this IO 2470 * end_page_writeback if the page has no more pending IO 2471 * 2472 * Scheduling is not allowed, so the extent state tree is expected 2473 * to have one and only one object corresponding to this IO. 2474 */ 2475 static void end_bio_extent_writepage(struct bio *bio) 2476 { 2477 struct bio_vec *bvec; 2478 u64 start; 2479 u64 end; 2480 int i; 2481 2482 bio_for_each_segment_all(bvec, bio, i) { 2483 struct page *page = bvec->bv_page; 2484 struct inode *inode = page->mapping->host; 2485 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2486 2487 /* We always issue full-page reads, but if some block 2488 * in a page fails to read, blk_update_request() will 2489 * advance bv_offset and adjust bv_len to compensate. 2490 * Print a warning for nonzero offsets, and an error 2491 * if they don't add up to a full page. */ 2492 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2493 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2494 btrfs_err(fs_info, 2495 "partial page write in btrfs with offset %u and length %u", 2496 bvec->bv_offset, bvec->bv_len); 2497 else 2498 btrfs_info(fs_info, 2499 "incomplete page write in btrfs with offset %u and length %u", 2500 bvec->bv_offset, bvec->bv_len); 2501 } 2502 2503 start = page_offset(page); 2504 end = start + bvec->bv_offset + bvec->bv_len - 1; 2505 2506 end_extent_writepage(page, bio->bi_error, start, end); 2507 end_page_writeback(page); 2508 } 2509 2510 bio_put(bio); 2511 } 2512 2513 static void 2514 endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, 2515 int uptodate) 2516 { 2517 struct extent_state *cached = NULL; 2518 u64 end = start + len - 1; 2519 2520 if (uptodate && tree->track_uptodate) 2521 set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); 2522 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); 2523 } 2524 2525 /* 2526 * after a readpage IO is done, we need to: 2527 * clear the uptodate bits on error 2528 * set the uptodate bits if things worked 2529 * set the page up to date if all extents in the tree are uptodate 2530 * clear the lock bit in the extent tree 2531 * unlock the page if there are no other extents locked for it 2532 * 2533 * Scheduling is not allowed, so the extent state tree is expected 2534 * to have one and only one object corresponding to this IO. 2535 */ 2536 static void end_bio_extent_readpage(struct bio *bio) 2537 { 2538 struct bio_vec *bvec; 2539 int uptodate = !bio->bi_error; 2540 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2541 struct extent_io_tree *tree; 2542 u64 offset = 0; 2543 u64 start; 2544 u64 end; 2545 u64 len; 2546 u64 extent_start = 0; 2547 u64 extent_len = 0; 2548 int mirror; 2549 int ret; 2550 int i; 2551 2552 bio_for_each_segment_all(bvec, bio, i) { 2553 struct page *page = bvec->bv_page; 2554 struct inode *inode = page->mapping->host; 2555 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2556 2557 btrfs_debug(fs_info, 2558 "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", 2559 (u64)bio->bi_iter.bi_sector, bio->bi_error, 2560 io_bio->mirror_num); 2561 tree = &BTRFS_I(inode)->io_tree; 2562 2563 /* We always issue full-page reads, but if some block 2564 * in a page fails to read, blk_update_request() will 2565 * advance bv_offset and adjust bv_len to compensate. 2566 * Print a warning for nonzero offsets, and an error 2567 * if they don't add up to a full page. */ 2568 if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2569 if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2570 btrfs_err(fs_info, 2571 "partial page read in btrfs with offset %u and length %u", 2572 bvec->bv_offset, bvec->bv_len); 2573 else 2574 btrfs_info(fs_info, 2575 "incomplete page read in btrfs with offset %u and length %u", 2576 bvec->bv_offset, bvec->bv_len); 2577 } 2578 2579 start = page_offset(page); 2580 end = start + bvec->bv_offset + bvec->bv_len - 1; 2581 len = bvec->bv_len; 2582 2583 mirror = io_bio->mirror_num; 2584 if (likely(uptodate && tree->ops)) { 2585 ret = tree->ops->readpage_end_io_hook(io_bio, offset, 2586 page, start, end, 2587 mirror); 2588 if (ret) 2589 uptodate = 0; 2590 else 2591 clean_io_failure(BTRFS_I(inode), start, 2592 page, 0); 2593 } 2594 2595 if (likely(uptodate)) 2596 goto readpage_ok; 2597 2598 if (tree->ops) { 2599 ret = tree->ops->readpage_io_failed_hook(page, mirror); 2600 if (ret == -EAGAIN) { 2601 /* 2602 * Data inode's readpage_io_failed_hook() always 2603 * returns -EAGAIN. 2604 * 2605 * The generic bio_readpage_error handles errors 2606 * the following way: If possible, new read 2607 * requests are created and submitted and will 2608 * end up in end_bio_extent_readpage as well (if 2609 * we're lucky, not in the !uptodate case). In 2610 * that case it returns 0 and we just go on with 2611 * the next page in our bio. If it can't handle 2612 * the error it will return -EIO and we remain 2613 * responsible for that page. 2614 */ 2615 ret = bio_readpage_error(bio, offset, page, 2616 start, end, mirror); 2617 if (ret == 0) { 2618 uptodate = !bio->bi_error; 2619 offset += len; 2620 continue; 2621 } 2622 } 2623 2624 /* 2625 * metadata's readpage_io_failed_hook() always returns 2626 * -EIO and fixes nothing. -EIO is also returned if 2627 * data inode error could not be fixed. 2628 */ 2629 ASSERT(ret == -EIO); 2630 } 2631 readpage_ok: 2632 if (likely(uptodate)) { 2633 loff_t i_size = i_size_read(inode); 2634 pgoff_t end_index = i_size >> PAGE_SHIFT; 2635 unsigned off; 2636 2637 /* Zero out the end if this page straddles i_size */ 2638 off = i_size & (PAGE_SIZE-1); 2639 if (page->index == end_index && off) 2640 zero_user_segment(page, off, PAGE_SIZE); 2641 SetPageUptodate(page); 2642 } else { 2643 ClearPageUptodate(page); 2644 SetPageError(page); 2645 } 2646 unlock_page(page); 2647 offset += len; 2648 2649 if (unlikely(!uptodate)) { 2650 if (extent_len) { 2651 endio_readpage_release_extent(tree, 2652 extent_start, 2653 extent_len, 1); 2654 extent_start = 0; 2655 extent_len = 0; 2656 } 2657 endio_readpage_release_extent(tree, start, 2658 end - start + 1, 0); 2659 } else if (!extent_len) { 2660 extent_start = start; 2661 extent_len = end + 1 - start; 2662 } else if (extent_start + extent_len == start) { 2663 extent_len += end + 1 - start; 2664 } else { 2665 endio_readpage_release_extent(tree, extent_start, 2666 extent_len, uptodate); 2667 extent_start = start; 2668 extent_len = end + 1 - start; 2669 } 2670 } 2671 2672 if (extent_len) 2673 endio_readpage_release_extent(tree, extent_start, extent_len, 2674 uptodate); 2675 if (io_bio->end_io) 2676 io_bio->end_io(io_bio, bio->bi_error); 2677 bio_put(bio); 2678 } 2679 2680 /* 2681 * this allocates from the btrfs_bioset. We're returning a bio right now 2682 * but you can call btrfs_io_bio for the appropriate container_of magic 2683 */ 2684 struct bio * 2685 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 2686 gfp_t gfp_flags) 2687 { 2688 struct btrfs_io_bio *btrfs_bio; 2689 struct bio *bio; 2690 2691 bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); 2692 2693 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 2694 while (!bio && (nr_vecs /= 2)) { 2695 bio = bio_alloc_bioset(gfp_flags, 2696 nr_vecs, btrfs_bioset); 2697 } 2698 } 2699 2700 if (bio) { 2701 bio->bi_bdev = bdev; 2702 bio->bi_iter.bi_sector = first_sector; 2703 btrfs_bio = btrfs_io_bio(bio); 2704 btrfs_bio->csum = NULL; 2705 btrfs_bio->csum_allocated = NULL; 2706 btrfs_bio->end_io = NULL; 2707 } 2708 return bio; 2709 } 2710 2711 struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2712 { 2713 struct btrfs_io_bio *btrfs_bio; 2714 struct bio *new; 2715 2716 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2717 if (new) { 2718 btrfs_bio = btrfs_io_bio(new); 2719 btrfs_bio->csum = NULL; 2720 btrfs_bio->csum_allocated = NULL; 2721 btrfs_bio->end_io = NULL; 2722 } 2723 return new; 2724 } 2725 2726 /* this also allocates from the btrfs_bioset */ 2727 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2728 { 2729 struct btrfs_io_bio *btrfs_bio; 2730 struct bio *bio; 2731 2732 bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); 2733 if (bio) { 2734 btrfs_bio = btrfs_io_bio(bio); 2735 btrfs_bio->csum = NULL; 2736 btrfs_bio->csum_allocated = NULL; 2737 btrfs_bio->end_io = NULL; 2738 } 2739 return bio; 2740 } 2741 2742 2743 static int __must_check submit_one_bio(struct bio *bio, int mirror_num, 2744 unsigned long bio_flags) 2745 { 2746 int ret = 0; 2747 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 2748 struct page *page = bvec->bv_page; 2749 struct extent_io_tree *tree = bio->bi_private; 2750 u64 start; 2751 2752 start = page_offset(page) + bvec->bv_offset; 2753 2754 bio->bi_private = NULL; 2755 bio_get(bio); 2756 2757 if (tree->ops) 2758 ret = tree->ops->submit_bio_hook(page->mapping->host, bio, 2759 mirror_num, bio_flags, start); 2760 else 2761 btrfsic_submit_bio(bio); 2762 2763 bio_put(bio); 2764 return ret; 2765 } 2766 2767 static int merge_bio(struct extent_io_tree *tree, struct page *page, 2768 unsigned long offset, size_t size, struct bio *bio, 2769 unsigned long bio_flags) 2770 { 2771 int ret = 0; 2772 if (tree->ops) 2773 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2774 bio_flags); 2775 return ret; 2776 2777 } 2778 2779 static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, 2780 struct writeback_control *wbc, 2781 struct page *page, sector_t sector, 2782 size_t size, unsigned long offset, 2783 struct block_device *bdev, 2784 struct bio **bio_ret, 2785 bio_end_io_t end_io_func, 2786 int mirror_num, 2787 unsigned long prev_bio_flags, 2788 unsigned long bio_flags, 2789 bool force_bio_submit) 2790 { 2791 int ret = 0; 2792 struct bio *bio; 2793 int contig = 0; 2794 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; 2795 size_t page_size = min_t(size_t, size, PAGE_SIZE); 2796 2797 if (bio_ret && *bio_ret) { 2798 bio = *bio_ret; 2799 if (old_compressed) 2800 contig = bio->bi_iter.bi_sector == sector; 2801 else 2802 contig = bio_end_sector(bio) == sector; 2803 2804 if (prev_bio_flags != bio_flags || !contig || 2805 force_bio_submit || 2806 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2807 bio_add_page(bio, page, page_size, offset) < page_size) { 2808 ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 2809 if (ret < 0) { 2810 *bio_ret = NULL; 2811 return ret; 2812 } 2813 bio = NULL; 2814 } else { 2815 if (wbc) 2816 wbc_account_io(wbc, page, page_size); 2817 return 0; 2818 } 2819 } 2820 2821 bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES, 2822 GFP_NOFS | __GFP_HIGH); 2823 if (!bio) 2824 return -ENOMEM; 2825 2826 bio_add_page(bio, page, page_size, offset); 2827 bio->bi_end_io = end_io_func; 2828 bio->bi_private = tree; 2829 bio_set_op_attrs(bio, op, op_flags); 2830 if (wbc) { 2831 wbc_init_bio(wbc, bio); 2832 wbc_account_io(wbc, page, page_size); 2833 } 2834 2835 if (bio_ret) 2836 *bio_ret = bio; 2837 else 2838 ret = submit_one_bio(bio, mirror_num, bio_flags); 2839 2840 return ret; 2841 } 2842 2843 static void attach_extent_buffer_page(struct extent_buffer *eb, 2844 struct page *page) 2845 { 2846 if (!PagePrivate(page)) { 2847 SetPagePrivate(page); 2848 get_page(page); 2849 set_page_private(page, (unsigned long)eb); 2850 } else { 2851 WARN_ON(page->private != (unsigned long)eb); 2852 } 2853 } 2854 2855 void set_page_extent_mapped(struct page *page) 2856 { 2857 if (!PagePrivate(page)) { 2858 SetPagePrivate(page); 2859 get_page(page); 2860 set_page_private(page, EXTENT_PAGE_PRIVATE); 2861 } 2862 } 2863 2864 static struct extent_map * 2865 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, 2866 u64 start, u64 len, get_extent_t *get_extent, 2867 struct extent_map **em_cached) 2868 { 2869 struct extent_map *em; 2870 2871 if (em_cached && *em_cached) { 2872 em = *em_cached; 2873 if (extent_map_in_tree(em) && start >= em->start && 2874 start < extent_map_end(em)) { 2875 refcount_inc(&em->refs); 2876 return em; 2877 } 2878 2879 free_extent_map(em); 2880 *em_cached = NULL; 2881 } 2882 2883 em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); 2884 if (em_cached && !IS_ERR_OR_NULL(em)) { 2885 BUG_ON(*em_cached); 2886 refcount_inc(&em->refs); 2887 *em_cached = em; 2888 } 2889 return em; 2890 } 2891 /* 2892 * basic readpage implementation. Locked extent state structs are inserted 2893 * into the tree that are removed when the IO is done (by the end_io 2894 * handlers) 2895 * XXX JDM: This needs looking at to ensure proper page locking 2896 * return 0 on success, otherwise return error 2897 */ 2898 static int __do_readpage(struct extent_io_tree *tree, 2899 struct page *page, 2900 get_extent_t *get_extent, 2901 struct extent_map **em_cached, 2902 struct bio **bio, int mirror_num, 2903 unsigned long *bio_flags, int read_flags, 2904 u64 *prev_em_start) 2905 { 2906 struct inode *inode = page->mapping->host; 2907 u64 start = page_offset(page); 2908 u64 page_end = start + PAGE_SIZE - 1; 2909 u64 end; 2910 u64 cur = start; 2911 u64 extent_offset; 2912 u64 last_byte = i_size_read(inode); 2913 u64 block_start; 2914 u64 cur_end; 2915 sector_t sector; 2916 struct extent_map *em; 2917 struct block_device *bdev; 2918 int ret = 0; 2919 int nr = 0; 2920 size_t pg_offset = 0; 2921 size_t iosize; 2922 size_t disk_io_size; 2923 size_t blocksize = inode->i_sb->s_blocksize; 2924 unsigned long this_bio_flag = 0; 2925 2926 set_page_extent_mapped(page); 2927 2928 end = page_end; 2929 if (!PageUptodate(page)) { 2930 if (cleancache_get_page(page) == 0) { 2931 BUG_ON(blocksize != PAGE_SIZE); 2932 unlock_extent(tree, start, end); 2933 goto out; 2934 } 2935 } 2936 2937 if (page->index == last_byte >> PAGE_SHIFT) { 2938 char *userpage; 2939 size_t zero_offset = last_byte & (PAGE_SIZE - 1); 2940 2941 if (zero_offset) { 2942 iosize = PAGE_SIZE - zero_offset; 2943 userpage = kmap_atomic(page); 2944 memset(userpage + zero_offset, 0, iosize); 2945 flush_dcache_page(page); 2946 kunmap_atomic(userpage); 2947 } 2948 } 2949 while (cur <= end) { 2950 bool force_bio_submit = false; 2951 2952 if (cur >= last_byte) { 2953 char *userpage; 2954 struct extent_state *cached = NULL; 2955 2956 iosize = PAGE_SIZE - pg_offset; 2957 userpage = kmap_atomic(page); 2958 memset(userpage + pg_offset, 0, iosize); 2959 flush_dcache_page(page); 2960 kunmap_atomic(userpage); 2961 set_extent_uptodate(tree, cur, cur + iosize - 1, 2962 &cached, GFP_NOFS); 2963 unlock_extent_cached(tree, cur, 2964 cur + iosize - 1, 2965 &cached, GFP_NOFS); 2966 break; 2967 } 2968 em = __get_extent_map(inode, page, pg_offset, cur, 2969 end - cur + 1, get_extent, em_cached); 2970 if (IS_ERR_OR_NULL(em)) { 2971 SetPageError(page); 2972 unlock_extent(tree, cur, end); 2973 break; 2974 } 2975 extent_offset = cur - em->start; 2976 BUG_ON(extent_map_end(em) <= cur); 2977 BUG_ON(end < cur); 2978 2979 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2980 this_bio_flag |= EXTENT_BIO_COMPRESSED; 2981 extent_set_compress_type(&this_bio_flag, 2982 em->compress_type); 2983 } 2984 2985 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2986 cur_end = min(extent_map_end(em) - 1, end); 2987 iosize = ALIGN(iosize, blocksize); 2988 if (this_bio_flag & EXTENT_BIO_COMPRESSED) { 2989 disk_io_size = em->block_len; 2990 sector = em->block_start >> 9; 2991 } else { 2992 sector = (em->block_start + extent_offset) >> 9; 2993 disk_io_size = iosize; 2994 } 2995 bdev = em->bdev; 2996 block_start = em->block_start; 2997 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 2998 block_start = EXTENT_MAP_HOLE; 2999 3000 /* 3001 * If we have a file range that points to a compressed extent 3002 * and it's followed by a consecutive file range that points to 3003 * to the same compressed extent (possibly with a different 3004 * offset and/or length, so it either points to the whole extent 3005 * or only part of it), we must make sure we do not submit a 3006 * single bio to populate the pages for the 2 ranges because 3007 * this makes the compressed extent read zero out the pages 3008 * belonging to the 2nd range. Imagine the following scenario: 3009 * 3010 * File layout 3011 * [0 - 8K] [8K - 24K] 3012 * | | 3013 * | | 3014 * points to extent X, points to extent X, 3015 * offset 4K, length of 8K offset 0, length 16K 3016 * 3017 * [extent X, compressed length = 4K uncompressed length = 16K] 3018 * 3019 * If the bio to read the compressed extent covers both ranges, 3020 * it will decompress extent X into the pages belonging to the 3021 * first range and then it will stop, zeroing out the remaining 3022 * pages that belong to the other range that points to extent X. 3023 * So here we make sure we submit 2 bios, one for the first 3024 * range and another one for the third range. Both will target 3025 * the same physical extent from disk, but we can't currently 3026 * make the compressed bio endio callback populate the pages 3027 * for both ranges because each compressed bio is tightly 3028 * coupled with a single extent map, and each range can have 3029 * an extent map with a different offset value relative to the 3030 * uncompressed data of our extent and different lengths. This 3031 * is a corner case so we prioritize correctness over 3032 * non-optimal behavior (submitting 2 bios for the same extent). 3033 */ 3034 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && 3035 prev_em_start && *prev_em_start != (u64)-1 && 3036 *prev_em_start != em->orig_start) 3037 force_bio_submit = true; 3038 3039 if (prev_em_start) 3040 *prev_em_start = em->orig_start; 3041 3042 free_extent_map(em); 3043 em = NULL; 3044 3045 /* we've found a hole, just zero and go on */ 3046 if (block_start == EXTENT_MAP_HOLE) { 3047 char *userpage; 3048 struct extent_state *cached = NULL; 3049 3050 userpage = kmap_atomic(page); 3051 memset(userpage + pg_offset, 0, iosize); 3052 flush_dcache_page(page); 3053 kunmap_atomic(userpage); 3054 3055 set_extent_uptodate(tree, cur, cur + iosize - 1, 3056 &cached, GFP_NOFS); 3057 unlock_extent_cached(tree, cur, 3058 cur + iosize - 1, 3059 &cached, GFP_NOFS); 3060 cur = cur + iosize; 3061 pg_offset += iosize; 3062 continue; 3063 } 3064 /* the get_extent function already copied into the page */ 3065 if (test_range_bit(tree, cur, cur_end, 3066 EXTENT_UPTODATE, 1, NULL)) { 3067 check_page_uptodate(tree, page); 3068 unlock_extent(tree, cur, cur + iosize - 1); 3069 cur = cur + iosize; 3070 pg_offset += iosize; 3071 continue; 3072 } 3073 /* we have an inline extent but it didn't get marked up 3074 * to date. Error out 3075 */ 3076 if (block_start == EXTENT_MAP_INLINE) { 3077 SetPageError(page); 3078 unlock_extent(tree, cur, cur + iosize - 1); 3079 cur = cur + iosize; 3080 pg_offset += iosize; 3081 continue; 3082 } 3083 3084 ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, 3085 page, sector, disk_io_size, pg_offset, 3086 bdev, bio, 3087 end_bio_extent_readpage, mirror_num, 3088 *bio_flags, 3089 this_bio_flag, 3090 force_bio_submit); 3091 if (!ret) { 3092 nr++; 3093 *bio_flags = this_bio_flag; 3094 } else { 3095 SetPageError(page); 3096 unlock_extent(tree, cur, cur + iosize - 1); 3097 goto out; 3098 } 3099 cur = cur + iosize; 3100 pg_offset += iosize; 3101 } 3102 out: 3103 if (!nr) { 3104 if (!PageError(page)) 3105 SetPageUptodate(page); 3106 unlock_page(page); 3107 } 3108 return ret; 3109 } 3110 3111 static inline void __do_contiguous_readpages(struct extent_io_tree *tree, 3112 struct page *pages[], int nr_pages, 3113 u64 start, u64 end, 3114 get_extent_t *get_extent, 3115 struct extent_map **em_cached, 3116 struct bio **bio, int mirror_num, 3117 unsigned long *bio_flags, 3118 u64 *prev_em_start) 3119 { 3120 struct inode *inode; 3121 struct btrfs_ordered_extent *ordered; 3122 int index; 3123 3124 inode = pages[0]->mapping->host; 3125 while (1) { 3126 lock_extent(tree, start, end); 3127 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3128 end - start + 1); 3129 if (!ordered) 3130 break; 3131 unlock_extent(tree, start, end); 3132 btrfs_start_ordered_extent(inode, ordered, 1); 3133 btrfs_put_ordered_extent(ordered); 3134 } 3135 3136 for (index = 0; index < nr_pages; index++) { 3137 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3138 mirror_num, bio_flags, 0, prev_em_start); 3139 put_page(pages[index]); 3140 } 3141 } 3142 3143 static void __extent_readpages(struct extent_io_tree *tree, 3144 struct page *pages[], 3145 int nr_pages, get_extent_t *get_extent, 3146 struct extent_map **em_cached, 3147 struct bio **bio, int mirror_num, 3148 unsigned long *bio_flags, 3149 u64 *prev_em_start) 3150 { 3151 u64 start = 0; 3152 u64 end = 0; 3153 u64 page_start; 3154 int index; 3155 int first_index = 0; 3156 3157 for (index = 0; index < nr_pages; index++) { 3158 page_start = page_offset(pages[index]); 3159 if (!end) { 3160 start = page_start; 3161 end = start + PAGE_SIZE - 1; 3162 first_index = index; 3163 } else if (end + 1 == page_start) { 3164 end += PAGE_SIZE; 3165 } else { 3166 __do_contiguous_readpages(tree, &pages[first_index], 3167 index - first_index, start, 3168 end, get_extent, em_cached, 3169 bio, mirror_num, bio_flags, 3170 prev_em_start); 3171 start = page_start; 3172 end = start + PAGE_SIZE - 1; 3173 first_index = index; 3174 } 3175 } 3176 3177 if (end) 3178 __do_contiguous_readpages(tree, &pages[first_index], 3179 index - first_index, start, 3180 end, get_extent, em_cached, bio, 3181 mirror_num, bio_flags, 3182 prev_em_start); 3183 } 3184 3185 static int __extent_read_full_page(struct extent_io_tree *tree, 3186 struct page *page, 3187 get_extent_t *get_extent, 3188 struct bio **bio, int mirror_num, 3189 unsigned long *bio_flags, int read_flags) 3190 { 3191 struct inode *inode = page->mapping->host; 3192 struct btrfs_ordered_extent *ordered; 3193 u64 start = page_offset(page); 3194 u64 end = start + PAGE_SIZE - 1; 3195 int ret; 3196 3197 while (1) { 3198 lock_extent(tree, start, end); 3199 ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, 3200 PAGE_SIZE); 3201 if (!ordered) 3202 break; 3203 unlock_extent(tree, start, end); 3204 btrfs_start_ordered_extent(inode, ordered, 1); 3205 btrfs_put_ordered_extent(ordered); 3206 } 3207 3208 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3209 bio_flags, read_flags, NULL); 3210 return ret; 3211 } 3212 3213 int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 3214 get_extent_t *get_extent, int mirror_num) 3215 { 3216 struct bio *bio = NULL; 3217 unsigned long bio_flags = 0; 3218 int ret; 3219 3220 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, 3221 &bio_flags, 0); 3222 if (bio) 3223 ret = submit_one_bio(bio, mirror_num, bio_flags); 3224 return ret; 3225 } 3226 3227 static void update_nr_written(struct writeback_control *wbc, 3228 unsigned long nr_written) 3229 { 3230 wbc->nr_to_write -= nr_written; 3231 } 3232 3233 /* 3234 * helper for __extent_writepage, doing all of the delayed allocation setup. 3235 * 3236 * This returns 1 if our fill_delalloc function did all the work required 3237 * to write the page (copy into inline extent). In this case the IO has 3238 * been started and the page is already unlocked. 3239 * 3240 * This returns 0 if all went well (page still locked) 3241 * This returns < 0 if there were errors (page still locked) 3242 */ 3243 static noinline_for_stack int writepage_delalloc(struct inode *inode, 3244 struct page *page, struct writeback_control *wbc, 3245 struct extent_page_data *epd, 3246 u64 delalloc_start, 3247 unsigned long *nr_written) 3248 { 3249 struct extent_io_tree *tree = epd->tree; 3250 u64 page_end = delalloc_start + PAGE_SIZE - 1; 3251 u64 nr_delalloc; 3252 u64 delalloc_to_write = 0; 3253 u64 delalloc_end = 0; 3254 int ret; 3255 int page_started = 0; 3256 3257 if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) 3258 return 0; 3259 3260 while (delalloc_end < page_end) { 3261 nr_delalloc = find_lock_delalloc_range(inode, tree, 3262 page, 3263 &delalloc_start, 3264 &delalloc_end, 3265 BTRFS_MAX_EXTENT_SIZE); 3266 if (nr_delalloc == 0) { 3267 delalloc_start = delalloc_end + 1; 3268 continue; 3269 } 3270 ret = tree->ops->fill_delalloc(inode, page, 3271 delalloc_start, 3272 delalloc_end, 3273 &page_started, 3274 nr_written); 3275 /* File system has been set read-only */ 3276 if (ret) { 3277 SetPageError(page); 3278 /* fill_delalloc should be return < 0 for error 3279 * but just in case, we use > 0 here meaning the 3280 * IO is started, so we don't want to return > 0 3281 * unless things are going well. 3282 */ 3283 ret = ret < 0 ? ret : -EIO; 3284 goto done; 3285 } 3286 /* 3287 * delalloc_end is already one less than the total length, so 3288 * we don't subtract one from PAGE_SIZE 3289 */ 3290 delalloc_to_write += (delalloc_end - delalloc_start + 3291 PAGE_SIZE) >> PAGE_SHIFT; 3292 delalloc_start = delalloc_end + 1; 3293 } 3294 if (wbc->nr_to_write < delalloc_to_write) { 3295 int thresh = 8192; 3296 3297 if (delalloc_to_write < thresh * 2) 3298 thresh = delalloc_to_write; 3299 wbc->nr_to_write = min_t(u64, delalloc_to_write, 3300 thresh); 3301 } 3302 3303 /* did the fill delalloc function already unlock and start 3304 * the IO? 3305 */ 3306 if (page_started) { 3307 /* 3308 * we've unlocked the page, so we can't update 3309 * the mapping's writeback index, just update 3310 * nr_to_write. 3311 */ 3312 wbc->nr_to_write -= *nr_written; 3313 return 1; 3314 } 3315 3316 ret = 0; 3317 3318 done: 3319 return ret; 3320 } 3321 3322 /* 3323 * helper for __extent_writepage. This calls the writepage start hooks, 3324 * and does the loop to map the page into extents and bios. 3325 * 3326 * We return 1 if the IO is started and the page is unlocked, 3327 * 0 if all went well (page still locked) 3328 * < 0 if there were errors (page still locked) 3329 */ 3330 static noinline_for_stack int __extent_writepage_io(struct inode *inode, 3331 struct page *page, 3332 struct writeback_control *wbc, 3333 struct extent_page_data *epd, 3334 loff_t i_size, 3335 unsigned long nr_written, 3336 int write_flags, int *nr_ret) 3337 { 3338 struct extent_io_tree *tree = epd->tree; 3339 u64 start = page_offset(page); 3340 u64 page_end = start + PAGE_SIZE - 1; 3341 u64 end; 3342 u64 cur = start; 3343 u64 extent_offset; 3344 u64 block_start; 3345 u64 iosize; 3346 sector_t sector; 3347 struct extent_map *em; 3348 struct block_device *bdev; 3349 size_t pg_offset = 0; 3350 size_t blocksize; 3351 int ret = 0; 3352 int nr = 0; 3353 bool compressed; 3354 3355 if (tree->ops && tree->ops->writepage_start_hook) { 3356 ret = tree->ops->writepage_start_hook(page, start, 3357 page_end); 3358 if (ret) { 3359 /* Fixup worker will requeue */ 3360 if (ret == -EBUSY) 3361 wbc->pages_skipped++; 3362 else 3363 redirty_page_for_writepage(wbc, page); 3364 3365 update_nr_written(wbc, nr_written); 3366 unlock_page(page); 3367 return 1; 3368 } 3369 } 3370 3371 /* 3372 * we don't want to touch the inode after unlocking the page, 3373 * so we update the mapping writeback index now 3374 */ 3375 update_nr_written(wbc, nr_written + 1); 3376 3377 end = page_end; 3378 if (i_size <= start) { 3379 if (tree->ops && tree->ops->writepage_end_io_hook) 3380 tree->ops->writepage_end_io_hook(page, start, 3381 page_end, NULL, 1); 3382 goto done; 3383 } 3384 3385 blocksize = inode->i_sb->s_blocksize; 3386 3387 while (cur <= end) { 3388 u64 em_end; 3389 3390 if (cur >= i_size) { 3391 if (tree->ops && tree->ops->writepage_end_io_hook) 3392 tree->ops->writepage_end_io_hook(page, cur, 3393 page_end, NULL, 1); 3394 break; 3395 } 3396 em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur, 3397 end - cur + 1, 1); 3398 if (IS_ERR_OR_NULL(em)) { 3399 SetPageError(page); 3400 ret = PTR_ERR_OR_ZERO(em); 3401 break; 3402 } 3403 3404 extent_offset = cur - em->start; 3405 em_end = extent_map_end(em); 3406 BUG_ON(em_end <= cur); 3407 BUG_ON(end < cur); 3408 iosize = min(em_end - cur, end - cur + 1); 3409 iosize = ALIGN(iosize, blocksize); 3410 sector = (em->block_start + extent_offset) >> 9; 3411 bdev = em->bdev; 3412 block_start = em->block_start; 3413 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3414 free_extent_map(em); 3415 em = NULL; 3416 3417 /* 3418 * compressed and inline extents are written through other 3419 * paths in the FS 3420 */ 3421 if (compressed || block_start == EXTENT_MAP_HOLE || 3422 block_start == EXTENT_MAP_INLINE) { 3423 /* 3424 * end_io notification does not happen here for 3425 * compressed extents 3426 */ 3427 if (!compressed && tree->ops && 3428 tree->ops->writepage_end_io_hook) 3429 tree->ops->writepage_end_io_hook(page, cur, 3430 cur + iosize - 1, 3431 NULL, 1); 3432 else if (compressed) { 3433 /* we don't want to end_page_writeback on 3434 * a compressed extent. this happens 3435 * elsewhere 3436 */ 3437 nr++; 3438 } 3439 3440 cur += iosize; 3441 pg_offset += iosize; 3442 continue; 3443 } 3444 3445 set_range_writeback(tree, cur, cur + iosize - 1); 3446 if (!PageWriteback(page)) { 3447 btrfs_err(BTRFS_I(inode)->root->fs_info, 3448 "page %lu not writeback, cur %llu end %llu", 3449 page->index, cur, end); 3450 } 3451 3452 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3453 page, sector, iosize, pg_offset, 3454 bdev, &epd->bio, 3455 end_bio_extent_writepage, 3456 0, 0, 0, false); 3457 if (ret) { 3458 SetPageError(page); 3459 if (PageWriteback(page)) 3460 end_page_writeback(page); 3461 } 3462 3463 cur = cur + iosize; 3464 pg_offset += iosize; 3465 nr++; 3466 } 3467 done: 3468 *nr_ret = nr; 3469 return ret; 3470 } 3471 3472 /* 3473 * the writepage semantics are similar to regular writepage. extent 3474 * records are inserted to lock ranges in the tree, and as dirty areas 3475 * are found, they are marked writeback. Then the lock bits are removed 3476 * and the end_io handler clears the writeback ranges 3477 */ 3478 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 3479 void *data) 3480 { 3481 struct inode *inode = page->mapping->host; 3482 struct extent_page_data *epd = data; 3483 u64 start = page_offset(page); 3484 u64 page_end = start + PAGE_SIZE - 1; 3485 int ret; 3486 int nr = 0; 3487 size_t pg_offset = 0; 3488 loff_t i_size = i_size_read(inode); 3489 unsigned long end_index = i_size >> PAGE_SHIFT; 3490 int write_flags = 0; 3491 unsigned long nr_written = 0; 3492 3493 if (wbc->sync_mode == WB_SYNC_ALL) 3494 write_flags = REQ_SYNC; 3495 3496 trace___extent_writepage(page, inode, wbc); 3497 3498 WARN_ON(!PageLocked(page)); 3499 3500 ClearPageError(page); 3501 3502 pg_offset = i_size & (PAGE_SIZE - 1); 3503 if (page->index > end_index || 3504 (page->index == end_index && !pg_offset)) { 3505 page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 3506 unlock_page(page); 3507 return 0; 3508 } 3509 3510 if (page->index == end_index) { 3511 char *userpage; 3512 3513 userpage = kmap_atomic(page); 3514 memset(userpage + pg_offset, 0, 3515 PAGE_SIZE - pg_offset); 3516 kunmap_atomic(userpage); 3517 flush_dcache_page(page); 3518 } 3519 3520 pg_offset = 0; 3521 3522 set_page_extent_mapped(page); 3523 3524 ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); 3525 if (ret == 1) 3526 goto done_unlocked; 3527 if (ret) 3528 goto done; 3529 3530 ret = __extent_writepage_io(inode, page, wbc, epd, 3531 i_size, nr_written, write_flags, &nr); 3532 if (ret == 1) 3533 goto done_unlocked; 3534 3535 done: 3536 if (nr == 0) { 3537 /* make sure the mapping tag for page dirty gets cleared */ 3538 set_page_writeback(page); 3539 end_page_writeback(page); 3540 } 3541 if (PageError(page)) { 3542 ret = ret < 0 ? ret : -EIO; 3543 end_extent_writepage(page, ret, start, page_end); 3544 } 3545 unlock_page(page); 3546 return ret; 3547 3548 done_unlocked: 3549 return 0; 3550 } 3551 3552 void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3553 { 3554 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK, 3555 TASK_UNINTERRUPTIBLE); 3556 } 3557 3558 static noinline_for_stack int 3559 lock_extent_buffer_for_io(struct extent_buffer *eb, 3560 struct btrfs_fs_info *fs_info, 3561 struct extent_page_data *epd) 3562 { 3563 unsigned long i, num_pages; 3564 int flush = 0; 3565 int ret = 0; 3566 3567 if (!btrfs_try_tree_write_lock(eb)) { 3568 flush = 1; 3569 flush_write_bio(epd); 3570 btrfs_tree_lock(eb); 3571 } 3572 3573 if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { 3574 btrfs_tree_unlock(eb); 3575 if (!epd->sync_io) 3576 return 0; 3577 if (!flush) { 3578 flush_write_bio(epd); 3579 flush = 1; 3580 } 3581 while (1) { 3582 wait_on_extent_buffer_writeback(eb); 3583 btrfs_tree_lock(eb); 3584 if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) 3585 break; 3586 btrfs_tree_unlock(eb); 3587 } 3588 } 3589 3590 /* 3591 * We need to do this to prevent races in people who check if the eb is 3592 * under IO since we can end up having no IO bits set for a short period 3593 * of time. 3594 */ 3595 spin_lock(&eb->refs_lock); 3596 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3597 set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3598 spin_unlock(&eb->refs_lock); 3599 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3600 __percpu_counter_add(&fs_info->dirty_metadata_bytes, 3601 -eb->len, 3602 fs_info->dirty_metadata_batch); 3603 ret = 1; 3604 } else { 3605 spin_unlock(&eb->refs_lock); 3606 } 3607 3608 btrfs_tree_unlock(eb); 3609 3610 if (!ret) 3611 return ret; 3612 3613 num_pages = num_extent_pages(eb->start, eb->len); 3614 for (i = 0; i < num_pages; i++) { 3615 struct page *p = eb->pages[i]; 3616 3617 if (!trylock_page(p)) { 3618 if (!flush) { 3619 flush_write_bio(epd); 3620 flush = 1; 3621 } 3622 lock_page(p); 3623 } 3624 } 3625 3626 return ret; 3627 } 3628 3629 static void end_extent_buffer_writeback(struct extent_buffer *eb) 3630 { 3631 clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); 3632 smp_mb__after_atomic(); 3633 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3634 } 3635 3636 static void set_btree_ioerr(struct page *page) 3637 { 3638 struct extent_buffer *eb = (struct extent_buffer *)page->private; 3639 3640 SetPageError(page); 3641 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) 3642 return; 3643 3644 /* 3645 * If writeback for a btree extent that doesn't belong to a log tree 3646 * failed, increment the counter transaction->eb_write_errors. 3647 * We do this because while the transaction is running and before it's 3648 * committing (when we call filemap_fdata[write|wait]_range against 3649 * the btree inode), we might have 3650 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it 3651 * returns an error or an error happens during writeback, when we're 3652 * committing the transaction we wouldn't know about it, since the pages 3653 * can be no longer dirty nor marked anymore for writeback (if a 3654 * subsequent modification to the extent buffer didn't happen before the 3655 * transaction commit), which makes filemap_fdata[write|wait]_range not 3656 * able to find the pages tagged with SetPageError at transaction 3657 * commit time. So if this happens we must abort the transaction, 3658 * otherwise we commit a super block with btree roots that point to 3659 * btree nodes/leafs whose content on disk is invalid - either garbage 3660 * or the content of some node/leaf from a past generation that got 3661 * cowed or deleted and is no longer valid. 3662 * 3663 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would 3664 * not be enough - we need to distinguish between log tree extents vs 3665 * non-log tree extents, and the next filemap_fdatawait_range() call 3666 * will catch and clear such errors in the mapping - and that call might 3667 * be from a log sync and not from a transaction commit. Also, checking 3668 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is 3669 * not done and would not be reliable - the eb might have been released 3670 * from memory and reading it back again means that flag would not be 3671 * set (since it's a runtime flag, not persisted on disk). 3672 * 3673 * Using the flags below in the btree inode also makes us achieve the 3674 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started 3675 * writeback for all dirty pages and before filemap_fdatawait_range() 3676 * is called, the writeback for all dirty pages had already finished 3677 * with errors - because we were not using AS_EIO/AS_ENOSPC, 3678 * filemap_fdatawait_range() would return success, as it could not know 3679 * that writeback errors happened (the pages were no longer tagged for 3680 * writeback). 3681 */ 3682 switch (eb->log_index) { 3683 case -1: 3684 set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); 3685 break; 3686 case 0: 3687 set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); 3688 break; 3689 case 1: 3690 set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); 3691 break; 3692 default: 3693 BUG(); /* unexpected, logic error */ 3694 } 3695 } 3696 3697 static void end_bio_extent_buffer_writepage(struct bio *bio) 3698 { 3699 struct bio_vec *bvec; 3700 struct extent_buffer *eb; 3701 int i, done; 3702 3703 bio_for_each_segment_all(bvec, bio, i) { 3704 struct page *page = bvec->bv_page; 3705 3706 eb = (struct extent_buffer *)page->private; 3707 BUG_ON(!eb); 3708 done = atomic_dec_and_test(&eb->io_pages); 3709 3710 if (bio->bi_error || 3711 test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { 3712 ClearPageUptodate(page); 3713 set_btree_ioerr(page); 3714 } 3715 3716 end_page_writeback(page); 3717 3718 if (!done) 3719 continue; 3720 3721 end_extent_buffer_writeback(eb); 3722 } 3723 3724 bio_put(bio); 3725 } 3726 3727 static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 3728 struct btrfs_fs_info *fs_info, 3729 struct writeback_control *wbc, 3730 struct extent_page_data *epd) 3731 { 3732 struct block_device *bdev = fs_info->fs_devices->latest_bdev; 3733 struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 3734 u64 offset = eb->start; 3735 u32 nritems; 3736 unsigned long i, num_pages; 3737 unsigned long bio_flags = 0; 3738 unsigned long start, end; 3739 int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; 3740 int ret = 0; 3741 3742 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 3743 num_pages = num_extent_pages(eb->start, eb->len); 3744 atomic_set(&eb->io_pages, num_pages); 3745 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3746 bio_flags = EXTENT_BIO_TREE_LOG; 3747 3748 /* set btree blocks beyond nritems with 0 to avoid stale content. */ 3749 nritems = btrfs_header_nritems(eb); 3750 if (btrfs_header_level(eb) > 0) { 3751 end = btrfs_node_key_ptr_offset(nritems); 3752 3753 memzero_extent_buffer(eb, end, eb->len - end); 3754 } else { 3755 /* 3756 * leaf: 3757 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 3758 */ 3759 start = btrfs_item_nr_offset(nritems); 3760 end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); 3761 memzero_extent_buffer(eb, start, end - start); 3762 } 3763 3764 for (i = 0; i < num_pages; i++) { 3765 struct page *p = eb->pages[i]; 3766 3767 clear_page_dirty_for_io(p); 3768 set_page_writeback(p); 3769 ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, 3770 p, offset >> 9, PAGE_SIZE, 0, bdev, 3771 &epd->bio, 3772 end_bio_extent_buffer_writepage, 3773 0, epd->bio_flags, bio_flags, false); 3774 epd->bio_flags = bio_flags; 3775 if (ret) { 3776 set_btree_ioerr(p); 3777 if (PageWriteback(p)) 3778 end_page_writeback(p); 3779 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3780 end_extent_buffer_writeback(eb); 3781 ret = -EIO; 3782 break; 3783 } 3784 offset += PAGE_SIZE; 3785 update_nr_written(wbc, 1); 3786 unlock_page(p); 3787 } 3788 3789 if (unlikely(ret)) { 3790 for (; i < num_pages; i++) { 3791 struct page *p = eb->pages[i]; 3792 clear_page_dirty_for_io(p); 3793 unlock_page(p); 3794 } 3795 } 3796 3797 return ret; 3798 } 3799 3800 int btree_write_cache_pages(struct address_space *mapping, 3801 struct writeback_control *wbc) 3802 { 3803 struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; 3804 struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; 3805 struct extent_buffer *eb, *prev_eb = NULL; 3806 struct extent_page_data epd = { 3807 .bio = NULL, 3808 .tree = tree, 3809 .extent_locked = 0, 3810 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 3811 .bio_flags = 0, 3812 }; 3813 int ret = 0; 3814 int done = 0; 3815 int nr_to_write_done = 0; 3816 struct pagevec pvec; 3817 int nr_pages; 3818 pgoff_t index; 3819 pgoff_t end; /* Inclusive */ 3820 int scanned = 0; 3821 int tag; 3822 3823 pagevec_init(&pvec, 0); 3824 if (wbc->range_cyclic) { 3825 index = mapping->writeback_index; /* Start from prev offset */ 3826 end = -1; 3827 } else { 3828 index = wbc->range_start >> PAGE_SHIFT; 3829 end = wbc->range_end >> PAGE_SHIFT; 3830 scanned = 1; 3831 } 3832 if (wbc->sync_mode == WB_SYNC_ALL) 3833 tag = PAGECACHE_TAG_TOWRITE; 3834 else 3835 tag = PAGECACHE_TAG_DIRTY; 3836 retry: 3837 if (wbc->sync_mode == WB_SYNC_ALL) 3838 tag_pages_for_writeback(mapping, index, end); 3839 while (!done && !nr_to_write_done && (index <= end) && 3840 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3841 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3842 unsigned i; 3843 3844 scanned = 1; 3845 for (i = 0; i < nr_pages; i++) { 3846 struct page *page = pvec.pages[i]; 3847 3848 if (!PagePrivate(page)) 3849 continue; 3850 3851 if (!wbc->range_cyclic && page->index > end) { 3852 done = 1; 3853 break; 3854 } 3855 3856 spin_lock(&mapping->private_lock); 3857 if (!PagePrivate(page)) { 3858 spin_unlock(&mapping->private_lock); 3859 continue; 3860 } 3861 3862 eb = (struct extent_buffer *)page->private; 3863 3864 /* 3865 * Shouldn't happen and normally this would be a BUG_ON 3866 * but no sense in crashing the users box for something 3867 * we can survive anyway. 3868 */ 3869 if (WARN_ON(!eb)) { 3870 spin_unlock(&mapping->private_lock); 3871 continue; 3872 } 3873 3874 if (eb == prev_eb) { 3875 spin_unlock(&mapping->private_lock); 3876 continue; 3877 } 3878 3879 ret = atomic_inc_not_zero(&eb->refs); 3880 spin_unlock(&mapping->private_lock); 3881 if (!ret) 3882 continue; 3883 3884 prev_eb = eb; 3885 ret = lock_extent_buffer_for_io(eb, fs_info, &epd); 3886 if (!ret) { 3887 free_extent_buffer(eb); 3888 continue; 3889 } 3890 3891 ret = write_one_eb(eb, fs_info, wbc, &epd); 3892 if (ret) { 3893 done = 1; 3894 free_extent_buffer(eb); 3895 break; 3896 } 3897 free_extent_buffer(eb); 3898 3899 /* 3900 * the filesystem may choose to bump up nr_to_write. 3901 * We have to make sure to honor the new nr_to_write 3902 * at any time 3903 */ 3904 nr_to_write_done = wbc->nr_to_write <= 0; 3905 } 3906 pagevec_release(&pvec); 3907 cond_resched(); 3908 } 3909 if (!scanned && !done) { 3910 /* 3911 * We hit the last page and there is more work to be done: wrap 3912 * back to the start of the file 3913 */ 3914 scanned = 1; 3915 index = 0; 3916 goto retry; 3917 } 3918 flush_write_bio(&epd); 3919 return ret; 3920 } 3921 3922 /** 3923 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 3924 * @mapping: address space structure to write 3925 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 3926 * @writepage: function called for each page 3927 * @data: data passed to writepage function 3928 * 3929 * If a page is already under I/O, write_cache_pages() skips it, even 3930 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 3931 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 3932 * and msync() need to guarantee that all the data which was dirty at the time 3933 * the call was made get new I/O started against them. If wbc->sync_mode is 3934 * WB_SYNC_ALL then we were called for data integrity and we must wait for 3935 * existing IO to complete. 3936 */ 3937 static int extent_write_cache_pages(struct address_space *mapping, 3938 struct writeback_control *wbc, 3939 writepage_t writepage, void *data, 3940 void (*flush_fn)(void *)) 3941 { 3942 struct inode *inode = mapping->host; 3943 int ret = 0; 3944 int done = 0; 3945 int nr_to_write_done = 0; 3946 struct pagevec pvec; 3947 int nr_pages; 3948 pgoff_t index; 3949 pgoff_t end; /* Inclusive */ 3950 pgoff_t done_index; 3951 int range_whole = 0; 3952 int scanned = 0; 3953 int tag; 3954 3955 /* 3956 * We have to hold onto the inode so that ordered extents can do their 3957 * work when the IO finishes. The alternative to this is failing to add 3958 * an ordered extent if the igrab() fails there and that is a huge pain 3959 * to deal with, so instead just hold onto the inode throughout the 3960 * writepages operation. If it fails here we are freeing up the inode 3961 * anyway and we'd rather not waste our time writing out stuff that is 3962 * going to be truncated anyway. 3963 */ 3964 if (!igrab(inode)) 3965 return 0; 3966 3967 pagevec_init(&pvec, 0); 3968 if (wbc->range_cyclic) { 3969 index = mapping->writeback_index; /* Start from prev offset */ 3970 end = -1; 3971 } else { 3972 index = wbc->range_start >> PAGE_SHIFT; 3973 end = wbc->range_end >> PAGE_SHIFT; 3974 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 3975 range_whole = 1; 3976 scanned = 1; 3977 } 3978 if (wbc->sync_mode == WB_SYNC_ALL) 3979 tag = PAGECACHE_TAG_TOWRITE; 3980 else 3981 tag = PAGECACHE_TAG_DIRTY; 3982 retry: 3983 if (wbc->sync_mode == WB_SYNC_ALL) 3984 tag_pages_for_writeback(mapping, index, end); 3985 done_index = index; 3986 while (!done && !nr_to_write_done && (index <= end) && 3987 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3988 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3989 unsigned i; 3990 3991 scanned = 1; 3992 for (i = 0; i < nr_pages; i++) { 3993 struct page *page = pvec.pages[i]; 3994 3995 done_index = page->index; 3996 /* 3997 * At this point we hold neither mapping->tree_lock nor 3998 * lock on the page itself: the page may be truncated or 3999 * invalidated (changing page->mapping to NULL), or even 4000 * swizzled back from swapper_space to tmpfs file 4001 * mapping 4002 */ 4003 if (!trylock_page(page)) { 4004 flush_fn(data); 4005 lock_page(page); 4006 } 4007 4008 if (unlikely(page->mapping != mapping)) { 4009 unlock_page(page); 4010 continue; 4011 } 4012 4013 if (!wbc->range_cyclic && page->index > end) { 4014 done = 1; 4015 unlock_page(page); 4016 continue; 4017 } 4018 4019 if (wbc->sync_mode != WB_SYNC_NONE) { 4020 if (PageWriteback(page)) 4021 flush_fn(data); 4022 wait_on_page_writeback(page); 4023 } 4024 4025 if (PageWriteback(page) || 4026 !clear_page_dirty_for_io(page)) { 4027 unlock_page(page); 4028 continue; 4029 } 4030 4031 ret = (*writepage)(page, wbc, data); 4032 4033 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 4034 unlock_page(page); 4035 ret = 0; 4036 } 4037 if (ret < 0) { 4038 /* 4039 * done_index is set past this page, 4040 * so media errors will not choke 4041 * background writeout for the entire 4042 * file. This has consequences for 4043 * range_cyclic semantics (ie. it may 4044 * not be suitable for data integrity 4045 * writeout). 4046 */ 4047 done_index = page->index + 1; 4048 done = 1; 4049 break; 4050 } 4051 4052 /* 4053 * the filesystem may choose to bump up nr_to_write. 4054 * We have to make sure to honor the new nr_to_write 4055 * at any time 4056 */ 4057 nr_to_write_done = wbc->nr_to_write <= 0; 4058 } 4059 pagevec_release(&pvec); 4060 cond_resched(); 4061 } 4062 if (!scanned && !done) { 4063 /* 4064 * We hit the last page and there is more work to be done: wrap 4065 * back to the start of the file 4066 */ 4067 scanned = 1; 4068 index = 0; 4069 goto retry; 4070 } 4071 4072 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) 4073 mapping->writeback_index = done_index; 4074 4075 btrfs_add_delayed_iput(inode); 4076 return ret; 4077 } 4078 4079 static void flush_epd_write_bio(struct extent_page_data *epd) 4080 { 4081 if (epd->bio) { 4082 int ret; 4083 4084 bio_set_op_attrs(epd->bio, REQ_OP_WRITE, 4085 epd->sync_io ? REQ_SYNC : 0); 4086 4087 ret = submit_one_bio(epd->bio, 0, epd->bio_flags); 4088 BUG_ON(ret < 0); /* -ENOMEM */ 4089 epd->bio = NULL; 4090 } 4091 } 4092 4093 static noinline void flush_write_bio(void *data) 4094 { 4095 struct extent_page_data *epd = data; 4096 flush_epd_write_bio(epd); 4097 } 4098 4099 int extent_write_full_page(struct extent_io_tree *tree, struct page *page, 4100 get_extent_t *get_extent, 4101 struct writeback_control *wbc) 4102 { 4103 int ret; 4104 struct extent_page_data epd = { 4105 .bio = NULL, 4106 .tree = tree, 4107 .get_extent = get_extent, 4108 .extent_locked = 0, 4109 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4110 .bio_flags = 0, 4111 }; 4112 4113 ret = __extent_writepage(page, wbc, &epd); 4114 4115 flush_epd_write_bio(&epd); 4116 return ret; 4117 } 4118 4119 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, 4120 u64 start, u64 end, get_extent_t *get_extent, 4121 int mode) 4122 { 4123 int ret = 0; 4124 struct address_space *mapping = inode->i_mapping; 4125 struct page *page; 4126 unsigned long nr_pages = (end - start + PAGE_SIZE) >> 4127 PAGE_SHIFT; 4128 4129 struct extent_page_data epd = { 4130 .bio = NULL, 4131 .tree = tree, 4132 .get_extent = get_extent, 4133 .extent_locked = 1, 4134 .sync_io = mode == WB_SYNC_ALL, 4135 .bio_flags = 0, 4136 }; 4137 struct writeback_control wbc_writepages = { 4138 .sync_mode = mode, 4139 .nr_to_write = nr_pages * 2, 4140 .range_start = start, 4141 .range_end = end + 1, 4142 }; 4143 4144 while (start <= end) { 4145 page = find_get_page(mapping, start >> PAGE_SHIFT); 4146 if (clear_page_dirty_for_io(page)) 4147 ret = __extent_writepage(page, &wbc_writepages, &epd); 4148 else { 4149 if (tree->ops && tree->ops->writepage_end_io_hook) 4150 tree->ops->writepage_end_io_hook(page, start, 4151 start + PAGE_SIZE - 1, 4152 NULL, 1); 4153 unlock_page(page); 4154 } 4155 put_page(page); 4156 start += PAGE_SIZE; 4157 } 4158 4159 flush_epd_write_bio(&epd); 4160 return ret; 4161 } 4162 4163 int extent_writepages(struct extent_io_tree *tree, 4164 struct address_space *mapping, 4165 get_extent_t *get_extent, 4166 struct writeback_control *wbc) 4167 { 4168 int ret = 0; 4169 struct extent_page_data epd = { 4170 .bio = NULL, 4171 .tree = tree, 4172 .get_extent = get_extent, 4173 .extent_locked = 0, 4174 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4175 .bio_flags = 0, 4176 }; 4177 4178 ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, 4179 flush_write_bio); 4180 flush_epd_write_bio(&epd); 4181 return ret; 4182 } 4183 4184 int extent_readpages(struct extent_io_tree *tree, 4185 struct address_space *mapping, 4186 struct list_head *pages, unsigned nr_pages, 4187 get_extent_t get_extent) 4188 { 4189 struct bio *bio = NULL; 4190 unsigned page_idx; 4191 unsigned long bio_flags = 0; 4192 struct page *pagepool[16]; 4193 struct page *page; 4194 struct extent_map *em_cached = NULL; 4195 int nr = 0; 4196 u64 prev_em_start = (u64)-1; 4197 4198 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 4199 page = list_entry(pages->prev, struct page, lru); 4200 4201 prefetchw(&page->flags); 4202 list_del(&page->lru); 4203 if (add_to_page_cache_lru(page, mapping, 4204 page->index, 4205 readahead_gfp_mask(mapping))) { 4206 put_page(page); 4207 continue; 4208 } 4209 4210 pagepool[nr++] = page; 4211 if (nr < ARRAY_SIZE(pagepool)) 4212 continue; 4213 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4214 &bio, 0, &bio_flags, &prev_em_start); 4215 nr = 0; 4216 } 4217 if (nr) 4218 __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, 4219 &bio, 0, &bio_flags, &prev_em_start); 4220 4221 if (em_cached) 4222 free_extent_map(em_cached); 4223 4224 BUG_ON(!list_empty(pages)); 4225 if (bio) 4226 return submit_one_bio(bio, 0, bio_flags); 4227 return 0; 4228 } 4229 4230 /* 4231 * basic invalidatepage code, this waits on any locked or writeback 4232 * ranges corresponding to the page, and then deletes any extent state 4233 * records from the tree 4234 */ 4235 int extent_invalidatepage(struct extent_io_tree *tree, 4236 struct page *page, unsigned long offset) 4237 { 4238 struct extent_state *cached_state = NULL; 4239 u64 start = page_offset(page); 4240 u64 end = start + PAGE_SIZE - 1; 4241 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 4242 4243 start += ALIGN(offset, blocksize); 4244 if (start > end) 4245 return 0; 4246 4247 lock_extent_bits(tree, start, end, &cached_state); 4248 wait_on_page_writeback(page); 4249 clear_extent_bit(tree, start, end, 4250 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 4251 EXTENT_DO_ACCOUNTING, 4252 1, 1, &cached_state, GFP_NOFS); 4253 return 0; 4254 } 4255 4256 /* 4257 * a helper for releasepage, this tests for areas of the page that 4258 * are locked or under IO and drops the related state bits if it is safe 4259 * to drop the page. 4260 */ 4261 static int try_release_extent_state(struct extent_map_tree *map, 4262 struct extent_io_tree *tree, 4263 struct page *page, gfp_t mask) 4264 { 4265 u64 start = page_offset(page); 4266 u64 end = start + PAGE_SIZE - 1; 4267 int ret = 1; 4268 4269 if (test_range_bit(tree, start, end, 4270 EXTENT_IOBITS, 0, NULL)) 4271 ret = 0; 4272 else { 4273 /* 4274 * at this point we can safely clear everything except the 4275 * locked bit and the nodatasum bit 4276 */ 4277 ret = clear_extent_bit(tree, start, end, 4278 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 4279 0, 0, NULL, mask); 4280 4281 /* if clear_extent_bit failed for enomem reasons, 4282 * we can't allow the release to continue. 4283 */ 4284 if (ret < 0) 4285 ret = 0; 4286 else 4287 ret = 1; 4288 } 4289 return ret; 4290 } 4291 4292 /* 4293 * a helper for releasepage. As long as there are no locked extents 4294 * in the range corresponding to the page, both state records and extent 4295 * map records are removed 4296 */ 4297 int try_release_extent_mapping(struct extent_map_tree *map, 4298 struct extent_io_tree *tree, struct page *page, 4299 gfp_t mask) 4300 { 4301 struct extent_map *em; 4302 u64 start = page_offset(page); 4303 u64 end = start + PAGE_SIZE - 1; 4304 4305 if (gfpflags_allow_blocking(mask) && 4306 page->mapping->host->i_size > SZ_16M) { 4307 u64 len; 4308 while (start <= end) { 4309 len = end - start + 1; 4310 write_lock(&map->lock); 4311 em = lookup_extent_mapping(map, start, len); 4312 if (!em) { 4313 write_unlock(&map->lock); 4314 break; 4315 } 4316 if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || 4317 em->start != start) { 4318 write_unlock(&map->lock); 4319 free_extent_map(em); 4320 break; 4321 } 4322 if (!test_range_bit(tree, em->start, 4323 extent_map_end(em) - 1, 4324 EXTENT_LOCKED | EXTENT_WRITEBACK, 4325 0, NULL)) { 4326 remove_extent_mapping(map, em); 4327 /* once for the rb tree */ 4328 free_extent_map(em); 4329 } 4330 start = extent_map_end(em); 4331 write_unlock(&map->lock); 4332 4333 /* once for us */ 4334 free_extent_map(em); 4335 } 4336 } 4337 return try_release_extent_state(map, tree, page, mask); 4338 } 4339 4340 /* 4341 * helper function for fiemap, which doesn't want to see any holes. 4342 * This maps until we find something past 'last' 4343 */ 4344 static struct extent_map *get_extent_skip_holes(struct inode *inode, 4345 u64 offset, 4346 u64 last, 4347 get_extent_t *get_extent) 4348 { 4349 u64 sectorsize = btrfs_inode_sectorsize(inode); 4350 struct extent_map *em; 4351 u64 len; 4352 4353 if (offset >= last) 4354 return NULL; 4355 4356 while (1) { 4357 len = last - offset; 4358 if (len == 0) 4359 break; 4360 len = ALIGN(len, sectorsize); 4361 em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0); 4362 if (IS_ERR_OR_NULL(em)) 4363 return em; 4364 4365 /* if this isn't a hole return it */ 4366 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && 4367 em->block_start != EXTENT_MAP_HOLE) { 4368 return em; 4369 } 4370 4371 /* this is a hole, advance to the next extent */ 4372 offset = extent_map_end(em); 4373 free_extent_map(em); 4374 if (offset >= last) 4375 break; 4376 } 4377 return NULL; 4378 } 4379 4380 /* 4381 * To cache previous fiemap extent 4382 * 4383 * Will be used for merging fiemap extent 4384 */ 4385 struct fiemap_cache { 4386 u64 offset; 4387 u64 phys; 4388 u64 len; 4389 u32 flags; 4390 bool cached; 4391 }; 4392 4393 /* 4394 * Helper to submit fiemap extent. 4395 * 4396 * Will try to merge current fiemap extent specified by @offset, @phys, 4397 * @len and @flags with cached one. 4398 * And only when we fails to merge, cached one will be submitted as 4399 * fiemap extent. 4400 * 4401 * Return value is the same as fiemap_fill_next_extent(). 4402 */ 4403 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, 4404 struct fiemap_cache *cache, 4405 u64 offset, u64 phys, u64 len, u32 flags) 4406 { 4407 int ret = 0; 4408 4409 if (!cache->cached) 4410 goto assign; 4411 4412 /* 4413 * Sanity check, extent_fiemap() should have ensured that new 4414 * fiemap extent won't overlap with cahced one. 4415 * Not recoverable. 4416 * 4417 * NOTE: Physical address can overlap, due to compression 4418 */ 4419 if (cache->offset + cache->len > offset) { 4420 WARN_ON(1); 4421 return -EINVAL; 4422 } 4423 4424 /* 4425 * Only merges fiemap extents if 4426 * 1) Their logical addresses are continuous 4427 * 4428 * 2) Their physical addresses are continuous 4429 * So truly compressed (physical size smaller than logical size) 4430 * extents won't get merged with each other 4431 * 4432 * 3) Share same flags except FIEMAP_EXTENT_LAST 4433 * So regular extent won't get merged with prealloc extent 4434 */ 4435 if (cache->offset + cache->len == offset && 4436 cache->phys + cache->len == phys && 4437 (cache->flags & ~FIEMAP_EXTENT_LAST) == 4438 (flags & ~FIEMAP_EXTENT_LAST)) { 4439 cache->len += len; 4440 cache->flags |= flags; 4441 goto try_submit_last; 4442 } 4443 4444 /* Not mergeable, need to submit cached one */ 4445 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4446 cache->len, cache->flags); 4447 cache->cached = false; 4448 if (ret) 4449 return ret; 4450 assign: 4451 cache->cached = true; 4452 cache->offset = offset; 4453 cache->phys = phys; 4454 cache->len = len; 4455 cache->flags = flags; 4456 try_submit_last: 4457 if (cache->flags & FIEMAP_EXTENT_LAST) { 4458 ret = fiemap_fill_next_extent(fieinfo, cache->offset, 4459 cache->phys, cache->len, cache->flags); 4460 cache->cached = false; 4461 } 4462 return ret; 4463 } 4464 4465 /* 4466 * Sanity check for fiemap cache 4467 * 4468 * All fiemap cache should be submitted by emit_fiemap_extent() 4469 * Iteration should be terminated either by last fiemap extent or 4470 * fieinfo->fi_extents_max. 4471 * So no cached fiemap should exist. 4472 */ 4473 static int check_fiemap_cache(struct btrfs_fs_info *fs_info, 4474 struct fiemap_extent_info *fieinfo, 4475 struct fiemap_cache *cache) 4476 { 4477 int ret; 4478 4479 if (!cache->cached) 4480 return 0; 4481 4482 /* Small and recoverbale problem, only to info developer */ 4483 #ifdef CONFIG_BTRFS_DEBUG 4484 WARN_ON(1); 4485 #endif 4486 btrfs_warn(fs_info, 4487 "unhandled fiemap cache detected: offset=%llu phys=%llu len=%llu flags=0x%x", 4488 cache->offset, cache->phys, cache->len, cache->flags); 4489 ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, 4490 cache->len, cache->flags); 4491 cache->cached = false; 4492 if (ret > 0) 4493 ret = 0; 4494 return ret; 4495 } 4496 4497 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4498 __u64 start, __u64 len, get_extent_t *get_extent) 4499 { 4500 int ret = 0; 4501 u64 off = start; 4502 u64 max = start + len; 4503 u32 flags = 0; 4504 u32 found_type; 4505 u64 last; 4506 u64 last_for_get_extent = 0; 4507 u64 disko = 0; 4508 u64 isize = i_size_read(inode); 4509 struct btrfs_key found_key; 4510 struct extent_map *em = NULL; 4511 struct extent_state *cached_state = NULL; 4512 struct btrfs_path *path; 4513 struct btrfs_root *root = BTRFS_I(inode)->root; 4514 struct fiemap_cache cache = { 0 }; 4515 int end = 0; 4516 u64 em_start = 0; 4517 u64 em_len = 0; 4518 u64 em_end = 0; 4519 4520 if (len == 0) 4521 return -EINVAL; 4522 4523 path = btrfs_alloc_path(); 4524 if (!path) 4525 return -ENOMEM; 4526 path->leave_spinning = 1; 4527 4528 start = round_down(start, btrfs_inode_sectorsize(inode)); 4529 len = round_up(max, btrfs_inode_sectorsize(inode)) - start; 4530 4531 /* 4532 * lookup the last file extent. We're not using i_size here 4533 * because there might be preallocation past i_size 4534 */ 4535 ret = btrfs_lookup_file_extent(NULL, root, path, 4536 btrfs_ino(BTRFS_I(inode)), -1, 0); 4537 if (ret < 0) { 4538 btrfs_free_path(path); 4539 return ret; 4540 } else { 4541 WARN_ON(!ret); 4542 if (ret == 1) 4543 ret = 0; 4544 } 4545 4546 path->slots[0]--; 4547 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4548 found_type = found_key.type; 4549 4550 /* No extents, but there might be delalloc bits */ 4551 if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || 4552 found_type != BTRFS_EXTENT_DATA_KEY) { 4553 /* have to trust i_size as the end */ 4554 last = (u64)-1; 4555 last_for_get_extent = isize; 4556 } else { 4557 /* 4558 * remember the start of the last extent. There are a 4559 * bunch of different factors that go into the length of the 4560 * extent, so its much less complex to remember where it started 4561 */ 4562 last = found_key.offset; 4563 last_for_get_extent = last + 1; 4564 } 4565 btrfs_release_path(path); 4566 4567 /* 4568 * we might have some extents allocated but more delalloc past those 4569 * extents. so, we trust isize unless the start of the last extent is 4570 * beyond isize 4571 */ 4572 if (last < isize) { 4573 last = (u64)-1; 4574 last_for_get_extent = isize; 4575 } 4576 4577 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4578 &cached_state); 4579 4580 em = get_extent_skip_holes(inode, start, last_for_get_extent, 4581 get_extent); 4582 if (!em) 4583 goto out; 4584 if (IS_ERR(em)) { 4585 ret = PTR_ERR(em); 4586 goto out; 4587 } 4588 4589 while (!end) { 4590 u64 offset_in_extent = 0; 4591 4592 /* break if the extent we found is outside the range */ 4593 if (em->start >= max || extent_map_end(em) < off) 4594 break; 4595 4596 /* 4597 * get_extent may return an extent that starts before our 4598 * requested range. We have to make sure the ranges 4599 * we return to fiemap always move forward and don't 4600 * overlap, so adjust the offsets here 4601 */ 4602 em_start = max(em->start, off); 4603 4604 /* 4605 * record the offset from the start of the extent 4606 * for adjusting the disk offset below. Only do this if the 4607 * extent isn't compressed since our in ram offset may be past 4608 * what we have actually allocated on disk. 4609 */ 4610 if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4611 offset_in_extent = em_start - em->start; 4612 em_end = extent_map_end(em); 4613 em_len = em_end - em_start; 4614 disko = 0; 4615 flags = 0; 4616 4617 /* 4618 * bump off for our next call to get_extent 4619 */ 4620 off = extent_map_end(em); 4621 if (off >= max) 4622 end = 1; 4623 4624 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 4625 end = 1; 4626 flags |= FIEMAP_EXTENT_LAST; 4627 } else if (em->block_start == EXTENT_MAP_INLINE) { 4628 flags |= (FIEMAP_EXTENT_DATA_INLINE | 4629 FIEMAP_EXTENT_NOT_ALIGNED); 4630 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4631 flags |= (FIEMAP_EXTENT_DELALLOC | 4632 FIEMAP_EXTENT_UNKNOWN); 4633 } else if (fieinfo->fi_extents_max) { 4634 struct btrfs_trans_handle *trans; 4635 4636 u64 bytenr = em->block_start - 4637 (em->start - em->orig_start); 4638 4639 disko = em->block_start + offset_in_extent; 4640 4641 /* 4642 * We need a trans handle to get delayed refs 4643 */ 4644 trans = btrfs_join_transaction(root); 4645 /* 4646 * It's OK if we can't start a trans we can still check 4647 * from commit_root 4648 */ 4649 if (IS_ERR(trans)) 4650 trans = NULL; 4651 4652 /* 4653 * As btrfs supports shared space, this information 4654 * can be exported to userspace tools via 4655 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 4656 * then we're just getting a count and we can skip the 4657 * lookup stuff. 4658 */ 4659 ret = btrfs_check_shared(trans, root->fs_info, 4660 root->objectid, 4661 btrfs_ino(BTRFS_I(inode)), bytenr); 4662 if (trans) 4663 btrfs_end_transaction(trans); 4664 if (ret < 0) 4665 goto out_free; 4666 if (ret) 4667 flags |= FIEMAP_EXTENT_SHARED; 4668 ret = 0; 4669 } 4670 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4671 flags |= FIEMAP_EXTENT_ENCODED; 4672 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 4673 flags |= FIEMAP_EXTENT_UNWRITTEN; 4674 4675 free_extent_map(em); 4676 em = NULL; 4677 if ((em_start >= last) || em_len == (u64)-1 || 4678 (last == (u64)-1 && isize <= em_end)) { 4679 flags |= FIEMAP_EXTENT_LAST; 4680 end = 1; 4681 } 4682 4683 /* now scan forward to see if this is really the last extent. */ 4684 em = get_extent_skip_holes(inode, off, last_for_get_extent, 4685 get_extent); 4686 if (IS_ERR(em)) { 4687 ret = PTR_ERR(em); 4688 goto out; 4689 } 4690 if (!em) { 4691 flags |= FIEMAP_EXTENT_LAST; 4692 end = 1; 4693 } 4694 ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko, 4695 em_len, flags); 4696 if (ret) { 4697 if (ret == 1) 4698 ret = 0; 4699 goto out_free; 4700 } 4701 } 4702 out_free: 4703 if (!ret) 4704 ret = check_fiemap_cache(root->fs_info, fieinfo, &cache); 4705 free_extent_map(em); 4706 out: 4707 btrfs_free_path(path); 4708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, 4709 &cached_state, GFP_NOFS); 4710 return ret; 4711 } 4712 4713 static void __free_extent_buffer(struct extent_buffer *eb) 4714 { 4715 btrfs_leak_debug_del(&eb->leak_list); 4716 kmem_cache_free(extent_buffer_cache, eb); 4717 } 4718 4719 int extent_buffer_under_io(struct extent_buffer *eb) 4720 { 4721 return (atomic_read(&eb->io_pages) || 4722 test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || 4723 test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4724 } 4725 4726 /* 4727 * Helper for releasing extent buffer page. 4728 */ 4729 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) 4730 { 4731 unsigned long index; 4732 struct page *page; 4733 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4734 4735 BUG_ON(extent_buffer_under_io(eb)); 4736 4737 index = num_extent_pages(eb->start, eb->len); 4738 if (index == 0) 4739 return; 4740 4741 do { 4742 index--; 4743 page = eb->pages[index]; 4744 if (!page) 4745 continue; 4746 if (mapped) 4747 spin_lock(&page->mapping->private_lock); 4748 /* 4749 * We do this since we'll remove the pages after we've 4750 * removed the eb from the radix tree, so we could race 4751 * and have this page now attached to the new eb. So 4752 * only clear page_private if it's still connected to 4753 * this eb. 4754 */ 4755 if (PagePrivate(page) && 4756 page->private == (unsigned long)eb) { 4757 BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 4758 BUG_ON(PageDirty(page)); 4759 BUG_ON(PageWriteback(page)); 4760 /* 4761 * We need to make sure we haven't be attached 4762 * to a new eb. 4763 */ 4764 ClearPagePrivate(page); 4765 set_page_private(page, 0); 4766 /* One for the page private */ 4767 put_page(page); 4768 } 4769 4770 if (mapped) 4771 spin_unlock(&page->mapping->private_lock); 4772 4773 /* One for when we allocated the page */ 4774 put_page(page); 4775 } while (index != 0); 4776 } 4777 4778 /* 4779 * Helper for releasing the extent buffer. 4780 */ 4781 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4782 { 4783 btrfs_release_extent_buffer_page(eb); 4784 __free_extent_buffer(eb); 4785 } 4786 4787 static struct extent_buffer * 4788 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, 4789 unsigned long len) 4790 { 4791 struct extent_buffer *eb = NULL; 4792 4793 eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); 4794 eb->start = start; 4795 eb->len = len; 4796 eb->fs_info = fs_info; 4797 eb->bflags = 0; 4798 rwlock_init(&eb->lock); 4799 atomic_set(&eb->write_locks, 0); 4800 atomic_set(&eb->read_locks, 0); 4801 atomic_set(&eb->blocking_readers, 0); 4802 atomic_set(&eb->blocking_writers, 0); 4803 atomic_set(&eb->spinning_readers, 0); 4804 atomic_set(&eb->spinning_writers, 0); 4805 eb->lock_nested = 0; 4806 init_waitqueue_head(&eb->write_lock_wq); 4807 init_waitqueue_head(&eb->read_lock_wq); 4808 4809 btrfs_leak_debug_add(&eb->leak_list, &buffers); 4810 4811 spin_lock_init(&eb->refs_lock); 4812 atomic_set(&eb->refs, 1); 4813 atomic_set(&eb->io_pages, 0); 4814 4815 /* 4816 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages 4817 */ 4818 BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE 4819 > MAX_INLINE_EXTENT_BUFFER_SIZE); 4820 BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); 4821 4822 return eb; 4823 } 4824 4825 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) 4826 { 4827 unsigned long i; 4828 struct page *p; 4829 struct extent_buffer *new; 4830 unsigned long num_pages = num_extent_pages(src->start, src->len); 4831 4832 new = __alloc_extent_buffer(src->fs_info, src->start, src->len); 4833 if (new == NULL) 4834 return NULL; 4835 4836 for (i = 0; i < num_pages; i++) { 4837 p = alloc_page(GFP_NOFS); 4838 if (!p) { 4839 btrfs_release_extent_buffer(new); 4840 return NULL; 4841 } 4842 attach_extent_buffer_page(new, p); 4843 WARN_ON(PageDirty(p)); 4844 SetPageUptodate(p); 4845 new->pages[i] = p; 4846 copy_page(page_address(p), page_address(src->pages[i])); 4847 } 4848 4849 set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); 4850 set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); 4851 4852 return new; 4853 } 4854 4855 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4856 u64 start, unsigned long len) 4857 { 4858 struct extent_buffer *eb; 4859 unsigned long num_pages; 4860 unsigned long i; 4861 4862 num_pages = num_extent_pages(start, len); 4863 4864 eb = __alloc_extent_buffer(fs_info, start, len); 4865 if (!eb) 4866 return NULL; 4867 4868 for (i = 0; i < num_pages; i++) { 4869 eb->pages[i] = alloc_page(GFP_NOFS); 4870 if (!eb->pages[i]) 4871 goto err; 4872 } 4873 set_extent_buffer_uptodate(eb); 4874 btrfs_set_header_nritems(eb, 0); 4875 set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4876 4877 return eb; 4878 err: 4879 for (; i > 0; i--) 4880 __free_page(eb->pages[i - 1]); 4881 __free_extent_buffer(eb); 4882 return NULL; 4883 } 4884 4885 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, 4886 u64 start) 4887 { 4888 return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); 4889 } 4890 4891 static void check_buffer_tree_ref(struct extent_buffer *eb) 4892 { 4893 int refs; 4894 /* the ref bit is tricky. We have to make sure it is set 4895 * if we have the buffer dirty. Otherwise the 4896 * code to free a buffer can end up dropping a dirty 4897 * page 4898 * 4899 * Once the ref bit is set, it won't go away while the 4900 * buffer is dirty or in writeback, and it also won't 4901 * go away while we have the reference count on the 4902 * eb bumped. 4903 * 4904 * We can't just set the ref bit without bumping the 4905 * ref on the eb because free_extent_buffer might 4906 * see the ref bit and try to clear it. If this happens 4907 * free_extent_buffer might end up dropping our original 4908 * ref by mistake and freeing the page before we are able 4909 * to add one more ref. 4910 * 4911 * So bump the ref count first, then set the bit. If someone 4912 * beat us to it, drop the ref we added. 4913 */ 4914 refs = atomic_read(&eb->refs); 4915 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4916 return; 4917 4918 spin_lock(&eb->refs_lock); 4919 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4920 atomic_inc(&eb->refs); 4921 spin_unlock(&eb->refs_lock); 4922 } 4923 4924 static void mark_extent_buffer_accessed(struct extent_buffer *eb, 4925 struct page *accessed) 4926 { 4927 unsigned long num_pages, i; 4928 4929 check_buffer_tree_ref(eb); 4930 4931 num_pages = num_extent_pages(eb->start, eb->len); 4932 for (i = 0; i < num_pages; i++) { 4933 struct page *p = eb->pages[i]; 4934 4935 if (p != accessed) 4936 mark_page_accessed(p); 4937 } 4938 } 4939 4940 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, 4941 u64 start) 4942 { 4943 struct extent_buffer *eb; 4944 4945 rcu_read_lock(); 4946 eb = radix_tree_lookup(&fs_info->buffer_radix, 4947 start >> PAGE_SHIFT); 4948 if (eb && atomic_inc_not_zero(&eb->refs)) { 4949 rcu_read_unlock(); 4950 /* 4951 * Lock our eb's refs_lock to avoid races with 4952 * free_extent_buffer. When we get our eb it might be flagged 4953 * with EXTENT_BUFFER_STALE and another task running 4954 * free_extent_buffer might have seen that flag set, 4955 * eb->refs == 2, that the buffer isn't under IO (dirty and 4956 * writeback flags not set) and it's still in the tree (flag 4957 * EXTENT_BUFFER_TREE_REF set), therefore being in the process 4958 * of decrementing the extent buffer's reference count twice. 4959 * So here we could race and increment the eb's reference count, 4960 * clear its stale flag, mark it as dirty and drop our reference 4961 * before the other task finishes executing free_extent_buffer, 4962 * which would later result in an attempt to free an extent 4963 * buffer that is dirty. 4964 */ 4965 if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { 4966 spin_lock(&eb->refs_lock); 4967 spin_unlock(&eb->refs_lock); 4968 } 4969 mark_extent_buffer_accessed(eb, NULL); 4970 return eb; 4971 } 4972 rcu_read_unlock(); 4973 4974 return NULL; 4975 } 4976 4977 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4978 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 4979 u64 start) 4980 { 4981 struct extent_buffer *eb, *exists = NULL; 4982 int ret; 4983 4984 eb = find_extent_buffer(fs_info, start); 4985 if (eb) 4986 return eb; 4987 eb = alloc_dummy_extent_buffer(fs_info, start); 4988 if (!eb) 4989 return NULL; 4990 eb->fs_info = fs_info; 4991 again: 4992 ret = radix_tree_preload(GFP_NOFS); 4993 if (ret) 4994 goto free_eb; 4995 spin_lock(&fs_info->buffer_lock); 4996 ret = radix_tree_insert(&fs_info->buffer_radix, 4997 start >> PAGE_SHIFT, eb); 4998 spin_unlock(&fs_info->buffer_lock); 4999 radix_tree_preload_end(); 5000 if (ret == -EEXIST) { 5001 exists = find_extent_buffer(fs_info, start); 5002 if (exists) 5003 goto free_eb; 5004 else 5005 goto again; 5006 } 5007 check_buffer_tree_ref(eb); 5008 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5009 5010 /* 5011 * We will free dummy extent buffer's if they come into 5012 * free_extent_buffer with a ref count of 2, but if we are using this we 5013 * want the buffers to stay in memory until we're done with them, so 5014 * bump the ref count again. 5015 */ 5016 atomic_inc(&eb->refs); 5017 return eb; 5018 free_eb: 5019 btrfs_release_extent_buffer(eb); 5020 return exists; 5021 } 5022 #endif 5023 5024 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, 5025 u64 start) 5026 { 5027 unsigned long len = fs_info->nodesize; 5028 unsigned long num_pages = num_extent_pages(start, len); 5029 unsigned long i; 5030 unsigned long index = start >> PAGE_SHIFT; 5031 struct extent_buffer *eb; 5032 struct extent_buffer *exists = NULL; 5033 struct page *p; 5034 struct address_space *mapping = fs_info->btree_inode->i_mapping; 5035 int uptodate = 1; 5036 int ret; 5037 5038 if (!IS_ALIGNED(start, fs_info->sectorsize)) { 5039 btrfs_err(fs_info, "bad tree block start %llu", start); 5040 return ERR_PTR(-EINVAL); 5041 } 5042 5043 eb = find_extent_buffer(fs_info, start); 5044 if (eb) 5045 return eb; 5046 5047 eb = __alloc_extent_buffer(fs_info, start, len); 5048 if (!eb) 5049 return ERR_PTR(-ENOMEM); 5050 5051 for (i = 0; i < num_pages; i++, index++) { 5052 p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); 5053 if (!p) { 5054 exists = ERR_PTR(-ENOMEM); 5055 goto free_eb; 5056 } 5057 5058 spin_lock(&mapping->private_lock); 5059 if (PagePrivate(p)) { 5060 /* 5061 * We could have already allocated an eb for this page 5062 * and attached one so lets see if we can get a ref on 5063 * the existing eb, and if we can we know it's good and 5064 * we can just return that one, else we know we can just 5065 * overwrite page->private. 5066 */ 5067 exists = (struct extent_buffer *)p->private; 5068 if (atomic_inc_not_zero(&exists->refs)) { 5069 spin_unlock(&mapping->private_lock); 5070 unlock_page(p); 5071 put_page(p); 5072 mark_extent_buffer_accessed(exists, p); 5073 goto free_eb; 5074 } 5075 exists = NULL; 5076 5077 /* 5078 * Do this so attach doesn't complain and we need to 5079 * drop the ref the old guy had. 5080 */ 5081 ClearPagePrivate(p); 5082 WARN_ON(PageDirty(p)); 5083 put_page(p); 5084 } 5085 attach_extent_buffer_page(eb, p); 5086 spin_unlock(&mapping->private_lock); 5087 WARN_ON(PageDirty(p)); 5088 eb->pages[i] = p; 5089 if (!PageUptodate(p)) 5090 uptodate = 0; 5091 5092 /* 5093 * see below about how we avoid a nasty race with release page 5094 * and why we unlock later 5095 */ 5096 } 5097 if (uptodate) 5098 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5099 again: 5100 ret = radix_tree_preload(GFP_NOFS); 5101 if (ret) { 5102 exists = ERR_PTR(ret); 5103 goto free_eb; 5104 } 5105 5106 spin_lock(&fs_info->buffer_lock); 5107 ret = radix_tree_insert(&fs_info->buffer_radix, 5108 start >> PAGE_SHIFT, eb); 5109 spin_unlock(&fs_info->buffer_lock); 5110 radix_tree_preload_end(); 5111 if (ret == -EEXIST) { 5112 exists = find_extent_buffer(fs_info, start); 5113 if (exists) 5114 goto free_eb; 5115 else 5116 goto again; 5117 } 5118 /* add one reference for the tree */ 5119 check_buffer_tree_ref(eb); 5120 set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); 5121 5122 /* 5123 * there is a race where release page may have 5124 * tried to find this extent buffer in the radix 5125 * but failed. It will tell the VM it is safe to 5126 * reclaim the, and it will clear the page private bit. 5127 * We must make sure to set the page private bit properly 5128 * after the extent buffer is in the radix tree so 5129 * it doesn't get lost 5130 */ 5131 SetPageChecked(eb->pages[0]); 5132 for (i = 1; i < num_pages; i++) { 5133 p = eb->pages[i]; 5134 ClearPageChecked(p); 5135 unlock_page(p); 5136 } 5137 unlock_page(eb->pages[0]); 5138 return eb; 5139 5140 free_eb: 5141 WARN_ON(!atomic_dec_and_test(&eb->refs)); 5142 for (i = 0; i < num_pages; i++) { 5143 if (eb->pages[i]) 5144 unlock_page(eb->pages[i]); 5145 } 5146 5147 btrfs_release_extent_buffer(eb); 5148 return exists; 5149 } 5150 5151 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) 5152 { 5153 struct extent_buffer *eb = 5154 container_of(head, struct extent_buffer, rcu_head); 5155 5156 __free_extent_buffer(eb); 5157 } 5158 5159 /* Expects to have eb->eb_lock already held */ 5160 static int release_extent_buffer(struct extent_buffer *eb) 5161 { 5162 WARN_ON(atomic_read(&eb->refs) == 0); 5163 if (atomic_dec_and_test(&eb->refs)) { 5164 if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { 5165 struct btrfs_fs_info *fs_info = eb->fs_info; 5166 5167 spin_unlock(&eb->refs_lock); 5168 5169 spin_lock(&fs_info->buffer_lock); 5170 radix_tree_delete(&fs_info->buffer_radix, 5171 eb->start >> PAGE_SHIFT); 5172 spin_unlock(&fs_info->buffer_lock); 5173 } else { 5174 spin_unlock(&eb->refs_lock); 5175 } 5176 5177 /* Should be safe to release our pages at this point */ 5178 btrfs_release_extent_buffer_page(eb); 5179 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 5180 if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { 5181 __free_extent_buffer(eb); 5182 return 1; 5183 } 5184 #endif 5185 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 5186 return 1; 5187 } 5188 spin_unlock(&eb->refs_lock); 5189 5190 return 0; 5191 } 5192 5193 void free_extent_buffer(struct extent_buffer *eb) 5194 { 5195 int refs; 5196 int old; 5197 if (!eb) 5198 return; 5199 5200 while (1) { 5201 refs = atomic_read(&eb->refs); 5202 if (refs <= 3) 5203 break; 5204 old = atomic_cmpxchg(&eb->refs, refs, refs - 1); 5205 if (old == refs) 5206 return; 5207 } 5208 5209 spin_lock(&eb->refs_lock); 5210 if (atomic_read(&eb->refs) == 2 && 5211 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 5212 atomic_dec(&eb->refs); 5213 5214 if (atomic_read(&eb->refs) == 2 && 5215 test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && 5216 !extent_buffer_under_io(eb) && 5217 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5218 atomic_dec(&eb->refs); 5219 5220 /* 5221 * I know this is terrible, but it's temporary until we stop tracking 5222 * the uptodate bits and such for the extent buffers. 5223 */ 5224 release_extent_buffer(eb); 5225 } 5226 5227 void free_extent_buffer_stale(struct extent_buffer *eb) 5228 { 5229 if (!eb) 5230 return; 5231 5232 spin_lock(&eb->refs_lock); 5233 set_bit(EXTENT_BUFFER_STALE, &eb->bflags); 5234 5235 if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && 5236 test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 5237 atomic_dec(&eb->refs); 5238 release_extent_buffer(eb); 5239 } 5240 5241 void clear_extent_buffer_dirty(struct extent_buffer *eb) 5242 { 5243 unsigned long i; 5244 unsigned long num_pages; 5245 struct page *page; 5246 5247 num_pages = num_extent_pages(eb->start, eb->len); 5248 5249 for (i = 0; i < num_pages; i++) { 5250 page = eb->pages[i]; 5251 if (!PageDirty(page)) 5252 continue; 5253 5254 lock_page(page); 5255 WARN_ON(!PagePrivate(page)); 5256 5257 clear_page_dirty_for_io(page); 5258 spin_lock_irq(&page->mapping->tree_lock); 5259 if (!PageDirty(page)) { 5260 radix_tree_tag_clear(&page->mapping->page_tree, 5261 page_index(page), 5262 PAGECACHE_TAG_DIRTY); 5263 } 5264 spin_unlock_irq(&page->mapping->tree_lock); 5265 ClearPageError(page); 5266 unlock_page(page); 5267 } 5268 WARN_ON(atomic_read(&eb->refs) == 0); 5269 } 5270 5271 int set_extent_buffer_dirty(struct extent_buffer *eb) 5272 { 5273 unsigned long i; 5274 unsigned long num_pages; 5275 int was_dirty = 0; 5276 5277 check_buffer_tree_ref(eb); 5278 5279 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 5280 5281 num_pages = num_extent_pages(eb->start, eb->len); 5282 WARN_ON(atomic_read(&eb->refs) == 0); 5283 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5284 5285 for (i = 0; i < num_pages; i++) 5286 set_page_dirty(eb->pages[i]); 5287 return was_dirty; 5288 } 5289 5290 void clear_extent_buffer_uptodate(struct extent_buffer *eb) 5291 { 5292 unsigned long i; 5293 struct page *page; 5294 unsigned long num_pages; 5295 5296 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5297 num_pages = num_extent_pages(eb->start, eb->len); 5298 for (i = 0; i < num_pages; i++) { 5299 page = eb->pages[i]; 5300 if (page) 5301 ClearPageUptodate(page); 5302 } 5303 } 5304 5305 void set_extent_buffer_uptodate(struct extent_buffer *eb) 5306 { 5307 unsigned long i; 5308 struct page *page; 5309 unsigned long num_pages; 5310 5311 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5312 num_pages = num_extent_pages(eb->start, eb->len); 5313 for (i = 0; i < num_pages; i++) { 5314 page = eb->pages[i]; 5315 SetPageUptodate(page); 5316 } 5317 } 5318 5319 int extent_buffer_uptodate(struct extent_buffer *eb) 5320 { 5321 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5322 } 5323 5324 int read_extent_buffer_pages(struct extent_io_tree *tree, 5325 struct extent_buffer *eb, int wait, 5326 get_extent_t *get_extent, int mirror_num) 5327 { 5328 unsigned long i; 5329 struct page *page; 5330 int err; 5331 int ret = 0; 5332 int locked_pages = 0; 5333 int all_uptodate = 1; 5334 unsigned long num_pages; 5335 unsigned long num_reads = 0; 5336 struct bio *bio = NULL; 5337 unsigned long bio_flags = 0; 5338 5339 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 5340 return 0; 5341 5342 num_pages = num_extent_pages(eb->start, eb->len); 5343 for (i = 0; i < num_pages; i++) { 5344 page = eb->pages[i]; 5345 if (wait == WAIT_NONE) { 5346 if (!trylock_page(page)) 5347 goto unlock_exit; 5348 } else { 5349 lock_page(page); 5350 } 5351 locked_pages++; 5352 } 5353 /* 5354 * We need to firstly lock all pages to make sure that 5355 * the uptodate bit of our pages won't be affected by 5356 * clear_extent_buffer_uptodate(). 5357 */ 5358 for (i = 0; i < num_pages; i++) { 5359 page = eb->pages[i]; 5360 if (!PageUptodate(page)) { 5361 num_reads++; 5362 all_uptodate = 0; 5363 } 5364 } 5365 5366 if (all_uptodate) { 5367 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5368 goto unlock_exit; 5369 } 5370 5371 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 5372 eb->read_mirror = 0; 5373 atomic_set(&eb->io_pages, num_reads); 5374 for (i = 0; i < num_pages; i++) { 5375 page = eb->pages[i]; 5376 5377 if (!PageUptodate(page)) { 5378 if (ret) { 5379 atomic_dec(&eb->io_pages); 5380 unlock_page(page); 5381 continue; 5382 } 5383 5384 ClearPageError(page); 5385 err = __extent_read_full_page(tree, page, 5386 get_extent, &bio, 5387 mirror_num, &bio_flags, 5388 REQ_META); 5389 if (err) { 5390 ret = err; 5391 /* 5392 * We use &bio in above __extent_read_full_page, 5393 * so we ensure that if it returns error, the 5394 * current page fails to add itself to bio and 5395 * it's been unlocked. 5396 * 5397 * We must dec io_pages by ourselves. 5398 */ 5399 atomic_dec(&eb->io_pages); 5400 } 5401 } else { 5402 unlock_page(page); 5403 } 5404 } 5405 5406 if (bio) { 5407 err = submit_one_bio(bio, mirror_num, bio_flags); 5408 if (err) 5409 return err; 5410 } 5411 5412 if (ret || wait != WAIT_COMPLETE) 5413 return ret; 5414 5415 for (i = 0; i < num_pages; i++) { 5416 page = eb->pages[i]; 5417 wait_on_page_locked(page); 5418 if (!PageUptodate(page)) 5419 ret = -EIO; 5420 } 5421 5422 return ret; 5423 5424 unlock_exit: 5425 while (locked_pages > 0) { 5426 locked_pages--; 5427 page = eb->pages[locked_pages]; 5428 unlock_page(page); 5429 } 5430 return ret; 5431 } 5432 5433 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 5434 unsigned long start, 5435 unsigned long len) 5436 { 5437 size_t cur; 5438 size_t offset; 5439 struct page *page; 5440 char *kaddr; 5441 char *dst = (char *)dstv; 5442 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5443 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5444 5445 WARN_ON(start > eb->len); 5446 WARN_ON(start + len > eb->start + eb->len); 5447 5448 offset = (start_offset + start) & (PAGE_SIZE - 1); 5449 5450 while (len > 0) { 5451 page = eb->pages[i]; 5452 5453 cur = min(len, (PAGE_SIZE - offset)); 5454 kaddr = page_address(page); 5455 memcpy(dst, kaddr + offset, cur); 5456 5457 dst += cur; 5458 len -= cur; 5459 offset = 0; 5460 i++; 5461 } 5462 } 5463 5464 int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, 5465 unsigned long start, 5466 unsigned long len) 5467 { 5468 size_t cur; 5469 size_t offset; 5470 struct page *page; 5471 char *kaddr; 5472 char __user *dst = (char __user *)dstv; 5473 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5474 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5475 int ret = 0; 5476 5477 WARN_ON(start > eb->len); 5478 WARN_ON(start + len > eb->start + eb->len); 5479 5480 offset = (start_offset + start) & (PAGE_SIZE - 1); 5481 5482 while (len > 0) { 5483 page = eb->pages[i]; 5484 5485 cur = min(len, (PAGE_SIZE - offset)); 5486 kaddr = page_address(page); 5487 if (copy_to_user(dst, kaddr + offset, cur)) { 5488 ret = -EFAULT; 5489 break; 5490 } 5491 5492 dst += cur; 5493 len -= cur; 5494 offset = 0; 5495 i++; 5496 } 5497 5498 return ret; 5499 } 5500 5501 /* 5502 * return 0 if the item is found within a page. 5503 * return 1 if the item spans two pages. 5504 * return -EINVAL otherwise. 5505 */ 5506 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 5507 unsigned long min_len, char **map, 5508 unsigned long *map_start, 5509 unsigned long *map_len) 5510 { 5511 size_t offset = start & (PAGE_SIZE - 1); 5512 char *kaddr; 5513 struct page *p; 5514 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5515 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5516 unsigned long end_i = (start_offset + start + min_len - 1) >> 5517 PAGE_SHIFT; 5518 5519 if (i != end_i) 5520 return 1; 5521 5522 if (i == 0) { 5523 offset = start_offset; 5524 *map_start = 0; 5525 } else { 5526 offset = 0; 5527 *map_start = ((u64)i << PAGE_SHIFT) - start_offset; 5528 } 5529 5530 if (start + min_len > eb->len) { 5531 WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", 5532 eb->start, eb->len, start, min_len); 5533 return -EINVAL; 5534 } 5535 5536 p = eb->pages[i]; 5537 kaddr = page_address(p); 5538 *map = kaddr + offset; 5539 *map_len = PAGE_SIZE - offset; 5540 return 0; 5541 } 5542 5543 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 5544 unsigned long start, 5545 unsigned long len) 5546 { 5547 size_t cur; 5548 size_t offset; 5549 struct page *page; 5550 char *kaddr; 5551 char *ptr = (char *)ptrv; 5552 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5553 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5554 int ret = 0; 5555 5556 WARN_ON(start > eb->len); 5557 WARN_ON(start + len > eb->start + eb->len); 5558 5559 offset = (start_offset + start) & (PAGE_SIZE - 1); 5560 5561 while (len > 0) { 5562 page = eb->pages[i]; 5563 5564 cur = min(len, (PAGE_SIZE - offset)); 5565 5566 kaddr = page_address(page); 5567 ret = memcmp(ptr, kaddr + offset, cur); 5568 if (ret) 5569 break; 5570 5571 ptr += cur; 5572 len -= cur; 5573 offset = 0; 5574 i++; 5575 } 5576 return ret; 5577 } 5578 5579 void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, 5580 const void *srcv) 5581 { 5582 char *kaddr; 5583 5584 WARN_ON(!PageUptodate(eb->pages[0])); 5585 kaddr = page_address(eb->pages[0]); 5586 memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 5587 BTRFS_FSID_SIZE); 5588 } 5589 5590 void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) 5591 { 5592 char *kaddr; 5593 5594 WARN_ON(!PageUptodate(eb->pages[0])); 5595 kaddr = page_address(eb->pages[0]); 5596 memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 5597 BTRFS_FSID_SIZE); 5598 } 5599 5600 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 5601 unsigned long start, unsigned long len) 5602 { 5603 size_t cur; 5604 size_t offset; 5605 struct page *page; 5606 char *kaddr; 5607 char *src = (char *)srcv; 5608 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5609 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5610 5611 WARN_ON(start > eb->len); 5612 WARN_ON(start + len > eb->start + eb->len); 5613 5614 offset = (start_offset + start) & (PAGE_SIZE - 1); 5615 5616 while (len > 0) { 5617 page = eb->pages[i]; 5618 WARN_ON(!PageUptodate(page)); 5619 5620 cur = min(len, PAGE_SIZE - offset); 5621 kaddr = page_address(page); 5622 memcpy(kaddr + offset, src, cur); 5623 5624 src += cur; 5625 len -= cur; 5626 offset = 0; 5627 i++; 5628 } 5629 } 5630 5631 void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, 5632 unsigned long len) 5633 { 5634 size_t cur; 5635 size_t offset; 5636 struct page *page; 5637 char *kaddr; 5638 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5639 unsigned long i = (start_offset + start) >> PAGE_SHIFT; 5640 5641 WARN_ON(start > eb->len); 5642 WARN_ON(start + len > eb->start + eb->len); 5643 5644 offset = (start_offset + start) & (PAGE_SIZE - 1); 5645 5646 while (len > 0) { 5647 page = eb->pages[i]; 5648 WARN_ON(!PageUptodate(page)); 5649 5650 cur = min(len, PAGE_SIZE - offset); 5651 kaddr = page_address(page); 5652 memset(kaddr + offset, 0, cur); 5653 5654 len -= cur; 5655 offset = 0; 5656 i++; 5657 } 5658 } 5659 5660 void copy_extent_buffer_full(struct extent_buffer *dst, 5661 struct extent_buffer *src) 5662 { 5663 int i; 5664 unsigned num_pages; 5665 5666 ASSERT(dst->len == src->len); 5667 5668 num_pages = num_extent_pages(dst->start, dst->len); 5669 for (i = 0; i < num_pages; i++) 5670 copy_page(page_address(dst->pages[i]), 5671 page_address(src->pages[i])); 5672 } 5673 5674 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 5675 unsigned long dst_offset, unsigned long src_offset, 5676 unsigned long len) 5677 { 5678 u64 dst_len = dst->len; 5679 size_t cur; 5680 size_t offset; 5681 struct page *page; 5682 char *kaddr; 5683 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5684 unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; 5685 5686 WARN_ON(src->len != dst_len); 5687 5688 offset = (start_offset + dst_offset) & 5689 (PAGE_SIZE - 1); 5690 5691 while (len > 0) { 5692 page = dst->pages[i]; 5693 WARN_ON(!PageUptodate(page)); 5694 5695 cur = min(len, (unsigned long)(PAGE_SIZE - offset)); 5696 5697 kaddr = page_address(page); 5698 read_extent_buffer(src, kaddr + offset, src_offset, cur); 5699 5700 src_offset += cur; 5701 len -= cur; 5702 offset = 0; 5703 i++; 5704 } 5705 } 5706 5707 void le_bitmap_set(u8 *map, unsigned int start, int len) 5708 { 5709 u8 *p = map + BIT_BYTE(start); 5710 const unsigned int size = start + len; 5711 int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5712 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); 5713 5714 while (len - bits_to_set >= 0) { 5715 *p |= mask_to_set; 5716 len -= bits_to_set; 5717 bits_to_set = BITS_PER_BYTE; 5718 mask_to_set = ~0; 5719 p++; 5720 } 5721 if (len) { 5722 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5723 *p |= mask_to_set; 5724 } 5725 } 5726 5727 void le_bitmap_clear(u8 *map, unsigned int start, int len) 5728 { 5729 u8 *p = map + BIT_BYTE(start); 5730 const unsigned int size = start + len; 5731 int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); 5732 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); 5733 5734 while (len - bits_to_clear >= 0) { 5735 *p &= ~mask_to_clear; 5736 len -= bits_to_clear; 5737 bits_to_clear = BITS_PER_BYTE; 5738 mask_to_clear = ~0; 5739 p++; 5740 } 5741 if (len) { 5742 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5743 *p &= ~mask_to_clear; 5744 } 5745 } 5746 5747 /* 5748 * eb_bitmap_offset() - calculate the page and offset of the byte containing the 5749 * given bit number 5750 * @eb: the extent buffer 5751 * @start: offset of the bitmap item in the extent buffer 5752 * @nr: bit number 5753 * @page_index: return index of the page in the extent buffer that contains the 5754 * given bit number 5755 * @page_offset: return offset into the page given by page_index 5756 * 5757 * This helper hides the ugliness of finding the byte in an extent buffer which 5758 * contains a given bit. 5759 */ 5760 static inline void eb_bitmap_offset(struct extent_buffer *eb, 5761 unsigned long start, unsigned long nr, 5762 unsigned long *page_index, 5763 size_t *page_offset) 5764 { 5765 size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); 5766 size_t byte_offset = BIT_BYTE(nr); 5767 size_t offset; 5768 5769 /* 5770 * The byte we want is the offset of the extent buffer + the offset of 5771 * the bitmap item in the extent buffer + the offset of the byte in the 5772 * bitmap item. 5773 */ 5774 offset = start_offset + start + byte_offset; 5775 5776 *page_index = offset >> PAGE_SHIFT; 5777 *page_offset = offset & (PAGE_SIZE - 1); 5778 } 5779 5780 /** 5781 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set 5782 * @eb: the extent buffer 5783 * @start: offset of the bitmap item in the extent buffer 5784 * @nr: bit number to test 5785 */ 5786 int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, 5787 unsigned long nr) 5788 { 5789 u8 *kaddr; 5790 struct page *page; 5791 unsigned long i; 5792 size_t offset; 5793 5794 eb_bitmap_offset(eb, start, nr, &i, &offset); 5795 page = eb->pages[i]; 5796 WARN_ON(!PageUptodate(page)); 5797 kaddr = page_address(page); 5798 return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); 5799 } 5800 5801 /** 5802 * extent_buffer_bitmap_set - set an area of a bitmap 5803 * @eb: the extent buffer 5804 * @start: offset of the bitmap item in the extent buffer 5805 * @pos: bit number of the first bit 5806 * @len: number of bits to set 5807 */ 5808 void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, 5809 unsigned long pos, unsigned long len) 5810 { 5811 u8 *kaddr; 5812 struct page *page; 5813 unsigned long i; 5814 size_t offset; 5815 const unsigned int size = pos + len; 5816 int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5817 u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); 5818 5819 eb_bitmap_offset(eb, start, pos, &i, &offset); 5820 page = eb->pages[i]; 5821 WARN_ON(!PageUptodate(page)); 5822 kaddr = page_address(page); 5823 5824 while (len >= bits_to_set) { 5825 kaddr[offset] |= mask_to_set; 5826 len -= bits_to_set; 5827 bits_to_set = BITS_PER_BYTE; 5828 mask_to_set = ~0; 5829 if (++offset >= PAGE_SIZE && len > 0) { 5830 offset = 0; 5831 page = eb->pages[++i]; 5832 WARN_ON(!PageUptodate(page)); 5833 kaddr = page_address(page); 5834 } 5835 } 5836 if (len) { 5837 mask_to_set &= BITMAP_LAST_BYTE_MASK(size); 5838 kaddr[offset] |= mask_to_set; 5839 } 5840 } 5841 5842 5843 /** 5844 * extent_buffer_bitmap_clear - clear an area of a bitmap 5845 * @eb: the extent buffer 5846 * @start: offset of the bitmap item in the extent buffer 5847 * @pos: bit number of the first bit 5848 * @len: number of bits to clear 5849 */ 5850 void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, 5851 unsigned long pos, unsigned long len) 5852 { 5853 u8 *kaddr; 5854 struct page *page; 5855 unsigned long i; 5856 size_t offset; 5857 const unsigned int size = pos + len; 5858 int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); 5859 u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); 5860 5861 eb_bitmap_offset(eb, start, pos, &i, &offset); 5862 page = eb->pages[i]; 5863 WARN_ON(!PageUptodate(page)); 5864 kaddr = page_address(page); 5865 5866 while (len >= bits_to_clear) { 5867 kaddr[offset] &= ~mask_to_clear; 5868 len -= bits_to_clear; 5869 bits_to_clear = BITS_PER_BYTE; 5870 mask_to_clear = ~0; 5871 if (++offset >= PAGE_SIZE && len > 0) { 5872 offset = 0; 5873 page = eb->pages[++i]; 5874 WARN_ON(!PageUptodate(page)); 5875 kaddr = page_address(page); 5876 } 5877 } 5878 if (len) { 5879 mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); 5880 kaddr[offset] &= ~mask_to_clear; 5881 } 5882 } 5883 5884 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 5885 { 5886 unsigned long distance = (src > dst) ? src - dst : dst - src; 5887 return distance < len; 5888 } 5889 5890 static void copy_pages(struct page *dst_page, struct page *src_page, 5891 unsigned long dst_off, unsigned long src_off, 5892 unsigned long len) 5893 { 5894 char *dst_kaddr = page_address(dst_page); 5895 char *src_kaddr; 5896 int must_memmove = 0; 5897 5898 if (dst_page != src_page) { 5899 src_kaddr = page_address(src_page); 5900 } else { 5901 src_kaddr = dst_kaddr; 5902 if (areas_overlap(src_off, dst_off, len)) 5903 must_memmove = 1; 5904 } 5905 5906 if (must_memmove) 5907 memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); 5908 else 5909 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 5910 } 5911 5912 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5913 unsigned long src_offset, unsigned long len) 5914 { 5915 struct btrfs_fs_info *fs_info = dst->fs_info; 5916 size_t cur; 5917 size_t dst_off_in_page; 5918 size_t src_off_in_page; 5919 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5920 unsigned long dst_i; 5921 unsigned long src_i; 5922 5923 if (src_offset + len > dst->len) { 5924 btrfs_err(fs_info, 5925 "memmove bogus src_offset %lu move len %lu dst len %lu", 5926 src_offset, len, dst->len); 5927 BUG_ON(1); 5928 } 5929 if (dst_offset + len > dst->len) { 5930 btrfs_err(fs_info, 5931 "memmove bogus dst_offset %lu move len %lu dst len %lu", 5932 dst_offset, len, dst->len); 5933 BUG_ON(1); 5934 } 5935 5936 while (len > 0) { 5937 dst_off_in_page = (start_offset + dst_offset) & 5938 (PAGE_SIZE - 1); 5939 src_off_in_page = (start_offset + src_offset) & 5940 (PAGE_SIZE - 1); 5941 5942 dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; 5943 src_i = (start_offset + src_offset) >> PAGE_SHIFT; 5944 5945 cur = min(len, (unsigned long)(PAGE_SIZE - 5946 src_off_in_page)); 5947 cur = min_t(unsigned long, cur, 5948 (unsigned long)(PAGE_SIZE - dst_off_in_page)); 5949 5950 copy_pages(dst->pages[dst_i], dst->pages[src_i], 5951 dst_off_in_page, src_off_in_page, cur); 5952 5953 src_offset += cur; 5954 dst_offset += cur; 5955 len -= cur; 5956 } 5957 } 5958 5959 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 5960 unsigned long src_offset, unsigned long len) 5961 { 5962 struct btrfs_fs_info *fs_info = dst->fs_info; 5963 size_t cur; 5964 size_t dst_off_in_page; 5965 size_t src_off_in_page; 5966 unsigned long dst_end = dst_offset + len - 1; 5967 unsigned long src_end = src_offset + len - 1; 5968 size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); 5969 unsigned long dst_i; 5970 unsigned long src_i; 5971 5972 if (src_offset + len > dst->len) { 5973 btrfs_err(fs_info, 5974 "memmove bogus src_offset %lu move len %lu len %lu", 5975 src_offset, len, dst->len); 5976 BUG_ON(1); 5977 } 5978 if (dst_offset + len > dst->len) { 5979 btrfs_err(fs_info, 5980 "memmove bogus dst_offset %lu move len %lu len %lu", 5981 dst_offset, len, dst->len); 5982 BUG_ON(1); 5983 } 5984 if (dst_offset < src_offset) { 5985 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 5986 return; 5987 } 5988 while (len > 0) { 5989 dst_i = (start_offset + dst_end) >> PAGE_SHIFT; 5990 src_i = (start_offset + src_end) >> PAGE_SHIFT; 5991 5992 dst_off_in_page = (start_offset + dst_end) & 5993 (PAGE_SIZE - 1); 5994 src_off_in_page = (start_offset + src_end) & 5995 (PAGE_SIZE - 1); 5996 5997 cur = min_t(unsigned long, len, src_off_in_page + 1); 5998 cur = min(cur, dst_off_in_page + 1); 5999 copy_pages(dst->pages[dst_i], dst->pages[src_i], 6000 dst_off_in_page - cur + 1, 6001 src_off_in_page - cur + 1, cur); 6002 6003 dst_end -= cur; 6004 src_end -= cur; 6005 len -= cur; 6006 } 6007 } 6008 6009 int try_release_extent_buffer(struct page *page) 6010 { 6011 struct extent_buffer *eb; 6012 6013 /* 6014 * We need to make sure nobody is attaching this page to an eb right 6015 * now. 6016 */ 6017 spin_lock(&page->mapping->private_lock); 6018 if (!PagePrivate(page)) { 6019 spin_unlock(&page->mapping->private_lock); 6020 return 1; 6021 } 6022 6023 eb = (struct extent_buffer *)page->private; 6024 BUG_ON(!eb); 6025 6026 /* 6027 * This is a little awful but should be ok, we need to make sure that 6028 * the eb doesn't disappear out from under us while we're looking at 6029 * this page. 6030 */ 6031 spin_lock(&eb->refs_lock); 6032 if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { 6033 spin_unlock(&eb->refs_lock); 6034 spin_unlock(&page->mapping->private_lock); 6035 return 0; 6036 } 6037 spin_unlock(&page->mapping->private_lock); 6038 6039 /* 6040 * If tree ref isn't set then we know the ref on this eb is a real ref, 6041 * so just return, this page will likely be freed soon anyway. 6042 */ 6043 if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { 6044 spin_unlock(&eb->refs_lock); 6045 return 0; 6046 } 6047 6048 return release_extent_buffer(eb); 6049 } 6050