1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "misc.h" 11 #include "ctree.h" 12 #include "transaction.h" 13 #include "btrfs_inode.h" 14 #include "extent_io.h" 15 #include "disk-io.h" 16 #include "compression.h" 17 #include "delalloc-space.h" 18 19 static struct kmem_cache *btrfs_ordered_extent_cache; 20 21 static u64 entry_end(struct btrfs_ordered_extent *entry) 22 { 23 if (entry->file_offset + entry->num_bytes < entry->file_offset) 24 return (u64)-1; 25 return entry->file_offset + entry->num_bytes; 26 } 27 28 /* returns NULL if the insertion worked, or it returns the node it did find 29 * in the tree 30 */ 31 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 32 struct rb_node *node) 33 { 34 struct rb_node **p = &root->rb_node; 35 struct rb_node *parent = NULL; 36 struct btrfs_ordered_extent *entry; 37 38 while (*p) { 39 parent = *p; 40 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 41 42 if (file_offset < entry->file_offset) 43 p = &(*p)->rb_left; 44 else if (file_offset >= entry_end(entry)) 45 p = &(*p)->rb_right; 46 else 47 return parent; 48 } 49 50 rb_link_node(node, parent, p); 51 rb_insert_color(node, root); 52 return NULL; 53 } 54 55 /* 56 * look for a given offset in the tree, and if it can't be found return the 57 * first lesser offset 58 */ 59 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 60 struct rb_node **prev_ret) 61 { 62 struct rb_node *n = root->rb_node; 63 struct rb_node *prev = NULL; 64 struct rb_node *test; 65 struct btrfs_ordered_extent *entry; 66 struct btrfs_ordered_extent *prev_entry = NULL; 67 68 while (n) { 69 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 70 prev = n; 71 prev_entry = entry; 72 73 if (file_offset < entry->file_offset) 74 n = n->rb_left; 75 else if (file_offset >= entry_end(entry)) 76 n = n->rb_right; 77 else 78 return n; 79 } 80 if (!prev_ret) 81 return NULL; 82 83 while (prev && file_offset >= entry_end(prev_entry)) { 84 test = rb_next(prev); 85 if (!test) 86 break; 87 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 88 rb_node); 89 if (file_offset < entry_end(prev_entry)) 90 break; 91 92 prev = test; 93 } 94 if (prev) 95 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 96 rb_node); 97 while (prev && file_offset < entry_end(prev_entry)) { 98 test = rb_prev(prev); 99 if (!test) 100 break; 101 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 102 rb_node); 103 prev = test; 104 } 105 *prev_ret = prev; 106 return NULL; 107 } 108 109 /* 110 * helper to check if a given offset is inside a given entry 111 */ 112 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 113 { 114 if (file_offset < entry->file_offset || 115 entry->file_offset + entry->num_bytes <= file_offset) 116 return 0; 117 return 1; 118 } 119 120 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 121 u64 len) 122 { 123 if (file_offset + len <= entry->file_offset || 124 entry->file_offset + entry->num_bytes <= file_offset) 125 return 0; 126 return 1; 127 } 128 129 /* 130 * look find the first ordered struct that has this offset, otherwise 131 * the first one less than this offset 132 */ 133 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 134 u64 file_offset) 135 { 136 struct rb_root *root = &tree->tree; 137 struct rb_node *prev = NULL; 138 struct rb_node *ret; 139 struct btrfs_ordered_extent *entry; 140 141 if (tree->last) { 142 entry = rb_entry(tree->last, struct btrfs_ordered_extent, 143 rb_node); 144 if (offset_in_entry(entry, file_offset)) 145 return tree->last; 146 } 147 ret = __tree_search(root, file_offset, &prev); 148 if (!ret) 149 ret = prev; 150 if (ret) 151 tree->last = ret; 152 return ret; 153 } 154 155 /* allocate and add a new ordered_extent into the per-inode tree. 156 * 157 * The tree is given a single reference on the ordered extent that was 158 * inserted. 159 */ 160 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 161 u64 disk_bytenr, u64 num_bytes, 162 u64 disk_num_bytes, int type, int dio, 163 int compress_type) 164 { 165 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 166 struct btrfs_root *root = BTRFS_I(inode)->root; 167 struct btrfs_ordered_inode_tree *tree; 168 struct rb_node *node; 169 struct btrfs_ordered_extent *entry; 170 171 tree = &BTRFS_I(inode)->ordered_tree; 172 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 173 if (!entry) 174 return -ENOMEM; 175 176 entry->file_offset = file_offset; 177 entry->disk_bytenr = disk_bytenr; 178 entry->num_bytes = num_bytes; 179 entry->disk_num_bytes = disk_num_bytes; 180 entry->bytes_left = num_bytes; 181 entry->inode = igrab(inode); 182 entry->compress_type = compress_type; 183 entry->truncated_len = (u64)-1; 184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 185 set_bit(type, &entry->flags); 186 187 if (dio) { 188 percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, 189 fs_info->delalloc_batch); 190 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 191 } 192 193 /* one ref for the tree */ 194 refcount_set(&entry->refs, 1); 195 init_waitqueue_head(&entry->wait); 196 INIT_LIST_HEAD(&entry->list); 197 INIT_LIST_HEAD(&entry->root_extent_list); 198 INIT_LIST_HEAD(&entry->work_list); 199 init_completion(&entry->completion); 200 INIT_LIST_HEAD(&entry->log_list); 201 INIT_LIST_HEAD(&entry->trans_list); 202 203 trace_btrfs_ordered_extent_add(inode, entry); 204 205 spin_lock_irq(&tree->lock); 206 node = tree_insert(&tree->tree, file_offset, 207 &entry->rb_node); 208 if (node) 209 btrfs_panic(fs_info, -EEXIST, 210 "inconsistency in ordered tree at offset %llu", 211 file_offset); 212 spin_unlock_irq(&tree->lock); 213 214 spin_lock(&root->ordered_extent_lock); 215 list_add_tail(&entry->root_extent_list, 216 &root->ordered_extents); 217 root->nr_ordered_extents++; 218 if (root->nr_ordered_extents == 1) { 219 spin_lock(&fs_info->ordered_root_lock); 220 BUG_ON(!list_empty(&root->ordered_root)); 221 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 222 spin_unlock(&fs_info->ordered_root_lock); 223 } 224 spin_unlock(&root->ordered_extent_lock); 225 226 /* 227 * We don't need the count_max_extents here, we can assume that all of 228 * that work has been done at higher layers, so this is truly the 229 * smallest the extent is going to get. 230 */ 231 spin_lock(&BTRFS_I(inode)->lock); 232 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 233 spin_unlock(&BTRFS_I(inode)->lock); 234 235 return 0; 236 } 237 238 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 239 u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, 240 int type) 241 { 242 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 243 num_bytes, disk_num_bytes, type, 0, 244 BTRFS_COMPRESS_NONE); 245 } 246 247 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 248 u64 disk_bytenr, u64 num_bytes, 249 u64 disk_num_bytes, int type) 250 { 251 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 252 num_bytes, disk_num_bytes, type, 1, 253 BTRFS_COMPRESS_NONE); 254 } 255 256 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, 257 u64 disk_bytenr, u64 num_bytes, 258 u64 disk_num_bytes, int type, 259 int compress_type) 260 { 261 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 262 num_bytes, disk_num_bytes, type, 0, 263 compress_type); 264 } 265 266 /* 267 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 268 * when an ordered extent is finished. If the list covers more than one 269 * ordered extent, it is split across multiples. 270 */ 271 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 272 struct btrfs_ordered_sum *sum) 273 { 274 struct btrfs_ordered_inode_tree *tree; 275 276 tree = &BTRFS_I(entry->inode)->ordered_tree; 277 spin_lock_irq(&tree->lock); 278 list_add_tail(&sum->list, &entry->list); 279 spin_unlock_irq(&tree->lock); 280 } 281 282 /* 283 * this is used to account for finished IO across a given range 284 * of the file. The IO may span ordered extents. If 285 * a given ordered_extent is completely done, 1 is returned, otherwise 286 * 0. 287 * 288 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 289 * to make sure this function only returns 1 once for a given ordered extent. 290 * 291 * file_offset is updated to one byte past the range that is recorded as 292 * complete. This allows you to walk forward in the file. 293 */ 294 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 295 struct btrfs_ordered_extent **cached, 296 u64 *file_offset, u64 io_size, int uptodate) 297 { 298 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 299 struct btrfs_ordered_inode_tree *tree; 300 struct rb_node *node; 301 struct btrfs_ordered_extent *entry = NULL; 302 int ret; 303 unsigned long flags; 304 u64 dec_end; 305 u64 dec_start; 306 u64 to_dec; 307 308 tree = &BTRFS_I(inode)->ordered_tree; 309 spin_lock_irqsave(&tree->lock, flags); 310 node = tree_search(tree, *file_offset); 311 if (!node) { 312 ret = 1; 313 goto out; 314 } 315 316 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 317 if (!offset_in_entry(entry, *file_offset)) { 318 ret = 1; 319 goto out; 320 } 321 322 dec_start = max(*file_offset, entry->file_offset); 323 dec_end = min(*file_offset + io_size, 324 entry->file_offset + entry->num_bytes); 325 *file_offset = dec_end; 326 if (dec_start > dec_end) { 327 btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", 328 dec_start, dec_end); 329 } 330 to_dec = dec_end - dec_start; 331 if (to_dec > entry->bytes_left) { 332 btrfs_crit(fs_info, 333 "bad ordered accounting left %llu size %llu", 334 entry->bytes_left, to_dec); 335 } 336 entry->bytes_left -= to_dec; 337 if (!uptodate) 338 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 339 340 if (entry->bytes_left == 0) { 341 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 342 /* test_and_set_bit implies a barrier */ 343 cond_wake_up_nomb(&entry->wait); 344 } else { 345 ret = 1; 346 } 347 out: 348 if (!ret && cached && entry) { 349 *cached = entry; 350 refcount_inc(&entry->refs); 351 } 352 spin_unlock_irqrestore(&tree->lock, flags); 353 return ret == 0; 354 } 355 356 /* 357 * this is used to account for finished IO across a given range 358 * of the file. The IO should not span ordered extents. If 359 * a given ordered_extent is completely done, 1 is returned, otherwise 360 * 0. 361 * 362 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 363 * to make sure this function only returns 1 once for a given ordered extent. 364 */ 365 int btrfs_dec_test_ordered_pending(struct inode *inode, 366 struct btrfs_ordered_extent **cached, 367 u64 file_offset, u64 io_size, int uptodate) 368 { 369 struct btrfs_ordered_inode_tree *tree; 370 struct rb_node *node; 371 struct btrfs_ordered_extent *entry = NULL; 372 unsigned long flags; 373 int ret; 374 375 tree = &BTRFS_I(inode)->ordered_tree; 376 spin_lock_irqsave(&tree->lock, flags); 377 if (cached && *cached) { 378 entry = *cached; 379 goto have_entry; 380 } 381 382 node = tree_search(tree, file_offset); 383 if (!node) { 384 ret = 1; 385 goto out; 386 } 387 388 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 389 have_entry: 390 if (!offset_in_entry(entry, file_offset)) { 391 ret = 1; 392 goto out; 393 } 394 395 if (io_size > entry->bytes_left) { 396 btrfs_crit(BTRFS_I(inode)->root->fs_info, 397 "bad ordered accounting left %llu size %llu", 398 entry->bytes_left, io_size); 399 } 400 entry->bytes_left -= io_size; 401 if (!uptodate) 402 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 403 404 if (entry->bytes_left == 0) { 405 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 406 /* test_and_set_bit implies a barrier */ 407 cond_wake_up_nomb(&entry->wait); 408 } else { 409 ret = 1; 410 } 411 out: 412 if (!ret && cached && entry) { 413 *cached = entry; 414 refcount_inc(&entry->refs); 415 } 416 spin_unlock_irqrestore(&tree->lock, flags); 417 return ret == 0; 418 } 419 420 /* 421 * used to drop a reference on an ordered extent. This will free 422 * the extent if the last reference is dropped 423 */ 424 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 425 { 426 struct list_head *cur; 427 struct btrfs_ordered_sum *sum; 428 429 trace_btrfs_ordered_extent_put(entry->inode, entry); 430 431 if (refcount_dec_and_test(&entry->refs)) { 432 ASSERT(list_empty(&entry->log_list)); 433 ASSERT(list_empty(&entry->trans_list)); 434 ASSERT(list_empty(&entry->root_extent_list)); 435 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 436 if (entry->inode) 437 btrfs_add_delayed_iput(entry->inode); 438 while (!list_empty(&entry->list)) { 439 cur = entry->list.next; 440 sum = list_entry(cur, struct btrfs_ordered_sum, list); 441 list_del(&sum->list); 442 kvfree(sum); 443 } 444 kmem_cache_free(btrfs_ordered_extent_cache, entry); 445 } 446 } 447 448 /* 449 * remove an ordered extent from the tree. No references are dropped 450 * and waiters are woken up. 451 */ 452 void btrfs_remove_ordered_extent(struct inode *inode, 453 struct btrfs_ordered_extent *entry) 454 { 455 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 456 struct btrfs_ordered_inode_tree *tree; 457 struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 458 struct btrfs_root *root = btrfs_inode->root; 459 struct rb_node *node; 460 461 /* This is paired with btrfs_add_ordered_extent. */ 462 spin_lock(&btrfs_inode->lock); 463 btrfs_mod_outstanding_extents(btrfs_inode, -1); 464 spin_unlock(&btrfs_inode->lock); 465 if (root != fs_info->tree_root) 466 btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, 467 false); 468 469 if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 470 percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, 471 fs_info->delalloc_batch); 472 473 tree = &btrfs_inode->ordered_tree; 474 spin_lock_irq(&tree->lock); 475 node = &entry->rb_node; 476 rb_erase(node, &tree->tree); 477 RB_CLEAR_NODE(node); 478 if (tree->last == node) 479 tree->last = NULL; 480 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 481 spin_unlock_irq(&tree->lock); 482 483 spin_lock(&root->ordered_extent_lock); 484 list_del_init(&entry->root_extent_list); 485 root->nr_ordered_extents--; 486 487 trace_btrfs_ordered_extent_remove(inode, entry); 488 489 if (!root->nr_ordered_extents) { 490 spin_lock(&fs_info->ordered_root_lock); 491 BUG_ON(list_empty(&root->ordered_root)); 492 list_del_init(&root->ordered_root); 493 spin_unlock(&fs_info->ordered_root_lock); 494 } 495 spin_unlock(&root->ordered_extent_lock); 496 wake_up(&entry->wait); 497 } 498 499 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 500 { 501 struct btrfs_ordered_extent *ordered; 502 503 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 504 btrfs_start_ordered_extent(ordered->inode, ordered, 1); 505 complete(&ordered->completion); 506 } 507 508 /* 509 * wait for all the ordered extents in a root. This is done when balancing 510 * space between drives. 511 */ 512 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 513 const u64 range_start, const u64 range_len) 514 { 515 struct btrfs_fs_info *fs_info = root->fs_info; 516 LIST_HEAD(splice); 517 LIST_HEAD(skipped); 518 LIST_HEAD(works); 519 struct btrfs_ordered_extent *ordered, *next; 520 u64 count = 0; 521 const u64 range_end = range_start + range_len; 522 523 mutex_lock(&root->ordered_extent_mutex); 524 spin_lock(&root->ordered_extent_lock); 525 list_splice_init(&root->ordered_extents, &splice); 526 while (!list_empty(&splice) && nr) { 527 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 528 root_extent_list); 529 530 if (range_end <= ordered->disk_bytenr || 531 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 532 list_move_tail(&ordered->root_extent_list, &skipped); 533 cond_resched_lock(&root->ordered_extent_lock); 534 continue; 535 } 536 537 list_move_tail(&ordered->root_extent_list, 538 &root->ordered_extents); 539 refcount_inc(&ordered->refs); 540 spin_unlock(&root->ordered_extent_lock); 541 542 btrfs_init_work(&ordered->flush_work, 543 btrfs_run_ordered_extent_work, NULL, NULL); 544 list_add_tail(&ordered->work_list, &works); 545 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 546 547 cond_resched(); 548 spin_lock(&root->ordered_extent_lock); 549 if (nr != U64_MAX) 550 nr--; 551 count++; 552 } 553 list_splice_tail(&skipped, &root->ordered_extents); 554 list_splice_tail(&splice, &root->ordered_extents); 555 spin_unlock(&root->ordered_extent_lock); 556 557 list_for_each_entry_safe(ordered, next, &works, work_list) { 558 list_del_init(&ordered->work_list); 559 wait_for_completion(&ordered->completion); 560 btrfs_put_ordered_extent(ordered); 561 cond_resched(); 562 } 563 mutex_unlock(&root->ordered_extent_mutex); 564 565 return count; 566 } 567 568 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 569 const u64 range_start, const u64 range_len) 570 { 571 struct btrfs_root *root; 572 struct list_head splice; 573 u64 done; 574 575 INIT_LIST_HEAD(&splice); 576 577 mutex_lock(&fs_info->ordered_operations_mutex); 578 spin_lock(&fs_info->ordered_root_lock); 579 list_splice_init(&fs_info->ordered_roots, &splice); 580 while (!list_empty(&splice) && nr) { 581 root = list_first_entry(&splice, struct btrfs_root, 582 ordered_root); 583 root = btrfs_grab_root(root); 584 BUG_ON(!root); 585 list_move_tail(&root->ordered_root, 586 &fs_info->ordered_roots); 587 spin_unlock(&fs_info->ordered_root_lock); 588 589 done = btrfs_wait_ordered_extents(root, nr, 590 range_start, range_len); 591 btrfs_put_root(root); 592 593 spin_lock(&fs_info->ordered_root_lock); 594 if (nr != U64_MAX) { 595 nr -= done; 596 } 597 } 598 list_splice_tail(&splice, &fs_info->ordered_roots); 599 spin_unlock(&fs_info->ordered_root_lock); 600 mutex_unlock(&fs_info->ordered_operations_mutex); 601 } 602 603 /* 604 * Used to start IO or wait for a given ordered extent to finish. 605 * 606 * If wait is one, this effectively waits on page writeback for all the pages 607 * in the extent, and it waits on the io completion code to insert 608 * metadata into the btree corresponding to the extent 609 */ 610 void btrfs_start_ordered_extent(struct inode *inode, 611 struct btrfs_ordered_extent *entry, 612 int wait) 613 { 614 u64 start = entry->file_offset; 615 u64 end = start + entry->num_bytes - 1; 616 617 trace_btrfs_ordered_extent_start(inode, entry); 618 619 /* 620 * pages in the range can be dirty, clean or writeback. We 621 * start IO on any dirty ones so the wait doesn't stall waiting 622 * for the flusher thread to find them 623 */ 624 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 625 filemap_fdatawrite_range(inode->i_mapping, start, end); 626 if (wait) { 627 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 628 &entry->flags)); 629 } 630 } 631 632 /* 633 * Used to wait on ordered extents across a large range of bytes. 634 */ 635 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 636 { 637 int ret = 0; 638 int ret_wb = 0; 639 u64 end; 640 u64 orig_end; 641 struct btrfs_ordered_extent *ordered; 642 643 if (start + len < start) { 644 orig_end = INT_LIMIT(loff_t); 645 } else { 646 orig_end = start + len - 1; 647 if (orig_end > INT_LIMIT(loff_t)) 648 orig_end = INT_LIMIT(loff_t); 649 } 650 651 /* start IO across the range first to instantiate any delalloc 652 * extents 653 */ 654 ret = btrfs_fdatawrite_range(inode, start, orig_end); 655 if (ret) 656 return ret; 657 658 /* 659 * If we have a writeback error don't return immediately. Wait first 660 * for any ordered extents that haven't completed yet. This is to make 661 * sure no one can dirty the same page ranges and call writepages() 662 * before the ordered extents complete - to avoid failures (-EEXIST) 663 * when adding the new ordered extents to the ordered tree. 664 */ 665 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 666 667 end = orig_end; 668 while (1) { 669 ordered = btrfs_lookup_first_ordered_extent(inode, end); 670 if (!ordered) 671 break; 672 if (ordered->file_offset > orig_end) { 673 btrfs_put_ordered_extent(ordered); 674 break; 675 } 676 if (ordered->file_offset + ordered->num_bytes <= start) { 677 btrfs_put_ordered_extent(ordered); 678 break; 679 } 680 btrfs_start_ordered_extent(inode, ordered, 1); 681 end = ordered->file_offset; 682 /* 683 * If the ordered extent had an error save the error but don't 684 * exit without waiting first for all other ordered extents in 685 * the range to complete. 686 */ 687 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 688 ret = -EIO; 689 btrfs_put_ordered_extent(ordered); 690 if (end == 0 || end == start) 691 break; 692 end--; 693 } 694 return ret_wb ? ret_wb : ret; 695 } 696 697 /* 698 * find an ordered extent corresponding to file_offset. return NULL if 699 * nothing is found, otherwise take a reference on the extent and return it 700 */ 701 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, 702 u64 file_offset) 703 { 704 struct btrfs_ordered_inode_tree *tree; 705 struct rb_node *node; 706 struct btrfs_ordered_extent *entry = NULL; 707 708 tree = &BTRFS_I(inode)->ordered_tree; 709 spin_lock_irq(&tree->lock); 710 node = tree_search(tree, file_offset); 711 if (!node) 712 goto out; 713 714 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 715 if (!offset_in_entry(entry, file_offset)) 716 entry = NULL; 717 if (entry) 718 refcount_inc(&entry->refs); 719 out: 720 spin_unlock_irq(&tree->lock); 721 return entry; 722 } 723 724 /* Since the DIO code tries to lock a wide area we need to look for any ordered 725 * extents that exist in the range, rather than just the start of the range. 726 */ 727 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 728 struct btrfs_inode *inode, u64 file_offset, u64 len) 729 { 730 struct btrfs_ordered_inode_tree *tree; 731 struct rb_node *node; 732 struct btrfs_ordered_extent *entry = NULL; 733 734 tree = &inode->ordered_tree; 735 spin_lock_irq(&tree->lock); 736 node = tree_search(tree, file_offset); 737 if (!node) { 738 node = tree_search(tree, file_offset + len); 739 if (!node) 740 goto out; 741 } 742 743 while (1) { 744 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 745 if (range_overlaps(entry, file_offset, len)) 746 break; 747 748 if (entry->file_offset >= file_offset + len) { 749 entry = NULL; 750 break; 751 } 752 entry = NULL; 753 node = rb_next(node); 754 if (!node) 755 break; 756 } 757 out: 758 if (entry) 759 refcount_inc(&entry->refs); 760 spin_unlock_irq(&tree->lock); 761 return entry; 762 } 763 764 /* 765 * lookup and return any extent before 'file_offset'. NULL is returned 766 * if none is found 767 */ 768 struct btrfs_ordered_extent * 769 btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) 770 { 771 struct btrfs_ordered_inode_tree *tree; 772 struct rb_node *node; 773 struct btrfs_ordered_extent *entry = NULL; 774 775 tree = &BTRFS_I(inode)->ordered_tree; 776 spin_lock_irq(&tree->lock); 777 node = tree_search(tree, file_offset); 778 if (!node) 779 goto out; 780 781 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 782 refcount_inc(&entry->refs); 783 out: 784 spin_unlock_irq(&tree->lock); 785 return entry; 786 } 787 788 /* 789 * search the ordered extents for one corresponding to 'offset' and 790 * try to find a checksum. This is used because we allow pages to 791 * be reclaimed before their checksum is actually put into the btree 792 */ 793 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 794 u8 *sum, int len) 795 { 796 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 797 struct btrfs_ordered_sum *ordered_sum; 798 struct btrfs_ordered_extent *ordered; 799 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 800 unsigned long num_sectors; 801 unsigned long i; 802 u32 sectorsize = btrfs_inode_sectorsize(inode); 803 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 804 int index = 0; 805 806 ordered = btrfs_lookup_ordered_extent(inode, offset); 807 if (!ordered) 808 return 0; 809 810 spin_lock_irq(&tree->lock); 811 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 812 if (disk_bytenr >= ordered_sum->bytenr && 813 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 814 i = (disk_bytenr - ordered_sum->bytenr) >> 815 inode->i_sb->s_blocksize_bits; 816 num_sectors = ordered_sum->len >> 817 inode->i_sb->s_blocksize_bits; 818 num_sectors = min_t(int, len - index, num_sectors - i); 819 memcpy(sum + index, ordered_sum->sums + i * csum_size, 820 num_sectors * csum_size); 821 822 index += (int)num_sectors * csum_size; 823 if (index == len) 824 goto out; 825 disk_bytenr += num_sectors * sectorsize; 826 } 827 } 828 out: 829 spin_unlock_irq(&tree->lock); 830 btrfs_put_ordered_extent(ordered); 831 return index; 832 } 833 834 /* 835 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending 836 * ordered extents in it are run to completion. 837 * 838 * @inode: Inode whose ordered tree is to be searched 839 * @start: Beginning of range to flush 840 * @end: Last byte of range to lock 841 * @cached_state: If passed, will return the extent state responsible for the 842 * locked range. It's the caller's responsibility to free the cached state. 843 * 844 * This function always returns with the given range locked, ensuring after it's 845 * called no order extent can be pending. 846 */ 847 void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, 848 u64 end, 849 struct extent_state **cached_state) 850 { 851 struct btrfs_ordered_extent *ordered; 852 struct extent_state *cache = NULL; 853 struct extent_state **cachedp = &cache; 854 855 if (cached_state) 856 cachedp = cached_state; 857 858 while (1) { 859 lock_extent_bits(&inode->io_tree, start, end, cachedp); 860 ordered = btrfs_lookup_ordered_range(inode, start, 861 end - start + 1); 862 if (!ordered) { 863 /* 864 * If no external cached_state has been passed then 865 * decrement the extra ref taken for cachedp since we 866 * aren't exposing it outside of this function 867 */ 868 if (!cached_state) 869 refcount_dec(&cache->refs); 870 break; 871 } 872 unlock_extent_cached(&inode->io_tree, start, end, cachedp); 873 btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); 874 btrfs_put_ordered_extent(ordered); 875 } 876 } 877 878 int __init ordered_data_init(void) 879 { 880 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 881 sizeof(struct btrfs_ordered_extent), 0, 882 SLAB_MEM_SPREAD, 883 NULL); 884 if (!btrfs_ordered_extent_cache) 885 return -ENOMEM; 886 887 return 0; 888 } 889 890 void __cold ordered_data_exit(void) 891 { 892 kmem_cache_destroy(btrfs_ordered_extent_cache); 893 } 894