1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/sched/mm.h> 10 #include "misc.h" 11 #include "ctree.h" 12 #include "transaction.h" 13 #include "btrfs_inode.h" 14 #include "extent_io.h" 15 #include "disk-io.h" 16 #include "compression.h" 17 #include "delalloc-space.h" 18 19 static struct kmem_cache *btrfs_ordered_extent_cache; 20 21 static u64 entry_end(struct btrfs_ordered_extent *entry) 22 { 23 if (entry->file_offset + entry->num_bytes < entry->file_offset) 24 return (u64)-1; 25 return entry->file_offset + entry->num_bytes; 26 } 27 28 /* returns NULL if the insertion worked, or it returns the node it did find 29 * in the tree 30 */ 31 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 32 struct rb_node *node) 33 { 34 struct rb_node **p = &root->rb_node; 35 struct rb_node *parent = NULL; 36 struct btrfs_ordered_extent *entry; 37 38 while (*p) { 39 parent = *p; 40 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 41 42 if (file_offset < entry->file_offset) 43 p = &(*p)->rb_left; 44 else if (file_offset >= entry_end(entry)) 45 p = &(*p)->rb_right; 46 else 47 return parent; 48 } 49 50 rb_link_node(node, parent, p); 51 rb_insert_color(node, root); 52 return NULL; 53 } 54 55 /* 56 * look for a given offset in the tree, and if it can't be found return the 57 * first lesser offset 58 */ 59 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 60 struct rb_node **prev_ret) 61 { 62 struct rb_node *n = root->rb_node; 63 struct rb_node *prev = NULL; 64 struct rb_node *test; 65 struct btrfs_ordered_extent *entry; 66 struct btrfs_ordered_extent *prev_entry = NULL; 67 68 while (n) { 69 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 70 prev = n; 71 prev_entry = entry; 72 73 if (file_offset < entry->file_offset) 74 n = n->rb_left; 75 else if (file_offset >= entry_end(entry)) 76 n = n->rb_right; 77 else 78 return n; 79 } 80 if (!prev_ret) 81 return NULL; 82 83 while (prev && file_offset >= entry_end(prev_entry)) { 84 test = rb_next(prev); 85 if (!test) 86 break; 87 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 88 rb_node); 89 if (file_offset < entry_end(prev_entry)) 90 break; 91 92 prev = test; 93 } 94 if (prev) 95 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 96 rb_node); 97 while (prev && file_offset < entry_end(prev_entry)) { 98 test = rb_prev(prev); 99 if (!test) 100 break; 101 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 102 rb_node); 103 prev = test; 104 } 105 *prev_ret = prev; 106 return NULL; 107 } 108 109 /* 110 * helper to check if a given offset is inside a given entry 111 */ 112 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 113 { 114 if (file_offset < entry->file_offset || 115 entry->file_offset + entry->num_bytes <= file_offset) 116 return 0; 117 return 1; 118 } 119 120 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 121 u64 len) 122 { 123 if (file_offset + len <= entry->file_offset || 124 entry->file_offset + entry->num_bytes <= file_offset) 125 return 0; 126 return 1; 127 } 128 129 /* 130 * look find the first ordered struct that has this offset, otherwise 131 * the first one less than this offset 132 */ 133 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 134 u64 file_offset) 135 { 136 struct rb_root *root = &tree->tree; 137 struct rb_node *prev = NULL; 138 struct rb_node *ret; 139 struct btrfs_ordered_extent *entry; 140 141 if (tree->last) { 142 entry = rb_entry(tree->last, struct btrfs_ordered_extent, 143 rb_node); 144 if (offset_in_entry(entry, file_offset)) 145 return tree->last; 146 } 147 ret = __tree_search(root, file_offset, &prev); 148 if (!ret) 149 ret = prev; 150 if (ret) 151 tree->last = ret; 152 return ret; 153 } 154 155 /* allocate and add a new ordered_extent into the per-inode tree. 156 * 157 * The tree is given a single reference on the ordered extent that was 158 * inserted. 159 */ 160 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 161 u64 disk_bytenr, u64 num_bytes, 162 u64 disk_num_bytes, int type, int dio, 163 int compress_type) 164 { 165 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 166 struct btrfs_root *root = BTRFS_I(inode)->root; 167 struct btrfs_ordered_inode_tree *tree; 168 struct rb_node *node; 169 struct btrfs_ordered_extent *entry; 170 171 tree = &BTRFS_I(inode)->ordered_tree; 172 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 173 if (!entry) 174 return -ENOMEM; 175 176 entry->file_offset = file_offset; 177 entry->disk_bytenr = disk_bytenr; 178 entry->num_bytes = num_bytes; 179 entry->disk_num_bytes = disk_num_bytes; 180 entry->bytes_left = num_bytes; 181 entry->inode = igrab(inode); 182 entry->compress_type = compress_type; 183 entry->truncated_len = (u64)-1; 184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 185 set_bit(type, &entry->flags); 186 187 if (dio) { 188 percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, 189 fs_info->delalloc_batch); 190 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 191 } 192 193 /* one ref for the tree */ 194 refcount_set(&entry->refs, 1); 195 init_waitqueue_head(&entry->wait); 196 INIT_LIST_HEAD(&entry->list); 197 INIT_LIST_HEAD(&entry->root_extent_list); 198 INIT_LIST_HEAD(&entry->work_list); 199 init_completion(&entry->completion); 200 INIT_LIST_HEAD(&entry->log_list); 201 INIT_LIST_HEAD(&entry->trans_list); 202 203 trace_btrfs_ordered_extent_add(inode, entry); 204 205 spin_lock_irq(&tree->lock); 206 node = tree_insert(&tree->tree, file_offset, 207 &entry->rb_node); 208 if (node) 209 btrfs_panic(fs_info, -EEXIST, 210 "inconsistency in ordered tree at offset %llu", 211 file_offset); 212 spin_unlock_irq(&tree->lock); 213 214 spin_lock(&root->ordered_extent_lock); 215 list_add_tail(&entry->root_extent_list, 216 &root->ordered_extents); 217 root->nr_ordered_extents++; 218 if (root->nr_ordered_extents == 1) { 219 spin_lock(&fs_info->ordered_root_lock); 220 BUG_ON(!list_empty(&root->ordered_root)); 221 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 222 spin_unlock(&fs_info->ordered_root_lock); 223 } 224 spin_unlock(&root->ordered_extent_lock); 225 226 /* 227 * We don't need the count_max_extents here, we can assume that all of 228 * that work has been done at higher layers, so this is truly the 229 * smallest the extent is going to get. 230 */ 231 spin_lock(&BTRFS_I(inode)->lock); 232 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 233 spin_unlock(&BTRFS_I(inode)->lock); 234 235 return 0; 236 } 237 238 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 239 u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, 240 int type) 241 { 242 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 243 num_bytes, disk_num_bytes, type, 0, 244 BTRFS_COMPRESS_NONE); 245 } 246 247 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 248 u64 disk_bytenr, u64 num_bytes, 249 u64 disk_num_bytes, int type) 250 { 251 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 252 num_bytes, disk_num_bytes, type, 1, 253 BTRFS_COMPRESS_NONE); 254 } 255 256 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, 257 u64 disk_bytenr, u64 num_bytes, 258 u64 disk_num_bytes, int type, 259 int compress_type) 260 { 261 return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, 262 num_bytes, disk_num_bytes, type, 0, 263 compress_type); 264 } 265 266 /* 267 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 268 * when an ordered extent is finished. If the list covers more than one 269 * ordered extent, it is split across multiples. 270 */ 271 void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, 272 struct btrfs_ordered_sum *sum) 273 { 274 struct btrfs_ordered_inode_tree *tree; 275 276 tree = &BTRFS_I(entry->inode)->ordered_tree; 277 spin_lock_irq(&tree->lock); 278 list_add_tail(&sum->list, &entry->list); 279 spin_unlock_irq(&tree->lock); 280 } 281 282 /* 283 * this is used to account for finished IO across a given range 284 * of the file. The IO may span ordered extents. If 285 * a given ordered_extent is completely done, 1 is returned, otherwise 286 * 0. 287 * 288 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 289 * to make sure this function only returns 1 once for a given ordered extent. 290 * 291 * file_offset is updated to one byte past the range that is recorded as 292 * complete. This allows you to walk forward in the file. 293 */ 294 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 295 struct btrfs_ordered_extent **cached, 296 u64 *file_offset, u64 io_size, int uptodate) 297 { 298 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 299 struct btrfs_ordered_inode_tree *tree; 300 struct rb_node *node; 301 struct btrfs_ordered_extent *entry = NULL; 302 int ret; 303 unsigned long flags; 304 u64 dec_end; 305 u64 dec_start; 306 u64 to_dec; 307 308 tree = &BTRFS_I(inode)->ordered_tree; 309 spin_lock_irqsave(&tree->lock, flags); 310 node = tree_search(tree, *file_offset); 311 if (!node) { 312 ret = 1; 313 goto out; 314 } 315 316 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 317 if (!offset_in_entry(entry, *file_offset)) { 318 ret = 1; 319 goto out; 320 } 321 322 dec_start = max(*file_offset, entry->file_offset); 323 dec_end = min(*file_offset + io_size, 324 entry->file_offset + entry->num_bytes); 325 *file_offset = dec_end; 326 if (dec_start > dec_end) { 327 btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", 328 dec_start, dec_end); 329 } 330 to_dec = dec_end - dec_start; 331 if (to_dec > entry->bytes_left) { 332 btrfs_crit(fs_info, 333 "bad ordered accounting left %llu size %llu", 334 entry->bytes_left, to_dec); 335 } 336 entry->bytes_left -= to_dec; 337 if (!uptodate) 338 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 339 340 if (entry->bytes_left == 0) { 341 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 342 /* test_and_set_bit implies a barrier */ 343 cond_wake_up_nomb(&entry->wait); 344 } else { 345 ret = 1; 346 } 347 out: 348 if (!ret && cached && entry) { 349 *cached = entry; 350 refcount_inc(&entry->refs); 351 } 352 spin_unlock_irqrestore(&tree->lock, flags); 353 return ret == 0; 354 } 355 356 /* 357 * this is used to account for finished IO across a given range 358 * of the file. The IO should not span ordered extents. If 359 * a given ordered_extent is completely done, 1 is returned, otherwise 360 * 0. 361 * 362 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 363 * to make sure this function only returns 1 once for a given ordered extent. 364 */ 365 int btrfs_dec_test_ordered_pending(struct inode *inode, 366 struct btrfs_ordered_extent **cached, 367 u64 file_offset, u64 io_size, int uptodate) 368 { 369 struct btrfs_ordered_inode_tree *tree; 370 struct rb_node *node; 371 struct btrfs_ordered_extent *entry = NULL; 372 unsigned long flags; 373 int ret; 374 375 tree = &BTRFS_I(inode)->ordered_tree; 376 spin_lock_irqsave(&tree->lock, flags); 377 if (cached && *cached) { 378 entry = *cached; 379 goto have_entry; 380 } 381 382 node = tree_search(tree, file_offset); 383 if (!node) { 384 ret = 1; 385 goto out; 386 } 387 388 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 389 have_entry: 390 if (!offset_in_entry(entry, file_offset)) { 391 ret = 1; 392 goto out; 393 } 394 395 if (io_size > entry->bytes_left) { 396 btrfs_crit(BTRFS_I(inode)->root->fs_info, 397 "bad ordered accounting left %llu size %llu", 398 entry->bytes_left, io_size); 399 } 400 entry->bytes_left -= io_size; 401 if (!uptodate) 402 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 403 404 if (entry->bytes_left == 0) { 405 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 406 /* test_and_set_bit implies a barrier */ 407 cond_wake_up_nomb(&entry->wait); 408 } else { 409 ret = 1; 410 } 411 out: 412 if (!ret && cached && entry) { 413 *cached = entry; 414 refcount_inc(&entry->refs); 415 } 416 spin_unlock_irqrestore(&tree->lock, flags); 417 return ret == 0; 418 } 419 420 /* 421 * used to drop a reference on an ordered extent. This will free 422 * the extent if the last reference is dropped 423 */ 424 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 425 { 426 struct list_head *cur; 427 struct btrfs_ordered_sum *sum; 428 429 trace_btrfs_ordered_extent_put(entry->inode, entry); 430 431 if (refcount_dec_and_test(&entry->refs)) { 432 ASSERT(list_empty(&entry->log_list)); 433 ASSERT(list_empty(&entry->trans_list)); 434 ASSERT(list_empty(&entry->root_extent_list)); 435 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 436 if (entry->inode) 437 btrfs_add_delayed_iput(entry->inode); 438 while (!list_empty(&entry->list)) { 439 cur = entry->list.next; 440 sum = list_entry(cur, struct btrfs_ordered_sum, list); 441 list_del(&sum->list); 442 kvfree(sum); 443 } 444 kmem_cache_free(btrfs_ordered_extent_cache, entry); 445 } 446 } 447 448 /* 449 * remove an ordered extent from the tree. No references are dropped 450 * and waiters are woken up. 451 */ 452 void btrfs_remove_ordered_extent(struct inode *inode, 453 struct btrfs_ordered_extent *entry) 454 { 455 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 456 struct btrfs_ordered_inode_tree *tree; 457 struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 458 struct btrfs_root *root = btrfs_inode->root; 459 struct rb_node *node; 460 461 /* This is paired with btrfs_add_ordered_extent. */ 462 spin_lock(&btrfs_inode->lock); 463 btrfs_mod_outstanding_extents(btrfs_inode, -1); 464 spin_unlock(&btrfs_inode->lock); 465 if (root != fs_info->tree_root) 466 btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, 467 false); 468 469 if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 470 percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, 471 fs_info->delalloc_batch); 472 473 tree = &btrfs_inode->ordered_tree; 474 spin_lock_irq(&tree->lock); 475 node = &entry->rb_node; 476 rb_erase(node, &tree->tree); 477 RB_CLEAR_NODE(node); 478 if (tree->last == node) 479 tree->last = NULL; 480 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 481 spin_unlock_irq(&tree->lock); 482 483 spin_lock(&root->ordered_extent_lock); 484 list_del_init(&entry->root_extent_list); 485 root->nr_ordered_extents--; 486 487 trace_btrfs_ordered_extent_remove(inode, entry); 488 489 if (!root->nr_ordered_extents) { 490 spin_lock(&fs_info->ordered_root_lock); 491 BUG_ON(list_empty(&root->ordered_root)); 492 list_del_init(&root->ordered_root); 493 spin_unlock(&fs_info->ordered_root_lock); 494 } 495 spin_unlock(&root->ordered_extent_lock); 496 wake_up(&entry->wait); 497 } 498 499 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 500 { 501 struct btrfs_ordered_extent *ordered; 502 503 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 504 btrfs_start_ordered_extent(ordered->inode, ordered, 1); 505 complete(&ordered->completion); 506 } 507 508 /* 509 * wait for all the ordered extents in a root. This is done when balancing 510 * space between drives. 511 */ 512 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 513 const u64 range_start, const u64 range_len) 514 { 515 struct btrfs_fs_info *fs_info = root->fs_info; 516 LIST_HEAD(splice); 517 LIST_HEAD(skipped); 518 LIST_HEAD(works); 519 struct btrfs_ordered_extent *ordered, *next; 520 u64 count = 0; 521 const u64 range_end = range_start + range_len; 522 523 mutex_lock(&root->ordered_extent_mutex); 524 spin_lock(&root->ordered_extent_lock); 525 list_splice_init(&root->ordered_extents, &splice); 526 while (!list_empty(&splice) && nr) { 527 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 528 root_extent_list); 529 530 if (range_end <= ordered->disk_bytenr || 531 ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { 532 list_move_tail(&ordered->root_extent_list, &skipped); 533 cond_resched_lock(&root->ordered_extent_lock); 534 continue; 535 } 536 537 list_move_tail(&ordered->root_extent_list, 538 &root->ordered_extents); 539 refcount_inc(&ordered->refs); 540 spin_unlock(&root->ordered_extent_lock); 541 542 btrfs_init_work(&ordered->flush_work, 543 btrfs_run_ordered_extent_work, NULL, NULL); 544 list_add_tail(&ordered->work_list, &works); 545 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 546 547 cond_resched(); 548 spin_lock(&root->ordered_extent_lock); 549 if (nr != U64_MAX) 550 nr--; 551 count++; 552 } 553 list_splice_tail(&skipped, &root->ordered_extents); 554 list_splice_tail(&splice, &root->ordered_extents); 555 spin_unlock(&root->ordered_extent_lock); 556 557 list_for_each_entry_safe(ordered, next, &works, work_list) { 558 list_del_init(&ordered->work_list); 559 wait_for_completion(&ordered->completion); 560 btrfs_put_ordered_extent(ordered); 561 cond_resched(); 562 } 563 mutex_unlock(&root->ordered_extent_mutex); 564 565 return count; 566 } 567 568 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 569 const u64 range_start, const u64 range_len) 570 { 571 struct btrfs_root *root; 572 struct list_head splice; 573 u64 done; 574 575 INIT_LIST_HEAD(&splice); 576 577 mutex_lock(&fs_info->ordered_operations_mutex); 578 spin_lock(&fs_info->ordered_root_lock); 579 list_splice_init(&fs_info->ordered_roots, &splice); 580 while (!list_empty(&splice) && nr) { 581 root = list_first_entry(&splice, struct btrfs_root, 582 ordered_root); 583 root = btrfs_grab_fs_root(root); 584 BUG_ON(!root); 585 list_move_tail(&root->ordered_root, 586 &fs_info->ordered_roots); 587 spin_unlock(&fs_info->ordered_root_lock); 588 589 done = btrfs_wait_ordered_extents(root, nr, 590 range_start, range_len); 591 btrfs_put_fs_root(root); 592 593 spin_lock(&fs_info->ordered_root_lock); 594 if (nr != U64_MAX) { 595 nr -= done; 596 } 597 } 598 list_splice_tail(&splice, &fs_info->ordered_roots); 599 spin_unlock(&fs_info->ordered_root_lock); 600 mutex_unlock(&fs_info->ordered_operations_mutex); 601 } 602 603 /* 604 * Used to start IO or wait for a given ordered extent to finish. 605 * 606 * If wait is one, this effectively waits on page writeback for all the pages 607 * in the extent, and it waits on the io completion code to insert 608 * metadata into the btree corresponding to the extent 609 */ 610 void btrfs_start_ordered_extent(struct inode *inode, 611 struct btrfs_ordered_extent *entry, 612 int wait) 613 { 614 u64 start = entry->file_offset; 615 u64 end = start + entry->num_bytes - 1; 616 617 trace_btrfs_ordered_extent_start(inode, entry); 618 619 /* 620 * pages in the range can be dirty, clean or writeback. We 621 * start IO on any dirty ones so the wait doesn't stall waiting 622 * for the flusher thread to find them 623 */ 624 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 625 filemap_fdatawrite_range(inode->i_mapping, start, end); 626 if (wait) { 627 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 628 &entry->flags)); 629 } 630 } 631 632 /* 633 * Used to wait on ordered extents across a large range of bytes. 634 */ 635 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 636 { 637 int ret = 0; 638 int ret_wb = 0; 639 u64 end; 640 u64 orig_end; 641 struct btrfs_ordered_extent *ordered; 642 643 if (start + len < start) { 644 orig_end = INT_LIMIT(loff_t); 645 } else { 646 orig_end = start + len - 1; 647 if (orig_end > INT_LIMIT(loff_t)) 648 orig_end = INT_LIMIT(loff_t); 649 } 650 651 /* start IO across the range first to instantiate any delalloc 652 * extents 653 */ 654 ret = btrfs_fdatawrite_range(inode, start, orig_end); 655 if (ret) 656 return ret; 657 658 /* 659 * If we have a writeback error don't return immediately. Wait first 660 * for any ordered extents that haven't completed yet. This is to make 661 * sure no one can dirty the same page ranges and call writepages() 662 * before the ordered extents complete - to avoid failures (-EEXIST) 663 * when adding the new ordered extents to the ordered tree. 664 */ 665 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 666 667 end = orig_end; 668 while (1) { 669 ordered = btrfs_lookup_first_ordered_extent(inode, end); 670 if (!ordered) 671 break; 672 if (ordered->file_offset > orig_end) { 673 btrfs_put_ordered_extent(ordered); 674 break; 675 } 676 if (ordered->file_offset + ordered->num_bytes <= start) { 677 btrfs_put_ordered_extent(ordered); 678 break; 679 } 680 btrfs_start_ordered_extent(inode, ordered, 1); 681 end = ordered->file_offset; 682 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 683 ret = -EIO; 684 btrfs_put_ordered_extent(ordered); 685 if (ret || end == 0 || end == start) 686 break; 687 end--; 688 } 689 return ret_wb ? ret_wb : ret; 690 } 691 692 /* 693 * find an ordered extent corresponding to file_offset. return NULL if 694 * nothing is found, otherwise take a reference on the extent and return it 695 */ 696 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, 697 u64 file_offset) 698 { 699 struct btrfs_ordered_inode_tree *tree; 700 struct rb_node *node; 701 struct btrfs_ordered_extent *entry = NULL; 702 703 tree = &BTRFS_I(inode)->ordered_tree; 704 spin_lock_irq(&tree->lock); 705 node = tree_search(tree, file_offset); 706 if (!node) 707 goto out; 708 709 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 710 if (!offset_in_entry(entry, file_offset)) 711 entry = NULL; 712 if (entry) 713 refcount_inc(&entry->refs); 714 out: 715 spin_unlock_irq(&tree->lock); 716 return entry; 717 } 718 719 /* Since the DIO code tries to lock a wide area we need to look for any ordered 720 * extents that exist in the range, rather than just the start of the range. 721 */ 722 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 723 struct btrfs_inode *inode, u64 file_offset, u64 len) 724 { 725 struct btrfs_ordered_inode_tree *tree; 726 struct rb_node *node; 727 struct btrfs_ordered_extent *entry = NULL; 728 729 tree = &inode->ordered_tree; 730 spin_lock_irq(&tree->lock); 731 node = tree_search(tree, file_offset); 732 if (!node) { 733 node = tree_search(tree, file_offset + len); 734 if (!node) 735 goto out; 736 } 737 738 while (1) { 739 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 740 if (range_overlaps(entry, file_offset, len)) 741 break; 742 743 if (entry->file_offset >= file_offset + len) { 744 entry = NULL; 745 break; 746 } 747 entry = NULL; 748 node = rb_next(node); 749 if (!node) 750 break; 751 } 752 out: 753 if (entry) 754 refcount_inc(&entry->refs); 755 spin_unlock_irq(&tree->lock); 756 return entry; 757 } 758 759 /* 760 * lookup and return any extent before 'file_offset'. NULL is returned 761 * if none is found 762 */ 763 struct btrfs_ordered_extent * 764 btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) 765 { 766 struct btrfs_ordered_inode_tree *tree; 767 struct rb_node *node; 768 struct btrfs_ordered_extent *entry = NULL; 769 770 tree = &BTRFS_I(inode)->ordered_tree; 771 spin_lock_irq(&tree->lock); 772 node = tree_search(tree, file_offset); 773 if (!node) 774 goto out; 775 776 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 777 refcount_inc(&entry->refs); 778 out: 779 spin_unlock_irq(&tree->lock); 780 return entry; 781 } 782 783 /* 784 * After an extent is done, call this to conditionally update the on disk 785 * i_size. i_size is updated to cover any fully written part of the file. 786 */ 787 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 788 struct btrfs_ordered_extent *ordered) 789 { 790 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 791 u64 disk_i_size; 792 u64 new_i_size; 793 u64 i_size = i_size_read(inode); 794 struct rb_node *node; 795 struct rb_node *prev = NULL; 796 struct btrfs_ordered_extent *test; 797 int ret = 1; 798 u64 orig_offset = offset; 799 800 spin_lock_irq(&tree->lock); 801 if (ordered) { 802 offset = entry_end(ordered); 803 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) 804 offset = min(offset, 805 ordered->file_offset + 806 ordered->truncated_len); 807 } else { 808 offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); 809 } 810 disk_i_size = BTRFS_I(inode)->disk_i_size; 811 812 /* 813 * truncate file. 814 * If ordered is not NULL, then this is called from endio and 815 * disk_i_size will be updated by either truncate itself or any 816 * in-flight IOs which are inside the disk_i_size. 817 * 818 * Because btrfs_setsize() may set i_size with disk_i_size if truncate 819 * fails somehow, we need to make sure we have a precise disk_i_size by 820 * updating it as usual. 821 * 822 */ 823 if (!ordered && disk_i_size > i_size) { 824 BTRFS_I(inode)->disk_i_size = orig_offset; 825 ret = 0; 826 goto out; 827 } 828 829 /* 830 * if the disk i_size is already at the inode->i_size, or 831 * this ordered extent is inside the disk i_size, we're done 832 */ 833 if (disk_i_size == i_size) 834 goto out; 835 836 /* 837 * We still need to update disk_i_size if outstanding_isize is greater 838 * than disk_i_size. 839 */ 840 if (offset <= disk_i_size && 841 (!ordered || ordered->outstanding_isize <= disk_i_size)) 842 goto out; 843 844 /* 845 * walk backward from this ordered extent to disk_i_size. 846 * if we find an ordered extent then we can't update disk i_size 847 * yet 848 */ 849 if (ordered) { 850 node = rb_prev(&ordered->rb_node); 851 } else { 852 prev = tree_search(tree, offset); 853 /* 854 * we insert file extents without involving ordered struct, 855 * so there should be no ordered struct cover this offset 856 */ 857 if (prev) { 858 test = rb_entry(prev, struct btrfs_ordered_extent, 859 rb_node); 860 BUG_ON(offset_in_entry(test, offset)); 861 } 862 node = prev; 863 } 864 for (; node; node = rb_prev(node)) { 865 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 866 867 /* We treat this entry as if it doesn't exist */ 868 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 869 continue; 870 871 if (entry_end(test) <= disk_i_size) 872 break; 873 if (test->file_offset >= i_size) 874 break; 875 876 /* 877 * We don't update disk_i_size now, so record this undealt 878 * i_size. Or we will not know the real i_size. 879 */ 880 if (test->outstanding_isize < offset) 881 test->outstanding_isize = offset; 882 if (ordered && 883 ordered->outstanding_isize > test->outstanding_isize) 884 test->outstanding_isize = ordered->outstanding_isize; 885 goto out; 886 } 887 new_i_size = min_t(u64, offset, i_size); 888 889 /* 890 * Some ordered extents may completed before the current one, and 891 * we hold the real i_size in ->outstanding_isize. 892 */ 893 if (ordered && ordered->outstanding_isize > new_i_size) 894 new_i_size = min_t(u64, ordered->outstanding_isize, i_size); 895 BTRFS_I(inode)->disk_i_size = new_i_size; 896 ret = 0; 897 out: 898 /* 899 * We need to do this because we can't remove ordered extents until 900 * after the i_disk_size has been updated and then the inode has been 901 * updated to reflect the change, so we need to tell anybody who finds 902 * this ordered extent that we've already done all the real work, we 903 * just haven't completed all the other work. 904 */ 905 if (ordered) 906 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); 907 spin_unlock_irq(&tree->lock); 908 return ret; 909 } 910 911 /* 912 * search the ordered extents for one corresponding to 'offset' and 913 * try to find a checksum. This is used because we allow pages to 914 * be reclaimed before their checksum is actually put into the btree 915 */ 916 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 917 u8 *sum, int len) 918 { 919 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 920 struct btrfs_ordered_sum *ordered_sum; 921 struct btrfs_ordered_extent *ordered; 922 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 923 unsigned long num_sectors; 924 unsigned long i; 925 u32 sectorsize = btrfs_inode_sectorsize(inode); 926 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 927 int index = 0; 928 929 ordered = btrfs_lookup_ordered_extent(inode, offset); 930 if (!ordered) 931 return 0; 932 933 spin_lock_irq(&tree->lock); 934 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 935 if (disk_bytenr >= ordered_sum->bytenr && 936 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 937 i = (disk_bytenr - ordered_sum->bytenr) >> 938 inode->i_sb->s_blocksize_bits; 939 num_sectors = ordered_sum->len >> 940 inode->i_sb->s_blocksize_bits; 941 num_sectors = min_t(int, len - index, num_sectors - i); 942 memcpy(sum + index, ordered_sum->sums + i * csum_size, 943 num_sectors * csum_size); 944 945 index += (int)num_sectors * csum_size; 946 if (index == len) 947 goto out; 948 disk_bytenr += num_sectors * sectorsize; 949 } 950 } 951 out: 952 spin_unlock_irq(&tree->lock); 953 btrfs_put_ordered_extent(ordered); 954 return index; 955 } 956 957 /* 958 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending 959 * ordered extents in it are run to completion. 960 * 961 * @tree: IO tree used for locking out other users of the range 962 * @inode: Inode whose ordered tree is to be searched 963 * @start: Beginning of range to flush 964 * @end: Last byte of range to lock 965 * @cached_state: If passed, will return the extent state responsible for the 966 * locked range. It's the caller's responsibility to free the cached state. 967 * 968 * This function always returns with the given range locked, ensuring after it's 969 * called no order extent can be pending. 970 */ 971 void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, 972 struct btrfs_inode *inode, u64 start, 973 u64 end, 974 struct extent_state **cached_state) 975 { 976 struct btrfs_ordered_extent *ordered; 977 struct extent_state *cache = NULL; 978 struct extent_state **cachedp = &cache; 979 980 if (cached_state) 981 cachedp = cached_state; 982 983 while (1) { 984 lock_extent_bits(tree, start, end, cachedp); 985 ordered = btrfs_lookup_ordered_range(inode, start, 986 end - start + 1); 987 if (!ordered) { 988 /* 989 * If no external cached_state has been passed then 990 * decrement the extra ref taken for cachedp since we 991 * aren't exposing it outside of this function 992 */ 993 if (!cached_state) 994 refcount_dec(&cache->refs); 995 break; 996 } 997 unlock_extent_cached(tree, start, end, cachedp); 998 btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); 999 btrfs_put_ordered_extent(ordered); 1000 } 1001 } 1002 1003 int __init ordered_data_init(void) 1004 { 1005 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1006 sizeof(struct btrfs_ordered_extent), 0, 1007 SLAB_MEM_SPREAD, 1008 NULL); 1009 if (!btrfs_ordered_extent_cache) 1010 return -ENOMEM; 1011 1012 return 0; 1013 } 1014 1015 void __cold ordered_data_exit(void) 1016 { 1017 kmem_cache_destroy(btrfs_ordered_extent_cache); 1018 } 1019