1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/slab.h> 7 #include <linux/blkdev.h> 8 #include <linux/writeback.h> 9 #include <linux/pagevec.h> 10 #include "ctree.h" 11 #include "transaction.h" 12 #include "btrfs_inode.h" 13 #include "extent_io.h" 14 #include "disk-io.h" 15 #include "compression.h" 16 17 static struct kmem_cache *btrfs_ordered_extent_cache; 18 19 static u64 entry_end(struct btrfs_ordered_extent *entry) 20 { 21 if (entry->file_offset + entry->len < entry->file_offset) 22 return (u64)-1; 23 return entry->file_offset + entry->len; 24 } 25 26 /* returns NULL if the insertion worked, or it returns the node it did find 27 * in the tree 28 */ 29 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, 30 struct rb_node *node) 31 { 32 struct rb_node **p = &root->rb_node; 33 struct rb_node *parent = NULL; 34 struct btrfs_ordered_extent *entry; 35 36 while (*p) { 37 parent = *p; 38 entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); 39 40 if (file_offset < entry->file_offset) 41 p = &(*p)->rb_left; 42 else if (file_offset >= entry_end(entry)) 43 p = &(*p)->rb_right; 44 else 45 return parent; 46 } 47 48 rb_link_node(node, parent, p); 49 rb_insert_color(node, root); 50 return NULL; 51 } 52 53 static void ordered_data_tree_panic(struct inode *inode, int errno, 54 u64 offset) 55 { 56 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 57 btrfs_panic(fs_info, errno, 58 "Inconsistency in ordered tree at offset %llu", offset); 59 } 60 61 /* 62 * look for a given offset in the tree, and if it can't be found return the 63 * first lesser offset 64 */ 65 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, 66 struct rb_node **prev_ret) 67 { 68 struct rb_node *n = root->rb_node; 69 struct rb_node *prev = NULL; 70 struct rb_node *test; 71 struct btrfs_ordered_extent *entry; 72 struct btrfs_ordered_extent *prev_entry = NULL; 73 74 while (n) { 75 entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); 76 prev = n; 77 prev_entry = entry; 78 79 if (file_offset < entry->file_offset) 80 n = n->rb_left; 81 else if (file_offset >= entry_end(entry)) 82 n = n->rb_right; 83 else 84 return n; 85 } 86 if (!prev_ret) 87 return NULL; 88 89 while (prev && file_offset >= entry_end(prev_entry)) { 90 test = rb_next(prev); 91 if (!test) 92 break; 93 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 94 rb_node); 95 if (file_offset < entry_end(prev_entry)) 96 break; 97 98 prev = test; 99 } 100 if (prev) 101 prev_entry = rb_entry(prev, struct btrfs_ordered_extent, 102 rb_node); 103 while (prev && file_offset < entry_end(prev_entry)) { 104 test = rb_prev(prev); 105 if (!test) 106 break; 107 prev_entry = rb_entry(test, struct btrfs_ordered_extent, 108 rb_node); 109 prev = test; 110 } 111 *prev_ret = prev; 112 return NULL; 113 } 114 115 /* 116 * helper to check if a given offset is inside a given entry 117 */ 118 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) 119 { 120 if (file_offset < entry->file_offset || 121 entry->file_offset + entry->len <= file_offset) 122 return 0; 123 return 1; 124 } 125 126 static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, 127 u64 len) 128 { 129 if (file_offset + len <= entry->file_offset || 130 entry->file_offset + entry->len <= file_offset) 131 return 0; 132 return 1; 133 } 134 135 /* 136 * look find the first ordered struct that has this offset, otherwise 137 * the first one less than this offset 138 */ 139 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, 140 u64 file_offset) 141 { 142 struct rb_root *root = &tree->tree; 143 struct rb_node *prev = NULL; 144 struct rb_node *ret; 145 struct btrfs_ordered_extent *entry; 146 147 if (tree->last) { 148 entry = rb_entry(tree->last, struct btrfs_ordered_extent, 149 rb_node); 150 if (offset_in_entry(entry, file_offset)) 151 return tree->last; 152 } 153 ret = __tree_search(root, file_offset, &prev); 154 if (!ret) 155 ret = prev; 156 if (ret) 157 tree->last = ret; 158 return ret; 159 } 160 161 /* allocate and add a new ordered_extent into the per-inode tree. 162 * file_offset is the logical offset in the file 163 * 164 * start is the disk block number of an extent already reserved in the 165 * extent allocation tree 166 * 167 * len is the length of the extent 168 * 169 * The tree is given a single reference on the ordered extent that was 170 * inserted. 171 */ 172 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173 u64 start, u64 len, u64 disk_len, 174 int type, int dio, int compress_type) 175 { 176 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 177 struct btrfs_root *root = BTRFS_I(inode)->root; 178 struct btrfs_ordered_inode_tree *tree; 179 struct rb_node *node; 180 struct btrfs_ordered_extent *entry; 181 182 tree = &BTRFS_I(inode)->ordered_tree; 183 entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); 184 if (!entry) 185 return -ENOMEM; 186 187 entry->file_offset = file_offset; 188 entry->start = start; 189 entry->len = len; 190 entry->disk_len = disk_len; 191 entry->bytes_left = len; 192 entry->inode = igrab(inode); 193 entry->compress_type = compress_type; 194 entry->truncated_len = (u64)-1; 195 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 196 set_bit(type, &entry->flags); 197 198 if (dio) 199 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); 200 201 /* one ref for the tree */ 202 refcount_set(&entry->refs, 1); 203 init_waitqueue_head(&entry->wait); 204 INIT_LIST_HEAD(&entry->list); 205 INIT_LIST_HEAD(&entry->root_extent_list); 206 INIT_LIST_HEAD(&entry->work_list); 207 init_completion(&entry->completion); 208 INIT_LIST_HEAD(&entry->log_list); 209 INIT_LIST_HEAD(&entry->trans_list); 210 211 trace_btrfs_ordered_extent_add(inode, entry); 212 213 spin_lock_irq(&tree->lock); 214 node = tree_insert(&tree->tree, file_offset, 215 &entry->rb_node); 216 if (node) 217 ordered_data_tree_panic(inode, -EEXIST, file_offset); 218 spin_unlock_irq(&tree->lock); 219 220 spin_lock(&root->ordered_extent_lock); 221 list_add_tail(&entry->root_extent_list, 222 &root->ordered_extents); 223 root->nr_ordered_extents++; 224 if (root->nr_ordered_extents == 1) { 225 spin_lock(&fs_info->ordered_root_lock); 226 BUG_ON(!list_empty(&root->ordered_root)); 227 list_add_tail(&root->ordered_root, &fs_info->ordered_roots); 228 spin_unlock(&fs_info->ordered_root_lock); 229 } 230 spin_unlock(&root->ordered_extent_lock); 231 232 /* 233 * We don't need the count_max_extents here, we can assume that all of 234 * that work has been done at higher layers, so this is truly the 235 * smallest the extent is going to get. 236 */ 237 spin_lock(&BTRFS_I(inode)->lock); 238 btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); 239 spin_unlock(&BTRFS_I(inode)->lock); 240 241 return 0; 242 } 243 244 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 245 u64 start, u64 len, u64 disk_len, int type) 246 { 247 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 248 disk_len, type, 0, 249 BTRFS_COMPRESS_NONE); 250 } 251 252 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 253 u64 start, u64 len, u64 disk_len, int type) 254 { 255 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 256 disk_len, type, 1, 257 BTRFS_COMPRESS_NONE); 258 } 259 260 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, 261 u64 start, u64 len, u64 disk_len, 262 int type, int compress_type) 263 { 264 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 265 disk_len, type, 0, 266 compress_type); 267 } 268 269 /* 270 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 271 * when an ordered extent is finished. If the list covers more than one 272 * ordered extent, it is split across multiples. 273 */ 274 void btrfs_add_ordered_sum(struct inode *inode, 275 struct btrfs_ordered_extent *entry, 276 struct btrfs_ordered_sum *sum) 277 { 278 struct btrfs_ordered_inode_tree *tree; 279 280 tree = &BTRFS_I(inode)->ordered_tree; 281 spin_lock_irq(&tree->lock); 282 list_add_tail(&sum->list, &entry->list); 283 spin_unlock_irq(&tree->lock); 284 } 285 286 /* 287 * this is used to account for finished IO across a given range 288 * of the file. The IO may span ordered extents. If 289 * a given ordered_extent is completely done, 1 is returned, otherwise 290 * 0. 291 * 292 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 293 * to make sure this function only returns 1 once for a given ordered extent. 294 * 295 * file_offset is updated to one byte past the range that is recorded as 296 * complete. This allows you to walk forward in the file. 297 */ 298 int btrfs_dec_test_first_ordered_pending(struct inode *inode, 299 struct btrfs_ordered_extent **cached, 300 u64 *file_offset, u64 io_size, int uptodate) 301 { 302 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 303 struct btrfs_ordered_inode_tree *tree; 304 struct rb_node *node; 305 struct btrfs_ordered_extent *entry = NULL; 306 int ret; 307 unsigned long flags; 308 u64 dec_end; 309 u64 dec_start; 310 u64 to_dec; 311 312 tree = &BTRFS_I(inode)->ordered_tree; 313 spin_lock_irqsave(&tree->lock, flags); 314 node = tree_search(tree, *file_offset); 315 if (!node) { 316 ret = 1; 317 goto out; 318 } 319 320 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 321 if (!offset_in_entry(entry, *file_offset)) { 322 ret = 1; 323 goto out; 324 } 325 326 dec_start = max(*file_offset, entry->file_offset); 327 dec_end = min(*file_offset + io_size, entry->file_offset + 328 entry->len); 329 *file_offset = dec_end; 330 if (dec_start > dec_end) { 331 btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", 332 dec_start, dec_end); 333 } 334 to_dec = dec_end - dec_start; 335 if (to_dec > entry->bytes_left) { 336 btrfs_crit(fs_info, 337 "bad ordered accounting left %llu size %llu", 338 entry->bytes_left, to_dec); 339 } 340 entry->bytes_left -= to_dec; 341 if (!uptodate) 342 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 343 344 if (entry->bytes_left == 0) { 345 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 346 /* 347 * Implicit memory barrier after test_and_set_bit 348 */ 349 if (waitqueue_active(&entry->wait)) 350 wake_up(&entry->wait); 351 } else { 352 ret = 1; 353 } 354 out: 355 if (!ret && cached && entry) { 356 *cached = entry; 357 refcount_inc(&entry->refs); 358 } 359 spin_unlock_irqrestore(&tree->lock, flags); 360 return ret == 0; 361 } 362 363 /* 364 * this is used to account for finished IO across a given range 365 * of the file. The IO should not span ordered extents. If 366 * a given ordered_extent is completely done, 1 is returned, otherwise 367 * 0. 368 * 369 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used 370 * to make sure this function only returns 1 once for a given ordered extent. 371 */ 372 int btrfs_dec_test_ordered_pending(struct inode *inode, 373 struct btrfs_ordered_extent **cached, 374 u64 file_offset, u64 io_size, int uptodate) 375 { 376 struct btrfs_ordered_inode_tree *tree; 377 struct rb_node *node; 378 struct btrfs_ordered_extent *entry = NULL; 379 unsigned long flags; 380 int ret; 381 382 tree = &BTRFS_I(inode)->ordered_tree; 383 spin_lock_irqsave(&tree->lock, flags); 384 if (cached && *cached) { 385 entry = *cached; 386 goto have_entry; 387 } 388 389 node = tree_search(tree, file_offset); 390 if (!node) { 391 ret = 1; 392 goto out; 393 } 394 395 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 396 have_entry: 397 if (!offset_in_entry(entry, file_offset)) { 398 ret = 1; 399 goto out; 400 } 401 402 if (io_size > entry->bytes_left) { 403 btrfs_crit(BTRFS_I(inode)->root->fs_info, 404 "bad ordered accounting left %llu size %llu", 405 entry->bytes_left, io_size); 406 } 407 entry->bytes_left -= io_size; 408 if (!uptodate) 409 set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 410 411 if (entry->bytes_left == 0) { 412 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 413 /* 414 * Implicit memory barrier after test_and_set_bit 415 */ 416 if (waitqueue_active(&entry->wait)) 417 wake_up(&entry->wait); 418 } else { 419 ret = 1; 420 } 421 out: 422 if (!ret && cached && entry) { 423 *cached = entry; 424 refcount_inc(&entry->refs); 425 } 426 spin_unlock_irqrestore(&tree->lock, flags); 427 return ret == 0; 428 } 429 430 /* Needs to either be called under a log transaction or the log_mutex */ 431 void btrfs_get_logged_extents(struct btrfs_inode *inode, 432 struct list_head *logged_list, 433 const loff_t start, 434 const loff_t end) 435 { 436 struct btrfs_ordered_inode_tree *tree; 437 struct btrfs_ordered_extent *ordered; 438 struct rb_node *n; 439 struct rb_node *prev; 440 441 tree = &inode->ordered_tree; 442 spin_lock_irq(&tree->lock); 443 n = __tree_search(&tree->tree, end, &prev); 444 if (!n) 445 n = prev; 446 for (; n; n = rb_prev(n)) { 447 ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); 448 if (ordered->file_offset > end) 449 continue; 450 if (entry_end(ordered) <= start) 451 break; 452 if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) 453 continue; 454 list_add(&ordered->log_list, logged_list); 455 refcount_inc(&ordered->refs); 456 } 457 spin_unlock_irq(&tree->lock); 458 } 459 460 void btrfs_put_logged_extents(struct list_head *logged_list) 461 { 462 struct btrfs_ordered_extent *ordered; 463 464 while (!list_empty(logged_list)) { 465 ordered = list_first_entry(logged_list, 466 struct btrfs_ordered_extent, 467 log_list); 468 list_del_init(&ordered->log_list); 469 btrfs_put_ordered_extent(ordered); 470 } 471 } 472 473 void btrfs_submit_logged_extents(struct list_head *logged_list, 474 struct btrfs_root *log) 475 { 476 int index = log->log_transid % 2; 477 478 spin_lock_irq(&log->log_extents_lock[index]); 479 list_splice_tail(logged_list, &log->logged_list[index]); 480 spin_unlock_irq(&log->log_extents_lock[index]); 481 } 482 483 void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, 484 struct btrfs_root *log, u64 transid) 485 { 486 struct btrfs_ordered_extent *ordered; 487 int index = transid % 2; 488 489 spin_lock_irq(&log->log_extents_lock[index]); 490 while (!list_empty(&log->logged_list[index])) { 491 struct inode *inode; 492 ordered = list_first_entry(&log->logged_list[index], 493 struct btrfs_ordered_extent, 494 log_list); 495 list_del_init(&ordered->log_list); 496 inode = ordered->inode; 497 spin_unlock_irq(&log->log_extents_lock[index]); 498 499 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 500 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 501 u64 start = ordered->file_offset; 502 u64 end = ordered->file_offset + ordered->len - 1; 503 504 WARN_ON(!inode); 505 filemap_fdatawrite_range(inode->i_mapping, start, end); 506 } 507 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, 508 &ordered->flags)); 509 510 /* 511 * In order to keep us from losing our ordered extent 512 * information when committing the transaction we have to make 513 * sure that any logged extents are completed when we go to 514 * commit the transaction. To do this we simply increase the 515 * current transactions pending_ordered counter and decrement it 516 * when the ordered extent completes. 517 */ 518 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 519 struct btrfs_ordered_inode_tree *tree; 520 521 tree = &BTRFS_I(inode)->ordered_tree; 522 spin_lock_irq(&tree->lock); 523 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { 524 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); 525 atomic_inc(&trans->transaction->pending_ordered); 526 } 527 spin_unlock_irq(&tree->lock); 528 } 529 btrfs_put_ordered_extent(ordered); 530 spin_lock_irq(&log->log_extents_lock[index]); 531 } 532 spin_unlock_irq(&log->log_extents_lock[index]); 533 } 534 535 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) 536 { 537 struct btrfs_ordered_extent *ordered; 538 int index = transid % 2; 539 540 spin_lock_irq(&log->log_extents_lock[index]); 541 while (!list_empty(&log->logged_list[index])) { 542 ordered = list_first_entry(&log->logged_list[index], 543 struct btrfs_ordered_extent, 544 log_list); 545 list_del_init(&ordered->log_list); 546 spin_unlock_irq(&log->log_extents_lock[index]); 547 btrfs_put_ordered_extent(ordered); 548 spin_lock_irq(&log->log_extents_lock[index]); 549 } 550 spin_unlock_irq(&log->log_extents_lock[index]); 551 } 552 553 /* 554 * used to drop a reference on an ordered extent. This will free 555 * the extent if the last reference is dropped 556 */ 557 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) 558 { 559 struct list_head *cur; 560 struct btrfs_ordered_sum *sum; 561 562 trace_btrfs_ordered_extent_put(entry->inode, entry); 563 564 if (refcount_dec_and_test(&entry->refs)) { 565 ASSERT(list_empty(&entry->log_list)); 566 ASSERT(list_empty(&entry->trans_list)); 567 ASSERT(list_empty(&entry->root_extent_list)); 568 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 569 if (entry->inode) 570 btrfs_add_delayed_iput(entry->inode); 571 while (!list_empty(&entry->list)) { 572 cur = entry->list.next; 573 sum = list_entry(cur, struct btrfs_ordered_sum, list); 574 list_del(&sum->list); 575 kfree(sum); 576 } 577 kmem_cache_free(btrfs_ordered_extent_cache, entry); 578 } 579 } 580 581 /* 582 * remove an ordered extent from the tree. No references are dropped 583 * and waiters are woken up. 584 */ 585 void btrfs_remove_ordered_extent(struct inode *inode, 586 struct btrfs_ordered_extent *entry) 587 { 588 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 589 struct btrfs_ordered_inode_tree *tree; 590 struct btrfs_inode *btrfs_inode = BTRFS_I(inode); 591 struct btrfs_root *root = btrfs_inode->root; 592 struct rb_node *node; 593 bool dec_pending_ordered = false; 594 595 /* This is paired with btrfs_add_ordered_extent. */ 596 spin_lock(&btrfs_inode->lock); 597 btrfs_mod_outstanding_extents(btrfs_inode, -1); 598 spin_unlock(&btrfs_inode->lock); 599 if (root != fs_info->tree_root) 600 btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); 601 602 tree = &btrfs_inode->ordered_tree; 603 spin_lock_irq(&tree->lock); 604 node = &entry->rb_node; 605 rb_erase(node, &tree->tree); 606 RB_CLEAR_NODE(node); 607 if (tree->last == node) 608 tree->last = NULL; 609 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 610 if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) 611 dec_pending_ordered = true; 612 spin_unlock_irq(&tree->lock); 613 614 /* 615 * The current running transaction is waiting on us, we need to let it 616 * know that we're complete and wake it up. 617 */ 618 if (dec_pending_ordered) { 619 struct btrfs_transaction *trans; 620 621 /* 622 * The checks for trans are just a formality, it should be set, 623 * but if it isn't we don't want to deref/assert under the spin 624 * lock, so be nice and check if trans is set, but ASSERT() so 625 * if it isn't set a developer will notice. 626 */ 627 spin_lock(&fs_info->trans_lock); 628 trans = fs_info->running_transaction; 629 if (trans) 630 refcount_inc(&trans->use_count); 631 spin_unlock(&fs_info->trans_lock); 632 633 ASSERT(trans); 634 if (trans) { 635 if (atomic_dec_and_test(&trans->pending_ordered)) 636 wake_up(&trans->pending_wait); 637 btrfs_put_transaction(trans); 638 } 639 } 640 641 spin_lock(&root->ordered_extent_lock); 642 list_del_init(&entry->root_extent_list); 643 root->nr_ordered_extents--; 644 645 trace_btrfs_ordered_extent_remove(inode, entry); 646 647 if (!root->nr_ordered_extents) { 648 spin_lock(&fs_info->ordered_root_lock); 649 BUG_ON(list_empty(&root->ordered_root)); 650 list_del_init(&root->ordered_root); 651 spin_unlock(&fs_info->ordered_root_lock); 652 } 653 spin_unlock(&root->ordered_extent_lock); 654 wake_up(&entry->wait); 655 } 656 657 static void btrfs_run_ordered_extent_work(struct btrfs_work *work) 658 { 659 struct btrfs_ordered_extent *ordered; 660 661 ordered = container_of(work, struct btrfs_ordered_extent, flush_work); 662 btrfs_start_ordered_extent(ordered->inode, ordered, 1); 663 complete(&ordered->completion); 664 } 665 666 /* 667 * wait for all the ordered extents in a root. This is done when balancing 668 * space between drives. 669 */ 670 u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, 671 const u64 range_start, const u64 range_len) 672 { 673 struct btrfs_fs_info *fs_info = root->fs_info; 674 LIST_HEAD(splice); 675 LIST_HEAD(skipped); 676 LIST_HEAD(works); 677 struct btrfs_ordered_extent *ordered, *next; 678 u64 count = 0; 679 const u64 range_end = range_start + range_len; 680 681 mutex_lock(&root->ordered_extent_mutex); 682 spin_lock(&root->ordered_extent_lock); 683 list_splice_init(&root->ordered_extents, &splice); 684 while (!list_empty(&splice) && nr) { 685 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 686 root_extent_list); 687 688 if (range_end <= ordered->start || 689 ordered->start + ordered->disk_len <= range_start) { 690 list_move_tail(&ordered->root_extent_list, &skipped); 691 cond_resched_lock(&root->ordered_extent_lock); 692 continue; 693 } 694 695 list_move_tail(&ordered->root_extent_list, 696 &root->ordered_extents); 697 refcount_inc(&ordered->refs); 698 spin_unlock(&root->ordered_extent_lock); 699 700 btrfs_init_work(&ordered->flush_work, 701 btrfs_flush_delalloc_helper, 702 btrfs_run_ordered_extent_work, NULL, NULL); 703 list_add_tail(&ordered->work_list, &works); 704 btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); 705 706 cond_resched(); 707 spin_lock(&root->ordered_extent_lock); 708 if (nr != U64_MAX) 709 nr--; 710 count++; 711 } 712 list_splice_tail(&skipped, &root->ordered_extents); 713 list_splice_tail(&splice, &root->ordered_extents); 714 spin_unlock(&root->ordered_extent_lock); 715 716 list_for_each_entry_safe(ordered, next, &works, work_list) { 717 list_del_init(&ordered->work_list); 718 wait_for_completion(&ordered->completion); 719 btrfs_put_ordered_extent(ordered); 720 cond_resched(); 721 } 722 mutex_unlock(&root->ordered_extent_mutex); 723 724 return count; 725 } 726 727 u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, 728 const u64 range_start, const u64 range_len) 729 { 730 struct btrfs_root *root; 731 struct list_head splice; 732 u64 total_done = 0; 733 u64 done; 734 735 INIT_LIST_HEAD(&splice); 736 737 mutex_lock(&fs_info->ordered_operations_mutex); 738 spin_lock(&fs_info->ordered_root_lock); 739 list_splice_init(&fs_info->ordered_roots, &splice); 740 while (!list_empty(&splice) && nr) { 741 root = list_first_entry(&splice, struct btrfs_root, 742 ordered_root); 743 root = btrfs_grab_fs_root(root); 744 BUG_ON(!root); 745 list_move_tail(&root->ordered_root, 746 &fs_info->ordered_roots); 747 spin_unlock(&fs_info->ordered_root_lock); 748 749 done = btrfs_wait_ordered_extents(root, nr, 750 range_start, range_len); 751 btrfs_put_fs_root(root); 752 total_done += done; 753 754 spin_lock(&fs_info->ordered_root_lock); 755 if (nr != U64_MAX) { 756 nr -= done; 757 } 758 } 759 list_splice_tail(&splice, &fs_info->ordered_roots); 760 spin_unlock(&fs_info->ordered_root_lock); 761 mutex_unlock(&fs_info->ordered_operations_mutex); 762 763 return total_done; 764 } 765 766 /* 767 * Used to start IO or wait for a given ordered extent to finish. 768 * 769 * If wait is one, this effectively waits on page writeback for all the pages 770 * in the extent, and it waits on the io completion code to insert 771 * metadata into the btree corresponding to the extent 772 */ 773 void btrfs_start_ordered_extent(struct inode *inode, 774 struct btrfs_ordered_extent *entry, 775 int wait) 776 { 777 u64 start = entry->file_offset; 778 u64 end = start + entry->len - 1; 779 780 trace_btrfs_ordered_extent_start(inode, entry); 781 782 /* 783 * pages in the range can be dirty, clean or writeback. We 784 * start IO on any dirty ones so the wait doesn't stall waiting 785 * for the flusher thread to find them 786 */ 787 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) 788 filemap_fdatawrite_range(inode->i_mapping, start, end); 789 if (wait) { 790 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 791 &entry->flags)); 792 } 793 } 794 795 /* 796 * Used to wait on ordered extents across a large range of bytes. 797 */ 798 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) 799 { 800 int ret = 0; 801 int ret_wb = 0; 802 u64 end; 803 u64 orig_end; 804 struct btrfs_ordered_extent *ordered; 805 806 if (start + len < start) { 807 orig_end = INT_LIMIT(loff_t); 808 } else { 809 orig_end = start + len - 1; 810 if (orig_end > INT_LIMIT(loff_t)) 811 orig_end = INT_LIMIT(loff_t); 812 } 813 814 /* start IO across the range first to instantiate any delalloc 815 * extents 816 */ 817 ret = btrfs_fdatawrite_range(inode, start, orig_end); 818 if (ret) 819 return ret; 820 821 /* 822 * If we have a writeback error don't return immediately. Wait first 823 * for any ordered extents that haven't completed yet. This is to make 824 * sure no one can dirty the same page ranges and call writepages() 825 * before the ordered extents complete - to avoid failures (-EEXIST) 826 * when adding the new ordered extents to the ordered tree. 827 */ 828 ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); 829 830 end = orig_end; 831 while (1) { 832 ordered = btrfs_lookup_first_ordered_extent(inode, end); 833 if (!ordered) 834 break; 835 if (ordered->file_offset > orig_end) { 836 btrfs_put_ordered_extent(ordered); 837 break; 838 } 839 if (ordered->file_offset + ordered->len <= start) { 840 btrfs_put_ordered_extent(ordered); 841 break; 842 } 843 btrfs_start_ordered_extent(inode, ordered, 1); 844 end = ordered->file_offset; 845 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) 846 ret = -EIO; 847 btrfs_put_ordered_extent(ordered); 848 if (ret || end == 0 || end == start) 849 break; 850 end--; 851 } 852 return ret_wb ? ret_wb : ret; 853 } 854 855 /* 856 * find an ordered extent corresponding to file_offset. return NULL if 857 * nothing is found, otherwise take a reference on the extent and return it 858 */ 859 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, 860 u64 file_offset) 861 { 862 struct btrfs_ordered_inode_tree *tree; 863 struct rb_node *node; 864 struct btrfs_ordered_extent *entry = NULL; 865 866 tree = &BTRFS_I(inode)->ordered_tree; 867 spin_lock_irq(&tree->lock); 868 node = tree_search(tree, file_offset); 869 if (!node) 870 goto out; 871 872 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 873 if (!offset_in_entry(entry, file_offset)) 874 entry = NULL; 875 if (entry) 876 refcount_inc(&entry->refs); 877 out: 878 spin_unlock_irq(&tree->lock); 879 return entry; 880 } 881 882 /* Since the DIO code tries to lock a wide area we need to look for any ordered 883 * extents that exist in the range, rather than just the start of the range. 884 */ 885 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 886 struct btrfs_inode *inode, u64 file_offset, u64 len) 887 { 888 struct btrfs_ordered_inode_tree *tree; 889 struct rb_node *node; 890 struct btrfs_ordered_extent *entry = NULL; 891 892 tree = &inode->ordered_tree; 893 spin_lock_irq(&tree->lock); 894 node = tree_search(tree, file_offset); 895 if (!node) { 896 node = tree_search(tree, file_offset + len); 897 if (!node) 898 goto out; 899 } 900 901 while (1) { 902 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 903 if (range_overlaps(entry, file_offset, len)) 904 break; 905 906 if (entry->file_offset >= file_offset + len) { 907 entry = NULL; 908 break; 909 } 910 entry = NULL; 911 node = rb_next(node); 912 if (!node) 913 break; 914 } 915 out: 916 if (entry) 917 refcount_inc(&entry->refs); 918 spin_unlock_irq(&tree->lock); 919 return entry; 920 } 921 922 bool btrfs_have_ordered_extents_in_range(struct inode *inode, 923 u64 file_offset, 924 u64 len) 925 { 926 struct btrfs_ordered_extent *oe; 927 928 oe = btrfs_lookup_ordered_range(BTRFS_I(inode), file_offset, len); 929 if (oe) { 930 btrfs_put_ordered_extent(oe); 931 return true; 932 } 933 return false; 934 } 935 936 /* 937 * lookup and return any extent before 'file_offset'. NULL is returned 938 * if none is found 939 */ 940 struct btrfs_ordered_extent * 941 btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) 942 { 943 struct btrfs_ordered_inode_tree *tree; 944 struct rb_node *node; 945 struct btrfs_ordered_extent *entry = NULL; 946 947 tree = &BTRFS_I(inode)->ordered_tree; 948 spin_lock_irq(&tree->lock); 949 node = tree_search(tree, file_offset); 950 if (!node) 951 goto out; 952 953 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 954 refcount_inc(&entry->refs); 955 out: 956 spin_unlock_irq(&tree->lock); 957 return entry; 958 } 959 960 /* 961 * After an extent is done, call this to conditionally update the on disk 962 * i_size. i_size is updated to cover any fully written part of the file. 963 */ 964 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 965 struct btrfs_ordered_extent *ordered) 966 { 967 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 968 u64 disk_i_size; 969 u64 new_i_size; 970 u64 i_size = i_size_read(inode); 971 struct rb_node *node; 972 struct rb_node *prev = NULL; 973 struct btrfs_ordered_extent *test; 974 int ret = 1; 975 u64 orig_offset = offset; 976 977 spin_lock_irq(&tree->lock); 978 if (ordered) { 979 offset = entry_end(ordered); 980 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) 981 offset = min(offset, 982 ordered->file_offset + 983 ordered->truncated_len); 984 } else { 985 offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); 986 } 987 disk_i_size = BTRFS_I(inode)->disk_i_size; 988 989 /* 990 * truncate file. 991 * If ordered is not NULL, then this is called from endio and 992 * disk_i_size will be updated by either truncate itself or any 993 * in-flight IOs which are inside the disk_i_size. 994 * 995 * Because btrfs_setsize() may set i_size with disk_i_size if truncate 996 * fails somehow, we need to make sure we have a precise disk_i_size by 997 * updating it as usual. 998 * 999 */ 1000 if (!ordered && disk_i_size > i_size) { 1001 BTRFS_I(inode)->disk_i_size = orig_offset; 1002 ret = 0; 1003 goto out; 1004 } 1005 1006 /* 1007 * if the disk i_size is already at the inode->i_size, or 1008 * this ordered extent is inside the disk i_size, we're done 1009 */ 1010 if (disk_i_size == i_size) 1011 goto out; 1012 1013 /* 1014 * We still need to update disk_i_size if outstanding_isize is greater 1015 * than disk_i_size. 1016 */ 1017 if (offset <= disk_i_size && 1018 (!ordered || ordered->outstanding_isize <= disk_i_size)) 1019 goto out; 1020 1021 /* 1022 * walk backward from this ordered extent to disk_i_size. 1023 * if we find an ordered extent then we can't update disk i_size 1024 * yet 1025 */ 1026 if (ordered) { 1027 node = rb_prev(&ordered->rb_node); 1028 } else { 1029 prev = tree_search(tree, offset); 1030 /* 1031 * we insert file extents without involving ordered struct, 1032 * so there should be no ordered struct cover this offset 1033 */ 1034 if (prev) { 1035 test = rb_entry(prev, struct btrfs_ordered_extent, 1036 rb_node); 1037 BUG_ON(offset_in_entry(test, offset)); 1038 } 1039 node = prev; 1040 } 1041 for (; node; node = rb_prev(node)) { 1042 test = rb_entry(node, struct btrfs_ordered_extent, rb_node); 1043 1044 /* We treat this entry as if it doesn't exist */ 1045 if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags)) 1046 continue; 1047 1048 if (entry_end(test) <= disk_i_size) 1049 break; 1050 if (test->file_offset >= i_size) 1051 break; 1052 1053 /* 1054 * We don't update disk_i_size now, so record this undealt 1055 * i_size. Or we will not know the real i_size. 1056 */ 1057 if (test->outstanding_isize < offset) 1058 test->outstanding_isize = offset; 1059 if (ordered && 1060 ordered->outstanding_isize > test->outstanding_isize) 1061 test->outstanding_isize = ordered->outstanding_isize; 1062 goto out; 1063 } 1064 new_i_size = min_t(u64, offset, i_size); 1065 1066 /* 1067 * Some ordered extents may completed before the current one, and 1068 * we hold the real i_size in ->outstanding_isize. 1069 */ 1070 if (ordered && ordered->outstanding_isize > new_i_size) 1071 new_i_size = min_t(u64, ordered->outstanding_isize, i_size); 1072 BTRFS_I(inode)->disk_i_size = new_i_size; 1073 ret = 0; 1074 out: 1075 /* 1076 * We need to do this because we can't remove ordered extents until 1077 * after the i_disk_size has been updated and then the inode has been 1078 * updated to reflect the change, so we need to tell anybody who finds 1079 * this ordered extent that we've already done all the real work, we 1080 * just haven't completed all the other work. 1081 */ 1082 if (ordered) 1083 set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags); 1084 spin_unlock_irq(&tree->lock); 1085 return ret; 1086 } 1087 1088 /* 1089 * search the ordered extents for one corresponding to 'offset' and 1090 * try to find a checksum. This is used because we allow pages to 1091 * be reclaimed before their checksum is actually put into the btree 1092 */ 1093 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 1094 u32 *sum, int len) 1095 { 1096 struct btrfs_ordered_sum *ordered_sum; 1097 struct btrfs_ordered_extent *ordered; 1098 struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; 1099 unsigned long num_sectors; 1100 unsigned long i; 1101 u32 sectorsize = btrfs_inode_sectorsize(inode); 1102 int index = 0; 1103 1104 ordered = btrfs_lookup_ordered_extent(inode, offset); 1105 if (!ordered) 1106 return 0; 1107 1108 spin_lock_irq(&tree->lock); 1109 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 1110 if (disk_bytenr >= ordered_sum->bytenr && 1111 disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { 1112 i = (disk_bytenr - ordered_sum->bytenr) >> 1113 inode->i_sb->s_blocksize_bits; 1114 num_sectors = ordered_sum->len >> 1115 inode->i_sb->s_blocksize_bits; 1116 num_sectors = min_t(int, len - index, num_sectors - i); 1117 memcpy(sum + index, ordered_sum->sums + i, 1118 num_sectors); 1119 1120 index += (int)num_sectors; 1121 if (index == len) 1122 goto out; 1123 disk_bytenr += num_sectors * sectorsize; 1124 } 1125 } 1126 out: 1127 spin_unlock_irq(&tree->lock); 1128 btrfs_put_ordered_extent(ordered); 1129 return index; 1130 } 1131 1132 int __init ordered_data_init(void) 1133 { 1134 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1135 sizeof(struct btrfs_ordered_extent), 0, 1136 SLAB_MEM_SPREAD, 1137 NULL); 1138 if (!btrfs_ordered_extent_cache) 1139 return -ENOMEM; 1140 1141 return 0; 1142 } 1143 1144 void __cold ordered_data_exit(void) 1145 { 1146 kmem_cache_destroy(btrfs_ordered_extent_cache); 1147 } 1148