1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 Fujitsu. All rights reserved. 4 * Written by Miao Xie <miaox@cn.fujitsu.com> 5 */ 6 7 #include <linux/slab.h> 8 #include <linux/iversion.h> 9 #include "misc.h" 10 #include "delayed-inode.h" 11 #include "disk-io.h" 12 #include "transaction.h" 13 #include "ctree.h" 14 #include "qgroup.h" 15 #include "locking.h" 16 #include "inode-item.h" 17 18 #define BTRFS_DELAYED_WRITEBACK 512 19 #define BTRFS_DELAYED_BACKGROUND 128 20 #define BTRFS_DELAYED_BATCH 16 21 22 static struct kmem_cache *delayed_node_cache; 23 24 int __init btrfs_delayed_inode_init(void) 25 { 26 delayed_node_cache = kmem_cache_create("btrfs_delayed_node", 27 sizeof(struct btrfs_delayed_node), 28 0, 29 SLAB_MEM_SPREAD, 30 NULL); 31 if (!delayed_node_cache) 32 return -ENOMEM; 33 return 0; 34 } 35 36 void __cold btrfs_delayed_inode_exit(void) 37 { 38 kmem_cache_destroy(delayed_node_cache); 39 } 40 41 static inline void btrfs_init_delayed_node( 42 struct btrfs_delayed_node *delayed_node, 43 struct btrfs_root *root, u64 inode_id) 44 { 45 delayed_node->root = root; 46 delayed_node->inode_id = inode_id; 47 refcount_set(&delayed_node->refs, 0); 48 delayed_node->ins_root = RB_ROOT_CACHED; 49 delayed_node->del_root = RB_ROOT_CACHED; 50 mutex_init(&delayed_node->mutex); 51 INIT_LIST_HEAD(&delayed_node->n_list); 52 INIT_LIST_HEAD(&delayed_node->p_list); 53 } 54 55 static struct btrfs_delayed_node *btrfs_get_delayed_node( 56 struct btrfs_inode *btrfs_inode) 57 { 58 struct btrfs_root *root = btrfs_inode->root; 59 u64 ino = btrfs_ino(btrfs_inode); 60 struct btrfs_delayed_node *node; 61 62 node = READ_ONCE(btrfs_inode->delayed_node); 63 if (node) { 64 refcount_inc(&node->refs); 65 return node; 66 } 67 68 spin_lock(&root->inode_lock); 69 node = radix_tree_lookup(&root->delayed_nodes_tree, ino); 70 71 if (node) { 72 if (btrfs_inode->delayed_node) { 73 refcount_inc(&node->refs); /* can be accessed */ 74 BUG_ON(btrfs_inode->delayed_node != node); 75 spin_unlock(&root->inode_lock); 76 return node; 77 } 78 79 /* 80 * It's possible that we're racing into the middle of removing 81 * this node from the radix tree. In this case, the refcount 82 * was zero and it should never go back to one. Just return 83 * NULL like it was never in the radix at all; our release 84 * function is in the process of removing it. 85 * 86 * Some implementations of refcount_inc refuse to bump the 87 * refcount once it has hit zero. If we don't do this dance 88 * here, refcount_inc() may decide to just WARN_ONCE() instead 89 * of actually bumping the refcount. 90 * 91 * If this node is properly in the radix, we want to bump the 92 * refcount twice, once for the inode and once for this get 93 * operation. 94 */ 95 if (refcount_inc_not_zero(&node->refs)) { 96 refcount_inc(&node->refs); 97 btrfs_inode->delayed_node = node; 98 } else { 99 node = NULL; 100 } 101 102 spin_unlock(&root->inode_lock); 103 return node; 104 } 105 spin_unlock(&root->inode_lock); 106 107 return NULL; 108 } 109 110 /* Will return either the node or PTR_ERR(-ENOMEM) */ 111 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( 112 struct btrfs_inode *btrfs_inode) 113 { 114 struct btrfs_delayed_node *node; 115 struct btrfs_root *root = btrfs_inode->root; 116 u64 ino = btrfs_ino(btrfs_inode); 117 int ret; 118 119 again: 120 node = btrfs_get_delayed_node(btrfs_inode); 121 if (node) 122 return node; 123 124 node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); 125 if (!node) 126 return ERR_PTR(-ENOMEM); 127 btrfs_init_delayed_node(node, root, ino); 128 129 /* cached in the btrfs inode and can be accessed */ 130 refcount_set(&node->refs, 2); 131 132 ret = radix_tree_preload(GFP_NOFS); 133 if (ret) { 134 kmem_cache_free(delayed_node_cache, node); 135 return ERR_PTR(ret); 136 } 137 138 spin_lock(&root->inode_lock); 139 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); 140 if (ret == -EEXIST) { 141 spin_unlock(&root->inode_lock); 142 kmem_cache_free(delayed_node_cache, node); 143 radix_tree_preload_end(); 144 goto again; 145 } 146 btrfs_inode->delayed_node = node; 147 spin_unlock(&root->inode_lock); 148 radix_tree_preload_end(); 149 150 return node; 151 } 152 153 /* 154 * Call it when holding delayed_node->mutex 155 * 156 * If mod = 1, add this node into the prepared list. 157 */ 158 static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, 159 struct btrfs_delayed_node *node, 160 int mod) 161 { 162 spin_lock(&root->lock); 163 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 164 if (!list_empty(&node->p_list)) 165 list_move_tail(&node->p_list, &root->prepare_list); 166 else if (mod) 167 list_add_tail(&node->p_list, &root->prepare_list); 168 } else { 169 list_add_tail(&node->n_list, &root->node_list); 170 list_add_tail(&node->p_list, &root->prepare_list); 171 refcount_inc(&node->refs); /* inserted into list */ 172 root->nodes++; 173 set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 174 } 175 spin_unlock(&root->lock); 176 } 177 178 /* Call it when holding delayed_node->mutex */ 179 static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, 180 struct btrfs_delayed_node *node) 181 { 182 spin_lock(&root->lock); 183 if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 184 root->nodes--; 185 refcount_dec(&node->refs); /* not in the list */ 186 list_del_init(&node->n_list); 187 if (!list_empty(&node->p_list)) 188 list_del_init(&node->p_list); 189 clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); 190 } 191 spin_unlock(&root->lock); 192 } 193 194 static struct btrfs_delayed_node *btrfs_first_delayed_node( 195 struct btrfs_delayed_root *delayed_root) 196 { 197 struct list_head *p; 198 struct btrfs_delayed_node *node = NULL; 199 200 spin_lock(&delayed_root->lock); 201 if (list_empty(&delayed_root->node_list)) 202 goto out; 203 204 p = delayed_root->node_list.next; 205 node = list_entry(p, struct btrfs_delayed_node, n_list); 206 refcount_inc(&node->refs); 207 out: 208 spin_unlock(&delayed_root->lock); 209 210 return node; 211 } 212 213 static struct btrfs_delayed_node *btrfs_next_delayed_node( 214 struct btrfs_delayed_node *node) 215 { 216 struct btrfs_delayed_root *delayed_root; 217 struct list_head *p; 218 struct btrfs_delayed_node *next = NULL; 219 220 delayed_root = node->root->fs_info->delayed_root; 221 spin_lock(&delayed_root->lock); 222 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 223 /* not in the list */ 224 if (list_empty(&delayed_root->node_list)) 225 goto out; 226 p = delayed_root->node_list.next; 227 } else if (list_is_last(&node->n_list, &delayed_root->node_list)) 228 goto out; 229 else 230 p = node->n_list.next; 231 232 next = list_entry(p, struct btrfs_delayed_node, n_list); 233 refcount_inc(&next->refs); 234 out: 235 spin_unlock(&delayed_root->lock); 236 237 return next; 238 } 239 240 static void __btrfs_release_delayed_node( 241 struct btrfs_delayed_node *delayed_node, 242 int mod) 243 { 244 struct btrfs_delayed_root *delayed_root; 245 246 if (!delayed_node) 247 return; 248 249 delayed_root = delayed_node->root->fs_info->delayed_root; 250 251 mutex_lock(&delayed_node->mutex); 252 if (delayed_node->count) 253 btrfs_queue_delayed_node(delayed_root, delayed_node, mod); 254 else 255 btrfs_dequeue_delayed_node(delayed_root, delayed_node); 256 mutex_unlock(&delayed_node->mutex); 257 258 if (refcount_dec_and_test(&delayed_node->refs)) { 259 struct btrfs_root *root = delayed_node->root; 260 261 spin_lock(&root->inode_lock); 262 /* 263 * Once our refcount goes to zero, nobody is allowed to bump it 264 * back up. We can delete it now. 265 */ 266 ASSERT(refcount_read(&delayed_node->refs) == 0); 267 radix_tree_delete(&root->delayed_nodes_tree, 268 delayed_node->inode_id); 269 spin_unlock(&root->inode_lock); 270 kmem_cache_free(delayed_node_cache, delayed_node); 271 } 272 } 273 274 static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) 275 { 276 __btrfs_release_delayed_node(node, 0); 277 } 278 279 static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( 280 struct btrfs_delayed_root *delayed_root) 281 { 282 struct list_head *p; 283 struct btrfs_delayed_node *node = NULL; 284 285 spin_lock(&delayed_root->lock); 286 if (list_empty(&delayed_root->prepare_list)) 287 goto out; 288 289 p = delayed_root->prepare_list.next; 290 list_del_init(p); 291 node = list_entry(p, struct btrfs_delayed_node, p_list); 292 refcount_inc(&node->refs); 293 out: 294 spin_unlock(&delayed_root->lock); 295 296 return node; 297 } 298 299 static inline void btrfs_release_prepared_delayed_node( 300 struct btrfs_delayed_node *node) 301 { 302 __btrfs_release_delayed_node(node, 1); 303 } 304 305 static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, 306 struct btrfs_delayed_node *node, 307 enum btrfs_delayed_item_type type) 308 { 309 struct btrfs_delayed_item *item; 310 311 item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); 312 if (item) { 313 item->data_len = data_len; 314 item->type = type; 315 item->bytes_reserved = 0; 316 item->delayed_node = node; 317 RB_CLEAR_NODE(&item->rb_node); 318 INIT_LIST_HEAD(&item->log_list); 319 item->logged = false; 320 refcount_set(&item->refs, 1); 321 } 322 return item; 323 } 324 325 /* 326 * __btrfs_lookup_delayed_item - look up the delayed item by key 327 * @delayed_node: pointer to the delayed node 328 * @index: the dir index value to lookup (offset of a dir index key) 329 * 330 * Note: if we don't find the right item, we will return the prev item and 331 * the next item. 332 */ 333 static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( 334 struct rb_root *root, 335 u64 index) 336 { 337 struct rb_node *node = root->rb_node; 338 struct btrfs_delayed_item *delayed_item = NULL; 339 340 while (node) { 341 delayed_item = rb_entry(node, struct btrfs_delayed_item, 342 rb_node); 343 if (delayed_item->index < index) 344 node = node->rb_right; 345 else if (delayed_item->index > index) 346 node = node->rb_left; 347 else 348 return delayed_item; 349 } 350 351 return NULL; 352 } 353 354 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, 355 struct btrfs_delayed_item *ins) 356 { 357 struct rb_node **p, *node; 358 struct rb_node *parent_node = NULL; 359 struct rb_root_cached *root; 360 struct btrfs_delayed_item *item; 361 bool leftmost = true; 362 363 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM) 364 root = &delayed_node->ins_root; 365 else 366 root = &delayed_node->del_root; 367 368 p = &root->rb_root.rb_node; 369 node = &ins->rb_node; 370 371 while (*p) { 372 parent_node = *p; 373 item = rb_entry(parent_node, struct btrfs_delayed_item, 374 rb_node); 375 376 if (item->index < ins->index) { 377 p = &(*p)->rb_right; 378 leftmost = false; 379 } else if (item->index > ins->index) { 380 p = &(*p)->rb_left; 381 } else { 382 return -EEXIST; 383 } 384 } 385 386 rb_link_node(node, parent_node, p); 387 rb_insert_color_cached(node, root, leftmost); 388 389 if (ins->type == BTRFS_DELAYED_INSERTION_ITEM && 390 ins->index >= delayed_node->index_cnt) 391 delayed_node->index_cnt = ins->index + 1; 392 393 delayed_node->count++; 394 atomic_inc(&delayed_node->root->fs_info->delayed_root->items); 395 return 0; 396 } 397 398 static void finish_one_item(struct btrfs_delayed_root *delayed_root) 399 { 400 int seq = atomic_inc_return(&delayed_root->items_seq); 401 402 /* atomic_dec_return implies a barrier */ 403 if ((atomic_dec_return(&delayed_root->items) < 404 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0)) 405 cond_wake_up_nomb(&delayed_root->wait); 406 } 407 408 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) 409 { 410 struct rb_root_cached *root; 411 struct btrfs_delayed_root *delayed_root; 412 413 /* Not inserted, ignore it. */ 414 if (RB_EMPTY_NODE(&delayed_item->rb_node)) 415 return; 416 417 delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; 418 419 BUG_ON(!delayed_root); 420 421 if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) 422 root = &delayed_item->delayed_node->ins_root; 423 else 424 root = &delayed_item->delayed_node->del_root; 425 426 rb_erase_cached(&delayed_item->rb_node, root); 427 RB_CLEAR_NODE(&delayed_item->rb_node); 428 delayed_item->delayed_node->count--; 429 430 finish_one_item(delayed_root); 431 } 432 433 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) 434 { 435 if (item) { 436 __btrfs_remove_delayed_item(item); 437 if (refcount_dec_and_test(&item->refs)) 438 kfree(item); 439 } 440 } 441 442 static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( 443 struct btrfs_delayed_node *delayed_node) 444 { 445 struct rb_node *p; 446 struct btrfs_delayed_item *item = NULL; 447 448 p = rb_first_cached(&delayed_node->ins_root); 449 if (p) 450 item = rb_entry(p, struct btrfs_delayed_item, rb_node); 451 452 return item; 453 } 454 455 static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( 456 struct btrfs_delayed_node *delayed_node) 457 { 458 struct rb_node *p; 459 struct btrfs_delayed_item *item = NULL; 460 461 p = rb_first_cached(&delayed_node->del_root); 462 if (p) 463 item = rb_entry(p, struct btrfs_delayed_item, rb_node); 464 465 return item; 466 } 467 468 static struct btrfs_delayed_item *__btrfs_next_delayed_item( 469 struct btrfs_delayed_item *item) 470 { 471 struct rb_node *p; 472 struct btrfs_delayed_item *next = NULL; 473 474 p = rb_next(&item->rb_node); 475 if (p) 476 next = rb_entry(p, struct btrfs_delayed_item, rb_node); 477 478 return next; 479 } 480 481 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, 482 struct btrfs_delayed_item *item) 483 { 484 struct btrfs_block_rsv *src_rsv; 485 struct btrfs_block_rsv *dst_rsv; 486 struct btrfs_fs_info *fs_info = trans->fs_info; 487 u64 num_bytes; 488 int ret; 489 490 if (!trans->bytes_reserved) 491 return 0; 492 493 src_rsv = trans->block_rsv; 494 dst_rsv = &fs_info->delayed_block_rsv; 495 496 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 497 498 /* 499 * Here we migrate space rsv from transaction rsv, since have already 500 * reserved space when starting a transaction. So no need to reserve 501 * qgroup space here. 502 */ 503 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 504 if (!ret) { 505 trace_btrfs_space_reservation(fs_info, "delayed_item", 506 item->delayed_node->inode_id, 507 num_bytes, 1); 508 /* 509 * For insertions we track reserved metadata space by accounting 510 * for the number of leaves that will be used, based on the delayed 511 * node's index_items_size field. 512 */ 513 if (item->type == BTRFS_DELAYED_DELETION_ITEM) 514 item->bytes_reserved = num_bytes; 515 } 516 517 return ret; 518 } 519 520 static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, 521 struct btrfs_delayed_item *item) 522 { 523 struct btrfs_block_rsv *rsv; 524 struct btrfs_fs_info *fs_info = root->fs_info; 525 526 if (!item->bytes_reserved) 527 return; 528 529 rsv = &fs_info->delayed_block_rsv; 530 /* 531 * Check btrfs_delayed_item_reserve_metadata() to see why we don't need 532 * to release/reserve qgroup space. 533 */ 534 trace_btrfs_space_reservation(fs_info, "delayed_item", 535 item->delayed_node->inode_id, 536 item->bytes_reserved, 0); 537 btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); 538 } 539 540 static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, 541 unsigned int num_leaves) 542 { 543 struct btrfs_fs_info *fs_info = node->root->fs_info; 544 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); 545 546 /* There are no space reservations during log replay, bail out. */ 547 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 548 return; 549 550 trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, 551 bytes, 0); 552 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); 553 } 554 555 static int btrfs_delayed_inode_reserve_metadata( 556 struct btrfs_trans_handle *trans, 557 struct btrfs_root *root, 558 struct btrfs_delayed_node *node) 559 { 560 struct btrfs_fs_info *fs_info = root->fs_info; 561 struct btrfs_block_rsv *src_rsv; 562 struct btrfs_block_rsv *dst_rsv; 563 u64 num_bytes; 564 int ret; 565 566 src_rsv = trans->block_rsv; 567 dst_rsv = &fs_info->delayed_block_rsv; 568 569 num_bytes = btrfs_calc_metadata_size(fs_info, 1); 570 571 /* 572 * btrfs_dirty_inode will update the inode under btrfs_join_transaction 573 * which doesn't reserve space for speed. This is a problem since we 574 * still need to reserve space for this update, so try to reserve the 575 * space. 576 * 577 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since 578 * we always reserve enough to update the inode item. 579 */ 580 if (!src_rsv || (!trans->bytes_reserved && 581 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 582 ret = btrfs_qgroup_reserve_meta(root, num_bytes, 583 BTRFS_QGROUP_RSV_META_PREALLOC, true); 584 if (ret < 0) 585 return ret; 586 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, 587 BTRFS_RESERVE_NO_FLUSH); 588 /* NO_FLUSH could only fail with -ENOSPC */ 589 ASSERT(ret == 0 || ret == -ENOSPC); 590 if (ret) 591 btrfs_qgroup_free_meta_prealloc(root, num_bytes); 592 } else { 593 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); 594 } 595 596 if (!ret) { 597 trace_btrfs_space_reservation(fs_info, "delayed_inode", 598 node->inode_id, num_bytes, 1); 599 node->bytes_reserved = num_bytes; 600 } 601 602 return ret; 603 } 604 605 static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, 606 struct btrfs_delayed_node *node, 607 bool qgroup_free) 608 { 609 struct btrfs_block_rsv *rsv; 610 611 if (!node->bytes_reserved) 612 return; 613 614 rsv = &fs_info->delayed_block_rsv; 615 trace_btrfs_space_reservation(fs_info, "delayed_inode", 616 node->inode_id, node->bytes_reserved, 0); 617 btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved, NULL); 618 if (qgroup_free) 619 btrfs_qgroup_free_meta_prealloc(node->root, 620 node->bytes_reserved); 621 else 622 btrfs_qgroup_convert_reserved_meta(node->root, 623 node->bytes_reserved); 624 node->bytes_reserved = 0; 625 } 626 627 /* 628 * Insert a single delayed item or a batch of delayed items, as many as possible 629 * that fit in a leaf. The delayed items (dir index keys) are sorted by their key 630 * in the rbtree, and if there's a gap between two consecutive dir index items, 631 * then it means at some point we had delayed dir indexes to add but they got 632 * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them 633 * into the subvolume tree. Dir index keys also have their offsets coming from a 634 * monotonically increasing counter, so we can't get new keys with an offset that 635 * fits within a gap between delayed dir index items. 636 */ 637 static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, 638 struct btrfs_root *root, 639 struct btrfs_path *path, 640 struct btrfs_delayed_item *first_item) 641 { 642 struct btrfs_fs_info *fs_info = root->fs_info; 643 struct btrfs_delayed_node *node = first_item->delayed_node; 644 LIST_HEAD(item_list); 645 struct btrfs_delayed_item *curr; 646 struct btrfs_delayed_item *next; 647 const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); 648 struct btrfs_item_batch batch; 649 struct btrfs_key first_key; 650 const u32 first_data_size = first_item->data_len; 651 int total_size; 652 char *ins_data = NULL; 653 int ret; 654 bool continuous_keys_only = false; 655 656 lockdep_assert_held(&node->mutex); 657 658 /* 659 * During normal operation the delayed index offset is continuously 660 * increasing, so we can batch insert all items as there will not be any 661 * overlapping keys in the tree. 662 * 663 * The exception to this is log replay, where we may have interleaved 664 * offsets in the tree, so our batch needs to be continuous keys only in 665 * order to ensure we do not end up with out of order items in our leaf. 666 */ 667 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 668 continuous_keys_only = true; 669 670 /* 671 * For delayed items to insert, we track reserved metadata bytes based 672 * on the number of leaves that we will use. 673 * See btrfs_insert_delayed_dir_index() and 674 * btrfs_delayed_item_reserve_metadata()). 675 */ 676 ASSERT(first_item->bytes_reserved == 0); 677 678 list_add_tail(&first_item->tree_list, &item_list); 679 batch.total_data_size = first_data_size; 680 batch.nr = 1; 681 total_size = first_data_size + sizeof(struct btrfs_item); 682 curr = first_item; 683 684 while (true) { 685 int next_size; 686 687 next = __btrfs_next_delayed_item(curr); 688 if (!next) 689 break; 690 691 /* 692 * We cannot allow gaps in the key space if we're doing log 693 * replay. 694 */ 695 if (continuous_keys_only && (next->index != curr->index + 1)) 696 break; 697 698 ASSERT(next->bytes_reserved == 0); 699 700 next_size = next->data_len + sizeof(struct btrfs_item); 701 if (total_size + next_size > max_size) 702 break; 703 704 list_add_tail(&next->tree_list, &item_list); 705 batch.nr++; 706 total_size += next_size; 707 batch.total_data_size += next->data_len; 708 curr = next; 709 } 710 711 if (batch.nr == 1) { 712 first_key.objectid = node->inode_id; 713 first_key.type = BTRFS_DIR_INDEX_KEY; 714 first_key.offset = first_item->index; 715 batch.keys = &first_key; 716 batch.data_sizes = &first_data_size; 717 } else { 718 struct btrfs_key *ins_keys; 719 u32 *ins_sizes; 720 int i = 0; 721 722 ins_data = kmalloc(batch.nr * sizeof(u32) + 723 batch.nr * sizeof(struct btrfs_key), GFP_NOFS); 724 if (!ins_data) { 725 ret = -ENOMEM; 726 goto out; 727 } 728 ins_sizes = (u32 *)ins_data; 729 ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); 730 batch.keys = ins_keys; 731 batch.data_sizes = ins_sizes; 732 list_for_each_entry(curr, &item_list, tree_list) { 733 ins_keys[i].objectid = node->inode_id; 734 ins_keys[i].type = BTRFS_DIR_INDEX_KEY; 735 ins_keys[i].offset = curr->index; 736 ins_sizes[i] = curr->data_len; 737 i++; 738 } 739 } 740 741 ret = btrfs_insert_empty_items(trans, root, path, &batch); 742 if (ret) 743 goto out; 744 745 list_for_each_entry(curr, &item_list, tree_list) { 746 char *data_ptr; 747 748 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); 749 write_extent_buffer(path->nodes[0], &curr->data, 750 (unsigned long)data_ptr, curr->data_len); 751 path->slots[0]++; 752 } 753 754 /* 755 * Now release our path before releasing the delayed items and their 756 * metadata reservations, so that we don't block other tasks for more 757 * time than needed. 758 */ 759 btrfs_release_path(path); 760 761 ASSERT(node->index_item_leaves > 0); 762 763 /* 764 * For normal operations we will batch an entire leaf's worth of delayed 765 * items, so if there are more items to process we can decrement 766 * index_item_leaves by 1 as we inserted 1 leaf's worth of items. 767 * 768 * However for log replay we may not have inserted an entire leaf's 769 * worth of items, we may have not had continuous items, so decrementing 770 * here would mess up the index_item_leaves accounting. For this case 771 * only clean up the accounting when there are no items left. 772 */ 773 if (next && !continuous_keys_only) { 774 /* 775 * We inserted one batch of items into a leaf a there are more 776 * items to flush in a future batch, now release one unit of 777 * metadata space from the delayed block reserve, corresponding 778 * the leaf we just flushed to. 779 */ 780 btrfs_delayed_item_release_leaves(node, 1); 781 node->index_item_leaves--; 782 } else if (!next) { 783 /* 784 * There are no more items to insert. We can have a number of 785 * reserved leaves > 1 here - this happens when many dir index 786 * items are added and then removed before they are flushed (file 787 * names with a very short life, never span a transaction). So 788 * release all remaining leaves. 789 */ 790 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 791 node->index_item_leaves = 0; 792 } 793 794 list_for_each_entry_safe(curr, next, &item_list, tree_list) { 795 list_del(&curr->tree_list); 796 btrfs_release_delayed_item(curr); 797 } 798 out: 799 kfree(ins_data); 800 return ret; 801 } 802 803 static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, 804 struct btrfs_path *path, 805 struct btrfs_root *root, 806 struct btrfs_delayed_node *node) 807 { 808 int ret = 0; 809 810 while (ret == 0) { 811 struct btrfs_delayed_item *curr; 812 813 mutex_lock(&node->mutex); 814 curr = __btrfs_first_delayed_insertion_item(node); 815 if (!curr) { 816 mutex_unlock(&node->mutex); 817 break; 818 } 819 ret = btrfs_insert_delayed_item(trans, root, path, curr); 820 mutex_unlock(&node->mutex); 821 } 822 823 return ret; 824 } 825 826 static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, 827 struct btrfs_root *root, 828 struct btrfs_path *path, 829 struct btrfs_delayed_item *item) 830 { 831 const u64 ino = item->delayed_node->inode_id; 832 struct btrfs_fs_info *fs_info = root->fs_info; 833 struct btrfs_delayed_item *curr, *next; 834 struct extent_buffer *leaf = path->nodes[0]; 835 LIST_HEAD(batch_list); 836 int nitems, slot, last_slot; 837 int ret; 838 u64 total_reserved_size = item->bytes_reserved; 839 840 ASSERT(leaf != NULL); 841 842 slot = path->slots[0]; 843 last_slot = btrfs_header_nritems(leaf) - 1; 844 /* 845 * Our caller always gives us a path pointing to an existing item, so 846 * this can not happen. 847 */ 848 ASSERT(slot <= last_slot); 849 if (WARN_ON(slot > last_slot)) 850 return -ENOENT; 851 852 nitems = 1; 853 curr = item; 854 list_add_tail(&curr->tree_list, &batch_list); 855 856 /* 857 * Keep checking if the next delayed item matches the next item in the 858 * leaf - if so, we can add it to the batch of items to delete from the 859 * leaf. 860 */ 861 while (slot < last_slot) { 862 struct btrfs_key key; 863 864 next = __btrfs_next_delayed_item(curr); 865 if (!next) 866 break; 867 868 slot++; 869 btrfs_item_key_to_cpu(leaf, &key, slot); 870 if (key.objectid != ino || 871 key.type != BTRFS_DIR_INDEX_KEY || 872 key.offset != next->index) 873 break; 874 nitems++; 875 curr = next; 876 list_add_tail(&curr->tree_list, &batch_list); 877 total_reserved_size += curr->bytes_reserved; 878 } 879 880 ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); 881 if (ret) 882 return ret; 883 884 /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ 885 if (total_reserved_size > 0) { 886 /* 887 * Check btrfs_delayed_item_reserve_metadata() to see why we 888 * don't need to release/reserve qgroup space. 889 */ 890 trace_btrfs_space_reservation(fs_info, "delayed_item", ino, 891 total_reserved_size, 0); 892 btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, 893 total_reserved_size, NULL); 894 } 895 896 list_for_each_entry_safe(curr, next, &batch_list, tree_list) { 897 list_del(&curr->tree_list); 898 btrfs_release_delayed_item(curr); 899 } 900 901 return 0; 902 } 903 904 static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, 905 struct btrfs_path *path, 906 struct btrfs_root *root, 907 struct btrfs_delayed_node *node) 908 { 909 struct btrfs_key key; 910 int ret = 0; 911 912 key.objectid = node->inode_id; 913 key.type = BTRFS_DIR_INDEX_KEY; 914 915 while (ret == 0) { 916 struct btrfs_delayed_item *item; 917 918 mutex_lock(&node->mutex); 919 item = __btrfs_first_delayed_deletion_item(node); 920 if (!item) { 921 mutex_unlock(&node->mutex); 922 break; 923 } 924 925 key.offset = item->index; 926 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 927 if (ret > 0) { 928 /* 929 * There's no matching item in the leaf. This means we 930 * have already deleted this item in a past run of the 931 * delayed items. We ignore errors when running delayed 932 * items from an async context, through a work queue job 933 * running btrfs_async_run_delayed_root(), and don't 934 * release delayed items that failed to complete. This 935 * is because we will retry later, and at transaction 936 * commit time we always run delayed items and will 937 * then deal with errors if they fail to run again. 938 * 939 * So just release delayed items for which we can't find 940 * an item in the tree, and move to the next item. 941 */ 942 btrfs_release_path(path); 943 btrfs_release_delayed_item(item); 944 ret = 0; 945 } else if (ret == 0) { 946 ret = btrfs_batch_delete_items(trans, root, path, item); 947 btrfs_release_path(path); 948 } 949 950 /* 951 * We unlock and relock on each iteration, this is to prevent 952 * blocking other tasks for too long while we are being run from 953 * the async context (work queue job). Those tasks are typically 954 * running system calls like creat/mkdir/rename/unlink/etc which 955 * need to add delayed items to this delayed node. 956 */ 957 mutex_unlock(&node->mutex); 958 } 959 960 return ret; 961 } 962 963 static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) 964 { 965 struct btrfs_delayed_root *delayed_root; 966 967 if (delayed_node && 968 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 969 BUG_ON(!delayed_node->root); 970 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 971 delayed_node->count--; 972 973 delayed_root = delayed_node->root->fs_info->delayed_root; 974 finish_one_item(delayed_root); 975 } 976 } 977 978 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) 979 { 980 981 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 982 struct btrfs_delayed_root *delayed_root; 983 984 ASSERT(delayed_node->root); 985 delayed_node->count--; 986 987 delayed_root = delayed_node->root->fs_info->delayed_root; 988 finish_one_item(delayed_root); 989 } 990 } 991 992 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 993 struct btrfs_root *root, 994 struct btrfs_path *path, 995 struct btrfs_delayed_node *node) 996 { 997 struct btrfs_fs_info *fs_info = root->fs_info; 998 struct btrfs_key key; 999 struct btrfs_inode_item *inode_item; 1000 struct extent_buffer *leaf; 1001 int mod; 1002 int ret; 1003 1004 key.objectid = node->inode_id; 1005 key.type = BTRFS_INODE_ITEM_KEY; 1006 key.offset = 0; 1007 1008 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1009 mod = -1; 1010 else 1011 mod = 1; 1012 1013 ret = btrfs_lookup_inode(trans, root, path, &key, mod); 1014 if (ret > 0) 1015 ret = -ENOENT; 1016 if (ret < 0) 1017 goto out; 1018 1019 leaf = path->nodes[0]; 1020 inode_item = btrfs_item_ptr(leaf, path->slots[0], 1021 struct btrfs_inode_item); 1022 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, 1023 sizeof(struct btrfs_inode_item)); 1024 btrfs_mark_buffer_dirty(leaf); 1025 1026 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1027 goto out; 1028 1029 path->slots[0]++; 1030 if (path->slots[0] >= btrfs_header_nritems(leaf)) 1031 goto search; 1032 again: 1033 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1034 if (key.objectid != node->inode_id) 1035 goto out; 1036 1037 if (key.type != BTRFS_INODE_REF_KEY && 1038 key.type != BTRFS_INODE_EXTREF_KEY) 1039 goto out; 1040 1041 /* 1042 * Delayed iref deletion is for the inode who has only one link, 1043 * so there is only one iref. The case that several irefs are 1044 * in the same item doesn't exist. 1045 */ 1046 btrfs_del_item(trans, root, path); 1047 out: 1048 btrfs_release_delayed_iref(node); 1049 btrfs_release_path(path); 1050 err_out: 1051 btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); 1052 btrfs_release_delayed_inode(node); 1053 1054 /* 1055 * If we fail to update the delayed inode we need to abort the 1056 * transaction, because we could leave the inode with the improper 1057 * counts behind. 1058 */ 1059 if (ret && ret != -ENOENT) 1060 btrfs_abort_transaction(trans, ret); 1061 1062 return ret; 1063 1064 search: 1065 btrfs_release_path(path); 1066 1067 key.type = BTRFS_INODE_EXTREF_KEY; 1068 key.offset = -1; 1069 1070 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1071 if (ret < 0) 1072 goto err_out; 1073 ASSERT(ret); 1074 1075 ret = 0; 1076 leaf = path->nodes[0]; 1077 path->slots[0]--; 1078 goto again; 1079 } 1080 1081 static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, 1082 struct btrfs_root *root, 1083 struct btrfs_path *path, 1084 struct btrfs_delayed_node *node) 1085 { 1086 int ret; 1087 1088 mutex_lock(&node->mutex); 1089 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) { 1090 mutex_unlock(&node->mutex); 1091 return 0; 1092 } 1093 1094 ret = __btrfs_update_delayed_inode(trans, root, path, node); 1095 mutex_unlock(&node->mutex); 1096 return ret; 1097 } 1098 1099 static inline int 1100 __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1101 struct btrfs_path *path, 1102 struct btrfs_delayed_node *node) 1103 { 1104 int ret; 1105 1106 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1107 if (ret) 1108 return ret; 1109 1110 ret = btrfs_delete_delayed_items(trans, path, node->root, node); 1111 if (ret) 1112 return ret; 1113 1114 ret = btrfs_update_delayed_inode(trans, node->root, path, node); 1115 return ret; 1116 } 1117 1118 /* 1119 * Called when committing the transaction. 1120 * Returns 0 on success. 1121 * Returns < 0 on error and returns with an aborted transaction with any 1122 * outstanding delayed items cleaned up. 1123 */ 1124 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) 1125 { 1126 struct btrfs_fs_info *fs_info = trans->fs_info; 1127 struct btrfs_delayed_root *delayed_root; 1128 struct btrfs_delayed_node *curr_node, *prev_node; 1129 struct btrfs_path *path; 1130 struct btrfs_block_rsv *block_rsv; 1131 int ret = 0; 1132 bool count = (nr > 0); 1133 1134 if (TRANS_ABORTED(trans)) 1135 return -EIO; 1136 1137 path = btrfs_alloc_path(); 1138 if (!path) 1139 return -ENOMEM; 1140 1141 block_rsv = trans->block_rsv; 1142 trans->block_rsv = &fs_info->delayed_block_rsv; 1143 1144 delayed_root = fs_info->delayed_root; 1145 1146 curr_node = btrfs_first_delayed_node(delayed_root); 1147 while (curr_node && (!count || nr--)) { 1148 ret = __btrfs_commit_inode_delayed_items(trans, path, 1149 curr_node); 1150 if (ret) { 1151 btrfs_release_delayed_node(curr_node); 1152 curr_node = NULL; 1153 btrfs_abort_transaction(trans, ret); 1154 break; 1155 } 1156 1157 prev_node = curr_node; 1158 curr_node = btrfs_next_delayed_node(curr_node); 1159 btrfs_release_delayed_node(prev_node); 1160 } 1161 1162 if (curr_node) 1163 btrfs_release_delayed_node(curr_node); 1164 btrfs_free_path(path); 1165 trans->block_rsv = block_rsv; 1166 1167 return ret; 1168 } 1169 1170 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans) 1171 { 1172 return __btrfs_run_delayed_items(trans, -1); 1173 } 1174 1175 int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr) 1176 { 1177 return __btrfs_run_delayed_items(trans, nr); 1178 } 1179 1180 int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, 1181 struct btrfs_inode *inode) 1182 { 1183 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1184 struct btrfs_path *path; 1185 struct btrfs_block_rsv *block_rsv; 1186 int ret; 1187 1188 if (!delayed_node) 1189 return 0; 1190 1191 mutex_lock(&delayed_node->mutex); 1192 if (!delayed_node->count) { 1193 mutex_unlock(&delayed_node->mutex); 1194 btrfs_release_delayed_node(delayed_node); 1195 return 0; 1196 } 1197 mutex_unlock(&delayed_node->mutex); 1198 1199 path = btrfs_alloc_path(); 1200 if (!path) { 1201 btrfs_release_delayed_node(delayed_node); 1202 return -ENOMEM; 1203 } 1204 1205 block_rsv = trans->block_rsv; 1206 trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; 1207 1208 ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1209 1210 btrfs_release_delayed_node(delayed_node); 1211 btrfs_free_path(path); 1212 trans->block_rsv = block_rsv; 1213 1214 return ret; 1215 } 1216 1217 int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode) 1218 { 1219 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1220 struct btrfs_trans_handle *trans; 1221 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1222 struct btrfs_path *path; 1223 struct btrfs_block_rsv *block_rsv; 1224 int ret; 1225 1226 if (!delayed_node) 1227 return 0; 1228 1229 mutex_lock(&delayed_node->mutex); 1230 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1231 mutex_unlock(&delayed_node->mutex); 1232 btrfs_release_delayed_node(delayed_node); 1233 return 0; 1234 } 1235 mutex_unlock(&delayed_node->mutex); 1236 1237 trans = btrfs_join_transaction(delayed_node->root); 1238 if (IS_ERR(trans)) { 1239 ret = PTR_ERR(trans); 1240 goto out; 1241 } 1242 1243 path = btrfs_alloc_path(); 1244 if (!path) { 1245 ret = -ENOMEM; 1246 goto trans_out; 1247 } 1248 1249 block_rsv = trans->block_rsv; 1250 trans->block_rsv = &fs_info->delayed_block_rsv; 1251 1252 mutex_lock(&delayed_node->mutex); 1253 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) 1254 ret = __btrfs_update_delayed_inode(trans, delayed_node->root, 1255 path, delayed_node); 1256 else 1257 ret = 0; 1258 mutex_unlock(&delayed_node->mutex); 1259 1260 btrfs_free_path(path); 1261 trans->block_rsv = block_rsv; 1262 trans_out: 1263 btrfs_end_transaction(trans); 1264 btrfs_btree_balance_dirty(fs_info); 1265 out: 1266 btrfs_release_delayed_node(delayed_node); 1267 1268 return ret; 1269 } 1270 1271 void btrfs_remove_delayed_node(struct btrfs_inode *inode) 1272 { 1273 struct btrfs_delayed_node *delayed_node; 1274 1275 delayed_node = READ_ONCE(inode->delayed_node); 1276 if (!delayed_node) 1277 return; 1278 1279 inode->delayed_node = NULL; 1280 btrfs_release_delayed_node(delayed_node); 1281 } 1282 1283 struct btrfs_async_delayed_work { 1284 struct btrfs_delayed_root *delayed_root; 1285 int nr; 1286 struct btrfs_work work; 1287 }; 1288 1289 static void btrfs_async_run_delayed_root(struct btrfs_work *work) 1290 { 1291 struct btrfs_async_delayed_work *async_work; 1292 struct btrfs_delayed_root *delayed_root; 1293 struct btrfs_trans_handle *trans; 1294 struct btrfs_path *path; 1295 struct btrfs_delayed_node *delayed_node = NULL; 1296 struct btrfs_root *root; 1297 struct btrfs_block_rsv *block_rsv; 1298 int total_done = 0; 1299 1300 async_work = container_of(work, struct btrfs_async_delayed_work, work); 1301 delayed_root = async_work->delayed_root; 1302 1303 path = btrfs_alloc_path(); 1304 if (!path) 1305 goto out; 1306 1307 do { 1308 if (atomic_read(&delayed_root->items) < 1309 BTRFS_DELAYED_BACKGROUND / 2) 1310 break; 1311 1312 delayed_node = btrfs_first_prepared_delayed_node(delayed_root); 1313 if (!delayed_node) 1314 break; 1315 1316 root = delayed_node->root; 1317 1318 trans = btrfs_join_transaction(root); 1319 if (IS_ERR(trans)) { 1320 btrfs_release_path(path); 1321 btrfs_release_prepared_delayed_node(delayed_node); 1322 total_done++; 1323 continue; 1324 } 1325 1326 block_rsv = trans->block_rsv; 1327 trans->block_rsv = &root->fs_info->delayed_block_rsv; 1328 1329 __btrfs_commit_inode_delayed_items(trans, path, delayed_node); 1330 1331 trans->block_rsv = block_rsv; 1332 btrfs_end_transaction(trans); 1333 btrfs_btree_balance_dirty_nodelay(root->fs_info); 1334 1335 btrfs_release_path(path); 1336 btrfs_release_prepared_delayed_node(delayed_node); 1337 total_done++; 1338 1339 } while ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) 1340 || total_done < async_work->nr); 1341 1342 btrfs_free_path(path); 1343 out: 1344 wake_up(&delayed_root->wait); 1345 kfree(async_work); 1346 } 1347 1348 1349 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, 1350 struct btrfs_fs_info *fs_info, int nr) 1351 { 1352 struct btrfs_async_delayed_work *async_work; 1353 1354 async_work = kmalloc(sizeof(*async_work), GFP_NOFS); 1355 if (!async_work) 1356 return -ENOMEM; 1357 1358 async_work->delayed_root = delayed_root; 1359 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, 1360 NULL); 1361 async_work->nr = nr; 1362 1363 btrfs_queue_work(fs_info->delayed_workers, &async_work->work); 1364 return 0; 1365 } 1366 1367 void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) 1368 { 1369 WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); 1370 } 1371 1372 static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) 1373 { 1374 int val = atomic_read(&delayed_root->items_seq); 1375 1376 if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) 1377 return 1; 1378 1379 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) 1380 return 1; 1381 1382 return 0; 1383 } 1384 1385 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) 1386 { 1387 struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; 1388 1389 if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || 1390 btrfs_workqueue_normal_congested(fs_info->delayed_workers)) 1391 return; 1392 1393 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) { 1394 int seq; 1395 int ret; 1396 1397 seq = atomic_read(&delayed_root->items_seq); 1398 1399 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); 1400 if (ret) 1401 return; 1402 1403 wait_event_interruptible(delayed_root->wait, 1404 could_end_wait(delayed_root, seq)); 1405 return; 1406 } 1407 1408 btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); 1409 } 1410 1411 /* Will return 0 or -ENOMEM */ 1412 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, 1413 const char *name, int name_len, 1414 struct btrfs_inode *dir, 1415 struct btrfs_disk_key *disk_key, u8 type, 1416 u64 index) 1417 { 1418 struct btrfs_fs_info *fs_info = trans->fs_info; 1419 const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); 1420 struct btrfs_delayed_node *delayed_node; 1421 struct btrfs_delayed_item *delayed_item; 1422 struct btrfs_dir_item *dir_item; 1423 bool reserve_leaf_space; 1424 u32 data_len; 1425 int ret; 1426 1427 delayed_node = btrfs_get_or_create_delayed_node(dir); 1428 if (IS_ERR(delayed_node)) 1429 return PTR_ERR(delayed_node); 1430 1431 delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len, 1432 delayed_node, 1433 BTRFS_DELAYED_INSERTION_ITEM); 1434 if (!delayed_item) { 1435 ret = -ENOMEM; 1436 goto release_node; 1437 } 1438 1439 delayed_item->index = index; 1440 1441 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1442 dir_item->location = *disk_key; 1443 btrfs_set_stack_dir_transid(dir_item, trans->transid); 1444 btrfs_set_stack_dir_data_len(dir_item, 0); 1445 btrfs_set_stack_dir_name_len(dir_item, name_len); 1446 btrfs_set_stack_dir_type(dir_item, type); 1447 memcpy((char *)(dir_item + 1), name, name_len); 1448 1449 data_len = delayed_item->data_len + sizeof(struct btrfs_item); 1450 1451 mutex_lock(&delayed_node->mutex); 1452 1453 if (delayed_node->index_item_leaves == 0 || 1454 delayed_node->curr_index_batch_size + data_len > leaf_data_size) { 1455 delayed_node->curr_index_batch_size = data_len; 1456 reserve_leaf_space = true; 1457 } else { 1458 delayed_node->curr_index_batch_size += data_len; 1459 reserve_leaf_space = false; 1460 } 1461 1462 if (reserve_leaf_space) { 1463 ret = btrfs_delayed_item_reserve_metadata(trans, delayed_item); 1464 /* 1465 * Space was reserved for a dir index item insertion when we 1466 * started the transaction, so getting a failure here should be 1467 * impossible. 1468 */ 1469 if (WARN_ON(ret)) { 1470 mutex_unlock(&delayed_node->mutex); 1471 btrfs_release_delayed_item(delayed_item); 1472 goto release_node; 1473 } 1474 1475 delayed_node->index_item_leaves++; 1476 } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { 1477 const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); 1478 1479 /* 1480 * Adding the new dir index item does not require touching another 1481 * leaf, so we can release 1 unit of metadata that was previously 1482 * reserved when starting the transaction. This applies only to 1483 * the case where we had a transaction start and excludes the 1484 * transaction join case (when replaying log trees). 1485 */ 1486 trace_btrfs_space_reservation(fs_info, "transaction", 1487 trans->transid, bytes, 0); 1488 btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); 1489 ASSERT(trans->bytes_reserved >= bytes); 1490 trans->bytes_reserved -= bytes; 1491 } 1492 1493 ret = __btrfs_add_delayed_item(delayed_node, delayed_item); 1494 if (unlikely(ret)) { 1495 btrfs_err(trans->fs_info, 1496 "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", 1497 name_len, name, delayed_node->root->root_key.objectid, 1498 delayed_node->inode_id, ret); 1499 BUG(); 1500 } 1501 mutex_unlock(&delayed_node->mutex); 1502 1503 release_node: 1504 btrfs_release_delayed_node(delayed_node); 1505 return ret; 1506 } 1507 1508 static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, 1509 struct btrfs_delayed_node *node, 1510 u64 index) 1511 { 1512 struct btrfs_delayed_item *item; 1513 1514 mutex_lock(&node->mutex); 1515 item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index); 1516 if (!item) { 1517 mutex_unlock(&node->mutex); 1518 return 1; 1519 } 1520 1521 /* 1522 * For delayed items to insert, we track reserved metadata bytes based 1523 * on the number of leaves that we will use. 1524 * See btrfs_insert_delayed_dir_index() and 1525 * btrfs_delayed_item_reserve_metadata()). 1526 */ 1527 ASSERT(item->bytes_reserved == 0); 1528 ASSERT(node->index_item_leaves > 0); 1529 1530 /* 1531 * If there's only one leaf reserved, we can decrement this item from the 1532 * current batch, otherwise we can not because we don't know which leaf 1533 * it belongs to. With the current limit on delayed items, we rarely 1534 * accumulate enough dir index items to fill more than one leaf (even 1535 * when using a leaf size of 4K). 1536 */ 1537 if (node->index_item_leaves == 1) { 1538 const u32 data_len = item->data_len + sizeof(struct btrfs_item); 1539 1540 ASSERT(node->curr_index_batch_size >= data_len); 1541 node->curr_index_batch_size -= data_len; 1542 } 1543 1544 btrfs_release_delayed_item(item); 1545 1546 /* If we now have no more dir index items, we can release all leaves. */ 1547 if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { 1548 btrfs_delayed_item_release_leaves(node, node->index_item_leaves); 1549 node->index_item_leaves = 0; 1550 } 1551 1552 mutex_unlock(&node->mutex); 1553 return 0; 1554 } 1555 1556 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, 1557 struct btrfs_inode *dir, u64 index) 1558 { 1559 struct btrfs_delayed_node *node; 1560 struct btrfs_delayed_item *item; 1561 int ret; 1562 1563 node = btrfs_get_or_create_delayed_node(dir); 1564 if (IS_ERR(node)) 1565 return PTR_ERR(node); 1566 1567 ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node, index); 1568 if (!ret) 1569 goto end; 1570 1571 item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM); 1572 if (!item) { 1573 ret = -ENOMEM; 1574 goto end; 1575 } 1576 1577 item->index = index; 1578 1579 ret = btrfs_delayed_item_reserve_metadata(trans, item); 1580 /* 1581 * we have reserved enough space when we start a new transaction, 1582 * so reserving metadata failure is impossible. 1583 */ 1584 if (ret < 0) { 1585 btrfs_err(trans->fs_info, 1586 "metadata reservation failed for delayed dir item deltiona, should have been reserved"); 1587 btrfs_release_delayed_item(item); 1588 goto end; 1589 } 1590 1591 mutex_lock(&node->mutex); 1592 ret = __btrfs_add_delayed_item(node, item); 1593 if (unlikely(ret)) { 1594 btrfs_err(trans->fs_info, 1595 "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", 1596 index, node->root->root_key.objectid, 1597 node->inode_id, ret); 1598 btrfs_delayed_item_release_metadata(dir->root, item); 1599 btrfs_release_delayed_item(item); 1600 } 1601 mutex_unlock(&node->mutex); 1602 end: 1603 btrfs_release_delayed_node(node); 1604 return ret; 1605 } 1606 1607 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode) 1608 { 1609 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); 1610 1611 if (!delayed_node) 1612 return -ENOENT; 1613 1614 /* 1615 * Since we have held i_mutex of this directory, it is impossible that 1616 * a new directory index is added into the delayed node and index_cnt 1617 * is updated now. So we needn't lock the delayed node. 1618 */ 1619 if (!delayed_node->index_cnt) { 1620 btrfs_release_delayed_node(delayed_node); 1621 return -EINVAL; 1622 } 1623 1624 inode->index_cnt = delayed_node->index_cnt; 1625 btrfs_release_delayed_node(delayed_node); 1626 return 0; 1627 } 1628 1629 bool btrfs_readdir_get_delayed_items(struct inode *inode, 1630 struct list_head *ins_list, 1631 struct list_head *del_list) 1632 { 1633 struct btrfs_delayed_node *delayed_node; 1634 struct btrfs_delayed_item *item; 1635 1636 delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); 1637 if (!delayed_node) 1638 return false; 1639 1640 /* 1641 * We can only do one readdir with delayed items at a time because of 1642 * item->readdir_list. 1643 */ 1644 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 1645 btrfs_inode_lock(inode, 0); 1646 1647 mutex_lock(&delayed_node->mutex); 1648 item = __btrfs_first_delayed_insertion_item(delayed_node); 1649 while (item) { 1650 refcount_inc(&item->refs); 1651 list_add_tail(&item->readdir_list, ins_list); 1652 item = __btrfs_next_delayed_item(item); 1653 } 1654 1655 item = __btrfs_first_delayed_deletion_item(delayed_node); 1656 while (item) { 1657 refcount_inc(&item->refs); 1658 list_add_tail(&item->readdir_list, del_list); 1659 item = __btrfs_next_delayed_item(item); 1660 } 1661 mutex_unlock(&delayed_node->mutex); 1662 /* 1663 * This delayed node is still cached in the btrfs inode, so refs 1664 * must be > 1 now, and we needn't check it is going to be freed 1665 * or not. 1666 * 1667 * Besides that, this function is used to read dir, we do not 1668 * insert/delete delayed items in this period. So we also needn't 1669 * requeue or dequeue this delayed node. 1670 */ 1671 refcount_dec(&delayed_node->refs); 1672 1673 return true; 1674 } 1675 1676 void btrfs_readdir_put_delayed_items(struct inode *inode, 1677 struct list_head *ins_list, 1678 struct list_head *del_list) 1679 { 1680 struct btrfs_delayed_item *curr, *next; 1681 1682 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1683 list_del(&curr->readdir_list); 1684 if (refcount_dec_and_test(&curr->refs)) 1685 kfree(curr); 1686 } 1687 1688 list_for_each_entry_safe(curr, next, del_list, readdir_list) { 1689 list_del(&curr->readdir_list); 1690 if (refcount_dec_and_test(&curr->refs)) 1691 kfree(curr); 1692 } 1693 1694 /* 1695 * The VFS is going to do up_read(), so we need to downgrade back to a 1696 * read lock. 1697 */ 1698 downgrade_write(&inode->i_rwsem); 1699 } 1700 1701 int btrfs_should_delete_dir_index(struct list_head *del_list, 1702 u64 index) 1703 { 1704 struct btrfs_delayed_item *curr; 1705 int ret = 0; 1706 1707 list_for_each_entry(curr, del_list, readdir_list) { 1708 if (curr->index > index) 1709 break; 1710 if (curr->index == index) { 1711 ret = 1; 1712 break; 1713 } 1714 } 1715 return ret; 1716 } 1717 1718 /* 1719 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree 1720 * 1721 */ 1722 int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, 1723 struct list_head *ins_list) 1724 { 1725 struct btrfs_dir_item *di; 1726 struct btrfs_delayed_item *curr, *next; 1727 struct btrfs_key location; 1728 char *name; 1729 int name_len; 1730 int over = 0; 1731 unsigned char d_type; 1732 1733 if (list_empty(ins_list)) 1734 return 0; 1735 1736 /* 1737 * Changing the data of the delayed item is impossible. So 1738 * we needn't lock them. And we have held i_mutex of the 1739 * directory, nobody can delete any directory indexes now. 1740 */ 1741 list_for_each_entry_safe(curr, next, ins_list, readdir_list) { 1742 list_del(&curr->readdir_list); 1743 1744 if (curr->index < ctx->pos) { 1745 if (refcount_dec_and_test(&curr->refs)) 1746 kfree(curr); 1747 continue; 1748 } 1749 1750 ctx->pos = curr->index; 1751 1752 di = (struct btrfs_dir_item *)curr->data; 1753 name = (char *)(di + 1); 1754 name_len = btrfs_stack_dir_name_len(di); 1755 1756 d_type = fs_ftype_to_dtype(di->type); 1757 btrfs_disk_key_to_cpu(&location, &di->location); 1758 1759 over = !dir_emit(ctx, name, name_len, 1760 location.objectid, d_type); 1761 1762 if (refcount_dec_and_test(&curr->refs)) 1763 kfree(curr); 1764 1765 if (over) 1766 return 1; 1767 ctx->pos++; 1768 } 1769 return 0; 1770 } 1771 1772 static void fill_stack_inode_item(struct btrfs_trans_handle *trans, 1773 struct btrfs_inode_item *inode_item, 1774 struct inode *inode) 1775 { 1776 u64 flags; 1777 1778 btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); 1779 btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); 1780 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); 1781 btrfs_set_stack_inode_mode(inode_item, inode->i_mode); 1782 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); 1783 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); 1784 btrfs_set_stack_inode_generation(inode_item, 1785 BTRFS_I(inode)->generation); 1786 btrfs_set_stack_inode_sequence(inode_item, 1787 inode_peek_iversion(inode)); 1788 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1789 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1790 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 1791 BTRFS_I(inode)->ro_flags); 1792 btrfs_set_stack_inode_flags(inode_item, flags); 1793 btrfs_set_stack_inode_block_group(inode_item, 0); 1794 1795 btrfs_set_stack_timespec_sec(&inode_item->atime, 1796 inode->i_atime.tv_sec); 1797 btrfs_set_stack_timespec_nsec(&inode_item->atime, 1798 inode->i_atime.tv_nsec); 1799 1800 btrfs_set_stack_timespec_sec(&inode_item->mtime, 1801 inode->i_mtime.tv_sec); 1802 btrfs_set_stack_timespec_nsec(&inode_item->mtime, 1803 inode->i_mtime.tv_nsec); 1804 1805 btrfs_set_stack_timespec_sec(&inode_item->ctime, 1806 inode->i_ctime.tv_sec); 1807 btrfs_set_stack_timespec_nsec(&inode_item->ctime, 1808 inode->i_ctime.tv_nsec); 1809 1810 btrfs_set_stack_timespec_sec(&inode_item->otime, 1811 BTRFS_I(inode)->i_otime.tv_sec); 1812 btrfs_set_stack_timespec_nsec(&inode_item->otime, 1813 BTRFS_I(inode)->i_otime.tv_nsec); 1814 } 1815 1816 int btrfs_fill_inode(struct inode *inode, u32 *rdev) 1817 { 1818 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 1819 struct btrfs_delayed_node *delayed_node; 1820 struct btrfs_inode_item *inode_item; 1821 1822 delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); 1823 if (!delayed_node) 1824 return -ENOENT; 1825 1826 mutex_lock(&delayed_node->mutex); 1827 if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1828 mutex_unlock(&delayed_node->mutex); 1829 btrfs_release_delayed_node(delayed_node); 1830 return -ENOENT; 1831 } 1832 1833 inode_item = &delayed_node->inode_item; 1834 1835 i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); 1836 i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); 1837 btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); 1838 btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, 1839 round_up(i_size_read(inode), fs_info->sectorsize)); 1840 inode->i_mode = btrfs_stack_inode_mode(inode_item); 1841 set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); 1842 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1843 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1844 BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); 1845 1846 inode_set_iversion_queried(inode, 1847 btrfs_stack_inode_sequence(inode_item)); 1848 inode->i_rdev = 0; 1849 *rdev = btrfs_stack_inode_rdev(inode_item); 1850 btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), 1851 &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); 1852 1853 inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); 1854 inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); 1855 1856 inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); 1857 inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); 1858 1859 inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); 1860 inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); 1861 1862 BTRFS_I(inode)->i_otime.tv_sec = 1863 btrfs_stack_timespec_sec(&inode_item->otime); 1864 BTRFS_I(inode)->i_otime.tv_nsec = 1865 btrfs_stack_timespec_nsec(&inode_item->otime); 1866 1867 inode->i_generation = BTRFS_I(inode)->generation; 1868 BTRFS_I(inode)->index_cnt = (u64)-1; 1869 1870 mutex_unlock(&delayed_node->mutex); 1871 btrfs_release_delayed_node(delayed_node); 1872 return 0; 1873 } 1874 1875 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, 1876 struct btrfs_root *root, 1877 struct btrfs_inode *inode) 1878 { 1879 struct btrfs_delayed_node *delayed_node; 1880 int ret = 0; 1881 1882 delayed_node = btrfs_get_or_create_delayed_node(inode); 1883 if (IS_ERR(delayed_node)) 1884 return PTR_ERR(delayed_node); 1885 1886 mutex_lock(&delayed_node->mutex); 1887 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1888 fill_stack_inode_item(trans, &delayed_node->inode_item, 1889 &inode->vfs_inode); 1890 goto release_node; 1891 } 1892 1893 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1894 if (ret) 1895 goto release_node; 1896 1897 fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); 1898 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 1899 delayed_node->count++; 1900 atomic_inc(&root->fs_info->delayed_root->items); 1901 release_node: 1902 mutex_unlock(&delayed_node->mutex); 1903 btrfs_release_delayed_node(delayed_node); 1904 return ret; 1905 } 1906 1907 int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) 1908 { 1909 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1910 struct btrfs_delayed_node *delayed_node; 1911 1912 /* 1913 * we don't do delayed inode updates during log recovery because it 1914 * leads to enospc problems. This means we also can't do 1915 * delayed inode refs 1916 */ 1917 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) 1918 return -EAGAIN; 1919 1920 delayed_node = btrfs_get_or_create_delayed_node(inode); 1921 if (IS_ERR(delayed_node)) 1922 return PTR_ERR(delayed_node); 1923 1924 /* 1925 * We don't reserve space for inode ref deletion is because: 1926 * - We ONLY do async inode ref deletion for the inode who has only 1927 * one link(i_nlink == 1), it means there is only one inode ref. 1928 * And in most case, the inode ref and the inode item are in the 1929 * same leaf, and we will deal with them at the same time. 1930 * Since we are sure we will reserve the space for the inode item, 1931 * it is unnecessary to reserve space for inode ref deletion. 1932 * - If the inode ref and the inode item are not in the same leaf, 1933 * We also needn't worry about enospc problem, because we reserve 1934 * much more space for the inode update than it needs. 1935 * - At the worst, we can steal some space from the global reservation. 1936 * It is very rare. 1937 */ 1938 mutex_lock(&delayed_node->mutex); 1939 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) 1940 goto release_node; 1941 1942 set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); 1943 delayed_node->count++; 1944 atomic_inc(&fs_info->delayed_root->items); 1945 release_node: 1946 mutex_unlock(&delayed_node->mutex); 1947 btrfs_release_delayed_node(delayed_node); 1948 return 0; 1949 } 1950 1951 static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) 1952 { 1953 struct btrfs_root *root = delayed_node->root; 1954 struct btrfs_fs_info *fs_info = root->fs_info; 1955 struct btrfs_delayed_item *curr_item, *prev_item; 1956 1957 mutex_lock(&delayed_node->mutex); 1958 curr_item = __btrfs_first_delayed_insertion_item(delayed_node); 1959 while (curr_item) { 1960 prev_item = curr_item; 1961 curr_item = __btrfs_next_delayed_item(prev_item); 1962 btrfs_release_delayed_item(prev_item); 1963 } 1964 1965 if (delayed_node->index_item_leaves > 0) { 1966 btrfs_delayed_item_release_leaves(delayed_node, 1967 delayed_node->index_item_leaves); 1968 delayed_node->index_item_leaves = 0; 1969 } 1970 1971 curr_item = __btrfs_first_delayed_deletion_item(delayed_node); 1972 while (curr_item) { 1973 btrfs_delayed_item_release_metadata(root, curr_item); 1974 prev_item = curr_item; 1975 curr_item = __btrfs_next_delayed_item(prev_item); 1976 btrfs_release_delayed_item(prev_item); 1977 } 1978 1979 btrfs_release_delayed_iref(delayed_node); 1980 1981 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1982 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); 1983 btrfs_release_delayed_inode(delayed_node); 1984 } 1985 mutex_unlock(&delayed_node->mutex); 1986 } 1987 1988 void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) 1989 { 1990 struct btrfs_delayed_node *delayed_node; 1991 1992 delayed_node = btrfs_get_delayed_node(inode); 1993 if (!delayed_node) 1994 return; 1995 1996 __btrfs_kill_delayed_node(delayed_node); 1997 btrfs_release_delayed_node(delayed_node); 1998 } 1999 2000 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) 2001 { 2002 u64 inode_id = 0; 2003 struct btrfs_delayed_node *delayed_nodes[8]; 2004 int i, n; 2005 2006 while (1) { 2007 spin_lock(&root->inode_lock); 2008 n = radix_tree_gang_lookup(&root->delayed_nodes_tree, 2009 (void **)delayed_nodes, inode_id, 2010 ARRAY_SIZE(delayed_nodes)); 2011 if (!n) { 2012 spin_unlock(&root->inode_lock); 2013 break; 2014 } 2015 2016 inode_id = delayed_nodes[n - 1]->inode_id + 1; 2017 for (i = 0; i < n; i++) { 2018 /* 2019 * Don't increase refs in case the node is dead and 2020 * about to be removed from the tree in the loop below 2021 */ 2022 if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) 2023 delayed_nodes[i] = NULL; 2024 } 2025 spin_unlock(&root->inode_lock); 2026 2027 for (i = 0; i < n; i++) { 2028 if (!delayed_nodes[i]) 2029 continue; 2030 __btrfs_kill_delayed_node(delayed_nodes[i]); 2031 btrfs_release_delayed_node(delayed_nodes[i]); 2032 } 2033 } 2034 } 2035 2036 void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) 2037 { 2038 struct btrfs_delayed_node *curr_node, *prev_node; 2039 2040 curr_node = btrfs_first_delayed_node(fs_info->delayed_root); 2041 while (curr_node) { 2042 __btrfs_kill_delayed_node(curr_node); 2043 2044 prev_node = curr_node; 2045 curr_node = btrfs_next_delayed_node(curr_node); 2046 btrfs_release_delayed_node(prev_node); 2047 } 2048 } 2049 2050 void btrfs_log_get_delayed_items(struct btrfs_inode *inode, 2051 struct list_head *ins_list, 2052 struct list_head *del_list) 2053 { 2054 struct btrfs_delayed_node *node; 2055 struct btrfs_delayed_item *item; 2056 2057 node = btrfs_get_delayed_node(inode); 2058 if (!node) 2059 return; 2060 2061 mutex_lock(&node->mutex); 2062 item = __btrfs_first_delayed_insertion_item(node); 2063 while (item) { 2064 /* 2065 * It's possible that the item is already in a log list. This 2066 * can happen in case two tasks are trying to log the same 2067 * directory. For example if we have tasks A and task B: 2068 * 2069 * Task A collected the delayed items into a log list while 2070 * under the inode's log_mutex (at btrfs_log_inode()), but it 2071 * only releases the items after logging the inodes they point 2072 * to (if they are new inodes), which happens after unlocking 2073 * the log mutex; 2074 * 2075 * Task B enters btrfs_log_inode() and acquires the log_mutex 2076 * of the same directory inode, before task B releases the 2077 * delayed items. This can happen for example when logging some 2078 * inode we need to trigger logging of its parent directory, so 2079 * logging two files that have the same parent directory can 2080 * lead to this. 2081 * 2082 * If this happens, just ignore delayed items already in a log 2083 * list. All the tasks logging the directory are under a log 2084 * transaction and whichever finishes first can not sync the log 2085 * before the other completes and leaves the log transaction. 2086 */ 2087 if (!item->logged && list_empty(&item->log_list)) { 2088 refcount_inc(&item->refs); 2089 list_add_tail(&item->log_list, ins_list); 2090 } 2091 item = __btrfs_next_delayed_item(item); 2092 } 2093 2094 item = __btrfs_first_delayed_deletion_item(node); 2095 while (item) { 2096 /* It may be non-empty, for the same reason mentioned above. */ 2097 if (!item->logged && list_empty(&item->log_list)) { 2098 refcount_inc(&item->refs); 2099 list_add_tail(&item->log_list, del_list); 2100 } 2101 item = __btrfs_next_delayed_item(item); 2102 } 2103 mutex_unlock(&node->mutex); 2104 2105 /* 2106 * We are called during inode logging, which means the inode is in use 2107 * and can not be evicted before we finish logging the inode. So we never 2108 * have the last reference on the delayed inode. 2109 * Also, we don't use btrfs_release_delayed_node() because that would 2110 * requeue the delayed inode (change its order in the list of prepared 2111 * nodes) and we don't want to do such change because we don't create or 2112 * delete delayed items. 2113 */ 2114 ASSERT(refcount_read(&node->refs) > 1); 2115 refcount_dec(&node->refs); 2116 } 2117 2118 void btrfs_log_put_delayed_items(struct btrfs_inode *inode, 2119 struct list_head *ins_list, 2120 struct list_head *del_list) 2121 { 2122 struct btrfs_delayed_node *node; 2123 struct btrfs_delayed_item *item; 2124 struct btrfs_delayed_item *next; 2125 2126 node = btrfs_get_delayed_node(inode); 2127 if (!node) 2128 return; 2129 2130 mutex_lock(&node->mutex); 2131 2132 list_for_each_entry_safe(item, next, ins_list, log_list) { 2133 item->logged = true; 2134 list_del_init(&item->log_list); 2135 if (refcount_dec_and_test(&item->refs)) 2136 kfree(item); 2137 } 2138 2139 list_for_each_entry_safe(item, next, del_list, log_list) { 2140 item->logged = true; 2141 list_del_init(&item->log_list); 2142 if (refcount_dec_and_test(&item->refs)) 2143 kfree(item); 2144 } 2145 2146 mutex_unlock(&node->mutex); 2147 2148 /* 2149 * We are called during inode logging, which means the inode is in use 2150 * and can not be evicted before we finish logging the inode. So we never 2151 * have the last reference on the delayed inode. 2152 * Also, we don't use btrfs_release_delayed_node() because that would 2153 * requeue the delayed inode (change its order in the list of prepared 2154 * nodes) and we don't want to do such change because we don't create or 2155 * delete delayed items. 2156 */ 2157 ASSERT(refcount_read(&node->refs) > 1); 2158 refcount_dec(&node->refs); 2159 } 2160