1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/fs.h> 7 #include <linux/blkdev.h> 8 #include <linux/radix-tree.h> 9 #include <linux/writeback.h> 10 #include <linux/workqueue.h> 11 #include <linux/kthread.h> 12 #include <linux/slab.h> 13 #include <linux/migrate.h> 14 #include <linux/ratelimit.h> 15 #include <linux/uuid.h> 16 #include <linux/semaphore.h> 17 #include <linux/error-injection.h> 18 #include <linux/crc32c.h> 19 #include <linux/sched/mm.h> 20 #include <asm/unaligned.h> 21 #include <crypto/hash.h> 22 #include "ctree.h" 23 #include "disk-io.h" 24 #include "transaction.h" 25 #include "btrfs_inode.h" 26 #include "volumes.h" 27 #include "print-tree.h" 28 #include "locking.h" 29 #include "tree-log.h" 30 #include "free-space-cache.h" 31 #include "free-space-tree.h" 32 #include "check-integrity.h" 33 #include "rcu-string.h" 34 #include "dev-replace.h" 35 #include "raid56.h" 36 #include "sysfs.h" 37 #include "qgroup.h" 38 #include "compression.h" 39 #include "tree-checker.h" 40 #include "ref-verify.h" 41 #include "block-group.h" 42 #include "discard.h" 43 #include "space-info.h" 44 #include "zoned.h" 45 #include "subpage.h" 46 47 #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ 48 BTRFS_HEADER_FLAG_RELOC |\ 49 BTRFS_SUPER_FLAG_ERROR |\ 50 BTRFS_SUPER_FLAG_SEEDING |\ 51 BTRFS_SUPER_FLAG_METADUMP |\ 52 BTRFS_SUPER_FLAG_METADUMP_V2) 53 54 static void end_workqueue_fn(struct btrfs_work *work); 55 static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 56 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 57 struct btrfs_fs_info *fs_info); 58 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); 59 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, 60 struct extent_io_tree *dirty_pages, 61 int mark); 62 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, 63 struct extent_io_tree *pinned_extents); 64 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); 65 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); 66 67 /* 68 * btrfs_end_io_wq structs are used to do processing in task context when an IO 69 * is complete. This is used during reads to verify checksums, and it is used 70 * by writes to insert metadata for new file extents after IO is complete. 71 */ 72 struct btrfs_end_io_wq { 73 struct bio *bio; 74 bio_end_io_t *end_io; 75 void *private; 76 struct btrfs_fs_info *info; 77 blk_status_t status; 78 enum btrfs_wq_endio_type metadata; 79 struct btrfs_work work; 80 }; 81 82 static struct kmem_cache *btrfs_end_io_wq_cache; 83 84 int __init btrfs_end_io_wq_init(void) 85 { 86 btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", 87 sizeof(struct btrfs_end_io_wq), 88 0, 89 SLAB_MEM_SPREAD, 90 NULL); 91 if (!btrfs_end_io_wq_cache) 92 return -ENOMEM; 93 return 0; 94 } 95 96 void __cold btrfs_end_io_wq_exit(void) 97 { 98 kmem_cache_destroy(btrfs_end_io_wq_cache); 99 } 100 101 static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) 102 { 103 if (fs_info->csum_shash) 104 crypto_free_shash(fs_info->csum_shash); 105 } 106 107 /* 108 * async submit bios are used to offload expensive checksumming 109 * onto the worker threads. They checksum file and metadata bios 110 * just before they are sent down the IO stack. 111 */ 112 struct async_submit_bio { 113 struct inode *inode; 114 struct bio *bio; 115 extent_submit_bio_start_t *submit_bio_start; 116 int mirror_num; 117 118 /* Optional parameter for submit_bio_start used by direct io */ 119 u64 dio_file_offset; 120 struct btrfs_work work; 121 blk_status_t status; 122 }; 123 124 /* 125 * Lockdep class keys for extent_buffer->lock's in this root. For a given 126 * eb, the lockdep key is determined by the btrfs_root it belongs to and 127 * the level the eb occupies in the tree. 128 * 129 * Different roots are used for different purposes and may nest inside each 130 * other and they require separate keysets. As lockdep keys should be 131 * static, assign keysets according to the purpose of the root as indicated 132 * by btrfs_root->root_key.objectid. This ensures that all special purpose 133 * roots have separate keysets. 134 * 135 * Lock-nesting across peer nodes is always done with the immediate parent 136 * node locked thus preventing deadlock. As lockdep doesn't know this, use 137 * subclass to avoid triggering lockdep warning in such cases. 138 * 139 * The key is set by the readpage_end_io_hook after the buffer has passed 140 * csum validation but before the pages are unlocked. It is also set by 141 * btrfs_init_new_buffer on freshly allocated blocks. 142 * 143 * We also add a check to make sure the highest level of the tree is the 144 * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code 145 * needs update as well. 146 */ 147 #ifdef CONFIG_DEBUG_LOCK_ALLOC 148 # if BTRFS_MAX_LEVEL != 8 149 # error 150 # endif 151 152 #define DEFINE_LEVEL(stem, level) \ 153 .names[level] = "btrfs-" stem "-0" #level, 154 155 #define DEFINE_NAME(stem) \ 156 DEFINE_LEVEL(stem, 0) \ 157 DEFINE_LEVEL(stem, 1) \ 158 DEFINE_LEVEL(stem, 2) \ 159 DEFINE_LEVEL(stem, 3) \ 160 DEFINE_LEVEL(stem, 4) \ 161 DEFINE_LEVEL(stem, 5) \ 162 DEFINE_LEVEL(stem, 6) \ 163 DEFINE_LEVEL(stem, 7) 164 165 static struct btrfs_lockdep_keyset { 166 u64 id; /* root objectid */ 167 /* Longest entry: btrfs-free-space-00 */ 168 char names[BTRFS_MAX_LEVEL][20]; 169 struct lock_class_key keys[BTRFS_MAX_LEVEL]; 170 } btrfs_lockdep_keysets[] = { 171 { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, 172 { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, 173 { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, 174 { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, 175 { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, 176 { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, 177 { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, 178 { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, 179 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, 180 { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, 181 { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, 182 { .id = 0, DEFINE_NAME("tree") }, 183 }; 184 185 #undef DEFINE_LEVEL 186 #undef DEFINE_NAME 187 188 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, 189 int level) 190 { 191 struct btrfs_lockdep_keyset *ks; 192 193 BUG_ON(level >= ARRAY_SIZE(ks->keys)); 194 195 /* find the matching keyset, id 0 is the default entry */ 196 for (ks = btrfs_lockdep_keysets; ks->id; ks++) 197 if (ks->id == objectid) 198 break; 199 200 lockdep_set_class_and_name(&eb->lock, 201 &ks->keys[level], ks->names[level]); 202 } 203 204 #endif 205 206 /* 207 * Compute the csum of a btree block and store the result to provided buffer. 208 */ 209 static void csum_tree_block(struct extent_buffer *buf, u8 *result) 210 { 211 struct btrfs_fs_info *fs_info = buf->fs_info; 212 const int num_pages = num_extent_pages(buf); 213 const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); 214 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 215 char *kaddr; 216 int i; 217 218 shash->tfm = fs_info->csum_shash; 219 crypto_shash_init(shash); 220 kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); 221 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, 222 first_page_part - BTRFS_CSUM_SIZE); 223 224 for (i = 1; i < num_pages; i++) { 225 kaddr = page_address(buf->pages[i]); 226 crypto_shash_update(shash, kaddr, PAGE_SIZE); 227 } 228 memset(result, 0, BTRFS_CSUM_SIZE); 229 crypto_shash_final(shash, result); 230 } 231 232 /* 233 * we can't consider a given block up to date unless the transid of the 234 * block matches the transid in the parent node's pointer. This is how we 235 * detect blocks that either didn't get written at all or got written 236 * in the wrong place. 237 */ 238 static int verify_parent_transid(struct extent_io_tree *io_tree, 239 struct extent_buffer *eb, u64 parent_transid, 240 int atomic) 241 { 242 struct extent_state *cached_state = NULL; 243 int ret; 244 245 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 246 return 0; 247 248 if (atomic) 249 return -EAGAIN; 250 251 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 252 &cached_state); 253 if (extent_buffer_uptodate(eb) && 254 btrfs_header_generation(eb) == parent_transid) { 255 ret = 0; 256 goto out; 257 } 258 btrfs_err_rl(eb->fs_info, 259 "parent transid verify failed on %llu wanted %llu found %llu", 260 eb->start, 261 parent_transid, btrfs_header_generation(eb)); 262 ret = 1; 263 clear_extent_buffer_uptodate(eb); 264 out: 265 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 266 &cached_state); 267 return ret; 268 } 269 270 static bool btrfs_supported_super_csum(u16 csum_type) 271 { 272 switch (csum_type) { 273 case BTRFS_CSUM_TYPE_CRC32: 274 case BTRFS_CSUM_TYPE_XXHASH: 275 case BTRFS_CSUM_TYPE_SHA256: 276 case BTRFS_CSUM_TYPE_BLAKE2: 277 return true; 278 default: 279 return false; 280 } 281 } 282 283 /* 284 * Return 0 if the superblock checksum type matches the checksum value of that 285 * algorithm. Pass the raw disk superblock data. 286 */ 287 static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, 288 char *raw_disk_sb) 289 { 290 struct btrfs_super_block *disk_sb = 291 (struct btrfs_super_block *)raw_disk_sb; 292 char result[BTRFS_CSUM_SIZE]; 293 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 294 295 shash->tfm = fs_info->csum_shash; 296 297 /* 298 * The super_block structure does not span the whole 299 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is 300 * filled with zeros and is included in the checksum. 301 */ 302 crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, 303 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); 304 305 if (memcmp(disk_sb->csum, result, fs_info->csum_size)) 306 return 1; 307 308 return 0; 309 } 310 311 int btrfs_verify_level_key(struct extent_buffer *eb, int level, 312 struct btrfs_key *first_key, u64 parent_transid) 313 { 314 struct btrfs_fs_info *fs_info = eb->fs_info; 315 int found_level; 316 struct btrfs_key found_key; 317 int ret; 318 319 found_level = btrfs_header_level(eb); 320 if (found_level != level) { 321 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 322 KERN_ERR "BTRFS: tree level check failed\n"); 323 btrfs_err(fs_info, 324 "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", 325 eb->start, level, found_level); 326 return -EIO; 327 } 328 329 if (!first_key) 330 return 0; 331 332 /* 333 * For live tree block (new tree blocks in current transaction), 334 * we need proper lock context to avoid race, which is impossible here. 335 * So we only checks tree blocks which is read from disk, whose 336 * generation <= fs_info->last_trans_committed. 337 */ 338 if (btrfs_header_generation(eb) > fs_info->last_trans_committed) 339 return 0; 340 341 /* We have @first_key, so this @eb must have at least one item */ 342 if (btrfs_header_nritems(eb) == 0) { 343 btrfs_err(fs_info, 344 "invalid tree nritems, bytenr=%llu nritems=0 expect >0", 345 eb->start); 346 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 347 return -EUCLEAN; 348 } 349 350 if (found_level) 351 btrfs_node_key_to_cpu(eb, &found_key, 0); 352 else 353 btrfs_item_key_to_cpu(eb, &found_key, 0); 354 ret = btrfs_comp_cpu_keys(first_key, &found_key); 355 356 if (ret) { 357 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), 358 KERN_ERR "BTRFS: tree first key check failed\n"); 359 btrfs_err(fs_info, 360 "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", 361 eb->start, parent_transid, first_key->objectid, 362 first_key->type, first_key->offset, 363 found_key.objectid, found_key.type, 364 found_key.offset); 365 } 366 return ret; 367 } 368 369 /* 370 * helper to read a given tree block, doing retries as required when 371 * the checksums don't match and we have alternate mirrors to try. 372 * 373 * @parent_transid: expected transid, skip check if 0 374 * @level: expected level, mandatory check 375 * @first_key: expected key of first slot, skip check if NULL 376 */ 377 int btrfs_read_extent_buffer(struct extent_buffer *eb, 378 u64 parent_transid, int level, 379 struct btrfs_key *first_key) 380 { 381 struct btrfs_fs_info *fs_info = eb->fs_info; 382 struct extent_io_tree *io_tree; 383 int failed = 0; 384 int ret; 385 int num_copies = 0; 386 int mirror_num = 0; 387 int failed_mirror = 0; 388 389 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 390 while (1) { 391 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 392 ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); 393 if (!ret) { 394 if (verify_parent_transid(io_tree, eb, 395 parent_transid, 0)) 396 ret = -EIO; 397 else if (btrfs_verify_level_key(eb, level, 398 first_key, parent_transid)) 399 ret = -EUCLEAN; 400 else 401 break; 402 } 403 404 num_copies = btrfs_num_copies(fs_info, 405 eb->start, eb->len); 406 if (num_copies == 1) 407 break; 408 409 if (!failed_mirror) { 410 failed = 1; 411 failed_mirror = eb->read_mirror; 412 } 413 414 mirror_num++; 415 if (mirror_num == failed_mirror) 416 mirror_num++; 417 418 if (mirror_num > num_copies) 419 break; 420 } 421 422 if (failed && !ret && failed_mirror) 423 btrfs_repair_eb_io_failure(eb, failed_mirror); 424 425 return ret; 426 } 427 428 static int csum_one_extent_buffer(struct extent_buffer *eb) 429 { 430 struct btrfs_fs_info *fs_info = eb->fs_info; 431 u8 result[BTRFS_CSUM_SIZE]; 432 int ret; 433 434 ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, 435 offsetof(struct btrfs_header, fsid), 436 BTRFS_FSID_SIZE) == 0); 437 csum_tree_block(eb, result); 438 439 if (btrfs_header_level(eb)) 440 ret = btrfs_check_node(eb); 441 else 442 ret = btrfs_check_leaf_full(eb); 443 444 if (ret < 0) 445 goto error; 446 447 /* 448 * Also check the generation, the eb reached here must be newer than 449 * last committed. Or something seriously wrong happened. 450 */ 451 if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { 452 ret = -EUCLEAN; 453 btrfs_err(fs_info, 454 "block=%llu bad generation, have %llu expect > %llu", 455 eb->start, btrfs_header_generation(eb), 456 fs_info->last_trans_committed); 457 goto error; 458 } 459 write_extent_buffer(eb, result, 0, fs_info->csum_size); 460 461 return 0; 462 463 error: 464 btrfs_print_tree(eb, 0); 465 btrfs_err(fs_info, "block=%llu write time tree block corruption detected", 466 eb->start); 467 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 468 return ret; 469 } 470 471 /* Checksum all dirty extent buffers in one bio_vec */ 472 static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, 473 struct bio_vec *bvec) 474 { 475 struct page *page = bvec->bv_page; 476 u64 bvec_start = page_offset(page) + bvec->bv_offset; 477 u64 cur; 478 int ret = 0; 479 480 for (cur = bvec_start; cur < bvec_start + bvec->bv_len; 481 cur += fs_info->nodesize) { 482 struct extent_buffer *eb; 483 bool uptodate; 484 485 eb = find_extent_buffer(fs_info, cur); 486 uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, 487 fs_info->nodesize); 488 489 /* A dirty eb shouldn't disappear from buffer_radix */ 490 if (WARN_ON(!eb)) 491 return -EUCLEAN; 492 493 if (WARN_ON(cur != btrfs_header_bytenr(eb))) { 494 free_extent_buffer(eb); 495 return -EUCLEAN; 496 } 497 if (WARN_ON(!uptodate)) { 498 free_extent_buffer(eb); 499 return -EUCLEAN; 500 } 501 502 ret = csum_one_extent_buffer(eb); 503 free_extent_buffer(eb); 504 if (ret < 0) 505 return ret; 506 } 507 return ret; 508 } 509 510 /* 511 * Checksum a dirty tree block before IO. This has extra checks to make sure 512 * we only fill in the checksum field in the first page of a multi-page block. 513 * For subpage extent buffers we need bvec to also read the offset in the page. 514 */ 515 static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) 516 { 517 struct page *page = bvec->bv_page; 518 u64 start = page_offset(page); 519 u64 found_start; 520 struct extent_buffer *eb; 521 522 if (fs_info->nodesize < PAGE_SIZE) 523 return csum_dirty_subpage_buffers(fs_info, bvec); 524 525 eb = (struct extent_buffer *)page->private; 526 if (page != eb->pages[0]) 527 return 0; 528 529 found_start = btrfs_header_bytenr(eb); 530 531 if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { 532 WARN_ON(found_start != 0); 533 return 0; 534 } 535 536 /* 537 * Please do not consolidate these warnings into a single if. 538 * It is useful to know what went wrong. 539 */ 540 if (WARN_ON(found_start != start)) 541 return -EUCLEAN; 542 if (WARN_ON(!PageUptodate(page))) 543 return -EUCLEAN; 544 545 return csum_one_extent_buffer(eb); 546 } 547 548 static int check_tree_block_fsid(struct extent_buffer *eb) 549 { 550 struct btrfs_fs_info *fs_info = eb->fs_info; 551 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 552 u8 fsid[BTRFS_FSID_SIZE]; 553 u8 *metadata_uuid; 554 555 read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), 556 BTRFS_FSID_SIZE); 557 /* 558 * Checking the incompat flag is only valid for the current fs. For 559 * seed devices it's forbidden to have their uuid changed so reading 560 * ->fsid in this case is fine 561 */ 562 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 563 metadata_uuid = fs_devices->metadata_uuid; 564 else 565 metadata_uuid = fs_devices->fsid; 566 567 if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) 568 return 0; 569 570 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) 571 if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) 572 return 0; 573 574 return 1; 575 } 576 577 /* Do basic extent buffer checks at read time */ 578 static int validate_extent_buffer(struct extent_buffer *eb) 579 { 580 struct btrfs_fs_info *fs_info = eb->fs_info; 581 u64 found_start; 582 const u32 csum_size = fs_info->csum_size; 583 u8 found_level; 584 u8 result[BTRFS_CSUM_SIZE]; 585 const u8 *header_csum; 586 int ret = 0; 587 588 found_start = btrfs_header_bytenr(eb); 589 if (found_start != eb->start) { 590 btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", 591 eb->start, found_start); 592 ret = -EIO; 593 goto out; 594 } 595 if (check_tree_block_fsid(eb)) { 596 btrfs_err_rl(fs_info, "bad fsid on block %llu", 597 eb->start); 598 ret = -EIO; 599 goto out; 600 } 601 found_level = btrfs_header_level(eb); 602 if (found_level >= BTRFS_MAX_LEVEL) { 603 btrfs_err(fs_info, "bad tree block level %d on %llu", 604 (int)btrfs_header_level(eb), eb->start); 605 ret = -EIO; 606 goto out; 607 } 608 609 csum_tree_block(eb, result); 610 header_csum = page_address(eb->pages[0]) + 611 get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); 612 613 if (memcmp(result, header_csum, csum_size) != 0) { 614 btrfs_warn_rl(fs_info, 615 "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", 616 eb->start, 617 CSUM_FMT_VALUE(csum_size, header_csum), 618 CSUM_FMT_VALUE(csum_size, result), 619 btrfs_header_level(eb)); 620 ret = -EUCLEAN; 621 goto out; 622 } 623 624 /* 625 * If this is a leaf block and it is corrupt, set the corrupt bit so 626 * that we don't try and read the other copies of this block, just 627 * return -EIO. 628 */ 629 if (found_level == 0 && btrfs_check_leaf_full(eb)) { 630 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 631 ret = -EIO; 632 } 633 634 if (found_level > 0 && btrfs_check_node(eb)) 635 ret = -EIO; 636 637 if (!ret) 638 set_extent_buffer_uptodate(eb); 639 else 640 btrfs_err(fs_info, 641 "block=%llu read time tree block corruption detected", 642 eb->start); 643 out: 644 return ret; 645 } 646 647 static int validate_subpage_buffer(struct page *page, u64 start, u64 end, 648 int mirror) 649 { 650 struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 651 struct extent_buffer *eb; 652 bool reads_done; 653 int ret = 0; 654 655 /* 656 * We don't allow bio merge for subpage metadata read, so we should 657 * only get one eb for each endio hook. 658 */ 659 ASSERT(end == start + fs_info->nodesize - 1); 660 ASSERT(PagePrivate(page)); 661 662 eb = find_extent_buffer(fs_info, start); 663 /* 664 * When we are reading one tree block, eb must have been inserted into 665 * the radix tree. If not, something is wrong. 666 */ 667 ASSERT(eb); 668 669 reads_done = atomic_dec_and_test(&eb->io_pages); 670 /* Subpage read must finish in page read */ 671 ASSERT(reads_done); 672 673 eb->read_mirror = mirror; 674 if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { 675 ret = -EIO; 676 goto err; 677 } 678 ret = validate_extent_buffer(eb); 679 if (ret < 0) 680 goto err; 681 682 set_extent_buffer_uptodate(eb); 683 684 free_extent_buffer(eb); 685 return ret; 686 err: 687 /* 688 * end_bio_extent_readpage decrements io_pages in case of error, 689 * make sure it has something to decrement. 690 */ 691 atomic_inc(&eb->io_pages); 692 clear_extent_buffer_uptodate(eb); 693 free_extent_buffer(eb); 694 return ret; 695 } 696 697 int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, 698 struct page *page, u64 start, u64 end, 699 int mirror) 700 { 701 struct extent_buffer *eb; 702 int ret = 0; 703 int reads_done; 704 705 ASSERT(page->private); 706 707 if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) 708 return validate_subpage_buffer(page, start, end, mirror); 709 710 eb = (struct extent_buffer *)page->private; 711 712 /* 713 * The pending IO might have been the only thing that kept this buffer 714 * in memory. Make sure we have a ref for all this other checks 715 */ 716 atomic_inc(&eb->refs); 717 718 reads_done = atomic_dec_and_test(&eb->io_pages); 719 if (!reads_done) 720 goto err; 721 722 eb->read_mirror = mirror; 723 if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { 724 ret = -EIO; 725 goto err; 726 } 727 ret = validate_extent_buffer(eb); 728 err: 729 if (ret) { 730 /* 731 * our io error hook is going to dec the io pages 732 * again, we have to make sure it has something 733 * to decrement 734 */ 735 atomic_inc(&eb->io_pages); 736 clear_extent_buffer_uptodate(eb); 737 } 738 free_extent_buffer(eb); 739 740 return ret; 741 } 742 743 static void end_workqueue_bio(struct bio *bio) 744 { 745 struct btrfs_end_io_wq *end_io_wq = bio->bi_private; 746 struct btrfs_fs_info *fs_info; 747 struct btrfs_workqueue *wq; 748 749 fs_info = end_io_wq->info; 750 end_io_wq->status = bio->bi_status; 751 752 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 753 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 754 wq = fs_info->endio_meta_write_workers; 755 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 756 wq = fs_info->endio_freespace_worker; 757 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 758 wq = fs_info->endio_raid56_workers; 759 else 760 wq = fs_info->endio_write_workers; 761 } else { 762 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 763 wq = fs_info->endio_raid56_workers; 764 else if (end_io_wq->metadata) 765 wq = fs_info->endio_meta_workers; 766 else 767 wq = fs_info->endio_workers; 768 } 769 770 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); 771 btrfs_queue_work(wq, &end_io_wq->work); 772 } 773 774 blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 775 enum btrfs_wq_endio_type metadata) 776 { 777 struct btrfs_end_io_wq *end_io_wq; 778 779 end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); 780 if (!end_io_wq) 781 return BLK_STS_RESOURCE; 782 783 end_io_wq->private = bio->bi_private; 784 end_io_wq->end_io = bio->bi_end_io; 785 end_io_wq->info = info; 786 end_io_wq->status = 0; 787 end_io_wq->bio = bio; 788 end_io_wq->metadata = metadata; 789 790 bio->bi_private = end_io_wq; 791 bio->bi_end_io = end_workqueue_bio; 792 return 0; 793 } 794 795 static void run_one_async_start(struct btrfs_work *work) 796 { 797 struct async_submit_bio *async; 798 blk_status_t ret; 799 800 async = container_of(work, struct async_submit_bio, work); 801 ret = async->submit_bio_start(async->inode, async->bio, 802 async->dio_file_offset); 803 if (ret) 804 async->status = ret; 805 } 806 807 /* 808 * In order to insert checksums into the metadata in large chunks, we wait 809 * until bio submission time. All the pages in the bio are checksummed and 810 * sums are attached onto the ordered extent record. 811 * 812 * At IO completion time the csums attached on the ordered extent record are 813 * inserted into the tree. 814 */ 815 static void run_one_async_done(struct btrfs_work *work) 816 { 817 struct async_submit_bio *async; 818 struct inode *inode; 819 blk_status_t ret; 820 821 async = container_of(work, struct async_submit_bio, work); 822 inode = async->inode; 823 824 /* If an error occurred we just want to clean up the bio and move on */ 825 if (async->status) { 826 async->bio->bi_status = async->status; 827 bio_endio(async->bio); 828 return; 829 } 830 831 /* 832 * All of the bios that pass through here are from async helpers. 833 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. 834 * This changes nothing when cgroups aren't in use. 835 */ 836 async->bio->bi_opf |= REQ_CGROUP_PUNT; 837 ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); 838 if (ret) { 839 async->bio->bi_status = ret; 840 bio_endio(async->bio); 841 } 842 } 843 844 static void run_one_async_free(struct btrfs_work *work) 845 { 846 struct async_submit_bio *async; 847 848 async = container_of(work, struct async_submit_bio, work); 849 kfree(async); 850 } 851 852 blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, 853 int mirror_num, u64 dio_file_offset, 854 extent_submit_bio_start_t *submit_bio_start) 855 { 856 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 857 struct async_submit_bio *async; 858 859 async = kmalloc(sizeof(*async), GFP_NOFS); 860 if (!async) 861 return BLK_STS_RESOURCE; 862 863 async->inode = inode; 864 async->bio = bio; 865 async->mirror_num = mirror_num; 866 async->submit_bio_start = submit_bio_start; 867 868 btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, 869 run_one_async_free); 870 871 async->dio_file_offset = dio_file_offset; 872 873 async->status = 0; 874 875 if (op_is_sync(bio->bi_opf)) 876 btrfs_queue_work(fs_info->hipri_workers, &async->work); 877 else 878 btrfs_queue_work(fs_info->workers, &async->work); 879 return 0; 880 } 881 882 static blk_status_t btree_csum_one_bio(struct bio *bio) 883 { 884 struct bio_vec *bvec; 885 struct btrfs_root *root; 886 int ret = 0; 887 struct bvec_iter_all iter_all; 888 889 ASSERT(!bio_flagged(bio, BIO_CLONED)); 890 bio_for_each_segment_all(bvec, bio, iter_all) { 891 root = BTRFS_I(bvec->bv_page->mapping->host)->root; 892 ret = csum_dirty_buffer(root->fs_info, bvec); 893 if (ret) 894 break; 895 } 896 897 return errno_to_blk_status(ret); 898 } 899 900 static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, 901 u64 dio_file_offset) 902 { 903 /* 904 * when we're called for a write, we're already in the async 905 * submission context. Just jump into btrfs_map_bio 906 */ 907 return btree_csum_one_bio(bio); 908 } 909 910 static bool should_async_write(struct btrfs_fs_info *fs_info, 911 struct btrfs_inode *bi) 912 { 913 if (btrfs_is_zoned(fs_info)) 914 return false; 915 if (atomic_read(&bi->sync_writers)) 916 return false; 917 if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) 918 return false; 919 return true; 920 } 921 922 void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) 923 { 924 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 925 blk_status_t ret; 926 927 if (btrfs_op(bio) != BTRFS_MAP_WRITE) { 928 /* 929 * called for a read, do the setup so that checksum validation 930 * can happen in the async kernel threads 931 */ 932 ret = btrfs_bio_wq_end_io(fs_info, bio, 933 BTRFS_WQ_ENDIO_METADATA); 934 if (!ret) 935 ret = btrfs_map_bio(fs_info, bio, mirror_num); 936 } else if (!should_async_write(fs_info, BTRFS_I(inode))) { 937 ret = btree_csum_one_bio(bio); 938 if (!ret) 939 ret = btrfs_map_bio(fs_info, bio, mirror_num); 940 } else { 941 /* 942 * kthread helpers are used to submit writes so that 943 * checksumming can happen in parallel across all CPUs 944 */ 945 ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, 946 btree_submit_bio_start); 947 } 948 949 if (ret) { 950 bio->bi_status = ret; 951 bio_endio(bio); 952 } 953 } 954 955 #ifdef CONFIG_MIGRATION 956 static int btree_migratepage(struct address_space *mapping, 957 struct page *newpage, struct page *page, 958 enum migrate_mode mode) 959 { 960 /* 961 * we can't safely write a btree page from here, 962 * we haven't done the locking hook 963 */ 964 if (PageDirty(page)) 965 return -EAGAIN; 966 /* 967 * Buffers may be managed in a filesystem specific way. 968 * We must have no buffers or drop them. 969 */ 970 if (page_has_private(page) && 971 !try_to_release_page(page, GFP_KERNEL)) 972 return -EAGAIN; 973 return migrate_page(mapping, newpage, page, mode); 974 } 975 #endif 976 977 978 static int btree_writepages(struct address_space *mapping, 979 struct writeback_control *wbc) 980 { 981 struct btrfs_fs_info *fs_info; 982 int ret; 983 984 if (wbc->sync_mode == WB_SYNC_NONE) { 985 986 if (wbc->for_kupdate) 987 return 0; 988 989 fs_info = BTRFS_I(mapping->host)->root->fs_info; 990 /* this is a bit racy, but that's ok */ 991 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, 992 BTRFS_DIRTY_METADATA_THRESH, 993 fs_info->dirty_metadata_batch); 994 if (ret < 0) 995 return 0; 996 } 997 return btree_write_cache_pages(mapping, wbc); 998 } 999 1000 static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) 1001 { 1002 if (folio_test_writeback(folio) || folio_test_dirty(folio)) 1003 return false; 1004 1005 return try_release_extent_buffer(&folio->page); 1006 } 1007 1008 static void btree_invalidate_folio(struct folio *folio, size_t offset, 1009 size_t length) 1010 { 1011 struct extent_io_tree *tree; 1012 tree = &BTRFS_I(folio->mapping->host)->io_tree; 1013 extent_invalidate_folio(tree, folio, offset); 1014 btree_release_folio(folio, GFP_NOFS); 1015 if (folio_get_private(folio)) { 1016 btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, 1017 "folio private not zero on folio %llu", 1018 (unsigned long long)folio_pos(folio)); 1019 folio_detach_private(folio); 1020 } 1021 } 1022 1023 #ifdef DEBUG 1024 static bool btree_dirty_folio(struct address_space *mapping, 1025 struct folio *folio) 1026 { 1027 struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); 1028 struct btrfs_subpage *subpage; 1029 struct extent_buffer *eb; 1030 int cur_bit = 0; 1031 u64 page_start = folio_pos(folio); 1032 1033 if (fs_info->sectorsize == PAGE_SIZE) { 1034 eb = folio_get_private(folio); 1035 BUG_ON(!eb); 1036 BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1037 BUG_ON(!atomic_read(&eb->refs)); 1038 btrfs_assert_tree_write_locked(eb); 1039 return filemap_dirty_folio(mapping, folio); 1040 } 1041 subpage = folio_get_private(folio); 1042 1043 ASSERT(subpage->dirty_bitmap); 1044 while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { 1045 unsigned long flags; 1046 u64 cur; 1047 u16 tmp = (1 << cur_bit); 1048 1049 spin_lock_irqsave(&subpage->lock, flags); 1050 if (!(tmp & subpage->dirty_bitmap)) { 1051 spin_unlock_irqrestore(&subpage->lock, flags); 1052 cur_bit++; 1053 continue; 1054 } 1055 spin_unlock_irqrestore(&subpage->lock, flags); 1056 cur = page_start + cur_bit * fs_info->sectorsize; 1057 1058 eb = find_extent_buffer(fs_info, cur); 1059 ASSERT(eb); 1060 ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); 1061 ASSERT(atomic_read(&eb->refs)); 1062 btrfs_assert_tree_write_locked(eb); 1063 free_extent_buffer(eb); 1064 1065 cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); 1066 } 1067 return filemap_dirty_folio(mapping, folio); 1068 } 1069 #else 1070 #define btree_dirty_folio filemap_dirty_folio 1071 #endif 1072 1073 static const struct address_space_operations btree_aops = { 1074 .writepages = btree_writepages, 1075 .release_folio = btree_release_folio, 1076 .invalidate_folio = btree_invalidate_folio, 1077 #ifdef CONFIG_MIGRATION 1078 .migratepage = btree_migratepage, 1079 #endif 1080 .dirty_folio = btree_dirty_folio, 1081 }; 1082 1083 struct extent_buffer *btrfs_find_create_tree_block( 1084 struct btrfs_fs_info *fs_info, 1085 u64 bytenr, u64 owner_root, 1086 int level) 1087 { 1088 if (btrfs_is_testing(fs_info)) 1089 return alloc_test_extent_buffer(fs_info, bytenr); 1090 return alloc_extent_buffer(fs_info, bytenr, owner_root, level); 1091 } 1092 1093 /* 1094 * Read tree block at logical address @bytenr and do variant basic but critical 1095 * verification. 1096 * 1097 * @owner_root: the objectid of the root owner for this block. 1098 * @parent_transid: expected transid of this tree block, skip check if 0 1099 * @level: expected level, mandatory check 1100 * @first_key: expected key in slot 0, skip check if NULL 1101 */ 1102 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, 1103 u64 owner_root, u64 parent_transid, 1104 int level, struct btrfs_key *first_key) 1105 { 1106 struct extent_buffer *buf = NULL; 1107 int ret; 1108 1109 buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); 1110 if (IS_ERR(buf)) 1111 return buf; 1112 1113 ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); 1114 if (ret) { 1115 free_extent_buffer_stale(buf); 1116 return ERR_PTR(ret); 1117 } 1118 if (btrfs_check_eb_owner(buf, owner_root)) { 1119 free_extent_buffer_stale(buf); 1120 return ERR_PTR(-EUCLEAN); 1121 } 1122 return buf; 1123 1124 } 1125 1126 void btrfs_clean_tree_block(struct extent_buffer *buf) 1127 { 1128 struct btrfs_fs_info *fs_info = buf->fs_info; 1129 if (btrfs_header_generation(buf) == 1130 fs_info->running_transaction->transid) { 1131 btrfs_assert_tree_write_locked(buf); 1132 1133 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { 1134 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 1135 -buf->len, 1136 fs_info->dirty_metadata_batch); 1137 clear_extent_buffer_dirty(buf); 1138 } 1139 } 1140 } 1141 1142 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, 1143 u64 objectid) 1144 { 1145 bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); 1146 1147 memset(&root->root_key, 0, sizeof(root->root_key)); 1148 memset(&root->root_item, 0, sizeof(root->root_item)); 1149 memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); 1150 root->fs_info = fs_info; 1151 root->root_key.objectid = objectid; 1152 root->node = NULL; 1153 root->commit_root = NULL; 1154 root->state = 0; 1155 RB_CLEAR_NODE(&root->rb_node); 1156 1157 root->last_trans = 0; 1158 root->free_objectid = 0; 1159 root->nr_delalloc_inodes = 0; 1160 root->nr_ordered_extents = 0; 1161 root->inode_tree = RB_ROOT; 1162 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); 1163 1164 btrfs_init_root_block_rsv(root); 1165 1166 INIT_LIST_HEAD(&root->dirty_list); 1167 INIT_LIST_HEAD(&root->root_list); 1168 INIT_LIST_HEAD(&root->delalloc_inodes); 1169 INIT_LIST_HEAD(&root->delalloc_root); 1170 INIT_LIST_HEAD(&root->ordered_extents); 1171 INIT_LIST_HEAD(&root->ordered_root); 1172 INIT_LIST_HEAD(&root->reloc_dirty_list); 1173 INIT_LIST_HEAD(&root->logged_list[0]); 1174 INIT_LIST_HEAD(&root->logged_list[1]); 1175 spin_lock_init(&root->inode_lock); 1176 spin_lock_init(&root->delalloc_lock); 1177 spin_lock_init(&root->ordered_extent_lock); 1178 spin_lock_init(&root->accounting_lock); 1179 spin_lock_init(&root->log_extents_lock[0]); 1180 spin_lock_init(&root->log_extents_lock[1]); 1181 spin_lock_init(&root->qgroup_meta_rsv_lock); 1182 mutex_init(&root->objectid_mutex); 1183 mutex_init(&root->log_mutex); 1184 mutex_init(&root->ordered_extent_mutex); 1185 mutex_init(&root->delalloc_mutex); 1186 init_waitqueue_head(&root->qgroup_flush_wait); 1187 init_waitqueue_head(&root->log_writer_wait); 1188 init_waitqueue_head(&root->log_commit_wait[0]); 1189 init_waitqueue_head(&root->log_commit_wait[1]); 1190 INIT_LIST_HEAD(&root->log_ctxs[0]); 1191 INIT_LIST_HEAD(&root->log_ctxs[1]); 1192 atomic_set(&root->log_commit[0], 0); 1193 atomic_set(&root->log_commit[1], 0); 1194 atomic_set(&root->log_writers, 0); 1195 atomic_set(&root->log_batch, 0); 1196 refcount_set(&root->refs, 1); 1197 atomic_set(&root->snapshot_force_cow, 0); 1198 atomic_set(&root->nr_swapfiles, 0); 1199 root->log_transid = 0; 1200 root->log_transid_committed = -1; 1201 root->last_log_commit = 0; 1202 root->anon_dev = 0; 1203 if (!dummy) { 1204 extent_io_tree_init(fs_info, &root->dirty_log_pages, 1205 IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); 1206 extent_io_tree_init(fs_info, &root->log_csum_range, 1207 IO_TREE_LOG_CSUM_RANGE, NULL); 1208 } 1209 1210 spin_lock_init(&root->root_item_lock); 1211 btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); 1212 #ifdef CONFIG_BTRFS_DEBUG 1213 INIT_LIST_HEAD(&root->leak_list); 1214 spin_lock(&fs_info->fs_roots_radix_lock); 1215 list_add_tail(&root->leak_list, &fs_info->allocated_roots); 1216 spin_unlock(&fs_info->fs_roots_radix_lock); 1217 #endif 1218 } 1219 1220 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, 1221 u64 objectid, gfp_t flags) 1222 { 1223 struct btrfs_root *root = kzalloc(sizeof(*root), flags); 1224 if (root) 1225 __setup_root(root, fs_info, objectid); 1226 return root; 1227 } 1228 1229 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1230 /* Should only be used by the testing infrastructure */ 1231 struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) 1232 { 1233 struct btrfs_root *root; 1234 1235 if (!fs_info) 1236 return ERR_PTR(-EINVAL); 1237 1238 root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL); 1239 if (!root) 1240 return ERR_PTR(-ENOMEM); 1241 1242 /* We don't use the stripesize in selftest, set it as sectorsize */ 1243 root->alloc_bytenr = 0; 1244 1245 return root; 1246 } 1247 #endif 1248 1249 static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node) 1250 { 1251 const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node); 1252 const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node); 1253 1254 return btrfs_comp_cpu_keys(&a->root_key, &b->root_key); 1255 } 1256 1257 static int global_root_key_cmp(const void *k, const struct rb_node *node) 1258 { 1259 const struct btrfs_key *key = k; 1260 const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node); 1261 1262 return btrfs_comp_cpu_keys(key, &root->root_key); 1263 } 1264 1265 int btrfs_global_root_insert(struct btrfs_root *root) 1266 { 1267 struct btrfs_fs_info *fs_info = root->fs_info; 1268 struct rb_node *tmp; 1269 1270 write_lock(&fs_info->global_root_lock); 1271 tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); 1272 write_unlock(&fs_info->global_root_lock); 1273 ASSERT(!tmp); 1274 1275 return tmp ? -EEXIST : 0; 1276 } 1277 1278 void btrfs_global_root_delete(struct btrfs_root *root) 1279 { 1280 struct btrfs_fs_info *fs_info = root->fs_info; 1281 1282 write_lock(&fs_info->global_root_lock); 1283 rb_erase(&root->rb_node, &fs_info->global_root_tree); 1284 write_unlock(&fs_info->global_root_lock); 1285 } 1286 1287 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, 1288 struct btrfs_key *key) 1289 { 1290 struct rb_node *node; 1291 struct btrfs_root *root = NULL; 1292 1293 read_lock(&fs_info->global_root_lock); 1294 node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp); 1295 if (node) 1296 root = container_of(node, struct btrfs_root, rb_node); 1297 read_unlock(&fs_info->global_root_lock); 1298 1299 return root; 1300 } 1301 1302 static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) 1303 { 1304 struct btrfs_block_group *block_group; 1305 u64 ret; 1306 1307 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 1308 return 0; 1309 1310 if (bytenr) 1311 block_group = btrfs_lookup_block_group(fs_info, bytenr); 1312 else 1313 block_group = btrfs_lookup_first_block_group(fs_info, bytenr); 1314 ASSERT(block_group); 1315 if (!block_group) 1316 return 0; 1317 ret = block_group->global_root_id; 1318 btrfs_put_block_group(block_group); 1319 1320 return ret; 1321 } 1322 1323 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) 1324 { 1325 struct btrfs_key key = { 1326 .objectid = BTRFS_CSUM_TREE_OBJECTID, 1327 .type = BTRFS_ROOT_ITEM_KEY, 1328 .offset = btrfs_global_root_id(fs_info, bytenr), 1329 }; 1330 1331 return btrfs_global_root(fs_info, &key); 1332 } 1333 1334 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) 1335 { 1336 struct btrfs_key key = { 1337 .objectid = BTRFS_EXTENT_TREE_OBJECTID, 1338 .type = BTRFS_ROOT_ITEM_KEY, 1339 .offset = btrfs_global_root_id(fs_info, bytenr), 1340 }; 1341 1342 return btrfs_global_root(fs_info, &key); 1343 } 1344 1345 struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 1346 u64 objectid) 1347 { 1348 struct btrfs_fs_info *fs_info = trans->fs_info; 1349 struct extent_buffer *leaf; 1350 struct btrfs_root *tree_root = fs_info->tree_root; 1351 struct btrfs_root *root; 1352 struct btrfs_key key; 1353 unsigned int nofs_flag; 1354 int ret = 0; 1355 1356 /* 1357 * We're holding a transaction handle, so use a NOFS memory allocation 1358 * context to avoid deadlock if reclaim happens. 1359 */ 1360 nofs_flag = memalloc_nofs_save(); 1361 root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL); 1362 memalloc_nofs_restore(nofs_flag); 1363 if (!root) 1364 return ERR_PTR(-ENOMEM); 1365 1366 root->root_key.objectid = objectid; 1367 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1368 root->root_key.offset = 0; 1369 1370 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, 1371 BTRFS_NESTING_NORMAL); 1372 if (IS_ERR(leaf)) { 1373 ret = PTR_ERR(leaf); 1374 leaf = NULL; 1375 goto fail_unlock; 1376 } 1377 1378 root->node = leaf; 1379 btrfs_mark_buffer_dirty(leaf); 1380 1381 root->commit_root = btrfs_root_node(root); 1382 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 1383 1384 btrfs_set_root_flags(&root->root_item, 0); 1385 btrfs_set_root_limit(&root->root_item, 0); 1386 btrfs_set_root_bytenr(&root->root_item, leaf->start); 1387 btrfs_set_root_generation(&root->root_item, trans->transid); 1388 btrfs_set_root_level(&root->root_item, 0); 1389 btrfs_set_root_refs(&root->root_item, 1); 1390 btrfs_set_root_used(&root->root_item, leaf->len); 1391 btrfs_set_root_last_snapshot(&root->root_item, 0); 1392 btrfs_set_root_dirid(&root->root_item, 0); 1393 if (is_fstree(objectid)) 1394 generate_random_guid(root->root_item.uuid); 1395 else 1396 export_guid(root->root_item.uuid, &guid_null); 1397 btrfs_set_root_drop_level(&root->root_item, 0); 1398 1399 btrfs_tree_unlock(leaf); 1400 1401 key.objectid = objectid; 1402 key.type = BTRFS_ROOT_ITEM_KEY; 1403 key.offset = 0; 1404 ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); 1405 if (ret) 1406 goto fail; 1407 1408 return root; 1409 1410 fail_unlock: 1411 if (leaf) 1412 btrfs_tree_unlock(leaf); 1413 fail: 1414 btrfs_put_root(root); 1415 1416 return ERR_PTR(ret); 1417 } 1418 1419 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1420 struct btrfs_fs_info *fs_info) 1421 { 1422 struct btrfs_root *root; 1423 1424 root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); 1425 if (!root) 1426 return ERR_PTR(-ENOMEM); 1427 1428 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1429 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1430 root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; 1431 1432 return root; 1433 } 1434 1435 int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, 1436 struct btrfs_root *root) 1437 { 1438 struct extent_buffer *leaf; 1439 1440 /* 1441 * DON'T set SHAREABLE bit for log trees. 1442 * 1443 * Log trees are not exposed to user space thus can't be snapshotted, 1444 * and they go away before a real commit is actually done. 1445 * 1446 * They do store pointers to file data extents, and those reference 1447 * counts still get updated (along with back refs to the log tree). 1448 */ 1449 1450 leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, 1451 NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); 1452 if (IS_ERR(leaf)) 1453 return PTR_ERR(leaf); 1454 1455 root->node = leaf; 1456 1457 btrfs_mark_buffer_dirty(root->node); 1458 btrfs_tree_unlock(root->node); 1459 1460 return 0; 1461 } 1462 1463 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 1464 struct btrfs_fs_info *fs_info) 1465 { 1466 struct btrfs_root *log_root; 1467 1468 log_root = alloc_log_tree(trans, fs_info); 1469 if (IS_ERR(log_root)) 1470 return PTR_ERR(log_root); 1471 1472 if (!btrfs_is_zoned(fs_info)) { 1473 int ret = btrfs_alloc_log_tree_node(trans, log_root); 1474 1475 if (ret) { 1476 btrfs_put_root(log_root); 1477 return ret; 1478 } 1479 } 1480 1481 WARN_ON(fs_info->log_root_tree); 1482 fs_info->log_root_tree = log_root; 1483 return 0; 1484 } 1485 1486 int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 1487 struct btrfs_root *root) 1488 { 1489 struct btrfs_fs_info *fs_info = root->fs_info; 1490 struct btrfs_root *log_root; 1491 struct btrfs_inode_item *inode_item; 1492 int ret; 1493 1494 log_root = alloc_log_tree(trans, fs_info); 1495 if (IS_ERR(log_root)) 1496 return PTR_ERR(log_root); 1497 1498 ret = btrfs_alloc_log_tree_node(trans, log_root); 1499 if (ret) { 1500 btrfs_put_root(log_root); 1501 return ret; 1502 } 1503 1504 log_root->last_trans = trans->transid; 1505 log_root->root_key.offset = root->root_key.objectid; 1506 1507 inode_item = &log_root->root_item.inode; 1508 btrfs_set_stack_inode_generation(inode_item, 1); 1509 btrfs_set_stack_inode_size(inode_item, 3); 1510 btrfs_set_stack_inode_nlink(inode_item, 1); 1511 btrfs_set_stack_inode_nbytes(inode_item, 1512 fs_info->nodesize); 1513 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 1514 1515 btrfs_set_root_node(&log_root->root_item, log_root->node); 1516 1517 WARN_ON(root->log_root); 1518 root->log_root = log_root; 1519 root->log_transid = 0; 1520 root->log_transid_committed = -1; 1521 root->last_log_commit = 0; 1522 return 0; 1523 } 1524 1525 static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, 1526 struct btrfs_path *path, 1527 struct btrfs_key *key) 1528 { 1529 struct btrfs_root *root; 1530 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1531 u64 generation; 1532 int ret; 1533 int level; 1534 1535 root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); 1536 if (!root) 1537 return ERR_PTR(-ENOMEM); 1538 1539 ret = btrfs_find_root(tree_root, key, path, 1540 &root->root_item, &root->root_key); 1541 if (ret) { 1542 if (ret > 0) 1543 ret = -ENOENT; 1544 goto fail; 1545 } 1546 1547 generation = btrfs_root_generation(&root->root_item); 1548 level = btrfs_root_level(&root->root_item); 1549 root->node = read_tree_block(fs_info, 1550 btrfs_root_bytenr(&root->root_item), 1551 key->objectid, generation, level, NULL); 1552 if (IS_ERR(root->node)) { 1553 ret = PTR_ERR(root->node); 1554 root->node = NULL; 1555 goto fail; 1556 } 1557 if (!btrfs_buffer_uptodate(root->node, generation, 0)) { 1558 ret = -EIO; 1559 goto fail; 1560 } 1561 1562 /* 1563 * For real fs, and not log/reloc trees, root owner must 1564 * match its root node owner 1565 */ 1566 if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && 1567 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && 1568 root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 1569 root->root_key.objectid != btrfs_header_owner(root->node)) { 1570 btrfs_crit(fs_info, 1571 "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", 1572 root->root_key.objectid, root->node->start, 1573 btrfs_header_owner(root->node), 1574 root->root_key.objectid); 1575 ret = -EUCLEAN; 1576 goto fail; 1577 } 1578 root->commit_root = btrfs_root_node(root); 1579 return root; 1580 fail: 1581 btrfs_put_root(root); 1582 return ERR_PTR(ret); 1583 } 1584 1585 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, 1586 struct btrfs_key *key) 1587 { 1588 struct btrfs_root *root; 1589 struct btrfs_path *path; 1590 1591 path = btrfs_alloc_path(); 1592 if (!path) 1593 return ERR_PTR(-ENOMEM); 1594 root = read_tree_root_path(tree_root, path, key); 1595 btrfs_free_path(path); 1596 1597 return root; 1598 } 1599 1600 /* 1601 * Initialize subvolume root in-memory structure 1602 * 1603 * @anon_dev: anonymous device to attach to the root, if zero, allocate new 1604 */ 1605 static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) 1606 { 1607 int ret; 1608 unsigned int nofs_flag; 1609 1610 /* 1611 * We might be called under a transaction (e.g. indirect backref 1612 * resolution) which could deadlock if it triggers memory reclaim 1613 */ 1614 nofs_flag = memalloc_nofs_save(); 1615 ret = btrfs_drew_lock_init(&root->snapshot_lock); 1616 memalloc_nofs_restore(nofs_flag); 1617 if (ret) 1618 goto fail; 1619 1620 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && 1621 !btrfs_is_data_reloc_root(root)) { 1622 set_bit(BTRFS_ROOT_SHAREABLE, &root->state); 1623 btrfs_check_and_init_root_item(&root->root_item); 1624 } 1625 1626 /* 1627 * Don't assign anonymous block device to roots that are not exposed to 1628 * userspace, the id pool is limited to 1M 1629 */ 1630 if (is_fstree(root->root_key.objectid) && 1631 btrfs_root_refs(&root->root_item) > 0) { 1632 if (!anon_dev) { 1633 ret = get_anon_bdev(&root->anon_dev); 1634 if (ret) 1635 goto fail; 1636 } else { 1637 root->anon_dev = anon_dev; 1638 } 1639 } 1640 1641 mutex_lock(&root->objectid_mutex); 1642 ret = btrfs_init_root_free_objectid(root); 1643 if (ret) { 1644 mutex_unlock(&root->objectid_mutex); 1645 goto fail; 1646 } 1647 1648 ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); 1649 1650 mutex_unlock(&root->objectid_mutex); 1651 1652 return 0; 1653 fail: 1654 /* The caller is responsible to call btrfs_free_fs_root */ 1655 return ret; 1656 } 1657 1658 static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 1659 u64 root_id) 1660 { 1661 struct btrfs_root *root; 1662 1663 spin_lock(&fs_info->fs_roots_radix_lock); 1664 root = radix_tree_lookup(&fs_info->fs_roots_radix, 1665 (unsigned long)root_id); 1666 if (root) 1667 root = btrfs_grab_root(root); 1668 spin_unlock(&fs_info->fs_roots_radix_lock); 1669 return root; 1670 } 1671 1672 static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, 1673 u64 objectid) 1674 { 1675 struct btrfs_key key = { 1676 .objectid = objectid, 1677 .type = BTRFS_ROOT_ITEM_KEY, 1678 .offset = 0, 1679 }; 1680 1681 if (objectid == BTRFS_ROOT_TREE_OBJECTID) 1682 return btrfs_grab_root(fs_info->tree_root); 1683 if (objectid == BTRFS_EXTENT_TREE_OBJECTID) 1684 return btrfs_grab_root(btrfs_global_root(fs_info, &key)); 1685 if (objectid == BTRFS_CHUNK_TREE_OBJECTID) 1686 return btrfs_grab_root(fs_info->chunk_root); 1687 if (objectid == BTRFS_DEV_TREE_OBJECTID) 1688 return btrfs_grab_root(fs_info->dev_root); 1689 if (objectid == BTRFS_CSUM_TREE_OBJECTID) 1690 return btrfs_grab_root(btrfs_global_root(fs_info, &key)); 1691 if (objectid == BTRFS_QUOTA_TREE_OBJECTID) 1692 return btrfs_grab_root(fs_info->quota_root) ? 1693 fs_info->quota_root : ERR_PTR(-ENOENT); 1694 if (objectid == BTRFS_UUID_TREE_OBJECTID) 1695 return btrfs_grab_root(fs_info->uuid_root) ? 1696 fs_info->uuid_root : ERR_PTR(-ENOENT); 1697 if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { 1698 struct btrfs_root *root = btrfs_global_root(fs_info, &key); 1699 1700 return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); 1701 } 1702 return NULL; 1703 } 1704 1705 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, 1706 struct btrfs_root *root) 1707 { 1708 int ret; 1709 1710 ret = radix_tree_preload(GFP_NOFS); 1711 if (ret) 1712 return ret; 1713 1714 spin_lock(&fs_info->fs_roots_radix_lock); 1715 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1716 (unsigned long)root->root_key.objectid, 1717 root); 1718 if (ret == 0) { 1719 btrfs_grab_root(root); 1720 set_bit(BTRFS_ROOT_IN_RADIX, &root->state); 1721 } 1722 spin_unlock(&fs_info->fs_roots_radix_lock); 1723 radix_tree_preload_end(); 1724 1725 return ret; 1726 } 1727 1728 void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) 1729 { 1730 #ifdef CONFIG_BTRFS_DEBUG 1731 struct btrfs_root *root; 1732 1733 while (!list_empty(&fs_info->allocated_roots)) { 1734 char buf[BTRFS_ROOT_NAME_BUF_LEN]; 1735 1736 root = list_first_entry(&fs_info->allocated_roots, 1737 struct btrfs_root, leak_list); 1738 btrfs_err(fs_info, "leaked root %s refcount %d", 1739 btrfs_root_name(&root->root_key, buf), 1740 refcount_read(&root->refs)); 1741 while (refcount_read(&root->refs) > 1) 1742 btrfs_put_root(root); 1743 btrfs_put_root(root); 1744 } 1745 #endif 1746 } 1747 1748 static void free_global_roots(struct btrfs_fs_info *fs_info) 1749 { 1750 struct btrfs_root *root; 1751 struct rb_node *node; 1752 1753 while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) { 1754 root = rb_entry(node, struct btrfs_root, rb_node); 1755 rb_erase(&root->rb_node, &fs_info->global_root_tree); 1756 btrfs_put_root(root); 1757 } 1758 } 1759 1760 void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) 1761 { 1762 percpu_counter_destroy(&fs_info->dirty_metadata_bytes); 1763 percpu_counter_destroy(&fs_info->delalloc_bytes); 1764 percpu_counter_destroy(&fs_info->ordered_bytes); 1765 percpu_counter_destroy(&fs_info->dev_replace.bio_counter); 1766 btrfs_free_csum_hash(fs_info); 1767 btrfs_free_stripe_hash_table(fs_info); 1768 btrfs_free_ref_cache(fs_info); 1769 kfree(fs_info->balance_ctl); 1770 kfree(fs_info->delayed_root); 1771 free_global_roots(fs_info); 1772 btrfs_put_root(fs_info->tree_root); 1773 btrfs_put_root(fs_info->chunk_root); 1774 btrfs_put_root(fs_info->dev_root); 1775 btrfs_put_root(fs_info->quota_root); 1776 btrfs_put_root(fs_info->uuid_root); 1777 btrfs_put_root(fs_info->fs_root); 1778 btrfs_put_root(fs_info->data_reloc_root); 1779 btrfs_put_root(fs_info->block_group_root); 1780 btrfs_check_leaked_roots(fs_info); 1781 btrfs_extent_buffer_leak_debug_check(fs_info); 1782 kfree(fs_info->super_copy); 1783 kfree(fs_info->super_for_commit); 1784 kfree(fs_info->subpage_info); 1785 kvfree(fs_info); 1786 } 1787 1788 1789 /* 1790 * Get an in-memory reference of a root structure. 1791 * 1792 * For essential trees like root/extent tree, we grab it from fs_info directly. 1793 * For subvolume trees, we check the cached filesystem roots first. If not 1794 * found, then read it from disk and add it to cached fs roots. 1795 * 1796 * Caller should release the root by calling btrfs_put_root() after the usage. 1797 * 1798 * NOTE: Reloc and log trees can't be read by this function as they share the 1799 * same root objectid. 1800 * 1801 * @objectid: root id 1802 * @anon_dev: preallocated anonymous block device number for new roots, 1803 * pass 0 for new allocation. 1804 * @check_ref: whether to check root item references, If true, return -ENOENT 1805 * for orphan roots 1806 */ 1807 static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, 1808 u64 objectid, dev_t anon_dev, 1809 bool check_ref) 1810 { 1811 struct btrfs_root *root; 1812 struct btrfs_path *path; 1813 struct btrfs_key key; 1814 int ret; 1815 1816 root = btrfs_get_global_root(fs_info, objectid); 1817 if (root) 1818 return root; 1819 again: 1820 root = btrfs_lookup_fs_root(fs_info, objectid); 1821 if (root) { 1822 /* Shouldn't get preallocated anon_dev for cached roots */ 1823 ASSERT(!anon_dev); 1824 if (check_ref && btrfs_root_refs(&root->root_item) == 0) { 1825 btrfs_put_root(root); 1826 return ERR_PTR(-ENOENT); 1827 } 1828 return root; 1829 } 1830 1831 key.objectid = objectid; 1832 key.type = BTRFS_ROOT_ITEM_KEY; 1833 key.offset = (u64)-1; 1834 root = btrfs_read_tree_root(fs_info->tree_root, &key); 1835 if (IS_ERR(root)) 1836 return root; 1837 1838 if (check_ref && btrfs_root_refs(&root->root_item) == 0) { 1839 ret = -ENOENT; 1840 goto fail; 1841 } 1842 1843 ret = btrfs_init_fs_root(root, anon_dev); 1844 if (ret) 1845 goto fail; 1846 1847 path = btrfs_alloc_path(); 1848 if (!path) { 1849 ret = -ENOMEM; 1850 goto fail; 1851 } 1852 key.objectid = BTRFS_ORPHAN_OBJECTID; 1853 key.type = BTRFS_ORPHAN_ITEM_KEY; 1854 key.offset = objectid; 1855 1856 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 1857 btrfs_free_path(path); 1858 if (ret < 0) 1859 goto fail; 1860 if (ret == 0) 1861 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); 1862 1863 ret = btrfs_insert_fs_root(fs_info, root); 1864 if (ret) { 1865 if (ret == -EEXIST) { 1866 btrfs_put_root(root); 1867 goto again; 1868 } 1869 goto fail; 1870 } 1871 return root; 1872 fail: 1873 /* 1874 * If our caller provided us an anonymous device, then it's his 1875 * responsability to free it in case we fail. So we have to set our 1876 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() 1877 * and once again by our caller. 1878 */ 1879 if (anon_dev) 1880 root->anon_dev = 0; 1881 btrfs_put_root(root); 1882 return ERR_PTR(ret); 1883 } 1884 1885 /* 1886 * Get in-memory reference of a root structure 1887 * 1888 * @objectid: tree objectid 1889 * @check_ref: if set, verify that the tree exists and the item has at least 1890 * one reference 1891 */ 1892 struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, 1893 u64 objectid, bool check_ref) 1894 { 1895 return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); 1896 } 1897 1898 /* 1899 * Get in-memory reference of a root structure, created as new, optionally pass 1900 * the anonymous block device id 1901 * 1902 * @objectid: tree objectid 1903 * @anon_dev: if zero, allocate a new anonymous block device or use the 1904 * parameter value 1905 */ 1906 struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, 1907 u64 objectid, dev_t anon_dev) 1908 { 1909 return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); 1910 } 1911 1912 /* 1913 * btrfs_get_fs_root_commit_root - return a root for the given objectid 1914 * @fs_info: the fs_info 1915 * @objectid: the objectid we need to lookup 1916 * 1917 * This is exclusively used for backref walking, and exists specifically because 1918 * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref 1919 * creation time, which means we may have to read the tree_root in order to look 1920 * up a fs root that is not in memory. If the root is not in memory we will 1921 * read the tree root commit root and look up the fs root from there. This is a 1922 * temporary root, it will not be inserted into the radix tree as it doesn't 1923 * have the most uptodate information, it'll simply be discarded once the 1924 * backref code is finished using the root. 1925 */ 1926 struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, 1927 struct btrfs_path *path, 1928 u64 objectid) 1929 { 1930 struct btrfs_root *root; 1931 struct btrfs_key key; 1932 1933 ASSERT(path->search_commit_root && path->skip_locking); 1934 1935 /* 1936 * This can return -ENOENT if we ask for a root that doesn't exist, but 1937 * since this is called via the backref walking code we won't be looking 1938 * up a root that doesn't exist, unless there's corruption. So if root 1939 * != NULL just return it. 1940 */ 1941 root = btrfs_get_global_root(fs_info, objectid); 1942 if (root) 1943 return root; 1944 1945 root = btrfs_lookup_fs_root(fs_info, objectid); 1946 if (root) 1947 return root; 1948 1949 key.objectid = objectid; 1950 key.type = BTRFS_ROOT_ITEM_KEY; 1951 key.offset = (u64)-1; 1952 root = read_tree_root_path(fs_info->tree_root, path, &key); 1953 btrfs_release_path(path); 1954 1955 return root; 1956 } 1957 1958 /* 1959 * called by the kthread helper functions to finally call the bio end_io 1960 * functions. This is where read checksum verification actually happens 1961 */ 1962 static void end_workqueue_fn(struct btrfs_work *work) 1963 { 1964 struct bio *bio; 1965 struct btrfs_end_io_wq *end_io_wq; 1966 1967 end_io_wq = container_of(work, struct btrfs_end_io_wq, work); 1968 bio = end_io_wq->bio; 1969 1970 bio->bi_status = end_io_wq->status; 1971 bio->bi_private = end_io_wq->private; 1972 bio->bi_end_io = end_io_wq->end_io; 1973 bio_endio(bio); 1974 kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); 1975 } 1976 1977 static int cleaner_kthread(void *arg) 1978 { 1979 struct btrfs_fs_info *fs_info = arg; 1980 int again; 1981 1982 while (1) { 1983 again = 0; 1984 1985 set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); 1986 1987 /* Make the cleaner go to sleep early. */ 1988 if (btrfs_need_cleaner_sleep(fs_info)) 1989 goto sleep; 1990 1991 /* 1992 * Do not do anything if we might cause open_ctree() to block 1993 * before we have finished mounting the filesystem. 1994 */ 1995 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1996 goto sleep; 1997 1998 if (!mutex_trylock(&fs_info->cleaner_mutex)) 1999 goto sleep; 2000 2001 /* 2002 * Avoid the problem that we change the status of the fs 2003 * during the above check and trylock. 2004 */ 2005 if (btrfs_need_cleaner_sleep(fs_info)) { 2006 mutex_unlock(&fs_info->cleaner_mutex); 2007 goto sleep; 2008 } 2009 2010 btrfs_run_delayed_iputs(fs_info); 2011 2012 again = btrfs_clean_one_deleted_snapshot(fs_info); 2013 mutex_unlock(&fs_info->cleaner_mutex); 2014 2015 /* 2016 * The defragger has dealt with the R/O remount and umount, 2017 * needn't do anything special here. 2018 */ 2019 btrfs_run_defrag_inodes(fs_info); 2020 2021 /* 2022 * Acquires fs_info->reclaim_bgs_lock to avoid racing 2023 * with relocation (btrfs_relocate_chunk) and relocation 2024 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) 2025 * after acquiring fs_info->reclaim_bgs_lock. So we 2026 * can't hold, nor need to, fs_info->cleaner_mutex when deleting 2027 * unused block groups. 2028 */ 2029 btrfs_delete_unused_bgs(fs_info); 2030 2031 /* 2032 * Reclaim block groups in the reclaim_bgs list after we deleted 2033 * all unused block_groups. This possibly gives us some more free 2034 * space. 2035 */ 2036 btrfs_reclaim_bgs(fs_info); 2037 sleep: 2038 clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); 2039 if (kthread_should_park()) 2040 kthread_parkme(); 2041 if (kthread_should_stop()) 2042 return 0; 2043 if (!again) { 2044 set_current_state(TASK_INTERRUPTIBLE); 2045 schedule(); 2046 __set_current_state(TASK_RUNNING); 2047 } 2048 } 2049 } 2050 2051 static int transaction_kthread(void *arg) 2052 { 2053 struct btrfs_root *root = arg; 2054 struct btrfs_fs_info *fs_info = root->fs_info; 2055 struct btrfs_trans_handle *trans; 2056 struct btrfs_transaction *cur; 2057 u64 transid; 2058 time64_t delta; 2059 unsigned long delay; 2060 bool cannot_commit; 2061 2062 do { 2063 cannot_commit = false; 2064 delay = msecs_to_jiffies(fs_info->commit_interval * 1000); 2065 mutex_lock(&fs_info->transaction_kthread_mutex); 2066 2067 spin_lock(&fs_info->trans_lock); 2068 cur = fs_info->running_transaction; 2069 if (!cur) { 2070 spin_unlock(&fs_info->trans_lock); 2071 goto sleep; 2072 } 2073 2074 delta = ktime_get_seconds() - cur->start_time; 2075 if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && 2076 cur->state < TRANS_STATE_COMMIT_START && 2077 delta < fs_info->commit_interval) { 2078 spin_unlock(&fs_info->trans_lock); 2079 delay -= msecs_to_jiffies((delta - 1) * 1000); 2080 delay = min(delay, 2081 msecs_to_jiffies(fs_info->commit_interval * 1000)); 2082 goto sleep; 2083 } 2084 transid = cur->transid; 2085 spin_unlock(&fs_info->trans_lock); 2086 2087 /* If the file system is aborted, this will always fail. */ 2088 trans = btrfs_attach_transaction(root); 2089 if (IS_ERR(trans)) { 2090 if (PTR_ERR(trans) != -ENOENT) 2091 cannot_commit = true; 2092 goto sleep; 2093 } 2094 if (transid == trans->transid) { 2095 btrfs_commit_transaction(trans); 2096 } else { 2097 btrfs_end_transaction(trans); 2098 } 2099 sleep: 2100 wake_up_process(fs_info->cleaner_kthread); 2101 mutex_unlock(&fs_info->transaction_kthread_mutex); 2102 2103 if (BTRFS_FS_ERROR(fs_info)) 2104 btrfs_cleanup_transaction(fs_info); 2105 if (!kthread_should_stop() && 2106 (!btrfs_transaction_blocked(fs_info) || 2107 cannot_commit)) 2108 schedule_timeout_interruptible(delay); 2109 } while (!kthread_should_stop()); 2110 return 0; 2111 } 2112 2113 /* 2114 * This will find the highest generation in the array of root backups. The 2115 * index of the highest array is returned, or -EINVAL if we can't find 2116 * anything. 2117 * 2118 * We check to make sure the array is valid by comparing the 2119 * generation of the latest root in the array with the generation 2120 * in the super block. If they don't match we pitch it. 2121 */ 2122 static int find_newest_super_backup(struct btrfs_fs_info *info) 2123 { 2124 const u64 newest_gen = btrfs_super_generation(info->super_copy); 2125 u64 cur; 2126 struct btrfs_root_backup *root_backup; 2127 int i; 2128 2129 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { 2130 root_backup = info->super_copy->super_roots + i; 2131 cur = btrfs_backup_tree_root_gen(root_backup); 2132 if (cur == newest_gen) 2133 return i; 2134 } 2135 2136 return -EINVAL; 2137 } 2138 2139 /* 2140 * copy all the root pointers into the super backup array. 2141 * this will bump the backup pointer by one when it is 2142 * done 2143 */ 2144 static void backup_super_roots(struct btrfs_fs_info *info) 2145 { 2146 const int next_backup = info->backup_root_index; 2147 struct btrfs_root_backup *root_backup; 2148 2149 root_backup = info->super_for_commit->super_roots + next_backup; 2150 2151 /* 2152 * make sure all of our padding and empty slots get zero filled 2153 * regardless of which ones we use today 2154 */ 2155 memset(root_backup, 0, sizeof(*root_backup)); 2156 2157 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; 2158 2159 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); 2160 btrfs_set_backup_tree_root_gen(root_backup, 2161 btrfs_header_generation(info->tree_root->node)); 2162 2163 btrfs_set_backup_tree_root_level(root_backup, 2164 btrfs_header_level(info->tree_root->node)); 2165 2166 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); 2167 btrfs_set_backup_chunk_root_gen(root_backup, 2168 btrfs_header_generation(info->chunk_root->node)); 2169 btrfs_set_backup_chunk_root_level(root_backup, 2170 btrfs_header_level(info->chunk_root->node)); 2171 2172 if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { 2173 btrfs_set_backup_block_group_root(root_backup, 2174 info->block_group_root->node->start); 2175 btrfs_set_backup_block_group_root_gen(root_backup, 2176 btrfs_header_generation(info->block_group_root->node)); 2177 btrfs_set_backup_block_group_root_level(root_backup, 2178 btrfs_header_level(info->block_group_root->node)); 2179 } else { 2180 struct btrfs_root *extent_root = btrfs_extent_root(info, 0); 2181 struct btrfs_root *csum_root = btrfs_csum_root(info, 0); 2182 2183 btrfs_set_backup_extent_root(root_backup, 2184 extent_root->node->start); 2185 btrfs_set_backup_extent_root_gen(root_backup, 2186 btrfs_header_generation(extent_root->node)); 2187 btrfs_set_backup_extent_root_level(root_backup, 2188 btrfs_header_level(extent_root->node)); 2189 2190 btrfs_set_backup_csum_root(root_backup, csum_root->node->start); 2191 btrfs_set_backup_csum_root_gen(root_backup, 2192 btrfs_header_generation(csum_root->node)); 2193 btrfs_set_backup_csum_root_level(root_backup, 2194 btrfs_header_level(csum_root->node)); 2195 } 2196 2197 /* 2198 * we might commit during log recovery, which happens before we set 2199 * the fs_root. Make sure it is valid before we fill it in. 2200 */ 2201 if (info->fs_root && info->fs_root->node) { 2202 btrfs_set_backup_fs_root(root_backup, 2203 info->fs_root->node->start); 2204 btrfs_set_backup_fs_root_gen(root_backup, 2205 btrfs_header_generation(info->fs_root->node)); 2206 btrfs_set_backup_fs_root_level(root_backup, 2207 btrfs_header_level(info->fs_root->node)); 2208 } 2209 2210 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); 2211 btrfs_set_backup_dev_root_gen(root_backup, 2212 btrfs_header_generation(info->dev_root->node)); 2213 btrfs_set_backup_dev_root_level(root_backup, 2214 btrfs_header_level(info->dev_root->node)); 2215 2216 btrfs_set_backup_total_bytes(root_backup, 2217 btrfs_super_total_bytes(info->super_copy)); 2218 btrfs_set_backup_bytes_used(root_backup, 2219 btrfs_super_bytes_used(info->super_copy)); 2220 btrfs_set_backup_num_devices(root_backup, 2221 btrfs_super_num_devices(info->super_copy)); 2222 2223 /* 2224 * if we don't copy this out to the super_copy, it won't get remembered 2225 * for the next commit 2226 */ 2227 memcpy(&info->super_copy->super_roots, 2228 &info->super_for_commit->super_roots, 2229 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); 2230 } 2231 2232 /* 2233 * read_backup_root - Reads a backup root based on the passed priority. Prio 0 2234 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots 2235 * 2236 * fs_info - filesystem whose backup roots need to be read 2237 * priority - priority of backup root required 2238 * 2239 * Returns backup root index on success and -EINVAL otherwise. 2240 */ 2241 static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority) 2242 { 2243 int backup_index = find_newest_super_backup(fs_info); 2244 struct btrfs_super_block *super = fs_info->super_copy; 2245 struct btrfs_root_backup *root_backup; 2246 2247 if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) { 2248 if (priority == 0) 2249 return backup_index; 2250 2251 backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority; 2252 backup_index %= BTRFS_NUM_BACKUP_ROOTS; 2253 } else { 2254 return -EINVAL; 2255 } 2256 2257 root_backup = super->super_roots + backup_index; 2258 2259 btrfs_set_super_generation(super, 2260 btrfs_backup_tree_root_gen(root_backup)); 2261 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); 2262 btrfs_set_super_root_level(super, 2263 btrfs_backup_tree_root_level(root_backup)); 2264 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); 2265 2266 /* 2267 * Fixme: the total bytes and num_devices need to match or we should 2268 * need a fsck 2269 */ 2270 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); 2271 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); 2272 2273 return backup_index; 2274 } 2275 2276 /* helper to cleanup workers */ 2277 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) 2278 { 2279 btrfs_destroy_workqueue(fs_info->fixup_workers); 2280 btrfs_destroy_workqueue(fs_info->delalloc_workers); 2281 btrfs_destroy_workqueue(fs_info->hipri_workers); 2282 btrfs_destroy_workqueue(fs_info->workers); 2283 btrfs_destroy_workqueue(fs_info->endio_workers); 2284 btrfs_destroy_workqueue(fs_info->endio_raid56_workers); 2285 if (fs_info->rmw_workers) 2286 destroy_workqueue(fs_info->rmw_workers); 2287 btrfs_destroy_workqueue(fs_info->endio_write_workers); 2288 btrfs_destroy_workqueue(fs_info->endio_freespace_worker); 2289 btrfs_destroy_workqueue(fs_info->delayed_workers); 2290 btrfs_destroy_workqueue(fs_info->caching_workers); 2291 btrfs_destroy_workqueue(fs_info->flush_workers); 2292 btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); 2293 if (fs_info->discard_ctl.discard_workers) 2294 destroy_workqueue(fs_info->discard_ctl.discard_workers); 2295 /* 2296 * Now that all other work queues are destroyed, we can safely destroy 2297 * the queues used for metadata I/O, since tasks from those other work 2298 * queues can do metadata I/O operations. 2299 */ 2300 btrfs_destroy_workqueue(fs_info->endio_meta_workers); 2301 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); 2302 } 2303 2304 static void free_root_extent_buffers(struct btrfs_root *root) 2305 { 2306 if (root) { 2307 free_extent_buffer(root->node); 2308 free_extent_buffer(root->commit_root); 2309 root->node = NULL; 2310 root->commit_root = NULL; 2311 } 2312 } 2313 2314 static void free_global_root_pointers(struct btrfs_fs_info *fs_info) 2315 { 2316 struct btrfs_root *root, *tmp; 2317 2318 rbtree_postorder_for_each_entry_safe(root, tmp, 2319 &fs_info->global_root_tree, 2320 rb_node) 2321 free_root_extent_buffers(root); 2322 } 2323 2324 /* helper to cleanup tree roots */ 2325 static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) 2326 { 2327 free_root_extent_buffers(info->tree_root); 2328 2329 free_global_root_pointers(info); 2330 free_root_extent_buffers(info->dev_root); 2331 free_root_extent_buffers(info->quota_root); 2332 free_root_extent_buffers(info->uuid_root); 2333 free_root_extent_buffers(info->fs_root); 2334 free_root_extent_buffers(info->data_reloc_root); 2335 free_root_extent_buffers(info->block_group_root); 2336 if (free_chunk_root) 2337 free_root_extent_buffers(info->chunk_root); 2338 } 2339 2340 void btrfs_put_root(struct btrfs_root *root) 2341 { 2342 if (!root) 2343 return; 2344 2345 if (refcount_dec_and_test(&root->refs)) { 2346 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2347 WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); 2348 if (root->anon_dev) 2349 free_anon_bdev(root->anon_dev); 2350 btrfs_drew_lock_destroy(&root->snapshot_lock); 2351 free_root_extent_buffers(root); 2352 #ifdef CONFIG_BTRFS_DEBUG 2353 spin_lock(&root->fs_info->fs_roots_radix_lock); 2354 list_del_init(&root->leak_list); 2355 spin_unlock(&root->fs_info->fs_roots_radix_lock); 2356 #endif 2357 kfree(root); 2358 } 2359 } 2360 2361 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) 2362 { 2363 int ret; 2364 struct btrfs_root *gang[8]; 2365 int i; 2366 2367 while (!list_empty(&fs_info->dead_roots)) { 2368 gang[0] = list_entry(fs_info->dead_roots.next, 2369 struct btrfs_root, root_list); 2370 list_del(&gang[0]->root_list); 2371 2372 if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) 2373 btrfs_drop_and_free_fs_root(fs_info, gang[0]); 2374 btrfs_put_root(gang[0]); 2375 } 2376 2377 while (1) { 2378 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 2379 (void **)gang, 0, 2380 ARRAY_SIZE(gang)); 2381 if (!ret) 2382 break; 2383 for (i = 0; i < ret; i++) 2384 btrfs_drop_and_free_fs_root(fs_info, gang[i]); 2385 } 2386 } 2387 2388 static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) 2389 { 2390 mutex_init(&fs_info->scrub_lock); 2391 atomic_set(&fs_info->scrubs_running, 0); 2392 atomic_set(&fs_info->scrub_pause_req, 0); 2393 atomic_set(&fs_info->scrubs_paused, 0); 2394 atomic_set(&fs_info->scrub_cancel_req, 0); 2395 init_waitqueue_head(&fs_info->scrub_pause_wait); 2396 refcount_set(&fs_info->scrub_workers_refcnt, 0); 2397 } 2398 2399 static void btrfs_init_balance(struct btrfs_fs_info *fs_info) 2400 { 2401 spin_lock_init(&fs_info->balance_lock); 2402 mutex_init(&fs_info->balance_mutex); 2403 atomic_set(&fs_info->balance_pause_req, 0); 2404 atomic_set(&fs_info->balance_cancel_req, 0); 2405 fs_info->balance_ctl = NULL; 2406 init_waitqueue_head(&fs_info->balance_wait_q); 2407 atomic_set(&fs_info->reloc_cancel_req, 0); 2408 } 2409 2410 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) 2411 { 2412 struct inode *inode = fs_info->btree_inode; 2413 2414 inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; 2415 set_nlink(inode, 1); 2416 /* 2417 * we set the i_size on the btree inode to the max possible int. 2418 * the real end of the address space is determined by all of 2419 * the devices in the system 2420 */ 2421 inode->i_size = OFFSET_MAX; 2422 inode->i_mapping->a_ops = &btree_aops; 2423 2424 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); 2425 extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, 2426 IO_TREE_BTREE_INODE_IO, inode); 2427 BTRFS_I(inode)->io_tree.track_uptodate = false; 2428 extent_map_tree_init(&BTRFS_I(inode)->extent_tree); 2429 2430 BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); 2431 memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); 2432 set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); 2433 btrfs_insert_inode_hash(inode); 2434 } 2435 2436 static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) 2437 { 2438 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); 2439 init_rwsem(&fs_info->dev_replace.rwsem); 2440 init_waitqueue_head(&fs_info->dev_replace.replace_wait); 2441 } 2442 2443 static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) 2444 { 2445 spin_lock_init(&fs_info->qgroup_lock); 2446 mutex_init(&fs_info->qgroup_ioctl_lock); 2447 fs_info->qgroup_tree = RB_ROOT; 2448 INIT_LIST_HEAD(&fs_info->dirty_qgroups); 2449 fs_info->qgroup_seq = 1; 2450 fs_info->qgroup_ulist = NULL; 2451 fs_info->qgroup_rescan_running = false; 2452 mutex_init(&fs_info->qgroup_rescan_lock); 2453 } 2454 2455 static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) 2456 { 2457 u32 max_active = fs_info->thread_pool_size; 2458 unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; 2459 2460 fs_info->workers = 2461 btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); 2462 fs_info->hipri_workers = 2463 btrfs_alloc_workqueue(fs_info, "worker-high", 2464 flags | WQ_HIGHPRI, max_active, 16); 2465 2466 fs_info->delalloc_workers = 2467 btrfs_alloc_workqueue(fs_info, "delalloc", 2468 flags, max_active, 2); 2469 2470 fs_info->flush_workers = 2471 btrfs_alloc_workqueue(fs_info, "flush_delalloc", 2472 flags, max_active, 0); 2473 2474 fs_info->caching_workers = 2475 btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); 2476 2477 fs_info->fixup_workers = 2478 btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); 2479 2480 /* 2481 * endios are largely parallel and should have a very 2482 * low idle thresh 2483 */ 2484 fs_info->endio_workers = 2485 btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); 2486 fs_info->endio_meta_workers = 2487 btrfs_alloc_workqueue(fs_info, "endio-meta", flags, 2488 max_active, 4); 2489 fs_info->endio_meta_write_workers = 2490 btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, 2491 max_active, 2); 2492 fs_info->endio_raid56_workers = 2493 btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, 2494 max_active, 4); 2495 fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); 2496 fs_info->endio_write_workers = 2497 btrfs_alloc_workqueue(fs_info, "endio-write", flags, 2498 max_active, 2); 2499 fs_info->endio_freespace_worker = 2500 btrfs_alloc_workqueue(fs_info, "freespace-write", flags, 2501 max_active, 0); 2502 fs_info->delayed_workers = 2503 btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, 2504 max_active, 0); 2505 fs_info->qgroup_rescan_workers = 2506 btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); 2507 fs_info->discard_ctl.discard_workers = 2508 alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); 2509 2510 if (!(fs_info->workers && fs_info->hipri_workers && 2511 fs_info->delalloc_workers && fs_info->flush_workers && 2512 fs_info->endio_workers && fs_info->endio_meta_workers && 2513 fs_info->endio_meta_write_workers && 2514 fs_info->endio_write_workers && fs_info->endio_raid56_workers && 2515 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2516 fs_info->caching_workers && fs_info->fixup_workers && 2517 fs_info->delayed_workers && fs_info->qgroup_rescan_workers && 2518 fs_info->discard_ctl.discard_workers)) { 2519 return -ENOMEM; 2520 } 2521 2522 return 0; 2523 } 2524 2525 static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) 2526 { 2527 struct crypto_shash *csum_shash; 2528 const char *csum_driver = btrfs_super_csum_driver(csum_type); 2529 2530 csum_shash = crypto_alloc_shash(csum_driver, 0, 0); 2531 2532 if (IS_ERR(csum_shash)) { 2533 btrfs_err(fs_info, "error allocating %s hash for checksum", 2534 csum_driver); 2535 return PTR_ERR(csum_shash); 2536 } 2537 2538 fs_info->csum_shash = csum_shash; 2539 2540 return 0; 2541 } 2542 2543 static int btrfs_replay_log(struct btrfs_fs_info *fs_info, 2544 struct btrfs_fs_devices *fs_devices) 2545 { 2546 int ret; 2547 struct btrfs_root *log_tree_root; 2548 struct btrfs_super_block *disk_super = fs_info->super_copy; 2549 u64 bytenr = btrfs_super_log_root(disk_super); 2550 int level = btrfs_super_log_root_level(disk_super); 2551 2552 if (fs_devices->rw_devices == 0) { 2553 btrfs_warn(fs_info, "log replay required on RO media"); 2554 return -EIO; 2555 } 2556 2557 log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, 2558 GFP_KERNEL); 2559 if (!log_tree_root) 2560 return -ENOMEM; 2561 2562 log_tree_root->node = read_tree_block(fs_info, bytenr, 2563 BTRFS_TREE_LOG_OBJECTID, 2564 fs_info->generation + 1, level, 2565 NULL); 2566 if (IS_ERR(log_tree_root->node)) { 2567 btrfs_warn(fs_info, "failed to read log tree"); 2568 ret = PTR_ERR(log_tree_root->node); 2569 log_tree_root->node = NULL; 2570 btrfs_put_root(log_tree_root); 2571 return ret; 2572 } 2573 if (!extent_buffer_uptodate(log_tree_root->node)) { 2574 btrfs_err(fs_info, "failed to read log tree"); 2575 btrfs_put_root(log_tree_root); 2576 return -EIO; 2577 } 2578 2579 /* returns with log_tree_root freed on success */ 2580 ret = btrfs_recover_log_trees(log_tree_root); 2581 if (ret) { 2582 btrfs_handle_fs_error(fs_info, ret, 2583 "Failed to recover log tree"); 2584 btrfs_put_root(log_tree_root); 2585 return ret; 2586 } 2587 2588 if (sb_rdonly(fs_info->sb)) { 2589 ret = btrfs_commit_super(fs_info); 2590 if (ret) 2591 return ret; 2592 } 2593 2594 return 0; 2595 } 2596 2597 static int load_global_roots_objectid(struct btrfs_root *tree_root, 2598 struct btrfs_path *path, u64 objectid, 2599 const char *name) 2600 { 2601 struct btrfs_fs_info *fs_info = tree_root->fs_info; 2602 struct btrfs_root *root; 2603 u64 max_global_id = 0; 2604 int ret; 2605 struct btrfs_key key = { 2606 .objectid = objectid, 2607 .type = BTRFS_ROOT_ITEM_KEY, 2608 .offset = 0, 2609 }; 2610 bool found = false; 2611 2612 /* If we have IGNOREDATACSUMS skip loading these roots. */ 2613 if (objectid == BTRFS_CSUM_TREE_OBJECTID && 2614 btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { 2615 set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); 2616 return 0; 2617 } 2618 2619 while (1) { 2620 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 2621 if (ret < 0) 2622 break; 2623 2624 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2625 ret = btrfs_next_leaf(tree_root, path); 2626 if (ret) { 2627 if (ret > 0) 2628 ret = 0; 2629 break; 2630 } 2631 } 2632 ret = 0; 2633 2634 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2635 if (key.objectid != objectid) 2636 break; 2637 btrfs_release_path(path); 2638 2639 /* 2640 * Just worry about this for extent tree, it'll be the same for 2641 * everybody. 2642 */ 2643 if (objectid == BTRFS_EXTENT_TREE_OBJECTID) 2644 max_global_id = max(max_global_id, key.offset); 2645 2646 found = true; 2647 root = read_tree_root_path(tree_root, path, &key); 2648 if (IS_ERR(root)) { 2649 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) 2650 ret = PTR_ERR(root); 2651 break; 2652 } 2653 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2654 ret = btrfs_global_root_insert(root); 2655 if (ret) { 2656 btrfs_put_root(root); 2657 break; 2658 } 2659 key.offset++; 2660 } 2661 btrfs_release_path(path); 2662 2663 if (objectid == BTRFS_EXTENT_TREE_OBJECTID) 2664 fs_info->nr_global_roots = max_global_id + 1; 2665 2666 if (!found || ret) { 2667 if (objectid == BTRFS_CSUM_TREE_OBJECTID) 2668 set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); 2669 2670 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) 2671 ret = ret ? ret : -ENOENT; 2672 else 2673 ret = 0; 2674 btrfs_err(fs_info, "failed to load root %s", name); 2675 } 2676 return ret; 2677 } 2678 2679 static int load_global_roots(struct btrfs_root *tree_root) 2680 { 2681 struct btrfs_path *path; 2682 int ret = 0; 2683 2684 path = btrfs_alloc_path(); 2685 if (!path) 2686 return -ENOMEM; 2687 2688 ret = load_global_roots_objectid(tree_root, path, 2689 BTRFS_EXTENT_TREE_OBJECTID, "extent"); 2690 if (ret) 2691 goto out; 2692 ret = load_global_roots_objectid(tree_root, path, 2693 BTRFS_CSUM_TREE_OBJECTID, "csum"); 2694 if (ret) 2695 goto out; 2696 if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) 2697 goto out; 2698 ret = load_global_roots_objectid(tree_root, path, 2699 BTRFS_FREE_SPACE_TREE_OBJECTID, 2700 "free space"); 2701 out: 2702 btrfs_free_path(path); 2703 return ret; 2704 } 2705 2706 static int btrfs_read_roots(struct btrfs_fs_info *fs_info) 2707 { 2708 struct btrfs_root *tree_root = fs_info->tree_root; 2709 struct btrfs_root *root; 2710 struct btrfs_key location; 2711 int ret; 2712 2713 BUG_ON(!fs_info->tree_root); 2714 2715 ret = load_global_roots(tree_root); 2716 if (ret) 2717 return ret; 2718 2719 location.objectid = BTRFS_DEV_TREE_OBJECTID; 2720 location.type = BTRFS_ROOT_ITEM_KEY; 2721 location.offset = 0; 2722 2723 root = btrfs_read_tree_root(tree_root, &location); 2724 if (IS_ERR(root)) { 2725 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { 2726 ret = PTR_ERR(root); 2727 goto out; 2728 } 2729 } else { 2730 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2731 fs_info->dev_root = root; 2732 } 2733 /* Initialize fs_info for all devices in any case */ 2734 btrfs_init_devices_late(fs_info); 2735 2736 /* 2737 * This tree can share blocks with some other fs tree during relocation 2738 * and we need a proper setup by btrfs_get_fs_root 2739 */ 2740 root = btrfs_get_fs_root(tree_root->fs_info, 2741 BTRFS_DATA_RELOC_TREE_OBJECTID, true); 2742 if (IS_ERR(root)) { 2743 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { 2744 ret = PTR_ERR(root); 2745 goto out; 2746 } 2747 } else { 2748 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2749 fs_info->data_reloc_root = root; 2750 } 2751 2752 location.objectid = BTRFS_QUOTA_TREE_OBJECTID; 2753 root = btrfs_read_tree_root(tree_root, &location); 2754 if (!IS_ERR(root)) { 2755 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2756 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 2757 fs_info->quota_root = root; 2758 } 2759 2760 location.objectid = BTRFS_UUID_TREE_OBJECTID; 2761 root = btrfs_read_tree_root(tree_root, &location); 2762 if (IS_ERR(root)) { 2763 if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { 2764 ret = PTR_ERR(root); 2765 if (ret != -ENOENT) 2766 goto out; 2767 } 2768 } else { 2769 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2770 fs_info->uuid_root = root; 2771 } 2772 2773 return 0; 2774 out: 2775 btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", 2776 location.objectid, ret); 2777 return ret; 2778 } 2779 2780 /* 2781 * Real super block validation 2782 * NOTE: super csum type and incompat features will not be checked here. 2783 * 2784 * @sb: super block to check 2785 * @mirror_num: the super block number to check its bytenr: 2786 * 0 the primary (1st) sb 2787 * 1, 2 2nd and 3rd backup copy 2788 * -1 skip bytenr check 2789 */ 2790 static int validate_super(struct btrfs_fs_info *fs_info, 2791 struct btrfs_super_block *sb, int mirror_num) 2792 { 2793 u64 nodesize = btrfs_super_nodesize(sb); 2794 u64 sectorsize = btrfs_super_sectorsize(sb); 2795 int ret = 0; 2796 2797 if (btrfs_super_magic(sb) != BTRFS_MAGIC) { 2798 btrfs_err(fs_info, "no valid FS found"); 2799 ret = -EINVAL; 2800 } 2801 if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { 2802 btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", 2803 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); 2804 ret = -EINVAL; 2805 } 2806 if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { 2807 btrfs_err(fs_info, "tree_root level too big: %d >= %d", 2808 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); 2809 ret = -EINVAL; 2810 } 2811 if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { 2812 btrfs_err(fs_info, "chunk_root level too big: %d >= %d", 2813 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); 2814 ret = -EINVAL; 2815 } 2816 if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { 2817 btrfs_err(fs_info, "log_root level too big: %d >= %d", 2818 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); 2819 ret = -EINVAL; 2820 } 2821 2822 /* 2823 * Check sectorsize and nodesize first, other check will need it. 2824 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. 2825 */ 2826 if (!is_power_of_2(sectorsize) || sectorsize < 4096 || 2827 sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { 2828 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); 2829 ret = -EINVAL; 2830 } 2831 2832 /* 2833 * We only support at most two sectorsizes: 4K and PAGE_SIZE. 2834 * 2835 * We can support 16K sectorsize with 64K page size without problem, 2836 * but such sectorsize/pagesize combination doesn't make much sense. 2837 * 4K will be our future standard, PAGE_SIZE is supported from the very 2838 * beginning. 2839 */ 2840 if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { 2841 btrfs_err(fs_info, 2842 "sectorsize %llu not yet supported for page size %lu", 2843 sectorsize, PAGE_SIZE); 2844 ret = -EINVAL; 2845 } 2846 2847 if (!is_power_of_2(nodesize) || nodesize < sectorsize || 2848 nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { 2849 btrfs_err(fs_info, "invalid nodesize %llu", nodesize); 2850 ret = -EINVAL; 2851 } 2852 if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { 2853 btrfs_err(fs_info, "invalid leafsize %u, should be %llu", 2854 le32_to_cpu(sb->__unused_leafsize), nodesize); 2855 ret = -EINVAL; 2856 } 2857 2858 /* Root alignment check */ 2859 if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { 2860 btrfs_warn(fs_info, "tree_root block unaligned: %llu", 2861 btrfs_super_root(sb)); 2862 ret = -EINVAL; 2863 } 2864 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { 2865 btrfs_warn(fs_info, "chunk_root block unaligned: %llu", 2866 btrfs_super_chunk_root(sb)); 2867 ret = -EINVAL; 2868 } 2869 if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { 2870 btrfs_warn(fs_info, "log_root block unaligned: %llu", 2871 btrfs_super_log_root(sb)); 2872 ret = -EINVAL; 2873 } 2874 2875 if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, 2876 BTRFS_FSID_SIZE)) { 2877 btrfs_err(fs_info, 2878 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", 2879 fs_info->super_copy->fsid, fs_info->fs_devices->fsid); 2880 ret = -EINVAL; 2881 } 2882 2883 if (btrfs_fs_incompat(fs_info, METADATA_UUID) && 2884 memcmp(fs_info->fs_devices->metadata_uuid, 2885 fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { 2886 btrfs_err(fs_info, 2887 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", 2888 fs_info->super_copy->metadata_uuid, 2889 fs_info->fs_devices->metadata_uuid); 2890 ret = -EINVAL; 2891 } 2892 2893 if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, 2894 BTRFS_FSID_SIZE) != 0) { 2895 btrfs_err(fs_info, 2896 "dev_item UUID does not match metadata fsid: %pU != %pU", 2897 fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); 2898 ret = -EINVAL; 2899 } 2900 2901 /* 2902 * Hint to catch really bogus numbers, bitflips or so, more exact checks are 2903 * done later 2904 */ 2905 if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { 2906 btrfs_err(fs_info, "bytes_used is too small %llu", 2907 btrfs_super_bytes_used(sb)); 2908 ret = -EINVAL; 2909 } 2910 if (!is_power_of_2(btrfs_super_stripesize(sb))) { 2911 btrfs_err(fs_info, "invalid stripesize %u", 2912 btrfs_super_stripesize(sb)); 2913 ret = -EINVAL; 2914 } 2915 if (btrfs_super_num_devices(sb) > (1UL << 31)) 2916 btrfs_warn(fs_info, "suspicious number of devices: %llu", 2917 btrfs_super_num_devices(sb)); 2918 if (btrfs_super_num_devices(sb) == 0) { 2919 btrfs_err(fs_info, "number of devices is 0"); 2920 ret = -EINVAL; 2921 } 2922 2923 if (mirror_num >= 0 && 2924 btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { 2925 btrfs_err(fs_info, "super offset mismatch %llu != %u", 2926 btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); 2927 ret = -EINVAL; 2928 } 2929 2930 /* 2931 * Obvious sys_chunk_array corruptions, it must hold at least one key 2932 * and one chunk 2933 */ 2934 if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 2935 btrfs_err(fs_info, "system chunk array too big %u > %u", 2936 btrfs_super_sys_array_size(sb), 2937 BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); 2938 ret = -EINVAL; 2939 } 2940 if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) 2941 + sizeof(struct btrfs_chunk)) { 2942 btrfs_err(fs_info, "system chunk array too small %u < %zu", 2943 btrfs_super_sys_array_size(sb), 2944 sizeof(struct btrfs_disk_key) 2945 + sizeof(struct btrfs_chunk)); 2946 ret = -EINVAL; 2947 } 2948 2949 /* 2950 * The generation is a global counter, we'll trust it more than the others 2951 * but it's still possible that it's the one that's wrong. 2952 */ 2953 if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) 2954 btrfs_warn(fs_info, 2955 "suspicious: generation < chunk_root_generation: %llu < %llu", 2956 btrfs_super_generation(sb), 2957 btrfs_super_chunk_root_generation(sb)); 2958 if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) 2959 && btrfs_super_cache_generation(sb) != (u64)-1) 2960 btrfs_warn(fs_info, 2961 "suspicious: generation < cache_generation: %llu < %llu", 2962 btrfs_super_generation(sb), 2963 btrfs_super_cache_generation(sb)); 2964 2965 return ret; 2966 } 2967 2968 /* 2969 * Validation of super block at mount time. 2970 * Some checks already done early at mount time, like csum type and incompat 2971 * flags will be skipped. 2972 */ 2973 static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) 2974 { 2975 return validate_super(fs_info, fs_info->super_copy, 0); 2976 } 2977 2978 /* 2979 * Validation of super block at write time. 2980 * Some checks like bytenr check will be skipped as their values will be 2981 * overwritten soon. 2982 * Extra checks like csum type and incompat flags will be done here. 2983 */ 2984 static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, 2985 struct btrfs_super_block *sb) 2986 { 2987 int ret; 2988 2989 ret = validate_super(fs_info, sb, -1); 2990 if (ret < 0) 2991 goto out; 2992 if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { 2993 ret = -EUCLEAN; 2994 btrfs_err(fs_info, "invalid csum type, has %u want %u", 2995 btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); 2996 goto out; 2997 } 2998 if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { 2999 ret = -EUCLEAN; 3000 btrfs_err(fs_info, 3001 "invalid incompat flags, has 0x%llx valid mask 0x%llx", 3002 btrfs_super_incompat_flags(sb), 3003 (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); 3004 goto out; 3005 } 3006 out: 3007 if (ret < 0) 3008 btrfs_err(fs_info, 3009 "super block corruption detected before writing it to disk"); 3010 return ret; 3011 } 3012 3013 static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) 3014 { 3015 int ret = 0; 3016 3017 root->node = read_tree_block(root->fs_info, bytenr, 3018 root->root_key.objectid, gen, level, NULL); 3019 if (IS_ERR(root->node)) { 3020 ret = PTR_ERR(root->node); 3021 root->node = NULL; 3022 return ret; 3023 } 3024 if (!extent_buffer_uptodate(root->node)) { 3025 free_extent_buffer(root->node); 3026 root->node = NULL; 3027 return -EIO; 3028 } 3029 3030 btrfs_set_root_node(&root->root_item, root->node); 3031 root->commit_root = btrfs_root_node(root); 3032 btrfs_set_root_refs(&root->root_item, 1); 3033 return ret; 3034 } 3035 3036 static int load_important_roots(struct btrfs_fs_info *fs_info) 3037 { 3038 struct btrfs_super_block *sb = fs_info->super_copy; 3039 u64 gen, bytenr; 3040 int level, ret; 3041 3042 bytenr = btrfs_super_root(sb); 3043 gen = btrfs_super_generation(sb); 3044 level = btrfs_super_root_level(sb); 3045 ret = load_super_root(fs_info->tree_root, bytenr, gen, level); 3046 if (ret) { 3047 btrfs_warn(fs_info, "couldn't read tree root"); 3048 return ret; 3049 } 3050 3051 if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 3052 return 0; 3053 3054 bytenr = btrfs_super_block_group_root(sb); 3055 gen = btrfs_super_block_group_root_generation(sb); 3056 level = btrfs_super_block_group_root_level(sb); 3057 ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); 3058 if (ret) 3059 btrfs_warn(fs_info, "couldn't read block group root"); 3060 return ret; 3061 } 3062 3063 static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) 3064 { 3065 int backup_index = find_newest_super_backup(fs_info); 3066 struct btrfs_super_block *sb = fs_info->super_copy; 3067 struct btrfs_root *tree_root = fs_info->tree_root; 3068 bool handle_error = false; 3069 int ret = 0; 3070 int i; 3071 3072 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3073 struct btrfs_root *root; 3074 3075 root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, 3076 GFP_KERNEL); 3077 if (!root) 3078 return -ENOMEM; 3079 fs_info->block_group_root = root; 3080 } 3081 3082 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { 3083 if (handle_error) { 3084 if (!IS_ERR(tree_root->node)) 3085 free_extent_buffer(tree_root->node); 3086 tree_root->node = NULL; 3087 3088 if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) 3089 break; 3090 3091 free_root_pointers(fs_info, 0); 3092 3093 /* 3094 * Don't use the log in recovery mode, it won't be 3095 * valid 3096 */ 3097 btrfs_set_super_log_root(sb, 0); 3098 3099 /* We can't trust the free space cache either */ 3100 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); 3101 3102 ret = read_backup_root(fs_info, i); 3103 backup_index = ret; 3104 if (ret < 0) 3105 return ret; 3106 } 3107 3108 ret = load_important_roots(fs_info); 3109 if (ret) { 3110 handle_error = true; 3111 continue; 3112 } 3113 3114 /* 3115 * No need to hold btrfs_root::objectid_mutex since the fs 3116 * hasn't been fully initialised and we are the only user 3117 */ 3118 ret = btrfs_init_root_free_objectid(tree_root); 3119 if (ret < 0) { 3120 handle_error = true; 3121 continue; 3122 } 3123 3124 ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); 3125 3126 ret = btrfs_read_roots(fs_info); 3127 if (ret < 0) { 3128 handle_error = true; 3129 continue; 3130 } 3131 3132 /* All successful */ 3133 fs_info->generation = btrfs_header_generation(tree_root->node); 3134 fs_info->last_trans_committed = fs_info->generation; 3135 fs_info->last_reloc_trans = 0; 3136 3137 /* Always begin writing backup roots after the one being used */ 3138 if (backup_index < 0) { 3139 fs_info->backup_root_index = 0; 3140 } else { 3141 fs_info->backup_root_index = backup_index + 1; 3142 fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS; 3143 } 3144 break; 3145 } 3146 3147 return ret; 3148 } 3149 3150 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) 3151 { 3152 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 3153 INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); 3154 INIT_LIST_HEAD(&fs_info->trans_list); 3155 INIT_LIST_HEAD(&fs_info->dead_roots); 3156 INIT_LIST_HEAD(&fs_info->delayed_iputs); 3157 INIT_LIST_HEAD(&fs_info->delalloc_roots); 3158 INIT_LIST_HEAD(&fs_info->caching_block_groups); 3159 spin_lock_init(&fs_info->delalloc_root_lock); 3160 spin_lock_init(&fs_info->trans_lock); 3161 spin_lock_init(&fs_info->fs_roots_radix_lock); 3162 spin_lock_init(&fs_info->delayed_iput_lock); 3163 spin_lock_init(&fs_info->defrag_inodes_lock); 3164 spin_lock_init(&fs_info->super_lock); 3165 spin_lock_init(&fs_info->buffer_lock); 3166 spin_lock_init(&fs_info->unused_bgs_lock); 3167 spin_lock_init(&fs_info->treelog_bg_lock); 3168 spin_lock_init(&fs_info->zone_active_bgs_lock); 3169 spin_lock_init(&fs_info->relocation_bg_lock); 3170 rwlock_init(&fs_info->tree_mod_log_lock); 3171 rwlock_init(&fs_info->global_root_lock); 3172 mutex_init(&fs_info->unused_bg_unpin_mutex); 3173 mutex_init(&fs_info->reclaim_bgs_lock); 3174 mutex_init(&fs_info->reloc_mutex); 3175 mutex_init(&fs_info->delalloc_root_mutex); 3176 mutex_init(&fs_info->zoned_meta_io_lock); 3177 mutex_init(&fs_info->zoned_data_reloc_io_lock); 3178 seqlock_init(&fs_info->profiles_lock); 3179 3180 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 3181 INIT_LIST_HEAD(&fs_info->space_info); 3182 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 3183 INIT_LIST_HEAD(&fs_info->unused_bgs); 3184 INIT_LIST_HEAD(&fs_info->reclaim_bgs); 3185 INIT_LIST_HEAD(&fs_info->zone_active_bgs); 3186 #ifdef CONFIG_BTRFS_DEBUG 3187 INIT_LIST_HEAD(&fs_info->allocated_roots); 3188 INIT_LIST_HEAD(&fs_info->allocated_ebs); 3189 spin_lock_init(&fs_info->eb_leak_lock); 3190 #endif 3191 extent_map_tree_init(&fs_info->mapping_tree); 3192 btrfs_init_block_rsv(&fs_info->global_block_rsv, 3193 BTRFS_BLOCK_RSV_GLOBAL); 3194 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 3195 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 3196 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 3197 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, 3198 BTRFS_BLOCK_RSV_DELOPS); 3199 btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, 3200 BTRFS_BLOCK_RSV_DELREFS); 3201 3202 atomic_set(&fs_info->async_delalloc_pages, 0); 3203 atomic_set(&fs_info->defrag_running, 0); 3204 atomic_set(&fs_info->nr_delayed_iputs, 0); 3205 atomic64_set(&fs_info->tree_mod_seq, 0); 3206 fs_info->global_root_tree = RB_ROOT; 3207 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 3208 fs_info->metadata_ratio = 0; 3209 fs_info->defrag_inodes = RB_ROOT; 3210 atomic64_set(&fs_info->free_chunk_space, 0); 3211 fs_info->tree_mod_log = RB_ROOT; 3212 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 3213 fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ 3214 btrfs_init_ref_verify(fs_info); 3215 3216 fs_info->thread_pool_size = min_t(unsigned long, 3217 num_online_cpus() + 2, 8); 3218 3219 INIT_LIST_HEAD(&fs_info->ordered_roots); 3220 spin_lock_init(&fs_info->ordered_root_lock); 3221 3222 btrfs_init_scrub(fs_info); 3223 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3224 fs_info->check_integrity_print_mask = 0; 3225 #endif 3226 btrfs_init_balance(fs_info); 3227 btrfs_init_async_reclaim_work(fs_info); 3228 3229 rwlock_init(&fs_info->block_group_cache_lock); 3230 fs_info->block_group_cache_tree = RB_ROOT_CACHED; 3231 3232 extent_io_tree_init(fs_info, &fs_info->excluded_extents, 3233 IO_TREE_FS_EXCLUDED_EXTENTS, NULL); 3234 3235 mutex_init(&fs_info->ordered_operations_mutex); 3236 mutex_init(&fs_info->tree_log_mutex); 3237 mutex_init(&fs_info->chunk_mutex); 3238 mutex_init(&fs_info->transaction_kthread_mutex); 3239 mutex_init(&fs_info->cleaner_mutex); 3240 mutex_init(&fs_info->ro_block_group_mutex); 3241 init_rwsem(&fs_info->commit_root_sem); 3242 init_rwsem(&fs_info->cleanup_work_sem); 3243 init_rwsem(&fs_info->subvol_sem); 3244 sema_init(&fs_info->uuid_tree_rescan_sem, 1); 3245 3246 btrfs_init_dev_replace_locks(fs_info); 3247 btrfs_init_qgroup(fs_info); 3248 btrfs_discard_init(fs_info); 3249 3250 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 3251 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 3252 3253 init_waitqueue_head(&fs_info->transaction_throttle); 3254 init_waitqueue_head(&fs_info->transaction_wait); 3255 init_waitqueue_head(&fs_info->transaction_blocked_wait); 3256 init_waitqueue_head(&fs_info->async_submit_wait); 3257 init_waitqueue_head(&fs_info->delayed_iputs_wait); 3258 3259 /* Usable values until the real ones are cached from the superblock */ 3260 fs_info->nodesize = 4096; 3261 fs_info->sectorsize = 4096; 3262 fs_info->sectorsize_bits = ilog2(4096); 3263 fs_info->stripesize = 4096; 3264 3265 spin_lock_init(&fs_info->swapfile_pins_lock); 3266 fs_info->swapfile_pins = RB_ROOT; 3267 3268 fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; 3269 INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); 3270 } 3271 3272 static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) 3273 { 3274 int ret; 3275 3276 fs_info->sb = sb; 3277 sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; 3278 sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); 3279 3280 ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); 3281 if (ret) 3282 return ret; 3283 3284 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); 3285 if (ret) 3286 return ret; 3287 3288 fs_info->dirty_metadata_batch = PAGE_SIZE * 3289 (1 + ilog2(nr_cpu_ids)); 3290 3291 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); 3292 if (ret) 3293 return ret; 3294 3295 ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0, 3296 GFP_KERNEL); 3297 if (ret) 3298 return ret; 3299 3300 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 3301 GFP_KERNEL); 3302 if (!fs_info->delayed_root) 3303 return -ENOMEM; 3304 btrfs_init_delayed_root(fs_info->delayed_root); 3305 3306 if (sb_rdonly(sb)) 3307 set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); 3308 3309 return btrfs_alloc_stripe_hash_table(fs_info); 3310 } 3311 3312 static int btrfs_uuid_rescan_kthread(void *data) 3313 { 3314 struct btrfs_fs_info *fs_info = data; 3315 int ret; 3316 3317 /* 3318 * 1st step is to iterate through the existing UUID tree and 3319 * to delete all entries that contain outdated data. 3320 * 2nd step is to add all missing entries to the UUID tree. 3321 */ 3322 ret = btrfs_uuid_tree_iterate(fs_info); 3323 if (ret < 0) { 3324 if (ret != -EINTR) 3325 btrfs_warn(fs_info, "iterating uuid_tree failed %d", 3326 ret); 3327 up(&fs_info->uuid_tree_rescan_sem); 3328 return ret; 3329 } 3330 return btrfs_uuid_scan_kthread(data); 3331 } 3332 3333 static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) 3334 { 3335 struct task_struct *task; 3336 3337 down(&fs_info->uuid_tree_rescan_sem); 3338 task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); 3339 if (IS_ERR(task)) { 3340 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 3341 btrfs_warn(fs_info, "failed to start uuid_rescan task"); 3342 up(&fs_info->uuid_tree_rescan_sem); 3343 return PTR_ERR(task); 3344 } 3345 3346 return 0; 3347 } 3348 3349 /* 3350 * Some options only have meaning at mount time and shouldn't persist across 3351 * remounts, or be displayed. Clear these at the end of mount and remount 3352 * code paths. 3353 */ 3354 void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) 3355 { 3356 btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); 3357 btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); 3358 } 3359 3360 /* 3361 * Mounting logic specific to read-write file systems. Shared by open_ctree 3362 * and btrfs_remount when remounting from read-only to read-write. 3363 */ 3364 int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) 3365 { 3366 int ret; 3367 const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE); 3368 bool clear_free_space_tree = false; 3369 3370 if (btrfs_test_opt(fs_info, CLEAR_CACHE) && 3371 btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { 3372 clear_free_space_tree = true; 3373 } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && 3374 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { 3375 btrfs_warn(fs_info, "free space tree is invalid"); 3376 clear_free_space_tree = true; 3377 } 3378 3379 if (clear_free_space_tree) { 3380 btrfs_info(fs_info, "clearing free space tree"); 3381 ret = btrfs_clear_free_space_tree(fs_info); 3382 if (ret) { 3383 btrfs_warn(fs_info, 3384 "failed to clear free space tree: %d", ret); 3385 goto out; 3386 } 3387 } 3388 3389 /* 3390 * btrfs_find_orphan_roots() is responsible for finding all the dead 3391 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load 3392 * them into the fs_info->fs_roots_radix tree. This must be done before 3393 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it 3394 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan 3395 * item before the root's tree is deleted - this means that if we unmount 3396 * or crash before the deletion completes, on the next mount we will not 3397 * delete what remains of the tree because the orphan item does not 3398 * exists anymore, which is what tells us we have a pending deletion. 3399 */ 3400 ret = btrfs_find_orphan_roots(fs_info); 3401 if (ret) 3402 goto out; 3403 3404 ret = btrfs_cleanup_fs_roots(fs_info); 3405 if (ret) 3406 goto out; 3407 3408 down_read(&fs_info->cleanup_work_sem); 3409 if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || 3410 (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { 3411 up_read(&fs_info->cleanup_work_sem); 3412 goto out; 3413 } 3414 up_read(&fs_info->cleanup_work_sem); 3415 3416 mutex_lock(&fs_info->cleaner_mutex); 3417 ret = btrfs_recover_relocation(fs_info); 3418 mutex_unlock(&fs_info->cleaner_mutex); 3419 if (ret < 0) { 3420 btrfs_warn(fs_info, "failed to recover relocation: %d", ret); 3421 goto out; 3422 } 3423 3424 if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && 3425 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { 3426 btrfs_info(fs_info, "creating free space tree"); 3427 ret = btrfs_create_free_space_tree(fs_info); 3428 if (ret) { 3429 btrfs_warn(fs_info, 3430 "failed to create free space tree: %d", ret); 3431 goto out; 3432 } 3433 } 3434 3435 if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { 3436 ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); 3437 if (ret) 3438 goto out; 3439 } 3440 3441 ret = btrfs_resume_balance_async(fs_info); 3442 if (ret) 3443 goto out; 3444 3445 ret = btrfs_resume_dev_replace_async(fs_info); 3446 if (ret) { 3447 btrfs_warn(fs_info, "failed to resume dev_replace"); 3448 goto out; 3449 } 3450 3451 btrfs_qgroup_rescan_resume(fs_info); 3452 3453 if (!fs_info->uuid_root) { 3454 btrfs_info(fs_info, "creating UUID tree"); 3455 ret = btrfs_create_uuid_tree(fs_info); 3456 if (ret) { 3457 btrfs_warn(fs_info, 3458 "failed to create the UUID tree %d", ret); 3459 goto out; 3460 } 3461 } 3462 3463 out: 3464 return ret; 3465 } 3466 3467 int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, 3468 char *options) 3469 { 3470 u32 sectorsize; 3471 u32 nodesize; 3472 u32 stripesize; 3473 u64 generation; 3474 u64 features; 3475 u16 csum_type; 3476 struct btrfs_super_block *disk_super; 3477 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 3478 struct btrfs_root *tree_root; 3479 struct btrfs_root *chunk_root; 3480 int ret; 3481 int err = -EINVAL; 3482 int level; 3483 3484 ret = init_mount_fs_info(fs_info, sb); 3485 if (ret) { 3486 err = ret; 3487 goto fail; 3488 } 3489 3490 /* These need to be init'ed before we start creating inodes and such. */ 3491 tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, 3492 GFP_KERNEL); 3493 fs_info->tree_root = tree_root; 3494 chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID, 3495 GFP_KERNEL); 3496 fs_info->chunk_root = chunk_root; 3497 if (!tree_root || !chunk_root) { 3498 err = -ENOMEM; 3499 goto fail; 3500 } 3501 3502 fs_info->btree_inode = new_inode(sb); 3503 if (!fs_info->btree_inode) { 3504 err = -ENOMEM; 3505 goto fail; 3506 } 3507 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); 3508 btrfs_init_btree_inode(fs_info); 3509 3510 invalidate_bdev(fs_devices->latest_dev->bdev); 3511 3512 /* 3513 * Read super block and check the signature bytes only 3514 */ 3515 disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); 3516 if (IS_ERR(disk_super)) { 3517 err = PTR_ERR(disk_super); 3518 goto fail_alloc; 3519 } 3520 3521 /* 3522 * Verify the type first, if that or the checksum value are 3523 * corrupted, we'll find out 3524 */ 3525 csum_type = btrfs_super_csum_type(disk_super); 3526 if (!btrfs_supported_super_csum(csum_type)) { 3527 btrfs_err(fs_info, "unsupported checksum algorithm: %u", 3528 csum_type); 3529 err = -EINVAL; 3530 btrfs_release_disk_super(disk_super); 3531 goto fail_alloc; 3532 } 3533 3534 fs_info->csum_size = btrfs_super_csum_size(disk_super); 3535 3536 ret = btrfs_init_csum_hash(fs_info, csum_type); 3537 if (ret) { 3538 err = ret; 3539 btrfs_release_disk_super(disk_super); 3540 goto fail_alloc; 3541 } 3542 3543 /* 3544 * We want to check superblock checksum, the type is stored inside. 3545 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). 3546 */ 3547 if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) { 3548 btrfs_err(fs_info, "superblock checksum mismatch"); 3549 err = -EINVAL; 3550 btrfs_release_disk_super(disk_super); 3551 goto fail_alloc; 3552 } 3553 3554 /* 3555 * super_copy is zeroed at allocation time and we never touch the 3556 * following bytes up to INFO_SIZE, the checksum is calculated from 3557 * the whole block of INFO_SIZE 3558 */ 3559 memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy)); 3560 btrfs_release_disk_super(disk_super); 3561 3562 disk_super = fs_info->super_copy; 3563 3564 3565 features = btrfs_super_flags(disk_super); 3566 if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { 3567 features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; 3568 btrfs_set_super_flags(disk_super, features); 3569 btrfs_info(fs_info, 3570 "found metadata UUID change in progress flag, clearing"); 3571 } 3572 3573 memcpy(fs_info->super_for_commit, fs_info->super_copy, 3574 sizeof(*fs_info->super_for_commit)); 3575 3576 ret = btrfs_validate_mount_super(fs_info); 3577 if (ret) { 3578 btrfs_err(fs_info, "superblock contains fatal errors"); 3579 err = -EINVAL; 3580 goto fail_alloc; 3581 } 3582 3583 if (!btrfs_super_root(disk_super)) 3584 goto fail_alloc; 3585 3586 /* check FS state, whether FS is broken. */ 3587 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) 3588 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); 3589 3590 /* 3591 * In the long term, we'll store the compression type in the super 3592 * block, and it'll be used for per file compression control. 3593 */ 3594 fs_info->compress_type = BTRFS_COMPRESS_ZLIB; 3595 3596 /* 3597 * Flag our filesystem as having big metadata blocks if they are bigger 3598 * than the page size. 3599 */ 3600 if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { 3601 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 3602 btrfs_info(fs_info, 3603 "flagging fs with big metadata feature"); 3604 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 3605 } 3606 3607 /* Set up fs_info before parsing mount options */ 3608 nodesize = btrfs_super_nodesize(disk_super); 3609 sectorsize = btrfs_super_sectorsize(disk_super); 3610 stripesize = sectorsize; 3611 fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); 3612 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); 3613 3614 fs_info->nodesize = nodesize; 3615 fs_info->sectorsize = sectorsize; 3616 fs_info->sectorsize_bits = ilog2(sectorsize); 3617 fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; 3618 fs_info->stripesize = stripesize; 3619 3620 ret = btrfs_parse_options(fs_info, options, sb->s_flags); 3621 if (ret) { 3622 err = ret; 3623 goto fail_alloc; 3624 } 3625 3626 features = btrfs_super_incompat_flags(disk_super) & 3627 ~BTRFS_FEATURE_INCOMPAT_SUPP; 3628 if (features) { 3629 btrfs_err(fs_info, 3630 "cannot mount because of unsupported optional features (0x%llx)", 3631 features); 3632 err = -EINVAL; 3633 goto fail_alloc; 3634 } 3635 3636 features = btrfs_super_incompat_flags(disk_super); 3637 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 3638 if (fs_info->compress_type == BTRFS_COMPRESS_LZO) 3639 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 3640 else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) 3641 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; 3642 3643 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) 3644 btrfs_info(fs_info, "has skinny extents"); 3645 3646 /* 3647 * mixed block groups end up with duplicate but slightly offset 3648 * extent buffers for the same range. It leads to corruptions 3649 */ 3650 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 3651 (sectorsize != nodesize)) { 3652 btrfs_err(fs_info, 3653 "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", 3654 nodesize, sectorsize); 3655 goto fail_alloc; 3656 } 3657 3658 /* 3659 * Needn't use the lock because there is no other task which will 3660 * update the flag. 3661 */ 3662 btrfs_set_super_incompat_flags(disk_super, features); 3663 3664 features = btrfs_super_compat_ro_flags(disk_super) & 3665 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 3666 if (!sb_rdonly(sb) && features) { 3667 btrfs_err(fs_info, 3668 "cannot mount read-write because of unsupported optional features (0x%llx)", 3669 features); 3670 err = -EINVAL; 3671 goto fail_alloc; 3672 } 3673 3674 if (sectorsize < PAGE_SIZE) { 3675 struct btrfs_subpage_info *subpage_info; 3676 3677 /* 3678 * V1 space cache has some hardcoded PAGE_SIZE usage, and is 3679 * going to be deprecated. 3680 * 3681 * Force to use v2 cache for subpage case. 3682 */ 3683 btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); 3684 btrfs_set_and_info(fs_info, FREE_SPACE_TREE, 3685 "forcing free space tree for sector size %u with page size %lu", 3686 sectorsize, PAGE_SIZE); 3687 3688 btrfs_warn(fs_info, 3689 "read-write for sector size %u with page size %lu is experimental", 3690 sectorsize, PAGE_SIZE); 3691 subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); 3692 if (!subpage_info) 3693 goto fail_alloc; 3694 btrfs_init_subpage_info(subpage_info, sectorsize); 3695 fs_info->subpage_info = subpage_info; 3696 } 3697 3698 ret = btrfs_init_workqueues(fs_info); 3699 if (ret) { 3700 err = ret; 3701 goto fail_sb_buffer; 3702 } 3703 3704 sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); 3705 sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); 3706 3707 sb->s_blocksize = sectorsize; 3708 sb->s_blocksize_bits = blksize_bits(sectorsize); 3709 memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); 3710 3711 mutex_lock(&fs_info->chunk_mutex); 3712 ret = btrfs_read_sys_array(fs_info); 3713 mutex_unlock(&fs_info->chunk_mutex); 3714 if (ret) { 3715 btrfs_err(fs_info, "failed to read the system array: %d", ret); 3716 goto fail_sb_buffer; 3717 } 3718 3719 generation = btrfs_super_chunk_root_generation(disk_super); 3720 level = btrfs_super_chunk_root_level(disk_super); 3721 ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), 3722 generation, level); 3723 if (ret) { 3724 btrfs_err(fs_info, "failed to read chunk root"); 3725 goto fail_tree_roots; 3726 } 3727 3728 read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, 3729 offsetof(struct btrfs_header, chunk_tree_uuid), 3730 BTRFS_UUID_SIZE); 3731 3732 ret = btrfs_read_chunk_tree(fs_info); 3733 if (ret) { 3734 btrfs_err(fs_info, "failed to read chunk tree: %d", ret); 3735 goto fail_tree_roots; 3736 } 3737 3738 /* 3739 * At this point we know all the devices that make this filesystem, 3740 * including the seed devices but we don't know yet if the replace 3741 * target is required. So free devices that are not part of this 3742 * filesystem but skip the replace target device which is checked 3743 * below in btrfs_init_dev_replace(). 3744 */ 3745 btrfs_free_extra_devids(fs_devices); 3746 if (!fs_devices->latest_dev->bdev) { 3747 btrfs_err(fs_info, "failed to read devices"); 3748 goto fail_tree_roots; 3749 } 3750 3751 ret = init_tree_roots(fs_info); 3752 if (ret) 3753 goto fail_tree_roots; 3754 3755 /* 3756 * Get zone type information of zoned block devices. This will also 3757 * handle emulation of a zoned filesystem if a regular device has the 3758 * zoned incompat feature flag set. 3759 */ 3760 ret = btrfs_get_dev_zone_info_all_devices(fs_info); 3761 if (ret) { 3762 btrfs_err(fs_info, 3763 "zoned: failed to read device zone info: %d", 3764 ret); 3765 goto fail_block_groups; 3766 } 3767 3768 /* 3769 * If we have a uuid root and we're not being told to rescan we need to 3770 * check the generation here so we can set the 3771 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the 3772 * transaction during a balance or the log replay without updating the 3773 * uuid generation, and then if we crash we would rescan the uuid tree, 3774 * even though it was perfectly fine. 3775 */ 3776 if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && 3777 fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) 3778 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 3779 3780 ret = btrfs_verify_dev_extents(fs_info); 3781 if (ret) { 3782 btrfs_err(fs_info, 3783 "failed to verify dev extents against chunks: %d", 3784 ret); 3785 goto fail_block_groups; 3786 } 3787 ret = btrfs_recover_balance(fs_info); 3788 if (ret) { 3789 btrfs_err(fs_info, "failed to recover balance: %d", ret); 3790 goto fail_block_groups; 3791 } 3792 3793 ret = btrfs_init_dev_stats(fs_info); 3794 if (ret) { 3795 btrfs_err(fs_info, "failed to init dev_stats: %d", ret); 3796 goto fail_block_groups; 3797 } 3798 3799 ret = btrfs_init_dev_replace(fs_info); 3800 if (ret) { 3801 btrfs_err(fs_info, "failed to init dev_replace: %d", ret); 3802 goto fail_block_groups; 3803 } 3804 3805 ret = btrfs_check_zoned_mode(fs_info); 3806 if (ret) { 3807 btrfs_err(fs_info, "failed to initialize zoned mode: %d", 3808 ret); 3809 goto fail_block_groups; 3810 } 3811 3812 ret = btrfs_sysfs_add_fsid(fs_devices); 3813 if (ret) { 3814 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", 3815 ret); 3816 goto fail_block_groups; 3817 } 3818 3819 ret = btrfs_sysfs_add_mounted(fs_info); 3820 if (ret) { 3821 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); 3822 goto fail_fsdev_sysfs; 3823 } 3824 3825 ret = btrfs_init_space_info(fs_info); 3826 if (ret) { 3827 btrfs_err(fs_info, "failed to initialize space info: %d", ret); 3828 goto fail_sysfs; 3829 } 3830 3831 ret = btrfs_read_block_groups(fs_info); 3832 if (ret) { 3833 btrfs_err(fs_info, "failed to read block groups: %d", ret); 3834 goto fail_sysfs; 3835 } 3836 3837 btrfs_free_zone_cache(fs_info); 3838 3839 if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && 3840 !btrfs_check_rw_degradable(fs_info, NULL)) { 3841 btrfs_warn(fs_info, 3842 "writable mount is not allowed due to too many missing devices"); 3843 goto fail_sysfs; 3844 } 3845 3846 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, 3847 "btrfs-cleaner"); 3848 if (IS_ERR(fs_info->cleaner_kthread)) 3849 goto fail_sysfs; 3850 3851 fs_info->transaction_kthread = kthread_run(transaction_kthread, 3852 tree_root, 3853 "btrfs-transaction"); 3854 if (IS_ERR(fs_info->transaction_kthread)) 3855 goto fail_cleaner; 3856 3857 if (!btrfs_test_opt(fs_info, NOSSD) && 3858 !fs_info->fs_devices->rotating) { 3859 btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); 3860 } 3861 3862 /* 3863 * Mount does not set all options immediately, we can do it now and do 3864 * not have to wait for transaction commit 3865 */ 3866 btrfs_apply_pending_changes(fs_info); 3867 3868 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 3869 if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { 3870 ret = btrfsic_mount(fs_info, fs_devices, 3871 btrfs_test_opt(fs_info, 3872 CHECK_INTEGRITY_DATA) ? 1 : 0, 3873 fs_info->check_integrity_print_mask); 3874 if (ret) 3875 btrfs_warn(fs_info, 3876 "failed to initialize integrity check module: %d", 3877 ret); 3878 } 3879 #endif 3880 ret = btrfs_read_qgroup_config(fs_info); 3881 if (ret) 3882 goto fail_trans_kthread; 3883 3884 if (btrfs_build_ref_tree(fs_info)) 3885 btrfs_err(fs_info, "couldn't build ref tree"); 3886 3887 /* do not make disk changes in broken FS or nologreplay is given */ 3888 if (btrfs_super_log_root(disk_super) != 0 && 3889 !btrfs_test_opt(fs_info, NOLOGREPLAY)) { 3890 btrfs_info(fs_info, "start tree-log replay"); 3891 ret = btrfs_replay_log(fs_info, fs_devices); 3892 if (ret) { 3893 err = ret; 3894 goto fail_qgroup; 3895 } 3896 } 3897 3898 fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); 3899 if (IS_ERR(fs_info->fs_root)) { 3900 err = PTR_ERR(fs_info->fs_root); 3901 btrfs_warn(fs_info, "failed to read fs tree: %d", err); 3902 fs_info->fs_root = NULL; 3903 goto fail_qgroup; 3904 } 3905 3906 if (sb_rdonly(sb)) 3907 goto clear_oneshot; 3908 3909 ret = btrfs_start_pre_rw_mount(fs_info); 3910 if (ret) { 3911 close_ctree(fs_info); 3912 return ret; 3913 } 3914 btrfs_discard_resume(fs_info); 3915 3916 if (fs_info->uuid_root && 3917 (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || 3918 fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { 3919 btrfs_info(fs_info, "checking UUID tree"); 3920 ret = btrfs_check_uuid_tree(fs_info); 3921 if (ret) { 3922 btrfs_warn(fs_info, 3923 "failed to check the UUID tree: %d", ret); 3924 close_ctree(fs_info); 3925 return ret; 3926 } 3927 } 3928 3929 set_bit(BTRFS_FS_OPEN, &fs_info->flags); 3930 3931 /* Kick the cleaner thread so it'll start deleting snapshots. */ 3932 if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) 3933 wake_up_process(fs_info->cleaner_kthread); 3934 3935 clear_oneshot: 3936 btrfs_clear_oneshot_options(fs_info); 3937 return 0; 3938 3939 fail_qgroup: 3940 btrfs_free_qgroup_config(fs_info); 3941 fail_trans_kthread: 3942 kthread_stop(fs_info->transaction_kthread); 3943 btrfs_cleanup_transaction(fs_info); 3944 btrfs_free_fs_roots(fs_info); 3945 fail_cleaner: 3946 kthread_stop(fs_info->cleaner_kthread); 3947 3948 /* 3949 * make sure we're done with the btree inode before we stop our 3950 * kthreads 3951 */ 3952 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 3953 3954 fail_sysfs: 3955 btrfs_sysfs_remove_mounted(fs_info); 3956 3957 fail_fsdev_sysfs: 3958 btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3959 3960 fail_block_groups: 3961 btrfs_put_block_group_cache(fs_info); 3962 3963 fail_tree_roots: 3964 if (fs_info->data_reloc_root) 3965 btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root); 3966 free_root_pointers(fs_info, true); 3967 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 3968 3969 fail_sb_buffer: 3970 btrfs_stop_all_workers(fs_info); 3971 btrfs_free_block_groups(fs_info); 3972 fail_alloc: 3973 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3974 3975 iput(fs_info->btree_inode); 3976 fail: 3977 btrfs_close_devices(fs_info->fs_devices); 3978 return err; 3979 } 3980 ALLOW_ERROR_INJECTION(open_ctree, ERRNO); 3981 3982 static void btrfs_end_super_write(struct bio *bio) 3983 { 3984 struct btrfs_device *device = bio->bi_private; 3985 struct bio_vec *bvec; 3986 struct bvec_iter_all iter_all; 3987 struct page *page; 3988 3989 bio_for_each_segment_all(bvec, bio, iter_all) { 3990 page = bvec->bv_page; 3991 3992 if (bio->bi_status) { 3993 btrfs_warn_rl_in_rcu(device->fs_info, 3994 "lost page write due to IO error on %s (%d)", 3995 rcu_str_deref(device->name), 3996 blk_status_to_errno(bio->bi_status)); 3997 ClearPageUptodate(page); 3998 SetPageError(page); 3999 btrfs_dev_stat_inc_and_print(device, 4000 BTRFS_DEV_STAT_WRITE_ERRS); 4001 } else { 4002 SetPageUptodate(page); 4003 } 4004 4005 put_page(page); 4006 unlock_page(page); 4007 } 4008 4009 bio_put(bio); 4010 } 4011 4012 struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, 4013 int copy_num) 4014 { 4015 struct btrfs_super_block *super; 4016 struct page *page; 4017 u64 bytenr, bytenr_orig; 4018 struct address_space *mapping = bdev->bd_inode->i_mapping; 4019 int ret; 4020 4021 bytenr_orig = btrfs_sb_offset(copy_num); 4022 ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); 4023 if (ret == -ENOENT) 4024 return ERR_PTR(-EINVAL); 4025 else if (ret) 4026 return ERR_PTR(ret); 4027 4028 if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) 4029 return ERR_PTR(-EINVAL); 4030 4031 page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); 4032 if (IS_ERR(page)) 4033 return ERR_CAST(page); 4034 4035 super = page_address(page); 4036 if (btrfs_super_magic(super) != BTRFS_MAGIC) { 4037 btrfs_release_disk_super(super); 4038 return ERR_PTR(-ENODATA); 4039 } 4040 4041 if (btrfs_super_bytenr(super) != bytenr_orig) { 4042 btrfs_release_disk_super(super); 4043 return ERR_PTR(-EINVAL); 4044 } 4045 4046 return super; 4047 } 4048 4049 4050 struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) 4051 { 4052 struct btrfs_super_block *super, *latest = NULL; 4053 int i; 4054 u64 transid = 0; 4055 4056 /* we would like to check all the supers, but that would make 4057 * a btrfs mount succeed after a mkfs from a different FS. 4058 * So, we need to add a special mount option to scan for 4059 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 4060 */ 4061 for (i = 0; i < 1; i++) { 4062 super = btrfs_read_dev_one_super(bdev, i); 4063 if (IS_ERR(super)) 4064 continue; 4065 4066 if (!latest || btrfs_super_generation(super) > transid) { 4067 if (latest) 4068 btrfs_release_disk_super(super); 4069 4070 latest = super; 4071 transid = btrfs_super_generation(super); 4072 } 4073 } 4074 4075 return super; 4076 } 4077 4078 /* 4079 * Write superblock @sb to the @device. Do not wait for completion, all the 4080 * pages we use for writing are locked. 4081 * 4082 * Write @max_mirrors copies of the superblock, where 0 means default that fit 4083 * the expected device size at commit time. Note that max_mirrors must be 4084 * same for write and wait phases. 4085 * 4086 * Return number of errors when page is not found or submission fails. 4087 */ 4088 static int write_dev_supers(struct btrfs_device *device, 4089 struct btrfs_super_block *sb, int max_mirrors) 4090 { 4091 struct btrfs_fs_info *fs_info = device->fs_info; 4092 struct address_space *mapping = device->bdev->bd_inode->i_mapping; 4093 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 4094 int i; 4095 int errors = 0; 4096 int ret; 4097 u64 bytenr, bytenr_orig; 4098 4099 if (max_mirrors == 0) 4100 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 4101 4102 shash->tfm = fs_info->csum_shash; 4103 4104 for (i = 0; i < max_mirrors; i++) { 4105 struct page *page; 4106 struct bio *bio; 4107 struct btrfs_super_block *disk_super; 4108 4109 bytenr_orig = btrfs_sb_offset(i); 4110 ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); 4111 if (ret == -ENOENT) { 4112 continue; 4113 } else if (ret < 0) { 4114 btrfs_err(device->fs_info, 4115 "couldn't get super block location for mirror %d", 4116 i); 4117 errors++; 4118 continue; 4119 } 4120 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 4121 device->commit_total_bytes) 4122 break; 4123 4124 btrfs_set_super_bytenr(sb, bytenr_orig); 4125 4126 crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, 4127 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, 4128 sb->csum); 4129 4130 page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, 4131 GFP_NOFS); 4132 if (!page) { 4133 btrfs_err(device->fs_info, 4134 "couldn't get super block page for bytenr %llu", 4135 bytenr); 4136 errors++; 4137 continue; 4138 } 4139 4140 /* Bump the refcount for wait_dev_supers() */ 4141 get_page(page); 4142 4143 disk_super = page_address(page); 4144 memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); 4145 4146 /* 4147 * Directly use bios here instead of relying on the page cache 4148 * to do I/O, so we don't lose the ability to do integrity 4149 * checking. 4150 */ 4151 bio = bio_alloc(device->bdev, 1, 4152 REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, 4153 GFP_NOFS); 4154 bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; 4155 bio->bi_private = device; 4156 bio->bi_end_io = btrfs_end_super_write; 4157 __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, 4158 offset_in_page(bytenr)); 4159 4160 /* 4161 * We FUA only the first super block. The others we allow to 4162 * go down lazy and there's a short window where the on-disk 4163 * copies might still contain the older version. 4164 */ 4165 if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) 4166 bio->bi_opf |= REQ_FUA; 4167 4168 btrfsic_check_bio(bio); 4169 submit_bio(bio); 4170 4171 if (btrfs_advance_sb_log(device, i)) 4172 errors++; 4173 } 4174 return errors < i ? 0 : -1; 4175 } 4176 4177 /* 4178 * Wait for write completion of superblocks done by write_dev_supers, 4179 * @max_mirrors same for write and wait phases. 4180 * 4181 * Return number of errors when page is not found or not marked up to 4182 * date. 4183 */ 4184 static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) 4185 { 4186 int i; 4187 int errors = 0; 4188 bool primary_failed = false; 4189 int ret; 4190 u64 bytenr; 4191 4192 if (max_mirrors == 0) 4193 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 4194 4195 for (i = 0; i < max_mirrors; i++) { 4196 struct page *page; 4197 4198 ret = btrfs_sb_log_location(device, i, READ, &bytenr); 4199 if (ret == -ENOENT) { 4200 break; 4201 } else if (ret < 0) { 4202 errors++; 4203 if (i == 0) 4204 primary_failed = true; 4205 continue; 4206 } 4207 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 4208 device->commit_total_bytes) 4209 break; 4210 4211 page = find_get_page(device->bdev->bd_inode->i_mapping, 4212 bytenr >> PAGE_SHIFT); 4213 if (!page) { 4214 errors++; 4215 if (i == 0) 4216 primary_failed = true; 4217 continue; 4218 } 4219 /* Page is submitted locked and unlocked once the IO completes */ 4220 wait_on_page_locked(page); 4221 if (PageError(page)) { 4222 errors++; 4223 if (i == 0) 4224 primary_failed = true; 4225 } 4226 4227 /* Drop our reference */ 4228 put_page(page); 4229 4230 /* Drop the reference from the writing run */ 4231 put_page(page); 4232 } 4233 4234 /* log error, force error return */ 4235 if (primary_failed) { 4236 btrfs_err(device->fs_info, "error writing primary super block to device %llu", 4237 device->devid); 4238 return -1; 4239 } 4240 4241 return errors < i ? 0 : -1; 4242 } 4243 4244 /* 4245 * endio for the write_dev_flush, this will wake anyone waiting 4246 * for the barrier when it is done 4247 */ 4248 static void btrfs_end_empty_barrier(struct bio *bio) 4249 { 4250 bio_uninit(bio); 4251 complete(bio->bi_private); 4252 } 4253 4254 /* 4255 * Submit a flush request to the device if it supports it. Error handling is 4256 * done in the waiting counterpart. 4257 */ 4258 static void write_dev_flush(struct btrfs_device *device) 4259 { 4260 struct bio *bio = &device->flush_bio; 4261 4262 #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY 4263 /* 4264 * When a disk has write caching disabled, we skip submission of a bio 4265 * with flush and sync requests before writing the superblock, since 4266 * it's not needed. However when the integrity checker is enabled, this 4267 * results in reports that there are metadata blocks referred by a 4268 * superblock that were not properly flushed. So don't skip the bio 4269 * submission only when the integrity checker is enabled for the sake 4270 * of simplicity, since this is a debug tool and not meant for use in 4271 * non-debug builds. 4272 */ 4273 if (!bdev_write_cache(device->bdev)) 4274 return; 4275 #endif 4276 4277 bio_init(bio, device->bdev, NULL, 0, 4278 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); 4279 bio->bi_end_io = btrfs_end_empty_barrier; 4280 init_completion(&device->flush_wait); 4281 bio->bi_private = &device->flush_wait; 4282 4283 btrfsic_check_bio(bio); 4284 submit_bio(bio); 4285 set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); 4286 } 4287 4288 /* 4289 * If the flush bio has been submitted by write_dev_flush, wait for it. 4290 */ 4291 static blk_status_t wait_dev_flush(struct btrfs_device *device) 4292 { 4293 struct bio *bio = &device->flush_bio; 4294 4295 if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) 4296 return BLK_STS_OK; 4297 4298 clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); 4299 wait_for_completion_io(&device->flush_wait); 4300 4301 return bio->bi_status; 4302 } 4303 4304 static int check_barrier_error(struct btrfs_fs_info *fs_info) 4305 { 4306 if (!btrfs_check_rw_degradable(fs_info, NULL)) 4307 return -EIO; 4308 return 0; 4309 } 4310 4311 /* 4312 * send an empty flush down to each device in parallel, 4313 * then wait for them 4314 */ 4315 static int barrier_all_devices(struct btrfs_fs_info *info) 4316 { 4317 struct list_head *head; 4318 struct btrfs_device *dev; 4319 int errors_wait = 0; 4320 blk_status_t ret; 4321 4322 lockdep_assert_held(&info->fs_devices->device_list_mutex); 4323 /* send down all the barriers */ 4324 head = &info->fs_devices->devices; 4325 list_for_each_entry(dev, head, dev_list) { 4326 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 4327 continue; 4328 if (!dev->bdev) 4329 continue; 4330 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4331 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 4332 continue; 4333 4334 write_dev_flush(dev); 4335 dev->last_flush_error = BLK_STS_OK; 4336 } 4337 4338 /* wait for all the barriers */ 4339 list_for_each_entry(dev, head, dev_list) { 4340 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 4341 continue; 4342 if (!dev->bdev) { 4343 errors_wait++; 4344 continue; 4345 } 4346 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4347 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 4348 continue; 4349 4350 ret = wait_dev_flush(dev); 4351 if (ret) { 4352 dev->last_flush_error = ret; 4353 btrfs_dev_stat_inc_and_print(dev, 4354 BTRFS_DEV_STAT_FLUSH_ERRS); 4355 errors_wait++; 4356 } 4357 } 4358 4359 if (errors_wait) { 4360 /* 4361 * At some point we need the status of all disks 4362 * to arrive at the volume status. So error checking 4363 * is being pushed to a separate loop. 4364 */ 4365 return check_barrier_error(info); 4366 } 4367 return 0; 4368 } 4369 4370 int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) 4371 { 4372 int raid_type; 4373 int min_tolerated = INT_MAX; 4374 4375 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || 4376 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) 4377 min_tolerated = min_t(int, min_tolerated, 4378 btrfs_raid_array[BTRFS_RAID_SINGLE]. 4379 tolerated_failures); 4380 4381 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { 4382 if (raid_type == BTRFS_RAID_SINGLE) 4383 continue; 4384 if (!(flags & btrfs_raid_array[raid_type].bg_flag)) 4385 continue; 4386 min_tolerated = min_t(int, min_tolerated, 4387 btrfs_raid_array[raid_type]. 4388 tolerated_failures); 4389 } 4390 4391 if (min_tolerated == INT_MAX) { 4392 pr_warn("BTRFS: unknown raid flag: %llu", flags); 4393 min_tolerated = 0; 4394 } 4395 4396 return min_tolerated; 4397 } 4398 4399 int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) 4400 { 4401 struct list_head *head; 4402 struct btrfs_device *dev; 4403 struct btrfs_super_block *sb; 4404 struct btrfs_dev_item *dev_item; 4405 int ret; 4406 int do_barriers; 4407 int max_errors; 4408 int total_errors = 0; 4409 u64 flags; 4410 4411 do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); 4412 4413 /* 4414 * max_mirrors == 0 indicates we're from commit_transaction, 4415 * not from fsync where the tree roots in fs_info have not 4416 * been consistent on disk. 4417 */ 4418 if (max_mirrors == 0) 4419 backup_super_roots(fs_info); 4420 4421 sb = fs_info->super_for_commit; 4422 dev_item = &sb->dev_item; 4423 4424 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4425 head = &fs_info->fs_devices->devices; 4426 max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1; 4427 4428 if (do_barriers) { 4429 ret = barrier_all_devices(fs_info); 4430 if (ret) { 4431 mutex_unlock( 4432 &fs_info->fs_devices->device_list_mutex); 4433 btrfs_handle_fs_error(fs_info, ret, 4434 "errors while submitting device barriers."); 4435 return ret; 4436 } 4437 } 4438 4439 list_for_each_entry(dev, head, dev_list) { 4440 if (!dev->bdev) { 4441 total_errors++; 4442 continue; 4443 } 4444 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4445 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 4446 continue; 4447 4448 btrfs_set_stack_device_generation(dev_item, 0); 4449 btrfs_set_stack_device_type(dev_item, dev->type); 4450 btrfs_set_stack_device_id(dev_item, dev->devid); 4451 btrfs_set_stack_device_total_bytes(dev_item, 4452 dev->commit_total_bytes); 4453 btrfs_set_stack_device_bytes_used(dev_item, 4454 dev->commit_bytes_used); 4455 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 4456 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 4457 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 4458 memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); 4459 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, 4460 BTRFS_FSID_SIZE); 4461 4462 flags = btrfs_super_flags(sb); 4463 btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); 4464 4465 ret = btrfs_validate_write_super(fs_info, sb); 4466 if (ret < 0) { 4467 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4468 btrfs_handle_fs_error(fs_info, -EUCLEAN, 4469 "unexpected superblock corruption detected"); 4470 return -EUCLEAN; 4471 } 4472 4473 ret = write_dev_supers(dev, sb, max_mirrors); 4474 if (ret) 4475 total_errors++; 4476 } 4477 if (total_errors > max_errors) { 4478 btrfs_err(fs_info, "%d errors while writing supers", 4479 total_errors); 4480 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4481 4482 /* FUA is masked off if unsupported and can't be the reason */ 4483 btrfs_handle_fs_error(fs_info, -EIO, 4484 "%d errors while writing supers", 4485 total_errors); 4486 return -EIO; 4487 } 4488 4489 total_errors = 0; 4490 list_for_each_entry(dev, head, dev_list) { 4491 if (!dev->bdev) 4492 continue; 4493 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4494 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 4495 continue; 4496 4497 ret = wait_dev_supers(dev, max_mirrors); 4498 if (ret) 4499 total_errors++; 4500 } 4501 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4502 if (total_errors > max_errors) { 4503 btrfs_handle_fs_error(fs_info, -EIO, 4504 "%d errors while writing supers", 4505 total_errors); 4506 return -EIO; 4507 } 4508 return 0; 4509 } 4510 4511 /* Drop a fs root from the radix tree and free it. */ 4512 void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, 4513 struct btrfs_root *root) 4514 { 4515 bool drop_ref = false; 4516 4517 spin_lock(&fs_info->fs_roots_radix_lock); 4518 radix_tree_delete(&fs_info->fs_roots_radix, 4519 (unsigned long)root->root_key.objectid); 4520 if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) 4521 drop_ref = true; 4522 spin_unlock(&fs_info->fs_roots_radix_lock); 4523 4524 if (BTRFS_FS_ERROR(fs_info)) { 4525 ASSERT(root->log_root == NULL); 4526 if (root->reloc_root) { 4527 btrfs_put_root(root->reloc_root); 4528 root->reloc_root = NULL; 4529 } 4530 } 4531 4532 if (drop_ref) 4533 btrfs_put_root(root); 4534 } 4535 4536 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) 4537 { 4538 u64 root_objectid = 0; 4539 struct btrfs_root *gang[8]; 4540 int i = 0; 4541 int err = 0; 4542 unsigned int ret = 0; 4543 4544 while (1) { 4545 spin_lock(&fs_info->fs_roots_radix_lock); 4546 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 4547 (void **)gang, root_objectid, 4548 ARRAY_SIZE(gang)); 4549 if (!ret) { 4550 spin_unlock(&fs_info->fs_roots_radix_lock); 4551 break; 4552 } 4553 root_objectid = gang[ret - 1]->root_key.objectid + 1; 4554 4555 for (i = 0; i < ret; i++) { 4556 /* Avoid to grab roots in dead_roots */ 4557 if (btrfs_root_refs(&gang[i]->root_item) == 0) { 4558 gang[i] = NULL; 4559 continue; 4560 } 4561 /* grab all the search result for later use */ 4562 gang[i] = btrfs_grab_root(gang[i]); 4563 } 4564 spin_unlock(&fs_info->fs_roots_radix_lock); 4565 4566 for (i = 0; i < ret; i++) { 4567 if (!gang[i]) 4568 continue; 4569 root_objectid = gang[i]->root_key.objectid; 4570 err = btrfs_orphan_cleanup(gang[i]); 4571 if (err) 4572 break; 4573 btrfs_put_root(gang[i]); 4574 } 4575 root_objectid++; 4576 } 4577 4578 /* release the uncleaned roots due to error */ 4579 for (; i < ret; i++) { 4580 if (gang[i]) 4581 btrfs_put_root(gang[i]); 4582 } 4583 return err; 4584 } 4585 4586 int btrfs_commit_super(struct btrfs_fs_info *fs_info) 4587 { 4588 struct btrfs_root *root = fs_info->tree_root; 4589 struct btrfs_trans_handle *trans; 4590 4591 mutex_lock(&fs_info->cleaner_mutex); 4592 btrfs_run_delayed_iputs(fs_info); 4593 mutex_unlock(&fs_info->cleaner_mutex); 4594 wake_up_process(fs_info->cleaner_kthread); 4595 4596 /* wait until ongoing cleanup work done */ 4597 down_write(&fs_info->cleanup_work_sem); 4598 up_write(&fs_info->cleanup_work_sem); 4599 4600 trans = btrfs_join_transaction(root); 4601 if (IS_ERR(trans)) 4602 return PTR_ERR(trans); 4603 return btrfs_commit_transaction(trans); 4604 } 4605 4606 static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) 4607 { 4608 struct btrfs_transaction *trans; 4609 struct btrfs_transaction *tmp; 4610 bool found = false; 4611 4612 if (list_empty(&fs_info->trans_list)) 4613 return; 4614 4615 /* 4616 * This function is only called at the very end of close_ctree(), 4617 * thus no other running transaction, no need to take trans_lock. 4618 */ 4619 ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)); 4620 list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) { 4621 struct extent_state *cached = NULL; 4622 u64 dirty_bytes = 0; 4623 u64 cur = 0; 4624 u64 found_start; 4625 u64 found_end; 4626 4627 found = true; 4628 while (!find_first_extent_bit(&trans->dirty_pages, cur, 4629 &found_start, &found_end, EXTENT_DIRTY, &cached)) { 4630 dirty_bytes += found_end + 1 - found_start; 4631 cur = found_end + 1; 4632 } 4633 btrfs_warn(fs_info, 4634 "transaction %llu (with %llu dirty metadata bytes) is not committed", 4635 trans->transid, dirty_bytes); 4636 btrfs_cleanup_one_transaction(trans, fs_info); 4637 4638 if (trans == fs_info->running_transaction) 4639 fs_info->running_transaction = NULL; 4640 list_del_init(&trans->list); 4641 4642 btrfs_put_transaction(trans); 4643 trace_btrfs_transaction_commit(fs_info); 4644 } 4645 ASSERT(!found); 4646 } 4647 4648 void __cold close_ctree(struct btrfs_fs_info *fs_info) 4649 { 4650 int ret; 4651 4652 set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); 4653 4654 /* 4655 * We may have the reclaim task running and relocating a data block group, 4656 * in which case it may create delayed iputs. So stop it before we park 4657 * the cleaner kthread otherwise we can get new delayed iputs after 4658 * parking the cleaner, and that can make the async reclaim task to hang 4659 * if it's waiting for delayed iputs to complete, since the cleaner is 4660 * parked and can not run delayed iputs - this will make us hang when 4661 * trying to stop the async reclaim task. 4662 */ 4663 cancel_work_sync(&fs_info->reclaim_bgs_work); 4664 /* 4665 * We don't want the cleaner to start new transactions, add more delayed 4666 * iputs, etc. while we're closing. We can't use kthread_stop() yet 4667 * because that frees the task_struct, and the transaction kthread might 4668 * still try to wake up the cleaner. 4669 */ 4670 kthread_park(fs_info->cleaner_kthread); 4671 4672 /* 4673 * If we had UNFINISHED_DROPS we could still be processing them, so 4674 * clear that bit and wake up relocation so it can stop. 4675 */ 4676 btrfs_wake_unfinished_drop(fs_info); 4677 4678 /* wait for the qgroup rescan worker to stop */ 4679 btrfs_qgroup_wait_for_completion(fs_info, false); 4680 4681 /* wait for the uuid_scan task to finish */ 4682 down(&fs_info->uuid_tree_rescan_sem); 4683 /* avoid complains from lockdep et al., set sem back to initial state */ 4684 up(&fs_info->uuid_tree_rescan_sem); 4685 4686 /* pause restriper - we want to resume on mount */ 4687 btrfs_pause_balance(fs_info); 4688 4689 btrfs_dev_replace_suspend_for_unmount(fs_info); 4690 4691 btrfs_scrub_cancel(fs_info); 4692 4693 /* wait for any defraggers to finish */ 4694 wait_event(fs_info->transaction_wait, 4695 (atomic_read(&fs_info->defrag_running) == 0)); 4696 4697 /* clear out the rbtree of defraggable inodes */ 4698 btrfs_cleanup_defrag_inodes(fs_info); 4699 4700 cancel_work_sync(&fs_info->async_reclaim_work); 4701 cancel_work_sync(&fs_info->async_data_reclaim_work); 4702 cancel_work_sync(&fs_info->preempt_reclaim_work); 4703 4704 /* Cancel or finish ongoing discard work */ 4705 btrfs_discard_cleanup(fs_info); 4706 4707 if (!sb_rdonly(fs_info->sb)) { 4708 /* 4709 * The cleaner kthread is stopped, so do one final pass over 4710 * unused block groups. 4711 */ 4712 btrfs_delete_unused_bgs(fs_info); 4713 4714 /* 4715 * There might be existing delayed inode workers still running 4716 * and holding an empty delayed inode item. We must wait for 4717 * them to complete first because they can create a transaction. 4718 * This happens when someone calls btrfs_balance_delayed_items() 4719 * and then a transaction commit runs the same delayed nodes 4720 * before any delayed worker has done something with the nodes. 4721 * We must wait for any worker here and not at transaction 4722 * commit time since that could cause a deadlock. 4723 * This is a very rare case. 4724 */ 4725 btrfs_flush_workqueue(fs_info->delayed_workers); 4726 4727 ret = btrfs_commit_super(fs_info); 4728 if (ret) 4729 btrfs_err(fs_info, "commit super ret %d", ret); 4730 } 4731 4732 if (BTRFS_FS_ERROR(fs_info)) 4733 btrfs_error_commit_super(fs_info); 4734 4735 kthread_stop(fs_info->transaction_kthread); 4736 kthread_stop(fs_info->cleaner_kthread); 4737 4738 ASSERT(list_empty(&fs_info->delayed_iputs)); 4739 set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); 4740 4741 if (btrfs_check_quota_leak(fs_info)) { 4742 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4743 btrfs_err(fs_info, "qgroup reserved space leaked"); 4744 } 4745 4746 btrfs_free_qgroup_config(fs_info); 4747 ASSERT(list_empty(&fs_info->delalloc_roots)); 4748 4749 if (percpu_counter_sum(&fs_info->delalloc_bytes)) { 4750 btrfs_info(fs_info, "at unmount delalloc count %lld", 4751 percpu_counter_sum(&fs_info->delalloc_bytes)); 4752 } 4753 4754 if (percpu_counter_sum(&fs_info->ordered_bytes)) 4755 btrfs_info(fs_info, "at unmount dio bytes count %lld", 4756 percpu_counter_sum(&fs_info->ordered_bytes)); 4757 4758 btrfs_sysfs_remove_mounted(fs_info); 4759 btrfs_sysfs_remove_fsid(fs_info->fs_devices); 4760 4761 btrfs_put_block_group_cache(fs_info); 4762 4763 /* 4764 * we must make sure there is not any read request to 4765 * submit after we stopping all workers. 4766 */ 4767 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 4768 btrfs_stop_all_workers(fs_info); 4769 4770 /* We shouldn't have any transaction open at this point */ 4771 warn_about_uncommitted_trans(fs_info); 4772 4773 clear_bit(BTRFS_FS_OPEN, &fs_info->flags); 4774 free_root_pointers(fs_info, true); 4775 btrfs_free_fs_roots(fs_info); 4776 4777 /* 4778 * We must free the block groups after dropping the fs_roots as we could 4779 * have had an IO error and have left over tree log blocks that aren't 4780 * cleaned up until the fs roots are freed. This makes the block group 4781 * accounting appear to be wrong because there's pending reserved bytes, 4782 * so make sure we do the block group cleanup afterwards. 4783 */ 4784 btrfs_free_block_groups(fs_info); 4785 4786 iput(fs_info->btree_inode); 4787 4788 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 4789 if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) 4790 btrfsic_unmount(fs_info->fs_devices); 4791 #endif 4792 4793 btrfs_mapping_tree_free(&fs_info->mapping_tree); 4794 btrfs_close_devices(fs_info->fs_devices); 4795 } 4796 4797 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 4798 int atomic) 4799 { 4800 int ret; 4801 struct inode *btree_inode = buf->pages[0]->mapping->host; 4802 4803 ret = extent_buffer_uptodate(buf); 4804 if (!ret) 4805 return ret; 4806 4807 ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, 4808 parent_transid, atomic); 4809 if (ret == -EAGAIN) 4810 return ret; 4811 return !ret; 4812 } 4813 4814 void btrfs_mark_buffer_dirty(struct extent_buffer *buf) 4815 { 4816 struct btrfs_fs_info *fs_info = buf->fs_info; 4817 u64 transid = btrfs_header_generation(buf); 4818 int was_dirty; 4819 4820 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4821 /* 4822 * This is a fast path so only do this check if we have sanity tests 4823 * enabled. Normal people shouldn't be using unmapped buffers as dirty 4824 * outside of the sanity tests. 4825 */ 4826 if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) 4827 return; 4828 #endif 4829 btrfs_assert_tree_write_locked(buf); 4830 if (transid != fs_info->generation) 4831 WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", 4832 buf->start, transid, fs_info->generation); 4833 was_dirty = set_extent_buffer_dirty(buf); 4834 if (!was_dirty) 4835 percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, 4836 buf->len, 4837 fs_info->dirty_metadata_batch); 4838 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 4839 /* 4840 * Since btrfs_mark_buffer_dirty() can be called with item pointer set 4841 * but item data not updated. 4842 * So here we should only check item pointers, not item data. 4843 */ 4844 if (btrfs_header_level(buf) == 0 && 4845 btrfs_check_leaf_relaxed(buf)) { 4846 btrfs_print_leaf(buf); 4847 ASSERT(0); 4848 } 4849 #endif 4850 } 4851 4852 static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, 4853 int flush_delayed) 4854 { 4855 /* 4856 * looks as though older kernels can get into trouble with 4857 * this code, they end up stuck in balance_dirty_pages forever 4858 */ 4859 int ret; 4860 4861 if (current->flags & PF_MEMALLOC) 4862 return; 4863 4864 if (flush_delayed) 4865 btrfs_balance_delayed_items(fs_info); 4866 4867 ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, 4868 BTRFS_DIRTY_METADATA_THRESH, 4869 fs_info->dirty_metadata_batch); 4870 if (ret > 0) { 4871 balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); 4872 } 4873 } 4874 4875 void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info) 4876 { 4877 __btrfs_btree_balance_dirty(fs_info, 1); 4878 } 4879 4880 void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) 4881 { 4882 __btrfs_btree_balance_dirty(fs_info, 0); 4883 } 4884 4885 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) 4886 { 4887 /* cleanup FS via transaction */ 4888 btrfs_cleanup_transaction(fs_info); 4889 4890 mutex_lock(&fs_info->cleaner_mutex); 4891 btrfs_run_delayed_iputs(fs_info); 4892 mutex_unlock(&fs_info->cleaner_mutex); 4893 4894 down_write(&fs_info->cleanup_work_sem); 4895 up_write(&fs_info->cleanup_work_sem); 4896 } 4897 4898 static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) 4899 { 4900 struct btrfs_root *gang[8]; 4901 u64 root_objectid = 0; 4902 int ret; 4903 4904 spin_lock(&fs_info->fs_roots_radix_lock); 4905 while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 4906 (void **)gang, root_objectid, 4907 ARRAY_SIZE(gang))) != 0) { 4908 int i; 4909 4910 for (i = 0; i < ret; i++) 4911 gang[i] = btrfs_grab_root(gang[i]); 4912 spin_unlock(&fs_info->fs_roots_radix_lock); 4913 4914 for (i = 0; i < ret; i++) { 4915 if (!gang[i]) 4916 continue; 4917 root_objectid = gang[i]->root_key.objectid; 4918 btrfs_free_log(NULL, gang[i]); 4919 btrfs_put_root(gang[i]); 4920 } 4921 root_objectid++; 4922 spin_lock(&fs_info->fs_roots_radix_lock); 4923 } 4924 spin_unlock(&fs_info->fs_roots_radix_lock); 4925 btrfs_free_log_root_tree(NULL, fs_info); 4926 } 4927 4928 static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 4929 { 4930 struct btrfs_ordered_extent *ordered; 4931 4932 spin_lock(&root->ordered_extent_lock); 4933 /* 4934 * This will just short circuit the ordered completion stuff which will 4935 * make sure the ordered extent gets properly cleaned up. 4936 */ 4937 list_for_each_entry(ordered, &root->ordered_extents, 4938 root_extent_list) 4939 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 4940 spin_unlock(&root->ordered_extent_lock); 4941 } 4942 4943 static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) 4944 { 4945 struct btrfs_root *root; 4946 struct list_head splice; 4947 4948 INIT_LIST_HEAD(&splice); 4949 4950 spin_lock(&fs_info->ordered_root_lock); 4951 list_splice_init(&fs_info->ordered_roots, &splice); 4952 while (!list_empty(&splice)) { 4953 root = list_first_entry(&splice, struct btrfs_root, 4954 ordered_root); 4955 list_move_tail(&root->ordered_root, 4956 &fs_info->ordered_roots); 4957 4958 spin_unlock(&fs_info->ordered_root_lock); 4959 btrfs_destroy_ordered_extents(root); 4960 4961 cond_resched(); 4962 spin_lock(&fs_info->ordered_root_lock); 4963 } 4964 spin_unlock(&fs_info->ordered_root_lock); 4965 4966 /* 4967 * We need this here because if we've been flipped read-only we won't 4968 * get sync() from the umount, so we need to make sure any ordered 4969 * extents that haven't had their dirty pages IO start writeout yet 4970 * actually get run and error out properly. 4971 */ 4972 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 4973 } 4974 4975 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 4976 struct btrfs_fs_info *fs_info) 4977 { 4978 struct rb_node *node; 4979 struct btrfs_delayed_ref_root *delayed_refs; 4980 struct btrfs_delayed_ref_node *ref; 4981 int ret = 0; 4982 4983 delayed_refs = &trans->delayed_refs; 4984 4985 spin_lock(&delayed_refs->lock); 4986 if (atomic_read(&delayed_refs->num_entries) == 0) { 4987 spin_unlock(&delayed_refs->lock); 4988 btrfs_debug(fs_info, "delayed_refs has NO entry"); 4989 return ret; 4990 } 4991 4992 while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { 4993 struct btrfs_delayed_ref_head *head; 4994 struct rb_node *n; 4995 bool pin_bytes = false; 4996 4997 head = rb_entry(node, struct btrfs_delayed_ref_head, 4998 href_node); 4999 if (btrfs_delayed_ref_lock(delayed_refs, head)) 5000 continue; 5001 5002 spin_lock(&head->lock); 5003 while ((n = rb_first_cached(&head->ref_tree)) != NULL) { 5004 ref = rb_entry(n, struct btrfs_delayed_ref_node, 5005 ref_node); 5006 ref->in_tree = 0; 5007 rb_erase_cached(&ref->ref_node, &head->ref_tree); 5008 RB_CLEAR_NODE(&ref->ref_node); 5009 if (!list_empty(&ref->add_list)) 5010 list_del(&ref->add_list); 5011 atomic_dec(&delayed_refs->num_entries); 5012 btrfs_put_delayed_ref(ref); 5013 } 5014 if (head->must_insert_reserved) 5015 pin_bytes = true; 5016 btrfs_free_delayed_extent_op(head->extent_op); 5017 btrfs_delete_ref_head(delayed_refs, head); 5018 spin_unlock(&head->lock); 5019 spin_unlock(&delayed_refs->lock); 5020 mutex_unlock(&head->mutex); 5021 5022 if (pin_bytes) { 5023 struct btrfs_block_group *cache; 5024 5025 cache = btrfs_lookup_block_group(fs_info, head->bytenr); 5026 BUG_ON(!cache); 5027 5028 spin_lock(&cache->space_info->lock); 5029 spin_lock(&cache->lock); 5030 cache->pinned += head->num_bytes; 5031 btrfs_space_info_update_bytes_pinned(fs_info, 5032 cache->space_info, head->num_bytes); 5033 cache->reserved -= head->num_bytes; 5034 cache->space_info->bytes_reserved -= head->num_bytes; 5035 spin_unlock(&cache->lock); 5036 spin_unlock(&cache->space_info->lock); 5037 5038 btrfs_put_block_group(cache); 5039 5040 btrfs_error_unpin_extent_range(fs_info, head->bytenr, 5041 head->bytenr + head->num_bytes - 1); 5042 } 5043 btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); 5044 btrfs_put_delayed_ref_head(head); 5045 cond_resched(); 5046 spin_lock(&delayed_refs->lock); 5047 } 5048 btrfs_qgroup_destroy_extent_records(trans); 5049 5050 spin_unlock(&delayed_refs->lock); 5051 5052 return ret; 5053 } 5054 5055 static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) 5056 { 5057 struct btrfs_inode *btrfs_inode; 5058 struct list_head splice; 5059 5060 INIT_LIST_HEAD(&splice); 5061 5062 spin_lock(&root->delalloc_lock); 5063 list_splice_init(&root->delalloc_inodes, &splice); 5064 5065 while (!list_empty(&splice)) { 5066 struct inode *inode = NULL; 5067 btrfs_inode = list_first_entry(&splice, struct btrfs_inode, 5068 delalloc_inodes); 5069 __btrfs_del_delalloc_inode(root, btrfs_inode); 5070 spin_unlock(&root->delalloc_lock); 5071 5072 /* 5073 * Make sure we get a live inode and that it'll not disappear 5074 * meanwhile. 5075 */ 5076 inode = igrab(&btrfs_inode->vfs_inode); 5077 if (inode) { 5078 invalidate_inode_pages2(inode->i_mapping); 5079 iput(inode); 5080 } 5081 spin_lock(&root->delalloc_lock); 5082 } 5083 spin_unlock(&root->delalloc_lock); 5084 } 5085 5086 static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) 5087 { 5088 struct btrfs_root *root; 5089 struct list_head splice; 5090 5091 INIT_LIST_HEAD(&splice); 5092 5093 spin_lock(&fs_info->delalloc_root_lock); 5094 list_splice_init(&fs_info->delalloc_roots, &splice); 5095 while (!list_empty(&splice)) { 5096 root = list_first_entry(&splice, struct btrfs_root, 5097 delalloc_root); 5098 root = btrfs_grab_root(root); 5099 BUG_ON(!root); 5100 spin_unlock(&fs_info->delalloc_root_lock); 5101 5102 btrfs_destroy_delalloc_inodes(root); 5103 btrfs_put_root(root); 5104 5105 spin_lock(&fs_info->delalloc_root_lock); 5106 } 5107 spin_unlock(&fs_info->delalloc_root_lock); 5108 } 5109 5110 static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, 5111 struct extent_io_tree *dirty_pages, 5112 int mark) 5113 { 5114 int ret; 5115 struct extent_buffer *eb; 5116 u64 start = 0; 5117 u64 end; 5118 5119 while (1) { 5120 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 5121 mark, NULL); 5122 if (ret) 5123 break; 5124 5125 clear_extent_bits(dirty_pages, start, end, mark); 5126 while (start <= end) { 5127 eb = find_extent_buffer(fs_info, start); 5128 start += fs_info->nodesize; 5129 if (!eb) 5130 continue; 5131 wait_on_extent_buffer_writeback(eb); 5132 5133 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, 5134 &eb->bflags)) 5135 clear_extent_buffer_dirty(eb); 5136 free_extent_buffer_stale(eb); 5137 } 5138 } 5139 5140 return ret; 5141 } 5142 5143 static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, 5144 struct extent_io_tree *unpin) 5145 { 5146 u64 start; 5147 u64 end; 5148 int ret; 5149 5150 while (1) { 5151 struct extent_state *cached_state = NULL; 5152 5153 /* 5154 * The btrfs_finish_extent_commit() may get the same range as 5155 * ours between find_first_extent_bit and clear_extent_dirty. 5156 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin 5157 * the same extent range. 5158 */ 5159 mutex_lock(&fs_info->unused_bg_unpin_mutex); 5160 ret = find_first_extent_bit(unpin, 0, &start, &end, 5161 EXTENT_DIRTY, &cached_state); 5162 if (ret) { 5163 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5164 break; 5165 } 5166 5167 clear_extent_dirty(unpin, start, end, &cached_state); 5168 free_extent_state(cached_state); 5169 btrfs_error_unpin_extent_range(fs_info, start, end); 5170 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 5171 cond_resched(); 5172 } 5173 5174 return 0; 5175 } 5176 5177 static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) 5178 { 5179 struct inode *inode; 5180 5181 inode = cache->io_ctl.inode; 5182 if (inode) { 5183 invalidate_inode_pages2(inode->i_mapping); 5184 BTRFS_I(inode)->generation = 0; 5185 cache->io_ctl.inode = NULL; 5186 iput(inode); 5187 } 5188 ASSERT(cache->io_ctl.pages == NULL); 5189 btrfs_put_block_group(cache); 5190 } 5191 5192 void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, 5193 struct btrfs_fs_info *fs_info) 5194 { 5195 struct btrfs_block_group *cache; 5196 5197 spin_lock(&cur_trans->dirty_bgs_lock); 5198 while (!list_empty(&cur_trans->dirty_bgs)) { 5199 cache = list_first_entry(&cur_trans->dirty_bgs, 5200 struct btrfs_block_group, 5201 dirty_list); 5202 5203 if (!list_empty(&cache->io_list)) { 5204 spin_unlock(&cur_trans->dirty_bgs_lock); 5205 list_del_init(&cache->io_list); 5206 btrfs_cleanup_bg_io(cache); 5207 spin_lock(&cur_trans->dirty_bgs_lock); 5208 } 5209 5210 list_del_init(&cache->dirty_list); 5211 spin_lock(&cache->lock); 5212 cache->disk_cache_state = BTRFS_DC_ERROR; 5213 spin_unlock(&cache->lock); 5214 5215 spin_unlock(&cur_trans->dirty_bgs_lock); 5216 btrfs_put_block_group(cache); 5217 btrfs_delayed_refs_rsv_release(fs_info, 1); 5218 spin_lock(&cur_trans->dirty_bgs_lock); 5219 } 5220 spin_unlock(&cur_trans->dirty_bgs_lock); 5221 5222 /* 5223 * Refer to the definition of io_bgs member for details why it's safe 5224 * to use it without any locking 5225 */ 5226 while (!list_empty(&cur_trans->io_bgs)) { 5227 cache = list_first_entry(&cur_trans->io_bgs, 5228 struct btrfs_block_group, 5229 io_list); 5230 5231 list_del_init(&cache->io_list); 5232 spin_lock(&cache->lock); 5233 cache->disk_cache_state = BTRFS_DC_ERROR; 5234 spin_unlock(&cache->lock); 5235 btrfs_cleanup_bg_io(cache); 5236 } 5237 } 5238 5239 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 5240 struct btrfs_fs_info *fs_info) 5241 { 5242 struct btrfs_device *dev, *tmp; 5243 5244 btrfs_cleanup_dirty_bgs(cur_trans, fs_info); 5245 ASSERT(list_empty(&cur_trans->dirty_bgs)); 5246 ASSERT(list_empty(&cur_trans->io_bgs)); 5247 5248 list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list, 5249 post_commit_list) { 5250 list_del_init(&dev->post_commit_list); 5251 } 5252 5253 btrfs_destroy_delayed_refs(cur_trans, fs_info); 5254 5255 cur_trans->state = TRANS_STATE_COMMIT_START; 5256 wake_up(&fs_info->transaction_blocked_wait); 5257 5258 cur_trans->state = TRANS_STATE_UNBLOCKED; 5259 wake_up(&fs_info->transaction_wait); 5260 5261 btrfs_destroy_delayed_inodes(fs_info); 5262 5263 btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, 5264 EXTENT_DIRTY); 5265 btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); 5266 5267 btrfs_free_redirty_list(cur_trans); 5268 5269 cur_trans->state =TRANS_STATE_COMPLETED; 5270 wake_up(&cur_trans->commit_wait); 5271 } 5272 5273 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) 5274 { 5275 struct btrfs_transaction *t; 5276 5277 mutex_lock(&fs_info->transaction_kthread_mutex); 5278 5279 spin_lock(&fs_info->trans_lock); 5280 while (!list_empty(&fs_info->trans_list)) { 5281 t = list_first_entry(&fs_info->trans_list, 5282 struct btrfs_transaction, list); 5283 if (t->state >= TRANS_STATE_COMMIT_START) { 5284 refcount_inc(&t->use_count); 5285 spin_unlock(&fs_info->trans_lock); 5286 btrfs_wait_for_commit(fs_info, t->transid); 5287 btrfs_put_transaction(t); 5288 spin_lock(&fs_info->trans_lock); 5289 continue; 5290 } 5291 if (t == fs_info->running_transaction) { 5292 t->state = TRANS_STATE_COMMIT_DOING; 5293 spin_unlock(&fs_info->trans_lock); 5294 /* 5295 * We wait for 0 num_writers since we don't hold a trans 5296 * handle open currently for this transaction. 5297 */ 5298 wait_event(t->writer_wait, 5299 atomic_read(&t->num_writers) == 0); 5300 } else { 5301 spin_unlock(&fs_info->trans_lock); 5302 } 5303 btrfs_cleanup_one_transaction(t, fs_info); 5304 5305 spin_lock(&fs_info->trans_lock); 5306 if (t == fs_info->running_transaction) 5307 fs_info->running_transaction = NULL; 5308 list_del_init(&t->list); 5309 spin_unlock(&fs_info->trans_lock); 5310 5311 btrfs_put_transaction(t); 5312 trace_btrfs_transaction_commit(fs_info); 5313 spin_lock(&fs_info->trans_lock); 5314 } 5315 spin_unlock(&fs_info->trans_lock); 5316 btrfs_destroy_all_ordered_extents(fs_info); 5317 btrfs_destroy_delayed_inodes(fs_info); 5318 btrfs_assert_delayed_root_empty(fs_info); 5319 btrfs_destroy_all_delalloc_inodes(fs_info); 5320 btrfs_drop_all_logs(fs_info); 5321 mutex_unlock(&fs_info->transaction_kthread_mutex); 5322 5323 return 0; 5324 } 5325 5326 int btrfs_init_root_free_objectid(struct btrfs_root *root) 5327 { 5328 struct btrfs_path *path; 5329 int ret; 5330 struct extent_buffer *l; 5331 struct btrfs_key search_key; 5332 struct btrfs_key found_key; 5333 int slot; 5334 5335 path = btrfs_alloc_path(); 5336 if (!path) 5337 return -ENOMEM; 5338 5339 search_key.objectid = BTRFS_LAST_FREE_OBJECTID; 5340 search_key.type = -1; 5341 search_key.offset = (u64)-1; 5342 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 5343 if (ret < 0) 5344 goto error; 5345 BUG_ON(ret == 0); /* Corruption */ 5346 if (path->slots[0] > 0) { 5347 slot = path->slots[0] - 1; 5348 l = path->nodes[0]; 5349 btrfs_item_key_to_cpu(l, &found_key, slot); 5350 root->free_objectid = max_t(u64, found_key.objectid + 1, 5351 BTRFS_FIRST_FREE_OBJECTID); 5352 } else { 5353 root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; 5354 } 5355 ret = 0; 5356 error: 5357 btrfs_free_path(path); 5358 return ret; 5359 } 5360 5361 int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) 5362 { 5363 int ret; 5364 mutex_lock(&root->objectid_mutex); 5365 5366 if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { 5367 btrfs_warn(root->fs_info, 5368 "the objectid of root %llu reaches its highest value", 5369 root->root_key.objectid); 5370 ret = -ENOSPC; 5371 goto out; 5372 } 5373 5374 *objectid = root->free_objectid++; 5375 ret = 0; 5376 out: 5377 mutex_unlock(&root->objectid_mutex); 5378 return ret; 5379 } 5380