1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include <linux/atomic.h> 8 #include <linux/vmalloc.h> 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "zoned.h" 12 #include "rcu-string.h" 13 #include "disk-io.h" 14 #include "block-group.h" 15 #include "transaction.h" 16 #include "dev-replace.h" 17 #include "space-info.h" 18 19 /* Maximum number of zones to report per blkdev_report_zones() call */ 20 #define BTRFS_REPORT_NR_ZONES 4096 21 /* Invalid allocation pointer value for missing devices */ 22 #define WP_MISSING_DEV ((u64)-1) 23 /* Pseudo write pointer value for conventional zone */ 24 #define WP_CONVENTIONAL ((u64)-2) 25 26 /* 27 * Location of the first zone of superblock logging zone pairs. 28 * 29 * - primary superblock: 0B (zone 0) 30 * - first copy: 512G (zone starting at that offset) 31 * - second copy: 4T (zone starting at that offset) 32 */ 33 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 34 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 35 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 36 37 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 38 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 39 40 /* Number of superblock log zones */ 41 #define BTRFS_NR_SB_LOG_ZONES 2 42 43 /* 44 * Minimum of active zones we need: 45 * 46 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors 47 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group 48 * - 1 zone for tree-log dedicated block group 49 * - 1 zone for relocation 50 */ 51 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 52 53 /* 54 * Minimum / maximum supported zone size. Currently, SMR disks have a zone 55 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. 56 * We do not expect the zone size to become larger than 8GiB or smaller than 57 * 4MiB in the near future. 58 */ 59 #define BTRFS_MAX_ZONE_SIZE SZ_8G 60 #define BTRFS_MIN_ZONE_SIZE SZ_4M 61 62 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 63 64 static inline bool sb_zone_is_full(const struct blk_zone *zone) 65 { 66 return (zone->cond == BLK_ZONE_COND_FULL) || 67 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); 68 } 69 70 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 71 { 72 struct blk_zone *zones = data; 73 74 memcpy(&zones[idx], zone, sizeof(*zone)); 75 76 return 0; 77 } 78 79 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 80 u64 *wp_ret) 81 { 82 bool empty[BTRFS_NR_SB_LOG_ZONES]; 83 bool full[BTRFS_NR_SB_LOG_ZONES]; 84 sector_t sector; 85 int i; 86 87 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 88 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); 89 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); 90 full[i] = sb_zone_is_full(&zones[i]); 91 } 92 93 /* 94 * Possible states of log buffer zones 95 * 96 * Empty[0] In use[0] Full[0] 97 * Empty[1] * 0 1 98 * In use[1] x x 1 99 * Full[1] 0 0 C 100 * 101 * Log position: 102 * *: Special case, no superblock is written 103 * 0: Use write pointer of zones[0] 104 * 1: Use write pointer of zones[1] 105 * C: Compare super blocks from zones[0] and zones[1], use the latest 106 * one determined by generation 107 * x: Invalid state 108 */ 109 110 if (empty[0] && empty[1]) { 111 /* Special case to distinguish no superblock to read */ 112 *wp_ret = zones[0].start << SECTOR_SHIFT; 113 return -ENOENT; 114 } else if (full[0] && full[1]) { 115 /* Compare two super blocks */ 116 struct address_space *mapping = bdev->bd_inode->i_mapping; 117 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 118 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 119 int i; 120 121 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 122 u64 bytenr; 123 124 bytenr = ((zones[i].start + zones[i].len) 125 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 126 127 page[i] = read_cache_page_gfp(mapping, 128 bytenr >> PAGE_SHIFT, GFP_NOFS); 129 if (IS_ERR(page[i])) { 130 if (i == 1) 131 btrfs_release_disk_super(super[0]); 132 return PTR_ERR(page[i]); 133 } 134 super[i] = page_address(page[i]); 135 } 136 137 if (super[0]->generation > super[1]->generation) 138 sector = zones[1].start; 139 else 140 sector = zones[0].start; 141 142 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 143 btrfs_release_disk_super(super[i]); 144 } else if (!full[0] && (empty[1] || full[1])) { 145 sector = zones[0].wp; 146 } else if (full[0]) { 147 sector = zones[1].wp; 148 } else { 149 return -EUCLEAN; 150 } 151 *wp_ret = sector << SECTOR_SHIFT; 152 return 0; 153 } 154 155 /* 156 * Get the first zone number of the superblock mirror 157 */ 158 static inline u32 sb_zone_number(int shift, int mirror) 159 { 160 u64 zone; 161 162 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 163 switch (mirror) { 164 case 0: zone = 0; break; 165 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 166 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 167 } 168 169 ASSERT(zone <= U32_MAX); 170 171 return (u32)zone; 172 } 173 174 static inline sector_t zone_start_sector(u32 zone_number, 175 struct block_device *bdev) 176 { 177 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 178 } 179 180 static inline u64 zone_start_physical(u32 zone_number, 181 struct btrfs_zoned_device_info *zone_info) 182 { 183 return (u64)zone_number << zone_info->zone_size_shift; 184 } 185 186 /* 187 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 188 * device into static sized chunks and fake a conventional zone on each of 189 * them. 190 */ 191 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 192 struct blk_zone *zones, unsigned int nr_zones) 193 { 194 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 195 sector_t bdev_size = bdev_nr_sectors(device->bdev); 196 unsigned int i; 197 198 pos >>= SECTOR_SHIFT; 199 for (i = 0; i < nr_zones; i++) { 200 zones[i].start = i * zone_sectors + pos; 201 zones[i].len = zone_sectors; 202 zones[i].capacity = zone_sectors; 203 zones[i].wp = zones[i].start + zone_sectors; 204 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 205 zones[i].cond = BLK_ZONE_COND_NOT_WP; 206 207 if (zones[i].wp >= bdev_size) { 208 i++; 209 break; 210 } 211 } 212 213 return i; 214 } 215 216 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 217 struct blk_zone *zones, unsigned int *nr_zones) 218 { 219 struct btrfs_zoned_device_info *zinfo = device->zone_info; 220 u32 zno; 221 int ret; 222 223 if (!*nr_zones) 224 return 0; 225 226 if (!bdev_is_zoned(device->bdev)) { 227 ret = emulate_report_zones(device, pos, zones, *nr_zones); 228 *nr_zones = ret; 229 return 0; 230 } 231 232 /* Check cache */ 233 if (zinfo->zone_cache) { 234 unsigned int i; 235 236 ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); 237 zno = pos >> zinfo->zone_size_shift; 238 /* 239 * We cannot report zones beyond the zone end. So, it is OK to 240 * cap *nr_zones to at the end. 241 */ 242 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); 243 244 for (i = 0; i < *nr_zones; i++) { 245 struct blk_zone *zone_info; 246 247 zone_info = &zinfo->zone_cache[zno + i]; 248 if (!zone_info->len) 249 break; 250 } 251 252 if (i == *nr_zones) { 253 /* Cache hit on all the zones */ 254 memcpy(zones, zinfo->zone_cache + zno, 255 sizeof(*zinfo->zone_cache) * *nr_zones); 256 return 0; 257 } 258 } 259 260 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 261 copy_zone_info_cb, zones); 262 if (ret < 0) { 263 btrfs_err_in_rcu(device->fs_info, 264 "zoned: failed to read zone %llu on %s (devid %llu)", 265 pos, rcu_str_deref(device->name), 266 device->devid); 267 return ret; 268 } 269 *nr_zones = ret; 270 if (!ret) 271 return -EIO; 272 273 /* Populate cache */ 274 if (zinfo->zone_cache) 275 memcpy(zinfo->zone_cache + zno, zones, 276 sizeof(*zinfo->zone_cache) * *nr_zones); 277 278 return 0; 279 } 280 281 /* The emulated zone size is determined from the size of device extent */ 282 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 283 { 284 struct btrfs_path *path; 285 struct btrfs_root *root = fs_info->dev_root; 286 struct btrfs_key key; 287 struct extent_buffer *leaf; 288 struct btrfs_dev_extent *dext; 289 int ret = 0; 290 291 key.objectid = 1; 292 key.type = BTRFS_DEV_EXTENT_KEY; 293 key.offset = 0; 294 295 path = btrfs_alloc_path(); 296 if (!path) 297 return -ENOMEM; 298 299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 300 if (ret < 0) 301 goto out; 302 303 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 304 ret = btrfs_next_leaf(root, path); 305 if (ret < 0) 306 goto out; 307 /* No dev extents at all? Not good */ 308 if (ret > 0) { 309 ret = -EUCLEAN; 310 goto out; 311 } 312 } 313 314 leaf = path->nodes[0]; 315 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 316 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 317 ret = 0; 318 319 out: 320 btrfs_free_path(path); 321 322 return ret; 323 } 324 325 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 326 { 327 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 328 struct btrfs_device *device; 329 int ret = 0; 330 331 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 332 if (!btrfs_fs_incompat(fs_info, ZONED)) 333 return 0; 334 335 mutex_lock(&fs_devices->device_list_mutex); 336 list_for_each_entry(device, &fs_devices->devices, dev_list) { 337 /* We can skip reading of zone info for missing devices */ 338 if (!device->bdev) 339 continue; 340 341 ret = btrfs_get_dev_zone_info(device, true); 342 if (ret) 343 break; 344 } 345 mutex_unlock(&fs_devices->device_list_mutex); 346 347 return ret; 348 } 349 350 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) 351 { 352 struct btrfs_fs_info *fs_info = device->fs_info; 353 struct btrfs_zoned_device_info *zone_info = NULL; 354 struct block_device *bdev = device->bdev; 355 unsigned int max_active_zones; 356 unsigned int nactive; 357 sector_t nr_sectors; 358 sector_t sector = 0; 359 struct blk_zone *zones = NULL; 360 unsigned int i, nreported = 0, nr_zones; 361 sector_t zone_sectors; 362 char *model, *emulated; 363 int ret; 364 365 /* 366 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 367 * yet be set. 368 */ 369 if (!btrfs_fs_incompat(fs_info, ZONED)) 370 return 0; 371 372 if (device->zone_info) 373 return 0; 374 375 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 376 if (!zone_info) 377 return -ENOMEM; 378 379 device->zone_info = zone_info; 380 381 if (!bdev_is_zoned(bdev)) { 382 if (!fs_info->zone_size) { 383 ret = calculate_emulated_zone_size(fs_info); 384 if (ret) 385 goto out; 386 } 387 388 ASSERT(fs_info->zone_size); 389 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 390 } else { 391 zone_sectors = bdev_zone_sectors(bdev); 392 } 393 394 /* Check if it's power of 2 (see is_power_of_2) */ 395 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 396 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 397 398 /* We reject devices with a zone size larger than 8GB */ 399 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 400 btrfs_err_in_rcu(fs_info, 401 "zoned: %s: zone size %llu larger than supported maximum %llu", 402 rcu_str_deref(device->name), 403 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 404 ret = -EINVAL; 405 goto out; 406 } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { 407 btrfs_err_in_rcu(fs_info, 408 "zoned: %s: zone size %llu smaller than supported minimum %u", 409 rcu_str_deref(device->name), 410 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); 411 ret = -EINVAL; 412 goto out; 413 } 414 415 nr_sectors = bdev_nr_sectors(bdev); 416 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 417 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 418 /* 419 * We limit max_zone_append_size also by max_segments * 420 * PAGE_SIZE. Technically, we can have multiple pages per segment. But, 421 * since btrfs adds the pages one by one to a bio, and btrfs cannot 422 * increase the metadata reservation even if it increases the number of 423 * extents, it is safe to stick with the limit. 424 */ 425 zone_info->max_zone_append_size = 426 min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, 427 (u64)bdev_max_segments(bdev) << PAGE_SHIFT); 428 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 429 zone_info->nr_zones++; 430 431 max_active_zones = bdev_max_active_zones(bdev); 432 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 433 btrfs_err_in_rcu(fs_info, 434 "zoned: %s: max active zones %u is too small, need at least %u active zones", 435 rcu_str_deref(device->name), max_active_zones, 436 BTRFS_MIN_ACTIVE_ZONES); 437 ret = -EINVAL; 438 goto out; 439 } 440 zone_info->max_active_zones = max_active_zones; 441 442 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 443 if (!zone_info->seq_zones) { 444 ret = -ENOMEM; 445 goto out; 446 } 447 448 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 449 if (!zone_info->empty_zones) { 450 ret = -ENOMEM; 451 goto out; 452 } 453 454 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 455 if (!zone_info->active_zones) { 456 ret = -ENOMEM; 457 goto out; 458 } 459 460 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 461 if (!zones) { 462 ret = -ENOMEM; 463 goto out; 464 } 465 466 /* 467 * Enable zone cache only for a zoned device. On a non-zoned device, we 468 * fill the zone info with emulated CONVENTIONAL zones, so no need to 469 * use the cache. 470 */ 471 if (populate_cache && bdev_is_zoned(device->bdev)) { 472 zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * 473 zone_info->nr_zones); 474 if (!zone_info->zone_cache) { 475 btrfs_err_in_rcu(device->fs_info, 476 "zoned: failed to allocate zone cache for %s", 477 rcu_str_deref(device->name)); 478 ret = -ENOMEM; 479 goto out; 480 } 481 } 482 483 /* Get zones type */ 484 nactive = 0; 485 while (sector < nr_sectors) { 486 nr_zones = BTRFS_REPORT_NR_ZONES; 487 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 488 &nr_zones); 489 if (ret) 490 goto out; 491 492 for (i = 0; i < nr_zones; i++) { 493 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 494 __set_bit(nreported, zone_info->seq_zones); 495 switch (zones[i].cond) { 496 case BLK_ZONE_COND_EMPTY: 497 __set_bit(nreported, zone_info->empty_zones); 498 break; 499 case BLK_ZONE_COND_IMP_OPEN: 500 case BLK_ZONE_COND_EXP_OPEN: 501 case BLK_ZONE_COND_CLOSED: 502 __set_bit(nreported, zone_info->active_zones); 503 nactive++; 504 break; 505 } 506 nreported++; 507 } 508 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 509 } 510 511 if (nreported != zone_info->nr_zones) { 512 btrfs_err_in_rcu(device->fs_info, 513 "inconsistent number of zones on %s (%u/%u)", 514 rcu_str_deref(device->name), nreported, 515 zone_info->nr_zones); 516 ret = -EIO; 517 goto out; 518 } 519 520 if (max_active_zones) { 521 if (nactive > max_active_zones) { 522 btrfs_err_in_rcu(device->fs_info, 523 "zoned: %u active zones on %s exceeds max_active_zones %u", 524 nactive, rcu_str_deref(device->name), 525 max_active_zones); 526 ret = -EIO; 527 goto out; 528 } 529 atomic_set(&zone_info->active_zones_left, 530 max_active_zones - nactive); 531 } 532 533 /* Validate superblock log */ 534 nr_zones = BTRFS_NR_SB_LOG_ZONES; 535 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 536 u32 sb_zone; 537 u64 sb_wp; 538 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 539 540 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 541 if (sb_zone + 1 >= zone_info->nr_zones) 542 continue; 543 544 ret = btrfs_get_dev_zones(device, 545 zone_start_physical(sb_zone, zone_info), 546 &zone_info->sb_zones[sb_pos], 547 &nr_zones); 548 if (ret) 549 goto out; 550 551 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 552 btrfs_err_in_rcu(device->fs_info, 553 "zoned: failed to read super block log zone info at devid %llu zone %u", 554 device->devid, sb_zone); 555 ret = -EUCLEAN; 556 goto out; 557 } 558 559 /* 560 * If zones[0] is conventional, always use the beginning of the 561 * zone to record superblock. No need to validate in that case. 562 */ 563 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 564 BLK_ZONE_TYPE_CONVENTIONAL) 565 continue; 566 567 ret = sb_write_pointer(device->bdev, 568 &zone_info->sb_zones[sb_pos], &sb_wp); 569 if (ret != -ENOENT && ret) { 570 btrfs_err_in_rcu(device->fs_info, 571 "zoned: super block log zone corrupted devid %llu zone %u", 572 device->devid, sb_zone); 573 ret = -EUCLEAN; 574 goto out; 575 } 576 } 577 578 579 kfree(zones); 580 581 switch (bdev_zoned_model(bdev)) { 582 case BLK_ZONED_HM: 583 model = "host-managed zoned"; 584 emulated = ""; 585 break; 586 case BLK_ZONED_HA: 587 model = "host-aware zoned"; 588 emulated = ""; 589 break; 590 case BLK_ZONED_NONE: 591 model = "regular"; 592 emulated = "emulated "; 593 break; 594 default: 595 /* Just in case */ 596 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 597 bdev_zoned_model(bdev), 598 rcu_str_deref(device->name)); 599 ret = -EOPNOTSUPP; 600 goto out_free_zone_info; 601 } 602 603 btrfs_info_in_rcu(fs_info, 604 "%s block device %s, %u %szones of %llu bytes", 605 model, rcu_str_deref(device->name), zone_info->nr_zones, 606 emulated, zone_info->zone_size); 607 608 return 0; 609 610 out: 611 kfree(zones); 612 out_free_zone_info: 613 btrfs_destroy_dev_zone_info(device); 614 615 return ret; 616 } 617 618 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 619 { 620 struct btrfs_zoned_device_info *zone_info = device->zone_info; 621 622 if (!zone_info) 623 return; 624 625 bitmap_free(zone_info->active_zones); 626 bitmap_free(zone_info->seq_zones); 627 bitmap_free(zone_info->empty_zones); 628 vfree(zone_info->zone_cache); 629 kfree(zone_info); 630 device->zone_info = NULL; 631 } 632 633 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 634 struct blk_zone *zone) 635 { 636 unsigned int nr_zones = 1; 637 int ret; 638 639 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 640 if (ret != 0 || !nr_zones) 641 return ret ? ret : -EIO; 642 643 return 0; 644 } 645 646 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 647 { 648 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 649 struct btrfs_device *device; 650 u64 zoned_devices = 0; 651 u64 nr_devices = 0; 652 u64 zone_size = 0; 653 u64 max_zone_append_size = 0; 654 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 655 int ret = 0; 656 657 /* Count zoned devices */ 658 list_for_each_entry(device, &fs_devices->devices, dev_list) { 659 enum blk_zoned_model model; 660 661 if (!device->bdev) 662 continue; 663 664 model = bdev_zoned_model(device->bdev); 665 /* 666 * A Host-Managed zoned device must be used as a zoned device. 667 * A Host-Aware zoned device and a non-zoned devices can be 668 * treated as a zoned device, if ZONED flag is enabled in the 669 * superblock. 670 */ 671 if (model == BLK_ZONED_HM || 672 (model == BLK_ZONED_HA && incompat_zoned) || 673 (model == BLK_ZONED_NONE && incompat_zoned)) { 674 struct btrfs_zoned_device_info *zone_info; 675 676 zone_info = device->zone_info; 677 zoned_devices++; 678 if (!zone_size) { 679 zone_size = zone_info->zone_size; 680 } else if (zone_info->zone_size != zone_size) { 681 btrfs_err(fs_info, 682 "zoned: unequal block device zone sizes: have %llu found %llu", 683 device->zone_info->zone_size, 684 zone_size); 685 ret = -EINVAL; 686 goto out; 687 } 688 if (!max_zone_append_size || 689 (zone_info->max_zone_append_size && 690 zone_info->max_zone_append_size < max_zone_append_size)) 691 max_zone_append_size = 692 zone_info->max_zone_append_size; 693 } 694 nr_devices++; 695 } 696 697 if (!zoned_devices && !incompat_zoned) 698 goto out; 699 700 if (!zoned_devices && incompat_zoned) { 701 /* No zoned block device found on ZONED filesystem */ 702 btrfs_err(fs_info, 703 "zoned: no zoned devices found on a zoned filesystem"); 704 ret = -EINVAL; 705 goto out; 706 } 707 708 if (zoned_devices && !incompat_zoned) { 709 btrfs_err(fs_info, 710 "zoned: mode not enabled but zoned device found"); 711 ret = -EINVAL; 712 goto out; 713 } 714 715 if (zoned_devices != nr_devices) { 716 btrfs_err(fs_info, 717 "zoned: cannot mix zoned and regular devices"); 718 ret = -EINVAL; 719 goto out; 720 } 721 722 /* 723 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 724 * btrfs_create_chunk(). Since we want stripe_len == zone_size, 725 * check the alignment here. 726 */ 727 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 728 btrfs_err(fs_info, 729 "zoned: zone size %llu not aligned to stripe %u", 730 zone_size, BTRFS_STRIPE_LEN); 731 ret = -EINVAL; 732 goto out; 733 } 734 735 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 736 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 737 ret = -EINVAL; 738 goto out; 739 } 740 741 fs_info->zone_size = zone_size; 742 fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, 743 fs_info->sectorsize); 744 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 745 if (fs_info->max_zone_append_size < fs_info->max_extent_size) 746 fs_info->max_extent_size = fs_info->max_zone_append_size; 747 748 /* 749 * Check mount options here, because we might change fs_info->zoned 750 * from fs_info->zone_size. 751 */ 752 ret = btrfs_check_mountopts_zoned(fs_info); 753 if (ret) 754 goto out; 755 756 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 757 out: 758 return ret; 759 } 760 761 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 762 { 763 if (!btrfs_is_zoned(info)) 764 return 0; 765 766 /* 767 * Space cache writing is not COWed. Disable that to avoid write errors 768 * in sequential zones. 769 */ 770 if (btrfs_test_opt(info, SPACE_CACHE)) { 771 btrfs_err(info, "zoned: space cache v1 is not supported"); 772 return -EINVAL; 773 } 774 775 if (btrfs_test_opt(info, NODATACOW)) { 776 btrfs_err(info, "zoned: NODATACOW not supported"); 777 return -EINVAL; 778 } 779 780 return 0; 781 } 782 783 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 784 int rw, u64 *bytenr_ret) 785 { 786 u64 wp; 787 int ret; 788 789 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 790 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 791 return 0; 792 } 793 794 ret = sb_write_pointer(bdev, zones, &wp); 795 if (ret != -ENOENT && ret < 0) 796 return ret; 797 798 if (rw == WRITE) { 799 struct blk_zone *reset = NULL; 800 801 if (wp == zones[0].start << SECTOR_SHIFT) 802 reset = &zones[0]; 803 else if (wp == zones[1].start << SECTOR_SHIFT) 804 reset = &zones[1]; 805 806 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 807 ASSERT(sb_zone_is_full(reset)); 808 809 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 810 reset->start, reset->len, 811 GFP_NOFS); 812 if (ret) 813 return ret; 814 815 reset->cond = BLK_ZONE_COND_EMPTY; 816 reset->wp = reset->start; 817 } 818 } else if (ret != -ENOENT) { 819 /* 820 * For READ, we want the previous one. Move write pointer to 821 * the end of a zone, if it is at the head of a zone. 822 */ 823 u64 zone_end = 0; 824 825 if (wp == zones[0].start << SECTOR_SHIFT) 826 zone_end = zones[1].start + zones[1].capacity; 827 else if (wp == zones[1].start << SECTOR_SHIFT) 828 zone_end = zones[0].start + zones[0].capacity; 829 if (zone_end) 830 wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, 831 BTRFS_SUPER_INFO_SIZE); 832 833 wp -= BTRFS_SUPER_INFO_SIZE; 834 } 835 836 *bytenr_ret = wp; 837 return 0; 838 839 } 840 841 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 842 u64 *bytenr_ret) 843 { 844 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 845 sector_t zone_sectors; 846 u32 sb_zone; 847 int ret; 848 u8 zone_sectors_shift; 849 sector_t nr_sectors; 850 u32 nr_zones; 851 852 if (!bdev_is_zoned(bdev)) { 853 *bytenr_ret = btrfs_sb_offset(mirror); 854 return 0; 855 } 856 857 ASSERT(rw == READ || rw == WRITE); 858 859 zone_sectors = bdev_zone_sectors(bdev); 860 if (!is_power_of_2(zone_sectors)) 861 return -EINVAL; 862 zone_sectors_shift = ilog2(zone_sectors); 863 nr_sectors = bdev_nr_sectors(bdev); 864 nr_zones = nr_sectors >> zone_sectors_shift; 865 866 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 867 if (sb_zone + 1 >= nr_zones) 868 return -ENOENT; 869 870 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 871 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 872 zones); 873 if (ret < 0) 874 return ret; 875 if (ret != BTRFS_NR_SB_LOG_ZONES) 876 return -EIO; 877 878 return sb_log_location(bdev, zones, rw, bytenr_ret); 879 } 880 881 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 882 u64 *bytenr_ret) 883 { 884 struct btrfs_zoned_device_info *zinfo = device->zone_info; 885 u32 zone_num; 886 887 /* 888 * For a zoned filesystem on a non-zoned block device, use the same 889 * super block locations as regular filesystem. Doing so, the super 890 * block can always be retrieved and the zoned flag of the volume 891 * detected from the super block information. 892 */ 893 if (!bdev_is_zoned(device->bdev)) { 894 *bytenr_ret = btrfs_sb_offset(mirror); 895 return 0; 896 } 897 898 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 899 if (zone_num + 1 >= zinfo->nr_zones) 900 return -ENOENT; 901 902 return sb_log_location(device->bdev, 903 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 904 rw, bytenr_ret); 905 } 906 907 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 908 int mirror) 909 { 910 u32 zone_num; 911 912 if (!zinfo) 913 return false; 914 915 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 916 if (zone_num + 1 >= zinfo->nr_zones) 917 return false; 918 919 if (!test_bit(zone_num, zinfo->seq_zones)) 920 return false; 921 922 return true; 923 } 924 925 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 926 { 927 struct btrfs_zoned_device_info *zinfo = device->zone_info; 928 struct blk_zone *zone; 929 int i; 930 931 if (!is_sb_log_zone(zinfo, mirror)) 932 return 0; 933 934 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 935 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 936 /* Advance the next zone */ 937 if (zone->cond == BLK_ZONE_COND_FULL) { 938 zone++; 939 continue; 940 } 941 942 if (zone->cond == BLK_ZONE_COND_EMPTY) 943 zone->cond = BLK_ZONE_COND_IMP_OPEN; 944 945 zone->wp += SUPER_INFO_SECTORS; 946 947 if (sb_zone_is_full(zone)) { 948 /* 949 * No room left to write new superblock. Since 950 * superblock is written with REQ_SYNC, it is safe to 951 * finish the zone now. 952 * 953 * If the write pointer is exactly at the capacity, 954 * explicit ZONE_FINISH is not necessary. 955 */ 956 if (zone->wp != zone->start + zone->capacity) { 957 int ret; 958 959 ret = blkdev_zone_mgmt(device->bdev, 960 REQ_OP_ZONE_FINISH, zone->start, 961 zone->len, GFP_NOFS); 962 if (ret) 963 return ret; 964 } 965 966 zone->wp = zone->start + zone->len; 967 zone->cond = BLK_ZONE_COND_FULL; 968 } 969 return 0; 970 } 971 972 /* All the zones are FULL. Should not reach here. */ 973 ASSERT(0); 974 return -EIO; 975 } 976 977 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 978 { 979 sector_t zone_sectors; 980 sector_t nr_sectors; 981 u8 zone_sectors_shift; 982 u32 sb_zone; 983 u32 nr_zones; 984 985 zone_sectors = bdev_zone_sectors(bdev); 986 zone_sectors_shift = ilog2(zone_sectors); 987 nr_sectors = bdev_nr_sectors(bdev); 988 nr_zones = nr_sectors >> zone_sectors_shift; 989 990 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 991 if (sb_zone + 1 >= nr_zones) 992 return -ENOENT; 993 994 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 995 zone_start_sector(sb_zone, bdev), 996 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 997 } 998 999 /** 1000 * btrfs_find_allocatable_zones - find allocatable zones within a given region 1001 * 1002 * @device: the device to allocate a region on 1003 * @hole_start: the position of the hole to allocate the region 1004 * @num_bytes: size of wanted region 1005 * @hole_end: the end of the hole 1006 * @return: position of allocatable zones 1007 * 1008 * Allocatable region should not contain any superblock locations. 1009 */ 1010 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 1011 u64 hole_end, u64 num_bytes) 1012 { 1013 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1014 const u8 shift = zinfo->zone_size_shift; 1015 u64 nzones = num_bytes >> shift; 1016 u64 pos = hole_start; 1017 u64 begin, end; 1018 bool have_sb; 1019 int i; 1020 1021 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 1022 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 1023 1024 while (pos < hole_end) { 1025 begin = pos >> shift; 1026 end = begin + nzones; 1027 1028 if (end > zinfo->nr_zones) 1029 return hole_end; 1030 1031 /* Check if zones in the region are all empty */ 1032 if (btrfs_dev_is_sequential(device, pos) && 1033 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 1034 pos += zinfo->zone_size; 1035 continue; 1036 } 1037 1038 have_sb = false; 1039 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1040 u32 sb_zone; 1041 u64 sb_pos; 1042 1043 sb_zone = sb_zone_number(shift, i); 1044 if (!(end <= sb_zone || 1045 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 1046 have_sb = true; 1047 pos = zone_start_physical( 1048 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 1049 break; 1050 } 1051 1052 /* We also need to exclude regular superblock positions */ 1053 sb_pos = btrfs_sb_offset(i); 1054 if (!(pos + num_bytes <= sb_pos || 1055 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 1056 have_sb = true; 1057 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 1058 zinfo->zone_size); 1059 break; 1060 } 1061 } 1062 if (!have_sb) 1063 break; 1064 } 1065 1066 return pos; 1067 } 1068 1069 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) 1070 { 1071 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1072 unsigned int zno = (pos >> zone_info->zone_size_shift); 1073 1074 /* We can use any number of zones */ 1075 if (zone_info->max_active_zones == 0) 1076 return true; 1077 1078 if (!test_bit(zno, zone_info->active_zones)) { 1079 /* Active zone left? */ 1080 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) 1081 return false; 1082 if (test_and_set_bit(zno, zone_info->active_zones)) { 1083 /* Someone already set the bit */ 1084 atomic_inc(&zone_info->active_zones_left); 1085 } 1086 } 1087 1088 return true; 1089 } 1090 1091 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) 1092 { 1093 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1094 unsigned int zno = (pos >> zone_info->zone_size_shift); 1095 1096 /* We can use any number of zones */ 1097 if (zone_info->max_active_zones == 0) 1098 return; 1099 1100 if (test_and_clear_bit(zno, zone_info->active_zones)) 1101 atomic_inc(&zone_info->active_zones_left); 1102 } 1103 1104 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1105 u64 length, u64 *bytes) 1106 { 1107 int ret; 1108 1109 *bytes = 0; 1110 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1111 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 1112 GFP_NOFS); 1113 if (ret) 1114 return ret; 1115 1116 *bytes = length; 1117 while (length) { 1118 btrfs_dev_set_zone_empty(device, physical); 1119 btrfs_dev_clear_active_zone(device, physical); 1120 physical += device->zone_info->zone_size; 1121 length -= device->zone_info->zone_size; 1122 } 1123 1124 return 0; 1125 } 1126 1127 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 1128 { 1129 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1130 const u8 shift = zinfo->zone_size_shift; 1131 unsigned long begin = start >> shift; 1132 unsigned long end = (start + size) >> shift; 1133 u64 pos; 1134 int ret; 1135 1136 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 1137 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 1138 1139 if (end > zinfo->nr_zones) 1140 return -ERANGE; 1141 1142 /* All the zones are conventional */ 1143 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 1144 return 0; 1145 1146 /* All the zones are sequential and empty */ 1147 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 1148 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 1149 return 0; 1150 1151 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 1152 u64 reset_bytes; 1153 1154 if (!btrfs_dev_is_sequential(device, pos) || 1155 btrfs_dev_is_empty_zone(device, pos)) 1156 continue; 1157 1158 /* Free regions should be empty */ 1159 btrfs_warn_in_rcu( 1160 device->fs_info, 1161 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 1162 rcu_str_deref(device->name), device->devid, pos >> shift); 1163 WARN_ON_ONCE(1); 1164 1165 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 1166 &reset_bytes); 1167 if (ret) 1168 return ret; 1169 } 1170 1171 return 0; 1172 } 1173 1174 /* 1175 * Calculate an allocation pointer from the extent allocation information 1176 * for a block group consist of conventional zones. It is pointed to the 1177 * end of the highest addressed extent in the block group as an allocation 1178 * offset. 1179 */ 1180 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 1181 u64 *offset_ret) 1182 { 1183 struct btrfs_fs_info *fs_info = cache->fs_info; 1184 struct btrfs_root *root; 1185 struct btrfs_path *path; 1186 struct btrfs_key key; 1187 struct btrfs_key found_key; 1188 int ret; 1189 u64 length; 1190 1191 path = btrfs_alloc_path(); 1192 if (!path) 1193 return -ENOMEM; 1194 1195 key.objectid = cache->start + cache->length; 1196 key.type = 0; 1197 key.offset = 0; 1198 1199 root = btrfs_extent_root(fs_info, key.objectid); 1200 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1201 /* We should not find the exact match */ 1202 if (!ret) 1203 ret = -EUCLEAN; 1204 if (ret < 0) 1205 goto out; 1206 1207 ret = btrfs_previous_extent_item(root, path, cache->start); 1208 if (ret) { 1209 if (ret == 1) { 1210 ret = 0; 1211 *offset_ret = 0; 1212 } 1213 goto out; 1214 } 1215 1216 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1217 1218 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1219 length = found_key.offset; 1220 else 1221 length = fs_info->nodesize; 1222 1223 if (!(found_key.objectid >= cache->start && 1224 found_key.objectid + length <= cache->start + cache->length)) { 1225 ret = -EUCLEAN; 1226 goto out; 1227 } 1228 *offset_ret = found_key.objectid + length - cache->start; 1229 ret = 0; 1230 1231 out: 1232 btrfs_free_path(path); 1233 return ret; 1234 } 1235 1236 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1237 { 1238 struct btrfs_fs_info *fs_info = cache->fs_info; 1239 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1240 struct extent_map *em; 1241 struct map_lookup *map; 1242 struct btrfs_device *device; 1243 u64 logical = cache->start; 1244 u64 length = cache->length; 1245 int ret; 1246 int i; 1247 unsigned int nofs_flag; 1248 u64 *alloc_offsets = NULL; 1249 u64 *caps = NULL; 1250 u64 *physical = NULL; 1251 unsigned long *active = NULL; 1252 u64 last_alloc = 0; 1253 u32 num_sequential = 0, num_conventional = 0; 1254 1255 if (!btrfs_is_zoned(fs_info)) 1256 return 0; 1257 1258 /* Sanity check */ 1259 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1260 btrfs_err(fs_info, 1261 "zoned: block group %llu len %llu unaligned to zone size %llu", 1262 logical, length, fs_info->zone_size); 1263 return -EIO; 1264 } 1265 1266 /* Get the chunk mapping */ 1267 read_lock(&em_tree->lock); 1268 em = lookup_extent_mapping(em_tree, logical, length); 1269 read_unlock(&em_tree->lock); 1270 1271 if (!em) 1272 return -EINVAL; 1273 1274 map = em->map_lookup; 1275 1276 cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); 1277 if (!cache->physical_map) { 1278 ret = -ENOMEM; 1279 goto out; 1280 } 1281 1282 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1283 if (!alloc_offsets) { 1284 ret = -ENOMEM; 1285 goto out; 1286 } 1287 1288 caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); 1289 if (!caps) { 1290 ret = -ENOMEM; 1291 goto out; 1292 } 1293 1294 physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); 1295 if (!physical) { 1296 ret = -ENOMEM; 1297 goto out; 1298 } 1299 1300 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1301 if (!active) { 1302 ret = -ENOMEM; 1303 goto out; 1304 } 1305 1306 for (i = 0; i < map->num_stripes; i++) { 1307 bool is_sequential; 1308 struct blk_zone zone; 1309 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1310 int dev_replace_is_ongoing = 0; 1311 1312 device = map->stripes[i].dev; 1313 physical[i] = map->stripes[i].physical; 1314 1315 if (device->bdev == NULL) { 1316 alloc_offsets[i] = WP_MISSING_DEV; 1317 continue; 1318 } 1319 1320 is_sequential = btrfs_dev_is_sequential(device, physical[i]); 1321 if (is_sequential) 1322 num_sequential++; 1323 else 1324 num_conventional++; 1325 1326 if (!is_sequential) { 1327 alloc_offsets[i] = WP_CONVENTIONAL; 1328 continue; 1329 } 1330 1331 /* 1332 * This zone will be used for allocation, so mark this zone 1333 * non-empty. 1334 */ 1335 btrfs_dev_clear_zone_empty(device, physical[i]); 1336 1337 down_read(&dev_replace->rwsem); 1338 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1339 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1340 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); 1341 up_read(&dev_replace->rwsem); 1342 1343 /* 1344 * The group is mapped to a sequential zone. Get the zone write 1345 * pointer to determine the allocation offset within the zone. 1346 */ 1347 WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); 1348 nofs_flag = memalloc_nofs_save(); 1349 ret = btrfs_get_dev_zone(device, physical[i], &zone); 1350 memalloc_nofs_restore(nofs_flag); 1351 if (ret == -EIO || ret == -EOPNOTSUPP) { 1352 ret = 0; 1353 alloc_offsets[i] = WP_MISSING_DEV; 1354 continue; 1355 } else if (ret) { 1356 goto out; 1357 } 1358 1359 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1360 btrfs_err_in_rcu(fs_info, 1361 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1362 zone.start << SECTOR_SHIFT, 1363 rcu_str_deref(device->name), device->devid); 1364 ret = -EIO; 1365 goto out; 1366 } 1367 1368 caps[i] = (zone.capacity << SECTOR_SHIFT); 1369 1370 switch (zone.cond) { 1371 case BLK_ZONE_COND_OFFLINE: 1372 case BLK_ZONE_COND_READONLY: 1373 btrfs_err(fs_info, 1374 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1375 physical[i] >> device->zone_info->zone_size_shift, 1376 rcu_str_deref(device->name), device->devid); 1377 alloc_offsets[i] = WP_MISSING_DEV; 1378 break; 1379 case BLK_ZONE_COND_EMPTY: 1380 alloc_offsets[i] = 0; 1381 break; 1382 case BLK_ZONE_COND_FULL: 1383 alloc_offsets[i] = caps[i]; 1384 break; 1385 default: 1386 /* Partially used zone */ 1387 alloc_offsets[i] = 1388 ((zone.wp - zone.start) << SECTOR_SHIFT); 1389 __set_bit(i, active); 1390 break; 1391 } 1392 1393 /* 1394 * Consider a zone as active if we can allow any number of 1395 * active zones. 1396 */ 1397 if (!device->zone_info->max_active_zones) 1398 __set_bit(i, active); 1399 } 1400 1401 if (num_sequential > 0) 1402 cache->seq_zone = true; 1403 1404 if (num_conventional > 0) { 1405 /* 1406 * Avoid calling calculate_alloc_pointer() for new BG. It 1407 * is no use for new BG. It must be always 0. 1408 * 1409 * Also, we have a lock chain of extent buffer lock -> 1410 * chunk mutex. For new BG, this function is called from 1411 * btrfs_make_block_group() which is already taking the 1412 * chunk mutex. Thus, we cannot call 1413 * calculate_alloc_pointer() which takes extent buffer 1414 * locks to avoid deadlock. 1415 */ 1416 1417 /* Zone capacity is always zone size in emulation */ 1418 cache->zone_capacity = cache->length; 1419 if (new) { 1420 cache->alloc_offset = 0; 1421 goto out; 1422 } 1423 ret = calculate_alloc_pointer(cache, &last_alloc); 1424 if (ret || map->num_stripes == num_conventional) { 1425 if (!ret) 1426 cache->alloc_offset = last_alloc; 1427 else 1428 btrfs_err(fs_info, 1429 "zoned: failed to determine allocation offset of bg %llu", 1430 cache->start); 1431 goto out; 1432 } 1433 } 1434 1435 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1436 case 0: /* single */ 1437 if (alloc_offsets[0] == WP_MISSING_DEV) { 1438 btrfs_err(fs_info, 1439 "zoned: cannot recover write pointer for zone %llu", 1440 physical[0]); 1441 ret = -EIO; 1442 goto out; 1443 } 1444 cache->alloc_offset = alloc_offsets[0]; 1445 cache->zone_capacity = caps[0]; 1446 cache->zone_is_active = test_bit(0, active); 1447 break; 1448 case BTRFS_BLOCK_GROUP_DUP: 1449 if (map->type & BTRFS_BLOCK_GROUP_DATA) { 1450 btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); 1451 ret = -EINVAL; 1452 goto out; 1453 } 1454 if (alloc_offsets[0] == WP_MISSING_DEV) { 1455 btrfs_err(fs_info, 1456 "zoned: cannot recover write pointer for zone %llu", 1457 physical[0]); 1458 ret = -EIO; 1459 goto out; 1460 } 1461 if (alloc_offsets[1] == WP_MISSING_DEV) { 1462 btrfs_err(fs_info, 1463 "zoned: cannot recover write pointer for zone %llu", 1464 physical[1]); 1465 ret = -EIO; 1466 goto out; 1467 } 1468 if (alloc_offsets[0] != alloc_offsets[1]) { 1469 btrfs_err(fs_info, 1470 "zoned: write pointer offset mismatch of zones in DUP profile"); 1471 ret = -EIO; 1472 goto out; 1473 } 1474 if (test_bit(0, active) != test_bit(1, active)) { 1475 if (!btrfs_zone_activate(cache)) { 1476 ret = -EIO; 1477 goto out; 1478 } 1479 } else { 1480 cache->zone_is_active = test_bit(0, active); 1481 } 1482 cache->alloc_offset = alloc_offsets[0]; 1483 cache->zone_capacity = min(caps[0], caps[1]); 1484 break; 1485 case BTRFS_BLOCK_GROUP_RAID1: 1486 case BTRFS_BLOCK_GROUP_RAID0: 1487 case BTRFS_BLOCK_GROUP_RAID10: 1488 case BTRFS_BLOCK_GROUP_RAID5: 1489 case BTRFS_BLOCK_GROUP_RAID6: 1490 /* non-single profiles are not supported yet */ 1491 default: 1492 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1493 btrfs_bg_type_to_raid_name(map->type)); 1494 ret = -EINVAL; 1495 goto out; 1496 } 1497 1498 if (cache->zone_is_active) { 1499 btrfs_get_block_group(cache); 1500 spin_lock(&fs_info->zone_active_bgs_lock); 1501 list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); 1502 spin_unlock(&fs_info->zone_active_bgs_lock); 1503 } 1504 1505 out: 1506 if (cache->alloc_offset > fs_info->zone_size) { 1507 btrfs_err(fs_info, 1508 "zoned: invalid write pointer %llu in block group %llu", 1509 cache->alloc_offset, cache->start); 1510 ret = -EIO; 1511 } 1512 1513 if (cache->alloc_offset > cache->zone_capacity) { 1514 btrfs_err(fs_info, 1515 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", 1516 cache->alloc_offset, cache->zone_capacity, 1517 cache->start); 1518 ret = -EIO; 1519 } 1520 1521 /* An extent is allocated after the write pointer */ 1522 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1523 btrfs_err(fs_info, 1524 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1525 logical, last_alloc, cache->alloc_offset); 1526 ret = -EIO; 1527 } 1528 1529 if (!ret) 1530 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1531 1532 if (ret) { 1533 kfree(cache->physical_map); 1534 cache->physical_map = NULL; 1535 } 1536 bitmap_free(active); 1537 kfree(physical); 1538 kfree(caps); 1539 kfree(alloc_offsets); 1540 free_extent_map(em); 1541 1542 return ret; 1543 } 1544 1545 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1546 { 1547 u64 unusable, free; 1548 1549 if (!btrfs_is_zoned(cache->fs_info)) 1550 return; 1551 1552 WARN_ON(cache->bytes_super != 0); 1553 unusable = (cache->alloc_offset - cache->used) + 1554 (cache->length - cache->zone_capacity); 1555 free = cache->zone_capacity - cache->alloc_offset; 1556 1557 /* We only need ->free_space in ALLOC_SEQ block groups */ 1558 cache->last_byte_to_unpin = (u64)-1; 1559 cache->cached = BTRFS_CACHE_FINISHED; 1560 cache->free_space_ctl->free_space = free; 1561 cache->zone_unusable = unusable; 1562 } 1563 1564 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1565 struct extent_buffer *eb) 1566 { 1567 struct btrfs_fs_info *fs_info = eb->fs_info; 1568 1569 if (!btrfs_is_zoned(fs_info) || 1570 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1571 !list_empty(&eb->release_list)) 1572 return; 1573 1574 set_extent_buffer_dirty(eb); 1575 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1576 eb->start + eb->len - 1, EXTENT_DIRTY); 1577 memzero_extent_buffer(eb, 0, eb->len); 1578 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1579 1580 spin_lock(&trans->releasing_ebs_lock); 1581 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1582 spin_unlock(&trans->releasing_ebs_lock); 1583 atomic_inc(&eb->refs); 1584 } 1585 1586 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1587 { 1588 spin_lock(&trans->releasing_ebs_lock); 1589 while (!list_empty(&trans->releasing_ebs)) { 1590 struct extent_buffer *eb; 1591 1592 eb = list_first_entry(&trans->releasing_ebs, 1593 struct extent_buffer, release_list); 1594 list_del_init(&eb->release_list); 1595 free_extent_buffer(eb); 1596 } 1597 spin_unlock(&trans->releasing_ebs_lock); 1598 } 1599 1600 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) 1601 { 1602 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1603 struct btrfs_block_group *cache; 1604 bool ret = false; 1605 1606 if (!btrfs_is_zoned(fs_info)) 1607 return false; 1608 1609 if (!is_data_inode(&inode->vfs_inode)) 1610 return false; 1611 1612 /* 1613 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the 1614 * extent layout the relocation code has. 1615 * Furthermore we have set aside own block-group from which only the 1616 * relocation "process" can allocate and make sure only one process at a 1617 * time can add pages to an extent that gets relocated, so it's safe to 1618 * use regular REQ_OP_WRITE for this special case. 1619 */ 1620 if (btrfs_is_data_reloc_root(inode->root)) 1621 return false; 1622 1623 cache = btrfs_lookup_block_group(fs_info, start); 1624 ASSERT(cache); 1625 if (!cache) 1626 return false; 1627 1628 ret = cache->seq_zone; 1629 btrfs_put_block_group(cache); 1630 1631 return ret; 1632 } 1633 1634 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1635 struct bio *bio) 1636 { 1637 struct btrfs_ordered_extent *ordered; 1638 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1639 1640 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1641 return; 1642 1643 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1644 if (WARN_ON(!ordered)) 1645 return; 1646 1647 ordered->physical = physical; 1648 ordered->bdev = bio->bi_bdev; 1649 1650 btrfs_put_ordered_extent(ordered); 1651 } 1652 1653 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1654 { 1655 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1656 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1657 struct extent_map_tree *em_tree; 1658 struct extent_map *em; 1659 struct btrfs_ordered_sum *sum; 1660 u64 orig_logical = ordered->disk_bytenr; 1661 u64 *logical = NULL; 1662 int nr, stripe_len; 1663 1664 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1665 ASSERT(!bdev_is_partition(ordered->bdev)); 1666 if (WARN_ON(!ordered->bdev)) 1667 return; 1668 1669 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, 1670 ordered->physical, &logical, &nr, 1671 &stripe_len))) 1672 goto out; 1673 1674 WARN_ON(nr != 1); 1675 1676 if (orig_logical == *logical) 1677 goto out; 1678 1679 ordered->disk_bytenr = *logical; 1680 1681 em_tree = &inode->extent_tree; 1682 write_lock(&em_tree->lock); 1683 em = search_extent_mapping(em_tree, ordered->file_offset, 1684 ordered->num_bytes); 1685 em->block_start = *logical; 1686 free_extent_map(em); 1687 write_unlock(&em_tree->lock); 1688 1689 list_for_each_entry(sum, &ordered->list, list) { 1690 if (*logical < orig_logical) 1691 sum->bytenr -= orig_logical - *logical; 1692 else 1693 sum->bytenr += *logical - orig_logical; 1694 } 1695 1696 out: 1697 kfree(logical); 1698 } 1699 1700 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1701 struct extent_buffer *eb, 1702 struct btrfs_block_group **cache_ret) 1703 { 1704 struct btrfs_block_group *cache; 1705 bool ret = true; 1706 1707 if (!btrfs_is_zoned(fs_info)) 1708 return true; 1709 1710 cache = btrfs_lookup_block_group(fs_info, eb->start); 1711 if (!cache) 1712 return true; 1713 1714 if (cache->meta_write_pointer != eb->start) { 1715 btrfs_put_block_group(cache); 1716 cache = NULL; 1717 ret = false; 1718 } else { 1719 cache->meta_write_pointer = eb->start + eb->len; 1720 } 1721 1722 *cache_ret = cache; 1723 1724 return ret; 1725 } 1726 1727 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1728 struct extent_buffer *eb) 1729 { 1730 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1731 return; 1732 1733 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1734 cache->meta_write_pointer = eb->start; 1735 } 1736 1737 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1738 { 1739 if (!btrfs_dev_is_sequential(device, physical)) 1740 return -EOPNOTSUPP; 1741 1742 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1743 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1744 } 1745 1746 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1747 struct blk_zone *zone) 1748 { 1749 struct btrfs_io_context *bioc = NULL; 1750 u64 mapped_length = PAGE_SIZE; 1751 unsigned int nofs_flag; 1752 int nmirrors; 1753 int i, ret; 1754 1755 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1756 &mapped_length, &bioc); 1757 if (ret || !bioc || mapped_length < PAGE_SIZE) { 1758 ret = -EIO; 1759 goto out_put_bioc; 1760 } 1761 1762 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1763 ret = -EINVAL; 1764 goto out_put_bioc; 1765 } 1766 1767 nofs_flag = memalloc_nofs_save(); 1768 nmirrors = (int)bioc->num_stripes; 1769 for (i = 0; i < nmirrors; i++) { 1770 u64 physical = bioc->stripes[i].physical; 1771 struct btrfs_device *dev = bioc->stripes[i].dev; 1772 1773 /* Missing device */ 1774 if (!dev->bdev) 1775 continue; 1776 1777 ret = btrfs_get_dev_zone(dev, physical, zone); 1778 /* Failing device */ 1779 if (ret == -EIO || ret == -EOPNOTSUPP) 1780 continue; 1781 break; 1782 } 1783 memalloc_nofs_restore(nofs_flag); 1784 out_put_bioc: 1785 btrfs_put_bioc(bioc); 1786 return ret; 1787 } 1788 1789 /* 1790 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1791 * filling zeros between @physical_pos to a write pointer of dev-replace 1792 * source device. 1793 */ 1794 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1795 u64 physical_start, u64 physical_pos) 1796 { 1797 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1798 struct blk_zone zone; 1799 u64 length; 1800 u64 wp; 1801 int ret; 1802 1803 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1804 return 0; 1805 1806 ret = read_zone_info(fs_info, logical, &zone); 1807 if (ret) 1808 return ret; 1809 1810 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1811 1812 if (physical_pos == wp) 1813 return 0; 1814 1815 if (physical_pos > wp) 1816 return -EUCLEAN; 1817 1818 length = wp - physical_pos; 1819 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1820 } 1821 1822 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 1823 u64 logical, u64 length) 1824 { 1825 struct btrfs_device *device; 1826 struct extent_map *em; 1827 struct map_lookup *map; 1828 1829 em = btrfs_get_chunk_map(fs_info, logical, length); 1830 if (IS_ERR(em)) 1831 return ERR_CAST(em); 1832 1833 map = em->map_lookup; 1834 /* We only support single profile for now */ 1835 device = map->stripes[0].dev; 1836 1837 free_extent_map(em); 1838 1839 return device; 1840 } 1841 1842 /** 1843 * Activate block group and underlying device zones 1844 * 1845 * @block_group: the block group to activate 1846 * 1847 * Return: true on success, false otherwise 1848 */ 1849 bool btrfs_zone_activate(struct btrfs_block_group *block_group) 1850 { 1851 struct btrfs_fs_info *fs_info = block_group->fs_info; 1852 struct btrfs_space_info *space_info = block_group->space_info; 1853 struct map_lookup *map; 1854 struct btrfs_device *device; 1855 u64 physical; 1856 bool ret; 1857 int i; 1858 1859 if (!btrfs_is_zoned(block_group->fs_info)) 1860 return true; 1861 1862 map = block_group->physical_map; 1863 1864 spin_lock(&space_info->lock); 1865 spin_lock(&block_group->lock); 1866 if (block_group->zone_is_active) { 1867 ret = true; 1868 goto out_unlock; 1869 } 1870 1871 /* No space left */ 1872 if (btrfs_zoned_bg_is_full(block_group)) { 1873 ret = false; 1874 goto out_unlock; 1875 } 1876 1877 for (i = 0; i < map->num_stripes; i++) { 1878 device = map->stripes[i].dev; 1879 physical = map->stripes[i].physical; 1880 1881 if (device->zone_info->max_active_zones == 0) 1882 continue; 1883 1884 if (!btrfs_dev_set_active_zone(device, physical)) { 1885 /* Cannot activate the zone */ 1886 ret = false; 1887 goto out_unlock; 1888 } 1889 } 1890 1891 /* Successfully activated all the zones */ 1892 block_group->zone_is_active = 1; 1893 space_info->active_total_bytes += block_group->length; 1894 spin_unlock(&block_group->lock); 1895 btrfs_try_granting_tickets(fs_info, space_info); 1896 spin_unlock(&space_info->lock); 1897 1898 /* For the active block group list */ 1899 btrfs_get_block_group(block_group); 1900 1901 spin_lock(&fs_info->zone_active_bgs_lock); 1902 list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 1903 spin_unlock(&fs_info->zone_active_bgs_lock); 1904 1905 return true; 1906 1907 out_unlock: 1908 spin_unlock(&block_group->lock); 1909 spin_unlock(&space_info->lock); 1910 return ret; 1911 } 1912 1913 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) 1914 { 1915 struct btrfs_fs_info *fs_info = block_group->fs_info; 1916 struct map_lookup *map; 1917 int ret = 0; 1918 int i; 1919 1920 spin_lock(&block_group->lock); 1921 if (!block_group->zone_is_active) { 1922 spin_unlock(&block_group->lock); 1923 return 0; 1924 } 1925 1926 /* Check if we have unwritten allocated space */ 1927 if ((block_group->flags & 1928 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && 1929 block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { 1930 spin_unlock(&block_group->lock); 1931 return -EAGAIN; 1932 } 1933 1934 /* 1935 * If we are sure that the block group is full (= no more room left for 1936 * new allocation) and the IO for the last usable block is completed, we 1937 * don't need to wait for the other IOs. This holds because we ensure 1938 * the sequential IO submissions using the ZONE_APPEND command for data 1939 * and block_group->meta_write_pointer for metadata. 1940 */ 1941 if (!fully_written) { 1942 spin_unlock(&block_group->lock); 1943 1944 ret = btrfs_inc_block_group_ro(block_group, false); 1945 if (ret) 1946 return ret; 1947 1948 /* Ensure all writes in this block group finish */ 1949 btrfs_wait_block_group_reservations(block_group); 1950 /* No need to wait for NOCOW writers. Zoned mode does not allow that */ 1951 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, 1952 block_group->length); 1953 1954 spin_lock(&block_group->lock); 1955 1956 /* 1957 * Bail out if someone already deactivated the block group, or 1958 * allocated space is left in the block group. 1959 */ 1960 if (!block_group->zone_is_active) { 1961 spin_unlock(&block_group->lock); 1962 btrfs_dec_block_group_ro(block_group); 1963 return 0; 1964 } 1965 1966 if (block_group->reserved) { 1967 spin_unlock(&block_group->lock); 1968 btrfs_dec_block_group_ro(block_group); 1969 return -EAGAIN; 1970 } 1971 } 1972 1973 block_group->zone_is_active = 0; 1974 block_group->alloc_offset = block_group->zone_capacity; 1975 block_group->free_space_ctl->free_space = 0; 1976 btrfs_clear_treelog_bg(block_group); 1977 btrfs_clear_data_reloc_bg(block_group); 1978 spin_unlock(&block_group->lock); 1979 1980 map = block_group->physical_map; 1981 for (i = 0; i < map->num_stripes; i++) { 1982 struct btrfs_device *device = map->stripes[i].dev; 1983 const u64 physical = map->stripes[i].physical; 1984 1985 if (device->zone_info->max_active_zones == 0) 1986 continue; 1987 1988 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1989 physical >> SECTOR_SHIFT, 1990 device->zone_info->zone_size >> SECTOR_SHIFT, 1991 GFP_NOFS); 1992 1993 if (ret) 1994 return ret; 1995 1996 btrfs_dev_clear_active_zone(device, physical); 1997 } 1998 1999 if (!fully_written) 2000 btrfs_dec_block_group_ro(block_group); 2001 2002 spin_lock(&fs_info->zone_active_bgs_lock); 2003 ASSERT(!list_empty(&block_group->active_bg_list)); 2004 list_del_init(&block_group->active_bg_list); 2005 spin_unlock(&fs_info->zone_active_bgs_lock); 2006 2007 /* For active_bg_list */ 2008 btrfs_put_block_group(block_group); 2009 2010 clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2011 wake_up_all(&fs_info->zone_finish_wait); 2012 2013 return 0; 2014 } 2015 2016 int btrfs_zone_finish(struct btrfs_block_group *block_group) 2017 { 2018 if (!btrfs_is_zoned(block_group->fs_info)) 2019 return 0; 2020 2021 return do_zone_finish(block_group, false); 2022 } 2023 2024 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) 2025 { 2026 struct btrfs_fs_info *fs_info = fs_devices->fs_info; 2027 struct btrfs_device *device; 2028 bool ret = false; 2029 2030 if (!btrfs_is_zoned(fs_info)) 2031 return true; 2032 2033 /* Check if there is a device with active zones left */ 2034 mutex_lock(&fs_info->chunk_mutex); 2035 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 2036 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2037 2038 if (!device->bdev) 2039 continue; 2040 2041 if (!zinfo->max_active_zones || 2042 atomic_read(&zinfo->active_zones_left)) { 2043 ret = true; 2044 break; 2045 } 2046 } 2047 mutex_unlock(&fs_info->chunk_mutex); 2048 2049 if (!ret) 2050 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2051 2052 return ret; 2053 } 2054 2055 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 2056 { 2057 struct btrfs_block_group *block_group; 2058 u64 min_alloc_bytes; 2059 2060 if (!btrfs_is_zoned(fs_info)) 2061 return; 2062 2063 block_group = btrfs_lookup_block_group(fs_info, logical); 2064 ASSERT(block_group); 2065 2066 /* No MIXED_BG on zoned btrfs. */ 2067 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 2068 min_alloc_bytes = fs_info->sectorsize; 2069 else 2070 min_alloc_bytes = fs_info->nodesize; 2071 2072 /* Bail out if we can allocate more data from this block group. */ 2073 if (logical + length + min_alloc_bytes <= 2074 block_group->start + block_group->zone_capacity) 2075 goto out; 2076 2077 do_zone_finish(block_group, true); 2078 2079 out: 2080 btrfs_put_block_group(block_group); 2081 } 2082 2083 static void btrfs_zone_finish_endio_workfn(struct work_struct *work) 2084 { 2085 struct btrfs_block_group *bg = 2086 container_of(work, struct btrfs_block_group, zone_finish_work); 2087 2088 wait_on_extent_buffer_writeback(bg->last_eb); 2089 free_extent_buffer(bg->last_eb); 2090 btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); 2091 btrfs_put_block_group(bg); 2092 } 2093 2094 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 2095 struct extent_buffer *eb) 2096 { 2097 if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) 2098 return; 2099 2100 if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { 2101 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", 2102 bg->start); 2103 return; 2104 } 2105 2106 /* For the work */ 2107 btrfs_get_block_group(bg); 2108 atomic_inc(&eb->refs); 2109 bg->last_eb = eb; 2110 INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); 2111 queue_work(system_unbound_wq, &bg->zone_finish_work); 2112 } 2113 2114 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) 2115 { 2116 struct btrfs_fs_info *fs_info = bg->fs_info; 2117 2118 spin_lock(&fs_info->relocation_bg_lock); 2119 if (fs_info->data_reloc_bg == bg->start) 2120 fs_info->data_reloc_bg = 0; 2121 spin_unlock(&fs_info->relocation_bg_lock); 2122 } 2123 2124 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) 2125 { 2126 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2127 struct btrfs_device *device; 2128 2129 if (!btrfs_is_zoned(fs_info)) 2130 return; 2131 2132 mutex_lock(&fs_devices->device_list_mutex); 2133 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2134 if (device->zone_info) { 2135 vfree(device->zone_info->zone_cache); 2136 device->zone_info->zone_cache = NULL; 2137 } 2138 } 2139 mutex_unlock(&fs_devices->device_list_mutex); 2140 } 2141 2142 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) 2143 { 2144 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2145 struct btrfs_device *device; 2146 u64 used = 0; 2147 u64 total = 0; 2148 u64 factor; 2149 2150 ASSERT(btrfs_is_zoned(fs_info)); 2151 2152 if (fs_info->bg_reclaim_threshold == 0) 2153 return false; 2154 2155 mutex_lock(&fs_devices->device_list_mutex); 2156 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2157 if (!device->bdev) 2158 continue; 2159 2160 total += device->disk_total_bytes; 2161 used += device->bytes_used; 2162 } 2163 mutex_unlock(&fs_devices->device_list_mutex); 2164 2165 factor = div64_u64(used * 100, total); 2166 return factor >= fs_info->bg_reclaim_threshold; 2167 } 2168 2169 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 2170 u64 length) 2171 { 2172 struct btrfs_block_group *block_group; 2173 2174 if (!btrfs_is_zoned(fs_info)) 2175 return; 2176 2177 block_group = btrfs_lookup_block_group(fs_info, logical); 2178 /* It should be called on a previous data relocation block group. */ 2179 ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); 2180 2181 spin_lock(&block_group->lock); 2182 if (!block_group->zoned_data_reloc_ongoing) 2183 goto out; 2184 2185 /* All relocation extents are written. */ 2186 if (block_group->start + block_group->alloc_offset == logical + length) { 2187 /* Now, release this block group for further allocations. */ 2188 block_group->zoned_data_reloc_ongoing = 0; 2189 } 2190 2191 out: 2192 spin_unlock(&block_group->lock); 2193 btrfs_put_block_group(block_group); 2194 } 2195 2196 int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) 2197 { 2198 struct btrfs_block_group *block_group; 2199 struct btrfs_block_group *min_bg = NULL; 2200 u64 min_avail = U64_MAX; 2201 int ret; 2202 2203 spin_lock(&fs_info->zone_active_bgs_lock); 2204 list_for_each_entry(block_group, &fs_info->zone_active_bgs, 2205 active_bg_list) { 2206 u64 avail; 2207 2208 spin_lock(&block_group->lock); 2209 if (block_group->reserved || 2210 (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { 2211 spin_unlock(&block_group->lock); 2212 continue; 2213 } 2214 2215 avail = block_group->zone_capacity - block_group->alloc_offset; 2216 if (min_avail > avail) { 2217 if (min_bg) 2218 btrfs_put_block_group(min_bg); 2219 min_bg = block_group; 2220 min_avail = avail; 2221 btrfs_get_block_group(min_bg); 2222 } 2223 spin_unlock(&block_group->lock); 2224 } 2225 spin_unlock(&fs_info->zone_active_bgs_lock); 2226 2227 if (!min_bg) 2228 return 0; 2229 2230 ret = btrfs_zone_finish(min_bg); 2231 btrfs_put_block_group(min_bg); 2232 2233 return ret < 0 ? ret : 1; 2234 } 2235 2236 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, 2237 struct btrfs_space_info *space_info, 2238 bool do_finish) 2239 { 2240 struct btrfs_block_group *bg; 2241 int index; 2242 2243 if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) 2244 return 0; 2245 2246 /* No more block groups to activate */ 2247 if (space_info->active_total_bytes == space_info->total_bytes) 2248 return 0; 2249 2250 for (;;) { 2251 int ret; 2252 bool need_finish = false; 2253 2254 down_read(&space_info->groups_sem); 2255 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { 2256 list_for_each_entry(bg, &space_info->block_groups[index], 2257 list) { 2258 if (!spin_trylock(&bg->lock)) 2259 continue; 2260 if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { 2261 spin_unlock(&bg->lock); 2262 continue; 2263 } 2264 spin_unlock(&bg->lock); 2265 2266 if (btrfs_zone_activate(bg)) { 2267 up_read(&space_info->groups_sem); 2268 return 1; 2269 } 2270 2271 need_finish = true; 2272 } 2273 } 2274 up_read(&space_info->groups_sem); 2275 2276 if (!do_finish || !need_finish) 2277 break; 2278 2279 ret = btrfs_zone_finish_one_bg(fs_info); 2280 if (ret == 0) 2281 break; 2282 if (ret < 0) 2283 return ret; 2284 } 2285 2286 return 0; 2287 } 2288