1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include <linux/atomic.h> 8 #include "ctree.h" 9 #include "volumes.h" 10 #include "zoned.h" 11 #include "rcu-string.h" 12 #include "disk-io.h" 13 #include "block-group.h" 14 #include "transaction.h" 15 #include "dev-replace.h" 16 #include "space-info.h" 17 18 /* Maximum number of zones to report per blkdev_report_zones() call */ 19 #define BTRFS_REPORT_NR_ZONES 4096 20 /* Invalid allocation pointer value for missing devices */ 21 #define WP_MISSING_DEV ((u64)-1) 22 /* Pseudo write pointer value for conventional zone */ 23 #define WP_CONVENTIONAL ((u64)-2) 24 25 /* 26 * Location of the first zone of superblock logging zone pairs. 27 * 28 * - primary superblock: 0B (zone 0) 29 * - first copy: 512G (zone starting at that offset) 30 * - second copy: 4T (zone starting at that offset) 31 */ 32 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 33 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 34 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 35 36 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 37 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 38 39 /* Number of superblock log zones */ 40 #define BTRFS_NR_SB_LOG_ZONES 2 41 42 /* 43 * Minimum of active zones we need: 44 * 45 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors 46 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group 47 * - 1 zone for tree-log dedicated block group 48 * - 1 zone for relocation 49 */ 50 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 51 52 /* 53 * Maximum supported zone size. Currently, SMR disks have a zone size of 54 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not 55 * expect the zone size to become larger than 8GiB in the near future. 56 */ 57 #define BTRFS_MAX_ZONE_SIZE SZ_8G 58 59 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 60 61 static inline bool sb_zone_is_full(const struct blk_zone *zone) 62 { 63 return (zone->cond == BLK_ZONE_COND_FULL) || 64 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); 65 } 66 67 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 68 { 69 struct blk_zone *zones = data; 70 71 memcpy(&zones[idx], zone, sizeof(*zone)); 72 73 return 0; 74 } 75 76 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 77 u64 *wp_ret) 78 { 79 bool empty[BTRFS_NR_SB_LOG_ZONES]; 80 bool full[BTRFS_NR_SB_LOG_ZONES]; 81 sector_t sector; 82 int i; 83 84 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 85 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); 86 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); 87 full[i] = sb_zone_is_full(&zones[i]); 88 } 89 90 /* 91 * Possible states of log buffer zones 92 * 93 * Empty[0] In use[0] Full[0] 94 * Empty[1] * x 0 95 * In use[1] 0 x 0 96 * Full[1] 1 1 C 97 * 98 * Log position: 99 * *: Special case, no superblock is written 100 * 0: Use write pointer of zones[0] 101 * 1: Use write pointer of zones[1] 102 * C: Compare super blocks from zones[0] and zones[1], use the latest 103 * one determined by generation 104 * x: Invalid state 105 */ 106 107 if (empty[0] && empty[1]) { 108 /* Special case to distinguish no superblock to read */ 109 *wp_ret = zones[0].start << SECTOR_SHIFT; 110 return -ENOENT; 111 } else if (full[0] && full[1]) { 112 /* Compare two super blocks */ 113 struct address_space *mapping = bdev->bd_inode->i_mapping; 114 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 115 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 116 int i; 117 118 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 119 u64 bytenr; 120 121 bytenr = ((zones[i].start + zones[i].len) 122 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 123 124 page[i] = read_cache_page_gfp(mapping, 125 bytenr >> PAGE_SHIFT, GFP_NOFS); 126 if (IS_ERR(page[i])) { 127 if (i == 1) 128 btrfs_release_disk_super(super[0]); 129 return PTR_ERR(page[i]); 130 } 131 super[i] = page_address(page[i]); 132 } 133 134 if (super[0]->generation > super[1]->generation) 135 sector = zones[1].start; 136 else 137 sector = zones[0].start; 138 139 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 140 btrfs_release_disk_super(super[i]); 141 } else if (!full[0] && (empty[1] || full[1])) { 142 sector = zones[0].wp; 143 } else if (full[0]) { 144 sector = zones[1].wp; 145 } else { 146 return -EUCLEAN; 147 } 148 *wp_ret = sector << SECTOR_SHIFT; 149 return 0; 150 } 151 152 /* 153 * Get the first zone number of the superblock mirror 154 */ 155 static inline u32 sb_zone_number(int shift, int mirror) 156 { 157 u64 zone; 158 159 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 160 switch (mirror) { 161 case 0: zone = 0; break; 162 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 163 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 164 } 165 166 ASSERT(zone <= U32_MAX); 167 168 return (u32)zone; 169 } 170 171 static inline sector_t zone_start_sector(u32 zone_number, 172 struct block_device *bdev) 173 { 174 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 175 } 176 177 static inline u64 zone_start_physical(u32 zone_number, 178 struct btrfs_zoned_device_info *zone_info) 179 { 180 return (u64)zone_number << zone_info->zone_size_shift; 181 } 182 183 /* 184 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 185 * device into static sized chunks and fake a conventional zone on each of 186 * them. 187 */ 188 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 189 struct blk_zone *zones, unsigned int nr_zones) 190 { 191 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 192 sector_t bdev_size = bdev_nr_sectors(device->bdev); 193 unsigned int i; 194 195 pos >>= SECTOR_SHIFT; 196 for (i = 0; i < nr_zones; i++) { 197 zones[i].start = i * zone_sectors + pos; 198 zones[i].len = zone_sectors; 199 zones[i].capacity = zone_sectors; 200 zones[i].wp = zones[i].start + zone_sectors; 201 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 202 zones[i].cond = BLK_ZONE_COND_NOT_WP; 203 204 if (zones[i].wp >= bdev_size) { 205 i++; 206 break; 207 } 208 } 209 210 return i; 211 } 212 213 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 214 struct blk_zone *zones, unsigned int *nr_zones) 215 { 216 int ret; 217 218 if (!*nr_zones) 219 return 0; 220 221 if (!bdev_is_zoned(device->bdev)) { 222 ret = emulate_report_zones(device, pos, zones, *nr_zones); 223 *nr_zones = ret; 224 return 0; 225 } 226 227 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 228 copy_zone_info_cb, zones); 229 if (ret < 0) { 230 btrfs_err_in_rcu(device->fs_info, 231 "zoned: failed to read zone %llu on %s (devid %llu)", 232 pos, rcu_str_deref(device->name), 233 device->devid); 234 return ret; 235 } 236 *nr_zones = ret; 237 if (!ret) 238 return -EIO; 239 240 return 0; 241 } 242 243 /* The emulated zone size is determined from the size of device extent */ 244 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 245 { 246 struct btrfs_path *path; 247 struct btrfs_root *root = fs_info->dev_root; 248 struct btrfs_key key; 249 struct extent_buffer *leaf; 250 struct btrfs_dev_extent *dext; 251 int ret = 0; 252 253 key.objectid = 1; 254 key.type = BTRFS_DEV_EXTENT_KEY; 255 key.offset = 0; 256 257 path = btrfs_alloc_path(); 258 if (!path) 259 return -ENOMEM; 260 261 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 262 if (ret < 0) 263 goto out; 264 265 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 266 ret = btrfs_next_leaf(root, path); 267 if (ret < 0) 268 goto out; 269 /* No dev extents at all? Not good */ 270 if (ret > 0) { 271 ret = -EUCLEAN; 272 goto out; 273 } 274 } 275 276 leaf = path->nodes[0]; 277 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 278 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 279 ret = 0; 280 281 out: 282 btrfs_free_path(path); 283 284 return ret; 285 } 286 287 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 288 { 289 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 290 struct btrfs_device *device; 291 int ret = 0; 292 293 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 294 if (!btrfs_fs_incompat(fs_info, ZONED)) 295 return 0; 296 297 mutex_lock(&fs_devices->device_list_mutex); 298 list_for_each_entry(device, &fs_devices->devices, dev_list) { 299 /* We can skip reading of zone info for missing devices */ 300 if (!device->bdev) 301 continue; 302 303 ret = btrfs_get_dev_zone_info(device); 304 if (ret) 305 break; 306 } 307 mutex_unlock(&fs_devices->device_list_mutex); 308 309 return ret; 310 } 311 312 int btrfs_get_dev_zone_info(struct btrfs_device *device) 313 { 314 struct btrfs_fs_info *fs_info = device->fs_info; 315 struct btrfs_zoned_device_info *zone_info = NULL; 316 struct block_device *bdev = device->bdev; 317 struct request_queue *queue = bdev_get_queue(bdev); 318 unsigned int max_active_zones; 319 unsigned int nactive; 320 sector_t nr_sectors; 321 sector_t sector = 0; 322 struct blk_zone *zones = NULL; 323 unsigned int i, nreported = 0, nr_zones; 324 sector_t zone_sectors; 325 char *model, *emulated; 326 int ret; 327 328 /* 329 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 330 * yet be set. 331 */ 332 if (!btrfs_fs_incompat(fs_info, ZONED)) 333 return 0; 334 335 if (device->zone_info) 336 return 0; 337 338 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 339 if (!zone_info) 340 return -ENOMEM; 341 342 if (!bdev_is_zoned(bdev)) { 343 if (!fs_info->zone_size) { 344 ret = calculate_emulated_zone_size(fs_info); 345 if (ret) 346 goto out; 347 } 348 349 ASSERT(fs_info->zone_size); 350 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 351 } else { 352 zone_sectors = bdev_zone_sectors(bdev); 353 } 354 355 /* Check if it's power of 2 (see is_power_of_2) */ 356 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 357 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 358 359 /* We reject devices with a zone size larger than 8GB */ 360 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 361 btrfs_err_in_rcu(fs_info, 362 "zoned: %s: zone size %llu larger than supported maximum %llu", 363 rcu_str_deref(device->name), 364 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 365 ret = -EINVAL; 366 goto out; 367 } 368 369 nr_sectors = bdev_nr_sectors(bdev); 370 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 371 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 372 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 373 zone_info->nr_zones++; 374 375 max_active_zones = queue_max_active_zones(queue); 376 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 377 btrfs_err_in_rcu(fs_info, 378 "zoned: %s: max active zones %u is too small, need at least %u active zones", 379 rcu_str_deref(device->name), max_active_zones, 380 BTRFS_MIN_ACTIVE_ZONES); 381 ret = -EINVAL; 382 goto out; 383 } 384 zone_info->max_active_zones = max_active_zones; 385 386 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 387 if (!zone_info->seq_zones) { 388 ret = -ENOMEM; 389 goto out; 390 } 391 392 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 393 if (!zone_info->empty_zones) { 394 ret = -ENOMEM; 395 goto out; 396 } 397 398 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 399 if (!zone_info->active_zones) { 400 ret = -ENOMEM; 401 goto out; 402 } 403 404 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 405 if (!zones) { 406 ret = -ENOMEM; 407 goto out; 408 } 409 410 /* Get zones type */ 411 nactive = 0; 412 while (sector < nr_sectors) { 413 nr_zones = BTRFS_REPORT_NR_ZONES; 414 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 415 &nr_zones); 416 if (ret) 417 goto out; 418 419 for (i = 0; i < nr_zones; i++) { 420 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 421 __set_bit(nreported, zone_info->seq_zones); 422 switch (zones[i].cond) { 423 case BLK_ZONE_COND_EMPTY: 424 __set_bit(nreported, zone_info->empty_zones); 425 break; 426 case BLK_ZONE_COND_IMP_OPEN: 427 case BLK_ZONE_COND_EXP_OPEN: 428 case BLK_ZONE_COND_CLOSED: 429 __set_bit(nreported, zone_info->active_zones); 430 nactive++; 431 break; 432 } 433 nreported++; 434 } 435 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 436 } 437 438 if (nreported != zone_info->nr_zones) { 439 btrfs_err_in_rcu(device->fs_info, 440 "inconsistent number of zones on %s (%u/%u)", 441 rcu_str_deref(device->name), nreported, 442 zone_info->nr_zones); 443 ret = -EIO; 444 goto out; 445 } 446 447 if (max_active_zones) { 448 if (nactive > max_active_zones) { 449 btrfs_err_in_rcu(device->fs_info, 450 "zoned: %u active zones on %s exceeds max_active_zones %u", 451 nactive, rcu_str_deref(device->name), 452 max_active_zones); 453 ret = -EIO; 454 goto out; 455 } 456 atomic_set(&zone_info->active_zones_left, 457 max_active_zones - nactive); 458 } 459 460 /* Validate superblock log */ 461 nr_zones = BTRFS_NR_SB_LOG_ZONES; 462 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 463 u32 sb_zone; 464 u64 sb_wp; 465 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 466 467 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 468 if (sb_zone + 1 >= zone_info->nr_zones) 469 continue; 470 471 ret = btrfs_get_dev_zones(device, 472 zone_start_physical(sb_zone, zone_info), 473 &zone_info->sb_zones[sb_pos], 474 &nr_zones); 475 if (ret) 476 goto out; 477 478 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 479 btrfs_err_in_rcu(device->fs_info, 480 "zoned: failed to read super block log zone info at devid %llu zone %u", 481 device->devid, sb_zone); 482 ret = -EUCLEAN; 483 goto out; 484 } 485 486 /* 487 * If zones[0] is conventional, always use the beginning of the 488 * zone to record superblock. No need to validate in that case. 489 */ 490 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 491 BLK_ZONE_TYPE_CONVENTIONAL) 492 continue; 493 494 ret = sb_write_pointer(device->bdev, 495 &zone_info->sb_zones[sb_pos], &sb_wp); 496 if (ret != -ENOENT && ret) { 497 btrfs_err_in_rcu(device->fs_info, 498 "zoned: super block log zone corrupted devid %llu zone %u", 499 device->devid, sb_zone); 500 ret = -EUCLEAN; 501 goto out; 502 } 503 } 504 505 506 kfree(zones); 507 508 device->zone_info = zone_info; 509 510 switch (bdev_zoned_model(bdev)) { 511 case BLK_ZONED_HM: 512 model = "host-managed zoned"; 513 emulated = ""; 514 break; 515 case BLK_ZONED_HA: 516 model = "host-aware zoned"; 517 emulated = ""; 518 break; 519 case BLK_ZONED_NONE: 520 model = "regular"; 521 emulated = "emulated "; 522 break; 523 default: 524 /* Just in case */ 525 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 526 bdev_zoned_model(bdev), 527 rcu_str_deref(device->name)); 528 ret = -EOPNOTSUPP; 529 goto out_free_zone_info; 530 } 531 532 btrfs_info_in_rcu(fs_info, 533 "%s block device %s, %u %szones of %llu bytes", 534 model, rcu_str_deref(device->name), zone_info->nr_zones, 535 emulated, zone_info->zone_size); 536 537 return 0; 538 539 out: 540 kfree(zones); 541 out_free_zone_info: 542 bitmap_free(zone_info->active_zones); 543 bitmap_free(zone_info->empty_zones); 544 bitmap_free(zone_info->seq_zones); 545 kfree(zone_info); 546 device->zone_info = NULL; 547 548 return ret; 549 } 550 551 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 552 { 553 struct btrfs_zoned_device_info *zone_info = device->zone_info; 554 555 if (!zone_info) 556 return; 557 558 bitmap_free(zone_info->active_zones); 559 bitmap_free(zone_info->seq_zones); 560 bitmap_free(zone_info->empty_zones); 561 kfree(zone_info); 562 device->zone_info = NULL; 563 } 564 565 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 566 struct blk_zone *zone) 567 { 568 unsigned int nr_zones = 1; 569 int ret; 570 571 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 572 if (ret != 0 || !nr_zones) 573 return ret ? ret : -EIO; 574 575 return 0; 576 } 577 578 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 579 { 580 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 581 struct btrfs_device *device; 582 u64 zoned_devices = 0; 583 u64 nr_devices = 0; 584 u64 zone_size = 0; 585 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 586 int ret = 0; 587 588 /* Count zoned devices */ 589 list_for_each_entry(device, &fs_devices->devices, dev_list) { 590 enum blk_zoned_model model; 591 592 if (!device->bdev) 593 continue; 594 595 model = bdev_zoned_model(device->bdev); 596 /* 597 * A Host-Managed zoned device must be used as a zoned device. 598 * A Host-Aware zoned device and a non-zoned devices can be 599 * treated as a zoned device, if ZONED flag is enabled in the 600 * superblock. 601 */ 602 if (model == BLK_ZONED_HM || 603 (model == BLK_ZONED_HA && incompat_zoned) || 604 (model == BLK_ZONED_NONE && incompat_zoned)) { 605 struct btrfs_zoned_device_info *zone_info = 606 device->zone_info; 607 608 zone_info = device->zone_info; 609 zoned_devices++; 610 if (!zone_size) { 611 zone_size = zone_info->zone_size; 612 } else if (zone_info->zone_size != zone_size) { 613 btrfs_err(fs_info, 614 "zoned: unequal block device zone sizes: have %llu found %llu", 615 device->zone_info->zone_size, 616 zone_size); 617 ret = -EINVAL; 618 goto out; 619 } 620 } 621 nr_devices++; 622 } 623 624 if (!zoned_devices && !incompat_zoned) 625 goto out; 626 627 if (!zoned_devices && incompat_zoned) { 628 /* No zoned block device found on ZONED filesystem */ 629 btrfs_err(fs_info, 630 "zoned: no zoned devices found on a zoned filesystem"); 631 ret = -EINVAL; 632 goto out; 633 } 634 635 if (zoned_devices && !incompat_zoned) { 636 btrfs_err(fs_info, 637 "zoned: mode not enabled but zoned device found"); 638 ret = -EINVAL; 639 goto out; 640 } 641 642 if (zoned_devices != nr_devices) { 643 btrfs_err(fs_info, 644 "zoned: cannot mix zoned and regular devices"); 645 ret = -EINVAL; 646 goto out; 647 } 648 649 /* 650 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 651 * btrfs_create_chunk(). Since we want stripe_len == zone_size, 652 * check the alignment here. 653 */ 654 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 655 btrfs_err(fs_info, 656 "zoned: zone size %llu not aligned to stripe %u", 657 zone_size, BTRFS_STRIPE_LEN); 658 ret = -EINVAL; 659 goto out; 660 } 661 662 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 663 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 664 ret = -EINVAL; 665 goto out; 666 } 667 668 fs_info->zone_size = zone_size; 669 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 670 671 /* 672 * Check mount options here, because we might change fs_info->zoned 673 * from fs_info->zone_size. 674 */ 675 ret = btrfs_check_mountopts_zoned(fs_info); 676 if (ret) 677 goto out; 678 679 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 680 out: 681 return ret; 682 } 683 684 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 685 { 686 if (!btrfs_is_zoned(info)) 687 return 0; 688 689 /* 690 * Space cache writing is not COWed. Disable that to avoid write errors 691 * in sequential zones. 692 */ 693 if (btrfs_test_opt(info, SPACE_CACHE)) { 694 btrfs_err(info, "zoned: space cache v1 is not supported"); 695 return -EINVAL; 696 } 697 698 if (btrfs_test_opt(info, NODATACOW)) { 699 btrfs_err(info, "zoned: NODATACOW not supported"); 700 return -EINVAL; 701 } 702 703 return 0; 704 } 705 706 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 707 int rw, u64 *bytenr_ret) 708 { 709 u64 wp; 710 int ret; 711 712 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 713 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 714 return 0; 715 } 716 717 ret = sb_write_pointer(bdev, zones, &wp); 718 if (ret != -ENOENT && ret < 0) 719 return ret; 720 721 if (rw == WRITE) { 722 struct blk_zone *reset = NULL; 723 724 if (wp == zones[0].start << SECTOR_SHIFT) 725 reset = &zones[0]; 726 else if (wp == zones[1].start << SECTOR_SHIFT) 727 reset = &zones[1]; 728 729 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 730 ASSERT(sb_zone_is_full(reset)); 731 732 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 733 reset->start, reset->len, 734 GFP_NOFS); 735 if (ret) 736 return ret; 737 738 reset->cond = BLK_ZONE_COND_EMPTY; 739 reset->wp = reset->start; 740 } 741 } else if (ret != -ENOENT) { 742 /* 743 * For READ, we want the previous one. Move write pointer to 744 * the end of a zone, if it is at the head of a zone. 745 */ 746 u64 zone_end = 0; 747 748 if (wp == zones[0].start << SECTOR_SHIFT) 749 zone_end = zones[1].start + zones[1].capacity; 750 else if (wp == zones[1].start << SECTOR_SHIFT) 751 zone_end = zones[0].start + zones[0].capacity; 752 if (zone_end) 753 wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, 754 BTRFS_SUPER_INFO_SIZE); 755 756 wp -= BTRFS_SUPER_INFO_SIZE; 757 } 758 759 *bytenr_ret = wp; 760 return 0; 761 762 } 763 764 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 765 u64 *bytenr_ret) 766 { 767 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 768 sector_t zone_sectors; 769 u32 sb_zone; 770 int ret; 771 u8 zone_sectors_shift; 772 sector_t nr_sectors; 773 u32 nr_zones; 774 775 if (!bdev_is_zoned(bdev)) { 776 *bytenr_ret = btrfs_sb_offset(mirror); 777 return 0; 778 } 779 780 ASSERT(rw == READ || rw == WRITE); 781 782 zone_sectors = bdev_zone_sectors(bdev); 783 if (!is_power_of_2(zone_sectors)) 784 return -EINVAL; 785 zone_sectors_shift = ilog2(zone_sectors); 786 nr_sectors = bdev_nr_sectors(bdev); 787 nr_zones = nr_sectors >> zone_sectors_shift; 788 789 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 790 if (sb_zone + 1 >= nr_zones) 791 return -ENOENT; 792 793 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 794 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 795 zones); 796 if (ret < 0) 797 return ret; 798 if (ret != BTRFS_NR_SB_LOG_ZONES) 799 return -EIO; 800 801 return sb_log_location(bdev, zones, rw, bytenr_ret); 802 } 803 804 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 805 u64 *bytenr_ret) 806 { 807 struct btrfs_zoned_device_info *zinfo = device->zone_info; 808 u32 zone_num; 809 810 /* 811 * For a zoned filesystem on a non-zoned block device, use the same 812 * super block locations as regular filesystem. Doing so, the super 813 * block can always be retrieved and the zoned flag of the volume 814 * detected from the super block information. 815 */ 816 if (!bdev_is_zoned(device->bdev)) { 817 *bytenr_ret = btrfs_sb_offset(mirror); 818 return 0; 819 } 820 821 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 822 if (zone_num + 1 >= zinfo->nr_zones) 823 return -ENOENT; 824 825 return sb_log_location(device->bdev, 826 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 827 rw, bytenr_ret); 828 } 829 830 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 831 int mirror) 832 { 833 u32 zone_num; 834 835 if (!zinfo) 836 return false; 837 838 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 839 if (zone_num + 1 >= zinfo->nr_zones) 840 return false; 841 842 if (!test_bit(zone_num, zinfo->seq_zones)) 843 return false; 844 845 return true; 846 } 847 848 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 849 { 850 struct btrfs_zoned_device_info *zinfo = device->zone_info; 851 struct blk_zone *zone; 852 int i; 853 854 if (!is_sb_log_zone(zinfo, mirror)) 855 return 0; 856 857 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 858 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 859 /* Advance the next zone */ 860 if (zone->cond == BLK_ZONE_COND_FULL) { 861 zone++; 862 continue; 863 } 864 865 if (zone->cond == BLK_ZONE_COND_EMPTY) 866 zone->cond = BLK_ZONE_COND_IMP_OPEN; 867 868 zone->wp += SUPER_INFO_SECTORS; 869 870 if (sb_zone_is_full(zone)) { 871 /* 872 * No room left to write new superblock. Since 873 * superblock is written with REQ_SYNC, it is safe to 874 * finish the zone now. 875 * 876 * If the write pointer is exactly at the capacity, 877 * explicit ZONE_FINISH is not necessary. 878 */ 879 if (zone->wp != zone->start + zone->capacity) { 880 int ret; 881 882 ret = blkdev_zone_mgmt(device->bdev, 883 REQ_OP_ZONE_FINISH, zone->start, 884 zone->len, GFP_NOFS); 885 if (ret) 886 return ret; 887 } 888 889 zone->wp = zone->start + zone->len; 890 zone->cond = BLK_ZONE_COND_FULL; 891 } 892 return 0; 893 } 894 895 /* All the zones are FULL. Should not reach here. */ 896 ASSERT(0); 897 return -EIO; 898 } 899 900 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 901 { 902 sector_t zone_sectors; 903 sector_t nr_sectors; 904 u8 zone_sectors_shift; 905 u32 sb_zone; 906 u32 nr_zones; 907 908 zone_sectors = bdev_zone_sectors(bdev); 909 zone_sectors_shift = ilog2(zone_sectors); 910 nr_sectors = bdev_nr_sectors(bdev); 911 nr_zones = nr_sectors >> zone_sectors_shift; 912 913 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 914 if (sb_zone + 1 >= nr_zones) 915 return -ENOENT; 916 917 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 918 zone_start_sector(sb_zone, bdev), 919 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 920 } 921 922 /** 923 * btrfs_find_allocatable_zones - find allocatable zones within a given region 924 * 925 * @device: the device to allocate a region on 926 * @hole_start: the position of the hole to allocate the region 927 * @num_bytes: size of wanted region 928 * @hole_end: the end of the hole 929 * @return: position of allocatable zones 930 * 931 * Allocatable region should not contain any superblock locations. 932 */ 933 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 934 u64 hole_end, u64 num_bytes) 935 { 936 struct btrfs_zoned_device_info *zinfo = device->zone_info; 937 const u8 shift = zinfo->zone_size_shift; 938 u64 nzones = num_bytes >> shift; 939 u64 pos = hole_start; 940 u64 begin, end; 941 bool have_sb; 942 int i; 943 944 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 945 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 946 947 while (pos < hole_end) { 948 begin = pos >> shift; 949 end = begin + nzones; 950 951 if (end > zinfo->nr_zones) 952 return hole_end; 953 954 /* Check if zones in the region are all empty */ 955 if (btrfs_dev_is_sequential(device, pos) && 956 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 957 pos += zinfo->zone_size; 958 continue; 959 } 960 961 have_sb = false; 962 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 963 u32 sb_zone; 964 u64 sb_pos; 965 966 sb_zone = sb_zone_number(shift, i); 967 if (!(end <= sb_zone || 968 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 969 have_sb = true; 970 pos = zone_start_physical( 971 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 972 break; 973 } 974 975 /* We also need to exclude regular superblock positions */ 976 sb_pos = btrfs_sb_offset(i); 977 if (!(pos + num_bytes <= sb_pos || 978 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 979 have_sb = true; 980 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 981 zinfo->zone_size); 982 break; 983 } 984 } 985 if (!have_sb) 986 break; 987 } 988 989 return pos; 990 } 991 992 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) 993 { 994 struct btrfs_zoned_device_info *zone_info = device->zone_info; 995 unsigned int zno = (pos >> zone_info->zone_size_shift); 996 997 /* We can use any number of zones */ 998 if (zone_info->max_active_zones == 0) 999 return true; 1000 1001 if (!test_bit(zno, zone_info->active_zones)) { 1002 /* Active zone left? */ 1003 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) 1004 return false; 1005 if (test_and_set_bit(zno, zone_info->active_zones)) { 1006 /* Someone already set the bit */ 1007 atomic_inc(&zone_info->active_zones_left); 1008 } 1009 } 1010 1011 return true; 1012 } 1013 1014 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) 1015 { 1016 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1017 unsigned int zno = (pos >> zone_info->zone_size_shift); 1018 1019 /* We can use any number of zones */ 1020 if (zone_info->max_active_zones == 0) 1021 return; 1022 1023 if (test_and_clear_bit(zno, zone_info->active_zones)) 1024 atomic_inc(&zone_info->active_zones_left); 1025 } 1026 1027 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1028 u64 length, u64 *bytes) 1029 { 1030 int ret; 1031 1032 *bytes = 0; 1033 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1034 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 1035 GFP_NOFS); 1036 if (ret) 1037 return ret; 1038 1039 *bytes = length; 1040 while (length) { 1041 btrfs_dev_set_zone_empty(device, physical); 1042 btrfs_dev_clear_active_zone(device, physical); 1043 physical += device->zone_info->zone_size; 1044 length -= device->zone_info->zone_size; 1045 } 1046 1047 return 0; 1048 } 1049 1050 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 1051 { 1052 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1053 const u8 shift = zinfo->zone_size_shift; 1054 unsigned long begin = start >> shift; 1055 unsigned long end = (start + size) >> shift; 1056 u64 pos; 1057 int ret; 1058 1059 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 1060 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 1061 1062 if (end > zinfo->nr_zones) 1063 return -ERANGE; 1064 1065 /* All the zones are conventional */ 1066 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 1067 return 0; 1068 1069 /* All the zones are sequential and empty */ 1070 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 1071 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 1072 return 0; 1073 1074 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 1075 u64 reset_bytes; 1076 1077 if (!btrfs_dev_is_sequential(device, pos) || 1078 btrfs_dev_is_empty_zone(device, pos)) 1079 continue; 1080 1081 /* Free regions should be empty */ 1082 btrfs_warn_in_rcu( 1083 device->fs_info, 1084 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 1085 rcu_str_deref(device->name), device->devid, pos >> shift); 1086 WARN_ON_ONCE(1); 1087 1088 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 1089 &reset_bytes); 1090 if (ret) 1091 return ret; 1092 } 1093 1094 return 0; 1095 } 1096 1097 /* 1098 * Calculate an allocation pointer from the extent allocation information 1099 * for a block group consist of conventional zones. It is pointed to the 1100 * end of the highest addressed extent in the block group as an allocation 1101 * offset. 1102 */ 1103 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 1104 u64 *offset_ret) 1105 { 1106 struct btrfs_fs_info *fs_info = cache->fs_info; 1107 struct btrfs_root *root = fs_info->extent_root; 1108 struct btrfs_path *path; 1109 struct btrfs_key key; 1110 struct btrfs_key found_key; 1111 int ret; 1112 u64 length; 1113 1114 path = btrfs_alloc_path(); 1115 if (!path) 1116 return -ENOMEM; 1117 1118 key.objectid = cache->start + cache->length; 1119 key.type = 0; 1120 key.offset = 0; 1121 1122 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1123 /* We should not find the exact match */ 1124 if (!ret) 1125 ret = -EUCLEAN; 1126 if (ret < 0) 1127 goto out; 1128 1129 ret = btrfs_previous_extent_item(root, path, cache->start); 1130 if (ret) { 1131 if (ret == 1) { 1132 ret = 0; 1133 *offset_ret = 0; 1134 } 1135 goto out; 1136 } 1137 1138 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1139 1140 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1141 length = found_key.offset; 1142 else 1143 length = fs_info->nodesize; 1144 1145 if (!(found_key.objectid >= cache->start && 1146 found_key.objectid + length <= cache->start + cache->length)) { 1147 ret = -EUCLEAN; 1148 goto out; 1149 } 1150 *offset_ret = found_key.objectid + length - cache->start; 1151 ret = 0; 1152 1153 out: 1154 btrfs_free_path(path); 1155 return ret; 1156 } 1157 1158 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1159 { 1160 struct btrfs_fs_info *fs_info = cache->fs_info; 1161 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1162 struct extent_map *em; 1163 struct map_lookup *map; 1164 struct btrfs_device *device; 1165 u64 logical = cache->start; 1166 u64 length = cache->length; 1167 u64 physical = 0; 1168 int ret; 1169 int i; 1170 unsigned int nofs_flag; 1171 u64 *alloc_offsets = NULL; 1172 u64 *caps = NULL; 1173 unsigned long *active = NULL; 1174 u64 last_alloc = 0; 1175 u32 num_sequential = 0, num_conventional = 0; 1176 1177 if (!btrfs_is_zoned(fs_info)) 1178 return 0; 1179 1180 /* Sanity check */ 1181 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1182 btrfs_err(fs_info, 1183 "zoned: block group %llu len %llu unaligned to zone size %llu", 1184 logical, length, fs_info->zone_size); 1185 return -EIO; 1186 } 1187 1188 /* Get the chunk mapping */ 1189 read_lock(&em_tree->lock); 1190 em = lookup_extent_mapping(em_tree, logical, length); 1191 read_unlock(&em_tree->lock); 1192 1193 if (!em) 1194 return -EINVAL; 1195 1196 map = em->map_lookup; 1197 1198 cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); 1199 if (!cache->physical_map) { 1200 ret = -ENOMEM; 1201 goto out; 1202 } 1203 1204 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1205 if (!alloc_offsets) { 1206 ret = -ENOMEM; 1207 goto out; 1208 } 1209 1210 caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); 1211 if (!caps) { 1212 ret = -ENOMEM; 1213 goto out; 1214 } 1215 1216 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1217 if (!active) { 1218 ret = -ENOMEM; 1219 goto out; 1220 } 1221 1222 for (i = 0; i < map->num_stripes; i++) { 1223 bool is_sequential; 1224 struct blk_zone zone; 1225 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1226 int dev_replace_is_ongoing = 0; 1227 1228 device = map->stripes[i].dev; 1229 physical = map->stripes[i].physical; 1230 1231 if (device->bdev == NULL) { 1232 alloc_offsets[i] = WP_MISSING_DEV; 1233 continue; 1234 } 1235 1236 is_sequential = btrfs_dev_is_sequential(device, physical); 1237 if (is_sequential) 1238 num_sequential++; 1239 else 1240 num_conventional++; 1241 1242 if (!is_sequential) { 1243 alloc_offsets[i] = WP_CONVENTIONAL; 1244 continue; 1245 } 1246 1247 /* 1248 * This zone will be used for allocation, so mark this zone 1249 * non-empty. 1250 */ 1251 btrfs_dev_clear_zone_empty(device, physical); 1252 1253 down_read(&dev_replace->rwsem); 1254 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1255 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1256 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); 1257 up_read(&dev_replace->rwsem); 1258 1259 /* 1260 * The group is mapped to a sequential zone. Get the zone write 1261 * pointer to determine the allocation offset within the zone. 1262 */ 1263 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); 1264 nofs_flag = memalloc_nofs_save(); 1265 ret = btrfs_get_dev_zone(device, physical, &zone); 1266 memalloc_nofs_restore(nofs_flag); 1267 if (ret == -EIO || ret == -EOPNOTSUPP) { 1268 ret = 0; 1269 alloc_offsets[i] = WP_MISSING_DEV; 1270 continue; 1271 } else if (ret) { 1272 goto out; 1273 } 1274 1275 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1276 btrfs_err_in_rcu(fs_info, 1277 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1278 zone.start << SECTOR_SHIFT, 1279 rcu_str_deref(device->name), device->devid); 1280 ret = -EIO; 1281 goto out; 1282 } 1283 1284 caps[i] = (zone.capacity << SECTOR_SHIFT); 1285 1286 switch (zone.cond) { 1287 case BLK_ZONE_COND_OFFLINE: 1288 case BLK_ZONE_COND_READONLY: 1289 btrfs_err(fs_info, 1290 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1291 physical >> device->zone_info->zone_size_shift, 1292 rcu_str_deref(device->name), device->devid); 1293 alloc_offsets[i] = WP_MISSING_DEV; 1294 break; 1295 case BLK_ZONE_COND_EMPTY: 1296 alloc_offsets[i] = 0; 1297 break; 1298 case BLK_ZONE_COND_FULL: 1299 alloc_offsets[i] = caps[i]; 1300 break; 1301 default: 1302 /* Partially used zone */ 1303 alloc_offsets[i] = 1304 ((zone.wp - zone.start) << SECTOR_SHIFT); 1305 __set_bit(i, active); 1306 break; 1307 } 1308 1309 /* 1310 * Consider a zone as active if we can allow any number of 1311 * active zones. 1312 */ 1313 if (!device->zone_info->max_active_zones) 1314 __set_bit(i, active); 1315 } 1316 1317 if (num_sequential > 0) 1318 cache->seq_zone = true; 1319 1320 if (num_conventional > 0) { 1321 /* 1322 * Avoid calling calculate_alloc_pointer() for new BG. It 1323 * is no use for new BG. It must be always 0. 1324 * 1325 * Also, we have a lock chain of extent buffer lock -> 1326 * chunk mutex. For new BG, this function is called from 1327 * btrfs_make_block_group() which is already taking the 1328 * chunk mutex. Thus, we cannot call 1329 * calculate_alloc_pointer() which takes extent buffer 1330 * locks to avoid deadlock. 1331 */ 1332 1333 /* Zone capacity is always zone size in emulation */ 1334 cache->zone_capacity = cache->length; 1335 if (new) { 1336 cache->alloc_offset = 0; 1337 goto out; 1338 } 1339 ret = calculate_alloc_pointer(cache, &last_alloc); 1340 if (ret || map->num_stripes == num_conventional) { 1341 if (!ret) 1342 cache->alloc_offset = last_alloc; 1343 else 1344 btrfs_err(fs_info, 1345 "zoned: failed to determine allocation offset of bg %llu", 1346 cache->start); 1347 goto out; 1348 } 1349 } 1350 1351 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1352 case 0: /* single */ 1353 if (alloc_offsets[0] == WP_MISSING_DEV) { 1354 btrfs_err(fs_info, 1355 "zoned: cannot recover write pointer for zone %llu", 1356 physical); 1357 ret = -EIO; 1358 goto out; 1359 } 1360 cache->alloc_offset = alloc_offsets[0]; 1361 cache->zone_capacity = caps[0]; 1362 cache->zone_is_active = test_bit(0, active); 1363 break; 1364 case BTRFS_BLOCK_GROUP_DUP: 1365 case BTRFS_BLOCK_GROUP_RAID1: 1366 case BTRFS_BLOCK_GROUP_RAID0: 1367 case BTRFS_BLOCK_GROUP_RAID10: 1368 case BTRFS_BLOCK_GROUP_RAID5: 1369 case BTRFS_BLOCK_GROUP_RAID6: 1370 /* non-single profiles are not supported yet */ 1371 default: 1372 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1373 btrfs_bg_type_to_raid_name(map->type)); 1374 ret = -EINVAL; 1375 goto out; 1376 } 1377 1378 if (cache->zone_is_active) { 1379 btrfs_get_block_group(cache); 1380 spin_lock(&fs_info->zone_active_bgs_lock); 1381 list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); 1382 spin_unlock(&fs_info->zone_active_bgs_lock); 1383 } 1384 1385 out: 1386 if (cache->alloc_offset > fs_info->zone_size) { 1387 btrfs_err(fs_info, 1388 "zoned: invalid write pointer %llu in block group %llu", 1389 cache->alloc_offset, cache->start); 1390 ret = -EIO; 1391 } 1392 1393 if (cache->alloc_offset > cache->zone_capacity) { 1394 btrfs_err(fs_info, 1395 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", 1396 cache->alloc_offset, cache->zone_capacity, 1397 cache->start); 1398 ret = -EIO; 1399 } 1400 1401 /* An extent is allocated after the write pointer */ 1402 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1403 btrfs_err(fs_info, 1404 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1405 logical, last_alloc, cache->alloc_offset); 1406 ret = -EIO; 1407 } 1408 1409 if (!ret) 1410 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1411 1412 if (ret) { 1413 kfree(cache->physical_map); 1414 cache->physical_map = NULL; 1415 } 1416 bitmap_free(active); 1417 kfree(caps); 1418 kfree(alloc_offsets); 1419 free_extent_map(em); 1420 1421 return ret; 1422 } 1423 1424 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1425 { 1426 u64 unusable, free; 1427 1428 if (!btrfs_is_zoned(cache->fs_info)) 1429 return; 1430 1431 WARN_ON(cache->bytes_super != 0); 1432 unusable = (cache->alloc_offset - cache->used) + 1433 (cache->length - cache->zone_capacity); 1434 free = cache->zone_capacity - cache->alloc_offset; 1435 1436 /* We only need ->free_space in ALLOC_SEQ block groups */ 1437 cache->last_byte_to_unpin = (u64)-1; 1438 cache->cached = BTRFS_CACHE_FINISHED; 1439 cache->free_space_ctl->free_space = free; 1440 cache->zone_unusable = unusable; 1441 } 1442 1443 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1444 struct extent_buffer *eb) 1445 { 1446 struct btrfs_fs_info *fs_info = eb->fs_info; 1447 1448 if (!btrfs_is_zoned(fs_info) || 1449 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1450 !list_empty(&eb->release_list)) 1451 return; 1452 1453 set_extent_buffer_dirty(eb); 1454 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1455 eb->start + eb->len - 1, EXTENT_DIRTY); 1456 memzero_extent_buffer(eb, 0, eb->len); 1457 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1458 1459 spin_lock(&trans->releasing_ebs_lock); 1460 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1461 spin_unlock(&trans->releasing_ebs_lock); 1462 atomic_inc(&eb->refs); 1463 } 1464 1465 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1466 { 1467 spin_lock(&trans->releasing_ebs_lock); 1468 while (!list_empty(&trans->releasing_ebs)) { 1469 struct extent_buffer *eb; 1470 1471 eb = list_first_entry(&trans->releasing_ebs, 1472 struct extent_buffer, release_list); 1473 list_del_init(&eb->release_list); 1474 free_extent_buffer(eb); 1475 } 1476 spin_unlock(&trans->releasing_ebs_lock); 1477 } 1478 1479 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) 1480 { 1481 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1482 struct btrfs_block_group *cache; 1483 bool ret = false; 1484 1485 if (!btrfs_is_zoned(fs_info)) 1486 return false; 1487 1488 if (!is_data_inode(&inode->vfs_inode)) 1489 return false; 1490 1491 /* 1492 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the 1493 * extent layout the relocation code has. 1494 * Furthermore we have set aside own block-group from which only the 1495 * relocation "process" can allocate and make sure only one process at a 1496 * time can add pages to an extent that gets relocated, so it's safe to 1497 * use regular REQ_OP_WRITE for this special case. 1498 */ 1499 if (btrfs_is_data_reloc_root(inode->root)) 1500 return false; 1501 1502 cache = btrfs_lookup_block_group(fs_info, start); 1503 ASSERT(cache); 1504 if (!cache) 1505 return false; 1506 1507 ret = cache->seq_zone; 1508 btrfs_put_block_group(cache); 1509 1510 return ret; 1511 } 1512 1513 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1514 struct bio *bio) 1515 { 1516 struct btrfs_ordered_extent *ordered; 1517 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1518 1519 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1520 return; 1521 1522 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1523 if (WARN_ON(!ordered)) 1524 return; 1525 1526 ordered->physical = physical; 1527 ordered->bdev = bio->bi_bdev; 1528 1529 btrfs_put_ordered_extent(ordered); 1530 } 1531 1532 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1533 { 1534 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1535 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1536 struct extent_map_tree *em_tree; 1537 struct extent_map *em; 1538 struct btrfs_ordered_sum *sum; 1539 u64 orig_logical = ordered->disk_bytenr; 1540 u64 *logical = NULL; 1541 int nr, stripe_len; 1542 1543 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1544 ASSERT(!bdev_is_partition(ordered->bdev)); 1545 if (WARN_ON(!ordered->bdev)) 1546 return; 1547 1548 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, 1549 ordered->physical, &logical, &nr, 1550 &stripe_len))) 1551 goto out; 1552 1553 WARN_ON(nr != 1); 1554 1555 if (orig_logical == *logical) 1556 goto out; 1557 1558 ordered->disk_bytenr = *logical; 1559 1560 em_tree = &inode->extent_tree; 1561 write_lock(&em_tree->lock); 1562 em = search_extent_mapping(em_tree, ordered->file_offset, 1563 ordered->num_bytes); 1564 em->block_start = *logical; 1565 free_extent_map(em); 1566 write_unlock(&em_tree->lock); 1567 1568 list_for_each_entry(sum, &ordered->list, list) { 1569 if (*logical < orig_logical) 1570 sum->bytenr -= orig_logical - *logical; 1571 else 1572 sum->bytenr += *logical - orig_logical; 1573 } 1574 1575 out: 1576 kfree(logical); 1577 } 1578 1579 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1580 struct extent_buffer *eb, 1581 struct btrfs_block_group **cache_ret) 1582 { 1583 struct btrfs_block_group *cache; 1584 bool ret = true; 1585 1586 if (!btrfs_is_zoned(fs_info)) 1587 return true; 1588 1589 cache = *cache_ret; 1590 1591 if (cache && (eb->start < cache->start || 1592 cache->start + cache->length <= eb->start)) { 1593 btrfs_put_block_group(cache); 1594 cache = NULL; 1595 *cache_ret = NULL; 1596 } 1597 1598 if (!cache) 1599 cache = btrfs_lookup_block_group(fs_info, eb->start); 1600 1601 if (cache) { 1602 if (cache->meta_write_pointer != eb->start) { 1603 btrfs_put_block_group(cache); 1604 cache = NULL; 1605 ret = false; 1606 } else { 1607 cache->meta_write_pointer = eb->start + eb->len; 1608 } 1609 1610 *cache_ret = cache; 1611 } 1612 1613 return ret; 1614 } 1615 1616 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1617 struct extent_buffer *eb) 1618 { 1619 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1620 return; 1621 1622 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1623 cache->meta_write_pointer = eb->start; 1624 } 1625 1626 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1627 { 1628 if (!btrfs_dev_is_sequential(device, physical)) 1629 return -EOPNOTSUPP; 1630 1631 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1632 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1633 } 1634 1635 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1636 struct blk_zone *zone) 1637 { 1638 struct btrfs_io_context *bioc = NULL; 1639 u64 mapped_length = PAGE_SIZE; 1640 unsigned int nofs_flag; 1641 int nmirrors; 1642 int i, ret; 1643 1644 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1645 &mapped_length, &bioc); 1646 if (ret || !bioc || mapped_length < PAGE_SIZE) { 1647 btrfs_put_bioc(bioc); 1648 return -EIO; 1649 } 1650 1651 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1652 return -EINVAL; 1653 1654 nofs_flag = memalloc_nofs_save(); 1655 nmirrors = (int)bioc->num_stripes; 1656 for (i = 0; i < nmirrors; i++) { 1657 u64 physical = bioc->stripes[i].physical; 1658 struct btrfs_device *dev = bioc->stripes[i].dev; 1659 1660 /* Missing device */ 1661 if (!dev->bdev) 1662 continue; 1663 1664 ret = btrfs_get_dev_zone(dev, physical, zone); 1665 /* Failing device */ 1666 if (ret == -EIO || ret == -EOPNOTSUPP) 1667 continue; 1668 break; 1669 } 1670 memalloc_nofs_restore(nofs_flag); 1671 1672 return ret; 1673 } 1674 1675 /* 1676 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1677 * filling zeros between @physical_pos to a write pointer of dev-replace 1678 * source device. 1679 */ 1680 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1681 u64 physical_start, u64 physical_pos) 1682 { 1683 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1684 struct blk_zone zone; 1685 u64 length; 1686 u64 wp; 1687 int ret; 1688 1689 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1690 return 0; 1691 1692 ret = read_zone_info(fs_info, logical, &zone); 1693 if (ret) 1694 return ret; 1695 1696 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1697 1698 if (physical_pos == wp) 1699 return 0; 1700 1701 if (physical_pos > wp) 1702 return -EUCLEAN; 1703 1704 length = wp - physical_pos; 1705 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1706 } 1707 1708 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 1709 u64 logical, u64 length) 1710 { 1711 struct btrfs_device *device; 1712 struct extent_map *em; 1713 struct map_lookup *map; 1714 1715 em = btrfs_get_chunk_map(fs_info, logical, length); 1716 if (IS_ERR(em)) 1717 return ERR_CAST(em); 1718 1719 map = em->map_lookup; 1720 /* We only support single profile for now */ 1721 ASSERT(map->num_stripes == 1); 1722 device = map->stripes[0].dev; 1723 1724 free_extent_map(em); 1725 1726 return device; 1727 } 1728 1729 /** 1730 * Activate block group and underlying device zones 1731 * 1732 * @block_group: the block group to activate 1733 * 1734 * Return: true on success, false otherwise 1735 */ 1736 bool btrfs_zone_activate(struct btrfs_block_group *block_group) 1737 { 1738 struct btrfs_fs_info *fs_info = block_group->fs_info; 1739 struct map_lookup *map; 1740 struct btrfs_device *device; 1741 u64 physical; 1742 bool ret; 1743 1744 if (!btrfs_is_zoned(block_group->fs_info)) 1745 return true; 1746 1747 map = block_group->physical_map; 1748 /* Currently support SINGLE profile only */ 1749 ASSERT(map->num_stripes == 1); 1750 device = map->stripes[0].dev; 1751 physical = map->stripes[0].physical; 1752 1753 if (device->zone_info->max_active_zones == 0) 1754 return true; 1755 1756 spin_lock(&block_group->lock); 1757 1758 if (block_group->zone_is_active) { 1759 ret = true; 1760 goto out_unlock; 1761 } 1762 1763 /* No space left */ 1764 if (block_group->alloc_offset == block_group->zone_capacity) { 1765 ret = false; 1766 goto out_unlock; 1767 } 1768 1769 if (!btrfs_dev_set_active_zone(device, physical)) { 1770 /* Cannot activate the zone */ 1771 ret = false; 1772 goto out_unlock; 1773 } 1774 1775 /* Successfully activated all the zones */ 1776 block_group->zone_is_active = 1; 1777 1778 spin_unlock(&block_group->lock); 1779 1780 /* For the active block group list */ 1781 btrfs_get_block_group(block_group); 1782 1783 spin_lock(&fs_info->zone_active_bgs_lock); 1784 ASSERT(list_empty(&block_group->active_bg_list)); 1785 list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 1786 spin_unlock(&fs_info->zone_active_bgs_lock); 1787 1788 return true; 1789 1790 out_unlock: 1791 spin_unlock(&block_group->lock); 1792 return ret; 1793 } 1794 1795 int btrfs_zone_finish(struct btrfs_block_group *block_group) 1796 { 1797 struct btrfs_fs_info *fs_info = block_group->fs_info; 1798 struct map_lookup *map; 1799 struct btrfs_device *device; 1800 u64 physical; 1801 int ret = 0; 1802 1803 if (!btrfs_is_zoned(fs_info)) 1804 return 0; 1805 1806 map = block_group->physical_map; 1807 /* Currently support SINGLE profile only */ 1808 ASSERT(map->num_stripes == 1); 1809 1810 device = map->stripes[0].dev; 1811 physical = map->stripes[0].physical; 1812 1813 if (device->zone_info->max_active_zones == 0) 1814 return 0; 1815 1816 spin_lock(&block_group->lock); 1817 if (!block_group->zone_is_active) { 1818 spin_unlock(&block_group->lock); 1819 return 0; 1820 } 1821 1822 /* Check if we have unwritten allocated space */ 1823 if ((block_group->flags & 1824 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && 1825 block_group->alloc_offset > block_group->meta_write_pointer) { 1826 spin_unlock(&block_group->lock); 1827 return -EAGAIN; 1828 } 1829 spin_unlock(&block_group->lock); 1830 1831 ret = btrfs_inc_block_group_ro(block_group, false); 1832 if (ret) 1833 return ret; 1834 1835 /* Ensure all writes in this block group finish */ 1836 btrfs_wait_block_group_reservations(block_group); 1837 /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ 1838 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, 1839 block_group->length); 1840 1841 spin_lock(&block_group->lock); 1842 1843 /* 1844 * Bail out if someone already deactivated the block group, or 1845 * allocated space is left in the block group. 1846 */ 1847 if (!block_group->zone_is_active) { 1848 spin_unlock(&block_group->lock); 1849 btrfs_dec_block_group_ro(block_group); 1850 return 0; 1851 } 1852 1853 if (block_group->reserved) { 1854 spin_unlock(&block_group->lock); 1855 btrfs_dec_block_group_ro(block_group); 1856 return -EAGAIN; 1857 } 1858 1859 block_group->zone_is_active = 0; 1860 block_group->alloc_offset = block_group->zone_capacity; 1861 block_group->free_space_ctl->free_space = 0; 1862 btrfs_clear_treelog_bg(block_group); 1863 spin_unlock(&block_group->lock); 1864 1865 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 1866 physical >> SECTOR_SHIFT, 1867 device->zone_info->zone_size >> SECTOR_SHIFT, 1868 GFP_NOFS); 1869 btrfs_dec_block_group_ro(block_group); 1870 1871 if (!ret) { 1872 btrfs_dev_clear_active_zone(device, physical); 1873 1874 spin_lock(&fs_info->zone_active_bgs_lock); 1875 ASSERT(!list_empty(&block_group->active_bg_list)); 1876 list_del_init(&block_group->active_bg_list); 1877 spin_unlock(&fs_info->zone_active_bgs_lock); 1878 1879 /* For active_bg_list */ 1880 btrfs_put_block_group(block_group); 1881 } 1882 1883 return ret; 1884 } 1885 1886 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) 1887 { 1888 struct btrfs_device *device; 1889 bool ret = false; 1890 1891 if (!btrfs_is_zoned(fs_devices->fs_info)) 1892 return true; 1893 1894 /* Non-single profiles are not supported yet */ 1895 if (raid_index != BTRFS_RAID_SINGLE) 1896 return false; 1897 1898 /* Check if there is a device with active zones left */ 1899 mutex_lock(&fs_devices->device_list_mutex); 1900 list_for_each_entry(device, &fs_devices->devices, dev_list) { 1901 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1902 1903 if (!device->bdev) 1904 continue; 1905 1906 if (!zinfo->max_active_zones || 1907 atomic_read(&zinfo->active_zones_left)) { 1908 ret = true; 1909 break; 1910 } 1911 } 1912 mutex_unlock(&fs_devices->device_list_mutex); 1913 1914 return ret; 1915 } 1916 1917 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 1918 { 1919 struct btrfs_block_group *block_group; 1920 struct map_lookup *map; 1921 struct btrfs_device *device; 1922 u64 physical; 1923 1924 if (!btrfs_is_zoned(fs_info)) 1925 return; 1926 1927 block_group = btrfs_lookup_block_group(fs_info, logical); 1928 ASSERT(block_group); 1929 1930 if (logical + length < block_group->start + block_group->zone_capacity) 1931 goto out; 1932 1933 spin_lock(&block_group->lock); 1934 1935 if (!block_group->zone_is_active) { 1936 spin_unlock(&block_group->lock); 1937 goto out; 1938 } 1939 1940 block_group->zone_is_active = 0; 1941 /* We should have consumed all the free space */ 1942 ASSERT(block_group->alloc_offset == block_group->zone_capacity); 1943 ASSERT(block_group->free_space_ctl->free_space == 0); 1944 btrfs_clear_treelog_bg(block_group); 1945 spin_unlock(&block_group->lock); 1946 1947 map = block_group->physical_map; 1948 device = map->stripes[0].dev; 1949 physical = map->stripes[0].physical; 1950 1951 if (!device->zone_info->max_active_zones) 1952 goto out; 1953 1954 btrfs_dev_clear_active_zone(device, physical); 1955 1956 spin_lock(&fs_info->zone_active_bgs_lock); 1957 ASSERT(!list_empty(&block_group->active_bg_list)); 1958 list_del_init(&block_group->active_bg_list); 1959 spin_unlock(&fs_info->zone_active_bgs_lock); 1960 1961 btrfs_put_block_group(block_group); 1962 1963 out: 1964 btrfs_put_block_group(block_group); 1965 } 1966 1967 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) 1968 { 1969 struct btrfs_fs_info *fs_info = bg->fs_info; 1970 1971 spin_lock(&fs_info->relocation_bg_lock); 1972 if (fs_info->data_reloc_bg == bg->start) 1973 fs_info->data_reloc_bg = 0; 1974 spin_unlock(&fs_info->relocation_bg_lock); 1975 } 1976