1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include <linux/blkdev.h> 5 #include "ctree.h" 6 #include "volumes.h" 7 #include "zoned.h" 8 #include "rcu-string.h" 9 10 /* Maximum number of zones to report per blkdev_report_zones() call */ 11 #define BTRFS_REPORT_NR_ZONES 4096 12 13 /* Number of superblock log zones */ 14 #define BTRFS_NR_SB_LOG_ZONES 2 15 16 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 17 { 18 struct blk_zone *zones = data; 19 20 memcpy(&zones[idx], zone, sizeof(*zone)); 21 22 return 0; 23 } 24 25 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 26 u64 *wp_ret) 27 { 28 bool empty[BTRFS_NR_SB_LOG_ZONES]; 29 bool full[BTRFS_NR_SB_LOG_ZONES]; 30 sector_t sector; 31 32 ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && 33 zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); 34 35 empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); 36 empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); 37 full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); 38 full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); 39 40 /* 41 * Possible states of log buffer zones 42 * 43 * Empty[0] In use[0] Full[0] 44 * Empty[1] * x 0 45 * In use[1] 0 x 0 46 * Full[1] 1 1 C 47 * 48 * Log position: 49 * *: Special case, no superblock is written 50 * 0: Use write pointer of zones[0] 51 * 1: Use write pointer of zones[1] 52 * C: Compare super blcoks from zones[0] and zones[1], use the latest 53 * one determined by generation 54 * x: Invalid state 55 */ 56 57 if (empty[0] && empty[1]) { 58 /* Special case to distinguish no superblock to read */ 59 *wp_ret = zones[0].start << SECTOR_SHIFT; 60 return -ENOENT; 61 } else if (full[0] && full[1]) { 62 /* Compare two super blocks */ 63 struct address_space *mapping = bdev->bd_inode->i_mapping; 64 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 65 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 66 int i; 67 68 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 69 u64 bytenr; 70 71 bytenr = ((zones[i].start + zones[i].len) 72 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 73 74 page[i] = read_cache_page_gfp(mapping, 75 bytenr >> PAGE_SHIFT, GFP_NOFS); 76 if (IS_ERR(page[i])) { 77 if (i == 1) 78 btrfs_release_disk_super(super[0]); 79 return PTR_ERR(page[i]); 80 } 81 super[i] = page_address(page[i]); 82 } 83 84 if (super[0]->generation > super[1]->generation) 85 sector = zones[1].start; 86 else 87 sector = zones[0].start; 88 89 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 90 btrfs_release_disk_super(super[i]); 91 } else if (!full[0] && (empty[1] || full[1])) { 92 sector = zones[0].wp; 93 } else if (full[0]) { 94 sector = zones[1].wp; 95 } else { 96 return -EUCLEAN; 97 } 98 *wp_ret = sector << SECTOR_SHIFT; 99 return 0; 100 } 101 102 /* 103 * The following zones are reserved as the circular buffer on ZONED btrfs. 104 * - The primary superblock: zones 0 and 1 105 * - The first copy: zones 16 and 17 106 * - The second copy: zones 1024 or zone at 256GB which is minimum, and 107 * the following one 108 */ 109 static inline u32 sb_zone_number(int shift, int mirror) 110 { 111 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 112 113 switch (mirror) { 114 case 0: return 0; 115 case 1: return 16; 116 case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); 117 } 118 119 return 0; 120 } 121 122 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 123 struct blk_zone *zones, unsigned int *nr_zones) 124 { 125 int ret; 126 127 if (!*nr_zones) 128 return 0; 129 130 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 131 copy_zone_info_cb, zones); 132 if (ret < 0) { 133 btrfs_err_in_rcu(device->fs_info, 134 "zoned: failed to read zone %llu on %s (devid %llu)", 135 pos, rcu_str_deref(device->name), 136 device->devid); 137 return ret; 138 } 139 *nr_zones = ret; 140 if (!ret) 141 return -EIO; 142 143 return 0; 144 } 145 146 int btrfs_get_dev_zone_info(struct btrfs_device *device) 147 { 148 struct btrfs_zoned_device_info *zone_info = NULL; 149 struct block_device *bdev = device->bdev; 150 struct request_queue *queue = bdev_get_queue(bdev); 151 sector_t nr_sectors; 152 sector_t sector = 0; 153 struct blk_zone *zones = NULL; 154 unsigned int i, nreported = 0, nr_zones; 155 unsigned int zone_sectors; 156 int ret; 157 158 if (!bdev_is_zoned(bdev)) 159 return 0; 160 161 if (device->zone_info) 162 return 0; 163 164 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 165 if (!zone_info) 166 return -ENOMEM; 167 168 nr_sectors = bdev_nr_sectors(bdev); 169 zone_sectors = bdev_zone_sectors(bdev); 170 /* Check if it's power of 2 (see is_power_of_2) */ 171 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 172 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 173 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 174 zone_info->max_zone_append_size = 175 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; 176 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 177 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 178 zone_info->nr_zones++; 179 180 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 181 if (!zone_info->seq_zones) { 182 ret = -ENOMEM; 183 goto out; 184 } 185 186 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 187 if (!zone_info->empty_zones) { 188 ret = -ENOMEM; 189 goto out; 190 } 191 192 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 193 if (!zones) { 194 ret = -ENOMEM; 195 goto out; 196 } 197 198 /* Get zones type */ 199 while (sector < nr_sectors) { 200 nr_zones = BTRFS_REPORT_NR_ZONES; 201 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 202 &nr_zones); 203 if (ret) 204 goto out; 205 206 for (i = 0; i < nr_zones; i++) { 207 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 208 __set_bit(nreported, zone_info->seq_zones); 209 if (zones[i].cond == BLK_ZONE_COND_EMPTY) 210 __set_bit(nreported, zone_info->empty_zones); 211 nreported++; 212 } 213 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 214 } 215 216 if (nreported != zone_info->nr_zones) { 217 btrfs_err_in_rcu(device->fs_info, 218 "inconsistent number of zones on %s (%u/%u)", 219 rcu_str_deref(device->name), nreported, 220 zone_info->nr_zones); 221 ret = -EIO; 222 goto out; 223 } 224 225 /* Validate superblock log */ 226 nr_zones = BTRFS_NR_SB_LOG_ZONES; 227 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 228 u32 sb_zone; 229 u64 sb_wp; 230 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 231 232 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 233 if (sb_zone + 1 >= zone_info->nr_zones) 234 continue; 235 236 sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); 237 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, 238 &zone_info->sb_zones[sb_pos], 239 &nr_zones); 240 if (ret) 241 goto out; 242 243 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 244 btrfs_err_in_rcu(device->fs_info, 245 "zoned: failed to read super block log zone info at devid %llu zone %u", 246 device->devid, sb_zone); 247 ret = -EUCLEAN; 248 goto out; 249 } 250 251 /* 252 * If zones[0] is conventional, always use the beggining of the 253 * zone to record superblock. No need to validate in that case. 254 */ 255 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 256 BLK_ZONE_TYPE_CONVENTIONAL) 257 continue; 258 259 ret = sb_write_pointer(device->bdev, 260 &zone_info->sb_zones[sb_pos], &sb_wp); 261 if (ret != -ENOENT && ret) { 262 btrfs_err_in_rcu(device->fs_info, 263 "zoned: super block log zone corrupted devid %llu zone %u", 264 device->devid, sb_zone); 265 ret = -EUCLEAN; 266 goto out; 267 } 268 } 269 270 271 kfree(zones); 272 273 device->zone_info = zone_info; 274 275 /* device->fs_info is not safe to use for printing messages */ 276 btrfs_info_in_rcu(NULL, 277 "host-%s zoned block device %s, %u zones of %llu bytes", 278 bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", 279 rcu_str_deref(device->name), zone_info->nr_zones, 280 zone_info->zone_size); 281 282 return 0; 283 284 out: 285 kfree(zones); 286 bitmap_free(zone_info->empty_zones); 287 bitmap_free(zone_info->seq_zones); 288 kfree(zone_info); 289 290 return ret; 291 } 292 293 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 294 { 295 struct btrfs_zoned_device_info *zone_info = device->zone_info; 296 297 if (!zone_info) 298 return; 299 300 bitmap_free(zone_info->seq_zones); 301 bitmap_free(zone_info->empty_zones); 302 kfree(zone_info); 303 device->zone_info = NULL; 304 } 305 306 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 307 struct blk_zone *zone) 308 { 309 unsigned int nr_zones = 1; 310 int ret; 311 312 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 313 if (ret != 0 || !nr_zones) 314 return ret ? ret : -EIO; 315 316 return 0; 317 } 318 319 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 320 { 321 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 322 struct btrfs_device *device; 323 u64 zoned_devices = 0; 324 u64 nr_devices = 0; 325 u64 zone_size = 0; 326 u64 max_zone_append_size = 0; 327 const bool incompat_zoned = btrfs_is_zoned(fs_info); 328 int ret = 0; 329 330 /* Count zoned devices */ 331 list_for_each_entry(device, &fs_devices->devices, dev_list) { 332 enum blk_zoned_model model; 333 334 if (!device->bdev) 335 continue; 336 337 model = bdev_zoned_model(device->bdev); 338 if (model == BLK_ZONED_HM || 339 (model == BLK_ZONED_HA && incompat_zoned)) { 340 struct btrfs_zoned_device_info *zone_info; 341 342 zone_info = device->zone_info; 343 zoned_devices++; 344 if (!zone_size) { 345 zone_size = zone_info->zone_size; 346 } else if (zone_info->zone_size != zone_size) { 347 btrfs_err(fs_info, 348 "zoned: unequal block device zone sizes: have %llu found %llu", 349 device->zone_info->zone_size, 350 zone_size); 351 ret = -EINVAL; 352 goto out; 353 } 354 if (!max_zone_append_size || 355 (zone_info->max_zone_append_size && 356 zone_info->max_zone_append_size < max_zone_append_size)) 357 max_zone_append_size = 358 zone_info->max_zone_append_size; 359 } 360 nr_devices++; 361 } 362 363 if (!zoned_devices && !incompat_zoned) 364 goto out; 365 366 if (!zoned_devices && incompat_zoned) { 367 /* No zoned block device found on ZONED filesystem */ 368 btrfs_err(fs_info, 369 "zoned: no zoned devices found on a zoned filesystem"); 370 ret = -EINVAL; 371 goto out; 372 } 373 374 if (zoned_devices && !incompat_zoned) { 375 btrfs_err(fs_info, 376 "zoned: mode not enabled but zoned device found"); 377 ret = -EINVAL; 378 goto out; 379 } 380 381 if (zoned_devices != nr_devices) { 382 btrfs_err(fs_info, 383 "zoned: cannot mix zoned and regular devices"); 384 ret = -EINVAL; 385 goto out; 386 } 387 388 /* 389 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 390 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, 391 * check the alignment here. 392 */ 393 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 394 btrfs_err(fs_info, 395 "zoned: zone size %llu not aligned to stripe %u", 396 zone_size, BTRFS_STRIPE_LEN); 397 ret = -EINVAL; 398 goto out; 399 } 400 401 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 402 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 403 ret = -EINVAL; 404 goto out; 405 } 406 407 fs_info->zone_size = zone_size; 408 fs_info->max_zone_append_size = max_zone_append_size; 409 410 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 411 out: 412 return ret; 413 } 414 415 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 416 { 417 if (!btrfs_is_zoned(info)) 418 return 0; 419 420 /* 421 * Space cache writing is not COWed. Disable that to avoid write errors 422 * in sequential zones. 423 */ 424 if (btrfs_test_opt(info, SPACE_CACHE)) { 425 btrfs_err(info, "zoned: space cache v1 is not supported"); 426 return -EINVAL; 427 } 428 429 if (btrfs_test_opt(info, NODATACOW)) { 430 btrfs_err(info, "zoned: NODATACOW not supported"); 431 return -EINVAL; 432 } 433 434 return 0; 435 } 436 437 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 438 int rw, u64 *bytenr_ret) 439 { 440 u64 wp; 441 int ret; 442 443 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 444 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 445 return 0; 446 } 447 448 ret = sb_write_pointer(bdev, zones, &wp); 449 if (ret != -ENOENT && ret < 0) 450 return ret; 451 452 if (rw == WRITE) { 453 struct blk_zone *reset = NULL; 454 455 if (wp == zones[0].start << SECTOR_SHIFT) 456 reset = &zones[0]; 457 else if (wp == zones[1].start << SECTOR_SHIFT) 458 reset = &zones[1]; 459 460 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 461 ASSERT(reset->cond == BLK_ZONE_COND_FULL); 462 463 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 464 reset->start, reset->len, 465 GFP_NOFS); 466 if (ret) 467 return ret; 468 469 reset->cond = BLK_ZONE_COND_EMPTY; 470 reset->wp = reset->start; 471 } 472 } else if (ret != -ENOENT) { 473 /* For READ, we want the precious one */ 474 if (wp == zones[0].start << SECTOR_SHIFT) 475 wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; 476 wp -= BTRFS_SUPER_INFO_SIZE; 477 } 478 479 *bytenr_ret = wp; 480 return 0; 481 482 } 483 484 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 485 u64 *bytenr_ret) 486 { 487 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 488 unsigned int zone_sectors; 489 u32 sb_zone; 490 int ret; 491 u64 zone_size; 492 u8 zone_sectors_shift; 493 sector_t nr_sectors; 494 u32 nr_zones; 495 496 if (!bdev_is_zoned(bdev)) { 497 *bytenr_ret = btrfs_sb_offset(mirror); 498 return 0; 499 } 500 501 ASSERT(rw == READ || rw == WRITE); 502 503 zone_sectors = bdev_zone_sectors(bdev); 504 if (!is_power_of_2(zone_sectors)) 505 return -EINVAL; 506 zone_size = zone_sectors << SECTOR_SHIFT; 507 zone_sectors_shift = ilog2(zone_sectors); 508 nr_sectors = bdev_nr_sectors(bdev); 509 nr_zones = nr_sectors >> zone_sectors_shift; 510 511 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 512 if (sb_zone + 1 >= nr_zones) 513 return -ENOENT; 514 515 ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, 516 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 517 zones); 518 if (ret < 0) 519 return ret; 520 if (ret != BTRFS_NR_SB_LOG_ZONES) 521 return -EIO; 522 523 return sb_log_location(bdev, zones, rw, bytenr_ret); 524 } 525 526 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 527 u64 *bytenr_ret) 528 { 529 struct btrfs_zoned_device_info *zinfo = device->zone_info; 530 u32 zone_num; 531 532 if (!zinfo) { 533 *bytenr_ret = btrfs_sb_offset(mirror); 534 return 0; 535 } 536 537 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 538 if (zone_num + 1 >= zinfo->nr_zones) 539 return -ENOENT; 540 541 return sb_log_location(device->bdev, 542 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 543 rw, bytenr_ret); 544 } 545 546 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 547 int mirror) 548 { 549 u32 zone_num; 550 551 if (!zinfo) 552 return false; 553 554 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 555 if (zone_num + 1 >= zinfo->nr_zones) 556 return false; 557 558 if (!test_bit(zone_num, zinfo->seq_zones)) 559 return false; 560 561 return true; 562 } 563 564 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 565 { 566 struct btrfs_zoned_device_info *zinfo = device->zone_info; 567 struct blk_zone *zone; 568 569 if (!is_sb_log_zone(zinfo, mirror)) 570 return; 571 572 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 573 if (zone->cond != BLK_ZONE_COND_FULL) { 574 if (zone->cond == BLK_ZONE_COND_EMPTY) 575 zone->cond = BLK_ZONE_COND_IMP_OPEN; 576 577 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 578 579 if (zone->wp == zone->start + zone->len) 580 zone->cond = BLK_ZONE_COND_FULL; 581 582 return; 583 } 584 585 zone++; 586 ASSERT(zone->cond != BLK_ZONE_COND_FULL); 587 if (zone->cond == BLK_ZONE_COND_EMPTY) 588 zone->cond = BLK_ZONE_COND_IMP_OPEN; 589 590 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 591 592 if (zone->wp == zone->start + zone->len) 593 zone->cond = BLK_ZONE_COND_FULL; 594 } 595 596 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 597 { 598 sector_t zone_sectors; 599 sector_t nr_sectors; 600 u8 zone_sectors_shift; 601 u32 sb_zone; 602 u32 nr_zones; 603 604 zone_sectors = bdev_zone_sectors(bdev); 605 zone_sectors_shift = ilog2(zone_sectors); 606 nr_sectors = bdev_nr_sectors(bdev); 607 nr_zones = nr_sectors >> zone_sectors_shift; 608 609 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 610 if (sb_zone + 1 >= nr_zones) 611 return -ENOENT; 612 613 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 614 sb_zone << zone_sectors_shift, 615 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 616 } 617