1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include "ctree.h" 8 #include "volumes.h" 9 #include "zoned.h" 10 #include "rcu-string.h" 11 #include "disk-io.h" 12 #include "block-group.h" 13 #include "transaction.h" 14 #include "dev-replace.h" 15 #include "space-info.h" 16 17 /* Maximum number of zones to report per blkdev_report_zones() call */ 18 #define BTRFS_REPORT_NR_ZONES 4096 19 /* Invalid allocation pointer value for missing devices */ 20 #define WP_MISSING_DEV ((u64)-1) 21 /* Pseudo write pointer value for conventional zone */ 22 #define WP_CONVENTIONAL ((u64)-2) 23 24 /* 25 * Location of the first zone of superblock logging zone pairs. 26 * 27 * - primary superblock: 0B (zone 0) 28 * - first copy: 512G (zone starting at that offset) 29 * - second copy: 4T (zone starting at that offset) 30 */ 31 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 32 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 33 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 34 35 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 36 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 37 38 /* Number of superblock log zones */ 39 #define BTRFS_NR_SB_LOG_ZONES 2 40 41 /* 42 * Maximum supported zone size. Currently, SMR disks have a zone size of 43 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not 44 * expect the zone size to become larger than 8GiB in the near future. 45 */ 46 #define BTRFS_MAX_ZONE_SIZE SZ_8G 47 48 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 49 { 50 struct blk_zone *zones = data; 51 52 memcpy(&zones[idx], zone, sizeof(*zone)); 53 54 return 0; 55 } 56 57 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 58 u64 *wp_ret) 59 { 60 bool empty[BTRFS_NR_SB_LOG_ZONES]; 61 bool full[BTRFS_NR_SB_LOG_ZONES]; 62 sector_t sector; 63 64 ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && 65 zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); 66 67 empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); 68 empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); 69 full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); 70 full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); 71 72 /* 73 * Possible states of log buffer zones 74 * 75 * Empty[0] In use[0] Full[0] 76 * Empty[1] * x 0 77 * In use[1] 0 x 0 78 * Full[1] 1 1 C 79 * 80 * Log position: 81 * *: Special case, no superblock is written 82 * 0: Use write pointer of zones[0] 83 * 1: Use write pointer of zones[1] 84 * C: Compare super blcoks from zones[0] and zones[1], use the latest 85 * one determined by generation 86 * x: Invalid state 87 */ 88 89 if (empty[0] && empty[1]) { 90 /* Special case to distinguish no superblock to read */ 91 *wp_ret = zones[0].start << SECTOR_SHIFT; 92 return -ENOENT; 93 } else if (full[0] && full[1]) { 94 /* Compare two super blocks */ 95 struct address_space *mapping = bdev->bd_inode->i_mapping; 96 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 97 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 98 int i; 99 100 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 101 u64 bytenr; 102 103 bytenr = ((zones[i].start + zones[i].len) 104 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 105 106 page[i] = read_cache_page_gfp(mapping, 107 bytenr >> PAGE_SHIFT, GFP_NOFS); 108 if (IS_ERR(page[i])) { 109 if (i == 1) 110 btrfs_release_disk_super(super[0]); 111 return PTR_ERR(page[i]); 112 } 113 super[i] = page_address(page[i]); 114 } 115 116 if (super[0]->generation > super[1]->generation) 117 sector = zones[1].start; 118 else 119 sector = zones[0].start; 120 121 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 122 btrfs_release_disk_super(super[i]); 123 } else if (!full[0] && (empty[1] || full[1])) { 124 sector = zones[0].wp; 125 } else if (full[0]) { 126 sector = zones[1].wp; 127 } else { 128 return -EUCLEAN; 129 } 130 *wp_ret = sector << SECTOR_SHIFT; 131 return 0; 132 } 133 134 /* 135 * Get the first zone number of the superblock mirror 136 */ 137 static inline u32 sb_zone_number(int shift, int mirror) 138 { 139 u64 zone; 140 141 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 142 switch (mirror) { 143 case 0: zone = 0; break; 144 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 145 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 146 } 147 148 ASSERT(zone <= U32_MAX); 149 150 return (u32)zone; 151 } 152 153 /* 154 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 155 * device into static sized chunks and fake a conventional zone on each of 156 * them. 157 */ 158 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 159 struct blk_zone *zones, unsigned int nr_zones) 160 { 161 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 162 sector_t bdev_size = bdev_nr_sectors(device->bdev); 163 unsigned int i; 164 165 pos >>= SECTOR_SHIFT; 166 for (i = 0; i < nr_zones; i++) { 167 zones[i].start = i * zone_sectors + pos; 168 zones[i].len = zone_sectors; 169 zones[i].capacity = zone_sectors; 170 zones[i].wp = zones[i].start + zone_sectors; 171 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 172 zones[i].cond = BLK_ZONE_COND_NOT_WP; 173 174 if (zones[i].wp >= bdev_size) { 175 i++; 176 break; 177 } 178 } 179 180 return i; 181 } 182 183 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 184 struct blk_zone *zones, unsigned int *nr_zones) 185 { 186 int ret; 187 188 if (!*nr_zones) 189 return 0; 190 191 if (!bdev_is_zoned(device->bdev)) { 192 ret = emulate_report_zones(device, pos, zones, *nr_zones); 193 *nr_zones = ret; 194 return 0; 195 } 196 197 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 198 copy_zone_info_cb, zones); 199 if (ret < 0) { 200 btrfs_err_in_rcu(device->fs_info, 201 "zoned: failed to read zone %llu on %s (devid %llu)", 202 pos, rcu_str_deref(device->name), 203 device->devid); 204 return ret; 205 } 206 *nr_zones = ret; 207 if (!ret) 208 return -EIO; 209 210 return 0; 211 } 212 213 /* The emulated zone size is determined from the size of device extent */ 214 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 215 { 216 struct btrfs_path *path; 217 struct btrfs_root *root = fs_info->dev_root; 218 struct btrfs_key key; 219 struct extent_buffer *leaf; 220 struct btrfs_dev_extent *dext; 221 int ret = 0; 222 223 key.objectid = 1; 224 key.type = BTRFS_DEV_EXTENT_KEY; 225 key.offset = 0; 226 227 path = btrfs_alloc_path(); 228 if (!path) 229 return -ENOMEM; 230 231 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 232 if (ret < 0) 233 goto out; 234 235 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 236 ret = btrfs_next_item(root, path); 237 if (ret < 0) 238 goto out; 239 /* No dev extents at all? Not good */ 240 if (ret > 0) { 241 ret = -EUCLEAN; 242 goto out; 243 } 244 } 245 246 leaf = path->nodes[0]; 247 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 248 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 249 ret = 0; 250 251 out: 252 btrfs_free_path(path); 253 254 return ret; 255 } 256 257 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 258 { 259 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 260 struct btrfs_device *device; 261 int ret = 0; 262 263 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 264 if (!btrfs_fs_incompat(fs_info, ZONED)) 265 return 0; 266 267 mutex_lock(&fs_devices->device_list_mutex); 268 list_for_each_entry(device, &fs_devices->devices, dev_list) { 269 /* We can skip reading of zone info for missing devices */ 270 if (!device->bdev) 271 continue; 272 273 ret = btrfs_get_dev_zone_info(device); 274 if (ret) 275 break; 276 } 277 mutex_unlock(&fs_devices->device_list_mutex); 278 279 return ret; 280 } 281 282 int btrfs_get_dev_zone_info(struct btrfs_device *device) 283 { 284 struct btrfs_fs_info *fs_info = device->fs_info; 285 struct btrfs_zoned_device_info *zone_info = NULL; 286 struct block_device *bdev = device->bdev; 287 struct request_queue *queue = bdev_get_queue(bdev); 288 sector_t nr_sectors; 289 sector_t sector = 0; 290 struct blk_zone *zones = NULL; 291 unsigned int i, nreported = 0, nr_zones; 292 sector_t zone_sectors; 293 char *model, *emulated; 294 int ret; 295 296 /* 297 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 298 * yet be set. 299 */ 300 if (!btrfs_fs_incompat(fs_info, ZONED)) 301 return 0; 302 303 if (device->zone_info) 304 return 0; 305 306 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 307 if (!zone_info) 308 return -ENOMEM; 309 310 if (!bdev_is_zoned(bdev)) { 311 if (!fs_info->zone_size) { 312 ret = calculate_emulated_zone_size(fs_info); 313 if (ret) 314 goto out; 315 } 316 317 ASSERT(fs_info->zone_size); 318 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 319 } else { 320 zone_sectors = bdev_zone_sectors(bdev); 321 } 322 323 /* Check if it's power of 2 (see is_power_of_2) */ 324 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 325 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 326 327 /* We reject devices with a zone size larger than 8GB */ 328 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 329 btrfs_err_in_rcu(fs_info, 330 "zoned: %s: zone size %llu larger than supported maximum %llu", 331 rcu_str_deref(device->name), 332 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 333 ret = -EINVAL; 334 goto out; 335 } 336 337 nr_sectors = bdev_nr_sectors(bdev); 338 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 339 zone_info->max_zone_append_size = 340 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; 341 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 342 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 343 zone_info->nr_zones++; 344 345 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 346 if (!zone_info->seq_zones) { 347 ret = -ENOMEM; 348 goto out; 349 } 350 351 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 352 if (!zone_info->empty_zones) { 353 ret = -ENOMEM; 354 goto out; 355 } 356 357 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 358 if (!zones) { 359 ret = -ENOMEM; 360 goto out; 361 } 362 363 /* Get zones type */ 364 while (sector < nr_sectors) { 365 nr_zones = BTRFS_REPORT_NR_ZONES; 366 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 367 &nr_zones); 368 if (ret) 369 goto out; 370 371 for (i = 0; i < nr_zones; i++) { 372 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 373 __set_bit(nreported, zone_info->seq_zones); 374 if (zones[i].cond == BLK_ZONE_COND_EMPTY) 375 __set_bit(nreported, zone_info->empty_zones); 376 nreported++; 377 } 378 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 379 } 380 381 if (nreported != zone_info->nr_zones) { 382 btrfs_err_in_rcu(device->fs_info, 383 "inconsistent number of zones on %s (%u/%u)", 384 rcu_str_deref(device->name), nreported, 385 zone_info->nr_zones); 386 ret = -EIO; 387 goto out; 388 } 389 390 /* Validate superblock log */ 391 nr_zones = BTRFS_NR_SB_LOG_ZONES; 392 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 393 u32 sb_zone; 394 u64 sb_wp; 395 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 396 397 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 398 if (sb_zone + 1 >= zone_info->nr_zones) 399 continue; 400 401 sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); 402 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, 403 &zone_info->sb_zones[sb_pos], 404 &nr_zones); 405 if (ret) 406 goto out; 407 408 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 409 btrfs_err_in_rcu(device->fs_info, 410 "zoned: failed to read super block log zone info at devid %llu zone %u", 411 device->devid, sb_zone); 412 ret = -EUCLEAN; 413 goto out; 414 } 415 416 /* 417 * If zones[0] is conventional, always use the beggining of the 418 * zone to record superblock. No need to validate in that case. 419 */ 420 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 421 BLK_ZONE_TYPE_CONVENTIONAL) 422 continue; 423 424 ret = sb_write_pointer(device->bdev, 425 &zone_info->sb_zones[sb_pos], &sb_wp); 426 if (ret != -ENOENT && ret) { 427 btrfs_err_in_rcu(device->fs_info, 428 "zoned: super block log zone corrupted devid %llu zone %u", 429 device->devid, sb_zone); 430 ret = -EUCLEAN; 431 goto out; 432 } 433 } 434 435 436 kfree(zones); 437 438 device->zone_info = zone_info; 439 440 switch (bdev_zoned_model(bdev)) { 441 case BLK_ZONED_HM: 442 model = "host-managed zoned"; 443 emulated = ""; 444 break; 445 case BLK_ZONED_HA: 446 model = "host-aware zoned"; 447 emulated = ""; 448 break; 449 case BLK_ZONED_NONE: 450 model = "regular"; 451 emulated = "emulated "; 452 break; 453 default: 454 /* Just in case */ 455 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 456 bdev_zoned_model(bdev), 457 rcu_str_deref(device->name)); 458 ret = -EOPNOTSUPP; 459 goto out_free_zone_info; 460 } 461 462 btrfs_info_in_rcu(fs_info, 463 "%s block device %s, %u %szones of %llu bytes", 464 model, rcu_str_deref(device->name), zone_info->nr_zones, 465 emulated, zone_info->zone_size); 466 467 return 0; 468 469 out: 470 kfree(zones); 471 out_free_zone_info: 472 bitmap_free(zone_info->empty_zones); 473 bitmap_free(zone_info->seq_zones); 474 kfree(zone_info); 475 device->zone_info = NULL; 476 477 return ret; 478 } 479 480 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 481 { 482 struct btrfs_zoned_device_info *zone_info = device->zone_info; 483 484 if (!zone_info) 485 return; 486 487 bitmap_free(zone_info->seq_zones); 488 bitmap_free(zone_info->empty_zones); 489 kfree(zone_info); 490 device->zone_info = NULL; 491 } 492 493 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 494 struct blk_zone *zone) 495 { 496 unsigned int nr_zones = 1; 497 int ret; 498 499 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 500 if (ret != 0 || !nr_zones) 501 return ret ? ret : -EIO; 502 503 return 0; 504 } 505 506 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 507 { 508 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 509 struct btrfs_device *device; 510 u64 zoned_devices = 0; 511 u64 nr_devices = 0; 512 u64 zone_size = 0; 513 u64 max_zone_append_size = 0; 514 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 515 int ret = 0; 516 517 /* Count zoned devices */ 518 list_for_each_entry(device, &fs_devices->devices, dev_list) { 519 enum blk_zoned_model model; 520 521 if (!device->bdev) 522 continue; 523 524 model = bdev_zoned_model(device->bdev); 525 /* 526 * A Host-Managed zoned device must be used as a zoned device. 527 * A Host-Aware zoned device and a non-zoned devices can be 528 * treated as a zoned device, if ZONED flag is enabled in the 529 * superblock. 530 */ 531 if (model == BLK_ZONED_HM || 532 (model == BLK_ZONED_HA && incompat_zoned) || 533 (model == BLK_ZONED_NONE && incompat_zoned)) { 534 struct btrfs_zoned_device_info *zone_info = 535 device->zone_info; 536 537 zone_info = device->zone_info; 538 zoned_devices++; 539 if (!zone_size) { 540 zone_size = zone_info->zone_size; 541 } else if (zone_info->zone_size != zone_size) { 542 btrfs_err(fs_info, 543 "zoned: unequal block device zone sizes: have %llu found %llu", 544 device->zone_info->zone_size, 545 zone_size); 546 ret = -EINVAL; 547 goto out; 548 } 549 if (!max_zone_append_size || 550 (zone_info->max_zone_append_size && 551 zone_info->max_zone_append_size < max_zone_append_size)) 552 max_zone_append_size = 553 zone_info->max_zone_append_size; 554 } 555 nr_devices++; 556 } 557 558 if (!zoned_devices && !incompat_zoned) 559 goto out; 560 561 if (!zoned_devices && incompat_zoned) { 562 /* No zoned block device found on ZONED filesystem */ 563 btrfs_err(fs_info, 564 "zoned: no zoned devices found on a zoned filesystem"); 565 ret = -EINVAL; 566 goto out; 567 } 568 569 if (zoned_devices && !incompat_zoned) { 570 btrfs_err(fs_info, 571 "zoned: mode not enabled but zoned device found"); 572 ret = -EINVAL; 573 goto out; 574 } 575 576 if (zoned_devices != nr_devices) { 577 btrfs_err(fs_info, 578 "zoned: cannot mix zoned and regular devices"); 579 ret = -EINVAL; 580 goto out; 581 } 582 583 /* 584 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 585 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, 586 * check the alignment here. 587 */ 588 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 589 btrfs_err(fs_info, 590 "zoned: zone size %llu not aligned to stripe %u", 591 zone_size, BTRFS_STRIPE_LEN); 592 ret = -EINVAL; 593 goto out; 594 } 595 596 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 597 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 598 ret = -EINVAL; 599 goto out; 600 } 601 602 fs_info->zone_size = zone_size; 603 fs_info->max_zone_append_size = max_zone_append_size; 604 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 605 606 /* 607 * Check mount options here, because we might change fs_info->zoned 608 * from fs_info->zone_size. 609 */ 610 ret = btrfs_check_mountopts_zoned(fs_info); 611 if (ret) 612 goto out; 613 614 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 615 out: 616 return ret; 617 } 618 619 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 620 { 621 if (!btrfs_is_zoned(info)) 622 return 0; 623 624 /* 625 * Space cache writing is not COWed. Disable that to avoid write errors 626 * in sequential zones. 627 */ 628 if (btrfs_test_opt(info, SPACE_CACHE)) { 629 btrfs_err(info, "zoned: space cache v1 is not supported"); 630 return -EINVAL; 631 } 632 633 if (btrfs_test_opt(info, NODATACOW)) { 634 btrfs_err(info, "zoned: NODATACOW not supported"); 635 return -EINVAL; 636 } 637 638 return 0; 639 } 640 641 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 642 int rw, u64 *bytenr_ret) 643 { 644 u64 wp; 645 int ret; 646 647 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 648 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 649 return 0; 650 } 651 652 ret = sb_write_pointer(bdev, zones, &wp); 653 if (ret != -ENOENT && ret < 0) 654 return ret; 655 656 if (rw == WRITE) { 657 struct blk_zone *reset = NULL; 658 659 if (wp == zones[0].start << SECTOR_SHIFT) 660 reset = &zones[0]; 661 else if (wp == zones[1].start << SECTOR_SHIFT) 662 reset = &zones[1]; 663 664 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 665 ASSERT(reset->cond == BLK_ZONE_COND_FULL); 666 667 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 668 reset->start, reset->len, 669 GFP_NOFS); 670 if (ret) 671 return ret; 672 673 reset->cond = BLK_ZONE_COND_EMPTY; 674 reset->wp = reset->start; 675 } 676 } else if (ret != -ENOENT) { 677 /* For READ, we want the precious one */ 678 if (wp == zones[0].start << SECTOR_SHIFT) 679 wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; 680 wp -= BTRFS_SUPER_INFO_SIZE; 681 } 682 683 *bytenr_ret = wp; 684 return 0; 685 686 } 687 688 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 689 u64 *bytenr_ret) 690 { 691 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 692 sector_t zone_sectors; 693 u32 sb_zone; 694 int ret; 695 u8 zone_sectors_shift; 696 sector_t nr_sectors; 697 u32 nr_zones; 698 699 if (!bdev_is_zoned(bdev)) { 700 *bytenr_ret = btrfs_sb_offset(mirror); 701 return 0; 702 } 703 704 ASSERT(rw == READ || rw == WRITE); 705 706 zone_sectors = bdev_zone_sectors(bdev); 707 if (!is_power_of_2(zone_sectors)) 708 return -EINVAL; 709 zone_sectors_shift = ilog2(zone_sectors); 710 nr_sectors = bdev_nr_sectors(bdev); 711 nr_zones = nr_sectors >> zone_sectors_shift; 712 713 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 714 if (sb_zone + 1 >= nr_zones) 715 return -ENOENT; 716 717 ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, 718 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 719 zones); 720 if (ret < 0) 721 return ret; 722 if (ret != BTRFS_NR_SB_LOG_ZONES) 723 return -EIO; 724 725 return sb_log_location(bdev, zones, rw, bytenr_ret); 726 } 727 728 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 729 u64 *bytenr_ret) 730 { 731 struct btrfs_zoned_device_info *zinfo = device->zone_info; 732 u32 zone_num; 733 734 /* 735 * For a zoned filesystem on a non-zoned block device, use the same 736 * super block locations as regular filesystem. Doing so, the super 737 * block can always be retrieved and the zoned flag of the volume 738 * detected from the super block information. 739 */ 740 if (!bdev_is_zoned(device->bdev)) { 741 *bytenr_ret = btrfs_sb_offset(mirror); 742 return 0; 743 } 744 745 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 746 if (zone_num + 1 >= zinfo->nr_zones) 747 return -ENOENT; 748 749 return sb_log_location(device->bdev, 750 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 751 rw, bytenr_ret); 752 } 753 754 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 755 int mirror) 756 { 757 u32 zone_num; 758 759 if (!zinfo) 760 return false; 761 762 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 763 if (zone_num + 1 >= zinfo->nr_zones) 764 return false; 765 766 if (!test_bit(zone_num, zinfo->seq_zones)) 767 return false; 768 769 return true; 770 } 771 772 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 773 { 774 struct btrfs_zoned_device_info *zinfo = device->zone_info; 775 struct blk_zone *zone; 776 777 if (!is_sb_log_zone(zinfo, mirror)) 778 return; 779 780 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 781 if (zone->cond != BLK_ZONE_COND_FULL) { 782 if (zone->cond == BLK_ZONE_COND_EMPTY) 783 zone->cond = BLK_ZONE_COND_IMP_OPEN; 784 785 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 786 787 if (zone->wp == zone->start + zone->len) 788 zone->cond = BLK_ZONE_COND_FULL; 789 790 return; 791 } 792 793 zone++; 794 ASSERT(zone->cond != BLK_ZONE_COND_FULL); 795 if (zone->cond == BLK_ZONE_COND_EMPTY) 796 zone->cond = BLK_ZONE_COND_IMP_OPEN; 797 798 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 799 800 if (zone->wp == zone->start + zone->len) 801 zone->cond = BLK_ZONE_COND_FULL; 802 } 803 804 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 805 { 806 sector_t zone_sectors; 807 sector_t nr_sectors; 808 u8 zone_sectors_shift; 809 u32 sb_zone; 810 u32 nr_zones; 811 812 zone_sectors = bdev_zone_sectors(bdev); 813 zone_sectors_shift = ilog2(zone_sectors); 814 nr_sectors = bdev_nr_sectors(bdev); 815 nr_zones = nr_sectors >> zone_sectors_shift; 816 817 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 818 if (sb_zone + 1 >= nr_zones) 819 return -ENOENT; 820 821 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 822 sb_zone << zone_sectors_shift, 823 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 824 } 825 826 /** 827 * btrfs_find_allocatable_zones - find allocatable zones within a given region 828 * 829 * @device: the device to allocate a region on 830 * @hole_start: the position of the hole to allocate the region 831 * @num_bytes: size of wanted region 832 * @hole_end: the end of the hole 833 * @return: position of allocatable zones 834 * 835 * Allocatable region should not contain any superblock locations. 836 */ 837 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 838 u64 hole_end, u64 num_bytes) 839 { 840 struct btrfs_zoned_device_info *zinfo = device->zone_info; 841 const u8 shift = zinfo->zone_size_shift; 842 u64 nzones = num_bytes >> shift; 843 u64 pos = hole_start; 844 u64 begin, end; 845 bool have_sb; 846 int i; 847 848 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 849 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 850 851 while (pos < hole_end) { 852 begin = pos >> shift; 853 end = begin + nzones; 854 855 if (end > zinfo->nr_zones) 856 return hole_end; 857 858 /* Check if zones in the region are all empty */ 859 if (btrfs_dev_is_sequential(device, pos) && 860 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 861 pos += zinfo->zone_size; 862 continue; 863 } 864 865 have_sb = false; 866 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 867 u32 sb_zone; 868 u64 sb_pos; 869 870 sb_zone = sb_zone_number(shift, i); 871 if (!(end <= sb_zone || 872 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 873 have_sb = true; 874 pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; 875 break; 876 } 877 878 /* We also need to exclude regular superblock positions */ 879 sb_pos = btrfs_sb_offset(i); 880 if (!(pos + num_bytes <= sb_pos || 881 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 882 have_sb = true; 883 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 884 zinfo->zone_size); 885 break; 886 } 887 } 888 if (!have_sb) 889 break; 890 } 891 892 return pos; 893 } 894 895 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 896 u64 length, u64 *bytes) 897 { 898 int ret; 899 900 *bytes = 0; 901 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 902 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 903 GFP_NOFS); 904 if (ret) 905 return ret; 906 907 *bytes = length; 908 while (length) { 909 btrfs_dev_set_zone_empty(device, physical); 910 physical += device->zone_info->zone_size; 911 length -= device->zone_info->zone_size; 912 } 913 914 return 0; 915 } 916 917 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 918 { 919 struct btrfs_zoned_device_info *zinfo = device->zone_info; 920 const u8 shift = zinfo->zone_size_shift; 921 unsigned long begin = start >> shift; 922 unsigned long end = (start + size) >> shift; 923 u64 pos; 924 int ret; 925 926 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 927 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 928 929 if (end > zinfo->nr_zones) 930 return -ERANGE; 931 932 /* All the zones are conventional */ 933 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 934 return 0; 935 936 /* All the zones are sequential and empty */ 937 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 938 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 939 return 0; 940 941 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 942 u64 reset_bytes; 943 944 if (!btrfs_dev_is_sequential(device, pos) || 945 btrfs_dev_is_empty_zone(device, pos)) 946 continue; 947 948 /* Free regions should be empty */ 949 btrfs_warn_in_rcu( 950 device->fs_info, 951 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 952 rcu_str_deref(device->name), device->devid, pos >> shift); 953 WARN_ON_ONCE(1); 954 955 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 956 &reset_bytes); 957 if (ret) 958 return ret; 959 } 960 961 return 0; 962 } 963 964 /* 965 * Calculate an allocation pointer from the extent allocation information 966 * for a block group consist of conventional zones. It is pointed to the 967 * end of the highest addressed extent in the block group as an allocation 968 * offset. 969 */ 970 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 971 u64 *offset_ret) 972 { 973 struct btrfs_fs_info *fs_info = cache->fs_info; 974 struct btrfs_root *root = fs_info->extent_root; 975 struct btrfs_path *path; 976 struct btrfs_key key; 977 struct btrfs_key found_key; 978 int ret; 979 u64 length; 980 981 path = btrfs_alloc_path(); 982 if (!path) 983 return -ENOMEM; 984 985 key.objectid = cache->start + cache->length; 986 key.type = 0; 987 key.offset = 0; 988 989 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 990 /* We should not find the exact match */ 991 if (!ret) 992 ret = -EUCLEAN; 993 if (ret < 0) 994 goto out; 995 996 ret = btrfs_previous_extent_item(root, path, cache->start); 997 if (ret) { 998 if (ret == 1) { 999 ret = 0; 1000 *offset_ret = 0; 1001 } 1002 goto out; 1003 } 1004 1005 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1006 1007 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1008 length = found_key.offset; 1009 else 1010 length = fs_info->nodesize; 1011 1012 if (!(found_key.objectid >= cache->start && 1013 found_key.objectid + length <= cache->start + cache->length)) { 1014 ret = -EUCLEAN; 1015 goto out; 1016 } 1017 *offset_ret = found_key.objectid + length - cache->start; 1018 ret = 0; 1019 1020 out: 1021 btrfs_free_path(path); 1022 return ret; 1023 } 1024 1025 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1026 { 1027 struct btrfs_fs_info *fs_info = cache->fs_info; 1028 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1029 struct extent_map *em; 1030 struct map_lookup *map; 1031 struct btrfs_device *device; 1032 u64 logical = cache->start; 1033 u64 length = cache->length; 1034 u64 physical = 0; 1035 int ret; 1036 int i; 1037 unsigned int nofs_flag; 1038 u64 *alloc_offsets = NULL; 1039 u64 last_alloc = 0; 1040 u32 num_sequential = 0, num_conventional = 0; 1041 1042 if (!btrfs_is_zoned(fs_info)) 1043 return 0; 1044 1045 /* Sanity check */ 1046 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1047 btrfs_err(fs_info, 1048 "zoned: block group %llu len %llu unaligned to zone size %llu", 1049 logical, length, fs_info->zone_size); 1050 return -EIO; 1051 } 1052 1053 /* Get the chunk mapping */ 1054 read_lock(&em_tree->lock); 1055 em = lookup_extent_mapping(em_tree, logical, length); 1056 read_unlock(&em_tree->lock); 1057 1058 if (!em) 1059 return -EINVAL; 1060 1061 map = em->map_lookup; 1062 1063 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1064 if (!alloc_offsets) { 1065 free_extent_map(em); 1066 return -ENOMEM; 1067 } 1068 1069 for (i = 0; i < map->num_stripes; i++) { 1070 bool is_sequential; 1071 struct blk_zone zone; 1072 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1073 int dev_replace_is_ongoing = 0; 1074 1075 device = map->stripes[i].dev; 1076 physical = map->stripes[i].physical; 1077 1078 if (device->bdev == NULL) { 1079 alloc_offsets[i] = WP_MISSING_DEV; 1080 continue; 1081 } 1082 1083 is_sequential = btrfs_dev_is_sequential(device, physical); 1084 if (is_sequential) 1085 num_sequential++; 1086 else 1087 num_conventional++; 1088 1089 if (!is_sequential) { 1090 alloc_offsets[i] = WP_CONVENTIONAL; 1091 continue; 1092 } 1093 1094 /* 1095 * This zone will be used for allocation, so mark this zone 1096 * non-empty. 1097 */ 1098 btrfs_dev_clear_zone_empty(device, physical); 1099 1100 down_read(&dev_replace->rwsem); 1101 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1102 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1103 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); 1104 up_read(&dev_replace->rwsem); 1105 1106 /* 1107 * The group is mapped to a sequential zone. Get the zone write 1108 * pointer to determine the allocation offset within the zone. 1109 */ 1110 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); 1111 nofs_flag = memalloc_nofs_save(); 1112 ret = btrfs_get_dev_zone(device, physical, &zone); 1113 memalloc_nofs_restore(nofs_flag); 1114 if (ret == -EIO || ret == -EOPNOTSUPP) { 1115 ret = 0; 1116 alloc_offsets[i] = WP_MISSING_DEV; 1117 continue; 1118 } else if (ret) { 1119 goto out; 1120 } 1121 1122 switch (zone.cond) { 1123 case BLK_ZONE_COND_OFFLINE: 1124 case BLK_ZONE_COND_READONLY: 1125 btrfs_err(fs_info, 1126 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1127 physical >> device->zone_info->zone_size_shift, 1128 rcu_str_deref(device->name), device->devid); 1129 alloc_offsets[i] = WP_MISSING_DEV; 1130 break; 1131 case BLK_ZONE_COND_EMPTY: 1132 alloc_offsets[i] = 0; 1133 break; 1134 case BLK_ZONE_COND_FULL: 1135 alloc_offsets[i] = fs_info->zone_size; 1136 break; 1137 default: 1138 /* Partially used zone */ 1139 alloc_offsets[i] = 1140 ((zone.wp - zone.start) << SECTOR_SHIFT); 1141 break; 1142 } 1143 } 1144 1145 if (num_sequential > 0) 1146 cache->seq_zone = true; 1147 1148 if (num_conventional > 0) { 1149 /* 1150 * Avoid calling calculate_alloc_pointer() for new BG. It 1151 * is no use for new BG. It must be always 0. 1152 * 1153 * Also, we have a lock chain of extent buffer lock -> 1154 * chunk mutex. For new BG, this function is called from 1155 * btrfs_make_block_group() which is already taking the 1156 * chunk mutex. Thus, we cannot call 1157 * calculate_alloc_pointer() which takes extent buffer 1158 * locks to avoid deadlock. 1159 */ 1160 if (new) { 1161 cache->alloc_offset = 0; 1162 goto out; 1163 } 1164 ret = calculate_alloc_pointer(cache, &last_alloc); 1165 if (ret || map->num_stripes == num_conventional) { 1166 if (!ret) 1167 cache->alloc_offset = last_alloc; 1168 else 1169 btrfs_err(fs_info, 1170 "zoned: failed to determine allocation offset of bg %llu", 1171 cache->start); 1172 goto out; 1173 } 1174 } 1175 1176 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1177 case 0: /* single */ 1178 cache->alloc_offset = alloc_offsets[0]; 1179 break; 1180 case BTRFS_BLOCK_GROUP_DUP: 1181 case BTRFS_BLOCK_GROUP_RAID1: 1182 case BTRFS_BLOCK_GROUP_RAID0: 1183 case BTRFS_BLOCK_GROUP_RAID10: 1184 case BTRFS_BLOCK_GROUP_RAID5: 1185 case BTRFS_BLOCK_GROUP_RAID6: 1186 /* non-single profiles are not supported yet */ 1187 default: 1188 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1189 btrfs_bg_type_to_raid_name(map->type)); 1190 ret = -EINVAL; 1191 goto out; 1192 } 1193 1194 out: 1195 /* An extent is allocated after the write pointer */ 1196 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1197 btrfs_err(fs_info, 1198 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1199 logical, last_alloc, cache->alloc_offset); 1200 ret = -EIO; 1201 } 1202 1203 if (!ret) 1204 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1205 1206 kfree(alloc_offsets); 1207 free_extent_map(em); 1208 1209 return ret; 1210 } 1211 1212 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1213 { 1214 u64 unusable, free; 1215 1216 if (!btrfs_is_zoned(cache->fs_info)) 1217 return; 1218 1219 WARN_ON(cache->bytes_super != 0); 1220 unusable = cache->alloc_offset - cache->used; 1221 free = cache->length - cache->alloc_offset; 1222 1223 /* We only need ->free_space in ALLOC_SEQ block groups */ 1224 cache->last_byte_to_unpin = (u64)-1; 1225 cache->cached = BTRFS_CACHE_FINISHED; 1226 cache->free_space_ctl->free_space = free; 1227 cache->zone_unusable = unusable; 1228 1229 /* Should not have any excluded extents. Just in case, though */ 1230 btrfs_free_excluded_extents(cache); 1231 } 1232 1233 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1234 struct extent_buffer *eb) 1235 { 1236 struct btrfs_fs_info *fs_info = eb->fs_info; 1237 1238 if (!btrfs_is_zoned(fs_info) || 1239 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1240 !list_empty(&eb->release_list)) 1241 return; 1242 1243 set_extent_buffer_dirty(eb); 1244 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1245 eb->start + eb->len - 1, EXTENT_DIRTY); 1246 memzero_extent_buffer(eb, 0, eb->len); 1247 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1248 1249 spin_lock(&trans->releasing_ebs_lock); 1250 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1251 spin_unlock(&trans->releasing_ebs_lock); 1252 atomic_inc(&eb->refs); 1253 } 1254 1255 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1256 { 1257 spin_lock(&trans->releasing_ebs_lock); 1258 while (!list_empty(&trans->releasing_ebs)) { 1259 struct extent_buffer *eb; 1260 1261 eb = list_first_entry(&trans->releasing_ebs, 1262 struct extent_buffer, release_list); 1263 list_del_init(&eb->release_list); 1264 free_extent_buffer(eb); 1265 } 1266 spin_unlock(&trans->releasing_ebs_lock); 1267 } 1268 1269 bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) 1270 { 1271 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1272 struct btrfs_block_group *cache; 1273 bool ret = false; 1274 1275 if (!btrfs_is_zoned(fs_info)) 1276 return false; 1277 1278 if (!fs_info->max_zone_append_size) 1279 return false; 1280 1281 if (!is_data_inode(&inode->vfs_inode)) 1282 return false; 1283 1284 cache = btrfs_lookup_block_group(fs_info, em->block_start); 1285 ASSERT(cache); 1286 if (!cache) 1287 return false; 1288 1289 ret = cache->seq_zone; 1290 btrfs_put_block_group(cache); 1291 1292 return ret; 1293 } 1294 1295 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1296 struct bio *bio) 1297 { 1298 struct btrfs_ordered_extent *ordered; 1299 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1300 1301 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1302 return; 1303 1304 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1305 if (WARN_ON(!ordered)) 1306 return; 1307 1308 ordered->physical = physical; 1309 ordered->disk = bio->bi_bdev->bd_disk; 1310 ordered->partno = bio->bi_bdev->bd_partno; 1311 1312 btrfs_put_ordered_extent(ordered); 1313 } 1314 1315 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1316 { 1317 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1318 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1319 struct extent_map_tree *em_tree; 1320 struct extent_map *em; 1321 struct btrfs_ordered_sum *sum; 1322 struct block_device *bdev; 1323 u64 orig_logical = ordered->disk_bytenr; 1324 u64 *logical = NULL; 1325 int nr, stripe_len; 1326 1327 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1328 ASSERT(ordered->partno == 0); 1329 bdev = bdgrab(ordered->disk->part0); 1330 if (WARN_ON(!bdev)) 1331 return; 1332 1333 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, 1334 ordered->physical, &logical, &nr, 1335 &stripe_len))) 1336 goto out; 1337 1338 WARN_ON(nr != 1); 1339 1340 if (orig_logical == *logical) 1341 goto out; 1342 1343 ordered->disk_bytenr = *logical; 1344 1345 em_tree = &inode->extent_tree; 1346 write_lock(&em_tree->lock); 1347 em = search_extent_mapping(em_tree, ordered->file_offset, 1348 ordered->num_bytes); 1349 em->block_start = *logical; 1350 free_extent_map(em); 1351 write_unlock(&em_tree->lock); 1352 1353 list_for_each_entry(sum, &ordered->list, list) { 1354 if (*logical < orig_logical) 1355 sum->bytenr -= orig_logical - *logical; 1356 else 1357 sum->bytenr += *logical - orig_logical; 1358 } 1359 1360 out: 1361 kfree(logical); 1362 bdput(bdev); 1363 } 1364 1365 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1366 struct extent_buffer *eb, 1367 struct btrfs_block_group **cache_ret) 1368 { 1369 struct btrfs_block_group *cache; 1370 bool ret = true; 1371 1372 if (!btrfs_is_zoned(fs_info)) 1373 return true; 1374 1375 cache = *cache_ret; 1376 1377 if (cache && (eb->start < cache->start || 1378 cache->start + cache->length <= eb->start)) { 1379 btrfs_put_block_group(cache); 1380 cache = NULL; 1381 *cache_ret = NULL; 1382 } 1383 1384 if (!cache) 1385 cache = btrfs_lookup_block_group(fs_info, eb->start); 1386 1387 if (cache) { 1388 if (cache->meta_write_pointer != eb->start) { 1389 btrfs_put_block_group(cache); 1390 cache = NULL; 1391 ret = false; 1392 } else { 1393 cache->meta_write_pointer = eb->start + eb->len; 1394 } 1395 1396 *cache_ret = cache; 1397 } 1398 1399 return ret; 1400 } 1401 1402 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1403 struct extent_buffer *eb) 1404 { 1405 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1406 return; 1407 1408 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1409 cache->meta_write_pointer = eb->start; 1410 } 1411 1412 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1413 { 1414 if (!btrfs_dev_is_sequential(device, physical)) 1415 return -EOPNOTSUPP; 1416 1417 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1418 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1419 } 1420 1421 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1422 struct blk_zone *zone) 1423 { 1424 struct btrfs_bio *bbio = NULL; 1425 u64 mapped_length = PAGE_SIZE; 1426 unsigned int nofs_flag; 1427 int nmirrors; 1428 int i, ret; 1429 1430 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1431 &mapped_length, &bbio); 1432 if (ret || !bbio || mapped_length < PAGE_SIZE) { 1433 btrfs_put_bbio(bbio); 1434 return -EIO; 1435 } 1436 1437 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1438 return -EINVAL; 1439 1440 nofs_flag = memalloc_nofs_save(); 1441 nmirrors = (int)bbio->num_stripes; 1442 for (i = 0; i < nmirrors; i++) { 1443 u64 physical = bbio->stripes[i].physical; 1444 struct btrfs_device *dev = bbio->stripes[i].dev; 1445 1446 /* Missing device */ 1447 if (!dev->bdev) 1448 continue; 1449 1450 ret = btrfs_get_dev_zone(dev, physical, zone); 1451 /* Failing device */ 1452 if (ret == -EIO || ret == -EOPNOTSUPP) 1453 continue; 1454 break; 1455 } 1456 memalloc_nofs_restore(nofs_flag); 1457 1458 return ret; 1459 } 1460 1461 /* 1462 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1463 * filling zeros between @physical_pos to a write pointer of dev-replace 1464 * source device. 1465 */ 1466 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1467 u64 physical_start, u64 physical_pos) 1468 { 1469 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1470 struct blk_zone zone; 1471 u64 length; 1472 u64 wp; 1473 int ret; 1474 1475 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1476 return 0; 1477 1478 ret = read_zone_info(fs_info, logical, &zone); 1479 if (ret) 1480 return ret; 1481 1482 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1483 1484 if (physical_pos == wp) 1485 return 0; 1486 1487 if (physical_pos > wp) 1488 return -EUCLEAN; 1489 1490 length = wp - physical_pos; 1491 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1492 } 1493