1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include "ctree.h" 8 #include "volumes.h" 9 #include "zoned.h" 10 #include "rcu-string.h" 11 #include "disk-io.h" 12 #include "block-group.h" 13 #include "transaction.h" 14 #include "dev-replace.h" 15 #include "space-info.h" 16 17 /* Maximum number of zones to report per blkdev_report_zones() call */ 18 #define BTRFS_REPORT_NR_ZONES 4096 19 /* Invalid allocation pointer value for missing devices */ 20 #define WP_MISSING_DEV ((u64)-1) 21 /* Pseudo write pointer value for conventional zone */ 22 #define WP_CONVENTIONAL ((u64)-2) 23 24 /* 25 * Location of the first zone of superblock logging zone pairs. 26 * 27 * - primary superblock: 0B (zone 0) 28 * - first copy: 512G (zone starting at that offset) 29 * - second copy: 4T (zone starting at that offset) 30 */ 31 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 32 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 33 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 34 35 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 36 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 37 38 /* Number of superblock log zones */ 39 #define BTRFS_NR_SB_LOG_ZONES 2 40 41 /* 42 * Maximum supported zone size. Currently, SMR disks have a zone size of 43 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not 44 * expect the zone size to become larger than 8GiB in the near future. 45 */ 46 #define BTRFS_MAX_ZONE_SIZE SZ_8G 47 48 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 49 { 50 struct blk_zone *zones = data; 51 52 memcpy(&zones[idx], zone, sizeof(*zone)); 53 54 return 0; 55 } 56 57 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 58 u64 *wp_ret) 59 { 60 bool empty[BTRFS_NR_SB_LOG_ZONES]; 61 bool full[BTRFS_NR_SB_LOG_ZONES]; 62 sector_t sector; 63 64 ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && 65 zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); 66 67 empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); 68 empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); 69 full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); 70 full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); 71 72 /* 73 * Possible states of log buffer zones 74 * 75 * Empty[0] In use[0] Full[0] 76 * Empty[1] * x 0 77 * In use[1] 0 x 0 78 * Full[1] 1 1 C 79 * 80 * Log position: 81 * *: Special case, no superblock is written 82 * 0: Use write pointer of zones[0] 83 * 1: Use write pointer of zones[1] 84 * C: Compare super blocks from zones[0] and zones[1], use the latest 85 * one determined by generation 86 * x: Invalid state 87 */ 88 89 if (empty[0] && empty[1]) { 90 /* Special case to distinguish no superblock to read */ 91 *wp_ret = zones[0].start << SECTOR_SHIFT; 92 return -ENOENT; 93 } else if (full[0] && full[1]) { 94 /* Compare two super blocks */ 95 struct address_space *mapping = bdev->bd_inode->i_mapping; 96 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 97 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 98 int i; 99 100 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 101 u64 bytenr; 102 103 bytenr = ((zones[i].start + zones[i].len) 104 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 105 106 page[i] = read_cache_page_gfp(mapping, 107 bytenr >> PAGE_SHIFT, GFP_NOFS); 108 if (IS_ERR(page[i])) { 109 if (i == 1) 110 btrfs_release_disk_super(super[0]); 111 return PTR_ERR(page[i]); 112 } 113 super[i] = page_address(page[i]); 114 } 115 116 if (super[0]->generation > super[1]->generation) 117 sector = zones[1].start; 118 else 119 sector = zones[0].start; 120 121 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 122 btrfs_release_disk_super(super[i]); 123 } else if (!full[0] && (empty[1] || full[1])) { 124 sector = zones[0].wp; 125 } else if (full[0]) { 126 sector = zones[1].wp; 127 } else { 128 return -EUCLEAN; 129 } 130 *wp_ret = sector << SECTOR_SHIFT; 131 return 0; 132 } 133 134 /* 135 * Get the first zone number of the superblock mirror 136 */ 137 static inline u32 sb_zone_number(int shift, int mirror) 138 { 139 u64 zone; 140 141 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 142 switch (mirror) { 143 case 0: zone = 0; break; 144 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 145 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 146 } 147 148 ASSERT(zone <= U32_MAX); 149 150 return (u32)zone; 151 } 152 153 static inline sector_t zone_start_sector(u32 zone_number, 154 struct block_device *bdev) 155 { 156 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 157 } 158 159 static inline u64 zone_start_physical(u32 zone_number, 160 struct btrfs_zoned_device_info *zone_info) 161 { 162 return (u64)zone_number << zone_info->zone_size_shift; 163 } 164 165 /* 166 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 167 * device into static sized chunks and fake a conventional zone on each of 168 * them. 169 */ 170 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 171 struct blk_zone *zones, unsigned int nr_zones) 172 { 173 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 174 sector_t bdev_size = bdev_nr_sectors(device->bdev); 175 unsigned int i; 176 177 pos >>= SECTOR_SHIFT; 178 for (i = 0; i < nr_zones; i++) { 179 zones[i].start = i * zone_sectors + pos; 180 zones[i].len = zone_sectors; 181 zones[i].capacity = zone_sectors; 182 zones[i].wp = zones[i].start + zone_sectors; 183 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 184 zones[i].cond = BLK_ZONE_COND_NOT_WP; 185 186 if (zones[i].wp >= bdev_size) { 187 i++; 188 break; 189 } 190 } 191 192 return i; 193 } 194 195 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 196 struct blk_zone *zones, unsigned int *nr_zones) 197 { 198 int ret; 199 200 if (!*nr_zones) 201 return 0; 202 203 if (!bdev_is_zoned(device->bdev)) { 204 ret = emulate_report_zones(device, pos, zones, *nr_zones); 205 *nr_zones = ret; 206 return 0; 207 } 208 209 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 210 copy_zone_info_cb, zones); 211 if (ret < 0) { 212 btrfs_err_in_rcu(device->fs_info, 213 "zoned: failed to read zone %llu on %s (devid %llu)", 214 pos, rcu_str_deref(device->name), 215 device->devid); 216 return ret; 217 } 218 *nr_zones = ret; 219 if (!ret) 220 return -EIO; 221 222 return 0; 223 } 224 225 /* The emulated zone size is determined from the size of device extent */ 226 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 227 { 228 struct btrfs_path *path; 229 struct btrfs_root *root = fs_info->dev_root; 230 struct btrfs_key key; 231 struct extent_buffer *leaf; 232 struct btrfs_dev_extent *dext; 233 int ret = 0; 234 235 key.objectid = 1; 236 key.type = BTRFS_DEV_EXTENT_KEY; 237 key.offset = 0; 238 239 path = btrfs_alloc_path(); 240 if (!path) 241 return -ENOMEM; 242 243 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 244 if (ret < 0) 245 goto out; 246 247 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 248 ret = btrfs_next_leaf(root, path); 249 if (ret < 0) 250 goto out; 251 /* No dev extents at all? Not good */ 252 if (ret > 0) { 253 ret = -EUCLEAN; 254 goto out; 255 } 256 } 257 258 leaf = path->nodes[0]; 259 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 260 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 261 ret = 0; 262 263 out: 264 btrfs_free_path(path); 265 266 return ret; 267 } 268 269 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 270 { 271 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 272 struct btrfs_device *device; 273 int ret = 0; 274 275 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 276 if (!btrfs_fs_incompat(fs_info, ZONED)) 277 return 0; 278 279 mutex_lock(&fs_devices->device_list_mutex); 280 list_for_each_entry(device, &fs_devices->devices, dev_list) { 281 /* We can skip reading of zone info for missing devices */ 282 if (!device->bdev) 283 continue; 284 285 ret = btrfs_get_dev_zone_info(device); 286 if (ret) 287 break; 288 } 289 mutex_unlock(&fs_devices->device_list_mutex); 290 291 return ret; 292 } 293 294 int btrfs_get_dev_zone_info(struct btrfs_device *device) 295 { 296 struct btrfs_fs_info *fs_info = device->fs_info; 297 struct btrfs_zoned_device_info *zone_info = NULL; 298 struct block_device *bdev = device->bdev; 299 sector_t nr_sectors; 300 sector_t sector = 0; 301 struct blk_zone *zones = NULL; 302 unsigned int i, nreported = 0, nr_zones; 303 sector_t zone_sectors; 304 char *model, *emulated; 305 int ret; 306 307 /* 308 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 309 * yet be set. 310 */ 311 if (!btrfs_fs_incompat(fs_info, ZONED)) 312 return 0; 313 314 if (device->zone_info) 315 return 0; 316 317 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 318 if (!zone_info) 319 return -ENOMEM; 320 321 if (!bdev_is_zoned(bdev)) { 322 if (!fs_info->zone_size) { 323 ret = calculate_emulated_zone_size(fs_info); 324 if (ret) 325 goto out; 326 } 327 328 ASSERT(fs_info->zone_size); 329 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 330 } else { 331 zone_sectors = bdev_zone_sectors(bdev); 332 } 333 334 /* Check if it's power of 2 (see is_power_of_2) */ 335 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 336 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 337 338 /* We reject devices with a zone size larger than 8GB */ 339 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 340 btrfs_err_in_rcu(fs_info, 341 "zoned: %s: zone size %llu larger than supported maximum %llu", 342 rcu_str_deref(device->name), 343 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 344 ret = -EINVAL; 345 goto out; 346 } 347 348 nr_sectors = bdev_nr_sectors(bdev); 349 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 350 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 351 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 352 zone_info->nr_zones++; 353 354 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 355 if (!zone_info->seq_zones) { 356 ret = -ENOMEM; 357 goto out; 358 } 359 360 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 361 if (!zone_info->empty_zones) { 362 ret = -ENOMEM; 363 goto out; 364 } 365 366 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 367 if (!zones) { 368 ret = -ENOMEM; 369 goto out; 370 } 371 372 /* Get zones type */ 373 while (sector < nr_sectors) { 374 nr_zones = BTRFS_REPORT_NR_ZONES; 375 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 376 &nr_zones); 377 if (ret) 378 goto out; 379 380 for (i = 0; i < nr_zones; i++) { 381 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 382 __set_bit(nreported, zone_info->seq_zones); 383 if (zones[i].cond == BLK_ZONE_COND_EMPTY) 384 __set_bit(nreported, zone_info->empty_zones); 385 nreported++; 386 } 387 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 388 } 389 390 if (nreported != zone_info->nr_zones) { 391 btrfs_err_in_rcu(device->fs_info, 392 "inconsistent number of zones on %s (%u/%u)", 393 rcu_str_deref(device->name), nreported, 394 zone_info->nr_zones); 395 ret = -EIO; 396 goto out; 397 } 398 399 /* Validate superblock log */ 400 nr_zones = BTRFS_NR_SB_LOG_ZONES; 401 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 402 u32 sb_zone; 403 u64 sb_wp; 404 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 405 406 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 407 if (sb_zone + 1 >= zone_info->nr_zones) 408 continue; 409 410 ret = btrfs_get_dev_zones(device, 411 zone_start_physical(sb_zone, zone_info), 412 &zone_info->sb_zones[sb_pos], 413 &nr_zones); 414 if (ret) 415 goto out; 416 417 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 418 btrfs_err_in_rcu(device->fs_info, 419 "zoned: failed to read super block log zone info at devid %llu zone %u", 420 device->devid, sb_zone); 421 ret = -EUCLEAN; 422 goto out; 423 } 424 425 /* 426 * If zones[0] is conventional, always use the beginning of the 427 * zone to record superblock. No need to validate in that case. 428 */ 429 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 430 BLK_ZONE_TYPE_CONVENTIONAL) 431 continue; 432 433 ret = sb_write_pointer(device->bdev, 434 &zone_info->sb_zones[sb_pos], &sb_wp); 435 if (ret != -ENOENT && ret) { 436 btrfs_err_in_rcu(device->fs_info, 437 "zoned: super block log zone corrupted devid %llu zone %u", 438 device->devid, sb_zone); 439 ret = -EUCLEAN; 440 goto out; 441 } 442 } 443 444 445 kfree(zones); 446 447 device->zone_info = zone_info; 448 449 switch (bdev_zoned_model(bdev)) { 450 case BLK_ZONED_HM: 451 model = "host-managed zoned"; 452 emulated = ""; 453 break; 454 case BLK_ZONED_HA: 455 model = "host-aware zoned"; 456 emulated = ""; 457 break; 458 case BLK_ZONED_NONE: 459 model = "regular"; 460 emulated = "emulated "; 461 break; 462 default: 463 /* Just in case */ 464 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 465 bdev_zoned_model(bdev), 466 rcu_str_deref(device->name)); 467 ret = -EOPNOTSUPP; 468 goto out_free_zone_info; 469 } 470 471 btrfs_info_in_rcu(fs_info, 472 "%s block device %s, %u %szones of %llu bytes", 473 model, rcu_str_deref(device->name), zone_info->nr_zones, 474 emulated, zone_info->zone_size); 475 476 return 0; 477 478 out: 479 kfree(zones); 480 out_free_zone_info: 481 bitmap_free(zone_info->empty_zones); 482 bitmap_free(zone_info->seq_zones); 483 kfree(zone_info); 484 device->zone_info = NULL; 485 486 return ret; 487 } 488 489 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 490 { 491 struct btrfs_zoned_device_info *zone_info = device->zone_info; 492 493 if (!zone_info) 494 return; 495 496 bitmap_free(zone_info->seq_zones); 497 bitmap_free(zone_info->empty_zones); 498 kfree(zone_info); 499 device->zone_info = NULL; 500 } 501 502 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 503 struct blk_zone *zone) 504 { 505 unsigned int nr_zones = 1; 506 int ret; 507 508 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 509 if (ret != 0 || !nr_zones) 510 return ret ? ret : -EIO; 511 512 return 0; 513 } 514 515 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 516 { 517 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 518 struct btrfs_device *device; 519 u64 zoned_devices = 0; 520 u64 nr_devices = 0; 521 u64 zone_size = 0; 522 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 523 int ret = 0; 524 525 /* Count zoned devices */ 526 list_for_each_entry(device, &fs_devices->devices, dev_list) { 527 enum blk_zoned_model model; 528 529 if (!device->bdev) 530 continue; 531 532 model = bdev_zoned_model(device->bdev); 533 /* 534 * A Host-Managed zoned device must be used as a zoned device. 535 * A Host-Aware zoned device and a non-zoned devices can be 536 * treated as a zoned device, if ZONED flag is enabled in the 537 * superblock. 538 */ 539 if (model == BLK_ZONED_HM || 540 (model == BLK_ZONED_HA && incompat_zoned) || 541 (model == BLK_ZONED_NONE && incompat_zoned)) { 542 struct btrfs_zoned_device_info *zone_info = 543 device->zone_info; 544 545 zone_info = device->zone_info; 546 zoned_devices++; 547 if (!zone_size) { 548 zone_size = zone_info->zone_size; 549 } else if (zone_info->zone_size != zone_size) { 550 btrfs_err(fs_info, 551 "zoned: unequal block device zone sizes: have %llu found %llu", 552 device->zone_info->zone_size, 553 zone_size); 554 ret = -EINVAL; 555 goto out; 556 } 557 } 558 nr_devices++; 559 } 560 561 if (!zoned_devices && !incompat_zoned) 562 goto out; 563 564 if (!zoned_devices && incompat_zoned) { 565 /* No zoned block device found on ZONED filesystem */ 566 btrfs_err(fs_info, 567 "zoned: no zoned devices found on a zoned filesystem"); 568 ret = -EINVAL; 569 goto out; 570 } 571 572 if (zoned_devices && !incompat_zoned) { 573 btrfs_err(fs_info, 574 "zoned: mode not enabled but zoned device found"); 575 ret = -EINVAL; 576 goto out; 577 } 578 579 if (zoned_devices != nr_devices) { 580 btrfs_err(fs_info, 581 "zoned: cannot mix zoned and regular devices"); 582 ret = -EINVAL; 583 goto out; 584 } 585 586 /* 587 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 588 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, 589 * check the alignment here. 590 */ 591 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 592 btrfs_err(fs_info, 593 "zoned: zone size %llu not aligned to stripe %u", 594 zone_size, BTRFS_STRIPE_LEN); 595 ret = -EINVAL; 596 goto out; 597 } 598 599 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 600 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 601 ret = -EINVAL; 602 goto out; 603 } 604 605 fs_info->zone_size = zone_size; 606 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 607 608 /* 609 * Check mount options here, because we might change fs_info->zoned 610 * from fs_info->zone_size. 611 */ 612 ret = btrfs_check_mountopts_zoned(fs_info); 613 if (ret) 614 goto out; 615 616 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 617 out: 618 return ret; 619 } 620 621 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 622 { 623 if (!btrfs_is_zoned(info)) 624 return 0; 625 626 /* 627 * Space cache writing is not COWed. Disable that to avoid write errors 628 * in sequential zones. 629 */ 630 if (btrfs_test_opt(info, SPACE_CACHE)) { 631 btrfs_err(info, "zoned: space cache v1 is not supported"); 632 return -EINVAL; 633 } 634 635 if (btrfs_test_opt(info, NODATACOW)) { 636 btrfs_err(info, "zoned: NODATACOW not supported"); 637 return -EINVAL; 638 } 639 640 return 0; 641 } 642 643 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 644 int rw, u64 *bytenr_ret) 645 { 646 u64 wp; 647 int ret; 648 649 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 650 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 651 return 0; 652 } 653 654 ret = sb_write_pointer(bdev, zones, &wp); 655 if (ret != -ENOENT && ret < 0) 656 return ret; 657 658 if (rw == WRITE) { 659 struct blk_zone *reset = NULL; 660 661 if (wp == zones[0].start << SECTOR_SHIFT) 662 reset = &zones[0]; 663 else if (wp == zones[1].start << SECTOR_SHIFT) 664 reset = &zones[1]; 665 666 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 667 ASSERT(reset->cond == BLK_ZONE_COND_FULL); 668 669 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 670 reset->start, reset->len, 671 GFP_NOFS); 672 if (ret) 673 return ret; 674 675 reset->cond = BLK_ZONE_COND_EMPTY; 676 reset->wp = reset->start; 677 } 678 } else if (ret != -ENOENT) { 679 /* For READ, we want the precious one */ 680 if (wp == zones[0].start << SECTOR_SHIFT) 681 wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; 682 wp -= BTRFS_SUPER_INFO_SIZE; 683 } 684 685 *bytenr_ret = wp; 686 return 0; 687 688 } 689 690 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 691 u64 *bytenr_ret) 692 { 693 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 694 sector_t zone_sectors; 695 u32 sb_zone; 696 int ret; 697 u8 zone_sectors_shift; 698 sector_t nr_sectors; 699 u32 nr_zones; 700 701 if (!bdev_is_zoned(bdev)) { 702 *bytenr_ret = btrfs_sb_offset(mirror); 703 return 0; 704 } 705 706 ASSERT(rw == READ || rw == WRITE); 707 708 zone_sectors = bdev_zone_sectors(bdev); 709 if (!is_power_of_2(zone_sectors)) 710 return -EINVAL; 711 zone_sectors_shift = ilog2(zone_sectors); 712 nr_sectors = bdev_nr_sectors(bdev); 713 nr_zones = nr_sectors >> zone_sectors_shift; 714 715 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 716 if (sb_zone + 1 >= nr_zones) 717 return -ENOENT; 718 719 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 720 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 721 zones); 722 if (ret < 0) 723 return ret; 724 if (ret != BTRFS_NR_SB_LOG_ZONES) 725 return -EIO; 726 727 return sb_log_location(bdev, zones, rw, bytenr_ret); 728 } 729 730 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 731 u64 *bytenr_ret) 732 { 733 struct btrfs_zoned_device_info *zinfo = device->zone_info; 734 u32 zone_num; 735 736 /* 737 * For a zoned filesystem on a non-zoned block device, use the same 738 * super block locations as regular filesystem. Doing so, the super 739 * block can always be retrieved and the zoned flag of the volume 740 * detected from the super block information. 741 */ 742 if (!bdev_is_zoned(device->bdev)) { 743 *bytenr_ret = btrfs_sb_offset(mirror); 744 return 0; 745 } 746 747 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 748 if (zone_num + 1 >= zinfo->nr_zones) 749 return -ENOENT; 750 751 return sb_log_location(device->bdev, 752 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 753 rw, bytenr_ret); 754 } 755 756 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 757 int mirror) 758 { 759 u32 zone_num; 760 761 if (!zinfo) 762 return false; 763 764 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 765 if (zone_num + 1 >= zinfo->nr_zones) 766 return false; 767 768 if (!test_bit(zone_num, zinfo->seq_zones)) 769 return false; 770 771 return true; 772 } 773 774 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 775 { 776 struct btrfs_zoned_device_info *zinfo = device->zone_info; 777 struct blk_zone *zone; 778 779 if (!is_sb_log_zone(zinfo, mirror)) 780 return; 781 782 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 783 if (zone->cond != BLK_ZONE_COND_FULL) { 784 if (zone->cond == BLK_ZONE_COND_EMPTY) 785 zone->cond = BLK_ZONE_COND_IMP_OPEN; 786 787 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 788 789 if (zone->wp == zone->start + zone->len) 790 zone->cond = BLK_ZONE_COND_FULL; 791 792 return; 793 } 794 795 zone++; 796 ASSERT(zone->cond != BLK_ZONE_COND_FULL); 797 if (zone->cond == BLK_ZONE_COND_EMPTY) 798 zone->cond = BLK_ZONE_COND_IMP_OPEN; 799 800 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 801 802 if (zone->wp == zone->start + zone->len) 803 zone->cond = BLK_ZONE_COND_FULL; 804 } 805 806 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 807 { 808 sector_t zone_sectors; 809 sector_t nr_sectors; 810 u8 zone_sectors_shift; 811 u32 sb_zone; 812 u32 nr_zones; 813 814 zone_sectors = bdev_zone_sectors(bdev); 815 zone_sectors_shift = ilog2(zone_sectors); 816 nr_sectors = bdev_nr_sectors(bdev); 817 nr_zones = nr_sectors >> zone_sectors_shift; 818 819 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 820 if (sb_zone + 1 >= nr_zones) 821 return -ENOENT; 822 823 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 824 zone_start_sector(sb_zone, bdev), 825 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 826 } 827 828 /** 829 * btrfs_find_allocatable_zones - find allocatable zones within a given region 830 * 831 * @device: the device to allocate a region on 832 * @hole_start: the position of the hole to allocate the region 833 * @num_bytes: size of wanted region 834 * @hole_end: the end of the hole 835 * @return: position of allocatable zones 836 * 837 * Allocatable region should not contain any superblock locations. 838 */ 839 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 840 u64 hole_end, u64 num_bytes) 841 { 842 struct btrfs_zoned_device_info *zinfo = device->zone_info; 843 const u8 shift = zinfo->zone_size_shift; 844 u64 nzones = num_bytes >> shift; 845 u64 pos = hole_start; 846 u64 begin, end; 847 bool have_sb; 848 int i; 849 850 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 851 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 852 853 while (pos < hole_end) { 854 begin = pos >> shift; 855 end = begin + nzones; 856 857 if (end > zinfo->nr_zones) 858 return hole_end; 859 860 /* Check if zones in the region are all empty */ 861 if (btrfs_dev_is_sequential(device, pos) && 862 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 863 pos += zinfo->zone_size; 864 continue; 865 } 866 867 have_sb = false; 868 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 869 u32 sb_zone; 870 u64 sb_pos; 871 872 sb_zone = sb_zone_number(shift, i); 873 if (!(end <= sb_zone || 874 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 875 have_sb = true; 876 pos = zone_start_physical( 877 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 878 break; 879 } 880 881 /* We also need to exclude regular superblock positions */ 882 sb_pos = btrfs_sb_offset(i); 883 if (!(pos + num_bytes <= sb_pos || 884 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 885 have_sb = true; 886 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 887 zinfo->zone_size); 888 break; 889 } 890 } 891 if (!have_sb) 892 break; 893 } 894 895 return pos; 896 } 897 898 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 899 u64 length, u64 *bytes) 900 { 901 int ret; 902 903 *bytes = 0; 904 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 905 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 906 GFP_NOFS); 907 if (ret) 908 return ret; 909 910 *bytes = length; 911 while (length) { 912 btrfs_dev_set_zone_empty(device, physical); 913 physical += device->zone_info->zone_size; 914 length -= device->zone_info->zone_size; 915 } 916 917 return 0; 918 } 919 920 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 921 { 922 struct btrfs_zoned_device_info *zinfo = device->zone_info; 923 const u8 shift = zinfo->zone_size_shift; 924 unsigned long begin = start >> shift; 925 unsigned long end = (start + size) >> shift; 926 u64 pos; 927 int ret; 928 929 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 930 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 931 932 if (end > zinfo->nr_zones) 933 return -ERANGE; 934 935 /* All the zones are conventional */ 936 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 937 return 0; 938 939 /* All the zones are sequential and empty */ 940 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 941 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 942 return 0; 943 944 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 945 u64 reset_bytes; 946 947 if (!btrfs_dev_is_sequential(device, pos) || 948 btrfs_dev_is_empty_zone(device, pos)) 949 continue; 950 951 /* Free regions should be empty */ 952 btrfs_warn_in_rcu( 953 device->fs_info, 954 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 955 rcu_str_deref(device->name), device->devid, pos >> shift); 956 WARN_ON_ONCE(1); 957 958 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 959 &reset_bytes); 960 if (ret) 961 return ret; 962 } 963 964 return 0; 965 } 966 967 /* 968 * Calculate an allocation pointer from the extent allocation information 969 * for a block group consist of conventional zones. It is pointed to the 970 * end of the highest addressed extent in the block group as an allocation 971 * offset. 972 */ 973 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 974 u64 *offset_ret) 975 { 976 struct btrfs_fs_info *fs_info = cache->fs_info; 977 struct btrfs_root *root = fs_info->extent_root; 978 struct btrfs_path *path; 979 struct btrfs_key key; 980 struct btrfs_key found_key; 981 int ret; 982 u64 length; 983 984 path = btrfs_alloc_path(); 985 if (!path) 986 return -ENOMEM; 987 988 key.objectid = cache->start + cache->length; 989 key.type = 0; 990 key.offset = 0; 991 992 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 993 /* We should not find the exact match */ 994 if (!ret) 995 ret = -EUCLEAN; 996 if (ret < 0) 997 goto out; 998 999 ret = btrfs_previous_extent_item(root, path, cache->start); 1000 if (ret) { 1001 if (ret == 1) { 1002 ret = 0; 1003 *offset_ret = 0; 1004 } 1005 goto out; 1006 } 1007 1008 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1009 1010 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1011 length = found_key.offset; 1012 else 1013 length = fs_info->nodesize; 1014 1015 if (!(found_key.objectid >= cache->start && 1016 found_key.objectid + length <= cache->start + cache->length)) { 1017 ret = -EUCLEAN; 1018 goto out; 1019 } 1020 *offset_ret = found_key.objectid + length - cache->start; 1021 ret = 0; 1022 1023 out: 1024 btrfs_free_path(path); 1025 return ret; 1026 } 1027 1028 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1029 { 1030 struct btrfs_fs_info *fs_info = cache->fs_info; 1031 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1032 struct extent_map *em; 1033 struct map_lookup *map; 1034 struct btrfs_device *device; 1035 u64 logical = cache->start; 1036 u64 length = cache->length; 1037 u64 physical = 0; 1038 int ret; 1039 int i; 1040 unsigned int nofs_flag; 1041 u64 *alloc_offsets = NULL; 1042 u64 last_alloc = 0; 1043 u32 num_sequential = 0, num_conventional = 0; 1044 1045 if (!btrfs_is_zoned(fs_info)) 1046 return 0; 1047 1048 /* Sanity check */ 1049 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1050 btrfs_err(fs_info, 1051 "zoned: block group %llu len %llu unaligned to zone size %llu", 1052 logical, length, fs_info->zone_size); 1053 return -EIO; 1054 } 1055 1056 /* Get the chunk mapping */ 1057 read_lock(&em_tree->lock); 1058 em = lookup_extent_mapping(em_tree, logical, length); 1059 read_unlock(&em_tree->lock); 1060 1061 if (!em) 1062 return -EINVAL; 1063 1064 map = em->map_lookup; 1065 1066 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1067 if (!alloc_offsets) { 1068 free_extent_map(em); 1069 return -ENOMEM; 1070 } 1071 1072 for (i = 0; i < map->num_stripes; i++) { 1073 bool is_sequential; 1074 struct blk_zone zone; 1075 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1076 int dev_replace_is_ongoing = 0; 1077 1078 device = map->stripes[i].dev; 1079 physical = map->stripes[i].physical; 1080 1081 if (device->bdev == NULL) { 1082 alloc_offsets[i] = WP_MISSING_DEV; 1083 continue; 1084 } 1085 1086 is_sequential = btrfs_dev_is_sequential(device, physical); 1087 if (is_sequential) 1088 num_sequential++; 1089 else 1090 num_conventional++; 1091 1092 if (!is_sequential) { 1093 alloc_offsets[i] = WP_CONVENTIONAL; 1094 continue; 1095 } 1096 1097 /* 1098 * This zone will be used for allocation, so mark this zone 1099 * non-empty. 1100 */ 1101 btrfs_dev_clear_zone_empty(device, physical); 1102 1103 down_read(&dev_replace->rwsem); 1104 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1105 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1106 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); 1107 up_read(&dev_replace->rwsem); 1108 1109 /* 1110 * The group is mapped to a sequential zone. Get the zone write 1111 * pointer to determine the allocation offset within the zone. 1112 */ 1113 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); 1114 nofs_flag = memalloc_nofs_save(); 1115 ret = btrfs_get_dev_zone(device, physical, &zone); 1116 memalloc_nofs_restore(nofs_flag); 1117 if (ret == -EIO || ret == -EOPNOTSUPP) { 1118 ret = 0; 1119 alloc_offsets[i] = WP_MISSING_DEV; 1120 continue; 1121 } else if (ret) { 1122 goto out; 1123 } 1124 1125 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1126 btrfs_err_in_rcu(fs_info, 1127 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1128 zone.start << SECTOR_SHIFT, 1129 rcu_str_deref(device->name), device->devid); 1130 ret = -EIO; 1131 goto out; 1132 } 1133 1134 switch (zone.cond) { 1135 case BLK_ZONE_COND_OFFLINE: 1136 case BLK_ZONE_COND_READONLY: 1137 btrfs_err(fs_info, 1138 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1139 physical >> device->zone_info->zone_size_shift, 1140 rcu_str_deref(device->name), device->devid); 1141 alloc_offsets[i] = WP_MISSING_DEV; 1142 break; 1143 case BLK_ZONE_COND_EMPTY: 1144 alloc_offsets[i] = 0; 1145 break; 1146 case BLK_ZONE_COND_FULL: 1147 alloc_offsets[i] = fs_info->zone_size; 1148 break; 1149 default: 1150 /* Partially used zone */ 1151 alloc_offsets[i] = 1152 ((zone.wp - zone.start) << SECTOR_SHIFT); 1153 break; 1154 } 1155 } 1156 1157 if (num_sequential > 0) 1158 cache->seq_zone = true; 1159 1160 if (num_conventional > 0) { 1161 /* 1162 * Avoid calling calculate_alloc_pointer() for new BG. It 1163 * is no use for new BG. It must be always 0. 1164 * 1165 * Also, we have a lock chain of extent buffer lock -> 1166 * chunk mutex. For new BG, this function is called from 1167 * btrfs_make_block_group() which is already taking the 1168 * chunk mutex. Thus, we cannot call 1169 * calculate_alloc_pointer() which takes extent buffer 1170 * locks to avoid deadlock. 1171 */ 1172 if (new) { 1173 cache->alloc_offset = 0; 1174 goto out; 1175 } 1176 ret = calculate_alloc_pointer(cache, &last_alloc); 1177 if (ret || map->num_stripes == num_conventional) { 1178 if (!ret) 1179 cache->alloc_offset = last_alloc; 1180 else 1181 btrfs_err(fs_info, 1182 "zoned: failed to determine allocation offset of bg %llu", 1183 cache->start); 1184 goto out; 1185 } 1186 } 1187 1188 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1189 case 0: /* single */ 1190 if (alloc_offsets[0] == WP_MISSING_DEV) { 1191 btrfs_err(fs_info, 1192 "zoned: cannot recover write pointer for zone %llu", 1193 physical); 1194 ret = -EIO; 1195 goto out; 1196 } 1197 cache->alloc_offset = alloc_offsets[0]; 1198 break; 1199 case BTRFS_BLOCK_GROUP_DUP: 1200 case BTRFS_BLOCK_GROUP_RAID1: 1201 case BTRFS_BLOCK_GROUP_RAID0: 1202 case BTRFS_BLOCK_GROUP_RAID10: 1203 case BTRFS_BLOCK_GROUP_RAID5: 1204 case BTRFS_BLOCK_GROUP_RAID6: 1205 /* non-single profiles are not supported yet */ 1206 default: 1207 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1208 btrfs_bg_type_to_raid_name(map->type)); 1209 ret = -EINVAL; 1210 goto out; 1211 } 1212 1213 out: 1214 if (cache->alloc_offset > fs_info->zone_size) { 1215 btrfs_err(fs_info, 1216 "zoned: invalid write pointer %llu in block group %llu", 1217 cache->alloc_offset, cache->start); 1218 ret = -EIO; 1219 } 1220 1221 /* An extent is allocated after the write pointer */ 1222 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1223 btrfs_err(fs_info, 1224 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1225 logical, last_alloc, cache->alloc_offset); 1226 ret = -EIO; 1227 } 1228 1229 if (!ret) 1230 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1231 1232 kfree(alloc_offsets); 1233 free_extent_map(em); 1234 1235 return ret; 1236 } 1237 1238 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1239 { 1240 u64 unusable, free; 1241 1242 if (!btrfs_is_zoned(cache->fs_info)) 1243 return; 1244 1245 WARN_ON(cache->bytes_super != 0); 1246 unusable = cache->alloc_offset - cache->used; 1247 free = cache->length - cache->alloc_offset; 1248 1249 /* We only need ->free_space in ALLOC_SEQ block groups */ 1250 cache->last_byte_to_unpin = (u64)-1; 1251 cache->cached = BTRFS_CACHE_FINISHED; 1252 cache->free_space_ctl->free_space = free; 1253 cache->zone_unusable = unusable; 1254 1255 /* Should not have any excluded extents. Just in case, though */ 1256 btrfs_free_excluded_extents(cache); 1257 } 1258 1259 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1260 struct extent_buffer *eb) 1261 { 1262 struct btrfs_fs_info *fs_info = eb->fs_info; 1263 1264 if (!btrfs_is_zoned(fs_info) || 1265 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1266 !list_empty(&eb->release_list)) 1267 return; 1268 1269 set_extent_buffer_dirty(eb); 1270 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1271 eb->start + eb->len - 1, EXTENT_DIRTY); 1272 memzero_extent_buffer(eb, 0, eb->len); 1273 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1274 1275 spin_lock(&trans->releasing_ebs_lock); 1276 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1277 spin_unlock(&trans->releasing_ebs_lock); 1278 atomic_inc(&eb->refs); 1279 } 1280 1281 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1282 { 1283 spin_lock(&trans->releasing_ebs_lock); 1284 while (!list_empty(&trans->releasing_ebs)) { 1285 struct extent_buffer *eb; 1286 1287 eb = list_first_entry(&trans->releasing_ebs, 1288 struct extent_buffer, release_list); 1289 list_del_init(&eb->release_list); 1290 free_extent_buffer(eb); 1291 } 1292 spin_unlock(&trans->releasing_ebs_lock); 1293 } 1294 1295 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) 1296 { 1297 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1298 struct btrfs_block_group *cache; 1299 bool ret = false; 1300 1301 if (!btrfs_is_zoned(fs_info)) 1302 return false; 1303 1304 if (!is_data_inode(&inode->vfs_inode)) 1305 return false; 1306 1307 cache = btrfs_lookup_block_group(fs_info, start); 1308 ASSERT(cache); 1309 if (!cache) 1310 return false; 1311 1312 ret = cache->seq_zone; 1313 btrfs_put_block_group(cache); 1314 1315 return ret; 1316 } 1317 1318 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1319 struct bio *bio) 1320 { 1321 struct btrfs_ordered_extent *ordered; 1322 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1323 1324 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1325 return; 1326 1327 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1328 if (WARN_ON(!ordered)) 1329 return; 1330 1331 ordered->physical = physical; 1332 ordered->bdev = bio->bi_bdev; 1333 1334 btrfs_put_ordered_extent(ordered); 1335 } 1336 1337 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1338 { 1339 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1340 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1341 struct extent_map_tree *em_tree; 1342 struct extent_map *em; 1343 struct btrfs_ordered_sum *sum; 1344 u64 orig_logical = ordered->disk_bytenr; 1345 u64 *logical = NULL; 1346 int nr, stripe_len; 1347 1348 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1349 ASSERT(!bdev_is_partition(ordered->bdev)); 1350 if (WARN_ON(!ordered->bdev)) 1351 return; 1352 1353 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, 1354 ordered->physical, &logical, &nr, 1355 &stripe_len))) 1356 goto out; 1357 1358 WARN_ON(nr != 1); 1359 1360 if (orig_logical == *logical) 1361 goto out; 1362 1363 ordered->disk_bytenr = *logical; 1364 1365 em_tree = &inode->extent_tree; 1366 write_lock(&em_tree->lock); 1367 em = search_extent_mapping(em_tree, ordered->file_offset, 1368 ordered->num_bytes); 1369 em->block_start = *logical; 1370 free_extent_map(em); 1371 write_unlock(&em_tree->lock); 1372 1373 list_for_each_entry(sum, &ordered->list, list) { 1374 if (*logical < orig_logical) 1375 sum->bytenr -= orig_logical - *logical; 1376 else 1377 sum->bytenr += *logical - orig_logical; 1378 } 1379 1380 out: 1381 kfree(logical); 1382 } 1383 1384 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1385 struct extent_buffer *eb, 1386 struct btrfs_block_group **cache_ret) 1387 { 1388 struct btrfs_block_group *cache; 1389 bool ret = true; 1390 1391 if (!btrfs_is_zoned(fs_info)) 1392 return true; 1393 1394 cache = *cache_ret; 1395 1396 if (cache && (eb->start < cache->start || 1397 cache->start + cache->length <= eb->start)) { 1398 btrfs_put_block_group(cache); 1399 cache = NULL; 1400 *cache_ret = NULL; 1401 } 1402 1403 if (!cache) 1404 cache = btrfs_lookup_block_group(fs_info, eb->start); 1405 1406 if (cache) { 1407 if (cache->meta_write_pointer != eb->start) { 1408 btrfs_put_block_group(cache); 1409 cache = NULL; 1410 ret = false; 1411 } else { 1412 cache->meta_write_pointer = eb->start + eb->len; 1413 } 1414 1415 *cache_ret = cache; 1416 } 1417 1418 return ret; 1419 } 1420 1421 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1422 struct extent_buffer *eb) 1423 { 1424 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1425 return; 1426 1427 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1428 cache->meta_write_pointer = eb->start; 1429 } 1430 1431 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1432 { 1433 if (!btrfs_dev_is_sequential(device, physical)) 1434 return -EOPNOTSUPP; 1435 1436 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1437 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1438 } 1439 1440 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1441 struct blk_zone *zone) 1442 { 1443 struct btrfs_bio *bbio = NULL; 1444 u64 mapped_length = PAGE_SIZE; 1445 unsigned int nofs_flag; 1446 int nmirrors; 1447 int i, ret; 1448 1449 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1450 &mapped_length, &bbio); 1451 if (ret || !bbio || mapped_length < PAGE_SIZE) { 1452 btrfs_put_bbio(bbio); 1453 return -EIO; 1454 } 1455 1456 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1457 return -EINVAL; 1458 1459 nofs_flag = memalloc_nofs_save(); 1460 nmirrors = (int)bbio->num_stripes; 1461 for (i = 0; i < nmirrors; i++) { 1462 u64 physical = bbio->stripes[i].physical; 1463 struct btrfs_device *dev = bbio->stripes[i].dev; 1464 1465 /* Missing device */ 1466 if (!dev->bdev) 1467 continue; 1468 1469 ret = btrfs_get_dev_zone(dev, physical, zone); 1470 /* Failing device */ 1471 if (ret == -EIO || ret == -EOPNOTSUPP) 1472 continue; 1473 break; 1474 } 1475 memalloc_nofs_restore(nofs_flag); 1476 1477 return ret; 1478 } 1479 1480 /* 1481 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1482 * filling zeros between @physical_pos to a write pointer of dev-replace 1483 * source device. 1484 */ 1485 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1486 u64 physical_start, u64 physical_pos) 1487 { 1488 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1489 struct blk_zone zone; 1490 u64 length; 1491 u64 wp; 1492 int ret; 1493 1494 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1495 return 0; 1496 1497 ret = read_zone_info(fs_info, logical, &zone); 1498 if (ret) 1499 return ret; 1500 1501 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1502 1503 if (physical_pos == wp) 1504 return 0; 1505 1506 if (physical_pos > wp) 1507 return -EUCLEAN; 1508 1509 length = wp - physical_pos; 1510 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1511 } 1512 1513 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 1514 u64 logical, u64 length) 1515 { 1516 struct btrfs_device *device; 1517 struct extent_map *em; 1518 struct map_lookup *map; 1519 1520 em = btrfs_get_chunk_map(fs_info, logical, length); 1521 if (IS_ERR(em)) 1522 return ERR_CAST(em); 1523 1524 map = em->map_lookup; 1525 /* We only support single profile for now */ 1526 ASSERT(map->num_stripes == 1); 1527 device = map->stripes[0].dev; 1528 1529 free_extent_map(em); 1530 1531 return device; 1532 } 1533