1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include "ctree.h" 8 #include "volumes.h" 9 #include "zoned.h" 10 #include "rcu-string.h" 11 #include "disk-io.h" 12 #include "block-group.h" 13 #include "transaction.h" 14 #include "dev-replace.h" 15 #include "space-info.h" 16 17 /* Maximum number of zones to report per blkdev_report_zones() call */ 18 #define BTRFS_REPORT_NR_ZONES 4096 19 /* Invalid allocation pointer value for missing devices */ 20 #define WP_MISSING_DEV ((u64)-1) 21 /* Pseudo write pointer value for conventional zone */ 22 #define WP_CONVENTIONAL ((u64)-2) 23 24 /* 25 * Location of the first zone of superblock logging zone pairs. 26 * 27 * - primary superblock: 0B (zone 0) 28 * - first copy: 512G (zone starting at that offset) 29 * - second copy: 4T (zone starting at that offset) 30 */ 31 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 32 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 33 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 34 35 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 36 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 37 38 /* Number of superblock log zones */ 39 #define BTRFS_NR_SB_LOG_ZONES 2 40 41 /* 42 * Maximum supported zone size. Currently, SMR disks have a zone size of 43 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not 44 * expect the zone size to become larger than 8GiB in the near future. 45 */ 46 #define BTRFS_MAX_ZONE_SIZE SZ_8G 47 48 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 49 { 50 struct blk_zone *zones = data; 51 52 memcpy(&zones[idx], zone, sizeof(*zone)); 53 54 return 0; 55 } 56 57 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 58 u64 *wp_ret) 59 { 60 bool empty[BTRFS_NR_SB_LOG_ZONES]; 61 bool full[BTRFS_NR_SB_LOG_ZONES]; 62 sector_t sector; 63 64 ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && 65 zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); 66 67 empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); 68 empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); 69 full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); 70 full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); 71 72 /* 73 * Possible states of log buffer zones 74 * 75 * Empty[0] In use[0] Full[0] 76 * Empty[1] * x 0 77 * In use[1] 0 x 0 78 * Full[1] 1 1 C 79 * 80 * Log position: 81 * *: Special case, no superblock is written 82 * 0: Use write pointer of zones[0] 83 * 1: Use write pointer of zones[1] 84 * C: Compare super blcoks from zones[0] and zones[1], use the latest 85 * one determined by generation 86 * x: Invalid state 87 */ 88 89 if (empty[0] && empty[1]) { 90 /* Special case to distinguish no superblock to read */ 91 *wp_ret = zones[0].start << SECTOR_SHIFT; 92 return -ENOENT; 93 } else if (full[0] && full[1]) { 94 /* Compare two super blocks */ 95 struct address_space *mapping = bdev->bd_inode->i_mapping; 96 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 97 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 98 int i; 99 100 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 101 u64 bytenr; 102 103 bytenr = ((zones[i].start + zones[i].len) 104 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 105 106 page[i] = read_cache_page_gfp(mapping, 107 bytenr >> PAGE_SHIFT, GFP_NOFS); 108 if (IS_ERR(page[i])) { 109 if (i == 1) 110 btrfs_release_disk_super(super[0]); 111 return PTR_ERR(page[i]); 112 } 113 super[i] = page_address(page[i]); 114 } 115 116 if (super[0]->generation > super[1]->generation) 117 sector = zones[1].start; 118 else 119 sector = zones[0].start; 120 121 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 122 btrfs_release_disk_super(super[i]); 123 } else if (!full[0] && (empty[1] || full[1])) { 124 sector = zones[0].wp; 125 } else if (full[0]) { 126 sector = zones[1].wp; 127 } else { 128 return -EUCLEAN; 129 } 130 *wp_ret = sector << SECTOR_SHIFT; 131 return 0; 132 } 133 134 /* 135 * Get the first zone number of the superblock mirror 136 */ 137 static inline u32 sb_zone_number(int shift, int mirror) 138 { 139 u64 zone; 140 141 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 142 switch (mirror) { 143 case 0: zone = 0; break; 144 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 145 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 146 } 147 148 ASSERT(zone <= U32_MAX); 149 150 return (u32)zone; 151 } 152 153 /* 154 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 155 * device into static sized chunks and fake a conventional zone on each of 156 * them. 157 */ 158 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 159 struct blk_zone *zones, unsigned int nr_zones) 160 { 161 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 162 sector_t bdev_size = bdev_nr_sectors(device->bdev); 163 unsigned int i; 164 165 pos >>= SECTOR_SHIFT; 166 for (i = 0; i < nr_zones; i++) { 167 zones[i].start = i * zone_sectors + pos; 168 zones[i].len = zone_sectors; 169 zones[i].capacity = zone_sectors; 170 zones[i].wp = zones[i].start + zone_sectors; 171 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 172 zones[i].cond = BLK_ZONE_COND_NOT_WP; 173 174 if (zones[i].wp >= bdev_size) { 175 i++; 176 break; 177 } 178 } 179 180 return i; 181 } 182 183 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 184 struct blk_zone *zones, unsigned int *nr_zones) 185 { 186 int ret; 187 188 if (!*nr_zones) 189 return 0; 190 191 if (!bdev_is_zoned(device->bdev)) { 192 ret = emulate_report_zones(device, pos, zones, *nr_zones); 193 *nr_zones = ret; 194 return 0; 195 } 196 197 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 198 copy_zone_info_cb, zones); 199 if (ret < 0) { 200 btrfs_err_in_rcu(device->fs_info, 201 "zoned: failed to read zone %llu on %s (devid %llu)", 202 pos, rcu_str_deref(device->name), 203 device->devid); 204 return ret; 205 } 206 *nr_zones = ret; 207 if (!ret) 208 return -EIO; 209 210 return 0; 211 } 212 213 /* The emulated zone size is determined from the size of device extent */ 214 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 215 { 216 struct btrfs_path *path; 217 struct btrfs_root *root = fs_info->dev_root; 218 struct btrfs_key key; 219 struct extent_buffer *leaf; 220 struct btrfs_dev_extent *dext; 221 int ret = 0; 222 223 key.objectid = 1; 224 key.type = BTRFS_DEV_EXTENT_KEY; 225 key.offset = 0; 226 227 path = btrfs_alloc_path(); 228 if (!path) 229 return -ENOMEM; 230 231 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 232 if (ret < 0) 233 goto out; 234 235 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 236 ret = btrfs_next_item(root, path); 237 if (ret < 0) 238 goto out; 239 /* No dev extents at all? Not good */ 240 if (ret > 0) { 241 ret = -EUCLEAN; 242 goto out; 243 } 244 } 245 246 leaf = path->nodes[0]; 247 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 248 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 249 ret = 0; 250 251 out: 252 btrfs_free_path(path); 253 254 return ret; 255 } 256 257 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 258 { 259 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 260 struct btrfs_device *device; 261 int ret = 0; 262 263 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 264 if (!btrfs_fs_incompat(fs_info, ZONED)) 265 return 0; 266 267 mutex_lock(&fs_devices->device_list_mutex); 268 list_for_each_entry(device, &fs_devices->devices, dev_list) { 269 /* We can skip reading of zone info for missing devices */ 270 if (!device->bdev) 271 continue; 272 273 ret = btrfs_get_dev_zone_info(device); 274 if (ret) 275 break; 276 } 277 mutex_unlock(&fs_devices->device_list_mutex); 278 279 return ret; 280 } 281 282 int btrfs_get_dev_zone_info(struct btrfs_device *device) 283 { 284 struct btrfs_fs_info *fs_info = device->fs_info; 285 struct btrfs_zoned_device_info *zone_info = NULL; 286 struct block_device *bdev = device->bdev; 287 struct request_queue *queue = bdev_get_queue(bdev); 288 sector_t nr_sectors; 289 sector_t sector = 0; 290 struct blk_zone *zones = NULL; 291 unsigned int i, nreported = 0, nr_zones; 292 sector_t zone_sectors; 293 char *model, *emulated; 294 int ret; 295 296 /* 297 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 298 * yet be set. 299 */ 300 if (!btrfs_fs_incompat(fs_info, ZONED)) 301 return 0; 302 303 if (device->zone_info) 304 return 0; 305 306 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 307 if (!zone_info) 308 return -ENOMEM; 309 310 if (!bdev_is_zoned(bdev)) { 311 if (!fs_info->zone_size) { 312 ret = calculate_emulated_zone_size(fs_info); 313 if (ret) 314 goto out; 315 } 316 317 ASSERT(fs_info->zone_size); 318 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 319 } else { 320 zone_sectors = bdev_zone_sectors(bdev); 321 } 322 323 /* Check if it's power of 2 (see is_power_of_2) */ 324 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 325 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 326 327 /* We reject devices with a zone size larger than 8GB */ 328 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 329 btrfs_err_in_rcu(fs_info, 330 "zoned: %s: zone size %llu larger than supported maximum %llu", 331 rcu_str_deref(device->name), 332 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 333 ret = -EINVAL; 334 goto out; 335 } 336 337 nr_sectors = bdev_nr_sectors(bdev); 338 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 339 zone_info->max_zone_append_size = 340 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; 341 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 342 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 343 zone_info->nr_zones++; 344 345 if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { 346 btrfs_err(fs_info, "zoned: device %pg does not support zone append", 347 bdev); 348 ret = -EINVAL; 349 goto out; 350 } 351 352 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 353 if (!zone_info->seq_zones) { 354 ret = -ENOMEM; 355 goto out; 356 } 357 358 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 359 if (!zone_info->empty_zones) { 360 ret = -ENOMEM; 361 goto out; 362 } 363 364 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 365 if (!zones) { 366 ret = -ENOMEM; 367 goto out; 368 } 369 370 /* Get zones type */ 371 while (sector < nr_sectors) { 372 nr_zones = BTRFS_REPORT_NR_ZONES; 373 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 374 &nr_zones); 375 if (ret) 376 goto out; 377 378 for (i = 0; i < nr_zones; i++) { 379 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 380 __set_bit(nreported, zone_info->seq_zones); 381 if (zones[i].cond == BLK_ZONE_COND_EMPTY) 382 __set_bit(nreported, zone_info->empty_zones); 383 nreported++; 384 } 385 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 386 } 387 388 if (nreported != zone_info->nr_zones) { 389 btrfs_err_in_rcu(device->fs_info, 390 "inconsistent number of zones on %s (%u/%u)", 391 rcu_str_deref(device->name), nreported, 392 zone_info->nr_zones); 393 ret = -EIO; 394 goto out; 395 } 396 397 /* Validate superblock log */ 398 nr_zones = BTRFS_NR_SB_LOG_ZONES; 399 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 400 u32 sb_zone; 401 u64 sb_wp; 402 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 403 404 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 405 if (sb_zone + 1 >= zone_info->nr_zones) 406 continue; 407 408 sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); 409 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, 410 &zone_info->sb_zones[sb_pos], 411 &nr_zones); 412 if (ret) 413 goto out; 414 415 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 416 btrfs_err_in_rcu(device->fs_info, 417 "zoned: failed to read super block log zone info at devid %llu zone %u", 418 device->devid, sb_zone); 419 ret = -EUCLEAN; 420 goto out; 421 } 422 423 /* 424 * If zones[0] is conventional, always use the beggining of the 425 * zone to record superblock. No need to validate in that case. 426 */ 427 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 428 BLK_ZONE_TYPE_CONVENTIONAL) 429 continue; 430 431 ret = sb_write_pointer(device->bdev, 432 &zone_info->sb_zones[sb_pos], &sb_wp); 433 if (ret != -ENOENT && ret) { 434 btrfs_err_in_rcu(device->fs_info, 435 "zoned: super block log zone corrupted devid %llu zone %u", 436 device->devid, sb_zone); 437 ret = -EUCLEAN; 438 goto out; 439 } 440 } 441 442 443 kfree(zones); 444 445 device->zone_info = zone_info; 446 447 switch (bdev_zoned_model(bdev)) { 448 case BLK_ZONED_HM: 449 model = "host-managed zoned"; 450 emulated = ""; 451 break; 452 case BLK_ZONED_HA: 453 model = "host-aware zoned"; 454 emulated = ""; 455 break; 456 case BLK_ZONED_NONE: 457 model = "regular"; 458 emulated = "emulated "; 459 break; 460 default: 461 /* Just in case */ 462 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 463 bdev_zoned_model(bdev), 464 rcu_str_deref(device->name)); 465 ret = -EOPNOTSUPP; 466 goto out_free_zone_info; 467 } 468 469 btrfs_info_in_rcu(fs_info, 470 "%s block device %s, %u %szones of %llu bytes", 471 model, rcu_str_deref(device->name), zone_info->nr_zones, 472 emulated, zone_info->zone_size); 473 474 return 0; 475 476 out: 477 kfree(zones); 478 out_free_zone_info: 479 bitmap_free(zone_info->empty_zones); 480 bitmap_free(zone_info->seq_zones); 481 kfree(zone_info); 482 device->zone_info = NULL; 483 484 return ret; 485 } 486 487 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 488 { 489 struct btrfs_zoned_device_info *zone_info = device->zone_info; 490 491 if (!zone_info) 492 return; 493 494 bitmap_free(zone_info->seq_zones); 495 bitmap_free(zone_info->empty_zones); 496 kfree(zone_info); 497 device->zone_info = NULL; 498 } 499 500 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 501 struct blk_zone *zone) 502 { 503 unsigned int nr_zones = 1; 504 int ret; 505 506 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 507 if (ret != 0 || !nr_zones) 508 return ret ? ret : -EIO; 509 510 return 0; 511 } 512 513 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 514 { 515 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 516 struct btrfs_device *device; 517 u64 zoned_devices = 0; 518 u64 nr_devices = 0; 519 u64 zone_size = 0; 520 u64 max_zone_append_size = 0; 521 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 522 int ret = 0; 523 524 /* Count zoned devices */ 525 list_for_each_entry(device, &fs_devices->devices, dev_list) { 526 enum blk_zoned_model model; 527 528 if (!device->bdev) 529 continue; 530 531 model = bdev_zoned_model(device->bdev); 532 /* 533 * A Host-Managed zoned device must be used as a zoned device. 534 * A Host-Aware zoned device and a non-zoned devices can be 535 * treated as a zoned device, if ZONED flag is enabled in the 536 * superblock. 537 */ 538 if (model == BLK_ZONED_HM || 539 (model == BLK_ZONED_HA && incompat_zoned) || 540 (model == BLK_ZONED_NONE && incompat_zoned)) { 541 struct btrfs_zoned_device_info *zone_info = 542 device->zone_info; 543 544 zone_info = device->zone_info; 545 zoned_devices++; 546 if (!zone_size) { 547 zone_size = zone_info->zone_size; 548 } else if (zone_info->zone_size != zone_size) { 549 btrfs_err(fs_info, 550 "zoned: unequal block device zone sizes: have %llu found %llu", 551 device->zone_info->zone_size, 552 zone_size); 553 ret = -EINVAL; 554 goto out; 555 } 556 if (!max_zone_append_size || 557 (zone_info->max_zone_append_size && 558 zone_info->max_zone_append_size < max_zone_append_size)) 559 max_zone_append_size = 560 zone_info->max_zone_append_size; 561 } 562 nr_devices++; 563 } 564 565 if (!zoned_devices && !incompat_zoned) 566 goto out; 567 568 if (!zoned_devices && incompat_zoned) { 569 /* No zoned block device found on ZONED filesystem */ 570 btrfs_err(fs_info, 571 "zoned: no zoned devices found on a zoned filesystem"); 572 ret = -EINVAL; 573 goto out; 574 } 575 576 if (zoned_devices && !incompat_zoned) { 577 btrfs_err(fs_info, 578 "zoned: mode not enabled but zoned device found"); 579 ret = -EINVAL; 580 goto out; 581 } 582 583 if (zoned_devices != nr_devices) { 584 btrfs_err(fs_info, 585 "zoned: cannot mix zoned and regular devices"); 586 ret = -EINVAL; 587 goto out; 588 } 589 590 /* 591 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 592 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, 593 * check the alignment here. 594 */ 595 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 596 btrfs_err(fs_info, 597 "zoned: zone size %llu not aligned to stripe %u", 598 zone_size, BTRFS_STRIPE_LEN); 599 ret = -EINVAL; 600 goto out; 601 } 602 603 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 604 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 605 ret = -EINVAL; 606 goto out; 607 } 608 609 fs_info->zone_size = zone_size; 610 fs_info->max_zone_append_size = max_zone_append_size; 611 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 612 613 /* 614 * Check mount options here, because we might change fs_info->zoned 615 * from fs_info->zone_size. 616 */ 617 ret = btrfs_check_mountopts_zoned(fs_info); 618 if (ret) 619 goto out; 620 621 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 622 out: 623 return ret; 624 } 625 626 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 627 { 628 if (!btrfs_is_zoned(info)) 629 return 0; 630 631 /* 632 * Space cache writing is not COWed. Disable that to avoid write errors 633 * in sequential zones. 634 */ 635 if (btrfs_test_opt(info, SPACE_CACHE)) { 636 btrfs_err(info, "zoned: space cache v1 is not supported"); 637 return -EINVAL; 638 } 639 640 if (btrfs_test_opt(info, NODATACOW)) { 641 btrfs_err(info, "zoned: NODATACOW not supported"); 642 return -EINVAL; 643 } 644 645 return 0; 646 } 647 648 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 649 int rw, u64 *bytenr_ret) 650 { 651 u64 wp; 652 int ret; 653 654 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 655 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 656 return 0; 657 } 658 659 ret = sb_write_pointer(bdev, zones, &wp); 660 if (ret != -ENOENT && ret < 0) 661 return ret; 662 663 if (rw == WRITE) { 664 struct blk_zone *reset = NULL; 665 666 if (wp == zones[0].start << SECTOR_SHIFT) 667 reset = &zones[0]; 668 else if (wp == zones[1].start << SECTOR_SHIFT) 669 reset = &zones[1]; 670 671 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 672 ASSERT(reset->cond == BLK_ZONE_COND_FULL); 673 674 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 675 reset->start, reset->len, 676 GFP_NOFS); 677 if (ret) 678 return ret; 679 680 reset->cond = BLK_ZONE_COND_EMPTY; 681 reset->wp = reset->start; 682 } 683 } else if (ret != -ENOENT) { 684 /* For READ, we want the precious one */ 685 if (wp == zones[0].start << SECTOR_SHIFT) 686 wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; 687 wp -= BTRFS_SUPER_INFO_SIZE; 688 } 689 690 *bytenr_ret = wp; 691 return 0; 692 693 } 694 695 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 696 u64 *bytenr_ret) 697 { 698 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 699 sector_t zone_sectors; 700 u32 sb_zone; 701 int ret; 702 u8 zone_sectors_shift; 703 sector_t nr_sectors; 704 u32 nr_zones; 705 706 if (!bdev_is_zoned(bdev)) { 707 *bytenr_ret = btrfs_sb_offset(mirror); 708 return 0; 709 } 710 711 ASSERT(rw == READ || rw == WRITE); 712 713 zone_sectors = bdev_zone_sectors(bdev); 714 if (!is_power_of_2(zone_sectors)) 715 return -EINVAL; 716 zone_sectors_shift = ilog2(zone_sectors); 717 nr_sectors = bdev_nr_sectors(bdev); 718 nr_zones = nr_sectors >> zone_sectors_shift; 719 720 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 721 if (sb_zone + 1 >= nr_zones) 722 return -ENOENT; 723 724 ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, 725 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 726 zones); 727 if (ret < 0) 728 return ret; 729 if (ret != BTRFS_NR_SB_LOG_ZONES) 730 return -EIO; 731 732 return sb_log_location(bdev, zones, rw, bytenr_ret); 733 } 734 735 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 736 u64 *bytenr_ret) 737 { 738 struct btrfs_zoned_device_info *zinfo = device->zone_info; 739 u32 zone_num; 740 741 /* 742 * For a zoned filesystem on a non-zoned block device, use the same 743 * super block locations as regular filesystem. Doing so, the super 744 * block can always be retrieved and the zoned flag of the volume 745 * detected from the super block information. 746 */ 747 if (!bdev_is_zoned(device->bdev)) { 748 *bytenr_ret = btrfs_sb_offset(mirror); 749 return 0; 750 } 751 752 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 753 if (zone_num + 1 >= zinfo->nr_zones) 754 return -ENOENT; 755 756 return sb_log_location(device->bdev, 757 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 758 rw, bytenr_ret); 759 } 760 761 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 762 int mirror) 763 { 764 u32 zone_num; 765 766 if (!zinfo) 767 return false; 768 769 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 770 if (zone_num + 1 >= zinfo->nr_zones) 771 return false; 772 773 if (!test_bit(zone_num, zinfo->seq_zones)) 774 return false; 775 776 return true; 777 } 778 779 void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 780 { 781 struct btrfs_zoned_device_info *zinfo = device->zone_info; 782 struct blk_zone *zone; 783 784 if (!is_sb_log_zone(zinfo, mirror)) 785 return; 786 787 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 788 if (zone->cond != BLK_ZONE_COND_FULL) { 789 if (zone->cond == BLK_ZONE_COND_EMPTY) 790 zone->cond = BLK_ZONE_COND_IMP_OPEN; 791 792 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 793 794 if (zone->wp == zone->start + zone->len) 795 zone->cond = BLK_ZONE_COND_FULL; 796 797 return; 798 } 799 800 zone++; 801 ASSERT(zone->cond != BLK_ZONE_COND_FULL); 802 if (zone->cond == BLK_ZONE_COND_EMPTY) 803 zone->cond = BLK_ZONE_COND_IMP_OPEN; 804 805 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); 806 807 if (zone->wp == zone->start + zone->len) 808 zone->cond = BLK_ZONE_COND_FULL; 809 } 810 811 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 812 { 813 sector_t zone_sectors; 814 sector_t nr_sectors; 815 u8 zone_sectors_shift; 816 u32 sb_zone; 817 u32 nr_zones; 818 819 zone_sectors = bdev_zone_sectors(bdev); 820 zone_sectors_shift = ilog2(zone_sectors); 821 nr_sectors = bdev_nr_sectors(bdev); 822 nr_zones = nr_sectors >> zone_sectors_shift; 823 824 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 825 if (sb_zone + 1 >= nr_zones) 826 return -ENOENT; 827 828 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 829 sb_zone << zone_sectors_shift, 830 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 831 } 832 833 /** 834 * btrfs_find_allocatable_zones - find allocatable zones within a given region 835 * 836 * @device: the device to allocate a region on 837 * @hole_start: the position of the hole to allocate the region 838 * @num_bytes: size of wanted region 839 * @hole_end: the end of the hole 840 * @return: position of allocatable zones 841 * 842 * Allocatable region should not contain any superblock locations. 843 */ 844 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 845 u64 hole_end, u64 num_bytes) 846 { 847 struct btrfs_zoned_device_info *zinfo = device->zone_info; 848 const u8 shift = zinfo->zone_size_shift; 849 u64 nzones = num_bytes >> shift; 850 u64 pos = hole_start; 851 u64 begin, end; 852 bool have_sb; 853 int i; 854 855 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 856 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 857 858 while (pos < hole_end) { 859 begin = pos >> shift; 860 end = begin + nzones; 861 862 if (end > zinfo->nr_zones) 863 return hole_end; 864 865 /* Check if zones in the region are all empty */ 866 if (btrfs_dev_is_sequential(device, pos) && 867 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 868 pos += zinfo->zone_size; 869 continue; 870 } 871 872 have_sb = false; 873 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 874 u32 sb_zone; 875 u64 sb_pos; 876 877 sb_zone = sb_zone_number(shift, i); 878 if (!(end <= sb_zone || 879 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 880 have_sb = true; 881 pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; 882 break; 883 } 884 885 /* We also need to exclude regular superblock positions */ 886 sb_pos = btrfs_sb_offset(i); 887 if (!(pos + num_bytes <= sb_pos || 888 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 889 have_sb = true; 890 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 891 zinfo->zone_size); 892 break; 893 } 894 } 895 if (!have_sb) 896 break; 897 } 898 899 return pos; 900 } 901 902 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 903 u64 length, u64 *bytes) 904 { 905 int ret; 906 907 *bytes = 0; 908 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 909 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 910 GFP_NOFS); 911 if (ret) 912 return ret; 913 914 *bytes = length; 915 while (length) { 916 btrfs_dev_set_zone_empty(device, physical); 917 physical += device->zone_info->zone_size; 918 length -= device->zone_info->zone_size; 919 } 920 921 return 0; 922 } 923 924 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 925 { 926 struct btrfs_zoned_device_info *zinfo = device->zone_info; 927 const u8 shift = zinfo->zone_size_shift; 928 unsigned long begin = start >> shift; 929 unsigned long end = (start + size) >> shift; 930 u64 pos; 931 int ret; 932 933 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 934 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 935 936 if (end > zinfo->nr_zones) 937 return -ERANGE; 938 939 /* All the zones are conventional */ 940 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 941 return 0; 942 943 /* All the zones are sequential and empty */ 944 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 945 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 946 return 0; 947 948 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 949 u64 reset_bytes; 950 951 if (!btrfs_dev_is_sequential(device, pos) || 952 btrfs_dev_is_empty_zone(device, pos)) 953 continue; 954 955 /* Free regions should be empty */ 956 btrfs_warn_in_rcu( 957 device->fs_info, 958 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 959 rcu_str_deref(device->name), device->devid, pos >> shift); 960 WARN_ON_ONCE(1); 961 962 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 963 &reset_bytes); 964 if (ret) 965 return ret; 966 } 967 968 return 0; 969 } 970 971 /* 972 * Calculate an allocation pointer from the extent allocation information 973 * for a block group consist of conventional zones. It is pointed to the 974 * end of the highest addressed extent in the block group as an allocation 975 * offset. 976 */ 977 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 978 u64 *offset_ret) 979 { 980 struct btrfs_fs_info *fs_info = cache->fs_info; 981 struct btrfs_root *root = fs_info->extent_root; 982 struct btrfs_path *path; 983 struct btrfs_key key; 984 struct btrfs_key found_key; 985 int ret; 986 u64 length; 987 988 path = btrfs_alloc_path(); 989 if (!path) 990 return -ENOMEM; 991 992 key.objectid = cache->start + cache->length; 993 key.type = 0; 994 key.offset = 0; 995 996 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 997 /* We should not find the exact match */ 998 if (!ret) 999 ret = -EUCLEAN; 1000 if (ret < 0) 1001 goto out; 1002 1003 ret = btrfs_previous_extent_item(root, path, cache->start); 1004 if (ret) { 1005 if (ret == 1) { 1006 ret = 0; 1007 *offset_ret = 0; 1008 } 1009 goto out; 1010 } 1011 1012 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1013 1014 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1015 length = found_key.offset; 1016 else 1017 length = fs_info->nodesize; 1018 1019 if (!(found_key.objectid >= cache->start && 1020 found_key.objectid + length <= cache->start + cache->length)) { 1021 ret = -EUCLEAN; 1022 goto out; 1023 } 1024 *offset_ret = found_key.objectid + length - cache->start; 1025 ret = 0; 1026 1027 out: 1028 btrfs_free_path(path); 1029 return ret; 1030 } 1031 1032 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1033 { 1034 struct btrfs_fs_info *fs_info = cache->fs_info; 1035 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1036 struct extent_map *em; 1037 struct map_lookup *map; 1038 struct btrfs_device *device; 1039 u64 logical = cache->start; 1040 u64 length = cache->length; 1041 u64 physical = 0; 1042 int ret; 1043 int i; 1044 unsigned int nofs_flag; 1045 u64 *alloc_offsets = NULL; 1046 u64 last_alloc = 0; 1047 u32 num_sequential = 0, num_conventional = 0; 1048 1049 if (!btrfs_is_zoned(fs_info)) 1050 return 0; 1051 1052 /* Sanity check */ 1053 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1054 btrfs_err(fs_info, 1055 "zoned: block group %llu len %llu unaligned to zone size %llu", 1056 logical, length, fs_info->zone_size); 1057 return -EIO; 1058 } 1059 1060 /* Get the chunk mapping */ 1061 read_lock(&em_tree->lock); 1062 em = lookup_extent_mapping(em_tree, logical, length); 1063 read_unlock(&em_tree->lock); 1064 1065 if (!em) 1066 return -EINVAL; 1067 1068 map = em->map_lookup; 1069 1070 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1071 if (!alloc_offsets) { 1072 free_extent_map(em); 1073 return -ENOMEM; 1074 } 1075 1076 for (i = 0; i < map->num_stripes; i++) { 1077 bool is_sequential; 1078 struct blk_zone zone; 1079 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1080 int dev_replace_is_ongoing = 0; 1081 1082 device = map->stripes[i].dev; 1083 physical = map->stripes[i].physical; 1084 1085 if (device->bdev == NULL) { 1086 alloc_offsets[i] = WP_MISSING_DEV; 1087 continue; 1088 } 1089 1090 is_sequential = btrfs_dev_is_sequential(device, physical); 1091 if (is_sequential) 1092 num_sequential++; 1093 else 1094 num_conventional++; 1095 1096 if (!is_sequential) { 1097 alloc_offsets[i] = WP_CONVENTIONAL; 1098 continue; 1099 } 1100 1101 /* 1102 * This zone will be used for allocation, so mark this zone 1103 * non-empty. 1104 */ 1105 btrfs_dev_clear_zone_empty(device, physical); 1106 1107 down_read(&dev_replace->rwsem); 1108 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1109 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1110 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); 1111 up_read(&dev_replace->rwsem); 1112 1113 /* 1114 * The group is mapped to a sequential zone. Get the zone write 1115 * pointer to determine the allocation offset within the zone. 1116 */ 1117 WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); 1118 nofs_flag = memalloc_nofs_save(); 1119 ret = btrfs_get_dev_zone(device, physical, &zone); 1120 memalloc_nofs_restore(nofs_flag); 1121 if (ret == -EIO || ret == -EOPNOTSUPP) { 1122 ret = 0; 1123 alloc_offsets[i] = WP_MISSING_DEV; 1124 continue; 1125 } else if (ret) { 1126 goto out; 1127 } 1128 1129 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1130 ret = -EIO; 1131 goto out; 1132 } 1133 1134 switch (zone.cond) { 1135 case BLK_ZONE_COND_OFFLINE: 1136 case BLK_ZONE_COND_READONLY: 1137 btrfs_err(fs_info, 1138 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1139 physical >> device->zone_info->zone_size_shift, 1140 rcu_str_deref(device->name), device->devid); 1141 alloc_offsets[i] = WP_MISSING_DEV; 1142 break; 1143 case BLK_ZONE_COND_EMPTY: 1144 alloc_offsets[i] = 0; 1145 break; 1146 case BLK_ZONE_COND_FULL: 1147 alloc_offsets[i] = fs_info->zone_size; 1148 break; 1149 default: 1150 /* Partially used zone */ 1151 alloc_offsets[i] = 1152 ((zone.wp - zone.start) << SECTOR_SHIFT); 1153 break; 1154 } 1155 } 1156 1157 if (num_sequential > 0) 1158 cache->seq_zone = true; 1159 1160 if (num_conventional > 0) { 1161 /* 1162 * Avoid calling calculate_alloc_pointer() for new BG. It 1163 * is no use for new BG. It must be always 0. 1164 * 1165 * Also, we have a lock chain of extent buffer lock -> 1166 * chunk mutex. For new BG, this function is called from 1167 * btrfs_make_block_group() which is already taking the 1168 * chunk mutex. Thus, we cannot call 1169 * calculate_alloc_pointer() which takes extent buffer 1170 * locks to avoid deadlock. 1171 */ 1172 if (new) { 1173 cache->alloc_offset = 0; 1174 goto out; 1175 } 1176 ret = calculate_alloc_pointer(cache, &last_alloc); 1177 if (ret || map->num_stripes == num_conventional) { 1178 if (!ret) 1179 cache->alloc_offset = last_alloc; 1180 else 1181 btrfs_err(fs_info, 1182 "zoned: failed to determine allocation offset of bg %llu", 1183 cache->start); 1184 goto out; 1185 } 1186 } 1187 1188 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1189 case 0: /* single */ 1190 cache->alloc_offset = alloc_offsets[0]; 1191 break; 1192 case BTRFS_BLOCK_GROUP_DUP: 1193 case BTRFS_BLOCK_GROUP_RAID1: 1194 case BTRFS_BLOCK_GROUP_RAID0: 1195 case BTRFS_BLOCK_GROUP_RAID10: 1196 case BTRFS_BLOCK_GROUP_RAID5: 1197 case BTRFS_BLOCK_GROUP_RAID6: 1198 /* non-single profiles are not supported yet */ 1199 default: 1200 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1201 btrfs_bg_type_to_raid_name(map->type)); 1202 ret = -EINVAL; 1203 goto out; 1204 } 1205 1206 out: 1207 /* An extent is allocated after the write pointer */ 1208 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1209 btrfs_err(fs_info, 1210 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1211 logical, last_alloc, cache->alloc_offset); 1212 ret = -EIO; 1213 } 1214 1215 if (!ret) 1216 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1217 1218 kfree(alloc_offsets); 1219 free_extent_map(em); 1220 1221 return ret; 1222 } 1223 1224 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1225 { 1226 u64 unusable, free; 1227 1228 if (!btrfs_is_zoned(cache->fs_info)) 1229 return; 1230 1231 WARN_ON(cache->bytes_super != 0); 1232 unusable = cache->alloc_offset - cache->used; 1233 free = cache->length - cache->alloc_offset; 1234 1235 /* We only need ->free_space in ALLOC_SEQ block groups */ 1236 cache->last_byte_to_unpin = (u64)-1; 1237 cache->cached = BTRFS_CACHE_FINISHED; 1238 cache->free_space_ctl->free_space = free; 1239 cache->zone_unusable = unusable; 1240 1241 /* Should not have any excluded extents. Just in case, though */ 1242 btrfs_free_excluded_extents(cache); 1243 } 1244 1245 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1246 struct extent_buffer *eb) 1247 { 1248 struct btrfs_fs_info *fs_info = eb->fs_info; 1249 1250 if (!btrfs_is_zoned(fs_info) || 1251 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1252 !list_empty(&eb->release_list)) 1253 return; 1254 1255 set_extent_buffer_dirty(eb); 1256 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1257 eb->start + eb->len - 1, EXTENT_DIRTY); 1258 memzero_extent_buffer(eb, 0, eb->len); 1259 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1260 1261 spin_lock(&trans->releasing_ebs_lock); 1262 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1263 spin_unlock(&trans->releasing_ebs_lock); 1264 atomic_inc(&eb->refs); 1265 } 1266 1267 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1268 { 1269 spin_lock(&trans->releasing_ebs_lock); 1270 while (!list_empty(&trans->releasing_ebs)) { 1271 struct extent_buffer *eb; 1272 1273 eb = list_first_entry(&trans->releasing_ebs, 1274 struct extent_buffer, release_list); 1275 list_del_init(&eb->release_list); 1276 free_extent_buffer(eb); 1277 } 1278 spin_unlock(&trans->releasing_ebs_lock); 1279 } 1280 1281 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) 1282 { 1283 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1284 struct btrfs_block_group *cache; 1285 bool ret = false; 1286 1287 if (!btrfs_is_zoned(fs_info)) 1288 return false; 1289 1290 if (!fs_info->max_zone_append_size) 1291 return false; 1292 1293 if (!is_data_inode(&inode->vfs_inode)) 1294 return false; 1295 1296 cache = btrfs_lookup_block_group(fs_info, start); 1297 ASSERT(cache); 1298 if (!cache) 1299 return false; 1300 1301 ret = cache->seq_zone; 1302 btrfs_put_block_group(cache); 1303 1304 return ret; 1305 } 1306 1307 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1308 struct bio *bio) 1309 { 1310 struct btrfs_ordered_extent *ordered; 1311 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1312 1313 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1314 return; 1315 1316 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1317 if (WARN_ON(!ordered)) 1318 return; 1319 1320 ordered->physical = physical; 1321 ordered->disk = bio->bi_bdev->bd_disk; 1322 ordered->partno = bio->bi_bdev->bd_partno; 1323 1324 btrfs_put_ordered_extent(ordered); 1325 } 1326 1327 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1328 { 1329 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1330 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1331 struct extent_map_tree *em_tree; 1332 struct extent_map *em; 1333 struct btrfs_ordered_sum *sum; 1334 struct block_device *bdev; 1335 u64 orig_logical = ordered->disk_bytenr; 1336 u64 *logical = NULL; 1337 int nr, stripe_len; 1338 1339 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1340 ASSERT(ordered->partno == 0); 1341 bdev = bdgrab(ordered->disk->part0); 1342 if (WARN_ON(!bdev)) 1343 return; 1344 1345 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, 1346 ordered->physical, &logical, &nr, 1347 &stripe_len))) 1348 goto out; 1349 1350 WARN_ON(nr != 1); 1351 1352 if (orig_logical == *logical) 1353 goto out; 1354 1355 ordered->disk_bytenr = *logical; 1356 1357 em_tree = &inode->extent_tree; 1358 write_lock(&em_tree->lock); 1359 em = search_extent_mapping(em_tree, ordered->file_offset, 1360 ordered->num_bytes); 1361 em->block_start = *logical; 1362 free_extent_map(em); 1363 write_unlock(&em_tree->lock); 1364 1365 list_for_each_entry(sum, &ordered->list, list) { 1366 if (*logical < orig_logical) 1367 sum->bytenr -= orig_logical - *logical; 1368 else 1369 sum->bytenr += *logical - orig_logical; 1370 } 1371 1372 out: 1373 kfree(logical); 1374 bdput(bdev); 1375 } 1376 1377 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1378 struct extent_buffer *eb, 1379 struct btrfs_block_group **cache_ret) 1380 { 1381 struct btrfs_block_group *cache; 1382 bool ret = true; 1383 1384 if (!btrfs_is_zoned(fs_info)) 1385 return true; 1386 1387 cache = *cache_ret; 1388 1389 if (cache && (eb->start < cache->start || 1390 cache->start + cache->length <= eb->start)) { 1391 btrfs_put_block_group(cache); 1392 cache = NULL; 1393 *cache_ret = NULL; 1394 } 1395 1396 if (!cache) 1397 cache = btrfs_lookup_block_group(fs_info, eb->start); 1398 1399 if (cache) { 1400 if (cache->meta_write_pointer != eb->start) { 1401 btrfs_put_block_group(cache); 1402 cache = NULL; 1403 ret = false; 1404 } else { 1405 cache->meta_write_pointer = eb->start + eb->len; 1406 } 1407 1408 *cache_ret = cache; 1409 } 1410 1411 return ret; 1412 } 1413 1414 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1415 struct extent_buffer *eb) 1416 { 1417 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1418 return; 1419 1420 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1421 cache->meta_write_pointer = eb->start; 1422 } 1423 1424 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1425 { 1426 if (!btrfs_dev_is_sequential(device, physical)) 1427 return -EOPNOTSUPP; 1428 1429 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1430 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1431 } 1432 1433 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1434 struct blk_zone *zone) 1435 { 1436 struct btrfs_bio *bbio = NULL; 1437 u64 mapped_length = PAGE_SIZE; 1438 unsigned int nofs_flag; 1439 int nmirrors; 1440 int i, ret; 1441 1442 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1443 &mapped_length, &bbio); 1444 if (ret || !bbio || mapped_length < PAGE_SIZE) { 1445 btrfs_put_bbio(bbio); 1446 return -EIO; 1447 } 1448 1449 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) 1450 return -EINVAL; 1451 1452 nofs_flag = memalloc_nofs_save(); 1453 nmirrors = (int)bbio->num_stripes; 1454 for (i = 0; i < nmirrors; i++) { 1455 u64 physical = bbio->stripes[i].physical; 1456 struct btrfs_device *dev = bbio->stripes[i].dev; 1457 1458 /* Missing device */ 1459 if (!dev->bdev) 1460 continue; 1461 1462 ret = btrfs_get_dev_zone(dev, physical, zone); 1463 /* Failing device */ 1464 if (ret == -EIO || ret == -EOPNOTSUPP) 1465 continue; 1466 break; 1467 } 1468 memalloc_nofs_restore(nofs_flag); 1469 1470 return ret; 1471 } 1472 1473 /* 1474 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1475 * filling zeros between @physical_pos to a write pointer of dev-replace 1476 * source device. 1477 */ 1478 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1479 u64 physical_start, u64 physical_pos) 1480 { 1481 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1482 struct blk_zone zone; 1483 u64 length; 1484 u64 wp; 1485 int ret; 1486 1487 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1488 return 0; 1489 1490 ret = read_zone_info(fs_info, logical, &zone); 1491 if (ret) 1492 return ret; 1493 1494 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1495 1496 if (physical_pos == wp) 1497 return 0; 1498 1499 if (physical_pos > wp) 1500 return -EUCLEAN; 1501 1502 length = wp - physical_pos; 1503 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1504 } 1505