1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 2 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* DM-Zoned label */ 73 u8 dmz_label[32]; /* 80 */ 74 75 /* DM-Zoned UUID */ 76 u8 dmz_uuid[16]; /* 96 */ 77 78 /* Device UUID */ 79 u8 dev_uuid[16]; /* 112 */ 80 81 /* Padding to full 512B sector */ 82 u8 reserved[400]; /* 512 */ 83 }; 84 85 /* 86 * Chunk mapping entry: entries are indexed by chunk number 87 * and give the zone ID (dzone_id) mapping the chunk on disk. 88 * This zone may be sequential or random. If it is a sequential 89 * zone, a second zone (bzone_id) used as a write buffer may 90 * also be specified. This second zone will always be a randomly 91 * writeable zone. 92 */ 93 struct dmz_map { 94 __le32 dzone_id; 95 __le32 bzone_id; 96 }; 97 98 /* 99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 100 */ 101 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 102 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 103 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 104 #define DMZ_MAP_UNMAPPED UINT_MAX 105 106 /* 107 * Meta data block descriptor (for cached metadata blocks). 108 */ 109 struct dmz_mblock { 110 struct rb_node node; 111 struct list_head link; 112 sector_t no; 113 unsigned int ref; 114 unsigned long state; 115 struct page *page; 116 void *data; 117 }; 118 119 /* 120 * Metadata block state flags. 121 */ 122 enum { 123 DMZ_META_DIRTY, 124 DMZ_META_READING, 125 DMZ_META_WRITING, 126 DMZ_META_ERROR, 127 }; 128 129 /* 130 * Super block information (one per metadata set). 131 */ 132 struct dmz_sb { 133 sector_t block; 134 struct dmz_dev *dev; 135 struct dmz_mblock *mblk; 136 struct dmz_super *sb; 137 struct dm_zone *zone; 138 }; 139 140 /* 141 * In-memory metadata. 142 */ 143 struct dmz_metadata { 144 struct dmz_dev *dev; 145 unsigned int nr_devs; 146 147 char devname[BDEVNAME_SIZE]; 148 char label[BDEVNAME_SIZE]; 149 uuid_t uuid; 150 151 sector_t zone_bitmap_size; 152 unsigned int zone_nr_bitmap_blocks; 153 unsigned int zone_bits_per_mblk; 154 155 sector_t zone_nr_blocks; 156 sector_t zone_nr_blocks_shift; 157 158 sector_t zone_nr_sectors; 159 sector_t zone_nr_sectors_shift; 160 161 unsigned int nr_bitmap_blocks; 162 unsigned int nr_map_blocks; 163 164 unsigned int nr_zones; 165 unsigned int nr_useable_zones; 166 unsigned int nr_meta_blocks; 167 unsigned int nr_meta_zones; 168 unsigned int nr_data_zones; 169 unsigned int nr_cache_zones; 170 unsigned int nr_rnd_zones; 171 unsigned int nr_reserved_seq; 172 unsigned int nr_chunks; 173 174 /* Zone information array */ 175 struct xarray zones; 176 177 struct dmz_sb sb[2]; 178 unsigned int mblk_primary; 179 unsigned int sb_version; 180 u64 sb_gen; 181 unsigned int min_nr_mblks; 182 unsigned int max_nr_mblks; 183 atomic_t nr_mblks; 184 struct rw_semaphore mblk_sem; 185 struct mutex mblk_flush_lock; 186 spinlock_t mblk_lock; 187 struct rb_root mblk_rbtree; 188 struct list_head mblk_lru_list; 189 struct list_head mblk_dirty_list; 190 struct shrinker mblk_shrinker; 191 192 /* Zone allocation management */ 193 struct mutex map_lock; 194 struct dmz_mblock **map_mblk; 195 196 unsigned int nr_cache; 197 atomic_t unmap_nr_cache; 198 struct list_head unmap_cache_list; 199 struct list_head map_cache_list; 200 201 atomic_t nr_reserved_seq_zones; 202 struct list_head reserved_seq_zones_list; 203 204 wait_queue_head_t free_wq; 205 }; 206 207 #define dmz_zmd_info(zmd, format, args...) \ 208 DMINFO("(%s): " format, (zmd)->label, ## args) 209 210 #define dmz_zmd_err(zmd, format, args...) \ 211 DMERR("(%s): " format, (zmd)->label, ## args) 212 213 #define dmz_zmd_warn(zmd, format, args...) \ 214 DMWARN("(%s): " format, (zmd)->label, ## args) 215 216 #define dmz_zmd_debug(zmd, format, args...) \ 217 DMDEBUG("(%s): " format, (zmd)->label, ## args) 218 /* 219 * Various accessors 220 */ 221 static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) 222 { 223 if (WARN_ON(!zone)) 224 return 0; 225 226 return zone->id - zone->dev->zone_offset; 227 } 228 229 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 230 { 231 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 232 233 return (sector_t)zone_id << zmd->zone_nr_sectors_shift; 234 } 235 236 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 237 { 238 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 239 240 return (sector_t)zone_id << zmd->zone_nr_blocks_shift; 241 } 242 243 unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) 244 { 245 return zmd->zone_nr_blocks; 246 } 247 248 unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) 249 { 250 return zmd->zone_nr_blocks_shift; 251 } 252 253 unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) 254 { 255 return zmd->zone_nr_sectors; 256 } 257 258 unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) 259 { 260 return zmd->zone_nr_sectors_shift; 261 } 262 263 unsigned int dmz_nr_zones(struct dmz_metadata *zmd) 264 { 265 return zmd->nr_zones; 266 } 267 268 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 269 { 270 return zmd->nr_chunks; 271 } 272 273 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) 274 { 275 return zmd->dev[idx].nr_rnd; 276 } 277 278 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) 279 { 280 return atomic_read(&zmd->dev[idx].unmap_nr_rnd); 281 } 282 283 unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) 284 { 285 return zmd->nr_cache; 286 } 287 288 unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) 289 { 290 return atomic_read(&zmd->unmap_nr_cache); 291 } 292 293 unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) 294 { 295 return zmd->dev[idx].nr_seq; 296 } 297 298 unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) 299 { 300 return atomic_read(&zmd->dev[idx].unmap_nr_seq); 301 } 302 303 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 304 { 305 return xa_load(&zmd->zones, zone_id); 306 } 307 308 static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, 309 unsigned int zone_id, struct dmz_dev *dev) 310 { 311 struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); 312 313 if (!zone) 314 return ERR_PTR(-ENOMEM); 315 316 if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { 317 kfree(zone); 318 return ERR_PTR(-EBUSY); 319 } 320 321 INIT_LIST_HEAD(&zone->link); 322 atomic_set(&zone->refcount, 0); 323 zone->id = zone_id; 324 zone->chunk = DMZ_MAP_UNMAPPED; 325 zone->dev = dev; 326 327 return zone; 328 } 329 330 const char *dmz_metadata_label(struct dmz_metadata *zmd) 331 { 332 return (const char *)zmd->label; 333 } 334 335 bool dmz_check_dev(struct dmz_metadata *zmd) 336 { 337 unsigned int i; 338 339 for (i = 0; i < zmd->nr_devs; i++) { 340 if (!dmz_check_bdev(&zmd->dev[i])) 341 return false; 342 } 343 return true; 344 } 345 346 bool dmz_dev_is_dying(struct dmz_metadata *zmd) 347 { 348 unsigned int i; 349 350 for (i = 0; i < zmd->nr_devs; i++) { 351 if (dmz_bdev_is_dying(&zmd->dev[i])) 352 return true; 353 } 354 return false; 355 } 356 357 /* 358 * Lock/unlock mapping table. 359 * The map lock also protects all the zone lists. 360 */ 361 void dmz_lock_map(struct dmz_metadata *zmd) 362 { 363 mutex_lock(&zmd->map_lock); 364 } 365 366 void dmz_unlock_map(struct dmz_metadata *zmd) 367 { 368 mutex_unlock(&zmd->map_lock); 369 } 370 371 /* 372 * Lock/unlock metadata access. This is a "read" lock on a semaphore 373 * that prevents metadata flush from running while metadata are being 374 * modified. The actual metadata write mutual exclusion is achieved with 375 * the map lock and zone state management (active and reclaim state are 376 * mutually exclusive). 377 */ 378 void dmz_lock_metadata(struct dmz_metadata *zmd) 379 { 380 down_read(&zmd->mblk_sem); 381 } 382 383 void dmz_unlock_metadata(struct dmz_metadata *zmd) 384 { 385 up_read(&zmd->mblk_sem); 386 } 387 388 /* 389 * Lock/unlock flush: prevent concurrent executions 390 * of dmz_flush_metadata as well as metadata modification in reclaim 391 * while flush is being executed. 392 */ 393 void dmz_lock_flush(struct dmz_metadata *zmd) 394 { 395 mutex_lock(&zmd->mblk_flush_lock); 396 } 397 398 void dmz_unlock_flush(struct dmz_metadata *zmd) 399 { 400 mutex_unlock(&zmd->mblk_flush_lock); 401 } 402 403 /* 404 * Allocate a metadata block. 405 */ 406 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 407 sector_t mblk_no) 408 { 409 struct dmz_mblock *mblk = NULL; 410 411 /* See if we can reuse cached blocks */ 412 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 413 spin_lock(&zmd->mblk_lock); 414 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 415 struct dmz_mblock, link); 416 if (mblk) { 417 list_del_init(&mblk->link); 418 rb_erase(&mblk->node, &zmd->mblk_rbtree); 419 mblk->no = mblk_no; 420 } 421 spin_unlock(&zmd->mblk_lock); 422 if (mblk) 423 return mblk; 424 } 425 426 /* Allocate a new block */ 427 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 428 if (!mblk) 429 return NULL; 430 431 mblk->page = alloc_page(GFP_NOIO); 432 if (!mblk->page) { 433 kfree(mblk); 434 return NULL; 435 } 436 437 RB_CLEAR_NODE(&mblk->node); 438 INIT_LIST_HEAD(&mblk->link); 439 mblk->ref = 0; 440 mblk->state = 0; 441 mblk->no = mblk_no; 442 mblk->data = page_address(mblk->page); 443 444 atomic_inc(&zmd->nr_mblks); 445 446 return mblk; 447 } 448 449 /* 450 * Free a metadata block. 451 */ 452 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 453 { 454 __free_pages(mblk->page, 0); 455 kfree(mblk); 456 457 atomic_dec(&zmd->nr_mblks); 458 } 459 460 /* 461 * Insert a metadata block in the rbtree. 462 */ 463 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 464 { 465 struct rb_root *root = &zmd->mblk_rbtree; 466 struct rb_node **new = &(root->rb_node), *parent = NULL; 467 struct dmz_mblock *b; 468 469 /* Figure out where to put the new node */ 470 while (*new) { 471 b = container_of(*new, struct dmz_mblock, node); 472 parent = *new; 473 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 474 } 475 476 /* Add new node and rebalance tree */ 477 rb_link_node(&mblk->node, parent, new); 478 rb_insert_color(&mblk->node, root); 479 } 480 481 /* 482 * Lookup a metadata block in the rbtree. If the block is found, increment 483 * its reference count. 484 */ 485 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 486 sector_t mblk_no) 487 { 488 struct rb_root *root = &zmd->mblk_rbtree; 489 struct rb_node *node = root->rb_node; 490 struct dmz_mblock *mblk; 491 492 while (node) { 493 mblk = container_of(node, struct dmz_mblock, node); 494 if (mblk->no == mblk_no) { 495 /* 496 * If this is the first reference to the block, 497 * remove it from the LRU list. 498 */ 499 mblk->ref++; 500 if (mblk->ref == 1 && 501 !test_bit(DMZ_META_DIRTY, &mblk->state)) 502 list_del_init(&mblk->link); 503 return mblk; 504 } 505 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 506 } 507 508 return NULL; 509 } 510 511 /* 512 * Metadata block BIO end callback. 513 */ 514 static void dmz_mblock_bio_end_io(struct bio *bio) 515 { 516 struct dmz_mblock *mblk = bio->bi_private; 517 int flag; 518 519 if (bio->bi_status) 520 set_bit(DMZ_META_ERROR, &mblk->state); 521 522 if (bio_op(bio) == REQ_OP_WRITE) 523 flag = DMZ_META_WRITING; 524 else 525 flag = DMZ_META_READING; 526 527 clear_bit_unlock(flag, &mblk->state); 528 smp_mb__after_atomic(); 529 wake_up_bit(&mblk->state, flag); 530 531 bio_put(bio); 532 } 533 534 /* 535 * Read an uncached metadata block from disk and add it to the cache. 536 */ 537 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 538 sector_t mblk_no) 539 { 540 struct dmz_mblock *mblk, *m; 541 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 542 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 543 struct bio *bio; 544 545 if (dmz_bdev_is_dying(dev)) 546 return ERR_PTR(-EIO); 547 548 /* Get a new block and a BIO to read it */ 549 mblk = dmz_alloc_mblock(zmd, mblk_no); 550 if (!mblk) 551 return ERR_PTR(-ENOMEM); 552 553 bio = bio_alloc(GFP_NOIO, 1); 554 if (!bio) { 555 dmz_free_mblock(zmd, mblk); 556 return ERR_PTR(-ENOMEM); 557 } 558 559 spin_lock(&zmd->mblk_lock); 560 561 /* 562 * Make sure that another context did not start reading 563 * the block already. 564 */ 565 m = dmz_get_mblock_fast(zmd, mblk_no); 566 if (m) { 567 spin_unlock(&zmd->mblk_lock); 568 dmz_free_mblock(zmd, mblk); 569 bio_put(bio); 570 return m; 571 } 572 573 mblk->ref++; 574 set_bit(DMZ_META_READING, &mblk->state); 575 dmz_insert_mblock(zmd, mblk); 576 577 spin_unlock(&zmd->mblk_lock); 578 579 /* Submit read BIO */ 580 bio->bi_iter.bi_sector = dmz_blk2sect(block); 581 bio_set_dev(bio, dev->bdev); 582 bio->bi_private = mblk; 583 bio->bi_end_io = dmz_mblock_bio_end_io; 584 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 585 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 586 submit_bio(bio); 587 588 return mblk; 589 } 590 591 /* 592 * Free metadata blocks. 593 */ 594 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 595 unsigned long limit) 596 { 597 struct dmz_mblock *mblk; 598 unsigned long count = 0; 599 600 if (!zmd->max_nr_mblks) 601 return 0; 602 603 while (!list_empty(&zmd->mblk_lru_list) && 604 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 605 count < limit) { 606 mblk = list_first_entry(&zmd->mblk_lru_list, 607 struct dmz_mblock, link); 608 list_del_init(&mblk->link); 609 rb_erase(&mblk->node, &zmd->mblk_rbtree); 610 dmz_free_mblock(zmd, mblk); 611 count++; 612 } 613 614 return count; 615 } 616 617 /* 618 * For mblock shrinker: get the number of unused metadata blocks in the cache. 619 */ 620 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 621 struct shrink_control *sc) 622 { 623 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 624 625 return atomic_read(&zmd->nr_mblks); 626 } 627 628 /* 629 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 630 */ 631 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 632 struct shrink_control *sc) 633 { 634 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 635 unsigned long count; 636 637 spin_lock(&zmd->mblk_lock); 638 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 639 spin_unlock(&zmd->mblk_lock); 640 641 return count ? count : SHRINK_STOP; 642 } 643 644 /* 645 * Release a metadata block. 646 */ 647 static void dmz_release_mblock(struct dmz_metadata *zmd, 648 struct dmz_mblock *mblk) 649 { 650 651 if (!mblk) 652 return; 653 654 spin_lock(&zmd->mblk_lock); 655 656 mblk->ref--; 657 if (mblk->ref == 0) { 658 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 659 rb_erase(&mblk->node, &zmd->mblk_rbtree); 660 dmz_free_mblock(zmd, mblk); 661 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 662 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 663 dmz_shrink_mblock_cache(zmd, 1); 664 } 665 } 666 667 spin_unlock(&zmd->mblk_lock); 668 } 669 670 /* 671 * Get a metadata block from the rbtree. If the block 672 * is not present, read it from disk. 673 */ 674 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 675 sector_t mblk_no) 676 { 677 struct dmz_mblock *mblk; 678 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 679 680 /* Check rbtree */ 681 spin_lock(&zmd->mblk_lock); 682 mblk = dmz_get_mblock_fast(zmd, mblk_no); 683 spin_unlock(&zmd->mblk_lock); 684 685 if (!mblk) { 686 /* Cache miss: read the block from disk */ 687 mblk = dmz_get_mblock_slow(zmd, mblk_no); 688 if (IS_ERR(mblk)) 689 return mblk; 690 } 691 692 /* Wait for on-going read I/O and check for error */ 693 wait_on_bit_io(&mblk->state, DMZ_META_READING, 694 TASK_UNINTERRUPTIBLE); 695 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 696 dmz_release_mblock(zmd, mblk); 697 dmz_check_bdev(dev); 698 return ERR_PTR(-EIO); 699 } 700 701 return mblk; 702 } 703 704 /* 705 * Mark a metadata block dirty. 706 */ 707 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 708 { 709 spin_lock(&zmd->mblk_lock); 710 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 711 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 712 spin_unlock(&zmd->mblk_lock); 713 } 714 715 /* 716 * Issue a metadata block write BIO. 717 */ 718 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 719 unsigned int set) 720 { 721 struct dmz_dev *dev = zmd->sb[set].dev; 722 sector_t block = zmd->sb[set].block + mblk->no; 723 struct bio *bio; 724 725 if (dmz_bdev_is_dying(dev)) 726 return -EIO; 727 728 bio = bio_alloc(GFP_NOIO, 1); 729 if (!bio) { 730 set_bit(DMZ_META_ERROR, &mblk->state); 731 return -ENOMEM; 732 } 733 734 set_bit(DMZ_META_WRITING, &mblk->state); 735 736 bio->bi_iter.bi_sector = dmz_blk2sect(block); 737 bio_set_dev(bio, dev->bdev); 738 bio->bi_private = mblk; 739 bio->bi_end_io = dmz_mblock_bio_end_io; 740 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 741 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 742 submit_bio(bio); 743 744 return 0; 745 } 746 747 /* 748 * Read/write a metadata block. 749 */ 750 static int dmz_rdwr_block(struct dmz_dev *dev, int op, 751 sector_t block, struct page *page) 752 { 753 struct bio *bio; 754 int ret; 755 756 if (WARN_ON(!dev)) 757 return -EIO; 758 759 if (dmz_bdev_is_dying(dev)) 760 return -EIO; 761 762 bio = bio_alloc(GFP_NOIO, 1); 763 if (!bio) 764 return -ENOMEM; 765 766 bio->bi_iter.bi_sector = dmz_blk2sect(block); 767 bio_set_dev(bio, dev->bdev); 768 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 769 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 770 ret = submit_bio_wait(bio); 771 bio_put(bio); 772 773 if (ret) 774 dmz_check_bdev(dev); 775 return ret; 776 } 777 778 /* 779 * Write super block of the specified metadata set. 780 */ 781 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 782 { 783 struct dmz_mblock *mblk = zmd->sb[set].mblk; 784 struct dmz_super *sb = zmd->sb[set].sb; 785 struct dmz_dev *dev = zmd->sb[set].dev; 786 sector_t sb_block; 787 u64 sb_gen = zmd->sb_gen + 1; 788 int ret; 789 790 sb->magic = cpu_to_le32(DMZ_MAGIC); 791 792 sb->version = cpu_to_le32(zmd->sb_version); 793 if (zmd->sb_version > 1) { 794 BUILD_BUG_ON(UUID_SIZE != 16); 795 export_uuid(sb->dmz_uuid, &zmd->uuid); 796 memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); 797 export_uuid(sb->dev_uuid, &dev->uuid); 798 } 799 800 sb->gen = cpu_to_le64(sb_gen); 801 802 /* 803 * The metadata always references the absolute block address, 804 * ie relative to the entire block range, not the per-device 805 * block address. 806 */ 807 sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; 808 sb->sb_block = cpu_to_le64(sb_block); 809 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 810 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 811 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 812 813 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 814 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 815 816 sb->crc = 0; 817 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 818 819 ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, 820 mblk->page); 821 if (ret == 0) 822 ret = blkdev_issue_flush(dev->bdev); 823 824 return ret; 825 } 826 827 /* 828 * Write dirty metadata blocks to the specified set. 829 */ 830 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 831 struct list_head *write_list, 832 unsigned int set) 833 { 834 struct dmz_mblock *mblk; 835 struct dmz_dev *dev = zmd->sb[set].dev; 836 struct blk_plug plug; 837 int ret = 0, nr_mblks_submitted = 0; 838 839 /* Issue writes */ 840 blk_start_plug(&plug); 841 list_for_each_entry(mblk, write_list, link) { 842 ret = dmz_write_mblock(zmd, mblk, set); 843 if (ret) 844 break; 845 nr_mblks_submitted++; 846 } 847 blk_finish_plug(&plug); 848 849 /* Wait for completion */ 850 list_for_each_entry(mblk, write_list, link) { 851 if (!nr_mblks_submitted) 852 break; 853 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 854 TASK_UNINTERRUPTIBLE); 855 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 856 clear_bit(DMZ_META_ERROR, &mblk->state); 857 dmz_check_bdev(dev); 858 ret = -EIO; 859 } 860 nr_mblks_submitted--; 861 } 862 863 /* Flush drive cache (this will also sync data) */ 864 if (ret == 0) 865 ret = blkdev_issue_flush(dev->bdev); 866 867 return ret; 868 } 869 870 /* 871 * Log dirty metadata blocks. 872 */ 873 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 874 struct list_head *write_list) 875 { 876 unsigned int log_set = zmd->mblk_primary ^ 0x1; 877 int ret; 878 879 /* Write dirty blocks to the log */ 880 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 881 if (ret) 882 return ret; 883 884 /* 885 * No error so far: now validate the log by updating the 886 * log index super block generation. 887 */ 888 ret = dmz_write_sb(zmd, log_set); 889 if (ret) 890 return ret; 891 892 return 0; 893 } 894 895 /* 896 * Flush dirty metadata blocks. 897 */ 898 int dmz_flush_metadata(struct dmz_metadata *zmd) 899 { 900 struct dmz_mblock *mblk; 901 struct list_head write_list; 902 struct dmz_dev *dev; 903 int ret; 904 905 if (WARN_ON(!zmd)) 906 return 0; 907 908 INIT_LIST_HEAD(&write_list); 909 910 /* 911 * Make sure that metadata blocks are stable before logging: take 912 * the write lock on the metadata semaphore to prevent target BIOs 913 * from modifying metadata. 914 */ 915 down_write(&zmd->mblk_sem); 916 dev = zmd->sb[zmd->mblk_primary].dev; 917 918 /* 919 * This is called from the target flush work and reclaim work. 920 * Concurrent execution is not allowed. 921 */ 922 dmz_lock_flush(zmd); 923 924 if (dmz_bdev_is_dying(dev)) { 925 ret = -EIO; 926 goto out; 927 } 928 929 /* Get dirty blocks */ 930 spin_lock(&zmd->mblk_lock); 931 list_splice_init(&zmd->mblk_dirty_list, &write_list); 932 spin_unlock(&zmd->mblk_lock); 933 934 /* If there are no dirty metadata blocks, just flush the device cache */ 935 if (list_empty(&write_list)) { 936 ret = blkdev_issue_flush(dev->bdev); 937 goto err; 938 } 939 940 /* 941 * The primary metadata set is still clean. Keep it this way until 942 * all updates are successful in the secondary set. That is, use 943 * the secondary set as a log. 944 */ 945 ret = dmz_log_dirty_mblocks(zmd, &write_list); 946 if (ret) 947 goto err; 948 949 /* 950 * The log is on disk. It is now safe to update in place 951 * in the primary metadata set. 952 */ 953 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 954 if (ret) 955 goto err; 956 957 ret = dmz_write_sb(zmd, zmd->mblk_primary); 958 if (ret) 959 goto err; 960 961 while (!list_empty(&write_list)) { 962 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 963 list_del_init(&mblk->link); 964 965 spin_lock(&zmd->mblk_lock); 966 clear_bit(DMZ_META_DIRTY, &mblk->state); 967 if (mblk->ref == 0) 968 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 969 spin_unlock(&zmd->mblk_lock); 970 } 971 972 zmd->sb_gen++; 973 out: 974 dmz_unlock_flush(zmd); 975 up_write(&zmd->mblk_sem); 976 977 return ret; 978 979 err: 980 if (!list_empty(&write_list)) { 981 spin_lock(&zmd->mblk_lock); 982 list_splice(&write_list, &zmd->mblk_dirty_list); 983 spin_unlock(&zmd->mblk_lock); 984 } 985 if (!dmz_check_bdev(dev)) 986 ret = -EIO; 987 goto out; 988 } 989 990 /* 991 * Check super block. 992 */ 993 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, 994 bool tertiary) 995 { 996 struct dmz_super *sb = dsb->sb; 997 struct dmz_dev *dev = dsb->dev; 998 unsigned int nr_meta_zones, nr_data_zones; 999 u32 crc, stored_crc; 1000 u64 gen, sb_block; 1001 1002 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 1003 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 1004 DMZ_MAGIC, le32_to_cpu(sb->magic)); 1005 return -ENXIO; 1006 } 1007 1008 zmd->sb_version = le32_to_cpu(sb->version); 1009 if (zmd->sb_version > DMZ_META_VER) { 1010 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 1011 DMZ_META_VER, zmd->sb_version); 1012 return -EINVAL; 1013 } 1014 if (zmd->sb_version < 2 && tertiary) { 1015 dmz_dev_err(dev, "Tertiary superblocks are not supported"); 1016 return -EINVAL; 1017 } 1018 1019 gen = le64_to_cpu(sb->gen); 1020 stored_crc = le32_to_cpu(sb->crc); 1021 sb->crc = 0; 1022 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 1023 if (crc != stored_crc) { 1024 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 1025 crc, stored_crc); 1026 return -ENXIO; 1027 } 1028 1029 sb_block = le64_to_cpu(sb->sb_block); 1030 if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { 1031 dmz_dev_err(dev, "Invalid superblock position " 1032 "(is %llu expected %llu)", 1033 sb_block, 1034 (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); 1035 return -EINVAL; 1036 } 1037 if (zmd->sb_version > 1) { 1038 uuid_t sb_uuid; 1039 1040 import_uuid(&sb_uuid, sb->dmz_uuid); 1041 if (uuid_is_null(&sb_uuid)) { 1042 dmz_dev_err(dev, "NULL DM-Zoned uuid"); 1043 return -ENXIO; 1044 } else if (uuid_is_null(&zmd->uuid)) { 1045 uuid_copy(&zmd->uuid, &sb_uuid); 1046 } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { 1047 dmz_dev_err(dev, "mismatching DM-Zoned uuid, " 1048 "is %pUl expected %pUl", 1049 &sb_uuid, &zmd->uuid); 1050 return -ENXIO; 1051 } 1052 if (!strlen(zmd->label)) 1053 memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); 1054 else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { 1055 dmz_dev_err(dev, "mismatching DM-Zoned label, " 1056 "is %s expected %s", 1057 sb->dmz_label, zmd->label); 1058 return -ENXIO; 1059 } 1060 import_uuid(&dev->uuid, sb->dev_uuid); 1061 if (uuid_is_null(&dev->uuid)) { 1062 dmz_dev_err(dev, "NULL device uuid"); 1063 return -ENXIO; 1064 } 1065 1066 if (tertiary) { 1067 /* 1068 * Generation number should be 0, but it doesn't 1069 * really matter if it isn't. 1070 */ 1071 if (gen != 0) 1072 dmz_dev_warn(dev, "Invalid generation %llu", 1073 gen); 1074 return 0; 1075 } 1076 } 1077 1078 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) 1079 >> zmd->zone_nr_blocks_shift; 1080 if (!nr_meta_zones || 1081 (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) || 1082 (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) { 1083 dmz_dev_err(dev, "Invalid number of metadata blocks"); 1084 return -ENXIO; 1085 } 1086 1087 if (!le32_to_cpu(sb->nr_reserved_seq) || 1088 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 1089 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 1090 return -ENXIO; 1091 } 1092 1093 nr_data_zones = zmd->nr_useable_zones - 1094 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 1095 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 1096 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 1097 le32_to_cpu(sb->nr_chunks), nr_data_zones); 1098 return -ENXIO; 1099 } 1100 1101 /* OK */ 1102 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 1103 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 1104 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 1105 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 1106 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 1107 zmd->nr_meta_zones = nr_meta_zones; 1108 zmd->nr_data_zones = nr_data_zones; 1109 1110 return 0; 1111 } 1112 1113 /* 1114 * Read the first or second super block from disk. 1115 */ 1116 static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1117 { 1118 dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", 1119 set, sb->dev->name, sb->block); 1120 1121 return dmz_rdwr_block(sb->dev, REQ_OP_READ, 1122 sb->block, sb->mblk->page); 1123 } 1124 1125 /* 1126 * Determine the position of the secondary super blocks on disk. 1127 * This is used only if a corruption of the primary super block 1128 * is detected. 1129 */ 1130 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 1131 { 1132 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 1133 struct dmz_mblock *mblk; 1134 unsigned int zone_id = zmd->sb[0].zone->id; 1135 int i; 1136 1137 /* Allocate a block */ 1138 mblk = dmz_alloc_mblock(zmd, 0); 1139 if (!mblk) 1140 return -ENOMEM; 1141 1142 zmd->sb[1].mblk = mblk; 1143 zmd->sb[1].sb = mblk->data; 1144 1145 /* Bad first super block: search for the second one */ 1146 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 1147 zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); 1148 zmd->sb[1].dev = zmd->sb[0].dev; 1149 for (i = 1; i < zmd->nr_rnd_zones; i++) { 1150 if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) 1151 break; 1152 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 1153 return 0; 1154 zmd->sb[1].block += zone_nr_blocks; 1155 zmd->sb[1].zone = dmz_get(zmd, zone_id + i); 1156 } 1157 1158 dmz_free_mblock(zmd, mblk); 1159 zmd->sb[1].mblk = NULL; 1160 zmd->sb[1].zone = NULL; 1161 zmd->sb[1].dev = NULL; 1162 1163 return -EIO; 1164 } 1165 1166 /* 1167 * Read a super block from disk. 1168 */ 1169 static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1170 { 1171 struct dmz_mblock *mblk; 1172 int ret; 1173 1174 /* Allocate a block */ 1175 mblk = dmz_alloc_mblock(zmd, 0); 1176 if (!mblk) 1177 return -ENOMEM; 1178 1179 sb->mblk = mblk; 1180 sb->sb = mblk->data; 1181 1182 /* Read super block */ 1183 ret = dmz_read_sb(zmd, sb, set); 1184 if (ret) { 1185 dmz_free_mblock(zmd, mblk); 1186 sb->mblk = NULL; 1187 return ret; 1188 } 1189 1190 return 0; 1191 } 1192 1193 /* 1194 * Recover a metadata set. 1195 */ 1196 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 1197 { 1198 unsigned int src_set = dst_set ^ 0x1; 1199 struct page *page; 1200 int i, ret; 1201 1202 dmz_dev_warn(zmd->sb[dst_set].dev, 1203 "Metadata set %u invalid: recovering", dst_set); 1204 1205 if (dst_set == 0) 1206 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1207 else 1208 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1209 1210 page = alloc_page(GFP_NOIO); 1211 if (!page) 1212 return -ENOMEM; 1213 1214 /* Copy metadata blocks */ 1215 for (i = 1; i < zmd->nr_meta_blocks; i++) { 1216 ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, 1217 zmd->sb[src_set].block + i, page); 1218 if (ret) 1219 goto out; 1220 ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, 1221 zmd->sb[dst_set].block + i, page); 1222 if (ret) 1223 goto out; 1224 } 1225 1226 /* Finalize with the super block */ 1227 if (!zmd->sb[dst_set].mblk) { 1228 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 1229 if (!zmd->sb[dst_set].mblk) { 1230 ret = -ENOMEM; 1231 goto out; 1232 } 1233 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 1234 } 1235 1236 ret = dmz_write_sb(zmd, dst_set); 1237 out: 1238 __free_pages(page, 0); 1239 1240 return ret; 1241 } 1242 1243 /* 1244 * Get super block from disk. 1245 */ 1246 static int dmz_load_sb(struct dmz_metadata *zmd) 1247 { 1248 bool sb_good[2] = {false, false}; 1249 u64 sb_gen[2] = {0, 0}; 1250 int ret; 1251 1252 if (!zmd->sb[0].zone) { 1253 dmz_zmd_err(zmd, "Primary super block zone not set"); 1254 return -ENXIO; 1255 } 1256 1257 /* Read and check the primary super block */ 1258 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1259 zmd->sb[0].dev = zmd->sb[0].zone->dev; 1260 ret = dmz_get_sb(zmd, &zmd->sb[0], 0); 1261 if (ret) { 1262 dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); 1263 return ret; 1264 } 1265 1266 ret = dmz_check_sb(zmd, &zmd->sb[0], false); 1267 1268 /* Read and check secondary super block */ 1269 if (ret == 0) { 1270 sb_good[0] = true; 1271 if (!zmd->sb[1].zone) { 1272 unsigned int zone_id = 1273 zmd->sb[0].zone->id + zmd->nr_meta_zones; 1274 1275 zmd->sb[1].zone = dmz_get(zmd, zone_id); 1276 } 1277 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1278 zmd->sb[1].dev = zmd->sb[0].dev; 1279 ret = dmz_get_sb(zmd, &zmd->sb[1], 1); 1280 } else 1281 ret = dmz_lookup_secondary_sb(zmd); 1282 1283 if (ret) { 1284 dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); 1285 return ret; 1286 } 1287 1288 ret = dmz_check_sb(zmd, &zmd->sb[1], false); 1289 if (ret == 0) 1290 sb_good[1] = true; 1291 1292 /* Use highest generation sb first */ 1293 if (!sb_good[0] && !sb_good[1]) { 1294 dmz_zmd_err(zmd, "No valid super block found"); 1295 return -EIO; 1296 } 1297 1298 if (sb_good[0]) 1299 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1300 else { 1301 ret = dmz_recover_mblocks(zmd, 0); 1302 if (ret) { 1303 dmz_dev_err(zmd->sb[0].dev, 1304 "Recovery of superblock 0 failed"); 1305 return -EIO; 1306 } 1307 } 1308 1309 if (sb_good[1]) 1310 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1311 else { 1312 ret = dmz_recover_mblocks(zmd, 1); 1313 1314 if (ret) { 1315 dmz_dev_err(zmd->sb[1].dev, 1316 "Recovery of superblock 1 failed"); 1317 return -EIO; 1318 } 1319 } 1320 1321 if (sb_gen[0] >= sb_gen[1]) { 1322 zmd->sb_gen = sb_gen[0]; 1323 zmd->mblk_primary = 0; 1324 } else { 1325 zmd->sb_gen = sb_gen[1]; 1326 zmd->mblk_primary = 1; 1327 } 1328 1329 dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, 1330 "Using super block %u (gen %llu)", 1331 zmd->mblk_primary, zmd->sb_gen); 1332 1333 if (zmd->sb_version > 1) { 1334 int i; 1335 struct dmz_sb *sb; 1336 1337 sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); 1338 if (!sb) 1339 return -ENOMEM; 1340 for (i = 1; i < zmd->nr_devs; i++) { 1341 sb->block = 0; 1342 sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); 1343 sb->dev = &zmd->dev[i]; 1344 if (!dmz_is_meta(sb->zone)) { 1345 dmz_dev_err(sb->dev, 1346 "Tertiary super block zone %u not marked as metadata zone", 1347 sb->zone->id); 1348 ret = -EINVAL; 1349 goto out_kfree; 1350 } 1351 ret = dmz_get_sb(zmd, sb, i + 1); 1352 if (ret) { 1353 dmz_dev_err(sb->dev, 1354 "Read tertiary super block failed"); 1355 dmz_free_mblock(zmd, sb->mblk); 1356 goto out_kfree; 1357 } 1358 ret = dmz_check_sb(zmd, sb, true); 1359 dmz_free_mblock(zmd, sb->mblk); 1360 if (ret == -EINVAL) 1361 goto out_kfree; 1362 } 1363 out_kfree: 1364 kfree(sb); 1365 } 1366 return ret; 1367 } 1368 1369 /* 1370 * Initialize a zone descriptor. 1371 */ 1372 static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) 1373 { 1374 struct dmz_dev *dev = data; 1375 struct dmz_metadata *zmd = dev->metadata; 1376 int idx = num + dev->zone_offset; 1377 struct dm_zone *zone; 1378 1379 zone = dmz_insert(zmd, idx, dev); 1380 if (IS_ERR(zone)) 1381 return PTR_ERR(zone); 1382 1383 if (blkz->len != zmd->zone_nr_sectors) { 1384 if (zmd->sb_version > 1) { 1385 /* Ignore the eventual runt (smaller) zone */ 1386 set_bit(DMZ_OFFLINE, &zone->flags); 1387 return 0; 1388 } else if (blkz->start + blkz->len == dev->capacity) 1389 return 0; 1390 return -ENXIO; 1391 } 1392 1393 /* 1394 * Devices that have zones with a capacity smaller than the zone size 1395 * (e.g. NVMe zoned namespaces) are not supported. 1396 */ 1397 if (blkz->capacity != blkz->len) 1398 return -ENXIO; 1399 1400 switch (blkz->type) { 1401 case BLK_ZONE_TYPE_CONVENTIONAL: 1402 set_bit(DMZ_RND, &zone->flags); 1403 break; 1404 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1405 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1406 set_bit(DMZ_SEQ, &zone->flags); 1407 break; 1408 default: 1409 return -ENXIO; 1410 } 1411 1412 if (dmz_is_rnd(zone)) 1413 zone->wp_block = 0; 1414 else 1415 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1416 1417 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1418 set_bit(DMZ_OFFLINE, &zone->flags); 1419 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1420 set_bit(DMZ_READ_ONLY, &zone->flags); 1421 else { 1422 zmd->nr_useable_zones++; 1423 if (dmz_is_rnd(zone)) { 1424 zmd->nr_rnd_zones++; 1425 if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { 1426 /* Primary super block zone */ 1427 zmd->sb[0].zone = zone; 1428 } 1429 } 1430 if (zmd->nr_devs > 1 && num == 0) { 1431 /* 1432 * Tertiary superblock zones are always at the 1433 * start of the zoned devices, so mark them 1434 * as metadata zone. 1435 */ 1436 set_bit(DMZ_META, &zone->flags); 1437 } 1438 } 1439 return 0; 1440 } 1441 1442 static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) 1443 { 1444 int idx; 1445 sector_t zone_offset = 0; 1446 1447 for(idx = 0; idx < dev->nr_zones; idx++) { 1448 struct dm_zone *zone; 1449 1450 zone = dmz_insert(zmd, idx, dev); 1451 if (IS_ERR(zone)) 1452 return PTR_ERR(zone); 1453 set_bit(DMZ_CACHE, &zone->flags); 1454 zone->wp_block = 0; 1455 zmd->nr_cache_zones++; 1456 zmd->nr_useable_zones++; 1457 if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { 1458 /* Disable runt zone */ 1459 set_bit(DMZ_OFFLINE, &zone->flags); 1460 break; 1461 } 1462 zone_offset += zmd->zone_nr_sectors; 1463 } 1464 return 0; 1465 } 1466 1467 /* 1468 * Free zones descriptors. 1469 */ 1470 static void dmz_drop_zones(struct dmz_metadata *zmd) 1471 { 1472 int idx; 1473 1474 for(idx = 0; idx < zmd->nr_zones; idx++) { 1475 struct dm_zone *zone = xa_load(&zmd->zones, idx); 1476 1477 kfree(zone); 1478 xa_erase(&zmd->zones, idx); 1479 } 1480 xa_destroy(&zmd->zones); 1481 } 1482 1483 /* 1484 * Allocate and initialize zone descriptors using the zone 1485 * information from disk. 1486 */ 1487 static int dmz_init_zones(struct dmz_metadata *zmd) 1488 { 1489 int i, ret; 1490 struct dmz_dev *zoned_dev = &zmd->dev[0]; 1491 1492 /* Init */ 1493 zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; 1494 zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); 1495 zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); 1496 zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); 1497 zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; 1498 zmd->zone_nr_bitmap_blocks = 1499 max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); 1500 zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, 1501 DMZ_BLOCK_SIZE_BITS); 1502 1503 /* Allocate zone array */ 1504 zmd->nr_zones = 0; 1505 for (i = 0; i < zmd->nr_devs; i++) { 1506 struct dmz_dev *dev = &zmd->dev[i]; 1507 1508 dev->metadata = zmd; 1509 zmd->nr_zones += dev->nr_zones; 1510 1511 atomic_set(&dev->unmap_nr_rnd, 0); 1512 INIT_LIST_HEAD(&dev->unmap_rnd_list); 1513 INIT_LIST_HEAD(&dev->map_rnd_list); 1514 1515 atomic_set(&dev->unmap_nr_seq, 0); 1516 INIT_LIST_HEAD(&dev->unmap_seq_list); 1517 INIT_LIST_HEAD(&dev->map_seq_list); 1518 } 1519 1520 if (!zmd->nr_zones) { 1521 DMERR("(%s): No zones found", zmd->devname); 1522 return -ENXIO; 1523 } 1524 xa_init(&zmd->zones); 1525 1526 DMDEBUG("(%s): Using %zu B for zone information", 1527 zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); 1528 1529 if (zmd->nr_devs > 1) { 1530 ret = dmz_emulate_zones(zmd, &zmd->dev[0]); 1531 if (ret < 0) { 1532 DMDEBUG("(%s): Failed to emulate zones, error %d", 1533 zmd->devname, ret); 1534 dmz_drop_zones(zmd); 1535 return ret; 1536 } 1537 1538 /* 1539 * Primary superblock zone is always at zone 0 when multiple 1540 * drives are present. 1541 */ 1542 zmd->sb[0].zone = dmz_get(zmd, 0); 1543 1544 for (i = 1; i < zmd->nr_devs; i++) { 1545 zoned_dev = &zmd->dev[i]; 1546 1547 ret = blkdev_report_zones(zoned_dev->bdev, 0, 1548 BLK_ALL_ZONES, 1549 dmz_init_zone, zoned_dev); 1550 if (ret < 0) { 1551 DMDEBUG("(%s): Failed to report zones, error %d", 1552 zmd->devname, ret); 1553 dmz_drop_zones(zmd); 1554 return ret; 1555 } 1556 } 1557 return 0; 1558 } 1559 1560 /* 1561 * Get zone information and initialize zone descriptors. At the same 1562 * time, determine where the super block should be: first block of the 1563 * first randomly writable zone. 1564 */ 1565 ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, 1566 dmz_init_zone, zoned_dev); 1567 if (ret < 0) { 1568 DMDEBUG("(%s): Failed to report zones, error %d", 1569 zmd->devname, ret); 1570 dmz_drop_zones(zmd); 1571 return ret; 1572 } 1573 1574 return 0; 1575 } 1576 1577 static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, 1578 void *data) 1579 { 1580 struct dm_zone *zone = data; 1581 1582 clear_bit(DMZ_OFFLINE, &zone->flags); 1583 clear_bit(DMZ_READ_ONLY, &zone->flags); 1584 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1585 set_bit(DMZ_OFFLINE, &zone->flags); 1586 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1587 set_bit(DMZ_READ_ONLY, &zone->flags); 1588 1589 if (dmz_is_seq(zone)) 1590 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1591 else 1592 zone->wp_block = 0; 1593 return 0; 1594 } 1595 1596 /* 1597 * Update a zone information. 1598 */ 1599 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1600 { 1601 struct dmz_dev *dev = zone->dev; 1602 unsigned int noio_flag; 1603 int ret; 1604 1605 if (dev->flags & DMZ_BDEV_REGULAR) 1606 return 0; 1607 1608 /* 1609 * Get zone information from disk. Since blkdev_report_zones() uses 1610 * GFP_KERNEL by default for memory allocations, set the per-task 1611 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1612 * GFP_NOIO was specified. 1613 */ 1614 noio_flag = memalloc_noio_save(); 1615 ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, 1616 dmz_update_zone_cb, zone); 1617 memalloc_noio_restore(noio_flag); 1618 1619 if (ret == 0) 1620 ret = -EIO; 1621 if (ret < 0) { 1622 dmz_dev_err(dev, "Get zone %u report failed", 1623 zone->id); 1624 dmz_check_bdev(dev); 1625 return ret; 1626 } 1627 1628 return 0; 1629 } 1630 1631 /* 1632 * Check a zone write pointer position when the zone is marked 1633 * with the sequential write error flag. 1634 */ 1635 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1636 struct dm_zone *zone) 1637 { 1638 struct dmz_dev *dev = zone->dev; 1639 unsigned int wp = 0; 1640 int ret; 1641 1642 wp = zone->wp_block; 1643 ret = dmz_update_zone(zmd, zone); 1644 if (ret) 1645 return ret; 1646 1647 dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", 1648 zone->id, zone->wp_block, wp); 1649 1650 if (zone->wp_block < wp) { 1651 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1652 wp - zone->wp_block); 1653 } 1654 1655 return 0; 1656 } 1657 1658 /* 1659 * Reset a zone write pointer. 1660 */ 1661 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1662 { 1663 int ret; 1664 1665 /* 1666 * Ignore offline zones, read only zones, 1667 * and conventional zones. 1668 */ 1669 if (dmz_is_offline(zone) || 1670 dmz_is_readonly(zone) || 1671 dmz_is_rnd(zone)) 1672 return 0; 1673 1674 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1675 struct dmz_dev *dev = zone->dev; 1676 1677 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1678 dmz_start_sect(zmd, zone), 1679 zmd->zone_nr_sectors, GFP_NOIO); 1680 if (ret) { 1681 dmz_dev_err(dev, "Reset zone %u failed %d", 1682 zone->id, ret); 1683 return ret; 1684 } 1685 } 1686 1687 /* Clear write error bit and rewind write pointer position */ 1688 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1689 zone->wp_block = 0; 1690 1691 return 0; 1692 } 1693 1694 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1695 1696 /* 1697 * Initialize chunk mapping. 1698 */ 1699 static int dmz_load_mapping(struct dmz_metadata *zmd) 1700 { 1701 struct dm_zone *dzone, *bzone; 1702 struct dmz_mblock *dmap_mblk = NULL; 1703 struct dmz_map *dmap; 1704 unsigned int i = 0, e = 0, chunk = 0; 1705 unsigned int dzone_id; 1706 unsigned int bzone_id; 1707 1708 /* Metadata block array for the chunk mapping table */ 1709 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1710 sizeof(struct dmz_mblk *), GFP_KERNEL); 1711 if (!zmd->map_mblk) 1712 return -ENOMEM; 1713 1714 /* Get chunk mapping table blocks and initialize zone mapping */ 1715 while (chunk < zmd->nr_chunks) { 1716 if (!dmap_mblk) { 1717 /* Get mapping block */ 1718 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1719 if (IS_ERR(dmap_mblk)) 1720 return PTR_ERR(dmap_mblk); 1721 zmd->map_mblk[i] = dmap_mblk; 1722 dmap = (struct dmz_map *) dmap_mblk->data; 1723 i++; 1724 e = 0; 1725 } 1726 1727 /* Check data zone */ 1728 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1729 if (dzone_id == DMZ_MAP_UNMAPPED) 1730 goto next; 1731 1732 if (dzone_id >= zmd->nr_zones) { 1733 dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", 1734 chunk, dzone_id); 1735 return -EIO; 1736 } 1737 1738 dzone = dmz_get(zmd, dzone_id); 1739 if (!dzone) { 1740 dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", 1741 chunk, dzone_id); 1742 return -EIO; 1743 } 1744 set_bit(DMZ_DATA, &dzone->flags); 1745 dzone->chunk = chunk; 1746 dmz_get_zone_weight(zmd, dzone); 1747 1748 if (dmz_is_cache(dzone)) 1749 list_add_tail(&dzone->link, &zmd->map_cache_list); 1750 else if (dmz_is_rnd(dzone)) 1751 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 1752 else 1753 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 1754 1755 /* Check buffer zone */ 1756 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1757 if (bzone_id == DMZ_MAP_UNMAPPED) 1758 goto next; 1759 1760 if (bzone_id >= zmd->nr_zones) { 1761 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", 1762 chunk, bzone_id); 1763 return -EIO; 1764 } 1765 1766 bzone = dmz_get(zmd, bzone_id); 1767 if (!bzone) { 1768 dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", 1769 chunk, bzone_id); 1770 return -EIO; 1771 } 1772 if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { 1773 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", 1774 chunk, bzone_id); 1775 return -EIO; 1776 } 1777 1778 set_bit(DMZ_DATA, &bzone->flags); 1779 set_bit(DMZ_BUF, &bzone->flags); 1780 bzone->chunk = chunk; 1781 bzone->bzone = dzone; 1782 dzone->bzone = bzone; 1783 dmz_get_zone_weight(zmd, bzone); 1784 if (dmz_is_cache(bzone)) 1785 list_add_tail(&bzone->link, &zmd->map_cache_list); 1786 else 1787 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 1788 next: 1789 chunk++; 1790 e++; 1791 if (e >= DMZ_MAP_ENTRIES) 1792 dmap_mblk = NULL; 1793 } 1794 1795 /* 1796 * At this point, only meta zones and mapped data zones were 1797 * fully initialized. All remaining zones are unmapped data 1798 * zones. Finish initializing those here. 1799 */ 1800 for (i = 0; i < zmd->nr_zones; i++) { 1801 dzone = dmz_get(zmd, i); 1802 if (!dzone) 1803 continue; 1804 if (dmz_is_meta(dzone)) 1805 continue; 1806 if (dmz_is_offline(dzone)) 1807 continue; 1808 1809 if (dmz_is_cache(dzone)) 1810 zmd->nr_cache++; 1811 else if (dmz_is_rnd(dzone)) 1812 dzone->dev->nr_rnd++; 1813 else 1814 dzone->dev->nr_seq++; 1815 1816 if (dmz_is_data(dzone)) { 1817 /* Already initialized */ 1818 continue; 1819 } 1820 1821 /* Unmapped data zone */ 1822 set_bit(DMZ_DATA, &dzone->flags); 1823 dzone->chunk = DMZ_MAP_UNMAPPED; 1824 if (dmz_is_cache(dzone)) { 1825 list_add_tail(&dzone->link, &zmd->unmap_cache_list); 1826 atomic_inc(&zmd->unmap_nr_cache); 1827 } else if (dmz_is_rnd(dzone)) { 1828 list_add_tail(&dzone->link, 1829 &dzone->dev->unmap_rnd_list); 1830 atomic_inc(&dzone->dev->unmap_nr_rnd); 1831 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1832 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1833 set_bit(DMZ_RESERVED, &dzone->flags); 1834 atomic_inc(&zmd->nr_reserved_seq_zones); 1835 dzone->dev->nr_seq--; 1836 } else { 1837 list_add_tail(&dzone->link, 1838 &dzone->dev->unmap_seq_list); 1839 atomic_inc(&dzone->dev->unmap_nr_seq); 1840 } 1841 } 1842 1843 return 0; 1844 } 1845 1846 /* 1847 * Set a data chunk mapping. 1848 */ 1849 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1850 unsigned int dzone_id, unsigned int bzone_id) 1851 { 1852 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1853 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1854 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1855 1856 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1857 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1858 dmz_dirty_mblock(zmd, dmap_mblk); 1859 } 1860 1861 /* 1862 * The list of mapped zones is maintained in LRU order. 1863 * This rotates a zone at the end of its map list. 1864 */ 1865 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1866 { 1867 if (list_empty(&zone->link)) 1868 return; 1869 1870 list_del_init(&zone->link); 1871 if (dmz_is_seq(zone)) { 1872 /* LRU rotate sequential zone */ 1873 list_add_tail(&zone->link, &zone->dev->map_seq_list); 1874 } else if (dmz_is_cache(zone)) { 1875 /* LRU rotate cache zone */ 1876 list_add_tail(&zone->link, &zmd->map_cache_list); 1877 } else { 1878 /* LRU rotate random zone */ 1879 list_add_tail(&zone->link, &zone->dev->map_rnd_list); 1880 } 1881 } 1882 1883 /* 1884 * The list of mapped random zones is maintained 1885 * in LRU order. This rotates a zone at the end of the list. 1886 */ 1887 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1888 { 1889 __dmz_lru_zone(zmd, zone); 1890 if (zone->bzone) 1891 __dmz_lru_zone(zmd, zone->bzone); 1892 } 1893 1894 /* 1895 * Wait for any zone to be freed. 1896 */ 1897 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1898 { 1899 DEFINE_WAIT(wait); 1900 1901 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1902 dmz_unlock_map(zmd); 1903 dmz_unlock_metadata(zmd); 1904 1905 io_schedule_timeout(HZ); 1906 1907 dmz_lock_metadata(zmd); 1908 dmz_lock_map(zmd); 1909 finish_wait(&zmd->free_wq, &wait); 1910 } 1911 1912 /* 1913 * Lock a zone for reclaim (set the zone RECLAIM bit). 1914 * Returns false if the zone cannot be locked or if it is already locked 1915 * and 1 otherwise. 1916 */ 1917 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1918 { 1919 /* Active zones cannot be reclaimed */ 1920 if (dmz_is_active(zone)) 1921 return 0; 1922 1923 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1924 } 1925 1926 /* 1927 * Clear a zone reclaim flag. 1928 */ 1929 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1930 { 1931 WARN_ON(dmz_is_active(zone)); 1932 WARN_ON(!dmz_in_reclaim(zone)); 1933 1934 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1935 smp_mb__after_atomic(); 1936 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1937 } 1938 1939 /* 1940 * Wait for a zone reclaim to complete. 1941 */ 1942 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1943 { 1944 dmz_unlock_map(zmd); 1945 dmz_unlock_metadata(zmd); 1946 set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1947 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1948 clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1949 dmz_lock_metadata(zmd); 1950 dmz_lock_map(zmd); 1951 } 1952 1953 /* 1954 * Select a cache or random write zone for reclaim. 1955 */ 1956 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, 1957 unsigned int idx, bool idle) 1958 { 1959 struct dm_zone *dzone = NULL; 1960 struct dm_zone *zone, *maxw_z = NULL; 1961 struct list_head *zone_list; 1962 1963 /* If we have cache zones select from the cache zone list */ 1964 if (zmd->nr_cache) { 1965 zone_list = &zmd->map_cache_list; 1966 /* Try to relaim random zones, too, when idle */ 1967 if (idle && list_empty(zone_list)) 1968 zone_list = &zmd->dev[idx].map_rnd_list; 1969 } else 1970 zone_list = &zmd->dev[idx].map_rnd_list; 1971 1972 /* 1973 * Find the buffer zone with the heaviest weight or the first (oldest) 1974 * data zone that can be reclaimed. 1975 */ 1976 list_for_each_entry(zone, zone_list, link) { 1977 if (dmz_is_buf(zone)) { 1978 dzone = zone->bzone; 1979 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 1980 continue; 1981 if (!maxw_z || maxw_z->weight < dzone->weight) 1982 maxw_z = dzone; 1983 } else { 1984 dzone = zone; 1985 if (dmz_lock_zone_reclaim(dzone)) 1986 return dzone; 1987 } 1988 } 1989 1990 if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) 1991 return maxw_z; 1992 1993 /* 1994 * If we come here, none of the zones inspected could be locked for 1995 * reclaim. Try again, being more aggressive, that is, find the 1996 * first zone that can be reclaimed regardless of its weitght. 1997 */ 1998 list_for_each_entry(zone, zone_list, link) { 1999 if (dmz_is_buf(zone)) { 2000 dzone = zone->bzone; 2001 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 2002 continue; 2003 } else 2004 dzone = zone; 2005 if (dmz_lock_zone_reclaim(dzone)) 2006 return dzone; 2007 } 2008 2009 return NULL; 2010 } 2011 2012 /* 2013 * Select a buffered sequential zone for reclaim. 2014 */ 2015 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, 2016 unsigned int idx) 2017 { 2018 struct dm_zone *zone; 2019 2020 list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { 2021 if (!zone->bzone) 2022 continue; 2023 if (dmz_lock_zone_reclaim(zone)) 2024 return zone; 2025 } 2026 2027 return NULL; 2028 } 2029 2030 /* 2031 * Select a zone for reclaim. 2032 */ 2033 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, 2034 unsigned int dev_idx, bool idle) 2035 { 2036 struct dm_zone *zone = NULL; 2037 2038 /* 2039 * Search for a zone candidate to reclaim: 2 cases are possible. 2040 * (1) There is no free sequential zones. Then a random data zone 2041 * cannot be reclaimed. So choose a sequential zone to reclaim so 2042 * that afterward a random zone can be reclaimed. 2043 * (2) At least one free sequential zone is available, then choose 2044 * the oldest random zone (data or buffer) that can be locked. 2045 */ 2046 dmz_lock_map(zmd); 2047 if (list_empty(&zmd->reserved_seq_zones_list)) 2048 zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); 2049 if (!zone) 2050 zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); 2051 dmz_unlock_map(zmd); 2052 2053 return zone; 2054 } 2055 2056 /* 2057 * Get the zone mapping a chunk, if the chunk is mapped already. 2058 * If no mapping exist and the operation is WRITE, a zone is 2059 * allocated and used to map the chunk. 2060 * The zone returned will be set to the active state. 2061 */ 2062 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 2063 { 2064 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 2065 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 2066 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 2067 unsigned int dzone_id; 2068 struct dm_zone *dzone = NULL; 2069 int ret = 0; 2070 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2071 2072 dmz_lock_map(zmd); 2073 again: 2074 /* Get the chunk mapping */ 2075 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 2076 if (dzone_id == DMZ_MAP_UNMAPPED) { 2077 /* 2078 * Read or discard in unmapped chunks are fine. But for 2079 * writes, we need a mapping, so get one. 2080 */ 2081 if (op != REQ_OP_WRITE) 2082 goto out; 2083 2084 /* Allocate a random zone */ 2085 dzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2086 if (!dzone) { 2087 if (dmz_dev_is_dying(zmd)) { 2088 dzone = ERR_PTR(-EIO); 2089 goto out; 2090 } 2091 dmz_wait_for_free_zones(zmd); 2092 goto again; 2093 } 2094 2095 dmz_map_zone(zmd, dzone, chunk); 2096 2097 } else { 2098 /* The chunk is already mapped: get the mapping zone */ 2099 dzone = dmz_get(zmd, dzone_id); 2100 if (!dzone) { 2101 dzone = ERR_PTR(-EIO); 2102 goto out; 2103 } 2104 if (dzone->chunk != chunk) { 2105 dzone = ERR_PTR(-EIO); 2106 goto out; 2107 } 2108 2109 /* Repair write pointer if the sequential dzone has error */ 2110 if (dmz_seq_write_err(dzone)) { 2111 ret = dmz_handle_seq_write_err(zmd, dzone); 2112 if (ret) { 2113 dzone = ERR_PTR(-EIO); 2114 goto out; 2115 } 2116 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 2117 } 2118 } 2119 2120 /* 2121 * If the zone is being reclaimed, the chunk mapping may change 2122 * to a different zone. So wait for reclaim and retry. Otherwise, 2123 * activate the zone (this will prevent reclaim from touching it). 2124 */ 2125 if (dmz_in_reclaim(dzone)) { 2126 dmz_wait_for_reclaim(zmd, dzone); 2127 goto again; 2128 } 2129 dmz_activate_zone(dzone); 2130 dmz_lru_zone(zmd, dzone); 2131 out: 2132 dmz_unlock_map(zmd); 2133 2134 return dzone; 2135 } 2136 2137 /* 2138 * Write and discard change the block validity of data zones and their buffer 2139 * zones. Check here that valid blocks are still present. If all blocks are 2140 * invalid, the zones can be unmapped on the fly without waiting for reclaim 2141 * to do it. 2142 */ 2143 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 2144 { 2145 struct dm_zone *bzone; 2146 2147 dmz_lock_map(zmd); 2148 2149 bzone = dzone->bzone; 2150 if (bzone) { 2151 if (dmz_weight(bzone)) 2152 dmz_lru_zone(zmd, bzone); 2153 else { 2154 /* Empty buffer zone: reclaim it */ 2155 dmz_unmap_zone(zmd, bzone); 2156 dmz_free_zone(zmd, bzone); 2157 bzone = NULL; 2158 } 2159 } 2160 2161 /* Deactivate the data zone */ 2162 dmz_deactivate_zone(dzone); 2163 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 2164 dmz_lru_zone(zmd, dzone); 2165 else { 2166 /* Unbuffered inactive empty data zone: reclaim it */ 2167 dmz_unmap_zone(zmd, dzone); 2168 dmz_free_zone(zmd, dzone); 2169 } 2170 2171 dmz_unlock_map(zmd); 2172 } 2173 2174 /* 2175 * Allocate and map a random zone to buffer a chunk 2176 * already mapped to a sequential zone. 2177 */ 2178 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 2179 struct dm_zone *dzone) 2180 { 2181 struct dm_zone *bzone; 2182 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2183 2184 dmz_lock_map(zmd); 2185 again: 2186 bzone = dzone->bzone; 2187 if (bzone) 2188 goto out; 2189 2190 /* Allocate a random zone */ 2191 bzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2192 if (!bzone) { 2193 if (dmz_dev_is_dying(zmd)) { 2194 bzone = ERR_PTR(-EIO); 2195 goto out; 2196 } 2197 dmz_wait_for_free_zones(zmd); 2198 goto again; 2199 } 2200 2201 /* Update the chunk mapping */ 2202 dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); 2203 2204 set_bit(DMZ_BUF, &bzone->flags); 2205 bzone->chunk = dzone->chunk; 2206 bzone->bzone = dzone; 2207 dzone->bzone = bzone; 2208 if (dmz_is_cache(bzone)) 2209 list_add_tail(&bzone->link, &zmd->map_cache_list); 2210 else 2211 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 2212 out: 2213 dmz_unlock_map(zmd); 2214 2215 return bzone; 2216 } 2217 2218 /* 2219 * Get an unmapped (free) zone. 2220 * This must be called with the mapping lock held. 2221 */ 2222 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, 2223 unsigned long flags) 2224 { 2225 struct list_head *list; 2226 struct dm_zone *zone; 2227 int i; 2228 2229 /* Schedule reclaim to ensure free zones are available */ 2230 if (!(flags & DMZ_ALLOC_RECLAIM)) { 2231 for (i = 0; i < zmd->nr_devs; i++) 2232 dmz_schedule_reclaim(zmd->dev[i].reclaim); 2233 } 2234 2235 i = 0; 2236 again: 2237 if (flags & DMZ_ALLOC_CACHE) 2238 list = &zmd->unmap_cache_list; 2239 else if (flags & DMZ_ALLOC_RND) 2240 list = &zmd->dev[dev_idx].unmap_rnd_list; 2241 else 2242 list = &zmd->dev[dev_idx].unmap_seq_list; 2243 2244 if (list_empty(list)) { 2245 /* 2246 * No free zone: return NULL if this is for not reclaim. 2247 */ 2248 if (!(flags & DMZ_ALLOC_RECLAIM)) 2249 return NULL; 2250 /* 2251 * Try to allocate from other devices 2252 */ 2253 if (i < zmd->nr_devs) { 2254 dev_idx = (dev_idx + 1) % zmd->nr_devs; 2255 i++; 2256 goto again; 2257 } 2258 2259 /* 2260 * Fallback to the reserved sequential zones 2261 */ 2262 zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, 2263 struct dm_zone, link); 2264 if (zone) { 2265 list_del_init(&zone->link); 2266 atomic_dec(&zmd->nr_reserved_seq_zones); 2267 } 2268 return zone; 2269 } 2270 2271 zone = list_first_entry(list, struct dm_zone, link); 2272 list_del_init(&zone->link); 2273 2274 if (dmz_is_cache(zone)) 2275 atomic_dec(&zmd->unmap_nr_cache); 2276 else if (dmz_is_rnd(zone)) 2277 atomic_dec(&zone->dev->unmap_nr_rnd); 2278 else 2279 atomic_dec(&zone->dev->unmap_nr_seq); 2280 2281 if (dmz_is_offline(zone)) { 2282 dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); 2283 zone = NULL; 2284 goto again; 2285 } 2286 if (dmz_is_meta(zone)) { 2287 dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); 2288 zone = NULL; 2289 goto again; 2290 } 2291 return zone; 2292 } 2293 2294 /* 2295 * Free a zone. 2296 * This must be called with the mapping lock held. 2297 */ 2298 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2299 { 2300 /* If this is a sequential zone, reset it */ 2301 if (dmz_is_seq(zone)) 2302 dmz_reset_zone(zmd, zone); 2303 2304 /* Return the zone to its type unmap list */ 2305 if (dmz_is_cache(zone)) { 2306 list_add_tail(&zone->link, &zmd->unmap_cache_list); 2307 atomic_inc(&zmd->unmap_nr_cache); 2308 } else if (dmz_is_rnd(zone)) { 2309 list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); 2310 atomic_inc(&zone->dev->unmap_nr_rnd); 2311 } else if (dmz_is_reserved(zone)) { 2312 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 2313 atomic_inc(&zmd->nr_reserved_seq_zones); 2314 } else { 2315 list_add_tail(&zone->link, &zone->dev->unmap_seq_list); 2316 atomic_inc(&zone->dev->unmap_nr_seq); 2317 } 2318 2319 wake_up_all(&zmd->free_wq); 2320 } 2321 2322 /* 2323 * Map a chunk to a zone. 2324 * This must be called with the mapping lock held. 2325 */ 2326 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 2327 unsigned int chunk) 2328 { 2329 /* Set the chunk mapping */ 2330 dmz_set_chunk_mapping(zmd, chunk, dzone->id, 2331 DMZ_MAP_UNMAPPED); 2332 dzone->chunk = chunk; 2333 if (dmz_is_cache(dzone)) 2334 list_add_tail(&dzone->link, &zmd->map_cache_list); 2335 else if (dmz_is_rnd(dzone)) 2336 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 2337 else 2338 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 2339 } 2340 2341 /* 2342 * Unmap a zone. 2343 * This must be called with the mapping lock held. 2344 */ 2345 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2346 { 2347 unsigned int chunk = zone->chunk; 2348 unsigned int dzone_id; 2349 2350 if (chunk == DMZ_MAP_UNMAPPED) { 2351 /* Already unmapped */ 2352 return; 2353 } 2354 2355 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 2356 /* 2357 * Unmapping the chunk buffer zone: clear only 2358 * the chunk buffer mapping 2359 */ 2360 dzone_id = zone->bzone->id; 2361 zone->bzone->bzone = NULL; 2362 zone->bzone = NULL; 2363 2364 } else { 2365 /* 2366 * Unmapping the chunk data zone: the zone must 2367 * not be buffered. 2368 */ 2369 if (WARN_ON(zone->bzone)) { 2370 zone->bzone->bzone = NULL; 2371 zone->bzone = NULL; 2372 } 2373 dzone_id = DMZ_MAP_UNMAPPED; 2374 } 2375 2376 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 2377 2378 zone->chunk = DMZ_MAP_UNMAPPED; 2379 list_del_init(&zone->link); 2380 } 2381 2382 /* 2383 * Set @nr_bits bits in @bitmap starting from @bit. 2384 * Return the number of bits changed from 0 to 1. 2385 */ 2386 static unsigned int dmz_set_bits(unsigned long *bitmap, 2387 unsigned int bit, unsigned int nr_bits) 2388 { 2389 unsigned long *addr; 2390 unsigned int end = bit + nr_bits; 2391 unsigned int n = 0; 2392 2393 while (bit < end) { 2394 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2395 ((end - bit) >= BITS_PER_LONG)) { 2396 /* Try to set the whole word at once */ 2397 addr = bitmap + BIT_WORD(bit); 2398 if (*addr == 0) { 2399 *addr = ULONG_MAX; 2400 n += BITS_PER_LONG; 2401 bit += BITS_PER_LONG; 2402 continue; 2403 } 2404 } 2405 2406 if (!test_and_set_bit(bit, bitmap)) 2407 n++; 2408 bit++; 2409 } 2410 2411 return n; 2412 } 2413 2414 /* 2415 * Get the bitmap block storing the bit for chunk_block in zone. 2416 */ 2417 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 2418 struct dm_zone *zone, 2419 sector_t chunk_block) 2420 { 2421 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 2422 (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + 2423 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 2424 2425 return dmz_get_mblock(zmd, bitmap_block); 2426 } 2427 2428 /* 2429 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 2430 */ 2431 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2432 struct dm_zone *to_zone) 2433 { 2434 struct dmz_mblock *from_mblk, *to_mblk; 2435 sector_t chunk_block = 0; 2436 2437 /* Get the zones bitmap blocks */ 2438 while (chunk_block < zmd->zone_nr_blocks) { 2439 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 2440 if (IS_ERR(from_mblk)) 2441 return PTR_ERR(from_mblk); 2442 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 2443 if (IS_ERR(to_mblk)) { 2444 dmz_release_mblock(zmd, from_mblk); 2445 return PTR_ERR(to_mblk); 2446 } 2447 2448 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 2449 dmz_dirty_mblock(zmd, to_mblk); 2450 2451 dmz_release_mblock(zmd, to_mblk); 2452 dmz_release_mblock(zmd, from_mblk); 2453 2454 chunk_block += zmd->zone_bits_per_mblk; 2455 } 2456 2457 to_zone->weight = from_zone->weight; 2458 2459 return 0; 2460 } 2461 2462 /* 2463 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 2464 * starting from chunk_block. 2465 */ 2466 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2467 struct dm_zone *to_zone, sector_t chunk_block) 2468 { 2469 unsigned int nr_blocks; 2470 int ret; 2471 2472 /* Get the zones bitmap blocks */ 2473 while (chunk_block < zmd->zone_nr_blocks) { 2474 /* Get a valid region from the source zone */ 2475 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 2476 if (ret <= 0) 2477 return ret; 2478 2479 nr_blocks = ret; 2480 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 2481 if (ret) 2482 return ret; 2483 2484 chunk_block += nr_blocks; 2485 } 2486 2487 return 0; 2488 } 2489 2490 /* 2491 * Validate all the blocks in the range [block..block+nr_blocks-1]. 2492 */ 2493 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2494 sector_t chunk_block, unsigned int nr_blocks) 2495 { 2496 unsigned int count, bit, nr_bits; 2497 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 2498 struct dmz_mblock *mblk; 2499 unsigned int n = 0; 2500 2501 dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", 2502 zone->id, (unsigned long long)chunk_block, 2503 nr_blocks); 2504 2505 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2506 2507 while (nr_blocks) { 2508 /* Get bitmap block */ 2509 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2510 if (IS_ERR(mblk)) 2511 return PTR_ERR(mblk); 2512 2513 /* Set bits */ 2514 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2515 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2516 2517 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2518 if (count) { 2519 dmz_dirty_mblock(zmd, mblk); 2520 n += count; 2521 } 2522 dmz_release_mblock(zmd, mblk); 2523 2524 nr_blocks -= nr_bits; 2525 chunk_block += nr_bits; 2526 } 2527 2528 if (likely(zone->weight + n <= zone_nr_blocks)) 2529 zone->weight += n; 2530 else { 2531 dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", 2532 zone->id, zone->weight, 2533 zone_nr_blocks - n); 2534 zone->weight = zone_nr_blocks; 2535 } 2536 2537 return 0; 2538 } 2539 2540 /* 2541 * Clear nr_bits bits in bitmap starting from bit. 2542 * Return the number of bits cleared. 2543 */ 2544 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2545 { 2546 unsigned long *addr; 2547 int end = bit + nr_bits; 2548 int n = 0; 2549 2550 while (bit < end) { 2551 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2552 ((end - bit) >= BITS_PER_LONG)) { 2553 /* Try to clear whole word at once */ 2554 addr = bitmap + BIT_WORD(bit); 2555 if (*addr == ULONG_MAX) { 2556 *addr = 0; 2557 n += BITS_PER_LONG; 2558 bit += BITS_PER_LONG; 2559 continue; 2560 } 2561 } 2562 2563 if (test_and_clear_bit(bit, bitmap)) 2564 n++; 2565 bit++; 2566 } 2567 2568 return n; 2569 } 2570 2571 /* 2572 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2573 */ 2574 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2575 sector_t chunk_block, unsigned int nr_blocks) 2576 { 2577 unsigned int count, bit, nr_bits; 2578 struct dmz_mblock *mblk; 2579 unsigned int n = 0; 2580 2581 dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", 2582 zone->id, (u64)chunk_block, nr_blocks); 2583 2584 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2585 2586 while (nr_blocks) { 2587 /* Get bitmap block */ 2588 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2589 if (IS_ERR(mblk)) 2590 return PTR_ERR(mblk); 2591 2592 /* Clear bits */ 2593 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2594 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2595 2596 count = dmz_clear_bits((unsigned long *)mblk->data, 2597 bit, nr_bits); 2598 if (count) { 2599 dmz_dirty_mblock(zmd, mblk); 2600 n += count; 2601 } 2602 dmz_release_mblock(zmd, mblk); 2603 2604 nr_blocks -= nr_bits; 2605 chunk_block += nr_bits; 2606 } 2607 2608 if (zone->weight >= n) 2609 zone->weight -= n; 2610 else { 2611 dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", 2612 zone->id, zone->weight, n); 2613 zone->weight = 0; 2614 } 2615 2616 return 0; 2617 } 2618 2619 /* 2620 * Get a block bit value. 2621 */ 2622 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2623 sector_t chunk_block) 2624 { 2625 struct dmz_mblock *mblk; 2626 int ret; 2627 2628 WARN_ON(chunk_block >= zmd->zone_nr_blocks); 2629 2630 /* Get bitmap block */ 2631 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2632 if (IS_ERR(mblk)) 2633 return PTR_ERR(mblk); 2634 2635 /* Get offset */ 2636 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2637 (unsigned long *) mblk->data) != 0; 2638 2639 dmz_release_mblock(zmd, mblk); 2640 2641 return ret; 2642 } 2643 2644 /* 2645 * Return the number of blocks from chunk_block to the first block with a bit 2646 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2647 */ 2648 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2649 sector_t chunk_block, unsigned int nr_blocks, 2650 int set) 2651 { 2652 struct dmz_mblock *mblk; 2653 unsigned int bit, set_bit, nr_bits; 2654 unsigned int zone_bits = zmd->zone_bits_per_mblk; 2655 unsigned long *bitmap; 2656 int n = 0; 2657 2658 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2659 2660 while (nr_blocks) { 2661 /* Get bitmap block */ 2662 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2663 if (IS_ERR(mblk)) 2664 return PTR_ERR(mblk); 2665 2666 /* Get offset */ 2667 bitmap = (unsigned long *) mblk->data; 2668 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2669 nr_bits = min(nr_blocks, zone_bits - bit); 2670 if (set) 2671 set_bit = find_next_bit(bitmap, zone_bits, bit); 2672 else 2673 set_bit = find_next_zero_bit(bitmap, zone_bits, bit); 2674 dmz_release_mblock(zmd, mblk); 2675 2676 n += set_bit - bit; 2677 if (set_bit < zone_bits) 2678 break; 2679 2680 nr_blocks -= nr_bits; 2681 chunk_block += nr_bits; 2682 } 2683 2684 return n; 2685 } 2686 2687 /* 2688 * Test if chunk_block is valid. If it is, the number of consecutive 2689 * valid blocks from chunk_block will be returned. 2690 */ 2691 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2692 sector_t chunk_block) 2693 { 2694 int valid; 2695 2696 valid = dmz_test_block(zmd, zone, chunk_block); 2697 if (valid <= 0) 2698 return valid; 2699 2700 /* The block is valid: get the number of valid blocks from block */ 2701 return dmz_to_next_set_block(zmd, zone, chunk_block, 2702 zmd->zone_nr_blocks - chunk_block, 0); 2703 } 2704 2705 /* 2706 * Find the first valid block from @chunk_block in @zone. 2707 * If such a block is found, its number is returned using 2708 * @chunk_block and the total number of valid blocks from @chunk_block 2709 * is returned. 2710 */ 2711 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2712 sector_t *chunk_block) 2713 { 2714 sector_t start_block = *chunk_block; 2715 int ret; 2716 2717 ret = dmz_to_next_set_block(zmd, zone, start_block, 2718 zmd->zone_nr_blocks - start_block, 1); 2719 if (ret < 0) 2720 return ret; 2721 2722 start_block += ret; 2723 *chunk_block = start_block; 2724 2725 return dmz_to_next_set_block(zmd, zone, start_block, 2726 zmd->zone_nr_blocks - start_block, 0); 2727 } 2728 2729 /* 2730 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2731 */ 2732 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2733 { 2734 unsigned long *addr; 2735 int end = bit + nr_bits; 2736 int n = 0; 2737 2738 while (bit < end) { 2739 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2740 ((end - bit) >= BITS_PER_LONG)) { 2741 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2742 if (*addr == ULONG_MAX) { 2743 n += BITS_PER_LONG; 2744 bit += BITS_PER_LONG; 2745 continue; 2746 } 2747 } 2748 2749 if (test_bit(bit, bitmap)) 2750 n++; 2751 bit++; 2752 } 2753 2754 return n; 2755 } 2756 2757 /* 2758 * Get a zone weight. 2759 */ 2760 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2761 { 2762 struct dmz_mblock *mblk; 2763 sector_t chunk_block = 0; 2764 unsigned int bit, nr_bits; 2765 unsigned int nr_blocks = zmd->zone_nr_blocks; 2766 void *bitmap; 2767 int n = 0; 2768 2769 while (nr_blocks) { 2770 /* Get bitmap block */ 2771 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2772 if (IS_ERR(mblk)) { 2773 n = 0; 2774 break; 2775 } 2776 2777 /* Count bits in this block */ 2778 bitmap = mblk->data; 2779 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2780 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2781 n += dmz_count_bits(bitmap, bit, nr_bits); 2782 2783 dmz_release_mblock(zmd, mblk); 2784 2785 nr_blocks -= nr_bits; 2786 chunk_block += nr_bits; 2787 } 2788 2789 zone->weight = n; 2790 } 2791 2792 /* 2793 * Cleanup the zoned metadata resources. 2794 */ 2795 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2796 { 2797 struct rb_root *root; 2798 struct dmz_mblock *mblk, *next; 2799 int i; 2800 2801 /* Release zone mapping resources */ 2802 if (zmd->map_mblk) { 2803 for (i = 0; i < zmd->nr_map_blocks; i++) 2804 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2805 kfree(zmd->map_mblk); 2806 zmd->map_mblk = NULL; 2807 } 2808 2809 /* Release super blocks */ 2810 for (i = 0; i < 2; i++) { 2811 if (zmd->sb[i].mblk) { 2812 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2813 zmd->sb[i].mblk = NULL; 2814 } 2815 } 2816 2817 /* Free cached blocks */ 2818 while (!list_empty(&zmd->mblk_dirty_list)) { 2819 mblk = list_first_entry(&zmd->mblk_dirty_list, 2820 struct dmz_mblock, link); 2821 dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", 2822 (u64)mblk->no, mblk->ref); 2823 list_del_init(&mblk->link); 2824 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2825 dmz_free_mblock(zmd, mblk); 2826 } 2827 2828 while (!list_empty(&zmd->mblk_lru_list)) { 2829 mblk = list_first_entry(&zmd->mblk_lru_list, 2830 struct dmz_mblock, link); 2831 list_del_init(&mblk->link); 2832 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2833 dmz_free_mblock(zmd, mblk); 2834 } 2835 2836 /* Sanity checks: the mblock rbtree should now be empty */ 2837 root = &zmd->mblk_rbtree; 2838 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2839 dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", 2840 (u64)mblk->no, mblk->ref); 2841 mblk->ref = 0; 2842 dmz_free_mblock(zmd, mblk); 2843 } 2844 2845 /* Free the zone descriptors */ 2846 dmz_drop_zones(zmd); 2847 2848 mutex_destroy(&zmd->mblk_flush_lock); 2849 mutex_destroy(&zmd->map_lock); 2850 } 2851 2852 static void dmz_print_dev(struct dmz_metadata *zmd, int num) 2853 { 2854 struct dmz_dev *dev = &zmd->dev[num]; 2855 2856 if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) 2857 dmz_dev_info(dev, "Regular block device"); 2858 else 2859 dmz_dev_info(dev, "Host-%s zoned block device", 2860 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2861 "aware" : "managed"); 2862 if (zmd->sb_version > 1) { 2863 sector_t sector_offset = 2864 dev->zone_offset << zmd->zone_nr_sectors_shift; 2865 2866 dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", 2867 (u64)dev->capacity, (u64)sector_offset); 2868 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", 2869 dev->nr_zones, (u64)zmd->zone_nr_sectors, 2870 (u64)dev->zone_offset); 2871 } else { 2872 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2873 (u64)dev->capacity); 2874 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2875 dev->nr_zones, (u64)zmd->zone_nr_sectors); 2876 } 2877 } 2878 2879 /* 2880 * Initialize the zoned metadata. 2881 */ 2882 int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, 2883 struct dmz_metadata **metadata, 2884 const char *devname) 2885 { 2886 struct dmz_metadata *zmd; 2887 unsigned int i; 2888 struct dm_zone *zone; 2889 int ret; 2890 2891 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2892 if (!zmd) 2893 return -ENOMEM; 2894 2895 strcpy(zmd->devname, devname); 2896 zmd->dev = dev; 2897 zmd->nr_devs = num_dev; 2898 zmd->mblk_rbtree = RB_ROOT; 2899 init_rwsem(&zmd->mblk_sem); 2900 mutex_init(&zmd->mblk_flush_lock); 2901 spin_lock_init(&zmd->mblk_lock); 2902 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2903 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2904 2905 mutex_init(&zmd->map_lock); 2906 2907 atomic_set(&zmd->unmap_nr_cache, 0); 2908 INIT_LIST_HEAD(&zmd->unmap_cache_list); 2909 INIT_LIST_HEAD(&zmd->map_cache_list); 2910 2911 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2912 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2913 2914 init_waitqueue_head(&zmd->free_wq); 2915 2916 /* Initialize zone descriptors */ 2917 ret = dmz_init_zones(zmd); 2918 if (ret) 2919 goto err; 2920 2921 /* Get super block */ 2922 ret = dmz_load_sb(zmd); 2923 if (ret) 2924 goto err; 2925 2926 /* Set metadata zones starting from sb_zone */ 2927 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2928 zone = dmz_get(zmd, zmd->sb[0].zone->id + i); 2929 if (!zone) { 2930 dmz_zmd_err(zmd, 2931 "metadata zone %u not present", i); 2932 ret = -ENXIO; 2933 goto err; 2934 } 2935 if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { 2936 dmz_zmd_err(zmd, 2937 "metadata zone %d is not random", i); 2938 ret = -ENXIO; 2939 goto err; 2940 } 2941 set_bit(DMZ_META, &zone->flags); 2942 } 2943 /* Load mapping table */ 2944 ret = dmz_load_mapping(zmd); 2945 if (ret) 2946 goto err; 2947 2948 /* 2949 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2950 * blocks and enough blocks to be able to cache the bitmap blocks of 2951 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2952 * the cache to add 512 more metadata blocks. 2953 */ 2954 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2955 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2956 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2957 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2958 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2959 2960 /* Metadata cache shrinker */ 2961 ret = register_shrinker(&zmd->mblk_shrinker); 2962 if (ret) { 2963 dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); 2964 goto err; 2965 } 2966 2967 dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); 2968 for (i = 0; i < zmd->nr_devs; i++) 2969 dmz_print_dev(zmd, i); 2970 2971 dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", 2972 zmd->nr_zones, (u64)zmd->zone_nr_sectors); 2973 dmz_zmd_debug(zmd, " %u metadata zones", 2974 zmd->nr_meta_zones * 2); 2975 dmz_zmd_debug(zmd, " %u data zones for %u chunks", 2976 zmd->nr_data_zones, zmd->nr_chunks); 2977 dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", 2978 zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); 2979 for (i = 0; i < zmd->nr_devs; i++) { 2980 dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", 2981 dmz_nr_rnd_zones(zmd, i), 2982 dmz_nr_unmap_rnd_zones(zmd, i)); 2983 dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", 2984 dmz_nr_seq_zones(zmd, i), 2985 dmz_nr_unmap_seq_zones(zmd, i)); 2986 } 2987 dmz_zmd_debug(zmd, " %u reserved sequential data zones", 2988 zmd->nr_reserved_seq); 2989 dmz_zmd_debug(zmd, "Format:"); 2990 dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", 2991 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2992 dmz_zmd_debug(zmd, " %u data zone mapping blocks", 2993 zmd->nr_map_blocks); 2994 dmz_zmd_debug(zmd, " %u bitmap blocks", 2995 zmd->nr_bitmap_blocks); 2996 2997 *metadata = zmd; 2998 2999 return 0; 3000 err: 3001 dmz_cleanup_metadata(zmd); 3002 kfree(zmd); 3003 *metadata = NULL; 3004 3005 return ret; 3006 } 3007 3008 /* 3009 * Cleanup the zoned metadata resources. 3010 */ 3011 void dmz_dtr_metadata(struct dmz_metadata *zmd) 3012 { 3013 unregister_shrinker(&zmd->mblk_shrinker); 3014 dmz_cleanup_metadata(zmd); 3015 kfree(zmd); 3016 } 3017 3018 /* 3019 * Check zone information on resume. 3020 */ 3021 int dmz_resume_metadata(struct dmz_metadata *zmd) 3022 { 3023 struct dm_zone *zone; 3024 sector_t wp_block; 3025 unsigned int i; 3026 int ret; 3027 3028 /* Check zones */ 3029 for (i = 0; i < zmd->nr_zones; i++) { 3030 zone = dmz_get(zmd, i); 3031 if (!zone) { 3032 dmz_zmd_err(zmd, "Unable to get zone %u", i); 3033 return -EIO; 3034 } 3035 wp_block = zone->wp_block; 3036 3037 ret = dmz_update_zone(zmd, zone); 3038 if (ret) { 3039 dmz_zmd_err(zmd, "Broken zone %u", i); 3040 return ret; 3041 } 3042 3043 if (dmz_is_offline(zone)) { 3044 dmz_zmd_warn(zmd, "Zone %u is offline", i); 3045 continue; 3046 } 3047 3048 /* Check write pointer */ 3049 if (!dmz_is_seq(zone)) 3050 zone->wp_block = 0; 3051 else if (zone->wp_block != wp_block) { 3052 dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", 3053 i, (u64)zone->wp_block, (u64)wp_block); 3054 zone->wp_block = wp_block; 3055 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 3056 zmd->zone_nr_blocks - zone->wp_block); 3057 } 3058 } 3059 3060 return 0; 3061 } 3062