1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 2 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* DM-Zoned label */ 73 u8 dmz_label[32]; /* 80 */ 74 75 /* DM-Zoned UUID */ 76 u8 dmz_uuid[16]; /* 96 */ 77 78 /* Device UUID */ 79 u8 dev_uuid[16]; /* 112 */ 80 81 /* Padding to full 512B sector */ 82 u8 reserved[400]; /* 512 */ 83 }; 84 85 /* 86 * Chunk mapping entry: entries are indexed by chunk number 87 * and give the zone ID (dzone_id) mapping the chunk on disk. 88 * This zone may be sequential or random. If it is a sequential 89 * zone, a second zone (bzone_id) used as a write buffer may 90 * also be specified. This second zone will always be a randomly 91 * writeable zone. 92 */ 93 struct dmz_map { 94 __le32 dzone_id; 95 __le32 bzone_id; 96 }; 97 98 /* 99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 100 */ 101 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 102 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 103 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 104 #define DMZ_MAP_UNMAPPED UINT_MAX 105 106 /* 107 * Meta data block descriptor (for cached metadata blocks). 108 */ 109 struct dmz_mblock { 110 struct rb_node node; 111 struct list_head link; 112 sector_t no; 113 unsigned int ref; 114 unsigned long state; 115 struct page *page; 116 void *data; 117 }; 118 119 /* 120 * Metadata block state flags. 121 */ 122 enum { 123 DMZ_META_DIRTY, 124 DMZ_META_READING, 125 DMZ_META_WRITING, 126 DMZ_META_ERROR, 127 }; 128 129 /* 130 * Super block information (one per metadata set). 131 */ 132 struct dmz_sb { 133 sector_t block; 134 struct dmz_dev *dev; 135 struct dmz_mblock *mblk; 136 struct dmz_super *sb; 137 struct dm_zone *zone; 138 }; 139 140 /* 141 * In-memory metadata. 142 */ 143 struct dmz_metadata { 144 struct dmz_dev *dev; 145 unsigned int nr_devs; 146 147 char devname[BDEVNAME_SIZE]; 148 char label[BDEVNAME_SIZE]; 149 uuid_t uuid; 150 151 sector_t zone_bitmap_size; 152 unsigned int zone_nr_bitmap_blocks; 153 unsigned int zone_bits_per_mblk; 154 155 sector_t zone_nr_blocks; 156 sector_t zone_nr_blocks_shift; 157 158 sector_t zone_nr_sectors; 159 sector_t zone_nr_sectors_shift; 160 161 unsigned int nr_bitmap_blocks; 162 unsigned int nr_map_blocks; 163 164 unsigned int nr_zones; 165 unsigned int nr_useable_zones; 166 unsigned int nr_meta_blocks; 167 unsigned int nr_meta_zones; 168 unsigned int nr_data_zones; 169 unsigned int nr_cache_zones; 170 unsigned int nr_rnd_zones; 171 unsigned int nr_reserved_seq; 172 unsigned int nr_chunks; 173 174 /* Zone information array */ 175 struct xarray zones; 176 177 struct dmz_sb sb[2]; 178 unsigned int mblk_primary; 179 unsigned int sb_version; 180 u64 sb_gen; 181 unsigned int min_nr_mblks; 182 unsigned int max_nr_mblks; 183 atomic_t nr_mblks; 184 struct rw_semaphore mblk_sem; 185 struct mutex mblk_flush_lock; 186 spinlock_t mblk_lock; 187 struct rb_root mblk_rbtree; 188 struct list_head mblk_lru_list; 189 struct list_head mblk_dirty_list; 190 struct shrinker mblk_shrinker; 191 192 /* Zone allocation management */ 193 struct mutex map_lock; 194 struct dmz_mblock **map_mblk; 195 196 unsigned int nr_cache; 197 atomic_t unmap_nr_cache; 198 struct list_head unmap_cache_list; 199 struct list_head map_cache_list; 200 201 atomic_t nr_reserved_seq_zones; 202 struct list_head reserved_seq_zones_list; 203 204 wait_queue_head_t free_wq; 205 }; 206 207 #define dmz_zmd_info(zmd, format, args...) \ 208 DMINFO("(%s): " format, (zmd)->label, ## args) 209 210 #define dmz_zmd_err(zmd, format, args...) \ 211 DMERR("(%s): " format, (zmd)->label, ## args) 212 213 #define dmz_zmd_warn(zmd, format, args...) \ 214 DMWARN("(%s): " format, (zmd)->label, ## args) 215 216 #define dmz_zmd_debug(zmd, format, args...) \ 217 DMDEBUG("(%s): " format, (zmd)->label, ## args) 218 /* 219 * Various accessors 220 */ 221 static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) 222 { 223 if (WARN_ON(!zone)) 224 return 0; 225 226 return zone->id - zone->dev->zone_offset; 227 } 228 229 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 230 { 231 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 232 233 return (sector_t)zone_id << zmd->zone_nr_sectors_shift; 234 } 235 236 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 237 { 238 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 239 240 return (sector_t)zone_id << zmd->zone_nr_blocks_shift; 241 } 242 243 unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) 244 { 245 return zmd->zone_nr_blocks; 246 } 247 248 unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) 249 { 250 return zmd->zone_nr_blocks_shift; 251 } 252 253 unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) 254 { 255 return zmd->zone_nr_sectors; 256 } 257 258 unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) 259 { 260 return zmd->zone_nr_sectors_shift; 261 } 262 263 unsigned int dmz_nr_zones(struct dmz_metadata *zmd) 264 { 265 return zmd->nr_zones; 266 } 267 268 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 269 { 270 return zmd->nr_chunks; 271 } 272 273 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) 274 { 275 return zmd->dev[idx].nr_rnd; 276 } 277 278 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) 279 { 280 return atomic_read(&zmd->dev[idx].unmap_nr_rnd); 281 } 282 283 unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) 284 { 285 return zmd->nr_cache; 286 } 287 288 unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) 289 { 290 return atomic_read(&zmd->unmap_nr_cache); 291 } 292 293 unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) 294 { 295 return zmd->dev[idx].nr_seq; 296 } 297 298 unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) 299 { 300 return atomic_read(&zmd->dev[idx].unmap_nr_seq); 301 } 302 303 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 304 { 305 return xa_load(&zmd->zones, zone_id); 306 } 307 308 static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, 309 unsigned int zone_id, struct dmz_dev *dev) 310 { 311 struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); 312 313 if (!zone) 314 return ERR_PTR(-ENOMEM); 315 316 if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { 317 kfree(zone); 318 return ERR_PTR(-EBUSY); 319 } 320 321 INIT_LIST_HEAD(&zone->link); 322 atomic_set(&zone->refcount, 0); 323 zone->id = zone_id; 324 zone->chunk = DMZ_MAP_UNMAPPED; 325 zone->dev = dev; 326 327 return zone; 328 } 329 330 const char *dmz_metadata_label(struct dmz_metadata *zmd) 331 { 332 return (const char *)zmd->label; 333 } 334 335 bool dmz_check_dev(struct dmz_metadata *zmd) 336 { 337 unsigned int i; 338 339 for (i = 0; i < zmd->nr_devs; i++) { 340 if (!dmz_check_bdev(&zmd->dev[i])) 341 return false; 342 } 343 return true; 344 } 345 346 bool dmz_dev_is_dying(struct dmz_metadata *zmd) 347 { 348 unsigned int i; 349 350 for (i = 0; i < zmd->nr_devs; i++) { 351 if (dmz_bdev_is_dying(&zmd->dev[i])) 352 return true; 353 } 354 return false; 355 } 356 357 /* 358 * Lock/unlock mapping table. 359 * The map lock also protects all the zone lists. 360 */ 361 void dmz_lock_map(struct dmz_metadata *zmd) 362 { 363 mutex_lock(&zmd->map_lock); 364 } 365 366 void dmz_unlock_map(struct dmz_metadata *zmd) 367 { 368 mutex_unlock(&zmd->map_lock); 369 } 370 371 /* 372 * Lock/unlock metadata access. This is a "read" lock on a semaphore 373 * that prevents metadata flush from running while metadata are being 374 * modified. The actual metadata write mutual exclusion is achieved with 375 * the map lock and zone state management (active and reclaim state are 376 * mutually exclusive). 377 */ 378 void dmz_lock_metadata(struct dmz_metadata *zmd) 379 { 380 down_read(&zmd->mblk_sem); 381 } 382 383 void dmz_unlock_metadata(struct dmz_metadata *zmd) 384 { 385 up_read(&zmd->mblk_sem); 386 } 387 388 /* 389 * Lock/unlock flush: prevent concurrent executions 390 * of dmz_flush_metadata as well as metadata modification in reclaim 391 * while flush is being executed. 392 */ 393 void dmz_lock_flush(struct dmz_metadata *zmd) 394 { 395 mutex_lock(&zmd->mblk_flush_lock); 396 } 397 398 void dmz_unlock_flush(struct dmz_metadata *zmd) 399 { 400 mutex_unlock(&zmd->mblk_flush_lock); 401 } 402 403 /* 404 * Allocate a metadata block. 405 */ 406 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 407 sector_t mblk_no) 408 { 409 struct dmz_mblock *mblk = NULL; 410 411 /* See if we can reuse cached blocks */ 412 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 413 spin_lock(&zmd->mblk_lock); 414 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 415 struct dmz_mblock, link); 416 if (mblk) { 417 list_del_init(&mblk->link); 418 rb_erase(&mblk->node, &zmd->mblk_rbtree); 419 mblk->no = mblk_no; 420 } 421 spin_unlock(&zmd->mblk_lock); 422 if (mblk) 423 return mblk; 424 } 425 426 /* Allocate a new block */ 427 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 428 if (!mblk) 429 return NULL; 430 431 mblk->page = alloc_page(GFP_NOIO); 432 if (!mblk->page) { 433 kfree(mblk); 434 return NULL; 435 } 436 437 RB_CLEAR_NODE(&mblk->node); 438 INIT_LIST_HEAD(&mblk->link); 439 mblk->ref = 0; 440 mblk->state = 0; 441 mblk->no = mblk_no; 442 mblk->data = page_address(mblk->page); 443 444 atomic_inc(&zmd->nr_mblks); 445 446 return mblk; 447 } 448 449 /* 450 * Free a metadata block. 451 */ 452 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 453 { 454 __free_pages(mblk->page, 0); 455 kfree(mblk); 456 457 atomic_dec(&zmd->nr_mblks); 458 } 459 460 /* 461 * Insert a metadata block in the rbtree. 462 */ 463 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 464 { 465 struct rb_root *root = &zmd->mblk_rbtree; 466 struct rb_node **new = &(root->rb_node), *parent = NULL; 467 struct dmz_mblock *b; 468 469 /* Figure out where to put the new node */ 470 while (*new) { 471 b = container_of(*new, struct dmz_mblock, node); 472 parent = *new; 473 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 474 } 475 476 /* Add new node and rebalance tree */ 477 rb_link_node(&mblk->node, parent, new); 478 rb_insert_color(&mblk->node, root); 479 } 480 481 /* 482 * Lookup a metadata block in the rbtree. If the block is found, increment 483 * its reference count. 484 */ 485 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 486 sector_t mblk_no) 487 { 488 struct rb_root *root = &zmd->mblk_rbtree; 489 struct rb_node *node = root->rb_node; 490 struct dmz_mblock *mblk; 491 492 while (node) { 493 mblk = container_of(node, struct dmz_mblock, node); 494 if (mblk->no == mblk_no) { 495 /* 496 * If this is the first reference to the block, 497 * remove it from the LRU list. 498 */ 499 mblk->ref++; 500 if (mblk->ref == 1 && 501 !test_bit(DMZ_META_DIRTY, &mblk->state)) 502 list_del_init(&mblk->link); 503 return mblk; 504 } 505 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 506 } 507 508 return NULL; 509 } 510 511 /* 512 * Metadata block BIO end callback. 513 */ 514 static void dmz_mblock_bio_end_io(struct bio *bio) 515 { 516 struct dmz_mblock *mblk = bio->bi_private; 517 int flag; 518 519 if (bio->bi_status) 520 set_bit(DMZ_META_ERROR, &mblk->state); 521 522 if (bio_op(bio) == REQ_OP_WRITE) 523 flag = DMZ_META_WRITING; 524 else 525 flag = DMZ_META_READING; 526 527 clear_bit_unlock(flag, &mblk->state); 528 smp_mb__after_atomic(); 529 wake_up_bit(&mblk->state, flag); 530 531 bio_put(bio); 532 } 533 534 /* 535 * Read an uncached metadata block from disk and add it to the cache. 536 */ 537 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 538 sector_t mblk_no) 539 { 540 struct dmz_mblock *mblk, *m; 541 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 542 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 543 struct bio *bio; 544 545 if (dmz_bdev_is_dying(dev)) 546 return ERR_PTR(-EIO); 547 548 /* Get a new block and a BIO to read it */ 549 mblk = dmz_alloc_mblock(zmd, mblk_no); 550 if (!mblk) 551 return ERR_PTR(-ENOMEM); 552 553 bio = bio_alloc(GFP_NOIO, 1); 554 if (!bio) { 555 dmz_free_mblock(zmd, mblk); 556 return ERR_PTR(-ENOMEM); 557 } 558 559 spin_lock(&zmd->mblk_lock); 560 561 /* 562 * Make sure that another context did not start reading 563 * the block already. 564 */ 565 m = dmz_get_mblock_fast(zmd, mblk_no); 566 if (m) { 567 spin_unlock(&zmd->mblk_lock); 568 dmz_free_mblock(zmd, mblk); 569 bio_put(bio); 570 return m; 571 } 572 573 mblk->ref++; 574 set_bit(DMZ_META_READING, &mblk->state); 575 dmz_insert_mblock(zmd, mblk); 576 577 spin_unlock(&zmd->mblk_lock); 578 579 /* Submit read BIO */ 580 bio->bi_iter.bi_sector = dmz_blk2sect(block); 581 bio_set_dev(bio, dev->bdev); 582 bio->bi_private = mblk; 583 bio->bi_end_io = dmz_mblock_bio_end_io; 584 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 585 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 586 submit_bio(bio); 587 588 return mblk; 589 } 590 591 /* 592 * Free metadata blocks. 593 */ 594 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 595 unsigned long limit) 596 { 597 struct dmz_mblock *mblk; 598 unsigned long count = 0; 599 600 if (!zmd->max_nr_mblks) 601 return 0; 602 603 while (!list_empty(&zmd->mblk_lru_list) && 604 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 605 count < limit) { 606 mblk = list_first_entry(&zmd->mblk_lru_list, 607 struct dmz_mblock, link); 608 list_del_init(&mblk->link); 609 rb_erase(&mblk->node, &zmd->mblk_rbtree); 610 dmz_free_mblock(zmd, mblk); 611 count++; 612 } 613 614 return count; 615 } 616 617 /* 618 * For mblock shrinker: get the number of unused metadata blocks in the cache. 619 */ 620 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 621 struct shrink_control *sc) 622 { 623 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 624 625 return atomic_read(&zmd->nr_mblks); 626 } 627 628 /* 629 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 630 */ 631 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 632 struct shrink_control *sc) 633 { 634 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 635 unsigned long count; 636 637 spin_lock(&zmd->mblk_lock); 638 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 639 spin_unlock(&zmd->mblk_lock); 640 641 return count ? count : SHRINK_STOP; 642 } 643 644 /* 645 * Release a metadata block. 646 */ 647 static void dmz_release_mblock(struct dmz_metadata *zmd, 648 struct dmz_mblock *mblk) 649 { 650 651 if (!mblk) 652 return; 653 654 spin_lock(&zmd->mblk_lock); 655 656 mblk->ref--; 657 if (mblk->ref == 0) { 658 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 659 rb_erase(&mblk->node, &zmd->mblk_rbtree); 660 dmz_free_mblock(zmd, mblk); 661 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 662 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 663 dmz_shrink_mblock_cache(zmd, 1); 664 } 665 } 666 667 spin_unlock(&zmd->mblk_lock); 668 } 669 670 /* 671 * Get a metadata block from the rbtree. If the block 672 * is not present, read it from disk. 673 */ 674 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 675 sector_t mblk_no) 676 { 677 struct dmz_mblock *mblk; 678 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 679 680 /* Check rbtree */ 681 spin_lock(&zmd->mblk_lock); 682 mblk = dmz_get_mblock_fast(zmd, mblk_no); 683 spin_unlock(&zmd->mblk_lock); 684 685 if (!mblk) { 686 /* Cache miss: read the block from disk */ 687 mblk = dmz_get_mblock_slow(zmd, mblk_no); 688 if (IS_ERR(mblk)) 689 return mblk; 690 } 691 692 /* Wait for on-going read I/O and check for error */ 693 wait_on_bit_io(&mblk->state, DMZ_META_READING, 694 TASK_UNINTERRUPTIBLE); 695 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 696 dmz_release_mblock(zmd, mblk); 697 dmz_check_bdev(dev); 698 return ERR_PTR(-EIO); 699 } 700 701 return mblk; 702 } 703 704 /* 705 * Mark a metadata block dirty. 706 */ 707 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 708 { 709 spin_lock(&zmd->mblk_lock); 710 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 711 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 712 spin_unlock(&zmd->mblk_lock); 713 } 714 715 /* 716 * Issue a metadata block write BIO. 717 */ 718 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 719 unsigned int set) 720 { 721 struct dmz_dev *dev = zmd->sb[set].dev; 722 sector_t block = zmd->sb[set].block + mblk->no; 723 struct bio *bio; 724 725 if (dmz_bdev_is_dying(dev)) 726 return -EIO; 727 728 bio = bio_alloc(GFP_NOIO, 1); 729 if (!bio) { 730 set_bit(DMZ_META_ERROR, &mblk->state); 731 return -ENOMEM; 732 } 733 734 set_bit(DMZ_META_WRITING, &mblk->state); 735 736 bio->bi_iter.bi_sector = dmz_blk2sect(block); 737 bio_set_dev(bio, dev->bdev); 738 bio->bi_private = mblk; 739 bio->bi_end_io = dmz_mblock_bio_end_io; 740 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 741 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 742 submit_bio(bio); 743 744 return 0; 745 } 746 747 /* 748 * Read/write a metadata block. 749 */ 750 static int dmz_rdwr_block(struct dmz_dev *dev, int op, 751 sector_t block, struct page *page) 752 { 753 struct bio *bio; 754 int ret; 755 756 if (WARN_ON(!dev)) 757 return -EIO; 758 759 if (dmz_bdev_is_dying(dev)) 760 return -EIO; 761 762 bio = bio_alloc(GFP_NOIO, 1); 763 if (!bio) 764 return -ENOMEM; 765 766 bio->bi_iter.bi_sector = dmz_blk2sect(block); 767 bio_set_dev(bio, dev->bdev); 768 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 769 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 770 ret = submit_bio_wait(bio); 771 bio_put(bio); 772 773 if (ret) 774 dmz_check_bdev(dev); 775 return ret; 776 } 777 778 /* 779 * Write super block of the specified metadata set. 780 */ 781 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 782 { 783 struct dmz_mblock *mblk = zmd->sb[set].mblk; 784 struct dmz_super *sb = zmd->sb[set].sb; 785 struct dmz_dev *dev = zmd->sb[set].dev; 786 sector_t sb_block; 787 u64 sb_gen = zmd->sb_gen + 1; 788 int ret; 789 790 sb->magic = cpu_to_le32(DMZ_MAGIC); 791 792 sb->version = cpu_to_le32(zmd->sb_version); 793 if (zmd->sb_version > 1) { 794 BUILD_BUG_ON(UUID_SIZE != 16); 795 export_uuid(sb->dmz_uuid, &zmd->uuid); 796 memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); 797 export_uuid(sb->dev_uuid, &dev->uuid); 798 } 799 800 sb->gen = cpu_to_le64(sb_gen); 801 802 /* 803 * The metadata always references the absolute block address, 804 * ie relative to the entire block range, not the per-device 805 * block address. 806 */ 807 sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; 808 sb->sb_block = cpu_to_le64(sb_block); 809 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 810 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 811 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 812 813 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 814 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 815 816 sb->crc = 0; 817 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 818 819 ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, 820 mblk->page); 821 if (ret == 0) 822 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 823 824 return ret; 825 } 826 827 /* 828 * Write dirty metadata blocks to the specified set. 829 */ 830 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 831 struct list_head *write_list, 832 unsigned int set) 833 { 834 struct dmz_mblock *mblk; 835 struct dmz_dev *dev = zmd->sb[set].dev; 836 struct blk_plug plug; 837 int ret = 0, nr_mblks_submitted = 0; 838 839 /* Issue writes */ 840 blk_start_plug(&plug); 841 list_for_each_entry(mblk, write_list, link) { 842 ret = dmz_write_mblock(zmd, mblk, set); 843 if (ret) 844 break; 845 nr_mblks_submitted++; 846 } 847 blk_finish_plug(&plug); 848 849 /* Wait for completion */ 850 list_for_each_entry(mblk, write_list, link) { 851 if (!nr_mblks_submitted) 852 break; 853 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 854 TASK_UNINTERRUPTIBLE); 855 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 856 clear_bit(DMZ_META_ERROR, &mblk->state); 857 dmz_check_bdev(dev); 858 ret = -EIO; 859 } 860 nr_mblks_submitted--; 861 } 862 863 /* Flush drive cache (this will also sync data) */ 864 if (ret == 0) 865 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 866 867 return ret; 868 } 869 870 /* 871 * Log dirty metadata blocks. 872 */ 873 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 874 struct list_head *write_list) 875 { 876 unsigned int log_set = zmd->mblk_primary ^ 0x1; 877 int ret; 878 879 /* Write dirty blocks to the log */ 880 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 881 if (ret) 882 return ret; 883 884 /* 885 * No error so far: now validate the log by updating the 886 * log index super block generation. 887 */ 888 ret = dmz_write_sb(zmd, log_set); 889 if (ret) 890 return ret; 891 892 return 0; 893 } 894 895 /* 896 * Flush dirty metadata blocks. 897 */ 898 int dmz_flush_metadata(struct dmz_metadata *zmd) 899 { 900 struct dmz_mblock *mblk; 901 struct list_head write_list; 902 struct dmz_dev *dev; 903 int ret; 904 905 if (WARN_ON(!zmd)) 906 return 0; 907 908 INIT_LIST_HEAD(&write_list); 909 910 /* 911 * Make sure that metadata blocks are stable before logging: take 912 * the write lock on the metadata semaphore to prevent target BIOs 913 * from modifying metadata. 914 */ 915 down_write(&zmd->mblk_sem); 916 dev = zmd->sb[zmd->mblk_primary].dev; 917 918 /* 919 * This is called from the target flush work and reclaim work. 920 * Concurrent execution is not allowed. 921 */ 922 dmz_lock_flush(zmd); 923 924 if (dmz_bdev_is_dying(dev)) { 925 ret = -EIO; 926 goto out; 927 } 928 929 /* Get dirty blocks */ 930 spin_lock(&zmd->mblk_lock); 931 list_splice_init(&zmd->mblk_dirty_list, &write_list); 932 spin_unlock(&zmd->mblk_lock); 933 934 /* If there are no dirty metadata blocks, just flush the device cache */ 935 if (list_empty(&write_list)) { 936 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 937 goto err; 938 } 939 940 /* 941 * The primary metadata set is still clean. Keep it this way until 942 * all updates are successful in the secondary set. That is, use 943 * the secondary set as a log. 944 */ 945 ret = dmz_log_dirty_mblocks(zmd, &write_list); 946 if (ret) 947 goto err; 948 949 /* 950 * The log is on disk. It is now safe to update in place 951 * in the primary metadata set. 952 */ 953 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 954 if (ret) 955 goto err; 956 957 ret = dmz_write_sb(zmd, zmd->mblk_primary); 958 if (ret) 959 goto err; 960 961 while (!list_empty(&write_list)) { 962 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 963 list_del_init(&mblk->link); 964 965 spin_lock(&zmd->mblk_lock); 966 clear_bit(DMZ_META_DIRTY, &mblk->state); 967 if (mblk->ref == 0) 968 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 969 spin_unlock(&zmd->mblk_lock); 970 } 971 972 zmd->sb_gen++; 973 out: 974 dmz_unlock_flush(zmd); 975 up_write(&zmd->mblk_sem); 976 977 return ret; 978 979 err: 980 if (!list_empty(&write_list)) { 981 spin_lock(&zmd->mblk_lock); 982 list_splice(&write_list, &zmd->mblk_dirty_list); 983 spin_unlock(&zmd->mblk_lock); 984 } 985 if (!dmz_check_bdev(dev)) 986 ret = -EIO; 987 goto out; 988 } 989 990 /* 991 * Check super block. 992 */ 993 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, 994 bool tertiary) 995 { 996 struct dmz_super *sb = dsb->sb; 997 struct dmz_dev *dev = dsb->dev; 998 unsigned int nr_meta_zones, nr_data_zones; 999 u32 crc, stored_crc; 1000 u64 gen, sb_block; 1001 1002 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 1003 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 1004 DMZ_MAGIC, le32_to_cpu(sb->magic)); 1005 return -ENXIO; 1006 } 1007 1008 zmd->sb_version = le32_to_cpu(sb->version); 1009 if (zmd->sb_version > DMZ_META_VER) { 1010 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 1011 DMZ_META_VER, zmd->sb_version); 1012 return -EINVAL; 1013 } 1014 if (zmd->sb_version < 2 && tertiary) { 1015 dmz_dev_err(dev, "Tertiary superblocks are not supported"); 1016 return -EINVAL; 1017 } 1018 1019 gen = le64_to_cpu(sb->gen); 1020 stored_crc = le32_to_cpu(sb->crc); 1021 sb->crc = 0; 1022 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 1023 if (crc != stored_crc) { 1024 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 1025 crc, stored_crc); 1026 return -ENXIO; 1027 } 1028 1029 sb_block = le64_to_cpu(sb->sb_block); 1030 if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { 1031 dmz_dev_err(dev, "Invalid superblock position " 1032 "(is %llu expected %llu)", 1033 sb_block, 1034 (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); 1035 return -EINVAL; 1036 } 1037 if (zmd->sb_version > 1) { 1038 uuid_t sb_uuid; 1039 1040 import_uuid(&sb_uuid, sb->dmz_uuid); 1041 if (uuid_is_null(&sb_uuid)) { 1042 dmz_dev_err(dev, "NULL DM-Zoned uuid"); 1043 return -ENXIO; 1044 } else if (uuid_is_null(&zmd->uuid)) { 1045 uuid_copy(&zmd->uuid, &sb_uuid); 1046 } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { 1047 dmz_dev_err(dev, "mismatching DM-Zoned uuid, " 1048 "is %pUl expected %pUl", 1049 &sb_uuid, &zmd->uuid); 1050 return -ENXIO; 1051 } 1052 if (!strlen(zmd->label)) 1053 memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); 1054 else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { 1055 dmz_dev_err(dev, "mismatching DM-Zoned label, " 1056 "is %s expected %s", 1057 sb->dmz_label, zmd->label); 1058 return -ENXIO; 1059 } 1060 import_uuid(&dev->uuid, sb->dev_uuid); 1061 if (uuid_is_null(&dev->uuid)) { 1062 dmz_dev_err(dev, "NULL device uuid"); 1063 return -ENXIO; 1064 } 1065 1066 if (tertiary) { 1067 /* 1068 * Generation number should be 0, but it doesn't 1069 * really matter if it isn't. 1070 */ 1071 if (gen != 0) 1072 dmz_dev_warn(dev, "Invalid generation %llu", 1073 gen); 1074 return 0; 1075 } 1076 } 1077 1078 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) 1079 >> zmd->zone_nr_blocks_shift; 1080 if (!nr_meta_zones || 1081 (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) || 1082 (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) { 1083 dmz_dev_err(dev, "Invalid number of metadata blocks"); 1084 return -ENXIO; 1085 } 1086 1087 if (!le32_to_cpu(sb->nr_reserved_seq) || 1088 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 1089 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 1090 return -ENXIO; 1091 } 1092 1093 nr_data_zones = zmd->nr_useable_zones - 1094 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 1095 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 1096 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 1097 le32_to_cpu(sb->nr_chunks), nr_data_zones); 1098 return -ENXIO; 1099 } 1100 1101 /* OK */ 1102 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 1103 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 1104 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 1105 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 1106 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 1107 zmd->nr_meta_zones = nr_meta_zones; 1108 zmd->nr_data_zones = nr_data_zones; 1109 1110 return 0; 1111 } 1112 1113 /* 1114 * Read the first or second super block from disk. 1115 */ 1116 static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1117 { 1118 dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", 1119 set, sb->dev->name, sb->block); 1120 1121 return dmz_rdwr_block(sb->dev, REQ_OP_READ, 1122 sb->block, sb->mblk->page); 1123 } 1124 1125 /* 1126 * Determine the position of the secondary super blocks on disk. 1127 * This is used only if a corruption of the primary super block 1128 * is detected. 1129 */ 1130 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 1131 { 1132 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 1133 struct dmz_mblock *mblk; 1134 unsigned int zone_id = zmd->sb[0].zone->id; 1135 int i; 1136 1137 /* Allocate a block */ 1138 mblk = dmz_alloc_mblock(zmd, 0); 1139 if (!mblk) 1140 return -ENOMEM; 1141 1142 zmd->sb[1].mblk = mblk; 1143 zmd->sb[1].sb = mblk->data; 1144 1145 /* Bad first super block: search for the second one */ 1146 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 1147 zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); 1148 zmd->sb[1].dev = zmd->sb[0].dev; 1149 for (i = 1; i < zmd->nr_rnd_zones; i++) { 1150 if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) 1151 break; 1152 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 1153 return 0; 1154 zmd->sb[1].block += zone_nr_blocks; 1155 zmd->sb[1].zone = dmz_get(zmd, zone_id + i); 1156 } 1157 1158 dmz_free_mblock(zmd, mblk); 1159 zmd->sb[1].mblk = NULL; 1160 zmd->sb[1].zone = NULL; 1161 zmd->sb[1].dev = NULL; 1162 1163 return -EIO; 1164 } 1165 1166 /* 1167 * Read a super block from disk. 1168 */ 1169 static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1170 { 1171 struct dmz_mblock *mblk; 1172 int ret; 1173 1174 /* Allocate a block */ 1175 mblk = dmz_alloc_mblock(zmd, 0); 1176 if (!mblk) 1177 return -ENOMEM; 1178 1179 sb->mblk = mblk; 1180 sb->sb = mblk->data; 1181 1182 /* Read super block */ 1183 ret = dmz_read_sb(zmd, sb, set); 1184 if (ret) { 1185 dmz_free_mblock(zmd, mblk); 1186 sb->mblk = NULL; 1187 return ret; 1188 } 1189 1190 return 0; 1191 } 1192 1193 /* 1194 * Recover a metadata set. 1195 */ 1196 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 1197 { 1198 unsigned int src_set = dst_set ^ 0x1; 1199 struct page *page; 1200 int i, ret; 1201 1202 dmz_dev_warn(zmd->sb[dst_set].dev, 1203 "Metadata set %u invalid: recovering", dst_set); 1204 1205 if (dst_set == 0) 1206 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1207 else 1208 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1209 1210 page = alloc_page(GFP_NOIO); 1211 if (!page) 1212 return -ENOMEM; 1213 1214 /* Copy metadata blocks */ 1215 for (i = 1; i < zmd->nr_meta_blocks; i++) { 1216 ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, 1217 zmd->sb[src_set].block + i, page); 1218 if (ret) 1219 goto out; 1220 ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, 1221 zmd->sb[dst_set].block + i, page); 1222 if (ret) 1223 goto out; 1224 } 1225 1226 /* Finalize with the super block */ 1227 if (!zmd->sb[dst_set].mblk) { 1228 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 1229 if (!zmd->sb[dst_set].mblk) { 1230 ret = -ENOMEM; 1231 goto out; 1232 } 1233 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 1234 } 1235 1236 ret = dmz_write_sb(zmd, dst_set); 1237 out: 1238 __free_pages(page, 0); 1239 1240 return ret; 1241 } 1242 1243 /* 1244 * Get super block from disk. 1245 */ 1246 static int dmz_load_sb(struct dmz_metadata *zmd) 1247 { 1248 bool sb_good[2] = {false, false}; 1249 u64 sb_gen[2] = {0, 0}; 1250 int ret; 1251 1252 if (!zmd->sb[0].zone) { 1253 dmz_zmd_err(zmd, "Primary super block zone not set"); 1254 return -ENXIO; 1255 } 1256 1257 /* Read and check the primary super block */ 1258 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1259 zmd->sb[0].dev = zmd->sb[0].zone->dev; 1260 ret = dmz_get_sb(zmd, &zmd->sb[0], 0); 1261 if (ret) { 1262 dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); 1263 return ret; 1264 } 1265 1266 ret = dmz_check_sb(zmd, &zmd->sb[0], false); 1267 1268 /* Read and check secondary super block */ 1269 if (ret == 0) { 1270 sb_good[0] = true; 1271 if (!zmd->sb[1].zone) { 1272 unsigned int zone_id = 1273 zmd->sb[0].zone->id + zmd->nr_meta_zones; 1274 1275 zmd->sb[1].zone = dmz_get(zmd, zone_id); 1276 } 1277 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1278 zmd->sb[1].dev = zmd->sb[0].dev; 1279 ret = dmz_get_sb(zmd, &zmd->sb[1], 1); 1280 } else 1281 ret = dmz_lookup_secondary_sb(zmd); 1282 1283 if (ret) { 1284 dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); 1285 return ret; 1286 } 1287 1288 ret = dmz_check_sb(zmd, &zmd->sb[1], false); 1289 if (ret == 0) 1290 sb_good[1] = true; 1291 1292 /* Use highest generation sb first */ 1293 if (!sb_good[0] && !sb_good[1]) { 1294 dmz_zmd_err(zmd, "No valid super block found"); 1295 return -EIO; 1296 } 1297 1298 if (sb_good[0]) 1299 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1300 else { 1301 ret = dmz_recover_mblocks(zmd, 0); 1302 if (ret) { 1303 dmz_dev_err(zmd->sb[0].dev, 1304 "Recovery of superblock 0 failed"); 1305 return -EIO; 1306 } 1307 } 1308 1309 if (sb_good[1]) 1310 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1311 else { 1312 ret = dmz_recover_mblocks(zmd, 1); 1313 1314 if (ret) { 1315 dmz_dev_err(zmd->sb[1].dev, 1316 "Recovery of superblock 1 failed"); 1317 return -EIO; 1318 } 1319 } 1320 1321 if (sb_gen[0] >= sb_gen[1]) { 1322 zmd->sb_gen = sb_gen[0]; 1323 zmd->mblk_primary = 0; 1324 } else { 1325 zmd->sb_gen = sb_gen[1]; 1326 zmd->mblk_primary = 1; 1327 } 1328 1329 dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, 1330 "Using super block %u (gen %llu)", 1331 zmd->mblk_primary, zmd->sb_gen); 1332 1333 if (zmd->sb_version > 1) { 1334 int i; 1335 struct dmz_sb *sb; 1336 1337 sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); 1338 if (!sb) 1339 return -ENOMEM; 1340 for (i = 1; i < zmd->nr_devs; i++) { 1341 sb->block = 0; 1342 sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); 1343 sb->dev = &zmd->dev[i]; 1344 if (!dmz_is_meta(sb->zone)) { 1345 dmz_dev_err(sb->dev, 1346 "Tertiary super block zone %u not marked as metadata zone", 1347 sb->zone->id); 1348 ret = -EINVAL; 1349 goto out_kfree; 1350 } 1351 ret = dmz_get_sb(zmd, sb, i + 1); 1352 if (ret) { 1353 dmz_dev_err(sb->dev, 1354 "Read tertiary super block failed"); 1355 dmz_free_mblock(zmd, sb->mblk); 1356 goto out_kfree; 1357 } 1358 ret = dmz_check_sb(zmd, sb, true); 1359 dmz_free_mblock(zmd, sb->mblk); 1360 if (ret == -EINVAL) 1361 goto out_kfree; 1362 } 1363 out_kfree: 1364 kfree(sb); 1365 } 1366 return ret; 1367 } 1368 1369 /* 1370 * Initialize a zone descriptor. 1371 */ 1372 static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) 1373 { 1374 struct dmz_dev *dev = data; 1375 struct dmz_metadata *zmd = dev->metadata; 1376 int idx = num + dev->zone_offset; 1377 struct dm_zone *zone; 1378 1379 zone = dmz_insert(zmd, idx, dev); 1380 if (IS_ERR(zone)) 1381 return PTR_ERR(zone); 1382 1383 if (blkz->len != zmd->zone_nr_sectors) { 1384 if (zmd->sb_version > 1) { 1385 /* Ignore the eventual runt (smaller) zone */ 1386 set_bit(DMZ_OFFLINE, &zone->flags); 1387 return 0; 1388 } else if (blkz->start + blkz->len == dev->capacity) 1389 return 0; 1390 return -ENXIO; 1391 } 1392 1393 switch (blkz->type) { 1394 case BLK_ZONE_TYPE_CONVENTIONAL: 1395 set_bit(DMZ_RND, &zone->flags); 1396 break; 1397 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1398 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1399 set_bit(DMZ_SEQ, &zone->flags); 1400 break; 1401 default: 1402 return -ENXIO; 1403 } 1404 1405 if (dmz_is_rnd(zone)) 1406 zone->wp_block = 0; 1407 else 1408 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1409 1410 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1411 set_bit(DMZ_OFFLINE, &zone->flags); 1412 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1413 set_bit(DMZ_READ_ONLY, &zone->flags); 1414 else { 1415 zmd->nr_useable_zones++; 1416 if (dmz_is_rnd(zone)) { 1417 zmd->nr_rnd_zones++; 1418 if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { 1419 /* Primary super block zone */ 1420 zmd->sb[0].zone = zone; 1421 } 1422 } 1423 if (zmd->nr_devs > 1 && num == 0) { 1424 /* 1425 * Tertiary superblock zones are always at the 1426 * start of the zoned devices, so mark them 1427 * as metadata zone. 1428 */ 1429 set_bit(DMZ_META, &zone->flags); 1430 } 1431 } 1432 return 0; 1433 } 1434 1435 static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) 1436 { 1437 int idx; 1438 sector_t zone_offset = 0; 1439 1440 for(idx = 0; idx < dev->nr_zones; idx++) { 1441 struct dm_zone *zone; 1442 1443 zone = dmz_insert(zmd, idx, dev); 1444 if (IS_ERR(zone)) 1445 return PTR_ERR(zone); 1446 set_bit(DMZ_CACHE, &zone->flags); 1447 zone->wp_block = 0; 1448 zmd->nr_cache_zones++; 1449 zmd->nr_useable_zones++; 1450 if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { 1451 /* Disable runt zone */ 1452 set_bit(DMZ_OFFLINE, &zone->flags); 1453 break; 1454 } 1455 zone_offset += zmd->zone_nr_sectors; 1456 } 1457 return 0; 1458 } 1459 1460 /* 1461 * Free zones descriptors. 1462 */ 1463 static void dmz_drop_zones(struct dmz_metadata *zmd) 1464 { 1465 int idx; 1466 1467 for(idx = 0; idx < zmd->nr_zones; idx++) { 1468 struct dm_zone *zone = xa_load(&zmd->zones, idx); 1469 1470 kfree(zone); 1471 xa_erase(&zmd->zones, idx); 1472 } 1473 xa_destroy(&zmd->zones); 1474 } 1475 1476 /* 1477 * Allocate and initialize zone descriptors using the zone 1478 * information from disk. 1479 */ 1480 static int dmz_init_zones(struct dmz_metadata *zmd) 1481 { 1482 int i, ret; 1483 struct dmz_dev *zoned_dev = &zmd->dev[0]; 1484 1485 /* Init */ 1486 zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; 1487 zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); 1488 zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); 1489 zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); 1490 zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; 1491 zmd->zone_nr_bitmap_blocks = 1492 max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); 1493 zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, 1494 DMZ_BLOCK_SIZE_BITS); 1495 1496 /* Allocate zone array */ 1497 zmd->nr_zones = 0; 1498 for (i = 0; i < zmd->nr_devs; i++) { 1499 struct dmz_dev *dev = &zmd->dev[i]; 1500 1501 dev->metadata = zmd; 1502 zmd->nr_zones += dev->nr_zones; 1503 1504 atomic_set(&dev->unmap_nr_rnd, 0); 1505 INIT_LIST_HEAD(&dev->unmap_rnd_list); 1506 INIT_LIST_HEAD(&dev->map_rnd_list); 1507 1508 atomic_set(&dev->unmap_nr_seq, 0); 1509 INIT_LIST_HEAD(&dev->unmap_seq_list); 1510 INIT_LIST_HEAD(&dev->map_seq_list); 1511 } 1512 1513 if (!zmd->nr_zones) { 1514 DMERR("(%s): No zones found", zmd->devname); 1515 return -ENXIO; 1516 } 1517 xa_init(&zmd->zones); 1518 1519 DMDEBUG("(%s): Using %zu B for zone information", 1520 zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); 1521 1522 if (zmd->nr_devs > 1) { 1523 ret = dmz_emulate_zones(zmd, &zmd->dev[0]); 1524 if (ret < 0) { 1525 DMDEBUG("(%s): Failed to emulate zones, error %d", 1526 zmd->devname, ret); 1527 dmz_drop_zones(zmd); 1528 return ret; 1529 } 1530 1531 /* 1532 * Primary superblock zone is always at zone 0 when multiple 1533 * drives are present. 1534 */ 1535 zmd->sb[0].zone = dmz_get(zmd, 0); 1536 1537 for (i = 1; i < zmd->nr_devs; i++) { 1538 zoned_dev = &zmd->dev[i]; 1539 1540 ret = blkdev_report_zones(zoned_dev->bdev, 0, 1541 BLK_ALL_ZONES, 1542 dmz_init_zone, zoned_dev); 1543 if (ret < 0) { 1544 DMDEBUG("(%s): Failed to report zones, error %d", 1545 zmd->devname, ret); 1546 dmz_drop_zones(zmd); 1547 return ret; 1548 } 1549 } 1550 return 0; 1551 } 1552 1553 /* 1554 * Get zone information and initialize zone descriptors. At the same 1555 * time, determine where the super block should be: first block of the 1556 * first randomly writable zone. 1557 */ 1558 ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, 1559 dmz_init_zone, zoned_dev); 1560 if (ret < 0) { 1561 DMDEBUG("(%s): Failed to report zones, error %d", 1562 zmd->devname, ret); 1563 dmz_drop_zones(zmd); 1564 return ret; 1565 } 1566 1567 return 0; 1568 } 1569 1570 static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, 1571 void *data) 1572 { 1573 struct dm_zone *zone = data; 1574 1575 clear_bit(DMZ_OFFLINE, &zone->flags); 1576 clear_bit(DMZ_READ_ONLY, &zone->flags); 1577 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1578 set_bit(DMZ_OFFLINE, &zone->flags); 1579 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1580 set_bit(DMZ_READ_ONLY, &zone->flags); 1581 1582 if (dmz_is_seq(zone)) 1583 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1584 else 1585 zone->wp_block = 0; 1586 return 0; 1587 } 1588 1589 /* 1590 * Update a zone information. 1591 */ 1592 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1593 { 1594 struct dmz_dev *dev = zone->dev; 1595 unsigned int noio_flag; 1596 int ret; 1597 1598 if (dev->flags & DMZ_BDEV_REGULAR) 1599 return 0; 1600 1601 /* 1602 * Get zone information from disk. Since blkdev_report_zones() uses 1603 * GFP_KERNEL by default for memory allocations, set the per-task 1604 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1605 * GFP_NOIO was specified. 1606 */ 1607 noio_flag = memalloc_noio_save(); 1608 ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, 1609 dmz_update_zone_cb, zone); 1610 memalloc_noio_restore(noio_flag); 1611 1612 if (ret == 0) 1613 ret = -EIO; 1614 if (ret < 0) { 1615 dmz_dev_err(dev, "Get zone %u report failed", 1616 zone->id); 1617 dmz_check_bdev(dev); 1618 return ret; 1619 } 1620 1621 return 0; 1622 } 1623 1624 /* 1625 * Check a zone write pointer position when the zone is marked 1626 * with the sequential write error flag. 1627 */ 1628 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1629 struct dm_zone *zone) 1630 { 1631 struct dmz_dev *dev = zone->dev; 1632 unsigned int wp = 0; 1633 int ret; 1634 1635 wp = zone->wp_block; 1636 ret = dmz_update_zone(zmd, zone); 1637 if (ret) 1638 return ret; 1639 1640 dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", 1641 zone->id, zone->wp_block, wp); 1642 1643 if (zone->wp_block < wp) { 1644 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1645 wp - zone->wp_block); 1646 } 1647 1648 return 0; 1649 } 1650 1651 /* 1652 * Reset a zone write pointer. 1653 */ 1654 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1655 { 1656 int ret; 1657 1658 /* 1659 * Ignore offline zones, read only zones, 1660 * and conventional zones. 1661 */ 1662 if (dmz_is_offline(zone) || 1663 dmz_is_readonly(zone) || 1664 dmz_is_rnd(zone)) 1665 return 0; 1666 1667 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1668 struct dmz_dev *dev = zone->dev; 1669 1670 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1671 dmz_start_sect(zmd, zone), 1672 zmd->zone_nr_sectors, GFP_NOIO); 1673 if (ret) { 1674 dmz_dev_err(dev, "Reset zone %u failed %d", 1675 zone->id, ret); 1676 return ret; 1677 } 1678 } 1679 1680 /* Clear write error bit and rewind write pointer position */ 1681 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1682 zone->wp_block = 0; 1683 1684 return 0; 1685 } 1686 1687 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1688 1689 /* 1690 * Initialize chunk mapping. 1691 */ 1692 static int dmz_load_mapping(struct dmz_metadata *zmd) 1693 { 1694 struct dm_zone *dzone, *bzone; 1695 struct dmz_mblock *dmap_mblk = NULL; 1696 struct dmz_map *dmap; 1697 unsigned int i = 0, e = 0, chunk = 0; 1698 unsigned int dzone_id; 1699 unsigned int bzone_id; 1700 1701 /* Metadata block array for the chunk mapping table */ 1702 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1703 sizeof(struct dmz_mblk *), GFP_KERNEL); 1704 if (!zmd->map_mblk) 1705 return -ENOMEM; 1706 1707 /* Get chunk mapping table blocks and initialize zone mapping */ 1708 while (chunk < zmd->nr_chunks) { 1709 if (!dmap_mblk) { 1710 /* Get mapping block */ 1711 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1712 if (IS_ERR(dmap_mblk)) 1713 return PTR_ERR(dmap_mblk); 1714 zmd->map_mblk[i] = dmap_mblk; 1715 dmap = (struct dmz_map *) dmap_mblk->data; 1716 i++; 1717 e = 0; 1718 } 1719 1720 /* Check data zone */ 1721 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1722 if (dzone_id == DMZ_MAP_UNMAPPED) 1723 goto next; 1724 1725 if (dzone_id >= zmd->nr_zones) { 1726 dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", 1727 chunk, dzone_id); 1728 return -EIO; 1729 } 1730 1731 dzone = dmz_get(zmd, dzone_id); 1732 if (!dzone) { 1733 dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", 1734 chunk, dzone_id); 1735 return -EIO; 1736 } 1737 set_bit(DMZ_DATA, &dzone->flags); 1738 dzone->chunk = chunk; 1739 dmz_get_zone_weight(zmd, dzone); 1740 1741 if (dmz_is_cache(dzone)) 1742 list_add_tail(&dzone->link, &zmd->map_cache_list); 1743 else if (dmz_is_rnd(dzone)) 1744 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 1745 else 1746 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 1747 1748 /* Check buffer zone */ 1749 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1750 if (bzone_id == DMZ_MAP_UNMAPPED) 1751 goto next; 1752 1753 if (bzone_id >= zmd->nr_zones) { 1754 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", 1755 chunk, bzone_id); 1756 return -EIO; 1757 } 1758 1759 bzone = dmz_get(zmd, bzone_id); 1760 if (!bzone) { 1761 dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", 1762 chunk, bzone_id); 1763 return -EIO; 1764 } 1765 if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { 1766 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", 1767 chunk, bzone_id); 1768 return -EIO; 1769 } 1770 1771 set_bit(DMZ_DATA, &bzone->flags); 1772 set_bit(DMZ_BUF, &bzone->flags); 1773 bzone->chunk = chunk; 1774 bzone->bzone = dzone; 1775 dzone->bzone = bzone; 1776 dmz_get_zone_weight(zmd, bzone); 1777 if (dmz_is_cache(bzone)) 1778 list_add_tail(&bzone->link, &zmd->map_cache_list); 1779 else 1780 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 1781 next: 1782 chunk++; 1783 e++; 1784 if (e >= DMZ_MAP_ENTRIES) 1785 dmap_mblk = NULL; 1786 } 1787 1788 /* 1789 * At this point, only meta zones and mapped data zones were 1790 * fully initialized. All remaining zones are unmapped data 1791 * zones. Finish initializing those here. 1792 */ 1793 for (i = 0; i < zmd->nr_zones; i++) { 1794 dzone = dmz_get(zmd, i); 1795 if (!dzone) 1796 continue; 1797 if (dmz_is_meta(dzone)) 1798 continue; 1799 if (dmz_is_offline(dzone)) 1800 continue; 1801 1802 if (dmz_is_cache(dzone)) 1803 zmd->nr_cache++; 1804 else if (dmz_is_rnd(dzone)) 1805 dzone->dev->nr_rnd++; 1806 else 1807 dzone->dev->nr_seq++; 1808 1809 if (dmz_is_data(dzone)) { 1810 /* Already initialized */ 1811 continue; 1812 } 1813 1814 /* Unmapped data zone */ 1815 set_bit(DMZ_DATA, &dzone->flags); 1816 dzone->chunk = DMZ_MAP_UNMAPPED; 1817 if (dmz_is_cache(dzone)) { 1818 list_add_tail(&dzone->link, &zmd->unmap_cache_list); 1819 atomic_inc(&zmd->unmap_nr_cache); 1820 } else if (dmz_is_rnd(dzone)) { 1821 list_add_tail(&dzone->link, 1822 &dzone->dev->unmap_rnd_list); 1823 atomic_inc(&dzone->dev->unmap_nr_rnd); 1824 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1825 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1826 set_bit(DMZ_RESERVED, &dzone->flags); 1827 atomic_inc(&zmd->nr_reserved_seq_zones); 1828 dzone->dev->nr_seq--; 1829 } else { 1830 list_add_tail(&dzone->link, 1831 &dzone->dev->unmap_seq_list); 1832 atomic_inc(&dzone->dev->unmap_nr_seq); 1833 } 1834 } 1835 1836 return 0; 1837 } 1838 1839 /* 1840 * Set a data chunk mapping. 1841 */ 1842 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1843 unsigned int dzone_id, unsigned int bzone_id) 1844 { 1845 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1846 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1847 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1848 1849 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1850 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1851 dmz_dirty_mblock(zmd, dmap_mblk); 1852 } 1853 1854 /* 1855 * The list of mapped zones is maintained in LRU order. 1856 * This rotates a zone at the end of its map list. 1857 */ 1858 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1859 { 1860 if (list_empty(&zone->link)) 1861 return; 1862 1863 list_del_init(&zone->link); 1864 if (dmz_is_seq(zone)) { 1865 /* LRU rotate sequential zone */ 1866 list_add_tail(&zone->link, &zone->dev->map_seq_list); 1867 } else if (dmz_is_cache(zone)) { 1868 /* LRU rotate cache zone */ 1869 list_add_tail(&zone->link, &zmd->map_cache_list); 1870 } else { 1871 /* LRU rotate random zone */ 1872 list_add_tail(&zone->link, &zone->dev->map_rnd_list); 1873 } 1874 } 1875 1876 /* 1877 * The list of mapped random zones is maintained 1878 * in LRU order. This rotates a zone at the end of the list. 1879 */ 1880 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1881 { 1882 __dmz_lru_zone(zmd, zone); 1883 if (zone->bzone) 1884 __dmz_lru_zone(zmd, zone->bzone); 1885 } 1886 1887 /* 1888 * Wait for any zone to be freed. 1889 */ 1890 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1891 { 1892 DEFINE_WAIT(wait); 1893 1894 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1895 dmz_unlock_map(zmd); 1896 dmz_unlock_metadata(zmd); 1897 1898 io_schedule_timeout(HZ); 1899 1900 dmz_lock_metadata(zmd); 1901 dmz_lock_map(zmd); 1902 finish_wait(&zmd->free_wq, &wait); 1903 } 1904 1905 /* 1906 * Lock a zone for reclaim (set the zone RECLAIM bit). 1907 * Returns false if the zone cannot be locked or if it is already locked 1908 * and 1 otherwise. 1909 */ 1910 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1911 { 1912 /* Active zones cannot be reclaimed */ 1913 if (dmz_is_active(zone)) 1914 return 0; 1915 1916 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1917 } 1918 1919 /* 1920 * Clear a zone reclaim flag. 1921 */ 1922 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1923 { 1924 WARN_ON(dmz_is_active(zone)); 1925 WARN_ON(!dmz_in_reclaim(zone)); 1926 1927 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1928 smp_mb__after_atomic(); 1929 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1930 } 1931 1932 /* 1933 * Wait for a zone reclaim to complete. 1934 */ 1935 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1936 { 1937 dmz_unlock_map(zmd); 1938 dmz_unlock_metadata(zmd); 1939 set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1940 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1941 clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1942 dmz_lock_metadata(zmd); 1943 dmz_lock_map(zmd); 1944 } 1945 1946 /* 1947 * Select a cache or random write zone for reclaim. 1948 */ 1949 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, 1950 unsigned int idx, bool idle) 1951 { 1952 struct dm_zone *dzone = NULL; 1953 struct dm_zone *zone, *maxw_z = NULL; 1954 struct list_head *zone_list; 1955 1956 /* If we have cache zones select from the cache zone list */ 1957 if (zmd->nr_cache) { 1958 zone_list = &zmd->map_cache_list; 1959 /* Try to relaim random zones, too, when idle */ 1960 if (idle && list_empty(zone_list)) 1961 zone_list = &zmd->dev[idx].map_rnd_list; 1962 } else 1963 zone_list = &zmd->dev[idx].map_rnd_list; 1964 1965 /* 1966 * Find the buffer zone with the heaviest weight or the first (oldest) 1967 * data zone that can be reclaimed. 1968 */ 1969 list_for_each_entry(zone, zone_list, link) { 1970 if (dmz_is_buf(zone)) { 1971 dzone = zone->bzone; 1972 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 1973 continue; 1974 if (!maxw_z || maxw_z->weight < dzone->weight) 1975 maxw_z = dzone; 1976 } else { 1977 dzone = zone; 1978 if (dmz_lock_zone_reclaim(dzone)) 1979 return dzone; 1980 } 1981 } 1982 1983 if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) 1984 return maxw_z; 1985 1986 /* 1987 * If we come here, none of the zones inspected could be locked for 1988 * reclaim. Try again, being more aggressive, that is, find the 1989 * first zone that can be reclaimed regardless of its weitght. 1990 */ 1991 list_for_each_entry(zone, zone_list, link) { 1992 if (dmz_is_buf(zone)) { 1993 dzone = zone->bzone; 1994 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 1995 continue; 1996 } else 1997 dzone = zone; 1998 if (dmz_lock_zone_reclaim(dzone)) 1999 return dzone; 2000 } 2001 2002 return NULL; 2003 } 2004 2005 /* 2006 * Select a buffered sequential zone for reclaim. 2007 */ 2008 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, 2009 unsigned int idx) 2010 { 2011 struct dm_zone *zone; 2012 2013 list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { 2014 if (!zone->bzone) 2015 continue; 2016 if (dmz_lock_zone_reclaim(zone)) 2017 return zone; 2018 } 2019 2020 return NULL; 2021 } 2022 2023 /* 2024 * Select a zone for reclaim. 2025 */ 2026 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, 2027 unsigned int dev_idx, bool idle) 2028 { 2029 struct dm_zone *zone = NULL; 2030 2031 /* 2032 * Search for a zone candidate to reclaim: 2 cases are possible. 2033 * (1) There is no free sequential zones. Then a random data zone 2034 * cannot be reclaimed. So choose a sequential zone to reclaim so 2035 * that afterward a random zone can be reclaimed. 2036 * (2) At least one free sequential zone is available, then choose 2037 * the oldest random zone (data or buffer) that can be locked. 2038 */ 2039 dmz_lock_map(zmd); 2040 if (list_empty(&zmd->reserved_seq_zones_list)) 2041 zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); 2042 if (!zone) 2043 zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); 2044 dmz_unlock_map(zmd); 2045 2046 return zone; 2047 } 2048 2049 /* 2050 * Get the zone mapping a chunk, if the chunk is mapped already. 2051 * If no mapping exist and the operation is WRITE, a zone is 2052 * allocated and used to map the chunk. 2053 * The zone returned will be set to the active state. 2054 */ 2055 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 2056 { 2057 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 2058 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 2059 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 2060 unsigned int dzone_id; 2061 struct dm_zone *dzone = NULL; 2062 int ret = 0; 2063 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2064 2065 dmz_lock_map(zmd); 2066 again: 2067 /* Get the chunk mapping */ 2068 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 2069 if (dzone_id == DMZ_MAP_UNMAPPED) { 2070 /* 2071 * Read or discard in unmapped chunks are fine. But for 2072 * writes, we need a mapping, so get one. 2073 */ 2074 if (op != REQ_OP_WRITE) 2075 goto out; 2076 2077 /* Allocate a random zone */ 2078 dzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2079 if (!dzone) { 2080 if (dmz_dev_is_dying(zmd)) { 2081 dzone = ERR_PTR(-EIO); 2082 goto out; 2083 } 2084 dmz_wait_for_free_zones(zmd); 2085 goto again; 2086 } 2087 2088 dmz_map_zone(zmd, dzone, chunk); 2089 2090 } else { 2091 /* The chunk is already mapped: get the mapping zone */ 2092 dzone = dmz_get(zmd, dzone_id); 2093 if (!dzone) { 2094 dzone = ERR_PTR(-EIO); 2095 goto out; 2096 } 2097 if (dzone->chunk != chunk) { 2098 dzone = ERR_PTR(-EIO); 2099 goto out; 2100 } 2101 2102 /* Repair write pointer if the sequential dzone has error */ 2103 if (dmz_seq_write_err(dzone)) { 2104 ret = dmz_handle_seq_write_err(zmd, dzone); 2105 if (ret) { 2106 dzone = ERR_PTR(-EIO); 2107 goto out; 2108 } 2109 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 2110 } 2111 } 2112 2113 /* 2114 * If the zone is being reclaimed, the chunk mapping may change 2115 * to a different zone. So wait for reclaim and retry. Otherwise, 2116 * activate the zone (this will prevent reclaim from touching it). 2117 */ 2118 if (dmz_in_reclaim(dzone)) { 2119 dmz_wait_for_reclaim(zmd, dzone); 2120 goto again; 2121 } 2122 dmz_activate_zone(dzone); 2123 dmz_lru_zone(zmd, dzone); 2124 out: 2125 dmz_unlock_map(zmd); 2126 2127 return dzone; 2128 } 2129 2130 /* 2131 * Write and discard change the block validity of data zones and their buffer 2132 * zones. Check here that valid blocks are still present. If all blocks are 2133 * invalid, the zones can be unmapped on the fly without waiting for reclaim 2134 * to do it. 2135 */ 2136 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 2137 { 2138 struct dm_zone *bzone; 2139 2140 dmz_lock_map(zmd); 2141 2142 bzone = dzone->bzone; 2143 if (bzone) { 2144 if (dmz_weight(bzone)) 2145 dmz_lru_zone(zmd, bzone); 2146 else { 2147 /* Empty buffer zone: reclaim it */ 2148 dmz_unmap_zone(zmd, bzone); 2149 dmz_free_zone(zmd, bzone); 2150 bzone = NULL; 2151 } 2152 } 2153 2154 /* Deactivate the data zone */ 2155 dmz_deactivate_zone(dzone); 2156 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 2157 dmz_lru_zone(zmd, dzone); 2158 else { 2159 /* Unbuffered inactive empty data zone: reclaim it */ 2160 dmz_unmap_zone(zmd, dzone); 2161 dmz_free_zone(zmd, dzone); 2162 } 2163 2164 dmz_unlock_map(zmd); 2165 } 2166 2167 /* 2168 * Allocate and map a random zone to buffer a chunk 2169 * already mapped to a sequential zone. 2170 */ 2171 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 2172 struct dm_zone *dzone) 2173 { 2174 struct dm_zone *bzone; 2175 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2176 2177 dmz_lock_map(zmd); 2178 again: 2179 bzone = dzone->bzone; 2180 if (bzone) 2181 goto out; 2182 2183 /* Allocate a random zone */ 2184 bzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2185 if (!bzone) { 2186 if (dmz_dev_is_dying(zmd)) { 2187 bzone = ERR_PTR(-EIO); 2188 goto out; 2189 } 2190 dmz_wait_for_free_zones(zmd); 2191 goto again; 2192 } 2193 2194 /* Update the chunk mapping */ 2195 dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); 2196 2197 set_bit(DMZ_BUF, &bzone->flags); 2198 bzone->chunk = dzone->chunk; 2199 bzone->bzone = dzone; 2200 dzone->bzone = bzone; 2201 if (dmz_is_cache(bzone)) 2202 list_add_tail(&bzone->link, &zmd->map_cache_list); 2203 else 2204 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 2205 out: 2206 dmz_unlock_map(zmd); 2207 2208 return bzone; 2209 } 2210 2211 /* 2212 * Get an unmapped (free) zone. 2213 * This must be called with the mapping lock held. 2214 */ 2215 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, 2216 unsigned long flags) 2217 { 2218 struct list_head *list; 2219 struct dm_zone *zone; 2220 int i = 0; 2221 2222 again: 2223 if (flags & DMZ_ALLOC_CACHE) 2224 list = &zmd->unmap_cache_list; 2225 else if (flags & DMZ_ALLOC_RND) 2226 list = &zmd->dev[dev_idx].unmap_rnd_list; 2227 else 2228 list = &zmd->dev[dev_idx].unmap_seq_list; 2229 2230 if (list_empty(list)) { 2231 /* 2232 * No free zone: return NULL if this is for not reclaim. 2233 */ 2234 if (!(flags & DMZ_ALLOC_RECLAIM)) 2235 return NULL; 2236 /* 2237 * Try to allocate from other devices 2238 */ 2239 if (i < zmd->nr_devs) { 2240 dev_idx = (dev_idx + 1) % zmd->nr_devs; 2241 i++; 2242 goto again; 2243 } 2244 2245 /* 2246 * Fallback to the reserved sequential zones 2247 */ 2248 zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, 2249 struct dm_zone, link); 2250 if (zone) { 2251 list_del_init(&zone->link); 2252 atomic_dec(&zmd->nr_reserved_seq_zones); 2253 } 2254 return zone; 2255 } 2256 2257 zone = list_first_entry(list, struct dm_zone, link); 2258 list_del_init(&zone->link); 2259 2260 if (dmz_is_cache(zone)) 2261 atomic_dec(&zmd->unmap_nr_cache); 2262 else if (dmz_is_rnd(zone)) 2263 atomic_dec(&zone->dev->unmap_nr_rnd); 2264 else 2265 atomic_dec(&zone->dev->unmap_nr_seq); 2266 2267 if (dmz_is_offline(zone)) { 2268 dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); 2269 zone = NULL; 2270 goto again; 2271 } 2272 if (dmz_is_meta(zone)) { 2273 dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); 2274 zone = NULL; 2275 goto again; 2276 } 2277 return zone; 2278 } 2279 2280 /* 2281 * Free a zone. 2282 * This must be called with the mapping lock held. 2283 */ 2284 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2285 { 2286 /* If this is a sequential zone, reset it */ 2287 if (dmz_is_seq(zone)) 2288 dmz_reset_zone(zmd, zone); 2289 2290 /* Return the zone to its type unmap list */ 2291 if (dmz_is_cache(zone)) { 2292 list_add_tail(&zone->link, &zmd->unmap_cache_list); 2293 atomic_inc(&zmd->unmap_nr_cache); 2294 } else if (dmz_is_rnd(zone)) { 2295 list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); 2296 atomic_inc(&zone->dev->unmap_nr_rnd); 2297 } else if (dmz_is_reserved(zone)) { 2298 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 2299 atomic_inc(&zmd->nr_reserved_seq_zones); 2300 } else { 2301 list_add_tail(&zone->link, &zone->dev->unmap_seq_list); 2302 atomic_inc(&zone->dev->unmap_nr_seq); 2303 } 2304 2305 wake_up_all(&zmd->free_wq); 2306 } 2307 2308 /* 2309 * Map a chunk to a zone. 2310 * This must be called with the mapping lock held. 2311 */ 2312 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 2313 unsigned int chunk) 2314 { 2315 /* Set the chunk mapping */ 2316 dmz_set_chunk_mapping(zmd, chunk, dzone->id, 2317 DMZ_MAP_UNMAPPED); 2318 dzone->chunk = chunk; 2319 if (dmz_is_cache(dzone)) 2320 list_add_tail(&dzone->link, &zmd->map_cache_list); 2321 else if (dmz_is_rnd(dzone)) 2322 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 2323 else 2324 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 2325 } 2326 2327 /* 2328 * Unmap a zone. 2329 * This must be called with the mapping lock held. 2330 */ 2331 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2332 { 2333 unsigned int chunk = zone->chunk; 2334 unsigned int dzone_id; 2335 2336 if (chunk == DMZ_MAP_UNMAPPED) { 2337 /* Already unmapped */ 2338 return; 2339 } 2340 2341 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 2342 /* 2343 * Unmapping the chunk buffer zone: clear only 2344 * the chunk buffer mapping 2345 */ 2346 dzone_id = zone->bzone->id; 2347 zone->bzone->bzone = NULL; 2348 zone->bzone = NULL; 2349 2350 } else { 2351 /* 2352 * Unmapping the chunk data zone: the zone must 2353 * not be buffered. 2354 */ 2355 if (WARN_ON(zone->bzone)) { 2356 zone->bzone->bzone = NULL; 2357 zone->bzone = NULL; 2358 } 2359 dzone_id = DMZ_MAP_UNMAPPED; 2360 } 2361 2362 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 2363 2364 zone->chunk = DMZ_MAP_UNMAPPED; 2365 list_del_init(&zone->link); 2366 } 2367 2368 /* 2369 * Set @nr_bits bits in @bitmap starting from @bit. 2370 * Return the number of bits changed from 0 to 1. 2371 */ 2372 static unsigned int dmz_set_bits(unsigned long *bitmap, 2373 unsigned int bit, unsigned int nr_bits) 2374 { 2375 unsigned long *addr; 2376 unsigned int end = bit + nr_bits; 2377 unsigned int n = 0; 2378 2379 while (bit < end) { 2380 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2381 ((end - bit) >= BITS_PER_LONG)) { 2382 /* Try to set the whole word at once */ 2383 addr = bitmap + BIT_WORD(bit); 2384 if (*addr == 0) { 2385 *addr = ULONG_MAX; 2386 n += BITS_PER_LONG; 2387 bit += BITS_PER_LONG; 2388 continue; 2389 } 2390 } 2391 2392 if (!test_and_set_bit(bit, bitmap)) 2393 n++; 2394 bit++; 2395 } 2396 2397 return n; 2398 } 2399 2400 /* 2401 * Get the bitmap block storing the bit for chunk_block in zone. 2402 */ 2403 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 2404 struct dm_zone *zone, 2405 sector_t chunk_block) 2406 { 2407 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 2408 (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + 2409 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 2410 2411 return dmz_get_mblock(zmd, bitmap_block); 2412 } 2413 2414 /* 2415 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 2416 */ 2417 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2418 struct dm_zone *to_zone) 2419 { 2420 struct dmz_mblock *from_mblk, *to_mblk; 2421 sector_t chunk_block = 0; 2422 2423 /* Get the zones bitmap blocks */ 2424 while (chunk_block < zmd->zone_nr_blocks) { 2425 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 2426 if (IS_ERR(from_mblk)) 2427 return PTR_ERR(from_mblk); 2428 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 2429 if (IS_ERR(to_mblk)) { 2430 dmz_release_mblock(zmd, from_mblk); 2431 return PTR_ERR(to_mblk); 2432 } 2433 2434 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 2435 dmz_dirty_mblock(zmd, to_mblk); 2436 2437 dmz_release_mblock(zmd, to_mblk); 2438 dmz_release_mblock(zmd, from_mblk); 2439 2440 chunk_block += zmd->zone_bits_per_mblk; 2441 } 2442 2443 to_zone->weight = from_zone->weight; 2444 2445 return 0; 2446 } 2447 2448 /* 2449 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 2450 * starting from chunk_block. 2451 */ 2452 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2453 struct dm_zone *to_zone, sector_t chunk_block) 2454 { 2455 unsigned int nr_blocks; 2456 int ret; 2457 2458 /* Get the zones bitmap blocks */ 2459 while (chunk_block < zmd->zone_nr_blocks) { 2460 /* Get a valid region from the source zone */ 2461 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 2462 if (ret <= 0) 2463 return ret; 2464 2465 nr_blocks = ret; 2466 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 2467 if (ret) 2468 return ret; 2469 2470 chunk_block += nr_blocks; 2471 } 2472 2473 return 0; 2474 } 2475 2476 /* 2477 * Validate all the blocks in the range [block..block+nr_blocks-1]. 2478 */ 2479 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2480 sector_t chunk_block, unsigned int nr_blocks) 2481 { 2482 unsigned int count, bit, nr_bits; 2483 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 2484 struct dmz_mblock *mblk; 2485 unsigned int n = 0; 2486 2487 dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", 2488 zone->id, (unsigned long long)chunk_block, 2489 nr_blocks); 2490 2491 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2492 2493 while (nr_blocks) { 2494 /* Get bitmap block */ 2495 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2496 if (IS_ERR(mblk)) 2497 return PTR_ERR(mblk); 2498 2499 /* Set bits */ 2500 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2501 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2502 2503 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2504 if (count) { 2505 dmz_dirty_mblock(zmd, mblk); 2506 n += count; 2507 } 2508 dmz_release_mblock(zmd, mblk); 2509 2510 nr_blocks -= nr_bits; 2511 chunk_block += nr_bits; 2512 } 2513 2514 if (likely(zone->weight + n <= zone_nr_blocks)) 2515 zone->weight += n; 2516 else { 2517 dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", 2518 zone->id, zone->weight, 2519 zone_nr_blocks - n); 2520 zone->weight = zone_nr_blocks; 2521 } 2522 2523 return 0; 2524 } 2525 2526 /* 2527 * Clear nr_bits bits in bitmap starting from bit. 2528 * Return the number of bits cleared. 2529 */ 2530 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2531 { 2532 unsigned long *addr; 2533 int end = bit + nr_bits; 2534 int n = 0; 2535 2536 while (bit < end) { 2537 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2538 ((end - bit) >= BITS_PER_LONG)) { 2539 /* Try to clear whole word at once */ 2540 addr = bitmap + BIT_WORD(bit); 2541 if (*addr == ULONG_MAX) { 2542 *addr = 0; 2543 n += BITS_PER_LONG; 2544 bit += BITS_PER_LONG; 2545 continue; 2546 } 2547 } 2548 2549 if (test_and_clear_bit(bit, bitmap)) 2550 n++; 2551 bit++; 2552 } 2553 2554 return n; 2555 } 2556 2557 /* 2558 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2559 */ 2560 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2561 sector_t chunk_block, unsigned int nr_blocks) 2562 { 2563 unsigned int count, bit, nr_bits; 2564 struct dmz_mblock *mblk; 2565 unsigned int n = 0; 2566 2567 dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", 2568 zone->id, (u64)chunk_block, nr_blocks); 2569 2570 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2571 2572 while (nr_blocks) { 2573 /* Get bitmap block */ 2574 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2575 if (IS_ERR(mblk)) 2576 return PTR_ERR(mblk); 2577 2578 /* Clear bits */ 2579 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2580 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2581 2582 count = dmz_clear_bits((unsigned long *)mblk->data, 2583 bit, nr_bits); 2584 if (count) { 2585 dmz_dirty_mblock(zmd, mblk); 2586 n += count; 2587 } 2588 dmz_release_mblock(zmd, mblk); 2589 2590 nr_blocks -= nr_bits; 2591 chunk_block += nr_bits; 2592 } 2593 2594 if (zone->weight >= n) 2595 zone->weight -= n; 2596 else { 2597 dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", 2598 zone->id, zone->weight, n); 2599 zone->weight = 0; 2600 } 2601 2602 return 0; 2603 } 2604 2605 /* 2606 * Get a block bit value. 2607 */ 2608 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2609 sector_t chunk_block) 2610 { 2611 struct dmz_mblock *mblk; 2612 int ret; 2613 2614 WARN_ON(chunk_block >= zmd->zone_nr_blocks); 2615 2616 /* Get bitmap block */ 2617 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2618 if (IS_ERR(mblk)) 2619 return PTR_ERR(mblk); 2620 2621 /* Get offset */ 2622 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2623 (unsigned long *) mblk->data) != 0; 2624 2625 dmz_release_mblock(zmd, mblk); 2626 2627 return ret; 2628 } 2629 2630 /* 2631 * Return the number of blocks from chunk_block to the first block with a bit 2632 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2633 */ 2634 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2635 sector_t chunk_block, unsigned int nr_blocks, 2636 int set) 2637 { 2638 struct dmz_mblock *mblk; 2639 unsigned int bit, set_bit, nr_bits; 2640 unsigned int zone_bits = zmd->zone_bits_per_mblk; 2641 unsigned long *bitmap; 2642 int n = 0; 2643 2644 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2645 2646 while (nr_blocks) { 2647 /* Get bitmap block */ 2648 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2649 if (IS_ERR(mblk)) 2650 return PTR_ERR(mblk); 2651 2652 /* Get offset */ 2653 bitmap = (unsigned long *) mblk->data; 2654 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2655 nr_bits = min(nr_blocks, zone_bits - bit); 2656 if (set) 2657 set_bit = find_next_bit(bitmap, zone_bits, bit); 2658 else 2659 set_bit = find_next_zero_bit(bitmap, zone_bits, bit); 2660 dmz_release_mblock(zmd, mblk); 2661 2662 n += set_bit - bit; 2663 if (set_bit < zone_bits) 2664 break; 2665 2666 nr_blocks -= nr_bits; 2667 chunk_block += nr_bits; 2668 } 2669 2670 return n; 2671 } 2672 2673 /* 2674 * Test if chunk_block is valid. If it is, the number of consecutive 2675 * valid blocks from chunk_block will be returned. 2676 */ 2677 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2678 sector_t chunk_block) 2679 { 2680 int valid; 2681 2682 valid = dmz_test_block(zmd, zone, chunk_block); 2683 if (valid <= 0) 2684 return valid; 2685 2686 /* The block is valid: get the number of valid blocks from block */ 2687 return dmz_to_next_set_block(zmd, zone, chunk_block, 2688 zmd->zone_nr_blocks - chunk_block, 0); 2689 } 2690 2691 /* 2692 * Find the first valid block from @chunk_block in @zone. 2693 * If such a block is found, its number is returned using 2694 * @chunk_block and the total number of valid blocks from @chunk_block 2695 * is returned. 2696 */ 2697 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2698 sector_t *chunk_block) 2699 { 2700 sector_t start_block = *chunk_block; 2701 int ret; 2702 2703 ret = dmz_to_next_set_block(zmd, zone, start_block, 2704 zmd->zone_nr_blocks - start_block, 1); 2705 if (ret < 0) 2706 return ret; 2707 2708 start_block += ret; 2709 *chunk_block = start_block; 2710 2711 return dmz_to_next_set_block(zmd, zone, start_block, 2712 zmd->zone_nr_blocks - start_block, 0); 2713 } 2714 2715 /* 2716 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2717 */ 2718 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2719 { 2720 unsigned long *addr; 2721 int end = bit + nr_bits; 2722 int n = 0; 2723 2724 while (bit < end) { 2725 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2726 ((end - bit) >= BITS_PER_LONG)) { 2727 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2728 if (*addr == ULONG_MAX) { 2729 n += BITS_PER_LONG; 2730 bit += BITS_PER_LONG; 2731 continue; 2732 } 2733 } 2734 2735 if (test_bit(bit, bitmap)) 2736 n++; 2737 bit++; 2738 } 2739 2740 return n; 2741 } 2742 2743 /* 2744 * Get a zone weight. 2745 */ 2746 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2747 { 2748 struct dmz_mblock *mblk; 2749 sector_t chunk_block = 0; 2750 unsigned int bit, nr_bits; 2751 unsigned int nr_blocks = zmd->zone_nr_blocks; 2752 void *bitmap; 2753 int n = 0; 2754 2755 while (nr_blocks) { 2756 /* Get bitmap block */ 2757 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2758 if (IS_ERR(mblk)) { 2759 n = 0; 2760 break; 2761 } 2762 2763 /* Count bits in this block */ 2764 bitmap = mblk->data; 2765 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2766 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2767 n += dmz_count_bits(bitmap, bit, nr_bits); 2768 2769 dmz_release_mblock(zmd, mblk); 2770 2771 nr_blocks -= nr_bits; 2772 chunk_block += nr_bits; 2773 } 2774 2775 zone->weight = n; 2776 } 2777 2778 /* 2779 * Cleanup the zoned metadata resources. 2780 */ 2781 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2782 { 2783 struct rb_root *root; 2784 struct dmz_mblock *mblk, *next; 2785 int i; 2786 2787 /* Release zone mapping resources */ 2788 if (zmd->map_mblk) { 2789 for (i = 0; i < zmd->nr_map_blocks; i++) 2790 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2791 kfree(zmd->map_mblk); 2792 zmd->map_mblk = NULL; 2793 } 2794 2795 /* Release super blocks */ 2796 for (i = 0; i < 2; i++) { 2797 if (zmd->sb[i].mblk) { 2798 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2799 zmd->sb[i].mblk = NULL; 2800 } 2801 } 2802 2803 /* Free cached blocks */ 2804 while (!list_empty(&zmd->mblk_dirty_list)) { 2805 mblk = list_first_entry(&zmd->mblk_dirty_list, 2806 struct dmz_mblock, link); 2807 dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", 2808 (u64)mblk->no, mblk->ref); 2809 list_del_init(&mblk->link); 2810 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2811 dmz_free_mblock(zmd, mblk); 2812 } 2813 2814 while (!list_empty(&zmd->mblk_lru_list)) { 2815 mblk = list_first_entry(&zmd->mblk_lru_list, 2816 struct dmz_mblock, link); 2817 list_del_init(&mblk->link); 2818 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2819 dmz_free_mblock(zmd, mblk); 2820 } 2821 2822 /* Sanity checks: the mblock rbtree should now be empty */ 2823 root = &zmd->mblk_rbtree; 2824 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2825 dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", 2826 (u64)mblk->no, mblk->ref); 2827 mblk->ref = 0; 2828 dmz_free_mblock(zmd, mblk); 2829 } 2830 2831 /* Free the zone descriptors */ 2832 dmz_drop_zones(zmd); 2833 2834 mutex_destroy(&zmd->mblk_flush_lock); 2835 mutex_destroy(&zmd->map_lock); 2836 } 2837 2838 static void dmz_print_dev(struct dmz_metadata *zmd, int num) 2839 { 2840 struct dmz_dev *dev = &zmd->dev[num]; 2841 2842 if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) 2843 dmz_dev_info(dev, "Regular block device"); 2844 else 2845 dmz_dev_info(dev, "Host-%s zoned block device", 2846 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2847 "aware" : "managed"); 2848 if (zmd->sb_version > 1) { 2849 sector_t sector_offset = 2850 dev->zone_offset << zmd->zone_nr_sectors_shift; 2851 2852 dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", 2853 (u64)dev->capacity, (u64)sector_offset); 2854 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", 2855 dev->nr_zones, (u64)zmd->zone_nr_sectors, 2856 (u64)dev->zone_offset); 2857 } else { 2858 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2859 (u64)dev->capacity); 2860 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2861 dev->nr_zones, (u64)zmd->zone_nr_sectors); 2862 } 2863 } 2864 2865 /* 2866 * Initialize the zoned metadata. 2867 */ 2868 int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, 2869 struct dmz_metadata **metadata, 2870 const char *devname) 2871 { 2872 struct dmz_metadata *zmd; 2873 unsigned int i; 2874 struct dm_zone *zone; 2875 int ret; 2876 2877 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2878 if (!zmd) 2879 return -ENOMEM; 2880 2881 strcpy(zmd->devname, devname); 2882 zmd->dev = dev; 2883 zmd->nr_devs = num_dev; 2884 zmd->mblk_rbtree = RB_ROOT; 2885 init_rwsem(&zmd->mblk_sem); 2886 mutex_init(&zmd->mblk_flush_lock); 2887 spin_lock_init(&zmd->mblk_lock); 2888 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2889 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2890 2891 mutex_init(&zmd->map_lock); 2892 2893 atomic_set(&zmd->unmap_nr_cache, 0); 2894 INIT_LIST_HEAD(&zmd->unmap_cache_list); 2895 INIT_LIST_HEAD(&zmd->map_cache_list); 2896 2897 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2898 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2899 2900 init_waitqueue_head(&zmd->free_wq); 2901 2902 /* Initialize zone descriptors */ 2903 ret = dmz_init_zones(zmd); 2904 if (ret) 2905 goto err; 2906 2907 /* Get super block */ 2908 ret = dmz_load_sb(zmd); 2909 if (ret) 2910 goto err; 2911 2912 /* Set metadata zones starting from sb_zone */ 2913 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2914 zone = dmz_get(zmd, zmd->sb[0].zone->id + i); 2915 if (!zone) { 2916 dmz_zmd_err(zmd, 2917 "metadata zone %u not present", i); 2918 ret = -ENXIO; 2919 goto err; 2920 } 2921 if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { 2922 dmz_zmd_err(zmd, 2923 "metadata zone %d is not random", i); 2924 ret = -ENXIO; 2925 goto err; 2926 } 2927 set_bit(DMZ_META, &zone->flags); 2928 } 2929 /* Load mapping table */ 2930 ret = dmz_load_mapping(zmd); 2931 if (ret) 2932 goto err; 2933 2934 /* 2935 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2936 * blocks and enough blocks to be able to cache the bitmap blocks of 2937 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2938 * the cache to add 512 more metadata blocks. 2939 */ 2940 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2941 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2942 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2943 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2944 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2945 2946 /* Metadata cache shrinker */ 2947 ret = register_shrinker(&zmd->mblk_shrinker); 2948 if (ret) { 2949 dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); 2950 goto err; 2951 } 2952 2953 dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); 2954 for (i = 0; i < zmd->nr_devs; i++) 2955 dmz_print_dev(zmd, i); 2956 2957 dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", 2958 zmd->nr_zones, (u64)zmd->zone_nr_sectors); 2959 dmz_zmd_debug(zmd, " %u metadata zones", 2960 zmd->nr_meta_zones * 2); 2961 dmz_zmd_debug(zmd, " %u data zones for %u chunks", 2962 zmd->nr_data_zones, zmd->nr_chunks); 2963 dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", 2964 zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); 2965 for (i = 0; i < zmd->nr_devs; i++) { 2966 dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", 2967 dmz_nr_rnd_zones(zmd, i), 2968 dmz_nr_unmap_rnd_zones(zmd, i)); 2969 dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", 2970 dmz_nr_seq_zones(zmd, i), 2971 dmz_nr_unmap_seq_zones(zmd, i)); 2972 } 2973 dmz_zmd_debug(zmd, " %u reserved sequential data zones", 2974 zmd->nr_reserved_seq); 2975 dmz_zmd_debug(zmd, "Format:"); 2976 dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", 2977 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2978 dmz_zmd_debug(zmd, " %u data zone mapping blocks", 2979 zmd->nr_map_blocks); 2980 dmz_zmd_debug(zmd, " %u bitmap blocks", 2981 zmd->nr_bitmap_blocks); 2982 2983 *metadata = zmd; 2984 2985 return 0; 2986 err: 2987 dmz_cleanup_metadata(zmd); 2988 kfree(zmd); 2989 *metadata = NULL; 2990 2991 return ret; 2992 } 2993 2994 /* 2995 * Cleanup the zoned metadata resources. 2996 */ 2997 void dmz_dtr_metadata(struct dmz_metadata *zmd) 2998 { 2999 unregister_shrinker(&zmd->mblk_shrinker); 3000 dmz_cleanup_metadata(zmd); 3001 kfree(zmd); 3002 } 3003 3004 /* 3005 * Check zone information on resume. 3006 */ 3007 int dmz_resume_metadata(struct dmz_metadata *zmd) 3008 { 3009 struct dm_zone *zone; 3010 sector_t wp_block; 3011 unsigned int i; 3012 int ret; 3013 3014 /* Check zones */ 3015 for (i = 0; i < zmd->nr_zones; i++) { 3016 zone = dmz_get(zmd, i); 3017 if (!zone) { 3018 dmz_zmd_err(zmd, "Unable to get zone %u", i); 3019 return -EIO; 3020 } 3021 wp_block = zone->wp_block; 3022 3023 ret = dmz_update_zone(zmd, zone); 3024 if (ret) { 3025 dmz_zmd_err(zmd, "Broken zone %u", i); 3026 return ret; 3027 } 3028 3029 if (dmz_is_offline(zone)) { 3030 dmz_zmd_warn(zmd, "Zone %u is offline", i); 3031 continue; 3032 } 3033 3034 /* Check write pointer */ 3035 if (!dmz_is_seq(zone)) 3036 zone->wp_block = 0; 3037 else if (zone->wp_block != wp_block) { 3038 dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", 3039 i, (u64)zone->wp_block, (u64)wp_block); 3040 zone->wp_block = wp_block; 3041 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 3042 zmd->zone_nr_blocks - zone->wp_block); 3043 } 3044 } 3045 3046 return 0; 3047 } 3048