1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 2 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* DM-Zoned label */ 73 u8 dmz_label[32]; /* 80 */ 74 75 /* DM-Zoned UUID */ 76 u8 dmz_uuid[16]; /* 96 */ 77 78 /* Device UUID */ 79 u8 dev_uuid[16]; /* 112 */ 80 81 /* Padding to full 512B sector */ 82 u8 reserved[400]; /* 512 */ 83 }; 84 85 /* 86 * Chunk mapping entry: entries are indexed by chunk number 87 * and give the zone ID (dzone_id) mapping the chunk on disk. 88 * This zone may be sequential or random. If it is a sequential 89 * zone, a second zone (bzone_id) used as a write buffer may 90 * also be specified. This second zone will always be a randomly 91 * writeable zone. 92 */ 93 struct dmz_map { 94 __le32 dzone_id; 95 __le32 bzone_id; 96 }; 97 98 /* 99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 100 */ 101 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 102 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 103 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 104 #define DMZ_MAP_UNMAPPED UINT_MAX 105 106 /* 107 * Meta data block descriptor (for cached metadata blocks). 108 */ 109 struct dmz_mblock { 110 struct rb_node node; 111 struct list_head link; 112 sector_t no; 113 unsigned int ref; 114 unsigned long state; 115 struct page *page; 116 void *data; 117 }; 118 119 /* 120 * Metadata block state flags. 121 */ 122 enum { 123 DMZ_META_DIRTY, 124 DMZ_META_READING, 125 DMZ_META_WRITING, 126 DMZ_META_ERROR, 127 }; 128 129 /* 130 * Super block information (one per metadata set). 131 */ 132 struct dmz_sb { 133 sector_t block; 134 struct dmz_dev *dev; 135 struct dmz_mblock *mblk; 136 struct dmz_super *sb; 137 struct dm_zone *zone; 138 }; 139 140 /* 141 * In-memory metadata. 142 */ 143 struct dmz_metadata { 144 struct dmz_dev *dev; 145 unsigned int nr_devs; 146 147 char devname[BDEVNAME_SIZE]; 148 char label[BDEVNAME_SIZE]; 149 uuid_t uuid; 150 151 sector_t zone_bitmap_size; 152 unsigned int zone_nr_bitmap_blocks; 153 unsigned int zone_bits_per_mblk; 154 155 sector_t zone_nr_blocks; 156 sector_t zone_nr_blocks_shift; 157 158 sector_t zone_nr_sectors; 159 sector_t zone_nr_sectors_shift; 160 161 unsigned int nr_bitmap_blocks; 162 unsigned int nr_map_blocks; 163 164 unsigned int nr_zones; 165 unsigned int nr_useable_zones; 166 unsigned int nr_meta_blocks; 167 unsigned int nr_meta_zones; 168 unsigned int nr_data_zones; 169 unsigned int nr_cache_zones; 170 unsigned int nr_rnd_zones; 171 unsigned int nr_reserved_seq; 172 unsigned int nr_chunks; 173 174 /* Zone information array */ 175 struct xarray zones; 176 177 struct dmz_sb sb[2]; 178 unsigned int mblk_primary; 179 unsigned int sb_version; 180 u64 sb_gen; 181 unsigned int min_nr_mblks; 182 unsigned int max_nr_mblks; 183 atomic_t nr_mblks; 184 struct rw_semaphore mblk_sem; 185 struct mutex mblk_flush_lock; 186 spinlock_t mblk_lock; 187 struct rb_root mblk_rbtree; 188 struct list_head mblk_lru_list; 189 struct list_head mblk_dirty_list; 190 struct shrinker mblk_shrinker; 191 192 /* Zone allocation management */ 193 struct mutex map_lock; 194 struct dmz_mblock **map_mblk; 195 196 unsigned int nr_cache; 197 atomic_t unmap_nr_cache; 198 struct list_head unmap_cache_list; 199 struct list_head map_cache_list; 200 201 atomic_t nr_reserved_seq_zones; 202 struct list_head reserved_seq_zones_list; 203 204 wait_queue_head_t free_wq; 205 }; 206 207 #define dmz_zmd_info(zmd, format, args...) \ 208 DMINFO("(%s): " format, (zmd)->label, ## args) 209 210 #define dmz_zmd_err(zmd, format, args...) \ 211 DMERR("(%s): " format, (zmd)->label, ## args) 212 213 #define dmz_zmd_warn(zmd, format, args...) \ 214 DMWARN("(%s): " format, (zmd)->label, ## args) 215 216 #define dmz_zmd_debug(zmd, format, args...) \ 217 DMDEBUG("(%s): " format, (zmd)->label, ## args) 218 /* 219 * Various accessors 220 */ 221 static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) 222 { 223 if (WARN_ON(!zone)) 224 return 0; 225 226 return zone->id - zone->dev->zone_offset; 227 } 228 229 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 230 { 231 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 232 233 return (sector_t)zone_id << zmd->zone_nr_sectors_shift; 234 } 235 236 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 237 { 238 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 239 240 return (sector_t)zone_id << zmd->zone_nr_blocks_shift; 241 } 242 243 unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) 244 { 245 return zmd->zone_nr_blocks; 246 } 247 248 unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) 249 { 250 return zmd->zone_nr_blocks_shift; 251 } 252 253 unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) 254 { 255 return zmd->zone_nr_sectors; 256 } 257 258 unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) 259 { 260 return zmd->zone_nr_sectors_shift; 261 } 262 263 unsigned int dmz_nr_zones(struct dmz_metadata *zmd) 264 { 265 return zmd->nr_zones; 266 } 267 268 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 269 { 270 return zmd->nr_chunks; 271 } 272 273 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) 274 { 275 return zmd->dev[idx].nr_rnd; 276 } 277 278 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) 279 { 280 return atomic_read(&zmd->dev[idx].unmap_nr_rnd); 281 } 282 283 unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) 284 { 285 return zmd->nr_cache; 286 } 287 288 unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) 289 { 290 return atomic_read(&zmd->unmap_nr_cache); 291 } 292 293 unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) 294 { 295 return zmd->dev[idx].nr_seq; 296 } 297 298 unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) 299 { 300 return atomic_read(&zmd->dev[idx].unmap_nr_seq); 301 } 302 303 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 304 { 305 return xa_load(&zmd->zones, zone_id); 306 } 307 308 static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, 309 unsigned int zone_id, struct dmz_dev *dev) 310 { 311 struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); 312 313 if (!zone) 314 return ERR_PTR(-ENOMEM); 315 316 if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { 317 kfree(zone); 318 return ERR_PTR(-EBUSY); 319 } 320 321 INIT_LIST_HEAD(&zone->link); 322 atomic_set(&zone->refcount, 0); 323 zone->id = zone_id; 324 zone->chunk = DMZ_MAP_UNMAPPED; 325 zone->dev = dev; 326 327 return zone; 328 } 329 330 const char *dmz_metadata_label(struct dmz_metadata *zmd) 331 { 332 return (const char *)zmd->label; 333 } 334 335 bool dmz_check_dev(struct dmz_metadata *zmd) 336 { 337 unsigned int i; 338 339 for (i = 0; i < zmd->nr_devs; i++) { 340 if (!dmz_check_bdev(&zmd->dev[i])) 341 return false; 342 } 343 return true; 344 } 345 346 bool dmz_dev_is_dying(struct dmz_metadata *zmd) 347 { 348 unsigned int i; 349 350 for (i = 0; i < zmd->nr_devs; i++) { 351 if (dmz_bdev_is_dying(&zmd->dev[i])) 352 return true; 353 } 354 return false; 355 } 356 357 /* 358 * Lock/unlock mapping table. 359 * The map lock also protects all the zone lists. 360 */ 361 void dmz_lock_map(struct dmz_metadata *zmd) 362 { 363 mutex_lock(&zmd->map_lock); 364 } 365 366 void dmz_unlock_map(struct dmz_metadata *zmd) 367 { 368 mutex_unlock(&zmd->map_lock); 369 } 370 371 /* 372 * Lock/unlock metadata access. This is a "read" lock on a semaphore 373 * that prevents metadata flush from running while metadata are being 374 * modified. The actual metadata write mutual exclusion is achieved with 375 * the map lock and zone state management (active and reclaim state are 376 * mutually exclusive). 377 */ 378 void dmz_lock_metadata(struct dmz_metadata *zmd) 379 { 380 down_read(&zmd->mblk_sem); 381 } 382 383 void dmz_unlock_metadata(struct dmz_metadata *zmd) 384 { 385 up_read(&zmd->mblk_sem); 386 } 387 388 /* 389 * Lock/unlock flush: prevent concurrent executions 390 * of dmz_flush_metadata as well as metadata modification in reclaim 391 * while flush is being executed. 392 */ 393 void dmz_lock_flush(struct dmz_metadata *zmd) 394 { 395 mutex_lock(&zmd->mblk_flush_lock); 396 } 397 398 void dmz_unlock_flush(struct dmz_metadata *zmd) 399 { 400 mutex_unlock(&zmd->mblk_flush_lock); 401 } 402 403 /* 404 * Allocate a metadata block. 405 */ 406 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 407 sector_t mblk_no) 408 { 409 struct dmz_mblock *mblk = NULL; 410 411 /* See if we can reuse cached blocks */ 412 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 413 spin_lock(&zmd->mblk_lock); 414 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 415 struct dmz_mblock, link); 416 if (mblk) { 417 list_del_init(&mblk->link); 418 rb_erase(&mblk->node, &zmd->mblk_rbtree); 419 mblk->no = mblk_no; 420 } 421 spin_unlock(&zmd->mblk_lock); 422 if (mblk) 423 return mblk; 424 } 425 426 /* Allocate a new block */ 427 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 428 if (!mblk) 429 return NULL; 430 431 mblk->page = alloc_page(GFP_NOIO); 432 if (!mblk->page) { 433 kfree(mblk); 434 return NULL; 435 } 436 437 RB_CLEAR_NODE(&mblk->node); 438 INIT_LIST_HEAD(&mblk->link); 439 mblk->ref = 0; 440 mblk->state = 0; 441 mblk->no = mblk_no; 442 mblk->data = page_address(mblk->page); 443 444 atomic_inc(&zmd->nr_mblks); 445 446 return mblk; 447 } 448 449 /* 450 * Free a metadata block. 451 */ 452 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 453 { 454 __free_pages(mblk->page, 0); 455 kfree(mblk); 456 457 atomic_dec(&zmd->nr_mblks); 458 } 459 460 /* 461 * Insert a metadata block in the rbtree. 462 */ 463 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 464 { 465 struct rb_root *root = &zmd->mblk_rbtree; 466 struct rb_node **new = &(root->rb_node), *parent = NULL; 467 struct dmz_mblock *b; 468 469 /* Figure out where to put the new node */ 470 while (*new) { 471 b = container_of(*new, struct dmz_mblock, node); 472 parent = *new; 473 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 474 } 475 476 /* Add new node and rebalance tree */ 477 rb_link_node(&mblk->node, parent, new); 478 rb_insert_color(&mblk->node, root); 479 } 480 481 /* 482 * Lookup a metadata block in the rbtree. If the block is found, increment 483 * its reference count. 484 */ 485 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 486 sector_t mblk_no) 487 { 488 struct rb_root *root = &zmd->mblk_rbtree; 489 struct rb_node *node = root->rb_node; 490 struct dmz_mblock *mblk; 491 492 while (node) { 493 mblk = container_of(node, struct dmz_mblock, node); 494 if (mblk->no == mblk_no) { 495 /* 496 * If this is the first reference to the block, 497 * remove it from the LRU list. 498 */ 499 mblk->ref++; 500 if (mblk->ref == 1 && 501 !test_bit(DMZ_META_DIRTY, &mblk->state)) 502 list_del_init(&mblk->link); 503 return mblk; 504 } 505 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 506 } 507 508 return NULL; 509 } 510 511 /* 512 * Metadata block BIO end callback. 513 */ 514 static void dmz_mblock_bio_end_io(struct bio *bio) 515 { 516 struct dmz_mblock *mblk = bio->bi_private; 517 int flag; 518 519 if (bio->bi_status) 520 set_bit(DMZ_META_ERROR, &mblk->state); 521 522 if (bio_op(bio) == REQ_OP_WRITE) 523 flag = DMZ_META_WRITING; 524 else 525 flag = DMZ_META_READING; 526 527 clear_bit_unlock(flag, &mblk->state); 528 smp_mb__after_atomic(); 529 wake_up_bit(&mblk->state, flag); 530 531 bio_put(bio); 532 } 533 534 /* 535 * Read an uncached metadata block from disk and add it to the cache. 536 */ 537 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 538 sector_t mblk_no) 539 { 540 struct dmz_mblock *mblk, *m; 541 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 542 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 543 struct bio *bio; 544 545 if (dmz_bdev_is_dying(dev)) 546 return ERR_PTR(-EIO); 547 548 /* Get a new block and a BIO to read it */ 549 mblk = dmz_alloc_mblock(zmd, mblk_no); 550 if (!mblk) 551 return ERR_PTR(-ENOMEM); 552 553 bio = bio_alloc(GFP_NOIO, 1); 554 if (!bio) { 555 dmz_free_mblock(zmd, mblk); 556 return ERR_PTR(-ENOMEM); 557 } 558 559 spin_lock(&zmd->mblk_lock); 560 561 /* 562 * Make sure that another context did not start reading 563 * the block already. 564 */ 565 m = dmz_get_mblock_fast(zmd, mblk_no); 566 if (m) { 567 spin_unlock(&zmd->mblk_lock); 568 dmz_free_mblock(zmd, mblk); 569 bio_put(bio); 570 return m; 571 } 572 573 mblk->ref++; 574 set_bit(DMZ_META_READING, &mblk->state); 575 dmz_insert_mblock(zmd, mblk); 576 577 spin_unlock(&zmd->mblk_lock); 578 579 /* Submit read BIO */ 580 bio->bi_iter.bi_sector = dmz_blk2sect(block); 581 bio_set_dev(bio, dev->bdev); 582 bio->bi_private = mblk; 583 bio->bi_end_io = dmz_mblock_bio_end_io; 584 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 585 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 586 submit_bio(bio); 587 588 return mblk; 589 } 590 591 /* 592 * Free metadata blocks. 593 */ 594 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 595 unsigned long limit) 596 { 597 struct dmz_mblock *mblk; 598 unsigned long count = 0; 599 600 if (!zmd->max_nr_mblks) 601 return 0; 602 603 while (!list_empty(&zmd->mblk_lru_list) && 604 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 605 count < limit) { 606 mblk = list_first_entry(&zmd->mblk_lru_list, 607 struct dmz_mblock, link); 608 list_del_init(&mblk->link); 609 rb_erase(&mblk->node, &zmd->mblk_rbtree); 610 dmz_free_mblock(zmd, mblk); 611 count++; 612 } 613 614 return count; 615 } 616 617 /* 618 * For mblock shrinker: get the number of unused metadata blocks in the cache. 619 */ 620 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 621 struct shrink_control *sc) 622 { 623 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 624 625 return atomic_read(&zmd->nr_mblks); 626 } 627 628 /* 629 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 630 */ 631 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 632 struct shrink_control *sc) 633 { 634 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 635 unsigned long count; 636 637 spin_lock(&zmd->mblk_lock); 638 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 639 spin_unlock(&zmd->mblk_lock); 640 641 return count ? count : SHRINK_STOP; 642 } 643 644 /* 645 * Release a metadata block. 646 */ 647 static void dmz_release_mblock(struct dmz_metadata *zmd, 648 struct dmz_mblock *mblk) 649 { 650 651 if (!mblk) 652 return; 653 654 spin_lock(&zmd->mblk_lock); 655 656 mblk->ref--; 657 if (mblk->ref == 0) { 658 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 659 rb_erase(&mblk->node, &zmd->mblk_rbtree); 660 dmz_free_mblock(zmd, mblk); 661 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 662 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 663 dmz_shrink_mblock_cache(zmd, 1); 664 } 665 } 666 667 spin_unlock(&zmd->mblk_lock); 668 } 669 670 /* 671 * Get a metadata block from the rbtree. If the block 672 * is not present, read it from disk. 673 */ 674 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 675 sector_t mblk_no) 676 { 677 struct dmz_mblock *mblk; 678 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 679 680 /* Check rbtree */ 681 spin_lock(&zmd->mblk_lock); 682 mblk = dmz_get_mblock_fast(zmd, mblk_no); 683 spin_unlock(&zmd->mblk_lock); 684 685 if (!mblk) { 686 /* Cache miss: read the block from disk */ 687 mblk = dmz_get_mblock_slow(zmd, mblk_no); 688 if (IS_ERR(mblk)) 689 return mblk; 690 } 691 692 /* Wait for on-going read I/O and check for error */ 693 wait_on_bit_io(&mblk->state, DMZ_META_READING, 694 TASK_UNINTERRUPTIBLE); 695 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 696 dmz_release_mblock(zmd, mblk); 697 dmz_check_bdev(dev); 698 return ERR_PTR(-EIO); 699 } 700 701 return mblk; 702 } 703 704 /* 705 * Mark a metadata block dirty. 706 */ 707 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 708 { 709 spin_lock(&zmd->mblk_lock); 710 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 711 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 712 spin_unlock(&zmd->mblk_lock); 713 } 714 715 /* 716 * Issue a metadata block write BIO. 717 */ 718 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 719 unsigned int set) 720 { 721 struct dmz_dev *dev = zmd->sb[set].dev; 722 sector_t block = zmd->sb[set].block + mblk->no; 723 struct bio *bio; 724 725 if (dmz_bdev_is_dying(dev)) 726 return -EIO; 727 728 bio = bio_alloc(GFP_NOIO, 1); 729 if (!bio) { 730 set_bit(DMZ_META_ERROR, &mblk->state); 731 return -ENOMEM; 732 } 733 734 set_bit(DMZ_META_WRITING, &mblk->state); 735 736 bio->bi_iter.bi_sector = dmz_blk2sect(block); 737 bio_set_dev(bio, dev->bdev); 738 bio->bi_private = mblk; 739 bio->bi_end_io = dmz_mblock_bio_end_io; 740 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 741 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 742 submit_bio(bio); 743 744 return 0; 745 } 746 747 /* 748 * Read/write a metadata block. 749 */ 750 static int dmz_rdwr_block(struct dmz_dev *dev, int op, 751 sector_t block, struct page *page) 752 { 753 struct bio *bio; 754 int ret; 755 756 if (WARN_ON(!dev)) 757 return -EIO; 758 759 if (dmz_bdev_is_dying(dev)) 760 return -EIO; 761 762 bio = bio_alloc(GFP_NOIO, 1); 763 if (!bio) 764 return -ENOMEM; 765 766 bio->bi_iter.bi_sector = dmz_blk2sect(block); 767 bio_set_dev(bio, dev->bdev); 768 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 769 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 770 ret = submit_bio_wait(bio); 771 bio_put(bio); 772 773 if (ret) 774 dmz_check_bdev(dev); 775 return ret; 776 } 777 778 /* 779 * Write super block of the specified metadata set. 780 */ 781 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 782 { 783 struct dmz_mblock *mblk = zmd->sb[set].mblk; 784 struct dmz_super *sb = zmd->sb[set].sb; 785 struct dmz_dev *dev = zmd->sb[set].dev; 786 sector_t sb_block; 787 u64 sb_gen = zmd->sb_gen + 1; 788 int ret; 789 790 sb->magic = cpu_to_le32(DMZ_MAGIC); 791 792 sb->version = cpu_to_le32(zmd->sb_version); 793 if (zmd->sb_version > 1) { 794 BUILD_BUG_ON(UUID_SIZE != 16); 795 export_uuid(sb->dmz_uuid, &zmd->uuid); 796 memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); 797 export_uuid(sb->dev_uuid, &dev->uuid); 798 } 799 800 sb->gen = cpu_to_le64(sb_gen); 801 802 /* 803 * The metadata always references the absolute block address, 804 * ie relative to the entire block range, not the per-device 805 * block address. 806 */ 807 sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; 808 sb->sb_block = cpu_to_le64(sb_block); 809 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 810 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 811 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 812 813 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 814 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 815 816 sb->crc = 0; 817 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 818 819 ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, 820 mblk->page); 821 if (ret == 0) 822 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 823 824 return ret; 825 } 826 827 /* 828 * Write dirty metadata blocks to the specified set. 829 */ 830 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 831 struct list_head *write_list, 832 unsigned int set) 833 { 834 struct dmz_mblock *mblk; 835 struct dmz_dev *dev = zmd->sb[set].dev; 836 struct blk_plug plug; 837 int ret = 0, nr_mblks_submitted = 0; 838 839 /* Issue writes */ 840 blk_start_plug(&plug); 841 list_for_each_entry(mblk, write_list, link) { 842 ret = dmz_write_mblock(zmd, mblk, set); 843 if (ret) 844 break; 845 nr_mblks_submitted++; 846 } 847 blk_finish_plug(&plug); 848 849 /* Wait for completion */ 850 list_for_each_entry(mblk, write_list, link) { 851 if (!nr_mblks_submitted) 852 break; 853 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 854 TASK_UNINTERRUPTIBLE); 855 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 856 clear_bit(DMZ_META_ERROR, &mblk->state); 857 dmz_check_bdev(dev); 858 ret = -EIO; 859 } 860 nr_mblks_submitted--; 861 } 862 863 /* Flush drive cache (this will also sync data) */ 864 if (ret == 0) 865 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 866 867 return ret; 868 } 869 870 /* 871 * Log dirty metadata blocks. 872 */ 873 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 874 struct list_head *write_list) 875 { 876 unsigned int log_set = zmd->mblk_primary ^ 0x1; 877 int ret; 878 879 /* Write dirty blocks to the log */ 880 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 881 if (ret) 882 return ret; 883 884 /* 885 * No error so far: now validate the log by updating the 886 * log index super block generation. 887 */ 888 ret = dmz_write_sb(zmd, log_set); 889 if (ret) 890 return ret; 891 892 return 0; 893 } 894 895 /* 896 * Flush dirty metadata blocks. 897 */ 898 int dmz_flush_metadata(struct dmz_metadata *zmd) 899 { 900 struct dmz_mblock *mblk; 901 struct list_head write_list; 902 struct dmz_dev *dev; 903 int ret; 904 905 if (WARN_ON(!zmd)) 906 return 0; 907 908 INIT_LIST_HEAD(&write_list); 909 910 /* 911 * Make sure that metadata blocks are stable before logging: take 912 * the write lock on the metadata semaphore to prevent target BIOs 913 * from modifying metadata. 914 */ 915 down_write(&zmd->mblk_sem); 916 dev = zmd->sb[zmd->mblk_primary].dev; 917 918 /* 919 * This is called from the target flush work and reclaim work. 920 * Concurrent execution is not allowed. 921 */ 922 dmz_lock_flush(zmd); 923 924 if (dmz_bdev_is_dying(dev)) { 925 ret = -EIO; 926 goto out; 927 } 928 929 /* Get dirty blocks */ 930 spin_lock(&zmd->mblk_lock); 931 list_splice_init(&zmd->mblk_dirty_list, &write_list); 932 spin_unlock(&zmd->mblk_lock); 933 934 /* If there are no dirty metadata blocks, just flush the device cache */ 935 if (list_empty(&write_list)) { 936 ret = blkdev_issue_flush(dev->bdev, GFP_NOIO); 937 goto err; 938 } 939 940 /* 941 * The primary metadata set is still clean. Keep it this way until 942 * all updates are successful in the secondary set. That is, use 943 * the secondary set as a log. 944 */ 945 ret = dmz_log_dirty_mblocks(zmd, &write_list); 946 if (ret) 947 goto err; 948 949 /* 950 * The log is on disk. It is now safe to update in place 951 * in the primary metadata set. 952 */ 953 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 954 if (ret) 955 goto err; 956 957 ret = dmz_write_sb(zmd, zmd->mblk_primary); 958 if (ret) 959 goto err; 960 961 while (!list_empty(&write_list)) { 962 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 963 list_del_init(&mblk->link); 964 965 spin_lock(&zmd->mblk_lock); 966 clear_bit(DMZ_META_DIRTY, &mblk->state); 967 if (mblk->ref == 0) 968 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 969 spin_unlock(&zmd->mblk_lock); 970 } 971 972 zmd->sb_gen++; 973 out: 974 dmz_unlock_flush(zmd); 975 up_write(&zmd->mblk_sem); 976 977 return ret; 978 979 err: 980 if (!list_empty(&write_list)) { 981 spin_lock(&zmd->mblk_lock); 982 list_splice(&write_list, &zmd->mblk_dirty_list); 983 spin_unlock(&zmd->mblk_lock); 984 } 985 if (!dmz_check_bdev(dev)) 986 ret = -EIO; 987 goto out; 988 } 989 990 /* 991 * Check super block. 992 */ 993 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, 994 bool tertiary) 995 { 996 struct dmz_super *sb = dsb->sb; 997 struct dmz_dev *dev = dsb->dev; 998 unsigned int nr_meta_zones, nr_data_zones; 999 u32 crc, stored_crc; 1000 u64 gen, sb_block; 1001 1002 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 1003 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 1004 DMZ_MAGIC, le32_to_cpu(sb->magic)); 1005 return -ENXIO; 1006 } 1007 1008 zmd->sb_version = le32_to_cpu(sb->version); 1009 if (zmd->sb_version > DMZ_META_VER) { 1010 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 1011 DMZ_META_VER, zmd->sb_version); 1012 return -EINVAL; 1013 } 1014 if (zmd->sb_version < 2 && tertiary) { 1015 dmz_dev_err(dev, "Tertiary superblocks are not supported"); 1016 return -EINVAL; 1017 } 1018 1019 gen = le64_to_cpu(sb->gen); 1020 stored_crc = le32_to_cpu(sb->crc); 1021 sb->crc = 0; 1022 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 1023 if (crc != stored_crc) { 1024 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 1025 crc, stored_crc); 1026 return -ENXIO; 1027 } 1028 1029 sb_block = le64_to_cpu(sb->sb_block); 1030 if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { 1031 dmz_dev_err(dev, "Invalid superblock position " 1032 "(is %llu expected %llu)", 1033 sb_block, 1034 (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); 1035 return -EINVAL; 1036 } 1037 if (zmd->sb_version > 1) { 1038 uuid_t sb_uuid; 1039 1040 import_uuid(&sb_uuid, sb->dmz_uuid); 1041 if (uuid_is_null(&sb_uuid)) { 1042 dmz_dev_err(dev, "NULL DM-Zoned uuid"); 1043 return -ENXIO; 1044 } else if (uuid_is_null(&zmd->uuid)) { 1045 uuid_copy(&zmd->uuid, &sb_uuid); 1046 } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { 1047 dmz_dev_err(dev, "mismatching DM-Zoned uuid, " 1048 "is %pUl expected %pUl", 1049 &sb_uuid, &zmd->uuid); 1050 return -ENXIO; 1051 } 1052 if (!strlen(zmd->label)) 1053 memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); 1054 else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { 1055 dmz_dev_err(dev, "mismatching DM-Zoned label, " 1056 "is %s expected %s", 1057 sb->dmz_label, zmd->label); 1058 return -ENXIO; 1059 } 1060 import_uuid(&dev->uuid, sb->dev_uuid); 1061 if (uuid_is_null(&dev->uuid)) { 1062 dmz_dev_err(dev, "NULL device uuid"); 1063 return -ENXIO; 1064 } 1065 1066 if (tertiary) { 1067 /* 1068 * Generation number should be 0, but it doesn't 1069 * really matter if it isn't. 1070 */ 1071 if (gen != 0) 1072 dmz_dev_warn(dev, "Invalid generation %llu", 1073 gen); 1074 return 0; 1075 } 1076 } 1077 1078 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) 1079 >> zmd->zone_nr_blocks_shift; 1080 if (!nr_meta_zones || 1081 nr_meta_zones >= zmd->nr_rnd_zones) { 1082 dmz_dev_err(dev, "Invalid number of metadata blocks"); 1083 return -ENXIO; 1084 } 1085 1086 if (!le32_to_cpu(sb->nr_reserved_seq) || 1087 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 1088 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 1089 return -ENXIO; 1090 } 1091 1092 nr_data_zones = zmd->nr_useable_zones - 1093 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 1094 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 1095 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 1096 le32_to_cpu(sb->nr_chunks), nr_data_zones); 1097 return -ENXIO; 1098 } 1099 1100 /* OK */ 1101 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 1102 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 1103 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 1104 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 1105 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 1106 zmd->nr_meta_zones = nr_meta_zones; 1107 zmd->nr_data_zones = nr_data_zones; 1108 1109 return 0; 1110 } 1111 1112 /* 1113 * Read the first or second super block from disk. 1114 */ 1115 static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1116 { 1117 dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", 1118 set, sb->dev->name, sb->block); 1119 1120 return dmz_rdwr_block(sb->dev, REQ_OP_READ, 1121 sb->block, sb->mblk->page); 1122 } 1123 1124 /* 1125 * Determine the position of the secondary super blocks on disk. 1126 * This is used only if a corruption of the primary super block 1127 * is detected. 1128 */ 1129 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 1130 { 1131 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 1132 struct dmz_mblock *mblk; 1133 unsigned int zone_id = zmd->sb[0].zone->id; 1134 int i; 1135 1136 /* Allocate a block */ 1137 mblk = dmz_alloc_mblock(zmd, 0); 1138 if (!mblk) 1139 return -ENOMEM; 1140 1141 zmd->sb[1].mblk = mblk; 1142 zmd->sb[1].sb = mblk->data; 1143 1144 /* Bad first super block: search for the second one */ 1145 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 1146 zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); 1147 zmd->sb[1].dev = zmd->sb[0].dev; 1148 for (i = 1; i < zmd->nr_rnd_zones; i++) { 1149 if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) 1150 break; 1151 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 1152 return 0; 1153 zmd->sb[1].block += zone_nr_blocks; 1154 zmd->sb[1].zone = dmz_get(zmd, zone_id + i); 1155 } 1156 1157 dmz_free_mblock(zmd, mblk); 1158 zmd->sb[1].mblk = NULL; 1159 zmd->sb[1].zone = NULL; 1160 zmd->sb[1].dev = NULL; 1161 1162 return -EIO; 1163 } 1164 1165 /* 1166 * Read a super block from disk. 1167 */ 1168 static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1169 { 1170 struct dmz_mblock *mblk; 1171 int ret; 1172 1173 /* Allocate a block */ 1174 mblk = dmz_alloc_mblock(zmd, 0); 1175 if (!mblk) 1176 return -ENOMEM; 1177 1178 sb->mblk = mblk; 1179 sb->sb = mblk->data; 1180 1181 /* Read super block */ 1182 ret = dmz_read_sb(zmd, sb, set); 1183 if (ret) { 1184 dmz_free_mblock(zmd, mblk); 1185 sb->mblk = NULL; 1186 return ret; 1187 } 1188 1189 return 0; 1190 } 1191 1192 /* 1193 * Recover a metadata set. 1194 */ 1195 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 1196 { 1197 unsigned int src_set = dst_set ^ 0x1; 1198 struct page *page; 1199 int i, ret; 1200 1201 dmz_dev_warn(zmd->sb[dst_set].dev, 1202 "Metadata set %u invalid: recovering", dst_set); 1203 1204 if (dst_set == 0) 1205 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1206 else 1207 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1208 1209 page = alloc_page(GFP_NOIO); 1210 if (!page) 1211 return -ENOMEM; 1212 1213 /* Copy metadata blocks */ 1214 for (i = 1; i < zmd->nr_meta_blocks; i++) { 1215 ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, 1216 zmd->sb[src_set].block + i, page); 1217 if (ret) 1218 goto out; 1219 ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, 1220 zmd->sb[dst_set].block + i, page); 1221 if (ret) 1222 goto out; 1223 } 1224 1225 /* Finalize with the super block */ 1226 if (!zmd->sb[dst_set].mblk) { 1227 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 1228 if (!zmd->sb[dst_set].mblk) { 1229 ret = -ENOMEM; 1230 goto out; 1231 } 1232 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 1233 } 1234 1235 ret = dmz_write_sb(zmd, dst_set); 1236 out: 1237 __free_pages(page, 0); 1238 1239 return ret; 1240 } 1241 1242 /* 1243 * Get super block from disk. 1244 */ 1245 static int dmz_load_sb(struct dmz_metadata *zmd) 1246 { 1247 bool sb_good[2] = {false, false}; 1248 u64 sb_gen[2] = {0, 0}; 1249 int ret; 1250 1251 if (!zmd->sb[0].zone) { 1252 dmz_zmd_err(zmd, "Primary super block zone not set"); 1253 return -ENXIO; 1254 } 1255 1256 /* Read and check the primary super block */ 1257 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1258 zmd->sb[0].dev = zmd->sb[0].zone->dev; 1259 ret = dmz_get_sb(zmd, &zmd->sb[0], 0); 1260 if (ret) { 1261 dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); 1262 return ret; 1263 } 1264 1265 ret = dmz_check_sb(zmd, &zmd->sb[0], false); 1266 1267 /* Read and check secondary super block */ 1268 if (ret == 0) { 1269 sb_good[0] = true; 1270 if (!zmd->sb[1].zone) { 1271 unsigned int zone_id = 1272 zmd->sb[0].zone->id + zmd->nr_meta_zones; 1273 1274 zmd->sb[1].zone = dmz_get(zmd, zone_id); 1275 } 1276 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1277 zmd->sb[1].dev = zmd->sb[0].dev; 1278 ret = dmz_get_sb(zmd, &zmd->sb[1], 1); 1279 } else 1280 ret = dmz_lookup_secondary_sb(zmd); 1281 1282 if (ret) { 1283 dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); 1284 return ret; 1285 } 1286 1287 ret = dmz_check_sb(zmd, &zmd->sb[1], false); 1288 if (ret == 0) 1289 sb_good[1] = true; 1290 1291 /* Use highest generation sb first */ 1292 if (!sb_good[0] && !sb_good[1]) { 1293 dmz_zmd_err(zmd, "No valid super block found"); 1294 return -EIO; 1295 } 1296 1297 if (sb_good[0]) 1298 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1299 else { 1300 ret = dmz_recover_mblocks(zmd, 0); 1301 if (ret) { 1302 dmz_dev_err(zmd->sb[0].dev, 1303 "Recovery of superblock 0 failed"); 1304 return -EIO; 1305 } 1306 } 1307 1308 if (sb_good[1]) 1309 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1310 else { 1311 ret = dmz_recover_mblocks(zmd, 1); 1312 1313 if (ret) { 1314 dmz_dev_err(zmd->sb[1].dev, 1315 "Recovery of superblock 1 failed"); 1316 return -EIO; 1317 } 1318 } 1319 1320 if (sb_gen[0] >= sb_gen[1]) { 1321 zmd->sb_gen = sb_gen[0]; 1322 zmd->mblk_primary = 0; 1323 } else { 1324 zmd->sb_gen = sb_gen[1]; 1325 zmd->mblk_primary = 1; 1326 } 1327 1328 dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, 1329 "Using super block %u (gen %llu)", 1330 zmd->mblk_primary, zmd->sb_gen); 1331 1332 if (zmd->sb_version > 1) { 1333 int i; 1334 struct dmz_sb *sb; 1335 1336 sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); 1337 if (!sb) 1338 return -ENOMEM; 1339 for (i = 1; i < zmd->nr_devs; i++) { 1340 sb->block = 0; 1341 sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); 1342 sb->dev = &zmd->dev[i]; 1343 if (!dmz_is_meta(sb->zone)) { 1344 dmz_dev_err(sb->dev, 1345 "Tertiary super block zone %u not marked as metadata zone", 1346 sb->zone->id); 1347 ret = -EINVAL; 1348 goto out_kfree; 1349 } 1350 ret = dmz_get_sb(zmd, sb, i + 1); 1351 if (ret) { 1352 dmz_dev_err(sb->dev, 1353 "Read tertiary super block failed"); 1354 dmz_free_mblock(zmd, sb->mblk); 1355 goto out_kfree; 1356 } 1357 ret = dmz_check_sb(zmd, sb, true); 1358 dmz_free_mblock(zmd, sb->mblk); 1359 if (ret == -EINVAL) 1360 goto out_kfree; 1361 } 1362 out_kfree: 1363 kfree(sb); 1364 } 1365 return ret; 1366 } 1367 1368 /* 1369 * Initialize a zone descriptor. 1370 */ 1371 static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) 1372 { 1373 struct dmz_dev *dev = data; 1374 struct dmz_metadata *zmd = dev->metadata; 1375 int idx = num + dev->zone_offset; 1376 struct dm_zone *zone; 1377 1378 zone = dmz_insert(zmd, idx, dev); 1379 if (IS_ERR(zone)) 1380 return PTR_ERR(zone); 1381 1382 if (blkz->len != zmd->zone_nr_sectors) { 1383 if (zmd->sb_version > 1) { 1384 /* Ignore the eventual runt (smaller) zone */ 1385 set_bit(DMZ_OFFLINE, &zone->flags); 1386 return 0; 1387 } else if (blkz->start + blkz->len == dev->capacity) 1388 return 0; 1389 return -ENXIO; 1390 } 1391 1392 switch (blkz->type) { 1393 case BLK_ZONE_TYPE_CONVENTIONAL: 1394 set_bit(DMZ_RND, &zone->flags); 1395 break; 1396 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1397 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1398 set_bit(DMZ_SEQ, &zone->flags); 1399 break; 1400 default: 1401 return -ENXIO; 1402 } 1403 1404 if (dmz_is_rnd(zone)) 1405 zone->wp_block = 0; 1406 else 1407 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1408 1409 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1410 set_bit(DMZ_OFFLINE, &zone->flags); 1411 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1412 set_bit(DMZ_READ_ONLY, &zone->flags); 1413 else { 1414 zmd->nr_useable_zones++; 1415 if (dmz_is_rnd(zone)) { 1416 zmd->nr_rnd_zones++; 1417 if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { 1418 /* Primary super block zone */ 1419 zmd->sb[0].zone = zone; 1420 } 1421 } 1422 if (zmd->nr_devs > 1 && num == 0) { 1423 /* 1424 * Tertiary superblock zones are always at the 1425 * start of the zoned devices, so mark them 1426 * as metadata zone. 1427 */ 1428 set_bit(DMZ_META, &zone->flags); 1429 } 1430 } 1431 return 0; 1432 } 1433 1434 static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) 1435 { 1436 int idx; 1437 sector_t zone_offset = 0; 1438 1439 for(idx = 0; idx < dev->nr_zones; idx++) { 1440 struct dm_zone *zone; 1441 1442 zone = dmz_insert(zmd, idx, dev); 1443 if (IS_ERR(zone)) 1444 return PTR_ERR(zone); 1445 set_bit(DMZ_CACHE, &zone->flags); 1446 zone->wp_block = 0; 1447 zmd->nr_cache_zones++; 1448 zmd->nr_useable_zones++; 1449 if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { 1450 /* Disable runt zone */ 1451 set_bit(DMZ_OFFLINE, &zone->flags); 1452 break; 1453 } 1454 zone_offset += zmd->zone_nr_sectors; 1455 } 1456 return 0; 1457 } 1458 1459 /* 1460 * Free zones descriptors. 1461 */ 1462 static void dmz_drop_zones(struct dmz_metadata *zmd) 1463 { 1464 int idx; 1465 1466 for(idx = 0; idx < zmd->nr_zones; idx++) { 1467 struct dm_zone *zone = xa_load(&zmd->zones, idx); 1468 1469 kfree(zone); 1470 xa_erase(&zmd->zones, idx); 1471 } 1472 xa_destroy(&zmd->zones); 1473 } 1474 1475 /* 1476 * Allocate and initialize zone descriptors using the zone 1477 * information from disk. 1478 */ 1479 static int dmz_init_zones(struct dmz_metadata *zmd) 1480 { 1481 int i, ret; 1482 struct dmz_dev *zoned_dev = &zmd->dev[0]; 1483 1484 /* Init */ 1485 zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; 1486 zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); 1487 zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); 1488 zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); 1489 zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; 1490 zmd->zone_nr_bitmap_blocks = 1491 max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); 1492 zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, 1493 DMZ_BLOCK_SIZE_BITS); 1494 1495 /* Allocate zone array */ 1496 zmd->nr_zones = 0; 1497 for (i = 0; i < zmd->nr_devs; i++) { 1498 struct dmz_dev *dev = &zmd->dev[i]; 1499 1500 dev->metadata = zmd; 1501 zmd->nr_zones += dev->nr_zones; 1502 1503 atomic_set(&dev->unmap_nr_rnd, 0); 1504 INIT_LIST_HEAD(&dev->unmap_rnd_list); 1505 INIT_LIST_HEAD(&dev->map_rnd_list); 1506 1507 atomic_set(&dev->unmap_nr_seq, 0); 1508 INIT_LIST_HEAD(&dev->unmap_seq_list); 1509 INIT_LIST_HEAD(&dev->map_seq_list); 1510 } 1511 1512 if (!zmd->nr_zones) { 1513 DMERR("(%s): No zones found", zmd->devname); 1514 return -ENXIO; 1515 } 1516 xa_init(&zmd->zones); 1517 1518 DMDEBUG("(%s): Using %zu B for zone information", 1519 zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); 1520 1521 if (zmd->nr_devs > 1) { 1522 ret = dmz_emulate_zones(zmd, &zmd->dev[0]); 1523 if (ret < 0) { 1524 DMDEBUG("(%s): Failed to emulate zones, error %d", 1525 zmd->devname, ret); 1526 dmz_drop_zones(zmd); 1527 return ret; 1528 } 1529 1530 /* 1531 * Primary superblock zone is always at zone 0 when multiple 1532 * drives are present. 1533 */ 1534 zmd->sb[0].zone = dmz_get(zmd, 0); 1535 1536 for (i = 1; i < zmd->nr_devs; i++) { 1537 zoned_dev = &zmd->dev[i]; 1538 1539 ret = blkdev_report_zones(zoned_dev->bdev, 0, 1540 BLK_ALL_ZONES, 1541 dmz_init_zone, zoned_dev); 1542 if (ret < 0) { 1543 DMDEBUG("(%s): Failed to report zones, error %d", 1544 zmd->devname, ret); 1545 dmz_drop_zones(zmd); 1546 return ret; 1547 } 1548 } 1549 return 0; 1550 } 1551 1552 /* 1553 * Get zone information and initialize zone descriptors. At the same 1554 * time, determine where the super block should be: first block of the 1555 * first randomly writable zone. 1556 */ 1557 ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, 1558 dmz_init_zone, zoned_dev); 1559 if (ret < 0) { 1560 DMDEBUG("(%s): Failed to report zones, error %d", 1561 zmd->devname, ret); 1562 dmz_drop_zones(zmd); 1563 return ret; 1564 } 1565 1566 return 0; 1567 } 1568 1569 static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, 1570 void *data) 1571 { 1572 struct dm_zone *zone = data; 1573 1574 clear_bit(DMZ_OFFLINE, &zone->flags); 1575 clear_bit(DMZ_READ_ONLY, &zone->flags); 1576 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1577 set_bit(DMZ_OFFLINE, &zone->flags); 1578 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1579 set_bit(DMZ_READ_ONLY, &zone->flags); 1580 1581 if (dmz_is_seq(zone)) 1582 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1583 else 1584 zone->wp_block = 0; 1585 return 0; 1586 } 1587 1588 /* 1589 * Update a zone information. 1590 */ 1591 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1592 { 1593 struct dmz_dev *dev = zone->dev; 1594 unsigned int noio_flag; 1595 int ret; 1596 1597 if (dev->flags & DMZ_BDEV_REGULAR) 1598 return 0; 1599 1600 /* 1601 * Get zone information from disk. Since blkdev_report_zones() uses 1602 * GFP_KERNEL by default for memory allocations, set the per-task 1603 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1604 * GFP_NOIO was specified. 1605 */ 1606 noio_flag = memalloc_noio_save(); 1607 ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, 1608 dmz_update_zone_cb, zone); 1609 memalloc_noio_restore(noio_flag); 1610 1611 if (ret == 0) 1612 ret = -EIO; 1613 if (ret < 0) { 1614 dmz_dev_err(dev, "Get zone %u report failed", 1615 zone->id); 1616 dmz_check_bdev(dev); 1617 return ret; 1618 } 1619 1620 return 0; 1621 } 1622 1623 /* 1624 * Check a zone write pointer position when the zone is marked 1625 * with the sequential write error flag. 1626 */ 1627 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1628 struct dm_zone *zone) 1629 { 1630 struct dmz_dev *dev = zone->dev; 1631 unsigned int wp = 0; 1632 int ret; 1633 1634 wp = zone->wp_block; 1635 ret = dmz_update_zone(zmd, zone); 1636 if (ret) 1637 return ret; 1638 1639 dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", 1640 zone->id, zone->wp_block, wp); 1641 1642 if (zone->wp_block < wp) { 1643 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1644 wp - zone->wp_block); 1645 } 1646 1647 return 0; 1648 } 1649 1650 /* 1651 * Reset a zone write pointer. 1652 */ 1653 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1654 { 1655 int ret; 1656 1657 /* 1658 * Ignore offline zones, read only zones, 1659 * and conventional zones. 1660 */ 1661 if (dmz_is_offline(zone) || 1662 dmz_is_readonly(zone) || 1663 dmz_is_rnd(zone)) 1664 return 0; 1665 1666 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1667 struct dmz_dev *dev = zone->dev; 1668 1669 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1670 dmz_start_sect(zmd, zone), 1671 zmd->zone_nr_sectors, GFP_NOIO); 1672 if (ret) { 1673 dmz_dev_err(dev, "Reset zone %u failed %d", 1674 zone->id, ret); 1675 return ret; 1676 } 1677 } 1678 1679 /* Clear write error bit and rewind write pointer position */ 1680 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1681 zone->wp_block = 0; 1682 1683 return 0; 1684 } 1685 1686 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1687 1688 /* 1689 * Initialize chunk mapping. 1690 */ 1691 static int dmz_load_mapping(struct dmz_metadata *zmd) 1692 { 1693 struct dm_zone *dzone, *bzone; 1694 struct dmz_mblock *dmap_mblk = NULL; 1695 struct dmz_map *dmap; 1696 unsigned int i = 0, e = 0, chunk = 0; 1697 unsigned int dzone_id; 1698 unsigned int bzone_id; 1699 1700 /* Metadata block array for the chunk mapping table */ 1701 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1702 sizeof(struct dmz_mblk *), GFP_KERNEL); 1703 if (!zmd->map_mblk) 1704 return -ENOMEM; 1705 1706 /* Get chunk mapping table blocks and initialize zone mapping */ 1707 while (chunk < zmd->nr_chunks) { 1708 if (!dmap_mblk) { 1709 /* Get mapping block */ 1710 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1711 if (IS_ERR(dmap_mblk)) 1712 return PTR_ERR(dmap_mblk); 1713 zmd->map_mblk[i] = dmap_mblk; 1714 dmap = (struct dmz_map *) dmap_mblk->data; 1715 i++; 1716 e = 0; 1717 } 1718 1719 /* Check data zone */ 1720 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1721 if (dzone_id == DMZ_MAP_UNMAPPED) 1722 goto next; 1723 1724 if (dzone_id >= zmd->nr_zones) { 1725 dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", 1726 chunk, dzone_id); 1727 return -EIO; 1728 } 1729 1730 dzone = dmz_get(zmd, dzone_id); 1731 if (!dzone) { 1732 dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", 1733 chunk, dzone_id); 1734 return -EIO; 1735 } 1736 set_bit(DMZ_DATA, &dzone->flags); 1737 dzone->chunk = chunk; 1738 dmz_get_zone_weight(zmd, dzone); 1739 1740 if (dmz_is_cache(dzone)) 1741 list_add_tail(&dzone->link, &zmd->map_cache_list); 1742 else if (dmz_is_rnd(dzone)) 1743 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 1744 else 1745 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 1746 1747 /* Check buffer zone */ 1748 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1749 if (bzone_id == DMZ_MAP_UNMAPPED) 1750 goto next; 1751 1752 if (bzone_id >= zmd->nr_zones) { 1753 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", 1754 chunk, bzone_id); 1755 return -EIO; 1756 } 1757 1758 bzone = dmz_get(zmd, bzone_id); 1759 if (!bzone) { 1760 dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", 1761 chunk, bzone_id); 1762 return -EIO; 1763 } 1764 if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { 1765 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", 1766 chunk, bzone_id); 1767 return -EIO; 1768 } 1769 1770 set_bit(DMZ_DATA, &bzone->flags); 1771 set_bit(DMZ_BUF, &bzone->flags); 1772 bzone->chunk = chunk; 1773 bzone->bzone = dzone; 1774 dzone->bzone = bzone; 1775 dmz_get_zone_weight(zmd, bzone); 1776 if (dmz_is_cache(bzone)) 1777 list_add_tail(&bzone->link, &zmd->map_cache_list); 1778 else 1779 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 1780 next: 1781 chunk++; 1782 e++; 1783 if (e >= DMZ_MAP_ENTRIES) 1784 dmap_mblk = NULL; 1785 } 1786 1787 /* 1788 * At this point, only meta zones and mapped data zones were 1789 * fully initialized. All remaining zones are unmapped data 1790 * zones. Finish initializing those here. 1791 */ 1792 for (i = 0; i < zmd->nr_zones; i++) { 1793 dzone = dmz_get(zmd, i); 1794 if (!dzone) 1795 continue; 1796 if (dmz_is_meta(dzone)) 1797 continue; 1798 if (dmz_is_offline(dzone)) 1799 continue; 1800 1801 if (dmz_is_cache(dzone)) 1802 zmd->nr_cache++; 1803 else if (dmz_is_rnd(dzone)) 1804 dzone->dev->nr_rnd++; 1805 else 1806 dzone->dev->nr_seq++; 1807 1808 if (dmz_is_data(dzone)) { 1809 /* Already initialized */ 1810 continue; 1811 } 1812 1813 /* Unmapped data zone */ 1814 set_bit(DMZ_DATA, &dzone->flags); 1815 dzone->chunk = DMZ_MAP_UNMAPPED; 1816 if (dmz_is_cache(dzone)) { 1817 list_add_tail(&dzone->link, &zmd->unmap_cache_list); 1818 atomic_inc(&zmd->unmap_nr_cache); 1819 } else if (dmz_is_rnd(dzone)) { 1820 list_add_tail(&dzone->link, 1821 &dzone->dev->unmap_rnd_list); 1822 atomic_inc(&dzone->dev->unmap_nr_rnd); 1823 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1824 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1825 set_bit(DMZ_RESERVED, &dzone->flags); 1826 atomic_inc(&zmd->nr_reserved_seq_zones); 1827 dzone->dev->nr_seq--; 1828 } else { 1829 list_add_tail(&dzone->link, 1830 &dzone->dev->unmap_seq_list); 1831 atomic_inc(&dzone->dev->unmap_nr_seq); 1832 } 1833 } 1834 1835 return 0; 1836 } 1837 1838 /* 1839 * Set a data chunk mapping. 1840 */ 1841 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1842 unsigned int dzone_id, unsigned int bzone_id) 1843 { 1844 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1845 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1846 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1847 1848 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1849 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1850 dmz_dirty_mblock(zmd, dmap_mblk); 1851 } 1852 1853 /* 1854 * The list of mapped zones is maintained in LRU order. 1855 * This rotates a zone at the end of its map list. 1856 */ 1857 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1858 { 1859 if (list_empty(&zone->link)) 1860 return; 1861 1862 list_del_init(&zone->link); 1863 if (dmz_is_seq(zone)) { 1864 /* LRU rotate sequential zone */ 1865 list_add_tail(&zone->link, &zone->dev->map_seq_list); 1866 } else if (dmz_is_cache(zone)) { 1867 /* LRU rotate cache zone */ 1868 list_add_tail(&zone->link, &zmd->map_cache_list); 1869 } else { 1870 /* LRU rotate random zone */ 1871 list_add_tail(&zone->link, &zone->dev->map_rnd_list); 1872 } 1873 } 1874 1875 /* 1876 * The list of mapped random zones is maintained 1877 * in LRU order. This rotates a zone at the end of the list. 1878 */ 1879 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1880 { 1881 __dmz_lru_zone(zmd, zone); 1882 if (zone->bzone) 1883 __dmz_lru_zone(zmd, zone->bzone); 1884 } 1885 1886 /* 1887 * Wait for any zone to be freed. 1888 */ 1889 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1890 { 1891 DEFINE_WAIT(wait); 1892 1893 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1894 dmz_unlock_map(zmd); 1895 dmz_unlock_metadata(zmd); 1896 1897 io_schedule_timeout(HZ); 1898 1899 dmz_lock_metadata(zmd); 1900 dmz_lock_map(zmd); 1901 finish_wait(&zmd->free_wq, &wait); 1902 } 1903 1904 /* 1905 * Lock a zone for reclaim (set the zone RECLAIM bit). 1906 * Returns false if the zone cannot be locked or if it is already locked 1907 * and 1 otherwise. 1908 */ 1909 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1910 { 1911 /* Active zones cannot be reclaimed */ 1912 if (dmz_is_active(zone)) 1913 return 0; 1914 1915 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1916 } 1917 1918 /* 1919 * Clear a zone reclaim flag. 1920 */ 1921 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1922 { 1923 WARN_ON(dmz_is_active(zone)); 1924 WARN_ON(!dmz_in_reclaim(zone)); 1925 1926 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1927 smp_mb__after_atomic(); 1928 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1929 } 1930 1931 /* 1932 * Wait for a zone reclaim to complete. 1933 */ 1934 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1935 { 1936 dmz_unlock_map(zmd); 1937 dmz_unlock_metadata(zmd); 1938 set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1939 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1940 clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1941 dmz_lock_metadata(zmd); 1942 dmz_lock_map(zmd); 1943 } 1944 1945 /* 1946 * Select a cache or random write zone for reclaim. 1947 */ 1948 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, 1949 unsigned int idx, bool idle) 1950 { 1951 struct dm_zone *dzone = NULL; 1952 struct dm_zone *zone, *last = NULL; 1953 struct list_head *zone_list; 1954 1955 /* If we have cache zones select from the cache zone list */ 1956 if (zmd->nr_cache) { 1957 zone_list = &zmd->map_cache_list; 1958 /* Try to relaim random zones, too, when idle */ 1959 if (idle && list_empty(zone_list)) 1960 zone_list = &zmd->dev[idx].map_rnd_list; 1961 } else 1962 zone_list = &zmd->dev[idx].map_rnd_list; 1963 1964 list_for_each_entry(zone, zone_list, link) { 1965 if (dmz_is_buf(zone)) { 1966 dzone = zone->bzone; 1967 if (dzone->dev->dev_idx != idx) 1968 continue; 1969 if (!last) { 1970 last = dzone; 1971 continue; 1972 } 1973 if (last->weight < dzone->weight) 1974 continue; 1975 dzone = last; 1976 } else 1977 dzone = zone; 1978 if (dmz_lock_zone_reclaim(dzone)) 1979 return dzone; 1980 } 1981 1982 return NULL; 1983 } 1984 1985 /* 1986 * Select a buffered sequential zone for reclaim. 1987 */ 1988 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, 1989 unsigned int idx) 1990 { 1991 struct dm_zone *zone; 1992 1993 list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { 1994 if (!zone->bzone) 1995 continue; 1996 if (dmz_lock_zone_reclaim(zone)) 1997 return zone; 1998 } 1999 2000 return NULL; 2001 } 2002 2003 /* 2004 * Select a zone for reclaim. 2005 */ 2006 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, 2007 unsigned int dev_idx, bool idle) 2008 { 2009 struct dm_zone *zone; 2010 2011 /* 2012 * Search for a zone candidate to reclaim: 2 cases are possible. 2013 * (1) There is no free sequential zones. Then a random data zone 2014 * cannot be reclaimed. So choose a sequential zone to reclaim so 2015 * that afterward a random zone can be reclaimed. 2016 * (2) At least one free sequential zone is available, then choose 2017 * the oldest random zone (data or buffer) that can be locked. 2018 */ 2019 dmz_lock_map(zmd); 2020 if (list_empty(&zmd->reserved_seq_zones_list)) 2021 zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); 2022 else 2023 zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); 2024 dmz_unlock_map(zmd); 2025 2026 return zone; 2027 } 2028 2029 /* 2030 * Get the zone mapping a chunk, if the chunk is mapped already. 2031 * If no mapping exist and the operation is WRITE, a zone is 2032 * allocated and used to map the chunk. 2033 * The zone returned will be set to the active state. 2034 */ 2035 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 2036 { 2037 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 2038 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 2039 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 2040 unsigned int dzone_id; 2041 struct dm_zone *dzone = NULL; 2042 int ret = 0; 2043 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2044 2045 dmz_lock_map(zmd); 2046 again: 2047 /* Get the chunk mapping */ 2048 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 2049 if (dzone_id == DMZ_MAP_UNMAPPED) { 2050 /* 2051 * Read or discard in unmapped chunks are fine. But for 2052 * writes, we need a mapping, so get one. 2053 */ 2054 if (op != REQ_OP_WRITE) 2055 goto out; 2056 2057 /* Allocate a random zone */ 2058 dzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2059 if (!dzone) { 2060 if (dmz_dev_is_dying(zmd)) { 2061 dzone = ERR_PTR(-EIO); 2062 goto out; 2063 } 2064 dmz_wait_for_free_zones(zmd); 2065 goto again; 2066 } 2067 2068 dmz_map_zone(zmd, dzone, chunk); 2069 2070 } else { 2071 /* The chunk is already mapped: get the mapping zone */ 2072 dzone = dmz_get(zmd, dzone_id); 2073 if (!dzone) { 2074 dzone = ERR_PTR(-EIO); 2075 goto out; 2076 } 2077 if (dzone->chunk != chunk) { 2078 dzone = ERR_PTR(-EIO); 2079 goto out; 2080 } 2081 2082 /* Repair write pointer if the sequential dzone has error */ 2083 if (dmz_seq_write_err(dzone)) { 2084 ret = dmz_handle_seq_write_err(zmd, dzone); 2085 if (ret) { 2086 dzone = ERR_PTR(-EIO); 2087 goto out; 2088 } 2089 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 2090 } 2091 } 2092 2093 /* 2094 * If the zone is being reclaimed, the chunk mapping may change 2095 * to a different zone. So wait for reclaim and retry. Otherwise, 2096 * activate the zone (this will prevent reclaim from touching it). 2097 */ 2098 if (dmz_in_reclaim(dzone)) { 2099 dmz_wait_for_reclaim(zmd, dzone); 2100 goto again; 2101 } 2102 dmz_activate_zone(dzone); 2103 dmz_lru_zone(zmd, dzone); 2104 out: 2105 dmz_unlock_map(zmd); 2106 2107 return dzone; 2108 } 2109 2110 /* 2111 * Write and discard change the block validity of data zones and their buffer 2112 * zones. Check here that valid blocks are still present. If all blocks are 2113 * invalid, the zones can be unmapped on the fly without waiting for reclaim 2114 * to do it. 2115 */ 2116 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 2117 { 2118 struct dm_zone *bzone; 2119 2120 dmz_lock_map(zmd); 2121 2122 bzone = dzone->bzone; 2123 if (bzone) { 2124 if (dmz_weight(bzone)) 2125 dmz_lru_zone(zmd, bzone); 2126 else { 2127 /* Empty buffer zone: reclaim it */ 2128 dmz_unmap_zone(zmd, bzone); 2129 dmz_free_zone(zmd, bzone); 2130 bzone = NULL; 2131 } 2132 } 2133 2134 /* Deactivate the data zone */ 2135 dmz_deactivate_zone(dzone); 2136 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 2137 dmz_lru_zone(zmd, dzone); 2138 else { 2139 /* Unbuffered inactive empty data zone: reclaim it */ 2140 dmz_unmap_zone(zmd, dzone); 2141 dmz_free_zone(zmd, dzone); 2142 } 2143 2144 dmz_unlock_map(zmd); 2145 } 2146 2147 /* 2148 * Allocate and map a random zone to buffer a chunk 2149 * already mapped to a sequential zone. 2150 */ 2151 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 2152 struct dm_zone *dzone) 2153 { 2154 struct dm_zone *bzone; 2155 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2156 2157 dmz_lock_map(zmd); 2158 again: 2159 bzone = dzone->bzone; 2160 if (bzone) 2161 goto out; 2162 2163 /* Allocate a random zone */ 2164 bzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2165 if (!bzone) { 2166 if (dmz_dev_is_dying(zmd)) { 2167 bzone = ERR_PTR(-EIO); 2168 goto out; 2169 } 2170 dmz_wait_for_free_zones(zmd); 2171 goto again; 2172 } 2173 2174 /* Update the chunk mapping */ 2175 dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); 2176 2177 set_bit(DMZ_BUF, &bzone->flags); 2178 bzone->chunk = dzone->chunk; 2179 bzone->bzone = dzone; 2180 dzone->bzone = bzone; 2181 if (dmz_is_cache(bzone)) 2182 list_add_tail(&bzone->link, &zmd->map_cache_list); 2183 else 2184 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 2185 out: 2186 dmz_unlock_map(zmd); 2187 2188 return bzone; 2189 } 2190 2191 /* 2192 * Get an unmapped (free) zone. 2193 * This must be called with the mapping lock held. 2194 */ 2195 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, 2196 unsigned long flags) 2197 { 2198 struct list_head *list; 2199 struct dm_zone *zone; 2200 int i = 0; 2201 2202 again: 2203 if (flags & DMZ_ALLOC_CACHE) 2204 list = &zmd->unmap_cache_list; 2205 else if (flags & DMZ_ALLOC_RND) 2206 list = &zmd->dev[dev_idx].unmap_rnd_list; 2207 else 2208 list = &zmd->dev[dev_idx].unmap_seq_list; 2209 2210 if (list_empty(list)) { 2211 /* 2212 * No free zone: return NULL if this is for not reclaim. 2213 */ 2214 if (!(flags & DMZ_ALLOC_RECLAIM)) 2215 return NULL; 2216 /* 2217 * Try to allocate from other devices 2218 */ 2219 if (i < zmd->nr_devs) { 2220 dev_idx = (dev_idx + 1) % zmd->nr_devs; 2221 i++; 2222 goto again; 2223 } 2224 2225 /* 2226 * Fallback to the reserved sequential zones 2227 */ 2228 zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, 2229 struct dm_zone, link); 2230 if (zone) { 2231 list_del_init(&zone->link); 2232 atomic_dec(&zmd->nr_reserved_seq_zones); 2233 } 2234 return zone; 2235 } 2236 2237 zone = list_first_entry(list, struct dm_zone, link); 2238 list_del_init(&zone->link); 2239 2240 if (dmz_is_cache(zone)) 2241 atomic_dec(&zmd->unmap_nr_cache); 2242 else if (dmz_is_rnd(zone)) 2243 atomic_dec(&zone->dev->unmap_nr_rnd); 2244 else 2245 atomic_dec(&zone->dev->unmap_nr_seq); 2246 2247 if (dmz_is_offline(zone)) { 2248 dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); 2249 zone = NULL; 2250 goto again; 2251 } 2252 if (dmz_is_meta(zone)) { 2253 dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); 2254 zone = NULL; 2255 goto again; 2256 } 2257 return zone; 2258 } 2259 2260 /* 2261 * Free a zone. 2262 * This must be called with the mapping lock held. 2263 */ 2264 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2265 { 2266 /* If this is a sequential zone, reset it */ 2267 if (dmz_is_seq(zone)) 2268 dmz_reset_zone(zmd, zone); 2269 2270 /* Return the zone to its type unmap list */ 2271 if (dmz_is_cache(zone)) { 2272 list_add_tail(&zone->link, &zmd->unmap_cache_list); 2273 atomic_inc(&zmd->unmap_nr_cache); 2274 } else if (dmz_is_rnd(zone)) { 2275 list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); 2276 atomic_inc(&zone->dev->unmap_nr_rnd); 2277 } else if (dmz_is_reserved(zone)) { 2278 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 2279 atomic_inc(&zmd->nr_reserved_seq_zones); 2280 } else { 2281 list_add_tail(&zone->link, &zone->dev->unmap_seq_list); 2282 atomic_inc(&zone->dev->unmap_nr_seq); 2283 } 2284 2285 wake_up_all(&zmd->free_wq); 2286 } 2287 2288 /* 2289 * Map a chunk to a zone. 2290 * This must be called with the mapping lock held. 2291 */ 2292 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 2293 unsigned int chunk) 2294 { 2295 /* Set the chunk mapping */ 2296 dmz_set_chunk_mapping(zmd, chunk, dzone->id, 2297 DMZ_MAP_UNMAPPED); 2298 dzone->chunk = chunk; 2299 if (dmz_is_cache(dzone)) 2300 list_add_tail(&dzone->link, &zmd->map_cache_list); 2301 else if (dmz_is_rnd(dzone)) 2302 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 2303 else 2304 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 2305 } 2306 2307 /* 2308 * Unmap a zone. 2309 * This must be called with the mapping lock held. 2310 */ 2311 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2312 { 2313 unsigned int chunk = zone->chunk; 2314 unsigned int dzone_id; 2315 2316 if (chunk == DMZ_MAP_UNMAPPED) { 2317 /* Already unmapped */ 2318 return; 2319 } 2320 2321 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 2322 /* 2323 * Unmapping the chunk buffer zone: clear only 2324 * the chunk buffer mapping 2325 */ 2326 dzone_id = zone->bzone->id; 2327 zone->bzone->bzone = NULL; 2328 zone->bzone = NULL; 2329 2330 } else { 2331 /* 2332 * Unmapping the chunk data zone: the zone must 2333 * not be buffered. 2334 */ 2335 if (WARN_ON(zone->bzone)) { 2336 zone->bzone->bzone = NULL; 2337 zone->bzone = NULL; 2338 } 2339 dzone_id = DMZ_MAP_UNMAPPED; 2340 } 2341 2342 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 2343 2344 zone->chunk = DMZ_MAP_UNMAPPED; 2345 list_del_init(&zone->link); 2346 } 2347 2348 /* 2349 * Set @nr_bits bits in @bitmap starting from @bit. 2350 * Return the number of bits changed from 0 to 1. 2351 */ 2352 static unsigned int dmz_set_bits(unsigned long *bitmap, 2353 unsigned int bit, unsigned int nr_bits) 2354 { 2355 unsigned long *addr; 2356 unsigned int end = bit + nr_bits; 2357 unsigned int n = 0; 2358 2359 while (bit < end) { 2360 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2361 ((end - bit) >= BITS_PER_LONG)) { 2362 /* Try to set the whole word at once */ 2363 addr = bitmap + BIT_WORD(bit); 2364 if (*addr == 0) { 2365 *addr = ULONG_MAX; 2366 n += BITS_PER_LONG; 2367 bit += BITS_PER_LONG; 2368 continue; 2369 } 2370 } 2371 2372 if (!test_and_set_bit(bit, bitmap)) 2373 n++; 2374 bit++; 2375 } 2376 2377 return n; 2378 } 2379 2380 /* 2381 * Get the bitmap block storing the bit for chunk_block in zone. 2382 */ 2383 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 2384 struct dm_zone *zone, 2385 sector_t chunk_block) 2386 { 2387 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 2388 (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + 2389 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 2390 2391 return dmz_get_mblock(zmd, bitmap_block); 2392 } 2393 2394 /* 2395 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 2396 */ 2397 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2398 struct dm_zone *to_zone) 2399 { 2400 struct dmz_mblock *from_mblk, *to_mblk; 2401 sector_t chunk_block = 0; 2402 2403 /* Get the zones bitmap blocks */ 2404 while (chunk_block < zmd->zone_nr_blocks) { 2405 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 2406 if (IS_ERR(from_mblk)) 2407 return PTR_ERR(from_mblk); 2408 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 2409 if (IS_ERR(to_mblk)) { 2410 dmz_release_mblock(zmd, from_mblk); 2411 return PTR_ERR(to_mblk); 2412 } 2413 2414 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 2415 dmz_dirty_mblock(zmd, to_mblk); 2416 2417 dmz_release_mblock(zmd, to_mblk); 2418 dmz_release_mblock(zmd, from_mblk); 2419 2420 chunk_block += zmd->zone_bits_per_mblk; 2421 } 2422 2423 to_zone->weight = from_zone->weight; 2424 2425 return 0; 2426 } 2427 2428 /* 2429 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 2430 * starting from chunk_block. 2431 */ 2432 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2433 struct dm_zone *to_zone, sector_t chunk_block) 2434 { 2435 unsigned int nr_blocks; 2436 int ret; 2437 2438 /* Get the zones bitmap blocks */ 2439 while (chunk_block < zmd->zone_nr_blocks) { 2440 /* Get a valid region from the source zone */ 2441 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 2442 if (ret <= 0) 2443 return ret; 2444 2445 nr_blocks = ret; 2446 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 2447 if (ret) 2448 return ret; 2449 2450 chunk_block += nr_blocks; 2451 } 2452 2453 return 0; 2454 } 2455 2456 /* 2457 * Validate all the blocks in the range [block..block+nr_blocks-1]. 2458 */ 2459 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2460 sector_t chunk_block, unsigned int nr_blocks) 2461 { 2462 unsigned int count, bit, nr_bits; 2463 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 2464 struct dmz_mblock *mblk; 2465 unsigned int n = 0; 2466 2467 dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", 2468 zone->id, (unsigned long long)chunk_block, 2469 nr_blocks); 2470 2471 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2472 2473 while (nr_blocks) { 2474 /* Get bitmap block */ 2475 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2476 if (IS_ERR(mblk)) 2477 return PTR_ERR(mblk); 2478 2479 /* Set bits */ 2480 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2481 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2482 2483 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2484 if (count) { 2485 dmz_dirty_mblock(zmd, mblk); 2486 n += count; 2487 } 2488 dmz_release_mblock(zmd, mblk); 2489 2490 nr_blocks -= nr_bits; 2491 chunk_block += nr_bits; 2492 } 2493 2494 if (likely(zone->weight + n <= zone_nr_blocks)) 2495 zone->weight += n; 2496 else { 2497 dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", 2498 zone->id, zone->weight, 2499 zone_nr_blocks - n); 2500 zone->weight = zone_nr_blocks; 2501 } 2502 2503 return 0; 2504 } 2505 2506 /* 2507 * Clear nr_bits bits in bitmap starting from bit. 2508 * Return the number of bits cleared. 2509 */ 2510 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2511 { 2512 unsigned long *addr; 2513 int end = bit + nr_bits; 2514 int n = 0; 2515 2516 while (bit < end) { 2517 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2518 ((end - bit) >= BITS_PER_LONG)) { 2519 /* Try to clear whole word at once */ 2520 addr = bitmap + BIT_WORD(bit); 2521 if (*addr == ULONG_MAX) { 2522 *addr = 0; 2523 n += BITS_PER_LONG; 2524 bit += BITS_PER_LONG; 2525 continue; 2526 } 2527 } 2528 2529 if (test_and_clear_bit(bit, bitmap)) 2530 n++; 2531 bit++; 2532 } 2533 2534 return n; 2535 } 2536 2537 /* 2538 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2539 */ 2540 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2541 sector_t chunk_block, unsigned int nr_blocks) 2542 { 2543 unsigned int count, bit, nr_bits; 2544 struct dmz_mblock *mblk; 2545 unsigned int n = 0; 2546 2547 dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", 2548 zone->id, (u64)chunk_block, nr_blocks); 2549 2550 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2551 2552 while (nr_blocks) { 2553 /* Get bitmap block */ 2554 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2555 if (IS_ERR(mblk)) 2556 return PTR_ERR(mblk); 2557 2558 /* Clear bits */ 2559 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2560 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2561 2562 count = dmz_clear_bits((unsigned long *)mblk->data, 2563 bit, nr_bits); 2564 if (count) { 2565 dmz_dirty_mblock(zmd, mblk); 2566 n += count; 2567 } 2568 dmz_release_mblock(zmd, mblk); 2569 2570 nr_blocks -= nr_bits; 2571 chunk_block += nr_bits; 2572 } 2573 2574 if (zone->weight >= n) 2575 zone->weight -= n; 2576 else { 2577 dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", 2578 zone->id, zone->weight, n); 2579 zone->weight = 0; 2580 } 2581 2582 return 0; 2583 } 2584 2585 /* 2586 * Get a block bit value. 2587 */ 2588 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2589 sector_t chunk_block) 2590 { 2591 struct dmz_mblock *mblk; 2592 int ret; 2593 2594 WARN_ON(chunk_block >= zmd->zone_nr_blocks); 2595 2596 /* Get bitmap block */ 2597 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2598 if (IS_ERR(mblk)) 2599 return PTR_ERR(mblk); 2600 2601 /* Get offset */ 2602 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2603 (unsigned long *) mblk->data) != 0; 2604 2605 dmz_release_mblock(zmd, mblk); 2606 2607 return ret; 2608 } 2609 2610 /* 2611 * Return the number of blocks from chunk_block to the first block with a bit 2612 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2613 */ 2614 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2615 sector_t chunk_block, unsigned int nr_blocks, 2616 int set) 2617 { 2618 struct dmz_mblock *mblk; 2619 unsigned int bit, set_bit, nr_bits; 2620 unsigned int zone_bits = zmd->zone_bits_per_mblk; 2621 unsigned long *bitmap; 2622 int n = 0; 2623 2624 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2625 2626 while (nr_blocks) { 2627 /* Get bitmap block */ 2628 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2629 if (IS_ERR(mblk)) 2630 return PTR_ERR(mblk); 2631 2632 /* Get offset */ 2633 bitmap = (unsigned long *) mblk->data; 2634 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2635 nr_bits = min(nr_blocks, zone_bits - bit); 2636 if (set) 2637 set_bit = find_next_bit(bitmap, zone_bits, bit); 2638 else 2639 set_bit = find_next_zero_bit(bitmap, zone_bits, bit); 2640 dmz_release_mblock(zmd, mblk); 2641 2642 n += set_bit - bit; 2643 if (set_bit < zone_bits) 2644 break; 2645 2646 nr_blocks -= nr_bits; 2647 chunk_block += nr_bits; 2648 } 2649 2650 return n; 2651 } 2652 2653 /* 2654 * Test if chunk_block is valid. If it is, the number of consecutive 2655 * valid blocks from chunk_block will be returned. 2656 */ 2657 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2658 sector_t chunk_block) 2659 { 2660 int valid; 2661 2662 valid = dmz_test_block(zmd, zone, chunk_block); 2663 if (valid <= 0) 2664 return valid; 2665 2666 /* The block is valid: get the number of valid blocks from block */ 2667 return dmz_to_next_set_block(zmd, zone, chunk_block, 2668 zmd->zone_nr_blocks - chunk_block, 0); 2669 } 2670 2671 /* 2672 * Find the first valid block from @chunk_block in @zone. 2673 * If such a block is found, its number is returned using 2674 * @chunk_block and the total number of valid blocks from @chunk_block 2675 * is returned. 2676 */ 2677 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2678 sector_t *chunk_block) 2679 { 2680 sector_t start_block = *chunk_block; 2681 int ret; 2682 2683 ret = dmz_to_next_set_block(zmd, zone, start_block, 2684 zmd->zone_nr_blocks - start_block, 1); 2685 if (ret < 0) 2686 return ret; 2687 2688 start_block += ret; 2689 *chunk_block = start_block; 2690 2691 return dmz_to_next_set_block(zmd, zone, start_block, 2692 zmd->zone_nr_blocks - start_block, 0); 2693 } 2694 2695 /* 2696 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2697 */ 2698 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2699 { 2700 unsigned long *addr; 2701 int end = bit + nr_bits; 2702 int n = 0; 2703 2704 while (bit < end) { 2705 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2706 ((end - bit) >= BITS_PER_LONG)) { 2707 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2708 if (*addr == ULONG_MAX) { 2709 n += BITS_PER_LONG; 2710 bit += BITS_PER_LONG; 2711 continue; 2712 } 2713 } 2714 2715 if (test_bit(bit, bitmap)) 2716 n++; 2717 bit++; 2718 } 2719 2720 return n; 2721 } 2722 2723 /* 2724 * Get a zone weight. 2725 */ 2726 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2727 { 2728 struct dmz_mblock *mblk; 2729 sector_t chunk_block = 0; 2730 unsigned int bit, nr_bits; 2731 unsigned int nr_blocks = zmd->zone_nr_blocks; 2732 void *bitmap; 2733 int n = 0; 2734 2735 while (nr_blocks) { 2736 /* Get bitmap block */ 2737 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2738 if (IS_ERR(mblk)) { 2739 n = 0; 2740 break; 2741 } 2742 2743 /* Count bits in this block */ 2744 bitmap = mblk->data; 2745 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2746 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2747 n += dmz_count_bits(bitmap, bit, nr_bits); 2748 2749 dmz_release_mblock(zmd, mblk); 2750 2751 nr_blocks -= nr_bits; 2752 chunk_block += nr_bits; 2753 } 2754 2755 zone->weight = n; 2756 } 2757 2758 /* 2759 * Cleanup the zoned metadata resources. 2760 */ 2761 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2762 { 2763 struct rb_root *root; 2764 struct dmz_mblock *mblk, *next; 2765 int i; 2766 2767 /* Release zone mapping resources */ 2768 if (zmd->map_mblk) { 2769 for (i = 0; i < zmd->nr_map_blocks; i++) 2770 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2771 kfree(zmd->map_mblk); 2772 zmd->map_mblk = NULL; 2773 } 2774 2775 /* Release super blocks */ 2776 for (i = 0; i < 2; i++) { 2777 if (zmd->sb[i].mblk) { 2778 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2779 zmd->sb[i].mblk = NULL; 2780 } 2781 } 2782 2783 /* Free cached blocks */ 2784 while (!list_empty(&zmd->mblk_dirty_list)) { 2785 mblk = list_first_entry(&zmd->mblk_dirty_list, 2786 struct dmz_mblock, link); 2787 dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", 2788 (u64)mblk->no, mblk->ref); 2789 list_del_init(&mblk->link); 2790 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2791 dmz_free_mblock(zmd, mblk); 2792 } 2793 2794 while (!list_empty(&zmd->mblk_lru_list)) { 2795 mblk = list_first_entry(&zmd->mblk_lru_list, 2796 struct dmz_mblock, link); 2797 list_del_init(&mblk->link); 2798 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2799 dmz_free_mblock(zmd, mblk); 2800 } 2801 2802 /* Sanity checks: the mblock rbtree should now be empty */ 2803 root = &zmd->mblk_rbtree; 2804 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2805 dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", 2806 (u64)mblk->no, mblk->ref); 2807 mblk->ref = 0; 2808 dmz_free_mblock(zmd, mblk); 2809 } 2810 2811 /* Free the zone descriptors */ 2812 dmz_drop_zones(zmd); 2813 2814 mutex_destroy(&zmd->mblk_flush_lock); 2815 mutex_destroy(&zmd->map_lock); 2816 } 2817 2818 static void dmz_print_dev(struct dmz_metadata *zmd, int num) 2819 { 2820 struct dmz_dev *dev = &zmd->dev[num]; 2821 2822 if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) 2823 dmz_dev_info(dev, "Regular block device"); 2824 else 2825 dmz_dev_info(dev, "Host-%s zoned block device", 2826 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2827 "aware" : "managed"); 2828 if (zmd->sb_version > 1) { 2829 sector_t sector_offset = 2830 dev->zone_offset << zmd->zone_nr_sectors_shift; 2831 2832 dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", 2833 (u64)dev->capacity, (u64)sector_offset); 2834 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", 2835 dev->nr_zones, (u64)zmd->zone_nr_sectors, 2836 (u64)dev->zone_offset); 2837 } else { 2838 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2839 (u64)dev->capacity); 2840 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2841 dev->nr_zones, (u64)zmd->zone_nr_sectors); 2842 } 2843 } 2844 2845 /* 2846 * Initialize the zoned metadata. 2847 */ 2848 int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, 2849 struct dmz_metadata **metadata, 2850 const char *devname) 2851 { 2852 struct dmz_metadata *zmd; 2853 unsigned int i; 2854 struct dm_zone *zone; 2855 int ret; 2856 2857 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2858 if (!zmd) 2859 return -ENOMEM; 2860 2861 strcpy(zmd->devname, devname); 2862 zmd->dev = dev; 2863 zmd->nr_devs = num_dev; 2864 zmd->mblk_rbtree = RB_ROOT; 2865 init_rwsem(&zmd->mblk_sem); 2866 mutex_init(&zmd->mblk_flush_lock); 2867 spin_lock_init(&zmd->mblk_lock); 2868 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2869 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2870 2871 mutex_init(&zmd->map_lock); 2872 2873 atomic_set(&zmd->unmap_nr_cache, 0); 2874 INIT_LIST_HEAD(&zmd->unmap_cache_list); 2875 INIT_LIST_HEAD(&zmd->map_cache_list); 2876 2877 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2878 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2879 2880 init_waitqueue_head(&zmd->free_wq); 2881 2882 /* Initialize zone descriptors */ 2883 ret = dmz_init_zones(zmd); 2884 if (ret) 2885 goto err; 2886 2887 /* Get super block */ 2888 ret = dmz_load_sb(zmd); 2889 if (ret) 2890 goto err; 2891 2892 /* Set metadata zones starting from sb_zone */ 2893 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2894 zone = dmz_get(zmd, zmd->sb[0].zone->id + i); 2895 if (!zone) { 2896 dmz_zmd_err(zmd, 2897 "metadata zone %u not present", i); 2898 ret = -ENXIO; 2899 goto err; 2900 } 2901 if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { 2902 dmz_zmd_err(zmd, 2903 "metadata zone %d is not random", i); 2904 ret = -ENXIO; 2905 goto err; 2906 } 2907 set_bit(DMZ_META, &zone->flags); 2908 } 2909 /* Load mapping table */ 2910 ret = dmz_load_mapping(zmd); 2911 if (ret) 2912 goto err; 2913 2914 /* 2915 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2916 * blocks and enough blocks to be able to cache the bitmap blocks of 2917 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2918 * the cache to add 512 more metadata blocks. 2919 */ 2920 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2921 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2922 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2923 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2924 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2925 2926 /* Metadata cache shrinker */ 2927 ret = register_shrinker(&zmd->mblk_shrinker); 2928 if (ret) { 2929 dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); 2930 goto err; 2931 } 2932 2933 dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); 2934 for (i = 0; i < zmd->nr_devs; i++) 2935 dmz_print_dev(zmd, i); 2936 2937 dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", 2938 zmd->nr_zones, (u64)zmd->zone_nr_sectors); 2939 dmz_zmd_debug(zmd, " %u metadata zones", 2940 zmd->nr_meta_zones * 2); 2941 dmz_zmd_debug(zmd, " %u data zones for %u chunks", 2942 zmd->nr_data_zones, zmd->nr_chunks); 2943 dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", 2944 zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); 2945 for (i = 0; i < zmd->nr_devs; i++) { 2946 dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", 2947 dmz_nr_rnd_zones(zmd, i), 2948 dmz_nr_unmap_rnd_zones(zmd, i)); 2949 dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", 2950 dmz_nr_seq_zones(zmd, i), 2951 dmz_nr_unmap_seq_zones(zmd, i)); 2952 } 2953 dmz_zmd_debug(zmd, " %u reserved sequential data zones", 2954 zmd->nr_reserved_seq); 2955 dmz_zmd_debug(zmd, "Format:"); 2956 dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", 2957 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2958 dmz_zmd_debug(zmd, " %u data zone mapping blocks", 2959 zmd->nr_map_blocks); 2960 dmz_zmd_debug(zmd, " %u bitmap blocks", 2961 zmd->nr_bitmap_blocks); 2962 2963 *metadata = zmd; 2964 2965 return 0; 2966 err: 2967 dmz_cleanup_metadata(zmd); 2968 kfree(zmd); 2969 *metadata = NULL; 2970 2971 return ret; 2972 } 2973 2974 /* 2975 * Cleanup the zoned metadata resources. 2976 */ 2977 void dmz_dtr_metadata(struct dmz_metadata *zmd) 2978 { 2979 unregister_shrinker(&zmd->mblk_shrinker); 2980 dmz_cleanup_metadata(zmd); 2981 kfree(zmd); 2982 } 2983 2984 /* 2985 * Check zone information on resume. 2986 */ 2987 int dmz_resume_metadata(struct dmz_metadata *zmd) 2988 { 2989 struct dm_zone *zone; 2990 sector_t wp_block; 2991 unsigned int i; 2992 int ret; 2993 2994 /* Check zones */ 2995 for (i = 0; i < zmd->nr_zones; i++) { 2996 zone = dmz_get(zmd, i); 2997 if (!zone) { 2998 dmz_zmd_err(zmd, "Unable to get zone %u", i); 2999 return -EIO; 3000 } 3001 wp_block = zone->wp_block; 3002 3003 ret = dmz_update_zone(zmd, zone); 3004 if (ret) { 3005 dmz_zmd_err(zmd, "Broken zone %u", i); 3006 return ret; 3007 } 3008 3009 if (dmz_is_offline(zone)) { 3010 dmz_zmd_warn(zmd, "Zone %u is offline", i); 3011 continue; 3012 } 3013 3014 /* Check write pointer */ 3015 if (!dmz_is_seq(zone)) 3016 zone->wp_block = 0; 3017 else if (zone->wp_block != wp_block) { 3018 dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", 3019 i, (u64)zone->wp_block, (u64)wp_block); 3020 zone->wp_block = wp_block; 3021 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 3022 zmd->zone_nr_blocks - zone->wp_block); 3023 } 3024 } 3025 3026 return 0; 3027 } 3028