1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 2 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* DM-Zoned label */ 73 u8 dmz_label[32]; /* 80 */ 74 75 /* DM-Zoned UUID */ 76 u8 dmz_uuid[16]; /* 96 */ 77 78 /* Device UUID */ 79 u8 dev_uuid[16]; /* 112 */ 80 81 /* Padding to full 512B sector */ 82 u8 reserved[400]; /* 512 */ 83 }; 84 85 /* 86 * Chunk mapping entry: entries are indexed by chunk number 87 * and give the zone ID (dzone_id) mapping the chunk on disk. 88 * This zone may be sequential or random. If it is a sequential 89 * zone, a second zone (bzone_id) used as a write buffer may 90 * also be specified. This second zone will always be a randomly 91 * writeable zone. 92 */ 93 struct dmz_map { 94 __le32 dzone_id; 95 __le32 bzone_id; 96 }; 97 98 /* 99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 100 */ 101 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 102 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 103 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 104 #define DMZ_MAP_UNMAPPED UINT_MAX 105 106 /* 107 * Meta data block descriptor (for cached metadata blocks). 108 */ 109 struct dmz_mblock { 110 struct rb_node node; 111 struct list_head link; 112 sector_t no; 113 unsigned int ref; 114 unsigned long state; 115 struct page *page; 116 void *data; 117 }; 118 119 /* 120 * Metadata block state flags. 121 */ 122 enum { 123 DMZ_META_DIRTY, 124 DMZ_META_READING, 125 DMZ_META_WRITING, 126 DMZ_META_ERROR, 127 }; 128 129 /* 130 * Super block information (one per metadata set). 131 */ 132 struct dmz_sb { 133 sector_t block; 134 struct dmz_dev *dev; 135 struct dmz_mblock *mblk; 136 struct dmz_super *sb; 137 struct dm_zone *zone; 138 }; 139 140 /* 141 * In-memory metadata. 142 */ 143 struct dmz_metadata { 144 struct dmz_dev *dev; 145 unsigned int nr_devs; 146 147 char devname[BDEVNAME_SIZE]; 148 char label[BDEVNAME_SIZE]; 149 uuid_t uuid; 150 151 sector_t zone_bitmap_size; 152 unsigned int zone_nr_bitmap_blocks; 153 unsigned int zone_bits_per_mblk; 154 155 sector_t zone_nr_blocks; 156 sector_t zone_nr_blocks_shift; 157 158 sector_t zone_nr_sectors; 159 sector_t zone_nr_sectors_shift; 160 161 unsigned int nr_bitmap_blocks; 162 unsigned int nr_map_blocks; 163 164 unsigned int nr_zones; 165 unsigned int nr_useable_zones; 166 unsigned int nr_meta_blocks; 167 unsigned int nr_meta_zones; 168 unsigned int nr_data_zones; 169 unsigned int nr_cache_zones; 170 unsigned int nr_rnd_zones; 171 unsigned int nr_reserved_seq; 172 unsigned int nr_chunks; 173 174 /* Zone information array */ 175 struct xarray zones; 176 177 struct dmz_sb sb[2]; 178 unsigned int mblk_primary; 179 unsigned int sb_version; 180 u64 sb_gen; 181 unsigned int min_nr_mblks; 182 unsigned int max_nr_mblks; 183 atomic_t nr_mblks; 184 struct rw_semaphore mblk_sem; 185 struct mutex mblk_flush_lock; 186 spinlock_t mblk_lock; 187 struct rb_root mblk_rbtree; 188 struct list_head mblk_lru_list; 189 struct list_head mblk_dirty_list; 190 struct shrinker mblk_shrinker; 191 192 /* Zone allocation management */ 193 struct mutex map_lock; 194 struct dmz_mblock **map_mblk; 195 196 unsigned int nr_cache; 197 atomic_t unmap_nr_cache; 198 struct list_head unmap_cache_list; 199 struct list_head map_cache_list; 200 201 atomic_t nr_reserved_seq_zones; 202 struct list_head reserved_seq_zones_list; 203 204 wait_queue_head_t free_wq; 205 }; 206 207 #define dmz_zmd_info(zmd, format, args...) \ 208 DMINFO("(%s): " format, (zmd)->label, ## args) 209 210 #define dmz_zmd_err(zmd, format, args...) \ 211 DMERR("(%s): " format, (zmd)->label, ## args) 212 213 #define dmz_zmd_warn(zmd, format, args...) \ 214 DMWARN("(%s): " format, (zmd)->label, ## args) 215 216 #define dmz_zmd_debug(zmd, format, args...) \ 217 DMDEBUG("(%s): " format, (zmd)->label, ## args) 218 /* 219 * Various accessors 220 */ 221 static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) 222 { 223 if (WARN_ON(!zone)) 224 return 0; 225 226 return zone->id - zone->dev->zone_offset; 227 } 228 229 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 230 { 231 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 232 233 return (sector_t)zone_id << zmd->zone_nr_sectors_shift; 234 } 235 236 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 237 { 238 unsigned int zone_id = dmz_dev_zone_id(zmd, zone); 239 240 return (sector_t)zone_id << zmd->zone_nr_blocks_shift; 241 } 242 243 unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd) 244 { 245 return zmd->zone_nr_blocks; 246 } 247 248 unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd) 249 { 250 return zmd->zone_nr_blocks_shift; 251 } 252 253 unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd) 254 { 255 return zmd->zone_nr_sectors; 256 } 257 258 unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd) 259 { 260 return zmd->zone_nr_sectors_shift; 261 } 262 263 unsigned int dmz_nr_zones(struct dmz_metadata *zmd) 264 { 265 return zmd->nr_zones; 266 } 267 268 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 269 { 270 return zmd->nr_chunks; 271 } 272 273 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx) 274 { 275 return zmd->dev[idx].nr_rnd; 276 } 277 278 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx) 279 { 280 return atomic_read(&zmd->dev[idx].unmap_nr_rnd); 281 } 282 283 unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd) 284 { 285 return zmd->nr_cache; 286 } 287 288 unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd) 289 { 290 return atomic_read(&zmd->unmap_nr_cache); 291 } 292 293 unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx) 294 { 295 return zmd->dev[idx].nr_seq; 296 } 297 298 unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx) 299 { 300 return atomic_read(&zmd->dev[idx].unmap_nr_seq); 301 } 302 303 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 304 { 305 return xa_load(&zmd->zones, zone_id); 306 } 307 308 static struct dm_zone *dmz_insert(struct dmz_metadata *zmd, 309 unsigned int zone_id, struct dmz_dev *dev) 310 { 311 struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL); 312 313 if (!zone) 314 return ERR_PTR(-ENOMEM); 315 316 if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) { 317 kfree(zone); 318 return ERR_PTR(-EBUSY); 319 } 320 321 INIT_LIST_HEAD(&zone->link); 322 atomic_set(&zone->refcount, 0); 323 zone->id = zone_id; 324 zone->chunk = DMZ_MAP_UNMAPPED; 325 zone->dev = dev; 326 327 return zone; 328 } 329 330 const char *dmz_metadata_label(struct dmz_metadata *zmd) 331 { 332 return (const char *)zmd->label; 333 } 334 335 bool dmz_check_dev(struct dmz_metadata *zmd) 336 { 337 unsigned int i; 338 339 for (i = 0; i < zmd->nr_devs; i++) { 340 if (!dmz_check_bdev(&zmd->dev[i])) 341 return false; 342 } 343 return true; 344 } 345 346 bool dmz_dev_is_dying(struct dmz_metadata *zmd) 347 { 348 unsigned int i; 349 350 for (i = 0; i < zmd->nr_devs; i++) { 351 if (dmz_bdev_is_dying(&zmd->dev[i])) 352 return true; 353 } 354 return false; 355 } 356 357 /* 358 * Lock/unlock mapping table. 359 * The map lock also protects all the zone lists. 360 */ 361 void dmz_lock_map(struct dmz_metadata *zmd) 362 { 363 mutex_lock(&zmd->map_lock); 364 } 365 366 void dmz_unlock_map(struct dmz_metadata *zmd) 367 { 368 mutex_unlock(&zmd->map_lock); 369 } 370 371 /* 372 * Lock/unlock metadata access. This is a "read" lock on a semaphore 373 * that prevents metadata flush from running while metadata are being 374 * modified. The actual metadata write mutual exclusion is achieved with 375 * the map lock and zone state management (active and reclaim state are 376 * mutually exclusive). 377 */ 378 void dmz_lock_metadata(struct dmz_metadata *zmd) 379 { 380 down_read(&zmd->mblk_sem); 381 } 382 383 void dmz_unlock_metadata(struct dmz_metadata *zmd) 384 { 385 up_read(&zmd->mblk_sem); 386 } 387 388 /* 389 * Lock/unlock flush: prevent concurrent executions 390 * of dmz_flush_metadata as well as metadata modification in reclaim 391 * while flush is being executed. 392 */ 393 void dmz_lock_flush(struct dmz_metadata *zmd) 394 { 395 mutex_lock(&zmd->mblk_flush_lock); 396 } 397 398 void dmz_unlock_flush(struct dmz_metadata *zmd) 399 { 400 mutex_unlock(&zmd->mblk_flush_lock); 401 } 402 403 /* 404 * Allocate a metadata block. 405 */ 406 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 407 sector_t mblk_no) 408 { 409 struct dmz_mblock *mblk = NULL; 410 411 /* See if we can reuse cached blocks */ 412 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 413 spin_lock(&zmd->mblk_lock); 414 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 415 struct dmz_mblock, link); 416 if (mblk) { 417 list_del_init(&mblk->link); 418 rb_erase(&mblk->node, &zmd->mblk_rbtree); 419 mblk->no = mblk_no; 420 } 421 spin_unlock(&zmd->mblk_lock); 422 if (mblk) 423 return mblk; 424 } 425 426 /* Allocate a new block */ 427 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 428 if (!mblk) 429 return NULL; 430 431 mblk->page = alloc_page(GFP_NOIO); 432 if (!mblk->page) { 433 kfree(mblk); 434 return NULL; 435 } 436 437 RB_CLEAR_NODE(&mblk->node); 438 INIT_LIST_HEAD(&mblk->link); 439 mblk->ref = 0; 440 mblk->state = 0; 441 mblk->no = mblk_no; 442 mblk->data = page_address(mblk->page); 443 444 atomic_inc(&zmd->nr_mblks); 445 446 return mblk; 447 } 448 449 /* 450 * Free a metadata block. 451 */ 452 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 453 { 454 __free_pages(mblk->page, 0); 455 kfree(mblk); 456 457 atomic_dec(&zmd->nr_mblks); 458 } 459 460 /* 461 * Insert a metadata block in the rbtree. 462 */ 463 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 464 { 465 struct rb_root *root = &zmd->mblk_rbtree; 466 struct rb_node **new = &(root->rb_node), *parent = NULL; 467 struct dmz_mblock *b; 468 469 /* Figure out where to put the new node */ 470 while (*new) { 471 b = container_of(*new, struct dmz_mblock, node); 472 parent = *new; 473 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 474 } 475 476 /* Add new node and rebalance tree */ 477 rb_link_node(&mblk->node, parent, new); 478 rb_insert_color(&mblk->node, root); 479 } 480 481 /* 482 * Lookup a metadata block in the rbtree. If the block is found, increment 483 * its reference count. 484 */ 485 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 486 sector_t mblk_no) 487 { 488 struct rb_root *root = &zmd->mblk_rbtree; 489 struct rb_node *node = root->rb_node; 490 struct dmz_mblock *mblk; 491 492 while (node) { 493 mblk = container_of(node, struct dmz_mblock, node); 494 if (mblk->no == mblk_no) { 495 /* 496 * If this is the first reference to the block, 497 * remove it from the LRU list. 498 */ 499 mblk->ref++; 500 if (mblk->ref == 1 && 501 !test_bit(DMZ_META_DIRTY, &mblk->state)) 502 list_del_init(&mblk->link); 503 return mblk; 504 } 505 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 506 } 507 508 return NULL; 509 } 510 511 /* 512 * Metadata block BIO end callback. 513 */ 514 static void dmz_mblock_bio_end_io(struct bio *bio) 515 { 516 struct dmz_mblock *mblk = bio->bi_private; 517 int flag; 518 519 if (bio->bi_status) 520 set_bit(DMZ_META_ERROR, &mblk->state); 521 522 if (bio_op(bio) == REQ_OP_WRITE) 523 flag = DMZ_META_WRITING; 524 else 525 flag = DMZ_META_READING; 526 527 clear_bit_unlock(flag, &mblk->state); 528 smp_mb__after_atomic(); 529 wake_up_bit(&mblk->state, flag); 530 531 bio_put(bio); 532 } 533 534 /* 535 * Read an uncached metadata block from disk and add it to the cache. 536 */ 537 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 538 sector_t mblk_no) 539 { 540 struct dmz_mblock *mblk, *m; 541 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 542 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 543 struct bio *bio; 544 545 if (dmz_bdev_is_dying(dev)) 546 return ERR_PTR(-EIO); 547 548 /* Get a new block and a BIO to read it */ 549 mblk = dmz_alloc_mblock(zmd, mblk_no); 550 if (!mblk) 551 return ERR_PTR(-ENOMEM); 552 553 bio = bio_alloc(GFP_NOIO, 1); 554 if (!bio) { 555 dmz_free_mblock(zmd, mblk); 556 return ERR_PTR(-ENOMEM); 557 } 558 559 spin_lock(&zmd->mblk_lock); 560 561 /* 562 * Make sure that another context did not start reading 563 * the block already. 564 */ 565 m = dmz_get_mblock_fast(zmd, mblk_no); 566 if (m) { 567 spin_unlock(&zmd->mblk_lock); 568 dmz_free_mblock(zmd, mblk); 569 bio_put(bio); 570 return m; 571 } 572 573 mblk->ref++; 574 set_bit(DMZ_META_READING, &mblk->state); 575 dmz_insert_mblock(zmd, mblk); 576 577 spin_unlock(&zmd->mblk_lock); 578 579 /* Submit read BIO */ 580 bio->bi_iter.bi_sector = dmz_blk2sect(block); 581 bio_set_dev(bio, dev->bdev); 582 bio->bi_private = mblk; 583 bio->bi_end_io = dmz_mblock_bio_end_io; 584 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 585 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 586 submit_bio(bio); 587 588 return mblk; 589 } 590 591 /* 592 * Free metadata blocks. 593 */ 594 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 595 unsigned long limit) 596 { 597 struct dmz_mblock *mblk; 598 unsigned long count = 0; 599 600 if (!zmd->max_nr_mblks) 601 return 0; 602 603 while (!list_empty(&zmd->mblk_lru_list) && 604 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 605 count < limit) { 606 mblk = list_first_entry(&zmd->mblk_lru_list, 607 struct dmz_mblock, link); 608 list_del_init(&mblk->link); 609 rb_erase(&mblk->node, &zmd->mblk_rbtree); 610 dmz_free_mblock(zmd, mblk); 611 count++; 612 } 613 614 return count; 615 } 616 617 /* 618 * For mblock shrinker: get the number of unused metadata blocks in the cache. 619 */ 620 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 621 struct shrink_control *sc) 622 { 623 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 624 625 return atomic_read(&zmd->nr_mblks); 626 } 627 628 /* 629 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 630 */ 631 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 632 struct shrink_control *sc) 633 { 634 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 635 unsigned long count; 636 637 spin_lock(&zmd->mblk_lock); 638 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 639 spin_unlock(&zmd->mblk_lock); 640 641 return count ? count : SHRINK_STOP; 642 } 643 644 /* 645 * Release a metadata block. 646 */ 647 static void dmz_release_mblock(struct dmz_metadata *zmd, 648 struct dmz_mblock *mblk) 649 { 650 651 if (!mblk) 652 return; 653 654 spin_lock(&zmd->mblk_lock); 655 656 mblk->ref--; 657 if (mblk->ref == 0) { 658 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 659 rb_erase(&mblk->node, &zmd->mblk_rbtree); 660 dmz_free_mblock(zmd, mblk); 661 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 662 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 663 dmz_shrink_mblock_cache(zmd, 1); 664 } 665 } 666 667 spin_unlock(&zmd->mblk_lock); 668 } 669 670 /* 671 * Get a metadata block from the rbtree. If the block 672 * is not present, read it from disk. 673 */ 674 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 675 sector_t mblk_no) 676 { 677 struct dmz_mblock *mblk; 678 struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev; 679 680 /* Check rbtree */ 681 spin_lock(&zmd->mblk_lock); 682 mblk = dmz_get_mblock_fast(zmd, mblk_no); 683 spin_unlock(&zmd->mblk_lock); 684 685 if (!mblk) { 686 /* Cache miss: read the block from disk */ 687 mblk = dmz_get_mblock_slow(zmd, mblk_no); 688 if (IS_ERR(mblk)) 689 return mblk; 690 } 691 692 /* Wait for on-going read I/O and check for error */ 693 wait_on_bit_io(&mblk->state, DMZ_META_READING, 694 TASK_UNINTERRUPTIBLE); 695 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 696 dmz_release_mblock(zmd, mblk); 697 dmz_check_bdev(dev); 698 return ERR_PTR(-EIO); 699 } 700 701 return mblk; 702 } 703 704 /* 705 * Mark a metadata block dirty. 706 */ 707 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 708 { 709 spin_lock(&zmd->mblk_lock); 710 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 711 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 712 spin_unlock(&zmd->mblk_lock); 713 } 714 715 /* 716 * Issue a metadata block write BIO. 717 */ 718 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 719 unsigned int set) 720 { 721 struct dmz_dev *dev = zmd->sb[set].dev; 722 sector_t block = zmd->sb[set].block + mblk->no; 723 struct bio *bio; 724 725 if (dmz_bdev_is_dying(dev)) 726 return -EIO; 727 728 bio = bio_alloc(GFP_NOIO, 1); 729 if (!bio) { 730 set_bit(DMZ_META_ERROR, &mblk->state); 731 return -ENOMEM; 732 } 733 734 set_bit(DMZ_META_WRITING, &mblk->state); 735 736 bio->bi_iter.bi_sector = dmz_blk2sect(block); 737 bio_set_dev(bio, dev->bdev); 738 bio->bi_private = mblk; 739 bio->bi_end_io = dmz_mblock_bio_end_io; 740 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 741 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 742 submit_bio(bio); 743 744 return 0; 745 } 746 747 /* 748 * Read/write a metadata block. 749 */ 750 static int dmz_rdwr_block(struct dmz_dev *dev, int op, 751 sector_t block, struct page *page) 752 { 753 struct bio *bio; 754 int ret; 755 756 if (WARN_ON(!dev)) 757 return -EIO; 758 759 if (dmz_bdev_is_dying(dev)) 760 return -EIO; 761 762 bio = bio_alloc(GFP_NOIO, 1); 763 if (!bio) 764 return -ENOMEM; 765 766 bio->bi_iter.bi_sector = dmz_blk2sect(block); 767 bio_set_dev(bio, dev->bdev); 768 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 769 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 770 ret = submit_bio_wait(bio); 771 bio_put(bio); 772 773 if (ret) 774 dmz_check_bdev(dev); 775 return ret; 776 } 777 778 /* 779 * Write super block of the specified metadata set. 780 */ 781 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 782 { 783 struct dmz_mblock *mblk = zmd->sb[set].mblk; 784 struct dmz_super *sb = zmd->sb[set].sb; 785 struct dmz_dev *dev = zmd->sb[set].dev; 786 sector_t sb_block; 787 u64 sb_gen = zmd->sb_gen + 1; 788 int ret; 789 790 sb->magic = cpu_to_le32(DMZ_MAGIC); 791 792 sb->version = cpu_to_le32(zmd->sb_version); 793 if (zmd->sb_version > 1) { 794 BUILD_BUG_ON(UUID_SIZE != 16); 795 export_uuid(sb->dmz_uuid, &zmd->uuid); 796 memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE); 797 export_uuid(sb->dev_uuid, &dev->uuid); 798 } 799 800 sb->gen = cpu_to_le64(sb_gen); 801 802 /* 803 * The metadata always references the absolute block address, 804 * ie relative to the entire block range, not the per-device 805 * block address. 806 */ 807 sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift; 808 sb->sb_block = cpu_to_le64(sb_block); 809 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 810 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 811 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 812 813 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 814 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 815 816 sb->crc = 0; 817 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 818 819 ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block, 820 mblk->page); 821 if (ret == 0) 822 ret = blkdev_issue_flush(dev->bdev); 823 824 return ret; 825 } 826 827 /* 828 * Write dirty metadata blocks to the specified set. 829 */ 830 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 831 struct list_head *write_list, 832 unsigned int set) 833 { 834 struct dmz_mblock *mblk; 835 struct dmz_dev *dev = zmd->sb[set].dev; 836 struct blk_plug plug; 837 int ret = 0, nr_mblks_submitted = 0; 838 839 /* Issue writes */ 840 blk_start_plug(&plug); 841 list_for_each_entry(mblk, write_list, link) { 842 ret = dmz_write_mblock(zmd, mblk, set); 843 if (ret) 844 break; 845 nr_mblks_submitted++; 846 } 847 blk_finish_plug(&plug); 848 849 /* Wait for completion */ 850 list_for_each_entry(mblk, write_list, link) { 851 if (!nr_mblks_submitted) 852 break; 853 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 854 TASK_UNINTERRUPTIBLE); 855 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 856 clear_bit(DMZ_META_ERROR, &mblk->state); 857 dmz_check_bdev(dev); 858 ret = -EIO; 859 } 860 nr_mblks_submitted--; 861 } 862 863 /* Flush drive cache (this will also sync data) */ 864 if (ret == 0) 865 ret = blkdev_issue_flush(dev->bdev); 866 867 return ret; 868 } 869 870 /* 871 * Log dirty metadata blocks. 872 */ 873 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 874 struct list_head *write_list) 875 { 876 unsigned int log_set = zmd->mblk_primary ^ 0x1; 877 int ret; 878 879 /* Write dirty blocks to the log */ 880 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 881 if (ret) 882 return ret; 883 884 /* 885 * No error so far: now validate the log by updating the 886 * log index super block generation. 887 */ 888 ret = dmz_write_sb(zmd, log_set); 889 if (ret) 890 return ret; 891 892 return 0; 893 } 894 895 /* 896 * Flush dirty metadata blocks. 897 */ 898 int dmz_flush_metadata(struct dmz_metadata *zmd) 899 { 900 struct dmz_mblock *mblk; 901 struct list_head write_list; 902 struct dmz_dev *dev; 903 int ret; 904 905 if (WARN_ON(!zmd)) 906 return 0; 907 908 INIT_LIST_HEAD(&write_list); 909 910 /* 911 * Make sure that metadata blocks are stable before logging: take 912 * the write lock on the metadata semaphore to prevent target BIOs 913 * from modifying metadata. 914 */ 915 down_write(&zmd->mblk_sem); 916 dev = zmd->sb[zmd->mblk_primary].dev; 917 918 /* 919 * This is called from the target flush work and reclaim work. 920 * Concurrent execution is not allowed. 921 */ 922 dmz_lock_flush(zmd); 923 924 if (dmz_bdev_is_dying(dev)) { 925 ret = -EIO; 926 goto out; 927 } 928 929 /* Get dirty blocks */ 930 spin_lock(&zmd->mblk_lock); 931 list_splice_init(&zmd->mblk_dirty_list, &write_list); 932 spin_unlock(&zmd->mblk_lock); 933 934 /* If there are no dirty metadata blocks, just flush the device cache */ 935 if (list_empty(&write_list)) { 936 ret = blkdev_issue_flush(dev->bdev); 937 goto err; 938 } 939 940 /* 941 * The primary metadata set is still clean. Keep it this way until 942 * all updates are successful in the secondary set. That is, use 943 * the secondary set as a log. 944 */ 945 ret = dmz_log_dirty_mblocks(zmd, &write_list); 946 if (ret) 947 goto err; 948 949 /* 950 * The log is on disk. It is now safe to update in place 951 * in the primary metadata set. 952 */ 953 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 954 if (ret) 955 goto err; 956 957 ret = dmz_write_sb(zmd, zmd->mblk_primary); 958 if (ret) 959 goto err; 960 961 while (!list_empty(&write_list)) { 962 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 963 list_del_init(&mblk->link); 964 965 spin_lock(&zmd->mblk_lock); 966 clear_bit(DMZ_META_DIRTY, &mblk->state); 967 if (mblk->ref == 0) 968 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 969 spin_unlock(&zmd->mblk_lock); 970 } 971 972 zmd->sb_gen++; 973 out: 974 dmz_unlock_flush(zmd); 975 up_write(&zmd->mblk_sem); 976 977 return ret; 978 979 err: 980 if (!list_empty(&write_list)) { 981 spin_lock(&zmd->mblk_lock); 982 list_splice(&write_list, &zmd->mblk_dirty_list); 983 spin_unlock(&zmd->mblk_lock); 984 } 985 if (!dmz_check_bdev(dev)) 986 ret = -EIO; 987 goto out; 988 } 989 990 /* 991 * Check super block. 992 */ 993 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb, 994 bool tertiary) 995 { 996 struct dmz_super *sb = dsb->sb; 997 struct dmz_dev *dev = dsb->dev; 998 unsigned int nr_meta_zones, nr_data_zones; 999 u32 crc, stored_crc; 1000 u64 gen, sb_block; 1001 1002 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 1003 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 1004 DMZ_MAGIC, le32_to_cpu(sb->magic)); 1005 return -ENXIO; 1006 } 1007 1008 zmd->sb_version = le32_to_cpu(sb->version); 1009 if (zmd->sb_version > DMZ_META_VER) { 1010 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 1011 DMZ_META_VER, zmd->sb_version); 1012 return -EINVAL; 1013 } 1014 if (zmd->sb_version < 2 && tertiary) { 1015 dmz_dev_err(dev, "Tertiary superblocks are not supported"); 1016 return -EINVAL; 1017 } 1018 1019 gen = le64_to_cpu(sb->gen); 1020 stored_crc = le32_to_cpu(sb->crc); 1021 sb->crc = 0; 1022 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 1023 if (crc != stored_crc) { 1024 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 1025 crc, stored_crc); 1026 return -ENXIO; 1027 } 1028 1029 sb_block = le64_to_cpu(sb->sb_block); 1030 if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) { 1031 dmz_dev_err(dev, "Invalid superblock position " 1032 "(is %llu expected %llu)", 1033 sb_block, 1034 (u64)dsb->zone->id << zmd->zone_nr_blocks_shift); 1035 return -EINVAL; 1036 } 1037 if (zmd->sb_version > 1) { 1038 uuid_t sb_uuid; 1039 1040 import_uuid(&sb_uuid, sb->dmz_uuid); 1041 if (uuid_is_null(&sb_uuid)) { 1042 dmz_dev_err(dev, "NULL DM-Zoned uuid"); 1043 return -ENXIO; 1044 } else if (uuid_is_null(&zmd->uuid)) { 1045 uuid_copy(&zmd->uuid, &sb_uuid); 1046 } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) { 1047 dmz_dev_err(dev, "mismatching DM-Zoned uuid, " 1048 "is %pUl expected %pUl", 1049 &sb_uuid, &zmd->uuid); 1050 return -ENXIO; 1051 } 1052 if (!strlen(zmd->label)) 1053 memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE); 1054 else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) { 1055 dmz_dev_err(dev, "mismatching DM-Zoned label, " 1056 "is %s expected %s", 1057 sb->dmz_label, zmd->label); 1058 return -ENXIO; 1059 } 1060 import_uuid(&dev->uuid, sb->dev_uuid); 1061 if (uuid_is_null(&dev->uuid)) { 1062 dmz_dev_err(dev, "NULL device uuid"); 1063 return -ENXIO; 1064 } 1065 1066 if (tertiary) { 1067 /* 1068 * Generation number should be 0, but it doesn't 1069 * really matter if it isn't. 1070 */ 1071 if (gen != 0) 1072 dmz_dev_warn(dev, "Invalid generation %llu", 1073 gen); 1074 return 0; 1075 } 1076 } 1077 1078 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1) 1079 >> zmd->zone_nr_blocks_shift; 1080 if (!nr_meta_zones || 1081 (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) || 1082 (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) { 1083 dmz_dev_err(dev, "Invalid number of metadata blocks"); 1084 return -ENXIO; 1085 } 1086 1087 if (!le32_to_cpu(sb->nr_reserved_seq) || 1088 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 1089 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 1090 return -ENXIO; 1091 } 1092 1093 nr_data_zones = zmd->nr_useable_zones - 1094 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 1095 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 1096 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 1097 le32_to_cpu(sb->nr_chunks), nr_data_zones); 1098 return -ENXIO; 1099 } 1100 1101 /* OK */ 1102 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 1103 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 1104 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 1105 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 1106 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 1107 zmd->nr_meta_zones = nr_meta_zones; 1108 zmd->nr_data_zones = nr_data_zones; 1109 1110 return 0; 1111 } 1112 1113 /* 1114 * Read the first or second super block from disk. 1115 */ 1116 static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1117 { 1118 dmz_zmd_debug(zmd, "read superblock set %d dev %s block %llu", 1119 set, sb->dev->name, sb->block); 1120 1121 return dmz_rdwr_block(sb->dev, REQ_OP_READ, 1122 sb->block, sb->mblk->page); 1123 } 1124 1125 /* 1126 * Determine the position of the secondary super blocks on disk. 1127 * This is used only if a corruption of the primary super block 1128 * is detected. 1129 */ 1130 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 1131 { 1132 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 1133 struct dmz_mblock *mblk; 1134 unsigned int zone_id = zmd->sb[0].zone->id; 1135 int i; 1136 1137 /* Allocate a block */ 1138 mblk = dmz_alloc_mblock(zmd, 0); 1139 if (!mblk) 1140 return -ENOMEM; 1141 1142 zmd->sb[1].mblk = mblk; 1143 zmd->sb[1].sb = mblk->data; 1144 1145 /* Bad first super block: search for the second one */ 1146 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 1147 zmd->sb[1].zone = dmz_get(zmd, zone_id + 1); 1148 zmd->sb[1].dev = zmd->sb[0].dev; 1149 for (i = 1; i < zmd->nr_rnd_zones; i++) { 1150 if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0) 1151 break; 1152 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 1153 return 0; 1154 zmd->sb[1].block += zone_nr_blocks; 1155 zmd->sb[1].zone = dmz_get(zmd, zone_id + i); 1156 } 1157 1158 dmz_free_mblock(zmd, mblk); 1159 zmd->sb[1].mblk = NULL; 1160 zmd->sb[1].zone = NULL; 1161 zmd->sb[1].dev = NULL; 1162 1163 return -EIO; 1164 } 1165 1166 /* 1167 * Read a super block from disk. 1168 */ 1169 static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set) 1170 { 1171 struct dmz_mblock *mblk; 1172 int ret; 1173 1174 /* Allocate a block */ 1175 mblk = dmz_alloc_mblock(zmd, 0); 1176 if (!mblk) 1177 return -ENOMEM; 1178 1179 sb->mblk = mblk; 1180 sb->sb = mblk->data; 1181 1182 /* Read super block */ 1183 ret = dmz_read_sb(zmd, sb, set); 1184 if (ret) { 1185 dmz_free_mblock(zmd, mblk); 1186 sb->mblk = NULL; 1187 return ret; 1188 } 1189 1190 return 0; 1191 } 1192 1193 /* 1194 * Recover a metadata set. 1195 */ 1196 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 1197 { 1198 unsigned int src_set = dst_set ^ 0x1; 1199 struct page *page; 1200 int i, ret; 1201 1202 dmz_dev_warn(zmd->sb[dst_set].dev, 1203 "Metadata set %u invalid: recovering", dst_set); 1204 1205 if (dst_set == 0) 1206 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1207 else 1208 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1209 1210 page = alloc_page(GFP_NOIO); 1211 if (!page) 1212 return -ENOMEM; 1213 1214 /* Copy metadata blocks */ 1215 for (i = 1; i < zmd->nr_meta_blocks; i++) { 1216 ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ, 1217 zmd->sb[src_set].block + i, page); 1218 if (ret) 1219 goto out; 1220 ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE, 1221 zmd->sb[dst_set].block + i, page); 1222 if (ret) 1223 goto out; 1224 } 1225 1226 /* Finalize with the super block */ 1227 if (!zmd->sb[dst_set].mblk) { 1228 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 1229 if (!zmd->sb[dst_set].mblk) { 1230 ret = -ENOMEM; 1231 goto out; 1232 } 1233 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 1234 } 1235 1236 ret = dmz_write_sb(zmd, dst_set); 1237 out: 1238 __free_pages(page, 0); 1239 1240 return ret; 1241 } 1242 1243 /* 1244 * Get super block from disk. 1245 */ 1246 static int dmz_load_sb(struct dmz_metadata *zmd) 1247 { 1248 bool sb_good[2] = {false, false}; 1249 u64 sb_gen[2] = {0, 0}; 1250 int ret; 1251 1252 if (!zmd->sb[0].zone) { 1253 dmz_zmd_err(zmd, "Primary super block zone not set"); 1254 return -ENXIO; 1255 } 1256 1257 /* Read and check the primary super block */ 1258 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone); 1259 zmd->sb[0].dev = zmd->sb[0].zone->dev; 1260 ret = dmz_get_sb(zmd, &zmd->sb[0], 0); 1261 if (ret) { 1262 dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed"); 1263 return ret; 1264 } 1265 1266 ret = dmz_check_sb(zmd, &zmd->sb[0], false); 1267 1268 /* Read and check secondary super block */ 1269 if (ret == 0) { 1270 sb_good[0] = true; 1271 if (!zmd->sb[1].zone) { 1272 unsigned int zone_id = 1273 zmd->sb[0].zone->id + zmd->nr_meta_zones; 1274 1275 zmd->sb[1].zone = dmz_get(zmd, zone_id); 1276 } 1277 zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone); 1278 zmd->sb[1].dev = zmd->sb[0].dev; 1279 ret = dmz_get_sb(zmd, &zmd->sb[1], 1); 1280 } else 1281 ret = dmz_lookup_secondary_sb(zmd); 1282 1283 if (ret) { 1284 dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed"); 1285 return ret; 1286 } 1287 1288 ret = dmz_check_sb(zmd, &zmd->sb[1], false); 1289 if (ret == 0) 1290 sb_good[1] = true; 1291 1292 /* Use highest generation sb first */ 1293 if (!sb_good[0] && !sb_good[1]) { 1294 dmz_zmd_err(zmd, "No valid super block found"); 1295 return -EIO; 1296 } 1297 1298 if (sb_good[0]) 1299 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1300 else { 1301 ret = dmz_recover_mblocks(zmd, 0); 1302 if (ret) { 1303 dmz_dev_err(zmd->sb[0].dev, 1304 "Recovery of superblock 0 failed"); 1305 return -EIO; 1306 } 1307 } 1308 1309 if (sb_good[1]) 1310 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1311 else { 1312 ret = dmz_recover_mblocks(zmd, 1); 1313 1314 if (ret) { 1315 dmz_dev_err(zmd->sb[1].dev, 1316 "Recovery of superblock 1 failed"); 1317 return -EIO; 1318 } 1319 } 1320 1321 if (sb_gen[0] >= sb_gen[1]) { 1322 zmd->sb_gen = sb_gen[0]; 1323 zmd->mblk_primary = 0; 1324 } else { 1325 zmd->sb_gen = sb_gen[1]; 1326 zmd->mblk_primary = 1; 1327 } 1328 1329 dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev, 1330 "Using super block %u (gen %llu)", 1331 zmd->mblk_primary, zmd->sb_gen); 1332 1333 if (zmd->sb_version > 1) { 1334 int i; 1335 struct dmz_sb *sb; 1336 1337 sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL); 1338 if (!sb) 1339 return -ENOMEM; 1340 for (i = 1; i < zmd->nr_devs; i++) { 1341 sb->block = 0; 1342 sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset); 1343 sb->dev = &zmd->dev[i]; 1344 if (!dmz_is_meta(sb->zone)) { 1345 dmz_dev_err(sb->dev, 1346 "Tertiary super block zone %u not marked as metadata zone", 1347 sb->zone->id); 1348 ret = -EINVAL; 1349 goto out_kfree; 1350 } 1351 ret = dmz_get_sb(zmd, sb, i + 1); 1352 if (ret) { 1353 dmz_dev_err(sb->dev, 1354 "Read tertiary super block failed"); 1355 dmz_free_mblock(zmd, sb->mblk); 1356 goto out_kfree; 1357 } 1358 ret = dmz_check_sb(zmd, sb, true); 1359 dmz_free_mblock(zmd, sb->mblk); 1360 if (ret == -EINVAL) 1361 goto out_kfree; 1362 } 1363 out_kfree: 1364 kfree(sb); 1365 } 1366 return ret; 1367 } 1368 1369 /* 1370 * Initialize a zone descriptor. 1371 */ 1372 static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data) 1373 { 1374 struct dmz_dev *dev = data; 1375 struct dmz_metadata *zmd = dev->metadata; 1376 int idx = num + dev->zone_offset; 1377 struct dm_zone *zone; 1378 1379 zone = dmz_insert(zmd, idx, dev); 1380 if (IS_ERR(zone)) 1381 return PTR_ERR(zone); 1382 1383 if (blkz->len != zmd->zone_nr_sectors) { 1384 if (zmd->sb_version > 1) { 1385 /* Ignore the eventual runt (smaller) zone */ 1386 set_bit(DMZ_OFFLINE, &zone->flags); 1387 return 0; 1388 } else if (blkz->start + blkz->len == dev->capacity) 1389 return 0; 1390 return -ENXIO; 1391 } 1392 1393 switch (blkz->type) { 1394 case BLK_ZONE_TYPE_CONVENTIONAL: 1395 set_bit(DMZ_RND, &zone->flags); 1396 break; 1397 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1398 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1399 set_bit(DMZ_SEQ, &zone->flags); 1400 break; 1401 default: 1402 return -ENXIO; 1403 } 1404 1405 if (dmz_is_rnd(zone)) 1406 zone->wp_block = 0; 1407 else 1408 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1409 1410 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1411 set_bit(DMZ_OFFLINE, &zone->flags); 1412 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1413 set_bit(DMZ_READ_ONLY, &zone->flags); 1414 else { 1415 zmd->nr_useable_zones++; 1416 if (dmz_is_rnd(zone)) { 1417 zmd->nr_rnd_zones++; 1418 if (zmd->nr_devs == 1 && !zmd->sb[0].zone) { 1419 /* Primary super block zone */ 1420 zmd->sb[0].zone = zone; 1421 } 1422 } 1423 if (zmd->nr_devs > 1 && num == 0) { 1424 /* 1425 * Tertiary superblock zones are always at the 1426 * start of the zoned devices, so mark them 1427 * as metadata zone. 1428 */ 1429 set_bit(DMZ_META, &zone->flags); 1430 } 1431 } 1432 return 0; 1433 } 1434 1435 static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev) 1436 { 1437 int idx; 1438 sector_t zone_offset = 0; 1439 1440 for(idx = 0; idx < dev->nr_zones; idx++) { 1441 struct dm_zone *zone; 1442 1443 zone = dmz_insert(zmd, idx, dev); 1444 if (IS_ERR(zone)) 1445 return PTR_ERR(zone); 1446 set_bit(DMZ_CACHE, &zone->flags); 1447 zone->wp_block = 0; 1448 zmd->nr_cache_zones++; 1449 zmd->nr_useable_zones++; 1450 if (dev->capacity - zone_offset < zmd->zone_nr_sectors) { 1451 /* Disable runt zone */ 1452 set_bit(DMZ_OFFLINE, &zone->flags); 1453 break; 1454 } 1455 zone_offset += zmd->zone_nr_sectors; 1456 } 1457 return 0; 1458 } 1459 1460 /* 1461 * Free zones descriptors. 1462 */ 1463 static void dmz_drop_zones(struct dmz_metadata *zmd) 1464 { 1465 int idx; 1466 1467 for(idx = 0; idx < zmd->nr_zones; idx++) { 1468 struct dm_zone *zone = xa_load(&zmd->zones, idx); 1469 1470 kfree(zone); 1471 xa_erase(&zmd->zones, idx); 1472 } 1473 xa_destroy(&zmd->zones); 1474 } 1475 1476 /* 1477 * Allocate and initialize zone descriptors using the zone 1478 * information from disk. 1479 */ 1480 static int dmz_init_zones(struct dmz_metadata *zmd) 1481 { 1482 int i, ret; 1483 struct dmz_dev *zoned_dev = &zmd->dev[0]; 1484 1485 /* Init */ 1486 zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors; 1487 zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors); 1488 zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors); 1489 zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks); 1490 zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3; 1491 zmd->zone_nr_bitmap_blocks = 1492 max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); 1493 zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks, 1494 DMZ_BLOCK_SIZE_BITS); 1495 1496 /* Allocate zone array */ 1497 zmd->nr_zones = 0; 1498 for (i = 0; i < zmd->nr_devs; i++) { 1499 struct dmz_dev *dev = &zmd->dev[i]; 1500 1501 dev->metadata = zmd; 1502 zmd->nr_zones += dev->nr_zones; 1503 1504 atomic_set(&dev->unmap_nr_rnd, 0); 1505 INIT_LIST_HEAD(&dev->unmap_rnd_list); 1506 INIT_LIST_HEAD(&dev->map_rnd_list); 1507 1508 atomic_set(&dev->unmap_nr_seq, 0); 1509 INIT_LIST_HEAD(&dev->unmap_seq_list); 1510 INIT_LIST_HEAD(&dev->map_seq_list); 1511 } 1512 1513 if (!zmd->nr_zones) { 1514 DMERR("(%s): No zones found", zmd->devname); 1515 return -ENXIO; 1516 } 1517 xa_init(&zmd->zones); 1518 1519 DMDEBUG("(%s): Using %zu B for zone information", 1520 zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones); 1521 1522 if (zmd->nr_devs > 1) { 1523 ret = dmz_emulate_zones(zmd, &zmd->dev[0]); 1524 if (ret < 0) { 1525 DMDEBUG("(%s): Failed to emulate zones, error %d", 1526 zmd->devname, ret); 1527 dmz_drop_zones(zmd); 1528 return ret; 1529 } 1530 1531 /* 1532 * Primary superblock zone is always at zone 0 when multiple 1533 * drives are present. 1534 */ 1535 zmd->sb[0].zone = dmz_get(zmd, 0); 1536 1537 for (i = 1; i < zmd->nr_devs; i++) { 1538 zoned_dev = &zmd->dev[i]; 1539 1540 ret = blkdev_report_zones(zoned_dev->bdev, 0, 1541 BLK_ALL_ZONES, 1542 dmz_init_zone, zoned_dev); 1543 if (ret < 0) { 1544 DMDEBUG("(%s): Failed to report zones, error %d", 1545 zmd->devname, ret); 1546 dmz_drop_zones(zmd); 1547 return ret; 1548 } 1549 } 1550 return 0; 1551 } 1552 1553 /* 1554 * Get zone information and initialize zone descriptors. At the same 1555 * time, determine where the super block should be: first block of the 1556 * first randomly writable zone. 1557 */ 1558 ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES, 1559 dmz_init_zone, zoned_dev); 1560 if (ret < 0) { 1561 DMDEBUG("(%s): Failed to report zones, error %d", 1562 zmd->devname, ret); 1563 dmz_drop_zones(zmd); 1564 return ret; 1565 } 1566 1567 return 0; 1568 } 1569 1570 static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, 1571 void *data) 1572 { 1573 struct dm_zone *zone = data; 1574 1575 clear_bit(DMZ_OFFLINE, &zone->flags); 1576 clear_bit(DMZ_READ_ONLY, &zone->flags); 1577 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1578 set_bit(DMZ_OFFLINE, &zone->flags); 1579 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1580 set_bit(DMZ_READ_ONLY, &zone->flags); 1581 1582 if (dmz_is_seq(zone)) 1583 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1584 else 1585 zone->wp_block = 0; 1586 return 0; 1587 } 1588 1589 /* 1590 * Update a zone information. 1591 */ 1592 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1593 { 1594 struct dmz_dev *dev = zone->dev; 1595 unsigned int noio_flag; 1596 int ret; 1597 1598 if (dev->flags & DMZ_BDEV_REGULAR) 1599 return 0; 1600 1601 /* 1602 * Get zone information from disk. Since blkdev_report_zones() uses 1603 * GFP_KERNEL by default for memory allocations, set the per-task 1604 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1605 * GFP_NOIO was specified. 1606 */ 1607 noio_flag = memalloc_noio_save(); 1608 ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1, 1609 dmz_update_zone_cb, zone); 1610 memalloc_noio_restore(noio_flag); 1611 1612 if (ret == 0) 1613 ret = -EIO; 1614 if (ret < 0) { 1615 dmz_dev_err(dev, "Get zone %u report failed", 1616 zone->id); 1617 dmz_check_bdev(dev); 1618 return ret; 1619 } 1620 1621 return 0; 1622 } 1623 1624 /* 1625 * Check a zone write pointer position when the zone is marked 1626 * with the sequential write error flag. 1627 */ 1628 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1629 struct dm_zone *zone) 1630 { 1631 struct dmz_dev *dev = zone->dev; 1632 unsigned int wp = 0; 1633 int ret; 1634 1635 wp = zone->wp_block; 1636 ret = dmz_update_zone(zmd, zone); 1637 if (ret) 1638 return ret; 1639 1640 dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)", 1641 zone->id, zone->wp_block, wp); 1642 1643 if (zone->wp_block < wp) { 1644 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1645 wp - zone->wp_block); 1646 } 1647 1648 return 0; 1649 } 1650 1651 /* 1652 * Reset a zone write pointer. 1653 */ 1654 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1655 { 1656 int ret; 1657 1658 /* 1659 * Ignore offline zones, read only zones, 1660 * and conventional zones. 1661 */ 1662 if (dmz_is_offline(zone) || 1663 dmz_is_readonly(zone) || 1664 dmz_is_rnd(zone)) 1665 return 0; 1666 1667 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1668 struct dmz_dev *dev = zone->dev; 1669 1670 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1671 dmz_start_sect(zmd, zone), 1672 zmd->zone_nr_sectors, GFP_NOIO); 1673 if (ret) { 1674 dmz_dev_err(dev, "Reset zone %u failed %d", 1675 zone->id, ret); 1676 return ret; 1677 } 1678 } 1679 1680 /* Clear write error bit and rewind write pointer position */ 1681 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1682 zone->wp_block = 0; 1683 1684 return 0; 1685 } 1686 1687 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1688 1689 /* 1690 * Initialize chunk mapping. 1691 */ 1692 static int dmz_load_mapping(struct dmz_metadata *zmd) 1693 { 1694 struct dm_zone *dzone, *bzone; 1695 struct dmz_mblock *dmap_mblk = NULL; 1696 struct dmz_map *dmap; 1697 unsigned int i = 0, e = 0, chunk = 0; 1698 unsigned int dzone_id; 1699 unsigned int bzone_id; 1700 1701 /* Metadata block array for the chunk mapping table */ 1702 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1703 sizeof(struct dmz_mblk *), GFP_KERNEL); 1704 if (!zmd->map_mblk) 1705 return -ENOMEM; 1706 1707 /* Get chunk mapping table blocks and initialize zone mapping */ 1708 while (chunk < zmd->nr_chunks) { 1709 if (!dmap_mblk) { 1710 /* Get mapping block */ 1711 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1712 if (IS_ERR(dmap_mblk)) 1713 return PTR_ERR(dmap_mblk); 1714 zmd->map_mblk[i] = dmap_mblk; 1715 dmap = (struct dmz_map *) dmap_mblk->data; 1716 i++; 1717 e = 0; 1718 } 1719 1720 /* Check data zone */ 1721 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1722 if (dzone_id == DMZ_MAP_UNMAPPED) 1723 goto next; 1724 1725 if (dzone_id >= zmd->nr_zones) { 1726 dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u", 1727 chunk, dzone_id); 1728 return -EIO; 1729 } 1730 1731 dzone = dmz_get(zmd, dzone_id); 1732 if (!dzone) { 1733 dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present", 1734 chunk, dzone_id); 1735 return -EIO; 1736 } 1737 set_bit(DMZ_DATA, &dzone->flags); 1738 dzone->chunk = chunk; 1739 dmz_get_zone_weight(zmd, dzone); 1740 1741 if (dmz_is_cache(dzone)) 1742 list_add_tail(&dzone->link, &zmd->map_cache_list); 1743 else if (dmz_is_rnd(dzone)) 1744 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 1745 else 1746 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 1747 1748 /* Check buffer zone */ 1749 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1750 if (bzone_id == DMZ_MAP_UNMAPPED) 1751 goto next; 1752 1753 if (bzone_id >= zmd->nr_zones) { 1754 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u", 1755 chunk, bzone_id); 1756 return -EIO; 1757 } 1758 1759 bzone = dmz_get(zmd, bzone_id); 1760 if (!bzone) { 1761 dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present", 1762 chunk, bzone_id); 1763 return -EIO; 1764 } 1765 if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) { 1766 dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u", 1767 chunk, bzone_id); 1768 return -EIO; 1769 } 1770 1771 set_bit(DMZ_DATA, &bzone->flags); 1772 set_bit(DMZ_BUF, &bzone->flags); 1773 bzone->chunk = chunk; 1774 bzone->bzone = dzone; 1775 dzone->bzone = bzone; 1776 dmz_get_zone_weight(zmd, bzone); 1777 if (dmz_is_cache(bzone)) 1778 list_add_tail(&bzone->link, &zmd->map_cache_list); 1779 else 1780 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 1781 next: 1782 chunk++; 1783 e++; 1784 if (e >= DMZ_MAP_ENTRIES) 1785 dmap_mblk = NULL; 1786 } 1787 1788 /* 1789 * At this point, only meta zones and mapped data zones were 1790 * fully initialized. All remaining zones are unmapped data 1791 * zones. Finish initializing those here. 1792 */ 1793 for (i = 0; i < zmd->nr_zones; i++) { 1794 dzone = dmz_get(zmd, i); 1795 if (!dzone) 1796 continue; 1797 if (dmz_is_meta(dzone)) 1798 continue; 1799 if (dmz_is_offline(dzone)) 1800 continue; 1801 1802 if (dmz_is_cache(dzone)) 1803 zmd->nr_cache++; 1804 else if (dmz_is_rnd(dzone)) 1805 dzone->dev->nr_rnd++; 1806 else 1807 dzone->dev->nr_seq++; 1808 1809 if (dmz_is_data(dzone)) { 1810 /* Already initialized */ 1811 continue; 1812 } 1813 1814 /* Unmapped data zone */ 1815 set_bit(DMZ_DATA, &dzone->flags); 1816 dzone->chunk = DMZ_MAP_UNMAPPED; 1817 if (dmz_is_cache(dzone)) { 1818 list_add_tail(&dzone->link, &zmd->unmap_cache_list); 1819 atomic_inc(&zmd->unmap_nr_cache); 1820 } else if (dmz_is_rnd(dzone)) { 1821 list_add_tail(&dzone->link, 1822 &dzone->dev->unmap_rnd_list); 1823 atomic_inc(&dzone->dev->unmap_nr_rnd); 1824 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1825 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1826 set_bit(DMZ_RESERVED, &dzone->flags); 1827 atomic_inc(&zmd->nr_reserved_seq_zones); 1828 dzone->dev->nr_seq--; 1829 } else { 1830 list_add_tail(&dzone->link, 1831 &dzone->dev->unmap_seq_list); 1832 atomic_inc(&dzone->dev->unmap_nr_seq); 1833 } 1834 } 1835 1836 return 0; 1837 } 1838 1839 /* 1840 * Set a data chunk mapping. 1841 */ 1842 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1843 unsigned int dzone_id, unsigned int bzone_id) 1844 { 1845 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1846 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1847 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1848 1849 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1850 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1851 dmz_dirty_mblock(zmd, dmap_mblk); 1852 } 1853 1854 /* 1855 * The list of mapped zones is maintained in LRU order. 1856 * This rotates a zone at the end of its map list. 1857 */ 1858 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1859 { 1860 if (list_empty(&zone->link)) 1861 return; 1862 1863 list_del_init(&zone->link); 1864 if (dmz_is_seq(zone)) { 1865 /* LRU rotate sequential zone */ 1866 list_add_tail(&zone->link, &zone->dev->map_seq_list); 1867 } else if (dmz_is_cache(zone)) { 1868 /* LRU rotate cache zone */ 1869 list_add_tail(&zone->link, &zmd->map_cache_list); 1870 } else { 1871 /* LRU rotate random zone */ 1872 list_add_tail(&zone->link, &zone->dev->map_rnd_list); 1873 } 1874 } 1875 1876 /* 1877 * The list of mapped random zones is maintained 1878 * in LRU order. This rotates a zone at the end of the list. 1879 */ 1880 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1881 { 1882 __dmz_lru_zone(zmd, zone); 1883 if (zone->bzone) 1884 __dmz_lru_zone(zmd, zone->bzone); 1885 } 1886 1887 /* 1888 * Wait for any zone to be freed. 1889 */ 1890 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1891 { 1892 DEFINE_WAIT(wait); 1893 1894 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1895 dmz_unlock_map(zmd); 1896 dmz_unlock_metadata(zmd); 1897 1898 io_schedule_timeout(HZ); 1899 1900 dmz_lock_metadata(zmd); 1901 dmz_lock_map(zmd); 1902 finish_wait(&zmd->free_wq, &wait); 1903 } 1904 1905 /* 1906 * Lock a zone for reclaim (set the zone RECLAIM bit). 1907 * Returns false if the zone cannot be locked or if it is already locked 1908 * and 1 otherwise. 1909 */ 1910 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1911 { 1912 /* Active zones cannot be reclaimed */ 1913 if (dmz_is_active(zone)) 1914 return 0; 1915 1916 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1917 } 1918 1919 /* 1920 * Clear a zone reclaim flag. 1921 */ 1922 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1923 { 1924 WARN_ON(dmz_is_active(zone)); 1925 WARN_ON(!dmz_in_reclaim(zone)); 1926 1927 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1928 smp_mb__after_atomic(); 1929 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1930 } 1931 1932 /* 1933 * Wait for a zone reclaim to complete. 1934 */ 1935 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1936 { 1937 dmz_unlock_map(zmd); 1938 dmz_unlock_metadata(zmd); 1939 set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1940 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1941 clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags); 1942 dmz_lock_metadata(zmd); 1943 dmz_lock_map(zmd); 1944 } 1945 1946 /* 1947 * Select a cache or random write zone for reclaim. 1948 */ 1949 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd, 1950 unsigned int idx, bool idle) 1951 { 1952 struct dm_zone *dzone = NULL; 1953 struct dm_zone *zone, *maxw_z = NULL; 1954 struct list_head *zone_list; 1955 1956 /* If we have cache zones select from the cache zone list */ 1957 if (zmd->nr_cache) { 1958 zone_list = &zmd->map_cache_list; 1959 /* Try to relaim random zones, too, when idle */ 1960 if (idle && list_empty(zone_list)) 1961 zone_list = &zmd->dev[idx].map_rnd_list; 1962 } else 1963 zone_list = &zmd->dev[idx].map_rnd_list; 1964 1965 /* 1966 * Find the buffer zone with the heaviest weight or the first (oldest) 1967 * data zone that can be reclaimed. 1968 */ 1969 list_for_each_entry(zone, zone_list, link) { 1970 if (dmz_is_buf(zone)) { 1971 dzone = zone->bzone; 1972 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 1973 continue; 1974 if (!maxw_z || maxw_z->weight < dzone->weight) 1975 maxw_z = dzone; 1976 } else { 1977 dzone = zone; 1978 if (dmz_lock_zone_reclaim(dzone)) 1979 return dzone; 1980 } 1981 } 1982 1983 if (maxw_z && dmz_lock_zone_reclaim(maxw_z)) 1984 return maxw_z; 1985 1986 /* 1987 * If we come here, none of the zones inspected could be locked for 1988 * reclaim. Try again, being more aggressive, that is, find the 1989 * first zone that can be reclaimed regardless of its weitght. 1990 */ 1991 list_for_each_entry(zone, zone_list, link) { 1992 if (dmz_is_buf(zone)) { 1993 dzone = zone->bzone; 1994 if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx) 1995 continue; 1996 } else 1997 dzone = zone; 1998 if (dmz_lock_zone_reclaim(dzone)) 1999 return dzone; 2000 } 2001 2002 return NULL; 2003 } 2004 2005 /* 2006 * Select a buffered sequential zone for reclaim. 2007 */ 2008 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd, 2009 unsigned int idx) 2010 { 2011 struct dm_zone *zone; 2012 2013 list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) { 2014 if (!zone->bzone) 2015 continue; 2016 if (dmz_lock_zone_reclaim(zone)) 2017 return zone; 2018 } 2019 2020 return NULL; 2021 } 2022 2023 /* 2024 * Select a zone for reclaim. 2025 */ 2026 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, 2027 unsigned int dev_idx, bool idle) 2028 { 2029 struct dm_zone *zone = NULL; 2030 2031 /* 2032 * Search for a zone candidate to reclaim: 2 cases are possible. 2033 * (1) There is no free sequential zones. Then a random data zone 2034 * cannot be reclaimed. So choose a sequential zone to reclaim so 2035 * that afterward a random zone can be reclaimed. 2036 * (2) At least one free sequential zone is available, then choose 2037 * the oldest random zone (data or buffer) that can be locked. 2038 */ 2039 dmz_lock_map(zmd); 2040 if (list_empty(&zmd->reserved_seq_zones_list)) 2041 zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx); 2042 if (!zone) 2043 zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle); 2044 dmz_unlock_map(zmd); 2045 2046 return zone; 2047 } 2048 2049 /* 2050 * Get the zone mapping a chunk, if the chunk is mapped already. 2051 * If no mapping exist and the operation is WRITE, a zone is 2052 * allocated and used to map the chunk. 2053 * The zone returned will be set to the active state. 2054 */ 2055 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 2056 { 2057 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 2058 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 2059 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 2060 unsigned int dzone_id; 2061 struct dm_zone *dzone = NULL; 2062 int ret = 0; 2063 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2064 2065 dmz_lock_map(zmd); 2066 again: 2067 /* Get the chunk mapping */ 2068 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 2069 if (dzone_id == DMZ_MAP_UNMAPPED) { 2070 /* 2071 * Read or discard in unmapped chunks are fine. But for 2072 * writes, we need a mapping, so get one. 2073 */ 2074 if (op != REQ_OP_WRITE) 2075 goto out; 2076 2077 /* Allocate a random zone */ 2078 dzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2079 if (!dzone) { 2080 if (dmz_dev_is_dying(zmd)) { 2081 dzone = ERR_PTR(-EIO); 2082 goto out; 2083 } 2084 dmz_wait_for_free_zones(zmd); 2085 goto again; 2086 } 2087 2088 dmz_map_zone(zmd, dzone, chunk); 2089 2090 } else { 2091 /* The chunk is already mapped: get the mapping zone */ 2092 dzone = dmz_get(zmd, dzone_id); 2093 if (!dzone) { 2094 dzone = ERR_PTR(-EIO); 2095 goto out; 2096 } 2097 if (dzone->chunk != chunk) { 2098 dzone = ERR_PTR(-EIO); 2099 goto out; 2100 } 2101 2102 /* Repair write pointer if the sequential dzone has error */ 2103 if (dmz_seq_write_err(dzone)) { 2104 ret = dmz_handle_seq_write_err(zmd, dzone); 2105 if (ret) { 2106 dzone = ERR_PTR(-EIO); 2107 goto out; 2108 } 2109 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 2110 } 2111 } 2112 2113 /* 2114 * If the zone is being reclaimed, the chunk mapping may change 2115 * to a different zone. So wait for reclaim and retry. Otherwise, 2116 * activate the zone (this will prevent reclaim from touching it). 2117 */ 2118 if (dmz_in_reclaim(dzone)) { 2119 dmz_wait_for_reclaim(zmd, dzone); 2120 goto again; 2121 } 2122 dmz_activate_zone(dzone); 2123 dmz_lru_zone(zmd, dzone); 2124 out: 2125 dmz_unlock_map(zmd); 2126 2127 return dzone; 2128 } 2129 2130 /* 2131 * Write and discard change the block validity of data zones and their buffer 2132 * zones. Check here that valid blocks are still present. If all blocks are 2133 * invalid, the zones can be unmapped on the fly without waiting for reclaim 2134 * to do it. 2135 */ 2136 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 2137 { 2138 struct dm_zone *bzone; 2139 2140 dmz_lock_map(zmd); 2141 2142 bzone = dzone->bzone; 2143 if (bzone) { 2144 if (dmz_weight(bzone)) 2145 dmz_lru_zone(zmd, bzone); 2146 else { 2147 /* Empty buffer zone: reclaim it */ 2148 dmz_unmap_zone(zmd, bzone); 2149 dmz_free_zone(zmd, bzone); 2150 bzone = NULL; 2151 } 2152 } 2153 2154 /* Deactivate the data zone */ 2155 dmz_deactivate_zone(dzone); 2156 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 2157 dmz_lru_zone(zmd, dzone); 2158 else { 2159 /* Unbuffered inactive empty data zone: reclaim it */ 2160 dmz_unmap_zone(zmd, dzone); 2161 dmz_free_zone(zmd, dzone); 2162 } 2163 2164 dmz_unlock_map(zmd); 2165 } 2166 2167 /* 2168 * Allocate and map a random zone to buffer a chunk 2169 * already mapped to a sequential zone. 2170 */ 2171 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 2172 struct dm_zone *dzone) 2173 { 2174 struct dm_zone *bzone; 2175 int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND; 2176 2177 dmz_lock_map(zmd); 2178 again: 2179 bzone = dzone->bzone; 2180 if (bzone) 2181 goto out; 2182 2183 /* Allocate a random zone */ 2184 bzone = dmz_alloc_zone(zmd, 0, alloc_flags); 2185 if (!bzone) { 2186 if (dmz_dev_is_dying(zmd)) { 2187 bzone = ERR_PTR(-EIO); 2188 goto out; 2189 } 2190 dmz_wait_for_free_zones(zmd); 2191 goto again; 2192 } 2193 2194 /* Update the chunk mapping */ 2195 dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id); 2196 2197 set_bit(DMZ_BUF, &bzone->flags); 2198 bzone->chunk = dzone->chunk; 2199 bzone->bzone = dzone; 2200 dzone->bzone = bzone; 2201 if (dmz_is_cache(bzone)) 2202 list_add_tail(&bzone->link, &zmd->map_cache_list); 2203 else 2204 list_add_tail(&bzone->link, &bzone->dev->map_rnd_list); 2205 out: 2206 dmz_unlock_map(zmd); 2207 2208 return bzone; 2209 } 2210 2211 /* 2212 * Get an unmapped (free) zone. 2213 * This must be called with the mapping lock held. 2214 */ 2215 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx, 2216 unsigned long flags) 2217 { 2218 struct list_head *list; 2219 struct dm_zone *zone; 2220 int i; 2221 2222 /* Schedule reclaim to ensure free zones are available */ 2223 if (!(flags & DMZ_ALLOC_RECLAIM)) { 2224 for (i = 0; i < zmd->nr_devs; i++) 2225 dmz_schedule_reclaim(zmd->dev[i].reclaim); 2226 } 2227 2228 i = 0; 2229 again: 2230 if (flags & DMZ_ALLOC_CACHE) 2231 list = &zmd->unmap_cache_list; 2232 else if (flags & DMZ_ALLOC_RND) 2233 list = &zmd->dev[dev_idx].unmap_rnd_list; 2234 else 2235 list = &zmd->dev[dev_idx].unmap_seq_list; 2236 2237 if (list_empty(list)) { 2238 /* 2239 * No free zone: return NULL if this is for not reclaim. 2240 */ 2241 if (!(flags & DMZ_ALLOC_RECLAIM)) 2242 return NULL; 2243 /* 2244 * Try to allocate from other devices 2245 */ 2246 if (i < zmd->nr_devs) { 2247 dev_idx = (dev_idx + 1) % zmd->nr_devs; 2248 i++; 2249 goto again; 2250 } 2251 2252 /* 2253 * Fallback to the reserved sequential zones 2254 */ 2255 zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list, 2256 struct dm_zone, link); 2257 if (zone) { 2258 list_del_init(&zone->link); 2259 atomic_dec(&zmd->nr_reserved_seq_zones); 2260 } 2261 return zone; 2262 } 2263 2264 zone = list_first_entry(list, struct dm_zone, link); 2265 list_del_init(&zone->link); 2266 2267 if (dmz_is_cache(zone)) 2268 atomic_dec(&zmd->unmap_nr_cache); 2269 else if (dmz_is_rnd(zone)) 2270 atomic_dec(&zone->dev->unmap_nr_rnd); 2271 else 2272 atomic_dec(&zone->dev->unmap_nr_seq); 2273 2274 if (dmz_is_offline(zone)) { 2275 dmz_zmd_warn(zmd, "Zone %u is offline", zone->id); 2276 zone = NULL; 2277 goto again; 2278 } 2279 if (dmz_is_meta(zone)) { 2280 dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id); 2281 zone = NULL; 2282 goto again; 2283 } 2284 return zone; 2285 } 2286 2287 /* 2288 * Free a zone. 2289 * This must be called with the mapping lock held. 2290 */ 2291 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2292 { 2293 /* If this is a sequential zone, reset it */ 2294 if (dmz_is_seq(zone)) 2295 dmz_reset_zone(zmd, zone); 2296 2297 /* Return the zone to its type unmap list */ 2298 if (dmz_is_cache(zone)) { 2299 list_add_tail(&zone->link, &zmd->unmap_cache_list); 2300 atomic_inc(&zmd->unmap_nr_cache); 2301 } else if (dmz_is_rnd(zone)) { 2302 list_add_tail(&zone->link, &zone->dev->unmap_rnd_list); 2303 atomic_inc(&zone->dev->unmap_nr_rnd); 2304 } else if (dmz_is_reserved(zone)) { 2305 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 2306 atomic_inc(&zmd->nr_reserved_seq_zones); 2307 } else { 2308 list_add_tail(&zone->link, &zone->dev->unmap_seq_list); 2309 atomic_inc(&zone->dev->unmap_nr_seq); 2310 } 2311 2312 wake_up_all(&zmd->free_wq); 2313 } 2314 2315 /* 2316 * Map a chunk to a zone. 2317 * This must be called with the mapping lock held. 2318 */ 2319 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 2320 unsigned int chunk) 2321 { 2322 /* Set the chunk mapping */ 2323 dmz_set_chunk_mapping(zmd, chunk, dzone->id, 2324 DMZ_MAP_UNMAPPED); 2325 dzone->chunk = chunk; 2326 if (dmz_is_cache(dzone)) 2327 list_add_tail(&dzone->link, &zmd->map_cache_list); 2328 else if (dmz_is_rnd(dzone)) 2329 list_add_tail(&dzone->link, &dzone->dev->map_rnd_list); 2330 else 2331 list_add_tail(&dzone->link, &dzone->dev->map_seq_list); 2332 } 2333 2334 /* 2335 * Unmap a zone. 2336 * This must be called with the mapping lock held. 2337 */ 2338 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 2339 { 2340 unsigned int chunk = zone->chunk; 2341 unsigned int dzone_id; 2342 2343 if (chunk == DMZ_MAP_UNMAPPED) { 2344 /* Already unmapped */ 2345 return; 2346 } 2347 2348 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 2349 /* 2350 * Unmapping the chunk buffer zone: clear only 2351 * the chunk buffer mapping 2352 */ 2353 dzone_id = zone->bzone->id; 2354 zone->bzone->bzone = NULL; 2355 zone->bzone = NULL; 2356 2357 } else { 2358 /* 2359 * Unmapping the chunk data zone: the zone must 2360 * not be buffered. 2361 */ 2362 if (WARN_ON(zone->bzone)) { 2363 zone->bzone->bzone = NULL; 2364 zone->bzone = NULL; 2365 } 2366 dzone_id = DMZ_MAP_UNMAPPED; 2367 } 2368 2369 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 2370 2371 zone->chunk = DMZ_MAP_UNMAPPED; 2372 list_del_init(&zone->link); 2373 } 2374 2375 /* 2376 * Set @nr_bits bits in @bitmap starting from @bit. 2377 * Return the number of bits changed from 0 to 1. 2378 */ 2379 static unsigned int dmz_set_bits(unsigned long *bitmap, 2380 unsigned int bit, unsigned int nr_bits) 2381 { 2382 unsigned long *addr; 2383 unsigned int end = bit + nr_bits; 2384 unsigned int n = 0; 2385 2386 while (bit < end) { 2387 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2388 ((end - bit) >= BITS_PER_LONG)) { 2389 /* Try to set the whole word at once */ 2390 addr = bitmap + BIT_WORD(bit); 2391 if (*addr == 0) { 2392 *addr = ULONG_MAX; 2393 n += BITS_PER_LONG; 2394 bit += BITS_PER_LONG; 2395 continue; 2396 } 2397 } 2398 2399 if (!test_and_set_bit(bit, bitmap)) 2400 n++; 2401 bit++; 2402 } 2403 2404 return n; 2405 } 2406 2407 /* 2408 * Get the bitmap block storing the bit for chunk_block in zone. 2409 */ 2410 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 2411 struct dm_zone *zone, 2412 sector_t chunk_block) 2413 { 2414 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 2415 (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) + 2416 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 2417 2418 return dmz_get_mblock(zmd, bitmap_block); 2419 } 2420 2421 /* 2422 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 2423 */ 2424 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2425 struct dm_zone *to_zone) 2426 { 2427 struct dmz_mblock *from_mblk, *to_mblk; 2428 sector_t chunk_block = 0; 2429 2430 /* Get the zones bitmap blocks */ 2431 while (chunk_block < zmd->zone_nr_blocks) { 2432 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 2433 if (IS_ERR(from_mblk)) 2434 return PTR_ERR(from_mblk); 2435 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 2436 if (IS_ERR(to_mblk)) { 2437 dmz_release_mblock(zmd, from_mblk); 2438 return PTR_ERR(to_mblk); 2439 } 2440 2441 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 2442 dmz_dirty_mblock(zmd, to_mblk); 2443 2444 dmz_release_mblock(zmd, to_mblk); 2445 dmz_release_mblock(zmd, from_mblk); 2446 2447 chunk_block += zmd->zone_bits_per_mblk; 2448 } 2449 2450 to_zone->weight = from_zone->weight; 2451 2452 return 0; 2453 } 2454 2455 /* 2456 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 2457 * starting from chunk_block. 2458 */ 2459 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 2460 struct dm_zone *to_zone, sector_t chunk_block) 2461 { 2462 unsigned int nr_blocks; 2463 int ret; 2464 2465 /* Get the zones bitmap blocks */ 2466 while (chunk_block < zmd->zone_nr_blocks) { 2467 /* Get a valid region from the source zone */ 2468 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 2469 if (ret <= 0) 2470 return ret; 2471 2472 nr_blocks = ret; 2473 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 2474 if (ret) 2475 return ret; 2476 2477 chunk_block += nr_blocks; 2478 } 2479 2480 return 0; 2481 } 2482 2483 /* 2484 * Validate all the blocks in the range [block..block+nr_blocks-1]. 2485 */ 2486 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2487 sector_t chunk_block, unsigned int nr_blocks) 2488 { 2489 unsigned int count, bit, nr_bits; 2490 unsigned int zone_nr_blocks = zmd->zone_nr_blocks; 2491 struct dmz_mblock *mblk; 2492 unsigned int n = 0; 2493 2494 dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks", 2495 zone->id, (unsigned long long)chunk_block, 2496 nr_blocks); 2497 2498 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2499 2500 while (nr_blocks) { 2501 /* Get bitmap block */ 2502 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2503 if (IS_ERR(mblk)) 2504 return PTR_ERR(mblk); 2505 2506 /* Set bits */ 2507 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2508 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2509 2510 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2511 if (count) { 2512 dmz_dirty_mblock(zmd, mblk); 2513 n += count; 2514 } 2515 dmz_release_mblock(zmd, mblk); 2516 2517 nr_blocks -= nr_bits; 2518 chunk_block += nr_bits; 2519 } 2520 2521 if (likely(zone->weight + n <= zone_nr_blocks)) 2522 zone->weight += n; 2523 else { 2524 dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u", 2525 zone->id, zone->weight, 2526 zone_nr_blocks - n); 2527 zone->weight = zone_nr_blocks; 2528 } 2529 2530 return 0; 2531 } 2532 2533 /* 2534 * Clear nr_bits bits in bitmap starting from bit. 2535 * Return the number of bits cleared. 2536 */ 2537 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2538 { 2539 unsigned long *addr; 2540 int end = bit + nr_bits; 2541 int n = 0; 2542 2543 while (bit < end) { 2544 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2545 ((end - bit) >= BITS_PER_LONG)) { 2546 /* Try to clear whole word at once */ 2547 addr = bitmap + BIT_WORD(bit); 2548 if (*addr == ULONG_MAX) { 2549 *addr = 0; 2550 n += BITS_PER_LONG; 2551 bit += BITS_PER_LONG; 2552 continue; 2553 } 2554 } 2555 2556 if (test_and_clear_bit(bit, bitmap)) 2557 n++; 2558 bit++; 2559 } 2560 2561 return n; 2562 } 2563 2564 /* 2565 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2566 */ 2567 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2568 sector_t chunk_block, unsigned int nr_blocks) 2569 { 2570 unsigned int count, bit, nr_bits; 2571 struct dmz_mblock *mblk; 2572 unsigned int n = 0; 2573 2574 dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks", 2575 zone->id, (u64)chunk_block, nr_blocks); 2576 2577 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2578 2579 while (nr_blocks) { 2580 /* Get bitmap block */ 2581 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2582 if (IS_ERR(mblk)) 2583 return PTR_ERR(mblk); 2584 2585 /* Clear bits */ 2586 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2587 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2588 2589 count = dmz_clear_bits((unsigned long *)mblk->data, 2590 bit, nr_bits); 2591 if (count) { 2592 dmz_dirty_mblock(zmd, mblk); 2593 n += count; 2594 } 2595 dmz_release_mblock(zmd, mblk); 2596 2597 nr_blocks -= nr_bits; 2598 chunk_block += nr_bits; 2599 } 2600 2601 if (zone->weight >= n) 2602 zone->weight -= n; 2603 else { 2604 dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u", 2605 zone->id, zone->weight, n); 2606 zone->weight = 0; 2607 } 2608 2609 return 0; 2610 } 2611 2612 /* 2613 * Get a block bit value. 2614 */ 2615 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2616 sector_t chunk_block) 2617 { 2618 struct dmz_mblock *mblk; 2619 int ret; 2620 2621 WARN_ON(chunk_block >= zmd->zone_nr_blocks); 2622 2623 /* Get bitmap block */ 2624 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2625 if (IS_ERR(mblk)) 2626 return PTR_ERR(mblk); 2627 2628 /* Get offset */ 2629 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2630 (unsigned long *) mblk->data) != 0; 2631 2632 dmz_release_mblock(zmd, mblk); 2633 2634 return ret; 2635 } 2636 2637 /* 2638 * Return the number of blocks from chunk_block to the first block with a bit 2639 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2640 */ 2641 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2642 sector_t chunk_block, unsigned int nr_blocks, 2643 int set) 2644 { 2645 struct dmz_mblock *mblk; 2646 unsigned int bit, set_bit, nr_bits; 2647 unsigned int zone_bits = zmd->zone_bits_per_mblk; 2648 unsigned long *bitmap; 2649 int n = 0; 2650 2651 WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks); 2652 2653 while (nr_blocks) { 2654 /* Get bitmap block */ 2655 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2656 if (IS_ERR(mblk)) 2657 return PTR_ERR(mblk); 2658 2659 /* Get offset */ 2660 bitmap = (unsigned long *) mblk->data; 2661 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2662 nr_bits = min(nr_blocks, zone_bits - bit); 2663 if (set) 2664 set_bit = find_next_bit(bitmap, zone_bits, bit); 2665 else 2666 set_bit = find_next_zero_bit(bitmap, zone_bits, bit); 2667 dmz_release_mblock(zmd, mblk); 2668 2669 n += set_bit - bit; 2670 if (set_bit < zone_bits) 2671 break; 2672 2673 nr_blocks -= nr_bits; 2674 chunk_block += nr_bits; 2675 } 2676 2677 return n; 2678 } 2679 2680 /* 2681 * Test if chunk_block is valid. If it is, the number of consecutive 2682 * valid blocks from chunk_block will be returned. 2683 */ 2684 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2685 sector_t chunk_block) 2686 { 2687 int valid; 2688 2689 valid = dmz_test_block(zmd, zone, chunk_block); 2690 if (valid <= 0) 2691 return valid; 2692 2693 /* The block is valid: get the number of valid blocks from block */ 2694 return dmz_to_next_set_block(zmd, zone, chunk_block, 2695 zmd->zone_nr_blocks - chunk_block, 0); 2696 } 2697 2698 /* 2699 * Find the first valid block from @chunk_block in @zone. 2700 * If such a block is found, its number is returned using 2701 * @chunk_block and the total number of valid blocks from @chunk_block 2702 * is returned. 2703 */ 2704 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2705 sector_t *chunk_block) 2706 { 2707 sector_t start_block = *chunk_block; 2708 int ret; 2709 2710 ret = dmz_to_next_set_block(zmd, zone, start_block, 2711 zmd->zone_nr_blocks - start_block, 1); 2712 if (ret < 0) 2713 return ret; 2714 2715 start_block += ret; 2716 *chunk_block = start_block; 2717 2718 return dmz_to_next_set_block(zmd, zone, start_block, 2719 zmd->zone_nr_blocks - start_block, 0); 2720 } 2721 2722 /* 2723 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2724 */ 2725 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2726 { 2727 unsigned long *addr; 2728 int end = bit + nr_bits; 2729 int n = 0; 2730 2731 while (bit < end) { 2732 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2733 ((end - bit) >= BITS_PER_LONG)) { 2734 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2735 if (*addr == ULONG_MAX) { 2736 n += BITS_PER_LONG; 2737 bit += BITS_PER_LONG; 2738 continue; 2739 } 2740 } 2741 2742 if (test_bit(bit, bitmap)) 2743 n++; 2744 bit++; 2745 } 2746 2747 return n; 2748 } 2749 2750 /* 2751 * Get a zone weight. 2752 */ 2753 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2754 { 2755 struct dmz_mblock *mblk; 2756 sector_t chunk_block = 0; 2757 unsigned int bit, nr_bits; 2758 unsigned int nr_blocks = zmd->zone_nr_blocks; 2759 void *bitmap; 2760 int n = 0; 2761 2762 while (nr_blocks) { 2763 /* Get bitmap block */ 2764 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2765 if (IS_ERR(mblk)) { 2766 n = 0; 2767 break; 2768 } 2769 2770 /* Count bits in this block */ 2771 bitmap = mblk->data; 2772 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2773 nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); 2774 n += dmz_count_bits(bitmap, bit, nr_bits); 2775 2776 dmz_release_mblock(zmd, mblk); 2777 2778 nr_blocks -= nr_bits; 2779 chunk_block += nr_bits; 2780 } 2781 2782 zone->weight = n; 2783 } 2784 2785 /* 2786 * Cleanup the zoned metadata resources. 2787 */ 2788 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2789 { 2790 struct rb_root *root; 2791 struct dmz_mblock *mblk, *next; 2792 int i; 2793 2794 /* Release zone mapping resources */ 2795 if (zmd->map_mblk) { 2796 for (i = 0; i < zmd->nr_map_blocks; i++) 2797 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2798 kfree(zmd->map_mblk); 2799 zmd->map_mblk = NULL; 2800 } 2801 2802 /* Release super blocks */ 2803 for (i = 0; i < 2; i++) { 2804 if (zmd->sb[i].mblk) { 2805 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2806 zmd->sb[i].mblk = NULL; 2807 } 2808 } 2809 2810 /* Free cached blocks */ 2811 while (!list_empty(&zmd->mblk_dirty_list)) { 2812 mblk = list_first_entry(&zmd->mblk_dirty_list, 2813 struct dmz_mblock, link); 2814 dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)", 2815 (u64)mblk->no, mblk->ref); 2816 list_del_init(&mblk->link); 2817 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2818 dmz_free_mblock(zmd, mblk); 2819 } 2820 2821 while (!list_empty(&zmd->mblk_lru_list)) { 2822 mblk = list_first_entry(&zmd->mblk_lru_list, 2823 struct dmz_mblock, link); 2824 list_del_init(&mblk->link); 2825 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2826 dmz_free_mblock(zmd, mblk); 2827 } 2828 2829 /* Sanity checks: the mblock rbtree should now be empty */ 2830 root = &zmd->mblk_rbtree; 2831 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2832 dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree", 2833 (u64)mblk->no, mblk->ref); 2834 mblk->ref = 0; 2835 dmz_free_mblock(zmd, mblk); 2836 } 2837 2838 /* Free the zone descriptors */ 2839 dmz_drop_zones(zmd); 2840 2841 mutex_destroy(&zmd->mblk_flush_lock); 2842 mutex_destroy(&zmd->map_lock); 2843 } 2844 2845 static void dmz_print_dev(struct dmz_metadata *zmd, int num) 2846 { 2847 struct dmz_dev *dev = &zmd->dev[num]; 2848 2849 if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) 2850 dmz_dev_info(dev, "Regular block device"); 2851 else 2852 dmz_dev_info(dev, "Host-%s zoned block device", 2853 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2854 "aware" : "managed"); 2855 if (zmd->sb_version > 1) { 2856 sector_t sector_offset = 2857 dev->zone_offset << zmd->zone_nr_sectors_shift; 2858 2859 dmz_dev_info(dev, " %llu 512-byte logical sectors (offset %llu)", 2860 (u64)dev->capacity, (u64)sector_offset); 2861 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors (offset %llu)", 2862 dev->nr_zones, (u64)zmd->zone_nr_sectors, 2863 (u64)dev->zone_offset); 2864 } else { 2865 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2866 (u64)dev->capacity); 2867 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2868 dev->nr_zones, (u64)zmd->zone_nr_sectors); 2869 } 2870 } 2871 2872 /* 2873 * Initialize the zoned metadata. 2874 */ 2875 int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, 2876 struct dmz_metadata **metadata, 2877 const char *devname) 2878 { 2879 struct dmz_metadata *zmd; 2880 unsigned int i; 2881 struct dm_zone *zone; 2882 int ret; 2883 2884 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2885 if (!zmd) 2886 return -ENOMEM; 2887 2888 strcpy(zmd->devname, devname); 2889 zmd->dev = dev; 2890 zmd->nr_devs = num_dev; 2891 zmd->mblk_rbtree = RB_ROOT; 2892 init_rwsem(&zmd->mblk_sem); 2893 mutex_init(&zmd->mblk_flush_lock); 2894 spin_lock_init(&zmd->mblk_lock); 2895 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2896 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2897 2898 mutex_init(&zmd->map_lock); 2899 2900 atomic_set(&zmd->unmap_nr_cache, 0); 2901 INIT_LIST_HEAD(&zmd->unmap_cache_list); 2902 INIT_LIST_HEAD(&zmd->map_cache_list); 2903 2904 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2905 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2906 2907 init_waitqueue_head(&zmd->free_wq); 2908 2909 /* Initialize zone descriptors */ 2910 ret = dmz_init_zones(zmd); 2911 if (ret) 2912 goto err; 2913 2914 /* Get super block */ 2915 ret = dmz_load_sb(zmd); 2916 if (ret) 2917 goto err; 2918 2919 /* Set metadata zones starting from sb_zone */ 2920 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2921 zone = dmz_get(zmd, zmd->sb[0].zone->id + i); 2922 if (!zone) { 2923 dmz_zmd_err(zmd, 2924 "metadata zone %u not present", i); 2925 ret = -ENXIO; 2926 goto err; 2927 } 2928 if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) { 2929 dmz_zmd_err(zmd, 2930 "metadata zone %d is not random", i); 2931 ret = -ENXIO; 2932 goto err; 2933 } 2934 set_bit(DMZ_META, &zone->flags); 2935 } 2936 /* Load mapping table */ 2937 ret = dmz_load_mapping(zmd); 2938 if (ret) 2939 goto err; 2940 2941 /* 2942 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2943 * blocks and enough blocks to be able to cache the bitmap blocks of 2944 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2945 * the cache to add 512 more metadata blocks. 2946 */ 2947 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2948 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2949 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2950 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2951 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2952 2953 /* Metadata cache shrinker */ 2954 ret = register_shrinker(&zmd->mblk_shrinker); 2955 if (ret) { 2956 dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); 2957 goto err; 2958 } 2959 2960 dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); 2961 for (i = 0; i < zmd->nr_devs; i++) 2962 dmz_print_dev(zmd, i); 2963 2964 dmz_zmd_info(zmd, " %u zones of %llu 512-byte logical sectors", 2965 zmd->nr_zones, (u64)zmd->zone_nr_sectors); 2966 dmz_zmd_debug(zmd, " %u metadata zones", 2967 zmd->nr_meta_zones * 2); 2968 dmz_zmd_debug(zmd, " %u data zones for %u chunks", 2969 zmd->nr_data_zones, zmd->nr_chunks); 2970 dmz_zmd_debug(zmd, " %u cache zones (%u unmapped)", 2971 zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache)); 2972 for (i = 0; i < zmd->nr_devs; i++) { 2973 dmz_zmd_debug(zmd, " %u random zones (%u unmapped)", 2974 dmz_nr_rnd_zones(zmd, i), 2975 dmz_nr_unmap_rnd_zones(zmd, i)); 2976 dmz_zmd_debug(zmd, " %u sequential zones (%u unmapped)", 2977 dmz_nr_seq_zones(zmd, i), 2978 dmz_nr_unmap_seq_zones(zmd, i)); 2979 } 2980 dmz_zmd_debug(zmd, " %u reserved sequential data zones", 2981 zmd->nr_reserved_seq); 2982 dmz_zmd_debug(zmd, "Format:"); 2983 dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)", 2984 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2985 dmz_zmd_debug(zmd, " %u data zone mapping blocks", 2986 zmd->nr_map_blocks); 2987 dmz_zmd_debug(zmd, " %u bitmap blocks", 2988 zmd->nr_bitmap_blocks); 2989 2990 *metadata = zmd; 2991 2992 return 0; 2993 err: 2994 dmz_cleanup_metadata(zmd); 2995 kfree(zmd); 2996 *metadata = NULL; 2997 2998 return ret; 2999 } 3000 3001 /* 3002 * Cleanup the zoned metadata resources. 3003 */ 3004 void dmz_dtr_metadata(struct dmz_metadata *zmd) 3005 { 3006 unregister_shrinker(&zmd->mblk_shrinker); 3007 dmz_cleanup_metadata(zmd); 3008 kfree(zmd); 3009 } 3010 3011 /* 3012 * Check zone information on resume. 3013 */ 3014 int dmz_resume_metadata(struct dmz_metadata *zmd) 3015 { 3016 struct dm_zone *zone; 3017 sector_t wp_block; 3018 unsigned int i; 3019 int ret; 3020 3021 /* Check zones */ 3022 for (i = 0; i < zmd->nr_zones; i++) { 3023 zone = dmz_get(zmd, i); 3024 if (!zone) { 3025 dmz_zmd_err(zmd, "Unable to get zone %u", i); 3026 return -EIO; 3027 } 3028 wp_block = zone->wp_block; 3029 3030 ret = dmz_update_zone(zmd, zone); 3031 if (ret) { 3032 dmz_zmd_err(zmd, "Broken zone %u", i); 3033 return ret; 3034 } 3035 3036 if (dmz_is_offline(zone)) { 3037 dmz_zmd_warn(zmd, "Zone %u is offline", i); 3038 continue; 3039 } 3040 3041 /* Check write pointer */ 3042 if (!dmz_is_seq(zone)) 3043 zone->wp_block = 0; 3044 else if (zone->wp_block != wp_block) { 3045 dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)", 3046 i, (u64)zone->wp_block, (u64)wp_block); 3047 zone->wp_block = wp_block; 3048 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 3049 zmd->zone_nr_blocks - zone->wp_block); 3050 } 3051 } 3052 3053 return 0; 3054 } 3055