1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 1 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* Padding to full 512B sector */ 73 u8 reserved[464]; /* 512 */ 74 }; 75 76 /* 77 * Chunk mapping entry: entries are indexed by chunk number 78 * and give the zone ID (dzone_id) mapping the chunk on disk. 79 * This zone may be sequential or random. If it is a sequential 80 * zone, a second zone (bzone_id) used as a write buffer may 81 * also be specified. This second zone will always be a randomly 82 * writeable zone. 83 */ 84 struct dmz_map { 85 __le32 dzone_id; 86 __le32 bzone_id; 87 }; 88 89 /* 90 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 91 */ 92 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 93 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 94 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 95 #define DMZ_MAP_UNMAPPED UINT_MAX 96 97 /* 98 * Meta data block descriptor (for cached metadata blocks). 99 */ 100 struct dmz_mblock { 101 struct rb_node node; 102 struct list_head link; 103 sector_t no; 104 unsigned int ref; 105 unsigned long state; 106 struct page *page; 107 void *data; 108 }; 109 110 /* 111 * Metadata block state flags. 112 */ 113 enum { 114 DMZ_META_DIRTY, 115 DMZ_META_READING, 116 DMZ_META_WRITING, 117 DMZ_META_ERROR, 118 }; 119 120 /* 121 * Super block information (one per metadata set). 122 */ 123 struct dmz_sb { 124 sector_t block; 125 struct dmz_mblock *mblk; 126 struct dmz_super *sb; 127 }; 128 129 /* 130 * In-memory metadata. 131 */ 132 struct dmz_metadata { 133 struct dmz_dev *dev; 134 135 sector_t zone_bitmap_size; 136 unsigned int zone_nr_bitmap_blocks; 137 138 unsigned int nr_bitmap_blocks; 139 unsigned int nr_map_blocks; 140 141 unsigned int nr_useable_zones; 142 unsigned int nr_meta_blocks; 143 unsigned int nr_meta_zones; 144 unsigned int nr_data_zones; 145 unsigned int nr_rnd_zones; 146 unsigned int nr_reserved_seq; 147 unsigned int nr_chunks; 148 149 /* Zone information array */ 150 struct dm_zone *zones; 151 152 struct dm_zone *sb_zone; 153 struct dmz_sb sb[2]; 154 unsigned int mblk_primary; 155 u64 sb_gen; 156 unsigned int min_nr_mblks; 157 unsigned int max_nr_mblks; 158 atomic_t nr_mblks; 159 struct rw_semaphore mblk_sem; 160 struct mutex mblk_flush_lock; 161 spinlock_t mblk_lock; 162 struct rb_root mblk_rbtree; 163 struct list_head mblk_lru_list; 164 struct list_head mblk_dirty_list; 165 struct shrinker mblk_shrinker; 166 167 /* Zone allocation management */ 168 struct mutex map_lock; 169 struct dmz_mblock **map_mblk; 170 unsigned int nr_rnd; 171 atomic_t unmap_nr_rnd; 172 struct list_head unmap_rnd_list; 173 struct list_head map_rnd_list; 174 175 unsigned int nr_seq; 176 atomic_t unmap_nr_seq; 177 struct list_head unmap_seq_list; 178 struct list_head map_seq_list; 179 180 atomic_t nr_reserved_seq_zones; 181 struct list_head reserved_seq_zones_list; 182 183 wait_queue_head_t free_wq; 184 }; 185 186 /* 187 * Various accessors 188 */ 189 unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone) 190 { 191 return ((unsigned int)(zone - zmd->zones)); 192 } 193 194 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 195 { 196 return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift; 197 } 198 199 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 200 { 201 return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; 202 } 203 204 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 205 { 206 return zmd->nr_chunks; 207 } 208 209 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd) 210 { 211 return zmd->nr_rnd; 212 } 213 214 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) 215 { 216 return atomic_read(&zmd->unmap_nr_rnd); 217 } 218 219 /* 220 * Lock/unlock mapping table. 221 * The map lock also protects all the zone lists. 222 */ 223 void dmz_lock_map(struct dmz_metadata *zmd) 224 { 225 mutex_lock(&zmd->map_lock); 226 } 227 228 void dmz_unlock_map(struct dmz_metadata *zmd) 229 { 230 mutex_unlock(&zmd->map_lock); 231 } 232 233 /* 234 * Lock/unlock metadata access. This is a "read" lock on a semaphore 235 * that prevents metadata flush from running while metadata are being 236 * modified. The actual metadata write mutual exclusion is achieved with 237 * the map lock and zone state management (active and reclaim state are 238 * mutually exclusive). 239 */ 240 void dmz_lock_metadata(struct dmz_metadata *zmd) 241 { 242 down_read(&zmd->mblk_sem); 243 } 244 245 void dmz_unlock_metadata(struct dmz_metadata *zmd) 246 { 247 up_read(&zmd->mblk_sem); 248 } 249 250 /* 251 * Lock/unlock flush: prevent concurrent executions 252 * of dmz_flush_metadata as well as metadata modification in reclaim 253 * while flush is being executed. 254 */ 255 void dmz_lock_flush(struct dmz_metadata *zmd) 256 { 257 mutex_lock(&zmd->mblk_flush_lock); 258 } 259 260 void dmz_unlock_flush(struct dmz_metadata *zmd) 261 { 262 mutex_unlock(&zmd->mblk_flush_lock); 263 } 264 265 /* 266 * Allocate a metadata block. 267 */ 268 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 269 sector_t mblk_no) 270 { 271 struct dmz_mblock *mblk = NULL; 272 273 /* See if we can reuse cached blocks */ 274 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 275 spin_lock(&zmd->mblk_lock); 276 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 277 struct dmz_mblock, link); 278 if (mblk) { 279 list_del_init(&mblk->link); 280 rb_erase(&mblk->node, &zmd->mblk_rbtree); 281 mblk->no = mblk_no; 282 } 283 spin_unlock(&zmd->mblk_lock); 284 if (mblk) 285 return mblk; 286 } 287 288 /* Allocate a new block */ 289 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 290 if (!mblk) 291 return NULL; 292 293 mblk->page = alloc_page(GFP_NOIO); 294 if (!mblk->page) { 295 kfree(mblk); 296 return NULL; 297 } 298 299 RB_CLEAR_NODE(&mblk->node); 300 INIT_LIST_HEAD(&mblk->link); 301 mblk->ref = 0; 302 mblk->state = 0; 303 mblk->no = mblk_no; 304 mblk->data = page_address(mblk->page); 305 306 atomic_inc(&zmd->nr_mblks); 307 308 return mblk; 309 } 310 311 /* 312 * Free a metadata block. 313 */ 314 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 315 { 316 __free_pages(mblk->page, 0); 317 kfree(mblk); 318 319 atomic_dec(&zmd->nr_mblks); 320 } 321 322 /* 323 * Insert a metadata block in the rbtree. 324 */ 325 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 326 { 327 struct rb_root *root = &zmd->mblk_rbtree; 328 struct rb_node **new = &(root->rb_node), *parent = NULL; 329 struct dmz_mblock *b; 330 331 /* Figure out where to put the new node */ 332 while (*new) { 333 b = container_of(*new, struct dmz_mblock, node); 334 parent = *new; 335 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 336 } 337 338 /* Add new node and rebalance tree */ 339 rb_link_node(&mblk->node, parent, new); 340 rb_insert_color(&mblk->node, root); 341 } 342 343 /* 344 * Lookup a metadata block in the rbtree. If the block is found, increment 345 * its reference count. 346 */ 347 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 348 sector_t mblk_no) 349 { 350 struct rb_root *root = &zmd->mblk_rbtree; 351 struct rb_node *node = root->rb_node; 352 struct dmz_mblock *mblk; 353 354 while (node) { 355 mblk = container_of(node, struct dmz_mblock, node); 356 if (mblk->no == mblk_no) { 357 /* 358 * If this is the first reference to the block, 359 * remove it from the LRU list. 360 */ 361 mblk->ref++; 362 if (mblk->ref == 1 && 363 !test_bit(DMZ_META_DIRTY, &mblk->state)) 364 list_del_init(&mblk->link); 365 return mblk; 366 } 367 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 368 } 369 370 return NULL; 371 } 372 373 /* 374 * Metadata block BIO end callback. 375 */ 376 static void dmz_mblock_bio_end_io(struct bio *bio) 377 { 378 struct dmz_mblock *mblk = bio->bi_private; 379 int flag; 380 381 if (bio->bi_status) 382 set_bit(DMZ_META_ERROR, &mblk->state); 383 384 if (bio_op(bio) == REQ_OP_WRITE) 385 flag = DMZ_META_WRITING; 386 else 387 flag = DMZ_META_READING; 388 389 clear_bit_unlock(flag, &mblk->state); 390 smp_mb__after_atomic(); 391 wake_up_bit(&mblk->state, flag); 392 393 bio_put(bio); 394 } 395 396 /* 397 * Read an uncached metadata block from disk and add it to the cache. 398 */ 399 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 400 sector_t mblk_no) 401 { 402 struct dmz_mblock *mblk, *m; 403 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 404 struct bio *bio; 405 406 if (dmz_bdev_is_dying(zmd->dev)) 407 return ERR_PTR(-EIO); 408 409 /* Get a new block and a BIO to read it */ 410 mblk = dmz_alloc_mblock(zmd, mblk_no); 411 if (!mblk) 412 return ERR_PTR(-ENOMEM); 413 414 bio = bio_alloc(GFP_NOIO, 1); 415 if (!bio) { 416 dmz_free_mblock(zmd, mblk); 417 return ERR_PTR(-ENOMEM); 418 } 419 420 spin_lock(&zmd->mblk_lock); 421 422 /* 423 * Make sure that another context did not start reading 424 * the block already. 425 */ 426 m = dmz_get_mblock_fast(zmd, mblk_no); 427 if (m) { 428 spin_unlock(&zmd->mblk_lock); 429 dmz_free_mblock(zmd, mblk); 430 bio_put(bio); 431 return m; 432 } 433 434 mblk->ref++; 435 set_bit(DMZ_META_READING, &mblk->state); 436 dmz_insert_mblock(zmd, mblk); 437 438 spin_unlock(&zmd->mblk_lock); 439 440 /* Submit read BIO */ 441 bio->bi_iter.bi_sector = dmz_blk2sect(block); 442 bio_set_dev(bio, zmd->dev->bdev); 443 bio->bi_private = mblk; 444 bio->bi_end_io = dmz_mblock_bio_end_io; 445 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 446 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 447 submit_bio(bio); 448 449 return mblk; 450 } 451 452 /* 453 * Free metadata blocks. 454 */ 455 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 456 unsigned long limit) 457 { 458 struct dmz_mblock *mblk; 459 unsigned long count = 0; 460 461 if (!zmd->max_nr_mblks) 462 return 0; 463 464 while (!list_empty(&zmd->mblk_lru_list) && 465 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 466 count < limit) { 467 mblk = list_first_entry(&zmd->mblk_lru_list, 468 struct dmz_mblock, link); 469 list_del_init(&mblk->link); 470 rb_erase(&mblk->node, &zmd->mblk_rbtree); 471 dmz_free_mblock(zmd, mblk); 472 count++; 473 } 474 475 return count; 476 } 477 478 /* 479 * For mblock shrinker: get the number of unused metadata blocks in the cache. 480 */ 481 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 482 struct shrink_control *sc) 483 { 484 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 485 486 return atomic_read(&zmd->nr_mblks); 487 } 488 489 /* 490 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 491 */ 492 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 493 struct shrink_control *sc) 494 { 495 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 496 unsigned long count; 497 498 spin_lock(&zmd->mblk_lock); 499 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 500 spin_unlock(&zmd->mblk_lock); 501 502 return count ? count : SHRINK_STOP; 503 } 504 505 /* 506 * Release a metadata block. 507 */ 508 static void dmz_release_mblock(struct dmz_metadata *zmd, 509 struct dmz_mblock *mblk) 510 { 511 512 if (!mblk) 513 return; 514 515 spin_lock(&zmd->mblk_lock); 516 517 mblk->ref--; 518 if (mblk->ref == 0) { 519 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 520 rb_erase(&mblk->node, &zmd->mblk_rbtree); 521 dmz_free_mblock(zmd, mblk); 522 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 523 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 524 dmz_shrink_mblock_cache(zmd, 1); 525 } 526 } 527 528 spin_unlock(&zmd->mblk_lock); 529 } 530 531 /* 532 * Get a metadata block from the rbtree. If the block 533 * is not present, read it from disk. 534 */ 535 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 536 sector_t mblk_no) 537 { 538 struct dmz_mblock *mblk; 539 540 /* Check rbtree */ 541 spin_lock(&zmd->mblk_lock); 542 mblk = dmz_get_mblock_fast(zmd, mblk_no); 543 spin_unlock(&zmd->mblk_lock); 544 545 if (!mblk) { 546 /* Cache miss: read the block from disk */ 547 mblk = dmz_get_mblock_slow(zmd, mblk_no); 548 if (IS_ERR(mblk)) 549 return mblk; 550 } 551 552 /* Wait for on-going read I/O and check for error */ 553 wait_on_bit_io(&mblk->state, DMZ_META_READING, 554 TASK_UNINTERRUPTIBLE); 555 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 556 dmz_release_mblock(zmd, mblk); 557 return ERR_PTR(-EIO); 558 } 559 560 return mblk; 561 } 562 563 /* 564 * Mark a metadata block dirty. 565 */ 566 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 567 { 568 spin_lock(&zmd->mblk_lock); 569 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 570 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 571 spin_unlock(&zmd->mblk_lock); 572 } 573 574 /* 575 * Issue a metadata block write BIO. 576 */ 577 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 578 unsigned int set) 579 { 580 sector_t block = zmd->sb[set].block + mblk->no; 581 struct bio *bio; 582 583 if (dmz_bdev_is_dying(zmd->dev)) 584 return -EIO; 585 586 bio = bio_alloc(GFP_NOIO, 1); 587 if (!bio) { 588 set_bit(DMZ_META_ERROR, &mblk->state); 589 return -ENOMEM; 590 } 591 592 set_bit(DMZ_META_WRITING, &mblk->state); 593 594 bio->bi_iter.bi_sector = dmz_blk2sect(block); 595 bio_set_dev(bio, zmd->dev->bdev); 596 bio->bi_private = mblk; 597 bio->bi_end_io = dmz_mblock_bio_end_io; 598 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 599 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 600 submit_bio(bio); 601 602 return 0; 603 } 604 605 /* 606 * Read/write a metadata block. 607 */ 608 static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, 609 struct page *page) 610 { 611 struct bio *bio; 612 int ret; 613 614 if (dmz_bdev_is_dying(zmd->dev)) 615 return -EIO; 616 617 bio = bio_alloc(GFP_NOIO, 1); 618 if (!bio) 619 return -ENOMEM; 620 621 bio->bi_iter.bi_sector = dmz_blk2sect(block); 622 bio_set_dev(bio, zmd->dev->bdev); 623 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 624 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 625 ret = submit_bio_wait(bio); 626 bio_put(bio); 627 628 return ret; 629 } 630 631 /* 632 * Write super block of the specified metadata set. 633 */ 634 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 635 { 636 sector_t block = zmd->sb[set].block; 637 struct dmz_mblock *mblk = zmd->sb[set].mblk; 638 struct dmz_super *sb = zmd->sb[set].sb; 639 u64 sb_gen = zmd->sb_gen + 1; 640 int ret; 641 642 sb->magic = cpu_to_le32(DMZ_MAGIC); 643 sb->version = cpu_to_le32(DMZ_META_VER); 644 645 sb->gen = cpu_to_le64(sb_gen); 646 647 sb->sb_block = cpu_to_le64(block); 648 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 649 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 650 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 651 652 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 653 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 654 655 sb->crc = 0; 656 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 657 658 ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); 659 if (ret == 0) 660 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 661 662 return ret; 663 } 664 665 /* 666 * Write dirty metadata blocks to the specified set. 667 */ 668 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 669 struct list_head *write_list, 670 unsigned int set) 671 { 672 struct dmz_mblock *mblk; 673 struct blk_plug plug; 674 int ret = 0, nr_mblks_submitted = 0; 675 676 /* Issue writes */ 677 blk_start_plug(&plug); 678 list_for_each_entry(mblk, write_list, link) { 679 ret = dmz_write_mblock(zmd, mblk, set); 680 if (ret) 681 break; 682 nr_mblks_submitted++; 683 } 684 blk_finish_plug(&plug); 685 686 /* Wait for completion */ 687 list_for_each_entry(mblk, write_list, link) { 688 if (!nr_mblks_submitted) 689 break; 690 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 691 TASK_UNINTERRUPTIBLE); 692 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 693 clear_bit(DMZ_META_ERROR, &mblk->state); 694 ret = -EIO; 695 } 696 nr_mblks_submitted--; 697 } 698 699 /* Flush drive cache (this will also sync data) */ 700 if (ret == 0) 701 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 702 703 return ret; 704 } 705 706 /* 707 * Log dirty metadata blocks. 708 */ 709 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 710 struct list_head *write_list) 711 { 712 unsigned int log_set = zmd->mblk_primary ^ 0x1; 713 int ret; 714 715 /* Write dirty blocks to the log */ 716 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 717 if (ret) 718 return ret; 719 720 /* 721 * No error so far: now validate the log by updating the 722 * log index super block generation. 723 */ 724 ret = dmz_write_sb(zmd, log_set); 725 if (ret) 726 return ret; 727 728 return 0; 729 } 730 731 /* 732 * Flush dirty metadata blocks. 733 */ 734 int dmz_flush_metadata(struct dmz_metadata *zmd) 735 { 736 struct dmz_mblock *mblk; 737 struct list_head write_list; 738 int ret; 739 740 if (WARN_ON(!zmd)) 741 return 0; 742 743 INIT_LIST_HEAD(&write_list); 744 745 /* 746 * Make sure that metadata blocks are stable before logging: take 747 * the write lock on the metadata semaphore to prevent target BIOs 748 * from modifying metadata. 749 */ 750 down_write(&zmd->mblk_sem); 751 752 /* 753 * This is called from the target flush work and reclaim work. 754 * Concurrent execution is not allowed. 755 */ 756 dmz_lock_flush(zmd); 757 758 if (dmz_bdev_is_dying(zmd->dev)) { 759 ret = -EIO; 760 goto out; 761 } 762 763 /* Get dirty blocks */ 764 spin_lock(&zmd->mblk_lock); 765 list_splice_init(&zmd->mblk_dirty_list, &write_list); 766 spin_unlock(&zmd->mblk_lock); 767 768 /* If there are no dirty metadata blocks, just flush the device cache */ 769 if (list_empty(&write_list)) { 770 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 771 goto out; 772 } 773 774 /* 775 * The primary metadata set is still clean. Keep it this way until 776 * all updates are successful in the secondary set. That is, use 777 * the secondary set as a log. 778 */ 779 ret = dmz_log_dirty_mblocks(zmd, &write_list); 780 if (ret) 781 goto out; 782 783 /* 784 * The log is on disk. It is now safe to update in place 785 * in the primary metadata set. 786 */ 787 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 788 if (ret) 789 goto out; 790 791 ret = dmz_write_sb(zmd, zmd->mblk_primary); 792 if (ret) 793 goto out; 794 795 while (!list_empty(&write_list)) { 796 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 797 list_del_init(&mblk->link); 798 799 spin_lock(&zmd->mblk_lock); 800 clear_bit(DMZ_META_DIRTY, &mblk->state); 801 if (mblk->ref == 0) 802 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 803 spin_unlock(&zmd->mblk_lock); 804 } 805 806 zmd->sb_gen++; 807 out: 808 if (ret && !list_empty(&write_list)) { 809 spin_lock(&zmd->mblk_lock); 810 list_splice(&write_list, &zmd->mblk_dirty_list); 811 spin_unlock(&zmd->mblk_lock); 812 } 813 814 dmz_unlock_flush(zmd); 815 up_write(&zmd->mblk_sem); 816 817 return ret; 818 } 819 820 /* 821 * Check super block. 822 */ 823 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) 824 { 825 unsigned int nr_meta_zones, nr_data_zones; 826 struct dmz_dev *dev = zmd->dev; 827 u32 crc, stored_crc; 828 u64 gen; 829 830 gen = le64_to_cpu(sb->gen); 831 stored_crc = le32_to_cpu(sb->crc); 832 sb->crc = 0; 833 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 834 if (crc != stored_crc) { 835 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 836 crc, stored_crc); 837 return -ENXIO; 838 } 839 840 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 841 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 842 DMZ_MAGIC, le32_to_cpu(sb->magic)); 843 return -ENXIO; 844 } 845 846 if (le32_to_cpu(sb->version) != DMZ_META_VER) { 847 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 848 DMZ_META_VER, le32_to_cpu(sb->version)); 849 return -ENXIO; 850 } 851 852 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1) 853 >> dev->zone_nr_blocks_shift; 854 if (!nr_meta_zones || 855 nr_meta_zones >= zmd->nr_rnd_zones) { 856 dmz_dev_err(dev, "Invalid number of metadata blocks"); 857 return -ENXIO; 858 } 859 860 if (!le32_to_cpu(sb->nr_reserved_seq) || 861 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 862 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 863 return -ENXIO; 864 } 865 866 nr_data_zones = zmd->nr_useable_zones - 867 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 868 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 869 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 870 le32_to_cpu(sb->nr_chunks), nr_data_zones); 871 return -ENXIO; 872 } 873 874 /* OK */ 875 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 876 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 877 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 878 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 879 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 880 zmd->nr_meta_zones = nr_meta_zones; 881 zmd->nr_data_zones = nr_data_zones; 882 883 return 0; 884 } 885 886 /* 887 * Read the first or second super block from disk. 888 */ 889 static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) 890 { 891 return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block, 892 zmd->sb[set].mblk->page); 893 } 894 895 /* 896 * Determine the position of the secondary super blocks on disk. 897 * This is used only if a corruption of the primary super block 898 * is detected. 899 */ 900 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 901 { 902 unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; 903 struct dmz_mblock *mblk; 904 int i; 905 906 /* Allocate a block */ 907 mblk = dmz_alloc_mblock(zmd, 0); 908 if (!mblk) 909 return -ENOMEM; 910 911 zmd->sb[1].mblk = mblk; 912 zmd->sb[1].sb = mblk->data; 913 914 /* Bad first super block: search for the second one */ 915 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 916 for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { 917 if (dmz_read_sb(zmd, 1) != 0) 918 break; 919 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 920 return 0; 921 zmd->sb[1].block += zone_nr_blocks; 922 } 923 924 dmz_free_mblock(zmd, mblk); 925 zmd->sb[1].mblk = NULL; 926 927 return -EIO; 928 } 929 930 /* 931 * Read the first or second super block from disk. 932 */ 933 static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) 934 { 935 struct dmz_mblock *mblk; 936 int ret; 937 938 /* Allocate a block */ 939 mblk = dmz_alloc_mblock(zmd, 0); 940 if (!mblk) 941 return -ENOMEM; 942 943 zmd->sb[set].mblk = mblk; 944 zmd->sb[set].sb = mblk->data; 945 946 /* Read super block */ 947 ret = dmz_read_sb(zmd, set); 948 if (ret) { 949 dmz_free_mblock(zmd, mblk); 950 zmd->sb[set].mblk = NULL; 951 return ret; 952 } 953 954 return 0; 955 } 956 957 /* 958 * Recover a metadata set. 959 */ 960 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 961 { 962 unsigned int src_set = dst_set ^ 0x1; 963 struct page *page; 964 int i, ret; 965 966 dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); 967 968 if (dst_set == 0) 969 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); 970 else { 971 zmd->sb[1].block = zmd->sb[0].block + 972 (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); 973 } 974 975 page = alloc_page(GFP_NOIO); 976 if (!page) 977 return -ENOMEM; 978 979 /* Copy metadata blocks */ 980 for (i = 1; i < zmd->nr_meta_blocks; i++) { 981 ret = dmz_rdwr_block(zmd, REQ_OP_READ, 982 zmd->sb[src_set].block + i, page); 983 if (ret) 984 goto out; 985 ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, 986 zmd->sb[dst_set].block + i, page); 987 if (ret) 988 goto out; 989 } 990 991 /* Finalize with the super block */ 992 if (!zmd->sb[dst_set].mblk) { 993 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 994 if (!zmd->sb[dst_set].mblk) { 995 ret = -ENOMEM; 996 goto out; 997 } 998 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 999 } 1000 1001 ret = dmz_write_sb(zmd, dst_set); 1002 out: 1003 __free_pages(page, 0); 1004 1005 return ret; 1006 } 1007 1008 /* 1009 * Get super block from disk. 1010 */ 1011 static int dmz_load_sb(struct dmz_metadata *zmd) 1012 { 1013 bool sb_good[2] = {false, false}; 1014 u64 sb_gen[2] = {0, 0}; 1015 int ret; 1016 1017 /* Read and check the primary super block */ 1018 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); 1019 ret = dmz_get_sb(zmd, 0); 1020 if (ret) { 1021 dmz_dev_err(zmd->dev, "Read primary super block failed"); 1022 return ret; 1023 } 1024 1025 ret = dmz_check_sb(zmd, zmd->sb[0].sb); 1026 1027 /* Read and check secondary super block */ 1028 if (ret == 0) { 1029 sb_good[0] = true; 1030 zmd->sb[1].block = zmd->sb[0].block + 1031 (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); 1032 ret = dmz_get_sb(zmd, 1); 1033 } else 1034 ret = dmz_lookup_secondary_sb(zmd); 1035 1036 if (ret) { 1037 dmz_dev_err(zmd->dev, "Read secondary super block failed"); 1038 return ret; 1039 } 1040 1041 ret = dmz_check_sb(zmd, zmd->sb[1].sb); 1042 if (ret == 0) 1043 sb_good[1] = true; 1044 1045 /* Use highest generation sb first */ 1046 if (!sb_good[0] && !sb_good[1]) { 1047 dmz_dev_err(zmd->dev, "No valid super block found"); 1048 return -EIO; 1049 } 1050 1051 if (sb_good[0]) 1052 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1053 else 1054 ret = dmz_recover_mblocks(zmd, 0); 1055 1056 if (sb_good[1]) 1057 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1058 else 1059 ret = dmz_recover_mblocks(zmd, 1); 1060 1061 if (ret) { 1062 dmz_dev_err(zmd->dev, "Recovery failed"); 1063 return -EIO; 1064 } 1065 1066 if (sb_gen[0] >= sb_gen[1]) { 1067 zmd->sb_gen = sb_gen[0]; 1068 zmd->mblk_primary = 0; 1069 } else { 1070 zmd->sb_gen = sb_gen[1]; 1071 zmd->mblk_primary = 1; 1072 } 1073 1074 dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)", 1075 zmd->mblk_primary, zmd->sb_gen); 1076 1077 return 0; 1078 } 1079 1080 /* 1081 * Initialize a zone descriptor. 1082 */ 1083 static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, 1084 struct blk_zone *blkz) 1085 { 1086 struct dmz_dev *dev = zmd->dev; 1087 1088 /* Ignore the eventual last runt (smaller) zone */ 1089 if (blkz->len != dev->zone_nr_sectors) { 1090 if (blkz->start + blkz->len == dev->capacity) 1091 return 0; 1092 return -ENXIO; 1093 } 1094 1095 INIT_LIST_HEAD(&zone->link); 1096 atomic_set(&zone->refcount, 0); 1097 zone->chunk = DMZ_MAP_UNMAPPED; 1098 1099 if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) { 1100 set_bit(DMZ_RND, &zone->flags); 1101 zmd->nr_rnd_zones++; 1102 } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ || 1103 blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) { 1104 set_bit(DMZ_SEQ, &zone->flags); 1105 } else 1106 return -ENXIO; 1107 1108 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1109 set_bit(DMZ_OFFLINE, &zone->flags); 1110 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1111 set_bit(DMZ_READ_ONLY, &zone->flags); 1112 1113 if (dmz_is_rnd(zone)) 1114 zone->wp_block = 0; 1115 else 1116 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1117 1118 if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) { 1119 zmd->nr_useable_zones++; 1120 if (dmz_is_rnd(zone)) { 1121 zmd->nr_rnd_zones++; 1122 if (!zmd->sb_zone) { 1123 /* Super block zone */ 1124 zmd->sb_zone = zone; 1125 } 1126 } 1127 } 1128 1129 return 0; 1130 } 1131 1132 /* 1133 * Free zones descriptors. 1134 */ 1135 static void dmz_drop_zones(struct dmz_metadata *zmd) 1136 { 1137 kfree(zmd->zones); 1138 zmd->zones = NULL; 1139 } 1140 1141 /* 1142 * The size of a zone report in number of zones. 1143 * This results in 4096*64B=256KB report zones commands. 1144 */ 1145 #define DMZ_REPORT_NR_ZONES 4096 1146 1147 /* 1148 * Allocate and initialize zone descriptors using the zone 1149 * information from disk. 1150 */ 1151 static int dmz_init_zones(struct dmz_metadata *zmd) 1152 { 1153 struct dmz_dev *dev = zmd->dev; 1154 struct dm_zone *zone; 1155 struct blk_zone *blkz; 1156 unsigned int nr_blkz; 1157 sector_t sector = 0; 1158 int i, ret = 0; 1159 1160 /* Init */ 1161 zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; 1162 zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; 1163 1164 /* Allocate zone array */ 1165 zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); 1166 if (!zmd->zones) 1167 return -ENOMEM; 1168 1169 dmz_dev_info(dev, "Using %zu B for zone information", 1170 sizeof(struct dm_zone) * dev->nr_zones); 1171 1172 /* Get zone information */ 1173 nr_blkz = DMZ_REPORT_NR_ZONES; 1174 blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL); 1175 if (!blkz) { 1176 ret = -ENOMEM; 1177 goto out; 1178 } 1179 1180 /* 1181 * Get zone information and initialize zone descriptors. 1182 * At the same time, determine where the super block 1183 * should be: first block of the first randomly writable 1184 * zone. 1185 */ 1186 zone = zmd->zones; 1187 while (sector < dev->capacity) { 1188 /* Get zone information */ 1189 nr_blkz = DMZ_REPORT_NR_ZONES; 1190 ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); 1191 if (ret) { 1192 dmz_dev_err(dev, "Report zones failed %d", ret); 1193 goto out; 1194 } 1195 1196 if (!nr_blkz) 1197 break; 1198 1199 /* Process report */ 1200 for (i = 0; i < nr_blkz; i++) { 1201 ret = dmz_init_zone(zmd, zone, &blkz[i]); 1202 if (ret) 1203 goto out; 1204 sector += dev->zone_nr_sectors; 1205 zone++; 1206 } 1207 } 1208 1209 /* The entire zone configuration of the disk should now be known */ 1210 if (sector < dev->capacity) { 1211 dmz_dev_err(dev, "Failed to get correct zone information"); 1212 ret = -ENXIO; 1213 } 1214 out: 1215 kfree(blkz); 1216 if (ret) 1217 dmz_drop_zones(zmd); 1218 1219 return ret; 1220 } 1221 1222 /* 1223 * Update a zone information. 1224 */ 1225 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1226 { 1227 unsigned int nr_blkz = 1; 1228 unsigned int noio_flag; 1229 struct blk_zone blkz; 1230 int ret; 1231 1232 /* 1233 * Get zone information from disk. Since blkdev_report_zones() uses 1234 * GFP_KERNEL by default for memory allocations, set the per-task 1235 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1236 * GFP_NOIO was specified. 1237 */ 1238 noio_flag = memalloc_noio_save(); 1239 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1240 &blkz, &nr_blkz); 1241 memalloc_noio_restore(noio_flag); 1242 if (!nr_blkz) 1243 ret = -EIO; 1244 if (ret) { 1245 dmz_dev_err(zmd->dev, "Get zone %u report failed", 1246 dmz_id(zmd, zone)); 1247 return ret; 1248 } 1249 1250 clear_bit(DMZ_OFFLINE, &zone->flags); 1251 clear_bit(DMZ_READ_ONLY, &zone->flags); 1252 if (blkz.cond == BLK_ZONE_COND_OFFLINE) 1253 set_bit(DMZ_OFFLINE, &zone->flags); 1254 else if (blkz.cond == BLK_ZONE_COND_READONLY) 1255 set_bit(DMZ_READ_ONLY, &zone->flags); 1256 1257 if (dmz_is_seq(zone)) 1258 zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start); 1259 else 1260 zone->wp_block = 0; 1261 1262 return 0; 1263 } 1264 1265 /* 1266 * Check a zone write pointer position when the zone is marked 1267 * with the sequential write error flag. 1268 */ 1269 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1270 struct dm_zone *zone) 1271 { 1272 unsigned int wp = 0; 1273 int ret; 1274 1275 wp = zone->wp_block; 1276 ret = dmz_update_zone(zmd, zone); 1277 if (ret) 1278 return ret; 1279 1280 dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", 1281 dmz_id(zmd, zone), zone->wp_block, wp); 1282 1283 if (zone->wp_block < wp) { 1284 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1285 wp - zone->wp_block); 1286 } 1287 1288 return 0; 1289 } 1290 1291 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 1292 { 1293 return &zmd->zones[zone_id]; 1294 } 1295 1296 /* 1297 * Reset a zone write pointer. 1298 */ 1299 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1300 { 1301 int ret; 1302 1303 /* 1304 * Ignore offline zones, read only zones, 1305 * and conventional zones. 1306 */ 1307 if (dmz_is_offline(zone) || 1308 dmz_is_readonly(zone) || 1309 dmz_is_rnd(zone)) 1310 return 0; 1311 1312 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1313 struct dmz_dev *dev = zmd->dev; 1314 1315 ret = blkdev_reset_zones(dev->bdev, 1316 dmz_start_sect(zmd, zone), 1317 dev->zone_nr_sectors, GFP_NOIO); 1318 if (ret) { 1319 dmz_dev_err(dev, "Reset zone %u failed %d", 1320 dmz_id(zmd, zone), ret); 1321 return ret; 1322 } 1323 } 1324 1325 /* Clear write error bit and rewind write pointer position */ 1326 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1327 zone->wp_block = 0; 1328 1329 return 0; 1330 } 1331 1332 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1333 1334 /* 1335 * Initialize chunk mapping. 1336 */ 1337 static int dmz_load_mapping(struct dmz_metadata *zmd) 1338 { 1339 struct dmz_dev *dev = zmd->dev; 1340 struct dm_zone *dzone, *bzone; 1341 struct dmz_mblock *dmap_mblk = NULL; 1342 struct dmz_map *dmap; 1343 unsigned int i = 0, e = 0, chunk = 0; 1344 unsigned int dzone_id; 1345 unsigned int bzone_id; 1346 1347 /* Metadata block array for the chunk mapping table */ 1348 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1349 sizeof(struct dmz_mblk *), GFP_KERNEL); 1350 if (!zmd->map_mblk) 1351 return -ENOMEM; 1352 1353 /* Get chunk mapping table blocks and initialize zone mapping */ 1354 while (chunk < zmd->nr_chunks) { 1355 if (!dmap_mblk) { 1356 /* Get mapping block */ 1357 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1358 if (IS_ERR(dmap_mblk)) 1359 return PTR_ERR(dmap_mblk); 1360 zmd->map_mblk[i] = dmap_mblk; 1361 dmap = (struct dmz_map *) dmap_mblk->data; 1362 i++; 1363 e = 0; 1364 } 1365 1366 /* Check data zone */ 1367 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1368 if (dzone_id == DMZ_MAP_UNMAPPED) 1369 goto next; 1370 1371 if (dzone_id >= dev->nr_zones) { 1372 dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", 1373 chunk, dzone_id); 1374 return -EIO; 1375 } 1376 1377 dzone = dmz_get(zmd, dzone_id); 1378 set_bit(DMZ_DATA, &dzone->flags); 1379 dzone->chunk = chunk; 1380 dmz_get_zone_weight(zmd, dzone); 1381 1382 if (dmz_is_rnd(dzone)) 1383 list_add_tail(&dzone->link, &zmd->map_rnd_list); 1384 else 1385 list_add_tail(&dzone->link, &zmd->map_seq_list); 1386 1387 /* Check buffer zone */ 1388 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1389 if (bzone_id == DMZ_MAP_UNMAPPED) 1390 goto next; 1391 1392 if (bzone_id >= dev->nr_zones) { 1393 dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", 1394 chunk, bzone_id); 1395 return -EIO; 1396 } 1397 1398 bzone = dmz_get(zmd, bzone_id); 1399 if (!dmz_is_rnd(bzone)) { 1400 dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u", 1401 chunk, bzone_id); 1402 return -EIO; 1403 } 1404 1405 set_bit(DMZ_DATA, &bzone->flags); 1406 set_bit(DMZ_BUF, &bzone->flags); 1407 bzone->chunk = chunk; 1408 bzone->bzone = dzone; 1409 dzone->bzone = bzone; 1410 dmz_get_zone_weight(zmd, bzone); 1411 list_add_tail(&bzone->link, &zmd->map_rnd_list); 1412 next: 1413 chunk++; 1414 e++; 1415 if (e >= DMZ_MAP_ENTRIES) 1416 dmap_mblk = NULL; 1417 } 1418 1419 /* 1420 * At this point, only meta zones and mapped data zones were 1421 * fully initialized. All remaining zones are unmapped data 1422 * zones. Finish initializing those here. 1423 */ 1424 for (i = 0; i < dev->nr_zones; i++) { 1425 dzone = dmz_get(zmd, i); 1426 if (dmz_is_meta(dzone)) 1427 continue; 1428 1429 if (dmz_is_rnd(dzone)) 1430 zmd->nr_rnd++; 1431 else 1432 zmd->nr_seq++; 1433 1434 if (dmz_is_data(dzone)) { 1435 /* Already initialized */ 1436 continue; 1437 } 1438 1439 /* Unmapped data zone */ 1440 set_bit(DMZ_DATA, &dzone->flags); 1441 dzone->chunk = DMZ_MAP_UNMAPPED; 1442 if (dmz_is_rnd(dzone)) { 1443 list_add_tail(&dzone->link, &zmd->unmap_rnd_list); 1444 atomic_inc(&zmd->unmap_nr_rnd); 1445 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1446 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1447 atomic_inc(&zmd->nr_reserved_seq_zones); 1448 zmd->nr_seq--; 1449 } else { 1450 list_add_tail(&dzone->link, &zmd->unmap_seq_list); 1451 atomic_inc(&zmd->unmap_nr_seq); 1452 } 1453 } 1454 1455 return 0; 1456 } 1457 1458 /* 1459 * Set a data chunk mapping. 1460 */ 1461 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1462 unsigned int dzone_id, unsigned int bzone_id) 1463 { 1464 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1465 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1466 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1467 1468 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1469 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1470 dmz_dirty_mblock(zmd, dmap_mblk); 1471 } 1472 1473 /* 1474 * The list of mapped zones is maintained in LRU order. 1475 * This rotates a zone at the end of its map list. 1476 */ 1477 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1478 { 1479 if (list_empty(&zone->link)) 1480 return; 1481 1482 list_del_init(&zone->link); 1483 if (dmz_is_seq(zone)) { 1484 /* LRU rotate sequential zone */ 1485 list_add_tail(&zone->link, &zmd->map_seq_list); 1486 } else { 1487 /* LRU rotate random zone */ 1488 list_add_tail(&zone->link, &zmd->map_rnd_list); 1489 } 1490 } 1491 1492 /* 1493 * The list of mapped random zones is maintained 1494 * in LRU order. This rotates a zone at the end of the list. 1495 */ 1496 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1497 { 1498 __dmz_lru_zone(zmd, zone); 1499 if (zone->bzone) 1500 __dmz_lru_zone(zmd, zone->bzone); 1501 } 1502 1503 /* 1504 * Wait for any zone to be freed. 1505 */ 1506 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1507 { 1508 DEFINE_WAIT(wait); 1509 1510 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1511 dmz_unlock_map(zmd); 1512 dmz_unlock_metadata(zmd); 1513 1514 io_schedule_timeout(HZ); 1515 1516 dmz_lock_metadata(zmd); 1517 dmz_lock_map(zmd); 1518 finish_wait(&zmd->free_wq, &wait); 1519 } 1520 1521 /* 1522 * Lock a zone for reclaim (set the zone RECLAIM bit). 1523 * Returns false if the zone cannot be locked or if it is already locked 1524 * and 1 otherwise. 1525 */ 1526 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1527 { 1528 /* Active zones cannot be reclaimed */ 1529 if (dmz_is_active(zone)) 1530 return 0; 1531 1532 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1533 } 1534 1535 /* 1536 * Clear a zone reclaim flag. 1537 */ 1538 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1539 { 1540 WARN_ON(dmz_is_active(zone)); 1541 WARN_ON(!dmz_in_reclaim(zone)); 1542 1543 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1544 smp_mb__after_atomic(); 1545 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1546 } 1547 1548 /* 1549 * Wait for a zone reclaim to complete. 1550 */ 1551 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1552 { 1553 dmz_unlock_map(zmd); 1554 dmz_unlock_metadata(zmd); 1555 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1556 dmz_lock_metadata(zmd); 1557 dmz_lock_map(zmd); 1558 } 1559 1560 /* 1561 * Select a random write zone for reclaim. 1562 */ 1563 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) 1564 { 1565 struct dm_zone *dzone = NULL; 1566 struct dm_zone *zone; 1567 1568 if (list_empty(&zmd->map_rnd_list)) 1569 return ERR_PTR(-EBUSY); 1570 1571 list_for_each_entry(zone, &zmd->map_rnd_list, link) { 1572 if (dmz_is_buf(zone)) 1573 dzone = zone->bzone; 1574 else 1575 dzone = zone; 1576 if (dmz_lock_zone_reclaim(dzone)) 1577 return dzone; 1578 } 1579 1580 return ERR_PTR(-EBUSY); 1581 } 1582 1583 /* 1584 * Select a buffered sequential zone for reclaim. 1585 */ 1586 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) 1587 { 1588 struct dm_zone *zone; 1589 1590 if (list_empty(&zmd->map_seq_list)) 1591 return ERR_PTR(-EBUSY); 1592 1593 list_for_each_entry(zone, &zmd->map_seq_list, link) { 1594 if (!zone->bzone) 1595 continue; 1596 if (dmz_lock_zone_reclaim(zone)) 1597 return zone; 1598 } 1599 1600 return ERR_PTR(-EBUSY); 1601 } 1602 1603 /* 1604 * Select a zone for reclaim. 1605 */ 1606 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) 1607 { 1608 struct dm_zone *zone; 1609 1610 /* 1611 * Search for a zone candidate to reclaim: 2 cases are possible. 1612 * (1) There is no free sequential zones. Then a random data zone 1613 * cannot be reclaimed. So choose a sequential zone to reclaim so 1614 * that afterward a random zone can be reclaimed. 1615 * (2) At least one free sequential zone is available, then choose 1616 * the oldest random zone (data or buffer) that can be locked. 1617 */ 1618 dmz_lock_map(zmd); 1619 if (list_empty(&zmd->reserved_seq_zones_list)) 1620 zone = dmz_get_seq_zone_for_reclaim(zmd); 1621 else 1622 zone = dmz_get_rnd_zone_for_reclaim(zmd); 1623 dmz_unlock_map(zmd); 1624 1625 return zone; 1626 } 1627 1628 /* 1629 * Get the zone mapping a chunk, if the chunk is mapped already. 1630 * If no mapping exist and the operation is WRITE, a zone is 1631 * allocated and used to map the chunk. 1632 * The zone returned will be set to the active state. 1633 */ 1634 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 1635 { 1636 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1637 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1638 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1639 unsigned int dzone_id; 1640 struct dm_zone *dzone = NULL; 1641 int ret = 0; 1642 1643 dmz_lock_map(zmd); 1644 again: 1645 /* Get the chunk mapping */ 1646 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 1647 if (dzone_id == DMZ_MAP_UNMAPPED) { 1648 /* 1649 * Read or discard in unmapped chunks are fine. But for 1650 * writes, we need a mapping, so get one. 1651 */ 1652 if (op != REQ_OP_WRITE) 1653 goto out; 1654 1655 /* Allocate a random zone */ 1656 dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); 1657 if (!dzone) { 1658 if (dmz_bdev_is_dying(zmd->dev)) { 1659 dzone = ERR_PTR(-EIO); 1660 goto out; 1661 } 1662 dmz_wait_for_free_zones(zmd); 1663 goto again; 1664 } 1665 1666 dmz_map_zone(zmd, dzone, chunk); 1667 1668 } else { 1669 /* The chunk is already mapped: get the mapping zone */ 1670 dzone = dmz_get(zmd, dzone_id); 1671 if (dzone->chunk != chunk) { 1672 dzone = ERR_PTR(-EIO); 1673 goto out; 1674 } 1675 1676 /* Repair write pointer if the sequential dzone has error */ 1677 if (dmz_seq_write_err(dzone)) { 1678 ret = dmz_handle_seq_write_err(zmd, dzone); 1679 if (ret) { 1680 dzone = ERR_PTR(-EIO); 1681 goto out; 1682 } 1683 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 1684 } 1685 } 1686 1687 /* 1688 * If the zone is being reclaimed, the chunk mapping may change 1689 * to a different zone. So wait for reclaim and retry. Otherwise, 1690 * activate the zone (this will prevent reclaim from touching it). 1691 */ 1692 if (dmz_in_reclaim(dzone)) { 1693 dmz_wait_for_reclaim(zmd, dzone); 1694 goto again; 1695 } 1696 dmz_activate_zone(dzone); 1697 dmz_lru_zone(zmd, dzone); 1698 out: 1699 dmz_unlock_map(zmd); 1700 1701 return dzone; 1702 } 1703 1704 /* 1705 * Write and discard change the block validity of data zones and their buffer 1706 * zones. Check here that valid blocks are still present. If all blocks are 1707 * invalid, the zones can be unmapped on the fly without waiting for reclaim 1708 * to do it. 1709 */ 1710 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 1711 { 1712 struct dm_zone *bzone; 1713 1714 dmz_lock_map(zmd); 1715 1716 bzone = dzone->bzone; 1717 if (bzone) { 1718 if (dmz_weight(bzone)) 1719 dmz_lru_zone(zmd, bzone); 1720 else { 1721 /* Empty buffer zone: reclaim it */ 1722 dmz_unmap_zone(zmd, bzone); 1723 dmz_free_zone(zmd, bzone); 1724 bzone = NULL; 1725 } 1726 } 1727 1728 /* Deactivate the data zone */ 1729 dmz_deactivate_zone(dzone); 1730 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 1731 dmz_lru_zone(zmd, dzone); 1732 else { 1733 /* Unbuffered inactive empty data zone: reclaim it */ 1734 dmz_unmap_zone(zmd, dzone); 1735 dmz_free_zone(zmd, dzone); 1736 } 1737 1738 dmz_unlock_map(zmd); 1739 } 1740 1741 /* 1742 * Allocate and map a random zone to buffer a chunk 1743 * already mapped to a sequential zone. 1744 */ 1745 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 1746 struct dm_zone *dzone) 1747 { 1748 struct dm_zone *bzone; 1749 1750 dmz_lock_map(zmd); 1751 again: 1752 bzone = dzone->bzone; 1753 if (bzone) 1754 goto out; 1755 1756 /* Allocate a random zone */ 1757 bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); 1758 if (!bzone) { 1759 if (dmz_bdev_is_dying(zmd->dev)) { 1760 bzone = ERR_PTR(-EIO); 1761 goto out; 1762 } 1763 dmz_wait_for_free_zones(zmd); 1764 goto again; 1765 } 1766 1767 /* Update the chunk mapping */ 1768 dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone), 1769 dmz_id(zmd, bzone)); 1770 1771 set_bit(DMZ_BUF, &bzone->flags); 1772 bzone->chunk = dzone->chunk; 1773 bzone->bzone = dzone; 1774 dzone->bzone = bzone; 1775 list_add_tail(&bzone->link, &zmd->map_rnd_list); 1776 out: 1777 dmz_unlock_map(zmd); 1778 1779 return bzone; 1780 } 1781 1782 /* 1783 * Get an unmapped (free) zone. 1784 * This must be called with the mapping lock held. 1785 */ 1786 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) 1787 { 1788 struct list_head *list; 1789 struct dm_zone *zone; 1790 1791 if (flags & DMZ_ALLOC_RND) 1792 list = &zmd->unmap_rnd_list; 1793 else 1794 list = &zmd->unmap_seq_list; 1795 again: 1796 if (list_empty(list)) { 1797 /* 1798 * No free zone: if this is for reclaim, allow using the 1799 * reserved sequential zones. 1800 */ 1801 if (!(flags & DMZ_ALLOC_RECLAIM) || 1802 list_empty(&zmd->reserved_seq_zones_list)) 1803 return NULL; 1804 1805 zone = list_first_entry(&zmd->reserved_seq_zones_list, 1806 struct dm_zone, link); 1807 list_del_init(&zone->link); 1808 atomic_dec(&zmd->nr_reserved_seq_zones); 1809 return zone; 1810 } 1811 1812 zone = list_first_entry(list, struct dm_zone, link); 1813 list_del_init(&zone->link); 1814 1815 if (dmz_is_rnd(zone)) 1816 atomic_dec(&zmd->unmap_nr_rnd); 1817 else 1818 atomic_dec(&zmd->unmap_nr_seq); 1819 1820 if (dmz_is_offline(zone)) { 1821 dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone)); 1822 zone = NULL; 1823 goto again; 1824 } 1825 1826 return zone; 1827 } 1828 1829 /* 1830 * Free a zone. 1831 * This must be called with the mapping lock held. 1832 */ 1833 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1834 { 1835 /* If this is a sequential zone, reset it */ 1836 if (dmz_is_seq(zone)) 1837 dmz_reset_zone(zmd, zone); 1838 1839 /* Return the zone to its type unmap list */ 1840 if (dmz_is_rnd(zone)) { 1841 list_add_tail(&zone->link, &zmd->unmap_rnd_list); 1842 atomic_inc(&zmd->unmap_nr_rnd); 1843 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < 1844 zmd->nr_reserved_seq) { 1845 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 1846 atomic_inc(&zmd->nr_reserved_seq_zones); 1847 } else { 1848 list_add_tail(&zone->link, &zmd->unmap_seq_list); 1849 atomic_inc(&zmd->unmap_nr_seq); 1850 } 1851 1852 wake_up_all(&zmd->free_wq); 1853 } 1854 1855 /* 1856 * Map a chunk to a zone. 1857 * This must be called with the mapping lock held. 1858 */ 1859 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 1860 unsigned int chunk) 1861 { 1862 /* Set the chunk mapping */ 1863 dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone), 1864 DMZ_MAP_UNMAPPED); 1865 dzone->chunk = chunk; 1866 if (dmz_is_rnd(dzone)) 1867 list_add_tail(&dzone->link, &zmd->map_rnd_list); 1868 else 1869 list_add_tail(&dzone->link, &zmd->map_seq_list); 1870 } 1871 1872 /* 1873 * Unmap a zone. 1874 * This must be called with the mapping lock held. 1875 */ 1876 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1877 { 1878 unsigned int chunk = zone->chunk; 1879 unsigned int dzone_id; 1880 1881 if (chunk == DMZ_MAP_UNMAPPED) { 1882 /* Already unmapped */ 1883 return; 1884 } 1885 1886 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 1887 /* 1888 * Unmapping the chunk buffer zone: clear only 1889 * the chunk buffer mapping 1890 */ 1891 dzone_id = dmz_id(zmd, zone->bzone); 1892 zone->bzone->bzone = NULL; 1893 zone->bzone = NULL; 1894 1895 } else { 1896 /* 1897 * Unmapping the chunk data zone: the zone must 1898 * not be buffered. 1899 */ 1900 if (WARN_ON(zone->bzone)) { 1901 zone->bzone->bzone = NULL; 1902 zone->bzone = NULL; 1903 } 1904 dzone_id = DMZ_MAP_UNMAPPED; 1905 } 1906 1907 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 1908 1909 zone->chunk = DMZ_MAP_UNMAPPED; 1910 list_del_init(&zone->link); 1911 } 1912 1913 /* 1914 * Set @nr_bits bits in @bitmap starting from @bit. 1915 * Return the number of bits changed from 0 to 1. 1916 */ 1917 static unsigned int dmz_set_bits(unsigned long *bitmap, 1918 unsigned int bit, unsigned int nr_bits) 1919 { 1920 unsigned long *addr; 1921 unsigned int end = bit + nr_bits; 1922 unsigned int n = 0; 1923 1924 while (bit < end) { 1925 if (((bit & (BITS_PER_LONG - 1)) == 0) && 1926 ((end - bit) >= BITS_PER_LONG)) { 1927 /* Try to set the whole word at once */ 1928 addr = bitmap + BIT_WORD(bit); 1929 if (*addr == 0) { 1930 *addr = ULONG_MAX; 1931 n += BITS_PER_LONG; 1932 bit += BITS_PER_LONG; 1933 continue; 1934 } 1935 } 1936 1937 if (!test_and_set_bit(bit, bitmap)) 1938 n++; 1939 bit++; 1940 } 1941 1942 return n; 1943 } 1944 1945 /* 1946 * Get the bitmap block storing the bit for chunk_block in zone. 1947 */ 1948 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 1949 struct dm_zone *zone, 1950 sector_t chunk_block) 1951 { 1952 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 1953 (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) + 1954 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 1955 1956 return dmz_get_mblock(zmd, bitmap_block); 1957 } 1958 1959 /* 1960 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 1961 */ 1962 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 1963 struct dm_zone *to_zone) 1964 { 1965 struct dmz_mblock *from_mblk, *to_mblk; 1966 sector_t chunk_block = 0; 1967 1968 /* Get the zones bitmap blocks */ 1969 while (chunk_block < zmd->dev->zone_nr_blocks) { 1970 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 1971 if (IS_ERR(from_mblk)) 1972 return PTR_ERR(from_mblk); 1973 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 1974 if (IS_ERR(to_mblk)) { 1975 dmz_release_mblock(zmd, from_mblk); 1976 return PTR_ERR(to_mblk); 1977 } 1978 1979 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 1980 dmz_dirty_mblock(zmd, to_mblk); 1981 1982 dmz_release_mblock(zmd, to_mblk); 1983 dmz_release_mblock(zmd, from_mblk); 1984 1985 chunk_block += DMZ_BLOCK_SIZE_BITS; 1986 } 1987 1988 to_zone->weight = from_zone->weight; 1989 1990 return 0; 1991 } 1992 1993 /* 1994 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 1995 * starting from chunk_block. 1996 */ 1997 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 1998 struct dm_zone *to_zone, sector_t chunk_block) 1999 { 2000 unsigned int nr_blocks; 2001 int ret; 2002 2003 /* Get the zones bitmap blocks */ 2004 while (chunk_block < zmd->dev->zone_nr_blocks) { 2005 /* Get a valid region from the source zone */ 2006 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 2007 if (ret <= 0) 2008 return ret; 2009 2010 nr_blocks = ret; 2011 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 2012 if (ret) 2013 return ret; 2014 2015 chunk_block += nr_blocks; 2016 } 2017 2018 return 0; 2019 } 2020 2021 /* 2022 * Validate all the blocks in the range [block..block+nr_blocks-1]. 2023 */ 2024 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2025 sector_t chunk_block, unsigned int nr_blocks) 2026 { 2027 unsigned int count, bit, nr_bits; 2028 unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; 2029 struct dmz_mblock *mblk; 2030 unsigned int n = 0; 2031 2032 dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", 2033 dmz_id(zmd, zone), (unsigned long long)chunk_block, 2034 nr_blocks); 2035 2036 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2037 2038 while (nr_blocks) { 2039 /* Get bitmap block */ 2040 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2041 if (IS_ERR(mblk)) 2042 return PTR_ERR(mblk); 2043 2044 /* Set bits */ 2045 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2046 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2047 2048 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2049 if (count) { 2050 dmz_dirty_mblock(zmd, mblk); 2051 n += count; 2052 } 2053 dmz_release_mblock(zmd, mblk); 2054 2055 nr_blocks -= nr_bits; 2056 chunk_block += nr_bits; 2057 } 2058 2059 if (likely(zone->weight + n <= zone_nr_blocks)) 2060 zone->weight += n; 2061 else { 2062 dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", 2063 dmz_id(zmd, zone), zone->weight, 2064 zone_nr_blocks - n); 2065 zone->weight = zone_nr_blocks; 2066 } 2067 2068 return 0; 2069 } 2070 2071 /* 2072 * Clear nr_bits bits in bitmap starting from bit. 2073 * Return the number of bits cleared. 2074 */ 2075 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2076 { 2077 unsigned long *addr; 2078 int end = bit + nr_bits; 2079 int n = 0; 2080 2081 while (bit < end) { 2082 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2083 ((end - bit) >= BITS_PER_LONG)) { 2084 /* Try to clear whole word at once */ 2085 addr = bitmap + BIT_WORD(bit); 2086 if (*addr == ULONG_MAX) { 2087 *addr = 0; 2088 n += BITS_PER_LONG; 2089 bit += BITS_PER_LONG; 2090 continue; 2091 } 2092 } 2093 2094 if (test_and_clear_bit(bit, bitmap)) 2095 n++; 2096 bit++; 2097 } 2098 2099 return n; 2100 } 2101 2102 /* 2103 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2104 */ 2105 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2106 sector_t chunk_block, unsigned int nr_blocks) 2107 { 2108 unsigned int count, bit, nr_bits; 2109 struct dmz_mblock *mblk; 2110 unsigned int n = 0; 2111 2112 dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", 2113 dmz_id(zmd, zone), (u64)chunk_block, nr_blocks); 2114 2115 WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); 2116 2117 while (nr_blocks) { 2118 /* Get bitmap block */ 2119 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2120 if (IS_ERR(mblk)) 2121 return PTR_ERR(mblk); 2122 2123 /* Clear bits */ 2124 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2125 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2126 2127 count = dmz_clear_bits((unsigned long *)mblk->data, 2128 bit, nr_bits); 2129 if (count) { 2130 dmz_dirty_mblock(zmd, mblk); 2131 n += count; 2132 } 2133 dmz_release_mblock(zmd, mblk); 2134 2135 nr_blocks -= nr_bits; 2136 chunk_block += nr_bits; 2137 } 2138 2139 if (zone->weight >= n) 2140 zone->weight -= n; 2141 else { 2142 dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", 2143 dmz_id(zmd, zone), zone->weight, n); 2144 zone->weight = 0; 2145 } 2146 2147 return 0; 2148 } 2149 2150 /* 2151 * Get a block bit value. 2152 */ 2153 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2154 sector_t chunk_block) 2155 { 2156 struct dmz_mblock *mblk; 2157 int ret; 2158 2159 WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks); 2160 2161 /* Get bitmap block */ 2162 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2163 if (IS_ERR(mblk)) 2164 return PTR_ERR(mblk); 2165 2166 /* Get offset */ 2167 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2168 (unsigned long *) mblk->data) != 0; 2169 2170 dmz_release_mblock(zmd, mblk); 2171 2172 return ret; 2173 } 2174 2175 /* 2176 * Return the number of blocks from chunk_block to the first block with a bit 2177 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2178 */ 2179 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2180 sector_t chunk_block, unsigned int nr_blocks, 2181 int set) 2182 { 2183 struct dmz_mblock *mblk; 2184 unsigned int bit, set_bit, nr_bits; 2185 unsigned long *bitmap; 2186 int n = 0; 2187 2188 WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); 2189 2190 while (nr_blocks) { 2191 /* Get bitmap block */ 2192 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2193 if (IS_ERR(mblk)) 2194 return PTR_ERR(mblk); 2195 2196 /* Get offset */ 2197 bitmap = (unsigned long *) mblk->data; 2198 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2199 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2200 if (set) 2201 set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); 2202 else 2203 set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); 2204 dmz_release_mblock(zmd, mblk); 2205 2206 n += set_bit - bit; 2207 if (set_bit < DMZ_BLOCK_SIZE_BITS) 2208 break; 2209 2210 nr_blocks -= nr_bits; 2211 chunk_block += nr_bits; 2212 } 2213 2214 return n; 2215 } 2216 2217 /* 2218 * Test if chunk_block is valid. If it is, the number of consecutive 2219 * valid blocks from chunk_block will be returned. 2220 */ 2221 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2222 sector_t chunk_block) 2223 { 2224 int valid; 2225 2226 valid = dmz_test_block(zmd, zone, chunk_block); 2227 if (valid <= 0) 2228 return valid; 2229 2230 /* The block is valid: get the number of valid blocks from block */ 2231 return dmz_to_next_set_block(zmd, zone, chunk_block, 2232 zmd->dev->zone_nr_blocks - chunk_block, 0); 2233 } 2234 2235 /* 2236 * Find the first valid block from @chunk_block in @zone. 2237 * If such a block is found, its number is returned using 2238 * @chunk_block and the total number of valid blocks from @chunk_block 2239 * is returned. 2240 */ 2241 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2242 sector_t *chunk_block) 2243 { 2244 sector_t start_block = *chunk_block; 2245 int ret; 2246 2247 ret = dmz_to_next_set_block(zmd, zone, start_block, 2248 zmd->dev->zone_nr_blocks - start_block, 1); 2249 if (ret < 0) 2250 return ret; 2251 2252 start_block += ret; 2253 *chunk_block = start_block; 2254 2255 return dmz_to_next_set_block(zmd, zone, start_block, 2256 zmd->dev->zone_nr_blocks - start_block, 0); 2257 } 2258 2259 /* 2260 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2261 */ 2262 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2263 { 2264 unsigned long *addr; 2265 int end = bit + nr_bits; 2266 int n = 0; 2267 2268 while (bit < end) { 2269 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2270 ((end - bit) >= BITS_PER_LONG)) { 2271 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2272 if (*addr == ULONG_MAX) { 2273 n += BITS_PER_LONG; 2274 bit += BITS_PER_LONG; 2275 continue; 2276 } 2277 } 2278 2279 if (test_bit(bit, bitmap)) 2280 n++; 2281 bit++; 2282 } 2283 2284 return n; 2285 } 2286 2287 /* 2288 * Get a zone weight. 2289 */ 2290 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2291 { 2292 struct dmz_mblock *mblk; 2293 sector_t chunk_block = 0; 2294 unsigned int bit, nr_bits; 2295 unsigned int nr_blocks = zmd->dev->zone_nr_blocks; 2296 void *bitmap; 2297 int n = 0; 2298 2299 while (nr_blocks) { 2300 /* Get bitmap block */ 2301 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2302 if (IS_ERR(mblk)) { 2303 n = 0; 2304 break; 2305 } 2306 2307 /* Count bits in this block */ 2308 bitmap = mblk->data; 2309 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2310 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2311 n += dmz_count_bits(bitmap, bit, nr_bits); 2312 2313 dmz_release_mblock(zmd, mblk); 2314 2315 nr_blocks -= nr_bits; 2316 chunk_block += nr_bits; 2317 } 2318 2319 zone->weight = n; 2320 } 2321 2322 /* 2323 * Cleanup the zoned metadata resources. 2324 */ 2325 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2326 { 2327 struct rb_root *root; 2328 struct dmz_mblock *mblk, *next; 2329 int i; 2330 2331 /* Release zone mapping resources */ 2332 if (zmd->map_mblk) { 2333 for (i = 0; i < zmd->nr_map_blocks; i++) 2334 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2335 kfree(zmd->map_mblk); 2336 zmd->map_mblk = NULL; 2337 } 2338 2339 /* Release super blocks */ 2340 for (i = 0; i < 2; i++) { 2341 if (zmd->sb[i].mblk) { 2342 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2343 zmd->sb[i].mblk = NULL; 2344 } 2345 } 2346 2347 /* Free cached blocks */ 2348 while (!list_empty(&zmd->mblk_dirty_list)) { 2349 mblk = list_first_entry(&zmd->mblk_dirty_list, 2350 struct dmz_mblock, link); 2351 dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", 2352 (u64)mblk->no, mblk->ref); 2353 list_del_init(&mblk->link); 2354 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2355 dmz_free_mblock(zmd, mblk); 2356 } 2357 2358 while (!list_empty(&zmd->mblk_lru_list)) { 2359 mblk = list_first_entry(&zmd->mblk_lru_list, 2360 struct dmz_mblock, link); 2361 list_del_init(&mblk->link); 2362 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2363 dmz_free_mblock(zmd, mblk); 2364 } 2365 2366 /* Sanity checks: the mblock rbtree should now be empty */ 2367 root = &zmd->mblk_rbtree; 2368 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2369 dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", 2370 (u64)mblk->no, mblk->ref); 2371 mblk->ref = 0; 2372 dmz_free_mblock(zmd, mblk); 2373 } 2374 2375 /* Free the zone descriptors */ 2376 dmz_drop_zones(zmd); 2377 2378 mutex_destroy(&zmd->mblk_flush_lock); 2379 mutex_destroy(&zmd->map_lock); 2380 } 2381 2382 /* 2383 * Initialize the zoned metadata. 2384 */ 2385 int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) 2386 { 2387 struct dmz_metadata *zmd; 2388 unsigned int i, zid; 2389 struct dm_zone *zone; 2390 int ret; 2391 2392 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2393 if (!zmd) 2394 return -ENOMEM; 2395 2396 zmd->dev = dev; 2397 zmd->mblk_rbtree = RB_ROOT; 2398 init_rwsem(&zmd->mblk_sem); 2399 mutex_init(&zmd->mblk_flush_lock); 2400 spin_lock_init(&zmd->mblk_lock); 2401 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2402 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2403 2404 mutex_init(&zmd->map_lock); 2405 atomic_set(&zmd->unmap_nr_rnd, 0); 2406 INIT_LIST_HEAD(&zmd->unmap_rnd_list); 2407 INIT_LIST_HEAD(&zmd->map_rnd_list); 2408 2409 atomic_set(&zmd->unmap_nr_seq, 0); 2410 INIT_LIST_HEAD(&zmd->unmap_seq_list); 2411 INIT_LIST_HEAD(&zmd->map_seq_list); 2412 2413 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2414 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2415 2416 init_waitqueue_head(&zmd->free_wq); 2417 2418 /* Initialize zone descriptors */ 2419 ret = dmz_init_zones(zmd); 2420 if (ret) 2421 goto err; 2422 2423 /* Get super block */ 2424 ret = dmz_load_sb(zmd); 2425 if (ret) 2426 goto err; 2427 2428 /* Set metadata zones starting from sb_zone */ 2429 zid = dmz_id(zmd, zmd->sb_zone); 2430 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2431 zone = dmz_get(zmd, zid + i); 2432 if (!dmz_is_rnd(zone)) 2433 goto err; 2434 set_bit(DMZ_META, &zone->flags); 2435 } 2436 2437 /* Load mapping table */ 2438 ret = dmz_load_mapping(zmd); 2439 if (ret) 2440 goto err; 2441 2442 /* 2443 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2444 * blocks and enough blocks to be able to cache the bitmap blocks of 2445 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2446 * the cache to add 512 more metadata blocks. 2447 */ 2448 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2449 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2450 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2451 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2452 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2453 2454 /* Metadata cache shrinker */ 2455 ret = register_shrinker(&zmd->mblk_shrinker); 2456 if (ret) { 2457 dmz_dev_err(dev, "Register metadata cache shrinker failed"); 2458 goto err; 2459 } 2460 2461 dmz_dev_info(dev, "Host-%s zoned block device", 2462 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2463 "aware" : "managed"); 2464 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2465 (u64)dev->capacity); 2466 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2467 dev->nr_zones, (u64)dev->zone_nr_sectors); 2468 dmz_dev_info(dev, " %u metadata zones", 2469 zmd->nr_meta_zones * 2); 2470 dmz_dev_info(dev, " %u data zones for %u chunks", 2471 zmd->nr_data_zones, zmd->nr_chunks); 2472 dmz_dev_info(dev, " %u random zones (%u unmapped)", 2473 zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); 2474 dmz_dev_info(dev, " %u sequential zones (%u unmapped)", 2475 zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); 2476 dmz_dev_info(dev, " %u reserved sequential data zones", 2477 zmd->nr_reserved_seq); 2478 2479 dmz_dev_debug(dev, "Format:"); 2480 dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)", 2481 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2482 dmz_dev_debug(dev, " %u data zone mapping blocks", 2483 zmd->nr_map_blocks); 2484 dmz_dev_debug(dev, " %u bitmap blocks", 2485 zmd->nr_bitmap_blocks); 2486 2487 *metadata = zmd; 2488 2489 return 0; 2490 err: 2491 dmz_cleanup_metadata(zmd); 2492 kfree(zmd); 2493 *metadata = NULL; 2494 2495 return ret; 2496 } 2497 2498 /* 2499 * Cleanup the zoned metadata resources. 2500 */ 2501 void dmz_dtr_metadata(struct dmz_metadata *zmd) 2502 { 2503 unregister_shrinker(&zmd->mblk_shrinker); 2504 dmz_cleanup_metadata(zmd); 2505 kfree(zmd); 2506 } 2507 2508 /* 2509 * Check zone information on resume. 2510 */ 2511 int dmz_resume_metadata(struct dmz_metadata *zmd) 2512 { 2513 struct dmz_dev *dev = zmd->dev; 2514 struct dm_zone *zone; 2515 sector_t wp_block; 2516 unsigned int i; 2517 int ret; 2518 2519 /* Check zones */ 2520 for (i = 0; i < dev->nr_zones; i++) { 2521 zone = dmz_get(zmd, i); 2522 if (!zone) { 2523 dmz_dev_err(dev, "Unable to get zone %u", i); 2524 return -EIO; 2525 } 2526 2527 wp_block = zone->wp_block; 2528 2529 ret = dmz_update_zone(zmd, zone); 2530 if (ret) { 2531 dmz_dev_err(dev, "Broken zone %u", i); 2532 return ret; 2533 } 2534 2535 if (dmz_is_offline(zone)) { 2536 dmz_dev_warn(dev, "Zone %u is offline", i); 2537 continue; 2538 } 2539 2540 /* Check write pointer */ 2541 if (!dmz_is_seq(zone)) 2542 zone->wp_block = 0; 2543 else if (zone->wp_block != wp_block) { 2544 dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)", 2545 i, (u64)zone->wp_block, (u64)wp_block); 2546 zone->wp_block = wp_block; 2547 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 2548 dev->zone_nr_blocks - zone->wp_block); 2549 } 2550 } 2551 2552 return 0; 2553 } 2554