1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 #include <linux/crc32.h> 12 #include <linux/sched/mm.h> 13 14 #define DM_MSG_PREFIX "zoned metadata" 15 16 /* 17 * Metadata version. 18 */ 19 #define DMZ_META_VER 1 20 21 /* 22 * On-disk super block magic. 23 */ 24 #define DMZ_MAGIC ((((unsigned int)('D')) << 24) | \ 25 (((unsigned int)('Z')) << 16) | \ 26 (((unsigned int)('B')) << 8) | \ 27 ((unsigned int)('D'))) 28 29 /* 30 * On disk super block. 31 * This uses only 512 B but uses on disk a full 4KB block. This block is 32 * followed on disk by the mapping table of chunks to zones and the bitmap 33 * blocks indicating zone block validity. 34 * The overall resulting metadata format is: 35 * (1) Super block (1 block) 36 * (2) Chunk mapping table (nr_map_blocks) 37 * (3) Bitmap blocks (nr_bitmap_blocks) 38 * All metadata blocks are stored in conventional zones, starting from 39 * the first conventional zone found on disk. 40 */ 41 struct dmz_super { 42 /* Magic number */ 43 __le32 magic; /* 4 */ 44 45 /* Metadata version number */ 46 __le32 version; /* 8 */ 47 48 /* Generation number */ 49 __le64 gen; /* 16 */ 50 51 /* This block number */ 52 __le64 sb_block; /* 24 */ 53 54 /* The number of metadata blocks, including this super block */ 55 __le32 nr_meta_blocks; /* 28 */ 56 57 /* The number of sequential zones reserved for reclaim */ 58 __le32 nr_reserved_seq; /* 32 */ 59 60 /* The number of entries in the mapping table */ 61 __le32 nr_chunks; /* 36 */ 62 63 /* The number of blocks used for the chunk mapping table */ 64 __le32 nr_map_blocks; /* 40 */ 65 66 /* The number of blocks used for the block bitmaps */ 67 __le32 nr_bitmap_blocks; /* 44 */ 68 69 /* Checksum */ 70 __le32 crc; /* 48 */ 71 72 /* Padding to full 512B sector */ 73 u8 reserved[464]; /* 512 */ 74 }; 75 76 /* 77 * Chunk mapping entry: entries are indexed by chunk number 78 * and give the zone ID (dzone_id) mapping the chunk on disk. 79 * This zone may be sequential or random. If it is a sequential 80 * zone, a second zone (bzone_id) used as a write buffer may 81 * also be specified. This second zone will always be a randomly 82 * writeable zone. 83 */ 84 struct dmz_map { 85 __le32 dzone_id; 86 __le32 bzone_id; 87 }; 88 89 /* 90 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block. 91 */ 92 #define DMZ_MAP_ENTRIES (DMZ_BLOCK_SIZE / sizeof(struct dmz_map)) 93 #define DMZ_MAP_ENTRIES_SHIFT (ilog2(DMZ_MAP_ENTRIES)) 94 #define DMZ_MAP_ENTRIES_MASK (DMZ_MAP_ENTRIES - 1) 95 #define DMZ_MAP_UNMAPPED UINT_MAX 96 97 /* 98 * Meta data block descriptor (for cached metadata blocks). 99 */ 100 struct dmz_mblock { 101 struct rb_node node; 102 struct list_head link; 103 sector_t no; 104 unsigned int ref; 105 unsigned long state; 106 struct page *page; 107 void *data; 108 }; 109 110 /* 111 * Metadata block state flags. 112 */ 113 enum { 114 DMZ_META_DIRTY, 115 DMZ_META_READING, 116 DMZ_META_WRITING, 117 DMZ_META_ERROR, 118 }; 119 120 /* 121 * Super block information (one per metadata set). 122 */ 123 struct dmz_sb { 124 sector_t block; 125 struct dmz_mblock *mblk; 126 struct dmz_super *sb; 127 }; 128 129 /* 130 * In-memory metadata. 131 */ 132 struct dmz_metadata { 133 struct dmz_dev *dev; 134 135 sector_t zone_bitmap_size; 136 unsigned int zone_nr_bitmap_blocks; 137 138 unsigned int nr_bitmap_blocks; 139 unsigned int nr_map_blocks; 140 141 unsigned int nr_useable_zones; 142 unsigned int nr_meta_blocks; 143 unsigned int nr_meta_zones; 144 unsigned int nr_data_zones; 145 unsigned int nr_rnd_zones; 146 unsigned int nr_reserved_seq; 147 unsigned int nr_chunks; 148 149 /* Zone information array */ 150 struct dm_zone *zones; 151 152 struct dm_zone *sb_zone; 153 struct dmz_sb sb[2]; 154 unsigned int mblk_primary; 155 u64 sb_gen; 156 unsigned int min_nr_mblks; 157 unsigned int max_nr_mblks; 158 atomic_t nr_mblks; 159 struct rw_semaphore mblk_sem; 160 struct mutex mblk_flush_lock; 161 spinlock_t mblk_lock; 162 struct rb_root mblk_rbtree; 163 struct list_head mblk_lru_list; 164 struct list_head mblk_dirty_list; 165 struct shrinker mblk_shrinker; 166 167 /* Zone allocation management */ 168 struct mutex map_lock; 169 struct dmz_mblock **map_mblk; 170 unsigned int nr_rnd; 171 atomic_t unmap_nr_rnd; 172 struct list_head unmap_rnd_list; 173 struct list_head map_rnd_list; 174 175 unsigned int nr_seq; 176 atomic_t unmap_nr_seq; 177 struct list_head unmap_seq_list; 178 struct list_head map_seq_list; 179 180 atomic_t nr_reserved_seq_zones; 181 struct list_head reserved_seq_zones_list; 182 183 wait_queue_head_t free_wq; 184 }; 185 186 /* 187 * Various accessors 188 */ 189 unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone) 190 { 191 return ((unsigned int)(zone - zmd->zones)); 192 } 193 194 sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) 195 { 196 return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift; 197 } 198 199 sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone) 200 { 201 return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift; 202 } 203 204 unsigned int dmz_nr_chunks(struct dmz_metadata *zmd) 205 { 206 return zmd->nr_chunks; 207 } 208 209 unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd) 210 { 211 return zmd->nr_rnd; 212 } 213 214 unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd) 215 { 216 return atomic_read(&zmd->unmap_nr_rnd); 217 } 218 219 /* 220 * Lock/unlock mapping table. 221 * The map lock also protects all the zone lists. 222 */ 223 void dmz_lock_map(struct dmz_metadata *zmd) 224 { 225 mutex_lock(&zmd->map_lock); 226 } 227 228 void dmz_unlock_map(struct dmz_metadata *zmd) 229 { 230 mutex_unlock(&zmd->map_lock); 231 } 232 233 /* 234 * Lock/unlock metadata access. This is a "read" lock on a semaphore 235 * that prevents metadata flush from running while metadata are being 236 * modified. The actual metadata write mutual exclusion is achieved with 237 * the map lock and zone state management (active and reclaim state are 238 * mutually exclusive). 239 */ 240 void dmz_lock_metadata(struct dmz_metadata *zmd) 241 { 242 down_read(&zmd->mblk_sem); 243 } 244 245 void dmz_unlock_metadata(struct dmz_metadata *zmd) 246 { 247 up_read(&zmd->mblk_sem); 248 } 249 250 /* 251 * Lock/unlock flush: prevent concurrent executions 252 * of dmz_flush_metadata as well as metadata modification in reclaim 253 * while flush is being executed. 254 */ 255 void dmz_lock_flush(struct dmz_metadata *zmd) 256 { 257 mutex_lock(&zmd->mblk_flush_lock); 258 } 259 260 void dmz_unlock_flush(struct dmz_metadata *zmd) 261 { 262 mutex_unlock(&zmd->mblk_flush_lock); 263 } 264 265 /* 266 * Allocate a metadata block. 267 */ 268 static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, 269 sector_t mblk_no) 270 { 271 struct dmz_mblock *mblk = NULL; 272 273 /* See if we can reuse cached blocks */ 274 if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) { 275 spin_lock(&zmd->mblk_lock); 276 mblk = list_first_entry_or_null(&zmd->mblk_lru_list, 277 struct dmz_mblock, link); 278 if (mblk) { 279 list_del_init(&mblk->link); 280 rb_erase(&mblk->node, &zmd->mblk_rbtree); 281 mblk->no = mblk_no; 282 } 283 spin_unlock(&zmd->mblk_lock); 284 if (mblk) 285 return mblk; 286 } 287 288 /* Allocate a new block */ 289 mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO); 290 if (!mblk) 291 return NULL; 292 293 mblk->page = alloc_page(GFP_NOIO); 294 if (!mblk->page) { 295 kfree(mblk); 296 return NULL; 297 } 298 299 RB_CLEAR_NODE(&mblk->node); 300 INIT_LIST_HEAD(&mblk->link); 301 mblk->ref = 0; 302 mblk->state = 0; 303 mblk->no = mblk_no; 304 mblk->data = page_address(mblk->page); 305 306 atomic_inc(&zmd->nr_mblks); 307 308 return mblk; 309 } 310 311 /* 312 * Free a metadata block. 313 */ 314 static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 315 { 316 __free_pages(mblk->page, 0); 317 kfree(mblk); 318 319 atomic_dec(&zmd->nr_mblks); 320 } 321 322 /* 323 * Insert a metadata block in the rbtree. 324 */ 325 static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 326 { 327 struct rb_root *root = &zmd->mblk_rbtree; 328 struct rb_node **new = &(root->rb_node), *parent = NULL; 329 struct dmz_mblock *b; 330 331 /* Figure out where to put the new node */ 332 while (*new) { 333 b = container_of(*new, struct dmz_mblock, node); 334 parent = *new; 335 new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right); 336 } 337 338 /* Add new node and rebalance tree */ 339 rb_link_node(&mblk->node, parent, new); 340 rb_insert_color(&mblk->node, root); 341 } 342 343 /* 344 * Lookup a metadata block in the rbtree. If the block is found, increment 345 * its reference count. 346 */ 347 static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, 348 sector_t mblk_no) 349 { 350 struct rb_root *root = &zmd->mblk_rbtree; 351 struct rb_node *node = root->rb_node; 352 struct dmz_mblock *mblk; 353 354 while (node) { 355 mblk = container_of(node, struct dmz_mblock, node); 356 if (mblk->no == mblk_no) { 357 /* 358 * If this is the first reference to the block, 359 * remove it from the LRU list. 360 */ 361 mblk->ref++; 362 if (mblk->ref == 1 && 363 !test_bit(DMZ_META_DIRTY, &mblk->state)) 364 list_del_init(&mblk->link); 365 return mblk; 366 } 367 node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; 368 } 369 370 return NULL; 371 } 372 373 /* 374 * Metadata block BIO end callback. 375 */ 376 static void dmz_mblock_bio_end_io(struct bio *bio) 377 { 378 struct dmz_mblock *mblk = bio->bi_private; 379 int flag; 380 381 if (bio->bi_status) 382 set_bit(DMZ_META_ERROR, &mblk->state); 383 384 if (bio_op(bio) == REQ_OP_WRITE) 385 flag = DMZ_META_WRITING; 386 else 387 flag = DMZ_META_READING; 388 389 clear_bit_unlock(flag, &mblk->state); 390 smp_mb__after_atomic(); 391 wake_up_bit(&mblk->state, flag); 392 393 bio_put(bio); 394 } 395 396 /* 397 * Read an uncached metadata block from disk and add it to the cache. 398 */ 399 static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, 400 sector_t mblk_no) 401 { 402 struct dmz_mblock *mblk, *m; 403 sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; 404 struct bio *bio; 405 406 if (dmz_bdev_is_dying(zmd->dev)) 407 return ERR_PTR(-EIO); 408 409 /* Get a new block and a BIO to read it */ 410 mblk = dmz_alloc_mblock(zmd, mblk_no); 411 if (!mblk) 412 return ERR_PTR(-ENOMEM); 413 414 bio = bio_alloc(GFP_NOIO, 1); 415 if (!bio) { 416 dmz_free_mblock(zmd, mblk); 417 return ERR_PTR(-ENOMEM); 418 } 419 420 spin_lock(&zmd->mblk_lock); 421 422 /* 423 * Make sure that another context did not start reading 424 * the block already. 425 */ 426 m = dmz_get_mblock_fast(zmd, mblk_no); 427 if (m) { 428 spin_unlock(&zmd->mblk_lock); 429 dmz_free_mblock(zmd, mblk); 430 bio_put(bio); 431 return m; 432 } 433 434 mblk->ref++; 435 set_bit(DMZ_META_READING, &mblk->state); 436 dmz_insert_mblock(zmd, mblk); 437 438 spin_unlock(&zmd->mblk_lock); 439 440 /* Submit read BIO */ 441 bio->bi_iter.bi_sector = dmz_blk2sect(block); 442 bio_set_dev(bio, zmd->dev->bdev); 443 bio->bi_private = mblk; 444 bio->bi_end_io = dmz_mblock_bio_end_io; 445 bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO); 446 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 447 submit_bio(bio); 448 449 return mblk; 450 } 451 452 /* 453 * Free metadata blocks. 454 */ 455 static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, 456 unsigned long limit) 457 { 458 struct dmz_mblock *mblk; 459 unsigned long count = 0; 460 461 if (!zmd->max_nr_mblks) 462 return 0; 463 464 while (!list_empty(&zmd->mblk_lru_list) && 465 atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks && 466 count < limit) { 467 mblk = list_first_entry(&zmd->mblk_lru_list, 468 struct dmz_mblock, link); 469 list_del_init(&mblk->link); 470 rb_erase(&mblk->node, &zmd->mblk_rbtree); 471 dmz_free_mblock(zmd, mblk); 472 count++; 473 } 474 475 return count; 476 } 477 478 /* 479 * For mblock shrinker: get the number of unused metadata blocks in the cache. 480 */ 481 static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, 482 struct shrink_control *sc) 483 { 484 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 485 486 return atomic_read(&zmd->nr_mblks); 487 } 488 489 /* 490 * For mblock shrinker: scan unused metadata blocks and shrink the cache. 491 */ 492 static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, 493 struct shrink_control *sc) 494 { 495 struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); 496 unsigned long count; 497 498 spin_lock(&zmd->mblk_lock); 499 count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan); 500 spin_unlock(&zmd->mblk_lock); 501 502 return count ? count : SHRINK_STOP; 503 } 504 505 /* 506 * Release a metadata block. 507 */ 508 static void dmz_release_mblock(struct dmz_metadata *zmd, 509 struct dmz_mblock *mblk) 510 { 511 512 if (!mblk) 513 return; 514 515 spin_lock(&zmd->mblk_lock); 516 517 mblk->ref--; 518 if (mblk->ref == 0) { 519 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 520 rb_erase(&mblk->node, &zmd->mblk_rbtree); 521 dmz_free_mblock(zmd, mblk); 522 } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) { 523 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 524 dmz_shrink_mblock_cache(zmd, 1); 525 } 526 } 527 528 spin_unlock(&zmd->mblk_lock); 529 } 530 531 /* 532 * Get a metadata block from the rbtree. If the block 533 * is not present, read it from disk. 534 */ 535 static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, 536 sector_t mblk_no) 537 { 538 struct dmz_mblock *mblk; 539 540 /* Check rbtree */ 541 spin_lock(&zmd->mblk_lock); 542 mblk = dmz_get_mblock_fast(zmd, mblk_no); 543 spin_unlock(&zmd->mblk_lock); 544 545 if (!mblk) { 546 /* Cache miss: read the block from disk */ 547 mblk = dmz_get_mblock_slow(zmd, mblk_no); 548 if (IS_ERR(mblk)) 549 return mblk; 550 } 551 552 /* Wait for on-going read I/O and check for error */ 553 wait_on_bit_io(&mblk->state, DMZ_META_READING, 554 TASK_UNINTERRUPTIBLE); 555 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 556 dmz_release_mblock(zmd, mblk); 557 dmz_check_bdev(zmd->dev); 558 return ERR_PTR(-EIO); 559 } 560 561 return mblk; 562 } 563 564 /* 565 * Mark a metadata block dirty. 566 */ 567 static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) 568 { 569 spin_lock(&zmd->mblk_lock); 570 if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state)) 571 list_add_tail(&mblk->link, &zmd->mblk_dirty_list); 572 spin_unlock(&zmd->mblk_lock); 573 } 574 575 /* 576 * Issue a metadata block write BIO. 577 */ 578 static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, 579 unsigned int set) 580 { 581 sector_t block = zmd->sb[set].block + mblk->no; 582 struct bio *bio; 583 584 if (dmz_bdev_is_dying(zmd->dev)) 585 return -EIO; 586 587 bio = bio_alloc(GFP_NOIO, 1); 588 if (!bio) { 589 set_bit(DMZ_META_ERROR, &mblk->state); 590 return -ENOMEM; 591 } 592 593 set_bit(DMZ_META_WRITING, &mblk->state); 594 595 bio->bi_iter.bi_sector = dmz_blk2sect(block); 596 bio_set_dev(bio, zmd->dev->bdev); 597 bio->bi_private = mblk; 598 bio->bi_end_io = dmz_mblock_bio_end_io; 599 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO); 600 bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); 601 submit_bio(bio); 602 603 return 0; 604 } 605 606 /* 607 * Read/write a metadata block. 608 */ 609 static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, 610 struct page *page) 611 { 612 struct bio *bio; 613 int ret; 614 615 if (dmz_bdev_is_dying(zmd->dev)) 616 return -EIO; 617 618 bio = bio_alloc(GFP_NOIO, 1); 619 if (!bio) 620 return -ENOMEM; 621 622 bio->bi_iter.bi_sector = dmz_blk2sect(block); 623 bio_set_dev(bio, zmd->dev->bdev); 624 bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO); 625 bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); 626 ret = submit_bio_wait(bio); 627 bio_put(bio); 628 629 if (ret) 630 dmz_check_bdev(zmd->dev); 631 return ret; 632 } 633 634 /* 635 * Write super block of the specified metadata set. 636 */ 637 static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set) 638 { 639 sector_t block = zmd->sb[set].block; 640 struct dmz_mblock *mblk = zmd->sb[set].mblk; 641 struct dmz_super *sb = zmd->sb[set].sb; 642 u64 sb_gen = zmd->sb_gen + 1; 643 int ret; 644 645 sb->magic = cpu_to_le32(DMZ_MAGIC); 646 sb->version = cpu_to_le32(DMZ_META_VER); 647 648 sb->gen = cpu_to_le64(sb_gen); 649 650 sb->sb_block = cpu_to_le64(block); 651 sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks); 652 sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq); 653 sb->nr_chunks = cpu_to_le32(zmd->nr_chunks); 654 655 sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks); 656 sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks); 657 658 sb->crc = 0; 659 sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE)); 660 661 ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page); 662 if (ret == 0) 663 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 664 665 return ret; 666 } 667 668 /* 669 * Write dirty metadata blocks to the specified set. 670 */ 671 static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, 672 struct list_head *write_list, 673 unsigned int set) 674 { 675 struct dmz_mblock *mblk; 676 struct blk_plug plug; 677 int ret = 0, nr_mblks_submitted = 0; 678 679 /* Issue writes */ 680 blk_start_plug(&plug); 681 list_for_each_entry(mblk, write_list, link) { 682 ret = dmz_write_mblock(zmd, mblk, set); 683 if (ret) 684 break; 685 nr_mblks_submitted++; 686 } 687 blk_finish_plug(&plug); 688 689 /* Wait for completion */ 690 list_for_each_entry(mblk, write_list, link) { 691 if (!nr_mblks_submitted) 692 break; 693 wait_on_bit_io(&mblk->state, DMZ_META_WRITING, 694 TASK_UNINTERRUPTIBLE); 695 if (test_bit(DMZ_META_ERROR, &mblk->state)) { 696 clear_bit(DMZ_META_ERROR, &mblk->state); 697 dmz_check_bdev(zmd->dev); 698 ret = -EIO; 699 } 700 nr_mblks_submitted--; 701 } 702 703 /* Flush drive cache (this will also sync data) */ 704 if (ret == 0) 705 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 706 707 return ret; 708 } 709 710 /* 711 * Log dirty metadata blocks. 712 */ 713 static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd, 714 struct list_head *write_list) 715 { 716 unsigned int log_set = zmd->mblk_primary ^ 0x1; 717 int ret; 718 719 /* Write dirty blocks to the log */ 720 ret = dmz_write_dirty_mblocks(zmd, write_list, log_set); 721 if (ret) 722 return ret; 723 724 /* 725 * No error so far: now validate the log by updating the 726 * log index super block generation. 727 */ 728 ret = dmz_write_sb(zmd, log_set); 729 if (ret) 730 return ret; 731 732 return 0; 733 } 734 735 /* 736 * Flush dirty metadata blocks. 737 */ 738 int dmz_flush_metadata(struct dmz_metadata *zmd) 739 { 740 struct dmz_mblock *mblk; 741 struct list_head write_list; 742 int ret; 743 744 if (WARN_ON(!zmd)) 745 return 0; 746 747 INIT_LIST_HEAD(&write_list); 748 749 /* 750 * Make sure that metadata blocks are stable before logging: take 751 * the write lock on the metadata semaphore to prevent target BIOs 752 * from modifying metadata. 753 */ 754 down_write(&zmd->mblk_sem); 755 756 /* 757 * This is called from the target flush work and reclaim work. 758 * Concurrent execution is not allowed. 759 */ 760 dmz_lock_flush(zmd); 761 762 if (dmz_bdev_is_dying(zmd->dev)) { 763 ret = -EIO; 764 goto out; 765 } 766 767 /* Get dirty blocks */ 768 spin_lock(&zmd->mblk_lock); 769 list_splice_init(&zmd->mblk_dirty_list, &write_list); 770 spin_unlock(&zmd->mblk_lock); 771 772 /* If there are no dirty metadata blocks, just flush the device cache */ 773 if (list_empty(&write_list)) { 774 ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); 775 goto err; 776 } 777 778 /* 779 * The primary metadata set is still clean. Keep it this way until 780 * all updates are successful in the secondary set. That is, use 781 * the secondary set as a log. 782 */ 783 ret = dmz_log_dirty_mblocks(zmd, &write_list); 784 if (ret) 785 goto err; 786 787 /* 788 * The log is on disk. It is now safe to update in place 789 * in the primary metadata set. 790 */ 791 ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); 792 if (ret) 793 goto err; 794 795 ret = dmz_write_sb(zmd, zmd->mblk_primary); 796 if (ret) 797 goto err; 798 799 while (!list_empty(&write_list)) { 800 mblk = list_first_entry(&write_list, struct dmz_mblock, link); 801 list_del_init(&mblk->link); 802 803 spin_lock(&zmd->mblk_lock); 804 clear_bit(DMZ_META_DIRTY, &mblk->state); 805 if (mblk->ref == 0) 806 list_add_tail(&mblk->link, &zmd->mblk_lru_list); 807 spin_unlock(&zmd->mblk_lock); 808 } 809 810 zmd->sb_gen++; 811 out: 812 dmz_unlock_flush(zmd); 813 up_write(&zmd->mblk_sem); 814 815 return ret; 816 817 err: 818 if (!list_empty(&write_list)) { 819 spin_lock(&zmd->mblk_lock); 820 list_splice(&write_list, &zmd->mblk_dirty_list); 821 spin_unlock(&zmd->mblk_lock); 822 } 823 if (!dmz_check_bdev(zmd->dev)) 824 ret = -EIO; 825 goto out; 826 } 827 828 /* 829 * Check super block. 830 */ 831 static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb) 832 { 833 unsigned int nr_meta_zones, nr_data_zones; 834 struct dmz_dev *dev = zmd->dev; 835 u32 crc, stored_crc; 836 u64 gen; 837 838 gen = le64_to_cpu(sb->gen); 839 stored_crc = le32_to_cpu(sb->crc); 840 sb->crc = 0; 841 crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE); 842 if (crc != stored_crc) { 843 dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)", 844 crc, stored_crc); 845 return -ENXIO; 846 } 847 848 if (le32_to_cpu(sb->magic) != DMZ_MAGIC) { 849 dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)", 850 DMZ_MAGIC, le32_to_cpu(sb->magic)); 851 return -ENXIO; 852 } 853 854 if (le32_to_cpu(sb->version) != DMZ_META_VER) { 855 dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)", 856 DMZ_META_VER, le32_to_cpu(sb->version)); 857 return -ENXIO; 858 } 859 860 nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1) 861 >> dev->zone_nr_blocks_shift; 862 if (!nr_meta_zones || 863 nr_meta_zones >= zmd->nr_rnd_zones) { 864 dmz_dev_err(dev, "Invalid number of metadata blocks"); 865 return -ENXIO; 866 } 867 868 if (!le32_to_cpu(sb->nr_reserved_seq) || 869 le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) { 870 dmz_dev_err(dev, "Invalid number of reserved sequential zones"); 871 return -ENXIO; 872 } 873 874 nr_data_zones = zmd->nr_useable_zones - 875 (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq)); 876 if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) { 877 dmz_dev_err(dev, "Invalid number of chunks %u / %u", 878 le32_to_cpu(sb->nr_chunks), nr_data_zones); 879 return -ENXIO; 880 } 881 882 /* OK */ 883 zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks); 884 zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq); 885 zmd->nr_chunks = le32_to_cpu(sb->nr_chunks); 886 zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks); 887 zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks); 888 zmd->nr_meta_zones = nr_meta_zones; 889 zmd->nr_data_zones = nr_data_zones; 890 891 return 0; 892 } 893 894 /* 895 * Read the first or second super block from disk. 896 */ 897 static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set) 898 { 899 return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block, 900 zmd->sb[set].mblk->page); 901 } 902 903 /* 904 * Determine the position of the secondary super blocks on disk. 905 * This is used only if a corruption of the primary super block 906 * is detected. 907 */ 908 static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd) 909 { 910 unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; 911 struct dmz_mblock *mblk; 912 int i; 913 914 /* Allocate a block */ 915 mblk = dmz_alloc_mblock(zmd, 0); 916 if (!mblk) 917 return -ENOMEM; 918 919 zmd->sb[1].mblk = mblk; 920 zmd->sb[1].sb = mblk->data; 921 922 /* Bad first super block: search for the second one */ 923 zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks; 924 for (i = 0; i < zmd->nr_rnd_zones - 1; i++) { 925 if (dmz_read_sb(zmd, 1) != 0) 926 break; 927 if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC) 928 return 0; 929 zmd->sb[1].block += zone_nr_blocks; 930 } 931 932 dmz_free_mblock(zmd, mblk); 933 zmd->sb[1].mblk = NULL; 934 935 return -EIO; 936 } 937 938 /* 939 * Read the first or second super block from disk. 940 */ 941 static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set) 942 { 943 struct dmz_mblock *mblk; 944 int ret; 945 946 /* Allocate a block */ 947 mblk = dmz_alloc_mblock(zmd, 0); 948 if (!mblk) 949 return -ENOMEM; 950 951 zmd->sb[set].mblk = mblk; 952 zmd->sb[set].sb = mblk->data; 953 954 /* Read super block */ 955 ret = dmz_read_sb(zmd, set); 956 if (ret) { 957 dmz_free_mblock(zmd, mblk); 958 zmd->sb[set].mblk = NULL; 959 return ret; 960 } 961 962 return 0; 963 } 964 965 /* 966 * Recover a metadata set. 967 */ 968 static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set) 969 { 970 unsigned int src_set = dst_set ^ 0x1; 971 struct page *page; 972 int i, ret; 973 974 dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set); 975 976 if (dst_set == 0) 977 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); 978 else { 979 zmd->sb[1].block = zmd->sb[0].block + 980 (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); 981 } 982 983 page = alloc_page(GFP_NOIO); 984 if (!page) 985 return -ENOMEM; 986 987 /* Copy metadata blocks */ 988 for (i = 1; i < zmd->nr_meta_blocks; i++) { 989 ret = dmz_rdwr_block(zmd, REQ_OP_READ, 990 zmd->sb[src_set].block + i, page); 991 if (ret) 992 goto out; 993 ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, 994 zmd->sb[dst_set].block + i, page); 995 if (ret) 996 goto out; 997 } 998 999 /* Finalize with the super block */ 1000 if (!zmd->sb[dst_set].mblk) { 1001 zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0); 1002 if (!zmd->sb[dst_set].mblk) { 1003 ret = -ENOMEM; 1004 goto out; 1005 } 1006 zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data; 1007 } 1008 1009 ret = dmz_write_sb(zmd, dst_set); 1010 out: 1011 __free_pages(page, 0); 1012 1013 return ret; 1014 } 1015 1016 /* 1017 * Get super block from disk. 1018 */ 1019 static int dmz_load_sb(struct dmz_metadata *zmd) 1020 { 1021 bool sb_good[2] = {false, false}; 1022 u64 sb_gen[2] = {0, 0}; 1023 int ret; 1024 1025 /* Read and check the primary super block */ 1026 zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone); 1027 ret = dmz_get_sb(zmd, 0); 1028 if (ret) { 1029 dmz_dev_err(zmd->dev, "Read primary super block failed"); 1030 return ret; 1031 } 1032 1033 ret = dmz_check_sb(zmd, zmd->sb[0].sb); 1034 1035 /* Read and check secondary super block */ 1036 if (ret == 0) { 1037 sb_good[0] = true; 1038 zmd->sb[1].block = zmd->sb[0].block + 1039 (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift); 1040 ret = dmz_get_sb(zmd, 1); 1041 } else 1042 ret = dmz_lookup_secondary_sb(zmd); 1043 1044 if (ret) { 1045 dmz_dev_err(zmd->dev, "Read secondary super block failed"); 1046 return ret; 1047 } 1048 1049 ret = dmz_check_sb(zmd, zmd->sb[1].sb); 1050 if (ret == 0) 1051 sb_good[1] = true; 1052 1053 /* Use highest generation sb first */ 1054 if (!sb_good[0] && !sb_good[1]) { 1055 dmz_dev_err(zmd->dev, "No valid super block found"); 1056 return -EIO; 1057 } 1058 1059 if (sb_good[0]) 1060 sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen); 1061 else 1062 ret = dmz_recover_mblocks(zmd, 0); 1063 1064 if (sb_good[1]) 1065 sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen); 1066 else 1067 ret = dmz_recover_mblocks(zmd, 1); 1068 1069 if (ret) { 1070 dmz_dev_err(zmd->dev, "Recovery failed"); 1071 return -EIO; 1072 } 1073 1074 if (sb_gen[0] >= sb_gen[1]) { 1075 zmd->sb_gen = sb_gen[0]; 1076 zmd->mblk_primary = 0; 1077 } else { 1078 zmd->sb_gen = sb_gen[1]; 1079 zmd->mblk_primary = 1; 1080 } 1081 1082 dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)", 1083 zmd->mblk_primary, zmd->sb_gen); 1084 1085 return 0; 1086 } 1087 1088 /* 1089 * Initialize a zone descriptor. 1090 */ 1091 static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) 1092 { 1093 struct dmz_metadata *zmd = data; 1094 struct dm_zone *zone = &zmd->zones[idx]; 1095 struct dmz_dev *dev = zmd->dev; 1096 1097 /* Ignore the eventual last runt (smaller) zone */ 1098 if (blkz->len != dev->zone_nr_sectors) { 1099 if (blkz->start + blkz->len == dev->capacity) 1100 return 0; 1101 return -ENXIO; 1102 } 1103 1104 INIT_LIST_HEAD(&zone->link); 1105 atomic_set(&zone->refcount, 0); 1106 zone->chunk = DMZ_MAP_UNMAPPED; 1107 1108 switch (blkz->type) { 1109 case BLK_ZONE_TYPE_CONVENTIONAL: 1110 set_bit(DMZ_RND, &zone->flags); 1111 zmd->nr_rnd_zones++; 1112 break; 1113 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1114 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1115 set_bit(DMZ_SEQ, &zone->flags); 1116 break; 1117 default: 1118 return -ENXIO; 1119 } 1120 1121 if (dmz_is_rnd(zone)) 1122 zone->wp_block = 0; 1123 else 1124 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1125 1126 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1127 set_bit(DMZ_OFFLINE, &zone->flags); 1128 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1129 set_bit(DMZ_READ_ONLY, &zone->flags); 1130 else { 1131 zmd->nr_useable_zones++; 1132 if (dmz_is_rnd(zone)) { 1133 zmd->nr_rnd_zones++; 1134 if (!zmd->sb_zone) { 1135 /* Super block zone */ 1136 zmd->sb_zone = zone; 1137 } 1138 } 1139 } 1140 1141 return 0; 1142 } 1143 1144 /* 1145 * Free zones descriptors. 1146 */ 1147 static void dmz_drop_zones(struct dmz_metadata *zmd) 1148 { 1149 kfree(zmd->zones); 1150 zmd->zones = NULL; 1151 } 1152 1153 /* 1154 * Allocate and initialize zone descriptors using the zone 1155 * information from disk. 1156 */ 1157 static int dmz_init_zones(struct dmz_metadata *zmd) 1158 { 1159 struct dmz_dev *dev = zmd->dev; 1160 int ret; 1161 1162 /* Init */ 1163 zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; 1164 zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; 1165 1166 /* Allocate zone array */ 1167 zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); 1168 if (!zmd->zones) 1169 return -ENOMEM; 1170 1171 dmz_dev_info(dev, "Using %zu B for zone information", 1172 sizeof(struct dm_zone) * dev->nr_zones); 1173 1174 /* 1175 * Get zone information and initialize zone descriptors. At the same 1176 * time, determine where the super block should be: first block of the 1177 * first randomly writable zone. 1178 */ 1179 ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, 1180 zmd); 1181 if (ret < 0) { 1182 dmz_drop_zones(zmd); 1183 return ret; 1184 } 1185 1186 return 0; 1187 } 1188 1189 static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, 1190 void *data) 1191 { 1192 struct dm_zone *zone = data; 1193 1194 clear_bit(DMZ_OFFLINE, &zone->flags); 1195 clear_bit(DMZ_READ_ONLY, &zone->flags); 1196 if (blkz->cond == BLK_ZONE_COND_OFFLINE) 1197 set_bit(DMZ_OFFLINE, &zone->flags); 1198 else if (blkz->cond == BLK_ZONE_COND_READONLY) 1199 set_bit(DMZ_READ_ONLY, &zone->flags); 1200 1201 if (dmz_is_seq(zone)) 1202 zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); 1203 else 1204 zone->wp_block = 0; 1205 return 0; 1206 } 1207 1208 /* 1209 * Update a zone information. 1210 */ 1211 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1212 { 1213 unsigned int noio_flag; 1214 int ret; 1215 1216 /* 1217 * Get zone information from disk. Since blkdev_report_zones() uses 1218 * GFP_KERNEL by default for memory allocations, set the per-task 1219 * PF_MEMALLOC_NOIO flag so that all allocations are done as if 1220 * GFP_NOIO was specified. 1221 */ 1222 noio_flag = memalloc_noio_save(); 1223 ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1, 1224 dmz_update_zone_cb, zone); 1225 memalloc_noio_restore(noio_flag); 1226 1227 if (ret == 0) 1228 ret = -EIO; 1229 if (ret < 0) { 1230 dmz_dev_err(zmd->dev, "Get zone %u report failed", 1231 dmz_id(zmd, zone)); 1232 dmz_check_bdev(zmd->dev); 1233 return ret; 1234 } 1235 1236 return 0; 1237 } 1238 1239 /* 1240 * Check a zone write pointer position when the zone is marked 1241 * with the sequential write error flag. 1242 */ 1243 static int dmz_handle_seq_write_err(struct dmz_metadata *zmd, 1244 struct dm_zone *zone) 1245 { 1246 unsigned int wp = 0; 1247 int ret; 1248 1249 wp = zone->wp_block; 1250 ret = dmz_update_zone(zmd, zone); 1251 if (ret) 1252 return ret; 1253 1254 dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)", 1255 dmz_id(zmd, zone), zone->wp_block, wp); 1256 1257 if (zone->wp_block < wp) { 1258 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 1259 wp - zone->wp_block); 1260 } 1261 1262 return 0; 1263 } 1264 1265 static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id) 1266 { 1267 return &zmd->zones[zone_id]; 1268 } 1269 1270 /* 1271 * Reset a zone write pointer. 1272 */ 1273 static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1274 { 1275 int ret; 1276 1277 /* 1278 * Ignore offline zones, read only zones, 1279 * and conventional zones. 1280 */ 1281 if (dmz_is_offline(zone) || 1282 dmz_is_readonly(zone) || 1283 dmz_is_rnd(zone)) 1284 return 0; 1285 1286 if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { 1287 struct dmz_dev *dev = zmd->dev; 1288 1289 ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, 1290 dmz_start_sect(zmd, zone), 1291 dev->zone_nr_sectors, GFP_NOIO); 1292 if (ret) { 1293 dmz_dev_err(dev, "Reset zone %u failed %d", 1294 dmz_id(zmd, zone), ret); 1295 return ret; 1296 } 1297 } 1298 1299 /* Clear write error bit and rewind write pointer position */ 1300 clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 1301 zone->wp_block = 0; 1302 1303 return 0; 1304 } 1305 1306 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone); 1307 1308 /* 1309 * Initialize chunk mapping. 1310 */ 1311 static int dmz_load_mapping(struct dmz_metadata *zmd) 1312 { 1313 struct dmz_dev *dev = zmd->dev; 1314 struct dm_zone *dzone, *bzone; 1315 struct dmz_mblock *dmap_mblk = NULL; 1316 struct dmz_map *dmap; 1317 unsigned int i = 0, e = 0, chunk = 0; 1318 unsigned int dzone_id; 1319 unsigned int bzone_id; 1320 1321 /* Metadata block array for the chunk mapping table */ 1322 zmd->map_mblk = kcalloc(zmd->nr_map_blocks, 1323 sizeof(struct dmz_mblk *), GFP_KERNEL); 1324 if (!zmd->map_mblk) 1325 return -ENOMEM; 1326 1327 /* Get chunk mapping table blocks and initialize zone mapping */ 1328 while (chunk < zmd->nr_chunks) { 1329 if (!dmap_mblk) { 1330 /* Get mapping block */ 1331 dmap_mblk = dmz_get_mblock(zmd, i + 1); 1332 if (IS_ERR(dmap_mblk)) 1333 return PTR_ERR(dmap_mblk); 1334 zmd->map_mblk[i] = dmap_mblk; 1335 dmap = (struct dmz_map *) dmap_mblk->data; 1336 i++; 1337 e = 0; 1338 } 1339 1340 /* Check data zone */ 1341 dzone_id = le32_to_cpu(dmap[e].dzone_id); 1342 if (dzone_id == DMZ_MAP_UNMAPPED) 1343 goto next; 1344 1345 if (dzone_id >= dev->nr_zones) { 1346 dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u", 1347 chunk, dzone_id); 1348 return -EIO; 1349 } 1350 1351 dzone = dmz_get(zmd, dzone_id); 1352 set_bit(DMZ_DATA, &dzone->flags); 1353 dzone->chunk = chunk; 1354 dmz_get_zone_weight(zmd, dzone); 1355 1356 if (dmz_is_rnd(dzone)) 1357 list_add_tail(&dzone->link, &zmd->map_rnd_list); 1358 else 1359 list_add_tail(&dzone->link, &zmd->map_seq_list); 1360 1361 /* Check buffer zone */ 1362 bzone_id = le32_to_cpu(dmap[e].bzone_id); 1363 if (bzone_id == DMZ_MAP_UNMAPPED) 1364 goto next; 1365 1366 if (bzone_id >= dev->nr_zones) { 1367 dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u", 1368 chunk, bzone_id); 1369 return -EIO; 1370 } 1371 1372 bzone = dmz_get(zmd, bzone_id); 1373 if (!dmz_is_rnd(bzone)) { 1374 dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u", 1375 chunk, bzone_id); 1376 return -EIO; 1377 } 1378 1379 set_bit(DMZ_DATA, &bzone->flags); 1380 set_bit(DMZ_BUF, &bzone->flags); 1381 bzone->chunk = chunk; 1382 bzone->bzone = dzone; 1383 dzone->bzone = bzone; 1384 dmz_get_zone_weight(zmd, bzone); 1385 list_add_tail(&bzone->link, &zmd->map_rnd_list); 1386 next: 1387 chunk++; 1388 e++; 1389 if (e >= DMZ_MAP_ENTRIES) 1390 dmap_mblk = NULL; 1391 } 1392 1393 /* 1394 * At this point, only meta zones and mapped data zones were 1395 * fully initialized. All remaining zones are unmapped data 1396 * zones. Finish initializing those here. 1397 */ 1398 for (i = 0; i < dev->nr_zones; i++) { 1399 dzone = dmz_get(zmd, i); 1400 if (dmz_is_meta(dzone)) 1401 continue; 1402 1403 if (dmz_is_rnd(dzone)) 1404 zmd->nr_rnd++; 1405 else 1406 zmd->nr_seq++; 1407 1408 if (dmz_is_data(dzone)) { 1409 /* Already initialized */ 1410 continue; 1411 } 1412 1413 /* Unmapped data zone */ 1414 set_bit(DMZ_DATA, &dzone->flags); 1415 dzone->chunk = DMZ_MAP_UNMAPPED; 1416 if (dmz_is_rnd(dzone)) { 1417 list_add_tail(&dzone->link, &zmd->unmap_rnd_list); 1418 atomic_inc(&zmd->unmap_nr_rnd); 1419 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) { 1420 list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list); 1421 atomic_inc(&zmd->nr_reserved_seq_zones); 1422 zmd->nr_seq--; 1423 } else { 1424 list_add_tail(&dzone->link, &zmd->unmap_seq_list); 1425 atomic_inc(&zmd->unmap_nr_seq); 1426 } 1427 } 1428 1429 return 0; 1430 } 1431 1432 /* 1433 * Set a data chunk mapping. 1434 */ 1435 static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, 1436 unsigned int dzone_id, unsigned int bzone_id) 1437 { 1438 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1439 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1440 int map_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1441 1442 dmap[map_idx].dzone_id = cpu_to_le32(dzone_id); 1443 dmap[map_idx].bzone_id = cpu_to_le32(bzone_id); 1444 dmz_dirty_mblock(zmd, dmap_mblk); 1445 } 1446 1447 /* 1448 * The list of mapped zones is maintained in LRU order. 1449 * This rotates a zone at the end of its map list. 1450 */ 1451 static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1452 { 1453 if (list_empty(&zone->link)) 1454 return; 1455 1456 list_del_init(&zone->link); 1457 if (dmz_is_seq(zone)) { 1458 /* LRU rotate sequential zone */ 1459 list_add_tail(&zone->link, &zmd->map_seq_list); 1460 } else { 1461 /* LRU rotate random zone */ 1462 list_add_tail(&zone->link, &zmd->map_rnd_list); 1463 } 1464 } 1465 1466 /* 1467 * The list of mapped random zones is maintained 1468 * in LRU order. This rotates a zone at the end of the list. 1469 */ 1470 static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1471 { 1472 __dmz_lru_zone(zmd, zone); 1473 if (zone->bzone) 1474 __dmz_lru_zone(zmd, zone->bzone); 1475 } 1476 1477 /* 1478 * Wait for any zone to be freed. 1479 */ 1480 static void dmz_wait_for_free_zones(struct dmz_metadata *zmd) 1481 { 1482 DEFINE_WAIT(wait); 1483 1484 prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE); 1485 dmz_unlock_map(zmd); 1486 dmz_unlock_metadata(zmd); 1487 1488 io_schedule_timeout(HZ); 1489 1490 dmz_lock_metadata(zmd); 1491 dmz_lock_map(zmd); 1492 finish_wait(&zmd->free_wq, &wait); 1493 } 1494 1495 /* 1496 * Lock a zone for reclaim (set the zone RECLAIM bit). 1497 * Returns false if the zone cannot be locked or if it is already locked 1498 * and 1 otherwise. 1499 */ 1500 int dmz_lock_zone_reclaim(struct dm_zone *zone) 1501 { 1502 /* Active zones cannot be reclaimed */ 1503 if (dmz_is_active(zone)) 1504 return 0; 1505 1506 return !test_and_set_bit(DMZ_RECLAIM, &zone->flags); 1507 } 1508 1509 /* 1510 * Clear a zone reclaim flag. 1511 */ 1512 void dmz_unlock_zone_reclaim(struct dm_zone *zone) 1513 { 1514 WARN_ON(dmz_is_active(zone)); 1515 WARN_ON(!dmz_in_reclaim(zone)); 1516 1517 clear_bit_unlock(DMZ_RECLAIM, &zone->flags); 1518 smp_mb__after_atomic(); 1519 wake_up_bit(&zone->flags, DMZ_RECLAIM); 1520 } 1521 1522 /* 1523 * Wait for a zone reclaim to complete. 1524 */ 1525 static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone) 1526 { 1527 dmz_unlock_map(zmd); 1528 dmz_unlock_metadata(zmd); 1529 wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ); 1530 dmz_lock_metadata(zmd); 1531 dmz_lock_map(zmd); 1532 } 1533 1534 /* 1535 * Select a random write zone for reclaim. 1536 */ 1537 static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) 1538 { 1539 struct dm_zone *dzone = NULL; 1540 struct dm_zone *zone; 1541 1542 if (list_empty(&zmd->map_rnd_list)) 1543 return ERR_PTR(-EBUSY); 1544 1545 list_for_each_entry(zone, &zmd->map_rnd_list, link) { 1546 if (dmz_is_buf(zone)) 1547 dzone = zone->bzone; 1548 else 1549 dzone = zone; 1550 if (dmz_lock_zone_reclaim(dzone)) 1551 return dzone; 1552 } 1553 1554 return ERR_PTR(-EBUSY); 1555 } 1556 1557 /* 1558 * Select a buffered sequential zone for reclaim. 1559 */ 1560 static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) 1561 { 1562 struct dm_zone *zone; 1563 1564 if (list_empty(&zmd->map_seq_list)) 1565 return ERR_PTR(-EBUSY); 1566 1567 list_for_each_entry(zone, &zmd->map_seq_list, link) { 1568 if (!zone->bzone) 1569 continue; 1570 if (dmz_lock_zone_reclaim(zone)) 1571 return zone; 1572 } 1573 1574 return ERR_PTR(-EBUSY); 1575 } 1576 1577 /* 1578 * Select a zone for reclaim. 1579 */ 1580 struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd) 1581 { 1582 struct dm_zone *zone; 1583 1584 /* 1585 * Search for a zone candidate to reclaim: 2 cases are possible. 1586 * (1) There is no free sequential zones. Then a random data zone 1587 * cannot be reclaimed. So choose a sequential zone to reclaim so 1588 * that afterward a random zone can be reclaimed. 1589 * (2) At least one free sequential zone is available, then choose 1590 * the oldest random zone (data or buffer) that can be locked. 1591 */ 1592 dmz_lock_map(zmd); 1593 if (list_empty(&zmd->reserved_seq_zones_list)) 1594 zone = dmz_get_seq_zone_for_reclaim(zmd); 1595 else 1596 zone = dmz_get_rnd_zone_for_reclaim(zmd); 1597 dmz_unlock_map(zmd); 1598 1599 return zone; 1600 } 1601 1602 /* 1603 * Get the zone mapping a chunk, if the chunk is mapped already. 1604 * If no mapping exist and the operation is WRITE, a zone is 1605 * allocated and used to map the chunk. 1606 * The zone returned will be set to the active state. 1607 */ 1608 struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op) 1609 { 1610 struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT]; 1611 struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data; 1612 int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK; 1613 unsigned int dzone_id; 1614 struct dm_zone *dzone = NULL; 1615 int ret = 0; 1616 1617 dmz_lock_map(zmd); 1618 again: 1619 /* Get the chunk mapping */ 1620 dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id); 1621 if (dzone_id == DMZ_MAP_UNMAPPED) { 1622 /* 1623 * Read or discard in unmapped chunks are fine. But for 1624 * writes, we need a mapping, so get one. 1625 */ 1626 if (op != REQ_OP_WRITE) 1627 goto out; 1628 1629 /* Allocate a random zone */ 1630 dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); 1631 if (!dzone) { 1632 if (dmz_bdev_is_dying(zmd->dev)) { 1633 dzone = ERR_PTR(-EIO); 1634 goto out; 1635 } 1636 dmz_wait_for_free_zones(zmd); 1637 goto again; 1638 } 1639 1640 dmz_map_zone(zmd, dzone, chunk); 1641 1642 } else { 1643 /* The chunk is already mapped: get the mapping zone */ 1644 dzone = dmz_get(zmd, dzone_id); 1645 if (dzone->chunk != chunk) { 1646 dzone = ERR_PTR(-EIO); 1647 goto out; 1648 } 1649 1650 /* Repair write pointer if the sequential dzone has error */ 1651 if (dmz_seq_write_err(dzone)) { 1652 ret = dmz_handle_seq_write_err(zmd, dzone); 1653 if (ret) { 1654 dzone = ERR_PTR(-EIO); 1655 goto out; 1656 } 1657 clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags); 1658 } 1659 } 1660 1661 /* 1662 * If the zone is being reclaimed, the chunk mapping may change 1663 * to a different zone. So wait for reclaim and retry. Otherwise, 1664 * activate the zone (this will prevent reclaim from touching it). 1665 */ 1666 if (dmz_in_reclaim(dzone)) { 1667 dmz_wait_for_reclaim(zmd, dzone); 1668 goto again; 1669 } 1670 dmz_activate_zone(dzone); 1671 dmz_lru_zone(zmd, dzone); 1672 out: 1673 dmz_unlock_map(zmd); 1674 1675 return dzone; 1676 } 1677 1678 /* 1679 * Write and discard change the block validity of data zones and their buffer 1680 * zones. Check here that valid blocks are still present. If all blocks are 1681 * invalid, the zones can be unmapped on the fly without waiting for reclaim 1682 * to do it. 1683 */ 1684 void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone) 1685 { 1686 struct dm_zone *bzone; 1687 1688 dmz_lock_map(zmd); 1689 1690 bzone = dzone->bzone; 1691 if (bzone) { 1692 if (dmz_weight(bzone)) 1693 dmz_lru_zone(zmd, bzone); 1694 else { 1695 /* Empty buffer zone: reclaim it */ 1696 dmz_unmap_zone(zmd, bzone); 1697 dmz_free_zone(zmd, bzone); 1698 bzone = NULL; 1699 } 1700 } 1701 1702 /* Deactivate the data zone */ 1703 dmz_deactivate_zone(dzone); 1704 if (dmz_is_active(dzone) || bzone || dmz_weight(dzone)) 1705 dmz_lru_zone(zmd, dzone); 1706 else { 1707 /* Unbuffered inactive empty data zone: reclaim it */ 1708 dmz_unmap_zone(zmd, dzone); 1709 dmz_free_zone(zmd, dzone); 1710 } 1711 1712 dmz_unlock_map(zmd); 1713 } 1714 1715 /* 1716 * Allocate and map a random zone to buffer a chunk 1717 * already mapped to a sequential zone. 1718 */ 1719 struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd, 1720 struct dm_zone *dzone) 1721 { 1722 struct dm_zone *bzone; 1723 1724 dmz_lock_map(zmd); 1725 again: 1726 bzone = dzone->bzone; 1727 if (bzone) 1728 goto out; 1729 1730 /* Allocate a random zone */ 1731 bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND); 1732 if (!bzone) { 1733 if (dmz_bdev_is_dying(zmd->dev)) { 1734 bzone = ERR_PTR(-EIO); 1735 goto out; 1736 } 1737 dmz_wait_for_free_zones(zmd); 1738 goto again; 1739 } 1740 1741 /* Update the chunk mapping */ 1742 dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone), 1743 dmz_id(zmd, bzone)); 1744 1745 set_bit(DMZ_BUF, &bzone->flags); 1746 bzone->chunk = dzone->chunk; 1747 bzone->bzone = dzone; 1748 dzone->bzone = bzone; 1749 list_add_tail(&bzone->link, &zmd->map_rnd_list); 1750 out: 1751 dmz_unlock_map(zmd); 1752 1753 return bzone; 1754 } 1755 1756 /* 1757 * Get an unmapped (free) zone. 1758 * This must be called with the mapping lock held. 1759 */ 1760 struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags) 1761 { 1762 struct list_head *list; 1763 struct dm_zone *zone; 1764 1765 if (flags & DMZ_ALLOC_RND) 1766 list = &zmd->unmap_rnd_list; 1767 else 1768 list = &zmd->unmap_seq_list; 1769 again: 1770 if (list_empty(list)) { 1771 /* 1772 * No free zone: if this is for reclaim, allow using the 1773 * reserved sequential zones. 1774 */ 1775 if (!(flags & DMZ_ALLOC_RECLAIM) || 1776 list_empty(&zmd->reserved_seq_zones_list)) 1777 return NULL; 1778 1779 zone = list_first_entry(&zmd->reserved_seq_zones_list, 1780 struct dm_zone, link); 1781 list_del_init(&zone->link); 1782 atomic_dec(&zmd->nr_reserved_seq_zones); 1783 return zone; 1784 } 1785 1786 zone = list_first_entry(list, struct dm_zone, link); 1787 list_del_init(&zone->link); 1788 1789 if (dmz_is_rnd(zone)) 1790 atomic_dec(&zmd->unmap_nr_rnd); 1791 else 1792 atomic_dec(&zmd->unmap_nr_seq); 1793 1794 if (dmz_is_offline(zone)) { 1795 dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone)); 1796 zone = NULL; 1797 goto again; 1798 } 1799 1800 return zone; 1801 } 1802 1803 /* 1804 * Free a zone. 1805 * This must be called with the mapping lock held. 1806 */ 1807 void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1808 { 1809 /* If this is a sequential zone, reset it */ 1810 if (dmz_is_seq(zone)) 1811 dmz_reset_zone(zmd, zone); 1812 1813 /* Return the zone to its type unmap list */ 1814 if (dmz_is_rnd(zone)) { 1815 list_add_tail(&zone->link, &zmd->unmap_rnd_list); 1816 atomic_inc(&zmd->unmap_nr_rnd); 1817 } else if (atomic_read(&zmd->nr_reserved_seq_zones) < 1818 zmd->nr_reserved_seq) { 1819 list_add_tail(&zone->link, &zmd->reserved_seq_zones_list); 1820 atomic_inc(&zmd->nr_reserved_seq_zones); 1821 } else { 1822 list_add_tail(&zone->link, &zmd->unmap_seq_list); 1823 atomic_inc(&zmd->unmap_nr_seq); 1824 } 1825 1826 wake_up_all(&zmd->free_wq); 1827 } 1828 1829 /* 1830 * Map a chunk to a zone. 1831 * This must be called with the mapping lock held. 1832 */ 1833 void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone, 1834 unsigned int chunk) 1835 { 1836 /* Set the chunk mapping */ 1837 dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone), 1838 DMZ_MAP_UNMAPPED); 1839 dzone->chunk = chunk; 1840 if (dmz_is_rnd(dzone)) 1841 list_add_tail(&dzone->link, &zmd->map_rnd_list); 1842 else 1843 list_add_tail(&dzone->link, &zmd->map_seq_list); 1844 } 1845 1846 /* 1847 * Unmap a zone. 1848 * This must be called with the mapping lock held. 1849 */ 1850 void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone) 1851 { 1852 unsigned int chunk = zone->chunk; 1853 unsigned int dzone_id; 1854 1855 if (chunk == DMZ_MAP_UNMAPPED) { 1856 /* Already unmapped */ 1857 return; 1858 } 1859 1860 if (test_and_clear_bit(DMZ_BUF, &zone->flags)) { 1861 /* 1862 * Unmapping the chunk buffer zone: clear only 1863 * the chunk buffer mapping 1864 */ 1865 dzone_id = dmz_id(zmd, zone->bzone); 1866 zone->bzone->bzone = NULL; 1867 zone->bzone = NULL; 1868 1869 } else { 1870 /* 1871 * Unmapping the chunk data zone: the zone must 1872 * not be buffered. 1873 */ 1874 if (WARN_ON(zone->bzone)) { 1875 zone->bzone->bzone = NULL; 1876 zone->bzone = NULL; 1877 } 1878 dzone_id = DMZ_MAP_UNMAPPED; 1879 } 1880 1881 dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED); 1882 1883 zone->chunk = DMZ_MAP_UNMAPPED; 1884 list_del_init(&zone->link); 1885 } 1886 1887 /* 1888 * Set @nr_bits bits in @bitmap starting from @bit. 1889 * Return the number of bits changed from 0 to 1. 1890 */ 1891 static unsigned int dmz_set_bits(unsigned long *bitmap, 1892 unsigned int bit, unsigned int nr_bits) 1893 { 1894 unsigned long *addr; 1895 unsigned int end = bit + nr_bits; 1896 unsigned int n = 0; 1897 1898 while (bit < end) { 1899 if (((bit & (BITS_PER_LONG - 1)) == 0) && 1900 ((end - bit) >= BITS_PER_LONG)) { 1901 /* Try to set the whole word at once */ 1902 addr = bitmap + BIT_WORD(bit); 1903 if (*addr == 0) { 1904 *addr = ULONG_MAX; 1905 n += BITS_PER_LONG; 1906 bit += BITS_PER_LONG; 1907 continue; 1908 } 1909 } 1910 1911 if (!test_and_set_bit(bit, bitmap)) 1912 n++; 1913 bit++; 1914 } 1915 1916 return n; 1917 } 1918 1919 /* 1920 * Get the bitmap block storing the bit for chunk_block in zone. 1921 */ 1922 static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd, 1923 struct dm_zone *zone, 1924 sector_t chunk_block) 1925 { 1926 sector_t bitmap_block = 1 + zmd->nr_map_blocks + 1927 (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) + 1928 (chunk_block >> DMZ_BLOCK_SHIFT_BITS); 1929 1930 return dmz_get_mblock(zmd, bitmap_block); 1931 } 1932 1933 /* 1934 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone. 1935 */ 1936 int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 1937 struct dm_zone *to_zone) 1938 { 1939 struct dmz_mblock *from_mblk, *to_mblk; 1940 sector_t chunk_block = 0; 1941 1942 /* Get the zones bitmap blocks */ 1943 while (chunk_block < zmd->dev->zone_nr_blocks) { 1944 from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block); 1945 if (IS_ERR(from_mblk)) 1946 return PTR_ERR(from_mblk); 1947 to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block); 1948 if (IS_ERR(to_mblk)) { 1949 dmz_release_mblock(zmd, from_mblk); 1950 return PTR_ERR(to_mblk); 1951 } 1952 1953 memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE); 1954 dmz_dirty_mblock(zmd, to_mblk); 1955 1956 dmz_release_mblock(zmd, to_mblk); 1957 dmz_release_mblock(zmd, from_mblk); 1958 1959 chunk_block += DMZ_BLOCK_SIZE_BITS; 1960 } 1961 1962 to_zone->weight = from_zone->weight; 1963 1964 return 0; 1965 } 1966 1967 /* 1968 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone, 1969 * starting from chunk_block. 1970 */ 1971 int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, 1972 struct dm_zone *to_zone, sector_t chunk_block) 1973 { 1974 unsigned int nr_blocks; 1975 int ret; 1976 1977 /* Get the zones bitmap blocks */ 1978 while (chunk_block < zmd->dev->zone_nr_blocks) { 1979 /* Get a valid region from the source zone */ 1980 ret = dmz_first_valid_block(zmd, from_zone, &chunk_block); 1981 if (ret <= 0) 1982 return ret; 1983 1984 nr_blocks = ret; 1985 ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks); 1986 if (ret) 1987 return ret; 1988 1989 chunk_block += nr_blocks; 1990 } 1991 1992 return 0; 1993 } 1994 1995 /* 1996 * Validate all the blocks in the range [block..block+nr_blocks-1]. 1997 */ 1998 int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 1999 sector_t chunk_block, unsigned int nr_blocks) 2000 { 2001 unsigned int count, bit, nr_bits; 2002 unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks; 2003 struct dmz_mblock *mblk; 2004 unsigned int n = 0; 2005 2006 dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks", 2007 dmz_id(zmd, zone), (unsigned long long)chunk_block, 2008 nr_blocks); 2009 2010 WARN_ON(chunk_block + nr_blocks > zone_nr_blocks); 2011 2012 while (nr_blocks) { 2013 /* Get bitmap block */ 2014 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2015 if (IS_ERR(mblk)) 2016 return PTR_ERR(mblk); 2017 2018 /* Set bits */ 2019 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2020 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2021 2022 count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); 2023 if (count) { 2024 dmz_dirty_mblock(zmd, mblk); 2025 n += count; 2026 } 2027 dmz_release_mblock(zmd, mblk); 2028 2029 nr_blocks -= nr_bits; 2030 chunk_block += nr_bits; 2031 } 2032 2033 if (likely(zone->weight + n <= zone_nr_blocks)) 2034 zone->weight += n; 2035 else { 2036 dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u", 2037 dmz_id(zmd, zone), zone->weight, 2038 zone_nr_blocks - n); 2039 zone->weight = zone_nr_blocks; 2040 } 2041 2042 return 0; 2043 } 2044 2045 /* 2046 * Clear nr_bits bits in bitmap starting from bit. 2047 * Return the number of bits cleared. 2048 */ 2049 static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits) 2050 { 2051 unsigned long *addr; 2052 int end = bit + nr_bits; 2053 int n = 0; 2054 2055 while (bit < end) { 2056 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2057 ((end - bit) >= BITS_PER_LONG)) { 2058 /* Try to clear whole word at once */ 2059 addr = bitmap + BIT_WORD(bit); 2060 if (*addr == ULONG_MAX) { 2061 *addr = 0; 2062 n += BITS_PER_LONG; 2063 bit += BITS_PER_LONG; 2064 continue; 2065 } 2066 } 2067 2068 if (test_and_clear_bit(bit, bitmap)) 2069 n++; 2070 bit++; 2071 } 2072 2073 return n; 2074 } 2075 2076 /* 2077 * Invalidate all the blocks in the range [block..block+nr_blocks-1]. 2078 */ 2079 int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, 2080 sector_t chunk_block, unsigned int nr_blocks) 2081 { 2082 unsigned int count, bit, nr_bits; 2083 struct dmz_mblock *mblk; 2084 unsigned int n = 0; 2085 2086 dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks", 2087 dmz_id(zmd, zone), (u64)chunk_block, nr_blocks); 2088 2089 WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); 2090 2091 while (nr_blocks) { 2092 /* Get bitmap block */ 2093 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2094 if (IS_ERR(mblk)) 2095 return PTR_ERR(mblk); 2096 2097 /* Clear bits */ 2098 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2099 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2100 2101 count = dmz_clear_bits((unsigned long *)mblk->data, 2102 bit, nr_bits); 2103 if (count) { 2104 dmz_dirty_mblock(zmd, mblk); 2105 n += count; 2106 } 2107 dmz_release_mblock(zmd, mblk); 2108 2109 nr_blocks -= nr_bits; 2110 chunk_block += nr_bits; 2111 } 2112 2113 if (zone->weight >= n) 2114 zone->weight -= n; 2115 else { 2116 dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u", 2117 dmz_id(zmd, zone), zone->weight, n); 2118 zone->weight = 0; 2119 } 2120 2121 return 0; 2122 } 2123 2124 /* 2125 * Get a block bit value. 2126 */ 2127 static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2128 sector_t chunk_block) 2129 { 2130 struct dmz_mblock *mblk; 2131 int ret; 2132 2133 WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks); 2134 2135 /* Get bitmap block */ 2136 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2137 if (IS_ERR(mblk)) 2138 return PTR_ERR(mblk); 2139 2140 /* Get offset */ 2141 ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS, 2142 (unsigned long *) mblk->data) != 0; 2143 2144 dmz_release_mblock(zmd, mblk); 2145 2146 return ret; 2147 } 2148 2149 /* 2150 * Return the number of blocks from chunk_block to the first block with a bit 2151 * value specified by set. Search at most nr_blocks blocks from chunk_block. 2152 */ 2153 static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2154 sector_t chunk_block, unsigned int nr_blocks, 2155 int set) 2156 { 2157 struct dmz_mblock *mblk; 2158 unsigned int bit, set_bit, nr_bits; 2159 unsigned long *bitmap; 2160 int n = 0; 2161 2162 WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks); 2163 2164 while (nr_blocks) { 2165 /* Get bitmap block */ 2166 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2167 if (IS_ERR(mblk)) 2168 return PTR_ERR(mblk); 2169 2170 /* Get offset */ 2171 bitmap = (unsigned long *) mblk->data; 2172 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2173 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2174 if (set) 2175 set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); 2176 else 2177 set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); 2178 dmz_release_mblock(zmd, mblk); 2179 2180 n += set_bit - bit; 2181 if (set_bit < DMZ_BLOCK_SIZE_BITS) 2182 break; 2183 2184 nr_blocks -= nr_bits; 2185 chunk_block += nr_bits; 2186 } 2187 2188 return n; 2189 } 2190 2191 /* 2192 * Test if chunk_block is valid. If it is, the number of consecutive 2193 * valid blocks from chunk_block will be returned. 2194 */ 2195 int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone, 2196 sector_t chunk_block) 2197 { 2198 int valid; 2199 2200 valid = dmz_test_block(zmd, zone, chunk_block); 2201 if (valid <= 0) 2202 return valid; 2203 2204 /* The block is valid: get the number of valid blocks from block */ 2205 return dmz_to_next_set_block(zmd, zone, chunk_block, 2206 zmd->dev->zone_nr_blocks - chunk_block, 0); 2207 } 2208 2209 /* 2210 * Find the first valid block from @chunk_block in @zone. 2211 * If such a block is found, its number is returned using 2212 * @chunk_block and the total number of valid blocks from @chunk_block 2213 * is returned. 2214 */ 2215 int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone, 2216 sector_t *chunk_block) 2217 { 2218 sector_t start_block = *chunk_block; 2219 int ret; 2220 2221 ret = dmz_to_next_set_block(zmd, zone, start_block, 2222 zmd->dev->zone_nr_blocks - start_block, 1); 2223 if (ret < 0) 2224 return ret; 2225 2226 start_block += ret; 2227 *chunk_block = start_block; 2228 2229 return dmz_to_next_set_block(zmd, zone, start_block, 2230 zmd->dev->zone_nr_blocks - start_block, 0); 2231 } 2232 2233 /* 2234 * Count the number of bits set starting from bit up to bit + nr_bits - 1. 2235 */ 2236 static int dmz_count_bits(void *bitmap, int bit, int nr_bits) 2237 { 2238 unsigned long *addr; 2239 int end = bit + nr_bits; 2240 int n = 0; 2241 2242 while (bit < end) { 2243 if (((bit & (BITS_PER_LONG - 1)) == 0) && 2244 ((end - bit) >= BITS_PER_LONG)) { 2245 addr = (unsigned long *)bitmap + BIT_WORD(bit); 2246 if (*addr == ULONG_MAX) { 2247 n += BITS_PER_LONG; 2248 bit += BITS_PER_LONG; 2249 continue; 2250 } 2251 } 2252 2253 if (test_bit(bit, bitmap)) 2254 n++; 2255 bit++; 2256 } 2257 2258 return n; 2259 } 2260 2261 /* 2262 * Get a zone weight. 2263 */ 2264 static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) 2265 { 2266 struct dmz_mblock *mblk; 2267 sector_t chunk_block = 0; 2268 unsigned int bit, nr_bits; 2269 unsigned int nr_blocks = zmd->dev->zone_nr_blocks; 2270 void *bitmap; 2271 int n = 0; 2272 2273 while (nr_blocks) { 2274 /* Get bitmap block */ 2275 mblk = dmz_get_bitmap(zmd, zone, chunk_block); 2276 if (IS_ERR(mblk)) { 2277 n = 0; 2278 break; 2279 } 2280 2281 /* Count bits in this block */ 2282 bitmap = mblk->data; 2283 bit = chunk_block & DMZ_BLOCK_MASK_BITS; 2284 nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); 2285 n += dmz_count_bits(bitmap, bit, nr_bits); 2286 2287 dmz_release_mblock(zmd, mblk); 2288 2289 nr_blocks -= nr_bits; 2290 chunk_block += nr_bits; 2291 } 2292 2293 zone->weight = n; 2294 } 2295 2296 /* 2297 * Cleanup the zoned metadata resources. 2298 */ 2299 static void dmz_cleanup_metadata(struct dmz_metadata *zmd) 2300 { 2301 struct rb_root *root; 2302 struct dmz_mblock *mblk, *next; 2303 int i; 2304 2305 /* Release zone mapping resources */ 2306 if (zmd->map_mblk) { 2307 for (i = 0; i < zmd->nr_map_blocks; i++) 2308 dmz_release_mblock(zmd, zmd->map_mblk[i]); 2309 kfree(zmd->map_mblk); 2310 zmd->map_mblk = NULL; 2311 } 2312 2313 /* Release super blocks */ 2314 for (i = 0; i < 2; i++) { 2315 if (zmd->sb[i].mblk) { 2316 dmz_free_mblock(zmd, zmd->sb[i].mblk); 2317 zmd->sb[i].mblk = NULL; 2318 } 2319 } 2320 2321 /* Free cached blocks */ 2322 while (!list_empty(&zmd->mblk_dirty_list)) { 2323 mblk = list_first_entry(&zmd->mblk_dirty_list, 2324 struct dmz_mblock, link); 2325 dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", 2326 (u64)mblk->no, mblk->ref); 2327 list_del_init(&mblk->link); 2328 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2329 dmz_free_mblock(zmd, mblk); 2330 } 2331 2332 while (!list_empty(&zmd->mblk_lru_list)) { 2333 mblk = list_first_entry(&zmd->mblk_lru_list, 2334 struct dmz_mblock, link); 2335 list_del_init(&mblk->link); 2336 rb_erase(&mblk->node, &zmd->mblk_rbtree); 2337 dmz_free_mblock(zmd, mblk); 2338 } 2339 2340 /* Sanity checks: the mblock rbtree should now be empty */ 2341 root = &zmd->mblk_rbtree; 2342 rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { 2343 dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", 2344 (u64)mblk->no, mblk->ref); 2345 mblk->ref = 0; 2346 dmz_free_mblock(zmd, mblk); 2347 } 2348 2349 /* Free the zone descriptors */ 2350 dmz_drop_zones(zmd); 2351 2352 mutex_destroy(&zmd->mblk_flush_lock); 2353 mutex_destroy(&zmd->map_lock); 2354 } 2355 2356 /* 2357 * Initialize the zoned metadata. 2358 */ 2359 int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata) 2360 { 2361 struct dmz_metadata *zmd; 2362 unsigned int i, zid; 2363 struct dm_zone *zone; 2364 int ret; 2365 2366 zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL); 2367 if (!zmd) 2368 return -ENOMEM; 2369 2370 zmd->dev = dev; 2371 zmd->mblk_rbtree = RB_ROOT; 2372 init_rwsem(&zmd->mblk_sem); 2373 mutex_init(&zmd->mblk_flush_lock); 2374 spin_lock_init(&zmd->mblk_lock); 2375 INIT_LIST_HEAD(&zmd->mblk_lru_list); 2376 INIT_LIST_HEAD(&zmd->mblk_dirty_list); 2377 2378 mutex_init(&zmd->map_lock); 2379 atomic_set(&zmd->unmap_nr_rnd, 0); 2380 INIT_LIST_HEAD(&zmd->unmap_rnd_list); 2381 INIT_LIST_HEAD(&zmd->map_rnd_list); 2382 2383 atomic_set(&zmd->unmap_nr_seq, 0); 2384 INIT_LIST_HEAD(&zmd->unmap_seq_list); 2385 INIT_LIST_HEAD(&zmd->map_seq_list); 2386 2387 atomic_set(&zmd->nr_reserved_seq_zones, 0); 2388 INIT_LIST_HEAD(&zmd->reserved_seq_zones_list); 2389 2390 init_waitqueue_head(&zmd->free_wq); 2391 2392 /* Initialize zone descriptors */ 2393 ret = dmz_init_zones(zmd); 2394 if (ret) 2395 goto err; 2396 2397 /* Get super block */ 2398 ret = dmz_load_sb(zmd); 2399 if (ret) 2400 goto err; 2401 2402 /* Set metadata zones starting from sb_zone */ 2403 zid = dmz_id(zmd, zmd->sb_zone); 2404 for (i = 0; i < zmd->nr_meta_zones << 1; i++) { 2405 zone = dmz_get(zmd, zid + i); 2406 if (!dmz_is_rnd(zone)) 2407 goto err; 2408 set_bit(DMZ_META, &zone->flags); 2409 } 2410 2411 /* Load mapping table */ 2412 ret = dmz_load_mapping(zmd); 2413 if (ret) 2414 goto err; 2415 2416 /* 2417 * Cache size boundaries: allow at least 2 super blocks, the chunk map 2418 * blocks and enough blocks to be able to cache the bitmap blocks of 2419 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow 2420 * the cache to add 512 more metadata blocks. 2421 */ 2422 zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; 2423 zmd->max_nr_mblks = zmd->min_nr_mblks + 512; 2424 zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; 2425 zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; 2426 zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; 2427 2428 /* Metadata cache shrinker */ 2429 ret = register_shrinker(&zmd->mblk_shrinker); 2430 if (ret) { 2431 dmz_dev_err(dev, "Register metadata cache shrinker failed"); 2432 goto err; 2433 } 2434 2435 dmz_dev_info(dev, "Host-%s zoned block device", 2436 bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ? 2437 "aware" : "managed"); 2438 dmz_dev_info(dev, " %llu 512-byte logical sectors", 2439 (u64)dev->capacity); 2440 dmz_dev_info(dev, " %u zones of %llu 512-byte logical sectors", 2441 dev->nr_zones, (u64)dev->zone_nr_sectors); 2442 dmz_dev_info(dev, " %u metadata zones", 2443 zmd->nr_meta_zones * 2); 2444 dmz_dev_info(dev, " %u data zones for %u chunks", 2445 zmd->nr_data_zones, zmd->nr_chunks); 2446 dmz_dev_info(dev, " %u random zones (%u unmapped)", 2447 zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd)); 2448 dmz_dev_info(dev, " %u sequential zones (%u unmapped)", 2449 zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq)); 2450 dmz_dev_info(dev, " %u reserved sequential data zones", 2451 zmd->nr_reserved_seq); 2452 2453 dmz_dev_debug(dev, "Format:"); 2454 dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)", 2455 zmd->nr_meta_blocks, zmd->max_nr_mblks); 2456 dmz_dev_debug(dev, " %u data zone mapping blocks", 2457 zmd->nr_map_blocks); 2458 dmz_dev_debug(dev, " %u bitmap blocks", 2459 zmd->nr_bitmap_blocks); 2460 2461 *metadata = zmd; 2462 2463 return 0; 2464 err: 2465 dmz_cleanup_metadata(zmd); 2466 kfree(zmd); 2467 *metadata = NULL; 2468 2469 return ret; 2470 } 2471 2472 /* 2473 * Cleanup the zoned metadata resources. 2474 */ 2475 void dmz_dtr_metadata(struct dmz_metadata *zmd) 2476 { 2477 unregister_shrinker(&zmd->mblk_shrinker); 2478 dmz_cleanup_metadata(zmd); 2479 kfree(zmd); 2480 } 2481 2482 /* 2483 * Check zone information on resume. 2484 */ 2485 int dmz_resume_metadata(struct dmz_metadata *zmd) 2486 { 2487 struct dmz_dev *dev = zmd->dev; 2488 struct dm_zone *zone; 2489 sector_t wp_block; 2490 unsigned int i; 2491 int ret; 2492 2493 /* Check zones */ 2494 for (i = 0; i < dev->nr_zones; i++) { 2495 zone = dmz_get(zmd, i); 2496 if (!zone) { 2497 dmz_dev_err(dev, "Unable to get zone %u", i); 2498 return -EIO; 2499 } 2500 2501 wp_block = zone->wp_block; 2502 2503 ret = dmz_update_zone(zmd, zone); 2504 if (ret) { 2505 dmz_dev_err(dev, "Broken zone %u", i); 2506 return ret; 2507 } 2508 2509 if (dmz_is_offline(zone)) { 2510 dmz_dev_warn(dev, "Zone %u is offline", i); 2511 continue; 2512 } 2513 2514 /* Check write pointer */ 2515 if (!dmz_is_seq(zone)) 2516 zone->wp_block = 0; 2517 else if (zone->wp_block != wp_block) { 2518 dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)", 2519 i, (u64)zone->wp_block, (u64)wp_block); 2520 zone->wp_block = wp_block; 2521 dmz_invalidate_blocks(zmd, zone, zone->wp_block, 2522 dev->zone_nr_blocks - zone->wp_block); 2523 } 2524 } 2525 2526 return 0; 2527 } 2528