1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 12 #define DM_MSG_PREFIX "zoned" 13 14 #define DMZ_MIN_BIOS 8192 15 16 /* 17 * Zone BIO context. 18 */ 19 struct dmz_bioctx { 20 struct dmz_dev *dev; 21 struct dm_zone *zone; 22 struct bio *bio; 23 refcount_t ref; 24 }; 25 26 /* 27 * Chunk work descriptor. 28 */ 29 struct dm_chunk_work { 30 struct work_struct work; 31 refcount_t refcount; 32 struct dmz_target *target; 33 unsigned int chunk; 34 struct bio_list bio_list; 35 }; 36 37 /* 38 * Target descriptor. 39 */ 40 struct dmz_target { 41 struct dm_dev **ddev; 42 unsigned int nr_ddevs; 43 44 unsigned int flags; 45 46 /* Zoned block device information */ 47 struct dmz_dev *dev; 48 49 /* For metadata handling */ 50 struct dmz_metadata *metadata; 51 52 /* For chunk work */ 53 struct radix_tree_root chunk_rxtree; 54 struct workqueue_struct *chunk_wq; 55 struct mutex chunk_lock; 56 57 /* For cloned BIOs to zones */ 58 struct bio_set bio_set; 59 60 /* For flush */ 61 spinlock_t flush_lock; 62 struct bio_list flush_list; 63 struct delayed_work flush_work; 64 struct workqueue_struct *flush_wq; 65 }; 66 67 /* 68 * Flush intervals (seconds). 69 */ 70 #define DMZ_FLUSH_PERIOD (10 * HZ) 71 72 /* 73 * Target BIO completion. 74 */ 75 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) 76 { 77 struct dmz_bioctx *bioctx = 78 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 79 80 if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) 81 bio->bi_status = status; 82 if (bioctx->dev && bio->bi_status != BLK_STS_OK) 83 bioctx->dev->flags |= DMZ_CHECK_BDEV; 84 85 if (refcount_dec_and_test(&bioctx->ref)) { 86 struct dm_zone *zone = bioctx->zone; 87 88 if (zone) { 89 if (bio->bi_status != BLK_STS_OK && 90 bio_op(bio) == REQ_OP_WRITE && 91 dmz_is_seq(zone)) 92 set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags); 93 dmz_deactivate_zone(zone); 94 } 95 bio_endio(bio); 96 } 97 } 98 99 /* 100 * Completion callback for an internally cloned target BIO. This terminates the 101 * target BIO when there are no more references to its context. 102 */ 103 static void dmz_clone_endio(struct bio *clone) 104 { 105 struct dmz_bioctx *bioctx = clone->bi_private; 106 blk_status_t status = clone->bi_status; 107 108 bio_put(clone); 109 dmz_bio_endio(bioctx->bio, status); 110 } 111 112 /* 113 * Issue a clone of a target BIO. The clone may only partially process the 114 * original target BIO. 115 */ 116 static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, 117 struct bio *bio, sector_t chunk_block, 118 unsigned int nr_blocks) 119 { 120 struct dmz_bioctx *bioctx = 121 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 122 struct dmz_dev *dev = zone->dev; 123 struct bio *clone; 124 125 if (dev->flags & DMZ_BDEV_DYING) 126 return -EIO; 127 128 clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set); 129 if (!clone) 130 return -ENOMEM; 131 132 bio_set_dev(clone, dev->bdev); 133 bioctx->dev = dev; 134 clone->bi_iter.bi_sector = 135 dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); 136 clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT; 137 clone->bi_end_io = dmz_clone_endio; 138 clone->bi_private = bioctx; 139 140 bio_advance(bio, clone->bi_iter.bi_size); 141 142 refcount_inc(&bioctx->ref); 143 generic_make_request(clone); 144 145 if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) 146 zone->wp_block += nr_blocks; 147 148 return 0; 149 } 150 151 /* 152 * Zero out pages of discarded blocks accessed by a read BIO. 153 */ 154 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio, 155 sector_t chunk_block, unsigned int nr_blocks) 156 { 157 unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT; 158 159 /* Clear nr_blocks */ 160 swap(bio->bi_iter.bi_size, size); 161 zero_fill_bio(bio); 162 swap(bio->bi_iter.bi_size, size); 163 164 bio_advance(bio, size); 165 } 166 167 /* 168 * Process a read BIO. 169 */ 170 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone, 171 struct bio *bio) 172 { 173 struct dmz_metadata *zmd = dmz->metadata; 174 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 175 unsigned int nr_blocks = dmz_bio_blocks(bio); 176 sector_t end_block = chunk_block + nr_blocks; 177 struct dm_zone *rzone, *bzone; 178 int ret; 179 180 /* Read into unmapped chunks need only zeroing the BIO buffer */ 181 if (!zone) { 182 zero_fill_bio(bio); 183 return 0; 184 } 185 186 DMDEBUG("(%s): READ chunk %llu -> %s zone %u, block %llu, %u blocks", 187 dmz_metadata_label(zmd), 188 (unsigned long long)dmz_bio_chunk(zmd, bio), 189 (dmz_is_rnd(zone) ? "RND" : 190 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 191 zone->id, 192 (unsigned long long)chunk_block, nr_blocks); 193 194 /* Check block validity to determine the read location */ 195 bzone = zone->bzone; 196 while (chunk_block < end_block) { 197 nr_blocks = 0; 198 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 199 chunk_block < zone->wp_block) { 200 /* Test block validity in the data zone */ 201 ret = dmz_block_valid(zmd, zone, chunk_block); 202 if (ret < 0) 203 return ret; 204 if (ret > 0) { 205 /* Read data zone blocks */ 206 nr_blocks = ret; 207 rzone = zone; 208 } 209 } 210 211 /* 212 * No valid blocks found in the data zone. 213 * Check the buffer zone, if there is one. 214 */ 215 if (!nr_blocks && bzone) { 216 ret = dmz_block_valid(zmd, bzone, chunk_block); 217 if (ret < 0) 218 return ret; 219 if (ret > 0) { 220 /* Read buffer zone blocks */ 221 nr_blocks = ret; 222 rzone = bzone; 223 } 224 } 225 226 if (nr_blocks) { 227 /* Valid blocks found: read them */ 228 nr_blocks = min_t(unsigned int, nr_blocks, 229 end_block - chunk_block); 230 ret = dmz_submit_bio(dmz, rzone, bio, 231 chunk_block, nr_blocks); 232 if (ret) 233 return ret; 234 chunk_block += nr_blocks; 235 } else { 236 /* No valid block: zeroout the current BIO block */ 237 dmz_handle_read_zero(dmz, bio, chunk_block, 1); 238 chunk_block++; 239 } 240 } 241 242 return 0; 243 } 244 245 /* 246 * Write blocks directly in a data zone, at the write pointer. 247 * If a buffer zone is assigned, invalidate the blocks written 248 * in place. 249 */ 250 static int dmz_handle_direct_write(struct dmz_target *dmz, 251 struct dm_zone *zone, struct bio *bio, 252 sector_t chunk_block, 253 unsigned int nr_blocks) 254 { 255 struct dmz_metadata *zmd = dmz->metadata; 256 struct dm_zone *bzone = zone->bzone; 257 int ret; 258 259 if (dmz_is_readonly(zone)) 260 return -EROFS; 261 262 /* Submit write */ 263 ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks); 264 if (ret) 265 return ret; 266 267 /* 268 * Validate the blocks in the data zone and invalidate 269 * in the buffer zone, if there is one. 270 */ 271 ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks); 272 if (ret == 0 && bzone) 273 ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks); 274 275 return ret; 276 } 277 278 /* 279 * Write blocks in the buffer zone of @zone. 280 * If no buffer zone is assigned yet, get one. 281 * Called with @zone write locked. 282 */ 283 static int dmz_handle_buffered_write(struct dmz_target *dmz, 284 struct dm_zone *zone, struct bio *bio, 285 sector_t chunk_block, 286 unsigned int nr_blocks) 287 { 288 struct dmz_metadata *zmd = dmz->metadata; 289 struct dm_zone *bzone; 290 int ret; 291 292 /* Get the buffer zone. One will be allocated if needed */ 293 bzone = dmz_get_chunk_buffer(zmd, zone); 294 if (IS_ERR(bzone)) 295 return PTR_ERR(bzone); 296 297 if (dmz_is_readonly(bzone)) 298 return -EROFS; 299 300 /* Submit write */ 301 ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks); 302 if (ret) 303 return ret; 304 305 /* 306 * Validate the blocks in the buffer zone 307 * and invalidate in the data zone. 308 */ 309 ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks); 310 if (ret == 0 && chunk_block < zone->wp_block) 311 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 312 313 return ret; 314 } 315 316 /* 317 * Process a write BIO. 318 */ 319 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone, 320 struct bio *bio) 321 { 322 struct dmz_metadata *zmd = dmz->metadata; 323 sector_t chunk_block = dmz_chunk_block(zmd, dmz_bio_block(bio)); 324 unsigned int nr_blocks = dmz_bio_blocks(bio); 325 326 if (!zone) 327 return -ENOSPC; 328 329 DMDEBUG("(%s): WRITE chunk %llu -> %s zone %u, block %llu, %u blocks", 330 dmz_metadata_label(zmd), 331 (unsigned long long)dmz_bio_chunk(zmd, bio), 332 (dmz_is_rnd(zone) ? "RND" : 333 (dmz_is_cache(zone) ? "CACHE" : "SEQ")), 334 zone->id, 335 (unsigned long long)chunk_block, nr_blocks); 336 337 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 338 chunk_block == zone->wp_block) { 339 /* 340 * zone is a random zone or it is a sequential zone 341 * and the BIO is aligned to the zone write pointer: 342 * direct write the zone. 343 */ 344 return dmz_handle_direct_write(dmz, zone, bio, 345 chunk_block, nr_blocks); 346 } 347 348 /* 349 * This is an unaligned write in a sequential zone: 350 * use buffered write. 351 */ 352 return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks); 353 } 354 355 /* 356 * Process a discard BIO. 357 */ 358 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone, 359 struct bio *bio) 360 { 361 struct dmz_metadata *zmd = dmz->metadata; 362 sector_t block = dmz_bio_block(bio); 363 unsigned int nr_blocks = dmz_bio_blocks(bio); 364 sector_t chunk_block = dmz_chunk_block(zmd, block); 365 int ret = 0; 366 367 /* For unmapped chunks, there is nothing to do */ 368 if (!zone) 369 return 0; 370 371 if (dmz_is_readonly(zone)) 372 return -EROFS; 373 374 DMDEBUG("(%s): DISCARD chunk %llu -> zone %u, block %llu, %u blocks", 375 dmz_metadata_label(dmz->metadata), 376 (unsigned long long)dmz_bio_chunk(zmd, bio), 377 zone->id, 378 (unsigned long long)chunk_block, nr_blocks); 379 380 /* 381 * Invalidate blocks in the data zone and its 382 * buffer zone if one is mapped. 383 */ 384 if (dmz_is_rnd(zone) || dmz_is_cache(zone) || 385 chunk_block < zone->wp_block) 386 ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks); 387 if (ret == 0 && zone->bzone) 388 ret = dmz_invalidate_blocks(zmd, zone->bzone, 389 chunk_block, nr_blocks); 390 return ret; 391 } 392 393 /* 394 * Process a BIO. 395 */ 396 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw, 397 struct bio *bio) 398 { 399 struct dmz_bioctx *bioctx = 400 dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 401 struct dmz_metadata *zmd = dmz->metadata; 402 struct dm_zone *zone; 403 int i, ret; 404 405 /* 406 * Write may trigger a zone allocation. So make sure the 407 * allocation can succeed. 408 */ 409 if (bio_op(bio) == REQ_OP_WRITE) 410 for (i = 0; i < dmz->nr_ddevs; i++) 411 dmz_schedule_reclaim(dmz->dev[i].reclaim); 412 413 dmz_lock_metadata(zmd); 414 415 /* 416 * Get the data zone mapping the chunk. There may be no 417 * mapping for read and discard. If a mapping is obtained, 418 + the zone returned will be set to active state. 419 */ 420 zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(zmd, bio), 421 bio_op(bio)); 422 if (IS_ERR(zone)) { 423 ret = PTR_ERR(zone); 424 goto out; 425 } 426 427 /* Process the BIO */ 428 if (zone) { 429 dmz_activate_zone(zone); 430 bioctx->zone = zone; 431 dmz_reclaim_bio_acc(zone->dev->reclaim); 432 } 433 434 switch (bio_op(bio)) { 435 case REQ_OP_READ: 436 ret = dmz_handle_read(dmz, zone, bio); 437 break; 438 case REQ_OP_WRITE: 439 ret = dmz_handle_write(dmz, zone, bio); 440 break; 441 case REQ_OP_DISCARD: 442 case REQ_OP_WRITE_ZEROES: 443 ret = dmz_handle_discard(dmz, zone, bio); 444 break; 445 default: 446 DMERR("(%s): Unsupported BIO operation 0x%x", 447 dmz_metadata_label(dmz->metadata), bio_op(bio)); 448 ret = -EIO; 449 } 450 451 /* 452 * Release the chunk mapping. This will check that the mapping 453 * is still valid, that is, that the zone used still has valid blocks. 454 */ 455 if (zone) 456 dmz_put_chunk_mapping(zmd, zone); 457 out: 458 dmz_bio_endio(bio, errno_to_blk_status(ret)); 459 460 dmz_unlock_metadata(zmd); 461 } 462 463 /* 464 * Increment a chunk reference counter. 465 */ 466 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) 467 { 468 refcount_inc(&cw->refcount); 469 } 470 471 /* 472 * Decrement a chunk work reference count and 473 * free it if it becomes 0. 474 */ 475 static void dmz_put_chunk_work(struct dm_chunk_work *cw) 476 { 477 if (refcount_dec_and_test(&cw->refcount)) { 478 WARN_ON(!bio_list_empty(&cw->bio_list)); 479 radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); 480 kfree(cw); 481 } 482 } 483 484 /* 485 * Chunk BIO work function. 486 */ 487 static void dmz_chunk_work(struct work_struct *work) 488 { 489 struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work); 490 struct dmz_target *dmz = cw->target; 491 struct bio *bio; 492 493 mutex_lock(&dmz->chunk_lock); 494 495 /* Process the chunk BIOs */ 496 while ((bio = bio_list_pop(&cw->bio_list))) { 497 mutex_unlock(&dmz->chunk_lock); 498 dmz_handle_bio(dmz, cw, bio); 499 mutex_lock(&dmz->chunk_lock); 500 dmz_put_chunk_work(cw); 501 } 502 503 /* Queueing the work incremented the work refcount */ 504 dmz_put_chunk_work(cw); 505 506 mutex_unlock(&dmz->chunk_lock); 507 } 508 509 /* 510 * Flush work. 511 */ 512 static void dmz_flush_work(struct work_struct *work) 513 { 514 struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work); 515 struct bio *bio; 516 int ret; 517 518 /* Flush dirty metadata blocks */ 519 ret = dmz_flush_metadata(dmz->metadata); 520 if (ret) 521 DMDEBUG("(%s): Metadata flush failed, rc=%d", 522 dmz_metadata_label(dmz->metadata), ret); 523 524 /* Process queued flush requests */ 525 while (1) { 526 spin_lock(&dmz->flush_lock); 527 bio = bio_list_pop(&dmz->flush_list); 528 spin_unlock(&dmz->flush_lock); 529 530 if (!bio) 531 break; 532 533 dmz_bio_endio(bio, errno_to_blk_status(ret)); 534 } 535 536 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 537 } 538 539 /* 540 * Get a chunk work and start it to process a new BIO. 541 * If the BIO chunk has no work yet, create one. 542 */ 543 static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) 544 { 545 unsigned int chunk = dmz_bio_chunk(dmz->metadata, bio); 546 struct dm_chunk_work *cw; 547 int ret = 0; 548 549 mutex_lock(&dmz->chunk_lock); 550 551 /* Get the BIO chunk work. If one is not active yet, create one */ 552 cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk); 553 if (cw) { 554 dmz_get_chunk_work(cw); 555 } else { 556 /* Create a new chunk work */ 557 cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO); 558 if (unlikely(!cw)) { 559 ret = -ENOMEM; 560 goto out; 561 } 562 563 INIT_WORK(&cw->work, dmz_chunk_work); 564 refcount_set(&cw->refcount, 1); 565 cw->target = dmz; 566 cw->chunk = chunk; 567 bio_list_init(&cw->bio_list); 568 569 ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw); 570 if (unlikely(ret)) { 571 kfree(cw); 572 goto out; 573 } 574 } 575 576 bio_list_add(&cw->bio_list, bio); 577 578 if (queue_work(dmz->chunk_wq, &cw->work)) 579 dmz_get_chunk_work(cw); 580 out: 581 mutex_unlock(&dmz->chunk_lock); 582 return ret; 583 } 584 585 /* 586 * Check if the backing device is being removed. If it's on the way out, 587 * start failing I/O. Reclaim and metadata components also call this 588 * function to cleanly abort operation in the event of such failure. 589 */ 590 bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) 591 { 592 if (dmz_dev->flags & DMZ_BDEV_DYING) 593 return true; 594 595 if (dmz_dev->flags & DMZ_CHECK_BDEV) 596 return !dmz_check_bdev(dmz_dev); 597 598 if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { 599 dmz_dev_warn(dmz_dev, "Backing device queue dying"); 600 dmz_dev->flags |= DMZ_BDEV_DYING; 601 } 602 603 return dmz_dev->flags & DMZ_BDEV_DYING; 604 } 605 606 /* 607 * Check the backing device availability. This detects such events as 608 * backing device going offline due to errors, media removals, etc. 609 * This check is less efficient than dmz_bdev_is_dying() and should 610 * only be performed as a part of error handling. 611 */ 612 bool dmz_check_bdev(struct dmz_dev *dmz_dev) 613 { 614 struct gendisk *disk; 615 616 dmz_dev->flags &= ~DMZ_CHECK_BDEV; 617 618 if (dmz_bdev_is_dying(dmz_dev)) 619 return false; 620 621 disk = dmz_dev->bdev->bd_disk; 622 if (disk->fops->check_events && 623 disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { 624 dmz_dev_warn(dmz_dev, "Backing device offline"); 625 dmz_dev->flags |= DMZ_BDEV_DYING; 626 } 627 628 return !(dmz_dev->flags & DMZ_BDEV_DYING); 629 } 630 631 /* 632 * Process a new BIO. 633 */ 634 static int dmz_map(struct dm_target *ti, struct bio *bio) 635 { 636 struct dmz_target *dmz = ti->private; 637 struct dmz_metadata *zmd = dmz->metadata; 638 struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx)); 639 sector_t sector = bio->bi_iter.bi_sector; 640 unsigned int nr_sectors = bio_sectors(bio); 641 sector_t chunk_sector; 642 int ret; 643 644 if (dmz_dev_is_dying(zmd)) 645 return DM_MAPIO_KILL; 646 647 DMDEBUG("(%s): BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks", 648 dmz_metadata_label(zmd), 649 bio_op(bio), (unsigned long long)sector, nr_sectors, 650 (unsigned long long)dmz_bio_chunk(zmd, bio), 651 (unsigned long long)dmz_chunk_block(zmd, dmz_bio_block(bio)), 652 (unsigned int)dmz_bio_blocks(bio)); 653 654 if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE) 655 return DM_MAPIO_REMAPPED; 656 657 /* The BIO should be block aligned */ 658 if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK)) 659 return DM_MAPIO_KILL; 660 661 /* Initialize the BIO context */ 662 bioctx->dev = NULL; 663 bioctx->zone = NULL; 664 bioctx->bio = bio; 665 refcount_set(&bioctx->ref, 1); 666 667 /* Set the BIO pending in the flush list */ 668 if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) { 669 spin_lock(&dmz->flush_lock); 670 bio_list_add(&dmz->flush_list, bio); 671 spin_unlock(&dmz->flush_lock); 672 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0); 673 return DM_MAPIO_SUBMITTED; 674 } 675 676 /* Split zone BIOs to fit entirely into a zone */ 677 chunk_sector = sector & (dmz_zone_nr_sectors(zmd) - 1); 678 if (chunk_sector + nr_sectors > dmz_zone_nr_sectors(zmd)) 679 dm_accept_partial_bio(bio, dmz_zone_nr_sectors(zmd) - chunk_sector); 680 681 /* Now ready to handle this BIO */ 682 ret = dmz_queue_chunk_work(dmz, bio); 683 if (ret) { 684 DMDEBUG("(%s): BIO op %d, can't process chunk %llu, err %i", 685 dmz_metadata_label(zmd), 686 bio_op(bio), (u64)dmz_bio_chunk(zmd, bio), 687 ret); 688 return DM_MAPIO_REQUEUE; 689 } 690 691 return DM_MAPIO_SUBMITTED; 692 } 693 694 /* 695 * Get zoned device information. 696 */ 697 static int dmz_get_zoned_device(struct dm_target *ti, char *path, 698 int idx, int nr_devs) 699 { 700 struct dmz_target *dmz = ti->private; 701 struct dm_dev *ddev; 702 struct dmz_dev *dev; 703 int ret; 704 struct block_device *bdev; 705 706 /* Get the target device */ 707 ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &ddev); 708 if (ret) { 709 ti->error = "Get target device failed"; 710 return ret; 711 } 712 713 bdev = ddev->bdev; 714 if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) { 715 if (nr_devs == 1) { 716 ti->error = "Invalid regular device"; 717 goto err; 718 } 719 if (idx != 0) { 720 ti->error = "First device must be a regular device"; 721 goto err; 722 } 723 if (dmz->ddev[0]) { 724 ti->error = "Too many regular devices"; 725 goto err; 726 } 727 dev = &dmz->dev[idx]; 728 dev->flags = DMZ_BDEV_REGULAR; 729 } else { 730 if (dmz->ddev[idx]) { 731 ti->error = "Too many zoned devices"; 732 goto err; 733 } 734 if (nr_devs > 1 && idx == 0) { 735 ti->error = "First device must be a regular device"; 736 goto err; 737 } 738 dev = &dmz->dev[idx]; 739 } 740 dev->bdev = bdev; 741 dev->dev_idx = idx; 742 (void)bdevname(dev->bdev, dev->name); 743 744 dev->capacity = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 745 if (ti->begin) { 746 ti->error = "Partial mapping is not supported"; 747 goto err; 748 } 749 750 dmz->ddev[idx] = ddev; 751 752 return 0; 753 err: 754 dm_put_device(ti, ddev); 755 return -EINVAL; 756 } 757 758 /* 759 * Cleanup zoned device information. 760 */ 761 static void dmz_put_zoned_device(struct dm_target *ti) 762 { 763 struct dmz_target *dmz = ti->private; 764 int i; 765 766 for (i = 0; i < dmz->nr_ddevs; i++) { 767 if (dmz->ddev[i]) { 768 dm_put_device(ti, dmz->ddev[i]); 769 dmz->ddev[i] = NULL; 770 } 771 } 772 } 773 774 static int dmz_fixup_devices(struct dm_target *ti) 775 { 776 struct dmz_target *dmz = ti->private; 777 struct dmz_dev *reg_dev, *zoned_dev; 778 struct request_queue *q; 779 sector_t zone_nr_sectors = 0; 780 int i; 781 782 /* 783 * When we have more than on devices, the first one must be a 784 * regular block device and the others zoned block devices. 785 */ 786 if (dmz->nr_ddevs > 1) { 787 reg_dev = &dmz->dev[0]; 788 if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { 789 ti->error = "Primary disk is not a regular device"; 790 return -EINVAL; 791 } 792 for (i = 1; i < dmz->nr_ddevs; i++) { 793 zoned_dev = &dmz->dev[i]; 794 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 795 ti->error = "Secondary disk is not a zoned device"; 796 return -EINVAL; 797 } 798 q = bdev_get_queue(zoned_dev->bdev); 799 if (zone_nr_sectors && 800 zone_nr_sectors != blk_queue_zone_sectors(q)) { 801 ti->error = "Zone nr sectors mismatch"; 802 return -EINVAL; 803 } 804 zone_nr_sectors = blk_queue_zone_sectors(q); 805 zoned_dev->zone_nr_sectors = zone_nr_sectors; 806 zoned_dev->nr_zones = 807 blkdev_nr_zones(zoned_dev->bdev->bd_disk); 808 } 809 } else { 810 reg_dev = NULL; 811 zoned_dev = &dmz->dev[0]; 812 if (zoned_dev->flags & DMZ_BDEV_REGULAR) { 813 ti->error = "Disk is not a zoned device"; 814 return -EINVAL; 815 } 816 q = bdev_get_queue(zoned_dev->bdev); 817 zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); 818 zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); 819 } 820 821 if (reg_dev) { 822 sector_t zone_offset; 823 824 reg_dev->zone_nr_sectors = zone_nr_sectors; 825 reg_dev->nr_zones = 826 DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, 827 reg_dev->zone_nr_sectors); 828 reg_dev->zone_offset = 0; 829 zone_offset = reg_dev->nr_zones; 830 for (i = 1; i < dmz->nr_ddevs; i++) { 831 dmz->dev[i].zone_offset = zone_offset; 832 zone_offset += dmz->dev[i].nr_zones; 833 } 834 } 835 return 0; 836 } 837 838 /* 839 * Setup target. 840 */ 841 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) 842 { 843 struct dmz_target *dmz; 844 int ret, i; 845 846 /* Check arguments */ 847 if (argc < 1) { 848 ti->error = "Invalid argument count"; 849 return -EINVAL; 850 } 851 852 /* Allocate and initialize the target descriptor */ 853 dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL); 854 if (!dmz) { 855 ti->error = "Unable to allocate the zoned target descriptor"; 856 return -ENOMEM; 857 } 858 dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); 859 if (!dmz->dev) { 860 ti->error = "Unable to allocate the zoned device descriptors"; 861 kfree(dmz); 862 return -ENOMEM; 863 } 864 dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); 865 if (!dmz->ddev) { 866 ti->error = "Unable to allocate the dm device descriptors"; 867 ret = -ENOMEM; 868 goto err; 869 } 870 dmz->nr_ddevs = argc; 871 872 ti->private = dmz; 873 874 /* Get the target zoned block device */ 875 for (i = 0; i < argc; i++) { 876 ret = dmz_get_zoned_device(ti, argv[i], i, argc); 877 if (ret) 878 goto err_dev; 879 } 880 ret = dmz_fixup_devices(ti); 881 if (ret) 882 goto err_dev; 883 884 /* Initialize metadata */ 885 ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, 886 dm_table_device_name(ti->table)); 887 if (ret) { 888 ti->error = "Metadata initialization failed"; 889 goto err_dev; 890 } 891 892 /* Set target (no write same support) */ 893 ti->max_io_len = dmz_zone_nr_sectors(dmz->metadata); 894 ti->num_flush_bios = 1; 895 ti->num_discard_bios = 1; 896 ti->num_write_zeroes_bios = 1; 897 ti->per_io_data_size = sizeof(struct dmz_bioctx); 898 ti->flush_supported = true; 899 ti->discards_supported = true; 900 901 /* The exposed capacity is the number of chunks that can be mapped */ 902 ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << 903 dmz_zone_nr_sectors_shift(dmz->metadata); 904 905 /* Zone BIO */ 906 ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0); 907 if (ret) { 908 ti->error = "Create BIO set failed"; 909 goto err_meta; 910 } 911 912 /* Chunk BIO work */ 913 mutex_init(&dmz->chunk_lock); 914 INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); 915 dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", 916 WQ_MEM_RECLAIM | WQ_UNBOUND, 0, 917 dmz_metadata_label(dmz->metadata)); 918 if (!dmz->chunk_wq) { 919 ti->error = "Create chunk workqueue failed"; 920 ret = -ENOMEM; 921 goto err_bio; 922 } 923 924 /* Flush work */ 925 spin_lock_init(&dmz->flush_lock); 926 bio_list_init(&dmz->flush_list); 927 INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work); 928 dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM, 929 dmz_metadata_label(dmz->metadata)); 930 if (!dmz->flush_wq) { 931 ti->error = "Create flush workqueue failed"; 932 ret = -ENOMEM; 933 goto err_cwq; 934 } 935 mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 936 937 /* Initialize reclaim */ 938 for (i = 0; i < dmz->nr_ddevs; i++) { 939 ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i); 940 if (ret) { 941 ti->error = "Zone reclaim initialization failed"; 942 goto err_fwq; 943 } 944 } 945 946 DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)", 947 dmz_metadata_label(dmz->metadata), 948 (unsigned long long)ti->len, 949 (unsigned long long)dmz_sect2blk(ti->len)); 950 951 return 0; 952 err_fwq: 953 destroy_workqueue(dmz->flush_wq); 954 err_cwq: 955 destroy_workqueue(dmz->chunk_wq); 956 err_bio: 957 mutex_destroy(&dmz->chunk_lock); 958 bioset_exit(&dmz->bio_set); 959 err_meta: 960 dmz_dtr_metadata(dmz->metadata); 961 err_dev: 962 dmz_put_zoned_device(ti); 963 err: 964 kfree(dmz->dev); 965 kfree(dmz); 966 967 return ret; 968 } 969 970 /* 971 * Cleanup target. 972 */ 973 static void dmz_dtr(struct dm_target *ti) 974 { 975 struct dmz_target *dmz = ti->private; 976 int i; 977 978 flush_workqueue(dmz->chunk_wq); 979 destroy_workqueue(dmz->chunk_wq); 980 981 for (i = 0; i < dmz->nr_ddevs; i++) 982 dmz_dtr_reclaim(dmz->dev[i].reclaim); 983 984 cancel_delayed_work_sync(&dmz->flush_work); 985 destroy_workqueue(dmz->flush_wq); 986 987 (void) dmz_flush_metadata(dmz->metadata); 988 989 dmz_dtr_metadata(dmz->metadata); 990 991 bioset_exit(&dmz->bio_set); 992 993 dmz_put_zoned_device(ti); 994 995 mutex_destroy(&dmz->chunk_lock); 996 997 kfree(dmz->dev); 998 kfree(dmz); 999 } 1000 1001 /* 1002 * Setup target request queue limits. 1003 */ 1004 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) 1005 { 1006 struct dmz_target *dmz = ti->private; 1007 unsigned int chunk_sectors = dmz_zone_nr_sectors(dmz->metadata); 1008 1009 limits->logical_block_size = DMZ_BLOCK_SIZE; 1010 limits->physical_block_size = DMZ_BLOCK_SIZE; 1011 1012 blk_limits_io_min(limits, DMZ_BLOCK_SIZE); 1013 blk_limits_io_opt(limits, DMZ_BLOCK_SIZE); 1014 1015 limits->discard_alignment = DMZ_BLOCK_SIZE; 1016 limits->discard_granularity = DMZ_BLOCK_SIZE; 1017 limits->max_discard_sectors = chunk_sectors; 1018 limits->max_hw_discard_sectors = chunk_sectors; 1019 limits->max_write_zeroes_sectors = chunk_sectors; 1020 1021 /* FS hint to try to align to the device zone size */ 1022 limits->chunk_sectors = chunk_sectors; 1023 limits->max_sectors = chunk_sectors; 1024 1025 /* We are exposing a drive-managed zoned block device */ 1026 limits->zoned = BLK_ZONED_NONE; 1027 } 1028 1029 /* 1030 * Pass on ioctl to the backend device. 1031 */ 1032 static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 1033 { 1034 struct dmz_target *dmz = ti->private; 1035 struct dmz_dev *dev = &dmz->dev[0]; 1036 1037 if (!dmz_check_bdev(dev)) 1038 return -EIO; 1039 1040 *bdev = dev->bdev; 1041 1042 return 0; 1043 } 1044 1045 /* 1046 * Stop works on suspend. 1047 */ 1048 static void dmz_suspend(struct dm_target *ti) 1049 { 1050 struct dmz_target *dmz = ti->private; 1051 int i; 1052 1053 flush_workqueue(dmz->chunk_wq); 1054 for (i = 0; i < dmz->nr_ddevs; i++) 1055 dmz_suspend_reclaim(dmz->dev[i].reclaim); 1056 cancel_delayed_work_sync(&dmz->flush_work); 1057 } 1058 1059 /* 1060 * Restart works on resume or if suspend failed. 1061 */ 1062 static void dmz_resume(struct dm_target *ti) 1063 { 1064 struct dmz_target *dmz = ti->private; 1065 int i; 1066 1067 queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD); 1068 for (i = 0; i < dmz->nr_ddevs; i++) 1069 dmz_resume_reclaim(dmz->dev[i].reclaim); 1070 } 1071 1072 static int dmz_iterate_devices(struct dm_target *ti, 1073 iterate_devices_callout_fn fn, void *data) 1074 { 1075 struct dmz_target *dmz = ti->private; 1076 unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); 1077 sector_t capacity; 1078 int i, r; 1079 1080 for (i = 0; i < dmz->nr_ddevs; i++) { 1081 capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); 1082 r = fn(ti, dmz->ddev[i], 0, capacity, data); 1083 if (r) 1084 break; 1085 } 1086 return r; 1087 } 1088 1089 static void dmz_status(struct dm_target *ti, status_type_t type, 1090 unsigned int status_flags, char *result, 1091 unsigned int maxlen) 1092 { 1093 struct dmz_target *dmz = ti->private; 1094 ssize_t sz = 0; 1095 char buf[BDEVNAME_SIZE]; 1096 struct dmz_dev *dev; 1097 int i; 1098 1099 switch (type) { 1100 case STATUSTYPE_INFO: 1101 DMEMIT("%u zones %u/%u cache", 1102 dmz_nr_zones(dmz->metadata), 1103 dmz_nr_unmap_cache_zones(dmz->metadata), 1104 dmz_nr_cache_zones(dmz->metadata)); 1105 for (i = 0; i < dmz->nr_ddevs; i++) { 1106 /* 1107 * For a multi-device setup the first device 1108 * contains only cache zones. 1109 */ 1110 if ((i == 0) && 1111 (dmz_nr_cache_zones(dmz->metadata) > 0)) 1112 continue; 1113 DMEMIT(" %u/%u random %u/%u sequential", 1114 dmz_nr_unmap_rnd_zones(dmz->metadata, i), 1115 dmz_nr_rnd_zones(dmz->metadata, i), 1116 dmz_nr_unmap_seq_zones(dmz->metadata, i), 1117 dmz_nr_seq_zones(dmz->metadata, i)); 1118 } 1119 break; 1120 case STATUSTYPE_TABLE: 1121 dev = &dmz->dev[0]; 1122 format_dev_t(buf, dev->bdev->bd_dev); 1123 DMEMIT("%s", buf); 1124 for (i = 1; i < dmz->nr_ddevs; i++) { 1125 dev = &dmz->dev[i]; 1126 format_dev_t(buf, dev->bdev->bd_dev); 1127 DMEMIT(" %s", buf); 1128 } 1129 break; 1130 } 1131 return; 1132 } 1133 1134 static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, 1135 char *result, unsigned int maxlen) 1136 { 1137 struct dmz_target *dmz = ti->private; 1138 int r = -EINVAL; 1139 1140 if (!strcasecmp(argv[0], "reclaim")) { 1141 int i; 1142 1143 for (i = 0; i < dmz->nr_ddevs; i++) 1144 dmz_schedule_reclaim(dmz->dev[i].reclaim); 1145 r = 0; 1146 } else 1147 DMERR("unrecognized message %s", argv[0]); 1148 return r; 1149 } 1150 1151 static struct target_type dmz_type = { 1152 .name = "zoned", 1153 .version = {2, 0, 0}, 1154 .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, 1155 .module = THIS_MODULE, 1156 .ctr = dmz_ctr, 1157 .dtr = dmz_dtr, 1158 .map = dmz_map, 1159 .io_hints = dmz_io_hints, 1160 .prepare_ioctl = dmz_prepare_ioctl, 1161 .postsuspend = dmz_suspend, 1162 .resume = dmz_resume, 1163 .iterate_devices = dmz_iterate_devices, 1164 .status = dmz_status, 1165 .message = dmz_message, 1166 }; 1167 1168 static int __init dmz_init(void) 1169 { 1170 return dm_register_target(&dmz_type); 1171 } 1172 1173 static void __exit dmz_exit(void) 1174 { 1175 dm_unregister_target(&dmz_type); 1176 } 1177 1178 module_init(dmz_init); 1179 module_exit(dmz_exit); 1180 1181 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices"); 1182 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>"); 1183 MODULE_LICENSE("GPL"); 1184