1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/vmalloc.h> 3 #include <linux/bitmap.h> 4 #include "null_blk.h" 5 6 #define CREATE_TRACE_POINTS 7 #include "trace.h" 8 9 static inline sector_t mb_to_sects(unsigned long mb) 10 { 11 return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; 12 } 13 14 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) 15 { 16 return sect >> ilog2(dev->zone_size_sects); 17 } 18 19 static inline void null_lock_zone_res(struct nullb_device *dev) 20 { 21 if (dev->need_zone_res_mgmt) 22 spin_lock_irq(&dev->zone_res_lock); 23 } 24 25 static inline void null_unlock_zone_res(struct nullb_device *dev) 26 { 27 if (dev->need_zone_res_mgmt) 28 spin_unlock_irq(&dev->zone_res_lock); 29 } 30 31 static inline void null_init_zone_lock(struct nullb_device *dev, 32 struct nullb_zone *zone) 33 { 34 if (!dev->memory_backed) 35 spin_lock_init(&zone->spinlock); 36 else 37 mutex_init(&zone->mutex); 38 } 39 40 static inline void null_lock_zone(struct nullb_device *dev, 41 struct nullb_zone *zone) 42 { 43 if (!dev->memory_backed) 44 spin_lock_irq(&zone->spinlock); 45 else 46 mutex_lock(&zone->mutex); 47 } 48 49 static inline void null_unlock_zone(struct nullb_device *dev, 50 struct nullb_zone *zone) 51 { 52 if (!dev->memory_backed) 53 spin_unlock_irq(&zone->spinlock); 54 else 55 mutex_unlock(&zone->mutex); 56 } 57 58 int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) 59 { 60 sector_t dev_capacity_sects, zone_capacity_sects; 61 struct nullb_zone *zone; 62 sector_t sector = 0; 63 unsigned int i; 64 65 if (!is_power_of_2(dev->zone_size)) { 66 pr_err("zone_size must be power-of-two\n"); 67 return -EINVAL; 68 } 69 if (dev->zone_size > dev->size) { 70 pr_err("Zone size larger than device capacity\n"); 71 return -EINVAL; 72 } 73 74 if (!dev->zone_capacity) 75 dev->zone_capacity = dev->zone_size; 76 77 if (dev->zone_capacity > dev->zone_size) { 78 pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", 79 dev->zone_capacity, dev->zone_size); 80 return -EINVAL; 81 } 82 83 zone_capacity_sects = mb_to_sects(dev->zone_capacity); 84 dev_capacity_sects = mb_to_sects(dev->size); 85 dev->zone_size_sects = mb_to_sects(dev->zone_size); 86 dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects) 87 >> ilog2(dev->zone_size_sects); 88 89 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), 90 GFP_KERNEL | __GFP_ZERO); 91 if (!dev->zones) 92 return -ENOMEM; 93 94 spin_lock_init(&dev->zone_res_lock); 95 96 if (dev->zone_nr_conv >= dev->nr_zones) { 97 dev->zone_nr_conv = dev->nr_zones - 1; 98 pr_info("changed the number of conventional zones to %u", 99 dev->zone_nr_conv); 100 } 101 102 /* Max active zones has to be < nbr of seq zones in order to be enforceable */ 103 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { 104 dev->zone_max_active = 0; 105 pr_info("zone_max_active limit disabled, limit >= zone count\n"); 106 } 107 108 /* Max open zones has to be <= max active zones */ 109 if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { 110 dev->zone_max_open = dev->zone_max_active; 111 pr_info("changed the maximum number of open zones to %u\n", 112 dev->nr_zones); 113 } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { 114 dev->zone_max_open = 0; 115 pr_info("zone_max_open limit disabled, limit >= zone count\n"); 116 } 117 dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; 118 dev->imp_close_zone_no = dev->zone_nr_conv; 119 120 for (i = 0; i < dev->zone_nr_conv; i++) { 121 zone = &dev->zones[i]; 122 123 null_init_zone_lock(dev, zone); 124 zone->start = sector; 125 zone->len = dev->zone_size_sects; 126 zone->capacity = zone->len; 127 zone->wp = zone->start + zone->len; 128 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 129 zone->cond = BLK_ZONE_COND_NOT_WP; 130 131 sector += dev->zone_size_sects; 132 } 133 134 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 135 zone = &dev->zones[i]; 136 137 null_init_zone_lock(dev, zone); 138 zone->start = zone->wp = sector; 139 if (zone->start + dev->zone_size_sects > dev_capacity_sects) 140 zone->len = dev_capacity_sects - zone->start; 141 else 142 zone->len = dev->zone_size_sects; 143 zone->capacity = 144 min_t(sector_t, zone->len, zone_capacity_sects); 145 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 146 zone->cond = BLK_ZONE_COND_EMPTY; 147 148 sector += dev->zone_size_sects; 149 } 150 151 return 0; 152 } 153 154 int null_register_zoned_dev(struct nullb *nullb) 155 { 156 struct nullb_device *dev = nullb->dev; 157 struct request_queue *q = nullb->q; 158 159 blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); 160 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 161 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 162 163 if (queue_is_mq(q)) { 164 int ret = blk_revalidate_disk_zones(nullb->disk, NULL); 165 166 if (ret) 167 return ret; 168 } else { 169 blk_queue_chunk_sectors(q, dev->zone_size_sects); 170 q->nr_zones = blkdev_nr_zones(nullb->disk); 171 } 172 173 blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); 174 blk_queue_max_open_zones(q, dev->zone_max_open); 175 blk_queue_max_active_zones(q, dev->zone_max_active); 176 177 return 0; 178 } 179 180 void null_free_zoned_dev(struct nullb_device *dev) 181 { 182 kvfree(dev->zones); 183 } 184 185 int null_report_zones(struct gendisk *disk, sector_t sector, 186 unsigned int nr_zones, report_zones_cb cb, void *data) 187 { 188 struct nullb *nullb = disk->private_data; 189 struct nullb_device *dev = nullb->dev; 190 unsigned int first_zone, i; 191 struct nullb_zone *zone; 192 struct blk_zone blkz; 193 int error; 194 195 first_zone = null_zone_no(dev, sector); 196 if (first_zone >= dev->nr_zones) 197 return 0; 198 199 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 200 trace_nullb_report_zones(nullb, nr_zones); 201 202 memset(&blkz, 0, sizeof(struct blk_zone)); 203 zone = &dev->zones[first_zone]; 204 for (i = 0; i < nr_zones; i++, zone++) { 205 /* 206 * Stacked DM target drivers will remap the zone information by 207 * modifying the zone information passed to the report callback. 208 * So use a local copy to avoid corruption of the device zone 209 * array. 210 */ 211 null_lock_zone(dev, zone); 212 blkz.start = zone->start; 213 blkz.len = zone->len; 214 blkz.wp = zone->wp; 215 blkz.type = zone->type; 216 blkz.cond = zone->cond; 217 blkz.capacity = zone->capacity; 218 null_unlock_zone(dev, zone); 219 220 error = cb(&blkz, i, data); 221 if (error) 222 return error; 223 } 224 225 return nr_zones; 226 } 227 228 /* 229 * This is called in the case of memory backing from null_process_cmd() 230 * with the target zone already locked. 231 */ 232 size_t null_zone_valid_read_len(struct nullb *nullb, 233 sector_t sector, unsigned int len) 234 { 235 struct nullb_device *dev = nullb->dev; 236 struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; 237 unsigned int nr_sectors = len >> SECTOR_SHIFT; 238 239 /* Read must be below the write pointer position */ 240 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || 241 sector + nr_sectors <= zone->wp) 242 return len; 243 244 if (sector > zone->wp) 245 return 0; 246 247 return (zone->wp - sector) << SECTOR_SHIFT; 248 } 249 250 static blk_status_t __null_close_zone(struct nullb_device *dev, 251 struct nullb_zone *zone) 252 { 253 switch (zone->cond) { 254 case BLK_ZONE_COND_CLOSED: 255 /* close operation on closed is not an error */ 256 return BLK_STS_OK; 257 case BLK_ZONE_COND_IMP_OPEN: 258 dev->nr_zones_imp_open--; 259 break; 260 case BLK_ZONE_COND_EXP_OPEN: 261 dev->nr_zones_exp_open--; 262 break; 263 case BLK_ZONE_COND_EMPTY: 264 case BLK_ZONE_COND_FULL: 265 default: 266 return BLK_STS_IOERR; 267 } 268 269 if (zone->wp == zone->start) { 270 zone->cond = BLK_ZONE_COND_EMPTY; 271 } else { 272 zone->cond = BLK_ZONE_COND_CLOSED; 273 dev->nr_zones_closed++; 274 } 275 276 return BLK_STS_OK; 277 } 278 279 static void null_close_imp_open_zone(struct nullb_device *dev) 280 { 281 struct nullb_zone *zone; 282 unsigned int zno, i; 283 284 zno = dev->imp_close_zone_no; 285 if (zno >= dev->nr_zones) 286 zno = dev->zone_nr_conv; 287 288 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 289 zone = &dev->zones[zno]; 290 zno++; 291 if (zno >= dev->nr_zones) 292 zno = dev->zone_nr_conv; 293 294 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 295 __null_close_zone(dev, zone); 296 dev->imp_close_zone_no = zno; 297 return; 298 } 299 } 300 } 301 302 static blk_status_t null_check_active(struct nullb_device *dev) 303 { 304 if (!dev->zone_max_active) 305 return BLK_STS_OK; 306 307 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + 308 dev->nr_zones_closed < dev->zone_max_active) 309 return BLK_STS_OK; 310 311 return BLK_STS_ZONE_ACTIVE_RESOURCE; 312 } 313 314 static blk_status_t null_check_open(struct nullb_device *dev) 315 { 316 if (!dev->zone_max_open) 317 return BLK_STS_OK; 318 319 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) 320 return BLK_STS_OK; 321 322 if (dev->nr_zones_imp_open) { 323 if (null_check_active(dev) == BLK_STS_OK) { 324 null_close_imp_open_zone(dev); 325 return BLK_STS_OK; 326 } 327 } 328 329 return BLK_STS_ZONE_OPEN_RESOURCE; 330 } 331 332 /* 333 * This function matches the manage open zone resources function in the ZBC standard, 334 * with the addition of max active zones support (added in the ZNS standard). 335 * 336 * The function determines if a zone can transition to implicit open or explicit open, 337 * while maintaining the max open zone (and max active zone) limit(s). It may close an 338 * implicit open zone in order to make additional zone resources available. 339 * 340 * ZBC states that an implicit open zone shall be closed only if there is not 341 * room within the open limit. However, with the addition of an active limit, 342 * it is not certain that closing an implicit open zone will allow a new zone 343 * to be opened, since we might already be at the active limit capacity. 344 */ 345 static blk_status_t null_check_zone_resources(struct nullb_device *dev, 346 struct nullb_zone *zone) 347 { 348 blk_status_t ret; 349 350 switch (zone->cond) { 351 case BLK_ZONE_COND_EMPTY: 352 ret = null_check_active(dev); 353 if (ret != BLK_STS_OK) 354 return ret; 355 fallthrough; 356 case BLK_ZONE_COND_CLOSED: 357 return null_check_open(dev); 358 default: 359 /* Should never be called for other states */ 360 WARN_ON(1); 361 return BLK_STS_IOERR; 362 } 363 } 364 365 static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, 366 unsigned int nr_sectors, bool append) 367 { 368 struct nullb_device *dev = cmd->nq->dev; 369 unsigned int zno = null_zone_no(dev, sector); 370 struct nullb_zone *zone = &dev->zones[zno]; 371 blk_status_t ret; 372 373 trace_nullb_zone_op(cmd, zno, zone->cond); 374 375 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { 376 if (append) 377 return BLK_STS_IOERR; 378 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 379 } 380 381 null_lock_zone(dev, zone); 382 383 if (zone->cond == BLK_ZONE_COND_FULL) { 384 /* Cannot write to a full zone */ 385 ret = BLK_STS_IOERR; 386 goto unlock; 387 } 388 389 /* 390 * Regular writes must be at the write pointer position. 391 * Zone append writes are automatically issued at the write 392 * pointer and the position returned using the request or BIO 393 * sector. 394 */ 395 if (append) { 396 sector = zone->wp; 397 if (cmd->bio) 398 cmd->bio->bi_iter.bi_sector = sector; 399 else 400 cmd->rq->__sector = sector; 401 } else if (sector != zone->wp) { 402 ret = BLK_STS_IOERR; 403 goto unlock; 404 } 405 406 if (zone->wp + nr_sectors > zone->start + zone->capacity) { 407 ret = BLK_STS_IOERR; 408 goto unlock; 409 } 410 411 if (zone->cond == BLK_ZONE_COND_CLOSED || 412 zone->cond == BLK_ZONE_COND_EMPTY) { 413 null_lock_zone_res(dev); 414 415 ret = null_check_zone_resources(dev, zone); 416 if (ret != BLK_STS_OK) { 417 null_unlock_zone_res(dev); 418 goto unlock; 419 } 420 if (zone->cond == BLK_ZONE_COND_CLOSED) { 421 dev->nr_zones_closed--; 422 dev->nr_zones_imp_open++; 423 } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 424 dev->nr_zones_imp_open++; 425 } 426 427 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 428 zone->cond = BLK_ZONE_COND_IMP_OPEN; 429 430 null_unlock_zone_res(dev); 431 } 432 433 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 434 if (ret != BLK_STS_OK) 435 goto unlock; 436 437 zone->wp += nr_sectors; 438 if (zone->wp == zone->start + zone->capacity) { 439 null_lock_zone_res(dev); 440 if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 441 dev->nr_zones_exp_open--; 442 else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 443 dev->nr_zones_imp_open--; 444 zone->cond = BLK_ZONE_COND_FULL; 445 null_unlock_zone_res(dev); 446 } 447 448 ret = BLK_STS_OK; 449 450 unlock: 451 null_unlock_zone(dev, zone); 452 453 return ret; 454 } 455 456 static blk_status_t null_open_zone(struct nullb_device *dev, 457 struct nullb_zone *zone) 458 { 459 blk_status_t ret = BLK_STS_OK; 460 461 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 462 return BLK_STS_IOERR; 463 464 null_lock_zone_res(dev); 465 466 switch (zone->cond) { 467 case BLK_ZONE_COND_EXP_OPEN: 468 /* open operation on exp open is not an error */ 469 goto unlock; 470 case BLK_ZONE_COND_EMPTY: 471 ret = null_check_zone_resources(dev, zone); 472 if (ret != BLK_STS_OK) 473 goto unlock; 474 break; 475 case BLK_ZONE_COND_IMP_OPEN: 476 dev->nr_zones_imp_open--; 477 break; 478 case BLK_ZONE_COND_CLOSED: 479 ret = null_check_zone_resources(dev, zone); 480 if (ret != BLK_STS_OK) 481 goto unlock; 482 dev->nr_zones_closed--; 483 break; 484 case BLK_ZONE_COND_FULL: 485 default: 486 ret = BLK_STS_IOERR; 487 goto unlock; 488 } 489 490 zone->cond = BLK_ZONE_COND_EXP_OPEN; 491 dev->nr_zones_exp_open++; 492 493 unlock: 494 null_unlock_zone_res(dev); 495 496 return ret; 497 } 498 499 static blk_status_t null_close_zone(struct nullb_device *dev, 500 struct nullb_zone *zone) 501 { 502 blk_status_t ret; 503 504 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 505 return BLK_STS_IOERR; 506 507 null_lock_zone_res(dev); 508 ret = __null_close_zone(dev, zone); 509 null_unlock_zone_res(dev); 510 511 return ret; 512 } 513 514 static blk_status_t null_finish_zone(struct nullb_device *dev, 515 struct nullb_zone *zone) 516 { 517 blk_status_t ret = BLK_STS_OK; 518 519 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 520 return BLK_STS_IOERR; 521 522 null_lock_zone_res(dev); 523 524 switch (zone->cond) { 525 case BLK_ZONE_COND_FULL: 526 /* finish operation on full is not an error */ 527 goto unlock; 528 case BLK_ZONE_COND_EMPTY: 529 ret = null_check_zone_resources(dev, zone); 530 if (ret != BLK_STS_OK) 531 goto unlock; 532 break; 533 case BLK_ZONE_COND_IMP_OPEN: 534 dev->nr_zones_imp_open--; 535 break; 536 case BLK_ZONE_COND_EXP_OPEN: 537 dev->nr_zones_exp_open--; 538 break; 539 case BLK_ZONE_COND_CLOSED: 540 ret = null_check_zone_resources(dev, zone); 541 if (ret != BLK_STS_OK) 542 goto unlock; 543 dev->nr_zones_closed--; 544 break; 545 default: 546 ret = BLK_STS_IOERR; 547 goto unlock; 548 } 549 550 zone->cond = BLK_ZONE_COND_FULL; 551 zone->wp = zone->start + zone->len; 552 553 unlock: 554 null_unlock_zone_res(dev); 555 556 return ret; 557 } 558 559 static blk_status_t null_reset_zone(struct nullb_device *dev, 560 struct nullb_zone *zone) 561 { 562 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 563 return BLK_STS_IOERR; 564 565 null_lock_zone_res(dev); 566 567 switch (zone->cond) { 568 case BLK_ZONE_COND_EMPTY: 569 /* reset operation on empty is not an error */ 570 null_unlock_zone_res(dev); 571 return BLK_STS_OK; 572 case BLK_ZONE_COND_IMP_OPEN: 573 dev->nr_zones_imp_open--; 574 break; 575 case BLK_ZONE_COND_EXP_OPEN: 576 dev->nr_zones_exp_open--; 577 break; 578 case BLK_ZONE_COND_CLOSED: 579 dev->nr_zones_closed--; 580 break; 581 case BLK_ZONE_COND_FULL: 582 break; 583 default: 584 null_unlock_zone_res(dev); 585 return BLK_STS_IOERR; 586 } 587 588 zone->cond = BLK_ZONE_COND_EMPTY; 589 zone->wp = zone->start; 590 591 null_unlock_zone_res(dev); 592 593 if (dev->memory_backed) 594 return null_handle_discard(dev, zone->start, zone->len); 595 596 return BLK_STS_OK; 597 } 598 599 static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, 600 sector_t sector) 601 { 602 struct nullb_device *dev = cmd->nq->dev; 603 unsigned int zone_no; 604 struct nullb_zone *zone; 605 blk_status_t ret; 606 size_t i; 607 608 if (op == REQ_OP_ZONE_RESET_ALL) { 609 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 610 zone = &dev->zones[i]; 611 null_lock_zone(dev, zone); 612 if (zone->cond != BLK_ZONE_COND_EMPTY) { 613 null_reset_zone(dev, zone); 614 trace_nullb_zone_op(cmd, i, zone->cond); 615 } 616 null_unlock_zone(dev, zone); 617 } 618 return BLK_STS_OK; 619 } 620 621 zone_no = null_zone_no(dev, sector); 622 zone = &dev->zones[zone_no]; 623 624 null_lock_zone(dev, zone); 625 626 switch (op) { 627 case REQ_OP_ZONE_RESET: 628 ret = null_reset_zone(dev, zone); 629 break; 630 case REQ_OP_ZONE_OPEN: 631 ret = null_open_zone(dev, zone); 632 break; 633 case REQ_OP_ZONE_CLOSE: 634 ret = null_close_zone(dev, zone); 635 break; 636 case REQ_OP_ZONE_FINISH: 637 ret = null_finish_zone(dev, zone); 638 break; 639 default: 640 ret = BLK_STS_NOTSUPP; 641 break; 642 } 643 644 if (ret == BLK_STS_OK) 645 trace_nullb_zone_op(cmd, zone_no, zone->cond); 646 647 null_unlock_zone(dev, zone); 648 649 return ret; 650 } 651 652 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 653 sector_t sector, sector_t nr_sectors) 654 { 655 struct nullb_device *dev; 656 struct nullb_zone *zone; 657 blk_status_t sts; 658 659 switch (op) { 660 case REQ_OP_WRITE: 661 return null_zone_write(cmd, sector, nr_sectors, false); 662 case REQ_OP_ZONE_APPEND: 663 return null_zone_write(cmd, sector, nr_sectors, true); 664 case REQ_OP_ZONE_RESET: 665 case REQ_OP_ZONE_RESET_ALL: 666 case REQ_OP_ZONE_OPEN: 667 case REQ_OP_ZONE_CLOSE: 668 case REQ_OP_ZONE_FINISH: 669 return null_zone_mgmt(cmd, op, sector); 670 default: 671 dev = cmd->nq->dev; 672 zone = &dev->zones[null_zone_no(dev, sector)]; 673 674 null_lock_zone(dev, zone); 675 sts = null_process_cmd(cmd, op, sector, nr_sectors); 676 null_unlock_zone(dev, zone); 677 return sts; 678 } 679 } 680