1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/vmalloc.h> 3 #include <linux/bitmap.h> 4 #include "null_blk.h" 5 6 #define CREATE_TRACE_POINTS 7 #include "trace.h" 8 9 #define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT) 10 11 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) 12 { 13 return sect >> ilog2(dev->zone_size_sects); 14 } 15 16 static inline void null_lock_zone_res(struct nullb_device *dev) 17 { 18 if (dev->need_zone_res_mgmt) 19 spin_lock_irq(&dev->zone_res_lock); 20 } 21 22 static inline void null_unlock_zone_res(struct nullb_device *dev) 23 { 24 if (dev->need_zone_res_mgmt) 25 spin_unlock_irq(&dev->zone_res_lock); 26 } 27 28 static inline void null_init_zone_lock(struct nullb_device *dev, 29 struct nullb_zone *zone) 30 { 31 if (!dev->memory_backed) 32 spin_lock_init(&zone->spinlock); 33 else 34 mutex_init(&zone->mutex); 35 } 36 37 static inline void null_lock_zone(struct nullb_device *dev, 38 struct nullb_zone *zone) 39 { 40 if (!dev->memory_backed) 41 spin_lock_irq(&zone->spinlock); 42 else 43 mutex_lock(&zone->mutex); 44 } 45 46 static inline void null_unlock_zone(struct nullb_device *dev, 47 struct nullb_zone *zone) 48 { 49 if (!dev->memory_backed) 50 spin_unlock_irq(&zone->spinlock); 51 else 52 mutex_unlock(&zone->mutex); 53 } 54 55 int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) 56 { 57 sector_t dev_capacity_sects, zone_capacity_sects; 58 struct nullb_zone *zone; 59 sector_t sector = 0; 60 unsigned int i; 61 62 if (!is_power_of_2(dev->zone_size)) { 63 pr_err("zone_size must be power-of-two\n"); 64 return -EINVAL; 65 } 66 if (dev->zone_size > dev->size) { 67 pr_err("Zone size larger than device capacity\n"); 68 return -EINVAL; 69 } 70 71 if (!dev->zone_capacity) 72 dev->zone_capacity = dev->zone_size; 73 74 if (dev->zone_capacity > dev->zone_size) { 75 pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", 76 dev->zone_capacity, dev->zone_size); 77 return -EINVAL; 78 } 79 80 zone_capacity_sects = MB_TO_SECTS(dev->zone_capacity); 81 dev_capacity_sects = MB_TO_SECTS(dev->size); 82 dev->zone_size_sects = MB_TO_SECTS(dev->zone_size); 83 dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects); 84 if (dev_capacity_sects & (dev->zone_size_sects - 1)) 85 dev->nr_zones++; 86 87 dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), 88 GFP_KERNEL | __GFP_ZERO); 89 if (!dev->zones) 90 return -ENOMEM; 91 92 spin_lock_init(&dev->zone_res_lock); 93 94 if (dev->zone_nr_conv >= dev->nr_zones) { 95 dev->zone_nr_conv = dev->nr_zones - 1; 96 pr_info("changed the number of conventional zones to %u", 97 dev->zone_nr_conv); 98 } 99 100 /* Max active zones has to be < nbr of seq zones in order to be enforceable */ 101 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { 102 dev->zone_max_active = 0; 103 pr_info("zone_max_active limit disabled, limit >= zone count\n"); 104 } 105 106 /* Max open zones has to be <= max active zones */ 107 if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { 108 dev->zone_max_open = dev->zone_max_active; 109 pr_info("changed the maximum number of open zones to %u\n", 110 dev->nr_zones); 111 } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { 112 dev->zone_max_open = 0; 113 pr_info("zone_max_open limit disabled, limit >= zone count\n"); 114 } 115 dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open; 116 dev->imp_close_zone_no = dev->zone_nr_conv; 117 118 for (i = 0; i < dev->zone_nr_conv; i++) { 119 zone = &dev->zones[i]; 120 121 null_init_zone_lock(dev, zone); 122 zone->start = sector; 123 zone->len = dev->zone_size_sects; 124 zone->capacity = zone->len; 125 zone->wp = zone->start + zone->len; 126 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 127 zone->cond = BLK_ZONE_COND_NOT_WP; 128 129 sector += dev->zone_size_sects; 130 } 131 132 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 133 zone = &dev->zones[i]; 134 135 null_init_zone_lock(dev, zone); 136 zone->start = zone->wp = sector; 137 if (zone->start + dev->zone_size_sects > dev_capacity_sects) 138 zone->len = dev_capacity_sects - zone->start; 139 else 140 zone->len = dev->zone_size_sects; 141 zone->capacity = 142 min_t(sector_t, zone->len, zone_capacity_sects); 143 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 144 zone->cond = BLK_ZONE_COND_EMPTY; 145 146 sector += dev->zone_size_sects; 147 } 148 149 q->limits.zoned = BLK_ZONED_HM; 150 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 151 blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 152 153 return 0; 154 } 155 156 int null_register_zoned_dev(struct nullb *nullb) 157 { 158 struct nullb_device *dev = nullb->dev; 159 struct request_queue *q = nullb->q; 160 161 if (queue_is_mq(q)) { 162 int ret = blk_revalidate_disk_zones(nullb->disk, NULL); 163 164 if (ret) 165 return ret; 166 } else { 167 blk_queue_chunk_sectors(q, dev->zone_size_sects); 168 q->nr_zones = blkdev_nr_zones(nullb->disk); 169 } 170 171 blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); 172 blk_queue_max_open_zones(q, dev->zone_max_open); 173 blk_queue_max_active_zones(q, dev->zone_max_active); 174 175 return 0; 176 } 177 178 void null_free_zoned_dev(struct nullb_device *dev) 179 { 180 kvfree(dev->zones); 181 } 182 183 int null_report_zones(struct gendisk *disk, sector_t sector, 184 unsigned int nr_zones, report_zones_cb cb, void *data) 185 { 186 struct nullb *nullb = disk->private_data; 187 struct nullb_device *dev = nullb->dev; 188 unsigned int first_zone, i; 189 struct nullb_zone *zone; 190 struct blk_zone blkz; 191 int error; 192 193 first_zone = null_zone_no(dev, sector); 194 if (first_zone >= dev->nr_zones) 195 return 0; 196 197 nr_zones = min(nr_zones, dev->nr_zones - first_zone); 198 trace_nullb_report_zones(nullb, nr_zones); 199 200 memset(&blkz, 0, sizeof(struct blk_zone)); 201 zone = &dev->zones[first_zone]; 202 for (i = 0; i < nr_zones; i++, zone++) { 203 /* 204 * Stacked DM target drivers will remap the zone information by 205 * modifying the zone information passed to the report callback. 206 * So use a local copy to avoid corruption of the device zone 207 * array. 208 */ 209 null_lock_zone(dev, zone); 210 blkz.start = zone->start; 211 blkz.len = zone->len; 212 blkz.wp = zone->wp; 213 blkz.type = zone->type; 214 blkz.cond = zone->cond; 215 blkz.capacity = zone->capacity; 216 null_unlock_zone(dev, zone); 217 218 error = cb(&blkz, i, data); 219 if (error) 220 return error; 221 } 222 223 return nr_zones; 224 } 225 226 /* 227 * This is called in the case of memory backing from null_process_cmd() 228 * with the target zone already locked. 229 */ 230 size_t null_zone_valid_read_len(struct nullb *nullb, 231 sector_t sector, unsigned int len) 232 { 233 struct nullb_device *dev = nullb->dev; 234 struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; 235 unsigned int nr_sectors = len >> SECTOR_SHIFT; 236 237 /* Read must be below the write pointer position */ 238 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || 239 sector + nr_sectors <= zone->wp) 240 return len; 241 242 if (sector > zone->wp) 243 return 0; 244 245 return (zone->wp - sector) << SECTOR_SHIFT; 246 } 247 248 static blk_status_t __null_close_zone(struct nullb_device *dev, 249 struct nullb_zone *zone) 250 { 251 switch (zone->cond) { 252 case BLK_ZONE_COND_CLOSED: 253 /* close operation on closed is not an error */ 254 return BLK_STS_OK; 255 case BLK_ZONE_COND_IMP_OPEN: 256 dev->nr_zones_imp_open--; 257 break; 258 case BLK_ZONE_COND_EXP_OPEN: 259 dev->nr_zones_exp_open--; 260 break; 261 case BLK_ZONE_COND_EMPTY: 262 case BLK_ZONE_COND_FULL: 263 default: 264 return BLK_STS_IOERR; 265 } 266 267 if (zone->wp == zone->start) { 268 zone->cond = BLK_ZONE_COND_EMPTY; 269 } else { 270 zone->cond = BLK_ZONE_COND_CLOSED; 271 dev->nr_zones_closed++; 272 } 273 274 return BLK_STS_OK; 275 } 276 277 static void null_close_imp_open_zone(struct nullb_device *dev) 278 { 279 struct nullb_zone *zone; 280 unsigned int zno, i; 281 282 zno = dev->imp_close_zone_no; 283 if (zno >= dev->nr_zones) 284 zno = dev->zone_nr_conv; 285 286 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 287 zone = &dev->zones[zno]; 288 zno++; 289 if (zno >= dev->nr_zones) 290 zno = dev->zone_nr_conv; 291 292 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 293 __null_close_zone(dev, zone); 294 dev->imp_close_zone_no = zno; 295 return; 296 } 297 } 298 } 299 300 static blk_status_t null_check_active(struct nullb_device *dev) 301 { 302 if (!dev->zone_max_active) 303 return BLK_STS_OK; 304 305 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + 306 dev->nr_zones_closed < dev->zone_max_active) 307 return BLK_STS_OK; 308 309 return BLK_STS_ZONE_ACTIVE_RESOURCE; 310 } 311 312 static blk_status_t null_check_open(struct nullb_device *dev) 313 { 314 if (!dev->zone_max_open) 315 return BLK_STS_OK; 316 317 if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) 318 return BLK_STS_OK; 319 320 if (dev->nr_zones_imp_open) { 321 if (null_check_active(dev) == BLK_STS_OK) { 322 null_close_imp_open_zone(dev); 323 return BLK_STS_OK; 324 } 325 } 326 327 return BLK_STS_ZONE_OPEN_RESOURCE; 328 } 329 330 /* 331 * This function matches the manage open zone resources function in the ZBC standard, 332 * with the addition of max active zones support (added in the ZNS standard). 333 * 334 * The function determines if a zone can transition to implicit open or explicit open, 335 * while maintaining the max open zone (and max active zone) limit(s). It may close an 336 * implicit open zone in order to make additional zone resources available. 337 * 338 * ZBC states that an implicit open zone shall be closed only if there is not 339 * room within the open limit. However, with the addition of an active limit, 340 * it is not certain that closing an implicit open zone will allow a new zone 341 * to be opened, since we might already be at the active limit capacity. 342 */ 343 static blk_status_t null_check_zone_resources(struct nullb_device *dev, 344 struct nullb_zone *zone) 345 { 346 blk_status_t ret; 347 348 switch (zone->cond) { 349 case BLK_ZONE_COND_EMPTY: 350 ret = null_check_active(dev); 351 if (ret != BLK_STS_OK) 352 return ret; 353 fallthrough; 354 case BLK_ZONE_COND_CLOSED: 355 return null_check_open(dev); 356 default: 357 /* Should never be called for other states */ 358 WARN_ON(1); 359 return BLK_STS_IOERR; 360 } 361 } 362 363 static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, 364 unsigned int nr_sectors, bool append) 365 { 366 struct nullb_device *dev = cmd->nq->dev; 367 unsigned int zno = null_zone_no(dev, sector); 368 struct nullb_zone *zone = &dev->zones[zno]; 369 blk_status_t ret; 370 371 trace_nullb_zone_op(cmd, zno, zone->cond); 372 373 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) { 374 if (append) 375 return BLK_STS_IOERR; 376 return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 377 } 378 379 null_lock_zone(dev, zone); 380 381 if (zone->cond == BLK_ZONE_COND_FULL) { 382 /* Cannot write to a full zone */ 383 ret = BLK_STS_IOERR; 384 goto unlock; 385 } 386 387 /* 388 * Regular writes must be at the write pointer position. 389 * Zone append writes are automatically issued at the write 390 * pointer and the position returned using the request or BIO 391 * sector. 392 */ 393 if (append) { 394 sector = zone->wp; 395 if (cmd->bio) 396 cmd->bio->bi_iter.bi_sector = sector; 397 else 398 cmd->rq->__sector = sector; 399 } else if (sector != zone->wp) { 400 ret = BLK_STS_IOERR; 401 goto unlock; 402 } 403 404 if (zone->wp + nr_sectors > zone->start + zone->capacity) { 405 ret = BLK_STS_IOERR; 406 goto unlock; 407 } 408 409 if (zone->cond == BLK_ZONE_COND_CLOSED || 410 zone->cond == BLK_ZONE_COND_EMPTY) { 411 null_lock_zone_res(dev); 412 413 ret = null_check_zone_resources(dev, zone); 414 if (ret != BLK_STS_OK) { 415 null_unlock_zone_res(dev); 416 goto unlock; 417 } 418 if (zone->cond == BLK_ZONE_COND_CLOSED) { 419 dev->nr_zones_closed--; 420 dev->nr_zones_imp_open++; 421 } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 422 dev->nr_zones_imp_open++; 423 } 424 425 if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 426 zone->cond = BLK_ZONE_COND_IMP_OPEN; 427 428 null_unlock_zone_res(dev); 429 } 430 431 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 432 if (ret != BLK_STS_OK) 433 goto unlock; 434 435 zone->wp += nr_sectors; 436 if (zone->wp == zone->start + zone->capacity) { 437 null_lock_zone_res(dev); 438 if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 439 dev->nr_zones_exp_open--; 440 else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 441 dev->nr_zones_imp_open--; 442 zone->cond = BLK_ZONE_COND_FULL; 443 null_unlock_zone_res(dev); 444 } 445 446 ret = BLK_STS_OK; 447 448 unlock: 449 null_unlock_zone(dev, zone); 450 451 return ret; 452 } 453 454 static blk_status_t null_open_zone(struct nullb_device *dev, 455 struct nullb_zone *zone) 456 { 457 blk_status_t ret = BLK_STS_OK; 458 459 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 460 return BLK_STS_IOERR; 461 462 null_lock_zone_res(dev); 463 464 switch (zone->cond) { 465 case BLK_ZONE_COND_EXP_OPEN: 466 /* open operation on exp open is not an error */ 467 goto unlock; 468 case BLK_ZONE_COND_EMPTY: 469 ret = null_check_zone_resources(dev, zone); 470 if (ret != BLK_STS_OK) 471 goto unlock; 472 break; 473 case BLK_ZONE_COND_IMP_OPEN: 474 dev->nr_zones_imp_open--; 475 break; 476 case BLK_ZONE_COND_CLOSED: 477 ret = null_check_zone_resources(dev, zone); 478 if (ret != BLK_STS_OK) 479 goto unlock; 480 dev->nr_zones_closed--; 481 break; 482 case BLK_ZONE_COND_FULL: 483 default: 484 ret = BLK_STS_IOERR; 485 goto unlock; 486 } 487 488 zone->cond = BLK_ZONE_COND_EXP_OPEN; 489 dev->nr_zones_exp_open++; 490 491 unlock: 492 null_unlock_zone_res(dev); 493 494 return ret; 495 } 496 497 static blk_status_t null_close_zone(struct nullb_device *dev, 498 struct nullb_zone *zone) 499 { 500 blk_status_t ret; 501 502 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 503 return BLK_STS_IOERR; 504 505 null_lock_zone_res(dev); 506 ret = __null_close_zone(dev, zone); 507 null_unlock_zone_res(dev); 508 509 return ret; 510 } 511 512 static blk_status_t null_finish_zone(struct nullb_device *dev, 513 struct nullb_zone *zone) 514 { 515 blk_status_t ret = BLK_STS_OK; 516 517 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 518 return BLK_STS_IOERR; 519 520 null_lock_zone_res(dev); 521 522 switch (zone->cond) { 523 case BLK_ZONE_COND_FULL: 524 /* finish operation on full is not an error */ 525 goto unlock; 526 case BLK_ZONE_COND_EMPTY: 527 ret = null_check_zone_resources(dev, zone); 528 if (ret != BLK_STS_OK) 529 goto unlock; 530 break; 531 case BLK_ZONE_COND_IMP_OPEN: 532 dev->nr_zones_imp_open--; 533 break; 534 case BLK_ZONE_COND_EXP_OPEN: 535 dev->nr_zones_exp_open--; 536 break; 537 case BLK_ZONE_COND_CLOSED: 538 ret = null_check_zone_resources(dev, zone); 539 if (ret != BLK_STS_OK) 540 goto unlock; 541 dev->nr_zones_closed--; 542 break; 543 default: 544 ret = BLK_STS_IOERR; 545 goto unlock; 546 } 547 548 zone->cond = BLK_ZONE_COND_FULL; 549 zone->wp = zone->start + zone->len; 550 551 unlock: 552 null_unlock_zone_res(dev); 553 554 return ret; 555 } 556 557 static blk_status_t null_reset_zone(struct nullb_device *dev, 558 struct nullb_zone *zone) 559 { 560 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 561 return BLK_STS_IOERR; 562 563 null_lock_zone_res(dev); 564 565 switch (zone->cond) { 566 case BLK_ZONE_COND_EMPTY: 567 /* reset operation on empty is not an error */ 568 null_unlock_zone_res(dev); 569 return BLK_STS_OK; 570 case BLK_ZONE_COND_IMP_OPEN: 571 dev->nr_zones_imp_open--; 572 break; 573 case BLK_ZONE_COND_EXP_OPEN: 574 dev->nr_zones_exp_open--; 575 break; 576 case BLK_ZONE_COND_CLOSED: 577 dev->nr_zones_closed--; 578 break; 579 case BLK_ZONE_COND_FULL: 580 break; 581 default: 582 null_unlock_zone_res(dev); 583 return BLK_STS_IOERR; 584 } 585 586 zone->cond = BLK_ZONE_COND_EMPTY; 587 zone->wp = zone->start; 588 589 null_unlock_zone_res(dev); 590 591 if (dev->memory_backed) 592 return null_handle_discard(dev, zone->start, zone->len); 593 594 return BLK_STS_OK; 595 } 596 597 static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, 598 sector_t sector) 599 { 600 struct nullb_device *dev = cmd->nq->dev; 601 unsigned int zone_no; 602 struct nullb_zone *zone; 603 blk_status_t ret; 604 size_t i; 605 606 if (op == REQ_OP_ZONE_RESET_ALL) { 607 for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) { 608 zone = &dev->zones[i]; 609 null_lock_zone(dev, zone); 610 if (zone->cond != BLK_ZONE_COND_EMPTY) { 611 null_reset_zone(dev, zone); 612 trace_nullb_zone_op(cmd, i, zone->cond); 613 } 614 null_unlock_zone(dev, zone); 615 } 616 return BLK_STS_OK; 617 } 618 619 zone_no = null_zone_no(dev, sector); 620 zone = &dev->zones[zone_no]; 621 622 null_lock_zone(dev, zone); 623 624 switch (op) { 625 case REQ_OP_ZONE_RESET: 626 ret = null_reset_zone(dev, zone); 627 break; 628 case REQ_OP_ZONE_OPEN: 629 ret = null_open_zone(dev, zone); 630 break; 631 case REQ_OP_ZONE_CLOSE: 632 ret = null_close_zone(dev, zone); 633 break; 634 case REQ_OP_ZONE_FINISH: 635 ret = null_finish_zone(dev, zone); 636 break; 637 default: 638 ret = BLK_STS_NOTSUPP; 639 break; 640 } 641 642 if (ret == BLK_STS_OK) 643 trace_nullb_zone_op(cmd, zone_no, zone->cond); 644 645 null_unlock_zone(dev, zone); 646 647 return ret; 648 } 649 650 blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, 651 sector_t sector, sector_t nr_sectors) 652 { 653 struct nullb_device *dev; 654 struct nullb_zone *zone; 655 blk_status_t sts; 656 657 switch (op) { 658 case REQ_OP_WRITE: 659 return null_zone_write(cmd, sector, nr_sectors, false); 660 case REQ_OP_ZONE_APPEND: 661 return null_zone_write(cmd, sector, nr_sectors, true); 662 case REQ_OP_ZONE_RESET: 663 case REQ_OP_ZONE_RESET_ALL: 664 case REQ_OP_ZONE_OPEN: 665 case REQ_OP_ZONE_CLOSE: 666 case REQ_OP_ZONE_FINISH: 667 return null_zone_mgmt(cmd, op, sector); 668 default: 669 dev = cmd->nq->dev; 670 zone = &dev->zones[null_zone_no(dev, sector)]; 671 672 null_lock_zone(dev, zone); 673 sts = null_process_cmd(cmd, op, sector, nr_sectors); 674 null_unlock_zone(dev, zone); 675 return sts; 676 } 677 } 678