1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 */ 11 12 #include <linux/kernel.h> 13 #include <linux/module.h> 14 #include <linux/rbtree.h> 15 #include <linux/blkdev.h> 16 #include <linux/blk-mq.h> 17 #include <linux/mm.h> 18 #include <linux/vmalloc.h> 19 #include <linux/sched/mm.h> 20 21 #include "blk.h" 22 23 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 24 static const char *const zone_cond_name[] = { 25 ZONE_COND_NAME(NOT_WP), 26 ZONE_COND_NAME(EMPTY), 27 ZONE_COND_NAME(IMP_OPEN), 28 ZONE_COND_NAME(EXP_OPEN), 29 ZONE_COND_NAME(CLOSED), 30 ZONE_COND_NAME(READONLY), 31 ZONE_COND_NAME(FULL), 32 ZONE_COND_NAME(OFFLINE), 33 }; 34 #undef ZONE_COND_NAME 35 36 /** 37 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. 38 * @zone_cond: BLK_ZONE_COND_XXX. 39 * 40 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX 41 * into string format. Useful in the debugging and tracing zone conditions. For 42 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". 43 */ 44 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 45 { 46 static const char *zone_cond_str = "UNKNOWN"; 47 48 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 49 zone_cond_str = zone_cond_name[zone_cond]; 50 51 return zone_cond_str; 52 } 53 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 54 55 static inline sector_t blk_zone_start(struct request_queue *q, 56 sector_t sector) 57 { 58 sector_t zone_mask = blk_queue_zone_sectors(q) - 1; 59 60 return sector & ~zone_mask; 61 } 62 63 /* 64 * Return true if a request is a write requests that needs zone write locking. 65 */ 66 bool blk_req_needs_zone_write_lock(struct request *rq) 67 { 68 if (!rq->q->seq_zones_wlock) 69 return false; 70 71 if (blk_rq_is_passthrough(rq)) 72 return false; 73 74 switch (req_op(rq)) { 75 case REQ_OP_WRITE_ZEROES: 76 case REQ_OP_WRITE_SAME: 77 case REQ_OP_WRITE: 78 return blk_rq_zone_is_seq(rq); 79 default: 80 return false; 81 } 82 } 83 EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); 84 85 void __blk_req_zone_write_lock(struct request *rq) 86 { 87 if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), 88 rq->q->seq_zones_wlock))) 89 return; 90 91 WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); 92 rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; 93 } 94 EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); 95 96 void __blk_req_zone_write_unlock(struct request *rq) 97 { 98 rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; 99 if (rq->q->seq_zones_wlock) 100 WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), 101 rq->q->seq_zones_wlock)); 102 } 103 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); 104 105 /** 106 * blkdev_nr_zones - Get number of zones 107 * @disk: Target gendisk 108 * 109 * Return the total number of zones of a zoned block device. For a block 110 * device without zone capabilities, the number of zones is always 0. 111 */ 112 unsigned int blkdev_nr_zones(struct gendisk *disk) 113 { 114 sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); 115 116 if (!blk_queue_is_zoned(disk->queue)) 117 return 0; 118 return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); 119 } 120 EXPORT_SYMBOL_GPL(blkdev_nr_zones); 121 122 /** 123 * blkdev_report_zones - Get zones information 124 * @bdev: Target block device 125 * @sector: Sector from which to report zones 126 * @nr_zones: Maximum number of zones to report 127 * @cb: Callback function called for each reported zone 128 * @data: Private data for the callback 129 * 130 * Description: 131 * Get zone information starting from the zone containing @sector for at most 132 * @nr_zones, and call @cb for each zone reported by the device. 133 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 134 * constant can be passed to @nr_zones. 135 * Returns the number of zones reported by the device, or a negative errno 136 * value in case of failure. 137 * 138 * Note: The caller must use memalloc_noXX_save/restore() calls to control 139 * memory allocations done within this function. 140 */ 141 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 142 unsigned int nr_zones, report_zones_cb cb, void *data) 143 { 144 struct gendisk *disk = bdev->bd_disk; 145 sector_t capacity = get_capacity(disk); 146 147 if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || 148 WARN_ON_ONCE(!disk->fops->report_zones)) 149 return -EOPNOTSUPP; 150 151 if (!nr_zones || sector >= capacity) 152 return 0; 153 154 return disk->fops->report_zones(disk, sector, nr_zones, cb, data); 155 } 156 EXPORT_SYMBOL_GPL(blkdev_report_zones); 157 158 static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, 159 sector_t sector, 160 sector_t nr_sectors) 161 { 162 if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) 163 return false; 164 165 /* 166 * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors 167 * of the applicable zone range is the entire disk. 168 */ 169 return !sector && nr_sectors == get_capacity(bdev->bd_disk); 170 } 171 172 /** 173 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 174 * @bdev: Target block device 175 * @op: Operation to be performed on the zones 176 * @sector: Start sector of the first zone to operate on 177 * @nr_sectors: Number of sectors, should be at least the length of one zone and 178 * must be zone size aligned. 179 * @gfp_mask: Memory allocation flags (for bio_alloc) 180 * 181 * Description: 182 * Perform the specified operation on the range of zones specified by 183 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 184 * is valid, but the specified range should not contain conventional zones. 185 * The operation to execute on each zone can be a zone reset, open, close 186 * or finish request. 187 */ 188 int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, 189 sector_t sector, sector_t nr_sectors, 190 gfp_t gfp_mask) 191 { 192 struct request_queue *q = bdev_get_queue(bdev); 193 sector_t zone_sectors = blk_queue_zone_sectors(q); 194 sector_t capacity = get_capacity(bdev->bd_disk); 195 sector_t end_sector = sector + nr_sectors; 196 struct bio *bio = NULL; 197 int ret; 198 199 if (!blk_queue_is_zoned(q)) 200 return -EOPNOTSUPP; 201 202 if (bdev_read_only(bdev)) 203 return -EPERM; 204 205 if (!op_is_zone_mgmt(op)) 206 return -EOPNOTSUPP; 207 208 if (end_sector <= sector || end_sector > capacity) 209 /* Out of range */ 210 return -EINVAL; 211 212 /* Check alignment (handle eventual smaller last zone) */ 213 if (sector & (zone_sectors - 1)) 214 return -EINVAL; 215 216 if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) 217 return -EINVAL; 218 219 while (sector < end_sector) { 220 bio = blk_next_bio(bio, 0, gfp_mask); 221 bio_set_dev(bio, bdev); 222 223 /* 224 * Special case for the zone reset operation that reset all 225 * zones, this is useful for applications like mkfs. 226 */ 227 if (op == REQ_OP_ZONE_RESET && 228 blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { 229 bio->bi_opf = REQ_OP_ZONE_RESET_ALL; 230 break; 231 } 232 233 bio->bi_opf = op | REQ_SYNC; 234 bio->bi_iter.bi_sector = sector; 235 sector += zone_sectors; 236 237 /* This may take a while, so be nice to others */ 238 cond_resched(); 239 } 240 241 ret = submit_bio_wait(bio); 242 bio_put(bio); 243 244 return ret; 245 } 246 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 247 248 struct zone_report_args { 249 struct blk_zone __user *zones; 250 }; 251 252 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 253 void *data) 254 { 255 struct zone_report_args *args = data; 256 257 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 258 return -EFAULT; 259 return 0; 260 } 261 262 /* 263 * BLKREPORTZONE ioctl processing. 264 * Called from blkdev_ioctl. 265 */ 266 int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, 267 unsigned int cmd, unsigned long arg) 268 { 269 void __user *argp = (void __user *)arg; 270 struct zone_report_args args; 271 struct request_queue *q; 272 struct blk_zone_report rep; 273 int ret; 274 275 if (!argp) 276 return -EINVAL; 277 278 q = bdev_get_queue(bdev); 279 if (!q) 280 return -ENXIO; 281 282 if (!blk_queue_is_zoned(q)) 283 return -ENOTTY; 284 285 if (!capable(CAP_SYS_ADMIN)) 286 return -EACCES; 287 288 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 289 return -EFAULT; 290 291 if (!rep.nr_zones) 292 return -EINVAL; 293 294 args.zones = argp + sizeof(struct blk_zone_report); 295 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 296 blkdev_copy_zone_to_user, &args); 297 if (ret < 0) 298 return ret; 299 300 rep.nr_zones = ret; 301 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 302 return -EFAULT; 303 return 0; 304 } 305 306 /* 307 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 308 * Called from blkdev_ioctl. 309 */ 310 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, 311 unsigned int cmd, unsigned long arg) 312 { 313 void __user *argp = (void __user *)arg; 314 struct request_queue *q; 315 struct blk_zone_range zrange; 316 enum req_opf op; 317 318 if (!argp) 319 return -EINVAL; 320 321 q = bdev_get_queue(bdev); 322 if (!q) 323 return -ENXIO; 324 325 if (!blk_queue_is_zoned(q)) 326 return -ENOTTY; 327 328 if (!capable(CAP_SYS_ADMIN)) 329 return -EACCES; 330 331 if (!(mode & FMODE_WRITE)) 332 return -EBADF; 333 334 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 335 return -EFAULT; 336 337 switch (cmd) { 338 case BLKRESETZONE: 339 op = REQ_OP_ZONE_RESET; 340 break; 341 case BLKOPENZONE: 342 op = REQ_OP_ZONE_OPEN; 343 break; 344 case BLKCLOSEZONE: 345 op = REQ_OP_ZONE_CLOSE; 346 break; 347 case BLKFINISHZONE: 348 op = REQ_OP_ZONE_FINISH; 349 break; 350 default: 351 return -ENOTTY; 352 } 353 354 return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, 355 GFP_KERNEL); 356 } 357 358 static inline unsigned long *blk_alloc_zone_bitmap(int node, 359 unsigned int nr_zones) 360 { 361 return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), 362 GFP_NOIO, node); 363 } 364 365 void blk_queue_free_zone_bitmaps(struct request_queue *q) 366 { 367 kfree(q->conv_zones_bitmap); 368 q->conv_zones_bitmap = NULL; 369 kfree(q->seq_zones_wlock); 370 q->seq_zones_wlock = NULL; 371 } 372 373 struct blk_revalidate_zone_args { 374 struct gendisk *disk; 375 unsigned long *conv_zones_bitmap; 376 unsigned long *seq_zones_wlock; 377 unsigned int nr_zones; 378 sector_t zone_sectors; 379 sector_t sector; 380 }; 381 382 /* 383 * Helper function to check the validity of zones of a zoned block device. 384 */ 385 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 386 void *data) 387 { 388 struct blk_revalidate_zone_args *args = data; 389 struct gendisk *disk = args->disk; 390 struct request_queue *q = disk->queue; 391 sector_t capacity = get_capacity(disk); 392 393 /* 394 * All zones must have the same size, with the exception on an eventual 395 * smaller last zone. 396 */ 397 if (zone->start == 0) { 398 if (zone->len == 0 || !is_power_of_2(zone->len)) { 399 pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", 400 disk->disk_name, zone->len); 401 return -ENODEV; 402 } 403 404 args->zone_sectors = zone->len; 405 args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); 406 } else if (zone->start + args->zone_sectors < capacity) { 407 if (zone->len != args->zone_sectors) { 408 pr_warn("%s: Invalid zoned device with non constant zone size\n", 409 disk->disk_name); 410 return -ENODEV; 411 } 412 } else { 413 if (zone->len > args->zone_sectors) { 414 pr_warn("%s: Invalid zoned device with larger last zone size\n", 415 disk->disk_name); 416 return -ENODEV; 417 } 418 } 419 420 /* Check for holes in the zone report */ 421 if (zone->start != args->sector) { 422 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 423 disk->disk_name, args->sector, zone->start); 424 return -ENODEV; 425 } 426 427 /* Check zone type */ 428 switch (zone->type) { 429 case BLK_ZONE_TYPE_CONVENTIONAL: 430 if (!args->conv_zones_bitmap) { 431 args->conv_zones_bitmap = 432 blk_alloc_zone_bitmap(q->node, args->nr_zones); 433 if (!args->conv_zones_bitmap) 434 return -ENOMEM; 435 } 436 set_bit(idx, args->conv_zones_bitmap); 437 break; 438 case BLK_ZONE_TYPE_SEQWRITE_REQ: 439 case BLK_ZONE_TYPE_SEQWRITE_PREF: 440 if (!args->seq_zones_wlock) { 441 args->seq_zones_wlock = 442 blk_alloc_zone_bitmap(q->node, args->nr_zones); 443 if (!args->seq_zones_wlock) 444 return -ENOMEM; 445 } 446 break; 447 default: 448 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 449 disk->disk_name, (int)zone->type, zone->start); 450 return -ENODEV; 451 } 452 453 args->sector += zone->len; 454 return 0; 455 } 456 457 /** 458 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps 459 * @disk: Target disk 460 * 461 * Helper function for low-level device drivers to (re) allocate and initialize 462 * a disk request queue zone bitmaps. This functions should normally be called 463 * within the disk ->revalidate method for blk-mq based drivers. For BIO based 464 * drivers only q->nr_zones needs to be updated so that the sysfs exposed value 465 * is correct. 466 */ 467 int blk_revalidate_disk_zones(struct gendisk *disk) 468 { 469 struct request_queue *q = disk->queue; 470 struct blk_revalidate_zone_args args = { 471 .disk = disk, 472 }; 473 unsigned int noio_flag; 474 int ret; 475 476 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 477 return -EIO; 478 if (WARN_ON_ONCE(!queue_is_mq(q))) 479 return -EIO; 480 481 /* 482 * Ensure that all memory allocations in this context are done as if 483 * GFP_NOIO was specified. 484 */ 485 noio_flag = memalloc_noio_save(); 486 ret = disk->fops->report_zones(disk, 0, UINT_MAX, 487 blk_revalidate_zone_cb, &args); 488 memalloc_noio_restore(noio_flag); 489 490 /* 491 * Install the new bitmaps and update nr_zones only once the queue is 492 * stopped and all I/Os are completed (i.e. a scheduler is not 493 * referencing the bitmaps). 494 */ 495 blk_mq_freeze_queue(q); 496 if (ret >= 0) { 497 blk_queue_chunk_sectors(q, args.zone_sectors); 498 q->nr_zones = args.nr_zones; 499 swap(q->seq_zones_wlock, args.seq_zones_wlock); 500 swap(q->conv_zones_bitmap, args.conv_zones_bitmap); 501 ret = 0; 502 } else { 503 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 504 blk_queue_free_zone_bitmaps(q); 505 } 506 blk_mq_unfreeze_queue(q); 507 508 kfree(args.seq_zones_wlock); 509 kfree(args.conv_zones_bitmap); 510 return ret; 511 } 512 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 513