1 /* 2 * Zoned block device handling 3 * 4 * Copyright (c) 2015, Hannes Reinecke 5 * Copyright (c) 2015, SUSE Linux GmbH 6 * 7 * Copyright (c) 2016, Damien Le Moal 8 * Copyright (c) 2016, Western Digital 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/module.h> 13 #include <linux/rbtree.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 17 #include "blk.h" 18 19 static inline sector_t blk_zone_start(struct request_queue *q, 20 sector_t sector) 21 { 22 sector_t zone_mask = blk_queue_zone_sectors(q) - 1; 23 24 return sector & ~zone_mask; 25 } 26 27 /* 28 * Return true if a request is a write requests that needs zone write locking. 29 */ 30 bool blk_req_needs_zone_write_lock(struct request *rq) 31 { 32 if (!rq->q->seq_zones_wlock) 33 return false; 34 35 if (blk_rq_is_passthrough(rq)) 36 return false; 37 38 switch (req_op(rq)) { 39 case REQ_OP_WRITE_ZEROES: 40 case REQ_OP_WRITE_SAME: 41 case REQ_OP_WRITE: 42 return blk_rq_zone_is_seq(rq); 43 default: 44 return false; 45 } 46 } 47 EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); 48 49 void __blk_req_zone_write_lock(struct request *rq) 50 { 51 if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), 52 rq->q->seq_zones_wlock))) 53 return; 54 55 WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); 56 rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; 57 } 58 EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); 59 60 void __blk_req_zone_write_unlock(struct request *rq) 61 { 62 rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; 63 if (rq->q->seq_zones_wlock) 64 WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), 65 rq->q->seq_zones_wlock)); 66 } 67 EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); 68 69 static inline unsigned int __blkdev_nr_zones(struct request_queue *q, 70 sector_t nr_sectors) 71 { 72 unsigned long zone_sectors = blk_queue_zone_sectors(q); 73 74 return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); 75 } 76 77 /** 78 * blkdev_nr_zones - Get number of zones 79 * @bdev: Target block device 80 * 81 * Description: 82 * Return the total number of zones of a zoned block device. 83 * For a regular block device, the number of zones is always 0. 84 */ 85 unsigned int blkdev_nr_zones(struct block_device *bdev) 86 { 87 struct request_queue *q = bdev_get_queue(bdev); 88 89 if (!blk_queue_is_zoned(q)) 90 return 0; 91 92 return __blkdev_nr_zones(q, bdev->bd_part->nr_sects); 93 } 94 EXPORT_SYMBOL_GPL(blkdev_nr_zones); 95 96 /* 97 * Check that a zone report belongs to this partition, and if yes, fix its start 98 * sector and write pointer and return true. Return false otherwise. 99 */ 100 static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) 101 { 102 sector_t offset = get_start_sect(bdev); 103 104 if (rep->start < offset) 105 return false; 106 107 rep->start -= offset; 108 if (rep->start + rep->len > bdev->bd_part->nr_sects) 109 return false; 110 111 if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) 112 rep->wp = rep->start + rep->len; 113 else 114 rep->wp -= offset; 115 return true; 116 } 117 118 static int blk_report_zones(struct gendisk *disk, sector_t sector, 119 struct blk_zone *zones, unsigned int *nr_zones, 120 gfp_t gfp_mask) 121 { 122 struct request_queue *q = disk->queue; 123 unsigned int z = 0, n, nrz = *nr_zones; 124 sector_t capacity = get_capacity(disk); 125 int ret; 126 127 while (z < nrz && sector < capacity) { 128 n = nrz - z; 129 ret = disk->fops->report_zones(disk, sector, &zones[z], &n, 130 gfp_mask); 131 if (ret) 132 return ret; 133 if (!n) 134 break; 135 sector += blk_queue_zone_sectors(q) * n; 136 z += n; 137 } 138 139 WARN_ON(z > *nr_zones); 140 *nr_zones = z; 141 142 return 0; 143 } 144 145 /** 146 * blkdev_report_zones - Get zones information 147 * @bdev: Target block device 148 * @sector: Sector from which to report zones 149 * @zones: Array of zone structures where to return the zones information 150 * @nr_zones: Number of zone structures in the zone array 151 * @gfp_mask: Memory allocation flags (for bio_alloc) 152 * 153 * Description: 154 * Get zone information starting from the zone containing @sector. 155 * The number of zone information reported may be less than the number 156 * requested by @nr_zones. The number of zones actually reported is 157 * returned in @nr_zones. 158 */ 159 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 160 struct blk_zone *zones, unsigned int *nr_zones, 161 gfp_t gfp_mask) 162 { 163 struct request_queue *q = bdev_get_queue(bdev); 164 unsigned int i, nrz; 165 int ret; 166 167 if (!blk_queue_is_zoned(q)) 168 return -EOPNOTSUPP; 169 170 /* 171 * A block device that advertized itself as zoned must have a 172 * report_zones method. If it does not have one defined, the device 173 * driver has a bug. So warn about that. 174 */ 175 if (WARN_ON_ONCE(!bdev->bd_disk->fops->report_zones)) 176 return -EOPNOTSUPP; 177 178 if (!*nr_zones || sector >= bdev->bd_part->nr_sects) { 179 *nr_zones = 0; 180 return 0; 181 } 182 183 nrz = min(*nr_zones, 184 __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); 185 ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, 186 zones, &nrz, gfp_mask); 187 if (ret) 188 return ret; 189 190 for (i = 0; i < nrz; i++) { 191 if (!blkdev_report_zone(bdev, zones)) 192 break; 193 zones++; 194 } 195 196 *nr_zones = i; 197 198 return 0; 199 } 200 EXPORT_SYMBOL_GPL(blkdev_report_zones); 201 202 /** 203 * blkdev_reset_zones - Reset zones write pointer 204 * @bdev: Target block device 205 * @sector: Start sector of the first zone to reset 206 * @nr_sectors: Number of sectors, at least the length of one zone 207 * @gfp_mask: Memory allocation flags (for bio_alloc) 208 * 209 * Description: 210 * Reset the write pointer of the zones contained in the range 211 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 212 * is valid, but the specified range should not contain conventional zones. 213 */ 214 int blkdev_reset_zones(struct block_device *bdev, 215 sector_t sector, sector_t nr_sectors, 216 gfp_t gfp_mask) 217 { 218 struct request_queue *q = bdev_get_queue(bdev); 219 sector_t zone_sectors; 220 sector_t end_sector = sector + nr_sectors; 221 struct bio *bio = NULL; 222 struct blk_plug plug; 223 int ret; 224 225 if (!blk_queue_is_zoned(q)) 226 return -EOPNOTSUPP; 227 228 if (bdev_read_only(bdev)) 229 return -EPERM; 230 231 if (!nr_sectors || end_sector > bdev->bd_part->nr_sects) 232 /* Out of range */ 233 return -EINVAL; 234 235 /* Check alignment (handle eventual smaller last zone) */ 236 zone_sectors = blk_queue_zone_sectors(q); 237 if (sector & (zone_sectors - 1)) 238 return -EINVAL; 239 240 if ((nr_sectors & (zone_sectors - 1)) && 241 end_sector != bdev->bd_part->nr_sects) 242 return -EINVAL; 243 244 blk_start_plug(&plug); 245 while (sector < end_sector) { 246 247 bio = blk_next_bio(bio, 0, gfp_mask); 248 bio->bi_iter.bi_sector = sector; 249 bio_set_dev(bio, bdev); 250 bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); 251 252 sector += zone_sectors; 253 254 /* This may take a while, so be nice to others */ 255 cond_resched(); 256 257 } 258 259 ret = submit_bio_wait(bio); 260 bio_put(bio); 261 262 blk_finish_plug(&plug); 263 264 return ret; 265 } 266 EXPORT_SYMBOL_GPL(blkdev_reset_zones); 267 268 /* 269 * BLKREPORTZONE ioctl processing. 270 * Called from blkdev_ioctl. 271 */ 272 int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, 273 unsigned int cmd, unsigned long arg) 274 { 275 void __user *argp = (void __user *)arg; 276 struct request_queue *q; 277 struct blk_zone_report rep; 278 struct blk_zone *zones; 279 int ret; 280 281 if (!argp) 282 return -EINVAL; 283 284 q = bdev_get_queue(bdev); 285 if (!q) 286 return -ENXIO; 287 288 if (!blk_queue_is_zoned(q)) 289 return -ENOTTY; 290 291 if (!capable(CAP_SYS_ADMIN)) 292 return -EACCES; 293 294 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 295 return -EFAULT; 296 297 if (!rep.nr_zones) 298 return -EINVAL; 299 300 rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones); 301 302 zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone), 303 GFP_KERNEL | __GFP_ZERO); 304 if (!zones) 305 return -ENOMEM; 306 307 ret = blkdev_report_zones(bdev, rep.sector, 308 zones, &rep.nr_zones, 309 GFP_KERNEL); 310 if (ret) 311 goto out; 312 313 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { 314 ret = -EFAULT; 315 goto out; 316 } 317 318 if (rep.nr_zones) { 319 if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, 320 sizeof(struct blk_zone) * rep.nr_zones)) 321 ret = -EFAULT; 322 } 323 324 out: 325 kvfree(zones); 326 327 return ret; 328 } 329 330 /* 331 * BLKRESETZONE ioctl processing. 332 * Called from blkdev_ioctl. 333 */ 334 int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, 335 unsigned int cmd, unsigned long arg) 336 { 337 void __user *argp = (void __user *)arg; 338 struct request_queue *q; 339 struct blk_zone_range zrange; 340 341 if (!argp) 342 return -EINVAL; 343 344 q = bdev_get_queue(bdev); 345 if (!q) 346 return -ENXIO; 347 348 if (!blk_queue_is_zoned(q)) 349 return -ENOTTY; 350 351 if (!capable(CAP_SYS_ADMIN)) 352 return -EACCES; 353 354 if (!(mode & FMODE_WRITE)) 355 return -EBADF; 356 357 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 358 return -EFAULT; 359 360 return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, 361 GFP_KERNEL); 362 } 363 364 static inline unsigned long *blk_alloc_zone_bitmap(int node, 365 unsigned int nr_zones) 366 { 367 return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), 368 GFP_NOIO, node); 369 } 370 371 /* 372 * Allocate an array of struct blk_zone to get nr_zones zone information. 373 * The allocated array may be smaller than nr_zones. 374 */ 375 static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) 376 { 377 size_t size = *nr_zones * sizeof(struct blk_zone); 378 struct page *page; 379 int order; 380 381 for (order = get_order(size); order > 0; order--) { 382 page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); 383 if (page) { 384 *nr_zones = min_t(unsigned int, *nr_zones, 385 (PAGE_SIZE << order) / sizeof(struct blk_zone)); 386 return page_address(page); 387 } 388 } 389 390 return NULL; 391 } 392 393 void blk_queue_free_zone_bitmaps(struct request_queue *q) 394 { 395 kfree(q->seq_zones_bitmap); 396 q->seq_zones_bitmap = NULL; 397 kfree(q->seq_zones_wlock); 398 q->seq_zones_wlock = NULL; 399 } 400 401 /** 402 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps 403 * @disk: Target disk 404 * 405 * Helper function for low-level device drivers to (re) allocate and initialize 406 * a disk request queue zone bitmaps. This functions should normally be called 407 * within the disk ->revalidate method. For BIO based queues, no zone bitmap 408 * is allocated. 409 */ 410 int blk_revalidate_disk_zones(struct gendisk *disk) 411 { 412 struct request_queue *q = disk->queue; 413 unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); 414 unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; 415 unsigned int i, rep_nr_zones = 0, z = 0, nrz; 416 struct blk_zone *zones = NULL; 417 sector_t sector = 0; 418 int ret = 0; 419 420 /* 421 * BIO based queues do not use a scheduler so only q->nr_zones 422 * needs to be updated so that the sysfs exposed value is correct. 423 */ 424 if (!queue_is_rq_based(q)) { 425 q->nr_zones = nr_zones; 426 return 0; 427 } 428 429 if (!blk_queue_is_zoned(q) || !nr_zones) { 430 nr_zones = 0; 431 goto update; 432 } 433 434 /* Allocate bitmaps */ 435 ret = -ENOMEM; 436 seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones); 437 if (!seq_zones_wlock) 438 goto out; 439 seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones); 440 if (!seq_zones_bitmap) 441 goto out; 442 443 /* Get zone information and initialize seq_zones_bitmap */ 444 rep_nr_zones = nr_zones; 445 zones = blk_alloc_zones(q->node, &rep_nr_zones); 446 if (!zones) 447 goto out; 448 449 while (z < nr_zones) { 450 nrz = min(nr_zones - z, rep_nr_zones); 451 ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); 452 if (ret) 453 goto out; 454 if (!nrz) 455 break; 456 for (i = 0; i < nrz; i++) { 457 if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL) 458 set_bit(z, seq_zones_bitmap); 459 z++; 460 } 461 sector += nrz * blk_queue_zone_sectors(q); 462 } 463 464 if (WARN_ON(z != nr_zones)) { 465 ret = -EIO; 466 goto out; 467 } 468 469 update: 470 /* 471 * Install the new bitmaps, making sure the queue is stopped and 472 * all I/Os are completed (i.e. a scheduler is not referencing the 473 * bitmaps). 474 */ 475 blk_mq_freeze_queue(q); 476 q->nr_zones = nr_zones; 477 swap(q->seq_zones_wlock, seq_zones_wlock); 478 swap(q->seq_zones_bitmap, seq_zones_bitmap); 479 blk_mq_unfreeze_queue(q); 480 481 out: 482 free_pages((unsigned long)zones, 483 get_order(rep_nr_zones * sizeof(struct blk_zone))); 484 kfree(seq_zones_wlock); 485 kfree(seq_zones_bitmap); 486 487 if (ret) { 488 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 489 blk_mq_freeze_queue(q); 490 blk_queue_free_zone_bitmaps(q); 491 blk_mq_unfreeze_queue(q); 492 } 493 494 return ret; 495 } 496 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 497 498