1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and 4 * Shaohua Li <shli@fb.com> 5 */ 6 #include <linux/module.h> 7 8 #include <linux/moduleparam.h> 9 #include <linux/sched.h> 10 #include <linux/fs.h> 11 #include <linux/init.h> 12 #include "null_blk.h" 13 14 #undef pr_fmt 15 #define pr_fmt(fmt) "null_blk: " fmt 16 17 #define FREE_BATCH 16 18 19 #define TICKS_PER_SEC 50ULL 20 #define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC) 21 22 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 23 static DECLARE_FAULT_ATTR(null_timeout_attr); 24 static DECLARE_FAULT_ATTR(null_requeue_attr); 25 static DECLARE_FAULT_ATTR(null_init_hctx_attr); 26 #endif 27 28 static inline u64 mb_per_tick(int mbps) 29 { 30 return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); 31 } 32 33 /* 34 * Status flags for nullb_device. 35 * 36 * CONFIGURED: Device has been configured and turned on. Cannot reconfigure. 37 * UP: Device is currently on and visible in userspace. 38 * THROTTLED: Device is being throttled. 39 * CACHE: Device is using a write-back cache. 40 */ 41 enum nullb_device_flags { 42 NULLB_DEV_FL_CONFIGURED = 0, 43 NULLB_DEV_FL_UP = 1, 44 NULLB_DEV_FL_THROTTLED = 2, 45 NULLB_DEV_FL_CACHE = 3, 46 }; 47 48 #define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) 49 /* 50 * nullb_page is a page in memory for nullb devices. 51 * 52 * @page: The page holding the data. 53 * @bitmap: The bitmap represents which sector in the page has data. 54 * Each bit represents one block size. For example, sector 8 55 * will use the 7th bit 56 * The highest 2 bits of bitmap are for special purpose. LOCK means the cache 57 * page is being flushing to storage. FREE means the cache page is freed and 58 * should be skipped from flushing to storage. Please see 59 * null_make_cache_space 60 */ 61 struct nullb_page { 62 struct page *page; 63 DECLARE_BITMAP(bitmap, MAP_SZ); 64 }; 65 #define NULLB_PAGE_LOCK (MAP_SZ - 1) 66 #define NULLB_PAGE_FREE (MAP_SZ - 2) 67 68 static LIST_HEAD(nullb_list); 69 static struct mutex lock; 70 static int null_major; 71 static DEFINE_IDA(nullb_indexes); 72 static struct blk_mq_tag_set tag_set; 73 74 enum { 75 NULL_IRQ_NONE = 0, 76 NULL_IRQ_SOFTIRQ = 1, 77 NULL_IRQ_TIMER = 2, 78 }; 79 80 static bool g_virt_boundary = false; 81 module_param_named(virt_boundary, g_virt_boundary, bool, 0444); 82 MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False"); 83 84 static int g_no_sched; 85 module_param_named(no_sched, g_no_sched, int, 0444); 86 MODULE_PARM_DESC(no_sched, "No io scheduler"); 87 88 static int g_submit_queues = 1; 89 module_param_named(submit_queues, g_submit_queues, int, 0444); 90 MODULE_PARM_DESC(submit_queues, "Number of submission queues"); 91 92 static int g_poll_queues = 1; 93 module_param_named(poll_queues, g_poll_queues, int, 0444); 94 MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues"); 95 96 static int g_home_node = NUMA_NO_NODE; 97 module_param_named(home_node, g_home_node, int, 0444); 98 MODULE_PARM_DESC(home_node, "Home node for the device"); 99 100 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 101 /* 102 * For more details about fault injection, please refer to 103 * Documentation/fault-injection/fault-injection.rst. 104 */ 105 static char g_timeout_str[80]; 106 module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444); 107 MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>"); 108 109 static char g_requeue_str[80]; 110 module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444); 111 MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>"); 112 113 static char g_init_hctx_str[80]; 114 module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); 115 MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); 116 #endif 117 118 static int g_queue_mode = NULL_Q_MQ; 119 120 static int null_param_store_val(const char *str, int *val, int min, int max) 121 { 122 int ret, new_val; 123 124 ret = kstrtoint(str, 10, &new_val); 125 if (ret) 126 return -EINVAL; 127 128 if (new_val < min || new_val > max) 129 return -EINVAL; 130 131 *val = new_val; 132 return 0; 133 } 134 135 static int null_set_queue_mode(const char *str, const struct kernel_param *kp) 136 { 137 return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ); 138 } 139 140 static const struct kernel_param_ops null_queue_mode_param_ops = { 141 .set = null_set_queue_mode, 142 .get = param_get_int, 143 }; 144 145 device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444); 146 MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); 147 148 static int g_gb = 250; 149 module_param_named(gb, g_gb, int, 0444); 150 MODULE_PARM_DESC(gb, "Size in GB"); 151 152 static int g_bs = 512; 153 module_param_named(bs, g_bs, int, 0444); 154 MODULE_PARM_DESC(bs, "Block size (in bytes)"); 155 156 static int g_max_sectors; 157 module_param_named(max_sectors, g_max_sectors, int, 0444); 158 MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)"); 159 160 static unsigned int nr_devices = 1; 161 module_param(nr_devices, uint, 0444); 162 MODULE_PARM_DESC(nr_devices, "Number of devices to register"); 163 164 static bool g_blocking; 165 module_param_named(blocking, g_blocking, bool, 0444); 166 MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); 167 168 static bool shared_tags; 169 module_param(shared_tags, bool, 0444); 170 MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); 171 172 static bool g_shared_tag_bitmap; 173 module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444); 174 MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq"); 175 176 static int g_irqmode = NULL_IRQ_SOFTIRQ; 177 178 static int null_set_irqmode(const char *str, const struct kernel_param *kp) 179 { 180 return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE, 181 NULL_IRQ_TIMER); 182 } 183 184 static const struct kernel_param_ops null_irqmode_param_ops = { 185 .set = null_set_irqmode, 186 .get = param_get_int, 187 }; 188 189 device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444); 190 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); 191 192 static unsigned long g_completion_nsec = 10000; 193 module_param_named(completion_nsec, g_completion_nsec, ulong, 0444); 194 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); 195 196 static int g_hw_queue_depth = 64; 197 module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444); 198 MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); 199 200 static bool g_use_per_node_hctx; 201 module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); 202 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); 203 204 static bool g_memory_backed; 205 module_param_named(memory_backed, g_memory_backed, bool, 0444); 206 MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); 207 208 static bool g_discard; 209 module_param_named(discard, g_discard, bool, 0444); 210 MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); 211 212 static unsigned long g_cache_size; 213 module_param_named(cache_size, g_cache_size, ulong, 0444); 214 MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); 215 216 static unsigned int g_mbps; 217 module_param_named(mbps, g_mbps, uint, 0444); 218 MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); 219 220 static bool g_zoned; 221 module_param_named(zoned, g_zoned, bool, S_IRUGO); 222 MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); 223 224 static unsigned long g_zone_size = 256; 225 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); 226 MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); 227 228 static unsigned long g_zone_capacity; 229 module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); 230 MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); 231 232 static unsigned int g_zone_nr_conv; 233 module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); 234 MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); 235 236 static unsigned int g_zone_max_open; 237 module_param_named(zone_max_open, g_zone_max_open, uint, 0444); 238 MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); 239 240 static unsigned int g_zone_max_active; 241 module_param_named(zone_max_active, g_zone_max_active, uint, 0444); 242 MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); 243 244 static struct nullb_device *null_alloc_dev(void); 245 static void null_free_dev(struct nullb_device *dev); 246 static void null_del_dev(struct nullb *nullb); 247 static int null_add_dev(struct nullb_device *dev); 248 static struct nullb *null_find_dev_by_name(const char *name); 249 static void null_free_device_storage(struct nullb_device *dev, bool is_cache); 250 251 static inline struct nullb_device *to_nullb_device(struct config_item *item) 252 { 253 return item ? container_of(item, struct nullb_device, item) : NULL; 254 } 255 256 static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) 257 { 258 return snprintf(page, PAGE_SIZE, "%u\n", val); 259 } 260 261 static inline ssize_t nullb_device_ulong_attr_show(unsigned long val, 262 char *page) 263 { 264 return snprintf(page, PAGE_SIZE, "%lu\n", val); 265 } 266 267 static inline ssize_t nullb_device_bool_attr_show(bool val, char *page) 268 { 269 return snprintf(page, PAGE_SIZE, "%u\n", val); 270 } 271 272 static ssize_t nullb_device_uint_attr_store(unsigned int *val, 273 const char *page, size_t count) 274 { 275 unsigned int tmp; 276 int result; 277 278 result = kstrtouint(page, 0, &tmp); 279 if (result < 0) 280 return result; 281 282 *val = tmp; 283 return count; 284 } 285 286 static ssize_t nullb_device_ulong_attr_store(unsigned long *val, 287 const char *page, size_t count) 288 { 289 int result; 290 unsigned long tmp; 291 292 result = kstrtoul(page, 0, &tmp); 293 if (result < 0) 294 return result; 295 296 *val = tmp; 297 return count; 298 } 299 300 static ssize_t nullb_device_bool_attr_store(bool *val, const char *page, 301 size_t count) 302 { 303 bool tmp; 304 int result; 305 306 result = kstrtobool(page, &tmp); 307 if (result < 0) 308 return result; 309 310 *val = tmp; 311 return count; 312 } 313 314 /* The following macro should only be used with TYPE = {uint, ulong, bool}. */ 315 #define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY) \ 316 static ssize_t \ 317 nullb_device_##NAME##_show(struct config_item *item, char *page) \ 318 { \ 319 return nullb_device_##TYPE##_attr_show( \ 320 to_nullb_device(item)->NAME, page); \ 321 } \ 322 static ssize_t \ 323 nullb_device_##NAME##_store(struct config_item *item, const char *page, \ 324 size_t count) \ 325 { \ 326 int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\ 327 struct nullb_device *dev = to_nullb_device(item); \ 328 TYPE new_value = 0; \ 329 int ret; \ 330 \ 331 ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\ 332 if (ret < 0) \ 333 return ret; \ 334 if (apply_fn) \ 335 ret = apply_fn(dev, new_value); \ 336 else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) \ 337 ret = -EBUSY; \ 338 if (ret < 0) \ 339 return ret; \ 340 dev->NAME = new_value; \ 341 return count; \ 342 } \ 343 CONFIGFS_ATTR(nullb_device_, NAME); 344 345 static int nullb_update_nr_hw_queues(struct nullb_device *dev, 346 unsigned int submit_queues, 347 unsigned int poll_queues) 348 349 { 350 struct blk_mq_tag_set *set; 351 int ret, nr_hw_queues; 352 353 if (!dev->nullb) 354 return 0; 355 356 /* 357 * Make sure at least one submit queue exists. 358 */ 359 if (!submit_queues) 360 return -EINVAL; 361 362 /* 363 * Make sure that null_init_hctx() does not access nullb->queues[] past 364 * the end of that array. 365 */ 366 if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues) 367 return -EINVAL; 368 369 /* 370 * Keep previous and new queue numbers in nullb_device for reference in 371 * the call back function null_map_queues(). 372 */ 373 dev->prev_submit_queues = dev->submit_queues; 374 dev->prev_poll_queues = dev->poll_queues; 375 dev->submit_queues = submit_queues; 376 dev->poll_queues = poll_queues; 377 378 set = dev->nullb->tag_set; 379 nr_hw_queues = submit_queues + poll_queues; 380 blk_mq_update_nr_hw_queues(set, nr_hw_queues); 381 ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM; 382 383 if (ret) { 384 /* on error, revert the queue numbers */ 385 dev->submit_queues = dev->prev_submit_queues; 386 dev->poll_queues = dev->prev_poll_queues; 387 } 388 389 return ret; 390 } 391 392 static int nullb_apply_submit_queues(struct nullb_device *dev, 393 unsigned int submit_queues) 394 { 395 return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); 396 } 397 398 static int nullb_apply_poll_queues(struct nullb_device *dev, 399 unsigned int poll_queues) 400 { 401 return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); 402 } 403 404 NULLB_DEVICE_ATTR(size, ulong, NULL); 405 NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL); 406 NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues); 407 NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues); 408 NULLB_DEVICE_ATTR(home_node, uint, NULL); 409 NULLB_DEVICE_ATTR(queue_mode, uint, NULL); 410 NULLB_DEVICE_ATTR(blocksize, uint, NULL); 411 NULLB_DEVICE_ATTR(max_sectors, uint, NULL); 412 NULLB_DEVICE_ATTR(irqmode, uint, NULL); 413 NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL); 414 NULLB_DEVICE_ATTR(index, uint, NULL); 415 NULLB_DEVICE_ATTR(blocking, bool, NULL); 416 NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL); 417 NULLB_DEVICE_ATTR(memory_backed, bool, NULL); 418 NULLB_DEVICE_ATTR(discard, bool, NULL); 419 NULLB_DEVICE_ATTR(mbps, uint, NULL); 420 NULLB_DEVICE_ATTR(cache_size, ulong, NULL); 421 NULLB_DEVICE_ATTR(zoned, bool, NULL); 422 NULLB_DEVICE_ATTR(zone_size, ulong, NULL); 423 NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); 424 NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); 425 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); 426 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); 427 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); 428 NULLB_DEVICE_ATTR(no_sched, bool, NULL); 429 NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); 430 431 static ssize_t nullb_device_power_show(struct config_item *item, char *page) 432 { 433 return nullb_device_bool_attr_show(to_nullb_device(item)->power, page); 434 } 435 436 static ssize_t nullb_device_power_store(struct config_item *item, 437 const char *page, size_t count) 438 { 439 struct nullb_device *dev = to_nullb_device(item); 440 bool newp = false; 441 ssize_t ret; 442 443 ret = nullb_device_bool_attr_store(&newp, page, count); 444 if (ret < 0) 445 return ret; 446 447 if (!dev->power && newp) { 448 if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) 449 return count; 450 ret = null_add_dev(dev); 451 if (ret) { 452 clear_bit(NULLB_DEV_FL_UP, &dev->flags); 453 return ret; 454 } 455 456 set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 457 dev->power = newp; 458 } else if (dev->power && !newp) { 459 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 460 mutex_lock(&lock); 461 dev->power = newp; 462 null_del_dev(dev->nullb); 463 mutex_unlock(&lock); 464 } 465 clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); 466 } 467 468 return count; 469 } 470 471 CONFIGFS_ATTR(nullb_device_, power); 472 473 static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page) 474 { 475 struct nullb_device *t_dev = to_nullb_device(item); 476 477 return badblocks_show(&t_dev->badblocks, page, 0); 478 } 479 480 static ssize_t nullb_device_badblocks_store(struct config_item *item, 481 const char *page, size_t count) 482 { 483 struct nullb_device *t_dev = to_nullb_device(item); 484 char *orig, *buf, *tmp; 485 u64 start, end; 486 int ret; 487 488 orig = kstrndup(page, count, GFP_KERNEL); 489 if (!orig) 490 return -ENOMEM; 491 492 buf = strstrip(orig); 493 494 ret = -EINVAL; 495 if (buf[0] != '+' && buf[0] != '-') 496 goto out; 497 tmp = strchr(&buf[1], '-'); 498 if (!tmp) 499 goto out; 500 *tmp = '\0'; 501 ret = kstrtoull(buf + 1, 0, &start); 502 if (ret) 503 goto out; 504 ret = kstrtoull(tmp + 1, 0, &end); 505 if (ret) 506 goto out; 507 ret = -EINVAL; 508 if (start > end) 509 goto out; 510 /* enable badblocks */ 511 cmpxchg(&t_dev->badblocks.shift, -1, 0); 512 if (buf[0] == '+') 513 ret = badblocks_set(&t_dev->badblocks, start, 514 end - start + 1, 1); 515 else 516 ret = badblocks_clear(&t_dev->badblocks, start, 517 end - start + 1); 518 if (ret == 0) 519 ret = count; 520 out: 521 kfree(orig); 522 return ret; 523 } 524 CONFIGFS_ATTR(nullb_device_, badblocks); 525 526 static struct configfs_attribute *nullb_device_attrs[] = { 527 &nullb_device_attr_size, 528 &nullb_device_attr_completion_nsec, 529 &nullb_device_attr_submit_queues, 530 &nullb_device_attr_poll_queues, 531 &nullb_device_attr_home_node, 532 &nullb_device_attr_queue_mode, 533 &nullb_device_attr_blocksize, 534 &nullb_device_attr_max_sectors, 535 &nullb_device_attr_irqmode, 536 &nullb_device_attr_hw_queue_depth, 537 &nullb_device_attr_index, 538 &nullb_device_attr_blocking, 539 &nullb_device_attr_use_per_node_hctx, 540 &nullb_device_attr_power, 541 &nullb_device_attr_memory_backed, 542 &nullb_device_attr_discard, 543 &nullb_device_attr_mbps, 544 &nullb_device_attr_cache_size, 545 &nullb_device_attr_badblocks, 546 &nullb_device_attr_zoned, 547 &nullb_device_attr_zone_size, 548 &nullb_device_attr_zone_capacity, 549 &nullb_device_attr_zone_nr_conv, 550 &nullb_device_attr_zone_max_open, 551 &nullb_device_attr_zone_max_active, 552 &nullb_device_attr_virt_boundary, 553 &nullb_device_attr_no_sched, 554 &nullb_device_attr_shared_tag_bitmap, 555 NULL, 556 }; 557 558 static void nullb_device_release(struct config_item *item) 559 { 560 struct nullb_device *dev = to_nullb_device(item); 561 562 null_free_device_storage(dev, false); 563 null_free_dev(dev); 564 } 565 566 static struct configfs_item_operations nullb_device_ops = { 567 .release = nullb_device_release, 568 }; 569 570 static const struct config_item_type nullb_device_type = { 571 .ct_item_ops = &nullb_device_ops, 572 .ct_attrs = nullb_device_attrs, 573 .ct_owner = THIS_MODULE, 574 }; 575 576 static struct 577 config_item *nullb_group_make_item(struct config_group *group, const char *name) 578 { 579 struct nullb_device *dev; 580 581 if (null_find_dev_by_name(name)) 582 return ERR_PTR(-EEXIST); 583 584 dev = null_alloc_dev(); 585 if (!dev) 586 return ERR_PTR(-ENOMEM); 587 588 config_item_init_type_name(&dev->item, name, &nullb_device_type); 589 590 return &dev->item; 591 } 592 593 static void 594 nullb_group_drop_item(struct config_group *group, struct config_item *item) 595 { 596 struct nullb_device *dev = to_nullb_device(item); 597 598 if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { 599 mutex_lock(&lock); 600 dev->power = false; 601 null_del_dev(dev->nullb); 602 mutex_unlock(&lock); 603 } 604 605 config_item_put(item); 606 } 607 608 static ssize_t memb_group_features_show(struct config_item *item, char *page) 609 { 610 return snprintf(page, PAGE_SIZE, 611 "badblocks,blocking,blocksize,cache_size," 612 "completion_nsec,discard,home_node,hw_queue_depth," 613 "irqmode,max_sectors,mbps,memory_backed,no_sched," 614 "poll_queues,power,queue_mode,shared_tag_bitmap,size," 615 "submit_queues,use_per_node_hctx,virt_boundary,zoned," 616 "zone_capacity,zone_max_active,zone_max_open," 617 "zone_nr_conv,zone_size\n"); 618 } 619 620 CONFIGFS_ATTR_RO(memb_group_, features); 621 622 static struct configfs_attribute *nullb_group_attrs[] = { 623 &memb_group_attr_features, 624 NULL, 625 }; 626 627 static struct configfs_group_operations nullb_group_ops = { 628 .make_item = nullb_group_make_item, 629 .drop_item = nullb_group_drop_item, 630 }; 631 632 static const struct config_item_type nullb_group_type = { 633 .ct_group_ops = &nullb_group_ops, 634 .ct_attrs = nullb_group_attrs, 635 .ct_owner = THIS_MODULE, 636 }; 637 638 static struct configfs_subsystem nullb_subsys = { 639 .su_group = { 640 .cg_item = { 641 .ci_namebuf = "nullb", 642 .ci_type = &nullb_group_type, 643 }, 644 }, 645 }; 646 647 static inline int null_cache_active(struct nullb *nullb) 648 { 649 return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 650 } 651 652 static struct nullb_device *null_alloc_dev(void) 653 { 654 struct nullb_device *dev; 655 656 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 657 if (!dev) 658 return NULL; 659 INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); 660 INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); 661 if (badblocks_init(&dev->badblocks, 0)) { 662 kfree(dev); 663 return NULL; 664 } 665 666 dev->size = g_gb * 1024; 667 dev->completion_nsec = g_completion_nsec; 668 dev->submit_queues = g_submit_queues; 669 dev->prev_submit_queues = g_submit_queues; 670 dev->poll_queues = g_poll_queues; 671 dev->prev_poll_queues = g_poll_queues; 672 dev->home_node = g_home_node; 673 dev->queue_mode = g_queue_mode; 674 dev->blocksize = g_bs; 675 dev->max_sectors = g_max_sectors; 676 dev->irqmode = g_irqmode; 677 dev->hw_queue_depth = g_hw_queue_depth; 678 dev->blocking = g_blocking; 679 dev->memory_backed = g_memory_backed; 680 dev->discard = g_discard; 681 dev->cache_size = g_cache_size; 682 dev->mbps = g_mbps; 683 dev->use_per_node_hctx = g_use_per_node_hctx; 684 dev->zoned = g_zoned; 685 dev->zone_size = g_zone_size; 686 dev->zone_capacity = g_zone_capacity; 687 dev->zone_nr_conv = g_zone_nr_conv; 688 dev->zone_max_open = g_zone_max_open; 689 dev->zone_max_active = g_zone_max_active; 690 dev->virt_boundary = g_virt_boundary; 691 dev->no_sched = g_no_sched; 692 dev->shared_tag_bitmap = g_shared_tag_bitmap; 693 return dev; 694 } 695 696 static void null_free_dev(struct nullb_device *dev) 697 { 698 if (!dev) 699 return; 700 701 null_free_zoned_dev(dev); 702 badblocks_exit(&dev->badblocks); 703 kfree(dev); 704 } 705 706 static void put_tag(struct nullb_queue *nq, unsigned int tag) 707 { 708 clear_bit_unlock(tag, nq->tag_map); 709 710 if (waitqueue_active(&nq->wait)) 711 wake_up(&nq->wait); 712 } 713 714 static unsigned int get_tag(struct nullb_queue *nq) 715 { 716 unsigned int tag; 717 718 do { 719 tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); 720 if (tag >= nq->queue_depth) 721 return -1U; 722 } while (test_and_set_bit_lock(tag, nq->tag_map)); 723 724 return tag; 725 } 726 727 static void free_cmd(struct nullb_cmd *cmd) 728 { 729 put_tag(cmd->nq, cmd->tag); 730 } 731 732 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); 733 734 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) 735 { 736 struct nullb_cmd *cmd; 737 unsigned int tag; 738 739 tag = get_tag(nq); 740 if (tag != -1U) { 741 cmd = &nq->cmds[tag]; 742 cmd->tag = tag; 743 cmd->error = BLK_STS_OK; 744 cmd->nq = nq; 745 if (nq->dev->irqmode == NULL_IRQ_TIMER) { 746 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, 747 HRTIMER_MODE_REL); 748 cmd->timer.function = null_cmd_timer_expired; 749 } 750 return cmd; 751 } 752 753 return NULL; 754 } 755 756 static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) 757 { 758 struct nullb_cmd *cmd; 759 DEFINE_WAIT(wait); 760 761 do { 762 /* 763 * This avoids multiple return statements, multiple calls to 764 * __alloc_cmd() and a fast path call to prepare_to_wait(). 765 */ 766 cmd = __alloc_cmd(nq); 767 if (cmd) { 768 cmd->bio = bio; 769 return cmd; 770 } 771 prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); 772 io_schedule(); 773 finish_wait(&nq->wait, &wait); 774 } while (1); 775 } 776 777 static void end_cmd(struct nullb_cmd *cmd) 778 { 779 int queue_mode = cmd->nq->dev->queue_mode; 780 781 switch (queue_mode) { 782 case NULL_Q_MQ: 783 blk_mq_end_request(cmd->rq, cmd->error); 784 return; 785 case NULL_Q_BIO: 786 cmd->bio->bi_status = cmd->error; 787 bio_endio(cmd->bio); 788 break; 789 } 790 791 free_cmd(cmd); 792 } 793 794 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) 795 { 796 end_cmd(container_of(timer, struct nullb_cmd, timer)); 797 798 return HRTIMER_NORESTART; 799 } 800 801 static void null_cmd_end_timer(struct nullb_cmd *cmd) 802 { 803 ktime_t kt = cmd->nq->dev->completion_nsec; 804 805 hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); 806 } 807 808 static void null_complete_rq(struct request *rq) 809 { 810 end_cmd(blk_mq_rq_to_pdu(rq)); 811 } 812 813 static struct nullb_page *null_alloc_page(void) 814 { 815 struct nullb_page *t_page; 816 817 t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); 818 if (!t_page) 819 return NULL; 820 821 t_page->page = alloc_pages(GFP_NOIO, 0); 822 if (!t_page->page) { 823 kfree(t_page); 824 return NULL; 825 } 826 827 memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); 828 return t_page; 829 } 830 831 static void null_free_page(struct nullb_page *t_page) 832 { 833 __set_bit(NULLB_PAGE_FREE, t_page->bitmap); 834 if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) 835 return; 836 __free_page(t_page->page); 837 kfree(t_page); 838 } 839 840 static bool null_page_empty(struct nullb_page *page) 841 { 842 int size = MAP_SZ - 2; 843 844 return find_first_bit(page->bitmap, size) == size; 845 } 846 847 static void null_free_sector(struct nullb *nullb, sector_t sector, 848 bool is_cache) 849 { 850 unsigned int sector_bit; 851 u64 idx; 852 struct nullb_page *t_page, *ret; 853 struct radix_tree_root *root; 854 855 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 856 idx = sector >> PAGE_SECTORS_SHIFT; 857 sector_bit = (sector & SECTOR_MASK); 858 859 t_page = radix_tree_lookup(root, idx); 860 if (t_page) { 861 __clear_bit(sector_bit, t_page->bitmap); 862 863 if (null_page_empty(t_page)) { 864 ret = radix_tree_delete_item(root, idx, t_page); 865 WARN_ON(ret != t_page); 866 null_free_page(ret); 867 if (is_cache) 868 nullb->dev->curr_cache -= PAGE_SIZE; 869 } 870 } 871 } 872 873 static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, 874 struct nullb_page *t_page, bool is_cache) 875 { 876 struct radix_tree_root *root; 877 878 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 879 880 if (radix_tree_insert(root, idx, t_page)) { 881 null_free_page(t_page); 882 t_page = radix_tree_lookup(root, idx); 883 WARN_ON(!t_page || t_page->page->index != idx); 884 } else if (is_cache) 885 nullb->dev->curr_cache += PAGE_SIZE; 886 887 return t_page; 888 } 889 890 static void null_free_device_storage(struct nullb_device *dev, bool is_cache) 891 { 892 unsigned long pos = 0; 893 int nr_pages; 894 struct nullb_page *ret, *t_pages[FREE_BATCH]; 895 struct radix_tree_root *root; 896 897 root = is_cache ? &dev->cache : &dev->data; 898 899 do { 900 int i; 901 902 nr_pages = radix_tree_gang_lookup(root, 903 (void **)t_pages, pos, FREE_BATCH); 904 905 for (i = 0; i < nr_pages; i++) { 906 pos = t_pages[i]->page->index; 907 ret = radix_tree_delete_item(root, pos, t_pages[i]); 908 WARN_ON(ret != t_pages[i]); 909 null_free_page(ret); 910 } 911 912 pos++; 913 } while (nr_pages == FREE_BATCH); 914 915 if (is_cache) 916 dev->curr_cache = 0; 917 } 918 919 static struct nullb_page *__null_lookup_page(struct nullb *nullb, 920 sector_t sector, bool for_write, bool is_cache) 921 { 922 unsigned int sector_bit; 923 u64 idx; 924 struct nullb_page *t_page; 925 struct radix_tree_root *root; 926 927 idx = sector >> PAGE_SECTORS_SHIFT; 928 sector_bit = (sector & SECTOR_MASK); 929 930 root = is_cache ? &nullb->dev->cache : &nullb->dev->data; 931 t_page = radix_tree_lookup(root, idx); 932 WARN_ON(t_page && t_page->page->index != idx); 933 934 if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) 935 return t_page; 936 937 return NULL; 938 } 939 940 static struct nullb_page *null_lookup_page(struct nullb *nullb, 941 sector_t sector, bool for_write, bool ignore_cache) 942 { 943 struct nullb_page *page = NULL; 944 945 if (!ignore_cache) 946 page = __null_lookup_page(nullb, sector, for_write, true); 947 if (page) 948 return page; 949 return __null_lookup_page(nullb, sector, for_write, false); 950 } 951 952 static struct nullb_page *null_insert_page(struct nullb *nullb, 953 sector_t sector, bool ignore_cache) 954 __releases(&nullb->lock) 955 __acquires(&nullb->lock) 956 { 957 u64 idx; 958 struct nullb_page *t_page; 959 960 t_page = null_lookup_page(nullb, sector, true, ignore_cache); 961 if (t_page) 962 return t_page; 963 964 spin_unlock_irq(&nullb->lock); 965 966 t_page = null_alloc_page(); 967 if (!t_page) 968 goto out_lock; 969 970 if (radix_tree_preload(GFP_NOIO)) 971 goto out_freepage; 972 973 spin_lock_irq(&nullb->lock); 974 idx = sector >> PAGE_SECTORS_SHIFT; 975 t_page->page->index = idx; 976 t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); 977 radix_tree_preload_end(); 978 979 return t_page; 980 out_freepage: 981 null_free_page(t_page); 982 out_lock: 983 spin_lock_irq(&nullb->lock); 984 return null_lookup_page(nullb, sector, true, ignore_cache); 985 } 986 987 static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) 988 { 989 int i; 990 unsigned int offset; 991 u64 idx; 992 struct nullb_page *t_page, *ret; 993 void *dst, *src; 994 995 idx = c_page->page->index; 996 997 t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); 998 999 __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); 1000 if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { 1001 null_free_page(c_page); 1002 if (t_page && null_page_empty(t_page)) { 1003 ret = radix_tree_delete_item(&nullb->dev->data, 1004 idx, t_page); 1005 null_free_page(t_page); 1006 } 1007 return 0; 1008 } 1009 1010 if (!t_page) 1011 return -ENOMEM; 1012 1013 src = kmap_atomic(c_page->page); 1014 dst = kmap_atomic(t_page->page); 1015 1016 for (i = 0; i < PAGE_SECTORS; 1017 i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { 1018 if (test_bit(i, c_page->bitmap)) { 1019 offset = (i << SECTOR_SHIFT); 1020 memcpy(dst + offset, src + offset, 1021 nullb->dev->blocksize); 1022 __set_bit(i, t_page->bitmap); 1023 } 1024 } 1025 1026 kunmap_atomic(dst); 1027 kunmap_atomic(src); 1028 1029 ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); 1030 null_free_page(ret); 1031 nullb->dev->curr_cache -= PAGE_SIZE; 1032 1033 return 0; 1034 } 1035 1036 static int null_make_cache_space(struct nullb *nullb, unsigned long n) 1037 { 1038 int i, err, nr_pages; 1039 struct nullb_page *c_pages[FREE_BATCH]; 1040 unsigned long flushed = 0, one_round; 1041 1042 again: 1043 if ((nullb->dev->cache_size * 1024 * 1024) > 1044 nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0) 1045 return 0; 1046 1047 nr_pages = radix_tree_gang_lookup(&nullb->dev->cache, 1048 (void **)c_pages, nullb->cache_flush_pos, FREE_BATCH); 1049 /* 1050 * nullb_flush_cache_page could unlock before using the c_pages. To 1051 * avoid race, we don't allow page free 1052 */ 1053 for (i = 0; i < nr_pages; i++) { 1054 nullb->cache_flush_pos = c_pages[i]->page->index; 1055 /* 1056 * We found the page which is being flushed to disk by other 1057 * threads 1058 */ 1059 if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap)) 1060 c_pages[i] = NULL; 1061 else 1062 __set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap); 1063 } 1064 1065 one_round = 0; 1066 for (i = 0; i < nr_pages; i++) { 1067 if (c_pages[i] == NULL) 1068 continue; 1069 err = null_flush_cache_page(nullb, c_pages[i]); 1070 if (err) 1071 return err; 1072 one_round++; 1073 } 1074 flushed += one_round << PAGE_SHIFT; 1075 1076 if (n > flushed) { 1077 if (nr_pages == 0) 1078 nullb->cache_flush_pos = 0; 1079 if (one_round == 0) { 1080 /* give other threads a chance */ 1081 spin_unlock_irq(&nullb->lock); 1082 spin_lock_irq(&nullb->lock); 1083 } 1084 goto again; 1085 } 1086 return 0; 1087 } 1088 1089 static int copy_to_nullb(struct nullb *nullb, struct page *source, 1090 unsigned int off, sector_t sector, size_t n, bool is_fua) 1091 { 1092 size_t temp, count = 0; 1093 unsigned int offset; 1094 struct nullb_page *t_page; 1095 void *dst, *src; 1096 1097 while (count < n) { 1098 temp = min_t(size_t, nullb->dev->blocksize, n - count); 1099 1100 if (null_cache_active(nullb) && !is_fua) 1101 null_make_cache_space(nullb, PAGE_SIZE); 1102 1103 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1104 t_page = null_insert_page(nullb, sector, 1105 !null_cache_active(nullb) || is_fua); 1106 if (!t_page) 1107 return -ENOSPC; 1108 1109 src = kmap_atomic(source); 1110 dst = kmap_atomic(t_page->page); 1111 memcpy(dst + offset, src + off + count, temp); 1112 kunmap_atomic(dst); 1113 kunmap_atomic(src); 1114 1115 __set_bit(sector & SECTOR_MASK, t_page->bitmap); 1116 1117 if (is_fua) 1118 null_free_sector(nullb, sector, true); 1119 1120 count += temp; 1121 sector += temp >> SECTOR_SHIFT; 1122 } 1123 return 0; 1124 } 1125 1126 static int copy_from_nullb(struct nullb *nullb, struct page *dest, 1127 unsigned int off, sector_t sector, size_t n) 1128 { 1129 size_t temp, count = 0; 1130 unsigned int offset; 1131 struct nullb_page *t_page; 1132 void *dst, *src; 1133 1134 while (count < n) { 1135 temp = min_t(size_t, nullb->dev->blocksize, n - count); 1136 1137 offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; 1138 t_page = null_lookup_page(nullb, sector, false, 1139 !null_cache_active(nullb)); 1140 1141 dst = kmap_atomic(dest); 1142 if (!t_page) { 1143 memset(dst + off + count, 0, temp); 1144 goto next; 1145 } 1146 src = kmap_atomic(t_page->page); 1147 memcpy(dst + off + count, src + offset, temp); 1148 kunmap_atomic(src); 1149 next: 1150 kunmap_atomic(dst); 1151 1152 count += temp; 1153 sector += temp >> SECTOR_SHIFT; 1154 } 1155 return 0; 1156 } 1157 1158 static void nullb_fill_pattern(struct nullb *nullb, struct page *page, 1159 unsigned int len, unsigned int off) 1160 { 1161 void *dst; 1162 1163 dst = kmap_atomic(page); 1164 memset(dst + off, 0xFF, len); 1165 kunmap_atomic(dst); 1166 } 1167 1168 blk_status_t null_handle_discard(struct nullb_device *dev, 1169 sector_t sector, sector_t nr_sectors) 1170 { 1171 struct nullb *nullb = dev->nullb; 1172 size_t n = nr_sectors << SECTOR_SHIFT; 1173 size_t temp; 1174 1175 spin_lock_irq(&nullb->lock); 1176 while (n > 0) { 1177 temp = min_t(size_t, n, dev->blocksize); 1178 null_free_sector(nullb, sector, false); 1179 if (null_cache_active(nullb)) 1180 null_free_sector(nullb, sector, true); 1181 sector += temp >> SECTOR_SHIFT; 1182 n -= temp; 1183 } 1184 spin_unlock_irq(&nullb->lock); 1185 1186 return BLK_STS_OK; 1187 } 1188 1189 static int null_handle_flush(struct nullb *nullb) 1190 { 1191 int err; 1192 1193 if (!null_cache_active(nullb)) 1194 return 0; 1195 1196 spin_lock_irq(&nullb->lock); 1197 while (true) { 1198 err = null_make_cache_space(nullb, 1199 nullb->dev->cache_size * 1024 * 1024); 1200 if (err || nullb->dev->curr_cache == 0) 1201 break; 1202 } 1203 1204 WARN_ON(!radix_tree_empty(&nullb->dev->cache)); 1205 spin_unlock_irq(&nullb->lock); 1206 return err; 1207 } 1208 1209 static int null_transfer(struct nullb *nullb, struct page *page, 1210 unsigned int len, unsigned int off, bool is_write, sector_t sector, 1211 bool is_fua) 1212 { 1213 struct nullb_device *dev = nullb->dev; 1214 unsigned int valid_len = len; 1215 int err = 0; 1216 1217 if (!is_write) { 1218 if (dev->zoned) 1219 valid_len = null_zone_valid_read_len(nullb, 1220 sector, len); 1221 1222 if (valid_len) { 1223 err = copy_from_nullb(nullb, page, off, 1224 sector, valid_len); 1225 off += valid_len; 1226 len -= valid_len; 1227 } 1228 1229 if (len) 1230 nullb_fill_pattern(nullb, page, len, off); 1231 flush_dcache_page(page); 1232 } else { 1233 flush_dcache_page(page); 1234 err = copy_to_nullb(nullb, page, off, sector, len, is_fua); 1235 } 1236 1237 return err; 1238 } 1239 1240 static int null_handle_rq(struct nullb_cmd *cmd) 1241 { 1242 struct request *rq = cmd->rq; 1243 struct nullb *nullb = cmd->nq->dev->nullb; 1244 int err; 1245 unsigned int len; 1246 sector_t sector = blk_rq_pos(rq); 1247 struct req_iterator iter; 1248 struct bio_vec bvec; 1249 1250 spin_lock_irq(&nullb->lock); 1251 rq_for_each_segment(bvec, rq, iter) { 1252 len = bvec.bv_len; 1253 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1254 op_is_write(req_op(rq)), sector, 1255 rq->cmd_flags & REQ_FUA); 1256 if (err) { 1257 spin_unlock_irq(&nullb->lock); 1258 return err; 1259 } 1260 sector += len >> SECTOR_SHIFT; 1261 } 1262 spin_unlock_irq(&nullb->lock); 1263 1264 return 0; 1265 } 1266 1267 static int null_handle_bio(struct nullb_cmd *cmd) 1268 { 1269 struct bio *bio = cmd->bio; 1270 struct nullb *nullb = cmd->nq->dev->nullb; 1271 int err; 1272 unsigned int len; 1273 sector_t sector = bio->bi_iter.bi_sector; 1274 struct bio_vec bvec; 1275 struct bvec_iter iter; 1276 1277 spin_lock_irq(&nullb->lock); 1278 bio_for_each_segment(bvec, bio, iter) { 1279 len = bvec.bv_len; 1280 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1281 op_is_write(bio_op(bio)), sector, 1282 bio->bi_opf & REQ_FUA); 1283 if (err) { 1284 spin_unlock_irq(&nullb->lock); 1285 return err; 1286 } 1287 sector += len >> SECTOR_SHIFT; 1288 } 1289 spin_unlock_irq(&nullb->lock); 1290 return 0; 1291 } 1292 1293 static void null_stop_queue(struct nullb *nullb) 1294 { 1295 struct request_queue *q = nullb->q; 1296 1297 if (nullb->dev->queue_mode == NULL_Q_MQ) 1298 blk_mq_stop_hw_queues(q); 1299 } 1300 1301 static void null_restart_queue_async(struct nullb *nullb) 1302 { 1303 struct request_queue *q = nullb->q; 1304 1305 if (nullb->dev->queue_mode == NULL_Q_MQ) 1306 blk_mq_start_stopped_hw_queues(q, true); 1307 } 1308 1309 static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) 1310 { 1311 struct nullb_device *dev = cmd->nq->dev; 1312 struct nullb *nullb = dev->nullb; 1313 blk_status_t sts = BLK_STS_OK; 1314 struct request *rq = cmd->rq; 1315 1316 if (!hrtimer_active(&nullb->bw_timer)) 1317 hrtimer_restart(&nullb->bw_timer); 1318 1319 if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { 1320 null_stop_queue(nullb); 1321 /* race with timer */ 1322 if (atomic_long_read(&nullb->cur_bytes) > 0) 1323 null_restart_queue_async(nullb); 1324 /* requeue request */ 1325 sts = BLK_STS_DEV_RESOURCE; 1326 } 1327 return sts; 1328 } 1329 1330 static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, 1331 sector_t sector, 1332 sector_t nr_sectors) 1333 { 1334 struct badblocks *bb = &cmd->nq->dev->badblocks; 1335 sector_t first_bad; 1336 int bad_sectors; 1337 1338 if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors)) 1339 return BLK_STS_IOERR; 1340 1341 return BLK_STS_OK; 1342 } 1343 1344 static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, 1345 enum req_op op, 1346 sector_t sector, 1347 sector_t nr_sectors) 1348 { 1349 struct nullb_device *dev = cmd->nq->dev; 1350 int err; 1351 1352 if (op == REQ_OP_DISCARD) 1353 return null_handle_discard(dev, sector, nr_sectors); 1354 1355 if (dev->queue_mode == NULL_Q_BIO) 1356 err = null_handle_bio(cmd); 1357 else 1358 err = null_handle_rq(cmd); 1359 1360 return errno_to_blk_status(err); 1361 } 1362 1363 static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) 1364 { 1365 struct nullb_device *dev = cmd->nq->dev; 1366 struct bio *bio; 1367 1368 if (dev->memory_backed) 1369 return; 1370 1371 if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { 1372 zero_fill_bio(cmd->bio); 1373 } else if (req_op(cmd->rq) == REQ_OP_READ) { 1374 __rq_for_each_bio(bio, cmd->rq) 1375 zero_fill_bio(bio); 1376 } 1377 } 1378 1379 static inline void nullb_complete_cmd(struct nullb_cmd *cmd) 1380 { 1381 /* 1382 * Since root privileges are required to configure the null_blk 1383 * driver, it is fine that this driver does not initialize the 1384 * data buffers of read commands. Zero-initialize these buffers 1385 * anyway if KMSAN is enabled to prevent that KMSAN complains 1386 * about null_blk not initializing read data buffers. 1387 */ 1388 if (IS_ENABLED(CONFIG_KMSAN)) 1389 nullb_zero_read_cmd_buffer(cmd); 1390 1391 /* Complete IO by inline, softirq or timer */ 1392 switch (cmd->nq->dev->irqmode) { 1393 case NULL_IRQ_SOFTIRQ: 1394 switch (cmd->nq->dev->queue_mode) { 1395 case NULL_Q_MQ: 1396 if (likely(!blk_should_fake_timeout(cmd->rq->q))) 1397 blk_mq_complete_request(cmd->rq); 1398 break; 1399 case NULL_Q_BIO: 1400 /* 1401 * XXX: no proper submitting cpu information available. 1402 */ 1403 end_cmd(cmd); 1404 break; 1405 } 1406 break; 1407 case NULL_IRQ_NONE: 1408 end_cmd(cmd); 1409 break; 1410 case NULL_IRQ_TIMER: 1411 null_cmd_end_timer(cmd); 1412 break; 1413 } 1414 } 1415 1416 blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, 1417 sector_t sector, unsigned int nr_sectors) 1418 { 1419 struct nullb_device *dev = cmd->nq->dev; 1420 blk_status_t ret; 1421 1422 if (dev->badblocks.shift != -1) { 1423 ret = null_handle_badblocks(cmd, sector, nr_sectors); 1424 if (ret != BLK_STS_OK) 1425 return ret; 1426 } 1427 1428 if (dev->memory_backed) 1429 return null_handle_memory_backed(cmd, op, sector, nr_sectors); 1430 1431 return BLK_STS_OK; 1432 } 1433 1434 static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, 1435 sector_t nr_sectors, enum req_op op) 1436 { 1437 struct nullb_device *dev = cmd->nq->dev; 1438 struct nullb *nullb = dev->nullb; 1439 blk_status_t sts; 1440 1441 if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { 1442 sts = null_handle_throttled(cmd); 1443 if (sts != BLK_STS_OK) 1444 return sts; 1445 } 1446 1447 if (op == REQ_OP_FLUSH) { 1448 cmd->error = errno_to_blk_status(null_handle_flush(nullb)); 1449 goto out; 1450 } 1451 1452 if (dev->zoned) 1453 sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); 1454 else 1455 sts = null_process_cmd(cmd, op, sector, nr_sectors); 1456 1457 /* Do not overwrite errors (e.g. timeout errors) */ 1458 if (cmd->error == BLK_STS_OK) 1459 cmd->error = sts; 1460 1461 out: 1462 nullb_complete_cmd(cmd); 1463 return BLK_STS_OK; 1464 } 1465 1466 static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) 1467 { 1468 struct nullb *nullb = container_of(timer, struct nullb, bw_timer); 1469 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1470 unsigned int mbps = nullb->dev->mbps; 1471 1472 if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps)) 1473 return HRTIMER_NORESTART; 1474 1475 atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); 1476 null_restart_queue_async(nullb); 1477 1478 hrtimer_forward_now(&nullb->bw_timer, timer_interval); 1479 1480 return HRTIMER_RESTART; 1481 } 1482 1483 static void nullb_setup_bwtimer(struct nullb *nullb) 1484 { 1485 ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL); 1486 1487 hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1488 nullb->bw_timer.function = nullb_bwtimer_fn; 1489 atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps)); 1490 hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); 1491 } 1492 1493 static struct nullb_queue *nullb_to_queue(struct nullb *nullb) 1494 { 1495 int index = 0; 1496 1497 if (nullb->nr_queues != 1) 1498 index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); 1499 1500 return &nullb->queues[index]; 1501 } 1502 1503 static void null_submit_bio(struct bio *bio) 1504 { 1505 sector_t sector = bio->bi_iter.bi_sector; 1506 sector_t nr_sectors = bio_sectors(bio); 1507 struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; 1508 struct nullb_queue *nq = nullb_to_queue(nullb); 1509 1510 null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); 1511 } 1512 1513 static bool should_timeout_request(struct request *rq) 1514 { 1515 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1516 if (g_timeout_str[0]) 1517 return should_fail(&null_timeout_attr, 1); 1518 #endif 1519 return false; 1520 } 1521 1522 static bool should_requeue_request(struct request *rq) 1523 { 1524 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1525 if (g_requeue_str[0]) 1526 return should_fail(&null_requeue_attr, 1); 1527 #endif 1528 return false; 1529 } 1530 1531 static int null_map_queues(struct blk_mq_tag_set *set) 1532 { 1533 struct nullb *nullb = set->driver_data; 1534 int i, qoff; 1535 unsigned int submit_queues = g_submit_queues; 1536 unsigned int poll_queues = g_poll_queues; 1537 1538 if (nullb) { 1539 struct nullb_device *dev = nullb->dev; 1540 1541 /* 1542 * Refer nr_hw_queues of the tag set to check if the expected 1543 * number of hardware queues are prepared. If block layer failed 1544 * to prepare them, use previous numbers of submit queues and 1545 * poll queues to map queues. 1546 */ 1547 if (set->nr_hw_queues == 1548 dev->submit_queues + dev->poll_queues) { 1549 submit_queues = dev->submit_queues; 1550 poll_queues = dev->poll_queues; 1551 } else if (set->nr_hw_queues == 1552 dev->prev_submit_queues + dev->prev_poll_queues) { 1553 submit_queues = dev->prev_submit_queues; 1554 poll_queues = dev->prev_poll_queues; 1555 } else { 1556 pr_warn("tag set has unexpected nr_hw_queues: %d\n", 1557 set->nr_hw_queues); 1558 return -EINVAL; 1559 } 1560 } 1561 1562 for (i = 0, qoff = 0; i < set->nr_maps; i++) { 1563 struct blk_mq_queue_map *map = &set->map[i]; 1564 1565 switch (i) { 1566 case HCTX_TYPE_DEFAULT: 1567 map->nr_queues = submit_queues; 1568 break; 1569 case HCTX_TYPE_READ: 1570 map->nr_queues = 0; 1571 continue; 1572 case HCTX_TYPE_POLL: 1573 map->nr_queues = poll_queues; 1574 break; 1575 } 1576 map->queue_offset = qoff; 1577 qoff += map->nr_queues; 1578 blk_mq_map_queues(map); 1579 } 1580 1581 return 0; 1582 } 1583 1584 static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) 1585 { 1586 struct nullb_queue *nq = hctx->driver_data; 1587 LIST_HEAD(list); 1588 int nr = 0; 1589 1590 spin_lock(&nq->poll_lock); 1591 list_splice_init(&nq->poll_list, &list); 1592 spin_unlock(&nq->poll_lock); 1593 1594 while (!list_empty(&list)) { 1595 struct nullb_cmd *cmd; 1596 struct request *req; 1597 1598 req = list_first_entry(&list, struct request, queuelist); 1599 list_del_init(&req->queuelist); 1600 cmd = blk_mq_rq_to_pdu(req); 1601 cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req), 1602 blk_rq_sectors(req)); 1603 if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, 1604 blk_mq_end_request_batch)) 1605 end_cmd(cmd); 1606 nr++; 1607 } 1608 1609 return nr; 1610 } 1611 1612 static enum blk_eh_timer_return null_timeout_rq(struct request *rq) 1613 { 1614 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1615 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); 1616 1617 pr_info("rq %p timed out\n", rq); 1618 1619 if (hctx->type == HCTX_TYPE_POLL) { 1620 struct nullb_queue *nq = hctx->driver_data; 1621 1622 spin_lock(&nq->poll_lock); 1623 list_del_init(&rq->queuelist); 1624 spin_unlock(&nq->poll_lock); 1625 } 1626 1627 /* 1628 * If the device is marked as blocking (i.e. memory backed or zoned 1629 * device), the submission path may be blocked waiting for resources 1630 * and cause real timeouts. For these real timeouts, the submission 1631 * path will complete the request using blk_mq_complete_request(). 1632 * Only fake timeouts need to execute blk_mq_complete_request() here. 1633 */ 1634 cmd->error = BLK_STS_TIMEOUT; 1635 if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL) 1636 blk_mq_complete_request(rq); 1637 return BLK_EH_DONE; 1638 } 1639 1640 static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, 1641 const struct blk_mq_queue_data *bd) 1642 { 1643 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 1644 struct nullb_queue *nq = hctx->driver_data; 1645 sector_t nr_sectors = blk_rq_sectors(bd->rq); 1646 sector_t sector = blk_rq_pos(bd->rq); 1647 const bool is_poll = hctx->type == HCTX_TYPE_POLL; 1648 1649 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1650 1651 if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) { 1652 hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1653 cmd->timer.function = null_cmd_timer_expired; 1654 } 1655 cmd->rq = bd->rq; 1656 cmd->error = BLK_STS_OK; 1657 cmd->nq = nq; 1658 cmd->fake_timeout = should_timeout_request(bd->rq); 1659 1660 blk_mq_start_request(bd->rq); 1661 1662 if (should_requeue_request(bd->rq)) { 1663 /* 1664 * Alternate between hitting the core BUSY path, and the 1665 * driver driven requeue path 1666 */ 1667 nq->requeue_selection++; 1668 if (nq->requeue_selection & 1) 1669 return BLK_STS_RESOURCE; 1670 else { 1671 blk_mq_requeue_request(bd->rq, true); 1672 return BLK_STS_OK; 1673 } 1674 } 1675 1676 if (is_poll) { 1677 spin_lock(&nq->poll_lock); 1678 list_add_tail(&bd->rq->queuelist, &nq->poll_list); 1679 spin_unlock(&nq->poll_lock); 1680 return BLK_STS_OK; 1681 } 1682 if (cmd->fake_timeout) 1683 return BLK_STS_OK; 1684 1685 return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); 1686 } 1687 1688 static void cleanup_queue(struct nullb_queue *nq) 1689 { 1690 bitmap_free(nq->tag_map); 1691 kfree(nq->cmds); 1692 } 1693 1694 static void cleanup_queues(struct nullb *nullb) 1695 { 1696 int i; 1697 1698 for (i = 0; i < nullb->nr_queues; i++) 1699 cleanup_queue(&nullb->queues[i]); 1700 1701 kfree(nullb->queues); 1702 } 1703 1704 static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1705 { 1706 struct nullb_queue *nq = hctx->driver_data; 1707 struct nullb *nullb = nq->dev->nullb; 1708 1709 nullb->nr_queues--; 1710 } 1711 1712 static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 1713 { 1714 init_waitqueue_head(&nq->wait); 1715 nq->queue_depth = nullb->queue_depth; 1716 nq->dev = nullb->dev; 1717 INIT_LIST_HEAD(&nq->poll_list); 1718 spin_lock_init(&nq->poll_lock); 1719 } 1720 1721 static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, 1722 unsigned int hctx_idx) 1723 { 1724 struct nullb *nullb = hctx->queue->queuedata; 1725 struct nullb_queue *nq; 1726 1727 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1728 if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) 1729 return -EFAULT; 1730 #endif 1731 1732 nq = &nullb->queues[hctx_idx]; 1733 hctx->driver_data = nq; 1734 null_init_queue(nullb, nq); 1735 nullb->nr_queues++; 1736 1737 return 0; 1738 } 1739 1740 static const struct blk_mq_ops null_mq_ops = { 1741 .queue_rq = null_queue_rq, 1742 .complete = null_complete_rq, 1743 .timeout = null_timeout_rq, 1744 .poll = null_poll, 1745 .map_queues = null_map_queues, 1746 .init_hctx = null_init_hctx, 1747 .exit_hctx = null_exit_hctx, 1748 }; 1749 1750 static void null_del_dev(struct nullb *nullb) 1751 { 1752 struct nullb_device *dev; 1753 1754 if (!nullb) 1755 return; 1756 1757 dev = nullb->dev; 1758 1759 ida_simple_remove(&nullb_indexes, nullb->index); 1760 1761 list_del_init(&nullb->list); 1762 1763 del_gendisk(nullb->disk); 1764 1765 if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { 1766 hrtimer_cancel(&nullb->bw_timer); 1767 atomic_long_set(&nullb->cur_bytes, LONG_MAX); 1768 null_restart_queue_async(nullb); 1769 } 1770 1771 put_disk(nullb->disk); 1772 if (dev->queue_mode == NULL_Q_MQ && 1773 nullb->tag_set == &nullb->__tag_set) 1774 blk_mq_free_tag_set(nullb->tag_set); 1775 cleanup_queues(nullb); 1776 if (null_cache_active(nullb)) 1777 null_free_device_storage(nullb->dev, true); 1778 kfree(nullb); 1779 dev->nullb = NULL; 1780 } 1781 1782 static void null_config_discard(struct nullb *nullb) 1783 { 1784 if (nullb->dev->discard == false) 1785 return; 1786 1787 if (!nullb->dev->memory_backed) { 1788 nullb->dev->discard = false; 1789 pr_info("discard option is ignored without memory backing\n"); 1790 return; 1791 } 1792 1793 if (nullb->dev->zoned) { 1794 nullb->dev->discard = false; 1795 pr_info("discard option is ignored in zoned mode\n"); 1796 return; 1797 } 1798 1799 nullb->q->limits.discard_granularity = nullb->dev->blocksize; 1800 blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); 1801 } 1802 1803 static const struct block_device_operations null_bio_ops = { 1804 .owner = THIS_MODULE, 1805 .submit_bio = null_submit_bio, 1806 .report_zones = null_report_zones, 1807 }; 1808 1809 static const struct block_device_operations null_rq_ops = { 1810 .owner = THIS_MODULE, 1811 .report_zones = null_report_zones, 1812 }; 1813 1814 static int setup_commands(struct nullb_queue *nq) 1815 { 1816 struct nullb_cmd *cmd; 1817 int i; 1818 1819 nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); 1820 if (!nq->cmds) 1821 return -ENOMEM; 1822 1823 nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); 1824 if (!nq->tag_map) { 1825 kfree(nq->cmds); 1826 return -ENOMEM; 1827 } 1828 1829 for (i = 0; i < nq->queue_depth; i++) { 1830 cmd = &nq->cmds[i]; 1831 cmd->tag = -1U; 1832 } 1833 1834 return 0; 1835 } 1836 1837 static int setup_queues(struct nullb *nullb) 1838 { 1839 int nqueues = nr_cpu_ids; 1840 1841 if (g_poll_queues) 1842 nqueues += g_poll_queues; 1843 1844 nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue), 1845 GFP_KERNEL); 1846 if (!nullb->queues) 1847 return -ENOMEM; 1848 1849 nullb->queue_depth = nullb->dev->hw_queue_depth; 1850 return 0; 1851 } 1852 1853 static int init_driver_queues(struct nullb *nullb) 1854 { 1855 struct nullb_queue *nq; 1856 int i, ret = 0; 1857 1858 for (i = 0; i < nullb->dev->submit_queues; i++) { 1859 nq = &nullb->queues[i]; 1860 1861 null_init_queue(nullb, nq); 1862 1863 ret = setup_commands(nq); 1864 if (ret) 1865 return ret; 1866 nullb->nr_queues++; 1867 } 1868 return 0; 1869 } 1870 1871 static int null_gendisk_register(struct nullb *nullb) 1872 { 1873 sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; 1874 struct gendisk *disk = nullb->disk; 1875 1876 set_capacity(disk, size); 1877 1878 disk->major = null_major; 1879 disk->first_minor = nullb->index; 1880 disk->minors = 1; 1881 if (queue_is_mq(nullb->q)) 1882 disk->fops = &null_rq_ops; 1883 else 1884 disk->fops = &null_bio_ops; 1885 disk->private_data = nullb; 1886 strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); 1887 1888 if (nullb->dev->zoned) { 1889 int ret = null_register_zoned_dev(nullb); 1890 1891 if (ret) 1892 return ret; 1893 } 1894 1895 return add_disk(disk); 1896 } 1897 1898 static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) 1899 { 1900 unsigned int flags = BLK_MQ_F_SHOULD_MERGE; 1901 int hw_queues, numa_node; 1902 unsigned int queue_depth; 1903 int poll_queues; 1904 1905 if (nullb) { 1906 hw_queues = nullb->dev->submit_queues; 1907 poll_queues = nullb->dev->poll_queues; 1908 queue_depth = nullb->dev->hw_queue_depth; 1909 numa_node = nullb->dev->home_node; 1910 if (nullb->dev->no_sched) 1911 flags |= BLK_MQ_F_NO_SCHED; 1912 if (nullb->dev->shared_tag_bitmap) 1913 flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1914 if (nullb->dev->blocking) 1915 flags |= BLK_MQ_F_BLOCKING; 1916 } else { 1917 hw_queues = g_submit_queues; 1918 poll_queues = g_poll_queues; 1919 queue_depth = g_hw_queue_depth; 1920 numa_node = g_home_node; 1921 if (g_no_sched) 1922 flags |= BLK_MQ_F_NO_SCHED; 1923 if (g_shared_tag_bitmap) 1924 flags |= BLK_MQ_F_TAG_HCTX_SHARED; 1925 if (g_blocking) 1926 flags |= BLK_MQ_F_BLOCKING; 1927 } 1928 1929 set->ops = &null_mq_ops; 1930 set->cmd_size = sizeof(struct nullb_cmd); 1931 set->flags = flags; 1932 set->driver_data = nullb; 1933 set->nr_hw_queues = hw_queues; 1934 set->queue_depth = queue_depth; 1935 set->numa_node = numa_node; 1936 if (poll_queues) { 1937 set->nr_hw_queues += poll_queues; 1938 set->nr_maps = 3; 1939 } else { 1940 set->nr_maps = 1; 1941 } 1942 1943 return blk_mq_alloc_tag_set(set); 1944 } 1945 1946 static int null_validate_conf(struct nullb_device *dev) 1947 { 1948 dev->blocksize = round_down(dev->blocksize, 512); 1949 dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); 1950 1951 if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { 1952 if (dev->submit_queues != nr_online_nodes) 1953 dev->submit_queues = nr_online_nodes; 1954 } else if (dev->submit_queues > nr_cpu_ids) 1955 dev->submit_queues = nr_cpu_ids; 1956 else if (dev->submit_queues == 0) 1957 dev->submit_queues = 1; 1958 dev->prev_submit_queues = dev->submit_queues; 1959 1960 if (dev->poll_queues > g_poll_queues) 1961 dev->poll_queues = g_poll_queues; 1962 dev->prev_poll_queues = dev->poll_queues; 1963 1964 dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); 1965 dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); 1966 1967 /* Do memory allocation, so set blocking */ 1968 if (dev->memory_backed) 1969 dev->blocking = true; 1970 else /* cache is meaningless */ 1971 dev->cache_size = 0; 1972 dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, 1973 dev->cache_size); 1974 dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); 1975 /* can not stop a queue */ 1976 if (dev->queue_mode == NULL_Q_BIO) 1977 dev->mbps = 0; 1978 1979 if (dev->zoned && 1980 (!dev->zone_size || !is_power_of_2(dev->zone_size))) { 1981 pr_err("zone_size must be power-of-two\n"); 1982 return -EINVAL; 1983 } 1984 1985 return 0; 1986 } 1987 1988 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 1989 static bool __null_setup_fault(struct fault_attr *attr, char *str) 1990 { 1991 if (!str[0]) 1992 return true; 1993 1994 if (!setup_fault_attr(attr, str)) 1995 return false; 1996 1997 attr->verbose = 0; 1998 return true; 1999 } 2000 #endif 2001 2002 static bool null_setup_fault(void) 2003 { 2004 #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION 2005 if (!__null_setup_fault(&null_timeout_attr, g_timeout_str)) 2006 return false; 2007 if (!__null_setup_fault(&null_requeue_attr, g_requeue_str)) 2008 return false; 2009 if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str)) 2010 return false; 2011 #endif 2012 return true; 2013 } 2014 2015 static int null_add_dev(struct nullb_device *dev) 2016 { 2017 struct nullb *nullb; 2018 int rv; 2019 2020 rv = null_validate_conf(dev); 2021 if (rv) 2022 return rv; 2023 2024 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); 2025 if (!nullb) { 2026 rv = -ENOMEM; 2027 goto out; 2028 } 2029 nullb->dev = dev; 2030 dev->nullb = nullb; 2031 2032 spin_lock_init(&nullb->lock); 2033 2034 rv = setup_queues(nullb); 2035 if (rv) 2036 goto out_free_nullb; 2037 2038 if (dev->queue_mode == NULL_Q_MQ) { 2039 if (shared_tags) { 2040 nullb->tag_set = &tag_set; 2041 rv = 0; 2042 } else { 2043 nullb->tag_set = &nullb->__tag_set; 2044 rv = null_init_tag_set(nullb, nullb->tag_set); 2045 } 2046 2047 if (rv) 2048 goto out_cleanup_queues; 2049 2050 if (!null_setup_fault()) 2051 goto out_cleanup_tags; 2052 2053 nullb->tag_set->timeout = 5 * HZ; 2054 nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); 2055 if (IS_ERR(nullb->disk)) { 2056 rv = PTR_ERR(nullb->disk); 2057 goto out_cleanup_tags; 2058 } 2059 nullb->q = nullb->disk->queue; 2060 } else if (dev->queue_mode == NULL_Q_BIO) { 2061 rv = -ENOMEM; 2062 nullb->disk = blk_alloc_disk(nullb->dev->home_node); 2063 if (!nullb->disk) 2064 goto out_cleanup_queues; 2065 2066 nullb->q = nullb->disk->queue; 2067 rv = init_driver_queues(nullb); 2068 if (rv) 2069 goto out_cleanup_disk; 2070 } 2071 2072 if (dev->mbps) { 2073 set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); 2074 nullb_setup_bwtimer(nullb); 2075 } 2076 2077 if (dev->cache_size > 0) { 2078 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 2079 blk_queue_write_cache(nullb->q, true, true); 2080 } 2081 2082 if (dev->zoned) { 2083 rv = null_init_zoned_dev(dev, nullb->q); 2084 if (rv) 2085 goto out_cleanup_disk; 2086 } 2087 2088 nullb->q->queuedata = nullb; 2089 blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); 2090 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); 2091 2092 mutex_lock(&lock); 2093 rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); 2094 if (rv < 0) { 2095 mutex_unlock(&lock); 2096 goto out_cleanup_zone; 2097 } 2098 nullb->index = rv; 2099 dev->index = rv; 2100 mutex_unlock(&lock); 2101 2102 blk_queue_logical_block_size(nullb->q, dev->blocksize); 2103 blk_queue_physical_block_size(nullb->q, dev->blocksize); 2104 if (!dev->max_sectors) 2105 dev->max_sectors = queue_max_hw_sectors(nullb->q); 2106 dev->max_sectors = min_t(unsigned int, dev->max_sectors, 2107 BLK_DEF_MAX_SECTORS); 2108 blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); 2109 2110 if (dev->virt_boundary) 2111 blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1); 2112 2113 null_config_discard(nullb); 2114 2115 if (config_item_name(&dev->item)) { 2116 /* Use configfs dir name as the device name */ 2117 snprintf(nullb->disk_name, sizeof(nullb->disk_name), 2118 "%s", config_item_name(&dev->item)); 2119 } else { 2120 sprintf(nullb->disk_name, "nullb%d", nullb->index); 2121 } 2122 2123 rv = null_gendisk_register(nullb); 2124 if (rv) 2125 goto out_ida_free; 2126 2127 mutex_lock(&lock); 2128 list_add_tail(&nullb->list, &nullb_list); 2129 mutex_unlock(&lock); 2130 2131 pr_info("disk %s created\n", nullb->disk_name); 2132 2133 return 0; 2134 2135 out_ida_free: 2136 ida_free(&nullb_indexes, nullb->index); 2137 out_cleanup_zone: 2138 null_free_zoned_dev(dev); 2139 out_cleanup_disk: 2140 put_disk(nullb->disk); 2141 out_cleanup_tags: 2142 if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) 2143 blk_mq_free_tag_set(nullb->tag_set); 2144 out_cleanup_queues: 2145 cleanup_queues(nullb); 2146 out_free_nullb: 2147 kfree(nullb); 2148 dev->nullb = NULL; 2149 out: 2150 return rv; 2151 } 2152 2153 static struct nullb *null_find_dev_by_name(const char *name) 2154 { 2155 struct nullb *nullb = NULL, *nb; 2156 2157 mutex_lock(&lock); 2158 list_for_each_entry(nb, &nullb_list, list) { 2159 if (strcmp(nb->disk_name, name) == 0) { 2160 nullb = nb; 2161 break; 2162 } 2163 } 2164 mutex_unlock(&lock); 2165 2166 return nullb; 2167 } 2168 2169 static int null_create_dev(void) 2170 { 2171 struct nullb_device *dev; 2172 int ret; 2173 2174 dev = null_alloc_dev(); 2175 if (!dev) 2176 return -ENOMEM; 2177 2178 ret = null_add_dev(dev); 2179 if (ret) { 2180 null_free_dev(dev); 2181 return ret; 2182 } 2183 2184 return 0; 2185 } 2186 2187 static void null_destroy_dev(struct nullb *nullb) 2188 { 2189 struct nullb_device *dev = nullb->dev; 2190 2191 null_del_dev(nullb); 2192 null_free_dev(dev); 2193 } 2194 2195 static int __init null_init(void) 2196 { 2197 int ret = 0; 2198 unsigned int i; 2199 struct nullb *nullb; 2200 2201 if (g_bs > PAGE_SIZE) { 2202 pr_warn("invalid block size\n"); 2203 pr_warn("defaults block size to %lu\n", PAGE_SIZE); 2204 g_bs = PAGE_SIZE; 2205 } 2206 2207 if (g_max_sectors > BLK_DEF_MAX_SECTORS) { 2208 pr_warn("invalid max sectors\n"); 2209 pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS); 2210 g_max_sectors = BLK_DEF_MAX_SECTORS; 2211 } 2212 2213 if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { 2214 pr_err("invalid home_node value\n"); 2215 g_home_node = NUMA_NO_NODE; 2216 } 2217 2218 if (g_queue_mode == NULL_Q_RQ) { 2219 pr_err("legacy IO path is no longer available\n"); 2220 return -EINVAL; 2221 } 2222 2223 if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { 2224 if (g_submit_queues != nr_online_nodes) { 2225 pr_warn("submit_queues param is set to %u.\n", 2226 nr_online_nodes); 2227 g_submit_queues = nr_online_nodes; 2228 } 2229 } else if (g_submit_queues > nr_cpu_ids) { 2230 g_submit_queues = nr_cpu_ids; 2231 } else if (g_submit_queues <= 0) { 2232 g_submit_queues = 1; 2233 } 2234 2235 if (g_queue_mode == NULL_Q_MQ && shared_tags) { 2236 ret = null_init_tag_set(NULL, &tag_set); 2237 if (ret) 2238 return ret; 2239 } 2240 2241 config_group_init(&nullb_subsys.su_group); 2242 mutex_init(&nullb_subsys.su_mutex); 2243 2244 ret = configfs_register_subsystem(&nullb_subsys); 2245 if (ret) 2246 goto err_tagset; 2247 2248 mutex_init(&lock); 2249 2250 null_major = register_blkdev(0, "nullb"); 2251 if (null_major < 0) { 2252 ret = null_major; 2253 goto err_conf; 2254 } 2255 2256 for (i = 0; i < nr_devices; i++) { 2257 ret = null_create_dev(); 2258 if (ret) 2259 goto err_dev; 2260 } 2261 2262 pr_info("module loaded\n"); 2263 return 0; 2264 2265 err_dev: 2266 while (!list_empty(&nullb_list)) { 2267 nullb = list_entry(nullb_list.next, struct nullb, list); 2268 null_destroy_dev(nullb); 2269 } 2270 unregister_blkdev(null_major, "nullb"); 2271 err_conf: 2272 configfs_unregister_subsystem(&nullb_subsys); 2273 err_tagset: 2274 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2275 blk_mq_free_tag_set(&tag_set); 2276 return ret; 2277 } 2278 2279 static void __exit null_exit(void) 2280 { 2281 struct nullb *nullb; 2282 2283 configfs_unregister_subsystem(&nullb_subsys); 2284 2285 unregister_blkdev(null_major, "nullb"); 2286 2287 mutex_lock(&lock); 2288 while (!list_empty(&nullb_list)) { 2289 nullb = list_entry(nullb_list.next, struct nullb, list); 2290 null_destroy_dev(nullb); 2291 } 2292 mutex_unlock(&lock); 2293 2294 if (g_queue_mode == NULL_Q_MQ && shared_tags) 2295 blk_mq_free_tag_set(&tag_set); 2296 } 2297 2298 module_init(null_init); 2299 module_exit(null_exit); 2300 2301 MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); 2302 MODULE_LICENSE("GPL"); 2303