1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Network block device - make block devices work over TCP 4 * 5 * Note that you can not swap over this thing, yet. Seems to work but 6 * deadlocks sometimes - you can not swap over TCP in general. 7 * 8 * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz> 9 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com> 10 * 11 * (part of code stolen from loop.c) 12 */ 13 14 #include <linux/major.h> 15 16 #include <linux/blkdev.h> 17 #include <linux/module.h> 18 #include <linux/init.h> 19 #include <linux/sched.h> 20 #include <linux/sched/mm.h> 21 #include <linux/fs.h> 22 #include <linux/bio.h> 23 #include <linux/stat.h> 24 #include <linux/errno.h> 25 #include <linux/file.h> 26 #include <linux/ioctl.h> 27 #include <linux/mutex.h> 28 #include <linux/compiler.h> 29 #include <linux/completion.h> 30 #include <linux/err.h> 31 #include <linux/kernel.h> 32 #include <linux/slab.h> 33 #include <net/sock.h> 34 #include <linux/net.h> 35 #include <linux/kthread.h> 36 #include <linux/types.h> 37 #include <linux/debugfs.h> 38 #include <linux/blk-mq.h> 39 40 #include <linux/uaccess.h> 41 #include <asm/types.h> 42 43 #include <linux/nbd.h> 44 #include <linux/nbd-netlink.h> 45 #include <net/genetlink.h> 46 47 #define CREATE_TRACE_POINTS 48 #include <trace/events/nbd.h> 49 50 static DEFINE_IDR(nbd_index_idr); 51 static DEFINE_MUTEX(nbd_index_mutex); 52 static int nbd_total_devices = 0; 53 54 struct nbd_sock { 55 struct socket *sock; 56 struct mutex tx_lock; 57 struct request *pending; 58 int sent; 59 bool dead; 60 int fallback_index; 61 int cookie; 62 }; 63 64 struct recv_thread_args { 65 struct work_struct work; 66 struct nbd_device *nbd; 67 int index; 68 }; 69 70 struct link_dead_args { 71 struct work_struct work; 72 int index; 73 }; 74 75 #define NBD_RT_TIMEDOUT 0 76 #define NBD_RT_DISCONNECT_REQUESTED 1 77 #define NBD_RT_DISCONNECTED 2 78 #define NBD_RT_HAS_PID_FILE 3 79 #define NBD_RT_HAS_CONFIG_REF 4 80 #define NBD_RT_BOUND 5 81 #define NBD_RT_DISCONNECT_ON_CLOSE 6 82 83 #define NBD_DESTROY_ON_DISCONNECT 0 84 #define NBD_DISCONNECT_REQUESTED 1 85 86 struct nbd_config { 87 u32 flags; 88 unsigned long runtime_flags; 89 u64 dead_conn_timeout; 90 91 struct nbd_sock **socks; 92 int num_connections; 93 atomic_t live_connections; 94 wait_queue_head_t conn_wait; 95 96 atomic_t recv_threads; 97 wait_queue_head_t recv_wq; 98 loff_t blksize; 99 loff_t bytesize; 100 #if IS_ENABLED(CONFIG_DEBUG_FS) 101 struct dentry *dbg_dir; 102 #endif 103 }; 104 105 struct nbd_device { 106 struct blk_mq_tag_set tag_set; 107 108 int index; 109 refcount_t config_refs; 110 refcount_t refs; 111 struct nbd_config *config; 112 struct mutex config_lock; 113 struct gendisk *disk; 114 struct workqueue_struct *recv_workq; 115 116 struct list_head list; 117 struct task_struct *task_recv; 118 struct task_struct *task_setup; 119 120 struct completion *destroy_complete; 121 unsigned long flags; 122 }; 123 124 #define NBD_CMD_REQUEUED 1 125 126 struct nbd_cmd { 127 struct nbd_device *nbd; 128 struct mutex lock; 129 int index; 130 int cookie; 131 int retries; 132 blk_status_t status; 133 unsigned long flags; 134 u32 cmd_cookie; 135 }; 136 137 #if IS_ENABLED(CONFIG_DEBUG_FS) 138 static struct dentry *nbd_dbg_dir; 139 #endif 140 141 #define nbd_name(nbd) ((nbd)->disk->disk_name) 142 143 #define NBD_MAGIC 0x68797548 144 145 #define NBD_DEF_BLKSIZE 1024 146 147 static unsigned int nbds_max = 16; 148 static int max_part = 16; 149 static int part_shift; 150 151 static int nbd_dev_dbg_init(struct nbd_device *nbd); 152 static void nbd_dev_dbg_close(struct nbd_device *nbd); 153 static void nbd_config_put(struct nbd_device *nbd); 154 static void nbd_connect_reply(struct genl_info *info, int index); 155 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info); 156 static void nbd_dead_link_work(struct work_struct *work); 157 static void nbd_disconnect_and_put(struct nbd_device *nbd); 158 159 static inline struct device *nbd_to_dev(struct nbd_device *nbd) 160 { 161 return disk_to_dev(nbd->disk); 162 } 163 164 static void nbd_requeue_cmd(struct nbd_cmd *cmd) 165 { 166 struct request *req = blk_mq_rq_from_pdu(cmd); 167 168 if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) 169 blk_mq_requeue_request(req, true); 170 } 171 172 #define NBD_COOKIE_BITS 32 173 174 static u64 nbd_cmd_handle(struct nbd_cmd *cmd) 175 { 176 struct request *req = blk_mq_rq_from_pdu(cmd); 177 u32 tag = blk_mq_unique_tag(req); 178 u64 cookie = cmd->cmd_cookie; 179 180 return (cookie << NBD_COOKIE_BITS) | tag; 181 } 182 183 static u32 nbd_handle_to_tag(u64 handle) 184 { 185 return (u32)handle; 186 } 187 188 static u32 nbd_handle_to_cookie(u64 handle) 189 { 190 return (u32)(handle >> NBD_COOKIE_BITS); 191 } 192 193 static const char *nbdcmd_to_ascii(int cmd) 194 { 195 switch (cmd) { 196 case NBD_CMD_READ: return "read"; 197 case NBD_CMD_WRITE: return "write"; 198 case NBD_CMD_DISC: return "disconnect"; 199 case NBD_CMD_FLUSH: return "flush"; 200 case NBD_CMD_TRIM: return "trim/discard"; 201 } 202 return "invalid"; 203 } 204 205 static ssize_t pid_show(struct device *dev, 206 struct device_attribute *attr, char *buf) 207 { 208 struct gendisk *disk = dev_to_disk(dev); 209 struct nbd_device *nbd = (struct nbd_device *)disk->private_data; 210 211 return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); 212 } 213 214 static const struct device_attribute pid_attr = { 215 .attr = { .name = "pid", .mode = 0444}, 216 .show = pid_show, 217 }; 218 219 static void nbd_dev_remove(struct nbd_device *nbd) 220 { 221 struct gendisk *disk = nbd->disk; 222 223 if (disk) { 224 del_gendisk(disk); 225 blk_mq_free_tag_set(&nbd->tag_set); 226 blk_cleanup_disk(disk); 227 } 228 229 /* 230 * Place this in the last just before the nbd is freed to 231 * make sure that the disk and the related kobject are also 232 * totally removed to avoid duplicate creation of the same 233 * one. 234 */ 235 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete) 236 complete(nbd->destroy_complete); 237 238 kfree(nbd); 239 } 240 241 static void nbd_put(struct nbd_device *nbd) 242 { 243 if (refcount_dec_and_mutex_lock(&nbd->refs, 244 &nbd_index_mutex)) { 245 idr_remove(&nbd_index_idr, nbd->index); 246 nbd_dev_remove(nbd); 247 mutex_unlock(&nbd_index_mutex); 248 } 249 } 250 251 static int nbd_disconnected(struct nbd_config *config) 252 { 253 return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) || 254 test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); 255 } 256 257 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, 258 int notify) 259 { 260 if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) { 261 struct link_dead_args *args; 262 args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO); 263 if (args) { 264 INIT_WORK(&args->work, nbd_dead_link_work); 265 args->index = nbd->index; 266 queue_work(system_wq, &args->work); 267 } 268 } 269 if (!nsock->dead) { 270 kernel_sock_shutdown(nsock->sock, SHUT_RDWR); 271 if (atomic_dec_return(&nbd->config->live_connections) == 0) { 272 if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED, 273 &nbd->config->runtime_flags)) { 274 set_bit(NBD_RT_DISCONNECTED, 275 &nbd->config->runtime_flags); 276 dev_info(nbd_to_dev(nbd), 277 "Disconnected due to user request.\n"); 278 } 279 } 280 } 281 nsock->dead = true; 282 nsock->pending = NULL; 283 nsock->sent = 0; 284 } 285 286 static void nbd_size_clear(struct nbd_device *nbd) 287 { 288 if (nbd->config->bytesize) { 289 set_capacity(nbd->disk, 0); 290 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); 291 } 292 } 293 294 static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, 295 loff_t blksize) 296 { 297 if (!blksize) 298 blksize = NBD_DEF_BLKSIZE; 299 if (blksize < 512 || blksize > PAGE_SIZE || !is_power_of_2(blksize)) 300 return -EINVAL; 301 302 nbd->config->bytesize = bytesize; 303 nbd->config->blksize = blksize; 304 305 if (!nbd->task_recv) 306 return 0; 307 308 if (nbd->config->flags & NBD_FLAG_SEND_TRIM) { 309 nbd->disk->queue->limits.discard_granularity = blksize; 310 nbd->disk->queue->limits.discard_alignment = blksize; 311 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); 312 } 313 blk_queue_logical_block_size(nbd->disk->queue, blksize); 314 blk_queue_physical_block_size(nbd->disk->queue, blksize); 315 316 if (max_part) 317 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 318 if (!set_capacity_and_notify(nbd->disk, bytesize >> 9)) 319 kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); 320 return 0; 321 } 322 323 static void nbd_complete_rq(struct request *req) 324 { 325 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 326 327 dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req, 328 cmd->status ? "failed" : "done"); 329 330 blk_mq_end_request(req, cmd->status); 331 } 332 333 /* 334 * Forcibly shutdown the socket causing all listeners to error 335 */ 336 static void sock_shutdown(struct nbd_device *nbd) 337 { 338 struct nbd_config *config = nbd->config; 339 int i; 340 341 if (config->num_connections == 0) 342 return; 343 if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) 344 return; 345 346 for (i = 0; i < config->num_connections; i++) { 347 struct nbd_sock *nsock = config->socks[i]; 348 mutex_lock(&nsock->tx_lock); 349 nbd_mark_nsock_dead(nbd, nsock, 0); 350 mutex_unlock(&nsock->tx_lock); 351 } 352 dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n"); 353 } 354 355 static u32 req_to_nbd_cmd_type(struct request *req) 356 { 357 switch (req_op(req)) { 358 case REQ_OP_DISCARD: 359 return NBD_CMD_TRIM; 360 case REQ_OP_FLUSH: 361 return NBD_CMD_FLUSH; 362 case REQ_OP_WRITE: 363 return NBD_CMD_WRITE; 364 case REQ_OP_READ: 365 return NBD_CMD_READ; 366 default: 367 return U32_MAX; 368 } 369 } 370 371 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, 372 bool reserved) 373 { 374 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 375 struct nbd_device *nbd = cmd->nbd; 376 struct nbd_config *config; 377 378 if (!mutex_trylock(&cmd->lock)) 379 return BLK_EH_RESET_TIMER; 380 381 if (!refcount_inc_not_zero(&nbd->config_refs)) { 382 cmd->status = BLK_STS_TIMEOUT; 383 mutex_unlock(&cmd->lock); 384 goto done; 385 } 386 config = nbd->config; 387 388 if (config->num_connections > 1 || 389 (config->num_connections == 1 && nbd->tag_set.timeout)) { 390 dev_err_ratelimited(nbd_to_dev(nbd), 391 "Connection timed out, retrying (%d/%d alive)\n", 392 atomic_read(&config->live_connections), 393 config->num_connections); 394 /* 395 * Hooray we have more connections, requeue this IO, the submit 396 * path will put it on a real connection. Or if only one 397 * connection is configured, the submit path will wait util 398 * a new connection is reconfigured or util dead timeout. 399 */ 400 if (config->socks) { 401 if (cmd->index < config->num_connections) { 402 struct nbd_sock *nsock = 403 config->socks[cmd->index]; 404 mutex_lock(&nsock->tx_lock); 405 /* We can have multiple outstanding requests, so 406 * we don't want to mark the nsock dead if we've 407 * already reconnected with a new socket, so 408 * only mark it dead if its the same socket we 409 * were sent out on. 410 */ 411 if (cmd->cookie == nsock->cookie) 412 nbd_mark_nsock_dead(nbd, nsock, 1); 413 mutex_unlock(&nsock->tx_lock); 414 } 415 mutex_unlock(&cmd->lock); 416 nbd_requeue_cmd(cmd); 417 nbd_config_put(nbd); 418 return BLK_EH_DONE; 419 } 420 } 421 422 if (!nbd->tag_set.timeout) { 423 /* 424 * Userspace sets timeout=0 to disable socket disconnection, 425 * so just warn and reset the timer. 426 */ 427 struct nbd_sock *nsock = config->socks[cmd->index]; 428 cmd->retries++; 429 dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n", 430 req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)), 431 (unsigned long long)blk_rq_pos(req) << 9, 432 blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries); 433 434 mutex_lock(&nsock->tx_lock); 435 if (cmd->cookie != nsock->cookie) { 436 nbd_requeue_cmd(cmd); 437 mutex_unlock(&nsock->tx_lock); 438 mutex_unlock(&cmd->lock); 439 nbd_config_put(nbd); 440 return BLK_EH_DONE; 441 } 442 mutex_unlock(&nsock->tx_lock); 443 mutex_unlock(&cmd->lock); 444 nbd_config_put(nbd); 445 return BLK_EH_RESET_TIMER; 446 } 447 448 dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n"); 449 set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags); 450 cmd->status = BLK_STS_IOERR; 451 mutex_unlock(&cmd->lock); 452 sock_shutdown(nbd); 453 nbd_config_put(nbd); 454 done: 455 blk_mq_complete_request(req); 456 return BLK_EH_DONE; 457 } 458 459 /* 460 * Send or receive packet. 461 */ 462 static int sock_xmit(struct nbd_device *nbd, int index, int send, 463 struct iov_iter *iter, int msg_flags, int *sent) 464 { 465 struct nbd_config *config = nbd->config; 466 struct socket *sock = config->socks[index]->sock; 467 int result; 468 struct msghdr msg; 469 unsigned int noreclaim_flag; 470 471 if (unlikely(!sock)) { 472 dev_err_ratelimited(disk_to_dev(nbd->disk), 473 "Attempted %s on closed socket in sock_xmit\n", 474 (send ? "send" : "recv")); 475 return -EINVAL; 476 } 477 478 msg.msg_iter = *iter; 479 480 noreclaim_flag = memalloc_noreclaim_save(); 481 do { 482 sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; 483 msg.msg_name = NULL; 484 msg.msg_namelen = 0; 485 msg.msg_control = NULL; 486 msg.msg_controllen = 0; 487 msg.msg_flags = msg_flags | MSG_NOSIGNAL; 488 489 if (send) 490 result = sock_sendmsg(sock, &msg); 491 else 492 result = sock_recvmsg(sock, &msg, msg.msg_flags); 493 494 if (result <= 0) { 495 if (result == 0) 496 result = -EPIPE; /* short read */ 497 break; 498 } 499 if (sent) 500 *sent += result; 501 } while (msg_data_left(&msg)); 502 503 memalloc_noreclaim_restore(noreclaim_flag); 504 505 return result; 506 } 507 508 /* 509 * Different settings for sk->sk_sndtimeo can result in different return values 510 * if there is a signal pending when we enter sendmsg, because reasons? 511 */ 512 static inline int was_interrupted(int result) 513 { 514 return result == -ERESTARTSYS || result == -EINTR; 515 } 516 517 /* always call with the tx_lock held */ 518 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) 519 { 520 struct request *req = blk_mq_rq_from_pdu(cmd); 521 struct nbd_config *config = nbd->config; 522 struct nbd_sock *nsock = config->socks[index]; 523 int result; 524 struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)}; 525 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; 526 struct iov_iter from; 527 unsigned long size = blk_rq_bytes(req); 528 struct bio *bio; 529 u64 handle; 530 u32 type; 531 u32 nbd_cmd_flags = 0; 532 int sent = nsock->sent, skip = 0; 533 534 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); 535 536 type = req_to_nbd_cmd_type(req); 537 if (type == U32_MAX) 538 return -EIO; 539 540 if (rq_data_dir(req) == WRITE && 541 (config->flags & NBD_FLAG_READ_ONLY)) { 542 dev_err_ratelimited(disk_to_dev(nbd->disk), 543 "Write on read-only\n"); 544 return -EIO; 545 } 546 547 if (req->cmd_flags & REQ_FUA) 548 nbd_cmd_flags |= NBD_CMD_FLAG_FUA; 549 550 /* We did a partial send previously, and we at least sent the whole 551 * request struct, so just go and send the rest of the pages in the 552 * request. 553 */ 554 if (sent) { 555 if (sent >= sizeof(request)) { 556 skip = sent - sizeof(request); 557 558 /* initialize handle for tracing purposes */ 559 handle = nbd_cmd_handle(cmd); 560 561 goto send_pages; 562 } 563 iov_iter_advance(&from, sent); 564 } else { 565 cmd->cmd_cookie++; 566 } 567 cmd->index = index; 568 cmd->cookie = nsock->cookie; 569 cmd->retries = 0; 570 request.type = htonl(type | nbd_cmd_flags); 571 if (type != NBD_CMD_FLUSH) { 572 request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); 573 request.len = htonl(size); 574 } 575 handle = nbd_cmd_handle(cmd); 576 memcpy(request.handle, &handle, sizeof(handle)); 577 578 trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); 579 580 dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", 581 req, nbdcmd_to_ascii(type), 582 (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); 583 result = sock_xmit(nbd, index, 1, &from, 584 (type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent); 585 trace_nbd_header_sent(req, handle); 586 if (result <= 0) { 587 if (was_interrupted(result)) { 588 /* If we havne't sent anything we can just return BUSY, 589 * however if we have sent something we need to make 590 * sure we only allow this req to be sent until we are 591 * completely done. 592 */ 593 if (sent) { 594 nsock->pending = req; 595 nsock->sent = sent; 596 } 597 set_bit(NBD_CMD_REQUEUED, &cmd->flags); 598 return BLK_STS_RESOURCE; 599 } 600 dev_err_ratelimited(disk_to_dev(nbd->disk), 601 "Send control failed (result %d)\n", result); 602 return -EAGAIN; 603 } 604 send_pages: 605 if (type != NBD_CMD_WRITE) 606 goto out; 607 608 bio = req->bio; 609 while (bio) { 610 struct bio *next = bio->bi_next; 611 struct bvec_iter iter; 612 struct bio_vec bvec; 613 614 bio_for_each_segment(bvec, bio, iter) { 615 bool is_last = !next && bio_iter_last(bvec, iter); 616 int flags = is_last ? 0 : MSG_MORE; 617 618 dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", 619 req, bvec.bv_len); 620 iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len); 621 if (skip) { 622 if (skip >= iov_iter_count(&from)) { 623 skip -= iov_iter_count(&from); 624 continue; 625 } 626 iov_iter_advance(&from, skip); 627 skip = 0; 628 } 629 result = sock_xmit(nbd, index, 1, &from, flags, &sent); 630 if (result <= 0) { 631 if (was_interrupted(result)) { 632 /* We've already sent the header, we 633 * have no choice but to set pending and 634 * return BUSY. 635 */ 636 nsock->pending = req; 637 nsock->sent = sent; 638 set_bit(NBD_CMD_REQUEUED, &cmd->flags); 639 return BLK_STS_RESOURCE; 640 } 641 dev_err(disk_to_dev(nbd->disk), 642 "Send data failed (result %d)\n", 643 result); 644 return -EAGAIN; 645 } 646 /* 647 * The completion might already have come in, 648 * so break for the last one instead of letting 649 * the iterator do it. This prevents use-after-free 650 * of the bio. 651 */ 652 if (is_last) 653 break; 654 } 655 bio = next; 656 } 657 out: 658 trace_nbd_payload_sent(req, handle); 659 nsock->pending = NULL; 660 nsock->sent = 0; 661 return 0; 662 } 663 664 /* NULL returned = something went wrong, inform userspace */ 665 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) 666 { 667 struct nbd_config *config = nbd->config; 668 int result; 669 struct nbd_reply reply; 670 struct nbd_cmd *cmd; 671 struct request *req = NULL; 672 u64 handle; 673 u16 hwq; 674 u32 tag; 675 struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)}; 676 struct iov_iter to; 677 int ret = 0; 678 679 reply.magic = 0; 680 iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply)); 681 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); 682 if (result <= 0) { 683 if (!nbd_disconnected(config)) 684 dev_err(disk_to_dev(nbd->disk), 685 "Receive control failed (result %d)\n", result); 686 return ERR_PTR(result); 687 } 688 689 if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { 690 dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", 691 (unsigned long)ntohl(reply.magic)); 692 return ERR_PTR(-EPROTO); 693 } 694 695 memcpy(&handle, reply.handle, sizeof(handle)); 696 tag = nbd_handle_to_tag(handle); 697 hwq = blk_mq_unique_tag_to_hwq(tag); 698 if (hwq < nbd->tag_set.nr_hw_queues) 699 req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], 700 blk_mq_unique_tag_to_tag(tag)); 701 if (!req || !blk_mq_request_started(req)) { 702 dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", 703 tag, req); 704 return ERR_PTR(-ENOENT); 705 } 706 trace_nbd_header_received(req, handle); 707 cmd = blk_mq_rq_to_pdu(req); 708 709 mutex_lock(&cmd->lock); 710 if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) { 711 dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n", 712 req, cmd->cmd_cookie, nbd_handle_to_cookie(handle)); 713 ret = -ENOENT; 714 goto out; 715 } 716 if (cmd->status != BLK_STS_OK) { 717 dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n", 718 req); 719 ret = -ENOENT; 720 goto out; 721 } 722 if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { 723 dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", 724 req); 725 ret = -ENOENT; 726 goto out; 727 } 728 if (ntohl(reply.error)) { 729 dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", 730 ntohl(reply.error)); 731 cmd->status = BLK_STS_IOERR; 732 goto out; 733 } 734 735 dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); 736 if (rq_data_dir(req) != WRITE) { 737 struct req_iterator iter; 738 struct bio_vec bvec; 739 740 rq_for_each_segment(bvec, req, iter) { 741 iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len); 742 result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL); 743 if (result <= 0) { 744 dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", 745 result); 746 /* 747 * If we've disconnected, we need to make sure we 748 * complete this request, otherwise error out 749 * and let the timeout stuff handle resubmitting 750 * this request onto another connection. 751 */ 752 if (nbd_disconnected(config)) { 753 cmd->status = BLK_STS_IOERR; 754 goto out; 755 } 756 ret = -EIO; 757 goto out; 758 } 759 dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", 760 req, bvec.bv_len); 761 } 762 } 763 out: 764 trace_nbd_payload_received(req, handle); 765 mutex_unlock(&cmd->lock); 766 return ret ? ERR_PTR(ret) : cmd; 767 } 768 769 static void recv_work(struct work_struct *work) 770 { 771 struct recv_thread_args *args = container_of(work, 772 struct recv_thread_args, 773 work); 774 struct nbd_device *nbd = args->nbd; 775 struct nbd_config *config = nbd->config; 776 struct nbd_cmd *cmd; 777 struct request *rq; 778 779 while (1) { 780 cmd = nbd_read_stat(nbd, args->index); 781 if (IS_ERR(cmd)) { 782 struct nbd_sock *nsock = config->socks[args->index]; 783 784 mutex_lock(&nsock->tx_lock); 785 nbd_mark_nsock_dead(nbd, nsock, 1); 786 mutex_unlock(&nsock->tx_lock); 787 break; 788 } 789 790 rq = blk_mq_rq_from_pdu(cmd); 791 if (likely(!blk_should_fake_timeout(rq->q))) 792 blk_mq_complete_request(rq); 793 } 794 nbd_config_put(nbd); 795 atomic_dec(&config->recv_threads); 796 wake_up(&config->recv_wq); 797 kfree(args); 798 } 799 800 static bool nbd_clear_req(struct request *req, void *data, bool reserved) 801 { 802 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); 803 804 mutex_lock(&cmd->lock); 805 cmd->status = BLK_STS_IOERR; 806 mutex_unlock(&cmd->lock); 807 808 blk_mq_complete_request(req); 809 return true; 810 } 811 812 static void nbd_clear_que(struct nbd_device *nbd) 813 { 814 blk_mq_quiesce_queue(nbd->disk->queue); 815 blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); 816 blk_mq_unquiesce_queue(nbd->disk->queue); 817 dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); 818 } 819 820 static int find_fallback(struct nbd_device *nbd, int index) 821 { 822 struct nbd_config *config = nbd->config; 823 int new_index = -1; 824 struct nbd_sock *nsock = config->socks[index]; 825 int fallback = nsock->fallback_index; 826 827 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) 828 return new_index; 829 830 if (config->num_connections <= 1) { 831 dev_err_ratelimited(disk_to_dev(nbd->disk), 832 "Dead connection, failed to find a fallback\n"); 833 return new_index; 834 } 835 836 if (fallback >= 0 && fallback < config->num_connections && 837 !config->socks[fallback]->dead) 838 return fallback; 839 840 if (nsock->fallback_index < 0 || 841 nsock->fallback_index >= config->num_connections || 842 config->socks[nsock->fallback_index]->dead) { 843 int i; 844 for (i = 0; i < config->num_connections; i++) { 845 if (i == index) 846 continue; 847 if (!config->socks[i]->dead) { 848 new_index = i; 849 break; 850 } 851 } 852 nsock->fallback_index = new_index; 853 if (new_index < 0) { 854 dev_err_ratelimited(disk_to_dev(nbd->disk), 855 "Dead connection, failed to find a fallback\n"); 856 return new_index; 857 } 858 } 859 new_index = nsock->fallback_index; 860 return new_index; 861 } 862 863 static int wait_for_reconnect(struct nbd_device *nbd) 864 { 865 struct nbd_config *config = nbd->config; 866 if (!config->dead_conn_timeout) 867 return 0; 868 if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags)) 869 return 0; 870 return wait_event_timeout(config->conn_wait, 871 atomic_read(&config->live_connections) > 0, 872 config->dead_conn_timeout) > 0; 873 } 874 875 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) 876 { 877 struct request *req = blk_mq_rq_from_pdu(cmd); 878 struct nbd_device *nbd = cmd->nbd; 879 struct nbd_config *config; 880 struct nbd_sock *nsock; 881 int ret; 882 883 if (!refcount_inc_not_zero(&nbd->config_refs)) { 884 dev_err_ratelimited(disk_to_dev(nbd->disk), 885 "Socks array is empty\n"); 886 blk_mq_start_request(req); 887 return -EINVAL; 888 } 889 config = nbd->config; 890 891 if (index >= config->num_connections) { 892 dev_err_ratelimited(disk_to_dev(nbd->disk), 893 "Attempted send on invalid socket\n"); 894 nbd_config_put(nbd); 895 blk_mq_start_request(req); 896 return -EINVAL; 897 } 898 cmd->status = BLK_STS_OK; 899 again: 900 nsock = config->socks[index]; 901 mutex_lock(&nsock->tx_lock); 902 if (nsock->dead) { 903 int old_index = index; 904 index = find_fallback(nbd, index); 905 mutex_unlock(&nsock->tx_lock); 906 if (index < 0) { 907 if (wait_for_reconnect(nbd)) { 908 index = old_index; 909 goto again; 910 } 911 /* All the sockets should already be down at this point, 912 * we just want to make sure that DISCONNECTED is set so 913 * any requests that come in that were queue'ed waiting 914 * for the reconnect timer don't trigger the timer again 915 * and instead just error out. 916 */ 917 sock_shutdown(nbd); 918 nbd_config_put(nbd); 919 blk_mq_start_request(req); 920 return -EIO; 921 } 922 goto again; 923 } 924 925 /* Handle the case that we have a pending request that was partially 926 * transmitted that _has_ to be serviced first. We need to call requeue 927 * here so that it gets put _after_ the request that is already on the 928 * dispatch list. 929 */ 930 blk_mq_start_request(req); 931 if (unlikely(nsock->pending && nsock->pending != req)) { 932 nbd_requeue_cmd(cmd); 933 ret = 0; 934 goto out; 935 } 936 /* 937 * Some failures are related to the link going down, so anything that 938 * returns EAGAIN can be retried on a different socket. 939 */ 940 ret = nbd_send_cmd(nbd, cmd, index); 941 if (ret == -EAGAIN) { 942 dev_err_ratelimited(disk_to_dev(nbd->disk), 943 "Request send failed, requeueing\n"); 944 nbd_mark_nsock_dead(nbd, nsock, 1); 945 nbd_requeue_cmd(cmd); 946 ret = 0; 947 } 948 out: 949 mutex_unlock(&nsock->tx_lock); 950 nbd_config_put(nbd); 951 return ret; 952 } 953 954 static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, 955 const struct blk_mq_queue_data *bd) 956 { 957 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); 958 int ret; 959 960 /* 961 * Since we look at the bio's to send the request over the network we 962 * need to make sure the completion work doesn't mark this request done 963 * before we are done doing our send. This keeps us from dereferencing 964 * freed data if we have particularly fast completions (ie we get the 965 * completion before we exit sock_xmit on the last bvec) or in the case 966 * that the server is misbehaving (or there was an error) before we're 967 * done sending everything over the wire. 968 */ 969 mutex_lock(&cmd->lock); 970 clear_bit(NBD_CMD_REQUEUED, &cmd->flags); 971 972 /* We can be called directly from the user space process, which means we 973 * could possibly have signals pending so our sendmsg will fail. In 974 * this case we need to return that we are busy, otherwise error out as 975 * appropriate. 976 */ 977 ret = nbd_handle_cmd(cmd, hctx->queue_num); 978 if (ret < 0) 979 ret = BLK_STS_IOERR; 980 else if (!ret) 981 ret = BLK_STS_OK; 982 mutex_unlock(&cmd->lock); 983 984 return ret; 985 } 986 987 static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, 988 int *err) 989 { 990 struct socket *sock; 991 992 *err = 0; 993 sock = sockfd_lookup(fd, err); 994 if (!sock) 995 return NULL; 996 997 if (sock->ops->shutdown == sock_no_shutdown) { 998 dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); 999 *err = -EINVAL; 1000 sockfd_put(sock); 1001 return NULL; 1002 } 1003 1004 return sock; 1005 } 1006 1007 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, 1008 bool netlink) 1009 { 1010 struct nbd_config *config = nbd->config; 1011 struct socket *sock; 1012 struct nbd_sock **socks; 1013 struct nbd_sock *nsock; 1014 int err; 1015 1016 sock = nbd_get_socket(nbd, arg, &err); 1017 if (!sock) 1018 return err; 1019 1020 /* 1021 * We need to make sure we don't get any errant requests while we're 1022 * reallocating the ->socks array. 1023 */ 1024 blk_mq_freeze_queue(nbd->disk->queue); 1025 1026 if (!netlink && !nbd->task_setup && 1027 !test_bit(NBD_RT_BOUND, &config->runtime_flags)) 1028 nbd->task_setup = current; 1029 1030 if (!netlink && 1031 (nbd->task_setup != current || 1032 test_bit(NBD_RT_BOUND, &config->runtime_flags))) { 1033 dev_err(disk_to_dev(nbd->disk), 1034 "Device being setup by another task"); 1035 err = -EBUSY; 1036 goto put_socket; 1037 } 1038 1039 nsock = kzalloc(sizeof(*nsock), GFP_KERNEL); 1040 if (!nsock) { 1041 err = -ENOMEM; 1042 goto put_socket; 1043 } 1044 1045 socks = krealloc(config->socks, (config->num_connections + 1) * 1046 sizeof(struct nbd_sock *), GFP_KERNEL); 1047 if (!socks) { 1048 kfree(nsock); 1049 err = -ENOMEM; 1050 goto put_socket; 1051 } 1052 1053 config->socks = socks; 1054 1055 nsock->fallback_index = -1; 1056 nsock->dead = false; 1057 mutex_init(&nsock->tx_lock); 1058 nsock->sock = sock; 1059 nsock->pending = NULL; 1060 nsock->sent = 0; 1061 nsock->cookie = 0; 1062 socks[config->num_connections++] = nsock; 1063 atomic_inc(&config->live_connections); 1064 blk_mq_unfreeze_queue(nbd->disk->queue); 1065 1066 return 0; 1067 1068 put_socket: 1069 blk_mq_unfreeze_queue(nbd->disk->queue); 1070 sockfd_put(sock); 1071 return err; 1072 } 1073 1074 static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) 1075 { 1076 struct nbd_config *config = nbd->config; 1077 struct socket *sock, *old; 1078 struct recv_thread_args *args; 1079 int i; 1080 int err; 1081 1082 sock = nbd_get_socket(nbd, arg, &err); 1083 if (!sock) 1084 return err; 1085 1086 args = kzalloc(sizeof(*args), GFP_KERNEL); 1087 if (!args) { 1088 sockfd_put(sock); 1089 return -ENOMEM; 1090 } 1091 1092 for (i = 0; i < config->num_connections; i++) { 1093 struct nbd_sock *nsock = config->socks[i]; 1094 1095 if (!nsock->dead) 1096 continue; 1097 1098 mutex_lock(&nsock->tx_lock); 1099 if (!nsock->dead) { 1100 mutex_unlock(&nsock->tx_lock); 1101 continue; 1102 } 1103 sk_set_memalloc(sock->sk); 1104 if (nbd->tag_set.timeout) 1105 sock->sk->sk_sndtimeo = nbd->tag_set.timeout; 1106 atomic_inc(&config->recv_threads); 1107 refcount_inc(&nbd->config_refs); 1108 old = nsock->sock; 1109 nsock->fallback_index = -1; 1110 nsock->sock = sock; 1111 nsock->dead = false; 1112 INIT_WORK(&args->work, recv_work); 1113 args->index = i; 1114 args->nbd = nbd; 1115 nsock->cookie++; 1116 mutex_unlock(&nsock->tx_lock); 1117 sockfd_put(old); 1118 1119 clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags); 1120 1121 /* We take the tx_mutex in an error path in the recv_work, so we 1122 * need to queue_work outside of the tx_mutex. 1123 */ 1124 queue_work(nbd->recv_workq, &args->work); 1125 1126 atomic_inc(&config->live_connections); 1127 wake_up(&config->conn_wait); 1128 return 0; 1129 } 1130 sockfd_put(sock); 1131 kfree(args); 1132 return -ENOSPC; 1133 } 1134 1135 static void nbd_bdev_reset(struct block_device *bdev) 1136 { 1137 if (bdev->bd_openers > 1) 1138 return; 1139 set_capacity(bdev->bd_disk, 0); 1140 } 1141 1142 static void nbd_parse_flags(struct nbd_device *nbd) 1143 { 1144 struct nbd_config *config = nbd->config; 1145 if (config->flags & NBD_FLAG_READ_ONLY) 1146 set_disk_ro(nbd->disk, true); 1147 else 1148 set_disk_ro(nbd->disk, false); 1149 if (config->flags & NBD_FLAG_SEND_TRIM) 1150 blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue); 1151 if (config->flags & NBD_FLAG_SEND_FLUSH) { 1152 if (config->flags & NBD_FLAG_SEND_FUA) 1153 blk_queue_write_cache(nbd->disk->queue, true, true); 1154 else 1155 blk_queue_write_cache(nbd->disk->queue, true, false); 1156 } 1157 else 1158 blk_queue_write_cache(nbd->disk->queue, false, false); 1159 } 1160 1161 static void send_disconnects(struct nbd_device *nbd) 1162 { 1163 struct nbd_config *config = nbd->config; 1164 struct nbd_request request = { 1165 .magic = htonl(NBD_REQUEST_MAGIC), 1166 .type = htonl(NBD_CMD_DISC), 1167 }; 1168 struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)}; 1169 struct iov_iter from; 1170 int i, ret; 1171 1172 for (i = 0; i < config->num_connections; i++) { 1173 struct nbd_sock *nsock = config->socks[i]; 1174 1175 iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request)); 1176 mutex_lock(&nsock->tx_lock); 1177 ret = sock_xmit(nbd, i, 1, &from, 0, NULL); 1178 if (ret <= 0) 1179 dev_err(disk_to_dev(nbd->disk), 1180 "Send disconnect failed %d\n", ret); 1181 mutex_unlock(&nsock->tx_lock); 1182 } 1183 } 1184 1185 static int nbd_disconnect(struct nbd_device *nbd) 1186 { 1187 struct nbd_config *config = nbd->config; 1188 1189 dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); 1190 set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags); 1191 set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags); 1192 send_disconnects(nbd); 1193 return 0; 1194 } 1195 1196 static void nbd_clear_sock(struct nbd_device *nbd) 1197 { 1198 sock_shutdown(nbd); 1199 nbd_clear_que(nbd); 1200 nbd->task_setup = NULL; 1201 } 1202 1203 static void nbd_config_put(struct nbd_device *nbd) 1204 { 1205 if (refcount_dec_and_mutex_lock(&nbd->config_refs, 1206 &nbd->config_lock)) { 1207 struct nbd_config *config = nbd->config; 1208 nbd_dev_dbg_close(nbd); 1209 nbd_size_clear(nbd); 1210 if (test_and_clear_bit(NBD_RT_HAS_PID_FILE, 1211 &config->runtime_flags)) 1212 device_remove_file(disk_to_dev(nbd->disk), &pid_attr); 1213 nbd->task_recv = NULL; 1214 nbd_clear_sock(nbd); 1215 if (config->num_connections) { 1216 int i; 1217 for (i = 0; i < config->num_connections; i++) { 1218 sockfd_put(config->socks[i]->sock); 1219 kfree(config->socks[i]); 1220 } 1221 kfree(config->socks); 1222 } 1223 kfree(nbd->config); 1224 nbd->config = NULL; 1225 1226 if (nbd->recv_workq) 1227 destroy_workqueue(nbd->recv_workq); 1228 nbd->recv_workq = NULL; 1229 1230 nbd->tag_set.timeout = 0; 1231 nbd->disk->queue->limits.discard_granularity = 0; 1232 nbd->disk->queue->limits.discard_alignment = 0; 1233 blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); 1234 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue); 1235 1236 mutex_unlock(&nbd->config_lock); 1237 nbd_put(nbd); 1238 module_put(THIS_MODULE); 1239 } 1240 } 1241 1242 static int nbd_start_device(struct nbd_device *nbd) 1243 { 1244 struct nbd_config *config = nbd->config; 1245 int num_connections = config->num_connections; 1246 int error = 0, i; 1247 1248 if (nbd->task_recv) 1249 return -EBUSY; 1250 if (!config->socks) 1251 return -EINVAL; 1252 if (num_connections > 1 && 1253 !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) { 1254 dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n"); 1255 return -EINVAL; 1256 } 1257 1258 nbd->recv_workq = alloc_workqueue("knbd%d-recv", 1259 WQ_MEM_RECLAIM | WQ_HIGHPRI | 1260 WQ_UNBOUND, 0, nbd->index); 1261 if (!nbd->recv_workq) { 1262 dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n"); 1263 return -ENOMEM; 1264 } 1265 1266 blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections); 1267 nbd->task_recv = current; 1268 1269 nbd_parse_flags(nbd); 1270 1271 error = device_create_file(disk_to_dev(nbd->disk), &pid_attr); 1272 if (error) { 1273 dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); 1274 return error; 1275 } 1276 set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags); 1277 1278 nbd_dev_dbg_init(nbd); 1279 for (i = 0; i < num_connections; i++) { 1280 struct recv_thread_args *args; 1281 1282 args = kzalloc(sizeof(*args), GFP_KERNEL); 1283 if (!args) { 1284 sock_shutdown(nbd); 1285 /* 1286 * If num_connections is m (2 < m), 1287 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful. 1288 * But NO.(n + 1) failed. We still have n recv threads. 1289 * So, add flush_workqueue here to prevent recv threads 1290 * dropping the last config_refs and trying to destroy 1291 * the workqueue from inside the workqueue. 1292 */ 1293 if (i) 1294 flush_workqueue(nbd->recv_workq); 1295 return -ENOMEM; 1296 } 1297 sk_set_memalloc(config->socks[i]->sock->sk); 1298 if (nbd->tag_set.timeout) 1299 config->socks[i]->sock->sk->sk_sndtimeo = 1300 nbd->tag_set.timeout; 1301 atomic_inc(&config->recv_threads); 1302 refcount_inc(&nbd->config_refs); 1303 INIT_WORK(&args->work, recv_work); 1304 args->nbd = nbd; 1305 args->index = i; 1306 queue_work(nbd->recv_workq, &args->work); 1307 } 1308 return nbd_set_size(nbd, config->bytesize, config->blksize); 1309 } 1310 1311 static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev) 1312 { 1313 struct nbd_config *config = nbd->config; 1314 int ret; 1315 1316 ret = nbd_start_device(nbd); 1317 if (ret) 1318 return ret; 1319 1320 if (max_part) 1321 set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); 1322 mutex_unlock(&nbd->config_lock); 1323 ret = wait_event_interruptible(config->recv_wq, 1324 atomic_read(&config->recv_threads) == 0); 1325 if (ret) 1326 sock_shutdown(nbd); 1327 flush_workqueue(nbd->recv_workq); 1328 1329 mutex_lock(&nbd->config_lock); 1330 nbd_bdev_reset(bdev); 1331 /* user requested, ignore socket errors */ 1332 if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags)) 1333 ret = 0; 1334 if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags)) 1335 ret = -ETIMEDOUT; 1336 return ret; 1337 } 1338 1339 static void nbd_clear_sock_ioctl(struct nbd_device *nbd, 1340 struct block_device *bdev) 1341 { 1342 sock_shutdown(nbd); 1343 __invalidate_device(bdev, true); 1344 nbd_bdev_reset(bdev); 1345 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, 1346 &nbd->config->runtime_flags)) 1347 nbd_config_put(nbd); 1348 } 1349 1350 static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout) 1351 { 1352 nbd->tag_set.timeout = timeout * HZ; 1353 if (timeout) 1354 blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ); 1355 else 1356 blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ); 1357 } 1358 1359 /* Must be called with config_lock held */ 1360 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, 1361 unsigned int cmd, unsigned long arg) 1362 { 1363 struct nbd_config *config = nbd->config; 1364 1365 switch (cmd) { 1366 case NBD_DISCONNECT: 1367 return nbd_disconnect(nbd); 1368 case NBD_CLEAR_SOCK: 1369 nbd_clear_sock_ioctl(nbd, bdev); 1370 return 0; 1371 case NBD_SET_SOCK: 1372 return nbd_add_socket(nbd, arg, false); 1373 case NBD_SET_BLKSIZE: 1374 return nbd_set_size(nbd, config->bytesize, arg); 1375 case NBD_SET_SIZE: 1376 return nbd_set_size(nbd, arg, config->blksize); 1377 case NBD_SET_SIZE_BLOCKS: 1378 return nbd_set_size(nbd, arg * config->blksize, 1379 config->blksize); 1380 case NBD_SET_TIMEOUT: 1381 nbd_set_cmd_timeout(nbd, arg); 1382 return 0; 1383 1384 case NBD_SET_FLAGS: 1385 config->flags = arg; 1386 return 0; 1387 case NBD_DO_IT: 1388 return nbd_start_device_ioctl(nbd, bdev); 1389 case NBD_CLEAR_QUE: 1390 /* 1391 * This is for compatibility only. The queue is always cleared 1392 * by NBD_DO_IT or NBD_CLEAR_SOCK. 1393 */ 1394 return 0; 1395 case NBD_PRINT_DEBUG: 1396 /* 1397 * For compatibility only, we no longer keep a list of 1398 * outstanding requests. 1399 */ 1400 return 0; 1401 } 1402 return -ENOTTY; 1403 } 1404 1405 static int nbd_ioctl(struct block_device *bdev, fmode_t mode, 1406 unsigned int cmd, unsigned long arg) 1407 { 1408 struct nbd_device *nbd = bdev->bd_disk->private_data; 1409 struct nbd_config *config = nbd->config; 1410 int error = -EINVAL; 1411 1412 if (!capable(CAP_SYS_ADMIN)) 1413 return -EPERM; 1414 1415 /* The block layer will pass back some non-nbd ioctls in case we have 1416 * special handling for them, but we don't so just return an error. 1417 */ 1418 if (_IOC_TYPE(cmd) != 0xab) 1419 return -EINVAL; 1420 1421 mutex_lock(&nbd->config_lock); 1422 1423 /* Don't allow ioctl operations on a nbd device that was created with 1424 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine. 1425 */ 1426 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || 1427 (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK)) 1428 error = __nbd_ioctl(bdev, nbd, cmd, arg); 1429 else 1430 dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n"); 1431 mutex_unlock(&nbd->config_lock); 1432 return error; 1433 } 1434 1435 static struct nbd_config *nbd_alloc_config(void) 1436 { 1437 struct nbd_config *config; 1438 1439 config = kzalloc(sizeof(struct nbd_config), GFP_NOFS); 1440 if (!config) 1441 return NULL; 1442 atomic_set(&config->recv_threads, 0); 1443 init_waitqueue_head(&config->recv_wq); 1444 init_waitqueue_head(&config->conn_wait); 1445 config->blksize = NBD_DEF_BLKSIZE; 1446 atomic_set(&config->live_connections, 0); 1447 try_module_get(THIS_MODULE); 1448 return config; 1449 } 1450 1451 static int nbd_open(struct block_device *bdev, fmode_t mode) 1452 { 1453 struct nbd_device *nbd; 1454 int ret = 0; 1455 1456 mutex_lock(&nbd_index_mutex); 1457 nbd = bdev->bd_disk->private_data; 1458 if (!nbd) { 1459 ret = -ENXIO; 1460 goto out; 1461 } 1462 if (!refcount_inc_not_zero(&nbd->refs)) { 1463 ret = -ENXIO; 1464 goto out; 1465 } 1466 if (!refcount_inc_not_zero(&nbd->config_refs)) { 1467 struct nbd_config *config; 1468 1469 mutex_lock(&nbd->config_lock); 1470 if (refcount_inc_not_zero(&nbd->config_refs)) { 1471 mutex_unlock(&nbd->config_lock); 1472 goto out; 1473 } 1474 config = nbd->config = nbd_alloc_config(); 1475 if (!config) { 1476 ret = -ENOMEM; 1477 mutex_unlock(&nbd->config_lock); 1478 goto out; 1479 } 1480 refcount_set(&nbd->config_refs, 1); 1481 refcount_inc(&nbd->refs); 1482 mutex_unlock(&nbd->config_lock); 1483 if (max_part) 1484 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1485 } else if (nbd_disconnected(nbd->config)) { 1486 if (max_part) 1487 set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); 1488 } 1489 out: 1490 mutex_unlock(&nbd_index_mutex); 1491 return ret; 1492 } 1493 1494 static void nbd_release(struct gendisk *disk, fmode_t mode) 1495 { 1496 struct nbd_device *nbd = disk->private_data; 1497 1498 if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && 1499 disk->part0->bd_openers == 0) 1500 nbd_disconnect_and_put(nbd); 1501 1502 nbd_config_put(nbd); 1503 nbd_put(nbd); 1504 } 1505 1506 static const struct block_device_operations nbd_fops = 1507 { 1508 .owner = THIS_MODULE, 1509 .open = nbd_open, 1510 .release = nbd_release, 1511 .ioctl = nbd_ioctl, 1512 .compat_ioctl = nbd_ioctl, 1513 }; 1514 1515 #if IS_ENABLED(CONFIG_DEBUG_FS) 1516 1517 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) 1518 { 1519 struct nbd_device *nbd = s->private; 1520 1521 if (nbd->task_recv) 1522 seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); 1523 1524 return 0; 1525 } 1526 1527 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks); 1528 1529 static int nbd_dbg_flags_show(struct seq_file *s, void *unused) 1530 { 1531 struct nbd_device *nbd = s->private; 1532 u32 flags = nbd->config->flags; 1533 1534 seq_printf(s, "Hex: 0x%08x\n\n", flags); 1535 1536 seq_puts(s, "Known flags:\n"); 1537 1538 if (flags & NBD_FLAG_HAS_FLAGS) 1539 seq_puts(s, "NBD_FLAG_HAS_FLAGS\n"); 1540 if (flags & NBD_FLAG_READ_ONLY) 1541 seq_puts(s, "NBD_FLAG_READ_ONLY\n"); 1542 if (flags & NBD_FLAG_SEND_FLUSH) 1543 seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); 1544 if (flags & NBD_FLAG_SEND_FUA) 1545 seq_puts(s, "NBD_FLAG_SEND_FUA\n"); 1546 if (flags & NBD_FLAG_SEND_TRIM) 1547 seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); 1548 1549 return 0; 1550 } 1551 1552 DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags); 1553 1554 static int nbd_dev_dbg_init(struct nbd_device *nbd) 1555 { 1556 struct dentry *dir; 1557 struct nbd_config *config = nbd->config; 1558 1559 if (!nbd_dbg_dir) 1560 return -EIO; 1561 1562 dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); 1563 if (!dir) { 1564 dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n", 1565 nbd_name(nbd)); 1566 return -EIO; 1567 } 1568 config->dbg_dir = dir; 1569 1570 debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops); 1571 debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize); 1572 debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); 1573 debugfs_create_u64("blocksize", 0444, dir, &config->blksize); 1574 debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops); 1575 1576 return 0; 1577 } 1578 1579 static void nbd_dev_dbg_close(struct nbd_device *nbd) 1580 { 1581 debugfs_remove_recursive(nbd->config->dbg_dir); 1582 } 1583 1584 static int nbd_dbg_init(void) 1585 { 1586 struct dentry *dbg_dir; 1587 1588 dbg_dir = debugfs_create_dir("nbd", NULL); 1589 if (!dbg_dir) 1590 return -EIO; 1591 1592 nbd_dbg_dir = dbg_dir; 1593 1594 return 0; 1595 } 1596 1597 static void nbd_dbg_close(void) 1598 { 1599 debugfs_remove_recursive(nbd_dbg_dir); 1600 } 1601 1602 #else /* IS_ENABLED(CONFIG_DEBUG_FS) */ 1603 1604 static int nbd_dev_dbg_init(struct nbd_device *nbd) 1605 { 1606 return 0; 1607 } 1608 1609 static void nbd_dev_dbg_close(struct nbd_device *nbd) 1610 { 1611 } 1612 1613 static int nbd_dbg_init(void) 1614 { 1615 return 0; 1616 } 1617 1618 static void nbd_dbg_close(void) 1619 { 1620 } 1621 1622 #endif 1623 1624 static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 1625 unsigned int hctx_idx, unsigned int numa_node) 1626 { 1627 struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); 1628 cmd->nbd = set->driver_data; 1629 cmd->flags = 0; 1630 mutex_init(&cmd->lock); 1631 return 0; 1632 } 1633 1634 static const struct blk_mq_ops nbd_mq_ops = { 1635 .queue_rq = nbd_queue_rq, 1636 .complete = nbd_complete_rq, 1637 .init_request = nbd_init_request, 1638 .timeout = nbd_xmit_timeout, 1639 }; 1640 1641 static int nbd_dev_add(int index) 1642 { 1643 struct nbd_device *nbd; 1644 struct gendisk *disk; 1645 int err = -ENOMEM; 1646 1647 nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); 1648 if (!nbd) 1649 goto out; 1650 1651 nbd->tag_set.ops = &nbd_mq_ops; 1652 nbd->tag_set.nr_hw_queues = 1; 1653 nbd->tag_set.queue_depth = 128; 1654 nbd->tag_set.numa_node = NUMA_NO_NODE; 1655 nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); 1656 nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | 1657 BLK_MQ_F_BLOCKING; 1658 nbd->tag_set.driver_data = nbd; 1659 nbd->destroy_complete = NULL; 1660 1661 err = blk_mq_alloc_tag_set(&nbd->tag_set); 1662 if (err) 1663 goto out_free_nbd; 1664 1665 if (index >= 0) { 1666 err = idr_alloc(&nbd_index_idr, nbd, index, index + 1, 1667 GFP_KERNEL); 1668 if (err == -ENOSPC) 1669 err = -EEXIST; 1670 } else { 1671 err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL); 1672 if (err >= 0) 1673 index = err; 1674 } 1675 if (err < 0) 1676 goto out_free_tags; 1677 nbd->index = index; 1678 1679 disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); 1680 if (IS_ERR(disk)) { 1681 err = PTR_ERR(disk); 1682 goto out_free_idr; 1683 } 1684 nbd->disk = disk; 1685 1686 /* 1687 * Tell the block layer that we are not a rotational device 1688 */ 1689 blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); 1690 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); 1691 disk->queue->limits.discard_granularity = 0; 1692 disk->queue->limits.discard_alignment = 0; 1693 blk_queue_max_discard_sectors(disk->queue, 0); 1694 blk_queue_max_segment_size(disk->queue, UINT_MAX); 1695 blk_queue_max_segments(disk->queue, USHRT_MAX); 1696 blk_queue_max_hw_sectors(disk->queue, 65536); 1697 disk->queue->limits.max_sectors = 256; 1698 1699 mutex_init(&nbd->config_lock); 1700 refcount_set(&nbd->config_refs, 0); 1701 refcount_set(&nbd->refs, 1); 1702 INIT_LIST_HEAD(&nbd->list); 1703 disk->major = NBD_MAJOR; 1704 disk->first_minor = index << part_shift; 1705 disk->minors = 1 << part_shift; 1706 disk->fops = &nbd_fops; 1707 disk->private_data = nbd; 1708 sprintf(disk->disk_name, "nbd%d", index); 1709 add_disk(disk); 1710 nbd_total_devices++; 1711 return index; 1712 1713 out_free_idr: 1714 idr_remove(&nbd_index_idr, index); 1715 out_free_tags: 1716 blk_mq_free_tag_set(&nbd->tag_set); 1717 out_free_nbd: 1718 kfree(nbd); 1719 out: 1720 return err; 1721 } 1722 1723 static int find_free_cb(int id, void *ptr, void *data) 1724 { 1725 struct nbd_device *nbd = ptr; 1726 struct nbd_device **found = data; 1727 1728 if (!refcount_read(&nbd->config_refs)) { 1729 *found = nbd; 1730 return 1; 1731 } 1732 return 0; 1733 } 1734 1735 /* Netlink interface. */ 1736 static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = { 1737 [NBD_ATTR_INDEX] = { .type = NLA_U32 }, 1738 [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 }, 1739 [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 }, 1740 [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 }, 1741 [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 }, 1742 [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 }, 1743 [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED}, 1744 [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 }, 1745 [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED}, 1746 }; 1747 1748 static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = { 1749 [NBD_SOCK_FD] = { .type = NLA_U32 }, 1750 }; 1751 1752 /* We don't use this right now since we don't parse the incoming list, but we 1753 * still want it here so userspace knows what to expect. 1754 */ 1755 static const struct nla_policy __attribute__((unused)) 1756 nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { 1757 [NBD_DEVICE_INDEX] = { .type = NLA_U32 }, 1758 [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, 1759 }; 1760 1761 static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) 1762 { 1763 struct nbd_config *config = nbd->config; 1764 u64 bsize = config->blksize; 1765 u64 bytes = config->bytesize; 1766 1767 if (info->attrs[NBD_ATTR_SIZE_BYTES]) 1768 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); 1769 1770 if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) 1771 bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); 1772 1773 if (bytes != config->bytesize || bsize != config->blksize) 1774 return nbd_set_size(nbd, bytes, bsize); 1775 return 0; 1776 } 1777 1778 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) 1779 { 1780 DECLARE_COMPLETION_ONSTACK(destroy_complete); 1781 struct nbd_device *nbd = NULL; 1782 struct nbd_config *config; 1783 int index = -1; 1784 int ret; 1785 bool put_dev = false; 1786 1787 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 1788 return -EPERM; 1789 1790 if (info->attrs[NBD_ATTR_INDEX]) 1791 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 1792 if (!info->attrs[NBD_ATTR_SOCKETS]) { 1793 printk(KERN_ERR "nbd: must specify at least one socket\n"); 1794 return -EINVAL; 1795 } 1796 if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { 1797 printk(KERN_ERR "nbd: must specify a size in bytes for the device\n"); 1798 return -EINVAL; 1799 } 1800 again: 1801 mutex_lock(&nbd_index_mutex); 1802 if (index == -1) { 1803 ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd); 1804 if (ret == 0) { 1805 int new_index; 1806 new_index = nbd_dev_add(-1); 1807 if (new_index < 0) { 1808 mutex_unlock(&nbd_index_mutex); 1809 printk(KERN_ERR "nbd: failed to add new device\n"); 1810 return new_index; 1811 } 1812 nbd = idr_find(&nbd_index_idr, new_index); 1813 } 1814 } else { 1815 nbd = idr_find(&nbd_index_idr, index); 1816 if (!nbd) { 1817 ret = nbd_dev_add(index); 1818 if (ret < 0) { 1819 mutex_unlock(&nbd_index_mutex); 1820 printk(KERN_ERR "nbd: failed to add new device\n"); 1821 return ret; 1822 } 1823 nbd = idr_find(&nbd_index_idr, index); 1824 } 1825 } 1826 if (!nbd) { 1827 printk(KERN_ERR "nbd: couldn't find device at index %d\n", 1828 index); 1829 mutex_unlock(&nbd_index_mutex); 1830 return -EINVAL; 1831 } 1832 1833 if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && 1834 test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) { 1835 nbd->destroy_complete = &destroy_complete; 1836 mutex_unlock(&nbd_index_mutex); 1837 1838 /* Wait untill the the nbd stuff is totally destroyed */ 1839 wait_for_completion(&destroy_complete); 1840 goto again; 1841 } 1842 1843 if (!refcount_inc_not_zero(&nbd->refs)) { 1844 mutex_unlock(&nbd_index_mutex); 1845 if (index == -1) 1846 goto again; 1847 printk(KERN_ERR "nbd: device at index %d is going down\n", 1848 index); 1849 return -EINVAL; 1850 } 1851 mutex_unlock(&nbd_index_mutex); 1852 1853 mutex_lock(&nbd->config_lock); 1854 if (refcount_read(&nbd->config_refs)) { 1855 mutex_unlock(&nbd->config_lock); 1856 nbd_put(nbd); 1857 if (index == -1) 1858 goto again; 1859 printk(KERN_ERR "nbd: nbd%d already in use\n", index); 1860 return -EBUSY; 1861 } 1862 if (WARN_ON(nbd->config)) { 1863 mutex_unlock(&nbd->config_lock); 1864 nbd_put(nbd); 1865 return -EINVAL; 1866 } 1867 config = nbd->config = nbd_alloc_config(); 1868 if (!nbd->config) { 1869 mutex_unlock(&nbd->config_lock); 1870 nbd_put(nbd); 1871 printk(KERN_ERR "nbd: couldn't allocate config\n"); 1872 return -ENOMEM; 1873 } 1874 refcount_set(&nbd->config_refs, 1); 1875 set_bit(NBD_RT_BOUND, &config->runtime_flags); 1876 1877 ret = nbd_genl_size_set(info, nbd); 1878 if (ret) 1879 goto out; 1880 1881 if (info->attrs[NBD_ATTR_TIMEOUT]) 1882 nbd_set_cmd_timeout(nbd, 1883 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); 1884 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { 1885 config->dead_conn_timeout = 1886 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); 1887 config->dead_conn_timeout *= HZ; 1888 } 1889 if (info->attrs[NBD_ATTR_SERVER_FLAGS]) 1890 config->flags = 1891 nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]); 1892 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { 1893 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); 1894 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { 1895 /* 1896 * We have 1 ref to keep the device around, and then 1 1897 * ref for our current operation here, which will be 1898 * inherited by the config. If we already have 1899 * DESTROY_ON_DISCONNECT set then we know we don't have 1900 * that extra ref already held so we don't need the 1901 * put_dev. 1902 */ 1903 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, 1904 &nbd->flags)) 1905 put_dev = true; 1906 } else { 1907 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, 1908 &nbd->flags)) 1909 refcount_inc(&nbd->refs); 1910 } 1911 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { 1912 set_bit(NBD_RT_DISCONNECT_ON_CLOSE, 1913 &config->runtime_flags); 1914 } 1915 } 1916 1917 if (info->attrs[NBD_ATTR_SOCKETS]) { 1918 struct nlattr *attr; 1919 int rem, fd; 1920 1921 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], 1922 rem) { 1923 struct nlattr *socks[NBD_SOCK_MAX+1]; 1924 1925 if (nla_type(attr) != NBD_SOCK_ITEM) { 1926 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); 1927 ret = -EINVAL; 1928 goto out; 1929 } 1930 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, 1931 attr, 1932 nbd_sock_policy, 1933 info->extack); 1934 if (ret != 0) { 1935 printk(KERN_ERR "nbd: error processing sock list\n"); 1936 ret = -EINVAL; 1937 goto out; 1938 } 1939 if (!socks[NBD_SOCK_FD]) 1940 continue; 1941 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); 1942 ret = nbd_add_socket(nbd, fd, true); 1943 if (ret) 1944 goto out; 1945 } 1946 } 1947 ret = nbd_start_device(nbd); 1948 out: 1949 mutex_unlock(&nbd->config_lock); 1950 if (!ret) { 1951 set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); 1952 refcount_inc(&nbd->config_refs); 1953 nbd_connect_reply(info, nbd->index); 1954 } 1955 nbd_config_put(nbd); 1956 if (put_dev) 1957 nbd_put(nbd); 1958 return ret; 1959 } 1960 1961 static void nbd_disconnect_and_put(struct nbd_device *nbd) 1962 { 1963 mutex_lock(&nbd->config_lock); 1964 nbd_disconnect(nbd); 1965 nbd_clear_sock(nbd); 1966 mutex_unlock(&nbd->config_lock); 1967 /* 1968 * Make sure recv thread has finished, so it does not drop the last 1969 * config ref and try to destroy the workqueue from inside the work 1970 * queue. 1971 */ 1972 if (nbd->recv_workq) 1973 flush_workqueue(nbd->recv_workq); 1974 if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, 1975 &nbd->config->runtime_flags)) 1976 nbd_config_put(nbd); 1977 } 1978 1979 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) 1980 { 1981 struct nbd_device *nbd; 1982 int index; 1983 1984 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 1985 return -EPERM; 1986 1987 if (!info->attrs[NBD_ATTR_INDEX]) { 1988 printk(KERN_ERR "nbd: must specify an index to disconnect\n"); 1989 return -EINVAL; 1990 } 1991 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 1992 mutex_lock(&nbd_index_mutex); 1993 nbd = idr_find(&nbd_index_idr, index); 1994 if (!nbd) { 1995 mutex_unlock(&nbd_index_mutex); 1996 printk(KERN_ERR "nbd: couldn't find device at index %d\n", 1997 index); 1998 return -EINVAL; 1999 } 2000 if (!refcount_inc_not_zero(&nbd->refs)) { 2001 mutex_unlock(&nbd_index_mutex); 2002 printk(KERN_ERR "nbd: device at index %d is going down\n", 2003 index); 2004 return -EINVAL; 2005 } 2006 mutex_unlock(&nbd_index_mutex); 2007 if (!refcount_inc_not_zero(&nbd->config_refs)) 2008 goto put_nbd; 2009 nbd_disconnect_and_put(nbd); 2010 nbd_config_put(nbd); 2011 put_nbd: 2012 nbd_put(nbd); 2013 return 0; 2014 } 2015 2016 static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) 2017 { 2018 struct nbd_device *nbd = NULL; 2019 struct nbd_config *config; 2020 int index; 2021 int ret = 0; 2022 bool put_dev = false; 2023 2024 if (!netlink_capable(skb, CAP_SYS_ADMIN)) 2025 return -EPERM; 2026 2027 if (!info->attrs[NBD_ATTR_INDEX]) { 2028 printk(KERN_ERR "nbd: must specify a device to reconfigure\n"); 2029 return -EINVAL; 2030 } 2031 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 2032 mutex_lock(&nbd_index_mutex); 2033 nbd = idr_find(&nbd_index_idr, index); 2034 if (!nbd) { 2035 mutex_unlock(&nbd_index_mutex); 2036 printk(KERN_ERR "nbd: couldn't find a device at index %d\n", 2037 index); 2038 return -EINVAL; 2039 } 2040 if (!refcount_inc_not_zero(&nbd->refs)) { 2041 mutex_unlock(&nbd_index_mutex); 2042 printk(KERN_ERR "nbd: device at index %d is going down\n", 2043 index); 2044 return -EINVAL; 2045 } 2046 mutex_unlock(&nbd_index_mutex); 2047 2048 if (!refcount_inc_not_zero(&nbd->config_refs)) { 2049 dev_err(nbd_to_dev(nbd), 2050 "not configured, cannot reconfigure\n"); 2051 nbd_put(nbd); 2052 return -EINVAL; 2053 } 2054 2055 mutex_lock(&nbd->config_lock); 2056 config = nbd->config; 2057 if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) || 2058 !nbd->task_recv) { 2059 dev_err(nbd_to_dev(nbd), 2060 "not configured, cannot reconfigure\n"); 2061 ret = -EINVAL; 2062 goto out; 2063 } 2064 2065 ret = nbd_genl_size_set(info, nbd); 2066 if (ret) 2067 goto out; 2068 2069 if (info->attrs[NBD_ATTR_TIMEOUT]) 2070 nbd_set_cmd_timeout(nbd, 2071 nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT])); 2072 if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) { 2073 config->dead_conn_timeout = 2074 nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]); 2075 config->dead_conn_timeout *= HZ; 2076 } 2077 if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) { 2078 u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]); 2079 if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) { 2080 if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT, 2081 &nbd->flags)) 2082 put_dev = true; 2083 } else { 2084 if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT, 2085 &nbd->flags)) 2086 refcount_inc(&nbd->refs); 2087 } 2088 2089 if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { 2090 set_bit(NBD_RT_DISCONNECT_ON_CLOSE, 2091 &config->runtime_flags); 2092 } else { 2093 clear_bit(NBD_RT_DISCONNECT_ON_CLOSE, 2094 &config->runtime_flags); 2095 } 2096 } 2097 2098 if (info->attrs[NBD_ATTR_SOCKETS]) { 2099 struct nlattr *attr; 2100 int rem, fd; 2101 2102 nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], 2103 rem) { 2104 struct nlattr *socks[NBD_SOCK_MAX+1]; 2105 2106 if (nla_type(attr) != NBD_SOCK_ITEM) { 2107 printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n"); 2108 ret = -EINVAL; 2109 goto out; 2110 } 2111 ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX, 2112 attr, 2113 nbd_sock_policy, 2114 info->extack); 2115 if (ret != 0) { 2116 printk(KERN_ERR "nbd: error processing sock list\n"); 2117 ret = -EINVAL; 2118 goto out; 2119 } 2120 if (!socks[NBD_SOCK_FD]) 2121 continue; 2122 fd = (int)nla_get_u32(socks[NBD_SOCK_FD]); 2123 ret = nbd_reconnect_socket(nbd, fd); 2124 if (ret) { 2125 if (ret == -ENOSPC) 2126 ret = 0; 2127 goto out; 2128 } 2129 dev_info(nbd_to_dev(nbd), "reconnected socket\n"); 2130 } 2131 } 2132 out: 2133 mutex_unlock(&nbd->config_lock); 2134 nbd_config_put(nbd); 2135 nbd_put(nbd); 2136 if (put_dev) 2137 nbd_put(nbd); 2138 return ret; 2139 } 2140 2141 static const struct genl_small_ops nbd_connect_genl_ops[] = { 2142 { 2143 .cmd = NBD_CMD_CONNECT, 2144 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2145 .doit = nbd_genl_connect, 2146 }, 2147 { 2148 .cmd = NBD_CMD_DISCONNECT, 2149 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2150 .doit = nbd_genl_disconnect, 2151 }, 2152 { 2153 .cmd = NBD_CMD_RECONFIGURE, 2154 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2155 .doit = nbd_genl_reconfigure, 2156 }, 2157 { 2158 .cmd = NBD_CMD_STATUS, 2159 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 2160 .doit = nbd_genl_status, 2161 }, 2162 }; 2163 2164 static const struct genl_multicast_group nbd_mcast_grps[] = { 2165 { .name = NBD_GENL_MCAST_GROUP_NAME, }, 2166 }; 2167 2168 static struct genl_family nbd_genl_family __ro_after_init = { 2169 .hdrsize = 0, 2170 .name = NBD_GENL_FAMILY_NAME, 2171 .version = NBD_GENL_VERSION, 2172 .module = THIS_MODULE, 2173 .small_ops = nbd_connect_genl_ops, 2174 .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), 2175 .maxattr = NBD_ATTR_MAX, 2176 .policy = nbd_attr_policy, 2177 .mcgrps = nbd_mcast_grps, 2178 .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), 2179 }; 2180 2181 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) 2182 { 2183 struct nlattr *dev_opt; 2184 u8 connected = 0; 2185 int ret; 2186 2187 /* This is a little racey, but for status it's ok. The 2188 * reason we don't take a ref here is because we can't 2189 * take a ref in the index == -1 case as we would need 2190 * to put under the nbd_index_mutex, which could 2191 * deadlock if we are configured to remove ourselves 2192 * once we're disconnected. 2193 */ 2194 if (refcount_read(&nbd->config_refs)) 2195 connected = 1; 2196 dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM); 2197 if (!dev_opt) 2198 return -EMSGSIZE; 2199 ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index); 2200 if (ret) 2201 return -EMSGSIZE; 2202 ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED, 2203 connected); 2204 if (ret) 2205 return -EMSGSIZE; 2206 nla_nest_end(reply, dev_opt); 2207 return 0; 2208 } 2209 2210 static int status_cb(int id, void *ptr, void *data) 2211 { 2212 struct nbd_device *nbd = ptr; 2213 return populate_nbd_status(nbd, (struct sk_buff *)data); 2214 } 2215 2216 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) 2217 { 2218 struct nlattr *dev_list; 2219 struct sk_buff *reply; 2220 void *reply_head; 2221 size_t msg_size; 2222 int index = -1; 2223 int ret = -ENOMEM; 2224 2225 if (info->attrs[NBD_ATTR_INDEX]) 2226 index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]); 2227 2228 mutex_lock(&nbd_index_mutex); 2229 2230 msg_size = nla_total_size(nla_attr_size(sizeof(u32)) + 2231 nla_attr_size(sizeof(u8))); 2232 msg_size *= (index == -1) ? nbd_total_devices : 1; 2233 2234 reply = genlmsg_new(msg_size, GFP_KERNEL); 2235 if (!reply) 2236 goto out; 2237 reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0, 2238 NBD_CMD_STATUS); 2239 if (!reply_head) { 2240 nlmsg_free(reply); 2241 goto out; 2242 } 2243 2244 dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); 2245 if (index == -1) { 2246 ret = idr_for_each(&nbd_index_idr, &status_cb, reply); 2247 if (ret) { 2248 nlmsg_free(reply); 2249 goto out; 2250 } 2251 } else { 2252 struct nbd_device *nbd; 2253 nbd = idr_find(&nbd_index_idr, index); 2254 if (nbd) { 2255 ret = populate_nbd_status(nbd, reply); 2256 if (ret) { 2257 nlmsg_free(reply); 2258 goto out; 2259 } 2260 } 2261 } 2262 nla_nest_end(reply, dev_list); 2263 genlmsg_end(reply, reply_head); 2264 ret = genlmsg_reply(reply, info); 2265 out: 2266 mutex_unlock(&nbd_index_mutex); 2267 return ret; 2268 } 2269 2270 static void nbd_connect_reply(struct genl_info *info, int index) 2271 { 2272 struct sk_buff *skb; 2273 void *msg_head; 2274 int ret; 2275 2276 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); 2277 if (!skb) 2278 return; 2279 msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0, 2280 NBD_CMD_CONNECT); 2281 if (!msg_head) { 2282 nlmsg_free(skb); 2283 return; 2284 } 2285 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); 2286 if (ret) { 2287 nlmsg_free(skb); 2288 return; 2289 } 2290 genlmsg_end(skb, msg_head); 2291 genlmsg_reply(skb, info); 2292 } 2293 2294 static void nbd_mcast_index(int index) 2295 { 2296 struct sk_buff *skb; 2297 void *msg_head; 2298 int ret; 2299 2300 skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL); 2301 if (!skb) 2302 return; 2303 msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0, 2304 NBD_CMD_LINK_DEAD); 2305 if (!msg_head) { 2306 nlmsg_free(skb); 2307 return; 2308 } 2309 ret = nla_put_u32(skb, NBD_ATTR_INDEX, index); 2310 if (ret) { 2311 nlmsg_free(skb); 2312 return; 2313 } 2314 genlmsg_end(skb, msg_head); 2315 genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL); 2316 } 2317 2318 static void nbd_dead_link_work(struct work_struct *work) 2319 { 2320 struct link_dead_args *args = container_of(work, struct link_dead_args, 2321 work); 2322 nbd_mcast_index(args->index); 2323 kfree(args); 2324 } 2325 2326 static int __init nbd_init(void) 2327 { 2328 int i; 2329 2330 BUILD_BUG_ON(sizeof(struct nbd_request) != 28); 2331 2332 if (max_part < 0) { 2333 printk(KERN_ERR "nbd: max_part must be >= 0\n"); 2334 return -EINVAL; 2335 } 2336 2337 part_shift = 0; 2338 if (max_part > 0) { 2339 part_shift = fls(max_part); 2340 2341 /* 2342 * Adjust max_part according to part_shift as it is exported 2343 * to user space so that user can know the max number of 2344 * partition kernel should be able to manage. 2345 * 2346 * Note that -1 is required because partition 0 is reserved 2347 * for the whole disk. 2348 */ 2349 max_part = (1UL << part_shift) - 1; 2350 } 2351 2352 if ((1UL << part_shift) > DISK_MAX_PARTS) 2353 return -EINVAL; 2354 2355 if (nbds_max > 1UL << (MINORBITS - part_shift)) 2356 return -EINVAL; 2357 2358 if (register_blkdev(NBD_MAJOR, "nbd")) 2359 return -EIO; 2360 2361 if (genl_register_family(&nbd_genl_family)) { 2362 unregister_blkdev(NBD_MAJOR, "nbd"); 2363 return -EINVAL; 2364 } 2365 nbd_dbg_init(); 2366 2367 mutex_lock(&nbd_index_mutex); 2368 for (i = 0; i < nbds_max; i++) 2369 nbd_dev_add(i); 2370 mutex_unlock(&nbd_index_mutex); 2371 return 0; 2372 } 2373 2374 static int nbd_exit_cb(int id, void *ptr, void *data) 2375 { 2376 struct list_head *list = (struct list_head *)data; 2377 struct nbd_device *nbd = ptr; 2378 2379 list_add_tail(&nbd->list, list); 2380 return 0; 2381 } 2382 2383 static void __exit nbd_cleanup(void) 2384 { 2385 struct nbd_device *nbd; 2386 LIST_HEAD(del_list); 2387 2388 nbd_dbg_close(); 2389 2390 mutex_lock(&nbd_index_mutex); 2391 idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list); 2392 mutex_unlock(&nbd_index_mutex); 2393 2394 while (!list_empty(&del_list)) { 2395 nbd = list_first_entry(&del_list, struct nbd_device, list); 2396 list_del_init(&nbd->list); 2397 if (refcount_read(&nbd->refs) != 1) 2398 printk(KERN_ERR "nbd: possibly leaking a device\n"); 2399 nbd_put(nbd); 2400 } 2401 2402 idr_destroy(&nbd_index_idr); 2403 genl_unregister_family(&nbd_genl_family); 2404 unregister_blkdev(NBD_MAJOR, "nbd"); 2405 } 2406 2407 module_init(nbd_init); 2408 module_exit(nbd_cleanup); 2409 2410 MODULE_DESCRIPTION("Network Block Device"); 2411 MODULE_LICENSE("GPL"); 2412 2413 module_param(nbds_max, int, 0444); 2414 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)"); 2415 module_param(max_part, int, 0444); 2416 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)"); 2417