xref: /openbmc/linux/drivers/block/nbd.c (revision 20ff1cb5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Network block device - make block devices work over TCP
4  *
5  * Note that you can not swap over this thing, yet. Seems to work but
6  * deadlocks sometimes - you can not swap over TCP in general.
7  *
8  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
9  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
10  *
11  * (part of code stolen from loop.c)
12  */
13 
14 #include <linux/major.h>
15 
16 #include <linux/blkdev.h>
17 #include <linux/module.h>
18 #include <linux/init.h>
19 #include <linux/sched.h>
20 #include <linux/sched/mm.h>
21 #include <linux/fs.h>
22 #include <linux/bio.h>
23 #include <linux/stat.h>
24 #include <linux/errno.h>
25 #include <linux/file.h>
26 #include <linux/ioctl.h>
27 #include <linux/mutex.h>
28 #include <linux/compiler.h>
29 #include <linux/err.h>
30 #include <linux/kernel.h>
31 #include <linux/slab.h>
32 #include <net/sock.h>
33 #include <linux/net.h>
34 #include <linux/kthread.h>
35 #include <linux/types.h>
36 #include <linux/debugfs.h>
37 #include <linux/blk-mq.h>
38 
39 #include <linux/uaccess.h>
40 #include <asm/types.h>
41 
42 #include <linux/nbd.h>
43 #include <linux/nbd-netlink.h>
44 #include <net/genetlink.h>
45 
46 #define CREATE_TRACE_POINTS
47 #include <trace/events/nbd.h>
48 
49 static DEFINE_IDR(nbd_index_idr);
50 static DEFINE_MUTEX(nbd_index_mutex);
51 static int nbd_total_devices = 0;
52 
53 struct nbd_sock {
54 	struct socket *sock;
55 	struct mutex tx_lock;
56 	struct request *pending;
57 	int sent;
58 	bool dead;
59 	int fallback_index;
60 	int cookie;
61 };
62 
63 struct recv_thread_args {
64 	struct work_struct work;
65 	struct nbd_device *nbd;
66 	int index;
67 };
68 
69 struct link_dead_args {
70 	struct work_struct work;
71 	int index;
72 };
73 
74 #define NBD_TIMEDOUT			0
75 #define NBD_DISCONNECT_REQUESTED	1
76 #define NBD_DISCONNECTED		2
77 #define NBD_HAS_PID_FILE		3
78 #define NBD_HAS_CONFIG_REF		4
79 #define NBD_BOUND			5
80 #define NBD_DESTROY_ON_DISCONNECT	6
81 #define NBD_DISCONNECT_ON_CLOSE 	7
82 
83 struct nbd_config {
84 	u32 flags;
85 	unsigned long runtime_flags;
86 	u64 dead_conn_timeout;
87 
88 	struct nbd_sock **socks;
89 	int num_connections;
90 	atomic_t live_connections;
91 	wait_queue_head_t conn_wait;
92 
93 	atomic_t recv_threads;
94 	wait_queue_head_t recv_wq;
95 	loff_t blksize;
96 	loff_t bytesize;
97 #if IS_ENABLED(CONFIG_DEBUG_FS)
98 	struct dentry *dbg_dir;
99 #endif
100 };
101 
102 struct nbd_device {
103 	struct blk_mq_tag_set tag_set;
104 
105 	int index;
106 	refcount_t config_refs;
107 	refcount_t refs;
108 	struct nbd_config *config;
109 	struct mutex config_lock;
110 	struct gendisk *disk;
111 	struct workqueue_struct *recv_workq;
112 
113 	struct list_head list;
114 	struct task_struct *task_recv;
115 	struct task_struct *task_setup;
116 };
117 
118 #define NBD_CMD_REQUEUED	1
119 
120 struct nbd_cmd {
121 	struct nbd_device *nbd;
122 	struct mutex lock;
123 	int index;
124 	int cookie;
125 	int retries;
126 	blk_status_t status;
127 	unsigned long flags;
128 	u32 cmd_cookie;
129 };
130 
131 #if IS_ENABLED(CONFIG_DEBUG_FS)
132 static struct dentry *nbd_dbg_dir;
133 #endif
134 
135 #define nbd_name(nbd) ((nbd)->disk->disk_name)
136 
137 #define NBD_MAGIC 0x68797548
138 
139 #define NBD_DEF_BLKSIZE 1024
140 
141 static unsigned int nbds_max = 16;
142 static int max_part = 16;
143 static int part_shift;
144 
145 static int nbd_dev_dbg_init(struct nbd_device *nbd);
146 static void nbd_dev_dbg_close(struct nbd_device *nbd);
147 static void nbd_config_put(struct nbd_device *nbd);
148 static void nbd_connect_reply(struct genl_info *info, int index);
149 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
150 static void nbd_dead_link_work(struct work_struct *work);
151 static void nbd_disconnect_and_put(struct nbd_device *nbd);
152 
153 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
154 {
155 	return disk_to_dev(nbd->disk);
156 }
157 
158 static void nbd_requeue_cmd(struct nbd_cmd *cmd)
159 {
160 	struct request *req = blk_mq_rq_from_pdu(cmd);
161 
162 	if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
163 		blk_mq_requeue_request(req, true);
164 }
165 
166 #define NBD_COOKIE_BITS 32
167 
168 static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
169 {
170 	struct request *req = blk_mq_rq_from_pdu(cmd);
171 	u32 tag = blk_mq_unique_tag(req);
172 	u64 cookie = cmd->cmd_cookie;
173 
174 	return (cookie << NBD_COOKIE_BITS) | tag;
175 }
176 
177 static u32 nbd_handle_to_tag(u64 handle)
178 {
179 	return (u32)handle;
180 }
181 
182 static u32 nbd_handle_to_cookie(u64 handle)
183 {
184 	return (u32)(handle >> NBD_COOKIE_BITS);
185 }
186 
187 static const char *nbdcmd_to_ascii(int cmd)
188 {
189 	switch (cmd) {
190 	case  NBD_CMD_READ: return "read";
191 	case NBD_CMD_WRITE: return "write";
192 	case  NBD_CMD_DISC: return "disconnect";
193 	case NBD_CMD_FLUSH: return "flush";
194 	case  NBD_CMD_TRIM: return "trim/discard";
195 	}
196 	return "invalid";
197 }
198 
199 static ssize_t pid_show(struct device *dev,
200 			struct device_attribute *attr, char *buf)
201 {
202 	struct gendisk *disk = dev_to_disk(dev);
203 	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
204 
205 	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
206 }
207 
208 static const struct device_attribute pid_attr = {
209 	.attr = { .name = "pid", .mode = 0444},
210 	.show = pid_show,
211 };
212 
213 static void nbd_dev_remove(struct nbd_device *nbd)
214 {
215 	struct gendisk *disk = nbd->disk;
216 	struct request_queue *q;
217 
218 	if (disk) {
219 		q = disk->queue;
220 		del_gendisk(disk);
221 		blk_cleanup_queue(q);
222 		blk_mq_free_tag_set(&nbd->tag_set);
223 		disk->private_data = NULL;
224 		put_disk(disk);
225 	}
226 	kfree(nbd);
227 }
228 
229 static void nbd_put(struct nbd_device *nbd)
230 {
231 	if (refcount_dec_and_mutex_lock(&nbd->refs,
232 					&nbd_index_mutex)) {
233 		idr_remove(&nbd_index_idr, nbd->index);
234 		mutex_unlock(&nbd_index_mutex);
235 		nbd_dev_remove(nbd);
236 	}
237 }
238 
239 static int nbd_disconnected(struct nbd_config *config)
240 {
241 	return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
242 		test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
243 }
244 
245 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
246 				int notify)
247 {
248 	if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
249 		struct link_dead_args *args;
250 		args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
251 		if (args) {
252 			INIT_WORK(&args->work, nbd_dead_link_work);
253 			args->index = nbd->index;
254 			queue_work(system_wq, &args->work);
255 		}
256 	}
257 	if (!nsock->dead) {
258 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
259 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
260 			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
261 					       &nbd->config->runtime_flags)) {
262 				set_bit(NBD_DISCONNECTED,
263 					&nbd->config->runtime_flags);
264 				dev_info(nbd_to_dev(nbd),
265 					"Disconnected due to user request.\n");
266 			}
267 		}
268 	}
269 	nsock->dead = true;
270 	nsock->pending = NULL;
271 	nsock->sent = 0;
272 }
273 
274 static void nbd_size_clear(struct nbd_device *nbd)
275 {
276 	if (nbd->config->bytesize) {
277 		set_capacity(nbd->disk, 0);
278 		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
279 	}
280 }
281 
282 static void nbd_size_update(struct nbd_device *nbd)
283 {
284 	struct nbd_config *config = nbd->config;
285 	struct block_device *bdev = bdget_disk(nbd->disk, 0);
286 
287 	if (config->flags & NBD_FLAG_SEND_TRIM) {
288 		nbd->disk->queue->limits.discard_granularity = config->blksize;
289 		nbd->disk->queue->limits.discard_alignment = config->blksize;
290 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
291 	}
292 	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
293 	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
294 	set_capacity(nbd->disk, config->bytesize >> 9);
295 	if (bdev) {
296 		if (bdev->bd_disk) {
297 			bd_set_size(bdev, config->bytesize);
298 			set_blocksize(bdev, config->blksize);
299 		} else
300 			bdev->bd_invalidated = 1;
301 		bdput(bdev);
302 	}
303 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
304 }
305 
306 static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
307 			 loff_t nr_blocks)
308 {
309 	struct nbd_config *config = nbd->config;
310 	config->blksize = blocksize;
311 	config->bytesize = blocksize * nr_blocks;
312 	if (nbd->task_recv != NULL)
313 		nbd_size_update(nbd);
314 }
315 
316 static void nbd_complete_rq(struct request *req)
317 {
318 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
319 
320 	dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
321 		cmd->status ? "failed" : "done");
322 
323 	blk_mq_end_request(req, cmd->status);
324 }
325 
326 /*
327  * Forcibly shutdown the socket causing all listeners to error
328  */
329 static void sock_shutdown(struct nbd_device *nbd)
330 {
331 	struct nbd_config *config = nbd->config;
332 	int i;
333 
334 	if (config->num_connections == 0)
335 		return;
336 	if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
337 		return;
338 
339 	for (i = 0; i < config->num_connections; i++) {
340 		struct nbd_sock *nsock = config->socks[i];
341 		mutex_lock(&nsock->tx_lock);
342 		nbd_mark_nsock_dead(nbd, nsock, 0);
343 		mutex_unlock(&nsock->tx_lock);
344 	}
345 	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
346 }
347 
348 static u32 req_to_nbd_cmd_type(struct request *req)
349 {
350 	switch (req_op(req)) {
351 	case REQ_OP_DISCARD:
352 		return NBD_CMD_TRIM;
353 	case REQ_OP_FLUSH:
354 		return NBD_CMD_FLUSH;
355 	case REQ_OP_WRITE:
356 		return NBD_CMD_WRITE;
357 	case REQ_OP_READ:
358 		return NBD_CMD_READ;
359 	default:
360 		return U32_MAX;
361 	}
362 }
363 
364 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
365 						 bool reserved)
366 {
367 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
368 	struct nbd_device *nbd = cmd->nbd;
369 	struct nbd_config *config;
370 
371 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
372 		cmd->status = BLK_STS_TIMEOUT;
373 		goto done;
374 	}
375 	config = nbd->config;
376 
377 	if (!mutex_trylock(&cmd->lock)) {
378 		nbd_config_put(nbd);
379 		return BLK_EH_RESET_TIMER;
380 	}
381 
382 	if (config->num_connections > 1) {
383 		dev_err_ratelimited(nbd_to_dev(nbd),
384 				    "Connection timed out, retrying (%d/%d alive)\n",
385 				    atomic_read(&config->live_connections),
386 				    config->num_connections);
387 		/*
388 		 * Hooray we have more connections, requeue this IO, the submit
389 		 * path will put it on a real connection.
390 		 */
391 		if (config->socks && config->num_connections > 1) {
392 			if (cmd->index < config->num_connections) {
393 				struct nbd_sock *nsock =
394 					config->socks[cmd->index];
395 				mutex_lock(&nsock->tx_lock);
396 				/* We can have multiple outstanding requests, so
397 				 * we don't want to mark the nsock dead if we've
398 				 * already reconnected with a new socket, so
399 				 * only mark it dead if its the same socket we
400 				 * were sent out on.
401 				 */
402 				if (cmd->cookie == nsock->cookie)
403 					nbd_mark_nsock_dead(nbd, nsock, 1);
404 				mutex_unlock(&nsock->tx_lock);
405 			}
406 			mutex_unlock(&cmd->lock);
407 			nbd_requeue_cmd(cmd);
408 			nbd_config_put(nbd);
409 			return BLK_EH_DONE;
410 		}
411 	}
412 
413 	if (!nbd->tag_set.timeout) {
414 		/*
415 		 * Userspace sets timeout=0 to disable socket disconnection,
416 		 * so just warn and reset the timer.
417 		 */
418 		cmd->retries++;
419 		dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
420 			req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
421 			(unsigned long long)blk_rq_pos(req) << 9,
422 			blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
423 
424 		mutex_unlock(&cmd->lock);
425 		nbd_config_put(nbd);
426 		return BLK_EH_RESET_TIMER;
427 	}
428 
429 	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
430 	set_bit(NBD_TIMEDOUT, &config->runtime_flags);
431 	cmd->status = BLK_STS_IOERR;
432 	mutex_unlock(&cmd->lock);
433 	sock_shutdown(nbd);
434 	nbd_config_put(nbd);
435 done:
436 	blk_mq_complete_request(req);
437 	return BLK_EH_DONE;
438 }
439 
440 /*
441  *  Send or receive packet.
442  */
443 static int sock_xmit(struct nbd_device *nbd, int index, int send,
444 		     struct iov_iter *iter, int msg_flags, int *sent)
445 {
446 	struct nbd_config *config = nbd->config;
447 	struct socket *sock = config->socks[index]->sock;
448 	int result;
449 	struct msghdr msg;
450 	unsigned int noreclaim_flag;
451 
452 	if (unlikely(!sock)) {
453 		dev_err_ratelimited(disk_to_dev(nbd->disk),
454 			"Attempted %s on closed socket in sock_xmit\n",
455 			(send ? "send" : "recv"));
456 		return -EINVAL;
457 	}
458 
459 	msg.msg_iter = *iter;
460 
461 	noreclaim_flag = memalloc_noreclaim_save();
462 	do {
463 		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
464 		msg.msg_name = NULL;
465 		msg.msg_namelen = 0;
466 		msg.msg_control = NULL;
467 		msg.msg_controllen = 0;
468 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
469 
470 		if (send)
471 			result = sock_sendmsg(sock, &msg);
472 		else
473 			result = sock_recvmsg(sock, &msg, msg.msg_flags);
474 
475 		if (result <= 0) {
476 			if (result == 0)
477 				result = -EPIPE; /* short read */
478 			break;
479 		}
480 		if (sent)
481 			*sent += result;
482 	} while (msg_data_left(&msg));
483 
484 	memalloc_noreclaim_restore(noreclaim_flag);
485 
486 	return result;
487 }
488 
489 /*
490  * Different settings for sk->sk_sndtimeo can result in different return values
491  * if there is a signal pending when we enter sendmsg, because reasons?
492  */
493 static inline int was_interrupted(int result)
494 {
495 	return result == -ERESTARTSYS || result == -EINTR;
496 }
497 
498 /* always call with the tx_lock held */
499 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
500 {
501 	struct request *req = blk_mq_rq_from_pdu(cmd);
502 	struct nbd_config *config = nbd->config;
503 	struct nbd_sock *nsock = config->socks[index];
504 	int result;
505 	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
506 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
507 	struct iov_iter from;
508 	unsigned long size = blk_rq_bytes(req);
509 	struct bio *bio;
510 	u64 handle;
511 	u32 type;
512 	u32 nbd_cmd_flags = 0;
513 	int sent = nsock->sent, skip = 0;
514 
515 	iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
516 
517 	type = req_to_nbd_cmd_type(req);
518 	if (type == U32_MAX)
519 		return -EIO;
520 
521 	if (rq_data_dir(req) == WRITE &&
522 	    (config->flags & NBD_FLAG_READ_ONLY)) {
523 		dev_err_ratelimited(disk_to_dev(nbd->disk),
524 				    "Write on read-only\n");
525 		return -EIO;
526 	}
527 
528 	if (req->cmd_flags & REQ_FUA)
529 		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
530 
531 	/* We did a partial send previously, and we at least sent the whole
532 	 * request struct, so just go and send the rest of the pages in the
533 	 * request.
534 	 */
535 	if (sent) {
536 		if (sent >= sizeof(request)) {
537 			skip = sent - sizeof(request);
538 
539 			/* initialize handle for tracing purposes */
540 			handle = nbd_cmd_handle(cmd);
541 
542 			goto send_pages;
543 		}
544 		iov_iter_advance(&from, sent);
545 	} else {
546 		cmd->cmd_cookie++;
547 	}
548 	cmd->index = index;
549 	cmd->cookie = nsock->cookie;
550 	cmd->retries = 0;
551 	request.type = htonl(type | nbd_cmd_flags);
552 	if (type != NBD_CMD_FLUSH) {
553 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
554 		request.len = htonl(size);
555 	}
556 	handle = nbd_cmd_handle(cmd);
557 	memcpy(request.handle, &handle, sizeof(handle));
558 
559 	trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
560 
561 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
562 		req, nbdcmd_to_ascii(type),
563 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
564 	result = sock_xmit(nbd, index, 1, &from,
565 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
566 	trace_nbd_header_sent(req, handle);
567 	if (result <= 0) {
568 		if (was_interrupted(result)) {
569 			/* If we havne't sent anything we can just return BUSY,
570 			 * however if we have sent something we need to make
571 			 * sure we only allow this req to be sent until we are
572 			 * completely done.
573 			 */
574 			if (sent) {
575 				nsock->pending = req;
576 				nsock->sent = sent;
577 			}
578 			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
579 			return BLK_STS_RESOURCE;
580 		}
581 		dev_err_ratelimited(disk_to_dev(nbd->disk),
582 			"Send control failed (result %d)\n", result);
583 		return -EAGAIN;
584 	}
585 send_pages:
586 	if (type != NBD_CMD_WRITE)
587 		goto out;
588 
589 	bio = req->bio;
590 	while (bio) {
591 		struct bio *next = bio->bi_next;
592 		struct bvec_iter iter;
593 		struct bio_vec bvec;
594 
595 		bio_for_each_segment(bvec, bio, iter) {
596 			bool is_last = !next && bio_iter_last(bvec, iter);
597 			int flags = is_last ? 0 : MSG_MORE;
598 
599 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
600 				req, bvec.bv_len);
601 			iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
602 			if (skip) {
603 				if (skip >= iov_iter_count(&from)) {
604 					skip -= iov_iter_count(&from);
605 					continue;
606 				}
607 				iov_iter_advance(&from, skip);
608 				skip = 0;
609 			}
610 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
611 			if (result <= 0) {
612 				if (was_interrupted(result)) {
613 					/* We've already sent the header, we
614 					 * have no choice but to set pending and
615 					 * return BUSY.
616 					 */
617 					nsock->pending = req;
618 					nsock->sent = sent;
619 					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
620 					return BLK_STS_RESOURCE;
621 				}
622 				dev_err(disk_to_dev(nbd->disk),
623 					"Send data failed (result %d)\n",
624 					result);
625 				return -EAGAIN;
626 			}
627 			/*
628 			 * The completion might already have come in,
629 			 * so break for the last one instead of letting
630 			 * the iterator do it. This prevents use-after-free
631 			 * of the bio.
632 			 */
633 			if (is_last)
634 				break;
635 		}
636 		bio = next;
637 	}
638 out:
639 	trace_nbd_payload_sent(req, handle);
640 	nsock->pending = NULL;
641 	nsock->sent = 0;
642 	return 0;
643 }
644 
645 /* NULL returned = something went wrong, inform userspace */
646 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
647 {
648 	struct nbd_config *config = nbd->config;
649 	int result;
650 	struct nbd_reply reply;
651 	struct nbd_cmd *cmd;
652 	struct request *req = NULL;
653 	u64 handle;
654 	u16 hwq;
655 	u32 tag;
656 	struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
657 	struct iov_iter to;
658 	int ret = 0;
659 
660 	reply.magic = 0;
661 	iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
662 	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
663 	if (result <= 0) {
664 		if (!nbd_disconnected(config))
665 			dev_err(disk_to_dev(nbd->disk),
666 				"Receive control failed (result %d)\n", result);
667 		return ERR_PTR(result);
668 	}
669 
670 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
671 		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
672 				(unsigned long)ntohl(reply.magic));
673 		return ERR_PTR(-EPROTO);
674 	}
675 
676 	memcpy(&handle, reply.handle, sizeof(handle));
677 	tag = nbd_handle_to_tag(handle);
678 	hwq = blk_mq_unique_tag_to_hwq(tag);
679 	if (hwq < nbd->tag_set.nr_hw_queues)
680 		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
681 				       blk_mq_unique_tag_to_tag(tag));
682 	if (!req || !blk_mq_request_started(req)) {
683 		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
684 			tag, req);
685 		return ERR_PTR(-ENOENT);
686 	}
687 	trace_nbd_header_received(req, handle);
688 	cmd = blk_mq_rq_to_pdu(req);
689 
690 	mutex_lock(&cmd->lock);
691 	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
692 		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
693 			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
694 		ret = -ENOENT;
695 		goto out;
696 	}
697 	if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
698 		dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
699 			req);
700 		ret = -ENOENT;
701 		goto out;
702 	}
703 	if (ntohl(reply.error)) {
704 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
705 			ntohl(reply.error));
706 		cmd->status = BLK_STS_IOERR;
707 		goto out;
708 	}
709 
710 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
711 	if (rq_data_dir(req) != WRITE) {
712 		struct req_iterator iter;
713 		struct bio_vec bvec;
714 
715 		rq_for_each_segment(bvec, req, iter) {
716 			iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
717 			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
718 			if (result <= 0) {
719 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
720 					result);
721 				/*
722 				 * If we've disconnected or we only have 1
723 				 * connection then we need to make sure we
724 				 * complete this request, otherwise error out
725 				 * and let the timeout stuff handle resubmitting
726 				 * this request onto another connection.
727 				 */
728 				if (nbd_disconnected(config) ||
729 				    config->num_connections <= 1) {
730 					cmd->status = BLK_STS_IOERR;
731 					goto out;
732 				}
733 				ret = -EIO;
734 				goto out;
735 			}
736 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
737 				req, bvec.bv_len);
738 		}
739 	}
740 out:
741 	trace_nbd_payload_received(req, handle);
742 	mutex_unlock(&cmd->lock);
743 	return ret ? ERR_PTR(ret) : cmd;
744 }
745 
746 static void recv_work(struct work_struct *work)
747 {
748 	struct recv_thread_args *args = container_of(work,
749 						     struct recv_thread_args,
750 						     work);
751 	struct nbd_device *nbd = args->nbd;
752 	struct nbd_config *config = nbd->config;
753 	struct nbd_cmd *cmd;
754 
755 	while (1) {
756 		cmd = nbd_read_stat(nbd, args->index);
757 		if (IS_ERR(cmd)) {
758 			struct nbd_sock *nsock = config->socks[args->index];
759 
760 			mutex_lock(&nsock->tx_lock);
761 			nbd_mark_nsock_dead(nbd, nsock, 1);
762 			mutex_unlock(&nsock->tx_lock);
763 			break;
764 		}
765 
766 		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
767 	}
768 	atomic_dec(&config->recv_threads);
769 	wake_up(&config->recv_wq);
770 	nbd_config_put(nbd);
771 	kfree(args);
772 }
773 
774 static bool nbd_clear_req(struct request *req, void *data, bool reserved)
775 {
776 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
777 
778 	cmd->status = BLK_STS_IOERR;
779 	blk_mq_complete_request(req);
780 	return true;
781 }
782 
783 static void nbd_clear_que(struct nbd_device *nbd)
784 {
785 	blk_mq_quiesce_queue(nbd->disk->queue);
786 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
787 	blk_mq_unquiesce_queue(nbd->disk->queue);
788 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
789 }
790 
791 static int find_fallback(struct nbd_device *nbd, int index)
792 {
793 	struct nbd_config *config = nbd->config;
794 	int new_index = -1;
795 	struct nbd_sock *nsock = config->socks[index];
796 	int fallback = nsock->fallback_index;
797 
798 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
799 		return new_index;
800 
801 	if (config->num_connections <= 1) {
802 		dev_err_ratelimited(disk_to_dev(nbd->disk),
803 				    "Attempted send on invalid socket\n");
804 		return new_index;
805 	}
806 
807 	if (fallback >= 0 && fallback < config->num_connections &&
808 	    !config->socks[fallback]->dead)
809 		return fallback;
810 
811 	if (nsock->fallback_index < 0 ||
812 	    nsock->fallback_index >= config->num_connections ||
813 	    config->socks[nsock->fallback_index]->dead) {
814 		int i;
815 		for (i = 0; i < config->num_connections; i++) {
816 			if (i == index)
817 				continue;
818 			if (!config->socks[i]->dead) {
819 				new_index = i;
820 				break;
821 			}
822 		}
823 		nsock->fallback_index = new_index;
824 		if (new_index < 0) {
825 			dev_err_ratelimited(disk_to_dev(nbd->disk),
826 					    "Dead connection, failed to find a fallback\n");
827 			return new_index;
828 		}
829 	}
830 	new_index = nsock->fallback_index;
831 	return new_index;
832 }
833 
834 static int wait_for_reconnect(struct nbd_device *nbd)
835 {
836 	struct nbd_config *config = nbd->config;
837 	if (!config->dead_conn_timeout)
838 		return 0;
839 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
840 		return 0;
841 	return wait_event_timeout(config->conn_wait,
842 				  atomic_read(&config->live_connections) > 0,
843 				  config->dead_conn_timeout) > 0;
844 }
845 
846 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
847 {
848 	struct request *req = blk_mq_rq_from_pdu(cmd);
849 	struct nbd_device *nbd = cmd->nbd;
850 	struct nbd_config *config;
851 	struct nbd_sock *nsock;
852 	int ret;
853 
854 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
855 		dev_err_ratelimited(disk_to_dev(nbd->disk),
856 				    "Socks array is empty\n");
857 		blk_mq_start_request(req);
858 		return -EINVAL;
859 	}
860 	config = nbd->config;
861 
862 	if (index >= config->num_connections) {
863 		dev_err_ratelimited(disk_to_dev(nbd->disk),
864 				    "Attempted send on invalid socket\n");
865 		nbd_config_put(nbd);
866 		blk_mq_start_request(req);
867 		return -EINVAL;
868 	}
869 	cmd->status = BLK_STS_OK;
870 again:
871 	nsock = config->socks[index];
872 	mutex_lock(&nsock->tx_lock);
873 	if (nsock->dead) {
874 		int old_index = index;
875 		index = find_fallback(nbd, index);
876 		mutex_unlock(&nsock->tx_lock);
877 		if (index < 0) {
878 			if (wait_for_reconnect(nbd)) {
879 				index = old_index;
880 				goto again;
881 			}
882 			/* All the sockets should already be down at this point,
883 			 * we just want to make sure that DISCONNECTED is set so
884 			 * any requests that come in that were queue'ed waiting
885 			 * for the reconnect timer don't trigger the timer again
886 			 * and instead just error out.
887 			 */
888 			sock_shutdown(nbd);
889 			nbd_config_put(nbd);
890 			blk_mq_start_request(req);
891 			return -EIO;
892 		}
893 		goto again;
894 	}
895 
896 	/* Handle the case that we have a pending request that was partially
897 	 * transmitted that _has_ to be serviced first.  We need to call requeue
898 	 * here so that it gets put _after_ the request that is already on the
899 	 * dispatch list.
900 	 */
901 	blk_mq_start_request(req);
902 	if (unlikely(nsock->pending && nsock->pending != req)) {
903 		nbd_requeue_cmd(cmd);
904 		ret = 0;
905 		goto out;
906 	}
907 	/*
908 	 * Some failures are related to the link going down, so anything that
909 	 * returns EAGAIN can be retried on a different socket.
910 	 */
911 	ret = nbd_send_cmd(nbd, cmd, index);
912 	if (ret == -EAGAIN) {
913 		dev_err_ratelimited(disk_to_dev(nbd->disk),
914 				    "Request send failed, requeueing\n");
915 		nbd_mark_nsock_dead(nbd, nsock, 1);
916 		nbd_requeue_cmd(cmd);
917 		ret = 0;
918 	}
919 out:
920 	mutex_unlock(&nsock->tx_lock);
921 	nbd_config_put(nbd);
922 	return ret;
923 }
924 
925 static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
926 			const struct blk_mq_queue_data *bd)
927 {
928 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
929 	int ret;
930 
931 	/*
932 	 * Since we look at the bio's to send the request over the network we
933 	 * need to make sure the completion work doesn't mark this request done
934 	 * before we are done doing our send.  This keeps us from dereferencing
935 	 * freed data if we have particularly fast completions (ie we get the
936 	 * completion before we exit sock_xmit on the last bvec) or in the case
937 	 * that the server is misbehaving (or there was an error) before we're
938 	 * done sending everything over the wire.
939 	 */
940 	mutex_lock(&cmd->lock);
941 	clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
942 
943 	/* We can be called directly from the user space process, which means we
944 	 * could possibly have signals pending so our sendmsg will fail.  In
945 	 * this case we need to return that we are busy, otherwise error out as
946 	 * appropriate.
947 	 */
948 	ret = nbd_handle_cmd(cmd, hctx->queue_num);
949 	if (ret < 0)
950 		ret = BLK_STS_IOERR;
951 	else if (!ret)
952 		ret = BLK_STS_OK;
953 	mutex_unlock(&cmd->lock);
954 
955 	return ret;
956 }
957 
958 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
959 			  bool netlink)
960 {
961 	struct nbd_config *config = nbd->config;
962 	struct socket *sock;
963 	struct nbd_sock **socks;
964 	struct nbd_sock *nsock;
965 	int err;
966 
967 	sock = sockfd_lookup(arg, &err);
968 	if (!sock)
969 		return err;
970 
971 	if (!netlink && !nbd->task_setup &&
972 	    !test_bit(NBD_BOUND, &config->runtime_flags))
973 		nbd->task_setup = current;
974 
975 	if (!netlink &&
976 	    (nbd->task_setup != current ||
977 	     test_bit(NBD_BOUND, &config->runtime_flags))) {
978 		dev_err(disk_to_dev(nbd->disk),
979 			"Device being setup by another task");
980 		sockfd_put(sock);
981 		return -EBUSY;
982 	}
983 
984 	socks = krealloc(config->socks, (config->num_connections + 1) *
985 			 sizeof(struct nbd_sock *), GFP_KERNEL);
986 	if (!socks) {
987 		sockfd_put(sock);
988 		return -ENOMEM;
989 	}
990 	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
991 	if (!nsock) {
992 		sockfd_put(sock);
993 		return -ENOMEM;
994 	}
995 
996 	config->socks = socks;
997 
998 	nsock->fallback_index = -1;
999 	nsock->dead = false;
1000 	mutex_init(&nsock->tx_lock);
1001 	nsock->sock = sock;
1002 	nsock->pending = NULL;
1003 	nsock->sent = 0;
1004 	nsock->cookie = 0;
1005 	socks[config->num_connections++] = nsock;
1006 	atomic_inc(&config->live_connections);
1007 
1008 	return 0;
1009 }
1010 
1011 static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1012 {
1013 	struct nbd_config *config = nbd->config;
1014 	struct socket *sock, *old;
1015 	struct recv_thread_args *args;
1016 	int i;
1017 	int err;
1018 
1019 	sock = sockfd_lookup(arg, &err);
1020 	if (!sock)
1021 		return err;
1022 
1023 	args = kzalloc(sizeof(*args), GFP_KERNEL);
1024 	if (!args) {
1025 		sockfd_put(sock);
1026 		return -ENOMEM;
1027 	}
1028 
1029 	for (i = 0; i < config->num_connections; i++) {
1030 		struct nbd_sock *nsock = config->socks[i];
1031 
1032 		if (!nsock->dead)
1033 			continue;
1034 
1035 		mutex_lock(&nsock->tx_lock);
1036 		if (!nsock->dead) {
1037 			mutex_unlock(&nsock->tx_lock);
1038 			continue;
1039 		}
1040 		sk_set_memalloc(sock->sk);
1041 		if (nbd->tag_set.timeout)
1042 			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1043 		atomic_inc(&config->recv_threads);
1044 		refcount_inc(&nbd->config_refs);
1045 		old = nsock->sock;
1046 		nsock->fallback_index = -1;
1047 		nsock->sock = sock;
1048 		nsock->dead = false;
1049 		INIT_WORK(&args->work, recv_work);
1050 		args->index = i;
1051 		args->nbd = nbd;
1052 		nsock->cookie++;
1053 		mutex_unlock(&nsock->tx_lock);
1054 		sockfd_put(old);
1055 
1056 		clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
1057 
1058 		/* We take the tx_mutex in an error path in the recv_work, so we
1059 		 * need to queue_work outside of the tx_mutex.
1060 		 */
1061 		queue_work(nbd->recv_workq, &args->work);
1062 
1063 		atomic_inc(&config->live_connections);
1064 		wake_up(&config->conn_wait);
1065 		return 0;
1066 	}
1067 	sockfd_put(sock);
1068 	kfree(args);
1069 	return -ENOSPC;
1070 }
1071 
1072 static void nbd_bdev_reset(struct block_device *bdev)
1073 {
1074 	if (bdev->bd_openers > 1)
1075 		return;
1076 	bd_set_size(bdev, 0);
1077 }
1078 
1079 static void nbd_parse_flags(struct nbd_device *nbd)
1080 {
1081 	struct nbd_config *config = nbd->config;
1082 	if (config->flags & NBD_FLAG_READ_ONLY)
1083 		set_disk_ro(nbd->disk, true);
1084 	else
1085 		set_disk_ro(nbd->disk, false);
1086 	if (config->flags & NBD_FLAG_SEND_TRIM)
1087 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1088 	if (config->flags & NBD_FLAG_SEND_FLUSH) {
1089 		if (config->flags & NBD_FLAG_SEND_FUA)
1090 			blk_queue_write_cache(nbd->disk->queue, true, true);
1091 		else
1092 			blk_queue_write_cache(nbd->disk->queue, true, false);
1093 	}
1094 	else
1095 		blk_queue_write_cache(nbd->disk->queue, false, false);
1096 }
1097 
1098 static void send_disconnects(struct nbd_device *nbd)
1099 {
1100 	struct nbd_config *config = nbd->config;
1101 	struct nbd_request request = {
1102 		.magic = htonl(NBD_REQUEST_MAGIC),
1103 		.type = htonl(NBD_CMD_DISC),
1104 	};
1105 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1106 	struct iov_iter from;
1107 	int i, ret;
1108 
1109 	for (i = 0; i < config->num_connections; i++) {
1110 		struct nbd_sock *nsock = config->socks[i];
1111 
1112 		iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
1113 		mutex_lock(&nsock->tx_lock);
1114 		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1115 		if (ret <= 0)
1116 			dev_err(disk_to_dev(nbd->disk),
1117 				"Send disconnect failed %d\n", ret);
1118 		mutex_unlock(&nsock->tx_lock);
1119 	}
1120 }
1121 
1122 static int nbd_disconnect(struct nbd_device *nbd)
1123 {
1124 	struct nbd_config *config = nbd->config;
1125 
1126 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1127 	set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
1128 	send_disconnects(nbd);
1129 	return 0;
1130 }
1131 
1132 static void nbd_clear_sock(struct nbd_device *nbd)
1133 {
1134 	sock_shutdown(nbd);
1135 	nbd_clear_que(nbd);
1136 	nbd->task_setup = NULL;
1137 }
1138 
1139 static void nbd_config_put(struct nbd_device *nbd)
1140 {
1141 	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1142 					&nbd->config_lock)) {
1143 		struct nbd_config *config = nbd->config;
1144 		nbd_dev_dbg_close(nbd);
1145 		nbd_size_clear(nbd);
1146 		if (test_and_clear_bit(NBD_HAS_PID_FILE,
1147 				       &config->runtime_flags))
1148 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1149 		nbd->task_recv = NULL;
1150 		nbd_clear_sock(nbd);
1151 		if (config->num_connections) {
1152 			int i;
1153 			for (i = 0; i < config->num_connections; i++) {
1154 				sockfd_put(config->socks[i]->sock);
1155 				kfree(config->socks[i]);
1156 			}
1157 			kfree(config->socks);
1158 		}
1159 		kfree(nbd->config);
1160 		nbd->config = NULL;
1161 
1162 		if (nbd->recv_workq)
1163 			destroy_workqueue(nbd->recv_workq);
1164 		nbd->recv_workq = NULL;
1165 
1166 		nbd->tag_set.timeout = 0;
1167 		nbd->disk->queue->limits.discard_granularity = 0;
1168 		nbd->disk->queue->limits.discard_alignment = 0;
1169 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1170 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1171 
1172 		mutex_unlock(&nbd->config_lock);
1173 		nbd_put(nbd);
1174 		module_put(THIS_MODULE);
1175 	}
1176 }
1177 
1178 static int nbd_start_device(struct nbd_device *nbd)
1179 {
1180 	struct nbd_config *config = nbd->config;
1181 	int num_connections = config->num_connections;
1182 	int error = 0, i;
1183 
1184 	if (nbd->task_recv)
1185 		return -EBUSY;
1186 	if (!config->socks)
1187 		return -EINVAL;
1188 	if (num_connections > 1 &&
1189 	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1190 		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1191 		return -EINVAL;
1192 	}
1193 
1194 	nbd->recv_workq = alloc_workqueue("knbd%d-recv",
1195 					  WQ_MEM_RECLAIM | WQ_HIGHPRI |
1196 					  WQ_UNBOUND, 0, nbd->index);
1197 	if (!nbd->recv_workq) {
1198 		dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1199 		return -ENOMEM;
1200 	}
1201 
1202 	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1203 	nbd->task_recv = current;
1204 
1205 	nbd_parse_flags(nbd);
1206 
1207 	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1208 	if (error) {
1209 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1210 		return error;
1211 	}
1212 	set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1213 
1214 	nbd_dev_dbg_init(nbd);
1215 	for (i = 0; i < num_connections; i++) {
1216 		struct recv_thread_args *args;
1217 
1218 		args = kzalloc(sizeof(*args), GFP_KERNEL);
1219 		if (!args) {
1220 			sock_shutdown(nbd);
1221 			return -ENOMEM;
1222 		}
1223 		sk_set_memalloc(config->socks[i]->sock->sk);
1224 		if (nbd->tag_set.timeout)
1225 			config->socks[i]->sock->sk->sk_sndtimeo =
1226 				nbd->tag_set.timeout;
1227 		atomic_inc(&config->recv_threads);
1228 		refcount_inc(&nbd->config_refs);
1229 		INIT_WORK(&args->work, recv_work);
1230 		args->nbd = nbd;
1231 		args->index = i;
1232 		queue_work(nbd->recv_workq, &args->work);
1233 	}
1234 	nbd_size_update(nbd);
1235 	return error;
1236 }
1237 
1238 static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1239 {
1240 	struct nbd_config *config = nbd->config;
1241 	int ret;
1242 
1243 	ret = nbd_start_device(nbd);
1244 	if (ret)
1245 		return ret;
1246 
1247 	if (max_part)
1248 		bdev->bd_invalidated = 1;
1249 	mutex_unlock(&nbd->config_lock);
1250 	ret = wait_event_interruptible(config->recv_wq,
1251 					 atomic_read(&config->recv_threads) == 0);
1252 	if (ret) {
1253 		sock_shutdown(nbd);
1254 		flush_workqueue(nbd->recv_workq);
1255 	}
1256 	mutex_lock(&nbd->config_lock);
1257 	nbd_bdev_reset(bdev);
1258 	/* user requested, ignore socket errors */
1259 	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
1260 		ret = 0;
1261 	if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
1262 		ret = -ETIMEDOUT;
1263 	return ret;
1264 }
1265 
1266 static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1267 				 struct block_device *bdev)
1268 {
1269 	sock_shutdown(nbd);
1270 	__invalidate_device(bdev, true);
1271 	nbd_bdev_reset(bdev);
1272 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1273 			       &nbd->config->runtime_flags))
1274 		nbd_config_put(nbd);
1275 }
1276 
1277 static bool nbd_is_valid_blksize(unsigned long blksize)
1278 {
1279 	if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
1280 	    blksize > PAGE_SIZE)
1281 		return false;
1282 	return true;
1283 }
1284 
1285 static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1286 {
1287 	nbd->tag_set.timeout = timeout * HZ;
1288 	if (timeout)
1289 		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1290 }
1291 
1292 /* Must be called with config_lock held */
1293 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1294 		       unsigned int cmd, unsigned long arg)
1295 {
1296 	struct nbd_config *config = nbd->config;
1297 
1298 	switch (cmd) {
1299 	case NBD_DISCONNECT:
1300 		return nbd_disconnect(nbd);
1301 	case NBD_CLEAR_SOCK:
1302 		nbd_clear_sock_ioctl(nbd, bdev);
1303 		return 0;
1304 	case NBD_SET_SOCK:
1305 		return nbd_add_socket(nbd, arg, false);
1306 	case NBD_SET_BLKSIZE:
1307 		if (!arg)
1308 			arg = NBD_DEF_BLKSIZE;
1309 		if (!nbd_is_valid_blksize(arg))
1310 			return -EINVAL;
1311 		nbd_size_set(nbd, arg,
1312 			     div_s64(config->bytesize, arg));
1313 		return 0;
1314 	case NBD_SET_SIZE:
1315 		nbd_size_set(nbd, config->blksize,
1316 			     div_s64(arg, config->blksize));
1317 		return 0;
1318 	case NBD_SET_SIZE_BLOCKS:
1319 		nbd_size_set(nbd, config->blksize, arg);
1320 		return 0;
1321 	case NBD_SET_TIMEOUT:
1322 		nbd_set_cmd_timeout(nbd, arg);
1323 		return 0;
1324 
1325 	case NBD_SET_FLAGS:
1326 		config->flags = arg;
1327 		return 0;
1328 	case NBD_DO_IT:
1329 		return nbd_start_device_ioctl(nbd, bdev);
1330 	case NBD_CLEAR_QUE:
1331 		/*
1332 		 * This is for compatibility only.  The queue is always cleared
1333 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1334 		 */
1335 		return 0;
1336 	case NBD_PRINT_DEBUG:
1337 		/*
1338 		 * For compatibility only, we no longer keep a list of
1339 		 * outstanding requests.
1340 		 */
1341 		return 0;
1342 	}
1343 	return -ENOTTY;
1344 }
1345 
1346 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1347 		     unsigned int cmd, unsigned long arg)
1348 {
1349 	struct nbd_device *nbd = bdev->bd_disk->private_data;
1350 	struct nbd_config *config = nbd->config;
1351 	int error = -EINVAL;
1352 
1353 	if (!capable(CAP_SYS_ADMIN))
1354 		return -EPERM;
1355 
1356 	/* The block layer will pass back some non-nbd ioctls in case we have
1357 	 * special handling for them, but we don't so just return an error.
1358 	 */
1359 	if (_IOC_TYPE(cmd) != 0xab)
1360 		return -EINVAL;
1361 
1362 	mutex_lock(&nbd->config_lock);
1363 
1364 	/* Don't allow ioctl operations on a nbd device that was created with
1365 	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1366 	 */
1367 	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1368 	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1369 		error = __nbd_ioctl(bdev, nbd, cmd, arg);
1370 	else
1371 		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1372 	mutex_unlock(&nbd->config_lock);
1373 	return error;
1374 }
1375 
1376 static struct nbd_config *nbd_alloc_config(void)
1377 {
1378 	struct nbd_config *config;
1379 
1380 	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1381 	if (!config)
1382 		return NULL;
1383 	atomic_set(&config->recv_threads, 0);
1384 	init_waitqueue_head(&config->recv_wq);
1385 	init_waitqueue_head(&config->conn_wait);
1386 	config->blksize = NBD_DEF_BLKSIZE;
1387 	atomic_set(&config->live_connections, 0);
1388 	try_module_get(THIS_MODULE);
1389 	return config;
1390 }
1391 
1392 static int nbd_open(struct block_device *bdev, fmode_t mode)
1393 {
1394 	struct nbd_device *nbd;
1395 	int ret = 0;
1396 
1397 	mutex_lock(&nbd_index_mutex);
1398 	nbd = bdev->bd_disk->private_data;
1399 	if (!nbd) {
1400 		ret = -ENXIO;
1401 		goto out;
1402 	}
1403 	if (!refcount_inc_not_zero(&nbd->refs)) {
1404 		ret = -ENXIO;
1405 		goto out;
1406 	}
1407 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1408 		struct nbd_config *config;
1409 
1410 		mutex_lock(&nbd->config_lock);
1411 		if (refcount_inc_not_zero(&nbd->config_refs)) {
1412 			mutex_unlock(&nbd->config_lock);
1413 			goto out;
1414 		}
1415 		config = nbd->config = nbd_alloc_config();
1416 		if (!config) {
1417 			ret = -ENOMEM;
1418 			mutex_unlock(&nbd->config_lock);
1419 			goto out;
1420 		}
1421 		refcount_set(&nbd->config_refs, 1);
1422 		refcount_inc(&nbd->refs);
1423 		mutex_unlock(&nbd->config_lock);
1424 		bdev->bd_invalidated = 1;
1425 	} else if (nbd_disconnected(nbd->config)) {
1426 		bdev->bd_invalidated = 1;
1427 	}
1428 out:
1429 	mutex_unlock(&nbd_index_mutex);
1430 	return ret;
1431 }
1432 
1433 static void nbd_release(struct gendisk *disk, fmode_t mode)
1434 {
1435 	struct nbd_device *nbd = disk->private_data;
1436 	struct block_device *bdev = bdget_disk(disk, 0);
1437 
1438 	if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1439 			bdev->bd_openers == 0)
1440 		nbd_disconnect_and_put(nbd);
1441 
1442 	nbd_config_put(nbd);
1443 	nbd_put(nbd);
1444 }
1445 
1446 static const struct block_device_operations nbd_fops =
1447 {
1448 	.owner =	THIS_MODULE,
1449 	.open =		nbd_open,
1450 	.release =	nbd_release,
1451 	.ioctl =	nbd_ioctl,
1452 	.compat_ioctl =	nbd_ioctl,
1453 };
1454 
1455 #if IS_ENABLED(CONFIG_DEBUG_FS)
1456 
1457 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1458 {
1459 	struct nbd_device *nbd = s->private;
1460 
1461 	if (nbd->task_recv)
1462 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1463 
1464 	return 0;
1465 }
1466 
1467 static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1468 {
1469 	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1470 }
1471 
1472 static const struct file_operations nbd_dbg_tasks_ops = {
1473 	.open = nbd_dbg_tasks_open,
1474 	.read = seq_read,
1475 	.llseek = seq_lseek,
1476 	.release = single_release,
1477 };
1478 
1479 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1480 {
1481 	struct nbd_device *nbd = s->private;
1482 	u32 flags = nbd->config->flags;
1483 
1484 	seq_printf(s, "Hex: 0x%08x\n\n", flags);
1485 
1486 	seq_puts(s, "Known flags:\n");
1487 
1488 	if (flags & NBD_FLAG_HAS_FLAGS)
1489 		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1490 	if (flags & NBD_FLAG_READ_ONLY)
1491 		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1492 	if (flags & NBD_FLAG_SEND_FLUSH)
1493 		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1494 	if (flags & NBD_FLAG_SEND_FUA)
1495 		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1496 	if (flags & NBD_FLAG_SEND_TRIM)
1497 		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1498 
1499 	return 0;
1500 }
1501 
1502 static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1503 {
1504 	return single_open(file, nbd_dbg_flags_show, inode->i_private);
1505 }
1506 
1507 static const struct file_operations nbd_dbg_flags_ops = {
1508 	.open = nbd_dbg_flags_open,
1509 	.read = seq_read,
1510 	.llseek = seq_lseek,
1511 	.release = single_release,
1512 };
1513 
1514 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1515 {
1516 	struct dentry *dir;
1517 	struct nbd_config *config = nbd->config;
1518 
1519 	if (!nbd_dbg_dir)
1520 		return -EIO;
1521 
1522 	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1523 	if (!dir) {
1524 		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1525 			nbd_name(nbd));
1526 		return -EIO;
1527 	}
1528 	config->dbg_dir = dir;
1529 
1530 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1531 	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1532 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1533 	debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1534 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1535 
1536 	return 0;
1537 }
1538 
1539 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1540 {
1541 	debugfs_remove_recursive(nbd->config->dbg_dir);
1542 }
1543 
1544 static int nbd_dbg_init(void)
1545 {
1546 	struct dentry *dbg_dir;
1547 
1548 	dbg_dir = debugfs_create_dir("nbd", NULL);
1549 	if (!dbg_dir)
1550 		return -EIO;
1551 
1552 	nbd_dbg_dir = dbg_dir;
1553 
1554 	return 0;
1555 }
1556 
1557 static void nbd_dbg_close(void)
1558 {
1559 	debugfs_remove_recursive(nbd_dbg_dir);
1560 }
1561 
1562 #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1563 
1564 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1565 {
1566 	return 0;
1567 }
1568 
1569 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1570 {
1571 }
1572 
1573 static int nbd_dbg_init(void)
1574 {
1575 	return 0;
1576 }
1577 
1578 static void nbd_dbg_close(void)
1579 {
1580 }
1581 
1582 #endif
1583 
1584 static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1585 			    unsigned int hctx_idx, unsigned int numa_node)
1586 {
1587 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1588 	cmd->nbd = set->driver_data;
1589 	cmd->flags = 0;
1590 	mutex_init(&cmd->lock);
1591 	return 0;
1592 }
1593 
1594 static const struct blk_mq_ops nbd_mq_ops = {
1595 	.queue_rq	= nbd_queue_rq,
1596 	.complete	= nbd_complete_rq,
1597 	.init_request	= nbd_init_request,
1598 	.timeout	= nbd_xmit_timeout,
1599 };
1600 
1601 static int nbd_dev_add(int index)
1602 {
1603 	struct nbd_device *nbd;
1604 	struct gendisk *disk;
1605 	struct request_queue *q;
1606 	int err = -ENOMEM;
1607 
1608 	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1609 	if (!nbd)
1610 		goto out;
1611 
1612 	disk = alloc_disk(1 << part_shift);
1613 	if (!disk)
1614 		goto out_free_nbd;
1615 
1616 	if (index >= 0) {
1617 		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1618 				GFP_KERNEL);
1619 		if (err == -ENOSPC)
1620 			err = -EEXIST;
1621 	} else {
1622 		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1623 		if (err >= 0)
1624 			index = err;
1625 	}
1626 	if (err < 0)
1627 		goto out_free_disk;
1628 
1629 	nbd->index = index;
1630 	nbd->disk = disk;
1631 	nbd->tag_set.ops = &nbd_mq_ops;
1632 	nbd->tag_set.nr_hw_queues = 1;
1633 	nbd->tag_set.queue_depth = 128;
1634 	nbd->tag_set.numa_node = NUMA_NO_NODE;
1635 	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1636 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1637 		BLK_MQ_F_BLOCKING;
1638 	nbd->tag_set.driver_data = nbd;
1639 
1640 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
1641 	if (err)
1642 		goto out_free_idr;
1643 
1644 	q = blk_mq_init_queue(&nbd->tag_set);
1645 	if (IS_ERR(q)) {
1646 		err = PTR_ERR(q);
1647 		goto out_free_tags;
1648 	}
1649 	disk->queue = q;
1650 
1651 	/*
1652 	 * Tell the block layer that we are not a rotational device
1653 	 */
1654 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1655 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1656 	disk->queue->limits.discard_granularity = 0;
1657 	disk->queue->limits.discard_alignment = 0;
1658 	blk_queue_max_discard_sectors(disk->queue, 0);
1659 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
1660 	blk_queue_max_segments(disk->queue, USHRT_MAX);
1661 	blk_queue_max_hw_sectors(disk->queue, 65536);
1662 	disk->queue->limits.max_sectors = 256;
1663 
1664 	mutex_init(&nbd->config_lock);
1665 	refcount_set(&nbd->config_refs, 0);
1666 	refcount_set(&nbd->refs, 1);
1667 	INIT_LIST_HEAD(&nbd->list);
1668 	disk->major = NBD_MAJOR;
1669 	disk->first_minor = index << part_shift;
1670 	disk->fops = &nbd_fops;
1671 	disk->private_data = nbd;
1672 	sprintf(disk->disk_name, "nbd%d", index);
1673 	add_disk(disk);
1674 	nbd_total_devices++;
1675 	return index;
1676 
1677 out_free_tags:
1678 	blk_mq_free_tag_set(&nbd->tag_set);
1679 out_free_idr:
1680 	idr_remove(&nbd_index_idr, index);
1681 out_free_disk:
1682 	put_disk(disk);
1683 out_free_nbd:
1684 	kfree(nbd);
1685 out:
1686 	return err;
1687 }
1688 
1689 static int find_free_cb(int id, void *ptr, void *data)
1690 {
1691 	struct nbd_device *nbd = ptr;
1692 	struct nbd_device **found = data;
1693 
1694 	if (!refcount_read(&nbd->config_refs)) {
1695 		*found = nbd;
1696 		return 1;
1697 	}
1698 	return 0;
1699 }
1700 
1701 /* Netlink interface. */
1702 static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1703 	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
1704 	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
1705 	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
1706 	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
1707 	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
1708 	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
1709 	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
1710 	[NBD_ATTR_DEAD_CONN_TIMEOUT]	=	{ .type = NLA_U64 },
1711 	[NBD_ATTR_DEVICE_LIST]		=	{ .type = NLA_NESTED},
1712 };
1713 
1714 static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1715 	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
1716 };
1717 
1718 /* We don't use this right now since we don't parse the incoming list, but we
1719  * still want it here so userspace knows what to expect.
1720  */
1721 static const struct nla_policy __attribute__((unused))
1722 nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1723 	[NBD_DEVICE_INDEX]		=	{ .type = NLA_U32 },
1724 	[NBD_DEVICE_CONNECTED]		=	{ .type = NLA_U8 },
1725 };
1726 
1727 static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1728 {
1729 	struct nbd_config *config = nbd->config;
1730 	u64 bsize = config->blksize;
1731 	u64 bytes = config->bytesize;
1732 
1733 	if (info->attrs[NBD_ATTR_SIZE_BYTES])
1734 		bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1735 
1736 	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1737 		bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1738 		if (!bsize)
1739 			bsize = NBD_DEF_BLKSIZE;
1740 		if (!nbd_is_valid_blksize(bsize)) {
1741 			printk(KERN_ERR "Invalid block size %llu\n", bsize);
1742 			return -EINVAL;
1743 		}
1744 	}
1745 
1746 	if (bytes != config->bytesize || bsize != config->blksize)
1747 		nbd_size_set(nbd, bsize, div64_u64(bytes, bsize));
1748 	return 0;
1749 }
1750 
1751 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1752 {
1753 	struct nbd_device *nbd = NULL;
1754 	struct nbd_config *config;
1755 	int index = -1;
1756 	int ret;
1757 	bool put_dev = false;
1758 
1759 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1760 		return -EPERM;
1761 
1762 	if (info->attrs[NBD_ATTR_INDEX])
1763 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1764 	if (!info->attrs[NBD_ATTR_SOCKETS]) {
1765 		printk(KERN_ERR "nbd: must specify at least one socket\n");
1766 		return -EINVAL;
1767 	}
1768 	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1769 		printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1770 		return -EINVAL;
1771 	}
1772 again:
1773 	mutex_lock(&nbd_index_mutex);
1774 	if (index == -1) {
1775 		ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1776 		if (ret == 0) {
1777 			int new_index;
1778 			new_index = nbd_dev_add(-1);
1779 			if (new_index < 0) {
1780 				mutex_unlock(&nbd_index_mutex);
1781 				printk(KERN_ERR "nbd: failed to add new device\n");
1782 				return new_index;
1783 			}
1784 			nbd = idr_find(&nbd_index_idr, new_index);
1785 		}
1786 	} else {
1787 		nbd = idr_find(&nbd_index_idr, index);
1788 		if (!nbd) {
1789 			ret = nbd_dev_add(index);
1790 			if (ret < 0) {
1791 				mutex_unlock(&nbd_index_mutex);
1792 				printk(KERN_ERR "nbd: failed to add new device\n");
1793 				return ret;
1794 			}
1795 			nbd = idr_find(&nbd_index_idr, index);
1796 		}
1797 	}
1798 	if (!nbd) {
1799 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1800 		       index);
1801 		mutex_unlock(&nbd_index_mutex);
1802 		return -EINVAL;
1803 	}
1804 	if (!refcount_inc_not_zero(&nbd->refs)) {
1805 		mutex_unlock(&nbd_index_mutex);
1806 		if (index == -1)
1807 			goto again;
1808 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1809 		       index);
1810 		return -EINVAL;
1811 	}
1812 	mutex_unlock(&nbd_index_mutex);
1813 
1814 	mutex_lock(&nbd->config_lock);
1815 	if (refcount_read(&nbd->config_refs)) {
1816 		mutex_unlock(&nbd->config_lock);
1817 		nbd_put(nbd);
1818 		if (index == -1)
1819 			goto again;
1820 		printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1821 		return -EBUSY;
1822 	}
1823 	if (WARN_ON(nbd->config)) {
1824 		mutex_unlock(&nbd->config_lock);
1825 		nbd_put(nbd);
1826 		return -EINVAL;
1827 	}
1828 	config = nbd->config = nbd_alloc_config();
1829 	if (!nbd->config) {
1830 		mutex_unlock(&nbd->config_lock);
1831 		nbd_put(nbd);
1832 		printk(KERN_ERR "nbd: couldn't allocate config\n");
1833 		return -ENOMEM;
1834 	}
1835 	refcount_set(&nbd->config_refs, 1);
1836 	set_bit(NBD_BOUND, &config->runtime_flags);
1837 
1838 	ret = nbd_genl_size_set(info, nbd);
1839 	if (ret)
1840 		goto out;
1841 
1842 	if (info->attrs[NBD_ATTR_TIMEOUT])
1843 		nbd_set_cmd_timeout(nbd,
1844 				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
1845 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1846 		config->dead_conn_timeout =
1847 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1848 		config->dead_conn_timeout *= HZ;
1849 	}
1850 	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1851 		config->flags =
1852 			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1853 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1854 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1855 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1856 			set_bit(NBD_DESTROY_ON_DISCONNECT,
1857 				&config->runtime_flags);
1858 			put_dev = true;
1859 		}
1860 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1861 			set_bit(NBD_DISCONNECT_ON_CLOSE,
1862 				&config->runtime_flags);
1863 		}
1864 	}
1865 
1866 	if (info->attrs[NBD_ATTR_SOCKETS]) {
1867 		struct nlattr *attr;
1868 		int rem, fd;
1869 
1870 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1871 				    rem) {
1872 			struct nlattr *socks[NBD_SOCK_MAX+1];
1873 
1874 			if (nla_type(attr) != NBD_SOCK_ITEM) {
1875 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1876 				ret = -EINVAL;
1877 				goto out;
1878 			}
1879 			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
1880 							  attr,
1881 							  nbd_sock_policy,
1882 							  info->extack);
1883 			if (ret != 0) {
1884 				printk(KERN_ERR "nbd: error processing sock list\n");
1885 				ret = -EINVAL;
1886 				goto out;
1887 			}
1888 			if (!socks[NBD_SOCK_FD])
1889 				continue;
1890 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1891 			ret = nbd_add_socket(nbd, fd, true);
1892 			if (ret)
1893 				goto out;
1894 		}
1895 	}
1896 	ret = nbd_start_device(nbd);
1897 out:
1898 	mutex_unlock(&nbd->config_lock);
1899 	if (!ret) {
1900 		set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
1901 		refcount_inc(&nbd->config_refs);
1902 		nbd_connect_reply(info, nbd->index);
1903 	}
1904 	nbd_config_put(nbd);
1905 	if (put_dev)
1906 		nbd_put(nbd);
1907 	return ret;
1908 }
1909 
1910 static void nbd_disconnect_and_put(struct nbd_device *nbd)
1911 {
1912 	mutex_lock(&nbd->config_lock);
1913 	nbd_disconnect(nbd);
1914 	nbd_clear_sock(nbd);
1915 	mutex_unlock(&nbd->config_lock);
1916 	/*
1917 	 * Make sure recv thread has finished, so it does not drop the last
1918 	 * config ref and try to destroy the workqueue from inside the work
1919 	 * queue.
1920 	 */
1921 	flush_workqueue(nbd->recv_workq);
1922 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1923 			       &nbd->config->runtime_flags))
1924 		nbd_config_put(nbd);
1925 }
1926 
1927 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1928 {
1929 	struct nbd_device *nbd;
1930 	int index;
1931 
1932 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1933 		return -EPERM;
1934 
1935 	if (!info->attrs[NBD_ATTR_INDEX]) {
1936 		printk(KERN_ERR "nbd: must specify an index to disconnect\n");
1937 		return -EINVAL;
1938 	}
1939 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1940 	mutex_lock(&nbd_index_mutex);
1941 	nbd = idr_find(&nbd_index_idr, index);
1942 	if (!nbd) {
1943 		mutex_unlock(&nbd_index_mutex);
1944 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1945 		       index);
1946 		return -EINVAL;
1947 	}
1948 	if (!refcount_inc_not_zero(&nbd->refs)) {
1949 		mutex_unlock(&nbd_index_mutex);
1950 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1951 		       index);
1952 		return -EINVAL;
1953 	}
1954 	mutex_unlock(&nbd_index_mutex);
1955 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1956 		nbd_put(nbd);
1957 		return 0;
1958 	}
1959 	nbd_disconnect_and_put(nbd);
1960 	nbd_config_put(nbd);
1961 	nbd_put(nbd);
1962 	return 0;
1963 }
1964 
1965 static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1966 {
1967 	struct nbd_device *nbd = NULL;
1968 	struct nbd_config *config;
1969 	int index;
1970 	int ret = 0;
1971 	bool put_dev = false;
1972 
1973 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1974 		return -EPERM;
1975 
1976 	if (!info->attrs[NBD_ATTR_INDEX]) {
1977 		printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
1978 		return -EINVAL;
1979 	}
1980 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1981 	mutex_lock(&nbd_index_mutex);
1982 	nbd = idr_find(&nbd_index_idr, index);
1983 	if (!nbd) {
1984 		mutex_unlock(&nbd_index_mutex);
1985 		printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
1986 		       index);
1987 		return -EINVAL;
1988 	}
1989 	if (!refcount_inc_not_zero(&nbd->refs)) {
1990 		mutex_unlock(&nbd_index_mutex);
1991 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1992 		       index);
1993 		return -EINVAL;
1994 	}
1995 	mutex_unlock(&nbd_index_mutex);
1996 
1997 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1998 		dev_err(nbd_to_dev(nbd),
1999 			"not configured, cannot reconfigure\n");
2000 		nbd_put(nbd);
2001 		return -EINVAL;
2002 	}
2003 
2004 	mutex_lock(&nbd->config_lock);
2005 	config = nbd->config;
2006 	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
2007 	    !nbd->task_recv) {
2008 		dev_err(nbd_to_dev(nbd),
2009 			"not configured, cannot reconfigure\n");
2010 		ret = -EINVAL;
2011 		goto out;
2012 	}
2013 
2014 	ret = nbd_genl_size_set(info, nbd);
2015 	if (ret)
2016 		goto out;
2017 
2018 	if (info->attrs[NBD_ATTR_TIMEOUT])
2019 		nbd_set_cmd_timeout(nbd,
2020 				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2021 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2022 		config->dead_conn_timeout =
2023 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2024 		config->dead_conn_timeout *= HZ;
2025 	}
2026 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2027 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2028 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2029 			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2030 					      &config->runtime_flags))
2031 				put_dev = true;
2032 		} else {
2033 			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2034 					       &config->runtime_flags))
2035 				refcount_inc(&nbd->refs);
2036 		}
2037 
2038 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2039 			set_bit(NBD_DISCONNECT_ON_CLOSE,
2040 					&config->runtime_flags);
2041 		} else {
2042 			clear_bit(NBD_DISCONNECT_ON_CLOSE,
2043 					&config->runtime_flags);
2044 		}
2045 	}
2046 
2047 	if (info->attrs[NBD_ATTR_SOCKETS]) {
2048 		struct nlattr *attr;
2049 		int rem, fd;
2050 
2051 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2052 				    rem) {
2053 			struct nlattr *socks[NBD_SOCK_MAX+1];
2054 
2055 			if (nla_type(attr) != NBD_SOCK_ITEM) {
2056 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2057 				ret = -EINVAL;
2058 				goto out;
2059 			}
2060 			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2061 							  attr,
2062 							  nbd_sock_policy,
2063 							  info->extack);
2064 			if (ret != 0) {
2065 				printk(KERN_ERR "nbd: error processing sock list\n");
2066 				ret = -EINVAL;
2067 				goto out;
2068 			}
2069 			if (!socks[NBD_SOCK_FD])
2070 				continue;
2071 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2072 			ret = nbd_reconnect_socket(nbd, fd);
2073 			if (ret) {
2074 				if (ret == -ENOSPC)
2075 					ret = 0;
2076 				goto out;
2077 			}
2078 			dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2079 		}
2080 	}
2081 out:
2082 	mutex_unlock(&nbd->config_lock);
2083 	nbd_config_put(nbd);
2084 	nbd_put(nbd);
2085 	if (put_dev)
2086 		nbd_put(nbd);
2087 	return ret;
2088 }
2089 
2090 static const struct genl_ops nbd_connect_genl_ops[] = {
2091 	{
2092 		.cmd	= NBD_CMD_CONNECT,
2093 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2094 		.doit	= nbd_genl_connect,
2095 	},
2096 	{
2097 		.cmd	= NBD_CMD_DISCONNECT,
2098 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2099 		.doit	= nbd_genl_disconnect,
2100 	},
2101 	{
2102 		.cmd	= NBD_CMD_RECONFIGURE,
2103 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2104 		.doit	= nbd_genl_reconfigure,
2105 	},
2106 	{
2107 		.cmd	= NBD_CMD_STATUS,
2108 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2109 		.doit	= nbd_genl_status,
2110 	},
2111 };
2112 
2113 static const struct genl_multicast_group nbd_mcast_grps[] = {
2114 	{ .name = NBD_GENL_MCAST_GROUP_NAME, },
2115 };
2116 
2117 static struct genl_family nbd_genl_family __ro_after_init = {
2118 	.hdrsize	= 0,
2119 	.name		= NBD_GENL_FAMILY_NAME,
2120 	.version	= NBD_GENL_VERSION,
2121 	.module		= THIS_MODULE,
2122 	.ops		= nbd_connect_genl_ops,
2123 	.n_ops		= ARRAY_SIZE(nbd_connect_genl_ops),
2124 	.maxattr	= NBD_ATTR_MAX,
2125 	.policy = nbd_attr_policy,
2126 	.mcgrps		= nbd_mcast_grps,
2127 	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
2128 };
2129 
2130 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2131 {
2132 	struct nlattr *dev_opt;
2133 	u8 connected = 0;
2134 	int ret;
2135 
2136 	/* This is a little racey, but for status it's ok.  The
2137 	 * reason we don't take a ref here is because we can't
2138 	 * take a ref in the index == -1 case as we would need
2139 	 * to put under the nbd_index_mutex, which could
2140 	 * deadlock if we are configured to remove ourselves
2141 	 * once we're disconnected.
2142 	 */
2143 	if (refcount_read(&nbd->config_refs))
2144 		connected = 1;
2145 	dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
2146 	if (!dev_opt)
2147 		return -EMSGSIZE;
2148 	ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2149 	if (ret)
2150 		return -EMSGSIZE;
2151 	ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2152 			 connected);
2153 	if (ret)
2154 		return -EMSGSIZE;
2155 	nla_nest_end(reply, dev_opt);
2156 	return 0;
2157 }
2158 
2159 static int status_cb(int id, void *ptr, void *data)
2160 {
2161 	struct nbd_device *nbd = ptr;
2162 	return populate_nbd_status(nbd, (struct sk_buff *)data);
2163 }
2164 
2165 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2166 {
2167 	struct nlattr *dev_list;
2168 	struct sk_buff *reply;
2169 	void *reply_head;
2170 	size_t msg_size;
2171 	int index = -1;
2172 	int ret = -ENOMEM;
2173 
2174 	if (info->attrs[NBD_ATTR_INDEX])
2175 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2176 
2177 	mutex_lock(&nbd_index_mutex);
2178 
2179 	msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2180 				  nla_attr_size(sizeof(u8)));
2181 	msg_size *= (index == -1) ? nbd_total_devices : 1;
2182 
2183 	reply = genlmsg_new(msg_size, GFP_KERNEL);
2184 	if (!reply)
2185 		goto out;
2186 	reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2187 				       NBD_CMD_STATUS);
2188 	if (!reply_head) {
2189 		nlmsg_free(reply);
2190 		goto out;
2191 	}
2192 
2193 	dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
2194 	if (index == -1) {
2195 		ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2196 		if (ret) {
2197 			nlmsg_free(reply);
2198 			goto out;
2199 		}
2200 	} else {
2201 		struct nbd_device *nbd;
2202 		nbd = idr_find(&nbd_index_idr, index);
2203 		if (nbd) {
2204 			ret = populate_nbd_status(nbd, reply);
2205 			if (ret) {
2206 				nlmsg_free(reply);
2207 				goto out;
2208 			}
2209 		}
2210 	}
2211 	nla_nest_end(reply, dev_list);
2212 	genlmsg_end(reply, reply_head);
2213 	ret = genlmsg_reply(reply, info);
2214 out:
2215 	mutex_unlock(&nbd_index_mutex);
2216 	return ret;
2217 }
2218 
2219 static void nbd_connect_reply(struct genl_info *info, int index)
2220 {
2221 	struct sk_buff *skb;
2222 	void *msg_head;
2223 	int ret;
2224 
2225 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2226 	if (!skb)
2227 		return;
2228 	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2229 				     NBD_CMD_CONNECT);
2230 	if (!msg_head) {
2231 		nlmsg_free(skb);
2232 		return;
2233 	}
2234 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2235 	if (ret) {
2236 		nlmsg_free(skb);
2237 		return;
2238 	}
2239 	genlmsg_end(skb, msg_head);
2240 	genlmsg_reply(skb, info);
2241 }
2242 
2243 static void nbd_mcast_index(int index)
2244 {
2245 	struct sk_buff *skb;
2246 	void *msg_head;
2247 	int ret;
2248 
2249 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2250 	if (!skb)
2251 		return;
2252 	msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2253 				     NBD_CMD_LINK_DEAD);
2254 	if (!msg_head) {
2255 		nlmsg_free(skb);
2256 		return;
2257 	}
2258 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2259 	if (ret) {
2260 		nlmsg_free(skb);
2261 		return;
2262 	}
2263 	genlmsg_end(skb, msg_head);
2264 	genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2265 }
2266 
2267 static void nbd_dead_link_work(struct work_struct *work)
2268 {
2269 	struct link_dead_args *args = container_of(work, struct link_dead_args,
2270 						   work);
2271 	nbd_mcast_index(args->index);
2272 	kfree(args);
2273 }
2274 
2275 static int __init nbd_init(void)
2276 {
2277 	int i;
2278 
2279 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2280 
2281 	if (max_part < 0) {
2282 		printk(KERN_ERR "nbd: max_part must be >= 0\n");
2283 		return -EINVAL;
2284 	}
2285 
2286 	part_shift = 0;
2287 	if (max_part > 0) {
2288 		part_shift = fls(max_part);
2289 
2290 		/*
2291 		 * Adjust max_part according to part_shift as it is exported
2292 		 * to user space so that user can know the max number of
2293 		 * partition kernel should be able to manage.
2294 		 *
2295 		 * Note that -1 is required because partition 0 is reserved
2296 		 * for the whole disk.
2297 		 */
2298 		max_part = (1UL << part_shift) - 1;
2299 	}
2300 
2301 	if ((1UL << part_shift) > DISK_MAX_PARTS)
2302 		return -EINVAL;
2303 
2304 	if (nbds_max > 1UL << (MINORBITS - part_shift))
2305 		return -EINVAL;
2306 
2307 	if (register_blkdev(NBD_MAJOR, "nbd"))
2308 		return -EIO;
2309 
2310 	if (genl_register_family(&nbd_genl_family)) {
2311 		unregister_blkdev(NBD_MAJOR, "nbd");
2312 		return -EINVAL;
2313 	}
2314 	nbd_dbg_init();
2315 
2316 	mutex_lock(&nbd_index_mutex);
2317 	for (i = 0; i < nbds_max; i++)
2318 		nbd_dev_add(i);
2319 	mutex_unlock(&nbd_index_mutex);
2320 	return 0;
2321 }
2322 
2323 static int nbd_exit_cb(int id, void *ptr, void *data)
2324 {
2325 	struct list_head *list = (struct list_head *)data;
2326 	struct nbd_device *nbd = ptr;
2327 
2328 	list_add_tail(&nbd->list, list);
2329 	return 0;
2330 }
2331 
2332 static void __exit nbd_cleanup(void)
2333 {
2334 	struct nbd_device *nbd;
2335 	LIST_HEAD(del_list);
2336 
2337 	nbd_dbg_close();
2338 
2339 	mutex_lock(&nbd_index_mutex);
2340 	idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2341 	mutex_unlock(&nbd_index_mutex);
2342 
2343 	while (!list_empty(&del_list)) {
2344 		nbd = list_first_entry(&del_list, struct nbd_device, list);
2345 		list_del_init(&nbd->list);
2346 		if (refcount_read(&nbd->refs) != 1)
2347 			printk(KERN_ERR "nbd: possibly leaking a device\n");
2348 		nbd_put(nbd);
2349 	}
2350 
2351 	idr_destroy(&nbd_index_idr);
2352 	genl_unregister_family(&nbd_genl_family);
2353 	unregister_blkdev(NBD_MAJOR, "nbd");
2354 }
2355 
2356 module_init(nbd_init);
2357 module_exit(nbd_cleanup);
2358 
2359 MODULE_DESCRIPTION("Network Block Device");
2360 MODULE_LICENSE("GPL");
2361 
2362 module_param(nbds_max, int, 0444);
2363 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2364 module_param(max_part, int, 0444);
2365 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2366