xref: /openbmc/linux/drivers/block/nbd.c (revision 050e9baa)
1 /*
2  * Network block device - make block devices work over TCP
3  *
4  * Note that you can not swap over this thing, yet. Seems to work but
5  * deadlocks sometimes - you can not swap over TCP in general.
6  *
7  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
8  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9  *
10  * This file is released under GPLv2 or later.
11  *
12  * (part of code stolen from loop.c)
13  */
14 
15 #include <linux/major.h>
16 
17 #include <linux/blkdev.h>
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/sched.h>
21 #include <linux/sched/mm.h>
22 #include <linux/fs.h>
23 #include <linux/bio.h>
24 #include <linux/stat.h>
25 #include <linux/errno.h>
26 #include <linux/file.h>
27 #include <linux/ioctl.h>
28 #include <linux/mutex.h>
29 #include <linux/compiler.h>
30 #include <linux/err.h>
31 #include <linux/kernel.h>
32 #include <linux/slab.h>
33 #include <net/sock.h>
34 #include <linux/net.h>
35 #include <linux/kthread.h>
36 #include <linux/types.h>
37 #include <linux/debugfs.h>
38 #include <linux/blk-mq.h>
39 
40 #include <linux/uaccess.h>
41 #include <asm/types.h>
42 
43 #include <linux/nbd.h>
44 #include <linux/nbd-netlink.h>
45 #include <net/genetlink.h>
46 
47 static DEFINE_IDR(nbd_index_idr);
48 static DEFINE_MUTEX(nbd_index_mutex);
49 static int nbd_total_devices = 0;
50 
51 struct nbd_sock {
52 	struct socket *sock;
53 	struct mutex tx_lock;
54 	struct request *pending;
55 	int sent;
56 	bool dead;
57 	int fallback_index;
58 	int cookie;
59 };
60 
61 struct recv_thread_args {
62 	struct work_struct work;
63 	struct nbd_device *nbd;
64 	int index;
65 };
66 
67 struct link_dead_args {
68 	struct work_struct work;
69 	int index;
70 };
71 
72 #define NBD_TIMEDOUT			0
73 #define NBD_DISCONNECT_REQUESTED	1
74 #define NBD_DISCONNECTED		2
75 #define NBD_HAS_PID_FILE		3
76 #define NBD_HAS_CONFIG_REF		4
77 #define NBD_BOUND			5
78 #define NBD_DESTROY_ON_DISCONNECT	6
79 
80 struct nbd_config {
81 	u32 flags;
82 	unsigned long runtime_flags;
83 	u64 dead_conn_timeout;
84 
85 	struct nbd_sock **socks;
86 	int num_connections;
87 	atomic_t live_connections;
88 	wait_queue_head_t conn_wait;
89 
90 	atomic_t recv_threads;
91 	wait_queue_head_t recv_wq;
92 	loff_t blksize;
93 	loff_t bytesize;
94 #if IS_ENABLED(CONFIG_DEBUG_FS)
95 	struct dentry *dbg_dir;
96 #endif
97 };
98 
99 struct nbd_device {
100 	struct blk_mq_tag_set tag_set;
101 
102 	int index;
103 	refcount_t config_refs;
104 	refcount_t refs;
105 	struct nbd_config *config;
106 	struct mutex config_lock;
107 	struct gendisk *disk;
108 
109 	struct list_head list;
110 	struct task_struct *task_recv;
111 	struct task_struct *task_setup;
112 };
113 
114 struct nbd_cmd {
115 	struct nbd_device *nbd;
116 	int index;
117 	int cookie;
118 	struct completion send_complete;
119 	blk_status_t status;
120 };
121 
122 #if IS_ENABLED(CONFIG_DEBUG_FS)
123 static struct dentry *nbd_dbg_dir;
124 #endif
125 
126 #define nbd_name(nbd) ((nbd)->disk->disk_name)
127 
128 #define NBD_MAGIC 0x68797548
129 
130 static unsigned int nbds_max = 16;
131 static int max_part = 16;
132 static struct workqueue_struct *recv_workqueue;
133 static int part_shift;
134 
135 static int nbd_dev_dbg_init(struct nbd_device *nbd);
136 static void nbd_dev_dbg_close(struct nbd_device *nbd);
137 static void nbd_config_put(struct nbd_device *nbd);
138 static void nbd_connect_reply(struct genl_info *info, int index);
139 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
140 static void nbd_dead_link_work(struct work_struct *work);
141 
142 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
143 {
144 	return disk_to_dev(nbd->disk);
145 }
146 
147 static const char *nbdcmd_to_ascii(int cmd)
148 {
149 	switch (cmd) {
150 	case  NBD_CMD_READ: return "read";
151 	case NBD_CMD_WRITE: return "write";
152 	case  NBD_CMD_DISC: return "disconnect";
153 	case NBD_CMD_FLUSH: return "flush";
154 	case  NBD_CMD_TRIM: return "trim/discard";
155 	}
156 	return "invalid";
157 }
158 
159 static ssize_t pid_show(struct device *dev,
160 			struct device_attribute *attr, char *buf)
161 {
162 	struct gendisk *disk = dev_to_disk(dev);
163 	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
164 
165 	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
166 }
167 
168 static const struct device_attribute pid_attr = {
169 	.attr = { .name = "pid", .mode = 0444},
170 	.show = pid_show,
171 };
172 
173 static void nbd_dev_remove(struct nbd_device *nbd)
174 {
175 	struct gendisk *disk = nbd->disk;
176 	struct request_queue *q;
177 
178 	if (disk) {
179 		q = disk->queue;
180 		del_gendisk(disk);
181 		blk_cleanup_queue(q);
182 		blk_mq_free_tag_set(&nbd->tag_set);
183 		disk->private_data = NULL;
184 		put_disk(disk);
185 	}
186 	kfree(nbd);
187 }
188 
189 static void nbd_put(struct nbd_device *nbd)
190 {
191 	if (refcount_dec_and_mutex_lock(&nbd->refs,
192 					&nbd_index_mutex)) {
193 		idr_remove(&nbd_index_idr, nbd->index);
194 		mutex_unlock(&nbd_index_mutex);
195 		nbd_dev_remove(nbd);
196 	}
197 }
198 
199 static int nbd_disconnected(struct nbd_config *config)
200 {
201 	return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
202 		test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
203 }
204 
205 static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
206 				int notify)
207 {
208 	if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
209 		struct link_dead_args *args;
210 		args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
211 		if (args) {
212 			INIT_WORK(&args->work, nbd_dead_link_work);
213 			args->index = nbd->index;
214 			queue_work(system_wq, &args->work);
215 		}
216 	}
217 	if (!nsock->dead) {
218 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
219 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
220 			if (test_and_clear_bit(NBD_DISCONNECT_REQUESTED,
221 					       &nbd->config->runtime_flags)) {
222 				set_bit(NBD_DISCONNECTED,
223 					&nbd->config->runtime_flags);
224 				dev_info(nbd_to_dev(nbd),
225 					"Disconnected due to user request.\n");
226 			}
227 		}
228 	}
229 	nsock->dead = true;
230 	nsock->pending = NULL;
231 	nsock->sent = 0;
232 }
233 
234 static void nbd_size_clear(struct nbd_device *nbd)
235 {
236 	if (nbd->config->bytesize) {
237 		set_capacity(nbd->disk, 0);
238 		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
239 	}
240 }
241 
242 static void nbd_size_update(struct nbd_device *nbd)
243 {
244 	struct nbd_config *config = nbd->config;
245 	struct block_device *bdev = bdget_disk(nbd->disk, 0);
246 
247 	if (config->flags & NBD_FLAG_SEND_TRIM) {
248 		nbd->disk->queue->limits.discard_granularity = config->blksize;
249 		nbd->disk->queue->limits.discard_alignment = config->blksize;
250 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
251 	}
252 	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
253 	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
254 	set_capacity(nbd->disk, config->bytesize >> 9);
255 	if (bdev) {
256 		if (bdev->bd_disk)
257 			bd_set_size(bdev, config->bytesize);
258 		else
259 			bdev->bd_invalidated = 1;
260 		bdput(bdev);
261 	}
262 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
263 }
264 
265 static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
266 			 loff_t nr_blocks)
267 {
268 	struct nbd_config *config = nbd->config;
269 	config->blksize = blocksize;
270 	config->bytesize = blocksize * nr_blocks;
271 	if (nbd->task_recv != NULL)
272 		nbd_size_update(nbd);
273 }
274 
275 static void nbd_complete_rq(struct request *req)
276 {
277 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
278 
279 	dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
280 		cmd->status ? "failed" : "done");
281 
282 	blk_mq_end_request(req, cmd->status);
283 }
284 
285 /*
286  * Forcibly shutdown the socket causing all listeners to error
287  */
288 static void sock_shutdown(struct nbd_device *nbd)
289 {
290 	struct nbd_config *config = nbd->config;
291 	int i;
292 
293 	if (config->num_connections == 0)
294 		return;
295 	if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
296 		return;
297 
298 	for (i = 0; i < config->num_connections; i++) {
299 		struct nbd_sock *nsock = config->socks[i];
300 		mutex_lock(&nsock->tx_lock);
301 		nbd_mark_nsock_dead(nbd, nsock, 0);
302 		mutex_unlock(&nsock->tx_lock);
303 	}
304 	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
305 }
306 
307 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
308 						 bool reserved)
309 {
310 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
311 	struct nbd_device *nbd = cmd->nbd;
312 	struct nbd_config *config;
313 
314 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
315 		cmd->status = BLK_STS_TIMEOUT;
316 		goto done;
317 	}
318 	config = nbd->config;
319 
320 	if (config->num_connections > 1) {
321 		dev_err_ratelimited(nbd_to_dev(nbd),
322 				    "Connection timed out, retrying (%d/%d alive)\n",
323 				    atomic_read(&config->live_connections),
324 				    config->num_connections);
325 		/*
326 		 * Hooray we have more connections, requeue this IO, the submit
327 		 * path will put it on a real connection.
328 		 */
329 		if (config->socks && config->num_connections > 1) {
330 			if (cmd->index < config->num_connections) {
331 				struct nbd_sock *nsock =
332 					config->socks[cmd->index];
333 				mutex_lock(&nsock->tx_lock);
334 				/* We can have multiple outstanding requests, so
335 				 * we don't want to mark the nsock dead if we've
336 				 * already reconnected with a new socket, so
337 				 * only mark it dead if its the same socket we
338 				 * were sent out on.
339 				 */
340 				if (cmd->cookie == nsock->cookie)
341 					nbd_mark_nsock_dead(nbd, nsock, 1);
342 				mutex_unlock(&nsock->tx_lock);
343 			}
344 			blk_mq_requeue_request(req, true);
345 			nbd_config_put(nbd);
346 			return BLK_EH_DONE;
347 		}
348 	} else {
349 		dev_err_ratelimited(nbd_to_dev(nbd),
350 				    "Connection timed out\n");
351 	}
352 	set_bit(NBD_TIMEDOUT, &config->runtime_flags);
353 	cmd->status = BLK_STS_IOERR;
354 	sock_shutdown(nbd);
355 	nbd_config_put(nbd);
356 done:
357 	blk_mq_complete_request(req);
358 	return BLK_EH_DONE;
359 }
360 
361 /*
362  *  Send or receive packet.
363  */
364 static int sock_xmit(struct nbd_device *nbd, int index, int send,
365 		     struct iov_iter *iter, int msg_flags, int *sent)
366 {
367 	struct nbd_config *config = nbd->config;
368 	struct socket *sock = config->socks[index]->sock;
369 	int result;
370 	struct msghdr msg;
371 	unsigned int noreclaim_flag;
372 
373 	if (unlikely(!sock)) {
374 		dev_err_ratelimited(disk_to_dev(nbd->disk),
375 			"Attempted %s on closed socket in sock_xmit\n",
376 			(send ? "send" : "recv"));
377 		return -EINVAL;
378 	}
379 
380 	msg.msg_iter = *iter;
381 
382 	noreclaim_flag = memalloc_noreclaim_save();
383 	do {
384 		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
385 		msg.msg_name = NULL;
386 		msg.msg_namelen = 0;
387 		msg.msg_control = NULL;
388 		msg.msg_controllen = 0;
389 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
390 
391 		if (send)
392 			result = sock_sendmsg(sock, &msg);
393 		else
394 			result = sock_recvmsg(sock, &msg, msg.msg_flags);
395 
396 		if (result <= 0) {
397 			if (result == 0)
398 				result = -EPIPE; /* short read */
399 			break;
400 		}
401 		if (sent)
402 			*sent += result;
403 	} while (msg_data_left(&msg));
404 
405 	memalloc_noreclaim_restore(noreclaim_flag);
406 
407 	return result;
408 }
409 
410 /*
411  * Different settings for sk->sk_sndtimeo can result in different return values
412  * if there is a signal pending when we enter sendmsg, because reasons?
413  */
414 static inline int was_interrupted(int result)
415 {
416 	return result == -ERESTARTSYS || result == -EINTR;
417 }
418 
419 /* always call with the tx_lock held */
420 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
421 {
422 	struct request *req = blk_mq_rq_from_pdu(cmd);
423 	struct nbd_config *config = nbd->config;
424 	struct nbd_sock *nsock = config->socks[index];
425 	int result;
426 	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
427 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
428 	struct iov_iter from;
429 	unsigned long size = blk_rq_bytes(req);
430 	struct bio *bio;
431 	u32 type;
432 	u32 nbd_cmd_flags = 0;
433 	u32 tag = blk_mq_unique_tag(req);
434 	int sent = nsock->sent, skip = 0;
435 
436 	iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
437 
438 	switch (req_op(req)) {
439 	case REQ_OP_DISCARD:
440 		type = NBD_CMD_TRIM;
441 		break;
442 	case REQ_OP_FLUSH:
443 		type = NBD_CMD_FLUSH;
444 		break;
445 	case REQ_OP_WRITE:
446 		type = NBD_CMD_WRITE;
447 		break;
448 	case REQ_OP_READ:
449 		type = NBD_CMD_READ;
450 		break;
451 	default:
452 		return -EIO;
453 	}
454 
455 	if (rq_data_dir(req) == WRITE &&
456 	    (config->flags & NBD_FLAG_READ_ONLY)) {
457 		dev_err_ratelimited(disk_to_dev(nbd->disk),
458 				    "Write on read-only\n");
459 		return -EIO;
460 	}
461 
462 	if (req->cmd_flags & REQ_FUA)
463 		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
464 
465 	/* We did a partial send previously, and we at least sent the whole
466 	 * request struct, so just go and send the rest of the pages in the
467 	 * request.
468 	 */
469 	if (sent) {
470 		if (sent >= sizeof(request)) {
471 			skip = sent - sizeof(request);
472 			goto send_pages;
473 		}
474 		iov_iter_advance(&from, sent);
475 	}
476 	cmd->index = index;
477 	cmd->cookie = nsock->cookie;
478 	request.type = htonl(type | nbd_cmd_flags);
479 	if (type != NBD_CMD_FLUSH) {
480 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
481 		request.len = htonl(size);
482 	}
483 	memcpy(request.handle, &tag, sizeof(tag));
484 
485 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
486 		req, nbdcmd_to_ascii(type),
487 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
488 	result = sock_xmit(nbd, index, 1, &from,
489 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
490 	if (result <= 0) {
491 		if (was_interrupted(result)) {
492 			/* If we havne't sent anything we can just return BUSY,
493 			 * however if we have sent something we need to make
494 			 * sure we only allow this req to be sent until we are
495 			 * completely done.
496 			 */
497 			if (sent) {
498 				nsock->pending = req;
499 				nsock->sent = sent;
500 			}
501 			return BLK_STS_RESOURCE;
502 		}
503 		dev_err_ratelimited(disk_to_dev(nbd->disk),
504 			"Send control failed (result %d)\n", result);
505 		return -EAGAIN;
506 	}
507 send_pages:
508 	if (type != NBD_CMD_WRITE)
509 		goto out;
510 
511 	bio = req->bio;
512 	while (bio) {
513 		struct bio *next = bio->bi_next;
514 		struct bvec_iter iter;
515 		struct bio_vec bvec;
516 
517 		bio_for_each_segment(bvec, bio, iter) {
518 			bool is_last = !next && bio_iter_last(bvec, iter);
519 			int flags = is_last ? 0 : MSG_MORE;
520 
521 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
522 				req, bvec.bv_len);
523 			iov_iter_bvec(&from, ITER_BVEC | WRITE,
524 				      &bvec, 1, bvec.bv_len);
525 			if (skip) {
526 				if (skip >= iov_iter_count(&from)) {
527 					skip -= iov_iter_count(&from);
528 					continue;
529 				}
530 				iov_iter_advance(&from, skip);
531 				skip = 0;
532 			}
533 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
534 			if (result <= 0) {
535 				if (was_interrupted(result)) {
536 					/* We've already sent the header, we
537 					 * have no choice but to set pending and
538 					 * return BUSY.
539 					 */
540 					nsock->pending = req;
541 					nsock->sent = sent;
542 					return BLK_STS_RESOURCE;
543 				}
544 				dev_err(disk_to_dev(nbd->disk),
545 					"Send data failed (result %d)\n",
546 					result);
547 				return -EAGAIN;
548 			}
549 			/*
550 			 * The completion might already have come in,
551 			 * so break for the last one instead of letting
552 			 * the iterator do it. This prevents use-after-free
553 			 * of the bio.
554 			 */
555 			if (is_last)
556 				break;
557 		}
558 		bio = next;
559 	}
560 out:
561 	nsock->pending = NULL;
562 	nsock->sent = 0;
563 	return 0;
564 }
565 
566 /* NULL returned = something went wrong, inform userspace */
567 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
568 {
569 	struct nbd_config *config = nbd->config;
570 	int result;
571 	struct nbd_reply reply;
572 	struct nbd_cmd *cmd;
573 	struct request *req = NULL;
574 	u16 hwq;
575 	u32 tag;
576 	struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
577 	struct iov_iter to;
578 
579 	reply.magic = 0;
580 	iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
581 	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
582 	if (result <= 0) {
583 		if (!nbd_disconnected(config))
584 			dev_err(disk_to_dev(nbd->disk),
585 				"Receive control failed (result %d)\n", result);
586 		return ERR_PTR(result);
587 	}
588 
589 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
590 		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
591 				(unsigned long)ntohl(reply.magic));
592 		return ERR_PTR(-EPROTO);
593 	}
594 
595 	memcpy(&tag, reply.handle, sizeof(u32));
596 
597 	hwq = blk_mq_unique_tag_to_hwq(tag);
598 	if (hwq < nbd->tag_set.nr_hw_queues)
599 		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
600 				       blk_mq_unique_tag_to_tag(tag));
601 	if (!req || !blk_mq_request_started(req)) {
602 		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
603 			tag, req);
604 		return ERR_PTR(-ENOENT);
605 	}
606 	cmd = blk_mq_rq_to_pdu(req);
607 	if (ntohl(reply.error)) {
608 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
609 			ntohl(reply.error));
610 		cmd->status = BLK_STS_IOERR;
611 		return cmd;
612 	}
613 
614 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
615 	if (rq_data_dir(req) != WRITE) {
616 		struct req_iterator iter;
617 		struct bio_vec bvec;
618 
619 		rq_for_each_segment(bvec, req, iter) {
620 			iov_iter_bvec(&to, ITER_BVEC | READ,
621 				      &bvec, 1, bvec.bv_len);
622 			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
623 			if (result <= 0) {
624 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
625 					result);
626 				/*
627 				 * If we've disconnected or we only have 1
628 				 * connection then we need to make sure we
629 				 * complete this request, otherwise error out
630 				 * and let the timeout stuff handle resubmitting
631 				 * this request onto another connection.
632 				 */
633 				if (nbd_disconnected(config) ||
634 				    config->num_connections <= 1) {
635 					cmd->status = BLK_STS_IOERR;
636 					return cmd;
637 				}
638 				return ERR_PTR(-EIO);
639 			}
640 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
641 				req, bvec.bv_len);
642 		}
643 	} else {
644 		/* See the comment in nbd_queue_rq. */
645 		wait_for_completion(&cmd->send_complete);
646 	}
647 	return cmd;
648 }
649 
650 static void recv_work(struct work_struct *work)
651 {
652 	struct recv_thread_args *args = container_of(work,
653 						     struct recv_thread_args,
654 						     work);
655 	struct nbd_device *nbd = args->nbd;
656 	struct nbd_config *config = nbd->config;
657 	struct nbd_cmd *cmd;
658 
659 	while (1) {
660 		cmd = nbd_read_stat(nbd, args->index);
661 		if (IS_ERR(cmd)) {
662 			struct nbd_sock *nsock = config->socks[args->index];
663 
664 			mutex_lock(&nsock->tx_lock);
665 			nbd_mark_nsock_dead(nbd, nsock, 1);
666 			mutex_unlock(&nsock->tx_lock);
667 			break;
668 		}
669 
670 		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
671 	}
672 	atomic_dec(&config->recv_threads);
673 	wake_up(&config->recv_wq);
674 	nbd_config_put(nbd);
675 	kfree(args);
676 }
677 
678 static void nbd_clear_req(struct request *req, void *data, bool reserved)
679 {
680 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
681 
682 	cmd->status = BLK_STS_IOERR;
683 	blk_mq_complete_request(req);
684 }
685 
686 static void nbd_clear_que(struct nbd_device *nbd)
687 {
688 	blk_mq_quiesce_queue(nbd->disk->queue);
689 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
690 	blk_mq_unquiesce_queue(nbd->disk->queue);
691 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
692 }
693 
694 static int find_fallback(struct nbd_device *nbd, int index)
695 {
696 	struct nbd_config *config = nbd->config;
697 	int new_index = -1;
698 	struct nbd_sock *nsock = config->socks[index];
699 	int fallback = nsock->fallback_index;
700 
701 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
702 		return new_index;
703 
704 	if (config->num_connections <= 1) {
705 		dev_err_ratelimited(disk_to_dev(nbd->disk),
706 				    "Attempted send on invalid socket\n");
707 		return new_index;
708 	}
709 
710 	if (fallback >= 0 && fallback < config->num_connections &&
711 	    !config->socks[fallback]->dead)
712 		return fallback;
713 
714 	if (nsock->fallback_index < 0 ||
715 	    nsock->fallback_index >= config->num_connections ||
716 	    config->socks[nsock->fallback_index]->dead) {
717 		int i;
718 		for (i = 0; i < config->num_connections; i++) {
719 			if (i == index)
720 				continue;
721 			if (!config->socks[i]->dead) {
722 				new_index = i;
723 				break;
724 			}
725 		}
726 		nsock->fallback_index = new_index;
727 		if (new_index < 0) {
728 			dev_err_ratelimited(disk_to_dev(nbd->disk),
729 					    "Dead connection, failed to find a fallback\n");
730 			return new_index;
731 		}
732 	}
733 	new_index = nsock->fallback_index;
734 	return new_index;
735 }
736 
737 static int wait_for_reconnect(struct nbd_device *nbd)
738 {
739 	struct nbd_config *config = nbd->config;
740 	if (!config->dead_conn_timeout)
741 		return 0;
742 	if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
743 		return 0;
744 	return wait_event_timeout(config->conn_wait,
745 				  atomic_read(&config->live_connections) > 0,
746 				  config->dead_conn_timeout) > 0;
747 }
748 
749 static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
750 {
751 	struct request *req = blk_mq_rq_from_pdu(cmd);
752 	struct nbd_device *nbd = cmd->nbd;
753 	struct nbd_config *config;
754 	struct nbd_sock *nsock;
755 	int ret;
756 
757 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
758 		dev_err_ratelimited(disk_to_dev(nbd->disk),
759 				    "Socks array is empty\n");
760 		blk_mq_start_request(req);
761 		return -EINVAL;
762 	}
763 	config = nbd->config;
764 
765 	if (index >= config->num_connections) {
766 		dev_err_ratelimited(disk_to_dev(nbd->disk),
767 				    "Attempted send on invalid socket\n");
768 		nbd_config_put(nbd);
769 		blk_mq_start_request(req);
770 		return -EINVAL;
771 	}
772 	cmd->status = BLK_STS_OK;
773 again:
774 	nsock = config->socks[index];
775 	mutex_lock(&nsock->tx_lock);
776 	if (nsock->dead) {
777 		int old_index = index;
778 		index = find_fallback(nbd, index);
779 		mutex_unlock(&nsock->tx_lock);
780 		if (index < 0) {
781 			if (wait_for_reconnect(nbd)) {
782 				index = old_index;
783 				goto again;
784 			}
785 			/* All the sockets should already be down at this point,
786 			 * we just want to make sure that DISCONNECTED is set so
787 			 * any requests that come in that were queue'ed waiting
788 			 * for the reconnect timer don't trigger the timer again
789 			 * and instead just error out.
790 			 */
791 			sock_shutdown(nbd);
792 			nbd_config_put(nbd);
793 			blk_mq_start_request(req);
794 			return -EIO;
795 		}
796 		goto again;
797 	}
798 
799 	/* Handle the case that we have a pending request that was partially
800 	 * transmitted that _has_ to be serviced first.  We need to call requeue
801 	 * here so that it gets put _after_ the request that is already on the
802 	 * dispatch list.
803 	 */
804 	blk_mq_start_request(req);
805 	if (unlikely(nsock->pending && nsock->pending != req)) {
806 		blk_mq_requeue_request(req, true);
807 		ret = 0;
808 		goto out;
809 	}
810 	/*
811 	 * Some failures are related to the link going down, so anything that
812 	 * returns EAGAIN can be retried on a different socket.
813 	 */
814 	ret = nbd_send_cmd(nbd, cmd, index);
815 	if (ret == -EAGAIN) {
816 		dev_err_ratelimited(disk_to_dev(nbd->disk),
817 				    "Request send failed, requeueing\n");
818 		nbd_mark_nsock_dead(nbd, nsock, 1);
819 		blk_mq_requeue_request(req, true);
820 		ret = 0;
821 	}
822 out:
823 	mutex_unlock(&nsock->tx_lock);
824 	nbd_config_put(nbd);
825 	return ret;
826 }
827 
828 static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
829 			const struct blk_mq_queue_data *bd)
830 {
831 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
832 	int ret;
833 
834 	/*
835 	 * Since we look at the bio's to send the request over the network we
836 	 * need to make sure the completion work doesn't mark this request done
837 	 * before we are done doing our send.  This keeps us from dereferencing
838 	 * freed data if we have particularly fast completions (ie we get the
839 	 * completion before we exit sock_xmit on the last bvec) or in the case
840 	 * that the server is misbehaving (or there was an error) before we're
841 	 * done sending everything over the wire.
842 	 */
843 	init_completion(&cmd->send_complete);
844 
845 	/* We can be called directly from the user space process, which means we
846 	 * could possibly have signals pending so our sendmsg will fail.  In
847 	 * this case we need to return that we are busy, otherwise error out as
848 	 * appropriate.
849 	 */
850 	ret = nbd_handle_cmd(cmd, hctx->queue_num);
851 	if (ret < 0)
852 		ret = BLK_STS_IOERR;
853 	else if (!ret)
854 		ret = BLK_STS_OK;
855 	complete(&cmd->send_complete);
856 
857 	return ret;
858 }
859 
860 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
861 			  bool netlink)
862 {
863 	struct nbd_config *config = nbd->config;
864 	struct socket *sock;
865 	struct nbd_sock **socks;
866 	struct nbd_sock *nsock;
867 	int err;
868 
869 	sock = sockfd_lookup(arg, &err);
870 	if (!sock)
871 		return err;
872 
873 	if (!netlink && !nbd->task_setup &&
874 	    !test_bit(NBD_BOUND, &config->runtime_flags))
875 		nbd->task_setup = current;
876 
877 	if (!netlink &&
878 	    (nbd->task_setup != current ||
879 	     test_bit(NBD_BOUND, &config->runtime_flags))) {
880 		dev_err(disk_to_dev(nbd->disk),
881 			"Device being setup by another task");
882 		sockfd_put(sock);
883 		return -EBUSY;
884 	}
885 
886 	socks = krealloc(config->socks, (config->num_connections + 1) *
887 			 sizeof(struct nbd_sock *), GFP_KERNEL);
888 	if (!socks) {
889 		sockfd_put(sock);
890 		return -ENOMEM;
891 	}
892 	nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
893 	if (!nsock) {
894 		sockfd_put(sock);
895 		return -ENOMEM;
896 	}
897 
898 	config->socks = socks;
899 
900 	nsock->fallback_index = -1;
901 	nsock->dead = false;
902 	mutex_init(&nsock->tx_lock);
903 	nsock->sock = sock;
904 	nsock->pending = NULL;
905 	nsock->sent = 0;
906 	nsock->cookie = 0;
907 	socks[config->num_connections++] = nsock;
908 	atomic_inc(&config->live_connections);
909 
910 	return 0;
911 }
912 
913 static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
914 {
915 	struct nbd_config *config = nbd->config;
916 	struct socket *sock, *old;
917 	struct recv_thread_args *args;
918 	int i;
919 	int err;
920 
921 	sock = sockfd_lookup(arg, &err);
922 	if (!sock)
923 		return err;
924 
925 	args = kzalloc(sizeof(*args), GFP_KERNEL);
926 	if (!args) {
927 		sockfd_put(sock);
928 		return -ENOMEM;
929 	}
930 
931 	for (i = 0; i < config->num_connections; i++) {
932 		struct nbd_sock *nsock = config->socks[i];
933 
934 		if (!nsock->dead)
935 			continue;
936 
937 		mutex_lock(&nsock->tx_lock);
938 		if (!nsock->dead) {
939 			mutex_unlock(&nsock->tx_lock);
940 			continue;
941 		}
942 		sk_set_memalloc(sock->sk);
943 		if (nbd->tag_set.timeout)
944 			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
945 		atomic_inc(&config->recv_threads);
946 		refcount_inc(&nbd->config_refs);
947 		old = nsock->sock;
948 		nsock->fallback_index = -1;
949 		nsock->sock = sock;
950 		nsock->dead = false;
951 		INIT_WORK(&args->work, recv_work);
952 		args->index = i;
953 		args->nbd = nbd;
954 		nsock->cookie++;
955 		mutex_unlock(&nsock->tx_lock);
956 		sockfd_put(old);
957 
958 		clear_bit(NBD_DISCONNECTED, &config->runtime_flags);
959 
960 		/* We take the tx_mutex in an error path in the recv_work, so we
961 		 * need to queue_work outside of the tx_mutex.
962 		 */
963 		queue_work(recv_workqueue, &args->work);
964 
965 		atomic_inc(&config->live_connections);
966 		wake_up(&config->conn_wait);
967 		return 0;
968 	}
969 	sockfd_put(sock);
970 	kfree(args);
971 	return -ENOSPC;
972 }
973 
974 static void nbd_bdev_reset(struct block_device *bdev)
975 {
976 	if (bdev->bd_openers > 1)
977 		return;
978 	bd_set_size(bdev, 0);
979 }
980 
981 static void nbd_parse_flags(struct nbd_device *nbd)
982 {
983 	struct nbd_config *config = nbd->config;
984 	if (config->flags & NBD_FLAG_READ_ONLY)
985 		set_disk_ro(nbd->disk, true);
986 	else
987 		set_disk_ro(nbd->disk, false);
988 	if (config->flags & NBD_FLAG_SEND_TRIM)
989 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
990 	if (config->flags & NBD_FLAG_SEND_FLUSH) {
991 		if (config->flags & NBD_FLAG_SEND_FUA)
992 			blk_queue_write_cache(nbd->disk->queue, true, true);
993 		else
994 			blk_queue_write_cache(nbd->disk->queue, true, false);
995 	}
996 	else
997 		blk_queue_write_cache(nbd->disk->queue, false, false);
998 }
999 
1000 static void send_disconnects(struct nbd_device *nbd)
1001 {
1002 	struct nbd_config *config = nbd->config;
1003 	struct nbd_request request = {
1004 		.magic = htonl(NBD_REQUEST_MAGIC),
1005 		.type = htonl(NBD_CMD_DISC),
1006 	};
1007 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1008 	struct iov_iter from;
1009 	int i, ret;
1010 
1011 	for (i = 0; i < config->num_connections; i++) {
1012 		struct nbd_sock *nsock = config->socks[i];
1013 
1014 		iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
1015 		mutex_lock(&nsock->tx_lock);
1016 		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1017 		if (ret <= 0)
1018 			dev_err(disk_to_dev(nbd->disk),
1019 				"Send disconnect failed %d\n", ret);
1020 		mutex_unlock(&nsock->tx_lock);
1021 	}
1022 }
1023 
1024 static int nbd_disconnect(struct nbd_device *nbd)
1025 {
1026 	struct nbd_config *config = nbd->config;
1027 
1028 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1029 	set_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
1030 	send_disconnects(nbd);
1031 	return 0;
1032 }
1033 
1034 static void nbd_clear_sock(struct nbd_device *nbd)
1035 {
1036 	sock_shutdown(nbd);
1037 	nbd_clear_que(nbd);
1038 	nbd->task_setup = NULL;
1039 }
1040 
1041 static void nbd_config_put(struct nbd_device *nbd)
1042 {
1043 	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1044 					&nbd->config_lock)) {
1045 		struct nbd_config *config = nbd->config;
1046 		nbd_dev_dbg_close(nbd);
1047 		nbd_size_clear(nbd);
1048 		if (test_and_clear_bit(NBD_HAS_PID_FILE,
1049 				       &config->runtime_flags))
1050 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1051 		nbd->task_recv = NULL;
1052 		nbd_clear_sock(nbd);
1053 		if (config->num_connections) {
1054 			int i;
1055 			for (i = 0; i < config->num_connections; i++) {
1056 				sockfd_put(config->socks[i]->sock);
1057 				kfree(config->socks[i]);
1058 			}
1059 			kfree(config->socks);
1060 		}
1061 		kfree(nbd->config);
1062 		nbd->config = NULL;
1063 
1064 		nbd->tag_set.timeout = 0;
1065 		nbd->disk->queue->limits.discard_granularity = 0;
1066 		nbd->disk->queue->limits.discard_alignment = 0;
1067 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1068 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1069 
1070 		mutex_unlock(&nbd->config_lock);
1071 		nbd_put(nbd);
1072 		module_put(THIS_MODULE);
1073 	}
1074 }
1075 
1076 static int nbd_start_device(struct nbd_device *nbd)
1077 {
1078 	struct nbd_config *config = nbd->config;
1079 	int num_connections = config->num_connections;
1080 	int error = 0, i;
1081 
1082 	if (nbd->task_recv)
1083 		return -EBUSY;
1084 	if (!config->socks)
1085 		return -EINVAL;
1086 	if (num_connections > 1 &&
1087 	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1088 		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1089 		return -EINVAL;
1090 	}
1091 
1092 	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1093 	nbd->task_recv = current;
1094 
1095 	nbd_parse_flags(nbd);
1096 
1097 	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1098 	if (error) {
1099 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1100 		return error;
1101 	}
1102 	set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
1103 
1104 	nbd_dev_dbg_init(nbd);
1105 	for (i = 0; i < num_connections; i++) {
1106 		struct recv_thread_args *args;
1107 
1108 		args = kzalloc(sizeof(*args), GFP_KERNEL);
1109 		if (!args) {
1110 			sock_shutdown(nbd);
1111 			return -ENOMEM;
1112 		}
1113 		sk_set_memalloc(config->socks[i]->sock->sk);
1114 		if (nbd->tag_set.timeout)
1115 			config->socks[i]->sock->sk->sk_sndtimeo =
1116 				nbd->tag_set.timeout;
1117 		atomic_inc(&config->recv_threads);
1118 		refcount_inc(&nbd->config_refs);
1119 		INIT_WORK(&args->work, recv_work);
1120 		args->nbd = nbd;
1121 		args->index = i;
1122 		queue_work(recv_workqueue, &args->work);
1123 	}
1124 	nbd_size_update(nbd);
1125 	return error;
1126 }
1127 
1128 static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1129 {
1130 	struct nbd_config *config = nbd->config;
1131 	int ret;
1132 
1133 	ret = nbd_start_device(nbd);
1134 	if (ret)
1135 		return ret;
1136 
1137 	if (max_part)
1138 		bdev->bd_invalidated = 1;
1139 	mutex_unlock(&nbd->config_lock);
1140 	ret = wait_event_interruptible(config->recv_wq,
1141 					 atomic_read(&config->recv_threads) == 0);
1142 	if (ret)
1143 		sock_shutdown(nbd);
1144 	mutex_lock(&nbd->config_lock);
1145 	nbd_bdev_reset(bdev);
1146 	/* user requested, ignore socket errors */
1147 	if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
1148 		ret = 0;
1149 	if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
1150 		ret = -ETIMEDOUT;
1151 	return ret;
1152 }
1153 
1154 static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1155 				 struct block_device *bdev)
1156 {
1157 	sock_shutdown(nbd);
1158 	kill_bdev(bdev);
1159 	nbd_bdev_reset(bdev);
1160 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1161 			       &nbd->config->runtime_flags))
1162 		nbd_config_put(nbd);
1163 }
1164 
1165 /* Must be called with config_lock held */
1166 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1167 		       unsigned int cmd, unsigned long arg)
1168 {
1169 	struct nbd_config *config = nbd->config;
1170 
1171 	switch (cmd) {
1172 	case NBD_DISCONNECT:
1173 		return nbd_disconnect(nbd);
1174 	case NBD_CLEAR_SOCK:
1175 		nbd_clear_sock_ioctl(nbd, bdev);
1176 		return 0;
1177 	case NBD_SET_SOCK:
1178 		return nbd_add_socket(nbd, arg, false);
1179 	case NBD_SET_BLKSIZE:
1180 		nbd_size_set(nbd, arg,
1181 			     div_s64(config->bytesize, arg));
1182 		return 0;
1183 	case NBD_SET_SIZE:
1184 		nbd_size_set(nbd, config->blksize,
1185 			     div_s64(arg, config->blksize));
1186 		return 0;
1187 	case NBD_SET_SIZE_BLOCKS:
1188 		nbd_size_set(nbd, config->blksize, arg);
1189 		return 0;
1190 	case NBD_SET_TIMEOUT:
1191 		if (arg) {
1192 			nbd->tag_set.timeout = arg * HZ;
1193 			blk_queue_rq_timeout(nbd->disk->queue, arg * HZ);
1194 		}
1195 		return 0;
1196 
1197 	case NBD_SET_FLAGS:
1198 		config->flags = arg;
1199 		return 0;
1200 	case NBD_DO_IT:
1201 		return nbd_start_device_ioctl(nbd, bdev);
1202 	case NBD_CLEAR_QUE:
1203 		/*
1204 		 * This is for compatibility only.  The queue is always cleared
1205 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1206 		 */
1207 		return 0;
1208 	case NBD_PRINT_DEBUG:
1209 		/*
1210 		 * For compatibility only, we no longer keep a list of
1211 		 * outstanding requests.
1212 		 */
1213 		return 0;
1214 	}
1215 	return -ENOTTY;
1216 }
1217 
1218 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1219 		     unsigned int cmd, unsigned long arg)
1220 {
1221 	struct nbd_device *nbd = bdev->bd_disk->private_data;
1222 	struct nbd_config *config = nbd->config;
1223 	int error = -EINVAL;
1224 
1225 	if (!capable(CAP_SYS_ADMIN))
1226 		return -EPERM;
1227 
1228 	/* The block layer will pass back some non-nbd ioctls in case we have
1229 	 * special handling for them, but we don't so just return an error.
1230 	 */
1231 	if (_IOC_TYPE(cmd) != 0xab)
1232 		return -EINVAL;
1233 
1234 	mutex_lock(&nbd->config_lock);
1235 
1236 	/* Don't allow ioctl operations on a nbd device that was created with
1237 	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1238 	 */
1239 	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1240 	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1241 		error = __nbd_ioctl(bdev, nbd, cmd, arg);
1242 	else
1243 		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1244 	mutex_unlock(&nbd->config_lock);
1245 	return error;
1246 }
1247 
1248 static struct nbd_config *nbd_alloc_config(void)
1249 {
1250 	struct nbd_config *config;
1251 
1252 	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1253 	if (!config)
1254 		return NULL;
1255 	atomic_set(&config->recv_threads, 0);
1256 	init_waitqueue_head(&config->recv_wq);
1257 	init_waitqueue_head(&config->conn_wait);
1258 	config->blksize = 1024;
1259 	atomic_set(&config->live_connections, 0);
1260 	try_module_get(THIS_MODULE);
1261 	return config;
1262 }
1263 
1264 static int nbd_open(struct block_device *bdev, fmode_t mode)
1265 {
1266 	struct nbd_device *nbd;
1267 	int ret = 0;
1268 
1269 	mutex_lock(&nbd_index_mutex);
1270 	nbd = bdev->bd_disk->private_data;
1271 	if (!nbd) {
1272 		ret = -ENXIO;
1273 		goto out;
1274 	}
1275 	if (!refcount_inc_not_zero(&nbd->refs)) {
1276 		ret = -ENXIO;
1277 		goto out;
1278 	}
1279 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1280 		struct nbd_config *config;
1281 
1282 		mutex_lock(&nbd->config_lock);
1283 		if (refcount_inc_not_zero(&nbd->config_refs)) {
1284 			mutex_unlock(&nbd->config_lock);
1285 			goto out;
1286 		}
1287 		config = nbd->config = nbd_alloc_config();
1288 		if (!config) {
1289 			ret = -ENOMEM;
1290 			mutex_unlock(&nbd->config_lock);
1291 			goto out;
1292 		}
1293 		refcount_set(&nbd->config_refs, 1);
1294 		refcount_inc(&nbd->refs);
1295 		mutex_unlock(&nbd->config_lock);
1296 		bdev->bd_invalidated = 1;
1297 	} else if (nbd_disconnected(nbd->config)) {
1298 		bdev->bd_invalidated = 1;
1299 	}
1300 out:
1301 	mutex_unlock(&nbd_index_mutex);
1302 	return ret;
1303 }
1304 
1305 static void nbd_release(struct gendisk *disk, fmode_t mode)
1306 {
1307 	struct nbd_device *nbd = disk->private_data;
1308 	nbd_config_put(nbd);
1309 	nbd_put(nbd);
1310 }
1311 
1312 static const struct block_device_operations nbd_fops =
1313 {
1314 	.owner =	THIS_MODULE,
1315 	.open =		nbd_open,
1316 	.release =	nbd_release,
1317 	.ioctl =	nbd_ioctl,
1318 	.compat_ioctl =	nbd_ioctl,
1319 };
1320 
1321 #if IS_ENABLED(CONFIG_DEBUG_FS)
1322 
1323 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1324 {
1325 	struct nbd_device *nbd = s->private;
1326 
1327 	if (nbd->task_recv)
1328 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1329 
1330 	return 0;
1331 }
1332 
1333 static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1334 {
1335 	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1336 }
1337 
1338 static const struct file_operations nbd_dbg_tasks_ops = {
1339 	.open = nbd_dbg_tasks_open,
1340 	.read = seq_read,
1341 	.llseek = seq_lseek,
1342 	.release = single_release,
1343 };
1344 
1345 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1346 {
1347 	struct nbd_device *nbd = s->private;
1348 	u32 flags = nbd->config->flags;
1349 
1350 	seq_printf(s, "Hex: 0x%08x\n\n", flags);
1351 
1352 	seq_puts(s, "Known flags:\n");
1353 
1354 	if (flags & NBD_FLAG_HAS_FLAGS)
1355 		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1356 	if (flags & NBD_FLAG_READ_ONLY)
1357 		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1358 	if (flags & NBD_FLAG_SEND_FLUSH)
1359 		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1360 	if (flags & NBD_FLAG_SEND_FUA)
1361 		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1362 	if (flags & NBD_FLAG_SEND_TRIM)
1363 		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1364 
1365 	return 0;
1366 }
1367 
1368 static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1369 {
1370 	return single_open(file, nbd_dbg_flags_show, inode->i_private);
1371 }
1372 
1373 static const struct file_operations nbd_dbg_flags_ops = {
1374 	.open = nbd_dbg_flags_open,
1375 	.read = seq_read,
1376 	.llseek = seq_lseek,
1377 	.release = single_release,
1378 };
1379 
1380 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1381 {
1382 	struct dentry *dir;
1383 	struct nbd_config *config = nbd->config;
1384 
1385 	if (!nbd_dbg_dir)
1386 		return -EIO;
1387 
1388 	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1389 	if (!dir) {
1390 		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1391 			nbd_name(nbd));
1392 		return -EIO;
1393 	}
1394 	config->dbg_dir = dir;
1395 
1396 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1397 	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1398 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1399 	debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1400 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1401 
1402 	return 0;
1403 }
1404 
1405 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1406 {
1407 	debugfs_remove_recursive(nbd->config->dbg_dir);
1408 }
1409 
1410 static int nbd_dbg_init(void)
1411 {
1412 	struct dentry *dbg_dir;
1413 
1414 	dbg_dir = debugfs_create_dir("nbd", NULL);
1415 	if (!dbg_dir)
1416 		return -EIO;
1417 
1418 	nbd_dbg_dir = dbg_dir;
1419 
1420 	return 0;
1421 }
1422 
1423 static void nbd_dbg_close(void)
1424 {
1425 	debugfs_remove_recursive(nbd_dbg_dir);
1426 }
1427 
1428 #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1429 
1430 static int nbd_dev_dbg_init(struct nbd_device *nbd)
1431 {
1432 	return 0;
1433 }
1434 
1435 static void nbd_dev_dbg_close(struct nbd_device *nbd)
1436 {
1437 }
1438 
1439 static int nbd_dbg_init(void)
1440 {
1441 	return 0;
1442 }
1443 
1444 static void nbd_dbg_close(void)
1445 {
1446 }
1447 
1448 #endif
1449 
1450 static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1451 			    unsigned int hctx_idx, unsigned int numa_node)
1452 {
1453 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1454 	cmd->nbd = set->driver_data;
1455 	return 0;
1456 }
1457 
1458 static const struct blk_mq_ops nbd_mq_ops = {
1459 	.queue_rq	= nbd_queue_rq,
1460 	.complete	= nbd_complete_rq,
1461 	.init_request	= nbd_init_request,
1462 	.timeout	= nbd_xmit_timeout,
1463 };
1464 
1465 static int nbd_dev_add(int index)
1466 {
1467 	struct nbd_device *nbd;
1468 	struct gendisk *disk;
1469 	struct request_queue *q;
1470 	int err = -ENOMEM;
1471 
1472 	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1473 	if (!nbd)
1474 		goto out;
1475 
1476 	disk = alloc_disk(1 << part_shift);
1477 	if (!disk)
1478 		goto out_free_nbd;
1479 
1480 	if (index >= 0) {
1481 		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1482 				GFP_KERNEL);
1483 		if (err == -ENOSPC)
1484 			err = -EEXIST;
1485 	} else {
1486 		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1487 		if (err >= 0)
1488 			index = err;
1489 	}
1490 	if (err < 0)
1491 		goto out_free_disk;
1492 
1493 	nbd->index = index;
1494 	nbd->disk = disk;
1495 	nbd->tag_set.ops = &nbd_mq_ops;
1496 	nbd->tag_set.nr_hw_queues = 1;
1497 	nbd->tag_set.queue_depth = 128;
1498 	nbd->tag_set.numa_node = NUMA_NO_NODE;
1499 	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1500 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1501 		BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
1502 	nbd->tag_set.driver_data = nbd;
1503 
1504 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
1505 	if (err)
1506 		goto out_free_idr;
1507 
1508 	q = blk_mq_init_queue(&nbd->tag_set);
1509 	if (IS_ERR(q)) {
1510 		err = PTR_ERR(q);
1511 		goto out_free_tags;
1512 	}
1513 	disk->queue = q;
1514 
1515 	/*
1516 	 * Tell the block layer that we are not a rotational device
1517 	 */
1518 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1519 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1520 	disk->queue->limits.discard_granularity = 0;
1521 	disk->queue->limits.discard_alignment = 0;
1522 	blk_queue_max_discard_sectors(disk->queue, 0);
1523 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
1524 	blk_queue_max_segments(disk->queue, USHRT_MAX);
1525 	blk_queue_max_hw_sectors(disk->queue, 65536);
1526 	disk->queue->limits.max_sectors = 256;
1527 
1528 	mutex_init(&nbd->config_lock);
1529 	refcount_set(&nbd->config_refs, 0);
1530 	refcount_set(&nbd->refs, 1);
1531 	INIT_LIST_HEAD(&nbd->list);
1532 	disk->major = NBD_MAJOR;
1533 	disk->first_minor = index << part_shift;
1534 	disk->fops = &nbd_fops;
1535 	disk->private_data = nbd;
1536 	sprintf(disk->disk_name, "nbd%d", index);
1537 	add_disk(disk);
1538 	nbd_total_devices++;
1539 	return index;
1540 
1541 out_free_tags:
1542 	blk_mq_free_tag_set(&nbd->tag_set);
1543 out_free_idr:
1544 	idr_remove(&nbd_index_idr, index);
1545 out_free_disk:
1546 	put_disk(disk);
1547 out_free_nbd:
1548 	kfree(nbd);
1549 out:
1550 	return err;
1551 }
1552 
1553 static int find_free_cb(int id, void *ptr, void *data)
1554 {
1555 	struct nbd_device *nbd = ptr;
1556 	struct nbd_device **found = data;
1557 
1558 	if (!refcount_read(&nbd->config_refs)) {
1559 		*found = nbd;
1560 		return 1;
1561 	}
1562 	return 0;
1563 }
1564 
1565 /* Netlink interface. */
1566 static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1567 	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
1568 	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
1569 	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
1570 	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
1571 	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
1572 	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
1573 	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
1574 	[NBD_ATTR_DEAD_CONN_TIMEOUT]	=	{ .type = NLA_U64 },
1575 	[NBD_ATTR_DEVICE_LIST]		=	{ .type = NLA_NESTED},
1576 };
1577 
1578 static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1579 	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
1580 };
1581 
1582 /* We don't use this right now since we don't parse the incoming list, but we
1583  * still want it here so userspace knows what to expect.
1584  */
1585 static struct nla_policy __attribute__((unused))
1586 nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1587 	[NBD_DEVICE_INDEX]		=	{ .type = NLA_U32 },
1588 	[NBD_DEVICE_CONNECTED]		=	{ .type = NLA_U8 },
1589 };
1590 
1591 static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1592 {
1593 	struct nbd_device *nbd = NULL;
1594 	struct nbd_config *config;
1595 	int index = -1;
1596 	int ret;
1597 	bool put_dev = false;
1598 
1599 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1600 		return -EPERM;
1601 
1602 	if (info->attrs[NBD_ATTR_INDEX])
1603 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1604 	if (!info->attrs[NBD_ATTR_SOCKETS]) {
1605 		printk(KERN_ERR "nbd: must specify at least one socket\n");
1606 		return -EINVAL;
1607 	}
1608 	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1609 		printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1610 		return -EINVAL;
1611 	}
1612 again:
1613 	mutex_lock(&nbd_index_mutex);
1614 	if (index == -1) {
1615 		ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1616 		if (ret == 0) {
1617 			int new_index;
1618 			new_index = nbd_dev_add(-1);
1619 			if (new_index < 0) {
1620 				mutex_unlock(&nbd_index_mutex);
1621 				printk(KERN_ERR "nbd: failed to add new device\n");
1622 				return new_index;
1623 			}
1624 			nbd = idr_find(&nbd_index_idr, new_index);
1625 		}
1626 	} else {
1627 		nbd = idr_find(&nbd_index_idr, index);
1628 		if (!nbd) {
1629 			ret = nbd_dev_add(index);
1630 			if (ret < 0) {
1631 				mutex_unlock(&nbd_index_mutex);
1632 				printk(KERN_ERR "nbd: failed to add new device\n");
1633 				return ret;
1634 			}
1635 			nbd = idr_find(&nbd_index_idr, index);
1636 		}
1637 	}
1638 	if (!nbd) {
1639 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1640 		       index);
1641 		mutex_unlock(&nbd_index_mutex);
1642 		return -EINVAL;
1643 	}
1644 	if (!refcount_inc_not_zero(&nbd->refs)) {
1645 		mutex_unlock(&nbd_index_mutex);
1646 		if (index == -1)
1647 			goto again;
1648 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1649 		       index);
1650 		return -EINVAL;
1651 	}
1652 	mutex_unlock(&nbd_index_mutex);
1653 
1654 	mutex_lock(&nbd->config_lock);
1655 	if (refcount_read(&nbd->config_refs)) {
1656 		mutex_unlock(&nbd->config_lock);
1657 		nbd_put(nbd);
1658 		if (index == -1)
1659 			goto again;
1660 		printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1661 		return -EBUSY;
1662 	}
1663 	if (WARN_ON(nbd->config)) {
1664 		mutex_unlock(&nbd->config_lock);
1665 		nbd_put(nbd);
1666 		return -EINVAL;
1667 	}
1668 	config = nbd->config = nbd_alloc_config();
1669 	if (!nbd->config) {
1670 		mutex_unlock(&nbd->config_lock);
1671 		nbd_put(nbd);
1672 		printk(KERN_ERR "nbd: couldn't allocate config\n");
1673 		return -ENOMEM;
1674 	}
1675 	refcount_set(&nbd->config_refs, 1);
1676 	set_bit(NBD_BOUND, &config->runtime_flags);
1677 
1678 	if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
1679 		u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1680 		nbd_size_set(nbd, config->blksize,
1681 			     div64_u64(bytes, config->blksize));
1682 	}
1683 	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1684 		u64 bsize =
1685 			nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1686 		nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
1687 	}
1688 	if (info->attrs[NBD_ATTR_TIMEOUT]) {
1689 		u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1690 		nbd->tag_set.timeout = timeout * HZ;
1691 		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1692 	}
1693 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1694 		config->dead_conn_timeout =
1695 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1696 		config->dead_conn_timeout *= HZ;
1697 	}
1698 	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1699 		config->flags =
1700 			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1701 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1702 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1703 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1704 			set_bit(NBD_DESTROY_ON_DISCONNECT,
1705 				&config->runtime_flags);
1706 			put_dev = true;
1707 		}
1708 	}
1709 
1710 	if (info->attrs[NBD_ATTR_SOCKETS]) {
1711 		struct nlattr *attr;
1712 		int rem, fd;
1713 
1714 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1715 				    rem) {
1716 			struct nlattr *socks[NBD_SOCK_MAX+1];
1717 
1718 			if (nla_type(attr) != NBD_SOCK_ITEM) {
1719 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1720 				ret = -EINVAL;
1721 				goto out;
1722 			}
1723 			ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1724 					       nbd_sock_policy, info->extack);
1725 			if (ret != 0) {
1726 				printk(KERN_ERR "nbd: error processing sock list\n");
1727 				ret = -EINVAL;
1728 				goto out;
1729 			}
1730 			if (!socks[NBD_SOCK_FD])
1731 				continue;
1732 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1733 			ret = nbd_add_socket(nbd, fd, true);
1734 			if (ret)
1735 				goto out;
1736 		}
1737 	}
1738 	ret = nbd_start_device(nbd);
1739 out:
1740 	mutex_unlock(&nbd->config_lock);
1741 	if (!ret) {
1742 		set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
1743 		refcount_inc(&nbd->config_refs);
1744 		nbd_connect_reply(info, nbd->index);
1745 	}
1746 	nbd_config_put(nbd);
1747 	if (put_dev)
1748 		nbd_put(nbd);
1749 	return ret;
1750 }
1751 
1752 static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
1753 {
1754 	struct nbd_device *nbd;
1755 	int index;
1756 
1757 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1758 		return -EPERM;
1759 
1760 	if (!info->attrs[NBD_ATTR_INDEX]) {
1761 		printk(KERN_ERR "nbd: must specify an index to disconnect\n");
1762 		return -EINVAL;
1763 	}
1764 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1765 	mutex_lock(&nbd_index_mutex);
1766 	nbd = idr_find(&nbd_index_idr, index);
1767 	if (!nbd) {
1768 		mutex_unlock(&nbd_index_mutex);
1769 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1770 		       index);
1771 		return -EINVAL;
1772 	}
1773 	if (!refcount_inc_not_zero(&nbd->refs)) {
1774 		mutex_unlock(&nbd_index_mutex);
1775 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1776 		       index);
1777 		return -EINVAL;
1778 	}
1779 	mutex_unlock(&nbd_index_mutex);
1780 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1781 		nbd_put(nbd);
1782 		return 0;
1783 	}
1784 	mutex_lock(&nbd->config_lock);
1785 	nbd_disconnect(nbd);
1786 	nbd_clear_sock(nbd);
1787 	mutex_unlock(&nbd->config_lock);
1788 	if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
1789 			       &nbd->config->runtime_flags))
1790 		nbd_config_put(nbd);
1791 	nbd_config_put(nbd);
1792 	nbd_put(nbd);
1793 	return 0;
1794 }
1795 
1796 static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
1797 {
1798 	struct nbd_device *nbd = NULL;
1799 	struct nbd_config *config;
1800 	int index;
1801 	int ret = -EINVAL;
1802 	bool put_dev = false;
1803 
1804 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1805 		return -EPERM;
1806 
1807 	if (!info->attrs[NBD_ATTR_INDEX]) {
1808 		printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
1809 		return -EINVAL;
1810 	}
1811 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1812 	mutex_lock(&nbd_index_mutex);
1813 	nbd = idr_find(&nbd_index_idr, index);
1814 	if (!nbd) {
1815 		mutex_unlock(&nbd_index_mutex);
1816 		printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
1817 		       index);
1818 		return -EINVAL;
1819 	}
1820 	if (!refcount_inc_not_zero(&nbd->refs)) {
1821 		mutex_unlock(&nbd_index_mutex);
1822 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1823 		       index);
1824 		return -EINVAL;
1825 	}
1826 	mutex_unlock(&nbd_index_mutex);
1827 
1828 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1829 		dev_err(nbd_to_dev(nbd),
1830 			"not configured, cannot reconfigure\n");
1831 		nbd_put(nbd);
1832 		return -EINVAL;
1833 	}
1834 
1835 	mutex_lock(&nbd->config_lock);
1836 	config = nbd->config;
1837 	if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
1838 	    !nbd->task_recv) {
1839 		dev_err(nbd_to_dev(nbd),
1840 			"not configured, cannot reconfigure\n");
1841 		goto out;
1842 	}
1843 
1844 	if (info->attrs[NBD_ATTR_TIMEOUT]) {
1845 		u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
1846 		nbd->tag_set.timeout = timeout * HZ;
1847 		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1848 	}
1849 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1850 		config->dead_conn_timeout =
1851 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1852 		config->dead_conn_timeout *= HZ;
1853 	}
1854 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1855 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1856 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1857 			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1858 					      &config->runtime_flags))
1859 				put_dev = true;
1860 		} else {
1861 			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1862 					       &config->runtime_flags))
1863 				refcount_inc(&nbd->refs);
1864 		}
1865 	}
1866 
1867 	if (info->attrs[NBD_ATTR_SOCKETS]) {
1868 		struct nlattr *attr;
1869 		int rem, fd;
1870 
1871 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
1872 				    rem) {
1873 			struct nlattr *socks[NBD_SOCK_MAX+1];
1874 
1875 			if (nla_type(attr) != NBD_SOCK_ITEM) {
1876 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
1877 				ret = -EINVAL;
1878 				goto out;
1879 			}
1880 			ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
1881 					       nbd_sock_policy, info->extack);
1882 			if (ret != 0) {
1883 				printk(KERN_ERR "nbd: error processing sock list\n");
1884 				ret = -EINVAL;
1885 				goto out;
1886 			}
1887 			if (!socks[NBD_SOCK_FD])
1888 				continue;
1889 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
1890 			ret = nbd_reconnect_socket(nbd, fd);
1891 			if (ret) {
1892 				if (ret == -ENOSPC)
1893 					ret = 0;
1894 				goto out;
1895 			}
1896 			dev_info(nbd_to_dev(nbd), "reconnected socket\n");
1897 		}
1898 	}
1899 out:
1900 	mutex_unlock(&nbd->config_lock);
1901 	nbd_config_put(nbd);
1902 	nbd_put(nbd);
1903 	if (put_dev)
1904 		nbd_put(nbd);
1905 	return ret;
1906 }
1907 
1908 static const struct genl_ops nbd_connect_genl_ops[] = {
1909 	{
1910 		.cmd	= NBD_CMD_CONNECT,
1911 		.policy	= nbd_attr_policy,
1912 		.doit	= nbd_genl_connect,
1913 	},
1914 	{
1915 		.cmd	= NBD_CMD_DISCONNECT,
1916 		.policy	= nbd_attr_policy,
1917 		.doit	= nbd_genl_disconnect,
1918 	},
1919 	{
1920 		.cmd	= NBD_CMD_RECONFIGURE,
1921 		.policy	= nbd_attr_policy,
1922 		.doit	= nbd_genl_reconfigure,
1923 	},
1924 	{
1925 		.cmd	= NBD_CMD_STATUS,
1926 		.policy	= nbd_attr_policy,
1927 		.doit	= nbd_genl_status,
1928 	},
1929 };
1930 
1931 static const struct genl_multicast_group nbd_mcast_grps[] = {
1932 	{ .name = NBD_GENL_MCAST_GROUP_NAME, },
1933 };
1934 
1935 static struct genl_family nbd_genl_family __ro_after_init = {
1936 	.hdrsize	= 0,
1937 	.name		= NBD_GENL_FAMILY_NAME,
1938 	.version	= NBD_GENL_VERSION,
1939 	.module		= THIS_MODULE,
1940 	.ops		= nbd_connect_genl_ops,
1941 	.n_ops		= ARRAY_SIZE(nbd_connect_genl_ops),
1942 	.maxattr	= NBD_ATTR_MAX,
1943 	.mcgrps		= nbd_mcast_grps,
1944 	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
1945 };
1946 
1947 static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
1948 {
1949 	struct nlattr *dev_opt;
1950 	u8 connected = 0;
1951 	int ret;
1952 
1953 	/* This is a little racey, but for status it's ok.  The
1954 	 * reason we don't take a ref here is because we can't
1955 	 * take a ref in the index == -1 case as we would need
1956 	 * to put under the nbd_index_mutex, which could
1957 	 * deadlock if we are configured to remove ourselves
1958 	 * once we're disconnected.
1959 	 */
1960 	if (refcount_read(&nbd->config_refs))
1961 		connected = 1;
1962 	dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
1963 	if (!dev_opt)
1964 		return -EMSGSIZE;
1965 	ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
1966 	if (ret)
1967 		return -EMSGSIZE;
1968 	ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
1969 			 connected);
1970 	if (ret)
1971 		return -EMSGSIZE;
1972 	nla_nest_end(reply, dev_opt);
1973 	return 0;
1974 }
1975 
1976 static int status_cb(int id, void *ptr, void *data)
1977 {
1978 	struct nbd_device *nbd = ptr;
1979 	return populate_nbd_status(nbd, (struct sk_buff *)data);
1980 }
1981 
1982 static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
1983 {
1984 	struct nlattr *dev_list;
1985 	struct sk_buff *reply;
1986 	void *reply_head;
1987 	size_t msg_size;
1988 	int index = -1;
1989 	int ret = -ENOMEM;
1990 
1991 	if (info->attrs[NBD_ATTR_INDEX])
1992 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1993 
1994 	mutex_lock(&nbd_index_mutex);
1995 
1996 	msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
1997 				  nla_attr_size(sizeof(u8)));
1998 	msg_size *= (index == -1) ? nbd_total_devices : 1;
1999 
2000 	reply = genlmsg_new(msg_size, GFP_KERNEL);
2001 	if (!reply)
2002 		goto out;
2003 	reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2004 				       NBD_CMD_STATUS);
2005 	if (!reply_head) {
2006 		nlmsg_free(reply);
2007 		goto out;
2008 	}
2009 
2010 	dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
2011 	if (index == -1) {
2012 		ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2013 		if (ret) {
2014 			nlmsg_free(reply);
2015 			goto out;
2016 		}
2017 	} else {
2018 		struct nbd_device *nbd;
2019 		nbd = idr_find(&nbd_index_idr, index);
2020 		if (nbd) {
2021 			ret = populate_nbd_status(nbd, reply);
2022 			if (ret) {
2023 				nlmsg_free(reply);
2024 				goto out;
2025 			}
2026 		}
2027 	}
2028 	nla_nest_end(reply, dev_list);
2029 	genlmsg_end(reply, reply_head);
2030 	genlmsg_reply(reply, info);
2031 	ret = 0;
2032 out:
2033 	mutex_unlock(&nbd_index_mutex);
2034 	return ret;
2035 }
2036 
2037 static void nbd_connect_reply(struct genl_info *info, int index)
2038 {
2039 	struct sk_buff *skb;
2040 	void *msg_head;
2041 	int ret;
2042 
2043 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2044 	if (!skb)
2045 		return;
2046 	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2047 				     NBD_CMD_CONNECT);
2048 	if (!msg_head) {
2049 		nlmsg_free(skb);
2050 		return;
2051 	}
2052 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2053 	if (ret) {
2054 		nlmsg_free(skb);
2055 		return;
2056 	}
2057 	genlmsg_end(skb, msg_head);
2058 	genlmsg_reply(skb, info);
2059 }
2060 
2061 static void nbd_mcast_index(int index)
2062 {
2063 	struct sk_buff *skb;
2064 	void *msg_head;
2065 	int ret;
2066 
2067 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2068 	if (!skb)
2069 		return;
2070 	msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2071 				     NBD_CMD_LINK_DEAD);
2072 	if (!msg_head) {
2073 		nlmsg_free(skb);
2074 		return;
2075 	}
2076 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2077 	if (ret) {
2078 		nlmsg_free(skb);
2079 		return;
2080 	}
2081 	genlmsg_end(skb, msg_head);
2082 	genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2083 }
2084 
2085 static void nbd_dead_link_work(struct work_struct *work)
2086 {
2087 	struct link_dead_args *args = container_of(work, struct link_dead_args,
2088 						   work);
2089 	nbd_mcast_index(args->index);
2090 	kfree(args);
2091 }
2092 
2093 static int __init nbd_init(void)
2094 {
2095 	int i;
2096 
2097 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2098 
2099 	if (max_part < 0) {
2100 		printk(KERN_ERR "nbd: max_part must be >= 0\n");
2101 		return -EINVAL;
2102 	}
2103 
2104 	part_shift = 0;
2105 	if (max_part > 0) {
2106 		part_shift = fls(max_part);
2107 
2108 		/*
2109 		 * Adjust max_part according to part_shift as it is exported
2110 		 * to user space so that user can know the max number of
2111 		 * partition kernel should be able to manage.
2112 		 *
2113 		 * Note that -1 is required because partition 0 is reserved
2114 		 * for the whole disk.
2115 		 */
2116 		max_part = (1UL << part_shift) - 1;
2117 	}
2118 
2119 	if ((1UL << part_shift) > DISK_MAX_PARTS)
2120 		return -EINVAL;
2121 
2122 	if (nbds_max > 1UL << (MINORBITS - part_shift))
2123 		return -EINVAL;
2124 	recv_workqueue = alloc_workqueue("knbd-recv",
2125 					 WQ_MEM_RECLAIM | WQ_HIGHPRI |
2126 					 WQ_UNBOUND, 0);
2127 	if (!recv_workqueue)
2128 		return -ENOMEM;
2129 
2130 	if (register_blkdev(NBD_MAJOR, "nbd")) {
2131 		destroy_workqueue(recv_workqueue);
2132 		return -EIO;
2133 	}
2134 
2135 	if (genl_register_family(&nbd_genl_family)) {
2136 		unregister_blkdev(NBD_MAJOR, "nbd");
2137 		destroy_workqueue(recv_workqueue);
2138 		return -EINVAL;
2139 	}
2140 	nbd_dbg_init();
2141 
2142 	mutex_lock(&nbd_index_mutex);
2143 	for (i = 0; i < nbds_max; i++)
2144 		nbd_dev_add(i);
2145 	mutex_unlock(&nbd_index_mutex);
2146 	return 0;
2147 }
2148 
2149 static int nbd_exit_cb(int id, void *ptr, void *data)
2150 {
2151 	struct list_head *list = (struct list_head *)data;
2152 	struct nbd_device *nbd = ptr;
2153 
2154 	list_add_tail(&nbd->list, list);
2155 	return 0;
2156 }
2157 
2158 static void __exit nbd_cleanup(void)
2159 {
2160 	struct nbd_device *nbd;
2161 	LIST_HEAD(del_list);
2162 
2163 	nbd_dbg_close();
2164 
2165 	mutex_lock(&nbd_index_mutex);
2166 	idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2167 	mutex_unlock(&nbd_index_mutex);
2168 
2169 	while (!list_empty(&del_list)) {
2170 		nbd = list_first_entry(&del_list, struct nbd_device, list);
2171 		list_del_init(&nbd->list);
2172 		if (refcount_read(&nbd->refs) != 1)
2173 			printk(KERN_ERR "nbd: possibly leaking a device\n");
2174 		nbd_put(nbd);
2175 	}
2176 
2177 	idr_destroy(&nbd_index_idr);
2178 	genl_unregister_family(&nbd_genl_family);
2179 	destroy_workqueue(recv_workqueue);
2180 	unregister_blkdev(NBD_MAJOR, "nbd");
2181 }
2182 
2183 module_init(nbd_init);
2184 module_exit(nbd_cleanup);
2185 
2186 MODULE_DESCRIPTION("Network Block Device");
2187 MODULE_LICENSE("GPL");
2188 
2189 module_param(nbds_max, int, 0444);
2190 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2191 module_param(max_part, int, 0444);
2192 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2193