xref: /openbmc/linux/drivers/block/nbd.c (revision e983940270f10fe8551baf0098be76ea478294a3)
1 /*
2  * Network block device - make block devices work over TCP
3  *
4  * Note that you can not swap over this thing, yet. Seems to work but
5  * deadlocks sometimes - you can not swap over TCP in general.
6  *
7  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
8  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9  *
10  * This file is released under GPLv2 or later.
11  *
12  * (part of code stolen from loop.c)
13  */
14 
15 #include <linux/major.h>
16 
17 #include <linux/blkdev.h>
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/sched.h>
21 #include <linux/fs.h>
22 #include <linux/bio.h>
23 #include <linux/stat.h>
24 #include <linux/errno.h>
25 #include <linux/file.h>
26 #include <linux/ioctl.h>
27 #include <linux/mutex.h>
28 #include <linux/compiler.h>
29 #include <linux/err.h>
30 #include <linux/kernel.h>
31 #include <linux/slab.h>
32 #include <net/sock.h>
33 #include <linux/net.h>
34 #include <linux/kthread.h>
35 #include <linux/types.h>
36 #include <linux/debugfs.h>
37 #include <linux/blk-mq.h>
38 
39 #include <asm/uaccess.h>
40 #include <asm/types.h>
41 
42 #include <linux/nbd.h>
43 
44 #define NBD_TIMEDOUT			0
45 #define NBD_DISCONNECT_REQUESTED	1
46 
47 struct nbd_device {
48 	u32 flags;
49 	unsigned long runtime_flags;
50 	struct socket * sock;	/* If == NULL, device is not ready, yet	*/
51 	int magic;
52 
53 	struct blk_mq_tag_set tag_set;
54 
55 	struct mutex tx_lock;
56 	struct gendisk *disk;
57 	int blksize;
58 	loff_t bytesize;
59 
60 	/* protects initialization and shutdown of the socket */
61 	spinlock_t sock_lock;
62 	struct task_struct *task_recv;
63 	struct task_struct *task_send;
64 
65 #if IS_ENABLED(CONFIG_DEBUG_FS)
66 	struct dentry *dbg_dir;
67 #endif
68 };
69 
70 struct nbd_cmd {
71 	struct nbd_device *nbd;
72 	struct list_head list;
73 };
74 
75 #if IS_ENABLED(CONFIG_DEBUG_FS)
76 static struct dentry *nbd_dbg_dir;
77 #endif
78 
79 #define nbd_name(nbd) ((nbd)->disk->disk_name)
80 
81 #define NBD_MAGIC 0x68797548
82 
83 static unsigned int nbds_max = 16;
84 static struct nbd_device *nbd_dev;
85 static int max_part;
86 
87 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
88 {
89 	return disk_to_dev(nbd->disk);
90 }
91 
92 static bool nbd_is_connected(struct nbd_device *nbd)
93 {
94 	return !!nbd->task_recv;
95 }
96 
97 static const char *nbdcmd_to_ascii(int cmd)
98 {
99 	switch (cmd) {
100 	case  NBD_CMD_READ: return "read";
101 	case NBD_CMD_WRITE: return "write";
102 	case  NBD_CMD_DISC: return "disconnect";
103 	case NBD_CMD_FLUSH: return "flush";
104 	case  NBD_CMD_TRIM: return "trim/discard";
105 	}
106 	return "invalid";
107 }
108 
109 static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
110 {
111 	bdev->bd_inode->i_size = 0;
112 	set_capacity(nbd->disk, 0);
113 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
114 
115 	return 0;
116 }
117 
118 static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
119 {
120 	if (!nbd_is_connected(nbd))
121 		return;
122 
123 	bdev->bd_inode->i_size = nbd->bytesize;
124 	set_capacity(nbd->disk, nbd->bytesize >> 9);
125 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
126 }
127 
128 static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
129 			int blocksize, int nr_blocks)
130 {
131 	int ret;
132 
133 	ret = set_blocksize(bdev, blocksize);
134 	if (ret)
135 		return ret;
136 
137 	nbd->blksize = blocksize;
138 	nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks;
139 
140 	nbd_size_update(nbd, bdev);
141 
142 	return 0;
143 }
144 
145 static void nbd_end_request(struct nbd_cmd *cmd)
146 {
147 	struct nbd_device *nbd = cmd->nbd;
148 	struct request *req = blk_mq_rq_from_pdu(cmd);
149 	int error = req->errors ? -EIO : 0;
150 
151 	dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
152 		error ? "failed" : "done");
153 
154 	blk_mq_complete_request(req, error);
155 }
156 
157 /*
158  * Forcibly shutdown the socket causing all listeners to error
159  */
160 static void sock_shutdown(struct nbd_device *nbd)
161 {
162 	struct socket *sock;
163 
164 	spin_lock(&nbd->sock_lock);
165 
166 	if (!nbd->sock) {
167 		spin_unlock_irq(&nbd->sock_lock);
168 		return;
169 	}
170 
171 	sock = nbd->sock;
172 	dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
173 	nbd->sock = NULL;
174 	spin_unlock(&nbd->sock_lock);
175 
176 	kernel_sock_shutdown(sock, SHUT_RDWR);
177 	sockfd_put(sock);
178 }
179 
180 static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
181 						 bool reserved)
182 {
183 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
184 	struct nbd_device *nbd = cmd->nbd;
185 	struct socket *sock = NULL;
186 
187 	spin_lock(&nbd->sock_lock);
188 
189 	set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
190 
191 	if (nbd->sock) {
192 		sock = nbd->sock;
193 		get_file(sock->file);
194 	}
195 
196 	spin_unlock(&nbd->sock_lock);
197 	if (sock) {
198 		kernel_sock_shutdown(sock, SHUT_RDWR);
199 		sockfd_put(sock);
200 	}
201 
202 	req->errors++;
203 	dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
204 	return BLK_EH_HANDLED;
205 }
206 
207 /*
208  *  Send or receive packet.
209  */
210 static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
211 		int msg_flags)
212 {
213 	struct socket *sock = nbd->sock;
214 	int result;
215 	struct msghdr msg;
216 	struct kvec iov;
217 	unsigned long pflags = current->flags;
218 
219 	if (unlikely(!sock)) {
220 		dev_err(disk_to_dev(nbd->disk),
221 			"Attempted %s on closed socket in sock_xmit\n",
222 			(send ? "send" : "recv"));
223 		return -EINVAL;
224 	}
225 
226 	current->flags |= PF_MEMALLOC;
227 	do {
228 		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
229 		iov.iov_base = buf;
230 		iov.iov_len = size;
231 		msg.msg_name = NULL;
232 		msg.msg_namelen = 0;
233 		msg.msg_control = NULL;
234 		msg.msg_controllen = 0;
235 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
236 
237 		if (send)
238 			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
239 		else
240 			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
241 						msg.msg_flags);
242 
243 		if (result <= 0) {
244 			if (result == 0)
245 				result = -EPIPE; /* short read */
246 			break;
247 		}
248 		size -= result;
249 		buf += result;
250 	} while (size > 0);
251 
252 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
253 
254 	return result;
255 }
256 
257 static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
258 		int flags)
259 {
260 	int result;
261 	void *kaddr = kmap(bvec->bv_page);
262 	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
263 			   bvec->bv_len, flags);
264 	kunmap(bvec->bv_page);
265 	return result;
266 }
267 
268 /* always call with the tx_lock held */
269 static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
270 {
271 	struct request *req = blk_mq_rq_from_pdu(cmd);
272 	int result, flags;
273 	struct nbd_request request;
274 	unsigned long size = blk_rq_bytes(req);
275 	u32 type;
276 
277 	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
278 		type = NBD_CMD_DISC;
279 	else if (req_op(req) == REQ_OP_DISCARD)
280 		type = NBD_CMD_TRIM;
281 	else if (req_op(req) == REQ_OP_FLUSH)
282 		type = NBD_CMD_FLUSH;
283 	else if (rq_data_dir(req) == WRITE)
284 		type = NBD_CMD_WRITE;
285 	else
286 		type = NBD_CMD_READ;
287 
288 	memset(&request, 0, sizeof(request));
289 	request.magic = htonl(NBD_REQUEST_MAGIC);
290 	request.type = htonl(type);
291 	if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
292 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
293 		request.len = htonl(size);
294 	}
295 	memcpy(request.handle, &req->tag, sizeof(req->tag));
296 
297 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
298 		cmd, nbdcmd_to_ascii(type),
299 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
300 	result = sock_xmit(nbd, 1, &request, sizeof(request),
301 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
302 	if (result <= 0) {
303 		dev_err(disk_to_dev(nbd->disk),
304 			"Send control failed (result %d)\n", result);
305 		return -EIO;
306 	}
307 
308 	if (type == NBD_CMD_WRITE) {
309 		struct req_iterator iter;
310 		struct bio_vec bvec;
311 		/*
312 		 * we are really probing at internals to determine
313 		 * whether to set MSG_MORE or not...
314 		 */
315 		rq_for_each_segment(bvec, req, iter) {
316 			flags = 0;
317 			if (!rq_iter_last(bvec, iter))
318 				flags = MSG_MORE;
319 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
320 				cmd, bvec.bv_len);
321 			result = sock_send_bvec(nbd, &bvec, flags);
322 			if (result <= 0) {
323 				dev_err(disk_to_dev(nbd->disk),
324 					"Send data failed (result %d)\n",
325 					result);
326 				return -EIO;
327 			}
328 		}
329 	}
330 	return 0;
331 }
332 
333 static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
334 {
335 	int result;
336 	void *kaddr = kmap(bvec->bv_page);
337 	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
338 			MSG_WAITALL);
339 	kunmap(bvec->bv_page);
340 	return result;
341 }
342 
343 /* NULL returned = something went wrong, inform userspace */
344 static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
345 {
346 	int result;
347 	struct nbd_reply reply;
348 	struct nbd_cmd *cmd;
349 	struct request *req = NULL;
350 	u16 hwq;
351 	int tag;
352 
353 	reply.magic = 0;
354 	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
355 	if (result <= 0) {
356 		dev_err(disk_to_dev(nbd->disk),
357 			"Receive control failed (result %d)\n", result);
358 		return ERR_PTR(result);
359 	}
360 
361 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
362 		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
363 				(unsigned long)ntohl(reply.magic));
364 		return ERR_PTR(-EPROTO);
365 	}
366 
367 	memcpy(&tag, reply.handle, sizeof(int));
368 
369 	hwq = blk_mq_unique_tag_to_hwq(tag);
370 	if (hwq < nbd->tag_set.nr_hw_queues)
371 		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
372 				       blk_mq_unique_tag_to_tag(tag));
373 	if (!req || !blk_mq_request_started(req)) {
374 		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
375 			tag, req);
376 		return ERR_PTR(-ENOENT);
377 	}
378 	cmd = blk_mq_rq_to_pdu(req);
379 
380 	if (ntohl(reply.error)) {
381 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
382 			ntohl(reply.error));
383 		req->errors++;
384 		return cmd;
385 	}
386 
387 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
388 	if (rq_data_dir(req) != WRITE) {
389 		struct req_iterator iter;
390 		struct bio_vec bvec;
391 
392 		rq_for_each_segment(bvec, req, iter) {
393 			result = sock_recv_bvec(nbd, &bvec);
394 			if (result <= 0) {
395 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
396 					result);
397 				req->errors++;
398 				return cmd;
399 			}
400 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
401 				cmd, bvec.bv_len);
402 		}
403 	}
404 	return cmd;
405 }
406 
407 static ssize_t pid_show(struct device *dev,
408 			struct device_attribute *attr, char *buf)
409 {
410 	struct gendisk *disk = dev_to_disk(dev);
411 	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
412 
413 	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
414 }
415 
416 static struct device_attribute pid_attr = {
417 	.attr = { .name = "pid", .mode = S_IRUGO},
418 	.show = pid_show,
419 };
420 
421 static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
422 {
423 	struct nbd_cmd *cmd;
424 	int ret;
425 
426 	BUG_ON(nbd->magic != NBD_MAGIC);
427 
428 	sk_set_memalloc(nbd->sock->sk);
429 
430 	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
431 	if (ret) {
432 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
433 		return ret;
434 	}
435 
436 	nbd_size_update(nbd, bdev);
437 
438 	while (1) {
439 		cmd = nbd_read_stat(nbd);
440 		if (IS_ERR(cmd)) {
441 			ret = PTR_ERR(cmd);
442 			break;
443 		}
444 
445 		nbd_end_request(cmd);
446 	}
447 
448 	nbd_size_clear(nbd, bdev);
449 
450 	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
451 	return ret;
452 }
453 
454 static void nbd_clear_req(struct request *req, void *data, bool reserved)
455 {
456 	struct nbd_cmd *cmd;
457 
458 	if (!blk_mq_request_started(req))
459 		return;
460 	cmd = blk_mq_rq_to_pdu(req);
461 	req->errors++;
462 	nbd_end_request(cmd);
463 }
464 
465 static void nbd_clear_que(struct nbd_device *nbd)
466 {
467 	BUG_ON(nbd->magic != NBD_MAGIC);
468 
469 	/*
470 	 * Because we have set nbd->sock to NULL under the tx_lock, all
471 	 * modifications to the list must have completed by now.
472 	 */
473 	BUG_ON(nbd->sock);
474 
475 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
476 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
477 }
478 
479 
480 static void nbd_handle_cmd(struct nbd_cmd *cmd)
481 {
482 	struct request *req = blk_mq_rq_from_pdu(cmd);
483 	struct nbd_device *nbd = cmd->nbd;
484 
485 	if (req->cmd_type != REQ_TYPE_FS)
486 		goto error_out;
487 
488 	if (rq_data_dir(req) == WRITE &&
489 	    (nbd->flags & NBD_FLAG_READ_ONLY)) {
490 		dev_err(disk_to_dev(nbd->disk),
491 			"Write on read-only\n");
492 		goto error_out;
493 	}
494 
495 	req->errors = 0;
496 
497 	mutex_lock(&nbd->tx_lock);
498 	nbd->task_send = current;
499 	if (unlikely(!nbd->sock)) {
500 		mutex_unlock(&nbd->tx_lock);
501 		dev_err(disk_to_dev(nbd->disk),
502 			"Attempted send on closed socket\n");
503 		goto error_out;
504 	}
505 
506 	if (nbd_send_cmd(nbd, cmd) != 0) {
507 		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
508 		req->errors++;
509 		nbd_end_request(cmd);
510 	}
511 
512 	nbd->task_send = NULL;
513 	mutex_unlock(&nbd->tx_lock);
514 
515 	return;
516 
517 error_out:
518 	req->errors++;
519 	nbd_end_request(cmd);
520 }
521 
522 static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
523 			const struct blk_mq_queue_data *bd)
524 {
525 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
526 
527 	blk_mq_start_request(bd->rq);
528 	nbd_handle_cmd(cmd);
529 	return BLK_MQ_RQ_QUEUE_OK;
530 }
531 
532 static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
533 {
534 	int ret = 0;
535 
536 	spin_lock_irq(&nbd->sock_lock);
537 
538 	if (nbd->sock) {
539 		ret = -EBUSY;
540 		goto out;
541 	}
542 
543 	nbd->sock = sock;
544 
545 out:
546 	spin_unlock_irq(&nbd->sock_lock);
547 
548 	return ret;
549 }
550 
551 /* Reset all properties of an NBD device */
552 static void nbd_reset(struct nbd_device *nbd)
553 {
554 	nbd->runtime_flags = 0;
555 	nbd->blksize = 1024;
556 	nbd->bytesize = 0;
557 	set_capacity(nbd->disk, 0);
558 	nbd->flags = 0;
559 	nbd->tag_set.timeout = 0;
560 	queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
561 }
562 
563 static void nbd_bdev_reset(struct block_device *bdev)
564 {
565 	set_device_ro(bdev, false);
566 	bdev->bd_inode->i_size = 0;
567 	if (max_part > 0) {
568 		blkdev_reread_part(bdev);
569 		bdev->bd_invalidated = 1;
570 	}
571 }
572 
573 static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
574 {
575 	if (nbd->flags & NBD_FLAG_READ_ONLY)
576 		set_device_ro(bdev, true);
577 	if (nbd->flags & NBD_FLAG_SEND_TRIM)
578 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
579 	if (nbd->flags & NBD_FLAG_SEND_FLUSH)
580 		blk_queue_write_cache(nbd->disk->queue, true, false);
581 	else
582 		blk_queue_write_cache(nbd->disk->queue, false, false);
583 }
584 
585 static int nbd_dev_dbg_init(struct nbd_device *nbd);
586 static void nbd_dev_dbg_close(struct nbd_device *nbd);
587 
588 /* Must be called with tx_lock held */
589 
590 static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
591 		       unsigned int cmd, unsigned long arg)
592 {
593 	switch (cmd) {
594 	case NBD_DISCONNECT: {
595 		struct request *sreq;
596 
597 		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
598 		if (!nbd->sock)
599 			return -EINVAL;
600 
601 		sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
602 		if (!sreq)
603 			return -ENOMEM;
604 
605 		mutex_unlock(&nbd->tx_lock);
606 		fsync_bdev(bdev);
607 		mutex_lock(&nbd->tx_lock);
608 		sreq->cmd_type = REQ_TYPE_DRV_PRIV;
609 
610 		/* Check again after getting mutex back.  */
611 		if (!nbd->sock) {
612 			blk_mq_free_request(sreq);
613 			return -EINVAL;
614 		}
615 
616 		set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
617 
618 		nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
619 		blk_mq_free_request(sreq);
620 		return 0;
621 	}
622 
623 	case NBD_CLEAR_SOCK:
624 		sock_shutdown(nbd);
625 		nbd_clear_que(nbd);
626 		kill_bdev(bdev);
627 		return 0;
628 
629 	case NBD_SET_SOCK: {
630 		int err;
631 		struct socket *sock = sockfd_lookup(arg, &err);
632 
633 		if (!sock)
634 			return err;
635 
636 		err = nbd_set_socket(nbd, sock);
637 		if (!err && max_part)
638 			bdev->bd_invalidated = 1;
639 
640 		return err;
641 	}
642 
643 	case NBD_SET_BLKSIZE: {
644 		loff_t bsize = div_s64(nbd->bytesize, arg);
645 
646 		return nbd_size_set(nbd, bdev, arg, bsize);
647 	}
648 
649 	case NBD_SET_SIZE:
650 		return nbd_size_set(nbd, bdev, nbd->blksize,
651 				    arg / nbd->blksize);
652 
653 	case NBD_SET_SIZE_BLOCKS:
654 		return nbd_size_set(nbd, bdev, nbd->blksize, arg);
655 
656 	case NBD_SET_TIMEOUT:
657 		nbd->tag_set.timeout = arg * HZ;
658 		return 0;
659 
660 	case NBD_SET_FLAGS:
661 		nbd->flags = arg;
662 		return 0;
663 
664 	case NBD_DO_IT: {
665 		int error;
666 
667 		if (nbd->task_recv)
668 			return -EBUSY;
669 		if (!nbd->sock)
670 			return -EINVAL;
671 
672 		/* We have to claim the device under the lock */
673 		nbd->task_recv = current;
674 		mutex_unlock(&nbd->tx_lock);
675 
676 		nbd_parse_flags(nbd, bdev);
677 
678 		nbd_dev_dbg_init(nbd);
679 		error = nbd_thread_recv(nbd, bdev);
680 		nbd_dev_dbg_close(nbd);
681 
682 		mutex_lock(&nbd->tx_lock);
683 		nbd->task_recv = NULL;
684 
685 		sock_shutdown(nbd);
686 		nbd_clear_que(nbd);
687 		kill_bdev(bdev);
688 		nbd_bdev_reset(bdev);
689 
690 		/* user requested, ignore socket errors */
691 		if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
692 			error = 0;
693 		if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
694 			error = -ETIMEDOUT;
695 
696 		nbd_reset(nbd);
697 
698 		return error;
699 	}
700 
701 	case NBD_CLEAR_QUE:
702 		/*
703 		 * This is for compatibility only.  The queue is always cleared
704 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
705 		 */
706 		return 0;
707 
708 	case NBD_PRINT_DEBUG:
709 		/*
710 		 * For compatibility only, we no longer keep a list of
711 		 * outstanding requests.
712 		 */
713 		return 0;
714 	}
715 	return -ENOTTY;
716 }
717 
718 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
719 		     unsigned int cmd, unsigned long arg)
720 {
721 	struct nbd_device *nbd = bdev->bd_disk->private_data;
722 	int error;
723 
724 	if (!capable(CAP_SYS_ADMIN))
725 		return -EPERM;
726 
727 	BUG_ON(nbd->magic != NBD_MAGIC);
728 
729 	mutex_lock(&nbd->tx_lock);
730 	error = __nbd_ioctl(bdev, nbd, cmd, arg);
731 	mutex_unlock(&nbd->tx_lock);
732 
733 	return error;
734 }
735 
736 static const struct block_device_operations nbd_fops =
737 {
738 	.owner =	THIS_MODULE,
739 	.ioctl =	nbd_ioctl,
740 	.compat_ioctl =	nbd_ioctl,
741 };
742 
743 #if IS_ENABLED(CONFIG_DEBUG_FS)
744 
745 static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
746 {
747 	struct nbd_device *nbd = s->private;
748 
749 	if (nbd->task_recv)
750 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
751 	if (nbd->task_send)
752 		seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
753 
754 	return 0;
755 }
756 
757 static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
758 {
759 	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
760 }
761 
762 static const struct file_operations nbd_dbg_tasks_ops = {
763 	.open = nbd_dbg_tasks_open,
764 	.read = seq_read,
765 	.llseek = seq_lseek,
766 	.release = single_release,
767 };
768 
769 static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
770 {
771 	struct nbd_device *nbd = s->private;
772 	u32 flags = nbd->flags;
773 
774 	seq_printf(s, "Hex: 0x%08x\n\n", flags);
775 
776 	seq_puts(s, "Known flags:\n");
777 
778 	if (flags & NBD_FLAG_HAS_FLAGS)
779 		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
780 	if (flags & NBD_FLAG_READ_ONLY)
781 		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
782 	if (flags & NBD_FLAG_SEND_FLUSH)
783 		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
784 	if (flags & NBD_FLAG_SEND_TRIM)
785 		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
786 
787 	return 0;
788 }
789 
790 static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
791 {
792 	return single_open(file, nbd_dbg_flags_show, inode->i_private);
793 }
794 
795 static const struct file_operations nbd_dbg_flags_ops = {
796 	.open = nbd_dbg_flags_open,
797 	.read = seq_read,
798 	.llseek = seq_lseek,
799 	.release = single_release,
800 };
801 
802 static int nbd_dev_dbg_init(struct nbd_device *nbd)
803 {
804 	struct dentry *dir;
805 
806 	if (!nbd_dbg_dir)
807 		return -EIO;
808 
809 	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
810 	if (!dir) {
811 		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
812 			nbd_name(nbd));
813 		return -EIO;
814 	}
815 	nbd->dbg_dir = dir;
816 
817 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
818 	debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
819 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
820 	debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
821 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
822 
823 	return 0;
824 }
825 
826 static void nbd_dev_dbg_close(struct nbd_device *nbd)
827 {
828 	debugfs_remove_recursive(nbd->dbg_dir);
829 }
830 
831 static int nbd_dbg_init(void)
832 {
833 	struct dentry *dbg_dir;
834 
835 	dbg_dir = debugfs_create_dir("nbd", NULL);
836 	if (!dbg_dir)
837 		return -EIO;
838 
839 	nbd_dbg_dir = dbg_dir;
840 
841 	return 0;
842 }
843 
844 static void nbd_dbg_close(void)
845 {
846 	debugfs_remove_recursive(nbd_dbg_dir);
847 }
848 
849 #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
850 
851 static int nbd_dev_dbg_init(struct nbd_device *nbd)
852 {
853 	return 0;
854 }
855 
856 static void nbd_dev_dbg_close(struct nbd_device *nbd)
857 {
858 }
859 
860 static int nbd_dbg_init(void)
861 {
862 	return 0;
863 }
864 
865 static void nbd_dbg_close(void)
866 {
867 }
868 
869 #endif
870 
871 static int nbd_init_request(void *data, struct request *rq,
872 			    unsigned int hctx_idx, unsigned int request_idx,
873 			    unsigned int numa_node)
874 {
875 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
876 
877 	cmd->nbd = data;
878 	INIT_LIST_HEAD(&cmd->list);
879 	return 0;
880 }
881 
882 static struct blk_mq_ops nbd_mq_ops = {
883 	.queue_rq	= nbd_queue_rq,
884 	.init_request	= nbd_init_request,
885 	.timeout	= nbd_xmit_timeout,
886 };
887 
888 /*
889  * And here should be modules and kernel interface
890  *  (Just smiley confuses emacs :-)
891  */
892 
893 static int __init nbd_init(void)
894 {
895 	int err = -ENOMEM;
896 	int i;
897 	int part_shift;
898 
899 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
900 
901 	if (max_part < 0) {
902 		printk(KERN_ERR "nbd: max_part must be >= 0\n");
903 		return -EINVAL;
904 	}
905 
906 	part_shift = 0;
907 	if (max_part > 0) {
908 		part_shift = fls(max_part);
909 
910 		/*
911 		 * Adjust max_part according to part_shift as it is exported
912 		 * to user space so that user can know the max number of
913 		 * partition kernel should be able to manage.
914 		 *
915 		 * Note that -1 is required because partition 0 is reserved
916 		 * for the whole disk.
917 		 */
918 		max_part = (1UL << part_shift) - 1;
919 	}
920 
921 	if ((1UL << part_shift) > DISK_MAX_PARTS)
922 		return -EINVAL;
923 
924 	if (nbds_max > 1UL << (MINORBITS - part_shift))
925 		return -EINVAL;
926 
927 	nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
928 	if (!nbd_dev)
929 		return -ENOMEM;
930 
931 	for (i = 0; i < nbds_max; i++) {
932 		struct gendisk *disk = alloc_disk(1 << part_shift);
933 		if (!disk)
934 			goto out;
935 		nbd_dev[i].disk = disk;
936 
937 		nbd_dev[i].tag_set.ops = &nbd_mq_ops;
938 		nbd_dev[i].tag_set.nr_hw_queues = 1;
939 		nbd_dev[i].tag_set.queue_depth = 128;
940 		nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
941 		nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
942 		nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
943 			BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
944 		nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
945 
946 		err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
947 		if (err) {
948 			put_disk(disk);
949 			goto out;
950 		}
951 
952 		/*
953 		 * The new linux 2.5 block layer implementation requires
954 		 * every gendisk to have its very own request_queue struct.
955 		 * These structs are big so we dynamically allocate them.
956 		 */
957 		disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set);
958 		if (!disk->queue) {
959 			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
960 			put_disk(disk);
961 			goto out;
962 		}
963 
964 		/*
965 		 * Tell the block layer that we are not a rotational device
966 		 */
967 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
968 		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
969 		disk->queue->limits.discard_granularity = 512;
970 		blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
971 		disk->queue->limits.discard_zeroes_data = 0;
972 		blk_queue_max_hw_sectors(disk->queue, 65536);
973 		disk->queue->limits.max_sectors = 256;
974 	}
975 
976 	if (register_blkdev(NBD_MAJOR, "nbd")) {
977 		err = -EIO;
978 		goto out;
979 	}
980 
981 	printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
982 
983 	nbd_dbg_init();
984 
985 	for (i = 0; i < nbds_max; i++) {
986 		struct gendisk *disk = nbd_dev[i].disk;
987 		nbd_dev[i].magic = NBD_MAGIC;
988 		spin_lock_init(&nbd_dev[i].sock_lock);
989 		mutex_init(&nbd_dev[i].tx_lock);
990 		disk->major = NBD_MAJOR;
991 		disk->first_minor = i << part_shift;
992 		disk->fops = &nbd_fops;
993 		disk->private_data = &nbd_dev[i];
994 		sprintf(disk->disk_name, "nbd%d", i);
995 		nbd_reset(&nbd_dev[i]);
996 		add_disk(disk);
997 	}
998 
999 	return 0;
1000 out:
1001 	while (i--) {
1002 		blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1003 		blk_cleanup_queue(nbd_dev[i].disk->queue);
1004 		put_disk(nbd_dev[i].disk);
1005 	}
1006 	kfree(nbd_dev);
1007 	return err;
1008 }
1009 
1010 static void __exit nbd_cleanup(void)
1011 {
1012 	int i;
1013 
1014 	nbd_dbg_close();
1015 
1016 	for (i = 0; i < nbds_max; i++) {
1017 		struct gendisk *disk = nbd_dev[i].disk;
1018 		nbd_dev[i].magic = 0;
1019 		if (disk) {
1020 			del_gendisk(disk);
1021 			blk_cleanup_queue(disk->queue);
1022 			blk_mq_free_tag_set(&nbd_dev[i].tag_set);
1023 			put_disk(disk);
1024 		}
1025 	}
1026 	unregister_blkdev(NBD_MAJOR, "nbd");
1027 	kfree(nbd_dev);
1028 	printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
1029 }
1030 
1031 module_init(nbd_init);
1032 module_exit(nbd_cleanup);
1033 
1034 MODULE_DESCRIPTION("Network Block Device");
1035 MODULE_LICENSE("GPL");
1036 
1037 module_param(nbds_max, int, 0444);
1038 MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
1039 module_param(max_part, int, 0444);
1040 MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");
1041