xref: /openbmc/linux/drivers/block/ublk_drv.c (revision 8ebc80a25f9d9bf7a8e368b266d5b740c485c362)
171f28f31SMing Lei // SPDX-License-Identifier: GPL-2.0-or-later
271f28f31SMing Lei /*
371f28f31SMing Lei  * Userspace block device - block device which IO is handled from userspace
471f28f31SMing Lei  *
571f28f31SMing Lei  * Take full use of io_uring passthrough command for communicating with
671f28f31SMing Lei  * ublk userspace daemon(ublksrvd) for handling basic IO request.
771f28f31SMing Lei  *
871f28f31SMing Lei  * Copyright 2022 Ming Lei <ming.lei@redhat.com>
971f28f31SMing Lei  *
1071f28f31SMing Lei  * (part of code stolen from loop.c)
1171f28f31SMing Lei  */
1271f28f31SMing Lei #include <linux/module.h>
1371f28f31SMing Lei #include <linux/moduleparam.h>
1471f28f31SMing Lei #include <linux/sched.h>
1571f28f31SMing Lei #include <linux/fs.h>
1671f28f31SMing Lei #include <linux/pagemap.h>
1771f28f31SMing Lei #include <linux/file.h>
1871f28f31SMing Lei #include <linux/stat.h>
1971f28f31SMing Lei #include <linux/errno.h>
2071f28f31SMing Lei #include <linux/major.h>
2171f28f31SMing Lei #include <linux/wait.h>
2271f28f31SMing Lei #include <linux/blkdev.h>
2371f28f31SMing Lei #include <linux/init.h>
2471f28f31SMing Lei #include <linux/swap.h>
2571f28f31SMing Lei #include <linux/slab.h>
2671f28f31SMing Lei #include <linux/compat.h>
2771f28f31SMing Lei #include <linux/mutex.h>
2871f28f31SMing Lei #include <linux/writeback.h>
2971f28f31SMing Lei #include <linux/completion.h>
3071f28f31SMing Lei #include <linux/highmem.h>
3171f28f31SMing Lei #include <linux/sysfs.h>
3271f28f31SMing Lei #include <linux/miscdevice.h>
3371f28f31SMing Lei #include <linux/falloc.h>
3471f28f31SMing Lei #include <linux/uio.h>
3571f28f31SMing Lei #include <linux/ioprio.h>
3671f28f31SMing Lei #include <linux/sched/mm.h>
3771f28f31SMing Lei #include <linux/uaccess.h>
3871f28f31SMing Lei #include <linux/cdev.h>
3971f28f31SMing Lei #include <linux/io_uring.h>
4071f28f31SMing Lei #include <linux/blk-mq.h>
4171f28f31SMing Lei #include <linux/delay.h>
4271f28f31SMing Lei #include <linux/mm.h>
4371f28f31SMing Lei #include <asm/page.h>
440edb3696SMing Lei #include <linux/task_work.h>
454093cb5aSMing Lei #include <linux/namei.h>
4682840669SMing Lei #include <linux/kref.h>
4771f28f31SMing Lei #include <uapi/linux/ublk_cmd.h>
4871f28f31SMing Lei 
4971f28f31SMing Lei #define UBLK_MINORS		(1U << MINORBITS)
5071f28f31SMing Lei 
516d8c5afcSMing Lei /* All UBLK_F_* have to be included into UBLK_F_ALL */
52c86019ffSZiyangZhang #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
53c86019ffSZiyangZhang 		| UBLK_F_URING_CMD_COMP_IN_TASK \
5477a440e2SZiyangZhang 		| UBLK_F_NEED_GET_DATA \
55a0d41dc1SZiyangZhang 		| UBLK_F_USER_RECOVERY \
564093cb5aSMing Lei 		| UBLK_F_USER_RECOVERY_REISSUE \
572d786e66SMing Lei 		| UBLK_F_UNPRIVILEGED_DEV \
581172d5b8SMing Lei 		| UBLK_F_CMD_IOCTL_ENCODE \
5929802d7cSAndreas Hindborg 		| UBLK_F_USER_COPY \
6029802d7cSAndreas Hindborg 		| UBLK_F_ZONED)
616d8c5afcSMing Lei 
620aa73170SMing Lei /* All UBLK_PARAM_TYPE_* should be included here */
6329802d7cSAndreas Hindborg #define UBLK_PARAM_TYPE_ALL                                \
6429802d7cSAndreas Hindborg 	(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
6529802d7cSAndreas Hindborg 	 UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
660aa73170SMing Lei 
670edb3696SMing Lei struct ublk_rq_data {
683ab6e94cSMing Lei 	struct llist_node node;
6982840669SMing Lei 
7082840669SMing Lei 	struct kref ref;
710edb3696SMing Lei };
720edb3696SMing Lei 
7371f28f31SMing Lei struct ublk_uring_cmd_pdu {
743ab6e94cSMing Lei 	struct ublk_queue *ubq;
7571f28f31SMing Lei };
7671f28f31SMing Lei 
7771f28f31SMing Lei /*
7871f28f31SMing Lei  * io command is active: sqe cmd is received, and its cqe isn't done
7971f28f31SMing Lei  *
8071f28f31SMing Lei  * If the flag is set, the io command is owned by ublk driver, and waited
8171f28f31SMing Lei  * for incoming blk-mq request from the ublk block device.
8271f28f31SMing Lei  *
8371f28f31SMing Lei  * If the flag is cleared, the io command will be completed, and owned by
8471f28f31SMing Lei  * ublk server.
8571f28f31SMing Lei  */
8671f28f31SMing Lei #define UBLK_IO_FLAG_ACTIVE	0x01
8771f28f31SMing Lei 
8871f28f31SMing Lei /*
8971f28f31SMing Lei  * IO command is completed via cqe, and it is being handled by ublksrv, and
9071f28f31SMing Lei  * not committed yet
9171f28f31SMing Lei  *
9271f28f31SMing Lei  * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
9371f28f31SMing Lei  * cross verification
9471f28f31SMing Lei  */
9571f28f31SMing Lei #define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
9671f28f31SMing Lei 
9771f28f31SMing Lei /*
9871f28f31SMing Lei  * IO command is aborted, so this flag is set in case of
9971f28f31SMing Lei  * !UBLK_IO_FLAG_ACTIVE.
10071f28f31SMing Lei  *
10171f28f31SMing Lei  * After this flag is observed, any pending or new incoming request
10271f28f31SMing Lei  * associated with this io command will be failed immediately
10371f28f31SMing Lei  */
10471f28f31SMing Lei #define UBLK_IO_FLAG_ABORTED 0x04
10571f28f31SMing Lei 
106c86019ffSZiyangZhang /*
107c86019ffSZiyangZhang  * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
108c86019ffSZiyangZhang  * get data buffer address from ublksrv.
109c86019ffSZiyangZhang  *
110c86019ffSZiyangZhang  * Then, bio data could be copied into this data buffer for a WRITE request
111c86019ffSZiyangZhang  * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
112c86019ffSZiyangZhang  */
113c86019ffSZiyangZhang #define UBLK_IO_FLAG_NEED_GET_DATA 0x08
114c86019ffSZiyangZhang 
1158cb8ef0cSMing Lei /* atomic RW with ubq->cancel_lock */
1168cb8ef0cSMing Lei #define UBLK_IO_FLAG_CANCELED	0x80000000
1178cb8ef0cSMing Lei 
11871f28f31SMing Lei struct ublk_io {
11971f28f31SMing Lei 	/* userspace buffer address from io cmd */
12071f28f31SMing Lei 	__u64	addr;
12171f28f31SMing Lei 	unsigned int flags;
12271f28f31SMing Lei 	int res;
12371f28f31SMing Lei 
12471f28f31SMing Lei 	struct io_uring_cmd *cmd;
12571f28f31SMing Lei };
12671f28f31SMing Lei 
12771f28f31SMing Lei struct ublk_queue {
12871f28f31SMing Lei 	int q_id;
12971f28f31SMing Lei 	int q_depth;
13071f28f31SMing Lei 
1310edb3696SMing Lei 	unsigned long flags;
13271f28f31SMing Lei 	struct task_struct	*ubq_daemon;
13371f28f31SMing Lei 	char *io_cmd_buf;
13471f28f31SMing Lei 
1353ab6e94cSMing Lei 	struct llist_head	io_cmds;
1363ab6e94cSMing Lei 
13771f28f31SMing Lei 	unsigned long io_addr;	/* mapped vm address */
13871f28f31SMing Lei 	unsigned int max_io_sz;
139bbae8d1fSZiyangZhang 	bool force_abort;
140c0b79b0fSMing Lei 	bool timeout;
14171f28f31SMing Lei 	unsigned short nr_io_ready;	/* how many ios setup */
1428cb8ef0cSMing Lei 	spinlock_t		cancel_lock;
14371f28f31SMing Lei 	struct ublk_device *dev;
14472495b5aSYushan Zhou 	struct ublk_io ios[];
14571f28f31SMing Lei };
14671f28f31SMing Lei 
14771f28f31SMing Lei #define UBLK_DAEMON_MONITOR_PERIOD	(5 * HZ)
14871f28f31SMing Lei 
14971f28f31SMing Lei struct ublk_device {
15071f28f31SMing Lei 	struct gendisk		*ub_disk;
15171f28f31SMing Lei 
15271f28f31SMing Lei 	char	*__queues;
15371f28f31SMing Lei 
15429baef78SLiu Xiaodong 	unsigned int	queue_size;
15571f28f31SMing Lei 	struct ublksrv_ctrl_dev_info	dev_info;
15671f28f31SMing Lei 
15771f28f31SMing Lei 	struct blk_mq_tag_set	tag_set;
15871f28f31SMing Lei 
15971f28f31SMing Lei 	struct cdev		cdev;
16071f28f31SMing Lei 	struct device		cdev_dev;
16171f28f31SMing Lei 
1628d9fdb60SDan Carpenter #define UB_STATE_OPEN		0
1638d9fdb60SDan Carpenter #define UB_STATE_USED		1
1640abe39deSMing Lei #define UB_STATE_DELETED	2
165fa362045SChristoph Hellwig 	unsigned long		state;
16671f28f31SMing Lei 	int			ub_number;
16771f28f31SMing Lei 
16871f28f31SMing Lei 	struct mutex		mutex;
16971f28f31SMing Lei 
170e94eb459SMing Lei 	spinlock_t		mm_lock;
17171f28f31SMing Lei 	struct mm_struct	*mm;
17271f28f31SMing Lei 
1730aa73170SMing Lei 	struct ublk_params	params;
1740aa73170SMing Lei 
17571f28f31SMing Lei 	struct completion	completion;
17671f28f31SMing Lei 	unsigned int		nr_queues_ready;
17773a166d9SMing Lei 	unsigned int		nr_privileged_daemon;
17871f28f31SMing Lei 
17971f28f31SMing Lei 	/*
18071f28f31SMing Lei 	 * Our ubq->daemon may be killed without any notification, so
18171f28f31SMing Lei 	 * monitor each queue's daemon periodically
18271f28f31SMing Lei 	 */
18371f28f31SMing Lei 	struct delayed_work	monitor_work;
184bbae8d1fSZiyangZhang 	struct work_struct	quiesce_work;
18571f28f31SMing Lei 	struct work_struct	stop_work;
18671f28f31SMing Lei };
18771f28f31SMing Lei 
1880aa73170SMing Lei /* header of ublk_params */
1890aa73170SMing Lei struct ublk_params_header {
1900aa73170SMing Lei 	__u32	len;
1910aa73170SMing Lei 	__u32	types;
1920aa73170SMing Lei };
1930aa73170SMing Lei 
19429802d7cSAndreas Hindborg static inline unsigned int ublk_req_build_flags(struct request *req);
19529802d7cSAndreas Hindborg static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
19629802d7cSAndreas Hindborg 						   int tag);
19729802d7cSAndreas Hindborg 
ublk_dev_is_user_copy(const struct ublk_device * ub)1989d4ed6d4SAndreas Hindborg static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
1999d4ed6d4SAndreas Hindborg {
2009d4ed6d4SAndreas Hindborg 	return ub->dev_info.flags & UBLK_F_USER_COPY;
2019d4ed6d4SAndreas Hindborg }
2029d4ed6d4SAndreas Hindborg 
ublk_dev_is_zoned(const struct ublk_device * ub)20329802d7cSAndreas Hindborg static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
20429802d7cSAndreas Hindborg {
20529802d7cSAndreas Hindborg 	return ub->dev_info.flags & UBLK_F_ZONED;
20629802d7cSAndreas Hindborg }
20729802d7cSAndreas Hindborg 
ublk_queue_is_zoned(struct ublk_queue * ubq)20829802d7cSAndreas Hindborg static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
20929802d7cSAndreas Hindborg {
21029802d7cSAndreas Hindborg 	return ubq->flags & UBLK_F_ZONED;
21129802d7cSAndreas Hindborg }
21229802d7cSAndreas Hindborg 
21329802d7cSAndreas Hindborg #ifdef CONFIG_BLK_DEV_ZONED
21429802d7cSAndreas Hindborg 
21575a5e590SMing Lei struct ublk_zoned_report_desc {
21675a5e590SMing Lei 	__u64 sector;
21775a5e590SMing Lei 	__u32 operation;
21875a5e590SMing Lei 	__u32 nr_zones;
21975a5e590SMing Lei };
22075a5e590SMing Lei 
22175a5e590SMing Lei static DEFINE_XARRAY(ublk_zoned_report_descs);
22275a5e590SMing Lei 
ublk_zoned_insert_report_desc(const struct request * req,struct ublk_zoned_report_desc * desc)22375a5e590SMing Lei static int ublk_zoned_insert_report_desc(const struct request *req,
22475a5e590SMing Lei 		struct ublk_zoned_report_desc *desc)
22575a5e590SMing Lei {
22675a5e590SMing Lei 	return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
22775a5e590SMing Lei 			    desc, GFP_KERNEL);
22875a5e590SMing Lei }
22975a5e590SMing Lei 
ublk_zoned_erase_report_desc(const struct request * req)23075a5e590SMing Lei static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
23175a5e590SMing Lei 		const struct request *req)
23275a5e590SMing Lei {
23375a5e590SMing Lei 	return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
23475a5e590SMing Lei }
23575a5e590SMing Lei 
ublk_zoned_get_report_desc(const struct request * req)23675a5e590SMing Lei static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
23775a5e590SMing Lei 		const struct request *req)
23875a5e590SMing Lei {
23975a5e590SMing Lei 	return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
24075a5e590SMing Lei }
24175a5e590SMing Lei 
ublk_get_nr_zones(const struct ublk_device * ub)24229802d7cSAndreas Hindborg static int ublk_get_nr_zones(const struct ublk_device *ub)
24329802d7cSAndreas Hindborg {
24429802d7cSAndreas Hindborg 	const struct ublk_param_basic *p = &ub->params.basic;
24529802d7cSAndreas Hindborg 
24629802d7cSAndreas Hindborg 	/* Zone size is a power of 2 */
24729802d7cSAndreas Hindborg 	return p->dev_sectors >> ilog2(p->chunk_sectors);
24829802d7cSAndreas Hindborg }
24929802d7cSAndreas Hindborg 
ublk_revalidate_disk_zones(struct ublk_device * ub)25029802d7cSAndreas Hindborg static int ublk_revalidate_disk_zones(struct ublk_device *ub)
25129802d7cSAndreas Hindborg {
25229802d7cSAndreas Hindborg 	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
25329802d7cSAndreas Hindborg }
25429802d7cSAndreas Hindborg 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)25529802d7cSAndreas Hindborg static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
25629802d7cSAndreas Hindborg {
25729802d7cSAndreas Hindborg 	const struct ublk_param_zoned *p = &ub->params.zoned;
25829802d7cSAndreas Hindborg 	int nr_zones;
25929802d7cSAndreas Hindborg 
26029802d7cSAndreas Hindborg 	if (!ublk_dev_is_zoned(ub))
26129802d7cSAndreas Hindborg 		return -EINVAL;
26229802d7cSAndreas Hindborg 
26329802d7cSAndreas Hindborg 	if (!p->max_zone_append_sectors)
26429802d7cSAndreas Hindborg 		return -EINVAL;
26529802d7cSAndreas Hindborg 
26629802d7cSAndreas Hindborg 	nr_zones = ublk_get_nr_zones(ub);
26729802d7cSAndreas Hindborg 
26829802d7cSAndreas Hindborg 	if (p->max_active_zones > nr_zones)
26929802d7cSAndreas Hindborg 		return -EINVAL;
27029802d7cSAndreas Hindborg 
27129802d7cSAndreas Hindborg 	if (p->max_open_zones > nr_zones)
27229802d7cSAndreas Hindborg 		return -EINVAL;
27329802d7cSAndreas Hindborg 
27429802d7cSAndreas Hindborg 	return 0;
27529802d7cSAndreas Hindborg }
27629802d7cSAndreas Hindborg 
ublk_dev_param_zoned_apply(struct ublk_device * ub)27729802d7cSAndreas Hindborg static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
27829802d7cSAndreas Hindborg {
27929802d7cSAndreas Hindborg 	const struct ublk_param_zoned *p = &ub->params.zoned;
28029802d7cSAndreas Hindborg 
28129802d7cSAndreas Hindborg 	disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
282851e0629SMing Lei 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
28329802d7cSAndreas Hindborg 	blk_queue_required_elevator_features(ub->ub_disk->queue,
28429802d7cSAndreas Hindborg 					     ELEVATOR_F_ZBD_SEQ_WRITE);
28529802d7cSAndreas Hindborg 	disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
28629802d7cSAndreas Hindborg 	disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
28729802d7cSAndreas Hindborg 	blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
28829802d7cSAndreas Hindborg 
28929802d7cSAndreas Hindborg 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
29029802d7cSAndreas Hindborg 
29129802d7cSAndreas Hindborg 	return 0;
29229802d7cSAndreas Hindborg }
29329802d7cSAndreas Hindborg 
29429802d7cSAndreas Hindborg /* Based on virtblk_alloc_report_buffer */
ublk_alloc_report_buffer(struct ublk_device * ublk,unsigned int nr_zones,size_t * buflen)29529802d7cSAndreas Hindborg static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
29629802d7cSAndreas Hindborg 				      unsigned int nr_zones, size_t *buflen)
29729802d7cSAndreas Hindborg {
29829802d7cSAndreas Hindborg 	struct request_queue *q = ublk->ub_disk->queue;
29929802d7cSAndreas Hindborg 	size_t bufsize;
30029802d7cSAndreas Hindborg 	void *buf;
30129802d7cSAndreas Hindborg 
30229802d7cSAndreas Hindborg 	nr_zones = min_t(unsigned int, nr_zones,
30329802d7cSAndreas Hindborg 			 ublk->ub_disk->nr_zones);
30429802d7cSAndreas Hindborg 
30529802d7cSAndreas Hindborg 	bufsize = nr_zones * sizeof(struct blk_zone);
30629802d7cSAndreas Hindborg 	bufsize =
30729802d7cSAndreas Hindborg 		min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
30829802d7cSAndreas Hindborg 
30929802d7cSAndreas Hindborg 	while (bufsize >= sizeof(struct blk_zone)) {
31029802d7cSAndreas Hindborg 		buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
31129802d7cSAndreas Hindborg 		if (buf) {
31229802d7cSAndreas Hindborg 			*buflen = bufsize;
31329802d7cSAndreas Hindborg 			return buf;
31429802d7cSAndreas Hindborg 		}
31529802d7cSAndreas Hindborg 		bufsize >>= 1;
31629802d7cSAndreas Hindborg 	}
31729802d7cSAndreas Hindborg 
31829802d7cSAndreas Hindborg 	*buflen = 0;
31929802d7cSAndreas Hindborg 	return NULL;
32029802d7cSAndreas Hindborg }
32129802d7cSAndreas Hindborg 
ublk_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)32229802d7cSAndreas Hindborg static int ublk_report_zones(struct gendisk *disk, sector_t sector,
32329802d7cSAndreas Hindborg 		      unsigned int nr_zones, report_zones_cb cb, void *data)
32429802d7cSAndreas Hindborg {
32529802d7cSAndreas Hindborg 	struct ublk_device *ub = disk->private_data;
32629802d7cSAndreas Hindborg 	unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
32729802d7cSAndreas Hindborg 	unsigned int first_zone = sector >> ilog2(zone_size_sectors);
32829802d7cSAndreas Hindborg 	unsigned int done_zones = 0;
32929802d7cSAndreas Hindborg 	unsigned int max_zones_per_request;
33029802d7cSAndreas Hindborg 	int ret;
33129802d7cSAndreas Hindborg 	struct blk_zone *buffer;
33229802d7cSAndreas Hindborg 	size_t buffer_length;
33329802d7cSAndreas Hindborg 
33429802d7cSAndreas Hindborg 	nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
33529802d7cSAndreas Hindborg 			 nr_zones);
33629802d7cSAndreas Hindborg 
33729802d7cSAndreas Hindborg 	buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
33829802d7cSAndreas Hindborg 	if (!buffer)
33929802d7cSAndreas Hindborg 		return -ENOMEM;
34029802d7cSAndreas Hindborg 
34129802d7cSAndreas Hindborg 	max_zones_per_request = buffer_length / sizeof(struct blk_zone);
34229802d7cSAndreas Hindborg 
34329802d7cSAndreas Hindborg 	while (done_zones < nr_zones) {
34429802d7cSAndreas Hindborg 		unsigned int remaining_zones = nr_zones - done_zones;
34529802d7cSAndreas Hindborg 		unsigned int zones_in_request =
34629802d7cSAndreas Hindborg 			min_t(unsigned int, remaining_zones, max_zones_per_request);
34729802d7cSAndreas Hindborg 		struct request *req;
34875a5e590SMing Lei 		struct ublk_zoned_report_desc desc;
34929802d7cSAndreas Hindborg 		blk_status_t status;
35029802d7cSAndreas Hindborg 
35129802d7cSAndreas Hindborg 		memset(buffer, 0, buffer_length);
35229802d7cSAndreas Hindborg 
35329802d7cSAndreas Hindborg 		req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
35429802d7cSAndreas Hindborg 		if (IS_ERR(req)) {
35529802d7cSAndreas Hindborg 			ret = PTR_ERR(req);
35629802d7cSAndreas Hindborg 			goto out;
35729802d7cSAndreas Hindborg 		}
35829802d7cSAndreas Hindborg 
35975a5e590SMing Lei 		desc.operation = UBLK_IO_OP_REPORT_ZONES;
36075a5e590SMing Lei 		desc.sector = sector;
36175a5e590SMing Lei 		desc.nr_zones = zones_in_request;
36275a5e590SMing Lei 		ret = ublk_zoned_insert_report_desc(req, &desc);
36375a5e590SMing Lei 		if (ret)
36475a5e590SMing Lei 			goto free_req;
36529802d7cSAndreas Hindborg 
36629802d7cSAndreas Hindborg 		ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
36729802d7cSAndreas Hindborg 					GFP_KERNEL);
36875a5e590SMing Lei 		if (ret)
36975a5e590SMing Lei 			goto erase_desc;
37029802d7cSAndreas Hindborg 
37129802d7cSAndreas Hindborg 		status = blk_execute_rq(req, 0);
37229802d7cSAndreas Hindborg 		ret = blk_status_to_errno(status);
37375a5e590SMing Lei erase_desc:
37475a5e590SMing Lei 		ublk_zoned_erase_report_desc(req);
37575a5e590SMing Lei free_req:
37629802d7cSAndreas Hindborg 		blk_mq_free_request(req);
37729802d7cSAndreas Hindborg 		if (ret)
37829802d7cSAndreas Hindborg 			goto out;
37929802d7cSAndreas Hindborg 
38029802d7cSAndreas Hindborg 		for (unsigned int i = 0; i < zones_in_request; i++) {
38129802d7cSAndreas Hindborg 			struct blk_zone *zone = buffer + i;
38229802d7cSAndreas Hindborg 
38329802d7cSAndreas Hindborg 			/* A zero length zone means no more zones in this response */
38429802d7cSAndreas Hindborg 			if (!zone->len)
38529802d7cSAndreas Hindborg 				break;
38629802d7cSAndreas Hindborg 
38729802d7cSAndreas Hindborg 			ret = cb(zone, i, data);
38829802d7cSAndreas Hindborg 			if (ret)
38929802d7cSAndreas Hindborg 				goto out;
39029802d7cSAndreas Hindborg 
39129802d7cSAndreas Hindborg 			done_zones++;
39229802d7cSAndreas Hindborg 			sector += zone_size_sectors;
39329802d7cSAndreas Hindborg 
39429802d7cSAndreas Hindborg 		}
39529802d7cSAndreas Hindborg 	}
39629802d7cSAndreas Hindborg 
39729802d7cSAndreas Hindborg 	ret = done_zones;
39829802d7cSAndreas Hindborg 
39929802d7cSAndreas Hindborg out:
40029802d7cSAndreas Hindborg 	kvfree(buffer);
40129802d7cSAndreas Hindborg 	return ret;
40229802d7cSAndreas Hindborg }
40329802d7cSAndreas Hindborg 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)40429802d7cSAndreas Hindborg static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
40529802d7cSAndreas Hindborg 					 struct request *req)
40629802d7cSAndreas Hindborg {
40729802d7cSAndreas Hindborg 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
40829802d7cSAndreas Hindborg 	struct ublk_io *io = &ubq->ios[req->tag];
40975a5e590SMing Lei 	struct ublk_zoned_report_desc *desc;
41029802d7cSAndreas Hindborg 	u32 ublk_op;
41129802d7cSAndreas Hindborg 
41229802d7cSAndreas Hindborg 	switch (req_op(req)) {
41329802d7cSAndreas Hindborg 	case REQ_OP_ZONE_OPEN:
41429802d7cSAndreas Hindborg 		ublk_op = UBLK_IO_OP_ZONE_OPEN;
41529802d7cSAndreas Hindborg 		break;
41629802d7cSAndreas Hindborg 	case REQ_OP_ZONE_CLOSE:
41729802d7cSAndreas Hindborg 		ublk_op = UBLK_IO_OP_ZONE_CLOSE;
41829802d7cSAndreas Hindborg 		break;
41929802d7cSAndreas Hindborg 	case REQ_OP_ZONE_FINISH:
42029802d7cSAndreas Hindborg 		ublk_op = UBLK_IO_OP_ZONE_FINISH;
42129802d7cSAndreas Hindborg 		break;
42229802d7cSAndreas Hindborg 	case REQ_OP_ZONE_RESET:
42329802d7cSAndreas Hindborg 		ublk_op = UBLK_IO_OP_ZONE_RESET;
42429802d7cSAndreas Hindborg 		break;
42529802d7cSAndreas Hindborg 	case REQ_OP_ZONE_APPEND:
42629802d7cSAndreas Hindborg 		ublk_op = UBLK_IO_OP_ZONE_APPEND;
42729802d7cSAndreas Hindborg 		break;
428851e0629SMing Lei 	case REQ_OP_ZONE_RESET_ALL:
429851e0629SMing Lei 		ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
430851e0629SMing Lei 		break;
43129802d7cSAndreas Hindborg 	case REQ_OP_DRV_IN:
43275a5e590SMing Lei 		desc = ublk_zoned_get_report_desc(req);
43375a5e590SMing Lei 		if (!desc)
43475a5e590SMing Lei 			return BLK_STS_IOERR;
43575a5e590SMing Lei 		ublk_op = desc->operation;
43629802d7cSAndreas Hindborg 		switch (ublk_op) {
43729802d7cSAndreas Hindborg 		case UBLK_IO_OP_REPORT_ZONES:
43829802d7cSAndreas Hindborg 			iod->op_flags = ublk_op | ublk_req_build_flags(req);
43975a5e590SMing Lei 			iod->nr_zones = desc->nr_zones;
44075a5e590SMing Lei 			iod->start_sector = desc->sector;
44129802d7cSAndreas Hindborg 			return BLK_STS_OK;
44229802d7cSAndreas Hindborg 		default:
44329802d7cSAndreas Hindborg 			return BLK_STS_IOERR;
44429802d7cSAndreas Hindborg 		}
44529802d7cSAndreas Hindborg 	case REQ_OP_DRV_OUT:
446851e0629SMing Lei 		/* We do not support drv_out */
44729802d7cSAndreas Hindborg 		return BLK_STS_NOTSUPP;
44829802d7cSAndreas Hindborg 	default:
44929802d7cSAndreas Hindborg 		return BLK_STS_IOERR;
45029802d7cSAndreas Hindborg 	}
45129802d7cSAndreas Hindborg 
45229802d7cSAndreas Hindborg 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
45329802d7cSAndreas Hindborg 	iod->nr_sectors = blk_rq_sectors(req);
45429802d7cSAndreas Hindborg 	iod->start_sector = blk_rq_pos(req);
45529802d7cSAndreas Hindborg 	iod->addr = io->addr;
45629802d7cSAndreas Hindborg 
45729802d7cSAndreas Hindborg 	return BLK_STS_OK;
45829802d7cSAndreas Hindborg }
45929802d7cSAndreas Hindborg 
46029802d7cSAndreas Hindborg #else
46129802d7cSAndreas Hindborg 
46229802d7cSAndreas Hindborg #define ublk_report_zones (NULL)
46329802d7cSAndreas Hindborg 
ublk_dev_param_zoned_validate(const struct ublk_device * ub)46429802d7cSAndreas Hindborg static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
46529802d7cSAndreas Hindborg {
46629802d7cSAndreas Hindborg 	return -EOPNOTSUPP;
46729802d7cSAndreas Hindborg }
46829802d7cSAndreas Hindborg 
ublk_dev_param_zoned_apply(struct ublk_device * ub)46929802d7cSAndreas Hindborg static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
47029802d7cSAndreas Hindborg {
47129802d7cSAndreas Hindborg 	return -EOPNOTSUPP;
47229802d7cSAndreas Hindborg }
47329802d7cSAndreas Hindborg 
ublk_revalidate_disk_zones(struct ublk_device * ub)47429802d7cSAndreas Hindborg static int ublk_revalidate_disk_zones(struct ublk_device *ub)
47529802d7cSAndreas Hindborg {
47629802d7cSAndreas Hindborg 	return 0;
47729802d7cSAndreas Hindborg }
47829802d7cSAndreas Hindborg 
ublk_setup_iod_zoned(struct ublk_queue * ubq,struct request * req)47929802d7cSAndreas Hindborg static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
48029802d7cSAndreas Hindborg 					 struct request *req)
48129802d7cSAndreas Hindborg {
482c8659bbbSLi Zetao 	return BLK_STS_NOTSUPP;
48329802d7cSAndreas Hindborg }
48429802d7cSAndreas Hindborg 
48529802d7cSAndreas Hindborg #endif
48629802d7cSAndreas Hindborg 
48782840669SMing Lei static inline void __ublk_complete_rq(struct request *req);
48882840669SMing Lei static void ublk_complete_rq(struct kref *ref);
48982840669SMing Lei 
49071f28f31SMing Lei static dev_t ublk_chr_devt;
4912eefd399SIvan Orlov static const struct class ublk_chr_class = {
4922eefd399SIvan Orlov 	.name = "ublk-char",
4932eefd399SIvan Orlov };
49471f28f31SMing Lei 
49571f28f31SMing Lei static DEFINE_IDR(ublk_index_idr);
49671f28f31SMing Lei static DEFINE_SPINLOCK(ublk_idr_lock);
49771f28f31SMing Lei static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
49871f28f31SMing Lei 
49971f28f31SMing Lei static DEFINE_MUTEX(ublk_ctl_mutex);
50071f28f31SMing Lei 
501403ebc87SMing Lei /*
502403ebc87SMing Lei  * Max ublk devices allowed to add
503403ebc87SMing Lei  *
504403ebc87SMing Lei  * It can be extended to one per-user limit in future or even controlled
505403ebc87SMing Lei  * by cgroup.
506403ebc87SMing Lei  */
507403ebc87SMing Lei static unsigned int ublks_max = 64;
508403ebc87SMing Lei static unsigned int ublks_added;	/* protected by ublk_ctl_mutex */
509403ebc87SMing Lei 
51071f28f31SMing Lei static struct miscdevice ublk_misc;
51171f28f31SMing Lei 
ublk_pos_to_hwq(loff_t pos)51262fe99ceSMing Lei static inline unsigned ublk_pos_to_hwq(loff_t pos)
51362fe99ceSMing Lei {
51462fe99ceSMing Lei 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
51562fe99ceSMing Lei 		UBLK_QID_BITS_MASK;
51662fe99ceSMing Lei }
51762fe99ceSMing Lei 
ublk_pos_to_buf_off(loff_t pos)51862fe99ceSMing Lei static inline unsigned ublk_pos_to_buf_off(loff_t pos)
51962fe99ceSMing Lei {
52062fe99ceSMing Lei 	return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
52162fe99ceSMing Lei }
52262fe99ceSMing Lei 
ublk_pos_to_tag(loff_t pos)52362fe99ceSMing Lei static inline unsigned ublk_pos_to_tag(loff_t pos)
52462fe99ceSMing Lei {
52562fe99ceSMing Lei 	return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
52662fe99ceSMing Lei 		UBLK_TAG_BITS_MASK;
52762fe99ceSMing Lei }
52862fe99ceSMing Lei 
ublk_dev_param_basic_apply(struct ublk_device * ub)5290aa73170SMing Lei static void ublk_dev_param_basic_apply(struct ublk_device *ub)
5300aa73170SMing Lei {
5310aa73170SMing Lei 	struct request_queue *q = ub->ub_disk->queue;
5320aa73170SMing Lei 	const struct ublk_param_basic *p = &ub->params.basic;
5330aa73170SMing Lei 
5340aa73170SMing Lei 	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
5350aa73170SMing Lei 	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
5360aa73170SMing Lei 	blk_queue_io_min(q, 1 << p->io_min_shift);
5370aa73170SMing Lei 	blk_queue_io_opt(q, 1 << p->io_opt_shift);
5380aa73170SMing Lei 
5390aa73170SMing Lei 	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
5400aa73170SMing Lei 			p->attrs & UBLK_ATTR_FUA);
5410aa73170SMing Lei 	if (p->attrs & UBLK_ATTR_ROTATIONAL)
5420aa73170SMing Lei 		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
5430aa73170SMing Lei 	else
5440aa73170SMing Lei 		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5450aa73170SMing Lei 
5460aa73170SMing Lei 	blk_queue_max_hw_sectors(q, p->max_sectors);
5470aa73170SMing Lei 	blk_queue_chunk_sectors(q, p->chunk_sectors);
5480aa73170SMing Lei 	blk_queue_virt_boundary(q, p->virt_boundary_mask);
5490aa73170SMing Lei 
5500aa73170SMing Lei 	if (p->attrs & UBLK_ATTR_READ_ONLY)
5510aa73170SMing Lei 		set_disk_ro(ub->ub_disk, true);
5520aa73170SMing Lei 
5530aa73170SMing Lei 	set_capacity(ub->ub_disk, p->dev_sectors);
5540aa73170SMing Lei }
5550aa73170SMing Lei 
ublk_dev_param_discard_apply(struct ublk_device * ub)5560aa73170SMing Lei static void ublk_dev_param_discard_apply(struct ublk_device *ub)
5570aa73170SMing Lei {
5580aa73170SMing Lei 	struct request_queue *q = ub->ub_disk->queue;
5590aa73170SMing Lei 	const struct ublk_param_discard *p = &ub->params.discard;
5600aa73170SMing Lei 
5610aa73170SMing Lei 	q->limits.discard_alignment = p->discard_alignment;
5620aa73170SMing Lei 	q->limits.discard_granularity = p->discard_granularity;
5630aa73170SMing Lei 	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
5640aa73170SMing Lei 	blk_queue_max_write_zeroes_sectors(q,
5650aa73170SMing Lei 			p->max_write_zeroes_sectors);
5660aa73170SMing Lei 	blk_queue_max_discard_segments(q, p->max_discard_segments);
5670aa73170SMing Lei }
5680aa73170SMing Lei 
ublk_validate_params(const struct ublk_device * ub)5690aa73170SMing Lei static int ublk_validate_params(const struct ublk_device *ub)
5700aa73170SMing Lei {
5710aa73170SMing Lei 	/* basic param is the only one which must be set */
5720aa73170SMing Lei 	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
5730aa73170SMing Lei 		const struct ublk_param_basic *p = &ub->params.basic;
5740aa73170SMing Lei 
5751d166527SMing Lei 		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
5760aa73170SMing Lei 			return -EINVAL;
5770aa73170SMing Lei 
5780aa73170SMing Lei 		if (p->logical_bs_shift > p->physical_bs_shift)
5790aa73170SMing Lei 			return -EINVAL;
5800aa73170SMing Lei 
5814bf9cbf3SMing Lei 		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
5820aa73170SMing Lei 			return -EINVAL;
58329802d7cSAndreas Hindborg 
58429802d7cSAndreas Hindborg 		if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
58529802d7cSAndreas Hindborg 			return -EINVAL;
5860aa73170SMing Lei 	} else
5870aa73170SMing Lei 		return -EINVAL;
5880aa73170SMing Lei 
5890aa73170SMing Lei 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
5900aa73170SMing Lei 		const struct ublk_param_discard *p = &ub->params.discard;
5910aa73170SMing Lei 
5920aa73170SMing Lei 		/* So far, only support single segment discard */
5930aa73170SMing Lei 		if (p->max_discard_sectors && p->max_discard_segments != 1)
5940aa73170SMing Lei 			return -EINVAL;
5950aa73170SMing Lei 
5960aa73170SMing Lei 		if (!p->discard_granularity)
5970aa73170SMing Lei 			return -EINVAL;
5980aa73170SMing Lei 	}
5990aa73170SMing Lei 
600abb864d3SMing Lei 	/* dev_t is read-only */
601abb864d3SMing Lei 	if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
602abb864d3SMing Lei 		return -EINVAL;
603abb864d3SMing Lei 
60429802d7cSAndreas Hindborg 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
60529802d7cSAndreas Hindborg 		return ublk_dev_param_zoned_validate(ub);
60629802d7cSAndreas Hindborg 	else if (ublk_dev_is_zoned(ub))
60729802d7cSAndreas Hindborg 		return -EINVAL;
60829802d7cSAndreas Hindborg 
6090aa73170SMing Lei 	return 0;
6100aa73170SMing Lei }
6110aa73170SMing Lei 
ublk_apply_params(struct ublk_device * ub)6120aa73170SMing Lei static int ublk_apply_params(struct ublk_device *ub)
6130aa73170SMing Lei {
6140aa73170SMing Lei 	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
6150aa73170SMing Lei 		return -EINVAL;
6160aa73170SMing Lei 
6170aa73170SMing Lei 	ublk_dev_param_basic_apply(ub);
6180aa73170SMing Lei 
6190aa73170SMing Lei 	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
6200aa73170SMing Lei 		ublk_dev_param_discard_apply(ub);
6210aa73170SMing Lei 
62229802d7cSAndreas Hindborg 	if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
62329802d7cSAndreas Hindborg 		return ublk_dev_param_zoned_apply(ub);
62429802d7cSAndreas Hindborg 
6250aa73170SMing Lei 	return 0;
6260aa73170SMing Lei }
6270aa73170SMing Lei 
ublk_support_user_copy(const struct ublk_queue * ubq)6281172d5b8SMing Lei static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
6290edb3696SMing Lei {
6301172d5b8SMing Lei 	return ubq->flags & UBLK_F_USER_COPY;
6311172d5b8SMing Lei }
6321172d5b8SMing Lei 
ublk_need_req_ref(const struct ublk_queue * ubq)63382840669SMing Lei static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
63482840669SMing Lei {
6351172d5b8SMing Lei 	/*
6361172d5b8SMing Lei 	 * read()/write() is involved in user copy, so request reference
6371172d5b8SMing Lei 	 * has to be grabbed
6381172d5b8SMing Lei 	 */
6391172d5b8SMing Lei 	return ublk_support_user_copy(ubq);
64082840669SMing Lei }
64182840669SMing Lei 
ublk_init_req_ref(const struct ublk_queue * ubq,struct request * req)64282840669SMing Lei static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
64382840669SMing Lei 		struct request *req)
64482840669SMing Lei {
64582840669SMing Lei 	if (ublk_need_req_ref(ubq)) {
64682840669SMing Lei 		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
64782840669SMing Lei 
64882840669SMing Lei 		kref_init(&data->ref);
64982840669SMing Lei 	}
65082840669SMing Lei }
65182840669SMing Lei 
ublk_get_req_ref(const struct ublk_queue * ubq,struct request * req)65282840669SMing Lei static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
65382840669SMing Lei 		struct request *req)
65482840669SMing Lei {
65582840669SMing Lei 	if (ublk_need_req_ref(ubq)) {
65682840669SMing Lei 		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
65782840669SMing Lei 
65882840669SMing Lei 		return kref_get_unless_zero(&data->ref);
65982840669SMing Lei 	}
66082840669SMing Lei 
6610edb3696SMing Lei 	return true;
66282840669SMing Lei }
66382840669SMing Lei 
ublk_put_req_ref(const struct ublk_queue * ubq,struct request * req)66482840669SMing Lei static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
66582840669SMing Lei 		struct request *req)
66682840669SMing Lei {
66782840669SMing Lei 	if (ublk_need_req_ref(ubq)) {
66882840669SMing Lei 		struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
66982840669SMing Lei 
67082840669SMing Lei 		kref_put(&data->ref, ublk_complete_rq);
67182840669SMing Lei 	} else {
67282840669SMing Lei 		__ublk_complete_rq(req);
67382840669SMing Lei 	}
6740edb3696SMing Lei }
6750edb3696SMing Lei 
ublk_need_get_data(const struct ublk_queue * ubq)676c86019ffSZiyangZhang static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
677c86019ffSZiyangZhang {
67896cf2f54SMing Lei 	return ubq->flags & UBLK_F_NEED_GET_DATA;
679c86019ffSZiyangZhang }
680c86019ffSZiyangZhang 
ublk_get_device(struct ublk_device * ub)68171f28f31SMing Lei static struct ublk_device *ublk_get_device(struct ublk_device *ub)
68271f28f31SMing Lei {
68371f28f31SMing Lei 	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
68471f28f31SMing Lei 		return ub;
68571f28f31SMing Lei 	return NULL;
68671f28f31SMing Lei }
68771f28f31SMing Lei 
ublk_put_device(struct ublk_device * ub)68871f28f31SMing Lei static void ublk_put_device(struct ublk_device *ub)
68971f28f31SMing Lei {
69071f28f31SMing Lei 	put_device(&ub->cdev_dev);
69171f28f31SMing Lei }
69271f28f31SMing Lei 
ublk_get_queue(struct ublk_device * dev,int qid)69371f28f31SMing Lei static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
69471f28f31SMing Lei 		int qid)
69571f28f31SMing Lei {
69671f28f31SMing Lei        return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
69771f28f31SMing Lei }
69871f28f31SMing Lei 
ublk_rq_has_data(const struct request * rq)69971f28f31SMing Lei static inline bool ublk_rq_has_data(const struct request *rq)
70071f28f31SMing Lei {
701731e208dSZiyang Zhang 	return bio_has_data(rq->bio);
70271f28f31SMing Lei }
70371f28f31SMing Lei 
ublk_get_iod(struct ublk_queue * ubq,int tag)70471f28f31SMing Lei static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
70571f28f31SMing Lei 		int tag)
70671f28f31SMing Lei {
70771f28f31SMing Lei 	return (struct ublksrv_io_desc *)
70871f28f31SMing Lei 		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
70971f28f31SMing Lei }
71071f28f31SMing Lei 
ublk_queue_cmd_buf(struct ublk_device * ub,int q_id)71171f28f31SMing Lei static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
71271f28f31SMing Lei {
71371f28f31SMing Lei 	return ublk_get_queue(ub, q_id)->io_cmd_buf;
71471f28f31SMing Lei }
71571f28f31SMing Lei 
__ublk_queue_cmd_buf_size(int depth)71680f25003SMing Lei static inline int __ublk_queue_cmd_buf_size(int depth)
71780f25003SMing Lei {
71880f25003SMing Lei 	return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
71980f25003SMing Lei }
72080f25003SMing Lei 
ublk_queue_cmd_buf_size(struct ublk_device * ub,int q_id)72171f28f31SMing Lei static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
72271f28f31SMing Lei {
72371f28f31SMing Lei 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
72471f28f31SMing Lei 
72580f25003SMing Lei 	return __ublk_queue_cmd_buf_size(ubq->q_depth);
72680f25003SMing Lei }
72780f25003SMing Lei 
ublk_max_cmd_buf_size(void)72880f25003SMing Lei static int ublk_max_cmd_buf_size(void)
72980f25003SMing Lei {
73080f25003SMing Lei 	return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
73171f28f31SMing Lei }
73271f28f31SMing Lei 
ublk_queue_can_use_recovery_reissue(struct ublk_queue * ubq)733a0d41dc1SZiyangZhang static inline bool ublk_queue_can_use_recovery_reissue(
734a0d41dc1SZiyangZhang 		struct ublk_queue *ubq)
735a0d41dc1SZiyangZhang {
73696cf2f54SMing Lei 	return (ubq->flags & UBLK_F_USER_RECOVERY) &&
73796cf2f54SMing Lei 			(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
738a0d41dc1SZiyangZhang }
739a0d41dc1SZiyangZhang 
ublk_queue_can_use_recovery(struct ublk_queue * ubq)74077a440e2SZiyangZhang static inline bool ublk_queue_can_use_recovery(
74177a440e2SZiyangZhang 		struct ublk_queue *ubq)
74277a440e2SZiyangZhang {
74396cf2f54SMing Lei 	return ubq->flags & UBLK_F_USER_RECOVERY;
74477a440e2SZiyangZhang }
74577a440e2SZiyangZhang 
ublk_can_use_recovery(struct ublk_device * ub)74677a440e2SZiyangZhang static inline bool ublk_can_use_recovery(struct ublk_device *ub)
74777a440e2SZiyangZhang {
74896cf2f54SMing Lei 	return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
74977a440e2SZiyangZhang }
75077a440e2SZiyangZhang 
ublk_free_disk(struct gendisk * disk)7516d9e6dfdSChristoph Hellwig static void ublk_free_disk(struct gendisk *disk)
7526d9e6dfdSChristoph Hellwig {
7536d9e6dfdSChristoph Hellwig 	struct ublk_device *ub = disk->private_data;
7546d9e6dfdSChristoph Hellwig 
7556d9e6dfdSChristoph Hellwig 	clear_bit(UB_STATE_USED, &ub->state);
7566d9e6dfdSChristoph Hellwig 	put_device(&ub->cdev_dev);
7576d9e6dfdSChristoph Hellwig }
7586d9e6dfdSChristoph Hellwig 
ublk_store_owner_uid_gid(unsigned int * owner_uid,unsigned int * owner_gid)75948a90519SMing Lei static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
76048a90519SMing Lei 		unsigned int *owner_gid)
76148a90519SMing Lei {
76248a90519SMing Lei 	kuid_t uid;
76348a90519SMing Lei 	kgid_t gid;
76448a90519SMing Lei 
76548a90519SMing Lei 	current_uid_gid(&uid, &gid);
76648a90519SMing Lei 
76748a90519SMing Lei 	*owner_uid = from_kuid(&init_user_ns, uid);
76848a90519SMing Lei 	*owner_gid = from_kgid(&init_user_ns, gid);
76948a90519SMing Lei }
77048a90519SMing Lei 
ublk_open(struct gendisk * disk,blk_mode_t mode)77105bdb996SChristoph Hellwig static int ublk_open(struct gendisk *disk, blk_mode_t mode)
77248a90519SMing Lei {
773d32e2bf8SChristoph Hellwig 	struct ublk_device *ub = disk->private_data;
77448a90519SMing Lei 
77548a90519SMing Lei 	if (capable(CAP_SYS_ADMIN))
77648a90519SMing Lei 		return 0;
77748a90519SMing Lei 
77848a90519SMing Lei 	/*
77948a90519SMing Lei 	 * If it is one unprivileged device, only owner can open
78048a90519SMing Lei 	 * the disk. Otherwise it could be one trap made by one
78148a90519SMing Lei 	 * evil user who grants this disk's privileges to other
78248a90519SMing Lei 	 * users deliberately.
78348a90519SMing Lei 	 *
78448a90519SMing Lei 	 * This way is reasonable too given anyone can create
78548a90519SMing Lei 	 * unprivileged device, and no need other's grant.
78648a90519SMing Lei 	 */
78748a90519SMing Lei 	if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
78848a90519SMing Lei 		unsigned int curr_uid, curr_gid;
78948a90519SMing Lei 
79048a90519SMing Lei 		ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
79148a90519SMing Lei 
79248a90519SMing Lei 		if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
79348a90519SMing Lei 				ub->dev_info.owner_gid)
79448a90519SMing Lei 			return -EPERM;
79548a90519SMing Lei 	}
79648a90519SMing Lei 
79748a90519SMing Lei 	return 0;
79848a90519SMing Lei }
79948a90519SMing Lei 
80071f28f31SMing Lei static const struct block_device_operations ub_fops = {
80171f28f31SMing Lei 	.owner =	THIS_MODULE,
80248a90519SMing Lei 	.open =		ublk_open,
8036d9e6dfdSChristoph Hellwig 	.free_disk =	ublk_free_disk,
80429802d7cSAndreas Hindborg 	.report_zones =	ublk_report_zones,
80571f28f31SMing Lei };
80671f28f31SMing Lei 
80771f28f31SMing Lei #define UBLK_MAX_PIN_PAGES	32
80871f28f31SMing Lei 
80971f28f31SMing Lei struct ublk_io_iter {
81071f28f31SMing Lei 	struct page *pages[UBLK_MAX_PIN_PAGES];
81171f28f31SMing Lei 	struct bio *bio;
81271f28f31SMing Lei 	struct bvec_iter iter;
81371f28f31SMing Lei };
81471f28f31SMing Lei 
815981f95a5SMing Lei /* return how many pages are copied */
ublk_copy_io_pages(struct ublk_io_iter * data,size_t total,size_t pg_off,int dir)816981f95a5SMing Lei static void ublk_copy_io_pages(struct ublk_io_iter *data,
817981f95a5SMing Lei 		size_t total, size_t pg_off, int dir)
81871f28f31SMing Lei {
81971f28f31SMing Lei 	unsigned done = 0;
82071f28f31SMing Lei 	unsigned pg_idx = 0;
82171f28f31SMing Lei 
82271f28f31SMing Lei 	while (done < total) {
82371f28f31SMing Lei 		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
824981f95a5SMing Lei 		unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
825981f95a5SMing Lei 				(unsigned)(PAGE_SIZE - pg_off));
82671f28f31SMing Lei 		void *bv_buf = bvec_kmap_local(&bv);
82771f28f31SMing Lei 		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
82871f28f31SMing Lei 
829981f95a5SMing Lei 		if (dir == ITER_DEST)
830981f95a5SMing Lei 			memcpy(pg_buf + pg_off, bv_buf, bytes);
83171f28f31SMing Lei 		else
832981f95a5SMing Lei 			memcpy(bv_buf, pg_buf + pg_off, bytes);
83371f28f31SMing Lei 
83471f28f31SMing Lei 		kunmap_local(pg_buf);
83571f28f31SMing Lei 		kunmap_local(bv_buf);
83671f28f31SMing Lei 
83771f28f31SMing Lei 		/* advance page array */
838981f95a5SMing Lei 		pg_off += bytes;
839981f95a5SMing Lei 		if (pg_off == PAGE_SIZE) {
84071f28f31SMing Lei 			pg_idx += 1;
841981f95a5SMing Lei 			pg_off = 0;
84271f28f31SMing Lei 		}
84371f28f31SMing Lei 
84471f28f31SMing Lei 		done += bytes;
84571f28f31SMing Lei 
84671f28f31SMing Lei 		/* advance bio */
84771f28f31SMing Lei 		bio_advance_iter_single(data->bio, &data->iter, bytes);
84871f28f31SMing Lei 		if (!data->iter.bi_size) {
84971f28f31SMing Lei 			data->bio = data->bio->bi_next;
85071f28f31SMing Lei 			if (data->bio == NULL)
85171f28f31SMing Lei 				break;
85271f28f31SMing Lei 			data->iter = data->bio->bi_iter;
85371f28f31SMing Lei 		}
85471f28f31SMing Lei 	}
85571f28f31SMing Lei }
85671f28f31SMing Lei 
ublk_advance_io_iter(const struct request * req,struct ublk_io_iter * iter,unsigned int offset)85738f2dd34SMing Lei static bool ublk_advance_io_iter(const struct request *req,
85838f2dd34SMing Lei 		struct ublk_io_iter *iter, unsigned int offset)
85971f28f31SMing Lei {
86038f2dd34SMing Lei 	struct bio *bio = req->bio;
86171f28f31SMing Lei 
86238f2dd34SMing Lei 	for_each_bio(bio) {
86338f2dd34SMing Lei 		if (bio->bi_iter.bi_size > offset) {
86438f2dd34SMing Lei 			iter->bio = bio;
86538f2dd34SMing Lei 			iter->iter = bio->bi_iter;
86638f2dd34SMing Lei 			bio_advance_iter(iter->bio, &iter->iter, offset);
86738f2dd34SMing Lei 			return true;
86838f2dd34SMing Lei 		}
86938f2dd34SMing Lei 		offset -= bio->bi_iter.bi_size;
87038f2dd34SMing Lei 	}
87138f2dd34SMing Lei 	return false;
87238f2dd34SMing Lei }
87371f28f31SMing Lei 
874981f95a5SMing Lei /*
875981f95a5SMing Lei  * Copy data between request pages and io_iter, and 'offset'
876981f95a5SMing Lei  * is the start point of linear offset of request.
877981f95a5SMing Lei  */
ublk_copy_user_pages(const struct request * req,unsigned offset,struct iov_iter * uiter,int dir)878981f95a5SMing Lei static size_t ublk_copy_user_pages(const struct request *req,
87938f2dd34SMing Lei 		unsigned offset, struct iov_iter *uiter, int dir)
88071f28f31SMing Lei {
88138f2dd34SMing Lei 	struct ublk_io_iter iter;
882981f95a5SMing Lei 	size_t done = 0;
88371f28f31SMing Lei 
88438f2dd34SMing Lei 	if (!ublk_advance_io_iter(req, &iter, offset))
88538f2dd34SMing Lei 		return 0;
88638f2dd34SMing Lei 
887981f95a5SMing Lei 	while (iov_iter_count(uiter) && iter.bio) {
888981f95a5SMing Lei 		unsigned nr_pages;
889b8b637d7SMing Lei 		ssize_t len;
890b8b637d7SMing Lei 		size_t off;
891981f95a5SMing Lei 		int i;
89271f28f31SMing Lei 
893981f95a5SMing Lei 		len = iov_iter_get_pages2(uiter, iter.pages,
894981f95a5SMing Lei 				iov_iter_count(uiter),
895981f95a5SMing Lei 				UBLK_MAX_PIN_PAGES, &off);
896981f95a5SMing Lei 		if (len <= 0)
897981f95a5SMing Lei 			return done;
898981f95a5SMing Lei 
899981f95a5SMing Lei 		ublk_copy_io_pages(&iter, len, off, dir);
900981f95a5SMing Lei 		nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
901981f95a5SMing Lei 		for (i = 0; i < nr_pages; i++) {
902981f95a5SMing Lei 			if (dir == ITER_DEST)
90371f28f31SMing Lei 				set_page_dirty(iter.pages[i]);
90471f28f31SMing Lei 			put_page(iter.pages[i]);
90571f28f31SMing Lei 		}
906981f95a5SMing Lei 		done += len;
90771f28f31SMing Lei 	}
90871f28f31SMing Lei 
90971f28f31SMing Lei 	return done;
91071f28f31SMing Lei }
91171f28f31SMing Lei 
ublk_need_map_req(const struct request * req)9122f3af723SMing Lei static inline bool ublk_need_map_req(const struct request *req)
9132f3af723SMing Lei {
9142f3af723SMing Lei 	return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
9152f3af723SMing Lei }
9162f3af723SMing Lei 
ublk_need_unmap_req(const struct request * req)9172f3af723SMing Lei static inline bool ublk_need_unmap_req(const struct request *req)
9182f3af723SMing Lei {
91929802d7cSAndreas Hindborg 	return ublk_rq_has_data(req) &&
92029802d7cSAndreas Hindborg 	       (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
9212f3af723SMing Lei }
9222f3af723SMing Lei 
ublk_map_io(const struct ublk_queue * ubq,const struct request * req,struct ublk_io * io)92371f28f31SMing Lei static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
92471f28f31SMing Lei 		struct ublk_io *io)
92571f28f31SMing Lei {
92671f28f31SMing Lei 	const unsigned int rq_bytes = blk_rq_bytes(req);
92723ef8220SMing Lei 
9281172d5b8SMing Lei 	if (ublk_support_user_copy(ubq))
9291172d5b8SMing Lei 		return rq_bytes;
9301172d5b8SMing Lei 
93171f28f31SMing Lei 	/*
93271f28f31SMing Lei 	 * no zero copy, we delay copy WRITE request data into ublksrv
93371f28f31SMing Lei 	 * context and the big benefit is that pinning pages in current
93471f28f31SMing Lei 	 * context is pretty fast, see ublk_pin_user_pages
93571f28f31SMing Lei 	 */
9362f3af723SMing Lei 	if (ublk_need_map_req(req)) {
937981f95a5SMing Lei 		struct iov_iter iter;
938981f95a5SMing Lei 		struct iovec iov;
939981f95a5SMing Lei 		const int dir = ITER_DEST;
94071f28f31SMing Lei 
941981f95a5SMing Lei 		import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
942981f95a5SMing Lei 				&iov, &iter);
94371f28f31SMing Lei 
94438f2dd34SMing Lei 		return ublk_copy_user_pages(req, 0, &iter, dir);
94571f28f31SMing Lei 	}
94671f28f31SMing Lei 	return rq_bytes;
94771f28f31SMing Lei }
94871f28f31SMing Lei 
ublk_unmap_io(const struct ublk_queue * ubq,const struct request * req,struct ublk_io * io)94971f28f31SMing Lei static int ublk_unmap_io(const struct ublk_queue *ubq,
95071f28f31SMing Lei 		const struct request *req,
95171f28f31SMing Lei 		struct ublk_io *io)
95271f28f31SMing Lei {
95371f28f31SMing Lei 	const unsigned int rq_bytes = blk_rq_bytes(req);
95471f28f31SMing Lei 
9551172d5b8SMing Lei 	if (ublk_support_user_copy(ubq))
9561172d5b8SMing Lei 		return rq_bytes;
9571172d5b8SMing Lei 
9582f3af723SMing Lei 	if (ublk_need_unmap_req(req)) {
959981f95a5SMing Lei 		struct iov_iter iter;
960981f95a5SMing Lei 		struct iovec iov;
961981f95a5SMing Lei 		const int dir = ITER_SOURCE;
96271f28f31SMing Lei 
96371f28f31SMing Lei 		WARN_ON_ONCE(io->res > rq_bytes);
96471f28f31SMing Lei 
965981f95a5SMing Lei 		import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
966981f95a5SMing Lei 				&iov, &iter);
96738f2dd34SMing Lei 		return ublk_copy_user_pages(req, 0, &iter, dir);
96871f28f31SMing Lei 	}
96971f28f31SMing Lei 	return rq_bytes;
97071f28f31SMing Lei }
97171f28f31SMing Lei 
ublk_req_build_flags(struct request * req)97271f28f31SMing Lei static inline unsigned int ublk_req_build_flags(struct request *req)
97371f28f31SMing Lei {
97471f28f31SMing Lei 	unsigned flags = 0;
97571f28f31SMing Lei 
97671f28f31SMing Lei 	if (req->cmd_flags & REQ_FAILFAST_DEV)
97771f28f31SMing Lei 		flags |= UBLK_IO_F_FAILFAST_DEV;
97871f28f31SMing Lei 
97971f28f31SMing Lei 	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
98071f28f31SMing Lei 		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
98171f28f31SMing Lei 
98271f28f31SMing Lei 	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
98371f28f31SMing Lei 		flags |= UBLK_IO_F_FAILFAST_DRIVER;
98471f28f31SMing Lei 
98571f28f31SMing Lei 	if (req->cmd_flags & REQ_META)
98671f28f31SMing Lei 		flags |= UBLK_IO_F_META;
98771f28f31SMing Lei 
98871f28f31SMing Lei 	if (req->cmd_flags & REQ_FUA)
98971f28f31SMing Lei 		flags |= UBLK_IO_F_FUA;
99071f28f31SMing Lei 
99171f28f31SMing Lei 	if (req->cmd_flags & REQ_NOUNMAP)
99271f28f31SMing Lei 		flags |= UBLK_IO_F_NOUNMAP;
99371f28f31SMing Lei 
99471f28f31SMing Lei 	if (req->cmd_flags & REQ_SWAP)
99571f28f31SMing Lei 		flags |= UBLK_IO_F_SWAP;
99671f28f31SMing Lei 
99771f28f31SMing Lei 	return flags;
99871f28f31SMing Lei }
99971f28f31SMing Lei 
ublk_setup_iod(struct ublk_queue * ubq,struct request * req)1000f2450f8aSMing Lei static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
100171f28f31SMing Lei {
100271f28f31SMing Lei 	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
100371f28f31SMing Lei 	struct ublk_io *io = &ubq->ios[req->tag];
100429802d7cSAndreas Hindborg 	enum req_op op = req_op(req);
100571f28f31SMing Lei 	u32 ublk_op;
100671f28f31SMing Lei 
100729802d7cSAndreas Hindborg 	if (!ublk_queue_is_zoned(ubq) &&
100829802d7cSAndreas Hindborg 	    (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
1009c8659bbbSLi Zetao 		return BLK_STS_IOERR;
101029802d7cSAndreas Hindborg 
101171f28f31SMing Lei 	switch (req_op(req)) {
101271f28f31SMing Lei 	case REQ_OP_READ:
101371f28f31SMing Lei 		ublk_op = UBLK_IO_OP_READ;
101471f28f31SMing Lei 		break;
101571f28f31SMing Lei 	case REQ_OP_WRITE:
101671f28f31SMing Lei 		ublk_op = UBLK_IO_OP_WRITE;
101771f28f31SMing Lei 		break;
101871f28f31SMing Lei 	case REQ_OP_FLUSH:
101971f28f31SMing Lei 		ublk_op = UBLK_IO_OP_FLUSH;
102071f28f31SMing Lei 		break;
102171f28f31SMing Lei 	case REQ_OP_DISCARD:
102271f28f31SMing Lei 		ublk_op = UBLK_IO_OP_DISCARD;
102371f28f31SMing Lei 		break;
102471f28f31SMing Lei 	case REQ_OP_WRITE_ZEROES:
102571f28f31SMing Lei 		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
102671f28f31SMing Lei 		break;
102771f28f31SMing Lei 	default:
102829802d7cSAndreas Hindborg 		if (ublk_queue_is_zoned(ubq))
102929802d7cSAndreas Hindborg 			return ublk_setup_iod_zoned(ubq, req);
103071f28f31SMing Lei 		return BLK_STS_IOERR;
103171f28f31SMing Lei 	}
103271f28f31SMing Lei 
103371f28f31SMing Lei 	/* need to translate since kernel may change */
103471f28f31SMing Lei 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
103571f28f31SMing Lei 	iod->nr_sectors = blk_rq_sectors(req);
103671f28f31SMing Lei 	iod->start_sector = blk_rq_pos(req);
103771f28f31SMing Lei 	iod->addr = io->addr;
103871f28f31SMing Lei 
103971f28f31SMing Lei 	return BLK_STS_OK;
104071f28f31SMing Lei }
104171f28f31SMing Lei 
ublk_get_uring_cmd_pdu(struct io_uring_cmd * ioucmd)104271f28f31SMing Lei static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
104371f28f31SMing Lei 		struct io_uring_cmd *ioucmd)
104471f28f31SMing Lei {
104571f28f31SMing Lei 	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
104671f28f31SMing Lei }
104771f28f31SMing Lei 
ubq_daemon_is_dying(struct ublk_queue * ubq)1048966120b5SZiyangZhang static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
104971f28f31SMing Lei {
105071f28f31SMing Lei 	return ubq->ubq_daemon->flags & PF_EXITING;
105171f28f31SMing Lei }
105271f28f31SMing Lei 
105371f28f31SMing Lei /* todo: handle partial completion */
__ublk_complete_rq(struct request * req)105482840669SMing Lei static inline void __ublk_complete_rq(struct request *req)
105571f28f31SMing Lei {
105671f28f31SMing Lei 	struct ublk_queue *ubq = req->mq_hctx->driver_data;
105771f28f31SMing Lei 	struct ublk_io *io = &ubq->ios[req->tag];
105871f28f31SMing Lei 	unsigned int unmapped_bytes;
1059903f8aeeSMing Lei 	blk_status_t res = BLK_STS_OK;
106071f28f31SMing Lei 
106182840669SMing Lei 	/* called from ublk_abort_queue() code path */
106282840669SMing Lei 	if (io->flags & UBLK_IO_FLAG_ABORTED) {
106382840669SMing Lei 		res = BLK_STS_IOERR;
106482840669SMing Lei 		goto exit;
106582840669SMing Lei 	}
106682840669SMing Lei 
106771f28f31SMing Lei 	/* failed read IO if nothing is read */
106871f28f31SMing Lei 	if (!io->res && req_op(req) == REQ_OP_READ)
106971f28f31SMing Lei 		io->res = -EIO;
107071f28f31SMing Lei 
107171f28f31SMing Lei 	if (io->res < 0) {
1072903f8aeeSMing Lei 		res = errno_to_blk_status(io->res);
1073903f8aeeSMing Lei 		goto exit;
107471f28f31SMing Lei 	}
107571f28f31SMing Lei 
107671f28f31SMing Lei 	/*
1077b352389eSZiyang Zhang 	 * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
107871f28f31SMing Lei 	 * directly.
107971f28f31SMing Lei 	 *
108071f28f31SMing Lei 	 * Both the two needn't unmap.
108171f28f31SMing Lei 	 */
108229802d7cSAndreas Hindborg 	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
108329802d7cSAndreas Hindborg 	    req_op(req) != REQ_OP_DRV_IN)
1084903f8aeeSMing Lei 		goto exit;
108571f28f31SMing Lei 
108671f28f31SMing Lei 	/* for READ request, writing data in iod->addr to rq buffers */
108771f28f31SMing Lei 	unmapped_bytes = ublk_unmap_io(ubq, req, io);
108871f28f31SMing Lei 
108971f28f31SMing Lei 	/*
109071f28f31SMing Lei 	 * Extremely impossible since we got data filled in just before
109171f28f31SMing Lei 	 *
109271f28f31SMing Lei 	 * Re-read simply for this unlikely case.
109371f28f31SMing Lei 	 */
109471f28f31SMing Lei 	if (unlikely(unmapped_bytes < io->res))
109571f28f31SMing Lei 		io->res = unmapped_bytes;
109671f28f31SMing Lei 
109771f28f31SMing Lei 	if (blk_update_request(req, BLK_STS_OK, io->res))
109871f28f31SMing Lei 		blk_mq_requeue_request(req, true);
109971f28f31SMing Lei 	else
110071f28f31SMing Lei 		__blk_mq_end_request(req, BLK_STS_OK);
1101903f8aeeSMing Lei 
1102903f8aeeSMing Lei 	return;
1103903f8aeeSMing Lei exit:
1104903f8aeeSMing Lei 	blk_mq_end_request(req, res);
110571f28f31SMing Lei }
110671f28f31SMing Lei 
ublk_complete_rq(struct kref * ref)110782840669SMing Lei static void ublk_complete_rq(struct kref *ref)
110882840669SMing Lei {
110982840669SMing Lei 	struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
111082840669SMing Lei 			ref);
111182840669SMing Lei 	struct request *req = blk_mq_rq_from_pdu(data);
111282840669SMing Lei 
111382840669SMing Lei 	__ublk_complete_rq(req);
111482840669SMing Lei }
111582840669SMing Lei 
111671f28f31SMing Lei /*
1117bb241747SZiyangZhang  * Since __ublk_rq_task_work always fails requests immediately during
1118bb241747SZiyangZhang  * exiting, __ublk_fail_req() is only called from abort context during
1119bb241747SZiyangZhang  * exiting. So lock is unnecessary.
112071f28f31SMing Lei  *
112171f28f31SMing Lei  * Also aborting may not be started yet, keep in mind that one failed
112271f28f31SMing Lei  * request may be issued by block layer again.
112371f28f31SMing Lei  */
__ublk_fail_req(struct ublk_queue * ubq,struct ublk_io * io,struct request * req)1124a0d41dc1SZiyangZhang static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
1125a0d41dc1SZiyangZhang 		struct request *req)
112671f28f31SMing Lei {
112771f28f31SMing Lei 	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
112871f28f31SMing Lei 
112971f28f31SMing Lei 	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
113071f28f31SMing Lei 		io->flags |= UBLK_IO_FLAG_ABORTED;
1131a0d41dc1SZiyangZhang 		if (ublk_queue_can_use_recovery_reissue(ubq))
1132a0d41dc1SZiyangZhang 			blk_mq_requeue_request(req, false);
1133a0d41dc1SZiyangZhang 		else
113482840669SMing Lei 			ublk_put_req_ref(ubq, req);
113571f28f31SMing Lei 	}
113671f28f31SMing Lei }
113771f28f31SMing Lei 
ubq_complete_io_cmd(struct ublk_io * io,int res,unsigned issue_flags)11389d2789acSJens Axboe static void ubq_complete_io_cmd(struct ublk_io *io, int res,
11399d2789acSJens Axboe 				unsigned issue_flags)
1140c86019ffSZiyangZhang {
1141c86019ffSZiyangZhang 	/* mark this cmd owned by ublksrv */
1142c86019ffSZiyangZhang 	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1143c86019ffSZiyangZhang 
1144c86019ffSZiyangZhang 	/*
1145c86019ffSZiyangZhang 	 * clear ACTIVE since we are done with this sqe/cmd slot
1146c86019ffSZiyangZhang 	 * We can only accept io cmd in case of being not active.
1147c86019ffSZiyangZhang 	 */
1148c86019ffSZiyangZhang 	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1149c86019ffSZiyangZhang 
1150c86019ffSZiyangZhang 	/* tell ublksrv one io request is coming */
11519d2789acSJens Axboe 	io_uring_cmd_done(io->cmd, res, 0, issue_flags);
1152c86019ffSZiyangZhang }
1153c86019ffSZiyangZhang 
115471f28f31SMing Lei #define UBLK_REQUEUE_DELAY_MS	3
115571f28f31SMing Lei 
__ublk_abort_rq(struct ublk_queue * ubq,struct request * rq)115642cf5fc5SZiyangZhang static inline void __ublk_abort_rq(struct ublk_queue *ubq,
115742cf5fc5SZiyangZhang 		struct request *rq)
115842cf5fc5SZiyangZhang {
115942cf5fc5SZiyangZhang 	/* We cannot process this rq so just requeue it. */
116042cf5fc5SZiyangZhang 	if (ublk_queue_can_use_recovery(ubq))
116142cf5fc5SZiyangZhang 		blk_mq_requeue_request(rq, false);
116242cf5fc5SZiyangZhang 	else
116342cf5fc5SZiyangZhang 		blk_mq_end_request(rq, BLK_STS_IOERR);
116442cf5fc5SZiyangZhang 
116542cf5fc5SZiyangZhang 	mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
116642cf5fc5SZiyangZhang }
116742cf5fc5SZiyangZhang 
__ublk_rq_task_work(struct request * req,unsigned issue_flags)11689d2789acSJens Axboe static inline void __ublk_rq_task_work(struct request *req,
11699d2789acSJens Axboe 				       unsigned issue_flags)
117071f28f31SMing Lei {
117171f28f31SMing Lei 	struct ublk_queue *ubq = req->mq_hctx->driver_data;
117271f28f31SMing Lei 	int tag = req->tag;
117371f28f31SMing Lei 	struct ublk_io *io = &ubq->ios[tag];
117471f28f31SMing Lei 	unsigned int mapped_bytes;
117571f28f31SMing Lei 
117671f28f31SMing Lei 	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
117771f28f31SMing Lei 			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
117871f28f31SMing Lei 			ublk_get_iod(ubq, req->tag)->addr);
117971f28f31SMing Lei 
1180ae3f7193SZiyangZhang 	/*
1181ae3f7193SZiyangZhang 	 * Task is exiting if either:
1182ae3f7193SZiyangZhang 	 *
1183ae3f7193SZiyangZhang 	 * (1) current != ubq_daemon.
1184ae3f7193SZiyangZhang 	 * io_uring_cmd_complete_in_task() tries to run task_work
1185ae3f7193SZiyangZhang 	 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
1186ae3f7193SZiyangZhang 	 *
1187ae3f7193SZiyangZhang 	 * (2) current->flags & PF_EXITING.
1188ae3f7193SZiyangZhang 	 */
1189ae3f7193SZiyangZhang 	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
119042cf5fc5SZiyangZhang 		__ublk_abort_rq(ubq, req);
119171f28f31SMing Lei 		return;
119271f28f31SMing Lei 	}
119371f28f31SMing Lei 
11942f3af723SMing Lei 	if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1195c86019ffSZiyangZhang 		/*
1196c86019ffSZiyangZhang 		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
1197c86019ffSZiyangZhang 		 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1198c86019ffSZiyangZhang 		 * and notify it.
1199c86019ffSZiyangZhang 		 */
1200c86019ffSZiyangZhang 		if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
1201c86019ffSZiyangZhang 			io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1202c86019ffSZiyangZhang 			pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
1203c86019ffSZiyangZhang 					__func__, io->cmd->cmd_op, ubq->q_id,
1204c86019ffSZiyangZhang 					req->tag, io->flags);
12059d2789acSJens Axboe 			ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
1206c86019ffSZiyangZhang 			return;
1207c86019ffSZiyangZhang 		}
1208c86019ffSZiyangZhang 		/*
1209c86019ffSZiyangZhang 		 * We have handled UBLK_IO_NEED_GET_DATA command,
1210c86019ffSZiyangZhang 		 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
1211c86019ffSZiyangZhang 		 * do the copy work.
1212c86019ffSZiyangZhang 		 */
1213c86019ffSZiyangZhang 		io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
121492cb6e2eSZiyangZhang 		/* update iod->addr because ublksrv may have passed a new io buffer */
121592cb6e2eSZiyangZhang 		ublk_get_iod(ubq, req->tag)->addr = io->addr;
121692cb6e2eSZiyangZhang 		pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
121792cb6e2eSZiyangZhang 				__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
121892cb6e2eSZiyangZhang 				ublk_get_iod(ubq, req->tag)->addr);
1219c86019ffSZiyangZhang 	}
1220c86019ffSZiyangZhang 
122171f28f31SMing Lei 	mapped_bytes = ublk_map_io(ubq, req, io);
122271f28f31SMing Lei 
122371f28f31SMing Lei 	/* partially mapped, update io descriptor */
122471f28f31SMing Lei 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
122571f28f31SMing Lei 		/*
122671f28f31SMing Lei 		 * Nothing mapped, retry until we succeed.
122771f28f31SMing Lei 		 *
122871f28f31SMing Lei 		 * We may never succeed in mapping any bytes here because
122971f28f31SMing Lei 		 * of OOM. TODO: reserve one buffer with single page pinned
123071f28f31SMing Lei 		 * for providing forward progress guarantee.
123171f28f31SMing Lei 		 */
123271f28f31SMing Lei 		if (unlikely(!mapped_bytes)) {
123371f28f31SMing Lei 			blk_mq_requeue_request(req, false);
123471f28f31SMing Lei 			blk_mq_delay_kick_requeue_list(req->q,
123571f28f31SMing Lei 					UBLK_REQUEUE_DELAY_MS);
123671f28f31SMing Lei 			return;
123771f28f31SMing Lei 		}
123871f28f31SMing Lei 
123971f28f31SMing Lei 		ublk_get_iod(ubq, req->tag)->nr_sectors =
124071f28f31SMing Lei 			mapped_bytes >> 9;
124171f28f31SMing Lei 	}
124271f28f31SMing Lei 
124382840669SMing Lei 	ublk_init_req_ref(ubq, req);
12449d2789acSJens Axboe 	ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
124571f28f31SMing Lei }
124671f28f31SMing Lei 
ublk_forward_io_cmds(struct ublk_queue * ubq,unsigned issue_flags)12479d2789acSJens Axboe static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
12489d2789acSJens Axboe 					unsigned issue_flags)
12497d4a9317SMing Lei {
12507d4a9317SMing Lei 	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
12517d4a9317SMing Lei 	struct ublk_rq_data *data, *tmp;
12527d4a9317SMing Lei 
12537d4a9317SMing Lei 	io_cmds = llist_reverse_order(io_cmds);
12547d4a9317SMing Lei 	llist_for_each_entry_safe(data, tmp, io_cmds, node)
12559d2789acSJens Axboe 		__ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
12567d4a9317SMing Lei }
12577d4a9317SMing Lei 
ublk_abort_io_cmds(struct ublk_queue * ubq)12587d4a9317SMing Lei static inline void ublk_abort_io_cmds(struct ublk_queue *ubq)
12597d4a9317SMing Lei {
12607d4a9317SMing Lei 	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
12617d4a9317SMing Lei 	struct ublk_rq_data *data, *tmp;
12627d4a9317SMing Lei 
12637d4a9317SMing Lei 	llist_for_each_entry_safe(data, tmp, io_cmds, node)
12647d4a9317SMing Lei 		__ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
12657d4a9317SMing Lei }
12667d4a9317SMing Lei 
ublk_rq_task_work_cb(struct io_uring_cmd * cmd,unsigned issue_flags)12679d2789acSJens Axboe static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
12680edb3696SMing Lei {
12690edb3696SMing Lei 	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
12703ab6e94cSMing Lei 	struct ublk_queue *ubq = pdu->ubq;
12710edb3696SMing Lei 
12729d2789acSJens Axboe 	ublk_forward_io_cmds(ubq, issue_flags);
12730edb3696SMing Lei }
12740edb3696SMing Lei 
ublk_queue_cmd(struct ublk_queue * ubq,struct request * rq)12757d4a9317SMing Lei static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
12763ab6e94cSMing Lei {
12777d4a9317SMing Lei 	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
12787d4a9317SMing Lei 	struct ublk_io *io;
12793ab6e94cSMing Lei 
12807d4a9317SMing Lei 	if (!llist_add(&data->node, &ubq->io_cmds))
12817d4a9317SMing Lei 		return;
12827d4a9317SMing Lei 
12837d4a9317SMing Lei 	io = &ubq->ios[rq->tag];
12843ab6e94cSMing Lei 	/*
12853ab6e94cSMing Lei 	 * If the check pass, we know that this is a re-issued request aborted
12863ab6e94cSMing Lei 	 * previously in monitor_work because the ubq_daemon(cmd's task) is
12873ab6e94cSMing Lei 	 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
12883ab6e94cSMing Lei 	 * because this ioucmd's io_uring context may be freed now if no inflight
12893ab6e94cSMing Lei 	 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
12903ab6e94cSMing Lei 	 *
12913ab6e94cSMing Lei 	 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
12923ab6e94cSMing Lei 	 * the tag). Then the request is re-started(allocating the tag) and we are here.
12933ab6e94cSMing Lei 	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
12943ab6e94cSMing Lei 	 * guarantees that here is a re-issued request aborted previously.
12953ab6e94cSMing Lei 	 */
12963ab6e94cSMing Lei 	if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
12977d4a9317SMing Lei 		ublk_abort_io_cmds(ubq);
12983ab6e94cSMing Lei 	} else {
12993ab6e94cSMing Lei 		struct io_uring_cmd *cmd = io->cmd;
13003ab6e94cSMing Lei 		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
13013ab6e94cSMing Lei 
13023ab6e94cSMing Lei 		pdu->ubq = ubq;
13033ab6e94cSMing Lei 		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
13043ab6e94cSMing Lei 	}
13053ab6e94cSMing Lei }
13063ab6e94cSMing Lei 
ublk_timeout(struct request * rq)1307c0b79b0fSMing Lei static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1308c0b79b0fSMing Lei {
1309c0b79b0fSMing Lei 	struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1310c0b79b0fSMing Lei 
1311c0b79b0fSMing Lei 	if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
1312c0b79b0fSMing Lei 		if (!ubq->timeout) {
1313c0b79b0fSMing Lei 			send_sig(SIGKILL, ubq->ubq_daemon, 0);
1314c0b79b0fSMing Lei 			ubq->timeout = true;
1315c0b79b0fSMing Lei 		}
1316c0b79b0fSMing Lei 
1317c0b79b0fSMing Lei 		return BLK_EH_DONE;
1318c0b79b0fSMing Lei 	}
1319c0b79b0fSMing Lei 
1320c0b79b0fSMing Lei 	return BLK_EH_RESET_TIMER;
1321c0b79b0fSMing Lei }
1322c0b79b0fSMing Lei 
ublk_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)132371f28f31SMing Lei static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
132471f28f31SMing Lei 		const struct blk_mq_queue_data *bd)
132571f28f31SMing Lei {
132671f28f31SMing Lei 	struct ublk_queue *ubq = hctx->driver_data;
132771f28f31SMing Lei 	struct request *rq = bd->rq;
132871f28f31SMing Lei 	blk_status_t res;
132971f28f31SMing Lei 
133071f28f31SMing Lei 	/* fill iod to slot in io cmd buffer */
133171f28f31SMing Lei 	res = ublk_setup_iod(ubq, rq);
133271f28f31SMing Lei 	if (unlikely(res != BLK_STS_OK))
133371f28f31SMing Lei 		return BLK_STS_IOERR;
13343ab6e94cSMing Lei 
1335bbae8d1fSZiyangZhang 	/* With recovery feature enabled, force_abort is set in
1336bbae8d1fSZiyangZhang 	 * ublk_stop_dev() before calling del_gendisk(). We have to
1337bbae8d1fSZiyangZhang 	 * abort all requeued and new rqs here to let del_gendisk()
1338bbae8d1fSZiyangZhang 	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1339bbae8d1fSZiyangZhang 	 * to avoid UAF on io_uring ctx.
1340bbae8d1fSZiyangZhang 	 *
1341bbae8d1fSZiyangZhang 	 * Note: force_abort is guaranteed to be seen because it is set
1342bbae8d1fSZiyangZhang 	 * before request queue is unqiuesced.
1343bbae8d1fSZiyangZhang 	 */
1344bbae8d1fSZiyangZhang 	if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
1345bbae8d1fSZiyangZhang 		return BLK_STS_IOERR;
134671f28f31SMing Lei 
134771f28f31SMing Lei 	blk_mq_start_request(bd->rq);
134871f28f31SMing Lei 
134971f28f31SMing Lei 	if (unlikely(ubq_daemon_is_dying(ubq))) {
135042cf5fc5SZiyangZhang 		__ublk_abort_rq(ubq, rq);
135142cf5fc5SZiyangZhang 		return BLK_STS_OK;
135271f28f31SMing Lei 	}
135371f28f31SMing Lei 
13547d4a9317SMing Lei 	ublk_queue_cmd(ubq, rq);
13550edb3696SMing Lei 
135671f28f31SMing Lei 	return BLK_STS_OK;
135771f28f31SMing Lei }
135871f28f31SMing Lei 
ublk_init_hctx(struct blk_mq_hw_ctx * hctx,void * driver_data,unsigned int hctx_idx)135971f28f31SMing Lei static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
136071f28f31SMing Lei 		unsigned int hctx_idx)
136171f28f31SMing Lei {
1362cebbe577SMing Lei 	struct ublk_device *ub = driver_data;
136371f28f31SMing Lei 	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
136471f28f31SMing Lei 
136571f28f31SMing Lei 	hctx->driver_data = ubq;
136671f28f31SMing Lei 	return 0;
136771f28f31SMing Lei }
136871f28f31SMing Lei 
136971f28f31SMing Lei static const struct blk_mq_ops ublk_mq_ops = {
137071f28f31SMing Lei 	.queue_rq       = ublk_queue_rq,
137171f28f31SMing Lei 	.init_hctx	= ublk_init_hctx,
1372c0b79b0fSMing Lei 	.timeout	= ublk_timeout,
137371f28f31SMing Lei };
137471f28f31SMing Lei 
ublk_ch_open(struct inode * inode,struct file * filp)137571f28f31SMing Lei static int ublk_ch_open(struct inode *inode, struct file *filp)
137671f28f31SMing Lei {
137771f28f31SMing Lei 	struct ublk_device *ub = container_of(inode->i_cdev,
137871f28f31SMing Lei 			struct ublk_device, cdev);
137971f28f31SMing Lei 
1380fa362045SChristoph Hellwig 	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1381fa362045SChristoph Hellwig 		return -EBUSY;
138271f28f31SMing Lei 	filp->private_data = ub;
138371f28f31SMing Lei 	return 0;
138471f28f31SMing Lei }
138571f28f31SMing Lei 
ublk_ch_release(struct inode * inode,struct file * filp)138671f28f31SMing Lei static int ublk_ch_release(struct inode *inode, struct file *filp)
138771f28f31SMing Lei {
138871f28f31SMing Lei 	struct ublk_device *ub = filp->private_data;
138971f28f31SMing Lei 
1390fa362045SChristoph Hellwig 	clear_bit(UB_STATE_OPEN, &ub->state);
139171f28f31SMing Lei 	return 0;
139271f28f31SMing Lei }
139371f28f31SMing Lei 
139471f28f31SMing Lei /* map pre-allocated per-queue cmd buffer to ublksrv daemon */
ublk_ch_mmap(struct file * filp,struct vm_area_struct * vma)139571f28f31SMing Lei static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
139671f28f31SMing Lei {
139771f28f31SMing Lei 	struct ublk_device *ub = filp->private_data;
139871f28f31SMing Lei 	size_t sz = vma->vm_end - vma->vm_start;
139980f25003SMing Lei 	unsigned max_sz = ublk_max_cmd_buf_size();
140071f28f31SMing Lei 	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
140171f28f31SMing Lei 	int q_id, ret = 0;
140271f28f31SMing Lei 
1403e94eb459SMing Lei 	spin_lock(&ub->mm_lock);
140471f28f31SMing Lei 	if (!ub->mm)
140571f28f31SMing Lei 		ub->mm = current->mm;
140671f28f31SMing Lei 	if (current->mm != ub->mm)
140771f28f31SMing Lei 		ret = -EINVAL;
1408e94eb459SMing Lei 	spin_unlock(&ub->mm_lock);
140971f28f31SMing Lei 
141071f28f31SMing Lei 	if (ret)
141171f28f31SMing Lei 		return ret;
141271f28f31SMing Lei 
141371f28f31SMing Lei 	if (vma->vm_flags & VM_WRITE)
141471f28f31SMing Lei 		return -EPERM;
141571f28f31SMing Lei 
141671f28f31SMing Lei 	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
141771f28f31SMing Lei 	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
141871f28f31SMing Lei 		return -EINVAL;
141971f28f31SMing Lei 
142071f28f31SMing Lei 	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
142171f28f31SMing Lei 	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
142271f28f31SMing Lei 			__func__, q_id, current->pid, vma->vm_start,
142371f28f31SMing Lei 			phys_off, (unsigned long)sz);
142471f28f31SMing Lei 
142571f28f31SMing Lei 	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
142671f28f31SMing Lei 		return -EINVAL;
142771f28f31SMing Lei 
142871f28f31SMing Lei 	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
142971f28f31SMing Lei 	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
143071f28f31SMing Lei }
143171f28f31SMing Lei 
ublk_commit_completion(struct ublk_device * ub,const struct ublksrv_io_cmd * ub_cmd)143271f28f31SMing Lei static void ublk_commit_completion(struct ublk_device *ub,
1433fd9b8547SBreno Leitao 		const struct ublksrv_io_cmd *ub_cmd)
143471f28f31SMing Lei {
143571f28f31SMing Lei 	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
143671f28f31SMing Lei 	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
143771f28f31SMing Lei 	struct ublk_io *io = &ubq->ios[tag];
143871f28f31SMing Lei 	struct request *req;
143971f28f31SMing Lei 
144071f28f31SMing Lei 	/* now this cmd slot is owned by nbd driver */
144171f28f31SMing Lei 	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
144271f28f31SMing Lei 	io->res = ub_cmd->result;
144371f28f31SMing Lei 
144471f28f31SMing Lei 	/* find the io request and complete */
144571f28f31SMing Lei 	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
1446e24721e4SMing Lei 	if (WARN_ON_ONCE(unlikely(!req)))
1447e24721e4SMing Lei 		return;
144871f28f31SMing Lei 
144929802d7cSAndreas Hindborg 	if (req_op(req) == REQ_OP_ZONE_APPEND)
145029802d7cSAndreas Hindborg 		req->__sector = ub_cmd->zone_append_lba;
145129802d7cSAndreas Hindborg 
1452e24721e4SMing Lei 	if (likely(!blk_should_fake_timeout(req->q)))
145382840669SMing Lei 		ublk_put_req_ref(ubq, req);
145471f28f31SMing Lei }
145571f28f31SMing Lei 
145671f28f31SMing Lei /*
145771f28f31SMing Lei  * When ->ubq_daemon is exiting, either new request is ended immediately,
145871f28f31SMing Lei  * or any queued io command is drained, so it is safe to abort queue
145971f28f31SMing Lei  * lockless
146071f28f31SMing Lei  */
ublk_abort_queue(struct ublk_device * ub,struct ublk_queue * ubq)146171f28f31SMing Lei static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
146271f28f31SMing Lei {
146371f28f31SMing Lei 	int i;
146471f28f31SMing Lei 
146571f28f31SMing Lei 	if (!ublk_get_device(ub))
146671f28f31SMing Lei 		return;
146771f28f31SMing Lei 
146871f28f31SMing Lei 	for (i = 0; i < ubq->q_depth; i++) {
146971f28f31SMing Lei 		struct ublk_io *io = &ubq->ios[i];
147071f28f31SMing Lei 
147171f28f31SMing Lei 		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
147271f28f31SMing Lei 			struct request *rq;
147371f28f31SMing Lei 
147471f28f31SMing Lei 			/*
147571f28f31SMing Lei 			 * Either we fail the request or ublk_rq_task_work_fn
147671f28f31SMing Lei 			 * will do it
147771f28f31SMing Lei 			 */
147871f28f31SMing Lei 			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
147971f28f31SMing Lei 			if (rq)
1480a0d41dc1SZiyangZhang 				__ublk_fail_req(ubq, io, rq);
148171f28f31SMing Lei 		}
148271f28f31SMing Lei 	}
148371f28f31SMing Lei 	ublk_put_device(ub);
148471f28f31SMing Lei }
148571f28f31SMing Lei 
ublk_daemon_monitor_work(struct work_struct * work)148671f28f31SMing Lei static void ublk_daemon_monitor_work(struct work_struct *work)
148771f28f31SMing Lei {
148871f28f31SMing Lei 	struct ublk_device *ub =
148971f28f31SMing Lei 		container_of(work, struct ublk_device, monitor_work.work);
149071f28f31SMing Lei 	int i;
149171f28f31SMing Lei 
149271f28f31SMing Lei 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
149371f28f31SMing Lei 		struct ublk_queue *ubq = ublk_get_queue(ub, i);
149471f28f31SMing Lei 
149571f28f31SMing Lei 		if (ubq_daemon_is_dying(ubq)) {
1496bbae8d1fSZiyangZhang 			if (ublk_queue_can_use_recovery(ubq))
1497bbae8d1fSZiyangZhang 				schedule_work(&ub->quiesce_work);
1498bbae8d1fSZiyangZhang 			else
149971f28f31SMing Lei 				schedule_work(&ub->stop_work);
150071f28f31SMing Lei 
150171f28f31SMing Lei 			/* abort queue is for making forward progress */
150271f28f31SMing Lei 			ublk_abort_queue(ub, ubq);
150371f28f31SMing Lei 		}
150471f28f31SMing Lei 	}
150571f28f31SMing Lei 
150671f28f31SMing Lei 	/*
1507bbae8d1fSZiyangZhang 	 * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
1508bbae8d1fSZiyangZhang 	 * after ublk_remove() or __ublk_quiesce_dev() is started.
150971f28f31SMing Lei 	 *
151071f28f31SMing Lei 	 * No need ub->mutex, monitor work are canceled after state is marked
1511bbae8d1fSZiyangZhang 	 * as not LIVE, so new state is observed reliably.
151271f28f31SMing Lei 	 */
1513bbae8d1fSZiyangZhang 	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
151471f28f31SMing Lei 		schedule_delayed_work(&ub->monitor_work,
151571f28f31SMing Lei 				UBLK_DAEMON_MONITOR_PERIOD);
151671f28f31SMing Lei }
151771f28f31SMing Lei 
ublk_queue_ready(struct ublk_queue * ubq)1518a8ce5f52SMing Lei static inline bool ublk_queue_ready(struct ublk_queue *ubq)
1519a8ce5f52SMing Lei {
1520a8ce5f52SMing Lei 	return ubq->nr_io_ready == ubq->q_depth;
1521a8ce5f52SMing Lei }
1522a8ce5f52SMing Lei 
ublk_cancel_queue(struct ublk_queue * ubq)152371f28f31SMing Lei static void ublk_cancel_queue(struct ublk_queue *ubq)
152471f28f31SMing Lei {
152571f28f31SMing Lei 	int i;
152671f28f31SMing Lei 
152771f28f31SMing Lei 	for (i = 0; i < ubq->q_depth; i++) {
152871f28f31SMing Lei 		struct ublk_io *io = &ubq->ios[i];
152971f28f31SMing Lei 
15308cb8ef0cSMing Lei 		if (io->flags & UBLK_IO_FLAG_ACTIVE) {
15318cb8ef0cSMing Lei 			bool done;
1532a8ce5f52SMing Lei 
15338cb8ef0cSMing Lei 			spin_lock(&ubq->cancel_lock);
15348cb8ef0cSMing Lei 			done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
15358cb8ef0cSMing Lei 			if (!done)
15368cb8ef0cSMing Lei 				io->flags |= UBLK_IO_FLAG_CANCELED;
15378cb8ef0cSMing Lei 			spin_unlock(&ubq->cancel_lock);
15388cb8ef0cSMing Lei 
15398cb8ef0cSMing Lei 			if (!done)
15408cb8ef0cSMing Lei 				io_uring_cmd_done(io->cmd,
15418cb8ef0cSMing Lei 						UBLK_IO_RES_ABORT, 0,
15428cb8ef0cSMing Lei 						IO_URING_F_UNLOCKED);
15438cb8ef0cSMing Lei 		}
15448cb8ef0cSMing Lei 	}
154571f28f31SMing Lei }
154671f28f31SMing Lei 
154771f28f31SMing Lei /* Cancel all pending commands, must be called after del_gendisk() returns */
ublk_cancel_dev(struct ublk_device * ub)154871f28f31SMing Lei static void ublk_cancel_dev(struct ublk_device *ub)
154971f28f31SMing Lei {
155071f28f31SMing Lei 	int i;
155171f28f31SMing Lei 
155271f28f31SMing Lei 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
155371f28f31SMing Lei 		ublk_cancel_queue(ublk_get_queue(ub, i));
155471f28f31SMing Lei }
155571f28f31SMing Lei 
ublk_check_inflight_rq(struct request * rq,void * data)1556bbae8d1fSZiyangZhang static bool ublk_check_inflight_rq(struct request *rq, void *data)
155771f28f31SMing Lei {
1558bbae8d1fSZiyangZhang 	bool *idle = data;
1559bbae8d1fSZiyangZhang 
1560bbae8d1fSZiyangZhang 	if (blk_mq_request_started(rq)) {
1561bbae8d1fSZiyangZhang 		*idle = false;
1562bbae8d1fSZiyangZhang 		return false;
1563bbae8d1fSZiyangZhang 	}
1564bbae8d1fSZiyangZhang 	return true;
1565bbae8d1fSZiyangZhang }
1566bbae8d1fSZiyangZhang 
ublk_wait_tagset_rqs_idle(struct ublk_device * ub)1567bbae8d1fSZiyangZhang static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1568bbae8d1fSZiyangZhang {
1569bbae8d1fSZiyangZhang 	bool idle;
1570bbae8d1fSZiyangZhang 
1571bbae8d1fSZiyangZhang 	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1572bbae8d1fSZiyangZhang 	while (true) {
1573bbae8d1fSZiyangZhang 		idle = true;
1574bbae8d1fSZiyangZhang 		blk_mq_tagset_busy_iter(&ub->tag_set,
1575bbae8d1fSZiyangZhang 				ublk_check_inflight_rq, &idle);
1576bbae8d1fSZiyangZhang 		if (idle)
1577bbae8d1fSZiyangZhang 			break;
1578bbae8d1fSZiyangZhang 		msleep(UBLK_REQUEUE_DELAY_MS);
1579bbae8d1fSZiyangZhang 	}
1580bbae8d1fSZiyangZhang }
1581bbae8d1fSZiyangZhang 
__ublk_quiesce_dev(struct ublk_device * ub)1582bbae8d1fSZiyangZhang static void __ublk_quiesce_dev(struct ublk_device *ub)
1583bbae8d1fSZiyangZhang {
1584bbae8d1fSZiyangZhang 	pr_devel("%s: quiesce ub: dev_id %d state %s\n",
1585bbae8d1fSZiyangZhang 			__func__, ub->dev_info.dev_id,
1586bbae8d1fSZiyangZhang 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1587bbae8d1fSZiyangZhang 			"LIVE" : "QUIESCED");
1588bbae8d1fSZiyangZhang 	blk_mq_quiesce_queue(ub->ub_disk->queue);
1589bbae8d1fSZiyangZhang 	ublk_wait_tagset_rqs_idle(ub);
1590bbae8d1fSZiyangZhang 	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1591bbae8d1fSZiyangZhang 	/* we are going to release task_struct of ubq_daemon and resets
1592bbae8d1fSZiyangZhang 	 * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
1593bbae8d1fSZiyangZhang 	 * Besides, monitor_work is not necessary in QUIESCED state since we have
1594bbae8d1fSZiyangZhang 	 * already scheduled quiesce_work and quiesced all ubqs.
1595bbae8d1fSZiyangZhang 	 *
1596bbae8d1fSZiyangZhang 	 * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
1597bbae8d1fSZiyangZhang 	 * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
1598bbae8d1fSZiyangZhang 	 */
1599bbae8d1fSZiyangZhang 	cancel_delayed_work_sync(&ub->monitor_work);
1600bbae8d1fSZiyangZhang }
1601bbae8d1fSZiyangZhang 
ublk_quiesce_work_fn(struct work_struct * work)1602bbae8d1fSZiyangZhang static void ublk_quiesce_work_fn(struct work_struct *work)
1603bbae8d1fSZiyangZhang {
1604bbae8d1fSZiyangZhang 	struct ublk_device *ub =
1605bbae8d1fSZiyangZhang 		container_of(work, struct ublk_device, quiesce_work);
1606bbae8d1fSZiyangZhang 
160771f28f31SMing Lei 	mutex_lock(&ub->mutex);
16086d9e6dfdSChristoph Hellwig 	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
160971f28f31SMing Lei 		goto unlock;
1610bbae8d1fSZiyangZhang 	__ublk_quiesce_dev(ub);
1611bbae8d1fSZiyangZhang  unlock:
1612bbae8d1fSZiyangZhang 	mutex_unlock(&ub->mutex);
16138cb8ef0cSMing Lei 	ublk_cancel_dev(ub);
1614bbae8d1fSZiyangZhang }
161571f28f31SMing Lei 
ublk_unquiesce_dev(struct ublk_device * ub)1616bbae8d1fSZiyangZhang static void ublk_unquiesce_dev(struct ublk_device *ub)
1617bbae8d1fSZiyangZhang {
1618bbae8d1fSZiyangZhang 	int i;
1619bbae8d1fSZiyangZhang 
1620bbae8d1fSZiyangZhang 	pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
1621bbae8d1fSZiyangZhang 			__func__, ub->dev_info.dev_id,
1622bbae8d1fSZiyangZhang 			ub->dev_info.state == UBLK_S_DEV_LIVE ?
1623bbae8d1fSZiyangZhang 			"LIVE" : "QUIESCED");
1624bbae8d1fSZiyangZhang 	/* quiesce_work has run. We let requeued rqs be aborted
1625bbae8d1fSZiyangZhang 	 * before running fallback_wq. "force_abort" must be seen
1626bbae8d1fSZiyangZhang 	 * after request queue is unqiuesced. Then del_gendisk()
1627bbae8d1fSZiyangZhang 	 * can move on.
1628bbae8d1fSZiyangZhang 	 */
1629bbae8d1fSZiyangZhang 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1630bbae8d1fSZiyangZhang 		ublk_get_queue(ub, i)->force_abort = true;
1631bbae8d1fSZiyangZhang 
1632bbae8d1fSZiyangZhang 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
1633bbae8d1fSZiyangZhang 	/* We may have requeued some rqs in ublk_quiesce_queue() */
1634bbae8d1fSZiyangZhang 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
1635bbae8d1fSZiyangZhang }
1636bbae8d1fSZiyangZhang 
ublk_stop_dev(struct ublk_device * ub)1637bbae8d1fSZiyangZhang static void ublk_stop_dev(struct ublk_device *ub)
1638bbae8d1fSZiyangZhang {
1639bbae8d1fSZiyangZhang 	mutex_lock(&ub->mutex);
1640bbae8d1fSZiyangZhang 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
1641bbae8d1fSZiyangZhang 		goto unlock;
1642bbae8d1fSZiyangZhang 	if (ublk_can_use_recovery(ub)) {
1643bbae8d1fSZiyangZhang 		if (ub->dev_info.state == UBLK_S_DEV_LIVE)
1644bbae8d1fSZiyangZhang 			__ublk_quiesce_dev(ub);
1645bbae8d1fSZiyangZhang 		ublk_unquiesce_dev(ub);
1646bbae8d1fSZiyangZhang 	}
164771f28f31SMing Lei 	del_gendisk(ub->ub_disk);
164871f28f31SMing Lei 	ub->dev_info.state = UBLK_S_DEV_DEAD;
164971f28f31SMing Lei 	ub->dev_info.ublksrv_pid = -1;
16506d9e6dfdSChristoph Hellwig 	put_disk(ub->ub_disk);
16516d9e6dfdSChristoph Hellwig 	ub->ub_disk = NULL;
165271f28f31SMing Lei  unlock:
165371f28f31SMing Lei 	mutex_unlock(&ub->mutex);
16548cb8ef0cSMing Lei 	ublk_cancel_dev(ub);
165571f28f31SMing Lei 	cancel_delayed_work_sync(&ub->monitor_work);
165671f28f31SMing Lei }
165771f28f31SMing Lei 
165871f28f31SMing Lei /* device can only be started after all IOs are ready */
ublk_mark_io_ready(struct ublk_device * ub,struct ublk_queue * ubq)165971f28f31SMing Lei static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
166071f28f31SMing Lei {
166171f28f31SMing Lei 	mutex_lock(&ub->mutex);
166271f28f31SMing Lei 	ubq->nr_io_ready++;
166371f28f31SMing Lei 	if (ublk_queue_ready(ubq)) {
166471f28f31SMing Lei 		ubq->ubq_daemon = current;
166571f28f31SMing Lei 		get_task_struct(ubq->ubq_daemon);
166671f28f31SMing Lei 		ub->nr_queues_ready++;
166773a166d9SMing Lei 
166873a166d9SMing Lei 		if (capable(CAP_SYS_ADMIN))
166973a166d9SMing Lei 			ub->nr_privileged_daemon++;
167071f28f31SMing Lei 	}
167171f28f31SMing Lei 	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
167271f28f31SMing Lei 		complete_all(&ub->completion);
167371f28f31SMing Lei 	mutex_unlock(&ub->mutex);
167471f28f31SMing Lei }
167571f28f31SMing Lei 
ublk_handle_need_get_data(struct ublk_device * ub,int q_id,int tag)1676c86019ffSZiyangZhang static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
1677fee32f31SMing Lei 		int tag)
1678c86019ffSZiyangZhang {
1679c86019ffSZiyangZhang 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
1680c86019ffSZiyangZhang 	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
1681c86019ffSZiyangZhang 
16827d4a9317SMing Lei 	ublk_queue_cmd(ubq, req);
1683c86019ffSZiyangZhang }
1684c86019ffSZiyangZhang 
ublk_check_cmd_op(u32 cmd_op)16852d786e66SMing Lei static inline int ublk_check_cmd_op(u32 cmd_op)
16862d786e66SMing Lei {
16872d786e66SMing Lei 	u32 ioc_type = _IOC_TYPE(cmd_op);
16882d786e66SMing Lei 
1689e485bd9eSMing Lei 	if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
16902d786e66SMing Lei 		return -EOPNOTSUPP;
16912d786e66SMing Lei 
16922d786e66SMing Lei 	if (ioc_type != 'u' && ioc_type != 0)
16932d786e66SMing Lei 		return -EOPNOTSUPP;
16942d786e66SMing Lei 
16952d786e66SMing Lei 	return 0;
16962d786e66SMing Lei }
16972d786e66SMing Lei 
ublk_fill_io_cmd(struct ublk_io * io,struct io_uring_cmd * cmd,unsigned long buf_addr)1698f236a214SMing Lei static inline void ublk_fill_io_cmd(struct ublk_io *io,
1699f236a214SMing Lei 		struct io_uring_cmd *cmd, unsigned long buf_addr)
1700f236a214SMing Lei {
1701f236a214SMing Lei 	io->cmd = cmd;
1702f236a214SMing Lei 	io->flags |= UBLK_IO_FLAG_ACTIVE;
1703f236a214SMing Lei 	io->addr = buf_addr;
1704f236a214SMing Lei }
1705f236a214SMing Lei 
__ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags,const struct ublksrv_io_cmd * ub_cmd)17068c68ae3bSJens Axboe static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
17078c68ae3bSJens Axboe 			       unsigned int issue_flags,
170803e5cb7bSLinus Torvalds 			       const struct ublksrv_io_cmd *ub_cmd)
170971f28f31SMing Lei {
171071f28f31SMing Lei 	struct ublk_device *ub = cmd->file->private_data;
171171f28f31SMing Lei 	struct ublk_queue *ubq;
171271f28f31SMing Lei 	struct ublk_io *io;
171371f28f31SMing Lei 	u32 cmd_op = cmd->cmd_op;
171471f28f31SMing Lei 	unsigned tag = ub_cmd->tag;
171571f28f31SMing Lei 	int ret = -EINVAL;
17162f1e07ddSLiu Xiaodong 	struct request *req;
171771f28f31SMing Lei 
171871f28f31SMing Lei 	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
171971f28f31SMing Lei 			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
172071f28f31SMing Lei 			ub_cmd->result);
172171f28f31SMing Lei 
172271f28f31SMing Lei 	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
172371f28f31SMing Lei 		goto out;
172471f28f31SMing Lei 
172571f28f31SMing Lei 	ubq = ublk_get_queue(ub, ub_cmd->q_id);
172671f28f31SMing Lei 	if (!ubq || ub_cmd->q_id != ubq->q_id)
172771f28f31SMing Lei 		goto out;
172871f28f31SMing Lei 
172971f28f31SMing Lei 	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
173071f28f31SMing Lei 		goto out;
173171f28f31SMing Lei 
173271f28f31SMing Lei 	if (tag >= ubq->q_depth)
173371f28f31SMing Lei 		goto out;
173471f28f31SMing Lei 
173571f28f31SMing Lei 	io = &ubq->ios[tag];
173671f28f31SMing Lei 
173771f28f31SMing Lei 	/* there is pending io cmd, something must be wrong */
173871f28f31SMing Lei 	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
173971f28f31SMing Lei 		ret = -EBUSY;
174071f28f31SMing Lei 		goto out;
174171f28f31SMing Lei 	}
174271f28f31SMing Lei 
1743c86019ffSZiyangZhang 	/*
1744c86019ffSZiyangZhang 	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
1745c86019ffSZiyangZhang 	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
1746c86019ffSZiyangZhang 	 */
1747c86019ffSZiyangZhang 	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
17482d786e66SMing Lei 			^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
1749c86019ffSZiyangZhang 		goto out;
1750c86019ffSZiyangZhang 
17512d786e66SMing Lei 	ret = ublk_check_cmd_op(cmd_op);
17522d786e66SMing Lei 	if (ret)
17532d786e66SMing Lei 		goto out;
17542d786e66SMing Lei 
17557c75661cSMing Lei 	ret = -EINVAL;
17562d786e66SMing Lei 	switch (_IOC_NR(cmd_op)) {
175771f28f31SMing Lei 	case UBLK_IO_FETCH_REQ:
175871f28f31SMing Lei 		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
175971f28f31SMing Lei 		if (ublk_queue_ready(ubq)) {
176071f28f31SMing Lei 			ret = -EBUSY;
176171f28f31SMing Lei 			goto out;
176271f28f31SMing Lei 		}
176371f28f31SMing Lei 		/*
176471f28f31SMing Lei 		 * The io is being handled by server, so COMMIT_RQ is expected
176571f28f31SMing Lei 		 * instead of FETCH_REQ
176671f28f31SMing Lei 		 */
176771f28f31SMing Lei 		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
176871f28f31SMing Lei 			goto out;
17691172d5b8SMing Lei 
17701172d5b8SMing Lei 		if (!ublk_support_user_copy(ubq)) {
17711172d5b8SMing Lei 			/*
17721172d5b8SMing Lei 			 * FETCH_RQ has to provide IO buffer if NEED GET
17731172d5b8SMing Lei 			 * DATA is not enabled
17741172d5b8SMing Lei 			 */
17752f1e07ddSLiu Xiaodong 			if (!ub_cmd->addr && !ublk_need_get_data(ubq))
177671f28f31SMing Lei 				goto out;
17771a6e88b9SAndreas Hindborg 		} else if (ub_cmd->addr) {
17781a6e88b9SAndreas Hindborg 			/* User copy requires addr to be unset */
17791a6e88b9SAndreas Hindborg 			ret = -EINVAL;
17801a6e88b9SAndreas Hindborg 			goto out;
17811172d5b8SMing Lei 		}
178271f28f31SMing Lei 
1783f236a214SMing Lei 		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
178471f28f31SMing Lei 		ublk_mark_io_ready(ub, ubq);
178571f28f31SMing Lei 		break;
178671f28f31SMing Lei 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
17872f1e07ddSLiu Xiaodong 		req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
17881172d5b8SMing Lei 
178971f28f31SMing Lei 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
179071f28f31SMing Lei 			goto out;
17911172d5b8SMing Lei 
17921172d5b8SMing Lei 		if (!ublk_support_user_copy(ubq)) {
17931172d5b8SMing Lei 			/*
17941172d5b8SMing Lei 			 * COMMIT_AND_FETCH_REQ has to provide IO buffer if
17951172d5b8SMing Lei 			 * NEED GET DATA is not enabled or it is Read IO.
17961172d5b8SMing Lei 			 */
17971172d5b8SMing Lei 			if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
17981172d5b8SMing Lei 						req_op(req) == REQ_OP_READ))
17991172d5b8SMing Lei 				goto out;
180029802d7cSAndreas Hindborg 		} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
180129802d7cSAndreas Hindborg 			/*
180229802d7cSAndreas Hindborg 			 * User copy requires addr to be unset when command is
180329802d7cSAndreas Hindborg 			 * not zone append
180429802d7cSAndreas Hindborg 			 */
18051a6e88b9SAndreas Hindborg 			ret = -EINVAL;
18061a6e88b9SAndreas Hindborg 			goto out;
18071172d5b8SMing Lei 		}
18081a6e88b9SAndreas Hindborg 
1809f236a214SMing Lei 		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
181071f28f31SMing Lei 		ublk_commit_completion(ub, ub_cmd);
181171f28f31SMing Lei 		break;
1812c86019ffSZiyangZhang 	case UBLK_IO_NEED_GET_DATA:
1813c86019ffSZiyangZhang 		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
1814c86019ffSZiyangZhang 			goto out;
1815f236a214SMing Lei 		ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
1816fee32f31SMing Lei 		ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
1817c86019ffSZiyangZhang 		break;
181871f28f31SMing Lei 	default:
181971f28f31SMing Lei 		goto out;
182071f28f31SMing Lei 	}
182171f28f31SMing Lei 	return -EIOCBQUEUED;
182271f28f31SMing Lei 
182371f28f31SMing Lei  out:
18249d2789acSJens Axboe 	io_uring_cmd_done(cmd, ret, 0, issue_flags);
182571f28f31SMing Lei 	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
182671f28f31SMing Lei 			__func__, cmd_op, tag, ret, io->flags);
182771f28f31SMing Lei 	return -EIOCBQUEUED;
182871f28f31SMing Lei }
182971f28f31SMing Lei 
__ublk_check_and_get_req(struct ublk_device * ub,struct ublk_queue * ubq,int tag,size_t offset)183062fe99ceSMing Lei static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
183162fe99ceSMing Lei 		struct ublk_queue *ubq, int tag, size_t offset)
183262fe99ceSMing Lei {
183362fe99ceSMing Lei 	struct request *req;
183462fe99ceSMing Lei 
183562fe99ceSMing Lei 	if (!ublk_need_req_ref(ubq))
183662fe99ceSMing Lei 		return NULL;
183762fe99ceSMing Lei 
183862fe99ceSMing Lei 	req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
183962fe99ceSMing Lei 	if (!req)
184062fe99ceSMing Lei 		return NULL;
184162fe99ceSMing Lei 
184262fe99ceSMing Lei 	if (!ublk_get_req_ref(ubq, req))
184362fe99ceSMing Lei 		return NULL;
184462fe99ceSMing Lei 
184562fe99ceSMing Lei 	if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
184662fe99ceSMing Lei 		goto fail_put;
184762fe99ceSMing Lei 
184862fe99ceSMing Lei 	if (!ublk_rq_has_data(req))
184962fe99ceSMing Lei 		goto fail_put;
185062fe99ceSMing Lei 
185162fe99ceSMing Lei 	if (offset > blk_rq_bytes(req))
185262fe99ceSMing Lei 		goto fail_put;
185362fe99ceSMing Lei 
185462fe99ceSMing Lei 	return req;
185562fe99ceSMing Lei fail_put:
185662fe99ceSMing Lei 	ublk_put_req_ref(ubq, req);
185762fe99ceSMing Lei 	return NULL;
185862fe99ceSMing Lei }
185962fe99ceSMing Lei 
ublk_ch_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)18608c68ae3bSJens Axboe static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
18618c68ae3bSJens Axboe {
18628c68ae3bSJens Axboe 	/*
18638c68ae3bSJens Axboe 	 * Not necessary for async retry, but let's keep it simple and always
18648c68ae3bSJens Axboe 	 * copy the values to avoid any potential reuse.
18658c68ae3bSJens Axboe 	 */
186603e5cb7bSLinus Torvalds 	const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
186703e5cb7bSLinus Torvalds 	const struct ublksrv_io_cmd ub_cmd = {
186803e5cb7bSLinus Torvalds 		.q_id = READ_ONCE(ub_src->q_id),
186903e5cb7bSLinus Torvalds 		.tag = READ_ONCE(ub_src->tag),
187003e5cb7bSLinus Torvalds 		.result = READ_ONCE(ub_src->result),
187103e5cb7bSLinus Torvalds 		.addr = READ_ONCE(ub_src->addr)
187203e5cb7bSLinus Torvalds 	};
18738c68ae3bSJens Axboe 
18748c68ae3bSJens Axboe 	return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
18758c68ae3bSJens Axboe }
18768c68ae3bSJens Axboe 
ublk_check_ubuf_dir(const struct request * req,int ubuf_dir)187762fe99ceSMing Lei static inline bool ublk_check_ubuf_dir(const struct request *req,
187862fe99ceSMing Lei 		int ubuf_dir)
187962fe99ceSMing Lei {
188062fe99ceSMing Lei 	/* copy ubuf to request pages */
188129802d7cSAndreas Hindborg 	if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
188229802d7cSAndreas Hindborg 	    ubuf_dir == ITER_SOURCE)
188362fe99ceSMing Lei 		return true;
188462fe99ceSMing Lei 
188562fe99ceSMing Lei 	/* copy request pages to ubuf */
188629802d7cSAndreas Hindborg 	if ((req_op(req) == REQ_OP_WRITE ||
188729802d7cSAndreas Hindborg 	     req_op(req) == REQ_OP_ZONE_APPEND) &&
188829802d7cSAndreas Hindborg 	    ubuf_dir == ITER_DEST)
188962fe99ceSMing Lei 		return true;
189062fe99ceSMing Lei 
189162fe99ceSMing Lei 	return false;
189262fe99ceSMing Lei }
189362fe99ceSMing Lei 
ublk_check_and_get_req(struct kiocb * iocb,struct iov_iter * iter,size_t * off,int dir)189462fe99ceSMing Lei static struct request *ublk_check_and_get_req(struct kiocb *iocb,
189562fe99ceSMing Lei 		struct iov_iter *iter, size_t *off, int dir)
189662fe99ceSMing Lei {
189762fe99ceSMing Lei 	struct ublk_device *ub = iocb->ki_filp->private_data;
189862fe99ceSMing Lei 	struct ublk_queue *ubq;
189962fe99ceSMing Lei 	struct request *req;
190062fe99ceSMing Lei 	size_t buf_off;
190162fe99ceSMing Lei 	u16 tag, q_id;
190262fe99ceSMing Lei 
190362fe99ceSMing Lei 	if (!ub)
190462fe99ceSMing Lei 		return ERR_PTR(-EACCES);
190562fe99ceSMing Lei 
190662fe99ceSMing Lei 	if (!user_backed_iter(iter))
190762fe99ceSMing Lei 		return ERR_PTR(-EACCES);
190862fe99ceSMing Lei 
190962fe99ceSMing Lei 	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
191062fe99ceSMing Lei 		return ERR_PTR(-EACCES);
191162fe99ceSMing Lei 
191262fe99ceSMing Lei 	tag = ublk_pos_to_tag(iocb->ki_pos);
191362fe99ceSMing Lei 	q_id = ublk_pos_to_hwq(iocb->ki_pos);
191462fe99ceSMing Lei 	buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
191562fe99ceSMing Lei 
191662fe99ceSMing Lei 	if (q_id >= ub->dev_info.nr_hw_queues)
191762fe99ceSMing Lei 		return ERR_PTR(-EINVAL);
191862fe99ceSMing Lei 
191962fe99ceSMing Lei 	ubq = ublk_get_queue(ub, q_id);
192062fe99ceSMing Lei 	if (!ubq)
192162fe99ceSMing Lei 		return ERR_PTR(-EINVAL);
192262fe99ceSMing Lei 
192362fe99ceSMing Lei 	if (tag >= ubq->q_depth)
192462fe99ceSMing Lei 		return ERR_PTR(-EINVAL);
192562fe99ceSMing Lei 
192662fe99ceSMing Lei 	req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
192762fe99ceSMing Lei 	if (!req)
192862fe99ceSMing Lei 		return ERR_PTR(-EINVAL);
192962fe99ceSMing Lei 
193062fe99ceSMing Lei 	if (!req->mq_hctx || !req->mq_hctx->driver_data)
193162fe99ceSMing Lei 		goto fail;
193262fe99ceSMing Lei 
193362fe99ceSMing Lei 	if (!ublk_check_ubuf_dir(req, dir))
193462fe99ceSMing Lei 		goto fail;
193562fe99ceSMing Lei 
193662fe99ceSMing Lei 	*off = buf_off;
193762fe99ceSMing Lei 	return req;
193862fe99ceSMing Lei fail:
193962fe99ceSMing Lei 	ublk_put_req_ref(ubq, req);
194062fe99ceSMing Lei 	return ERR_PTR(-EACCES);
194162fe99ceSMing Lei }
194262fe99ceSMing Lei 
ublk_ch_read_iter(struct kiocb * iocb,struct iov_iter * to)194362fe99ceSMing Lei static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
194462fe99ceSMing Lei {
194562fe99ceSMing Lei 	struct ublk_queue *ubq;
194662fe99ceSMing Lei 	struct request *req;
194762fe99ceSMing Lei 	size_t buf_off;
194862fe99ceSMing Lei 	size_t ret;
194962fe99ceSMing Lei 
195062fe99ceSMing Lei 	req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
195162fe99ceSMing Lei 	if (IS_ERR(req))
195262fe99ceSMing Lei 		return PTR_ERR(req);
195362fe99ceSMing Lei 
195462fe99ceSMing Lei 	ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
195562fe99ceSMing Lei 	ubq = req->mq_hctx->driver_data;
195662fe99ceSMing Lei 	ublk_put_req_ref(ubq, req);
195762fe99ceSMing Lei 
195862fe99ceSMing Lei 	return ret;
195962fe99ceSMing Lei }
196062fe99ceSMing Lei 
ublk_ch_write_iter(struct kiocb * iocb,struct iov_iter * from)196162fe99ceSMing Lei static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
196262fe99ceSMing Lei {
196362fe99ceSMing Lei 	struct ublk_queue *ubq;
196462fe99ceSMing Lei 	struct request *req;
196562fe99ceSMing Lei 	size_t buf_off;
196662fe99ceSMing Lei 	size_t ret;
196762fe99ceSMing Lei 
196862fe99ceSMing Lei 	req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
196962fe99ceSMing Lei 	if (IS_ERR(req))
197062fe99ceSMing Lei 		return PTR_ERR(req);
197162fe99ceSMing Lei 
197262fe99ceSMing Lei 	ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
197362fe99ceSMing Lei 	ubq = req->mq_hctx->driver_data;
197462fe99ceSMing Lei 	ublk_put_req_ref(ubq, req);
197562fe99ceSMing Lei 
197662fe99ceSMing Lei 	return ret;
197762fe99ceSMing Lei }
197862fe99ceSMing Lei 
197971f28f31SMing Lei static const struct file_operations ublk_ch_fops = {
198071f28f31SMing Lei 	.owner = THIS_MODULE,
198171f28f31SMing Lei 	.open = ublk_ch_open,
198271f28f31SMing Lei 	.release = ublk_ch_release,
198371f28f31SMing Lei 	.llseek = no_llseek,
198462fe99ceSMing Lei 	.read_iter = ublk_ch_read_iter,
198562fe99ceSMing Lei 	.write_iter = ublk_ch_write_iter,
198671f28f31SMing Lei 	.uring_cmd = ublk_ch_uring_cmd,
198771f28f31SMing Lei 	.mmap = ublk_ch_mmap,
198871f28f31SMing Lei };
198971f28f31SMing Lei 
ublk_deinit_queue(struct ublk_device * ub,int q_id)199071f28f31SMing Lei static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
199171f28f31SMing Lei {
199271f28f31SMing Lei 	int size = ublk_queue_cmd_buf_size(ub, q_id);
199371f28f31SMing Lei 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
199471f28f31SMing Lei 
199571f28f31SMing Lei 	if (ubq->ubq_daemon)
199671f28f31SMing Lei 		put_task_struct(ubq->ubq_daemon);
199771f28f31SMing Lei 	if (ubq->io_cmd_buf)
199871f28f31SMing Lei 		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
199971f28f31SMing Lei }
200071f28f31SMing Lei 
ublk_init_queue(struct ublk_device * ub,int q_id)200171f28f31SMing Lei static int ublk_init_queue(struct ublk_device *ub, int q_id)
200271f28f31SMing Lei {
200371f28f31SMing Lei 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
200471f28f31SMing Lei 	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
200571f28f31SMing Lei 	void *ptr;
200671f28f31SMing Lei 	int size;
200771f28f31SMing Lei 
20088cb8ef0cSMing Lei 	spin_lock_init(&ubq->cancel_lock);
20096d8c5afcSMing Lei 	ubq->flags = ub->dev_info.flags;
201071f28f31SMing Lei 	ubq->q_id = q_id;
201171f28f31SMing Lei 	ubq->q_depth = ub->dev_info.queue_depth;
201271f28f31SMing Lei 	size = ublk_queue_cmd_buf_size(ub, q_id);
201371f28f31SMing Lei 
201471f28f31SMing Lei 	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
201571f28f31SMing Lei 	if (!ptr)
201671f28f31SMing Lei 		return -ENOMEM;
201771f28f31SMing Lei 
201871f28f31SMing Lei 	ubq->io_cmd_buf = ptr;
201971f28f31SMing Lei 	ubq->dev = ub;
202071f28f31SMing Lei 	return 0;
202171f28f31SMing Lei }
202271f28f31SMing Lei 
ublk_deinit_queues(struct ublk_device * ub)202371f28f31SMing Lei static void ublk_deinit_queues(struct ublk_device *ub)
202471f28f31SMing Lei {
202571f28f31SMing Lei 	int nr_queues = ub->dev_info.nr_hw_queues;
202671f28f31SMing Lei 	int i;
202771f28f31SMing Lei 
202871f28f31SMing Lei 	if (!ub->__queues)
202971f28f31SMing Lei 		return;
203071f28f31SMing Lei 
203171f28f31SMing Lei 	for (i = 0; i < nr_queues; i++)
203271f28f31SMing Lei 		ublk_deinit_queue(ub, i);
203371f28f31SMing Lei 	kfree(ub->__queues);
203471f28f31SMing Lei }
203571f28f31SMing Lei 
ublk_init_queues(struct ublk_device * ub)203671f28f31SMing Lei static int ublk_init_queues(struct ublk_device *ub)
203771f28f31SMing Lei {
203871f28f31SMing Lei 	int nr_queues = ub->dev_info.nr_hw_queues;
203971f28f31SMing Lei 	int depth = ub->dev_info.queue_depth;
204071f28f31SMing Lei 	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
204171f28f31SMing Lei 	int i, ret = -ENOMEM;
204271f28f31SMing Lei 
204371f28f31SMing Lei 	ub->queue_size = ubq_size;
204471f28f31SMing Lei 	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
204571f28f31SMing Lei 	if (!ub->__queues)
204671f28f31SMing Lei 		return ret;
204771f28f31SMing Lei 
204871f28f31SMing Lei 	for (i = 0; i < nr_queues; i++) {
204971f28f31SMing Lei 		if (ublk_init_queue(ub, i))
205071f28f31SMing Lei 			goto fail;
205171f28f31SMing Lei 	}
205271f28f31SMing Lei 
205371f28f31SMing Lei 	init_completion(&ub->completion);
205471f28f31SMing Lei 	return 0;
205571f28f31SMing Lei 
205671f28f31SMing Lei  fail:
205771f28f31SMing Lei 	ublk_deinit_queues(ub);
205871f28f31SMing Lei 	return ret;
205971f28f31SMing Lei }
206071f28f31SMing Lei 
ublk_alloc_dev_number(struct ublk_device * ub,int idx)2061fa9482e0SChristoph Hellwig static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
206271f28f31SMing Lei {
206371f28f31SMing Lei 	int i = idx;
206471f28f31SMing Lei 	int err;
206571f28f31SMing Lei 
206671f28f31SMing Lei 	spin_lock(&ublk_idr_lock);
206771f28f31SMing Lei 	/* allocate id, if @id >= 0, we're requesting that specific id */
206871f28f31SMing Lei 	if (i >= 0) {
206971f28f31SMing Lei 		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
207071f28f31SMing Lei 		if (err == -ENOSPC)
207171f28f31SMing Lei 			err = -EEXIST;
207271f28f31SMing Lei 	} else {
207371f28f31SMing Lei 		err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
207471f28f31SMing Lei 	}
207571f28f31SMing Lei 	spin_unlock(&ublk_idr_lock);
207671f28f31SMing Lei 
207771f28f31SMing Lei 	if (err >= 0)
207871f28f31SMing Lei 		ub->ub_number = err;
207971f28f31SMing Lei 
208071f28f31SMing Lei 	return err;
208171f28f31SMing Lei }
208271f28f31SMing Lei 
ublk_free_dev_number(struct ublk_device * ub)2083fa9482e0SChristoph Hellwig static void ublk_free_dev_number(struct ublk_device *ub)
208471f28f31SMing Lei {
208571f28f31SMing Lei 	spin_lock(&ublk_idr_lock);
208671f28f31SMing Lei 	idr_remove(&ublk_index_idr, ub->ub_number);
208771f28f31SMing Lei 	wake_up_all(&ublk_idr_wq);
208871f28f31SMing Lei 	spin_unlock(&ublk_idr_lock);
208971f28f31SMing Lei }
209071f28f31SMing Lei 
ublk_cdev_rel(struct device * dev)209171f28f31SMing Lei static void ublk_cdev_rel(struct device *dev)
209271f28f31SMing Lei {
209371f28f31SMing Lei 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
209471f28f31SMing Lei 
209571f28f31SMing Lei 	blk_mq_free_tag_set(&ub->tag_set);
209671f28f31SMing Lei 	ublk_deinit_queues(ub);
2097fa9482e0SChristoph Hellwig 	ublk_free_dev_number(ub);
2098fa9482e0SChristoph Hellwig 	mutex_destroy(&ub->mutex);
2099fa9482e0SChristoph Hellwig 	kfree(ub);
210071f28f31SMing Lei }
210171f28f31SMing Lei 
ublk_add_chdev(struct ublk_device * ub)210271f28f31SMing Lei static int ublk_add_chdev(struct ublk_device *ub)
210371f28f31SMing Lei {
210471f28f31SMing Lei 	struct device *dev = &ub->cdev_dev;
210571f28f31SMing Lei 	int minor = ub->ub_number;
210671f28f31SMing Lei 	int ret;
210771f28f31SMing Lei 
210871f28f31SMing Lei 	dev->parent = ublk_misc.this_device;
210971f28f31SMing Lei 	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
21102eefd399SIvan Orlov 	dev->class = &ublk_chr_class;
211171f28f31SMing Lei 	dev->release = ublk_cdev_rel;
211271f28f31SMing Lei 	device_initialize(dev);
211371f28f31SMing Lei 
211471f28f31SMing Lei 	ret = dev_set_name(dev, "ublkc%d", minor);
211571f28f31SMing Lei 	if (ret)
211671f28f31SMing Lei 		goto fail;
211771f28f31SMing Lei 
211871f28f31SMing Lei 	cdev_init(&ub->cdev, &ublk_ch_fops);
211971f28f31SMing Lei 	ret = cdev_device_add(&ub->cdev, dev);
212071f28f31SMing Lei 	if (ret)
212171f28f31SMing Lei 		goto fail;
2122403ebc87SMing Lei 
2123403ebc87SMing Lei 	ublks_added++;
212471f28f31SMing Lei 	return 0;
212571f28f31SMing Lei  fail:
212671f28f31SMing Lei 	put_device(dev);
212771f28f31SMing Lei 	return ret;
212871f28f31SMing Lei }
212971f28f31SMing Lei 
ublk_stop_work_fn(struct work_struct * work)213071f28f31SMing Lei static void ublk_stop_work_fn(struct work_struct *work)
213171f28f31SMing Lei {
213271f28f31SMing Lei 	struct ublk_device *ub =
213371f28f31SMing Lei 		container_of(work, struct ublk_device, stop_work);
213471f28f31SMing Lei 
213571f28f31SMing Lei 	ublk_stop_dev(ub);
213671f28f31SMing Lei }
213771f28f31SMing Lei 
21384bf9cbf3SMing Lei /* align max io buffer size with PAGE_SIZE */
ublk_align_max_io_size(struct ublk_device * ub)21396d9e6dfdSChristoph Hellwig static void ublk_align_max_io_size(struct ublk_device *ub)
214071f28f31SMing Lei {
21414bf9cbf3SMing Lei 	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
214271f28f31SMing Lei 
21434bf9cbf3SMing Lei 	ub->dev_info.max_io_buf_bytes =
21444bf9cbf3SMing Lei 		round_down(max_io_bytes, PAGE_SIZE);
214571f28f31SMing Lei }
214671f28f31SMing Lei 
ublk_add_tag_set(struct ublk_device * ub)2147fa9482e0SChristoph Hellwig static int ublk_add_tag_set(struct ublk_device *ub)
214871f28f31SMing Lei {
214971f28f31SMing Lei 	ub->tag_set.ops = &ublk_mq_ops;
215071f28f31SMing Lei 	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
215171f28f31SMing Lei 	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
215271f28f31SMing Lei 	ub->tag_set.numa_node = NUMA_NO_NODE;
21530edb3696SMing Lei 	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
215471f28f31SMing Lei 	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
215571f28f31SMing Lei 	ub->tag_set.driver_data = ub;
2156fa9482e0SChristoph Hellwig 	return blk_mq_alloc_tag_set(&ub->tag_set);
215771f28f31SMing Lei }
215871f28f31SMing Lei 
ublk_remove(struct ublk_device * ub)215971f28f31SMing Lei static void ublk_remove(struct ublk_device *ub)
216071f28f31SMing Lei {
216134d8f2beSChristoph Hellwig 	ublk_stop_dev(ub);
216234d8f2beSChristoph Hellwig 	cancel_work_sync(&ub->stop_work);
2163bbae8d1fSZiyangZhang 	cancel_work_sync(&ub->quiesce_work);
216471f28f31SMing Lei 	cdev_device_del(&ub->cdev, &ub->cdev_dev);
216571f28f31SMing Lei 	put_device(&ub->cdev_dev);
2166403ebc87SMing Lei 	ublks_added--;
216771f28f31SMing Lei }
216871f28f31SMing Lei 
ublk_get_device_from_id(int idx)216971f28f31SMing Lei static struct ublk_device *ublk_get_device_from_id(int idx)
217071f28f31SMing Lei {
217171f28f31SMing Lei 	struct ublk_device *ub = NULL;
217271f28f31SMing Lei 
217371f28f31SMing Lei 	if (idx < 0)
217471f28f31SMing Lei 		return NULL;
217571f28f31SMing Lei 
217671f28f31SMing Lei 	spin_lock(&ublk_idr_lock);
217771f28f31SMing Lei 	ub = idr_find(&ublk_index_idr, idx);
217871f28f31SMing Lei 	if (ub)
217971f28f31SMing Lei 		ub = ublk_get_device(ub);
218071f28f31SMing Lei 	spin_unlock(&ublk_idr_lock);
218171f28f31SMing Lei 
218271f28f31SMing Lei 	return ub;
218371f28f31SMing Lei }
218471f28f31SMing Lei 
ublk_ctrl_start_dev(struct ublk_device * ub,struct io_uring_cmd * cmd)2185bfbcef03SMing Lei static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
218671f28f31SMing Lei {
2187fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
218871f28f31SMing Lei 	int ublksrv_pid = (int)header->data[0];
21896d9e6dfdSChristoph Hellwig 	struct gendisk *disk;
219034d8f2beSChristoph Hellwig 	int ret = -EINVAL;
219171f28f31SMing Lei 
219271f28f31SMing Lei 	if (ublksrv_pid <= 0)
219334d8f2beSChristoph Hellwig 		return -EINVAL;
219434d8f2beSChristoph Hellwig 
219553e7d08fSMing Lei 	if (wait_for_completion_interruptible(&ub->completion) != 0)
219653e7d08fSMing Lei 		return -EINTR;
219771f28f31SMing Lei 
219871f28f31SMing Lei 	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
219971f28f31SMing Lei 
220071f28f31SMing Lei 	mutex_lock(&ub->mutex);
22016d9e6dfdSChristoph Hellwig 	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
22026d9e6dfdSChristoph Hellwig 	    test_bit(UB_STATE_USED, &ub->state)) {
220334d8f2beSChristoph Hellwig 		ret = -EEXIST;
220434d8f2beSChristoph Hellwig 		goto out_unlock;
220534d8f2beSChristoph Hellwig 	}
220634d8f2beSChristoph Hellwig 
22071972d038SZiyang Zhang 	disk = blk_mq_alloc_disk(&ub->tag_set, NULL);
22086d9e6dfdSChristoph Hellwig 	if (IS_ERR(disk)) {
22096d9e6dfdSChristoph Hellwig 		ret = PTR_ERR(disk);
22106d9e6dfdSChristoph Hellwig 		goto out_unlock;
22116d9e6dfdSChristoph Hellwig 	}
22126d9e6dfdSChristoph Hellwig 	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
22136d9e6dfdSChristoph Hellwig 	disk->fops = &ub_fops;
22146d9e6dfdSChristoph Hellwig 	disk->private_data = ub;
22156d9e6dfdSChristoph Hellwig 
22166d9e6dfdSChristoph Hellwig 	ub->dev_info.ublksrv_pid = ublksrv_pid;
22176d9e6dfdSChristoph Hellwig 	ub->ub_disk = disk;
22180aa73170SMing Lei 
22190aa73170SMing Lei 	ret = ublk_apply_params(ub);
22200aa73170SMing Lei 	if (ret)
22210aa73170SMing Lei 		goto out_put_disk;
22220aa73170SMing Lei 
222373a166d9SMing Lei 	/* don't probe partitions if any one ubq daemon is un-trusted */
222473a166d9SMing Lei 	if (ub->nr_privileged_daemon != ub->nr_queues_ready)
222573a166d9SMing Lei 		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
222673a166d9SMing Lei 
22276d9e6dfdSChristoph Hellwig 	get_device(&ub->cdev_dev);
22284985e7b2SMing Lei 	ub->dev_info.state = UBLK_S_DEV_LIVE;
222929802d7cSAndreas Hindborg 
223029802d7cSAndreas Hindborg 	if (ublk_dev_is_zoned(ub)) {
223129802d7cSAndreas Hindborg 		ret = ublk_revalidate_disk_zones(ub);
223229802d7cSAndreas Hindborg 		if (ret)
223329802d7cSAndreas Hindborg 			goto out_put_cdev;
223429802d7cSAndreas Hindborg 	}
223529802d7cSAndreas Hindborg 
22366d9e6dfdSChristoph Hellwig 	ret = add_disk(disk);
223729802d7cSAndreas Hindborg 	if (ret)
223829802d7cSAndreas Hindborg 		goto out_put_cdev;
223929802d7cSAndreas Hindborg 
224029802d7cSAndreas Hindborg 	set_bit(UB_STATE_USED, &ub->state);
224129802d7cSAndreas Hindborg 
224229802d7cSAndreas Hindborg out_put_cdev:
22436d9e6dfdSChristoph Hellwig 	if (ret) {
22444985e7b2SMing Lei 		ub->dev_info.state = UBLK_S_DEV_DEAD;
224593d71ec8SMing Lei 		ublk_put_device(ub);
22466d9e6dfdSChristoph Hellwig 	}
22470aa73170SMing Lei out_put_disk:
22480aa73170SMing Lei 	if (ret)
22490aa73170SMing Lei 		put_disk(disk);
225034d8f2beSChristoph Hellwig out_unlock:
225134d8f2beSChristoph Hellwig 	mutex_unlock(&ub->mutex);
225271f28f31SMing Lei 	return ret;
225371f28f31SMing Lei }
225471f28f31SMing Lei 
ublk_ctrl_get_queue_affinity(struct ublk_device * ub,struct io_uring_cmd * cmd)2255bfbcef03SMing Lei static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2256bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
225771f28f31SMing Lei {
2258fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
225971f28f31SMing Lei 	void __user *argp = (void __user *)(unsigned long)header->addr;
2260c50061f0SChristoph Hellwig 	cpumask_var_t cpumask;
226171f28f31SMing Lei 	unsigned long queue;
226271f28f31SMing Lei 	unsigned int retlen;
2263c50061f0SChristoph Hellwig 	unsigned int i;
2264bfbcef03SMing Lei 	int ret;
226571f28f31SMing Lei 
226634d8f2beSChristoph Hellwig 	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
226734d8f2beSChristoph Hellwig 		return -EINVAL;
226834d8f2beSChristoph Hellwig 	if (header->len & (sizeof(unsigned long)-1))
226934d8f2beSChristoph Hellwig 		return -EINVAL;
227034d8f2beSChristoph Hellwig 	if (!header->addr)
227134d8f2beSChristoph Hellwig 		return -EINVAL;
227234d8f2beSChristoph Hellwig 
227371f28f31SMing Lei 	queue = header->data[0];
227471f28f31SMing Lei 	if (queue >= ub->dev_info.nr_hw_queues)
2275bfbcef03SMing Lei 		return -EINVAL;
227671f28f31SMing Lei 
2277c50061f0SChristoph Hellwig 	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
2278bfbcef03SMing Lei 		return -ENOMEM;
2279c50061f0SChristoph Hellwig 
2280c50061f0SChristoph Hellwig 	for_each_possible_cpu(i) {
2281c50061f0SChristoph Hellwig 		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
2282c50061f0SChristoph Hellwig 			cpumask_set_cpu(i, cpumask);
2283c50061f0SChristoph Hellwig 	}
2284c50061f0SChristoph Hellwig 
2285c50061f0SChristoph Hellwig 	ret = -EFAULT;
228671f28f31SMing Lei 	retlen = min_t(unsigned short, header->len, cpumask_size());
2287c50061f0SChristoph Hellwig 	if (copy_to_user(argp, cpumask, retlen))
2288c50061f0SChristoph Hellwig 		goto out_free_cpumask;
2289c50061f0SChristoph Hellwig 	if (retlen != header->len &&
2290c50061f0SChristoph Hellwig 	    clear_user(argp + retlen, header->len - retlen))
2291c50061f0SChristoph Hellwig 		goto out_free_cpumask;
2292c50061f0SChristoph Hellwig 
229371f28f31SMing Lei 	ret = 0;
2294c50061f0SChristoph Hellwig out_free_cpumask:
2295c50061f0SChristoph Hellwig 	free_cpumask_var(cpumask);
229671f28f31SMing Lei 	return ret;
229771f28f31SMing Lei }
229871f28f31SMing Lei 
ublk_dump_dev_info(struct ublksrv_ctrl_dev_info * info)229934d8f2beSChristoph Hellwig static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
230071f28f31SMing Lei {
230134d8f2beSChristoph Hellwig 	pr_devel("%s: dev id %d flags %llx\n", __func__,
23026d8c5afcSMing Lei 			info->dev_id, info->flags);
23034bf9cbf3SMing Lei 	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
23044bf9cbf3SMing Lei 			info->nr_hw_queues, info->queue_depth);
230534d8f2beSChristoph Hellwig }
230634d8f2beSChristoph Hellwig 
ublk_ctrl_add_dev(struct io_uring_cmd * cmd)230734d8f2beSChristoph Hellwig static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
230834d8f2beSChristoph Hellwig {
2309fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
231034d8f2beSChristoph Hellwig 	void __user *argp = (void __user *)(unsigned long)header->addr;
231134d8f2beSChristoph Hellwig 	struct ublksrv_ctrl_dev_info info;
231271f28f31SMing Lei 	struct ublk_device *ub;
231334d8f2beSChristoph Hellwig 	int ret = -EINVAL;
231434d8f2beSChristoph Hellwig 
231534d8f2beSChristoph Hellwig 	if (header->len < sizeof(info) || !header->addr)
231634d8f2beSChristoph Hellwig 		return -EINVAL;
231734d8f2beSChristoph Hellwig 	if (header->queue_id != (u16)-1) {
231834d8f2beSChristoph Hellwig 		pr_warn("%s: queue_id is wrong %x\n",
231934d8f2beSChristoph Hellwig 			__func__, header->queue_id);
232034d8f2beSChristoph Hellwig 		return -EINVAL;
232134d8f2beSChristoph Hellwig 	}
23224093cb5aSMing Lei 
232334d8f2beSChristoph Hellwig 	if (copy_from_user(&info, argp, sizeof(info)))
232434d8f2beSChristoph Hellwig 		return -EFAULT;
23254093cb5aSMing Lei 
23264093cb5aSMing Lei 	if (capable(CAP_SYS_ADMIN))
23274093cb5aSMing Lei 		info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
23284093cb5aSMing Lei 	else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
23294093cb5aSMing Lei 		return -EPERM;
23304093cb5aSMing Lei 
2331c0b79b0fSMing Lei 	/*
2332c0b79b0fSMing Lei 	 * unprivileged device can't be trusted, but RECOVERY and
2333c0b79b0fSMing Lei 	 * RECOVERY_REISSUE still may hang error handling, so can't
2334c0b79b0fSMing Lei 	 * support recovery features for unprivileged ublk now
2335c0b79b0fSMing Lei 	 *
2336c0b79b0fSMing Lei 	 * TODO: provide forward progress for RECOVERY handler, so that
2337c0b79b0fSMing Lei 	 * unprivileged device can benefit from it
2338c0b79b0fSMing Lei 	 */
23396414ab5cSMing Lei 	if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
2340c0b79b0fSMing Lei 		info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
2341c0b79b0fSMing Lei 				UBLK_F_USER_RECOVERY);
2342c0b79b0fSMing Lei 
23436414ab5cSMing Lei 		/*
23446414ab5cSMing Lei 		 * For USER_COPY, we depends on userspace to fill request
23456414ab5cSMing Lei 		 * buffer by pwrite() to ublk char device, which can't be
23466414ab5cSMing Lei 		 * used for unprivileged device
23476414ab5cSMing Lei 		 */
23486414ab5cSMing Lei 		if (info.flags & UBLK_F_USER_COPY)
23496414ab5cSMing Lei 			return -EINVAL;
23506414ab5cSMing Lei 	}
23516414ab5cSMing Lei 
23524093cb5aSMing Lei 	/* the created device is always owned by current user */
235348a90519SMing Lei 	ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
23544093cb5aSMing Lei 
235534d8f2beSChristoph Hellwig 	if (header->dev_id != info.dev_id) {
235634d8f2beSChristoph Hellwig 		pr_warn("%s: dev id not match %u %u\n",
235734d8f2beSChristoph Hellwig 			__func__, header->dev_id, info.dev_id);
235834d8f2beSChristoph Hellwig 		return -EINVAL;
235934d8f2beSChristoph Hellwig 	}
236071f28f31SMing Lei 
23614093cb5aSMing Lei 	ublk_dump_dev_info(&info);
23624093cb5aSMing Lei 
236371f28f31SMing Lei 	ret = mutex_lock_killable(&ublk_ctl_mutex);
236471f28f31SMing Lei 	if (ret)
236571f28f31SMing Lei 		return ret;
236671f28f31SMing Lei 
2367403ebc87SMing Lei 	ret = -EACCES;
2368403ebc87SMing Lei 	if (ublks_added >= ublks_max)
2369403ebc87SMing Lei 		goto out_unlock;
2370403ebc87SMing Lei 
2371cfee7e4dSChristoph Hellwig 	ret = -ENOMEM;
2372cfee7e4dSChristoph Hellwig 	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
2373cfee7e4dSChristoph Hellwig 	if (!ub)
2374cfee7e4dSChristoph Hellwig 		goto out_unlock;
2375fa9482e0SChristoph Hellwig 	mutex_init(&ub->mutex);
2376fa9482e0SChristoph Hellwig 	spin_lock_init(&ub->mm_lock);
2377bbae8d1fSZiyangZhang 	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
2378fa9482e0SChristoph Hellwig 	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
2379fa9482e0SChristoph Hellwig 	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
2380cfee7e4dSChristoph Hellwig 
2381fa9482e0SChristoph Hellwig 	ret = ublk_alloc_dev_number(ub, header->dev_id);
2382fa9482e0SChristoph Hellwig 	if (ret < 0)
2383fa9482e0SChristoph Hellwig 		goto out_free_ub;
238434d8f2beSChristoph Hellwig 
238534d8f2beSChristoph Hellwig 	memcpy(&ub->dev_info, &info, sizeof(info));
238671f28f31SMing Lei 
238771f28f31SMing Lei 	/* update device id */
238871f28f31SMing Lei 	ub->dev_info.dev_id = ub->ub_number;
238971f28f31SMing Lei 
23906d8c5afcSMing Lei 	/*
23916d8c5afcSMing Lei 	 * 64bit flags will be copied back to userspace as feature
23926d8c5afcSMing Lei 	 * negotiation result, so have to clear flags which driver
23936d8c5afcSMing Lei 	 * doesn't support yet, then userspace can get correct flags
23946d8c5afcSMing Lei 	 * (features) to handle.
23956d8c5afcSMing Lei 	 */
23966d8c5afcSMing Lei 	ub->dev_info.flags &= UBLK_F_ALL;
23976d8c5afcSMing Lei 
239829dc5d06SMing Lei 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
239929dc5d06SMing Lei 		UBLK_F_URING_CMD_COMP_IN_TASK;
2400224e858fSMing Lei 
24011172d5b8SMing Lei 	/* GET_DATA isn't needed any more with USER_COPY */
24029d4ed6d4SAndreas Hindborg 	if (ublk_dev_is_user_copy(ub))
24031172d5b8SMing Lei 		ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
24042d786e66SMing Lei 
240529802d7cSAndreas Hindborg 	/* Zoned storage support requires user copy feature */
240629802d7cSAndreas Hindborg 	if (ublk_dev_is_zoned(ub) &&
240729802d7cSAndreas Hindborg 	    (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
240829802d7cSAndreas Hindborg 		ret = -EINVAL;
240929802d7cSAndreas Hindborg 		goto out_free_dev_number;
241029802d7cSAndreas Hindborg 	}
241129802d7cSAndreas Hindborg 
2412fa9482e0SChristoph Hellwig 	/* We are not ready to support zero copy */
24136d8c5afcSMing Lei 	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
2414fa9482e0SChristoph Hellwig 
2415fa9482e0SChristoph Hellwig 	ub->dev_info.nr_hw_queues = min_t(unsigned int,
2416fa9482e0SChristoph Hellwig 			ub->dev_info.nr_hw_queues, nr_cpu_ids);
2417fa9482e0SChristoph Hellwig 	ublk_align_max_io_size(ub);
2418fa9482e0SChristoph Hellwig 
2419fa9482e0SChristoph Hellwig 	ret = ublk_init_queues(ub);
242034d8f2beSChristoph Hellwig 	if (ret)
2421fa9482e0SChristoph Hellwig 		goto out_free_dev_number;
2422fa9482e0SChristoph Hellwig 
2423fa9482e0SChristoph Hellwig 	ret = ublk_add_tag_set(ub);
2424fa9482e0SChristoph Hellwig 	if (ret)
2425fa9482e0SChristoph Hellwig 		goto out_deinit_queues;
2426fa9482e0SChristoph Hellwig 
2427fa9482e0SChristoph Hellwig 	ret = -EFAULT;
2428fa9482e0SChristoph Hellwig 	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
2429fa9482e0SChristoph Hellwig 		goto out_free_tag_set;
2430fa9482e0SChristoph Hellwig 
2431fa9482e0SChristoph Hellwig 	/*
2432fa9482e0SChristoph Hellwig 	 * Add the char dev so that ublksrv daemon can be setup.
2433fa9482e0SChristoph Hellwig 	 * ublk_add_chdev() will cleanup everything if it fails.
2434fa9482e0SChristoph Hellwig 	 */
2435fa9482e0SChristoph Hellwig 	ret = ublk_add_chdev(ub);
243634d8f2beSChristoph Hellwig 	goto out_unlock;
243734d8f2beSChristoph Hellwig 
2438fa9482e0SChristoph Hellwig out_free_tag_set:
2439fa9482e0SChristoph Hellwig 	blk_mq_free_tag_set(&ub->tag_set);
2440fa9482e0SChristoph Hellwig out_deinit_queues:
2441fa9482e0SChristoph Hellwig 	ublk_deinit_queues(ub);
2442fa9482e0SChristoph Hellwig out_free_dev_number:
2443fa9482e0SChristoph Hellwig 	ublk_free_dev_number(ub);
2444fa9482e0SChristoph Hellwig out_free_ub:
2445fa9482e0SChristoph Hellwig 	mutex_destroy(&ub->mutex);
2446fa9482e0SChristoph Hellwig 	kfree(ub);
244734d8f2beSChristoph Hellwig out_unlock:
244871f28f31SMing Lei 	mutex_unlock(&ublk_ctl_mutex);
244971f28f31SMing Lei 	return ret;
245071f28f31SMing Lei }
245171f28f31SMing Lei 
ublk_idr_freed(int id)245271f28f31SMing Lei static inline bool ublk_idr_freed(int id)
245371f28f31SMing Lei {
245471f28f31SMing Lei 	void *ptr;
245571f28f31SMing Lei 
245671f28f31SMing Lei 	spin_lock(&ublk_idr_lock);
245771f28f31SMing Lei 	ptr = idr_find(&ublk_index_idr, id);
245871f28f31SMing Lei 	spin_unlock(&ublk_idr_lock);
245971f28f31SMing Lei 
246071f28f31SMing Lei 	return ptr == NULL;
246171f28f31SMing Lei }
246271f28f31SMing Lei 
ublk_ctrl_del_dev(struct ublk_device ** p_ub)2463bfbcef03SMing Lei static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
246471f28f31SMing Lei {
2465bfbcef03SMing Lei 	struct ublk_device *ub = *p_ub;
2466bfbcef03SMing Lei 	int idx = ub->ub_number;
246771f28f31SMing Lei 	int ret;
246871f28f31SMing Lei 
246971f28f31SMing Lei 	ret = mutex_lock_killable(&ublk_ctl_mutex);
247071f28f31SMing Lei 	if (ret)
247171f28f31SMing Lei 		return ret;
247271f28f31SMing Lei 
24730abe39deSMing Lei 	if (!test_bit(UB_STATE_DELETED, &ub->state)) {
247471f28f31SMing Lei 		ublk_remove(ub);
24750abe39deSMing Lei 		set_bit(UB_STATE_DELETED, &ub->state);
247671f28f31SMing Lei 	}
247771f28f31SMing Lei 
2478bfbcef03SMing Lei 	/* Mark the reference as consumed */
2479bfbcef03SMing Lei 	*p_ub = NULL;
248071f28f31SMing Lei 	ublk_put_device(ub);
24810abe39deSMing Lei 	mutex_unlock(&ublk_ctl_mutex);
248271f28f31SMing Lei 
248371f28f31SMing Lei 	/*
248471f28f31SMing Lei 	 * Wait until the idr is removed, then it can be reused after
248571f28f31SMing Lei 	 * DEL_DEV command is returned.
24860abe39deSMing Lei 	 *
24870abe39deSMing Lei 	 * If we returns because of user interrupt, future delete command
24880abe39deSMing Lei 	 * may come:
24890abe39deSMing Lei 	 *
24900abe39deSMing Lei 	 * - the device number isn't freed, this device won't or needn't
24910abe39deSMing Lei 	 *   be deleted again, since UB_STATE_DELETED is set, and device
24920abe39deSMing Lei 	 *   will be released after the last reference is dropped
24930abe39deSMing Lei 	 *
24940abe39deSMing Lei 	 * - the device number is freed already, we will not find this
24950abe39deSMing Lei 	 *   device via ublk_get_device_from_id()
249671f28f31SMing Lei 	 */
24973e9dce80SMing Lei 	if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
24983e9dce80SMing Lei 		return -EINTR;
24990abe39deSMing Lei 	return 0;
250071f28f31SMing Lei }
250171f28f31SMing Lei 
ublk_ctrl_cmd_dump(struct io_uring_cmd * cmd)250271f28f31SMing Lei static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
250371f28f31SMing Lei {
2504fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
250571f28f31SMing Lei 
250671f28f31SMing Lei 	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
250771f28f31SMing Lei 			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
250871f28f31SMing Lei 			header->data[0], header->addr, header->len);
250971f28f31SMing Lei }
251071f28f31SMing Lei 
ublk_ctrl_stop_dev(struct ublk_device * ub)2511bfbcef03SMing Lei static int ublk_ctrl_stop_dev(struct ublk_device *ub)
251271f28f31SMing Lei {
251334d8f2beSChristoph Hellwig 	ublk_stop_dev(ub);
251434d8f2beSChristoph Hellwig 	cancel_work_sync(&ub->stop_work);
2515bbae8d1fSZiyangZhang 	cancel_work_sync(&ub->quiesce_work);
251671f28f31SMing Lei 
251771f28f31SMing Lei 	return 0;
251871f28f31SMing Lei }
251971f28f31SMing Lei 
ublk_ctrl_get_dev_info(struct ublk_device * ub,struct io_uring_cmd * cmd)2520bfbcef03SMing Lei static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
2521bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
252234d8f2beSChristoph Hellwig {
2523fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
252434d8f2beSChristoph Hellwig 	void __user *argp = (void __user *)(unsigned long)header->addr;
252534d8f2beSChristoph Hellwig 
252634d8f2beSChristoph Hellwig 	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
252734d8f2beSChristoph Hellwig 		return -EINVAL;
252834d8f2beSChristoph Hellwig 
252934d8f2beSChristoph Hellwig 	if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
2530bfbcef03SMing Lei 		return -EFAULT;
253134d8f2beSChristoph Hellwig 
2532bfbcef03SMing Lei 	return 0;
253334d8f2beSChristoph Hellwig }
253434d8f2beSChristoph Hellwig 
2535abb864d3SMing Lei /* TYPE_DEVT is readonly, so fill it up before returning to userspace */
ublk_ctrl_fill_params_devt(struct ublk_device * ub)2536abb864d3SMing Lei static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
2537abb864d3SMing Lei {
2538abb864d3SMing Lei 	ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
2539abb864d3SMing Lei 	ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
2540abb864d3SMing Lei 
2541abb864d3SMing Lei 	if (ub->ub_disk) {
2542abb864d3SMing Lei 		ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
2543abb864d3SMing Lei 		ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
2544abb864d3SMing Lei 	} else {
2545abb864d3SMing Lei 		ub->params.devt.disk_major = 0;
2546abb864d3SMing Lei 		ub->params.devt.disk_minor = 0;
2547abb864d3SMing Lei 	}
2548abb864d3SMing Lei 	ub->params.types |= UBLK_PARAM_TYPE_DEVT;
2549abb864d3SMing Lei }
2550abb864d3SMing Lei 
ublk_ctrl_get_params(struct ublk_device * ub,struct io_uring_cmd * cmd)2551bfbcef03SMing Lei static int ublk_ctrl_get_params(struct ublk_device *ub,
2552bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
25530aa73170SMing Lei {
2554fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
25550aa73170SMing Lei 	void __user *argp = (void __user *)(unsigned long)header->addr;
25560aa73170SMing Lei 	struct ublk_params_header ph;
25570aa73170SMing Lei 	int ret;
25580aa73170SMing Lei 
25590aa73170SMing Lei 	if (header->len <= sizeof(ph) || !header->addr)
25600aa73170SMing Lei 		return -EINVAL;
25610aa73170SMing Lei 
25620aa73170SMing Lei 	if (copy_from_user(&ph, argp, sizeof(ph)))
25630aa73170SMing Lei 		return -EFAULT;
25640aa73170SMing Lei 
25650aa73170SMing Lei 	if (ph.len > header->len || !ph.len)
25660aa73170SMing Lei 		return -EINVAL;
25670aa73170SMing Lei 
25680aa73170SMing Lei 	if (ph.len > sizeof(struct ublk_params))
25690aa73170SMing Lei 		ph.len = sizeof(struct ublk_params);
25700aa73170SMing Lei 
25710aa73170SMing Lei 	mutex_lock(&ub->mutex);
2572abb864d3SMing Lei 	ublk_ctrl_fill_params_devt(ub);
25730aa73170SMing Lei 	if (copy_to_user(argp, &ub->params, ph.len))
25740aa73170SMing Lei 		ret = -EFAULT;
25750aa73170SMing Lei 	else
25760aa73170SMing Lei 		ret = 0;
25770aa73170SMing Lei 	mutex_unlock(&ub->mutex);
25780aa73170SMing Lei 
25790aa73170SMing Lei 	return ret;
25800aa73170SMing Lei }
25810aa73170SMing Lei 
ublk_ctrl_set_params(struct ublk_device * ub,struct io_uring_cmd * cmd)2582bfbcef03SMing Lei static int ublk_ctrl_set_params(struct ublk_device *ub,
2583bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
25840aa73170SMing Lei {
2585fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
25860aa73170SMing Lei 	void __user *argp = (void __user *)(unsigned long)header->addr;
25870aa73170SMing Lei 	struct ublk_params_header ph;
25880aa73170SMing Lei 	int ret = -EFAULT;
25890aa73170SMing Lei 
25900aa73170SMing Lei 	if (header->len <= sizeof(ph) || !header->addr)
25910aa73170SMing Lei 		return -EINVAL;
25920aa73170SMing Lei 
25930aa73170SMing Lei 	if (copy_from_user(&ph, argp, sizeof(ph)))
25940aa73170SMing Lei 		return -EFAULT;
25950aa73170SMing Lei 
25960aa73170SMing Lei 	if (ph.len > header->len || !ph.len || !ph.types)
25970aa73170SMing Lei 		return -EINVAL;
25980aa73170SMing Lei 
25990aa73170SMing Lei 	if (ph.len > sizeof(struct ublk_params))
26000aa73170SMing Lei 		ph.len = sizeof(struct ublk_params);
26010aa73170SMing Lei 
26020aa73170SMing Lei 	mutex_lock(&ub->mutex);
2603*c6292a2aSUday Shankar 	if (test_bit(UB_STATE_USED, &ub->state)) {
2604*c6292a2aSUday Shankar 		/*
2605*c6292a2aSUday Shankar 		 * Parameters can only be changed when device hasn't
2606*c6292a2aSUday Shankar 		 * been started yet
2607*c6292a2aSUday Shankar 		 */
26080aa73170SMing Lei 		ret = -EACCES;
26090aa73170SMing Lei 	} else if (copy_from_user(&ub->params, argp, ph.len)) {
26100aa73170SMing Lei 		ret = -EFAULT;
26110aa73170SMing Lei 	} else {
26120aa73170SMing Lei 		/* clear all we don't support yet */
26130aa73170SMing Lei 		ub->params.types &= UBLK_PARAM_TYPE_ALL;
26140aa73170SMing Lei 		ret = ublk_validate_params(ub);
26151d166527SMing Lei 		if (ret)
26161d166527SMing Lei 			ub->params.types = 0;
26170aa73170SMing Lei 	}
26180aa73170SMing Lei 	mutex_unlock(&ub->mutex);
26190aa73170SMing Lei 
26200aa73170SMing Lei 	return ret;
26210aa73170SMing Lei }
26220aa73170SMing Lei 
ublk_queue_reinit(struct ublk_device * ub,struct ublk_queue * ubq)2623c732a852SZiyangZhang static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
2624c732a852SZiyangZhang {
2625c732a852SZiyangZhang 	int i;
2626c732a852SZiyangZhang 
2627c732a852SZiyangZhang 	WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
26288cb8ef0cSMing Lei 
2629c732a852SZiyangZhang 	/* All old ioucmds have to be completed */
26308cb8ef0cSMing Lei 	ubq->nr_io_ready = 0;
2631c732a852SZiyangZhang 	/* old daemon is PF_EXITING, put it now */
2632c732a852SZiyangZhang 	put_task_struct(ubq->ubq_daemon);
2633c732a852SZiyangZhang 	/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
2634c732a852SZiyangZhang 	ubq->ubq_daemon = NULL;
2635c0b79b0fSMing Lei 	ubq->timeout = false;
2636c732a852SZiyangZhang 
2637c732a852SZiyangZhang 	for (i = 0; i < ubq->q_depth; i++) {
2638c732a852SZiyangZhang 		struct ublk_io *io = &ubq->ios[i];
2639c732a852SZiyangZhang 
2640c732a852SZiyangZhang 		/* forget everything now and be ready for new FETCH_REQ */
2641c732a852SZiyangZhang 		io->flags = 0;
2642c732a852SZiyangZhang 		io->cmd = NULL;
2643c732a852SZiyangZhang 		io->addr = 0;
2644c732a852SZiyangZhang 	}
2645c732a852SZiyangZhang }
2646c732a852SZiyangZhang 
ublk_ctrl_start_recovery(struct ublk_device * ub,struct io_uring_cmd * cmd)2647bfbcef03SMing Lei static int ublk_ctrl_start_recovery(struct ublk_device *ub,
2648bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
2649c732a852SZiyangZhang {
2650fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2651c732a852SZiyangZhang 	int ret = -EINVAL;
2652c732a852SZiyangZhang 	int i;
2653c732a852SZiyangZhang 
2654c732a852SZiyangZhang 	mutex_lock(&ub->mutex);
2655c732a852SZiyangZhang 	if (!ublk_can_use_recovery(ub))
2656c732a852SZiyangZhang 		goto out_unlock;
2657136a29d8SLi Nan 	if (!ub->nr_queues_ready)
2658136a29d8SLi Nan 		goto out_unlock;
2659c732a852SZiyangZhang 	/*
2660c732a852SZiyangZhang 	 * START_RECOVERY is only allowd after:
2661c732a852SZiyangZhang 	 *
2662c732a852SZiyangZhang 	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
2663c732a852SZiyangZhang 	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
2664c732a852SZiyangZhang 	 *     released.
2665c732a852SZiyangZhang 	 *
2666c732a852SZiyangZhang 	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
2667c732a852SZiyangZhang 	 *     (a)has quiesced request queue
2668c732a852SZiyangZhang 	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
2669c732a852SZiyangZhang 	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
2670c732a852SZiyangZhang 	 *     (d)has completed/camceled all ioucmds owned by ther dying process
2671c732a852SZiyangZhang 	 */
2672c732a852SZiyangZhang 	if (test_bit(UB_STATE_OPEN, &ub->state) ||
2673c732a852SZiyangZhang 			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2674c732a852SZiyangZhang 		ret = -EBUSY;
2675c732a852SZiyangZhang 		goto out_unlock;
2676c732a852SZiyangZhang 	}
2677c732a852SZiyangZhang 	pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
2678c732a852SZiyangZhang 	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2679c732a852SZiyangZhang 		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
2680c732a852SZiyangZhang 	/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
2681c732a852SZiyangZhang 	ub->mm = NULL;
2682c732a852SZiyangZhang 	ub->nr_queues_ready = 0;
268373a166d9SMing Lei 	ub->nr_privileged_daemon = 0;
2684c732a852SZiyangZhang 	init_completion(&ub->completion);
2685c732a852SZiyangZhang 	ret = 0;
2686c732a852SZiyangZhang  out_unlock:
2687c732a852SZiyangZhang 	mutex_unlock(&ub->mutex);
2688c732a852SZiyangZhang 	return ret;
2689c732a852SZiyangZhang }
2690c732a852SZiyangZhang 
ublk_ctrl_end_recovery(struct ublk_device * ub,struct io_uring_cmd * cmd)2691bfbcef03SMing Lei static int ublk_ctrl_end_recovery(struct ublk_device *ub,
2692bfbcef03SMing Lei 		struct io_uring_cmd *cmd)
2693c732a852SZiyangZhang {
2694fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2695c732a852SZiyangZhang 	int ublksrv_pid = (int)header->data[0];
2696c732a852SZiyangZhang 	int ret = -EINVAL;
2697c732a852SZiyangZhang 
2698c732a852SZiyangZhang 	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
2699c732a852SZiyangZhang 			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
2700c732a852SZiyangZhang 	/* wait until new ubq_daemon sending all FETCH_REQ */
27010c0cbd4eSMing Lei 	if (wait_for_completion_interruptible(&ub->completion))
27020c0cbd4eSMing Lei 		return -EINTR;
27030c0cbd4eSMing Lei 
2704c732a852SZiyangZhang 	pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
2705c732a852SZiyangZhang 			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
2706c732a852SZiyangZhang 
2707c732a852SZiyangZhang 	mutex_lock(&ub->mutex);
2708c732a852SZiyangZhang 	if (!ublk_can_use_recovery(ub))
2709c732a852SZiyangZhang 		goto out_unlock;
2710c732a852SZiyangZhang 
2711c732a852SZiyangZhang 	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
2712c732a852SZiyangZhang 		ret = -EBUSY;
2713c732a852SZiyangZhang 		goto out_unlock;
2714c732a852SZiyangZhang 	}
2715c732a852SZiyangZhang 	ub->dev_info.ublksrv_pid = ublksrv_pid;
2716c732a852SZiyangZhang 	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
2717c732a852SZiyangZhang 			__func__, ublksrv_pid, header->dev_id);
2718c732a852SZiyangZhang 	blk_mq_unquiesce_queue(ub->ub_disk->queue);
2719c732a852SZiyangZhang 	pr_devel("%s: queue unquiesced, dev id %d.\n",
2720c732a852SZiyangZhang 			__func__, header->dev_id);
2721c732a852SZiyangZhang 	blk_mq_kick_requeue_list(ub->ub_disk->queue);
2722c732a852SZiyangZhang 	ub->dev_info.state = UBLK_S_DEV_LIVE;
2723c732a852SZiyangZhang 	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
2724c732a852SZiyangZhang 	ret = 0;
2725c732a852SZiyangZhang  out_unlock:
2726c732a852SZiyangZhang 	mutex_unlock(&ub->mutex);
2727c732a852SZiyangZhang 	return ret;
2728c732a852SZiyangZhang }
2729c732a852SZiyangZhang 
ublk_ctrl_get_features(struct io_uring_cmd * cmd)2730b5bbc52fSMing Lei static int ublk_ctrl_get_features(struct io_uring_cmd *cmd)
2731b5bbc52fSMing Lei {
2732b5bbc52fSMing Lei 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2733b5bbc52fSMing Lei 	void __user *argp = (void __user *)(unsigned long)header->addr;
2734b5bbc52fSMing Lei 	u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY;
2735b5bbc52fSMing Lei 
2736b5bbc52fSMing Lei 	if (header->len != UBLK_FEATURES_LEN || !header->addr)
2737b5bbc52fSMing Lei 		return -EINVAL;
2738b5bbc52fSMing Lei 
2739b5bbc52fSMing Lei 	if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
2740b5bbc52fSMing Lei 		return -EFAULT;
2741b5bbc52fSMing Lei 
2742b5bbc52fSMing Lei 	return 0;
2743b5bbc52fSMing Lei }
2744b5bbc52fSMing Lei 
27454093cb5aSMing Lei /*
27464093cb5aSMing Lei  * All control commands are sent via /dev/ublk-control, so we have to check
27474093cb5aSMing Lei  * the destination device's permission
27484093cb5aSMing Lei  */
ublk_char_dev_permission(struct ublk_device * ub,const char * dev_path,int mask)27494093cb5aSMing Lei static int ublk_char_dev_permission(struct ublk_device *ub,
27504093cb5aSMing Lei 		const char *dev_path, int mask)
27514093cb5aSMing Lei {
27524093cb5aSMing Lei 	int err;
27534093cb5aSMing Lei 	struct path path;
27544093cb5aSMing Lei 	struct kstat stat;
27554093cb5aSMing Lei 
27564093cb5aSMing Lei 	err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
27574093cb5aSMing Lei 	if (err)
27584093cb5aSMing Lei 		return err;
27594093cb5aSMing Lei 
27604093cb5aSMing Lei 	err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
27614093cb5aSMing Lei 	if (err)
27624093cb5aSMing Lei 		goto exit;
27634093cb5aSMing Lei 
27644093cb5aSMing Lei 	err = -EPERM;
27654093cb5aSMing Lei 	if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
27664093cb5aSMing Lei 		goto exit;
27674093cb5aSMing Lei 
27685b0ed596SLinus Torvalds 	err = inode_permission(&nop_mnt_idmap,
27694093cb5aSMing Lei 			d_backing_inode(path.dentry), mask);
27704093cb5aSMing Lei exit:
27714093cb5aSMing Lei 	path_put(&path);
27724093cb5aSMing Lei 	return err;
27734093cb5aSMing Lei }
27744093cb5aSMing Lei 
ublk_ctrl_uring_cmd_permission(struct ublk_device * ub,struct io_uring_cmd * cmd)27754093cb5aSMing Lei static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
27764093cb5aSMing Lei 		struct io_uring_cmd *cmd)
27774093cb5aSMing Lei {
2778fd9b8547SBreno Leitao 	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
27794093cb5aSMing Lei 	bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
27804093cb5aSMing Lei 	void __user *argp = (void __user *)(unsigned long)header->addr;
27814093cb5aSMing Lei 	char *dev_path = NULL;
27824093cb5aSMing Lei 	int ret = 0;
27834093cb5aSMing Lei 	int mask;
27844093cb5aSMing Lei 
27854093cb5aSMing Lei 	if (!unprivileged) {
27864093cb5aSMing Lei 		if (!capable(CAP_SYS_ADMIN))
27874093cb5aSMing Lei 			return -EPERM;
27884093cb5aSMing Lei 		/*
27894093cb5aSMing Lei 		 * The new added command of UBLK_CMD_GET_DEV_INFO2 includes
27904093cb5aSMing Lei 		 * char_dev_path in payload too, since userspace may not
27914093cb5aSMing Lei 		 * know if the specified device is created as unprivileged
27924093cb5aSMing Lei 		 * mode.
27934093cb5aSMing Lei 		 */
27942d786e66SMing Lei 		if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
27954093cb5aSMing Lei 			return 0;
27964093cb5aSMing Lei 	}
27974093cb5aSMing Lei 
27984093cb5aSMing Lei 	/*
27994093cb5aSMing Lei 	 * User has to provide the char device path for unprivileged ublk
28004093cb5aSMing Lei 	 *
28014093cb5aSMing Lei 	 * header->addr always points to the dev path buffer, and
28024093cb5aSMing Lei 	 * header->dev_path_len records length of dev path buffer.
28034093cb5aSMing Lei 	 */
28044093cb5aSMing Lei 	if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
28054093cb5aSMing Lei 		return -EINVAL;
28064093cb5aSMing Lei 
28074093cb5aSMing Lei 	if (header->len < header->dev_path_len)
28084093cb5aSMing Lei 		return -EINVAL;
28094093cb5aSMing Lei 
281066a6a5d0SRuan Jinjie 	dev_path = memdup_user_nul(argp, header->dev_path_len);
281166a6a5d0SRuan Jinjie 	if (IS_ERR(dev_path))
281266a6a5d0SRuan Jinjie 		return PTR_ERR(dev_path);
28134093cb5aSMing Lei 
28144093cb5aSMing Lei 	ret = -EINVAL;
28152d786e66SMing Lei 	switch (_IOC_NR(cmd->cmd_op)) {
28164093cb5aSMing Lei 	case UBLK_CMD_GET_DEV_INFO:
28174093cb5aSMing Lei 	case UBLK_CMD_GET_DEV_INFO2:
28184093cb5aSMing Lei 	case UBLK_CMD_GET_QUEUE_AFFINITY:
28194093cb5aSMing Lei 	case UBLK_CMD_GET_PARAMS:
2820b5bbc52fSMing Lei 	case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
28214093cb5aSMing Lei 		mask = MAY_READ;
28224093cb5aSMing Lei 		break;
28234093cb5aSMing Lei 	case UBLK_CMD_START_DEV:
28244093cb5aSMing Lei 	case UBLK_CMD_STOP_DEV:
28254093cb5aSMing Lei 	case UBLK_CMD_ADD_DEV:
28264093cb5aSMing Lei 	case UBLK_CMD_DEL_DEV:
28274093cb5aSMing Lei 	case UBLK_CMD_SET_PARAMS:
28284093cb5aSMing Lei 	case UBLK_CMD_START_USER_RECOVERY:
28294093cb5aSMing Lei 	case UBLK_CMD_END_USER_RECOVERY:
28304093cb5aSMing Lei 		mask = MAY_READ | MAY_WRITE;
28314093cb5aSMing Lei 		break;
28324093cb5aSMing Lei 	default:
28334093cb5aSMing Lei 		goto exit;
28344093cb5aSMing Lei 	}
28354093cb5aSMing Lei 
28364093cb5aSMing Lei 	ret = ublk_char_dev_permission(ub, dev_path, mask);
28374093cb5aSMing Lei 	if (!ret) {
28384093cb5aSMing Lei 		header->len -= header->dev_path_len;
28394093cb5aSMing Lei 		header->addr += header->dev_path_len;
28404093cb5aSMing Lei 	}
28414093cb5aSMing Lei 	pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
28424093cb5aSMing Lei 			__func__, ub->ub_number, cmd->cmd_op,
28434093cb5aSMing Lei 			ub->dev_info.owner_uid, ub->dev_info.owner_gid,
28444093cb5aSMing Lei 			dev_path, ret);
28454093cb5aSMing Lei exit:
28464093cb5aSMing Lei 	kfree(dev_path);
284771f28f31SMing Lei 	return ret;
284871f28f31SMing Lei }
284971f28f31SMing Lei 
ublk_ctrl_uring_cmd(struct io_uring_cmd * cmd,unsigned int issue_flags)285071f28f31SMing Lei static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
285171f28f31SMing Lei 		unsigned int issue_flags)
285271f28f31SMing Lei {
2853fd9b8547SBreno Leitao 	const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
2854bfbcef03SMing Lei 	struct ublk_device *ub = NULL;
28552d786e66SMing Lei 	u32 cmd_op = cmd->cmd_op;
285671f28f31SMing Lei 	int ret = -EINVAL;
285771f28f31SMing Lei 
2858fa8e442eSMing Lei 	if (issue_flags & IO_URING_F_NONBLOCK)
2859fa8e442eSMing Lei 		return -EAGAIN;
2860fa8e442eSMing Lei 
286171f28f31SMing Lei 	ublk_ctrl_cmd_dump(cmd);
286271f28f31SMing Lei 
286371f28f31SMing Lei 	if (!(issue_flags & IO_URING_F_SQE128))
286471f28f31SMing Lei 		goto out;
286571f28f31SMing Lei 
28662d786e66SMing Lei 	ret = ublk_check_cmd_op(cmd_op);
28672d786e66SMing Lei 	if (ret)
28682d786e66SMing Lei 		goto out;
28692d786e66SMing Lei 
2870b5bbc52fSMing Lei 	if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
2871b5bbc52fSMing Lei 		ret = ublk_ctrl_get_features(cmd);
2872b5bbc52fSMing Lei 		goto out;
2873b5bbc52fSMing Lei 	}
2874b5bbc52fSMing Lei 
28752d786e66SMing Lei 	if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
287671f28f31SMing Lei 		ret = -ENODEV;
2877bfbcef03SMing Lei 		ub = ublk_get_device_from_id(header->dev_id);
2878bfbcef03SMing Lei 		if (!ub)
287971f28f31SMing Lei 			goto out;
288071f28f31SMing Lei 
28814093cb5aSMing Lei 		ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
28824093cb5aSMing Lei 		if (ret)
28834093cb5aSMing Lei 			goto put_dev;
28842d786e66SMing Lei 	}
28854093cb5aSMing Lei 
28862d786e66SMing Lei 	switch (_IOC_NR(cmd_op)) {
288771f28f31SMing Lei 	case UBLK_CMD_START_DEV:
2888bfbcef03SMing Lei 		ret = ublk_ctrl_start_dev(ub, cmd);
288971f28f31SMing Lei 		break;
289071f28f31SMing Lei 	case UBLK_CMD_STOP_DEV:
2891bfbcef03SMing Lei 		ret = ublk_ctrl_stop_dev(ub);
289271f28f31SMing Lei 		break;
289371f28f31SMing Lei 	case UBLK_CMD_GET_DEV_INFO:
28944093cb5aSMing Lei 	case UBLK_CMD_GET_DEV_INFO2:
2895bfbcef03SMing Lei 		ret = ublk_ctrl_get_dev_info(ub, cmd);
289671f28f31SMing Lei 		break;
289771f28f31SMing Lei 	case UBLK_CMD_ADD_DEV:
289834d8f2beSChristoph Hellwig 		ret = ublk_ctrl_add_dev(cmd);
289971f28f31SMing Lei 		break;
290071f28f31SMing Lei 	case UBLK_CMD_DEL_DEV:
2901bfbcef03SMing Lei 		ret = ublk_ctrl_del_dev(&ub);
290271f28f31SMing Lei 		break;
290371f28f31SMing Lei 	case UBLK_CMD_GET_QUEUE_AFFINITY:
2904bfbcef03SMing Lei 		ret = ublk_ctrl_get_queue_affinity(ub, cmd);
290571f28f31SMing Lei 		break;
29060aa73170SMing Lei 	case UBLK_CMD_GET_PARAMS:
2907bfbcef03SMing Lei 		ret = ublk_ctrl_get_params(ub, cmd);
29080aa73170SMing Lei 		break;
29090aa73170SMing Lei 	case UBLK_CMD_SET_PARAMS:
2910bfbcef03SMing Lei 		ret = ublk_ctrl_set_params(ub, cmd);
29110aa73170SMing Lei 		break;
2912c732a852SZiyangZhang 	case UBLK_CMD_START_USER_RECOVERY:
2913bfbcef03SMing Lei 		ret = ublk_ctrl_start_recovery(ub, cmd);
2914c732a852SZiyangZhang 		break;
2915c732a852SZiyangZhang 	case UBLK_CMD_END_USER_RECOVERY:
2916bfbcef03SMing Lei 		ret = ublk_ctrl_end_recovery(ub, cmd);
2917c732a852SZiyangZhang 		break;
291871f28f31SMing Lei 	default:
29199e68fcffSMing Lei 		ret = -EOPNOTSUPP;
292071f28f31SMing Lei 		break;
29216b1439d2SYang Li 	}
29224093cb5aSMing Lei 
29234093cb5aSMing Lei  put_dev:
2924bfbcef03SMing Lei 	if (ub)
2925bfbcef03SMing Lei 		ublk_put_device(ub);
292671f28f31SMing Lei  out:
29279d2789acSJens Axboe 	io_uring_cmd_done(cmd, ret, 0, issue_flags);
292871f28f31SMing Lei 	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
292971f28f31SMing Lei 			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
293071f28f31SMing Lei 	return -EIOCBQUEUED;
293171f28f31SMing Lei }
293271f28f31SMing Lei 
293371f28f31SMing Lei static const struct file_operations ublk_ctl_fops = {
293471f28f31SMing Lei 	.open		= nonseekable_open,
293571f28f31SMing Lei 	.uring_cmd      = ublk_ctrl_uring_cmd,
293671f28f31SMing Lei 	.owner		= THIS_MODULE,
293771f28f31SMing Lei 	.llseek		= noop_llseek,
293871f28f31SMing Lei };
293971f28f31SMing Lei 
294071f28f31SMing Lei static struct miscdevice ublk_misc = {
294171f28f31SMing Lei 	.minor		= MISC_DYNAMIC_MINOR,
294271f28f31SMing Lei 	.name		= "ublk-control",
294371f28f31SMing Lei 	.fops		= &ublk_ctl_fops,
294471f28f31SMing Lei };
294571f28f31SMing Lei 
ublk_init(void)294671f28f31SMing Lei static int __init ublk_init(void)
294771f28f31SMing Lei {
294871f28f31SMing Lei 	int ret;
294971f28f31SMing Lei 
295062fe99ceSMing Lei 	BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
295162fe99ceSMing Lei 			UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
295262fe99ceSMing Lei 
295371f28f31SMing Lei 	init_waitqueue_head(&ublk_idr_wq);
295471f28f31SMing Lei 
295571f28f31SMing Lei 	ret = misc_register(&ublk_misc);
295671f28f31SMing Lei 	if (ret)
295771f28f31SMing Lei 		return ret;
295871f28f31SMing Lei 
295971f28f31SMing Lei 	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
296071f28f31SMing Lei 	if (ret)
296171f28f31SMing Lei 		goto unregister_mis;
296271f28f31SMing Lei 
29632eefd399SIvan Orlov 	ret = class_register(&ublk_chr_class);
29642eefd399SIvan Orlov 	if (ret)
296571f28f31SMing Lei 		goto free_chrdev_region;
29662eefd399SIvan Orlov 
296771f28f31SMing Lei 	return 0;
296871f28f31SMing Lei 
296971f28f31SMing Lei free_chrdev_region:
297071f28f31SMing Lei 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
297171f28f31SMing Lei unregister_mis:
297271f28f31SMing Lei 	misc_deregister(&ublk_misc);
297371f28f31SMing Lei 	return ret;
297471f28f31SMing Lei }
297571f28f31SMing Lei 
ublk_exit(void)297671f28f31SMing Lei static void __exit ublk_exit(void)
297771f28f31SMing Lei {
297871f28f31SMing Lei 	struct ublk_device *ub;
297971f28f31SMing Lei 	int id;
298071f28f31SMing Lei 
298171f28f31SMing Lei 	idr_for_each_entry(&ublk_index_idr, ub, id)
298271f28f31SMing Lei 		ublk_remove(ub);
298371f28f31SMing Lei 
29842eefd399SIvan Orlov 	class_unregister(&ublk_chr_class);
29858e4ff684SMing Lei 	misc_deregister(&ublk_misc);
29868e4ff684SMing Lei 
298771f28f31SMing Lei 	idr_destroy(&ublk_index_idr);
298871f28f31SMing Lei 	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
298971f28f31SMing Lei }
299071f28f31SMing Lei 
299171f28f31SMing Lei module_init(ublk_init);
299271f28f31SMing Lei module_exit(ublk_exit);
299371f28f31SMing Lei 
2994403ebc87SMing Lei module_param(ublks_max, int, 0444);
2995403ebc87SMing Lei MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
2996403ebc87SMing Lei 
299771f28f31SMing Lei MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
299871f28f31SMing Lei MODULE_LICENSE("GPL");
2999