xref: /openbmc/linux/drivers/nvme/host/ioctl.c (revision 130b1207)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2011-2014, Intel Corporation.
4  * Copyright (c) 2017-2021 Christoph Hellwig.
5  */
6 #include <linux/ptrace.h>	/* for force_successful_syscall_return */
7 #include <linux/nvme_ioctl.h>
8 #include <linux/io_uring.h>
9 #include "nvme.h"
10 
11 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
12 		fmode_t mode)
13 {
14 	u32 effects;
15 
16 	if (capable(CAP_SYS_ADMIN))
17 		return true;
18 
19 	/*
20 	 * Do not allow unprivileged processes to send vendor specific or fabrics
21 	 * commands as we can't be sure about their effects.
22 	 */
23 	if (c->common.opcode >= nvme_cmd_vendor_start ||
24 	    c->common.opcode == nvme_fabrics_command)
25 		return false;
26 
27 	/*
28 	 * Do not allow unprivileged passthrough of admin commands except
29 	 * for a subset of identify commands that contain information required
30 	 * to form proper I/O commands in userspace and do not expose any
31 	 * potentially sensitive information.
32 	 */
33 	if (!ns) {
34 		if (c->common.opcode == nvme_admin_identify) {
35 			switch (c->identify.cns) {
36 			case NVME_ID_CNS_NS:
37 			case NVME_ID_CNS_CS_NS:
38 			case NVME_ID_CNS_NS_CS_INDEP:
39 			case NVME_ID_CNS_CS_CTRL:
40 			case NVME_ID_CNS_CTRL:
41 				return true;
42 			}
43 		}
44 		return false;
45 	}
46 
47 	/*
48 	 * Check if the controller provides a Commands Supported and Effects log
49 	 * and marks this command as supported.  If not reject unprivileged
50 	 * passthrough.
51 	 */
52 	effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode);
53 	if (!(effects & NVME_CMD_EFFECTS_CSUPP))
54 		return false;
55 
56 	/*
57 	 * Don't allow passthrough for command that have intrusive (or unknown)
58 	 * effects.
59 	 */
60 	if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
61 			NVME_CMD_EFFECTS_UUID_SEL |
62 			NVME_CMD_EFFECTS_SCOPE_MASK))
63 		return false;
64 
65 	/*
66 	 * Only allow I/O commands that transfer data to the controller or that
67 	 * change the logical block contents if the file descriptor is open for
68 	 * writing.
69 	 */
70 	if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC))
71 		return mode & FMODE_WRITE;
72 	return true;
73 }
74 
75 /*
76  * Convert integer values from ioctl structures to user pointers, silently
77  * ignoring the upper bits in the compat case to match behaviour of 32-bit
78  * kernels.
79  */
80 static void __user *nvme_to_user_ptr(uintptr_t ptrval)
81 {
82 	if (in_compat_syscall())
83 		ptrval = (compat_uptr_t)ptrval;
84 	return (void __user *)ptrval;
85 }
86 
87 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
88 		unsigned len, u32 seed)
89 {
90 	struct bio_integrity_payload *bip;
91 	int ret = -ENOMEM;
92 	void *buf;
93 	struct bio *bio = req->bio;
94 
95 	buf = kmalloc(len, GFP_KERNEL);
96 	if (!buf)
97 		goto out;
98 
99 	ret = -EFAULT;
100 	if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len))
101 		goto out_free_meta;
102 
103 	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
104 	if (IS_ERR(bip)) {
105 		ret = PTR_ERR(bip);
106 		goto out_free_meta;
107 	}
108 
109 	bip->bip_iter.bi_size = len;
110 	bip->bip_iter.bi_sector = seed;
111 	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
112 			offset_in_page(buf));
113 	if (ret != len) {
114 		ret = -ENOMEM;
115 		goto out_free_meta;
116 	}
117 
118 	req->cmd_flags |= REQ_INTEGRITY;
119 	return buf;
120 out_free_meta:
121 	kfree(buf);
122 out:
123 	return ERR_PTR(ret);
124 }
125 
126 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
127 		void *meta, unsigned len, int ret)
128 {
129 	if (!ret && req_op(req) == REQ_OP_DRV_IN &&
130 	    copy_to_user(ubuf, meta, len))
131 		ret = -EFAULT;
132 	kfree(meta);
133 	return ret;
134 }
135 
136 static struct request *nvme_alloc_user_request(struct request_queue *q,
137 		struct nvme_command *cmd, blk_opf_t rq_flags,
138 		blk_mq_req_flags_t blk_flags)
139 {
140 	struct request *req;
141 
142 	req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
143 	if (IS_ERR(req))
144 		return req;
145 	nvme_init_request(req, cmd);
146 	nvme_req(req)->flags |= NVME_REQ_USERCMD;
147 	return req;
148 }
149 
150 static int nvme_map_user_request(struct request *req, u64 ubuffer,
151 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
152 		u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
153 		bool vec)
154 {
155 	struct request_queue *q = req->q;
156 	struct nvme_ns *ns = q->queuedata;
157 	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
158 	struct bio *bio = NULL;
159 	void *meta = NULL;
160 	int ret;
161 
162 	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
163 		struct iov_iter iter;
164 
165 		/* fixedbufs is only for non-vectored io */
166 		if (WARN_ON_ONCE(vec))
167 			return -EINVAL;
168 		ret = io_uring_cmd_import_fixed(ubuffer, bufflen,
169 				rq_data_dir(req), &iter, ioucmd);
170 		if (ret < 0)
171 			goto out;
172 		ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL);
173 	} else {
174 		ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer),
175 				bufflen, GFP_KERNEL, vec, 0, 0,
176 				rq_data_dir(req));
177 	}
178 
179 	if (ret)
180 		goto out;
181 	bio = req->bio;
182 	if (bdev)
183 		bio_set_dev(bio, bdev);
184 
185 	if (bdev && meta_buffer && meta_len) {
186 		meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
187 				meta_seed);
188 		if (IS_ERR(meta)) {
189 			ret = PTR_ERR(meta);
190 			goto out_unmap;
191 		}
192 		*metap = meta;
193 	}
194 
195 	return ret;
196 
197 out_unmap:
198 	if (bio)
199 		blk_rq_unmap_user(bio);
200 out:
201 	blk_mq_free_request(req);
202 	return ret;
203 }
204 
205 static int nvme_submit_user_cmd(struct request_queue *q,
206 		struct nvme_command *cmd, u64 ubuffer,
207 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
208 		u32 meta_seed, u64 *result, unsigned timeout, bool vec)
209 {
210 	struct nvme_ctrl *ctrl;
211 	struct request *req;
212 	void *meta = NULL;
213 	struct bio *bio;
214 	u32 effects;
215 	int ret;
216 
217 	req = nvme_alloc_user_request(q, cmd, 0, 0);
218 	if (IS_ERR(req))
219 		return PTR_ERR(req);
220 
221 	req->timeout = timeout;
222 	if (ubuffer && bufflen) {
223 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
224 				meta_len, meta_seed, &meta, NULL, vec);
225 		if (ret)
226 			return ret;
227 	}
228 
229 	bio = req->bio;
230 	ctrl = nvme_req(req)->ctrl;
231 
232 	ret = nvme_execute_passthru_rq(req, &effects);
233 
234 	if (result)
235 		*result = le64_to_cpu(nvme_req(req)->result.u64);
236 	if (meta)
237 		ret = nvme_finish_user_metadata(req, meta_buffer, meta,
238 						meta_len, ret);
239 	if (bio)
240 		blk_rq_unmap_user(bio);
241 	blk_mq_free_request(req);
242 
243 	if (effects)
244 		nvme_passthru_end(ctrl, effects, cmd, ret);
245 
246 	return ret;
247 }
248 
249 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
250 {
251 	struct nvme_user_io io;
252 	struct nvme_command c;
253 	unsigned length, meta_len;
254 	void __user *metadata;
255 
256 	if (copy_from_user(&io, uio, sizeof(io)))
257 		return -EFAULT;
258 	if (io.flags)
259 		return -EINVAL;
260 
261 	switch (io.opcode) {
262 	case nvme_cmd_write:
263 	case nvme_cmd_read:
264 	case nvme_cmd_compare:
265 		break;
266 	default:
267 		return -EINVAL;
268 	}
269 
270 	length = (io.nblocks + 1) << ns->lba_shift;
271 
272 	if ((io.control & NVME_RW_PRINFO_PRACT) &&
273 	    ns->ms == sizeof(struct t10_pi_tuple)) {
274 		/*
275 		 * Protection information is stripped/inserted by the
276 		 * controller.
277 		 */
278 		if (nvme_to_user_ptr(io.metadata))
279 			return -EINVAL;
280 		meta_len = 0;
281 		metadata = NULL;
282 	} else {
283 		meta_len = (io.nblocks + 1) * ns->ms;
284 		metadata = nvme_to_user_ptr(io.metadata);
285 	}
286 
287 	if (ns->features & NVME_NS_EXT_LBAS) {
288 		length += meta_len;
289 		meta_len = 0;
290 	} else if (meta_len) {
291 		if ((io.metadata & 3) || !io.metadata)
292 			return -EINVAL;
293 	}
294 
295 	memset(&c, 0, sizeof(c));
296 	c.rw.opcode = io.opcode;
297 	c.rw.flags = io.flags;
298 	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
299 	c.rw.slba = cpu_to_le64(io.slba);
300 	c.rw.length = cpu_to_le16(io.nblocks);
301 	c.rw.control = cpu_to_le16(io.control);
302 	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
303 	c.rw.reftag = cpu_to_le32(io.reftag);
304 	c.rw.apptag = cpu_to_le16(io.apptag);
305 	c.rw.appmask = cpu_to_le16(io.appmask);
306 
307 	return nvme_submit_user_cmd(ns->queue, &c,
308 			io.addr, length,
309 			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
310 			false);
311 }
312 
313 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
314 					struct nvme_ns *ns, __u32 nsid)
315 {
316 	if (ns && nsid != ns->head->ns_id) {
317 		dev_err(ctrl->device,
318 			"%s: nsid (%u) in cmd does not match nsid (%u)"
319 			"of namespace\n",
320 			current->comm, nsid, ns->head->ns_id);
321 		return false;
322 	}
323 
324 	return true;
325 }
326 
327 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
328 			struct nvme_passthru_cmd __user *ucmd, fmode_t mode)
329 {
330 	struct nvme_passthru_cmd cmd;
331 	struct nvme_command c;
332 	unsigned timeout = 0;
333 	u64 result;
334 	int status;
335 
336 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
337 		return -EFAULT;
338 	if (cmd.flags)
339 		return -EINVAL;
340 	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
341 		return -EINVAL;
342 
343 	memset(&c, 0, sizeof(c));
344 	c.common.opcode = cmd.opcode;
345 	c.common.flags = cmd.flags;
346 	c.common.nsid = cpu_to_le32(cmd.nsid);
347 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
348 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
349 	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
350 	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
351 	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
352 	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
353 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
354 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
355 
356 	if (!nvme_cmd_allowed(ns, &c, mode))
357 		return -EACCES;
358 
359 	if (cmd.timeout_ms)
360 		timeout = msecs_to_jiffies(cmd.timeout_ms);
361 
362 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
363 			cmd.addr, cmd.data_len,
364 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
365 			0, &result, timeout, false);
366 
367 	if (status >= 0) {
368 		if (put_user(result, &ucmd->result))
369 			return -EFAULT;
370 	}
371 
372 	return status;
373 }
374 
375 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
376 			struct nvme_passthru_cmd64 __user *ucmd, bool vec,
377 			fmode_t mode)
378 {
379 	struct nvme_passthru_cmd64 cmd;
380 	struct nvme_command c;
381 	unsigned timeout = 0;
382 	int status;
383 
384 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
385 		return -EFAULT;
386 	if (cmd.flags)
387 		return -EINVAL;
388 	if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid))
389 		return -EINVAL;
390 
391 	memset(&c, 0, sizeof(c));
392 	c.common.opcode = cmd.opcode;
393 	c.common.flags = cmd.flags;
394 	c.common.nsid = cpu_to_le32(cmd.nsid);
395 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
396 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
397 	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
398 	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
399 	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
400 	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
401 	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
402 	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
403 
404 	if (!nvme_cmd_allowed(ns, &c, mode))
405 		return -EACCES;
406 
407 	if (cmd.timeout_ms)
408 		timeout = msecs_to_jiffies(cmd.timeout_ms);
409 
410 	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
411 			cmd.addr, cmd.data_len,
412 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
413 			0, &cmd.result, timeout, vec);
414 
415 	if (status >= 0) {
416 		if (put_user(cmd.result, &ucmd->result))
417 			return -EFAULT;
418 	}
419 
420 	return status;
421 }
422 
423 struct nvme_uring_data {
424 	__u64	metadata;
425 	__u64	addr;
426 	__u32	data_len;
427 	__u32	metadata_len;
428 	__u32	timeout_ms;
429 };
430 
431 /*
432  * This overlays struct io_uring_cmd pdu.
433  * Expect build errors if this grows larger than that.
434  */
435 struct nvme_uring_cmd_pdu {
436 	union {
437 		struct bio *bio;
438 		struct request *req;
439 	};
440 	u32 meta_len;
441 	u32 nvme_status;
442 	union {
443 		struct {
444 			void *meta; /* kernel-resident buffer */
445 			void __user *meta_buffer;
446 		};
447 		u64 result;
448 	} u;
449 };
450 
451 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
452 		struct io_uring_cmd *ioucmd)
453 {
454 	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
455 }
456 
457 static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd)
458 {
459 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
460 	struct request *req = pdu->req;
461 	int status;
462 	u64 result;
463 
464 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
465 		status = -EINTR;
466 	else
467 		status = nvme_req(req)->status;
468 
469 	result = le64_to_cpu(nvme_req(req)->result.u64);
470 
471 	if (pdu->meta_len)
472 		status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
473 					pdu->u.meta, pdu->meta_len, status);
474 	if (req->bio)
475 		blk_rq_unmap_user(req->bio);
476 	blk_mq_free_request(req);
477 
478 	io_uring_cmd_done(ioucmd, status, result);
479 }
480 
481 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
482 {
483 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
484 
485 	if (pdu->bio)
486 		blk_rq_unmap_user(pdu->bio);
487 
488 	io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result);
489 }
490 
491 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
492 						blk_status_t err)
493 {
494 	struct io_uring_cmd *ioucmd = req->end_io_data;
495 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
496 	void *cookie = READ_ONCE(ioucmd->cookie);
497 
498 	req->bio = pdu->bio;
499 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
500 		pdu->nvme_status = -EINTR;
501 	else
502 		pdu->nvme_status = nvme_req(req)->status;
503 	pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
504 
505 	/*
506 	 * For iopoll, complete it directly.
507 	 * Otherwise, move the completion to task work.
508 	 */
509 	if (cookie != NULL && blk_rq_is_poll(req))
510 		nvme_uring_task_cb(ioucmd);
511 	else
512 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
513 
514 	return RQ_END_IO_FREE;
515 }
516 
517 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
518 						     blk_status_t err)
519 {
520 	struct io_uring_cmd *ioucmd = req->end_io_data;
521 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
522 	void *cookie = READ_ONCE(ioucmd->cookie);
523 
524 	req->bio = pdu->bio;
525 	pdu->req = req;
526 
527 	/*
528 	 * For iopoll, complete it directly.
529 	 * Otherwise, move the completion to task work.
530 	 */
531 	if (cookie != NULL && blk_rq_is_poll(req))
532 		nvme_uring_task_meta_cb(ioucmd);
533 	else
534 		io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb);
535 
536 	return RQ_END_IO_NONE;
537 }
538 
539 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
540 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
541 {
542 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
543 	const struct nvme_uring_cmd *cmd = ioucmd->cmd;
544 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
545 	struct nvme_uring_data d;
546 	struct nvme_command c;
547 	struct request *req;
548 	blk_opf_t rq_flags = 0;
549 	blk_mq_req_flags_t blk_flags = 0;
550 	void *meta = NULL;
551 	int ret;
552 
553 	c.common.opcode = READ_ONCE(cmd->opcode);
554 	c.common.flags = READ_ONCE(cmd->flags);
555 	if (c.common.flags)
556 		return -EINVAL;
557 
558 	c.common.command_id = 0;
559 	c.common.nsid = cpu_to_le32(cmd->nsid);
560 	if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid)))
561 		return -EINVAL;
562 
563 	c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2));
564 	c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3));
565 	c.common.metadata = 0;
566 	c.common.dptr.prp1 = c.common.dptr.prp2 = 0;
567 	c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10));
568 	c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11));
569 	c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12));
570 	c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13));
571 	c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
572 	c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
573 
574 	if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode))
575 		return -EACCES;
576 
577 	d.metadata = READ_ONCE(cmd->metadata);
578 	d.addr = READ_ONCE(cmd->addr);
579 	d.data_len = READ_ONCE(cmd->data_len);
580 	d.metadata_len = READ_ONCE(cmd->metadata_len);
581 	d.timeout_ms = READ_ONCE(cmd->timeout_ms);
582 
583 	if (issue_flags & IO_URING_F_NONBLOCK) {
584 		rq_flags = REQ_NOWAIT;
585 		blk_flags = BLK_MQ_REQ_NOWAIT;
586 	}
587 	if (issue_flags & IO_URING_F_IOPOLL)
588 		rq_flags |= REQ_POLLED;
589 
590 retry:
591 	req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags);
592 	if (IS_ERR(req))
593 		return PTR_ERR(req);
594 	req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0;
595 
596 	if (d.addr && d.data_len) {
597 		ret = nvme_map_user_request(req, d.addr,
598 			d.data_len, nvme_to_user_ptr(d.metadata),
599 			d.metadata_len, 0, &meta, ioucmd, vec);
600 		if (ret)
601 			return ret;
602 	}
603 
604 	if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) {
605 		if (unlikely(!req->bio)) {
606 			/* we can't poll this, so alloc regular req instead */
607 			blk_mq_free_request(req);
608 			rq_flags &= ~REQ_POLLED;
609 			goto retry;
610 		} else {
611 			WRITE_ONCE(ioucmd->cookie, req->bio);
612 			req->bio->bi_opf |= REQ_POLLED;
613 		}
614 	}
615 	/* to free bio on completion, as req->bio will be null at that time */
616 	pdu->bio = req->bio;
617 	pdu->meta_len = d.metadata_len;
618 	req->end_io_data = ioucmd;
619 	if (pdu->meta_len) {
620 		pdu->u.meta = meta;
621 		pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
622 		req->end_io = nvme_uring_cmd_end_io_meta;
623 	} else {
624 		req->end_io = nvme_uring_cmd_end_io;
625 	}
626 	blk_execute_rq_nowait(req, false);
627 	return -EIOCBQUEUED;
628 }
629 
630 static bool is_ctrl_ioctl(unsigned int cmd)
631 {
632 	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
633 		return true;
634 	if (is_sed_ioctl(cmd))
635 		return true;
636 	return false;
637 }
638 
639 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
640 		void __user *argp, fmode_t mode)
641 {
642 	switch (cmd) {
643 	case NVME_IOCTL_ADMIN_CMD:
644 		return nvme_user_cmd(ctrl, NULL, argp, mode);
645 	case NVME_IOCTL_ADMIN64_CMD:
646 		return nvme_user_cmd64(ctrl, NULL, argp, false, mode);
647 	default:
648 		return sed_ioctl(ctrl->opal_dev, cmd, argp);
649 	}
650 }
651 
652 #ifdef COMPAT_FOR_U64_ALIGNMENT
653 struct nvme_user_io32 {
654 	__u8	opcode;
655 	__u8	flags;
656 	__u16	control;
657 	__u16	nblocks;
658 	__u16	rsvd;
659 	__u64	metadata;
660 	__u64	addr;
661 	__u64	slba;
662 	__u32	dsmgmt;
663 	__u32	reftag;
664 	__u16	apptag;
665 	__u16	appmask;
666 } __attribute__((__packed__));
667 #define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32)
668 #endif /* COMPAT_FOR_U64_ALIGNMENT */
669 
670 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
671 		void __user *argp, fmode_t mode)
672 {
673 	switch (cmd) {
674 	case NVME_IOCTL_ID:
675 		force_successful_syscall_return();
676 		return ns->head->ns_id;
677 	case NVME_IOCTL_IO_CMD:
678 		return nvme_user_cmd(ns->ctrl, ns, argp, mode);
679 	/*
680 	 * struct nvme_user_io can have different padding on some 32-bit ABIs.
681 	 * Just accept the compat version as all fields that are used are the
682 	 * same size and at the same offset.
683 	 */
684 #ifdef COMPAT_FOR_U64_ALIGNMENT
685 	case NVME_IOCTL_SUBMIT_IO32:
686 #endif
687 	case NVME_IOCTL_SUBMIT_IO:
688 		return nvme_submit_io(ns, argp);
689 	case NVME_IOCTL_IO64_CMD:
690 		return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode);
691 	case NVME_IOCTL_IO64_CMD_VEC:
692 		return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode);
693 	default:
694 		return -ENOTTY;
695 	}
696 }
697 
698 static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg,
699 			fmode_t mode)
700 {
701 	if (is_ctrl_ioctl(cmd))
702 		return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode);
703 	return nvme_ns_ioctl(ns, cmd, arg, mode);
704 }
705 
706 int nvme_ioctl(struct block_device *bdev, fmode_t mode,
707 		unsigned int cmd, unsigned long arg)
708 {
709 	struct nvme_ns *ns = bdev->bd_disk->private_data;
710 
711 	return __nvme_ioctl(ns, cmd, (void __user *)arg, mode);
712 }
713 
714 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
715 {
716 	struct nvme_ns *ns =
717 		container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
718 
719 	return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode);
720 }
721 
722 static int nvme_uring_cmd_checks(unsigned int issue_flags)
723 {
724 
725 	/* NVMe passthrough requires big SQE/CQE support */
726 	if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
727 	    (IO_URING_F_SQE128|IO_URING_F_CQE32))
728 		return -EOPNOTSUPP;
729 	return 0;
730 }
731 
732 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
733 			     unsigned int issue_flags)
734 {
735 	struct nvme_ctrl *ctrl = ns->ctrl;
736 	int ret;
737 
738 	BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
739 
740 	ret = nvme_uring_cmd_checks(issue_flags);
741 	if (ret)
742 		return ret;
743 
744 	switch (ioucmd->cmd_op) {
745 	case NVME_URING_CMD_IO:
746 		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false);
747 		break;
748 	case NVME_URING_CMD_IO_VEC:
749 		ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true);
750 		break;
751 	default:
752 		ret = -ENOTTY;
753 	}
754 
755 	return ret;
756 }
757 
758 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
759 {
760 	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
761 			struct nvme_ns, cdev);
762 
763 	return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
764 }
765 
766 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
767 				 struct io_comp_batch *iob,
768 				 unsigned int poll_flags)
769 {
770 	struct bio *bio;
771 	int ret = 0;
772 	struct nvme_ns *ns;
773 	struct request_queue *q;
774 
775 	rcu_read_lock();
776 	bio = READ_ONCE(ioucmd->cookie);
777 	ns = container_of(file_inode(ioucmd->file)->i_cdev,
778 			struct nvme_ns, cdev);
779 	q = ns->queue;
780 	if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev)
781 		ret = bio_poll(bio, iob, poll_flags);
782 	rcu_read_unlock();
783 	return ret;
784 }
785 #ifdef CONFIG_NVME_MULTIPATH
786 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
787 		void __user *argp, struct nvme_ns_head *head, int srcu_idx,
788 		fmode_t mode)
789 	__releases(&head->srcu)
790 {
791 	struct nvme_ctrl *ctrl = ns->ctrl;
792 	int ret;
793 
794 	nvme_get_ctrl(ns->ctrl);
795 	srcu_read_unlock(&head->srcu, srcu_idx);
796 	ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode);
797 
798 	nvme_put_ctrl(ctrl);
799 	return ret;
800 }
801 
802 int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
803 		unsigned int cmd, unsigned long arg)
804 {
805 	struct nvme_ns_head *head = bdev->bd_disk->private_data;
806 	void __user *argp = (void __user *)arg;
807 	struct nvme_ns *ns;
808 	int srcu_idx, ret = -EWOULDBLOCK;
809 
810 	srcu_idx = srcu_read_lock(&head->srcu);
811 	ns = nvme_find_path(head);
812 	if (!ns)
813 		goto out_unlock;
814 
815 	/*
816 	 * Handle ioctls that apply to the controller instead of the namespace
817 	 * seperately and drop the ns SRCU reference early.  This avoids a
818 	 * deadlock when deleting namespaces using the passthrough interface.
819 	 */
820 	if (is_ctrl_ioctl(cmd))
821 		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
822 					mode);
823 
824 	ret = nvme_ns_ioctl(ns, cmd, argp, mode);
825 out_unlock:
826 	srcu_read_unlock(&head->srcu, srcu_idx);
827 	return ret;
828 }
829 
830 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
831 		unsigned long arg)
832 {
833 	struct cdev *cdev = file_inode(file)->i_cdev;
834 	struct nvme_ns_head *head =
835 		container_of(cdev, struct nvme_ns_head, cdev);
836 	void __user *argp = (void __user *)arg;
837 	struct nvme_ns *ns;
838 	int srcu_idx, ret = -EWOULDBLOCK;
839 
840 	srcu_idx = srcu_read_lock(&head->srcu);
841 	ns = nvme_find_path(head);
842 	if (!ns)
843 		goto out_unlock;
844 
845 	if (is_ctrl_ioctl(cmd))
846 		return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
847 				file->f_mode);
848 
849 	ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode);
850 out_unlock:
851 	srcu_read_unlock(&head->srcu, srcu_idx);
852 	return ret;
853 }
854 
855 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
856 		unsigned int issue_flags)
857 {
858 	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
859 	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
860 	int srcu_idx = srcu_read_lock(&head->srcu);
861 	struct nvme_ns *ns = nvme_find_path(head);
862 	int ret = -EINVAL;
863 
864 	if (ns)
865 		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
866 	srcu_read_unlock(&head->srcu, srcu_idx);
867 	return ret;
868 }
869 
870 int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
871 				      struct io_comp_batch *iob,
872 				      unsigned int poll_flags)
873 {
874 	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
875 	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
876 	int srcu_idx = srcu_read_lock(&head->srcu);
877 	struct nvme_ns *ns = nvme_find_path(head);
878 	struct bio *bio;
879 	int ret = 0;
880 	struct request_queue *q;
881 
882 	if (ns) {
883 		rcu_read_lock();
884 		bio = READ_ONCE(ioucmd->cookie);
885 		q = ns->queue;
886 		if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio
887 				&& bio->bi_bdev)
888 			ret = bio_poll(bio, iob, poll_flags);
889 		rcu_read_unlock();
890 	}
891 	srcu_read_unlock(&head->srcu, srcu_idx);
892 	return ret;
893 }
894 #endif /* CONFIG_NVME_MULTIPATH */
895 
896 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
897 {
898 	struct nvme_ctrl *ctrl = ioucmd->file->private_data;
899 	int ret;
900 
901 	/* IOPOLL not supported yet */
902 	if (issue_flags & IO_URING_F_IOPOLL)
903 		return -EOPNOTSUPP;
904 
905 	ret = nvme_uring_cmd_checks(issue_flags);
906 	if (ret)
907 		return ret;
908 
909 	switch (ioucmd->cmd_op) {
910 	case NVME_URING_CMD_ADMIN:
911 		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false);
912 		break;
913 	case NVME_URING_CMD_ADMIN_VEC:
914 		ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true);
915 		break;
916 	default:
917 		ret = -ENOTTY;
918 	}
919 
920 	return ret;
921 }
922 
923 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
924 		fmode_t mode)
925 {
926 	struct nvme_ns *ns;
927 	int ret;
928 
929 	down_read(&ctrl->namespaces_rwsem);
930 	if (list_empty(&ctrl->namespaces)) {
931 		ret = -ENOTTY;
932 		goto out_unlock;
933 	}
934 
935 	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
936 	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
937 		dev_warn(ctrl->device,
938 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
939 		ret = -EINVAL;
940 		goto out_unlock;
941 	}
942 
943 	dev_warn(ctrl->device,
944 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
945 	kref_get(&ns->kref);
946 	up_read(&ctrl->namespaces_rwsem);
947 
948 	ret = nvme_user_cmd(ctrl, ns, argp, mode);
949 	nvme_put_ns(ns);
950 	return ret;
951 
952 out_unlock:
953 	up_read(&ctrl->namespaces_rwsem);
954 	return ret;
955 }
956 
957 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
958 		unsigned long arg)
959 {
960 	struct nvme_ctrl *ctrl = file->private_data;
961 	void __user *argp = (void __user *)arg;
962 
963 	switch (cmd) {
964 	case NVME_IOCTL_ADMIN_CMD:
965 		return nvme_user_cmd(ctrl, NULL, argp, file->f_mode);
966 	case NVME_IOCTL_ADMIN64_CMD:
967 		return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode);
968 	case NVME_IOCTL_IO_CMD:
969 		return nvme_dev_user_cmd(ctrl, argp, file->f_mode);
970 	case NVME_IOCTL_RESET:
971 		if (!capable(CAP_SYS_ADMIN))
972 			return -EACCES;
973 		dev_warn(ctrl->device, "resetting controller\n");
974 		return nvme_reset_ctrl_sync(ctrl);
975 	case NVME_IOCTL_SUBSYS_RESET:
976 		if (!capable(CAP_SYS_ADMIN))
977 			return -EACCES;
978 		return nvme_reset_subsystem(ctrl);
979 	case NVME_IOCTL_RESCAN:
980 		if (!capable(CAP_SYS_ADMIN))
981 			return -EACCES;
982 		nvme_queue_scan(ctrl);
983 		return 0;
984 	default:
985 		return -ENOTTY;
986 	}
987 }
988