1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * Copyright (c) 2017-2021 Christoph Hellwig. 5 */ 6 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 7 #include <linux/nvme_ioctl.h> 8 #include <linux/io_uring.h> 9 #include "nvme.h" 10 11 enum { 12 NVME_IOCTL_VEC = (1 << 0), 13 NVME_IOCTL_PARTITION = (1 << 1), 14 }; 15 16 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, 17 unsigned int flags, bool open_for_write) 18 { 19 u32 effects; 20 21 if (capable(CAP_SYS_ADMIN)) 22 return true; 23 24 /* 25 * Do not allow unprivileged passthrough on partitions, as that allows an 26 * escape from the containment of the partition. 27 */ 28 if (flags & NVME_IOCTL_PARTITION) 29 return false; 30 31 /* 32 * Do not allow unprivileged processes to send vendor specific or fabrics 33 * commands as we can't be sure about their effects. 34 */ 35 if (c->common.opcode >= nvme_cmd_vendor_start || 36 c->common.opcode == nvme_fabrics_command) 37 return false; 38 39 /* 40 * Do not allow unprivileged passthrough of admin commands except 41 * for a subset of identify commands that contain information required 42 * to form proper I/O commands in userspace and do not expose any 43 * potentially sensitive information. 44 */ 45 if (!ns) { 46 if (c->common.opcode == nvme_admin_identify) { 47 switch (c->identify.cns) { 48 case NVME_ID_CNS_NS: 49 case NVME_ID_CNS_CS_NS: 50 case NVME_ID_CNS_NS_CS_INDEP: 51 case NVME_ID_CNS_CS_CTRL: 52 case NVME_ID_CNS_CTRL: 53 return true; 54 } 55 } 56 return false; 57 } 58 59 /* 60 * Check if the controller provides a Commands Supported and Effects log 61 * and marks this command as supported. If not reject unprivileged 62 * passthrough. 63 */ 64 effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); 65 if (!(effects & NVME_CMD_EFFECTS_CSUPP)) 66 return false; 67 68 /* 69 * Don't allow passthrough for command that have intrusive (or unknown) 70 * effects. 71 */ 72 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 73 NVME_CMD_EFFECTS_UUID_SEL | 74 NVME_CMD_EFFECTS_SCOPE_MASK)) 75 return false; 76 77 /* 78 * Only allow I/O commands that transfer data to the controller or that 79 * change the logical block contents if the file descriptor is open for 80 * writing. 81 */ 82 if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) 83 return open_for_write; 84 return true; 85 } 86 87 /* 88 * Convert integer values from ioctl structures to user pointers, silently 89 * ignoring the upper bits in the compat case to match behaviour of 32-bit 90 * kernels. 91 */ 92 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 93 { 94 if (in_compat_syscall()) 95 ptrval = (compat_uptr_t)ptrval; 96 return (void __user *)ptrval; 97 } 98 99 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, 100 unsigned len, u32 seed) 101 { 102 struct bio_integrity_payload *bip; 103 int ret = -ENOMEM; 104 void *buf; 105 struct bio *bio = req->bio; 106 107 buf = kmalloc(len, GFP_KERNEL); 108 if (!buf) 109 goto out; 110 111 ret = -EFAULT; 112 if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len)) 113 goto out_free_meta; 114 115 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 116 if (IS_ERR(bip)) { 117 ret = PTR_ERR(bip); 118 goto out_free_meta; 119 } 120 121 bip->bip_iter.bi_size = len; 122 bip->bip_iter.bi_sector = seed; 123 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 124 offset_in_page(buf)); 125 if (ret != len) { 126 ret = -ENOMEM; 127 goto out_free_meta; 128 } 129 130 req->cmd_flags |= REQ_INTEGRITY; 131 return buf; 132 out_free_meta: 133 kfree(buf); 134 out: 135 return ERR_PTR(ret); 136 } 137 138 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, 139 void *meta, unsigned len, int ret) 140 { 141 if (!ret && req_op(req) == REQ_OP_DRV_IN && 142 copy_to_user(ubuf, meta, len)) 143 ret = -EFAULT; 144 kfree(meta); 145 return ret; 146 } 147 148 static struct request *nvme_alloc_user_request(struct request_queue *q, 149 struct nvme_command *cmd, blk_opf_t rq_flags, 150 blk_mq_req_flags_t blk_flags) 151 { 152 struct request *req; 153 154 req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); 155 if (IS_ERR(req)) 156 return req; 157 nvme_init_request(req, cmd); 158 nvme_req(req)->flags |= NVME_REQ_USERCMD; 159 return req; 160 } 161 162 static int nvme_map_user_request(struct request *req, u64 ubuffer, 163 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 164 u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, 165 unsigned int flags) 166 { 167 struct request_queue *q = req->q; 168 struct nvme_ns *ns = q->queuedata; 169 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 170 struct bio *bio = NULL; 171 void *meta = NULL; 172 int ret; 173 174 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 175 struct iov_iter iter; 176 177 /* fixedbufs is only for non-vectored io */ 178 if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) 179 return -EINVAL; 180 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 181 rq_data_dir(req), &iter, ioucmd); 182 if (ret < 0) 183 goto out; 184 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); 185 } else { 186 ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), 187 bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 188 0, rq_data_dir(req)); 189 } 190 191 if (ret) 192 goto out; 193 bio = req->bio; 194 if (bdev) 195 bio_set_dev(bio, bdev); 196 197 if (bdev && meta_buffer && meta_len) { 198 meta = nvme_add_user_metadata(req, meta_buffer, meta_len, 199 meta_seed); 200 if (IS_ERR(meta)) { 201 ret = PTR_ERR(meta); 202 goto out_unmap; 203 } 204 *metap = meta; 205 } 206 207 return ret; 208 209 out_unmap: 210 if (bio) 211 blk_rq_unmap_user(bio); 212 out: 213 blk_mq_free_request(req); 214 return ret; 215 } 216 217 static int nvme_submit_user_cmd(struct request_queue *q, 218 struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, 219 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 220 u64 *result, unsigned timeout, unsigned int flags) 221 { 222 struct nvme_ns *ns = q->queuedata; 223 struct nvme_ctrl *ctrl; 224 struct request *req; 225 void *meta = NULL; 226 struct bio *bio; 227 u32 effects; 228 int ret; 229 230 req = nvme_alloc_user_request(q, cmd, 0, 0); 231 if (IS_ERR(req)) 232 return PTR_ERR(req); 233 234 req->timeout = timeout; 235 if (ubuffer && bufflen) { 236 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 237 meta_len, meta_seed, &meta, NULL, flags); 238 if (ret) 239 return ret; 240 } 241 242 bio = req->bio; 243 ctrl = nvme_req(req)->ctrl; 244 245 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 246 ret = nvme_execute_rq(req, false); 247 if (result) 248 *result = le64_to_cpu(nvme_req(req)->result.u64); 249 if (meta) 250 ret = nvme_finish_user_metadata(req, meta_buffer, meta, 251 meta_len, ret); 252 if (bio) 253 blk_rq_unmap_user(bio); 254 blk_mq_free_request(req); 255 256 if (effects) 257 nvme_passthru_end(ctrl, ns, effects, cmd, ret); 258 259 return ret; 260 } 261 262 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 263 { 264 struct nvme_user_io io; 265 struct nvme_command c; 266 unsigned length, meta_len; 267 void __user *metadata; 268 269 if (copy_from_user(&io, uio, sizeof(io))) 270 return -EFAULT; 271 if (io.flags) 272 return -EINVAL; 273 274 switch (io.opcode) { 275 case nvme_cmd_write: 276 case nvme_cmd_read: 277 case nvme_cmd_compare: 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 length = (io.nblocks + 1) << ns->lba_shift; 284 285 if ((io.control & NVME_RW_PRINFO_PRACT) && 286 ns->ms == sizeof(struct t10_pi_tuple)) { 287 /* 288 * Protection information is stripped/inserted by the 289 * controller. 290 */ 291 if (nvme_to_user_ptr(io.metadata)) 292 return -EINVAL; 293 meta_len = 0; 294 metadata = NULL; 295 } else { 296 meta_len = (io.nblocks + 1) * ns->ms; 297 metadata = nvme_to_user_ptr(io.metadata); 298 } 299 300 if (ns->features & NVME_NS_EXT_LBAS) { 301 length += meta_len; 302 meta_len = 0; 303 } else if (meta_len) { 304 if ((io.metadata & 3) || !io.metadata) 305 return -EINVAL; 306 } 307 308 memset(&c, 0, sizeof(c)); 309 c.rw.opcode = io.opcode; 310 c.rw.flags = io.flags; 311 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 312 c.rw.slba = cpu_to_le64(io.slba); 313 c.rw.length = cpu_to_le16(io.nblocks); 314 c.rw.control = cpu_to_le16(io.control); 315 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 316 c.rw.reftag = cpu_to_le32(io.reftag); 317 c.rw.apptag = cpu_to_le16(io.apptag); 318 c.rw.appmask = cpu_to_le16(io.appmask); 319 320 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 321 meta_len, lower_32_bits(io.slba), NULL, 0, 0); 322 } 323 324 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, 325 struct nvme_ns *ns, __u32 nsid) 326 { 327 if (ns && nsid != ns->head->ns_id) { 328 dev_err(ctrl->device, 329 "%s: nsid (%u) in cmd does not match nsid (%u)" 330 "of namespace\n", 331 current->comm, nsid, ns->head->ns_id); 332 return false; 333 } 334 335 return true; 336 } 337 338 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 339 struct nvme_passthru_cmd __user *ucmd, unsigned int flags, 340 bool open_for_write) 341 { 342 struct nvme_passthru_cmd cmd; 343 struct nvme_command c; 344 unsigned timeout = 0; 345 u64 result; 346 int status; 347 348 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 349 return -EFAULT; 350 if (cmd.flags) 351 return -EINVAL; 352 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 353 return -EINVAL; 354 355 memset(&c, 0, sizeof(c)); 356 c.common.opcode = cmd.opcode; 357 c.common.flags = cmd.flags; 358 c.common.nsid = cpu_to_le32(cmd.nsid); 359 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 360 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 361 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 362 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 363 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 364 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 365 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 366 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 367 368 if (!nvme_cmd_allowed(ns, &c, 0, open_for_write)) 369 return -EACCES; 370 371 if (cmd.timeout_ms) 372 timeout = msecs_to_jiffies(cmd.timeout_ms); 373 374 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 375 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 376 cmd.metadata_len, 0, &result, timeout, 0); 377 378 if (status >= 0) { 379 if (put_user(result, &ucmd->result)) 380 return -EFAULT; 381 } 382 383 return status; 384 } 385 386 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 387 struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, 388 bool open_for_write) 389 { 390 struct nvme_passthru_cmd64 cmd; 391 struct nvme_command c; 392 unsigned timeout = 0; 393 int status; 394 395 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 396 return -EFAULT; 397 if (cmd.flags) 398 return -EINVAL; 399 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 400 return -EINVAL; 401 402 memset(&c, 0, sizeof(c)); 403 c.common.opcode = cmd.opcode; 404 c.common.flags = cmd.flags; 405 c.common.nsid = cpu_to_le32(cmd.nsid); 406 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 407 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 408 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 409 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 410 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 411 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 412 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 413 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 414 415 if (!nvme_cmd_allowed(ns, &c, flags, open_for_write)) 416 return -EACCES; 417 418 if (cmd.timeout_ms) 419 timeout = msecs_to_jiffies(cmd.timeout_ms); 420 421 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 422 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 423 cmd.metadata_len, 0, &cmd.result, timeout, flags); 424 425 if (status >= 0) { 426 if (put_user(cmd.result, &ucmd->result)) 427 return -EFAULT; 428 } 429 430 return status; 431 } 432 433 struct nvme_uring_data { 434 __u64 metadata; 435 __u64 addr; 436 __u32 data_len; 437 __u32 metadata_len; 438 __u32 timeout_ms; 439 }; 440 441 /* 442 * This overlays struct io_uring_cmd pdu. 443 * Expect build errors if this grows larger than that. 444 */ 445 struct nvme_uring_cmd_pdu { 446 union { 447 struct bio *bio; 448 struct request *req; 449 }; 450 u32 meta_len; 451 u32 nvme_status; 452 union { 453 struct { 454 void *meta; /* kernel-resident buffer */ 455 void __user *meta_buffer; 456 }; 457 u64 result; 458 } u; 459 }; 460 461 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( 462 struct io_uring_cmd *ioucmd) 463 { 464 return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; 465 } 466 467 static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, 468 unsigned issue_flags) 469 { 470 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 471 struct request *req = pdu->req; 472 int status; 473 u64 result; 474 475 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 476 status = -EINTR; 477 else 478 status = nvme_req(req)->status; 479 480 result = le64_to_cpu(nvme_req(req)->result.u64); 481 482 if (pdu->meta_len) 483 status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, 484 pdu->u.meta, pdu->meta_len, status); 485 if (req->bio) 486 blk_rq_unmap_user(req->bio); 487 blk_mq_free_request(req); 488 489 io_uring_cmd_done(ioucmd, status, result, issue_flags); 490 } 491 492 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 493 unsigned issue_flags) 494 { 495 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 496 497 if (pdu->bio) 498 blk_rq_unmap_user(pdu->bio); 499 500 io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); 501 } 502 503 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, 504 blk_status_t err) 505 { 506 struct io_uring_cmd *ioucmd = req->end_io_data; 507 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 508 void *cookie = READ_ONCE(ioucmd->cookie); 509 510 req->bio = pdu->bio; 511 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 512 pdu->nvme_status = -EINTR; 513 else 514 pdu->nvme_status = nvme_req(req)->status; 515 pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); 516 517 /* 518 * For iopoll, complete it directly. 519 * Otherwise, move the completion to task work. 520 */ 521 if (cookie != NULL && blk_rq_is_poll(req)) 522 nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 523 else 524 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 525 526 return RQ_END_IO_FREE; 527 } 528 529 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, 530 blk_status_t err) 531 { 532 struct io_uring_cmd *ioucmd = req->end_io_data; 533 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 534 void *cookie = READ_ONCE(ioucmd->cookie); 535 536 req->bio = pdu->bio; 537 pdu->req = req; 538 539 /* 540 * For iopoll, complete it directly. 541 * Otherwise, move the completion to task work. 542 */ 543 if (cookie != NULL && blk_rq_is_poll(req)) 544 nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); 545 else 546 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); 547 548 return RQ_END_IO_NONE; 549 } 550 551 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 552 struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) 553 { 554 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 555 const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); 556 struct request_queue *q = ns ? ns->queue : ctrl->admin_q; 557 struct nvme_uring_data d; 558 struct nvme_command c; 559 struct request *req; 560 blk_opf_t rq_flags = REQ_ALLOC_CACHE; 561 blk_mq_req_flags_t blk_flags = 0; 562 void *meta = NULL; 563 int ret; 564 565 c.common.opcode = READ_ONCE(cmd->opcode); 566 c.common.flags = READ_ONCE(cmd->flags); 567 if (c.common.flags) 568 return -EINVAL; 569 570 c.common.command_id = 0; 571 c.common.nsid = cpu_to_le32(cmd->nsid); 572 if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) 573 return -EINVAL; 574 575 c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); 576 c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); 577 c.common.metadata = 0; 578 c.common.dptr.prp1 = c.common.dptr.prp2 = 0; 579 c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); 580 c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); 581 c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); 582 c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); 583 c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); 584 c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); 585 586 if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE)) 587 return -EACCES; 588 589 d.metadata = READ_ONCE(cmd->metadata); 590 d.addr = READ_ONCE(cmd->addr); 591 d.data_len = READ_ONCE(cmd->data_len); 592 d.metadata_len = READ_ONCE(cmd->metadata_len); 593 d.timeout_ms = READ_ONCE(cmd->timeout_ms); 594 595 if (issue_flags & IO_URING_F_NONBLOCK) { 596 rq_flags |= REQ_NOWAIT; 597 blk_flags = BLK_MQ_REQ_NOWAIT; 598 } 599 if (issue_flags & IO_URING_F_IOPOLL) 600 rq_flags |= REQ_POLLED; 601 602 retry: 603 req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); 604 if (IS_ERR(req)) 605 return PTR_ERR(req); 606 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 607 608 if (d.addr && d.data_len) { 609 ret = nvme_map_user_request(req, d.addr, 610 d.data_len, nvme_to_user_ptr(d.metadata), 611 d.metadata_len, 0, &meta, ioucmd, vec); 612 if (ret) 613 return ret; 614 } 615 616 if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { 617 if (unlikely(!req->bio)) { 618 /* we can't poll this, so alloc regular req instead */ 619 blk_mq_free_request(req); 620 rq_flags &= ~REQ_POLLED; 621 goto retry; 622 } else { 623 WRITE_ONCE(ioucmd->cookie, req->bio); 624 req->bio->bi_opf |= REQ_POLLED; 625 } 626 } 627 /* to free bio on completion, as req->bio will be null at that time */ 628 pdu->bio = req->bio; 629 pdu->meta_len = d.metadata_len; 630 req->end_io_data = ioucmd; 631 if (pdu->meta_len) { 632 pdu->u.meta = meta; 633 pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); 634 req->end_io = nvme_uring_cmd_end_io_meta; 635 } else { 636 req->end_io = nvme_uring_cmd_end_io; 637 } 638 blk_execute_rq_nowait(req, false); 639 return -EIOCBQUEUED; 640 } 641 642 static bool is_ctrl_ioctl(unsigned int cmd) 643 { 644 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 645 return true; 646 if (is_sed_ioctl(cmd)) 647 return true; 648 return false; 649 } 650 651 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, 652 void __user *argp, bool open_for_write) 653 { 654 switch (cmd) { 655 case NVME_IOCTL_ADMIN_CMD: 656 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 657 case NVME_IOCTL_ADMIN64_CMD: 658 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 659 default: 660 return sed_ioctl(ctrl->opal_dev, cmd, argp); 661 } 662 } 663 664 #ifdef COMPAT_FOR_U64_ALIGNMENT 665 struct nvme_user_io32 { 666 __u8 opcode; 667 __u8 flags; 668 __u16 control; 669 __u16 nblocks; 670 __u16 rsvd; 671 __u64 metadata; 672 __u64 addr; 673 __u64 slba; 674 __u32 dsmgmt; 675 __u32 reftag; 676 __u16 apptag; 677 __u16 appmask; 678 } __attribute__((__packed__)); 679 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 680 #endif /* COMPAT_FOR_U64_ALIGNMENT */ 681 682 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, 683 void __user *argp, unsigned int flags, bool open_for_write) 684 { 685 switch (cmd) { 686 case NVME_IOCTL_ID: 687 force_successful_syscall_return(); 688 return ns->head->ns_id; 689 case NVME_IOCTL_IO_CMD: 690 return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write); 691 /* 692 * struct nvme_user_io can have different padding on some 32-bit ABIs. 693 * Just accept the compat version as all fields that are used are the 694 * same size and at the same offset. 695 */ 696 #ifdef COMPAT_FOR_U64_ALIGNMENT 697 case NVME_IOCTL_SUBMIT_IO32: 698 #endif 699 case NVME_IOCTL_SUBMIT_IO: 700 return nvme_submit_io(ns, argp); 701 case NVME_IOCTL_IO64_CMD_VEC: 702 flags |= NVME_IOCTL_VEC; 703 fallthrough; 704 case NVME_IOCTL_IO64_CMD: 705 return nvme_user_cmd64(ns->ctrl, ns, argp, flags, 706 open_for_write); 707 default: 708 return -ENOTTY; 709 } 710 } 711 712 int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, 713 unsigned int cmd, unsigned long arg) 714 { 715 struct nvme_ns *ns = bdev->bd_disk->private_data; 716 bool open_for_write = mode & BLK_OPEN_WRITE; 717 void __user *argp = (void __user *)arg; 718 unsigned int flags = 0; 719 720 if (bdev_is_partition(bdev)) 721 flags |= NVME_IOCTL_PARTITION; 722 723 if (is_ctrl_ioctl(cmd)) 724 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 725 return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 726 } 727 728 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 729 { 730 struct nvme_ns *ns = 731 container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); 732 bool open_for_write = file->f_mode & FMODE_WRITE; 733 void __user *argp = (void __user *)arg; 734 735 if (is_ctrl_ioctl(cmd)) 736 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 737 return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 738 } 739 740 static int nvme_uring_cmd_checks(unsigned int issue_flags) 741 { 742 743 /* NVMe passthrough requires big SQE/CQE support */ 744 if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != 745 (IO_URING_F_SQE128|IO_URING_F_CQE32)) 746 return -EOPNOTSUPP; 747 return 0; 748 } 749 750 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, 751 unsigned int issue_flags) 752 { 753 struct nvme_ctrl *ctrl = ns->ctrl; 754 int ret; 755 756 BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); 757 758 ret = nvme_uring_cmd_checks(issue_flags); 759 if (ret) 760 return ret; 761 762 switch (ioucmd->cmd_op) { 763 case NVME_URING_CMD_IO: 764 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); 765 break; 766 case NVME_URING_CMD_IO_VEC: 767 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); 768 break; 769 default: 770 ret = -ENOTTY; 771 } 772 773 return ret; 774 } 775 776 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 777 { 778 struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, 779 struct nvme_ns, cdev); 780 781 return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 782 } 783 784 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 785 struct io_comp_batch *iob, 786 unsigned int poll_flags) 787 { 788 struct bio *bio; 789 int ret = 0; 790 struct nvme_ns *ns; 791 struct request_queue *q; 792 793 rcu_read_lock(); 794 bio = READ_ONCE(ioucmd->cookie); 795 ns = container_of(file_inode(ioucmd->file)->i_cdev, 796 struct nvme_ns, cdev); 797 q = ns->queue; 798 if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) 799 ret = bio_poll(bio, iob, poll_flags); 800 rcu_read_unlock(); 801 return ret; 802 } 803 #ifdef CONFIG_NVME_MULTIPATH 804 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 805 void __user *argp, struct nvme_ns_head *head, int srcu_idx, 806 bool open_for_write) 807 __releases(&head->srcu) 808 { 809 struct nvme_ctrl *ctrl = ns->ctrl; 810 int ret; 811 812 nvme_get_ctrl(ns->ctrl); 813 srcu_read_unlock(&head->srcu, srcu_idx); 814 ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 815 816 nvme_put_ctrl(ctrl); 817 return ret; 818 } 819 820 int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, 821 unsigned int cmd, unsigned long arg) 822 { 823 struct nvme_ns_head *head = bdev->bd_disk->private_data; 824 bool open_for_write = mode & BLK_OPEN_WRITE; 825 void __user *argp = (void __user *)arg; 826 struct nvme_ns *ns; 827 int srcu_idx, ret = -EWOULDBLOCK; 828 unsigned int flags = 0; 829 830 if (bdev_is_partition(bdev)) 831 flags |= NVME_IOCTL_PARTITION; 832 833 srcu_idx = srcu_read_lock(&head->srcu); 834 ns = nvme_find_path(head); 835 if (!ns) 836 goto out_unlock; 837 838 /* 839 * Handle ioctls that apply to the controller instead of the namespace 840 * seperately and drop the ns SRCU reference early. This avoids a 841 * deadlock when deleting namespaces using the passthrough interface. 842 */ 843 if (is_ctrl_ioctl(cmd)) 844 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 845 open_for_write); 846 847 ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 848 out_unlock: 849 srcu_read_unlock(&head->srcu, srcu_idx); 850 return ret; 851 } 852 853 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, 854 unsigned long arg) 855 { 856 bool open_for_write = file->f_mode & FMODE_WRITE; 857 struct cdev *cdev = file_inode(file)->i_cdev; 858 struct nvme_ns_head *head = 859 container_of(cdev, struct nvme_ns_head, cdev); 860 void __user *argp = (void __user *)arg; 861 struct nvme_ns *ns; 862 int srcu_idx, ret = -EWOULDBLOCK; 863 864 srcu_idx = srcu_read_lock(&head->srcu); 865 ns = nvme_find_path(head); 866 if (!ns) 867 goto out_unlock; 868 869 if (is_ctrl_ioctl(cmd)) 870 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 871 open_for_write); 872 873 ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 874 out_unlock: 875 srcu_read_unlock(&head->srcu, srcu_idx); 876 return ret; 877 } 878 879 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, 880 unsigned int issue_flags) 881 { 882 struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 883 struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 884 int srcu_idx = srcu_read_lock(&head->srcu); 885 struct nvme_ns *ns = nvme_find_path(head); 886 int ret = -EINVAL; 887 888 if (ns) 889 ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 890 srcu_read_unlock(&head->srcu, srcu_idx); 891 return ret; 892 } 893 894 int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 895 struct io_comp_batch *iob, 896 unsigned int poll_flags) 897 { 898 struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 899 struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 900 int srcu_idx = srcu_read_lock(&head->srcu); 901 struct nvme_ns *ns = nvme_find_path(head); 902 struct bio *bio; 903 int ret = 0; 904 struct request_queue *q; 905 906 if (ns) { 907 rcu_read_lock(); 908 bio = READ_ONCE(ioucmd->cookie); 909 q = ns->queue; 910 if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio 911 && bio->bi_bdev) 912 ret = bio_poll(bio, iob, poll_flags); 913 rcu_read_unlock(); 914 } 915 srcu_read_unlock(&head->srcu, srcu_idx); 916 return ret; 917 } 918 #endif /* CONFIG_NVME_MULTIPATH */ 919 920 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 921 { 922 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 923 int ret; 924 925 /* IOPOLL not supported yet */ 926 if (issue_flags & IO_URING_F_IOPOLL) 927 return -EOPNOTSUPP; 928 929 ret = nvme_uring_cmd_checks(issue_flags); 930 if (ret) 931 return ret; 932 933 switch (ioucmd->cmd_op) { 934 case NVME_URING_CMD_ADMIN: 935 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); 936 break; 937 case NVME_URING_CMD_ADMIN_VEC: 938 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); 939 break; 940 default: 941 ret = -ENOTTY; 942 } 943 944 return ret; 945 } 946 947 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, 948 bool open_for_write) 949 { 950 struct nvme_ns *ns; 951 int ret; 952 953 down_read(&ctrl->namespaces_rwsem); 954 if (list_empty(&ctrl->namespaces)) { 955 ret = -ENOTTY; 956 goto out_unlock; 957 } 958 959 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); 960 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 961 dev_warn(ctrl->device, 962 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 963 ret = -EINVAL; 964 goto out_unlock; 965 } 966 967 dev_warn(ctrl->device, 968 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 969 kref_get(&ns->kref); 970 up_read(&ctrl->namespaces_rwsem); 971 972 ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); 973 nvme_put_ns(ns); 974 return ret; 975 976 out_unlock: 977 up_read(&ctrl->namespaces_rwsem); 978 return ret; 979 } 980 981 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 982 unsigned long arg) 983 { 984 bool open_for_write = file->f_mode & FMODE_WRITE; 985 struct nvme_ctrl *ctrl = file->private_data; 986 void __user *argp = (void __user *)arg; 987 988 switch (cmd) { 989 case NVME_IOCTL_ADMIN_CMD: 990 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 991 case NVME_IOCTL_ADMIN64_CMD: 992 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 993 case NVME_IOCTL_IO_CMD: 994 return nvme_dev_user_cmd(ctrl, argp, open_for_write); 995 case NVME_IOCTL_RESET: 996 if (!capable(CAP_SYS_ADMIN)) 997 return -EACCES; 998 dev_warn(ctrl->device, "resetting controller\n"); 999 return nvme_reset_ctrl_sync(ctrl); 1000 case NVME_IOCTL_SUBSYS_RESET: 1001 if (!capable(CAP_SYS_ADMIN)) 1002 return -EACCES; 1003 return nvme_reset_subsystem(ctrl); 1004 case NVME_IOCTL_RESCAN: 1005 if (!capable(CAP_SYS_ADMIN)) 1006 return -EACCES; 1007 nvme_queue_scan(ctrl); 1008 return 0; 1009 default: 1010 return -ENOTTY; 1011 } 1012 } 1013