1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * Copyright (c) 2017-2021 Christoph Hellwig. 5 */ 6 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 7 #include <linux/nvme_ioctl.h> 8 #include <linux/io_uring.h> 9 #include "nvme.h" 10 11 enum { 12 NVME_IOCTL_VEC = (1 << 0), 13 NVME_IOCTL_PARTITION = (1 << 1), 14 }; 15 16 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, 17 unsigned int flags, bool open_for_write) 18 { 19 u32 effects; 20 21 /* 22 * Do not allow unprivileged passthrough on partitions, as that allows an 23 * escape from the containment of the partition. 24 */ 25 if (flags & NVME_IOCTL_PARTITION) 26 goto admin; 27 28 /* 29 * Do not allow unprivileged processes to send vendor specific or fabrics 30 * commands as we can't be sure about their effects. 31 */ 32 if (c->common.opcode >= nvme_cmd_vendor_start || 33 c->common.opcode == nvme_fabrics_command) 34 goto admin; 35 36 /* 37 * Do not allow unprivileged passthrough of admin commands except 38 * for a subset of identify commands that contain information required 39 * to form proper I/O commands in userspace and do not expose any 40 * potentially sensitive information. 41 */ 42 if (!ns) { 43 if (c->common.opcode == nvme_admin_identify) { 44 switch (c->identify.cns) { 45 case NVME_ID_CNS_NS: 46 case NVME_ID_CNS_CS_NS: 47 case NVME_ID_CNS_NS_CS_INDEP: 48 case NVME_ID_CNS_CS_CTRL: 49 case NVME_ID_CNS_CTRL: 50 return true; 51 } 52 } 53 goto admin; 54 } 55 56 /* 57 * Check if the controller provides a Commands Supported and Effects log 58 * and marks this command as supported. If not reject unprivileged 59 * passthrough. 60 */ 61 effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); 62 if (!(effects & NVME_CMD_EFFECTS_CSUPP)) 63 goto admin; 64 65 /* 66 * Don't allow passthrough for command that have intrusive (or unknown) 67 * effects. 68 */ 69 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 70 NVME_CMD_EFFECTS_UUID_SEL | 71 NVME_CMD_EFFECTS_SCOPE_MASK)) 72 goto admin; 73 74 /* 75 * Only allow I/O commands that transfer data to the controller or that 76 * change the logical block contents if the file descriptor is open for 77 * writing. 78 */ 79 if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) && 80 !open_for_write) 81 goto admin; 82 83 return true; 84 admin: 85 return capable(CAP_SYS_ADMIN); 86 } 87 88 /* 89 * Convert integer values from ioctl structures to user pointers, silently 90 * ignoring the upper bits in the compat case to match behaviour of 32-bit 91 * kernels. 92 */ 93 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 94 { 95 if (in_compat_syscall()) 96 ptrval = (compat_uptr_t)ptrval; 97 return (void __user *)ptrval; 98 } 99 100 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, 101 unsigned len, u32 seed) 102 { 103 struct bio_integrity_payload *bip; 104 int ret = -ENOMEM; 105 void *buf; 106 struct bio *bio = req->bio; 107 108 buf = kmalloc(len, GFP_KERNEL); 109 if (!buf) 110 goto out; 111 112 if (req_op(req) == REQ_OP_DRV_OUT) { 113 ret = -EFAULT; 114 if (copy_from_user(buf, ubuf, len)) 115 goto out_free_meta; 116 } else { 117 memset(buf, 0, len); 118 } 119 120 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 121 if (IS_ERR(bip)) { 122 ret = PTR_ERR(bip); 123 goto out_free_meta; 124 } 125 126 bip->bip_iter.bi_sector = seed; 127 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 128 offset_in_page(buf)); 129 if (ret != len) { 130 ret = -ENOMEM; 131 goto out_free_meta; 132 } 133 134 req->cmd_flags |= REQ_INTEGRITY; 135 return buf; 136 out_free_meta: 137 kfree(buf); 138 out: 139 return ERR_PTR(ret); 140 } 141 142 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, 143 void *meta, unsigned len, int ret) 144 { 145 if (!ret && req_op(req) == REQ_OP_DRV_IN && 146 copy_to_user(ubuf, meta, len)) 147 ret = -EFAULT; 148 kfree(meta); 149 return ret; 150 } 151 152 static struct request *nvme_alloc_user_request(struct request_queue *q, 153 struct nvme_command *cmd, blk_opf_t rq_flags, 154 blk_mq_req_flags_t blk_flags) 155 { 156 struct request *req; 157 158 req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); 159 if (IS_ERR(req)) 160 return req; 161 nvme_init_request(req, cmd); 162 nvme_req(req)->flags |= NVME_REQ_USERCMD; 163 return req; 164 } 165 166 static int nvme_map_user_request(struct request *req, u64 ubuffer, 167 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 168 u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, 169 unsigned int flags) 170 { 171 struct request_queue *q = req->q; 172 struct nvme_ns *ns = q->queuedata; 173 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 174 struct bio *bio = NULL; 175 void *meta = NULL; 176 int ret; 177 178 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 179 struct iov_iter iter; 180 181 /* fixedbufs is only for non-vectored io */ 182 if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) 183 return -EINVAL; 184 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 185 rq_data_dir(req), &iter, ioucmd); 186 if (ret < 0) 187 goto out; 188 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); 189 } else { 190 ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), 191 bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 192 0, rq_data_dir(req)); 193 } 194 195 if (ret) 196 goto out; 197 bio = req->bio; 198 if (bdev) 199 bio_set_dev(bio, bdev); 200 201 if (bdev && meta_buffer && meta_len) { 202 meta = nvme_add_user_metadata(req, meta_buffer, meta_len, 203 meta_seed); 204 if (IS_ERR(meta)) { 205 ret = PTR_ERR(meta); 206 goto out_unmap; 207 } 208 *metap = meta; 209 } 210 211 return ret; 212 213 out_unmap: 214 if (bio) 215 blk_rq_unmap_user(bio); 216 out: 217 blk_mq_free_request(req); 218 return ret; 219 } 220 221 static int nvme_submit_user_cmd(struct request_queue *q, 222 struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, 223 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 224 u64 *result, unsigned timeout, unsigned int flags) 225 { 226 struct nvme_ns *ns = q->queuedata; 227 struct nvme_ctrl *ctrl; 228 struct request *req; 229 void *meta = NULL; 230 struct bio *bio; 231 u32 effects; 232 int ret; 233 234 req = nvme_alloc_user_request(q, cmd, 0, 0); 235 if (IS_ERR(req)) 236 return PTR_ERR(req); 237 238 req->timeout = timeout; 239 if (ubuffer && bufflen) { 240 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 241 meta_len, meta_seed, &meta, NULL, flags); 242 if (ret) 243 return ret; 244 } 245 246 bio = req->bio; 247 ctrl = nvme_req(req)->ctrl; 248 249 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 250 ret = nvme_execute_rq(req, false); 251 if (result) 252 *result = le64_to_cpu(nvme_req(req)->result.u64); 253 if (meta) 254 ret = nvme_finish_user_metadata(req, meta_buffer, meta, 255 meta_len, ret); 256 if (bio) 257 blk_rq_unmap_user(bio); 258 blk_mq_free_request(req); 259 260 if (effects) 261 nvme_passthru_end(ctrl, ns, effects, cmd, ret); 262 263 return ret; 264 } 265 266 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 267 { 268 struct nvme_user_io io; 269 struct nvme_command c; 270 unsigned length, meta_len; 271 void __user *metadata; 272 273 if (copy_from_user(&io, uio, sizeof(io))) 274 return -EFAULT; 275 if (io.flags) 276 return -EINVAL; 277 278 switch (io.opcode) { 279 case nvme_cmd_write: 280 case nvme_cmd_read: 281 case nvme_cmd_compare: 282 break; 283 default: 284 return -EINVAL; 285 } 286 287 length = (io.nblocks + 1) << ns->lba_shift; 288 289 if ((io.control & NVME_RW_PRINFO_PRACT) && 290 ns->ms == sizeof(struct t10_pi_tuple)) { 291 /* 292 * Protection information is stripped/inserted by the 293 * controller. 294 */ 295 if (nvme_to_user_ptr(io.metadata)) 296 return -EINVAL; 297 meta_len = 0; 298 metadata = NULL; 299 } else { 300 meta_len = (io.nblocks + 1) * ns->ms; 301 metadata = nvme_to_user_ptr(io.metadata); 302 } 303 304 if (ns->features & NVME_NS_EXT_LBAS) { 305 length += meta_len; 306 meta_len = 0; 307 } else if (meta_len) { 308 if ((io.metadata & 3) || !io.metadata) 309 return -EINVAL; 310 } 311 312 memset(&c, 0, sizeof(c)); 313 c.rw.opcode = io.opcode; 314 c.rw.flags = io.flags; 315 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 316 c.rw.slba = cpu_to_le64(io.slba); 317 c.rw.length = cpu_to_le16(io.nblocks); 318 c.rw.control = cpu_to_le16(io.control); 319 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 320 c.rw.reftag = cpu_to_le32(io.reftag); 321 c.rw.apptag = cpu_to_le16(io.apptag); 322 c.rw.appmask = cpu_to_le16(io.appmask); 323 324 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 325 meta_len, lower_32_bits(io.slba), NULL, 0, 0); 326 } 327 328 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, 329 struct nvme_ns *ns, __u32 nsid) 330 { 331 if (ns && nsid != ns->head->ns_id) { 332 dev_err(ctrl->device, 333 "%s: nsid (%u) in cmd does not match nsid (%u)" 334 "of namespace\n", 335 current->comm, nsid, ns->head->ns_id); 336 return false; 337 } 338 339 return true; 340 } 341 342 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 343 struct nvme_passthru_cmd __user *ucmd, unsigned int flags, 344 bool open_for_write) 345 { 346 struct nvme_passthru_cmd cmd; 347 struct nvme_command c; 348 unsigned timeout = 0; 349 u64 result; 350 int status; 351 352 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 353 return -EFAULT; 354 if (cmd.flags) 355 return -EINVAL; 356 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 357 return -EINVAL; 358 359 memset(&c, 0, sizeof(c)); 360 c.common.opcode = cmd.opcode; 361 c.common.flags = cmd.flags; 362 c.common.nsid = cpu_to_le32(cmd.nsid); 363 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 364 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 365 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 366 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 367 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 368 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 369 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 370 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 371 372 if (!nvme_cmd_allowed(ns, &c, 0, open_for_write)) 373 return -EACCES; 374 375 if (cmd.timeout_ms) 376 timeout = msecs_to_jiffies(cmd.timeout_ms); 377 378 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 379 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 380 cmd.metadata_len, 0, &result, timeout, 0); 381 382 if (status >= 0) { 383 if (put_user(result, &ucmd->result)) 384 return -EFAULT; 385 } 386 387 return status; 388 } 389 390 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 391 struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, 392 bool open_for_write) 393 { 394 struct nvme_passthru_cmd64 cmd; 395 struct nvme_command c; 396 unsigned timeout = 0; 397 int status; 398 399 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 400 return -EFAULT; 401 if (cmd.flags) 402 return -EINVAL; 403 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 404 return -EINVAL; 405 406 memset(&c, 0, sizeof(c)); 407 c.common.opcode = cmd.opcode; 408 c.common.flags = cmd.flags; 409 c.common.nsid = cpu_to_le32(cmd.nsid); 410 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 411 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 412 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 413 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 414 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 415 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 416 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 417 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 418 419 if (!nvme_cmd_allowed(ns, &c, flags, open_for_write)) 420 return -EACCES; 421 422 if (cmd.timeout_ms) 423 timeout = msecs_to_jiffies(cmd.timeout_ms); 424 425 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 426 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 427 cmd.metadata_len, 0, &cmd.result, timeout, flags); 428 429 if (status >= 0) { 430 if (put_user(cmd.result, &ucmd->result)) 431 return -EFAULT; 432 } 433 434 return status; 435 } 436 437 struct nvme_uring_data { 438 __u64 metadata; 439 __u64 addr; 440 __u32 data_len; 441 __u32 metadata_len; 442 __u32 timeout_ms; 443 }; 444 445 /* 446 * This overlays struct io_uring_cmd pdu. 447 * Expect build errors if this grows larger than that. 448 */ 449 struct nvme_uring_cmd_pdu { 450 union { 451 struct bio *bio; 452 struct request *req; 453 }; 454 u32 meta_len; 455 u32 nvme_status; 456 union { 457 struct { 458 void *meta; /* kernel-resident buffer */ 459 void __user *meta_buffer; 460 }; 461 u64 result; 462 } u; 463 }; 464 465 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( 466 struct io_uring_cmd *ioucmd) 467 { 468 return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; 469 } 470 471 static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, 472 unsigned issue_flags) 473 { 474 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 475 struct request *req = pdu->req; 476 int status; 477 u64 result; 478 479 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 480 status = -EINTR; 481 else 482 status = nvme_req(req)->status; 483 484 result = le64_to_cpu(nvme_req(req)->result.u64); 485 486 if (pdu->meta_len) 487 status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, 488 pdu->u.meta, pdu->meta_len, status); 489 if (req->bio) 490 blk_rq_unmap_user(req->bio); 491 blk_mq_free_request(req); 492 493 io_uring_cmd_done(ioucmd, status, result, issue_flags); 494 } 495 496 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 497 unsigned issue_flags) 498 { 499 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 500 501 if (pdu->bio) 502 blk_rq_unmap_user(pdu->bio); 503 504 io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); 505 } 506 507 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, 508 blk_status_t err) 509 { 510 struct io_uring_cmd *ioucmd = req->end_io_data; 511 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 512 513 req->bio = pdu->bio; 514 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) { 515 pdu->nvme_status = -EINTR; 516 } else { 517 pdu->nvme_status = nvme_req(req)->status; 518 if (!pdu->nvme_status) 519 pdu->nvme_status = blk_status_to_errno(err); 520 } 521 pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); 522 523 /* 524 * For iopoll, complete it directly. 525 * Otherwise, move the completion to task work. 526 */ 527 if (blk_rq_is_poll(req)) { 528 WRITE_ONCE(ioucmd->cookie, NULL); 529 nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 530 } else { 531 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 532 } 533 534 return RQ_END_IO_FREE; 535 } 536 537 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, 538 blk_status_t err) 539 { 540 struct io_uring_cmd *ioucmd = req->end_io_data; 541 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 542 543 req->bio = pdu->bio; 544 pdu->req = req; 545 546 /* 547 * For iopoll, complete it directly. 548 * Otherwise, move the completion to task work. 549 */ 550 if (blk_rq_is_poll(req)) { 551 WRITE_ONCE(ioucmd->cookie, NULL); 552 nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); 553 } else { 554 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); 555 } 556 557 return RQ_END_IO_NONE; 558 } 559 560 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 561 struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) 562 { 563 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 564 const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); 565 struct request_queue *q = ns ? ns->queue : ctrl->admin_q; 566 struct nvme_uring_data d; 567 struct nvme_command c; 568 struct request *req; 569 blk_opf_t rq_flags = REQ_ALLOC_CACHE; 570 blk_mq_req_flags_t blk_flags = 0; 571 void *meta = NULL; 572 int ret; 573 574 c.common.opcode = READ_ONCE(cmd->opcode); 575 c.common.flags = READ_ONCE(cmd->flags); 576 if (c.common.flags) 577 return -EINVAL; 578 579 c.common.command_id = 0; 580 c.common.nsid = cpu_to_le32(cmd->nsid); 581 if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) 582 return -EINVAL; 583 584 c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); 585 c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); 586 c.common.metadata = 0; 587 c.common.dptr.prp1 = c.common.dptr.prp2 = 0; 588 c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); 589 c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); 590 c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); 591 c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); 592 c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); 593 c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); 594 595 if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE)) 596 return -EACCES; 597 598 d.metadata = READ_ONCE(cmd->metadata); 599 d.addr = READ_ONCE(cmd->addr); 600 d.data_len = READ_ONCE(cmd->data_len); 601 d.metadata_len = READ_ONCE(cmd->metadata_len); 602 d.timeout_ms = READ_ONCE(cmd->timeout_ms); 603 604 if (issue_flags & IO_URING_F_NONBLOCK) { 605 rq_flags |= REQ_NOWAIT; 606 blk_flags = BLK_MQ_REQ_NOWAIT; 607 } 608 if (issue_flags & IO_URING_F_IOPOLL) 609 rq_flags |= REQ_POLLED; 610 611 req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); 612 if (IS_ERR(req)) 613 return PTR_ERR(req); 614 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 615 616 if (d.addr && d.data_len) { 617 ret = nvme_map_user_request(req, d.addr, 618 d.data_len, nvme_to_user_ptr(d.metadata), 619 d.metadata_len, 0, &meta, ioucmd, vec); 620 if (ret) 621 return ret; 622 } 623 624 if (blk_rq_is_poll(req)) { 625 ioucmd->flags |= IORING_URING_CMD_POLLED; 626 WRITE_ONCE(ioucmd->cookie, req); 627 } 628 629 /* to free bio on completion, as req->bio will be null at that time */ 630 pdu->bio = req->bio; 631 pdu->meta_len = d.metadata_len; 632 req->end_io_data = ioucmd; 633 if (pdu->meta_len) { 634 pdu->u.meta = meta; 635 pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); 636 req->end_io = nvme_uring_cmd_end_io_meta; 637 } else { 638 req->end_io = nvme_uring_cmd_end_io; 639 } 640 blk_execute_rq_nowait(req, false); 641 return -EIOCBQUEUED; 642 } 643 644 static bool is_ctrl_ioctl(unsigned int cmd) 645 { 646 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 647 return true; 648 if (is_sed_ioctl(cmd)) 649 return true; 650 return false; 651 } 652 653 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, 654 void __user *argp, bool open_for_write) 655 { 656 switch (cmd) { 657 case NVME_IOCTL_ADMIN_CMD: 658 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 659 case NVME_IOCTL_ADMIN64_CMD: 660 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 661 default: 662 return sed_ioctl(ctrl->opal_dev, cmd, argp); 663 } 664 } 665 666 #ifdef COMPAT_FOR_U64_ALIGNMENT 667 struct nvme_user_io32 { 668 __u8 opcode; 669 __u8 flags; 670 __u16 control; 671 __u16 nblocks; 672 __u16 rsvd; 673 __u64 metadata; 674 __u64 addr; 675 __u64 slba; 676 __u32 dsmgmt; 677 __u32 reftag; 678 __u16 apptag; 679 __u16 appmask; 680 } __attribute__((__packed__)); 681 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 682 #endif /* COMPAT_FOR_U64_ALIGNMENT */ 683 684 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, 685 void __user *argp, unsigned int flags, bool open_for_write) 686 { 687 switch (cmd) { 688 case NVME_IOCTL_ID: 689 force_successful_syscall_return(); 690 return ns->head->ns_id; 691 case NVME_IOCTL_IO_CMD: 692 return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write); 693 /* 694 * struct nvme_user_io can have different padding on some 32-bit ABIs. 695 * Just accept the compat version as all fields that are used are the 696 * same size and at the same offset. 697 */ 698 #ifdef COMPAT_FOR_U64_ALIGNMENT 699 case NVME_IOCTL_SUBMIT_IO32: 700 #endif 701 case NVME_IOCTL_SUBMIT_IO: 702 return nvme_submit_io(ns, argp); 703 case NVME_IOCTL_IO64_CMD_VEC: 704 flags |= NVME_IOCTL_VEC; 705 fallthrough; 706 case NVME_IOCTL_IO64_CMD: 707 return nvme_user_cmd64(ns->ctrl, ns, argp, flags, 708 open_for_write); 709 default: 710 return -ENOTTY; 711 } 712 } 713 714 int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, 715 unsigned int cmd, unsigned long arg) 716 { 717 struct nvme_ns *ns = bdev->bd_disk->private_data; 718 bool open_for_write = mode & BLK_OPEN_WRITE; 719 void __user *argp = (void __user *)arg; 720 unsigned int flags = 0; 721 722 if (bdev_is_partition(bdev)) 723 flags |= NVME_IOCTL_PARTITION; 724 725 if (is_ctrl_ioctl(cmd)) 726 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 727 return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 728 } 729 730 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 731 { 732 struct nvme_ns *ns = 733 container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); 734 bool open_for_write = file->f_mode & FMODE_WRITE; 735 void __user *argp = (void __user *)arg; 736 737 if (is_ctrl_ioctl(cmd)) 738 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 739 return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 740 } 741 742 static int nvme_uring_cmd_checks(unsigned int issue_flags) 743 { 744 745 /* NVMe passthrough requires big SQE/CQE support */ 746 if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != 747 (IO_URING_F_SQE128|IO_URING_F_CQE32)) 748 return -EOPNOTSUPP; 749 return 0; 750 } 751 752 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, 753 unsigned int issue_flags) 754 { 755 struct nvme_ctrl *ctrl = ns->ctrl; 756 int ret; 757 758 BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); 759 760 ret = nvme_uring_cmd_checks(issue_flags); 761 if (ret) 762 return ret; 763 764 switch (ioucmd->cmd_op) { 765 case NVME_URING_CMD_IO: 766 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); 767 break; 768 case NVME_URING_CMD_IO_VEC: 769 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); 770 break; 771 default: 772 ret = -ENOTTY; 773 } 774 775 return ret; 776 } 777 778 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 779 { 780 struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, 781 struct nvme_ns, cdev); 782 783 return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 784 } 785 786 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 787 struct io_comp_batch *iob, 788 unsigned int poll_flags) 789 { 790 struct request *req; 791 int ret = 0; 792 793 if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) 794 return 0; 795 796 req = READ_ONCE(ioucmd->cookie); 797 if (req && blk_rq_is_poll(req)) 798 ret = blk_rq_poll(req, iob, poll_flags); 799 return ret; 800 } 801 #ifdef CONFIG_NVME_MULTIPATH 802 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 803 void __user *argp, struct nvme_ns_head *head, int srcu_idx, 804 bool open_for_write) 805 __releases(&head->srcu) 806 { 807 struct nvme_ctrl *ctrl = ns->ctrl; 808 int ret; 809 810 nvme_get_ctrl(ns->ctrl); 811 srcu_read_unlock(&head->srcu, srcu_idx); 812 ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 813 814 nvme_put_ctrl(ctrl); 815 return ret; 816 } 817 818 int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, 819 unsigned int cmd, unsigned long arg) 820 { 821 struct nvme_ns_head *head = bdev->bd_disk->private_data; 822 bool open_for_write = mode & BLK_OPEN_WRITE; 823 void __user *argp = (void __user *)arg; 824 struct nvme_ns *ns; 825 int srcu_idx, ret = -EWOULDBLOCK; 826 unsigned int flags = 0; 827 828 if (bdev_is_partition(bdev)) 829 flags |= NVME_IOCTL_PARTITION; 830 831 srcu_idx = srcu_read_lock(&head->srcu); 832 ns = nvme_find_path(head); 833 if (!ns) 834 goto out_unlock; 835 836 /* 837 * Handle ioctls that apply to the controller instead of the namespace 838 * seperately and drop the ns SRCU reference early. This avoids a 839 * deadlock when deleting namespaces using the passthrough interface. 840 */ 841 if (is_ctrl_ioctl(cmd)) 842 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 843 open_for_write); 844 845 ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 846 out_unlock: 847 srcu_read_unlock(&head->srcu, srcu_idx); 848 return ret; 849 } 850 851 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, 852 unsigned long arg) 853 { 854 bool open_for_write = file->f_mode & FMODE_WRITE; 855 struct cdev *cdev = file_inode(file)->i_cdev; 856 struct nvme_ns_head *head = 857 container_of(cdev, struct nvme_ns_head, cdev); 858 void __user *argp = (void __user *)arg; 859 struct nvme_ns *ns; 860 int srcu_idx, ret = -EWOULDBLOCK; 861 862 srcu_idx = srcu_read_lock(&head->srcu); 863 ns = nvme_find_path(head); 864 if (!ns) 865 goto out_unlock; 866 867 if (is_ctrl_ioctl(cmd)) 868 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 869 open_for_write); 870 871 ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 872 out_unlock: 873 srcu_read_unlock(&head->srcu, srcu_idx); 874 return ret; 875 } 876 877 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, 878 unsigned int issue_flags) 879 { 880 struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 881 struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 882 int srcu_idx = srcu_read_lock(&head->srcu); 883 struct nvme_ns *ns = nvme_find_path(head); 884 int ret = -EINVAL; 885 886 if (ns) 887 ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 888 srcu_read_unlock(&head->srcu, srcu_idx); 889 return ret; 890 } 891 #endif /* CONFIG_NVME_MULTIPATH */ 892 893 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 894 { 895 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 896 int ret; 897 898 /* IOPOLL not supported yet */ 899 if (issue_flags & IO_URING_F_IOPOLL) 900 return -EOPNOTSUPP; 901 902 ret = nvme_uring_cmd_checks(issue_flags); 903 if (ret) 904 return ret; 905 906 switch (ioucmd->cmd_op) { 907 case NVME_URING_CMD_ADMIN: 908 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); 909 break; 910 case NVME_URING_CMD_ADMIN_VEC: 911 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); 912 break; 913 default: 914 ret = -ENOTTY; 915 } 916 917 return ret; 918 } 919 920 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, 921 bool open_for_write) 922 { 923 struct nvme_ns *ns; 924 int ret, srcu_idx; 925 926 srcu_idx = srcu_read_lock(&ctrl->srcu); 927 if (list_empty(&ctrl->namespaces)) { 928 ret = -ENOTTY; 929 goto out_unlock; 930 } 931 932 ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); 933 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 934 dev_warn(ctrl->device, 935 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 936 ret = -EINVAL; 937 goto out_unlock; 938 } 939 940 dev_warn(ctrl->device, 941 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 942 if (!nvme_get_ns(ns)) { 943 ret = -ENXIO; 944 goto out_unlock; 945 } 946 srcu_read_unlock(&ctrl->srcu, srcu_idx); 947 948 ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); 949 nvme_put_ns(ns); 950 return ret; 951 952 out_unlock: 953 srcu_read_unlock(&ctrl->srcu, srcu_idx); 954 return ret; 955 } 956 957 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 958 unsigned long arg) 959 { 960 bool open_for_write = file->f_mode & FMODE_WRITE; 961 struct nvme_ctrl *ctrl = file->private_data; 962 void __user *argp = (void __user *)arg; 963 964 switch (cmd) { 965 case NVME_IOCTL_ADMIN_CMD: 966 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 967 case NVME_IOCTL_ADMIN64_CMD: 968 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 969 case NVME_IOCTL_IO_CMD: 970 return nvme_dev_user_cmd(ctrl, argp, open_for_write); 971 case NVME_IOCTL_RESET: 972 if (!capable(CAP_SYS_ADMIN)) 973 return -EACCES; 974 dev_warn(ctrl->device, "resetting controller\n"); 975 return nvme_reset_ctrl_sync(ctrl); 976 case NVME_IOCTL_SUBSYS_RESET: 977 if (!capable(CAP_SYS_ADMIN)) 978 return -EACCES; 979 return nvme_reset_subsystem(ctrl); 980 case NVME_IOCTL_RESCAN: 981 if (!capable(CAP_SYS_ADMIN)) 982 return -EACCES; 983 nvme_queue_scan(ctrl); 984 return 0; 985 default: 986 return -ENOTTY; 987 } 988 } 989