1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2011-2014, Intel Corporation. 4 * Copyright (c) 2017-2021 Christoph Hellwig. 5 */ 6 #include <linux/blk-integrity.h> 7 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 8 #include <linux/nvme_ioctl.h> 9 #include <linux/io_uring.h> 10 #include "nvme.h" 11 12 enum { 13 NVME_IOCTL_VEC = (1 << 0), 14 NVME_IOCTL_PARTITION = (1 << 1), 15 }; 16 17 static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, 18 unsigned int flags, bool open_for_write) 19 { 20 u32 effects; 21 22 /* 23 * Do not allow unprivileged passthrough on partitions, as that allows an 24 * escape from the containment of the partition. 25 */ 26 if (flags & NVME_IOCTL_PARTITION) 27 goto admin; 28 29 /* 30 * Do not allow unprivileged processes to send vendor specific or fabrics 31 * commands as we can't be sure about their effects. 32 */ 33 if (c->common.opcode >= nvme_cmd_vendor_start || 34 c->common.opcode == nvme_fabrics_command) 35 goto admin; 36 37 /* 38 * Do not allow unprivileged passthrough of admin commands except 39 * for a subset of identify commands that contain information required 40 * to form proper I/O commands in userspace and do not expose any 41 * potentially sensitive information. 42 */ 43 if (!ns) { 44 if (c->common.opcode == nvme_admin_identify) { 45 switch (c->identify.cns) { 46 case NVME_ID_CNS_NS: 47 case NVME_ID_CNS_CS_NS: 48 case NVME_ID_CNS_NS_CS_INDEP: 49 case NVME_ID_CNS_CS_CTRL: 50 case NVME_ID_CNS_CTRL: 51 return true; 52 } 53 } 54 goto admin; 55 } 56 57 /* 58 * Check if the controller provides a Commands Supported and Effects log 59 * and marks this command as supported. If not reject unprivileged 60 * passthrough. 61 */ 62 effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); 63 if (!(effects & NVME_CMD_EFFECTS_CSUPP)) 64 goto admin; 65 66 /* 67 * Don't allow passthrough for command that have intrusive (or unknown) 68 * effects. 69 */ 70 if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | 71 NVME_CMD_EFFECTS_UUID_SEL | 72 NVME_CMD_EFFECTS_SCOPE_MASK)) 73 goto admin; 74 75 /* 76 * Only allow I/O commands that transfer data to the controller or that 77 * change the logical block contents if the file descriptor is open for 78 * writing. 79 */ 80 if ((nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) && 81 !open_for_write) 82 goto admin; 83 84 return true; 85 admin: 86 return capable(CAP_SYS_ADMIN); 87 } 88 89 /* 90 * Convert integer values from ioctl structures to user pointers, silently 91 * ignoring the upper bits in the compat case to match behaviour of 32-bit 92 * kernels. 93 */ 94 static void __user *nvme_to_user_ptr(uintptr_t ptrval) 95 { 96 if (in_compat_syscall()) 97 ptrval = (compat_uptr_t)ptrval; 98 return (void __user *)ptrval; 99 } 100 101 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf, 102 unsigned len, u32 seed) 103 { 104 struct bio_integrity_payload *bip; 105 int ret = -ENOMEM; 106 void *buf; 107 struct bio *bio = req->bio; 108 109 buf = kmalloc(len, GFP_KERNEL); 110 if (!buf) 111 goto out; 112 113 if (req_op(req) == REQ_OP_DRV_OUT) { 114 ret = -EFAULT; 115 if (copy_from_user(buf, ubuf, len)) 116 goto out_free_meta; 117 } else { 118 memset(buf, 0, len); 119 } 120 121 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); 122 if (IS_ERR(bip)) { 123 ret = PTR_ERR(bip); 124 goto out_free_meta; 125 } 126 127 bip->bip_iter.bi_sector = seed; 128 ret = bio_integrity_add_page(bio, virt_to_page(buf), len, 129 offset_in_page(buf)); 130 if (ret != len) { 131 ret = -ENOMEM; 132 goto out_free_meta; 133 } 134 135 req->cmd_flags |= REQ_INTEGRITY; 136 return buf; 137 out_free_meta: 138 kfree(buf); 139 out: 140 return ERR_PTR(ret); 141 } 142 143 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf, 144 void *meta, unsigned len, int ret) 145 { 146 if (!ret && req_op(req) == REQ_OP_DRV_IN && 147 copy_to_user(ubuf, meta, len)) 148 ret = -EFAULT; 149 kfree(meta); 150 return ret; 151 } 152 153 static struct request *nvme_alloc_user_request(struct request_queue *q, 154 struct nvme_command *cmd, blk_opf_t rq_flags, 155 blk_mq_req_flags_t blk_flags) 156 { 157 struct request *req; 158 159 req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags); 160 if (IS_ERR(req)) 161 return req; 162 nvme_init_request(req, cmd); 163 nvme_req(req)->flags |= NVME_REQ_USERCMD; 164 return req; 165 } 166 167 static int nvme_map_user_request(struct request *req, u64 ubuffer, 168 unsigned bufflen, void __user *meta_buffer, unsigned meta_len, 169 u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd, 170 unsigned int flags) 171 { 172 struct request_queue *q = req->q; 173 struct nvme_ns *ns = q->queuedata; 174 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 175 bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); 176 bool has_metadata = meta_buffer && meta_len; 177 struct bio *bio = NULL; 178 void *meta = NULL; 179 int ret; 180 181 if (has_metadata && !supports_metadata) 182 return -EINVAL; 183 184 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 185 struct iov_iter iter; 186 187 /* fixedbufs is only for non-vectored io */ 188 if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) 189 return -EINVAL; 190 ret = io_uring_cmd_import_fixed(ubuffer, bufflen, 191 rq_data_dir(req), &iter, ioucmd); 192 if (ret < 0) 193 goto out; 194 ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); 195 } else { 196 ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), 197 bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 198 0, rq_data_dir(req)); 199 } 200 201 if (ret) 202 goto out; 203 bio = req->bio; 204 if (bdev) 205 bio_set_dev(bio, bdev); 206 207 if (has_metadata) { 208 meta = nvme_add_user_metadata(req, meta_buffer, meta_len, 209 meta_seed); 210 if (IS_ERR(meta)) { 211 ret = PTR_ERR(meta); 212 goto out_unmap; 213 } 214 *metap = meta; 215 } 216 217 return ret; 218 219 out_unmap: 220 if (bio) 221 blk_rq_unmap_user(bio); 222 out: 223 blk_mq_free_request(req); 224 return ret; 225 } 226 227 static int nvme_submit_user_cmd(struct request_queue *q, 228 struct nvme_command *cmd, u64 ubuffer, unsigned bufflen, 229 void __user *meta_buffer, unsigned meta_len, u32 meta_seed, 230 u64 *result, unsigned timeout, unsigned int flags) 231 { 232 struct nvme_ns *ns = q->queuedata; 233 struct nvme_ctrl *ctrl; 234 struct request *req; 235 void *meta = NULL; 236 struct bio *bio; 237 u32 effects; 238 int ret; 239 240 req = nvme_alloc_user_request(q, cmd, 0, 0); 241 if (IS_ERR(req)) 242 return PTR_ERR(req); 243 244 req->timeout = timeout; 245 if (ubuffer && bufflen) { 246 ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, 247 meta_len, meta_seed, &meta, NULL, flags); 248 if (ret) 249 return ret; 250 } 251 252 bio = req->bio; 253 ctrl = nvme_req(req)->ctrl; 254 255 effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); 256 ret = nvme_execute_rq(req, false); 257 if (result) 258 *result = le64_to_cpu(nvme_req(req)->result.u64); 259 if (meta) 260 ret = nvme_finish_user_metadata(req, meta_buffer, meta, 261 meta_len, ret); 262 if (bio) 263 blk_rq_unmap_user(bio); 264 blk_mq_free_request(req); 265 266 if (effects) 267 nvme_passthru_end(ctrl, ns, effects, cmd, ret); 268 269 return ret; 270 } 271 272 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) 273 { 274 struct nvme_user_io io; 275 struct nvme_command c; 276 unsigned length, meta_len; 277 void __user *metadata; 278 279 if (copy_from_user(&io, uio, sizeof(io))) 280 return -EFAULT; 281 if (io.flags) 282 return -EINVAL; 283 284 switch (io.opcode) { 285 case nvme_cmd_write: 286 case nvme_cmd_read: 287 case nvme_cmd_compare: 288 break; 289 default: 290 return -EINVAL; 291 } 292 293 length = (io.nblocks + 1) << ns->lba_shift; 294 295 if ((io.control & NVME_RW_PRINFO_PRACT) && 296 ns->ms == sizeof(struct t10_pi_tuple)) { 297 /* 298 * Protection information is stripped/inserted by the 299 * controller. 300 */ 301 if (nvme_to_user_ptr(io.metadata)) 302 return -EINVAL; 303 meta_len = 0; 304 metadata = NULL; 305 } else { 306 meta_len = (io.nblocks + 1) * ns->ms; 307 metadata = nvme_to_user_ptr(io.metadata); 308 } 309 310 if (ns->features & NVME_NS_EXT_LBAS) { 311 length += meta_len; 312 meta_len = 0; 313 } else if (meta_len) { 314 if ((io.metadata & 3) || !io.metadata) 315 return -EINVAL; 316 } 317 318 memset(&c, 0, sizeof(c)); 319 c.rw.opcode = io.opcode; 320 c.rw.flags = io.flags; 321 c.rw.nsid = cpu_to_le32(ns->head->ns_id); 322 c.rw.slba = cpu_to_le64(io.slba); 323 c.rw.length = cpu_to_le16(io.nblocks); 324 c.rw.control = cpu_to_le16(io.control); 325 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 326 c.rw.reftag = cpu_to_le32(io.reftag); 327 c.rw.apptag = cpu_to_le16(io.apptag); 328 c.rw.appmask = cpu_to_le16(io.appmask); 329 330 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 331 meta_len, lower_32_bits(io.slba), NULL, 0, 0); 332 } 333 334 static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, 335 struct nvme_ns *ns, __u32 nsid) 336 { 337 if (ns && nsid != ns->head->ns_id) { 338 dev_err(ctrl->device, 339 "%s: nsid (%u) in cmd does not match nsid (%u)" 340 "of namespace\n", 341 current->comm, nsid, ns->head->ns_id); 342 return false; 343 } 344 345 return true; 346 } 347 348 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 349 struct nvme_passthru_cmd __user *ucmd, unsigned int flags, 350 bool open_for_write) 351 { 352 struct nvme_passthru_cmd cmd; 353 struct nvme_command c; 354 unsigned timeout = 0; 355 u64 result; 356 int status; 357 358 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 359 return -EFAULT; 360 if (cmd.flags) 361 return -EINVAL; 362 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 363 return -EINVAL; 364 365 memset(&c, 0, sizeof(c)); 366 c.common.opcode = cmd.opcode; 367 c.common.flags = cmd.flags; 368 c.common.nsid = cpu_to_le32(cmd.nsid); 369 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 370 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 371 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 372 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 373 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 374 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 375 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 376 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 377 378 if (!nvme_cmd_allowed(ns, &c, 0, open_for_write)) 379 return -EACCES; 380 381 if (cmd.timeout_ms) 382 timeout = msecs_to_jiffies(cmd.timeout_ms); 383 384 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 385 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 386 cmd.metadata_len, 0, &result, timeout, 0); 387 388 if (status >= 0) { 389 if (put_user(result, &ucmd->result)) 390 return -EFAULT; 391 } 392 393 return status; 394 } 395 396 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 397 struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, 398 bool open_for_write) 399 { 400 struct nvme_passthru_cmd64 cmd; 401 struct nvme_command c; 402 unsigned timeout = 0; 403 int status; 404 405 if (copy_from_user(&cmd, ucmd, sizeof(cmd))) 406 return -EFAULT; 407 if (cmd.flags) 408 return -EINVAL; 409 if (!nvme_validate_passthru_nsid(ctrl, ns, cmd.nsid)) 410 return -EINVAL; 411 412 memset(&c, 0, sizeof(c)); 413 c.common.opcode = cmd.opcode; 414 c.common.flags = cmd.flags; 415 c.common.nsid = cpu_to_le32(cmd.nsid); 416 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); 417 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); 418 c.common.cdw10 = cpu_to_le32(cmd.cdw10); 419 c.common.cdw11 = cpu_to_le32(cmd.cdw11); 420 c.common.cdw12 = cpu_to_le32(cmd.cdw12); 421 c.common.cdw13 = cpu_to_le32(cmd.cdw13); 422 c.common.cdw14 = cpu_to_le32(cmd.cdw14); 423 c.common.cdw15 = cpu_to_le32(cmd.cdw15); 424 425 if (!nvme_cmd_allowed(ns, &c, flags, open_for_write)) 426 return -EACCES; 427 428 if (cmd.timeout_ms) 429 timeout = msecs_to_jiffies(cmd.timeout_ms); 430 431 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, 432 cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata), 433 cmd.metadata_len, 0, &cmd.result, timeout, flags); 434 435 if (status >= 0) { 436 if (put_user(cmd.result, &ucmd->result)) 437 return -EFAULT; 438 } 439 440 return status; 441 } 442 443 struct nvme_uring_data { 444 __u64 metadata; 445 __u64 addr; 446 __u32 data_len; 447 __u32 metadata_len; 448 __u32 timeout_ms; 449 }; 450 451 /* 452 * This overlays struct io_uring_cmd pdu. 453 * Expect build errors if this grows larger than that. 454 */ 455 struct nvme_uring_cmd_pdu { 456 union { 457 struct bio *bio; 458 struct request *req; 459 }; 460 u32 meta_len; 461 u32 nvme_status; 462 union { 463 struct { 464 void *meta; /* kernel-resident buffer */ 465 void __user *meta_buffer; 466 }; 467 u64 result; 468 } u; 469 }; 470 471 static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu( 472 struct io_uring_cmd *ioucmd) 473 { 474 return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu; 475 } 476 477 static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd, 478 unsigned issue_flags) 479 { 480 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 481 struct request *req = pdu->req; 482 int status; 483 u64 result; 484 485 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) 486 status = -EINTR; 487 else 488 status = nvme_req(req)->status; 489 490 result = le64_to_cpu(nvme_req(req)->result.u64); 491 492 if (pdu->meta_len) 493 status = nvme_finish_user_metadata(req, pdu->u.meta_buffer, 494 pdu->u.meta, pdu->meta_len, status); 495 if (req->bio) 496 blk_rq_unmap_user(req->bio); 497 blk_mq_free_request(req); 498 499 io_uring_cmd_done(ioucmd, status, result, issue_flags); 500 } 501 502 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 503 unsigned issue_flags) 504 { 505 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 506 507 if (pdu->bio) 508 blk_rq_unmap_user(pdu->bio); 509 510 io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags); 511 } 512 513 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, 514 blk_status_t err) 515 { 516 struct io_uring_cmd *ioucmd = req->end_io_data; 517 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 518 519 req->bio = pdu->bio; 520 if (nvme_req(req)->flags & NVME_REQ_CANCELLED) { 521 pdu->nvme_status = -EINTR; 522 } else { 523 pdu->nvme_status = nvme_req(req)->status; 524 if (!pdu->nvme_status) 525 pdu->nvme_status = blk_status_to_errno(err); 526 } 527 pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); 528 529 /* 530 * For iopoll, complete it directly. 531 * Otherwise, move the completion to task work. 532 */ 533 if (blk_rq_is_poll(req)) { 534 WRITE_ONCE(ioucmd->cookie, NULL); 535 nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); 536 } else { 537 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); 538 } 539 540 return RQ_END_IO_FREE; 541 } 542 543 static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, 544 blk_status_t err) 545 { 546 struct io_uring_cmd *ioucmd = req->end_io_data; 547 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 548 549 req->bio = pdu->bio; 550 pdu->req = req; 551 552 /* 553 * For iopoll, complete it directly. 554 * Otherwise, move the completion to task work. 555 */ 556 if (blk_rq_is_poll(req)) { 557 WRITE_ONCE(ioucmd->cookie, NULL); 558 nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); 559 } else { 560 io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); 561 } 562 563 return RQ_END_IO_NONE; 564 } 565 566 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, 567 struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) 568 { 569 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 570 const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); 571 struct request_queue *q = ns ? ns->queue : ctrl->admin_q; 572 struct nvme_uring_data d; 573 struct nvme_command c; 574 struct request *req; 575 blk_opf_t rq_flags = REQ_ALLOC_CACHE; 576 blk_mq_req_flags_t blk_flags = 0; 577 void *meta = NULL; 578 int ret; 579 580 c.common.opcode = READ_ONCE(cmd->opcode); 581 c.common.flags = READ_ONCE(cmd->flags); 582 if (c.common.flags) 583 return -EINVAL; 584 585 c.common.command_id = 0; 586 c.common.nsid = cpu_to_le32(cmd->nsid); 587 if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid))) 588 return -EINVAL; 589 590 c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2)); 591 c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3)); 592 c.common.metadata = 0; 593 c.common.dptr.prp1 = c.common.dptr.prp2 = 0; 594 c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10)); 595 c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11)); 596 c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12)); 597 c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13)); 598 c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); 599 c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); 600 601 if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE)) 602 return -EACCES; 603 604 d.metadata = READ_ONCE(cmd->metadata); 605 d.addr = READ_ONCE(cmd->addr); 606 d.data_len = READ_ONCE(cmd->data_len); 607 d.metadata_len = READ_ONCE(cmd->metadata_len); 608 d.timeout_ms = READ_ONCE(cmd->timeout_ms); 609 610 if (issue_flags & IO_URING_F_NONBLOCK) { 611 rq_flags |= REQ_NOWAIT; 612 blk_flags = BLK_MQ_REQ_NOWAIT; 613 } 614 if (issue_flags & IO_URING_F_IOPOLL) 615 rq_flags |= REQ_POLLED; 616 617 req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); 618 if (IS_ERR(req)) 619 return PTR_ERR(req); 620 req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; 621 622 if (d.addr && d.data_len) { 623 ret = nvme_map_user_request(req, d.addr, 624 d.data_len, nvme_to_user_ptr(d.metadata), 625 d.metadata_len, 0, &meta, ioucmd, vec); 626 if (ret) 627 return ret; 628 } 629 630 if (blk_rq_is_poll(req)) { 631 ioucmd->flags |= IORING_URING_CMD_POLLED; 632 WRITE_ONCE(ioucmd->cookie, req); 633 } 634 635 /* to free bio on completion, as req->bio will be null at that time */ 636 pdu->bio = req->bio; 637 pdu->meta_len = d.metadata_len; 638 req->end_io_data = ioucmd; 639 if (pdu->meta_len) { 640 pdu->u.meta = meta; 641 pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata); 642 req->end_io = nvme_uring_cmd_end_io_meta; 643 } else { 644 req->end_io = nvme_uring_cmd_end_io; 645 } 646 blk_execute_rq_nowait(req, false); 647 return -EIOCBQUEUED; 648 } 649 650 static bool is_ctrl_ioctl(unsigned int cmd) 651 { 652 if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) 653 return true; 654 if (is_sed_ioctl(cmd)) 655 return true; 656 return false; 657 } 658 659 static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, 660 void __user *argp, bool open_for_write) 661 { 662 switch (cmd) { 663 case NVME_IOCTL_ADMIN_CMD: 664 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 665 case NVME_IOCTL_ADMIN64_CMD: 666 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 667 default: 668 return sed_ioctl(ctrl->opal_dev, cmd, argp); 669 } 670 } 671 672 #ifdef COMPAT_FOR_U64_ALIGNMENT 673 struct nvme_user_io32 { 674 __u8 opcode; 675 __u8 flags; 676 __u16 control; 677 __u16 nblocks; 678 __u16 rsvd; 679 __u64 metadata; 680 __u64 addr; 681 __u64 slba; 682 __u32 dsmgmt; 683 __u32 reftag; 684 __u16 apptag; 685 __u16 appmask; 686 } __attribute__((__packed__)); 687 #define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) 688 #endif /* COMPAT_FOR_U64_ALIGNMENT */ 689 690 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, 691 void __user *argp, unsigned int flags, bool open_for_write) 692 { 693 switch (cmd) { 694 case NVME_IOCTL_ID: 695 force_successful_syscall_return(); 696 return ns->head->ns_id; 697 case NVME_IOCTL_IO_CMD: 698 return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write); 699 /* 700 * struct nvme_user_io can have different padding on some 32-bit ABIs. 701 * Just accept the compat version as all fields that are used are the 702 * same size and at the same offset. 703 */ 704 #ifdef COMPAT_FOR_U64_ALIGNMENT 705 case NVME_IOCTL_SUBMIT_IO32: 706 #endif 707 case NVME_IOCTL_SUBMIT_IO: 708 return nvme_submit_io(ns, argp); 709 case NVME_IOCTL_IO64_CMD_VEC: 710 flags |= NVME_IOCTL_VEC; 711 fallthrough; 712 case NVME_IOCTL_IO64_CMD: 713 return nvme_user_cmd64(ns->ctrl, ns, argp, flags, 714 open_for_write); 715 default: 716 return -ENOTTY; 717 } 718 } 719 720 int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, 721 unsigned int cmd, unsigned long arg) 722 { 723 struct nvme_ns *ns = bdev->bd_disk->private_data; 724 bool open_for_write = mode & BLK_OPEN_WRITE; 725 void __user *argp = (void __user *)arg; 726 unsigned int flags = 0; 727 728 if (bdev_is_partition(bdev)) 729 flags |= NVME_IOCTL_PARTITION; 730 731 if (is_ctrl_ioctl(cmd)) 732 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 733 return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 734 } 735 736 long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 737 { 738 struct nvme_ns *ns = 739 container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); 740 bool open_for_write = file->f_mode & FMODE_WRITE; 741 void __user *argp = (void __user *)arg; 742 743 if (is_ctrl_ioctl(cmd)) 744 return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 745 return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 746 } 747 748 static int nvme_uring_cmd_checks(unsigned int issue_flags) 749 { 750 751 /* NVMe passthrough requires big SQE/CQE support */ 752 if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != 753 (IO_URING_F_SQE128|IO_URING_F_CQE32)) 754 return -EOPNOTSUPP; 755 return 0; 756 } 757 758 static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd, 759 unsigned int issue_flags) 760 { 761 struct nvme_ctrl *ctrl = ns->ctrl; 762 int ret; 763 764 BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu)); 765 766 ret = nvme_uring_cmd_checks(issue_flags); 767 if (ret) 768 return ret; 769 770 switch (ioucmd->cmd_op) { 771 case NVME_URING_CMD_IO: 772 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false); 773 break; 774 case NVME_URING_CMD_IO_VEC: 775 ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true); 776 break; 777 default: 778 ret = -ENOTTY; 779 } 780 781 return ret; 782 } 783 784 int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 785 { 786 struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev, 787 struct nvme_ns, cdev); 788 789 return nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 790 } 791 792 int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, 793 struct io_comp_batch *iob, 794 unsigned int poll_flags) 795 { 796 struct request *req; 797 int ret = 0; 798 799 if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) 800 return 0; 801 802 req = READ_ONCE(ioucmd->cookie); 803 if (req && blk_rq_is_poll(req)) 804 ret = blk_rq_poll(req, iob, poll_flags); 805 return ret; 806 } 807 #ifdef CONFIG_NVME_MULTIPATH 808 static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, 809 void __user *argp, struct nvme_ns_head *head, int srcu_idx, 810 bool open_for_write) 811 __releases(&head->srcu) 812 { 813 struct nvme_ctrl *ctrl = ns->ctrl; 814 int ret; 815 816 nvme_get_ctrl(ns->ctrl); 817 srcu_read_unlock(&head->srcu, srcu_idx); 818 ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); 819 820 nvme_put_ctrl(ctrl); 821 return ret; 822 } 823 824 int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, 825 unsigned int cmd, unsigned long arg) 826 { 827 struct nvme_ns_head *head = bdev->bd_disk->private_data; 828 bool open_for_write = mode & BLK_OPEN_WRITE; 829 void __user *argp = (void __user *)arg; 830 struct nvme_ns *ns; 831 int srcu_idx, ret = -EWOULDBLOCK; 832 unsigned int flags = 0; 833 834 if (bdev_is_partition(bdev)) 835 flags |= NVME_IOCTL_PARTITION; 836 837 srcu_idx = srcu_read_lock(&head->srcu); 838 ns = nvme_find_path(head); 839 if (!ns) 840 goto out_unlock; 841 842 /* 843 * Handle ioctls that apply to the controller instead of the namespace 844 * seperately and drop the ns SRCU reference early. This avoids a 845 * deadlock when deleting namespaces using the passthrough interface. 846 */ 847 if (is_ctrl_ioctl(cmd)) 848 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 849 open_for_write); 850 851 ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); 852 out_unlock: 853 srcu_read_unlock(&head->srcu, srcu_idx); 854 return ret; 855 } 856 857 long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, 858 unsigned long arg) 859 { 860 bool open_for_write = file->f_mode & FMODE_WRITE; 861 struct cdev *cdev = file_inode(file)->i_cdev; 862 struct nvme_ns_head *head = 863 container_of(cdev, struct nvme_ns_head, cdev); 864 void __user *argp = (void __user *)arg; 865 struct nvme_ns *ns; 866 int srcu_idx, ret = -EWOULDBLOCK; 867 868 srcu_idx = srcu_read_lock(&head->srcu); 869 ns = nvme_find_path(head); 870 if (!ns) 871 goto out_unlock; 872 873 if (is_ctrl_ioctl(cmd)) 874 return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, 875 open_for_write); 876 877 ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); 878 out_unlock: 879 srcu_read_unlock(&head->srcu, srcu_idx); 880 return ret; 881 } 882 883 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, 884 unsigned int issue_flags) 885 { 886 struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; 887 struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); 888 int srcu_idx = srcu_read_lock(&head->srcu); 889 struct nvme_ns *ns = nvme_find_path(head); 890 int ret = -EINVAL; 891 892 if (ns) 893 ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags); 894 srcu_read_unlock(&head->srcu, srcu_idx); 895 return ret; 896 } 897 #endif /* CONFIG_NVME_MULTIPATH */ 898 899 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) 900 { 901 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 902 int ret; 903 904 /* IOPOLL not supported yet */ 905 if (issue_flags & IO_URING_F_IOPOLL) 906 return -EOPNOTSUPP; 907 908 ret = nvme_uring_cmd_checks(issue_flags); 909 if (ret) 910 return ret; 911 912 switch (ioucmd->cmd_op) { 913 case NVME_URING_CMD_ADMIN: 914 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false); 915 break; 916 case NVME_URING_CMD_ADMIN_VEC: 917 ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true); 918 break; 919 default: 920 ret = -ENOTTY; 921 } 922 923 return ret; 924 } 925 926 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, 927 bool open_for_write) 928 { 929 struct nvme_ns *ns; 930 int ret, srcu_idx; 931 932 srcu_idx = srcu_read_lock(&ctrl->srcu); 933 if (list_empty(&ctrl->namespaces)) { 934 ret = -ENOTTY; 935 goto out_unlock; 936 } 937 938 ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); 939 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { 940 dev_warn(ctrl->device, 941 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); 942 ret = -EINVAL; 943 goto out_unlock; 944 } 945 946 dev_warn(ctrl->device, 947 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); 948 if (!nvme_get_ns(ns)) { 949 ret = -ENXIO; 950 goto out_unlock; 951 } 952 srcu_read_unlock(&ctrl->srcu, srcu_idx); 953 954 ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); 955 nvme_put_ns(ns); 956 return ret; 957 958 out_unlock: 959 srcu_read_unlock(&ctrl->srcu, srcu_idx); 960 return ret; 961 } 962 963 long nvme_dev_ioctl(struct file *file, unsigned int cmd, 964 unsigned long arg) 965 { 966 bool open_for_write = file->f_mode & FMODE_WRITE; 967 struct nvme_ctrl *ctrl = file->private_data; 968 void __user *argp = (void __user *)arg; 969 970 switch (cmd) { 971 case NVME_IOCTL_ADMIN_CMD: 972 return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); 973 case NVME_IOCTL_ADMIN64_CMD: 974 return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); 975 case NVME_IOCTL_IO_CMD: 976 return nvme_dev_user_cmd(ctrl, argp, open_for_write); 977 case NVME_IOCTL_RESET: 978 if (!capable(CAP_SYS_ADMIN)) 979 return -EACCES; 980 dev_warn(ctrl->device, "resetting controller\n"); 981 return nvme_reset_ctrl_sync(ctrl); 982 case NVME_IOCTL_SUBSYS_RESET: 983 if (!capable(CAP_SYS_ADMIN)) 984 return -EACCES; 985 return nvme_reset_subsystem(ctrl); 986 case NVME_IOCTL_RESCAN: 987 if (!capable(CAP_SYS_ADMIN)) 988 return -EACCES; 989 nvme_queue_scan(ctrl); 990 return 0; 991 default: 992 return -ENOTTY; 993 } 994 } 995