1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics TCP target. 4 * Copyright (c) 2018 Lightbits Labs. All rights reserved. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <linux/err.h> 11 #include <linux/nvme-tcp.h> 12 #include <net/sock.h> 13 #include <net/tcp.h> 14 #include <linux/inet.h> 15 #include <linux/llist.h> 16 #include <crypto/hash.h> 17 18 #include "nvmet.h" 19 20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) 21 22 /* Define the socket priority to use for connections were it is desirable 23 * that the NIC consider performing optimized packet processing or filtering. 24 * A non-zero value being sufficient to indicate general consideration of any 25 * possible optimization. Making it a module param allows for alternative 26 * values that may be unique for some NIC implementations. 27 */ 28 static int so_priority; 29 module_param(so_priority, int, 0644); 30 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); 31 32 #define NVMET_TCP_RECV_BUDGET 8 33 #define NVMET_TCP_SEND_BUDGET 8 34 #define NVMET_TCP_IO_WORK_BUDGET 64 35 36 enum nvmet_tcp_send_state { 37 NVMET_TCP_SEND_DATA_PDU, 38 NVMET_TCP_SEND_DATA, 39 NVMET_TCP_SEND_R2T, 40 NVMET_TCP_SEND_DDGST, 41 NVMET_TCP_SEND_RESPONSE 42 }; 43 44 enum nvmet_tcp_recv_state { 45 NVMET_TCP_RECV_PDU, 46 NVMET_TCP_RECV_DATA, 47 NVMET_TCP_RECV_DDGST, 48 NVMET_TCP_RECV_ERR, 49 }; 50 51 enum { 52 NVMET_TCP_F_INIT_FAILED = (1 << 0), 53 }; 54 55 struct nvmet_tcp_cmd { 56 struct nvmet_tcp_queue *queue; 57 struct nvmet_req req; 58 59 struct nvme_tcp_cmd_pdu *cmd_pdu; 60 struct nvme_tcp_rsp_pdu *rsp_pdu; 61 struct nvme_tcp_data_pdu *data_pdu; 62 struct nvme_tcp_r2t_pdu *r2t_pdu; 63 64 u32 rbytes_done; 65 u32 wbytes_done; 66 67 u32 pdu_len; 68 u32 pdu_recv; 69 int sg_idx; 70 int nr_mapped; 71 struct msghdr recv_msg; 72 struct kvec *iov; 73 u32 flags; 74 75 struct list_head entry; 76 struct llist_node lentry; 77 78 /* send state */ 79 u32 offset; 80 struct scatterlist *cur_sg; 81 enum nvmet_tcp_send_state state; 82 83 __le32 exp_ddgst; 84 __le32 recv_ddgst; 85 }; 86 87 enum nvmet_tcp_queue_state { 88 NVMET_TCP_Q_CONNECTING, 89 NVMET_TCP_Q_LIVE, 90 NVMET_TCP_Q_DISCONNECTING, 91 }; 92 93 struct nvmet_tcp_queue { 94 struct socket *sock; 95 struct nvmet_tcp_port *port; 96 struct work_struct io_work; 97 int cpu; 98 struct nvmet_cq nvme_cq; 99 struct nvmet_sq nvme_sq; 100 101 /* send state */ 102 struct nvmet_tcp_cmd *cmds; 103 unsigned int nr_cmds; 104 struct list_head free_list; 105 struct llist_head resp_list; 106 struct list_head resp_send_list; 107 int send_list_len; 108 struct nvmet_tcp_cmd *snd_cmd; 109 110 /* recv state */ 111 int offset; 112 int left; 113 enum nvmet_tcp_recv_state rcv_state; 114 struct nvmet_tcp_cmd *cmd; 115 union nvme_tcp_pdu pdu; 116 117 /* digest state */ 118 bool hdr_digest; 119 bool data_digest; 120 struct ahash_request *snd_hash; 121 struct ahash_request *rcv_hash; 122 123 spinlock_t state_lock; 124 enum nvmet_tcp_queue_state state; 125 126 struct sockaddr_storage sockaddr; 127 struct sockaddr_storage sockaddr_peer; 128 struct work_struct release_work; 129 130 int idx; 131 struct list_head queue_list; 132 133 struct nvmet_tcp_cmd connect; 134 135 struct page_frag_cache pf_cache; 136 137 void (*data_ready)(struct sock *); 138 void (*state_change)(struct sock *); 139 void (*write_space)(struct sock *); 140 }; 141 142 struct nvmet_tcp_port { 143 struct socket *sock; 144 struct work_struct accept_work; 145 struct nvmet_port *nport; 146 struct sockaddr_storage addr; 147 int last_cpu; 148 void (*data_ready)(struct sock *); 149 }; 150 151 static DEFINE_IDA(nvmet_tcp_queue_ida); 152 static LIST_HEAD(nvmet_tcp_queue_list); 153 static DEFINE_MUTEX(nvmet_tcp_queue_mutex); 154 155 static struct workqueue_struct *nvmet_tcp_wq; 156 static const struct nvmet_fabrics_ops nvmet_tcp_ops; 157 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); 158 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); 159 160 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, 161 struct nvmet_tcp_cmd *cmd) 162 { 163 return cmd - queue->cmds; 164 } 165 166 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) 167 { 168 return nvme_is_write(cmd->req.cmd) && 169 cmd->rbytes_done < cmd->req.transfer_len; 170 } 171 172 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) 173 { 174 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status; 175 } 176 177 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) 178 { 179 return !nvme_is_write(cmd->req.cmd) && 180 cmd->req.transfer_len > 0 && 181 !cmd->req.cqe->status; 182 } 183 184 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) 185 { 186 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && 187 !cmd->rbytes_done; 188 } 189 190 static inline struct nvmet_tcp_cmd * 191 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) 192 { 193 struct nvmet_tcp_cmd *cmd; 194 195 cmd = list_first_entry_or_null(&queue->free_list, 196 struct nvmet_tcp_cmd, entry); 197 if (!cmd) 198 return NULL; 199 list_del_init(&cmd->entry); 200 201 cmd->rbytes_done = cmd->wbytes_done = 0; 202 cmd->pdu_len = 0; 203 cmd->pdu_recv = 0; 204 cmd->iov = NULL; 205 cmd->flags = 0; 206 return cmd; 207 } 208 209 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) 210 { 211 if (unlikely(cmd == &cmd->queue->connect)) 212 return; 213 214 list_add_tail(&cmd->entry, &cmd->queue->free_list); 215 } 216 217 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) 218 { 219 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; 220 } 221 222 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) 223 { 224 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 225 } 226 227 static inline void nvmet_tcp_hdgst(struct ahash_request *hash, 228 void *pdu, size_t len) 229 { 230 struct scatterlist sg; 231 232 sg_init_one(&sg, pdu, len); 233 ahash_request_set_crypt(hash, &sg, pdu + len, len); 234 crypto_ahash_digest(hash); 235 } 236 237 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, 238 void *pdu, size_t len) 239 { 240 struct nvme_tcp_hdr *hdr = pdu; 241 __le32 recv_digest; 242 __le32 exp_digest; 243 244 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { 245 pr_err("queue %d: header digest enabled but no header digest\n", 246 queue->idx); 247 return -EPROTO; 248 } 249 250 recv_digest = *(__le32 *)(pdu + hdr->hlen); 251 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); 252 exp_digest = *(__le32 *)(pdu + hdr->hlen); 253 if (recv_digest != exp_digest) { 254 pr_err("queue %d: header digest error: recv %#x expected %#x\n", 255 queue->idx, le32_to_cpu(recv_digest), 256 le32_to_cpu(exp_digest)); 257 return -EPROTO; 258 } 259 260 return 0; 261 } 262 263 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) 264 { 265 struct nvme_tcp_hdr *hdr = pdu; 266 u8 digest_len = nvmet_tcp_hdgst_len(queue); 267 u32 len; 268 269 len = le32_to_cpu(hdr->plen) - hdr->hlen - 270 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); 271 272 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { 273 pr_err("queue %d: data digest flag is cleared\n", queue->idx); 274 return -EPROTO; 275 } 276 277 return 0; 278 } 279 280 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) 281 { 282 struct scatterlist *sg; 283 int i; 284 285 sg = &cmd->req.sg[cmd->sg_idx]; 286 287 for (i = 0; i < cmd->nr_mapped; i++) 288 kunmap(sg_page(&sg[i])); 289 } 290 291 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) 292 { 293 struct kvec *iov = cmd->iov; 294 struct scatterlist *sg; 295 u32 length, offset, sg_offset; 296 297 length = cmd->pdu_len; 298 cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); 299 offset = cmd->rbytes_done; 300 cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE); 301 sg_offset = offset % PAGE_SIZE; 302 sg = &cmd->req.sg[cmd->sg_idx]; 303 304 while (length) { 305 u32 iov_len = min_t(u32, length, sg->length - sg_offset); 306 307 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; 308 iov->iov_len = iov_len; 309 310 length -= iov_len; 311 sg = sg_next(sg); 312 iov++; 313 } 314 315 iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, 316 cmd->nr_mapped, cmd->pdu_len); 317 } 318 319 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) 320 { 321 queue->rcv_state = NVMET_TCP_RECV_ERR; 322 if (queue->nvme_sq.ctrl) 323 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 324 else 325 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 326 } 327 328 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) 329 { 330 if (status == -EPIPE || status == -ECONNRESET) 331 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 332 else 333 nvmet_tcp_fatal_error(queue); 334 } 335 336 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) 337 { 338 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; 339 u32 len = le32_to_cpu(sgl->length); 340 341 if (!len) 342 return 0; 343 344 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | 345 NVME_SGL_FMT_OFFSET)) { 346 if (!nvme_is_write(cmd->req.cmd)) 347 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 348 349 if (len > cmd->req.port->inline_data_size) 350 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 351 cmd->pdu_len = len; 352 } 353 cmd->req.transfer_len += len; 354 355 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); 356 if (!cmd->req.sg) 357 return NVME_SC_INTERNAL; 358 cmd->cur_sg = cmd->req.sg; 359 360 if (nvmet_tcp_has_data_in(cmd)) { 361 cmd->iov = kmalloc_array(cmd->req.sg_cnt, 362 sizeof(*cmd->iov), GFP_KERNEL); 363 if (!cmd->iov) 364 goto err; 365 } 366 367 return 0; 368 err: 369 sgl_free(cmd->req.sg); 370 return NVME_SC_INTERNAL; 371 } 372 373 static void nvmet_tcp_ddgst(struct ahash_request *hash, 374 struct nvmet_tcp_cmd *cmd) 375 { 376 ahash_request_set_crypt(hash, cmd->req.sg, 377 (void *)&cmd->exp_ddgst, cmd->req.transfer_len); 378 crypto_ahash_digest(hash); 379 } 380 381 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) 382 { 383 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; 384 struct nvmet_tcp_queue *queue = cmd->queue; 385 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 386 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); 387 388 cmd->offset = 0; 389 cmd->state = NVMET_TCP_SEND_DATA_PDU; 390 391 pdu->hdr.type = nvme_tcp_c2h_data; 392 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ? 393 NVME_TCP_F_DATA_SUCCESS : 0); 394 pdu->hdr.hlen = sizeof(*pdu); 395 pdu->hdr.pdo = pdu->hdr.hlen + hdgst; 396 pdu->hdr.plen = 397 cpu_to_le32(pdu->hdr.hlen + hdgst + 398 cmd->req.transfer_len + ddgst); 399 pdu->command_id = cmd->req.cqe->command_id; 400 pdu->data_length = cpu_to_le32(cmd->req.transfer_len); 401 pdu->data_offset = cpu_to_le32(cmd->wbytes_done); 402 403 if (queue->data_digest) { 404 pdu->hdr.flags |= NVME_TCP_F_DDGST; 405 nvmet_tcp_ddgst(queue->snd_hash, cmd); 406 } 407 408 if (cmd->queue->hdr_digest) { 409 pdu->hdr.flags |= NVME_TCP_F_HDGST; 410 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 411 } 412 } 413 414 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) 415 { 416 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; 417 struct nvmet_tcp_queue *queue = cmd->queue; 418 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 419 420 cmd->offset = 0; 421 cmd->state = NVMET_TCP_SEND_R2T; 422 423 pdu->hdr.type = nvme_tcp_r2t; 424 pdu->hdr.flags = 0; 425 pdu->hdr.hlen = sizeof(*pdu); 426 pdu->hdr.pdo = 0; 427 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 428 429 pdu->command_id = cmd->req.cmd->common.command_id; 430 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); 431 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); 432 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); 433 if (cmd->queue->hdr_digest) { 434 pdu->hdr.flags |= NVME_TCP_F_HDGST; 435 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 436 } 437 } 438 439 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) 440 { 441 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; 442 struct nvmet_tcp_queue *queue = cmd->queue; 443 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 444 445 cmd->offset = 0; 446 cmd->state = NVMET_TCP_SEND_RESPONSE; 447 448 pdu->hdr.type = nvme_tcp_rsp; 449 pdu->hdr.flags = 0; 450 pdu->hdr.hlen = sizeof(*pdu); 451 pdu->hdr.pdo = 0; 452 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 453 if (cmd->queue->hdr_digest) { 454 pdu->hdr.flags |= NVME_TCP_F_HDGST; 455 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 456 } 457 } 458 459 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) 460 { 461 struct llist_node *node; 462 struct nvmet_tcp_cmd *cmd; 463 464 for (node = llist_del_all(&queue->resp_list); node; node = node->next) { 465 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); 466 list_add(&cmd->entry, &queue->resp_send_list); 467 queue->send_list_len++; 468 } 469 } 470 471 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) 472 { 473 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, 474 struct nvmet_tcp_cmd, entry); 475 if (!queue->snd_cmd) { 476 nvmet_tcp_process_resp_list(queue); 477 queue->snd_cmd = 478 list_first_entry_or_null(&queue->resp_send_list, 479 struct nvmet_tcp_cmd, entry); 480 if (unlikely(!queue->snd_cmd)) 481 return NULL; 482 } 483 484 list_del_init(&queue->snd_cmd->entry); 485 queue->send_list_len--; 486 487 if (nvmet_tcp_need_data_out(queue->snd_cmd)) 488 nvmet_setup_c2h_data_pdu(queue->snd_cmd); 489 else if (nvmet_tcp_need_data_in(queue->snd_cmd)) 490 nvmet_setup_r2t_pdu(queue->snd_cmd); 491 else 492 nvmet_setup_response_pdu(queue->snd_cmd); 493 494 return queue->snd_cmd; 495 } 496 497 static void nvmet_tcp_queue_response(struct nvmet_req *req) 498 { 499 struct nvmet_tcp_cmd *cmd = 500 container_of(req, struct nvmet_tcp_cmd, req); 501 struct nvmet_tcp_queue *queue = cmd->queue; 502 503 llist_add(&cmd->lentry, &queue->resp_list); 504 queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work); 505 } 506 507 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) 508 { 509 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 510 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; 511 int ret; 512 513 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), 514 offset_in_page(cmd->data_pdu) + cmd->offset, 515 left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); 516 if (ret <= 0) 517 return ret; 518 519 cmd->offset += ret; 520 left -= ret; 521 522 if (left) 523 return -EAGAIN; 524 525 cmd->state = NVMET_TCP_SEND_DATA; 526 cmd->offset = 0; 527 return 1; 528 } 529 530 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 531 { 532 struct nvmet_tcp_queue *queue = cmd->queue; 533 int ret; 534 535 while (cmd->cur_sg) { 536 struct page *page = sg_page(cmd->cur_sg); 537 u32 left = cmd->cur_sg->length - cmd->offset; 538 int flags = MSG_DONTWAIT; 539 540 if ((!last_in_batch && cmd->queue->send_list_len) || 541 cmd->wbytes_done + left < cmd->req.transfer_len || 542 queue->data_digest || !queue->nvme_sq.sqhd_disabled) 543 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 544 545 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, 546 left, flags); 547 if (ret <= 0) 548 return ret; 549 550 cmd->offset += ret; 551 cmd->wbytes_done += ret; 552 553 /* Done with sg?*/ 554 if (cmd->offset == cmd->cur_sg->length) { 555 cmd->cur_sg = sg_next(cmd->cur_sg); 556 cmd->offset = 0; 557 } 558 } 559 560 if (queue->data_digest) { 561 cmd->state = NVMET_TCP_SEND_DDGST; 562 cmd->offset = 0; 563 } else { 564 if (queue->nvme_sq.sqhd_disabled) { 565 cmd->queue->snd_cmd = NULL; 566 nvmet_tcp_put_cmd(cmd); 567 } else { 568 nvmet_setup_response_pdu(cmd); 569 } 570 } 571 572 if (queue->nvme_sq.sqhd_disabled) { 573 kfree(cmd->iov); 574 sgl_free(cmd->req.sg); 575 } 576 577 return 1; 578 579 } 580 581 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, 582 bool last_in_batch) 583 { 584 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 585 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; 586 int flags = MSG_DONTWAIT; 587 int ret; 588 589 if (!last_in_batch && cmd->queue->send_list_len) 590 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 591 else 592 flags |= MSG_EOR; 593 594 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), 595 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); 596 if (ret <= 0) 597 return ret; 598 cmd->offset += ret; 599 left -= ret; 600 601 if (left) 602 return -EAGAIN; 603 604 kfree(cmd->iov); 605 sgl_free(cmd->req.sg); 606 cmd->queue->snd_cmd = NULL; 607 nvmet_tcp_put_cmd(cmd); 608 return 1; 609 } 610 611 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 612 { 613 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 614 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; 615 int flags = MSG_DONTWAIT; 616 int ret; 617 618 if (!last_in_batch && cmd->queue->send_list_len) 619 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 620 else 621 flags |= MSG_EOR; 622 623 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), 624 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); 625 if (ret <= 0) 626 return ret; 627 cmd->offset += ret; 628 left -= ret; 629 630 if (left) 631 return -EAGAIN; 632 633 cmd->queue->snd_cmd = NULL; 634 return 1; 635 } 636 637 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 638 { 639 struct nvmet_tcp_queue *queue = cmd->queue; 640 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 641 struct kvec iov = { 642 .iov_base = &cmd->exp_ddgst + cmd->offset, 643 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset 644 }; 645 int ret; 646 647 if (!last_in_batch && cmd->queue->send_list_len) 648 msg.msg_flags |= MSG_MORE; 649 else 650 msg.msg_flags |= MSG_EOR; 651 652 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 653 if (unlikely(ret <= 0)) 654 return ret; 655 656 cmd->offset += ret; 657 658 if (queue->nvme_sq.sqhd_disabled) { 659 cmd->queue->snd_cmd = NULL; 660 nvmet_tcp_put_cmd(cmd); 661 } else { 662 nvmet_setup_response_pdu(cmd); 663 } 664 return 1; 665 } 666 667 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, 668 bool last_in_batch) 669 { 670 struct nvmet_tcp_cmd *cmd = queue->snd_cmd; 671 int ret = 0; 672 673 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { 674 cmd = nvmet_tcp_fetch_cmd(queue); 675 if (unlikely(!cmd)) 676 return 0; 677 } 678 679 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { 680 ret = nvmet_try_send_data_pdu(cmd); 681 if (ret <= 0) 682 goto done_send; 683 } 684 685 if (cmd->state == NVMET_TCP_SEND_DATA) { 686 ret = nvmet_try_send_data(cmd, last_in_batch); 687 if (ret <= 0) 688 goto done_send; 689 } 690 691 if (cmd->state == NVMET_TCP_SEND_DDGST) { 692 ret = nvmet_try_send_ddgst(cmd, last_in_batch); 693 if (ret <= 0) 694 goto done_send; 695 } 696 697 if (cmd->state == NVMET_TCP_SEND_R2T) { 698 ret = nvmet_try_send_r2t(cmd, last_in_batch); 699 if (ret <= 0) 700 goto done_send; 701 } 702 703 if (cmd->state == NVMET_TCP_SEND_RESPONSE) 704 ret = nvmet_try_send_response(cmd, last_in_batch); 705 706 done_send: 707 if (ret < 0) { 708 if (ret == -EAGAIN) 709 return 0; 710 return ret; 711 } 712 713 return 1; 714 } 715 716 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, 717 int budget, int *sends) 718 { 719 int i, ret = 0; 720 721 for (i = 0; i < budget; i++) { 722 ret = nvmet_tcp_try_send_one(queue, i == budget - 1); 723 if (unlikely(ret < 0)) { 724 nvmet_tcp_socket_error(queue, ret); 725 goto done; 726 } else if (ret == 0) { 727 break; 728 } 729 (*sends)++; 730 } 731 done: 732 return ret; 733 } 734 735 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) 736 { 737 queue->offset = 0; 738 queue->left = sizeof(struct nvme_tcp_hdr); 739 queue->cmd = NULL; 740 queue->rcv_state = NVMET_TCP_RECV_PDU; 741 } 742 743 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) 744 { 745 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); 746 747 ahash_request_free(queue->rcv_hash); 748 ahash_request_free(queue->snd_hash); 749 crypto_free_ahash(tfm); 750 } 751 752 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) 753 { 754 struct crypto_ahash *tfm; 755 756 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); 757 if (IS_ERR(tfm)) 758 return PTR_ERR(tfm); 759 760 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); 761 if (!queue->snd_hash) 762 goto free_tfm; 763 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); 764 765 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); 766 if (!queue->rcv_hash) 767 goto free_snd_hash; 768 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); 769 770 return 0; 771 free_snd_hash: 772 ahash_request_free(queue->snd_hash); 773 free_tfm: 774 crypto_free_ahash(tfm); 775 return -ENOMEM; 776 } 777 778 779 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) 780 { 781 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; 782 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; 783 struct msghdr msg = {}; 784 struct kvec iov; 785 int ret; 786 787 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { 788 pr_err("bad nvme-tcp pdu length (%d)\n", 789 le32_to_cpu(icreq->hdr.plen)); 790 nvmet_tcp_fatal_error(queue); 791 } 792 793 if (icreq->pfv != NVME_TCP_PFV_1_0) { 794 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); 795 return -EPROTO; 796 } 797 798 if (icreq->hpda != 0) { 799 pr_err("queue %d: unsupported hpda %d\n", queue->idx, 800 icreq->hpda); 801 return -EPROTO; 802 } 803 804 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); 805 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); 806 if (queue->hdr_digest || queue->data_digest) { 807 ret = nvmet_tcp_alloc_crypto(queue); 808 if (ret) 809 return ret; 810 } 811 812 memset(icresp, 0, sizeof(*icresp)); 813 icresp->hdr.type = nvme_tcp_icresp; 814 icresp->hdr.hlen = sizeof(*icresp); 815 icresp->hdr.pdo = 0; 816 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); 817 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); 818 icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ 819 icresp->cpda = 0; 820 if (queue->hdr_digest) 821 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; 822 if (queue->data_digest) 823 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; 824 825 iov.iov_base = icresp; 826 iov.iov_len = sizeof(*icresp); 827 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 828 if (ret < 0) 829 goto free_crypto; 830 831 queue->state = NVMET_TCP_Q_LIVE; 832 nvmet_prepare_receive_pdu(queue); 833 return 0; 834 free_crypto: 835 if (queue->hdr_digest || queue->data_digest) 836 nvmet_tcp_free_crypto(queue); 837 return ret; 838 } 839 840 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, 841 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) 842 { 843 size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); 844 int ret; 845 846 if (!nvme_is_write(cmd->req.cmd) || 847 data_len > cmd->req.port->inline_data_size) { 848 nvmet_prepare_receive_pdu(queue); 849 return; 850 } 851 852 ret = nvmet_tcp_map_data(cmd); 853 if (unlikely(ret)) { 854 pr_err("queue %d: failed to map data\n", queue->idx); 855 nvmet_tcp_fatal_error(queue); 856 return; 857 } 858 859 queue->rcv_state = NVMET_TCP_RECV_DATA; 860 nvmet_tcp_map_pdu_iovec(cmd); 861 cmd->flags |= NVMET_TCP_F_INIT_FAILED; 862 } 863 864 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) 865 { 866 struct nvme_tcp_data_pdu *data = &queue->pdu.data; 867 struct nvmet_tcp_cmd *cmd; 868 869 cmd = &queue->cmds[data->ttag]; 870 871 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { 872 pr_err("ttag %u unexpected data offset %u (expected %u)\n", 873 data->ttag, le32_to_cpu(data->data_offset), 874 cmd->rbytes_done); 875 /* FIXME: use path and transport errors */ 876 nvmet_req_complete(&cmd->req, 877 NVME_SC_INVALID_FIELD | NVME_SC_DNR); 878 return -EPROTO; 879 } 880 881 cmd->pdu_len = le32_to_cpu(data->data_length); 882 cmd->pdu_recv = 0; 883 nvmet_tcp_map_pdu_iovec(cmd); 884 queue->cmd = cmd; 885 queue->rcv_state = NVMET_TCP_RECV_DATA; 886 887 return 0; 888 } 889 890 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) 891 { 892 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 893 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; 894 struct nvmet_req *req; 895 int ret; 896 897 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 898 if (hdr->type != nvme_tcp_icreq) { 899 pr_err("unexpected pdu type (%d) before icreq\n", 900 hdr->type); 901 nvmet_tcp_fatal_error(queue); 902 return -EPROTO; 903 } 904 return nvmet_tcp_handle_icreq(queue); 905 } 906 907 if (hdr->type == nvme_tcp_h2c_data) { 908 ret = nvmet_tcp_handle_h2c_data_pdu(queue); 909 if (unlikely(ret)) 910 return ret; 911 return 0; 912 } 913 914 queue->cmd = nvmet_tcp_get_cmd(queue); 915 if (unlikely(!queue->cmd)) { 916 /* This should never happen */ 917 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", 918 queue->idx, queue->nr_cmds, queue->send_list_len, 919 nvme_cmd->common.opcode); 920 nvmet_tcp_fatal_error(queue); 921 return -ENOMEM; 922 } 923 924 req = &queue->cmd->req; 925 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); 926 927 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, 928 &queue->nvme_sq, &nvmet_tcp_ops))) { 929 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", 930 req->cmd, req->cmd->common.command_id, 931 req->cmd->common.opcode, 932 le32_to_cpu(req->cmd->common.dptr.sgl.length)); 933 934 nvmet_tcp_handle_req_failure(queue, queue->cmd, req); 935 return -EAGAIN; 936 } 937 938 ret = nvmet_tcp_map_data(queue->cmd); 939 if (unlikely(ret)) { 940 pr_err("queue %d: failed to map data\n", queue->idx); 941 if (nvmet_tcp_has_inline_data(queue->cmd)) 942 nvmet_tcp_fatal_error(queue); 943 else 944 nvmet_req_complete(req, ret); 945 ret = -EAGAIN; 946 goto out; 947 } 948 949 if (nvmet_tcp_need_data_in(queue->cmd)) { 950 if (nvmet_tcp_has_inline_data(queue->cmd)) { 951 queue->rcv_state = NVMET_TCP_RECV_DATA; 952 nvmet_tcp_map_pdu_iovec(queue->cmd); 953 return 0; 954 } 955 /* send back R2T */ 956 nvmet_tcp_queue_response(&queue->cmd->req); 957 goto out; 958 } 959 960 queue->cmd->req.execute(&queue->cmd->req); 961 out: 962 nvmet_prepare_receive_pdu(queue); 963 return ret; 964 } 965 966 static const u8 nvme_tcp_pdu_sizes[] = { 967 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), 968 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), 969 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), 970 }; 971 972 static inline u8 nvmet_tcp_pdu_size(u8 type) 973 { 974 size_t idx = type; 975 976 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && 977 nvme_tcp_pdu_sizes[idx]) ? 978 nvme_tcp_pdu_sizes[idx] : 0; 979 } 980 981 static inline bool nvmet_tcp_pdu_valid(u8 type) 982 { 983 switch (type) { 984 case nvme_tcp_icreq: 985 case nvme_tcp_cmd: 986 case nvme_tcp_h2c_data: 987 /* fallthru */ 988 return true; 989 } 990 991 return false; 992 } 993 994 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) 995 { 996 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 997 int len; 998 struct kvec iov; 999 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1000 1001 recv: 1002 iov.iov_base = (void *)&queue->pdu + queue->offset; 1003 iov.iov_len = queue->left; 1004 len = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1005 iov.iov_len, msg.msg_flags); 1006 if (unlikely(len < 0)) 1007 return len; 1008 1009 queue->offset += len; 1010 queue->left -= len; 1011 if (queue->left) 1012 return -EAGAIN; 1013 1014 if (queue->offset == sizeof(struct nvme_tcp_hdr)) { 1015 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1016 1017 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { 1018 pr_err("unexpected pdu type %d\n", hdr->type); 1019 nvmet_tcp_fatal_error(queue); 1020 return -EIO; 1021 } 1022 1023 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { 1024 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); 1025 return -EIO; 1026 } 1027 1028 queue->left = hdr->hlen - queue->offset + hdgst; 1029 goto recv; 1030 } 1031 1032 if (queue->hdr_digest && 1033 nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { 1034 nvmet_tcp_fatal_error(queue); /* fatal */ 1035 return -EPROTO; 1036 } 1037 1038 if (queue->data_digest && 1039 nvmet_tcp_check_ddgst(queue, &queue->pdu)) { 1040 nvmet_tcp_fatal_error(queue); /* fatal */ 1041 return -EPROTO; 1042 } 1043 1044 return nvmet_tcp_done_recv_pdu(queue); 1045 } 1046 1047 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) 1048 { 1049 struct nvmet_tcp_queue *queue = cmd->queue; 1050 1051 nvmet_tcp_ddgst(queue->rcv_hash, cmd); 1052 queue->offset = 0; 1053 queue->left = NVME_TCP_DIGEST_LENGTH; 1054 queue->rcv_state = NVMET_TCP_RECV_DDGST; 1055 } 1056 1057 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) 1058 { 1059 struct nvmet_tcp_cmd *cmd = queue->cmd; 1060 int ret; 1061 1062 while (msg_data_left(&cmd->recv_msg)) { 1063 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, 1064 cmd->recv_msg.msg_flags); 1065 if (ret <= 0) 1066 return ret; 1067 1068 cmd->pdu_recv += ret; 1069 cmd->rbytes_done += ret; 1070 } 1071 1072 nvmet_tcp_unmap_pdu_iovec(cmd); 1073 1074 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && 1075 cmd->rbytes_done == cmd->req.transfer_len) { 1076 if (queue->data_digest) { 1077 nvmet_tcp_prep_recv_ddgst(cmd); 1078 return 0; 1079 } 1080 cmd->req.execute(&cmd->req); 1081 } 1082 1083 nvmet_prepare_receive_pdu(queue); 1084 return 0; 1085 } 1086 1087 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) 1088 { 1089 struct nvmet_tcp_cmd *cmd = queue->cmd; 1090 int ret; 1091 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1092 struct kvec iov = { 1093 .iov_base = (void *)&cmd->recv_ddgst + queue->offset, 1094 .iov_len = queue->left 1095 }; 1096 1097 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1098 iov.iov_len, msg.msg_flags); 1099 if (unlikely(ret < 0)) 1100 return ret; 1101 1102 queue->offset += ret; 1103 queue->left -= ret; 1104 if (queue->left) 1105 return -EAGAIN; 1106 1107 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { 1108 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", 1109 queue->idx, cmd->req.cmd->common.command_id, 1110 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), 1111 le32_to_cpu(cmd->exp_ddgst)); 1112 nvmet_tcp_finish_cmd(cmd); 1113 nvmet_tcp_fatal_error(queue); 1114 ret = -EPROTO; 1115 goto out; 1116 } 1117 1118 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && 1119 cmd->rbytes_done == cmd->req.transfer_len) 1120 cmd->req.execute(&cmd->req); 1121 ret = 0; 1122 out: 1123 nvmet_prepare_receive_pdu(queue); 1124 return ret; 1125 } 1126 1127 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) 1128 { 1129 int result = 0; 1130 1131 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) 1132 return 0; 1133 1134 if (queue->rcv_state == NVMET_TCP_RECV_PDU) { 1135 result = nvmet_tcp_try_recv_pdu(queue); 1136 if (result != 0) 1137 goto done_recv; 1138 } 1139 1140 if (queue->rcv_state == NVMET_TCP_RECV_DATA) { 1141 result = nvmet_tcp_try_recv_data(queue); 1142 if (result != 0) 1143 goto done_recv; 1144 } 1145 1146 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { 1147 result = nvmet_tcp_try_recv_ddgst(queue); 1148 if (result != 0) 1149 goto done_recv; 1150 } 1151 1152 done_recv: 1153 if (result < 0) { 1154 if (result == -EAGAIN) 1155 return 0; 1156 return result; 1157 } 1158 return 1; 1159 } 1160 1161 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, 1162 int budget, int *recvs) 1163 { 1164 int i, ret = 0; 1165 1166 for (i = 0; i < budget; i++) { 1167 ret = nvmet_tcp_try_recv_one(queue); 1168 if (unlikely(ret < 0)) { 1169 nvmet_tcp_socket_error(queue, ret); 1170 goto done; 1171 } else if (ret == 0) { 1172 break; 1173 } 1174 (*recvs)++; 1175 } 1176 done: 1177 return ret; 1178 } 1179 1180 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) 1181 { 1182 spin_lock(&queue->state_lock); 1183 if (queue->state != NVMET_TCP_Q_DISCONNECTING) { 1184 queue->state = NVMET_TCP_Q_DISCONNECTING; 1185 schedule_work(&queue->release_work); 1186 } 1187 spin_unlock(&queue->state_lock); 1188 } 1189 1190 static void nvmet_tcp_io_work(struct work_struct *w) 1191 { 1192 struct nvmet_tcp_queue *queue = 1193 container_of(w, struct nvmet_tcp_queue, io_work); 1194 bool pending; 1195 int ret, ops = 0; 1196 1197 do { 1198 pending = false; 1199 1200 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); 1201 if (ret > 0) 1202 pending = true; 1203 else if (ret < 0) 1204 return; 1205 1206 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); 1207 if (ret > 0) 1208 pending = true; 1209 else if (ret < 0) 1210 return; 1211 1212 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); 1213 1214 /* 1215 * We exahusted our budget, requeue our selves 1216 */ 1217 if (pending) 1218 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); 1219 } 1220 1221 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, 1222 struct nvmet_tcp_cmd *c) 1223 { 1224 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1225 1226 c->queue = queue; 1227 c->req.port = queue->port->nport; 1228 1229 c->cmd_pdu = page_frag_alloc(&queue->pf_cache, 1230 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1231 if (!c->cmd_pdu) 1232 return -ENOMEM; 1233 c->req.cmd = &c->cmd_pdu->cmd; 1234 1235 c->rsp_pdu = page_frag_alloc(&queue->pf_cache, 1236 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1237 if (!c->rsp_pdu) 1238 goto out_free_cmd; 1239 c->req.cqe = &c->rsp_pdu->cqe; 1240 1241 c->data_pdu = page_frag_alloc(&queue->pf_cache, 1242 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1243 if (!c->data_pdu) 1244 goto out_free_rsp; 1245 1246 c->r2t_pdu = page_frag_alloc(&queue->pf_cache, 1247 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1248 if (!c->r2t_pdu) 1249 goto out_free_data; 1250 1251 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1252 1253 list_add_tail(&c->entry, &queue->free_list); 1254 1255 return 0; 1256 out_free_data: 1257 page_frag_free(c->data_pdu); 1258 out_free_rsp: 1259 page_frag_free(c->rsp_pdu); 1260 out_free_cmd: 1261 page_frag_free(c->cmd_pdu); 1262 return -ENOMEM; 1263 } 1264 1265 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) 1266 { 1267 page_frag_free(c->r2t_pdu); 1268 page_frag_free(c->data_pdu); 1269 page_frag_free(c->rsp_pdu); 1270 page_frag_free(c->cmd_pdu); 1271 } 1272 1273 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) 1274 { 1275 struct nvmet_tcp_cmd *cmds; 1276 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; 1277 1278 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); 1279 if (!cmds) 1280 goto out; 1281 1282 for (i = 0; i < nr_cmds; i++) { 1283 ret = nvmet_tcp_alloc_cmd(queue, cmds + i); 1284 if (ret) 1285 goto out_free; 1286 } 1287 1288 queue->cmds = cmds; 1289 1290 return 0; 1291 out_free: 1292 while (--i >= 0) 1293 nvmet_tcp_free_cmd(cmds + i); 1294 kfree(cmds); 1295 out: 1296 return ret; 1297 } 1298 1299 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) 1300 { 1301 struct nvmet_tcp_cmd *cmds = queue->cmds; 1302 int i; 1303 1304 for (i = 0; i < queue->nr_cmds; i++) 1305 nvmet_tcp_free_cmd(cmds + i); 1306 1307 nvmet_tcp_free_cmd(&queue->connect); 1308 kfree(cmds); 1309 } 1310 1311 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) 1312 { 1313 struct socket *sock = queue->sock; 1314 1315 write_lock_bh(&sock->sk->sk_callback_lock); 1316 sock->sk->sk_data_ready = queue->data_ready; 1317 sock->sk->sk_state_change = queue->state_change; 1318 sock->sk->sk_write_space = queue->write_space; 1319 sock->sk->sk_user_data = NULL; 1320 write_unlock_bh(&sock->sk->sk_callback_lock); 1321 } 1322 1323 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) 1324 { 1325 nvmet_req_uninit(&cmd->req); 1326 nvmet_tcp_unmap_pdu_iovec(cmd); 1327 kfree(cmd->iov); 1328 sgl_free(cmd->req.sg); 1329 } 1330 1331 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) 1332 { 1333 struct nvmet_tcp_cmd *cmd = queue->cmds; 1334 int i; 1335 1336 for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1337 if (nvmet_tcp_need_data_in(cmd)) 1338 nvmet_tcp_finish_cmd(cmd); 1339 } 1340 1341 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { 1342 /* failed in connect */ 1343 nvmet_tcp_finish_cmd(&queue->connect); 1344 } 1345 } 1346 1347 static void nvmet_tcp_release_queue_work(struct work_struct *w) 1348 { 1349 struct nvmet_tcp_queue *queue = 1350 container_of(w, struct nvmet_tcp_queue, release_work); 1351 1352 mutex_lock(&nvmet_tcp_queue_mutex); 1353 list_del_init(&queue->queue_list); 1354 mutex_unlock(&nvmet_tcp_queue_mutex); 1355 1356 nvmet_tcp_restore_socket_callbacks(queue); 1357 flush_work(&queue->io_work); 1358 1359 nvmet_tcp_uninit_data_in_cmds(queue); 1360 nvmet_sq_destroy(&queue->nvme_sq); 1361 cancel_work_sync(&queue->io_work); 1362 sock_release(queue->sock); 1363 nvmet_tcp_free_cmds(queue); 1364 if (queue->hdr_digest || queue->data_digest) 1365 nvmet_tcp_free_crypto(queue); 1366 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); 1367 1368 kfree(queue); 1369 } 1370 1371 static void nvmet_tcp_data_ready(struct sock *sk) 1372 { 1373 struct nvmet_tcp_queue *queue; 1374 1375 read_lock_bh(&sk->sk_callback_lock); 1376 queue = sk->sk_user_data; 1377 if (likely(queue)) 1378 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); 1379 read_unlock_bh(&sk->sk_callback_lock); 1380 } 1381 1382 static void nvmet_tcp_write_space(struct sock *sk) 1383 { 1384 struct nvmet_tcp_queue *queue; 1385 1386 read_lock_bh(&sk->sk_callback_lock); 1387 queue = sk->sk_user_data; 1388 if (unlikely(!queue)) 1389 goto out; 1390 1391 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 1392 queue->write_space(sk); 1393 goto out; 1394 } 1395 1396 if (sk_stream_is_writeable(sk)) { 1397 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1398 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); 1399 } 1400 out: 1401 read_unlock_bh(&sk->sk_callback_lock); 1402 } 1403 1404 static void nvmet_tcp_state_change(struct sock *sk) 1405 { 1406 struct nvmet_tcp_queue *queue; 1407 1408 write_lock_bh(&sk->sk_callback_lock); 1409 queue = sk->sk_user_data; 1410 if (!queue) 1411 goto done; 1412 1413 switch (sk->sk_state) { 1414 case TCP_FIN_WAIT1: 1415 case TCP_CLOSE_WAIT: 1416 case TCP_CLOSE: 1417 /* FALLTHRU */ 1418 sk->sk_user_data = NULL; 1419 nvmet_tcp_schedule_release_queue(queue); 1420 break; 1421 default: 1422 pr_warn("queue %d unhandled state %d\n", 1423 queue->idx, sk->sk_state); 1424 } 1425 done: 1426 write_unlock_bh(&sk->sk_callback_lock); 1427 } 1428 1429 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) 1430 { 1431 struct socket *sock = queue->sock; 1432 struct inet_sock *inet = inet_sk(sock->sk); 1433 int ret; 1434 1435 ret = kernel_getsockname(sock, 1436 (struct sockaddr *)&queue->sockaddr); 1437 if (ret < 0) 1438 return ret; 1439 1440 ret = kernel_getpeername(sock, 1441 (struct sockaddr *)&queue->sockaddr_peer); 1442 if (ret < 0) 1443 return ret; 1444 1445 /* 1446 * Cleanup whatever is sitting in the TCP transmit queue on socket 1447 * close. This is done to prevent stale data from being sent should 1448 * the network connection be restored before TCP times out. 1449 */ 1450 sock_no_linger(sock->sk); 1451 1452 if (so_priority > 0) 1453 sock_set_priority(sock->sk, so_priority); 1454 1455 /* Set socket type of service */ 1456 if (inet->rcv_tos > 0) 1457 ip_sock_set_tos(sock->sk, inet->rcv_tos); 1458 1459 write_lock_bh(&sock->sk->sk_callback_lock); 1460 sock->sk->sk_user_data = queue; 1461 queue->data_ready = sock->sk->sk_data_ready; 1462 sock->sk->sk_data_ready = nvmet_tcp_data_ready; 1463 queue->state_change = sock->sk->sk_state_change; 1464 sock->sk->sk_state_change = nvmet_tcp_state_change; 1465 queue->write_space = sock->sk->sk_write_space; 1466 sock->sk->sk_write_space = nvmet_tcp_write_space; 1467 write_unlock_bh(&sock->sk->sk_callback_lock); 1468 1469 return 0; 1470 } 1471 1472 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, 1473 struct socket *newsock) 1474 { 1475 struct nvmet_tcp_queue *queue; 1476 int ret; 1477 1478 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1479 if (!queue) 1480 return -ENOMEM; 1481 1482 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); 1483 INIT_WORK(&queue->io_work, nvmet_tcp_io_work); 1484 queue->sock = newsock; 1485 queue->port = port; 1486 queue->nr_cmds = 0; 1487 spin_lock_init(&queue->state_lock); 1488 queue->state = NVMET_TCP_Q_CONNECTING; 1489 INIT_LIST_HEAD(&queue->free_list); 1490 init_llist_head(&queue->resp_list); 1491 INIT_LIST_HEAD(&queue->resp_send_list); 1492 1493 queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); 1494 if (queue->idx < 0) { 1495 ret = queue->idx; 1496 goto out_free_queue; 1497 } 1498 1499 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); 1500 if (ret) 1501 goto out_ida_remove; 1502 1503 ret = nvmet_sq_init(&queue->nvme_sq); 1504 if (ret) 1505 goto out_free_connect; 1506 1507 port->last_cpu = cpumask_next_wrap(port->last_cpu, 1508 cpu_online_mask, -1, false); 1509 queue->cpu = port->last_cpu; 1510 nvmet_prepare_receive_pdu(queue); 1511 1512 mutex_lock(&nvmet_tcp_queue_mutex); 1513 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); 1514 mutex_unlock(&nvmet_tcp_queue_mutex); 1515 1516 ret = nvmet_tcp_set_queue_sock(queue); 1517 if (ret) 1518 goto out_destroy_sq; 1519 1520 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); 1521 1522 return 0; 1523 out_destroy_sq: 1524 mutex_lock(&nvmet_tcp_queue_mutex); 1525 list_del_init(&queue->queue_list); 1526 mutex_unlock(&nvmet_tcp_queue_mutex); 1527 nvmet_sq_destroy(&queue->nvme_sq); 1528 out_free_connect: 1529 nvmet_tcp_free_cmd(&queue->connect); 1530 out_ida_remove: 1531 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); 1532 out_free_queue: 1533 kfree(queue); 1534 return ret; 1535 } 1536 1537 static void nvmet_tcp_accept_work(struct work_struct *w) 1538 { 1539 struct nvmet_tcp_port *port = 1540 container_of(w, struct nvmet_tcp_port, accept_work); 1541 struct socket *newsock; 1542 int ret; 1543 1544 while (true) { 1545 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); 1546 if (ret < 0) { 1547 if (ret != -EAGAIN) 1548 pr_warn("failed to accept err=%d\n", ret); 1549 return; 1550 } 1551 ret = nvmet_tcp_alloc_queue(port, newsock); 1552 if (ret) { 1553 pr_err("failed to allocate queue\n"); 1554 sock_release(newsock); 1555 } 1556 } 1557 } 1558 1559 static void nvmet_tcp_listen_data_ready(struct sock *sk) 1560 { 1561 struct nvmet_tcp_port *port; 1562 1563 read_lock_bh(&sk->sk_callback_lock); 1564 port = sk->sk_user_data; 1565 if (!port) 1566 goto out; 1567 1568 if (sk->sk_state == TCP_LISTEN) 1569 schedule_work(&port->accept_work); 1570 out: 1571 read_unlock_bh(&sk->sk_callback_lock); 1572 } 1573 1574 static int nvmet_tcp_add_port(struct nvmet_port *nport) 1575 { 1576 struct nvmet_tcp_port *port; 1577 __kernel_sa_family_t af; 1578 int ret; 1579 1580 port = kzalloc(sizeof(*port), GFP_KERNEL); 1581 if (!port) 1582 return -ENOMEM; 1583 1584 switch (nport->disc_addr.adrfam) { 1585 case NVMF_ADDR_FAMILY_IP4: 1586 af = AF_INET; 1587 break; 1588 case NVMF_ADDR_FAMILY_IP6: 1589 af = AF_INET6; 1590 break; 1591 default: 1592 pr_err("address family %d not supported\n", 1593 nport->disc_addr.adrfam); 1594 ret = -EINVAL; 1595 goto err_port; 1596 } 1597 1598 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, 1599 nport->disc_addr.trsvcid, &port->addr); 1600 if (ret) { 1601 pr_err("malformed ip/port passed: %s:%s\n", 1602 nport->disc_addr.traddr, nport->disc_addr.trsvcid); 1603 goto err_port; 1604 } 1605 1606 port->nport = nport; 1607 port->last_cpu = -1; 1608 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); 1609 if (port->nport->inline_data_size < 0) 1610 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; 1611 1612 ret = sock_create(port->addr.ss_family, SOCK_STREAM, 1613 IPPROTO_TCP, &port->sock); 1614 if (ret) { 1615 pr_err("failed to create a socket\n"); 1616 goto err_port; 1617 } 1618 1619 port->sock->sk->sk_user_data = port; 1620 port->data_ready = port->sock->sk->sk_data_ready; 1621 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; 1622 sock_set_reuseaddr(port->sock->sk); 1623 tcp_sock_set_nodelay(port->sock->sk); 1624 if (so_priority > 0) 1625 sock_set_priority(port->sock->sk, so_priority); 1626 1627 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, 1628 sizeof(port->addr)); 1629 if (ret) { 1630 pr_err("failed to bind port socket %d\n", ret); 1631 goto err_sock; 1632 } 1633 1634 ret = kernel_listen(port->sock, 128); 1635 if (ret) { 1636 pr_err("failed to listen %d on port sock\n", ret); 1637 goto err_sock; 1638 } 1639 1640 nport->priv = port; 1641 pr_info("enabling port %d (%pISpc)\n", 1642 le16_to_cpu(nport->disc_addr.portid), &port->addr); 1643 1644 return 0; 1645 1646 err_sock: 1647 sock_release(port->sock); 1648 err_port: 1649 kfree(port); 1650 return ret; 1651 } 1652 1653 static void nvmet_tcp_remove_port(struct nvmet_port *nport) 1654 { 1655 struct nvmet_tcp_port *port = nport->priv; 1656 1657 write_lock_bh(&port->sock->sk->sk_callback_lock); 1658 port->sock->sk->sk_data_ready = port->data_ready; 1659 port->sock->sk->sk_user_data = NULL; 1660 write_unlock_bh(&port->sock->sk->sk_callback_lock); 1661 cancel_work_sync(&port->accept_work); 1662 1663 sock_release(port->sock); 1664 kfree(port); 1665 } 1666 1667 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) 1668 { 1669 struct nvmet_tcp_queue *queue; 1670 1671 mutex_lock(&nvmet_tcp_queue_mutex); 1672 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1673 if (queue->nvme_sq.ctrl == ctrl) 1674 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1675 mutex_unlock(&nvmet_tcp_queue_mutex); 1676 } 1677 1678 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) 1679 { 1680 struct nvmet_tcp_queue *queue = 1681 container_of(sq, struct nvmet_tcp_queue, nvme_sq); 1682 1683 if (sq->qid == 0) { 1684 /* Let inflight controller teardown complete */ 1685 flush_scheduled_work(); 1686 } 1687 1688 queue->nr_cmds = sq->size * 2; 1689 if (nvmet_tcp_alloc_cmds(queue)) 1690 return NVME_SC_INTERNAL; 1691 return 0; 1692 } 1693 1694 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, 1695 struct nvmet_port *nport, char *traddr) 1696 { 1697 struct nvmet_tcp_port *port = nport->priv; 1698 1699 if (inet_addr_is_any((struct sockaddr *)&port->addr)) { 1700 struct nvmet_tcp_cmd *cmd = 1701 container_of(req, struct nvmet_tcp_cmd, req); 1702 struct nvmet_tcp_queue *queue = cmd->queue; 1703 1704 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); 1705 } else { 1706 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); 1707 } 1708 } 1709 1710 static const struct nvmet_fabrics_ops nvmet_tcp_ops = { 1711 .owner = THIS_MODULE, 1712 .type = NVMF_TRTYPE_TCP, 1713 .msdbd = 1, 1714 .add_port = nvmet_tcp_add_port, 1715 .remove_port = nvmet_tcp_remove_port, 1716 .queue_response = nvmet_tcp_queue_response, 1717 .delete_ctrl = nvmet_tcp_delete_ctrl, 1718 .install_queue = nvmet_tcp_install_queue, 1719 .disc_traddr = nvmet_tcp_disc_port_addr, 1720 }; 1721 1722 static int __init nvmet_tcp_init(void) 1723 { 1724 int ret; 1725 1726 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); 1727 if (!nvmet_tcp_wq) 1728 return -ENOMEM; 1729 1730 ret = nvmet_register_transport(&nvmet_tcp_ops); 1731 if (ret) 1732 goto err; 1733 1734 return 0; 1735 err: 1736 destroy_workqueue(nvmet_tcp_wq); 1737 return ret; 1738 } 1739 1740 static void __exit nvmet_tcp_exit(void) 1741 { 1742 struct nvmet_tcp_queue *queue; 1743 1744 nvmet_unregister_transport(&nvmet_tcp_ops); 1745 1746 flush_scheduled_work(); 1747 mutex_lock(&nvmet_tcp_queue_mutex); 1748 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1749 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1750 mutex_unlock(&nvmet_tcp_queue_mutex); 1751 flush_scheduled_work(); 1752 1753 destroy_workqueue(nvmet_tcp_wq); 1754 } 1755 1756 module_init(nvmet_tcp_init); 1757 module_exit(nvmet_tcp_exit); 1758 1759 MODULE_LICENSE("GPL v2"); 1760 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ 1761