1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics TCP target. 4 * Copyright (c) 2018 Lightbits Labs. All rights reserved. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <linux/err.h> 11 #include <linux/nvme-tcp.h> 12 #include <net/sock.h> 13 #include <net/tcp.h> 14 #include <linux/inet.h> 15 #include <linux/llist.h> 16 #include <crypto/hash.h> 17 18 #include "nvmet.h" 19 20 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) 21 22 /* Define the socket priority to use for connections were it is desirable 23 * that the NIC consider performing optimized packet processing or filtering. 24 * A non-zero value being sufficient to indicate general consideration of any 25 * possible optimization. Making it a module param allows for alternative 26 * values that may be unique for some NIC implementations. 27 */ 28 static int so_priority; 29 module_param(so_priority, int, 0644); 30 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); 31 32 #define NVMET_TCP_RECV_BUDGET 8 33 #define NVMET_TCP_SEND_BUDGET 8 34 #define NVMET_TCP_IO_WORK_BUDGET 64 35 36 enum nvmet_tcp_send_state { 37 NVMET_TCP_SEND_DATA_PDU, 38 NVMET_TCP_SEND_DATA, 39 NVMET_TCP_SEND_R2T, 40 NVMET_TCP_SEND_DDGST, 41 NVMET_TCP_SEND_RESPONSE 42 }; 43 44 enum nvmet_tcp_recv_state { 45 NVMET_TCP_RECV_PDU, 46 NVMET_TCP_RECV_DATA, 47 NVMET_TCP_RECV_DDGST, 48 NVMET_TCP_RECV_ERR, 49 }; 50 51 enum { 52 NVMET_TCP_F_INIT_FAILED = (1 << 0), 53 }; 54 55 struct nvmet_tcp_cmd { 56 struct nvmet_tcp_queue *queue; 57 struct nvmet_req req; 58 59 struct nvme_tcp_cmd_pdu *cmd_pdu; 60 struct nvme_tcp_rsp_pdu *rsp_pdu; 61 struct nvme_tcp_data_pdu *data_pdu; 62 struct nvme_tcp_r2t_pdu *r2t_pdu; 63 64 u32 rbytes_done; 65 u32 wbytes_done; 66 67 u32 pdu_len; 68 u32 pdu_recv; 69 int sg_idx; 70 int nr_mapped; 71 struct msghdr recv_msg; 72 struct kvec *iov; 73 u32 flags; 74 75 struct list_head entry; 76 struct llist_node lentry; 77 78 /* send state */ 79 u32 offset; 80 struct scatterlist *cur_sg; 81 enum nvmet_tcp_send_state state; 82 83 __le32 exp_ddgst; 84 __le32 recv_ddgst; 85 }; 86 87 enum nvmet_tcp_queue_state { 88 NVMET_TCP_Q_CONNECTING, 89 NVMET_TCP_Q_LIVE, 90 NVMET_TCP_Q_DISCONNECTING, 91 }; 92 93 struct nvmet_tcp_queue { 94 struct socket *sock; 95 struct nvmet_tcp_port *port; 96 struct work_struct io_work; 97 struct nvmet_cq nvme_cq; 98 struct nvmet_sq nvme_sq; 99 100 /* send state */ 101 struct nvmet_tcp_cmd *cmds; 102 unsigned int nr_cmds; 103 struct list_head free_list; 104 struct llist_head resp_list; 105 struct list_head resp_send_list; 106 int send_list_len; 107 struct nvmet_tcp_cmd *snd_cmd; 108 109 /* recv state */ 110 int offset; 111 int left; 112 enum nvmet_tcp_recv_state rcv_state; 113 struct nvmet_tcp_cmd *cmd; 114 union nvme_tcp_pdu pdu; 115 116 /* digest state */ 117 bool hdr_digest; 118 bool data_digest; 119 struct ahash_request *snd_hash; 120 struct ahash_request *rcv_hash; 121 122 spinlock_t state_lock; 123 enum nvmet_tcp_queue_state state; 124 125 struct sockaddr_storage sockaddr; 126 struct sockaddr_storage sockaddr_peer; 127 struct work_struct release_work; 128 129 int idx; 130 struct list_head queue_list; 131 132 struct nvmet_tcp_cmd connect; 133 134 struct page_frag_cache pf_cache; 135 136 void (*data_ready)(struct sock *); 137 void (*state_change)(struct sock *); 138 void (*write_space)(struct sock *); 139 }; 140 141 struct nvmet_tcp_port { 142 struct socket *sock; 143 struct work_struct accept_work; 144 struct nvmet_port *nport; 145 struct sockaddr_storage addr; 146 void (*data_ready)(struct sock *); 147 }; 148 149 static DEFINE_IDA(nvmet_tcp_queue_ida); 150 static LIST_HEAD(nvmet_tcp_queue_list); 151 static DEFINE_MUTEX(nvmet_tcp_queue_mutex); 152 153 static struct workqueue_struct *nvmet_tcp_wq; 154 static const struct nvmet_fabrics_ops nvmet_tcp_ops; 155 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); 156 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); 157 158 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, 159 struct nvmet_tcp_cmd *cmd) 160 { 161 if (unlikely(!queue->nr_cmds)) { 162 /* We didn't allocate cmds yet, send 0xffff */ 163 return USHRT_MAX; 164 } 165 166 return cmd - queue->cmds; 167 } 168 169 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) 170 { 171 return nvme_is_write(cmd->req.cmd) && 172 cmd->rbytes_done < cmd->req.transfer_len; 173 } 174 175 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) 176 { 177 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status; 178 } 179 180 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) 181 { 182 return !nvme_is_write(cmd->req.cmd) && 183 cmd->req.transfer_len > 0 && 184 !cmd->req.cqe->status; 185 } 186 187 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) 188 { 189 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && 190 !cmd->rbytes_done; 191 } 192 193 static inline struct nvmet_tcp_cmd * 194 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) 195 { 196 struct nvmet_tcp_cmd *cmd; 197 198 cmd = list_first_entry_or_null(&queue->free_list, 199 struct nvmet_tcp_cmd, entry); 200 if (!cmd) 201 return NULL; 202 list_del_init(&cmd->entry); 203 204 cmd->rbytes_done = cmd->wbytes_done = 0; 205 cmd->pdu_len = 0; 206 cmd->pdu_recv = 0; 207 cmd->iov = NULL; 208 cmd->flags = 0; 209 return cmd; 210 } 211 212 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) 213 { 214 if (unlikely(cmd == &cmd->queue->connect)) 215 return; 216 217 list_add_tail(&cmd->entry, &cmd->queue->free_list); 218 } 219 220 static inline int queue_cpu(struct nvmet_tcp_queue *queue) 221 { 222 return queue->sock->sk->sk_incoming_cpu; 223 } 224 225 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) 226 { 227 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; 228 } 229 230 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) 231 { 232 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 233 } 234 235 static inline void nvmet_tcp_hdgst(struct ahash_request *hash, 236 void *pdu, size_t len) 237 { 238 struct scatterlist sg; 239 240 sg_init_one(&sg, pdu, len); 241 ahash_request_set_crypt(hash, &sg, pdu + len, len); 242 crypto_ahash_digest(hash); 243 } 244 245 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, 246 void *pdu, size_t len) 247 { 248 struct nvme_tcp_hdr *hdr = pdu; 249 __le32 recv_digest; 250 __le32 exp_digest; 251 252 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { 253 pr_err("queue %d: header digest enabled but no header digest\n", 254 queue->idx); 255 return -EPROTO; 256 } 257 258 recv_digest = *(__le32 *)(pdu + hdr->hlen); 259 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); 260 exp_digest = *(__le32 *)(pdu + hdr->hlen); 261 if (recv_digest != exp_digest) { 262 pr_err("queue %d: header digest error: recv %#x expected %#x\n", 263 queue->idx, le32_to_cpu(recv_digest), 264 le32_to_cpu(exp_digest)); 265 return -EPROTO; 266 } 267 268 return 0; 269 } 270 271 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) 272 { 273 struct nvme_tcp_hdr *hdr = pdu; 274 u8 digest_len = nvmet_tcp_hdgst_len(queue); 275 u32 len; 276 277 len = le32_to_cpu(hdr->plen) - hdr->hlen - 278 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); 279 280 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { 281 pr_err("queue %d: data digest flag is cleared\n", queue->idx); 282 return -EPROTO; 283 } 284 285 return 0; 286 } 287 288 static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) 289 { 290 struct scatterlist *sg; 291 int i; 292 293 sg = &cmd->req.sg[cmd->sg_idx]; 294 295 for (i = 0; i < cmd->nr_mapped; i++) 296 kunmap(sg_page(&sg[i])); 297 } 298 299 static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) 300 { 301 struct kvec *iov = cmd->iov; 302 struct scatterlist *sg; 303 u32 length, offset, sg_offset; 304 305 length = cmd->pdu_len; 306 cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); 307 offset = cmd->rbytes_done; 308 cmd->sg_idx = offset / PAGE_SIZE; 309 sg_offset = offset % PAGE_SIZE; 310 sg = &cmd->req.sg[cmd->sg_idx]; 311 312 while (length) { 313 u32 iov_len = min_t(u32, length, sg->length - sg_offset); 314 315 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; 316 iov->iov_len = iov_len; 317 318 length -= iov_len; 319 sg = sg_next(sg); 320 iov++; 321 sg_offset = 0; 322 } 323 324 iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, 325 cmd->nr_mapped, cmd->pdu_len); 326 } 327 328 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) 329 { 330 queue->rcv_state = NVMET_TCP_RECV_ERR; 331 if (queue->nvme_sq.ctrl) 332 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 333 else 334 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 335 } 336 337 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) 338 { 339 if (status == -EPIPE || status == -ECONNRESET) 340 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 341 else 342 nvmet_tcp_fatal_error(queue); 343 } 344 345 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) 346 { 347 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; 348 u32 len = le32_to_cpu(sgl->length); 349 350 if (!len) 351 return 0; 352 353 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | 354 NVME_SGL_FMT_OFFSET)) { 355 if (!nvme_is_write(cmd->req.cmd)) 356 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 357 358 if (len > cmd->req.port->inline_data_size) 359 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 360 cmd->pdu_len = len; 361 } 362 cmd->req.transfer_len += len; 363 364 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); 365 if (!cmd->req.sg) 366 return NVME_SC_INTERNAL; 367 cmd->cur_sg = cmd->req.sg; 368 369 if (nvmet_tcp_has_data_in(cmd)) { 370 cmd->iov = kmalloc_array(cmd->req.sg_cnt, 371 sizeof(*cmd->iov), GFP_KERNEL); 372 if (!cmd->iov) 373 goto err; 374 } 375 376 return 0; 377 err: 378 sgl_free(cmd->req.sg); 379 return NVME_SC_INTERNAL; 380 } 381 382 static void nvmet_tcp_send_ddgst(struct ahash_request *hash, 383 struct nvmet_tcp_cmd *cmd) 384 { 385 ahash_request_set_crypt(hash, cmd->req.sg, 386 (void *)&cmd->exp_ddgst, cmd->req.transfer_len); 387 crypto_ahash_digest(hash); 388 } 389 390 static void nvmet_tcp_recv_ddgst(struct ahash_request *hash, 391 struct nvmet_tcp_cmd *cmd) 392 { 393 struct scatterlist sg; 394 struct kvec *iov; 395 int i; 396 397 crypto_ahash_init(hash); 398 for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) { 399 sg_init_one(&sg, iov->iov_base, iov->iov_len); 400 ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len); 401 crypto_ahash_update(hash); 402 } 403 ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0); 404 crypto_ahash_final(hash); 405 } 406 407 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) 408 { 409 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; 410 struct nvmet_tcp_queue *queue = cmd->queue; 411 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 412 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); 413 414 cmd->offset = 0; 415 cmd->state = NVMET_TCP_SEND_DATA_PDU; 416 417 pdu->hdr.type = nvme_tcp_c2h_data; 418 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ? 419 NVME_TCP_F_DATA_SUCCESS : 0); 420 pdu->hdr.hlen = sizeof(*pdu); 421 pdu->hdr.pdo = pdu->hdr.hlen + hdgst; 422 pdu->hdr.plen = 423 cpu_to_le32(pdu->hdr.hlen + hdgst + 424 cmd->req.transfer_len + ddgst); 425 pdu->command_id = cmd->req.cqe->command_id; 426 pdu->data_length = cpu_to_le32(cmd->req.transfer_len); 427 pdu->data_offset = cpu_to_le32(cmd->wbytes_done); 428 429 if (queue->data_digest) { 430 pdu->hdr.flags |= NVME_TCP_F_DDGST; 431 nvmet_tcp_send_ddgst(queue->snd_hash, cmd); 432 } 433 434 if (cmd->queue->hdr_digest) { 435 pdu->hdr.flags |= NVME_TCP_F_HDGST; 436 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 437 } 438 } 439 440 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) 441 { 442 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; 443 struct nvmet_tcp_queue *queue = cmd->queue; 444 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 445 446 cmd->offset = 0; 447 cmd->state = NVMET_TCP_SEND_R2T; 448 449 pdu->hdr.type = nvme_tcp_r2t; 450 pdu->hdr.flags = 0; 451 pdu->hdr.hlen = sizeof(*pdu); 452 pdu->hdr.pdo = 0; 453 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 454 455 pdu->command_id = cmd->req.cmd->common.command_id; 456 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); 457 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); 458 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); 459 if (cmd->queue->hdr_digest) { 460 pdu->hdr.flags |= NVME_TCP_F_HDGST; 461 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 462 } 463 } 464 465 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) 466 { 467 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; 468 struct nvmet_tcp_queue *queue = cmd->queue; 469 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 470 471 cmd->offset = 0; 472 cmd->state = NVMET_TCP_SEND_RESPONSE; 473 474 pdu->hdr.type = nvme_tcp_rsp; 475 pdu->hdr.flags = 0; 476 pdu->hdr.hlen = sizeof(*pdu); 477 pdu->hdr.pdo = 0; 478 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 479 if (cmd->queue->hdr_digest) { 480 pdu->hdr.flags |= NVME_TCP_F_HDGST; 481 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 482 } 483 } 484 485 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) 486 { 487 struct llist_node *node; 488 struct nvmet_tcp_cmd *cmd; 489 490 for (node = llist_del_all(&queue->resp_list); node; node = node->next) { 491 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); 492 list_add(&cmd->entry, &queue->resp_send_list); 493 queue->send_list_len++; 494 } 495 } 496 497 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) 498 { 499 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, 500 struct nvmet_tcp_cmd, entry); 501 if (!queue->snd_cmd) { 502 nvmet_tcp_process_resp_list(queue); 503 queue->snd_cmd = 504 list_first_entry_or_null(&queue->resp_send_list, 505 struct nvmet_tcp_cmd, entry); 506 if (unlikely(!queue->snd_cmd)) 507 return NULL; 508 } 509 510 list_del_init(&queue->snd_cmd->entry); 511 queue->send_list_len--; 512 513 if (nvmet_tcp_need_data_out(queue->snd_cmd)) 514 nvmet_setup_c2h_data_pdu(queue->snd_cmd); 515 else if (nvmet_tcp_need_data_in(queue->snd_cmd)) 516 nvmet_setup_r2t_pdu(queue->snd_cmd); 517 else 518 nvmet_setup_response_pdu(queue->snd_cmd); 519 520 return queue->snd_cmd; 521 } 522 523 static void nvmet_tcp_queue_response(struct nvmet_req *req) 524 { 525 struct nvmet_tcp_cmd *cmd = 526 container_of(req, struct nvmet_tcp_cmd, req); 527 struct nvmet_tcp_queue *queue = cmd->queue; 528 529 llist_add(&cmd->lentry, &queue->resp_list); 530 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); 531 } 532 533 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) 534 { 535 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 536 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; 537 int ret; 538 539 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), 540 offset_in_page(cmd->data_pdu) + cmd->offset, 541 left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); 542 if (ret <= 0) 543 return ret; 544 545 cmd->offset += ret; 546 left -= ret; 547 548 if (left) 549 return -EAGAIN; 550 551 cmd->state = NVMET_TCP_SEND_DATA; 552 cmd->offset = 0; 553 return 1; 554 } 555 556 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 557 { 558 struct nvmet_tcp_queue *queue = cmd->queue; 559 int ret; 560 561 while (cmd->cur_sg) { 562 struct page *page = sg_page(cmd->cur_sg); 563 u32 left = cmd->cur_sg->length - cmd->offset; 564 int flags = MSG_DONTWAIT; 565 566 if ((!last_in_batch && cmd->queue->send_list_len) || 567 cmd->wbytes_done + left < cmd->req.transfer_len || 568 queue->data_digest || !queue->nvme_sq.sqhd_disabled) 569 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 570 571 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, 572 left, flags); 573 if (ret <= 0) 574 return ret; 575 576 cmd->offset += ret; 577 cmd->wbytes_done += ret; 578 579 /* Done with sg?*/ 580 if (cmd->offset == cmd->cur_sg->length) { 581 cmd->cur_sg = sg_next(cmd->cur_sg); 582 cmd->offset = 0; 583 } 584 } 585 586 if (queue->data_digest) { 587 cmd->state = NVMET_TCP_SEND_DDGST; 588 cmd->offset = 0; 589 } else { 590 if (queue->nvme_sq.sqhd_disabled) { 591 cmd->queue->snd_cmd = NULL; 592 nvmet_tcp_put_cmd(cmd); 593 } else { 594 nvmet_setup_response_pdu(cmd); 595 } 596 } 597 598 if (queue->nvme_sq.sqhd_disabled) { 599 kfree(cmd->iov); 600 sgl_free(cmd->req.sg); 601 } 602 603 return 1; 604 605 } 606 607 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, 608 bool last_in_batch) 609 { 610 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 611 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; 612 int flags = MSG_DONTWAIT; 613 int ret; 614 615 if (!last_in_batch && cmd->queue->send_list_len) 616 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 617 else 618 flags |= MSG_EOR; 619 620 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), 621 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); 622 if (ret <= 0) 623 return ret; 624 cmd->offset += ret; 625 left -= ret; 626 627 if (left) 628 return -EAGAIN; 629 630 kfree(cmd->iov); 631 sgl_free(cmd->req.sg); 632 cmd->queue->snd_cmd = NULL; 633 nvmet_tcp_put_cmd(cmd); 634 return 1; 635 } 636 637 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 638 { 639 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 640 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; 641 int flags = MSG_DONTWAIT; 642 int ret; 643 644 if (!last_in_batch && cmd->queue->send_list_len) 645 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 646 else 647 flags |= MSG_EOR; 648 649 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), 650 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); 651 if (ret <= 0) 652 return ret; 653 cmd->offset += ret; 654 left -= ret; 655 656 if (left) 657 return -EAGAIN; 658 659 cmd->queue->snd_cmd = NULL; 660 return 1; 661 } 662 663 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 664 { 665 struct nvmet_tcp_queue *queue = cmd->queue; 666 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 667 struct kvec iov = { 668 .iov_base = &cmd->exp_ddgst + cmd->offset, 669 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset 670 }; 671 int ret; 672 673 if (!last_in_batch && cmd->queue->send_list_len) 674 msg.msg_flags |= MSG_MORE; 675 else 676 msg.msg_flags |= MSG_EOR; 677 678 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 679 if (unlikely(ret <= 0)) 680 return ret; 681 682 cmd->offset += ret; 683 684 if (queue->nvme_sq.sqhd_disabled) { 685 cmd->queue->snd_cmd = NULL; 686 nvmet_tcp_put_cmd(cmd); 687 } else { 688 nvmet_setup_response_pdu(cmd); 689 } 690 return 1; 691 } 692 693 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, 694 bool last_in_batch) 695 { 696 struct nvmet_tcp_cmd *cmd = queue->snd_cmd; 697 int ret = 0; 698 699 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { 700 cmd = nvmet_tcp_fetch_cmd(queue); 701 if (unlikely(!cmd)) 702 return 0; 703 } 704 705 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { 706 ret = nvmet_try_send_data_pdu(cmd); 707 if (ret <= 0) 708 goto done_send; 709 } 710 711 if (cmd->state == NVMET_TCP_SEND_DATA) { 712 ret = nvmet_try_send_data(cmd, last_in_batch); 713 if (ret <= 0) 714 goto done_send; 715 } 716 717 if (cmd->state == NVMET_TCP_SEND_DDGST) { 718 ret = nvmet_try_send_ddgst(cmd, last_in_batch); 719 if (ret <= 0) 720 goto done_send; 721 } 722 723 if (cmd->state == NVMET_TCP_SEND_R2T) { 724 ret = nvmet_try_send_r2t(cmd, last_in_batch); 725 if (ret <= 0) 726 goto done_send; 727 } 728 729 if (cmd->state == NVMET_TCP_SEND_RESPONSE) 730 ret = nvmet_try_send_response(cmd, last_in_batch); 731 732 done_send: 733 if (ret < 0) { 734 if (ret == -EAGAIN) 735 return 0; 736 return ret; 737 } 738 739 return 1; 740 } 741 742 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, 743 int budget, int *sends) 744 { 745 int i, ret = 0; 746 747 for (i = 0; i < budget; i++) { 748 ret = nvmet_tcp_try_send_one(queue, i == budget - 1); 749 if (unlikely(ret < 0)) { 750 nvmet_tcp_socket_error(queue, ret); 751 goto done; 752 } else if (ret == 0) { 753 break; 754 } 755 (*sends)++; 756 } 757 done: 758 return ret; 759 } 760 761 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) 762 { 763 queue->offset = 0; 764 queue->left = sizeof(struct nvme_tcp_hdr); 765 queue->cmd = NULL; 766 queue->rcv_state = NVMET_TCP_RECV_PDU; 767 } 768 769 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) 770 { 771 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); 772 773 ahash_request_free(queue->rcv_hash); 774 ahash_request_free(queue->snd_hash); 775 crypto_free_ahash(tfm); 776 } 777 778 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) 779 { 780 struct crypto_ahash *tfm; 781 782 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); 783 if (IS_ERR(tfm)) 784 return PTR_ERR(tfm); 785 786 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); 787 if (!queue->snd_hash) 788 goto free_tfm; 789 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); 790 791 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); 792 if (!queue->rcv_hash) 793 goto free_snd_hash; 794 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); 795 796 return 0; 797 free_snd_hash: 798 ahash_request_free(queue->snd_hash); 799 free_tfm: 800 crypto_free_ahash(tfm); 801 return -ENOMEM; 802 } 803 804 805 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) 806 { 807 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; 808 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; 809 struct msghdr msg = {}; 810 struct kvec iov; 811 int ret; 812 813 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { 814 pr_err("bad nvme-tcp pdu length (%d)\n", 815 le32_to_cpu(icreq->hdr.plen)); 816 nvmet_tcp_fatal_error(queue); 817 } 818 819 if (icreq->pfv != NVME_TCP_PFV_1_0) { 820 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); 821 return -EPROTO; 822 } 823 824 if (icreq->hpda != 0) { 825 pr_err("queue %d: unsupported hpda %d\n", queue->idx, 826 icreq->hpda); 827 return -EPROTO; 828 } 829 830 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); 831 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); 832 if (queue->hdr_digest || queue->data_digest) { 833 ret = nvmet_tcp_alloc_crypto(queue); 834 if (ret) 835 return ret; 836 } 837 838 memset(icresp, 0, sizeof(*icresp)); 839 icresp->hdr.type = nvme_tcp_icresp; 840 icresp->hdr.hlen = sizeof(*icresp); 841 icresp->hdr.pdo = 0; 842 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); 843 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); 844 icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ 845 icresp->cpda = 0; 846 if (queue->hdr_digest) 847 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; 848 if (queue->data_digest) 849 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; 850 851 iov.iov_base = icresp; 852 iov.iov_len = sizeof(*icresp); 853 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 854 if (ret < 0) 855 goto free_crypto; 856 857 queue->state = NVMET_TCP_Q_LIVE; 858 nvmet_prepare_receive_pdu(queue); 859 return 0; 860 free_crypto: 861 if (queue->hdr_digest || queue->data_digest) 862 nvmet_tcp_free_crypto(queue); 863 return ret; 864 } 865 866 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, 867 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) 868 { 869 size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); 870 int ret; 871 872 if (!nvme_is_write(cmd->req.cmd) || 873 data_len > cmd->req.port->inline_data_size) { 874 nvmet_prepare_receive_pdu(queue); 875 return; 876 } 877 878 ret = nvmet_tcp_map_data(cmd); 879 if (unlikely(ret)) { 880 pr_err("queue %d: failed to map data\n", queue->idx); 881 nvmet_tcp_fatal_error(queue); 882 return; 883 } 884 885 queue->rcv_state = NVMET_TCP_RECV_DATA; 886 nvmet_tcp_map_pdu_iovec(cmd); 887 cmd->flags |= NVMET_TCP_F_INIT_FAILED; 888 } 889 890 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) 891 { 892 struct nvme_tcp_data_pdu *data = &queue->pdu.data; 893 struct nvmet_tcp_cmd *cmd; 894 895 if (likely(queue->nr_cmds)) 896 cmd = &queue->cmds[data->ttag]; 897 else 898 cmd = &queue->connect; 899 900 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { 901 pr_err("ttag %u unexpected data offset %u (expected %u)\n", 902 data->ttag, le32_to_cpu(data->data_offset), 903 cmd->rbytes_done); 904 /* FIXME: use path and transport errors */ 905 nvmet_req_complete(&cmd->req, 906 NVME_SC_INVALID_FIELD | NVME_SC_DNR); 907 return -EPROTO; 908 } 909 910 cmd->pdu_len = le32_to_cpu(data->data_length); 911 cmd->pdu_recv = 0; 912 nvmet_tcp_map_pdu_iovec(cmd); 913 queue->cmd = cmd; 914 queue->rcv_state = NVMET_TCP_RECV_DATA; 915 916 return 0; 917 } 918 919 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) 920 { 921 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 922 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; 923 struct nvmet_req *req; 924 int ret; 925 926 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 927 if (hdr->type != nvme_tcp_icreq) { 928 pr_err("unexpected pdu type (%d) before icreq\n", 929 hdr->type); 930 nvmet_tcp_fatal_error(queue); 931 return -EPROTO; 932 } 933 return nvmet_tcp_handle_icreq(queue); 934 } 935 936 if (hdr->type == nvme_tcp_h2c_data) { 937 ret = nvmet_tcp_handle_h2c_data_pdu(queue); 938 if (unlikely(ret)) 939 return ret; 940 return 0; 941 } 942 943 queue->cmd = nvmet_tcp_get_cmd(queue); 944 if (unlikely(!queue->cmd)) { 945 /* This should never happen */ 946 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", 947 queue->idx, queue->nr_cmds, queue->send_list_len, 948 nvme_cmd->common.opcode); 949 nvmet_tcp_fatal_error(queue); 950 return -ENOMEM; 951 } 952 953 req = &queue->cmd->req; 954 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); 955 956 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, 957 &queue->nvme_sq, &nvmet_tcp_ops))) { 958 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", 959 req->cmd, req->cmd->common.command_id, 960 req->cmd->common.opcode, 961 le32_to_cpu(req->cmd->common.dptr.sgl.length)); 962 963 nvmet_tcp_handle_req_failure(queue, queue->cmd, req); 964 return -EAGAIN; 965 } 966 967 ret = nvmet_tcp_map_data(queue->cmd); 968 if (unlikely(ret)) { 969 pr_err("queue %d: failed to map data\n", queue->idx); 970 if (nvmet_tcp_has_inline_data(queue->cmd)) 971 nvmet_tcp_fatal_error(queue); 972 else 973 nvmet_req_complete(req, ret); 974 ret = -EAGAIN; 975 goto out; 976 } 977 978 if (nvmet_tcp_need_data_in(queue->cmd)) { 979 if (nvmet_tcp_has_inline_data(queue->cmd)) { 980 queue->rcv_state = NVMET_TCP_RECV_DATA; 981 nvmet_tcp_map_pdu_iovec(queue->cmd); 982 return 0; 983 } 984 /* send back R2T */ 985 nvmet_tcp_queue_response(&queue->cmd->req); 986 goto out; 987 } 988 989 queue->cmd->req.execute(&queue->cmd->req); 990 out: 991 nvmet_prepare_receive_pdu(queue); 992 return ret; 993 } 994 995 static const u8 nvme_tcp_pdu_sizes[] = { 996 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), 997 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), 998 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), 999 }; 1000 1001 static inline u8 nvmet_tcp_pdu_size(u8 type) 1002 { 1003 size_t idx = type; 1004 1005 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && 1006 nvme_tcp_pdu_sizes[idx]) ? 1007 nvme_tcp_pdu_sizes[idx] : 0; 1008 } 1009 1010 static inline bool nvmet_tcp_pdu_valid(u8 type) 1011 { 1012 switch (type) { 1013 case nvme_tcp_icreq: 1014 case nvme_tcp_cmd: 1015 case nvme_tcp_h2c_data: 1016 /* fallthru */ 1017 return true; 1018 } 1019 1020 return false; 1021 } 1022 1023 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) 1024 { 1025 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 1026 int len; 1027 struct kvec iov; 1028 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1029 1030 recv: 1031 iov.iov_base = (void *)&queue->pdu + queue->offset; 1032 iov.iov_len = queue->left; 1033 len = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1034 iov.iov_len, msg.msg_flags); 1035 if (unlikely(len < 0)) 1036 return len; 1037 1038 queue->offset += len; 1039 queue->left -= len; 1040 if (queue->left) 1041 return -EAGAIN; 1042 1043 if (queue->offset == sizeof(struct nvme_tcp_hdr)) { 1044 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1045 1046 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { 1047 pr_err("unexpected pdu type %d\n", hdr->type); 1048 nvmet_tcp_fatal_error(queue); 1049 return -EIO; 1050 } 1051 1052 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { 1053 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); 1054 return -EIO; 1055 } 1056 1057 queue->left = hdr->hlen - queue->offset + hdgst; 1058 goto recv; 1059 } 1060 1061 if (queue->hdr_digest && 1062 nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { 1063 nvmet_tcp_fatal_error(queue); /* fatal */ 1064 return -EPROTO; 1065 } 1066 1067 if (queue->data_digest && 1068 nvmet_tcp_check_ddgst(queue, &queue->pdu)) { 1069 nvmet_tcp_fatal_error(queue); /* fatal */ 1070 return -EPROTO; 1071 } 1072 1073 return nvmet_tcp_done_recv_pdu(queue); 1074 } 1075 1076 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) 1077 { 1078 struct nvmet_tcp_queue *queue = cmd->queue; 1079 1080 nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd); 1081 queue->offset = 0; 1082 queue->left = NVME_TCP_DIGEST_LENGTH; 1083 queue->rcv_state = NVMET_TCP_RECV_DDGST; 1084 } 1085 1086 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) 1087 { 1088 struct nvmet_tcp_cmd *cmd = queue->cmd; 1089 int ret; 1090 1091 while (msg_data_left(&cmd->recv_msg)) { 1092 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, 1093 cmd->recv_msg.msg_flags); 1094 if (ret <= 0) 1095 return ret; 1096 1097 cmd->pdu_recv += ret; 1098 cmd->rbytes_done += ret; 1099 } 1100 1101 nvmet_tcp_unmap_pdu_iovec(cmd); 1102 if (queue->data_digest) { 1103 nvmet_tcp_prep_recv_ddgst(cmd); 1104 return 0; 1105 } 1106 1107 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && 1108 cmd->rbytes_done == cmd->req.transfer_len) { 1109 cmd->req.execute(&cmd->req); 1110 } 1111 1112 nvmet_prepare_receive_pdu(queue); 1113 return 0; 1114 } 1115 1116 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) 1117 { 1118 struct nvmet_tcp_cmd *cmd = queue->cmd; 1119 int ret; 1120 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1121 struct kvec iov = { 1122 .iov_base = (void *)&cmd->recv_ddgst + queue->offset, 1123 .iov_len = queue->left 1124 }; 1125 1126 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1127 iov.iov_len, msg.msg_flags); 1128 if (unlikely(ret < 0)) 1129 return ret; 1130 1131 queue->offset += ret; 1132 queue->left -= ret; 1133 if (queue->left) 1134 return -EAGAIN; 1135 1136 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { 1137 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", 1138 queue->idx, cmd->req.cmd->common.command_id, 1139 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), 1140 le32_to_cpu(cmd->exp_ddgst)); 1141 nvmet_tcp_finish_cmd(cmd); 1142 nvmet_tcp_fatal_error(queue); 1143 ret = -EPROTO; 1144 goto out; 1145 } 1146 1147 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && 1148 cmd->rbytes_done == cmd->req.transfer_len) 1149 cmd->req.execute(&cmd->req); 1150 ret = 0; 1151 out: 1152 nvmet_prepare_receive_pdu(queue); 1153 return ret; 1154 } 1155 1156 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) 1157 { 1158 int result = 0; 1159 1160 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) 1161 return 0; 1162 1163 if (queue->rcv_state == NVMET_TCP_RECV_PDU) { 1164 result = nvmet_tcp_try_recv_pdu(queue); 1165 if (result != 0) 1166 goto done_recv; 1167 } 1168 1169 if (queue->rcv_state == NVMET_TCP_RECV_DATA) { 1170 result = nvmet_tcp_try_recv_data(queue); 1171 if (result != 0) 1172 goto done_recv; 1173 } 1174 1175 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { 1176 result = nvmet_tcp_try_recv_ddgst(queue); 1177 if (result != 0) 1178 goto done_recv; 1179 } 1180 1181 done_recv: 1182 if (result < 0) { 1183 if (result == -EAGAIN) 1184 return 0; 1185 return result; 1186 } 1187 return 1; 1188 } 1189 1190 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, 1191 int budget, int *recvs) 1192 { 1193 int i, ret = 0; 1194 1195 for (i = 0; i < budget; i++) { 1196 ret = nvmet_tcp_try_recv_one(queue); 1197 if (unlikely(ret < 0)) { 1198 nvmet_tcp_socket_error(queue, ret); 1199 goto done; 1200 } else if (ret == 0) { 1201 break; 1202 } 1203 (*recvs)++; 1204 } 1205 done: 1206 return ret; 1207 } 1208 1209 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) 1210 { 1211 spin_lock(&queue->state_lock); 1212 if (queue->state != NVMET_TCP_Q_DISCONNECTING) { 1213 queue->state = NVMET_TCP_Q_DISCONNECTING; 1214 schedule_work(&queue->release_work); 1215 } 1216 spin_unlock(&queue->state_lock); 1217 } 1218 1219 static void nvmet_tcp_io_work(struct work_struct *w) 1220 { 1221 struct nvmet_tcp_queue *queue = 1222 container_of(w, struct nvmet_tcp_queue, io_work); 1223 bool pending; 1224 int ret, ops = 0; 1225 1226 do { 1227 pending = false; 1228 1229 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); 1230 if (ret > 0) 1231 pending = true; 1232 else if (ret < 0) 1233 return; 1234 1235 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); 1236 if (ret > 0) 1237 pending = true; 1238 else if (ret < 0) 1239 return; 1240 1241 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); 1242 1243 /* 1244 * We exahusted our budget, requeue our selves 1245 */ 1246 if (pending) 1247 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1248 } 1249 1250 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, 1251 struct nvmet_tcp_cmd *c) 1252 { 1253 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1254 1255 c->queue = queue; 1256 c->req.port = queue->port->nport; 1257 1258 c->cmd_pdu = page_frag_alloc(&queue->pf_cache, 1259 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1260 if (!c->cmd_pdu) 1261 return -ENOMEM; 1262 c->req.cmd = &c->cmd_pdu->cmd; 1263 1264 c->rsp_pdu = page_frag_alloc(&queue->pf_cache, 1265 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1266 if (!c->rsp_pdu) 1267 goto out_free_cmd; 1268 c->req.cqe = &c->rsp_pdu->cqe; 1269 1270 c->data_pdu = page_frag_alloc(&queue->pf_cache, 1271 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1272 if (!c->data_pdu) 1273 goto out_free_rsp; 1274 1275 c->r2t_pdu = page_frag_alloc(&queue->pf_cache, 1276 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1277 if (!c->r2t_pdu) 1278 goto out_free_data; 1279 1280 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1281 1282 list_add_tail(&c->entry, &queue->free_list); 1283 1284 return 0; 1285 out_free_data: 1286 page_frag_free(c->data_pdu); 1287 out_free_rsp: 1288 page_frag_free(c->rsp_pdu); 1289 out_free_cmd: 1290 page_frag_free(c->cmd_pdu); 1291 return -ENOMEM; 1292 } 1293 1294 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) 1295 { 1296 page_frag_free(c->r2t_pdu); 1297 page_frag_free(c->data_pdu); 1298 page_frag_free(c->rsp_pdu); 1299 page_frag_free(c->cmd_pdu); 1300 } 1301 1302 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) 1303 { 1304 struct nvmet_tcp_cmd *cmds; 1305 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; 1306 1307 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); 1308 if (!cmds) 1309 goto out; 1310 1311 for (i = 0; i < nr_cmds; i++) { 1312 ret = nvmet_tcp_alloc_cmd(queue, cmds + i); 1313 if (ret) 1314 goto out_free; 1315 } 1316 1317 queue->cmds = cmds; 1318 1319 return 0; 1320 out_free: 1321 while (--i >= 0) 1322 nvmet_tcp_free_cmd(cmds + i); 1323 kfree(cmds); 1324 out: 1325 return ret; 1326 } 1327 1328 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) 1329 { 1330 struct nvmet_tcp_cmd *cmds = queue->cmds; 1331 int i; 1332 1333 for (i = 0; i < queue->nr_cmds; i++) 1334 nvmet_tcp_free_cmd(cmds + i); 1335 1336 nvmet_tcp_free_cmd(&queue->connect); 1337 kfree(cmds); 1338 } 1339 1340 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) 1341 { 1342 struct socket *sock = queue->sock; 1343 1344 write_lock_bh(&sock->sk->sk_callback_lock); 1345 sock->sk->sk_data_ready = queue->data_ready; 1346 sock->sk->sk_state_change = queue->state_change; 1347 sock->sk->sk_write_space = queue->write_space; 1348 sock->sk->sk_user_data = NULL; 1349 write_unlock_bh(&sock->sk->sk_callback_lock); 1350 } 1351 1352 static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) 1353 { 1354 nvmet_req_uninit(&cmd->req); 1355 nvmet_tcp_unmap_pdu_iovec(cmd); 1356 kfree(cmd->iov); 1357 sgl_free(cmd->req.sg); 1358 } 1359 1360 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) 1361 { 1362 struct nvmet_tcp_cmd *cmd = queue->cmds; 1363 int i; 1364 1365 for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1366 if (nvmet_tcp_need_data_in(cmd)) 1367 nvmet_tcp_finish_cmd(cmd); 1368 } 1369 1370 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { 1371 /* failed in connect */ 1372 nvmet_tcp_finish_cmd(&queue->connect); 1373 } 1374 } 1375 1376 static void nvmet_tcp_release_queue_work(struct work_struct *w) 1377 { 1378 struct nvmet_tcp_queue *queue = 1379 container_of(w, struct nvmet_tcp_queue, release_work); 1380 1381 mutex_lock(&nvmet_tcp_queue_mutex); 1382 list_del_init(&queue->queue_list); 1383 mutex_unlock(&nvmet_tcp_queue_mutex); 1384 1385 nvmet_tcp_restore_socket_callbacks(queue); 1386 flush_work(&queue->io_work); 1387 1388 nvmet_tcp_uninit_data_in_cmds(queue); 1389 nvmet_sq_destroy(&queue->nvme_sq); 1390 cancel_work_sync(&queue->io_work); 1391 sock_release(queue->sock); 1392 nvmet_tcp_free_cmds(queue); 1393 if (queue->hdr_digest || queue->data_digest) 1394 nvmet_tcp_free_crypto(queue); 1395 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); 1396 1397 kfree(queue); 1398 } 1399 1400 static void nvmet_tcp_data_ready(struct sock *sk) 1401 { 1402 struct nvmet_tcp_queue *queue; 1403 1404 read_lock_bh(&sk->sk_callback_lock); 1405 queue = sk->sk_user_data; 1406 if (likely(queue)) 1407 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1408 read_unlock_bh(&sk->sk_callback_lock); 1409 } 1410 1411 static void nvmet_tcp_write_space(struct sock *sk) 1412 { 1413 struct nvmet_tcp_queue *queue; 1414 1415 read_lock_bh(&sk->sk_callback_lock); 1416 queue = sk->sk_user_data; 1417 if (unlikely(!queue)) 1418 goto out; 1419 1420 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 1421 queue->write_space(sk); 1422 goto out; 1423 } 1424 1425 if (sk_stream_is_writeable(sk)) { 1426 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1427 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1428 } 1429 out: 1430 read_unlock_bh(&sk->sk_callback_lock); 1431 } 1432 1433 static void nvmet_tcp_state_change(struct sock *sk) 1434 { 1435 struct nvmet_tcp_queue *queue; 1436 1437 write_lock_bh(&sk->sk_callback_lock); 1438 queue = sk->sk_user_data; 1439 if (!queue) 1440 goto done; 1441 1442 switch (sk->sk_state) { 1443 case TCP_FIN_WAIT1: 1444 case TCP_CLOSE_WAIT: 1445 case TCP_CLOSE: 1446 /* FALLTHRU */ 1447 sk->sk_user_data = NULL; 1448 nvmet_tcp_schedule_release_queue(queue); 1449 break; 1450 default: 1451 pr_warn("queue %d unhandled state %d\n", 1452 queue->idx, sk->sk_state); 1453 } 1454 done: 1455 write_unlock_bh(&sk->sk_callback_lock); 1456 } 1457 1458 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) 1459 { 1460 struct socket *sock = queue->sock; 1461 struct inet_sock *inet = inet_sk(sock->sk); 1462 int ret; 1463 1464 ret = kernel_getsockname(sock, 1465 (struct sockaddr *)&queue->sockaddr); 1466 if (ret < 0) 1467 return ret; 1468 1469 ret = kernel_getpeername(sock, 1470 (struct sockaddr *)&queue->sockaddr_peer); 1471 if (ret < 0) 1472 return ret; 1473 1474 /* 1475 * Cleanup whatever is sitting in the TCP transmit queue on socket 1476 * close. This is done to prevent stale data from being sent should 1477 * the network connection be restored before TCP times out. 1478 */ 1479 sock_no_linger(sock->sk); 1480 1481 if (so_priority > 0) 1482 sock_set_priority(sock->sk, so_priority); 1483 1484 /* Set socket type of service */ 1485 if (inet->rcv_tos > 0) 1486 ip_sock_set_tos(sock->sk, inet->rcv_tos); 1487 1488 ret = 0; 1489 write_lock_bh(&sock->sk->sk_callback_lock); 1490 if (sock->sk->sk_state != TCP_ESTABLISHED) { 1491 /* 1492 * If the socket is already closing, don't even start 1493 * consuming it 1494 */ 1495 ret = -ENOTCONN; 1496 } else { 1497 sock->sk->sk_user_data = queue; 1498 queue->data_ready = sock->sk->sk_data_ready; 1499 sock->sk->sk_data_ready = nvmet_tcp_data_ready; 1500 queue->state_change = sock->sk->sk_state_change; 1501 sock->sk->sk_state_change = nvmet_tcp_state_change; 1502 queue->write_space = sock->sk->sk_write_space; 1503 sock->sk->sk_write_space = nvmet_tcp_write_space; 1504 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1505 } 1506 write_unlock_bh(&sock->sk->sk_callback_lock); 1507 1508 return ret; 1509 } 1510 1511 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, 1512 struct socket *newsock) 1513 { 1514 struct nvmet_tcp_queue *queue; 1515 int ret; 1516 1517 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1518 if (!queue) 1519 return -ENOMEM; 1520 1521 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); 1522 INIT_WORK(&queue->io_work, nvmet_tcp_io_work); 1523 queue->sock = newsock; 1524 queue->port = port; 1525 queue->nr_cmds = 0; 1526 spin_lock_init(&queue->state_lock); 1527 queue->state = NVMET_TCP_Q_CONNECTING; 1528 INIT_LIST_HEAD(&queue->free_list); 1529 init_llist_head(&queue->resp_list); 1530 INIT_LIST_HEAD(&queue->resp_send_list); 1531 1532 queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); 1533 if (queue->idx < 0) { 1534 ret = queue->idx; 1535 goto out_free_queue; 1536 } 1537 1538 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); 1539 if (ret) 1540 goto out_ida_remove; 1541 1542 ret = nvmet_sq_init(&queue->nvme_sq); 1543 if (ret) 1544 goto out_free_connect; 1545 1546 nvmet_prepare_receive_pdu(queue); 1547 1548 mutex_lock(&nvmet_tcp_queue_mutex); 1549 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); 1550 mutex_unlock(&nvmet_tcp_queue_mutex); 1551 1552 ret = nvmet_tcp_set_queue_sock(queue); 1553 if (ret) 1554 goto out_destroy_sq; 1555 1556 return 0; 1557 out_destroy_sq: 1558 mutex_lock(&nvmet_tcp_queue_mutex); 1559 list_del_init(&queue->queue_list); 1560 mutex_unlock(&nvmet_tcp_queue_mutex); 1561 nvmet_sq_destroy(&queue->nvme_sq); 1562 out_free_connect: 1563 nvmet_tcp_free_cmd(&queue->connect); 1564 out_ida_remove: 1565 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); 1566 out_free_queue: 1567 kfree(queue); 1568 return ret; 1569 } 1570 1571 static void nvmet_tcp_accept_work(struct work_struct *w) 1572 { 1573 struct nvmet_tcp_port *port = 1574 container_of(w, struct nvmet_tcp_port, accept_work); 1575 struct socket *newsock; 1576 int ret; 1577 1578 while (true) { 1579 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); 1580 if (ret < 0) { 1581 if (ret != -EAGAIN) 1582 pr_warn("failed to accept err=%d\n", ret); 1583 return; 1584 } 1585 ret = nvmet_tcp_alloc_queue(port, newsock); 1586 if (ret) { 1587 pr_err("failed to allocate queue\n"); 1588 sock_release(newsock); 1589 } 1590 } 1591 } 1592 1593 static void nvmet_tcp_listen_data_ready(struct sock *sk) 1594 { 1595 struct nvmet_tcp_port *port; 1596 1597 read_lock_bh(&sk->sk_callback_lock); 1598 port = sk->sk_user_data; 1599 if (!port) 1600 goto out; 1601 1602 if (sk->sk_state == TCP_LISTEN) 1603 schedule_work(&port->accept_work); 1604 out: 1605 read_unlock_bh(&sk->sk_callback_lock); 1606 } 1607 1608 static int nvmet_tcp_add_port(struct nvmet_port *nport) 1609 { 1610 struct nvmet_tcp_port *port; 1611 __kernel_sa_family_t af; 1612 int ret; 1613 1614 port = kzalloc(sizeof(*port), GFP_KERNEL); 1615 if (!port) 1616 return -ENOMEM; 1617 1618 switch (nport->disc_addr.adrfam) { 1619 case NVMF_ADDR_FAMILY_IP4: 1620 af = AF_INET; 1621 break; 1622 case NVMF_ADDR_FAMILY_IP6: 1623 af = AF_INET6; 1624 break; 1625 default: 1626 pr_err("address family %d not supported\n", 1627 nport->disc_addr.adrfam); 1628 ret = -EINVAL; 1629 goto err_port; 1630 } 1631 1632 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, 1633 nport->disc_addr.trsvcid, &port->addr); 1634 if (ret) { 1635 pr_err("malformed ip/port passed: %s:%s\n", 1636 nport->disc_addr.traddr, nport->disc_addr.trsvcid); 1637 goto err_port; 1638 } 1639 1640 port->nport = nport; 1641 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); 1642 if (port->nport->inline_data_size < 0) 1643 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; 1644 1645 ret = sock_create(port->addr.ss_family, SOCK_STREAM, 1646 IPPROTO_TCP, &port->sock); 1647 if (ret) { 1648 pr_err("failed to create a socket\n"); 1649 goto err_port; 1650 } 1651 1652 port->sock->sk->sk_user_data = port; 1653 port->data_ready = port->sock->sk->sk_data_ready; 1654 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; 1655 sock_set_reuseaddr(port->sock->sk); 1656 tcp_sock_set_nodelay(port->sock->sk); 1657 if (so_priority > 0) 1658 sock_set_priority(port->sock->sk, so_priority); 1659 1660 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, 1661 sizeof(port->addr)); 1662 if (ret) { 1663 pr_err("failed to bind port socket %d\n", ret); 1664 goto err_sock; 1665 } 1666 1667 ret = kernel_listen(port->sock, 128); 1668 if (ret) { 1669 pr_err("failed to listen %d on port sock\n", ret); 1670 goto err_sock; 1671 } 1672 1673 nport->priv = port; 1674 pr_info("enabling port %d (%pISpc)\n", 1675 le16_to_cpu(nport->disc_addr.portid), &port->addr); 1676 1677 return 0; 1678 1679 err_sock: 1680 sock_release(port->sock); 1681 err_port: 1682 kfree(port); 1683 return ret; 1684 } 1685 1686 static void nvmet_tcp_remove_port(struct nvmet_port *nport) 1687 { 1688 struct nvmet_tcp_port *port = nport->priv; 1689 1690 write_lock_bh(&port->sock->sk->sk_callback_lock); 1691 port->sock->sk->sk_data_ready = port->data_ready; 1692 port->sock->sk->sk_user_data = NULL; 1693 write_unlock_bh(&port->sock->sk->sk_callback_lock); 1694 cancel_work_sync(&port->accept_work); 1695 1696 sock_release(port->sock); 1697 kfree(port); 1698 } 1699 1700 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) 1701 { 1702 struct nvmet_tcp_queue *queue; 1703 1704 mutex_lock(&nvmet_tcp_queue_mutex); 1705 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1706 if (queue->nvme_sq.ctrl == ctrl) 1707 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1708 mutex_unlock(&nvmet_tcp_queue_mutex); 1709 } 1710 1711 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) 1712 { 1713 struct nvmet_tcp_queue *queue = 1714 container_of(sq, struct nvmet_tcp_queue, nvme_sq); 1715 1716 if (sq->qid == 0) { 1717 /* Let inflight controller teardown complete */ 1718 flush_scheduled_work(); 1719 } 1720 1721 queue->nr_cmds = sq->size * 2; 1722 if (nvmet_tcp_alloc_cmds(queue)) 1723 return NVME_SC_INTERNAL; 1724 return 0; 1725 } 1726 1727 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, 1728 struct nvmet_port *nport, char *traddr) 1729 { 1730 struct nvmet_tcp_port *port = nport->priv; 1731 1732 if (inet_addr_is_any((struct sockaddr *)&port->addr)) { 1733 struct nvmet_tcp_cmd *cmd = 1734 container_of(req, struct nvmet_tcp_cmd, req); 1735 struct nvmet_tcp_queue *queue = cmd->queue; 1736 1737 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); 1738 } else { 1739 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); 1740 } 1741 } 1742 1743 static const struct nvmet_fabrics_ops nvmet_tcp_ops = { 1744 .owner = THIS_MODULE, 1745 .type = NVMF_TRTYPE_TCP, 1746 .msdbd = 1, 1747 .add_port = nvmet_tcp_add_port, 1748 .remove_port = nvmet_tcp_remove_port, 1749 .queue_response = nvmet_tcp_queue_response, 1750 .delete_ctrl = nvmet_tcp_delete_ctrl, 1751 .install_queue = nvmet_tcp_install_queue, 1752 .disc_traddr = nvmet_tcp_disc_port_addr, 1753 }; 1754 1755 static int __init nvmet_tcp_init(void) 1756 { 1757 int ret; 1758 1759 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); 1760 if (!nvmet_tcp_wq) 1761 return -ENOMEM; 1762 1763 ret = nvmet_register_transport(&nvmet_tcp_ops); 1764 if (ret) 1765 goto err; 1766 1767 return 0; 1768 err: 1769 destroy_workqueue(nvmet_tcp_wq); 1770 return ret; 1771 } 1772 1773 static void __exit nvmet_tcp_exit(void) 1774 { 1775 struct nvmet_tcp_queue *queue; 1776 1777 nvmet_unregister_transport(&nvmet_tcp_ops); 1778 1779 flush_scheduled_work(); 1780 mutex_lock(&nvmet_tcp_queue_mutex); 1781 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1782 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1783 mutex_unlock(&nvmet_tcp_queue_mutex); 1784 flush_scheduled_work(); 1785 1786 destroy_workqueue(nvmet_tcp_wq); 1787 } 1788 1789 module_init(nvmet_tcp_init); 1790 module_exit(nvmet_tcp_exit); 1791 1792 MODULE_LICENSE("GPL v2"); 1793 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ 1794