1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics TCP target. 4 * Copyright (c) 2018 Lightbits Labs. All rights reserved. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <linux/err.h> 11 #include <linux/nvme-tcp.h> 12 #include <net/sock.h> 13 #include <net/tcp.h> 14 #include <linux/inet.h> 15 #include <linux/llist.h> 16 #include <crypto/hash.h> 17 #include <trace/events/sock.h> 18 19 #include "nvmet.h" 20 21 #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) 22 23 /* Define the socket priority to use for connections were it is desirable 24 * that the NIC consider performing optimized packet processing or filtering. 25 * A non-zero value being sufficient to indicate general consideration of any 26 * possible optimization. Making it a module param allows for alternative 27 * values that may be unique for some NIC implementations. 28 */ 29 static int so_priority; 30 module_param(so_priority, int, 0644); 31 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); 32 33 /* Define a time period (in usecs) that io_work() shall sample an activated 34 * queue before determining it to be idle. This optional module behavior 35 * can enable NIC solutions that support socket optimized packet processing 36 * using advanced interrupt moderation techniques. 37 */ 38 static int idle_poll_period_usecs; 39 module_param(idle_poll_period_usecs, int, 0644); 40 MODULE_PARM_DESC(idle_poll_period_usecs, 41 "nvmet tcp io_work poll till idle time period in usecs"); 42 43 #define NVMET_TCP_RECV_BUDGET 8 44 #define NVMET_TCP_SEND_BUDGET 8 45 #define NVMET_TCP_IO_WORK_BUDGET 64 46 47 enum nvmet_tcp_send_state { 48 NVMET_TCP_SEND_DATA_PDU, 49 NVMET_TCP_SEND_DATA, 50 NVMET_TCP_SEND_R2T, 51 NVMET_TCP_SEND_DDGST, 52 NVMET_TCP_SEND_RESPONSE 53 }; 54 55 enum nvmet_tcp_recv_state { 56 NVMET_TCP_RECV_PDU, 57 NVMET_TCP_RECV_DATA, 58 NVMET_TCP_RECV_DDGST, 59 NVMET_TCP_RECV_ERR, 60 }; 61 62 enum { 63 NVMET_TCP_F_INIT_FAILED = (1 << 0), 64 }; 65 66 struct nvmet_tcp_cmd { 67 struct nvmet_tcp_queue *queue; 68 struct nvmet_req req; 69 70 struct nvme_tcp_cmd_pdu *cmd_pdu; 71 struct nvme_tcp_rsp_pdu *rsp_pdu; 72 struct nvme_tcp_data_pdu *data_pdu; 73 struct nvme_tcp_r2t_pdu *r2t_pdu; 74 75 u32 rbytes_done; 76 u32 wbytes_done; 77 78 u32 pdu_len; 79 u32 pdu_recv; 80 int sg_idx; 81 struct msghdr recv_msg; 82 struct bio_vec *iov; 83 u32 flags; 84 85 struct list_head entry; 86 struct llist_node lentry; 87 88 /* send state */ 89 u32 offset; 90 struct scatterlist *cur_sg; 91 enum nvmet_tcp_send_state state; 92 93 __le32 exp_ddgst; 94 __le32 recv_ddgst; 95 }; 96 97 enum nvmet_tcp_queue_state { 98 NVMET_TCP_Q_CONNECTING, 99 NVMET_TCP_Q_LIVE, 100 NVMET_TCP_Q_DISCONNECTING, 101 }; 102 103 struct nvmet_tcp_queue { 104 struct socket *sock; 105 struct nvmet_tcp_port *port; 106 struct work_struct io_work; 107 struct nvmet_cq nvme_cq; 108 struct nvmet_sq nvme_sq; 109 110 /* send state */ 111 struct nvmet_tcp_cmd *cmds; 112 unsigned int nr_cmds; 113 struct list_head free_list; 114 struct llist_head resp_list; 115 struct list_head resp_send_list; 116 int send_list_len; 117 struct nvmet_tcp_cmd *snd_cmd; 118 119 /* recv state */ 120 int offset; 121 int left; 122 enum nvmet_tcp_recv_state rcv_state; 123 struct nvmet_tcp_cmd *cmd; 124 union nvme_tcp_pdu pdu; 125 126 /* digest state */ 127 bool hdr_digest; 128 bool data_digest; 129 struct ahash_request *snd_hash; 130 struct ahash_request *rcv_hash; 131 132 unsigned long poll_end; 133 134 spinlock_t state_lock; 135 enum nvmet_tcp_queue_state state; 136 137 struct sockaddr_storage sockaddr; 138 struct sockaddr_storage sockaddr_peer; 139 struct work_struct release_work; 140 141 int idx; 142 struct list_head queue_list; 143 144 struct nvmet_tcp_cmd connect; 145 146 struct page_frag_cache pf_cache; 147 148 void (*data_ready)(struct sock *); 149 void (*state_change)(struct sock *); 150 void (*write_space)(struct sock *); 151 }; 152 153 struct nvmet_tcp_port { 154 struct socket *sock; 155 struct work_struct accept_work; 156 struct nvmet_port *nport; 157 struct sockaddr_storage addr; 158 void (*data_ready)(struct sock *); 159 }; 160 161 static DEFINE_IDA(nvmet_tcp_queue_ida); 162 static LIST_HEAD(nvmet_tcp_queue_list); 163 static DEFINE_MUTEX(nvmet_tcp_queue_mutex); 164 165 static struct workqueue_struct *nvmet_tcp_wq; 166 static const struct nvmet_fabrics_ops nvmet_tcp_ops; 167 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); 168 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); 169 170 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, 171 struct nvmet_tcp_cmd *cmd) 172 { 173 if (unlikely(!queue->nr_cmds)) { 174 /* We didn't allocate cmds yet, send 0xffff */ 175 return USHRT_MAX; 176 } 177 178 return cmd - queue->cmds; 179 } 180 181 static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) 182 { 183 return nvme_is_write(cmd->req.cmd) && 184 cmd->rbytes_done < cmd->req.transfer_len; 185 } 186 187 static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) 188 { 189 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status; 190 } 191 192 static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) 193 { 194 return !nvme_is_write(cmd->req.cmd) && 195 cmd->req.transfer_len > 0 && 196 !cmd->req.cqe->status; 197 } 198 199 static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) 200 { 201 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && 202 !cmd->rbytes_done; 203 } 204 205 static inline struct nvmet_tcp_cmd * 206 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) 207 { 208 struct nvmet_tcp_cmd *cmd; 209 210 cmd = list_first_entry_or_null(&queue->free_list, 211 struct nvmet_tcp_cmd, entry); 212 if (!cmd) 213 return NULL; 214 list_del_init(&cmd->entry); 215 216 cmd->rbytes_done = cmd->wbytes_done = 0; 217 cmd->pdu_len = 0; 218 cmd->pdu_recv = 0; 219 cmd->iov = NULL; 220 cmd->flags = 0; 221 return cmd; 222 } 223 224 static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) 225 { 226 if (unlikely(cmd == &cmd->queue->connect)) 227 return; 228 229 list_add_tail(&cmd->entry, &cmd->queue->free_list); 230 } 231 232 static inline int queue_cpu(struct nvmet_tcp_queue *queue) 233 { 234 return queue->sock->sk->sk_incoming_cpu; 235 } 236 237 static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) 238 { 239 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; 240 } 241 242 static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) 243 { 244 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 245 } 246 247 static inline void nvmet_tcp_hdgst(struct ahash_request *hash, 248 void *pdu, size_t len) 249 { 250 struct scatterlist sg; 251 252 sg_init_one(&sg, pdu, len); 253 ahash_request_set_crypt(hash, &sg, pdu + len, len); 254 crypto_ahash_digest(hash); 255 } 256 257 static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, 258 void *pdu, size_t len) 259 { 260 struct nvme_tcp_hdr *hdr = pdu; 261 __le32 recv_digest; 262 __le32 exp_digest; 263 264 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { 265 pr_err("queue %d: header digest enabled but no header digest\n", 266 queue->idx); 267 return -EPROTO; 268 } 269 270 recv_digest = *(__le32 *)(pdu + hdr->hlen); 271 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); 272 exp_digest = *(__le32 *)(pdu + hdr->hlen); 273 if (recv_digest != exp_digest) { 274 pr_err("queue %d: header digest error: recv %#x expected %#x\n", 275 queue->idx, le32_to_cpu(recv_digest), 276 le32_to_cpu(exp_digest)); 277 return -EPROTO; 278 } 279 280 return 0; 281 } 282 283 static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) 284 { 285 struct nvme_tcp_hdr *hdr = pdu; 286 u8 digest_len = nvmet_tcp_hdgst_len(queue); 287 u32 len; 288 289 len = le32_to_cpu(hdr->plen) - hdr->hlen - 290 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); 291 292 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { 293 pr_err("queue %d: data digest flag is cleared\n", queue->idx); 294 return -EPROTO; 295 } 296 297 return 0; 298 } 299 300 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) 301 { 302 kfree(cmd->iov); 303 sgl_free(cmd->req.sg); 304 cmd->iov = NULL; 305 cmd->req.sg = NULL; 306 } 307 308 static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) 309 { 310 struct bio_vec *iov = cmd->iov; 311 struct scatterlist *sg; 312 u32 length, offset, sg_offset; 313 int nr_pages; 314 315 length = cmd->pdu_len; 316 nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); 317 offset = cmd->rbytes_done; 318 cmd->sg_idx = offset / PAGE_SIZE; 319 sg_offset = offset % PAGE_SIZE; 320 sg = &cmd->req.sg[cmd->sg_idx]; 321 322 while (length) { 323 u32 iov_len = min_t(u32, length, sg->length - sg_offset); 324 325 iov->bv_page = sg_page(sg); 326 iov->bv_len = sg->length; 327 iov->bv_offset = sg->offset + sg_offset; 328 329 length -= iov_len; 330 sg = sg_next(sg); 331 iov++; 332 sg_offset = 0; 333 } 334 335 iov_iter_bvec(&cmd->recv_msg.msg_iter, ITER_DEST, cmd->iov, 336 nr_pages, cmd->pdu_len); 337 } 338 339 static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) 340 { 341 queue->rcv_state = NVMET_TCP_RECV_ERR; 342 if (queue->nvme_sq.ctrl) 343 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 344 else 345 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 346 } 347 348 static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status) 349 { 350 if (status == -EPIPE || status == -ECONNRESET) 351 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 352 else 353 nvmet_tcp_fatal_error(queue); 354 } 355 356 static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) 357 { 358 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; 359 u32 len = le32_to_cpu(sgl->length); 360 361 if (!len) 362 return 0; 363 364 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | 365 NVME_SGL_FMT_OFFSET)) { 366 if (!nvme_is_write(cmd->req.cmd)) 367 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 368 369 if (len > cmd->req.port->inline_data_size) 370 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 371 cmd->pdu_len = len; 372 } 373 cmd->req.transfer_len += len; 374 375 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); 376 if (!cmd->req.sg) 377 return NVME_SC_INTERNAL; 378 cmd->cur_sg = cmd->req.sg; 379 380 if (nvmet_tcp_has_data_in(cmd)) { 381 cmd->iov = kmalloc_array(cmd->req.sg_cnt, 382 sizeof(*cmd->iov), GFP_KERNEL); 383 if (!cmd->iov) 384 goto err; 385 } 386 387 return 0; 388 err: 389 nvmet_tcp_free_cmd_buffers(cmd); 390 return NVME_SC_INTERNAL; 391 } 392 393 static void nvmet_tcp_calc_ddgst(struct ahash_request *hash, 394 struct nvmet_tcp_cmd *cmd) 395 { 396 ahash_request_set_crypt(hash, cmd->req.sg, 397 (void *)&cmd->exp_ddgst, cmd->req.transfer_len); 398 crypto_ahash_digest(hash); 399 } 400 401 static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) 402 { 403 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; 404 struct nvmet_tcp_queue *queue = cmd->queue; 405 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 406 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); 407 408 cmd->offset = 0; 409 cmd->state = NVMET_TCP_SEND_DATA_PDU; 410 411 pdu->hdr.type = nvme_tcp_c2h_data; 412 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ? 413 NVME_TCP_F_DATA_SUCCESS : 0); 414 pdu->hdr.hlen = sizeof(*pdu); 415 pdu->hdr.pdo = pdu->hdr.hlen + hdgst; 416 pdu->hdr.plen = 417 cpu_to_le32(pdu->hdr.hlen + hdgst + 418 cmd->req.transfer_len + ddgst); 419 pdu->command_id = cmd->req.cqe->command_id; 420 pdu->data_length = cpu_to_le32(cmd->req.transfer_len); 421 pdu->data_offset = cpu_to_le32(cmd->wbytes_done); 422 423 if (queue->data_digest) { 424 pdu->hdr.flags |= NVME_TCP_F_DDGST; 425 nvmet_tcp_calc_ddgst(queue->snd_hash, cmd); 426 } 427 428 if (cmd->queue->hdr_digest) { 429 pdu->hdr.flags |= NVME_TCP_F_HDGST; 430 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 431 } 432 } 433 434 static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) 435 { 436 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; 437 struct nvmet_tcp_queue *queue = cmd->queue; 438 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 439 440 cmd->offset = 0; 441 cmd->state = NVMET_TCP_SEND_R2T; 442 443 pdu->hdr.type = nvme_tcp_r2t; 444 pdu->hdr.flags = 0; 445 pdu->hdr.hlen = sizeof(*pdu); 446 pdu->hdr.pdo = 0; 447 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 448 449 pdu->command_id = cmd->req.cmd->common.command_id; 450 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); 451 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); 452 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); 453 if (cmd->queue->hdr_digest) { 454 pdu->hdr.flags |= NVME_TCP_F_HDGST; 455 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 456 } 457 } 458 459 static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) 460 { 461 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; 462 struct nvmet_tcp_queue *queue = cmd->queue; 463 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 464 465 cmd->offset = 0; 466 cmd->state = NVMET_TCP_SEND_RESPONSE; 467 468 pdu->hdr.type = nvme_tcp_rsp; 469 pdu->hdr.flags = 0; 470 pdu->hdr.hlen = sizeof(*pdu); 471 pdu->hdr.pdo = 0; 472 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 473 if (cmd->queue->hdr_digest) { 474 pdu->hdr.flags |= NVME_TCP_F_HDGST; 475 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 476 } 477 } 478 479 static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) 480 { 481 struct llist_node *node; 482 struct nvmet_tcp_cmd *cmd; 483 484 for (node = llist_del_all(&queue->resp_list); node; node = node->next) { 485 cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); 486 list_add(&cmd->entry, &queue->resp_send_list); 487 queue->send_list_len++; 488 } 489 } 490 491 static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) 492 { 493 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, 494 struct nvmet_tcp_cmd, entry); 495 if (!queue->snd_cmd) { 496 nvmet_tcp_process_resp_list(queue); 497 queue->snd_cmd = 498 list_first_entry_or_null(&queue->resp_send_list, 499 struct nvmet_tcp_cmd, entry); 500 if (unlikely(!queue->snd_cmd)) 501 return NULL; 502 } 503 504 list_del_init(&queue->snd_cmd->entry); 505 queue->send_list_len--; 506 507 if (nvmet_tcp_need_data_out(queue->snd_cmd)) 508 nvmet_setup_c2h_data_pdu(queue->snd_cmd); 509 else if (nvmet_tcp_need_data_in(queue->snd_cmd)) 510 nvmet_setup_r2t_pdu(queue->snd_cmd); 511 else 512 nvmet_setup_response_pdu(queue->snd_cmd); 513 514 return queue->snd_cmd; 515 } 516 517 static void nvmet_tcp_queue_response(struct nvmet_req *req) 518 { 519 struct nvmet_tcp_cmd *cmd = 520 container_of(req, struct nvmet_tcp_cmd, req); 521 struct nvmet_tcp_queue *queue = cmd->queue; 522 struct nvme_sgl_desc *sgl; 523 u32 len; 524 525 if (unlikely(cmd == queue->cmd)) { 526 sgl = &cmd->req.cmd->common.dptr.sgl; 527 len = le32_to_cpu(sgl->length); 528 529 /* 530 * Wait for inline data before processing the response. 531 * Avoid using helpers, this might happen before 532 * nvmet_req_init is completed. 533 */ 534 if (queue->rcv_state == NVMET_TCP_RECV_PDU && 535 len && len <= cmd->req.port->inline_data_size && 536 nvme_is_write(cmd->req.cmd)) 537 return; 538 } 539 540 llist_add(&cmd->lentry, &queue->resp_list); 541 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); 542 } 543 544 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) 545 { 546 if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED)) 547 nvmet_tcp_queue_response(&cmd->req); 548 else 549 cmd->req.execute(&cmd->req); 550 } 551 552 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) 553 { 554 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 555 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; 556 int ret; 557 558 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), 559 offset_in_page(cmd->data_pdu) + cmd->offset, 560 left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); 561 if (ret <= 0) 562 return ret; 563 564 cmd->offset += ret; 565 left -= ret; 566 567 if (left) 568 return -EAGAIN; 569 570 cmd->state = NVMET_TCP_SEND_DATA; 571 cmd->offset = 0; 572 return 1; 573 } 574 575 static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 576 { 577 struct nvmet_tcp_queue *queue = cmd->queue; 578 int ret; 579 580 while (cmd->cur_sg) { 581 struct page *page = sg_page(cmd->cur_sg); 582 u32 left = cmd->cur_sg->length - cmd->offset; 583 int flags = MSG_DONTWAIT; 584 585 if ((!last_in_batch && cmd->queue->send_list_len) || 586 cmd->wbytes_done + left < cmd->req.transfer_len || 587 queue->data_digest || !queue->nvme_sq.sqhd_disabled) 588 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 589 590 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, 591 left, flags); 592 if (ret <= 0) 593 return ret; 594 595 cmd->offset += ret; 596 cmd->wbytes_done += ret; 597 598 /* Done with sg?*/ 599 if (cmd->offset == cmd->cur_sg->length) { 600 cmd->cur_sg = sg_next(cmd->cur_sg); 601 cmd->offset = 0; 602 } 603 } 604 605 if (queue->data_digest) { 606 cmd->state = NVMET_TCP_SEND_DDGST; 607 cmd->offset = 0; 608 } else { 609 if (queue->nvme_sq.sqhd_disabled) { 610 cmd->queue->snd_cmd = NULL; 611 nvmet_tcp_put_cmd(cmd); 612 } else { 613 nvmet_setup_response_pdu(cmd); 614 } 615 } 616 617 if (queue->nvme_sq.sqhd_disabled) 618 nvmet_tcp_free_cmd_buffers(cmd); 619 620 return 1; 621 622 } 623 624 static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, 625 bool last_in_batch) 626 { 627 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 628 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; 629 int flags = MSG_DONTWAIT; 630 int ret; 631 632 if (!last_in_batch && cmd->queue->send_list_len) 633 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 634 else 635 flags |= MSG_EOR; 636 637 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), 638 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); 639 if (ret <= 0) 640 return ret; 641 cmd->offset += ret; 642 left -= ret; 643 644 if (left) 645 return -EAGAIN; 646 647 nvmet_tcp_free_cmd_buffers(cmd); 648 cmd->queue->snd_cmd = NULL; 649 nvmet_tcp_put_cmd(cmd); 650 return 1; 651 } 652 653 static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 654 { 655 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); 656 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; 657 int flags = MSG_DONTWAIT; 658 int ret; 659 660 if (!last_in_batch && cmd->queue->send_list_len) 661 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; 662 else 663 flags |= MSG_EOR; 664 665 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), 666 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); 667 if (ret <= 0) 668 return ret; 669 cmd->offset += ret; 670 left -= ret; 671 672 if (left) 673 return -EAGAIN; 674 675 cmd->queue->snd_cmd = NULL; 676 return 1; 677 } 678 679 static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) 680 { 681 struct nvmet_tcp_queue *queue = cmd->queue; 682 int left = NVME_TCP_DIGEST_LENGTH - cmd->offset; 683 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 684 struct kvec iov = { 685 .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset, 686 .iov_len = left 687 }; 688 int ret; 689 690 if (!last_in_batch && cmd->queue->send_list_len) 691 msg.msg_flags |= MSG_MORE; 692 else 693 msg.msg_flags |= MSG_EOR; 694 695 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 696 if (unlikely(ret <= 0)) 697 return ret; 698 699 cmd->offset += ret; 700 left -= ret; 701 702 if (left) 703 return -EAGAIN; 704 705 if (queue->nvme_sq.sqhd_disabled) { 706 cmd->queue->snd_cmd = NULL; 707 nvmet_tcp_put_cmd(cmd); 708 } else { 709 nvmet_setup_response_pdu(cmd); 710 } 711 return 1; 712 } 713 714 static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, 715 bool last_in_batch) 716 { 717 struct nvmet_tcp_cmd *cmd = queue->snd_cmd; 718 int ret = 0; 719 720 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { 721 cmd = nvmet_tcp_fetch_cmd(queue); 722 if (unlikely(!cmd)) 723 return 0; 724 } 725 726 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { 727 ret = nvmet_try_send_data_pdu(cmd); 728 if (ret <= 0) 729 goto done_send; 730 } 731 732 if (cmd->state == NVMET_TCP_SEND_DATA) { 733 ret = nvmet_try_send_data(cmd, last_in_batch); 734 if (ret <= 0) 735 goto done_send; 736 } 737 738 if (cmd->state == NVMET_TCP_SEND_DDGST) { 739 ret = nvmet_try_send_ddgst(cmd, last_in_batch); 740 if (ret <= 0) 741 goto done_send; 742 } 743 744 if (cmd->state == NVMET_TCP_SEND_R2T) { 745 ret = nvmet_try_send_r2t(cmd, last_in_batch); 746 if (ret <= 0) 747 goto done_send; 748 } 749 750 if (cmd->state == NVMET_TCP_SEND_RESPONSE) 751 ret = nvmet_try_send_response(cmd, last_in_batch); 752 753 done_send: 754 if (ret < 0) { 755 if (ret == -EAGAIN) 756 return 0; 757 return ret; 758 } 759 760 return 1; 761 } 762 763 static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, 764 int budget, int *sends) 765 { 766 int i, ret = 0; 767 768 for (i = 0; i < budget; i++) { 769 ret = nvmet_tcp_try_send_one(queue, i == budget - 1); 770 if (unlikely(ret < 0)) { 771 nvmet_tcp_socket_error(queue, ret); 772 goto done; 773 } else if (ret == 0) { 774 break; 775 } 776 (*sends)++; 777 } 778 done: 779 return ret; 780 } 781 782 static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) 783 { 784 queue->offset = 0; 785 queue->left = sizeof(struct nvme_tcp_hdr); 786 queue->cmd = NULL; 787 queue->rcv_state = NVMET_TCP_RECV_PDU; 788 } 789 790 static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) 791 { 792 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); 793 794 ahash_request_free(queue->rcv_hash); 795 ahash_request_free(queue->snd_hash); 796 crypto_free_ahash(tfm); 797 } 798 799 static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) 800 { 801 struct crypto_ahash *tfm; 802 803 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); 804 if (IS_ERR(tfm)) 805 return PTR_ERR(tfm); 806 807 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); 808 if (!queue->snd_hash) 809 goto free_tfm; 810 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); 811 812 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); 813 if (!queue->rcv_hash) 814 goto free_snd_hash; 815 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); 816 817 return 0; 818 free_snd_hash: 819 ahash_request_free(queue->snd_hash); 820 free_tfm: 821 crypto_free_ahash(tfm); 822 return -ENOMEM; 823 } 824 825 826 static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) 827 { 828 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; 829 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; 830 struct msghdr msg = {}; 831 struct kvec iov; 832 int ret; 833 834 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { 835 pr_err("bad nvme-tcp pdu length (%d)\n", 836 le32_to_cpu(icreq->hdr.plen)); 837 nvmet_tcp_fatal_error(queue); 838 } 839 840 if (icreq->pfv != NVME_TCP_PFV_1_0) { 841 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); 842 return -EPROTO; 843 } 844 845 if (icreq->hpda != 0) { 846 pr_err("queue %d: unsupported hpda %d\n", queue->idx, 847 icreq->hpda); 848 return -EPROTO; 849 } 850 851 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); 852 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); 853 if (queue->hdr_digest || queue->data_digest) { 854 ret = nvmet_tcp_alloc_crypto(queue); 855 if (ret) 856 return ret; 857 } 858 859 memset(icresp, 0, sizeof(*icresp)); 860 icresp->hdr.type = nvme_tcp_icresp; 861 icresp->hdr.hlen = sizeof(*icresp); 862 icresp->hdr.pdo = 0; 863 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); 864 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); 865 icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ 866 icresp->cpda = 0; 867 if (queue->hdr_digest) 868 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; 869 if (queue->data_digest) 870 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; 871 872 iov.iov_base = icresp; 873 iov.iov_len = sizeof(*icresp); 874 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 875 if (ret < 0) 876 goto free_crypto; 877 878 queue->state = NVMET_TCP_Q_LIVE; 879 nvmet_prepare_receive_pdu(queue); 880 return 0; 881 free_crypto: 882 if (queue->hdr_digest || queue->data_digest) 883 nvmet_tcp_free_crypto(queue); 884 return ret; 885 } 886 887 static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, 888 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) 889 { 890 size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); 891 int ret; 892 893 /* 894 * This command has not been processed yet, hence we are trying to 895 * figure out if there is still pending data left to receive. If 896 * we don't, we can simply prepare for the next pdu and bail out, 897 * otherwise we will need to prepare a buffer and receive the 898 * stale data before continuing forward. 899 */ 900 if (!nvme_is_write(cmd->req.cmd) || !data_len || 901 data_len > cmd->req.port->inline_data_size) { 902 nvmet_prepare_receive_pdu(queue); 903 return; 904 } 905 906 ret = nvmet_tcp_map_data(cmd); 907 if (unlikely(ret)) { 908 pr_err("queue %d: failed to map data\n", queue->idx); 909 nvmet_tcp_fatal_error(queue); 910 return; 911 } 912 913 queue->rcv_state = NVMET_TCP_RECV_DATA; 914 nvmet_tcp_build_pdu_iovec(cmd); 915 cmd->flags |= NVMET_TCP_F_INIT_FAILED; 916 } 917 918 static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) 919 { 920 struct nvme_tcp_data_pdu *data = &queue->pdu.data; 921 struct nvmet_tcp_cmd *cmd; 922 923 if (likely(queue->nr_cmds)) { 924 if (unlikely(data->ttag >= queue->nr_cmds)) { 925 pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", 926 queue->idx, data->ttag, queue->nr_cmds); 927 nvmet_tcp_fatal_error(queue); 928 return -EPROTO; 929 } 930 cmd = &queue->cmds[data->ttag]; 931 } else { 932 cmd = &queue->connect; 933 } 934 935 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { 936 pr_err("ttag %u unexpected data offset %u (expected %u)\n", 937 data->ttag, le32_to_cpu(data->data_offset), 938 cmd->rbytes_done); 939 /* FIXME: use path and transport errors */ 940 nvmet_req_complete(&cmd->req, 941 NVME_SC_INVALID_FIELD | NVME_SC_DNR); 942 return -EPROTO; 943 } 944 945 cmd->pdu_len = le32_to_cpu(data->data_length); 946 cmd->pdu_recv = 0; 947 nvmet_tcp_build_pdu_iovec(cmd); 948 queue->cmd = cmd; 949 queue->rcv_state = NVMET_TCP_RECV_DATA; 950 951 return 0; 952 } 953 954 static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) 955 { 956 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 957 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; 958 struct nvmet_req *req; 959 int ret; 960 961 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 962 if (hdr->type != nvme_tcp_icreq) { 963 pr_err("unexpected pdu type (%d) before icreq\n", 964 hdr->type); 965 nvmet_tcp_fatal_error(queue); 966 return -EPROTO; 967 } 968 return nvmet_tcp_handle_icreq(queue); 969 } 970 971 if (unlikely(hdr->type == nvme_tcp_icreq)) { 972 pr_err("queue %d: received icreq pdu in state %d\n", 973 queue->idx, queue->state); 974 nvmet_tcp_fatal_error(queue); 975 return -EPROTO; 976 } 977 978 if (hdr->type == nvme_tcp_h2c_data) { 979 ret = nvmet_tcp_handle_h2c_data_pdu(queue); 980 if (unlikely(ret)) 981 return ret; 982 return 0; 983 } 984 985 queue->cmd = nvmet_tcp_get_cmd(queue); 986 if (unlikely(!queue->cmd)) { 987 /* This should never happen */ 988 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", 989 queue->idx, queue->nr_cmds, queue->send_list_len, 990 nvme_cmd->common.opcode); 991 nvmet_tcp_fatal_error(queue); 992 return -ENOMEM; 993 } 994 995 req = &queue->cmd->req; 996 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); 997 998 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, 999 &queue->nvme_sq, &nvmet_tcp_ops))) { 1000 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", 1001 req->cmd, req->cmd->common.command_id, 1002 req->cmd->common.opcode, 1003 le32_to_cpu(req->cmd->common.dptr.sgl.length)); 1004 1005 nvmet_tcp_handle_req_failure(queue, queue->cmd, req); 1006 return 0; 1007 } 1008 1009 ret = nvmet_tcp_map_data(queue->cmd); 1010 if (unlikely(ret)) { 1011 pr_err("queue %d: failed to map data\n", queue->idx); 1012 if (nvmet_tcp_has_inline_data(queue->cmd)) 1013 nvmet_tcp_fatal_error(queue); 1014 else 1015 nvmet_req_complete(req, ret); 1016 ret = -EAGAIN; 1017 goto out; 1018 } 1019 1020 if (nvmet_tcp_need_data_in(queue->cmd)) { 1021 if (nvmet_tcp_has_inline_data(queue->cmd)) { 1022 queue->rcv_state = NVMET_TCP_RECV_DATA; 1023 nvmet_tcp_build_pdu_iovec(queue->cmd); 1024 return 0; 1025 } 1026 /* send back R2T */ 1027 nvmet_tcp_queue_response(&queue->cmd->req); 1028 goto out; 1029 } 1030 1031 queue->cmd->req.execute(&queue->cmd->req); 1032 out: 1033 nvmet_prepare_receive_pdu(queue); 1034 return ret; 1035 } 1036 1037 static const u8 nvme_tcp_pdu_sizes[] = { 1038 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), 1039 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), 1040 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), 1041 }; 1042 1043 static inline u8 nvmet_tcp_pdu_size(u8 type) 1044 { 1045 size_t idx = type; 1046 1047 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && 1048 nvme_tcp_pdu_sizes[idx]) ? 1049 nvme_tcp_pdu_sizes[idx] : 0; 1050 } 1051 1052 static inline bool nvmet_tcp_pdu_valid(u8 type) 1053 { 1054 switch (type) { 1055 case nvme_tcp_icreq: 1056 case nvme_tcp_cmd: 1057 case nvme_tcp_h2c_data: 1058 /* fallthru */ 1059 return true; 1060 } 1061 1062 return false; 1063 } 1064 1065 static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) 1066 { 1067 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; 1068 int len; 1069 struct kvec iov; 1070 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1071 1072 recv: 1073 iov.iov_base = (void *)&queue->pdu + queue->offset; 1074 iov.iov_len = queue->left; 1075 len = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1076 iov.iov_len, msg.msg_flags); 1077 if (unlikely(len < 0)) 1078 return len; 1079 1080 queue->offset += len; 1081 queue->left -= len; 1082 if (queue->left) 1083 return -EAGAIN; 1084 1085 if (queue->offset == sizeof(struct nvme_tcp_hdr)) { 1086 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1087 1088 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { 1089 pr_err("unexpected pdu type %d\n", hdr->type); 1090 nvmet_tcp_fatal_error(queue); 1091 return -EIO; 1092 } 1093 1094 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { 1095 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); 1096 return -EIO; 1097 } 1098 1099 queue->left = hdr->hlen - queue->offset + hdgst; 1100 goto recv; 1101 } 1102 1103 if (queue->hdr_digest && 1104 nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) { 1105 nvmet_tcp_fatal_error(queue); /* fatal */ 1106 return -EPROTO; 1107 } 1108 1109 if (queue->data_digest && 1110 nvmet_tcp_check_ddgst(queue, &queue->pdu)) { 1111 nvmet_tcp_fatal_error(queue); /* fatal */ 1112 return -EPROTO; 1113 } 1114 1115 return nvmet_tcp_done_recv_pdu(queue); 1116 } 1117 1118 static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) 1119 { 1120 struct nvmet_tcp_queue *queue = cmd->queue; 1121 1122 nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd); 1123 queue->offset = 0; 1124 queue->left = NVME_TCP_DIGEST_LENGTH; 1125 queue->rcv_state = NVMET_TCP_RECV_DDGST; 1126 } 1127 1128 static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) 1129 { 1130 struct nvmet_tcp_cmd *cmd = queue->cmd; 1131 int ret; 1132 1133 while (msg_data_left(&cmd->recv_msg)) { 1134 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, 1135 cmd->recv_msg.msg_flags); 1136 if (ret <= 0) 1137 return ret; 1138 1139 cmd->pdu_recv += ret; 1140 cmd->rbytes_done += ret; 1141 } 1142 1143 if (queue->data_digest) { 1144 nvmet_tcp_prep_recv_ddgst(cmd); 1145 return 0; 1146 } 1147 1148 if (cmd->rbytes_done == cmd->req.transfer_len) 1149 nvmet_tcp_execute_request(cmd); 1150 1151 nvmet_prepare_receive_pdu(queue); 1152 return 0; 1153 } 1154 1155 static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) 1156 { 1157 struct nvmet_tcp_cmd *cmd = queue->cmd; 1158 int ret; 1159 struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; 1160 struct kvec iov = { 1161 .iov_base = (void *)&cmd->recv_ddgst + queue->offset, 1162 .iov_len = queue->left 1163 }; 1164 1165 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1166 iov.iov_len, msg.msg_flags); 1167 if (unlikely(ret < 0)) 1168 return ret; 1169 1170 queue->offset += ret; 1171 queue->left -= ret; 1172 if (queue->left) 1173 return -EAGAIN; 1174 1175 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { 1176 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", 1177 queue->idx, cmd->req.cmd->common.command_id, 1178 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), 1179 le32_to_cpu(cmd->exp_ddgst)); 1180 nvmet_req_uninit(&cmd->req); 1181 nvmet_tcp_free_cmd_buffers(cmd); 1182 nvmet_tcp_fatal_error(queue); 1183 ret = -EPROTO; 1184 goto out; 1185 } 1186 1187 if (cmd->rbytes_done == cmd->req.transfer_len) 1188 nvmet_tcp_execute_request(cmd); 1189 1190 ret = 0; 1191 out: 1192 nvmet_prepare_receive_pdu(queue); 1193 return ret; 1194 } 1195 1196 static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) 1197 { 1198 int result = 0; 1199 1200 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) 1201 return 0; 1202 1203 if (queue->rcv_state == NVMET_TCP_RECV_PDU) { 1204 result = nvmet_tcp_try_recv_pdu(queue); 1205 if (result != 0) 1206 goto done_recv; 1207 } 1208 1209 if (queue->rcv_state == NVMET_TCP_RECV_DATA) { 1210 result = nvmet_tcp_try_recv_data(queue); 1211 if (result != 0) 1212 goto done_recv; 1213 } 1214 1215 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { 1216 result = nvmet_tcp_try_recv_ddgst(queue); 1217 if (result != 0) 1218 goto done_recv; 1219 } 1220 1221 done_recv: 1222 if (result < 0) { 1223 if (result == -EAGAIN) 1224 return 0; 1225 return result; 1226 } 1227 return 1; 1228 } 1229 1230 static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, 1231 int budget, int *recvs) 1232 { 1233 int i, ret = 0; 1234 1235 for (i = 0; i < budget; i++) { 1236 ret = nvmet_tcp_try_recv_one(queue); 1237 if (unlikely(ret < 0)) { 1238 nvmet_tcp_socket_error(queue, ret); 1239 goto done; 1240 } else if (ret == 0) { 1241 break; 1242 } 1243 (*recvs)++; 1244 } 1245 done: 1246 return ret; 1247 } 1248 1249 static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) 1250 { 1251 spin_lock(&queue->state_lock); 1252 if (queue->state != NVMET_TCP_Q_DISCONNECTING) { 1253 queue->state = NVMET_TCP_Q_DISCONNECTING; 1254 queue_work(nvmet_wq, &queue->release_work); 1255 } 1256 spin_unlock(&queue->state_lock); 1257 } 1258 1259 static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) 1260 { 1261 queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs); 1262 } 1263 1264 static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue, 1265 int ops) 1266 { 1267 if (!idle_poll_period_usecs) 1268 return false; 1269 1270 if (ops) 1271 nvmet_tcp_arm_queue_deadline(queue); 1272 1273 return !time_after(jiffies, queue->poll_end); 1274 } 1275 1276 static void nvmet_tcp_io_work(struct work_struct *w) 1277 { 1278 struct nvmet_tcp_queue *queue = 1279 container_of(w, struct nvmet_tcp_queue, io_work); 1280 bool pending; 1281 int ret, ops = 0; 1282 1283 do { 1284 pending = false; 1285 1286 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); 1287 if (ret > 0) 1288 pending = true; 1289 else if (ret < 0) 1290 return; 1291 1292 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); 1293 if (ret > 0) 1294 pending = true; 1295 else if (ret < 0) 1296 return; 1297 1298 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); 1299 1300 /* 1301 * Requeue the worker if idle deadline period is in progress or any 1302 * ops activity was recorded during the do-while loop above. 1303 */ 1304 if (nvmet_tcp_check_queue_deadline(queue, ops) || pending) 1305 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1306 } 1307 1308 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, 1309 struct nvmet_tcp_cmd *c) 1310 { 1311 u8 hdgst = nvmet_tcp_hdgst_len(queue); 1312 1313 c->queue = queue; 1314 c->req.port = queue->port->nport; 1315 1316 c->cmd_pdu = page_frag_alloc(&queue->pf_cache, 1317 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1318 if (!c->cmd_pdu) 1319 return -ENOMEM; 1320 c->req.cmd = &c->cmd_pdu->cmd; 1321 1322 c->rsp_pdu = page_frag_alloc(&queue->pf_cache, 1323 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1324 if (!c->rsp_pdu) 1325 goto out_free_cmd; 1326 c->req.cqe = &c->rsp_pdu->cqe; 1327 1328 c->data_pdu = page_frag_alloc(&queue->pf_cache, 1329 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1330 if (!c->data_pdu) 1331 goto out_free_rsp; 1332 1333 c->r2t_pdu = page_frag_alloc(&queue->pf_cache, 1334 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); 1335 if (!c->r2t_pdu) 1336 goto out_free_data; 1337 1338 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1339 1340 list_add_tail(&c->entry, &queue->free_list); 1341 1342 return 0; 1343 out_free_data: 1344 page_frag_free(c->data_pdu); 1345 out_free_rsp: 1346 page_frag_free(c->rsp_pdu); 1347 out_free_cmd: 1348 page_frag_free(c->cmd_pdu); 1349 return -ENOMEM; 1350 } 1351 1352 static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) 1353 { 1354 page_frag_free(c->r2t_pdu); 1355 page_frag_free(c->data_pdu); 1356 page_frag_free(c->rsp_pdu); 1357 page_frag_free(c->cmd_pdu); 1358 } 1359 1360 static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) 1361 { 1362 struct nvmet_tcp_cmd *cmds; 1363 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; 1364 1365 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); 1366 if (!cmds) 1367 goto out; 1368 1369 for (i = 0; i < nr_cmds; i++) { 1370 ret = nvmet_tcp_alloc_cmd(queue, cmds + i); 1371 if (ret) 1372 goto out_free; 1373 } 1374 1375 queue->cmds = cmds; 1376 1377 return 0; 1378 out_free: 1379 while (--i >= 0) 1380 nvmet_tcp_free_cmd(cmds + i); 1381 kfree(cmds); 1382 out: 1383 return ret; 1384 } 1385 1386 static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) 1387 { 1388 struct nvmet_tcp_cmd *cmds = queue->cmds; 1389 int i; 1390 1391 for (i = 0; i < queue->nr_cmds; i++) 1392 nvmet_tcp_free_cmd(cmds + i); 1393 1394 nvmet_tcp_free_cmd(&queue->connect); 1395 kfree(cmds); 1396 } 1397 1398 static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) 1399 { 1400 struct socket *sock = queue->sock; 1401 1402 write_lock_bh(&sock->sk->sk_callback_lock); 1403 sock->sk->sk_data_ready = queue->data_ready; 1404 sock->sk->sk_state_change = queue->state_change; 1405 sock->sk->sk_write_space = queue->write_space; 1406 sock->sk->sk_user_data = NULL; 1407 write_unlock_bh(&sock->sk->sk_callback_lock); 1408 } 1409 1410 static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) 1411 { 1412 struct nvmet_tcp_cmd *cmd = queue->cmds; 1413 int i; 1414 1415 for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1416 if (nvmet_tcp_need_data_in(cmd)) 1417 nvmet_req_uninit(&cmd->req); 1418 } 1419 1420 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { 1421 /* failed in connect */ 1422 nvmet_req_uninit(&queue->connect.req); 1423 } 1424 } 1425 1426 static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue) 1427 { 1428 struct nvmet_tcp_cmd *cmd = queue->cmds; 1429 int i; 1430 1431 for (i = 0; i < queue->nr_cmds; i++, cmd++) { 1432 if (nvmet_tcp_need_data_in(cmd)) 1433 nvmet_tcp_free_cmd_buffers(cmd); 1434 } 1435 1436 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) 1437 nvmet_tcp_free_cmd_buffers(&queue->connect); 1438 } 1439 1440 static void nvmet_tcp_release_queue_work(struct work_struct *w) 1441 { 1442 struct page *page; 1443 struct nvmet_tcp_queue *queue = 1444 container_of(w, struct nvmet_tcp_queue, release_work); 1445 1446 mutex_lock(&nvmet_tcp_queue_mutex); 1447 list_del_init(&queue->queue_list); 1448 mutex_unlock(&nvmet_tcp_queue_mutex); 1449 1450 nvmet_tcp_restore_socket_callbacks(queue); 1451 cancel_work_sync(&queue->io_work); 1452 /* stop accepting incoming data */ 1453 queue->rcv_state = NVMET_TCP_RECV_ERR; 1454 1455 nvmet_tcp_uninit_data_in_cmds(queue); 1456 nvmet_sq_destroy(&queue->nvme_sq); 1457 cancel_work_sync(&queue->io_work); 1458 nvmet_tcp_free_cmd_data_in_buffers(queue); 1459 sock_release(queue->sock); 1460 nvmet_tcp_free_cmds(queue); 1461 if (queue->hdr_digest || queue->data_digest) 1462 nvmet_tcp_free_crypto(queue); 1463 ida_free(&nvmet_tcp_queue_ida, queue->idx); 1464 1465 page = virt_to_head_page(queue->pf_cache.va); 1466 __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); 1467 kfree(queue); 1468 } 1469 1470 static void nvmet_tcp_data_ready(struct sock *sk) 1471 { 1472 struct nvmet_tcp_queue *queue; 1473 1474 trace_sk_data_ready(sk); 1475 1476 read_lock_bh(&sk->sk_callback_lock); 1477 queue = sk->sk_user_data; 1478 if (likely(queue)) 1479 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1480 read_unlock_bh(&sk->sk_callback_lock); 1481 } 1482 1483 static void nvmet_tcp_write_space(struct sock *sk) 1484 { 1485 struct nvmet_tcp_queue *queue; 1486 1487 read_lock_bh(&sk->sk_callback_lock); 1488 queue = sk->sk_user_data; 1489 if (unlikely(!queue)) 1490 goto out; 1491 1492 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { 1493 queue->write_space(sk); 1494 goto out; 1495 } 1496 1497 if (sk_stream_is_writeable(sk)) { 1498 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1499 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1500 } 1501 out: 1502 read_unlock_bh(&sk->sk_callback_lock); 1503 } 1504 1505 static void nvmet_tcp_state_change(struct sock *sk) 1506 { 1507 struct nvmet_tcp_queue *queue; 1508 1509 read_lock_bh(&sk->sk_callback_lock); 1510 queue = sk->sk_user_data; 1511 if (!queue) 1512 goto done; 1513 1514 switch (sk->sk_state) { 1515 case TCP_FIN_WAIT2: 1516 case TCP_LAST_ACK: 1517 break; 1518 case TCP_FIN_WAIT1: 1519 case TCP_CLOSE_WAIT: 1520 case TCP_CLOSE: 1521 /* FALLTHRU */ 1522 nvmet_tcp_schedule_release_queue(queue); 1523 break; 1524 default: 1525 pr_warn("queue %d unhandled state %d\n", 1526 queue->idx, sk->sk_state); 1527 } 1528 done: 1529 read_unlock_bh(&sk->sk_callback_lock); 1530 } 1531 1532 static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) 1533 { 1534 struct socket *sock = queue->sock; 1535 struct inet_sock *inet = inet_sk(sock->sk); 1536 int ret; 1537 1538 ret = kernel_getsockname(sock, 1539 (struct sockaddr *)&queue->sockaddr); 1540 if (ret < 0) 1541 return ret; 1542 1543 ret = kernel_getpeername(sock, 1544 (struct sockaddr *)&queue->sockaddr_peer); 1545 if (ret < 0) 1546 return ret; 1547 1548 /* 1549 * Cleanup whatever is sitting in the TCP transmit queue on socket 1550 * close. This is done to prevent stale data from being sent should 1551 * the network connection be restored before TCP times out. 1552 */ 1553 sock_no_linger(sock->sk); 1554 1555 if (so_priority > 0) 1556 sock_set_priority(sock->sk, so_priority); 1557 1558 /* Set socket type of service */ 1559 if (inet->rcv_tos > 0) 1560 ip_sock_set_tos(sock->sk, inet->rcv_tos); 1561 1562 ret = 0; 1563 write_lock_bh(&sock->sk->sk_callback_lock); 1564 if (sock->sk->sk_state != TCP_ESTABLISHED) { 1565 /* 1566 * If the socket is already closing, don't even start 1567 * consuming it 1568 */ 1569 ret = -ENOTCONN; 1570 } else { 1571 sock->sk->sk_user_data = queue; 1572 queue->data_ready = sock->sk->sk_data_ready; 1573 sock->sk->sk_data_ready = nvmet_tcp_data_ready; 1574 queue->state_change = sock->sk->sk_state_change; 1575 sock->sk->sk_state_change = nvmet_tcp_state_change; 1576 queue->write_space = sock->sk->sk_write_space; 1577 sock->sk->sk_write_space = nvmet_tcp_write_space; 1578 if (idle_poll_period_usecs) 1579 nvmet_tcp_arm_queue_deadline(queue); 1580 queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); 1581 } 1582 write_unlock_bh(&sock->sk->sk_callback_lock); 1583 1584 return ret; 1585 } 1586 1587 static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, 1588 struct socket *newsock) 1589 { 1590 struct nvmet_tcp_queue *queue; 1591 int ret; 1592 1593 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1594 if (!queue) 1595 return -ENOMEM; 1596 1597 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); 1598 INIT_WORK(&queue->io_work, nvmet_tcp_io_work); 1599 queue->sock = newsock; 1600 queue->port = port; 1601 queue->nr_cmds = 0; 1602 spin_lock_init(&queue->state_lock); 1603 queue->state = NVMET_TCP_Q_CONNECTING; 1604 INIT_LIST_HEAD(&queue->free_list); 1605 init_llist_head(&queue->resp_list); 1606 INIT_LIST_HEAD(&queue->resp_send_list); 1607 1608 queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL); 1609 if (queue->idx < 0) { 1610 ret = queue->idx; 1611 goto out_free_queue; 1612 } 1613 1614 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); 1615 if (ret) 1616 goto out_ida_remove; 1617 1618 ret = nvmet_sq_init(&queue->nvme_sq); 1619 if (ret) 1620 goto out_free_connect; 1621 1622 nvmet_prepare_receive_pdu(queue); 1623 1624 mutex_lock(&nvmet_tcp_queue_mutex); 1625 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); 1626 mutex_unlock(&nvmet_tcp_queue_mutex); 1627 1628 ret = nvmet_tcp_set_queue_sock(queue); 1629 if (ret) 1630 goto out_destroy_sq; 1631 1632 return 0; 1633 out_destroy_sq: 1634 mutex_lock(&nvmet_tcp_queue_mutex); 1635 list_del_init(&queue->queue_list); 1636 mutex_unlock(&nvmet_tcp_queue_mutex); 1637 nvmet_sq_destroy(&queue->nvme_sq); 1638 out_free_connect: 1639 nvmet_tcp_free_cmd(&queue->connect); 1640 out_ida_remove: 1641 ida_free(&nvmet_tcp_queue_ida, queue->idx); 1642 out_free_queue: 1643 kfree(queue); 1644 return ret; 1645 } 1646 1647 static void nvmet_tcp_accept_work(struct work_struct *w) 1648 { 1649 struct nvmet_tcp_port *port = 1650 container_of(w, struct nvmet_tcp_port, accept_work); 1651 struct socket *newsock; 1652 int ret; 1653 1654 while (true) { 1655 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); 1656 if (ret < 0) { 1657 if (ret != -EAGAIN) 1658 pr_warn("failed to accept err=%d\n", ret); 1659 return; 1660 } 1661 ret = nvmet_tcp_alloc_queue(port, newsock); 1662 if (ret) { 1663 pr_err("failed to allocate queue\n"); 1664 sock_release(newsock); 1665 } 1666 } 1667 } 1668 1669 static void nvmet_tcp_listen_data_ready(struct sock *sk) 1670 { 1671 struct nvmet_tcp_port *port; 1672 1673 trace_sk_data_ready(sk); 1674 1675 read_lock_bh(&sk->sk_callback_lock); 1676 port = sk->sk_user_data; 1677 if (!port) 1678 goto out; 1679 1680 if (sk->sk_state == TCP_LISTEN) 1681 queue_work(nvmet_wq, &port->accept_work); 1682 out: 1683 read_unlock_bh(&sk->sk_callback_lock); 1684 } 1685 1686 static int nvmet_tcp_add_port(struct nvmet_port *nport) 1687 { 1688 struct nvmet_tcp_port *port; 1689 __kernel_sa_family_t af; 1690 int ret; 1691 1692 port = kzalloc(sizeof(*port), GFP_KERNEL); 1693 if (!port) 1694 return -ENOMEM; 1695 1696 switch (nport->disc_addr.adrfam) { 1697 case NVMF_ADDR_FAMILY_IP4: 1698 af = AF_INET; 1699 break; 1700 case NVMF_ADDR_FAMILY_IP6: 1701 af = AF_INET6; 1702 break; 1703 default: 1704 pr_err("address family %d not supported\n", 1705 nport->disc_addr.adrfam); 1706 ret = -EINVAL; 1707 goto err_port; 1708 } 1709 1710 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, 1711 nport->disc_addr.trsvcid, &port->addr); 1712 if (ret) { 1713 pr_err("malformed ip/port passed: %s:%s\n", 1714 nport->disc_addr.traddr, nport->disc_addr.trsvcid); 1715 goto err_port; 1716 } 1717 1718 port->nport = nport; 1719 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); 1720 if (port->nport->inline_data_size < 0) 1721 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; 1722 1723 ret = sock_create(port->addr.ss_family, SOCK_STREAM, 1724 IPPROTO_TCP, &port->sock); 1725 if (ret) { 1726 pr_err("failed to create a socket\n"); 1727 goto err_port; 1728 } 1729 1730 port->sock->sk->sk_user_data = port; 1731 port->data_ready = port->sock->sk->sk_data_ready; 1732 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; 1733 sock_set_reuseaddr(port->sock->sk); 1734 tcp_sock_set_nodelay(port->sock->sk); 1735 if (so_priority > 0) 1736 sock_set_priority(port->sock->sk, so_priority); 1737 1738 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, 1739 sizeof(port->addr)); 1740 if (ret) { 1741 pr_err("failed to bind port socket %d\n", ret); 1742 goto err_sock; 1743 } 1744 1745 ret = kernel_listen(port->sock, 128); 1746 if (ret) { 1747 pr_err("failed to listen %d on port sock\n", ret); 1748 goto err_sock; 1749 } 1750 1751 nport->priv = port; 1752 pr_info("enabling port %d (%pISpc)\n", 1753 le16_to_cpu(nport->disc_addr.portid), &port->addr); 1754 1755 return 0; 1756 1757 err_sock: 1758 sock_release(port->sock); 1759 err_port: 1760 kfree(port); 1761 return ret; 1762 } 1763 1764 static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port) 1765 { 1766 struct nvmet_tcp_queue *queue; 1767 1768 mutex_lock(&nvmet_tcp_queue_mutex); 1769 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1770 if (queue->port == port) 1771 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1772 mutex_unlock(&nvmet_tcp_queue_mutex); 1773 } 1774 1775 static void nvmet_tcp_remove_port(struct nvmet_port *nport) 1776 { 1777 struct nvmet_tcp_port *port = nport->priv; 1778 1779 write_lock_bh(&port->sock->sk->sk_callback_lock); 1780 port->sock->sk->sk_data_ready = port->data_ready; 1781 port->sock->sk->sk_user_data = NULL; 1782 write_unlock_bh(&port->sock->sk->sk_callback_lock); 1783 cancel_work_sync(&port->accept_work); 1784 /* 1785 * Destroy the remaining queues, which are not belong to any 1786 * controller yet. 1787 */ 1788 nvmet_tcp_destroy_port_queues(port); 1789 1790 sock_release(port->sock); 1791 kfree(port); 1792 } 1793 1794 static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) 1795 { 1796 struct nvmet_tcp_queue *queue; 1797 1798 mutex_lock(&nvmet_tcp_queue_mutex); 1799 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1800 if (queue->nvme_sq.ctrl == ctrl) 1801 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1802 mutex_unlock(&nvmet_tcp_queue_mutex); 1803 } 1804 1805 static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) 1806 { 1807 struct nvmet_tcp_queue *queue = 1808 container_of(sq, struct nvmet_tcp_queue, nvme_sq); 1809 1810 if (sq->qid == 0) { 1811 /* Let inflight controller teardown complete */ 1812 flush_workqueue(nvmet_wq); 1813 } 1814 1815 queue->nr_cmds = sq->size * 2; 1816 if (nvmet_tcp_alloc_cmds(queue)) 1817 return NVME_SC_INTERNAL; 1818 return 0; 1819 } 1820 1821 static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, 1822 struct nvmet_port *nport, char *traddr) 1823 { 1824 struct nvmet_tcp_port *port = nport->priv; 1825 1826 if (inet_addr_is_any((struct sockaddr *)&port->addr)) { 1827 struct nvmet_tcp_cmd *cmd = 1828 container_of(req, struct nvmet_tcp_cmd, req); 1829 struct nvmet_tcp_queue *queue = cmd->queue; 1830 1831 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); 1832 } else { 1833 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); 1834 } 1835 } 1836 1837 static const struct nvmet_fabrics_ops nvmet_tcp_ops = { 1838 .owner = THIS_MODULE, 1839 .type = NVMF_TRTYPE_TCP, 1840 .msdbd = 1, 1841 .add_port = nvmet_tcp_add_port, 1842 .remove_port = nvmet_tcp_remove_port, 1843 .queue_response = nvmet_tcp_queue_response, 1844 .delete_ctrl = nvmet_tcp_delete_ctrl, 1845 .install_queue = nvmet_tcp_install_queue, 1846 .disc_traddr = nvmet_tcp_disc_port_addr, 1847 }; 1848 1849 static int __init nvmet_tcp_init(void) 1850 { 1851 int ret; 1852 1853 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", 1854 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 1855 if (!nvmet_tcp_wq) 1856 return -ENOMEM; 1857 1858 ret = nvmet_register_transport(&nvmet_tcp_ops); 1859 if (ret) 1860 goto err; 1861 1862 return 0; 1863 err: 1864 destroy_workqueue(nvmet_tcp_wq); 1865 return ret; 1866 } 1867 1868 static void __exit nvmet_tcp_exit(void) 1869 { 1870 struct nvmet_tcp_queue *queue; 1871 1872 nvmet_unregister_transport(&nvmet_tcp_ops); 1873 1874 flush_workqueue(nvmet_wq); 1875 mutex_lock(&nvmet_tcp_queue_mutex); 1876 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) 1877 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1878 mutex_unlock(&nvmet_tcp_queue_mutex); 1879 flush_workqueue(nvmet_wq); 1880 1881 destroy_workqueue(nvmet_tcp_wq); 1882 } 1883 1884 module_init(nvmet_tcp_init); 1885 module_exit(nvmet_tcp_exit); 1886 1887 MODULE_LICENSE("GPL v2"); 1888 MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ 1889