1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics TCP host. 4 * Copyright (c) 2018 Lightbits Labs. All rights reserved. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/module.h> 8 #include <linux/init.h> 9 #include <linux/slab.h> 10 #include <linux/err.h> 11 #include <linux/nvme-tcp.h> 12 #include <net/sock.h> 13 #include <net/tcp.h> 14 #include <linux/blk-mq.h> 15 #include <crypto/hash.h> 16 #include <net/busy_poll.h> 17 18 #include "nvme.h" 19 #include "fabrics.h" 20 21 struct nvme_tcp_queue; 22 23 enum nvme_tcp_send_state { 24 NVME_TCP_SEND_CMD_PDU = 0, 25 NVME_TCP_SEND_H2C_PDU, 26 NVME_TCP_SEND_DATA, 27 NVME_TCP_SEND_DDGST, 28 }; 29 30 struct nvme_tcp_request { 31 struct nvme_request req; 32 void *pdu; 33 struct nvme_tcp_queue *queue; 34 u32 data_len; 35 u32 pdu_len; 36 u32 pdu_sent; 37 u16 ttag; 38 struct list_head entry; 39 __le32 ddgst; 40 41 struct bio *curr_bio; 42 struct iov_iter iter; 43 44 /* send state */ 45 size_t offset; 46 size_t data_sent; 47 enum nvme_tcp_send_state state; 48 }; 49 50 enum nvme_tcp_queue_flags { 51 NVME_TCP_Q_ALLOCATED = 0, 52 NVME_TCP_Q_LIVE = 1, 53 }; 54 55 enum nvme_tcp_recv_state { 56 NVME_TCP_RECV_PDU = 0, 57 NVME_TCP_RECV_DATA, 58 NVME_TCP_RECV_DDGST, 59 }; 60 61 struct nvme_tcp_ctrl; 62 struct nvme_tcp_queue { 63 struct socket *sock; 64 struct work_struct io_work; 65 int io_cpu; 66 67 spinlock_t lock; 68 struct list_head send_list; 69 70 /* recv state */ 71 void *pdu; 72 int pdu_remaining; 73 int pdu_offset; 74 size_t data_remaining; 75 size_t ddgst_remaining; 76 unsigned int nr_cqe; 77 78 /* send state */ 79 struct nvme_tcp_request *request; 80 81 int queue_size; 82 size_t cmnd_capsule_len; 83 struct nvme_tcp_ctrl *ctrl; 84 unsigned long flags; 85 bool rd_enabled; 86 87 bool hdr_digest; 88 bool data_digest; 89 struct ahash_request *rcv_hash; 90 struct ahash_request *snd_hash; 91 __le32 exp_ddgst; 92 __le32 recv_ddgst; 93 94 struct page_frag_cache pf_cache; 95 96 void (*state_change)(struct sock *); 97 void (*data_ready)(struct sock *); 98 void (*write_space)(struct sock *); 99 }; 100 101 struct nvme_tcp_ctrl { 102 /* read only in the hot path */ 103 struct nvme_tcp_queue *queues; 104 struct blk_mq_tag_set tag_set; 105 106 /* other member variables */ 107 struct list_head list; 108 struct blk_mq_tag_set admin_tag_set; 109 struct sockaddr_storage addr; 110 struct sockaddr_storage src_addr; 111 struct nvme_ctrl ctrl; 112 113 struct work_struct err_work; 114 struct delayed_work connect_work; 115 struct nvme_tcp_request async_req; 116 u32 io_queues[HCTX_MAX_TYPES]; 117 }; 118 119 static LIST_HEAD(nvme_tcp_ctrl_list); 120 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); 121 static struct workqueue_struct *nvme_tcp_wq; 122 static struct blk_mq_ops nvme_tcp_mq_ops; 123 static struct blk_mq_ops nvme_tcp_admin_mq_ops; 124 125 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) 126 { 127 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); 128 } 129 130 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) 131 { 132 return queue - queue->ctrl->queues; 133 } 134 135 static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) 136 { 137 u32 queue_idx = nvme_tcp_queue_id(queue); 138 139 if (queue_idx == 0) 140 return queue->ctrl->admin_tag_set.tags[queue_idx]; 141 return queue->ctrl->tag_set.tags[queue_idx - 1]; 142 } 143 144 static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue) 145 { 146 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; 147 } 148 149 static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) 150 { 151 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; 152 } 153 154 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) 155 { 156 return queue->cmnd_capsule_len - sizeof(struct nvme_command); 157 } 158 159 static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) 160 { 161 return req == &req->queue->ctrl->async_req; 162 } 163 164 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) 165 { 166 struct request *rq; 167 unsigned int bytes; 168 169 if (unlikely(nvme_tcp_async_req(req))) 170 return false; /* async events don't have a request */ 171 172 rq = blk_mq_rq_from_pdu(req); 173 bytes = blk_rq_payload_bytes(rq); 174 175 return rq_data_dir(rq) == WRITE && bytes && 176 bytes <= nvme_tcp_inline_data_size(req->queue); 177 } 178 179 static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) 180 { 181 return req->iter.bvec->bv_page; 182 } 183 184 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) 185 { 186 return req->iter.bvec->bv_offset + req->iter.iov_offset; 187 } 188 189 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) 190 { 191 return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset, 192 req->pdu_len - req->pdu_sent); 193 } 194 195 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req) 196 { 197 return req->iter.iov_offset; 198 } 199 200 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) 201 { 202 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? 203 req->pdu_len - req->pdu_sent : 0; 204 } 205 206 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, 207 int len) 208 { 209 return nvme_tcp_pdu_data_left(req) <= len; 210 } 211 212 static void nvme_tcp_init_iter(struct nvme_tcp_request *req, 213 unsigned int dir) 214 { 215 struct request *rq = blk_mq_rq_from_pdu(req); 216 struct bio_vec *vec; 217 unsigned int size; 218 int nsegs; 219 size_t offset; 220 221 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { 222 vec = &rq->special_vec; 223 nsegs = 1; 224 size = blk_rq_payload_bytes(rq); 225 offset = 0; 226 } else { 227 struct bio *bio = req->curr_bio; 228 229 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); 230 nsegs = bio_segments(bio); 231 size = bio->bi_iter.bi_size; 232 offset = bio->bi_iter.bi_bvec_done; 233 } 234 235 iov_iter_bvec(&req->iter, dir, vec, nsegs, size); 236 req->iter.iov_offset = offset; 237 } 238 239 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, 240 int len) 241 { 242 req->data_sent += len; 243 req->pdu_sent += len; 244 iov_iter_advance(&req->iter, len); 245 if (!iov_iter_count(&req->iter) && 246 req->data_sent < req->data_len) { 247 req->curr_bio = req->curr_bio->bi_next; 248 nvme_tcp_init_iter(req, WRITE); 249 } 250 } 251 252 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) 253 { 254 struct nvme_tcp_queue *queue = req->queue; 255 256 spin_lock(&queue->lock); 257 list_add_tail(&req->entry, &queue->send_list); 258 spin_unlock(&queue->lock); 259 260 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 261 } 262 263 static inline struct nvme_tcp_request * 264 nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) 265 { 266 struct nvme_tcp_request *req; 267 268 spin_lock(&queue->lock); 269 req = list_first_entry_or_null(&queue->send_list, 270 struct nvme_tcp_request, entry); 271 if (req) 272 list_del(&req->entry); 273 spin_unlock(&queue->lock); 274 275 return req; 276 } 277 278 static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, 279 __le32 *dgst) 280 { 281 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); 282 crypto_ahash_final(hash); 283 } 284 285 static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, 286 struct page *page, off_t off, size_t len) 287 { 288 struct scatterlist sg; 289 290 sg_init_marker(&sg, 1); 291 sg_set_page(&sg, page, len, off); 292 ahash_request_set_crypt(hash, &sg, NULL, len); 293 crypto_ahash_update(hash); 294 } 295 296 static inline void nvme_tcp_hdgst(struct ahash_request *hash, 297 void *pdu, size_t len) 298 { 299 struct scatterlist sg; 300 301 sg_init_one(&sg, pdu, len); 302 ahash_request_set_crypt(hash, &sg, pdu + len, len); 303 crypto_ahash_digest(hash); 304 } 305 306 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, 307 void *pdu, size_t pdu_len) 308 { 309 struct nvme_tcp_hdr *hdr = pdu; 310 __le32 recv_digest; 311 __le32 exp_digest; 312 313 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { 314 dev_err(queue->ctrl->ctrl.device, 315 "queue %d: header digest flag is cleared\n", 316 nvme_tcp_queue_id(queue)); 317 return -EPROTO; 318 } 319 320 recv_digest = *(__le32 *)(pdu + hdr->hlen); 321 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); 322 exp_digest = *(__le32 *)(pdu + hdr->hlen); 323 if (recv_digest != exp_digest) { 324 dev_err(queue->ctrl->ctrl.device, 325 "header digest error: recv %#x expected %#x\n", 326 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest)); 327 return -EIO; 328 } 329 330 return 0; 331 } 332 333 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) 334 { 335 struct nvme_tcp_hdr *hdr = pdu; 336 u8 digest_len = nvme_tcp_hdgst_len(queue); 337 u32 len; 338 339 len = le32_to_cpu(hdr->plen) - hdr->hlen - 340 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0); 341 342 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { 343 dev_err(queue->ctrl->ctrl.device, 344 "queue %d: data digest flag is cleared\n", 345 nvme_tcp_queue_id(queue)); 346 return -EPROTO; 347 } 348 crypto_ahash_init(queue->rcv_hash); 349 350 return 0; 351 } 352 353 static void nvme_tcp_exit_request(struct blk_mq_tag_set *set, 354 struct request *rq, unsigned int hctx_idx) 355 { 356 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 357 358 page_frag_free(req->pdu); 359 } 360 361 static int nvme_tcp_init_request(struct blk_mq_tag_set *set, 362 struct request *rq, unsigned int hctx_idx, 363 unsigned int numa_node) 364 { 365 struct nvme_tcp_ctrl *ctrl = set->driver_data; 366 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 367 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; 368 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; 369 u8 hdgst = nvme_tcp_hdgst_len(queue); 370 371 req->pdu = page_frag_alloc(&queue->pf_cache, 372 sizeof(struct nvme_tcp_cmd_pdu) + hdgst, 373 GFP_KERNEL | __GFP_ZERO); 374 if (!req->pdu) 375 return -ENOMEM; 376 377 req->queue = queue; 378 nvme_req(rq)->ctrl = &ctrl->ctrl; 379 380 return 0; 381 } 382 383 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, 384 unsigned int hctx_idx) 385 { 386 struct nvme_tcp_ctrl *ctrl = data; 387 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; 388 389 hctx->driver_data = queue; 390 return 0; 391 } 392 393 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, 394 unsigned int hctx_idx) 395 { 396 struct nvme_tcp_ctrl *ctrl = data; 397 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 398 399 hctx->driver_data = queue; 400 return 0; 401 } 402 403 static enum nvme_tcp_recv_state 404 nvme_tcp_recv_state(struct nvme_tcp_queue *queue) 405 { 406 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU : 407 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST : 408 NVME_TCP_RECV_DATA; 409 } 410 411 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) 412 { 413 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) + 414 nvme_tcp_hdgst_len(queue); 415 queue->pdu_offset = 0; 416 queue->data_remaining = -1; 417 queue->ddgst_remaining = 0; 418 } 419 420 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) 421 { 422 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 423 return; 424 425 queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); 426 } 427 428 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, 429 struct nvme_completion *cqe) 430 { 431 struct request *rq; 432 433 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); 434 if (!rq) { 435 dev_err(queue->ctrl->ctrl.device, 436 "queue %d tag 0x%x not found\n", 437 nvme_tcp_queue_id(queue), cqe->command_id); 438 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 439 return -EINVAL; 440 } 441 442 nvme_end_request(rq, cqe->status, cqe->result); 443 queue->nr_cqe++; 444 445 return 0; 446 } 447 448 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, 449 struct nvme_tcp_data_pdu *pdu) 450 { 451 struct request *rq; 452 453 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); 454 if (!rq) { 455 dev_err(queue->ctrl->ctrl.device, 456 "queue %d tag %#x not found\n", 457 nvme_tcp_queue_id(queue), pdu->command_id); 458 return -ENOENT; 459 } 460 461 if (!blk_rq_payload_bytes(rq)) { 462 dev_err(queue->ctrl->ctrl.device, 463 "queue %d tag %#x unexpected data\n", 464 nvme_tcp_queue_id(queue), rq->tag); 465 return -EIO; 466 } 467 468 queue->data_remaining = le32_to_cpu(pdu->data_length); 469 470 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && 471 unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { 472 dev_err(queue->ctrl->ctrl.device, 473 "queue %d tag %#x SUCCESS set but not last PDU\n", 474 nvme_tcp_queue_id(queue), rq->tag); 475 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 476 return -EPROTO; 477 } 478 479 return 0; 480 } 481 482 static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, 483 struct nvme_tcp_rsp_pdu *pdu) 484 { 485 struct nvme_completion *cqe = &pdu->cqe; 486 int ret = 0; 487 488 /* 489 * AEN requests are special as they don't time out and can 490 * survive any kind of queue freeze and often don't respond to 491 * aborts. We don't even bother to allocate a struct request 492 * for them but rather special case them here. 493 */ 494 if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue), 495 cqe->command_id))) 496 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, 497 &cqe->result); 498 else 499 ret = nvme_tcp_process_nvme_cqe(queue, cqe); 500 501 return ret; 502 } 503 504 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, 505 struct nvme_tcp_r2t_pdu *pdu) 506 { 507 struct nvme_tcp_data_pdu *data = req->pdu; 508 struct nvme_tcp_queue *queue = req->queue; 509 struct request *rq = blk_mq_rq_from_pdu(req); 510 u8 hdgst = nvme_tcp_hdgst_len(queue); 511 u8 ddgst = nvme_tcp_ddgst_len(queue); 512 513 req->pdu_len = le32_to_cpu(pdu->r2t_length); 514 req->pdu_sent = 0; 515 516 if (unlikely(req->data_sent + req->pdu_len > req->data_len)) { 517 dev_err(queue->ctrl->ctrl.device, 518 "req %d r2t len %u exceeded data len %u (%zu sent)\n", 519 rq->tag, req->pdu_len, req->data_len, 520 req->data_sent); 521 return -EPROTO; 522 } 523 524 if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { 525 dev_err(queue->ctrl->ctrl.device, 526 "req %d unexpected r2t offset %u (expected %zu)\n", 527 rq->tag, le32_to_cpu(pdu->r2t_offset), 528 req->data_sent); 529 return -EPROTO; 530 } 531 532 memset(data, 0, sizeof(*data)); 533 data->hdr.type = nvme_tcp_h2c_data; 534 data->hdr.flags = NVME_TCP_F_DATA_LAST; 535 if (queue->hdr_digest) 536 data->hdr.flags |= NVME_TCP_F_HDGST; 537 if (queue->data_digest) 538 data->hdr.flags |= NVME_TCP_F_DDGST; 539 data->hdr.hlen = sizeof(*data); 540 data->hdr.pdo = data->hdr.hlen + hdgst; 541 data->hdr.plen = 542 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); 543 data->ttag = pdu->ttag; 544 data->command_id = rq->tag; 545 data->data_offset = cpu_to_le32(req->data_sent); 546 data->data_length = cpu_to_le32(req->pdu_len); 547 return 0; 548 } 549 550 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, 551 struct nvme_tcp_r2t_pdu *pdu) 552 { 553 struct nvme_tcp_request *req; 554 struct request *rq; 555 int ret; 556 557 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); 558 if (!rq) { 559 dev_err(queue->ctrl->ctrl.device, 560 "queue %d tag %#x not found\n", 561 nvme_tcp_queue_id(queue), pdu->command_id); 562 return -ENOENT; 563 } 564 req = blk_mq_rq_to_pdu(rq); 565 566 ret = nvme_tcp_setup_h2c_data_pdu(req, pdu); 567 if (unlikely(ret)) 568 return ret; 569 570 req->state = NVME_TCP_SEND_H2C_PDU; 571 req->offset = 0; 572 573 nvme_tcp_queue_request(req); 574 575 return 0; 576 } 577 578 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, 579 unsigned int *offset, size_t *len) 580 { 581 struct nvme_tcp_hdr *hdr; 582 char *pdu = queue->pdu; 583 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); 584 int ret; 585 586 ret = skb_copy_bits(skb, *offset, 587 &pdu[queue->pdu_offset], rcv_len); 588 if (unlikely(ret)) 589 return ret; 590 591 queue->pdu_remaining -= rcv_len; 592 queue->pdu_offset += rcv_len; 593 *offset += rcv_len; 594 *len -= rcv_len; 595 if (queue->pdu_remaining) 596 return 0; 597 598 hdr = queue->pdu; 599 if (queue->hdr_digest) { 600 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); 601 if (unlikely(ret)) 602 return ret; 603 } 604 605 606 if (queue->data_digest) { 607 ret = nvme_tcp_check_ddgst(queue, queue->pdu); 608 if (unlikely(ret)) 609 return ret; 610 } 611 612 switch (hdr->type) { 613 case nvme_tcp_c2h_data: 614 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); 615 case nvme_tcp_rsp: 616 nvme_tcp_init_recv_ctx(queue); 617 return nvme_tcp_handle_comp(queue, (void *)queue->pdu); 618 case nvme_tcp_r2t: 619 nvme_tcp_init_recv_ctx(queue); 620 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); 621 default: 622 dev_err(queue->ctrl->ctrl.device, 623 "unsupported pdu type (%d)\n", hdr->type); 624 return -EINVAL; 625 } 626 } 627 628 static inline void nvme_tcp_end_request(struct request *rq, u16 status) 629 { 630 union nvme_result res = {}; 631 632 nvme_end_request(rq, cpu_to_le16(status << 1), res); 633 } 634 635 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, 636 unsigned int *offset, size_t *len) 637 { 638 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; 639 struct nvme_tcp_request *req; 640 struct request *rq; 641 642 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); 643 if (!rq) { 644 dev_err(queue->ctrl->ctrl.device, 645 "queue %d tag %#x not found\n", 646 nvme_tcp_queue_id(queue), pdu->command_id); 647 return -ENOENT; 648 } 649 req = blk_mq_rq_to_pdu(rq); 650 651 while (true) { 652 int recv_len, ret; 653 654 recv_len = min_t(size_t, *len, queue->data_remaining); 655 if (!recv_len) 656 break; 657 658 if (!iov_iter_count(&req->iter)) { 659 req->curr_bio = req->curr_bio->bi_next; 660 661 /* 662 * If we don`t have any bios it means that controller 663 * sent more data than we requested, hence error 664 */ 665 if (!req->curr_bio) { 666 dev_err(queue->ctrl->ctrl.device, 667 "queue %d no space in request %#x", 668 nvme_tcp_queue_id(queue), rq->tag); 669 nvme_tcp_init_recv_ctx(queue); 670 return -EIO; 671 } 672 nvme_tcp_init_iter(req, READ); 673 } 674 675 /* we can read only from what is left in this bio */ 676 recv_len = min_t(size_t, recv_len, 677 iov_iter_count(&req->iter)); 678 679 if (queue->data_digest) 680 ret = skb_copy_and_hash_datagram_iter(skb, *offset, 681 &req->iter, recv_len, queue->rcv_hash); 682 else 683 ret = skb_copy_datagram_iter(skb, *offset, 684 &req->iter, recv_len); 685 if (ret) { 686 dev_err(queue->ctrl->ctrl.device, 687 "queue %d failed to copy request %#x data", 688 nvme_tcp_queue_id(queue), rq->tag); 689 return ret; 690 } 691 692 *len -= recv_len; 693 *offset += recv_len; 694 queue->data_remaining -= recv_len; 695 } 696 697 if (!queue->data_remaining) { 698 if (queue->data_digest) { 699 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); 700 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; 701 } else { 702 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { 703 nvme_tcp_end_request(rq, NVME_SC_SUCCESS); 704 queue->nr_cqe++; 705 } 706 nvme_tcp_init_recv_ctx(queue); 707 } 708 } 709 710 return 0; 711 } 712 713 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, 714 struct sk_buff *skb, unsigned int *offset, size_t *len) 715 { 716 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; 717 char *ddgst = (char *)&queue->recv_ddgst; 718 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); 719 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; 720 int ret; 721 722 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); 723 if (unlikely(ret)) 724 return ret; 725 726 queue->ddgst_remaining -= recv_len; 727 *offset += recv_len; 728 *len -= recv_len; 729 if (queue->ddgst_remaining) 730 return 0; 731 732 if (queue->recv_ddgst != queue->exp_ddgst) { 733 dev_err(queue->ctrl->ctrl.device, 734 "data digest error: recv %#x expected %#x\n", 735 le32_to_cpu(queue->recv_ddgst), 736 le32_to_cpu(queue->exp_ddgst)); 737 return -EIO; 738 } 739 740 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { 741 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), 742 pdu->command_id); 743 744 nvme_tcp_end_request(rq, NVME_SC_SUCCESS); 745 queue->nr_cqe++; 746 } 747 748 nvme_tcp_init_recv_ctx(queue); 749 return 0; 750 } 751 752 static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 753 unsigned int offset, size_t len) 754 { 755 struct nvme_tcp_queue *queue = desc->arg.data; 756 size_t consumed = len; 757 int result; 758 759 while (len) { 760 switch (nvme_tcp_recv_state(queue)) { 761 case NVME_TCP_RECV_PDU: 762 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len); 763 break; 764 case NVME_TCP_RECV_DATA: 765 result = nvme_tcp_recv_data(queue, skb, &offset, &len); 766 break; 767 case NVME_TCP_RECV_DDGST: 768 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len); 769 break; 770 default: 771 result = -EFAULT; 772 } 773 if (result) { 774 dev_err(queue->ctrl->ctrl.device, 775 "receive failed: %d\n", result); 776 queue->rd_enabled = false; 777 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 778 return result; 779 } 780 } 781 782 return consumed; 783 } 784 785 static void nvme_tcp_data_ready(struct sock *sk) 786 { 787 struct nvme_tcp_queue *queue; 788 789 read_lock(&sk->sk_callback_lock); 790 queue = sk->sk_user_data; 791 if (likely(queue && queue->rd_enabled)) 792 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 793 read_unlock(&sk->sk_callback_lock); 794 } 795 796 static void nvme_tcp_write_space(struct sock *sk) 797 { 798 struct nvme_tcp_queue *queue; 799 800 read_lock_bh(&sk->sk_callback_lock); 801 queue = sk->sk_user_data; 802 if (likely(queue && sk_stream_is_writeable(sk))) { 803 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 804 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 805 } 806 read_unlock_bh(&sk->sk_callback_lock); 807 } 808 809 static void nvme_tcp_state_change(struct sock *sk) 810 { 811 struct nvme_tcp_queue *queue; 812 813 read_lock(&sk->sk_callback_lock); 814 queue = sk->sk_user_data; 815 if (!queue) 816 goto done; 817 818 switch (sk->sk_state) { 819 case TCP_CLOSE: 820 case TCP_CLOSE_WAIT: 821 case TCP_LAST_ACK: 822 case TCP_FIN_WAIT1: 823 case TCP_FIN_WAIT2: 824 /* fallthrough */ 825 nvme_tcp_error_recovery(&queue->ctrl->ctrl); 826 break; 827 default: 828 dev_info(queue->ctrl->ctrl.device, 829 "queue %d socket state %d\n", 830 nvme_tcp_queue_id(queue), sk->sk_state); 831 } 832 833 queue->state_change(sk); 834 done: 835 read_unlock(&sk->sk_callback_lock); 836 } 837 838 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) 839 { 840 queue->request = NULL; 841 } 842 843 static void nvme_tcp_fail_request(struct nvme_tcp_request *req) 844 { 845 nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); 846 } 847 848 static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) 849 { 850 struct nvme_tcp_queue *queue = req->queue; 851 852 while (true) { 853 struct page *page = nvme_tcp_req_cur_page(req); 854 size_t offset = nvme_tcp_req_cur_offset(req); 855 size_t len = nvme_tcp_req_cur_length(req); 856 bool last = nvme_tcp_pdu_last_send(req, len); 857 int ret, flags = MSG_DONTWAIT; 858 859 if (last && !queue->data_digest) 860 flags |= MSG_EOR; 861 else 862 flags |= MSG_MORE; 863 864 /* can't zcopy slab pages */ 865 if (unlikely(PageSlab(page))) { 866 ret = sock_no_sendpage(queue->sock, page, offset, len, 867 flags); 868 } else { 869 ret = kernel_sendpage(queue->sock, page, offset, len, 870 flags); 871 } 872 if (ret <= 0) 873 return ret; 874 875 nvme_tcp_advance_req(req, ret); 876 if (queue->data_digest) 877 nvme_tcp_ddgst_update(queue->snd_hash, page, 878 offset, ret); 879 880 /* fully successful last write*/ 881 if (last && ret == len) { 882 if (queue->data_digest) { 883 nvme_tcp_ddgst_final(queue->snd_hash, 884 &req->ddgst); 885 req->state = NVME_TCP_SEND_DDGST; 886 req->offset = 0; 887 } else { 888 nvme_tcp_done_send_req(queue); 889 } 890 return 1; 891 } 892 } 893 return -EAGAIN; 894 } 895 896 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) 897 { 898 struct nvme_tcp_queue *queue = req->queue; 899 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 900 bool inline_data = nvme_tcp_has_inline_data(req); 901 int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR); 902 u8 hdgst = nvme_tcp_hdgst_len(queue); 903 int len = sizeof(*pdu) + hdgst - req->offset; 904 int ret; 905 906 if (queue->hdr_digest && !req->offset) 907 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 908 909 ret = kernel_sendpage(queue->sock, virt_to_page(pdu), 910 offset_in_page(pdu) + req->offset, len, flags); 911 if (unlikely(ret <= 0)) 912 return ret; 913 914 len -= ret; 915 if (!len) { 916 if (inline_data) { 917 req->state = NVME_TCP_SEND_DATA; 918 if (queue->data_digest) 919 crypto_ahash_init(queue->snd_hash); 920 nvme_tcp_init_iter(req, WRITE); 921 } else { 922 nvme_tcp_done_send_req(queue); 923 } 924 return 1; 925 } 926 req->offset += ret; 927 928 return -EAGAIN; 929 } 930 931 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) 932 { 933 struct nvme_tcp_queue *queue = req->queue; 934 struct nvme_tcp_data_pdu *pdu = req->pdu; 935 u8 hdgst = nvme_tcp_hdgst_len(queue); 936 int len = sizeof(*pdu) - req->offset + hdgst; 937 int ret; 938 939 if (queue->hdr_digest && !req->offset) 940 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); 941 942 ret = kernel_sendpage(queue->sock, virt_to_page(pdu), 943 offset_in_page(pdu) + req->offset, len, 944 MSG_DONTWAIT | MSG_MORE); 945 if (unlikely(ret <= 0)) 946 return ret; 947 948 len -= ret; 949 if (!len) { 950 req->state = NVME_TCP_SEND_DATA; 951 if (queue->data_digest) 952 crypto_ahash_init(queue->snd_hash); 953 if (!req->data_sent) 954 nvme_tcp_init_iter(req, WRITE); 955 return 1; 956 } 957 req->offset += ret; 958 959 return -EAGAIN; 960 } 961 962 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) 963 { 964 struct nvme_tcp_queue *queue = req->queue; 965 int ret; 966 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; 967 struct kvec iov = { 968 .iov_base = &req->ddgst + req->offset, 969 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset 970 }; 971 972 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 973 if (unlikely(ret <= 0)) 974 return ret; 975 976 if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) { 977 nvme_tcp_done_send_req(queue); 978 return 1; 979 } 980 981 req->offset += ret; 982 return -EAGAIN; 983 } 984 985 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) 986 { 987 struct nvme_tcp_request *req; 988 int ret = 1; 989 990 if (!queue->request) { 991 queue->request = nvme_tcp_fetch_request(queue); 992 if (!queue->request) 993 return 0; 994 } 995 req = queue->request; 996 997 if (req->state == NVME_TCP_SEND_CMD_PDU) { 998 ret = nvme_tcp_try_send_cmd_pdu(req); 999 if (ret <= 0) 1000 goto done; 1001 if (!nvme_tcp_has_inline_data(req)) 1002 return ret; 1003 } 1004 1005 if (req->state == NVME_TCP_SEND_H2C_PDU) { 1006 ret = nvme_tcp_try_send_data_pdu(req); 1007 if (ret <= 0) 1008 goto done; 1009 } 1010 1011 if (req->state == NVME_TCP_SEND_DATA) { 1012 ret = nvme_tcp_try_send_data(req); 1013 if (ret <= 0) 1014 goto done; 1015 } 1016 1017 if (req->state == NVME_TCP_SEND_DDGST) 1018 ret = nvme_tcp_try_send_ddgst(req); 1019 done: 1020 if (ret == -EAGAIN) 1021 ret = 0; 1022 return ret; 1023 } 1024 1025 static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) 1026 { 1027 struct socket *sock = queue->sock; 1028 struct sock *sk = sock->sk; 1029 read_descriptor_t rd_desc; 1030 int consumed; 1031 1032 rd_desc.arg.data = queue; 1033 rd_desc.count = 1; 1034 lock_sock(sk); 1035 queue->nr_cqe = 0; 1036 consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); 1037 release_sock(sk); 1038 return consumed; 1039 } 1040 1041 static void nvme_tcp_io_work(struct work_struct *w) 1042 { 1043 struct nvme_tcp_queue *queue = 1044 container_of(w, struct nvme_tcp_queue, io_work); 1045 unsigned long deadline = jiffies + msecs_to_jiffies(1); 1046 1047 do { 1048 bool pending = false; 1049 int result; 1050 1051 result = nvme_tcp_try_send(queue); 1052 if (result > 0) { 1053 pending = true; 1054 } else if (unlikely(result < 0)) { 1055 dev_err(queue->ctrl->ctrl.device, 1056 "failed to send request %d\n", result); 1057 1058 /* 1059 * Fail the request unless peer closed the connection, 1060 * in which case error recovery flow will complete all. 1061 */ 1062 if ((result != -EPIPE) && (result != -ECONNRESET)) 1063 nvme_tcp_fail_request(queue->request); 1064 nvme_tcp_done_send_req(queue); 1065 return; 1066 } 1067 1068 result = nvme_tcp_try_recv(queue); 1069 if (result > 0) 1070 pending = true; 1071 1072 if (!pending) 1073 return; 1074 1075 } while (!time_after(jiffies, deadline)); /* quota is exhausted */ 1076 1077 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); 1078 } 1079 1080 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) 1081 { 1082 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); 1083 1084 ahash_request_free(queue->rcv_hash); 1085 ahash_request_free(queue->snd_hash); 1086 crypto_free_ahash(tfm); 1087 } 1088 1089 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) 1090 { 1091 struct crypto_ahash *tfm; 1092 1093 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); 1094 if (IS_ERR(tfm)) 1095 return PTR_ERR(tfm); 1096 1097 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); 1098 if (!queue->snd_hash) 1099 goto free_tfm; 1100 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); 1101 1102 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); 1103 if (!queue->rcv_hash) 1104 goto free_snd_hash; 1105 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); 1106 1107 return 0; 1108 free_snd_hash: 1109 ahash_request_free(queue->snd_hash); 1110 free_tfm: 1111 crypto_free_ahash(tfm); 1112 return -ENOMEM; 1113 } 1114 1115 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) 1116 { 1117 struct nvme_tcp_request *async = &ctrl->async_req; 1118 1119 page_frag_free(async->pdu); 1120 } 1121 1122 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) 1123 { 1124 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 1125 struct nvme_tcp_request *async = &ctrl->async_req; 1126 u8 hdgst = nvme_tcp_hdgst_len(queue); 1127 1128 async->pdu = page_frag_alloc(&queue->pf_cache, 1129 sizeof(struct nvme_tcp_cmd_pdu) + hdgst, 1130 GFP_KERNEL | __GFP_ZERO); 1131 if (!async->pdu) 1132 return -ENOMEM; 1133 1134 async->queue = &ctrl->queues[0]; 1135 return 0; 1136 } 1137 1138 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) 1139 { 1140 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1141 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1142 1143 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) 1144 return; 1145 1146 if (queue->hdr_digest || queue->data_digest) 1147 nvme_tcp_free_crypto(queue); 1148 1149 sock_release(queue->sock); 1150 kfree(queue->pdu); 1151 } 1152 1153 static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) 1154 { 1155 struct nvme_tcp_icreq_pdu *icreq; 1156 struct nvme_tcp_icresp_pdu *icresp; 1157 struct msghdr msg = {}; 1158 struct kvec iov; 1159 bool ctrl_hdgst, ctrl_ddgst; 1160 int ret; 1161 1162 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); 1163 if (!icreq) 1164 return -ENOMEM; 1165 1166 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL); 1167 if (!icresp) { 1168 ret = -ENOMEM; 1169 goto free_icreq; 1170 } 1171 1172 icreq->hdr.type = nvme_tcp_icreq; 1173 icreq->hdr.hlen = sizeof(*icreq); 1174 icreq->hdr.pdo = 0; 1175 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen); 1176 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); 1177 icreq->maxr2t = 0; /* single inflight r2t supported */ 1178 icreq->hpda = 0; /* no alignment constraint */ 1179 if (queue->hdr_digest) 1180 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE; 1181 if (queue->data_digest) 1182 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE; 1183 1184 iov.iov_base = icreq; 1185 iov.iov_len = sizeof(*icreq); 1186 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); 1187 if (ret < 0) 1188 goto free_icresp; 1189 1190 memset(&msg, 0, sizeof(msg)); 1191 iov.iov_base = icresp; 1192 iov.iov_len = sizeof(*icresp); 1193 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, 1194 iov.iov_len, msg.msg_flags); 1195 if (ret < 0) 1196 goto free_icresp; 1197 1198 ret = -EINVAL; 1199 if (icresp->hdr.type != nvme_tcp_icresp) { 1200 pr_err("queue %d: bad type returned %d\n", 1201 nvme_tcp_queue_id(queue), icresp->hdr.type); 1202 goto free_icresp; 1203 } 1204 1205 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) { 1206 pr_err("queue %d: bad pdu length returned %d\n", 1207 nvme_tcp_queue_id(queue), icresp->hdr.plen); 1208 goto free_icresp; 1209 } 1210 1211 if (icresp->pfv != NVME_TCP_PFV_1_0) { 1212 pr_err("queue %d: bad pfv returned %d\n", 1213 nvme_tcp_queue_id(queue), icresp->pfv); 1214 goto free_icresp; 1215 } 1216 1217 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE); 1218 if ((queue->data_digest && !ctrl_ddgst) || 1219 (!queue->data_digest && ctrl_ddgst)) { 1220 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n", 1221 nvme_tcp_queue_id(queue), 1222 queue->data_digest ? "enabled" : "disabled", 1223 ctrl_ddgst ? "enabled" : "disabled"); 1224 goto free_icresp; 1225 } 1226 1227 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE); 1228 if ((queue->hdr_digest && !ctrl_hdgst) || 1229 (!queue->hdr_digest && ctrl_hdgst)) { 1230 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n", 1231 nvme_tcp_queue_id(queue), 1232 queue->hdr_digest ? "enabled" : "disabled", 1233 ctrl_hdgst ? "enabled" : "disabled"); 1234 goto free_icresp; 1235 } 1236 1237 if (icresp->cpda != 0) { 1238 pr_err("queue %d: unsupported cpda returned %d\n", 1239 nvme_tcp_queue_id(queue), icresp->cpda); 1240 goto free_icresp; 1241 } 1242 1243 ret = 0; 1244 free_icresp: 1245 kfree(icresp); 1246 free_icreq: 1247 kfree(icreq); 1248 return ret; 1249 } 1250 1251 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, 1252 int qid, size_t queue_size) 1253 { 1254 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1255 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1256 struct linger sol = { .l_onoff = 1, .l_linger = 0 }; 1257 int ret, opt, rcv_pdu_size, n; 1258 1259 queue->ctrl = ctrl; 1260 INIT_LIST_HEAD(&queue->send_list); 1261 spin_lock_init(&queue->lock); 1262 INIT_WORK(&queue->io_work, nvme_tcp_io_work); 1263 queue->queue_size = queue_size; 1264 1265 if (qid > 0) 1266 queue->cmnd_capsule_len = nctrl->ioccsz * 16; 1267 else 1268 queue->cmnd_capsule_len = sizeof(struct nvme_command) + 1269 NVME_TCP_ADMIN_CCSZ; 1270 1271 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, 1272 IPPROTO_TCP, &queue->sock); 1273 if (ret) { 1274 dev_err(nctrl->device, 1275 "failed to create socket: %d\n", ret); 1276 return ret; 1277 } 1278 1279 /* Single syn retry */ 1280 opt = 1; 1281 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, 1282 (char *)&opt, sizeof(opt)); 1283 if (ret) { 1284 dev_err(nctrl->device, 1285 "failed to set TCP_SYNCNT sock opt %d\n", ret); 1286 goto err_sock; 1287 } 1288 1289 /* Set TCP no delay */ 1290 opt = 1; 1291 ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, 1292 TCP_NODELAY, (char *)&opt, sizeof(opt)); 1293 if (ret) { 1294 dev_err(nctrl->device, 1295 "failed to set TCP_NODELAY sock opt %d\n", ret); 1296 goto err_sock; 1297 } 1298 1299 /* 1300 * Cleanup whatever is sitting in the TCP transmit queue on socket 1301 * close. This is done to prevent stale data from being sent should 1302 * the network connection be restored before TCP times out. 1303 */ 1304 ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, 1305 (char *)&sol, sizeof(sol)); 1306 if (ret) { 1307 dev_err(nctrl->device, 1308 "failed to set SO_LINGER sock opt %d\n", ret); 1309 goto err_sock; 1310 } 1311 1312 /* Set socket type of service */ 1313 if (nctrl->opts->tos >= 0) { 1314 opt = nctrl->opts->tos; 1315 ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, 1316 (char *)&opt, sizeof(opt)); 1317 if (ret) { 1318 dev_err(nctrl->device, 1319 "failed to set IP_TOS sock opt %d\n", ret); 1320 goto err_sock; 1321 } 1322 } 1323 1324 queue->sock->sk->sk_allocation = GFP_ATOMIC; 1325 if (!qid) 1326 n = 0; 1327 else 1328 n = (qid - 1) % num_online_cpus(); 1329 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); 1330 queue->request = NULL; 1331 queue->data_remaining = 0; 1332 queue->ddgst_remaining = 0; 1333 queue->pdu_remaining = 0; 1334 queue->pdu_offset = 0; 1335 sk_set_memalloc(queue->sock->sk); 1336 1337 if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { 1338 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, 1339 sizeof(ctrl->src_addr)); 1340 if (ret) { 1341 dev_err(nctrl->device, 1342 "failed to bind queue %d socket %d\n", 1343 qid, ret); 1344 goto err_sock; 1345 } 1346 } 1347 1348 queue->hdr_digest = nctrl->opts->hdr_digest; 1349 queue->data_digest = nctrl->opts->data_digest; 1350 if (queue->hdr_digest || queue->data_digest) { 1351 ret = nvme_tcp_alloc_crypto(queue); 1352 if (ret) { 1353 dev_err(nctrl->device, 1354 "failed to allocate queue %d crypto\n", qid); 1355 goto err_sock; 1356 } 1357 } 1358 1359 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + 1360 nvme_tcp_hdgst_len(queue); 1361 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); 1362 if (!queue->pdu) { 1363 ret = -ENOMEM; 1364 goto err_crypto; 1365 } 1366 1367 dev_dbg(nctrl->device, "connecting queue %d\n", 1368 nvme_tcp_queue_id(queue)); 1369 1370 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, 1371 sizeof(ctrl->addr), 0); 1372 if (ret) { 1373 dev_err(nctrl->device, 1374 "failed to connect socket: %d\n", ret); 1375 goto err_rcv_pdu; 1376 } 1377 1378 ret = nvme_tcp_init_connection(queue); 1379 if (ret) 1380 goto err_init_connect; 1381 1382 queue->rd_enabled = true; 1383 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); 1384 nvme_tcp_init_recv_ctx(queue); 1385 1386 write_lock_bh(&queue->sock->sk->sk_callback_lock); 1387 queue->sock->sk->sk_user_data = queue; 1388 queue->state_change = queue->sock->sk->sk_state_change; 1389 queue->data_ready = queue->sock->sk->sk_data_ready; 1390 queue->write_space = queue->sock->sk->sk_write_space; 1391 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; 1392 queue->sock->sk->sk_state_change = nvme_tcp_state_change; 1393 queue->sock->sk->sk_write_space = nvme_tcp_write_space; 1394 #ifdef CONFIG_NET_RX_BUSY_POLL 1395 queue->sock->sk->sk_ll_usec = 1; 1396 #endif 1397 write_unlock_bh(&queue->sock->sk->sk_callback_lock); 1398 1399 return 0; 1400 1401 err_init_connect: 1402 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1403 err_rcv_pdu: 1404 kfree(queue->pdu); 1405 err_crypto: 1406 if (queue->hdr_digest || queue->data_digest) 1407 nvme_tcp_free_crypto(queue); 1408 err_sock: 1409 sock_release(queue->sock); 1410 queue->sock = NULL; 1411 return ret; 1412 } 1413 1414 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) 1415 { 1416 struct socket *sock = queue->sock; 1417 1418 write_lock_bh(&sock->sk->sk_callback_lock); 1419 sock->sk->sk_user_data = NULL; 1420 sock->sk->sk_data_ready = queue->data_ready; 1421 sock->sk->sk_state_change = queue->state_change; 1422 sock->sk->sk_write_space = queue->write_space; 1423 write_unlock_bh(&sock->sk->sk_callback_lock); 1424 } 1425 1426 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) 1427 { 1428 kernel_sock_shutdown(queue->sock, SHUT_RDWR); 1429 nvme_tcp_restore_sock_calls(queue); 1430 cancel_work_sync(&queue->io_work); 1431 } 1432 1433 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) 1434 { 1435 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1436 struct nvme_tcp_queue *queue = &ctrl->queues[qid]; 1437 1438 if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) 1439 return; 1440 1441 __nvme_tcp_stop_queue(queue); 1442 } 1443 1444 static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) 1445 { 1446 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1447 int ret; 1448 1449 if (idx) 1450 ret = nvmf_connect_io_queue(nctrl, idx, false); 1451 else 1452 ret = nvmf_connect_admin_queue(nctrl); 1453 1454 if (!ret) { 1455 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); 1456 } else { 1457 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags)) 1458 __nvme_tcp_stop_queue(&ctrl->queues[idx]); 1459 dev_err(nctrl->device, 1460 "failed to connect queue: %d ret=%d\n", idx, ret); 1461 } 1462 return ret; 1463 } 1464 1465 static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, 1466 bool admin) 1467 { 1468 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1469 struct blk_mq_tag_set *set; 1470 int ret; 1471 1472 if (admin) { 1473 set = &ctrl->admin_tag_set; 1474 memset(set, 0, sizeof(*set)); 1475 set->ops = &nvme_tcp_admin_mq_ops; 1476 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; 1477 set->reserved_tags = 2; /* connect + keep-alive */ 1478 set->numa_node = NUMA_NO_NODE; 1479 set->cmd_size = sizeof(struct nvme_tcp_request); 1480 set->driver_data = ctrl; 1481 set->nr_hw_queues = 1; 1482 set->timeout = ADMIN_TIMEOUT; 1483 } else { 1484 set = &ctrl->tag_set; 1485 memset(set, 0, sizeof(*set)); 1486 set->ops = &nvme_tcp_mq_ops; 1487 set->queue_depth = nctrl->sqsize + 1; 1488 set->reserved_tags = 1; /* fabric connect */ 1489 set->numa_node = NUMA_NO_NODE; 1490 set->flags = BLK_MQ_F_SHOULD_MERGE; 1491 set->cmd_size = sizeof(struct nvme_tcp_request); 1492 set->driver_data = ctrl; 1493 set->nr_hw_queues = nctrl->queue_count - 1; 1494 set->timeout = NVME_IO_TIMEOUT; 1495 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; 1496 } 1497 1498 ret = blk_mq_alloc_tag_set(set); 1499 if (ret) 1500 return ERR_PTR(ret); 1501 1502 return set; 1503 } 1504 1505 static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) 1506 { 1507 if (to_tcp_ctrl(ctrl)->async_req.pdu) { 1508 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); 1509 to_tcp_ctrl(ctrl)->async_req.pdu = NULL; 1510 } 1511 1512 nvme_tcp_free_queue(ctrl, 0); 1513 } 1514 1515 static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl) 1516 { 1517 int i; 1518 1519 for (i = 1; i < ctrl->queue_count; i++) 1520 nvme_tcp_free_queue(ctrl, i); 1521 } 1522 1523 static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) 1524 { 1525 int i; 1526 1527 for (i = 1; i < ctrl->queue_count; i++) 1528 nvme_tcp_stop_queue(ctrl, i); 1529 } 1530 1531 static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) 1532 { 1533 int i, ret = 0; 1534 1535 for (i = 1; i < ctrl->queue_count; i++) { 1536 ret = nvme_tcp_start_queue(ctrl, i); 1537 if (ret) 1538 goto out_stop_queues; 1539 } 1540 1541 return 0; 1542 1543 out_stop_queues: 1544 for (i--; i >= 1; i--) 1545 nvme_tcp_stop_queue(ctrl, i); 1546 return ret; 1547 } 1548 1549 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) 1550 { 1551 int ret; 1552 1553 ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); 1554 if (ret) 1555 return ret; 1556 1557 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); 1558 if (ret) 1559 goto out_free_queue; 1560 1561 return 0; 1562 1563 out_free_queue: 1564 nvme_tcp_free_queue(ctrl, 0); 1565 return ret; 1566 } 1567 1568 static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) 1569 { 1570 int i, ret; 1571 1572 for (i = 1; i < ctrl->queue_count; i++) { 1573 ret = nvme_tcp_alloc_queue(ctrl, i, 1574 ctrl->sqsize + 1); 1575 if (ret) 1576 goto out_free_queues; 1577 } 1578 1579 return 0; 1580 1581 out_free_queues: 1582 for (i--; i >= 1; i--) 1583 nvme_tcp_free_queue(ctrl, i); 1584 1585 return ret; 1586 } 1587 1588 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) 1589 { 1590 unsigned int nr_io_queues; 1591 1592 nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); 1593 nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); 1594 nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); 1595 1596 return nr_io_queues; 1597 } 1598 1599 static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, 1600 unsigned int nr_io_queues) 1601 { 1602 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1603 struct nvmf_ctrl_options *opts = nctrl->opts; 1604 1605 if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { 1606 /* 1607 * separate read/write queues 1608 * hand out dedicated default queues only after we have 1609 * sufficient read queues. 1610 */ 1611 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; 1612 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; 1613 ctrl->io_queues[HCTX_TYPE_DEFAULT] = 1614 min(opts->nr_write_queues, nr_io_queues); 1615 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1616 } else { 1617 /* 1618 * shared read/write queues 1619 * either no write queues were requested, or we don't have 1620 * sufficient queue count to have dedicated default queues. 1621 */ 1622 ctrl->io_queues[HCTX_TYPE_DEFAULT] = 1623 min(opts->nr_io_queues, nr_io_queues); 1624 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; 1625 } 1626 1627 if (opts->nr_poll_queues && nr_io_queues) { 1628 /* map dedicated poll queues only if we have queues left */ 1629 ctrl->io_queues[HCTX_TYPE_POLL] = 1630 min(opts->nr_poll_queues, nr_io_queues); 1631 } 1632 } 1633 1634 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) 1635 { 1636 unsigned int nr_io_queues; 1637 int ret; 1638 1639 nr_io_queues = nvme_tcp_nr_io_queues(ctrl); 1640 ret = nvme_set_queue_count(ctrl, &nr_io_queues); 1641 if (ret) 1642 return ret; 1643 1644 ctrl->queue_count = nr_io_queues + 1; 1645 if (ctrl->queue_count < 2) 1646 return 0; 1647 1648 dev_info(ctrl->device, 1649 "creating %d I/O queues.\n", nr_io_queues); 1650 1651 nvme_tcp_set_io_queues(ctrl, nr_io_queues); 1652 1653 return __nvme_tcp_alloc_io_queues(ctrl); 1654 } 1655 1656 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) 1657 { 1658 nvme_tcp_stop_io_queues(ctrl); 1659 if (remove) { 1660 blk_cleanup_queue(ctrl->connect_q); 1661 blk_mq_free_tag_set(ctrl->tagset); 1662 } 1663 nvme_tcp_free_io_queues(ctrl); 1664 } 1665 1666 static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) 1667 { 1668 int ret; 1669 1670 ret = nvme_tcp_alloc_io_queues(ctrl); 1671 if (ret) 1672 return ret; 1673 1674 if (new) { 1675 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); 1676 if (IS_ERR(ctrl->tagset)) { 1677 ret = PTR_ERR(ctrl->tagset); 1678 goto out_free_io_queues; 1679 } 1680 1681 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); 1682 if (IS_ERR(ctrl->connect_q)) { 1683 ret = PTR_ERR(ctrl->connect_q); 1684 goto out_free_tag_set; 1685 } 1686 } else { 1687 blk_mq_update_nr_hw_queues(ctrl->tagset, 1688 ctrl->queue_count - 1); 1689 } 1690 1691 ret = nvme_tcp_start_io_queues(ctrl); 1692 if (ret) 1693 goto out_cleanup_connect_q; 1694 1695 return 0; 1696 1697 out_cleanup_connect_q: 1698 if (new) 1699 blk_cleanup_queue(ctrl->connect_q); 1700 out_free_tag_set: 1701 if (new) 1702 blk_mq_free_tag_set(ctrl->tagset); 1703 out_free_io_queues: 1704 nvme_tcp_free_io_queues(ctrl); 1705 return ret; 1706 } 1707 1708 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) 1709 { 1710 nvme_tcp_stop_queue(ctrl, 0); 1711 if (remove) { 1712 blk_cleanup_queue(ctrl->admin_q); 1713 blk_cleanup_queue(ctrl->fabrics_q); 1714 blk_mq_free_tag_set(ctrl->admin_tagset); 1715 } 1716 nvme_tcp_free_admin_queue(ctrl); 1717 } 1718 1719 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) 1720 { 1721 int error; 1722 1723 error = nvme_tcp_alloc_admin_queue(ctrl); 1724 if (error) 1725 return error; 1726 1727 if (new) { 1728 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); 1729 if (IS_ERR(ctrl->admin_tagset)) { 1730 error = PTR_ERR(ctrl->admin_tagset); 1731 goto out_free_queue; 1732 } 1733 1734 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); 1735 if (IS_ERR(ctrl->fabrics_q)) { 1736 error = PTR_ERR(ctrl->fabrics_q); 1737 goto out_free_tagset; 1738 } 1739 1740 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); 1741 if (IS_ERR(ctrl->admin_q)) { 1742 error = PTR_ERR(ctrl->admin_q); 1743 goto out_cleanup_fabrics_q; 1744 } 1745 } 1746 1747 error = nvme_tcp_start_queue(ctrl, 0); 1748 if (error) 1749 goto out_cleanup_queue; 1750 1751 error = nvme_enable_ctrl(ctrl); 1752 if (error) 1753 goto out_stop_queue; 1754 1755 blk_mq_unquiesce_queue(ctrl->admin_q); 1756 1757 error = nvme_init_identify(ctrl); 1758 if (error) 1759 goto out_stop_queue; 1760 1761 return 0; 1762 1763 out_stop_queue: 1764 nvme_tcp_stop_queue(ctrl, 0); 1765 out_cleanup_queue: 1766 if (new) 1767 blk_cleanup_queue(ctrl->admin_q); 1768 out_cleanup_fabrics_q: 1769 if (new) 1770 blk_cleanup_queue(ctrl->fabrics_q); 1771 out_free_tagset: 1772 if (new) 1773 blk_mq_free_tag_set(ctrl->admin_tagset); 1774 out_free_queue: 1775 nvme_tcp_free_admin_queue(ctrl); 1776 return error; 1777 } 1778 1779 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, 1780 bool remove) 1781 { 1782 blk_mq_quiesce_queue(ctrl->admin_q); 1783 nvme_tcp_stop_queue(ctrl, 0); 1784 if (ctrl->admin_tagset) { 1785 blk_mq_tagset_busy_iter(ctrl->admin_tagset, 1786 nvme_cancel_request, ctrl); 1787 blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); 1788 } 1789 if (remove) 1790 blk_mq_unquiesce_queue(ctrl->admin_q); 1791 nvme_tcp_destroy_admin_queue(ctrl, remove); 1792 } 1793 1794 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, 1795 bool remove) 1796 { 1797 if (ctrl->queue_count <= 1) 1798 return; 1799 nvme_stop_queues(ctrl); 1800 nvme_tcp_stop_io_queues(ctrl); 1801 if (ctrl->tagset) { 1802 blk_mq_tagset_busy_iter(ctrl->tagset, 1803 nvme_cancel_request, ctrl); 1804 blk_mq_tagset_wait_completed_request(ctrl->tagset); 1805 } 1806 if (remove) 1807 nvme_start_queues(ctrl); 1808 nvme_tcp_destroy_io_queues(ctrl, remove); 1809 } 1810 1811 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) 1812 { 1813 /* If we are resetting/deleting then do nothing */ 1814 if (ctrl->state != NVME_CTRL_CONNECTING) { 1815 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || 1816 ctrl->state == NVME_CTRL_LIVE); 1817 return; 1818 } 1819 1820 if (nvmf_should_reconnect(ctrl)) { 1821 dev_info(ctrl->device, "Reconnecting in %d seconds...\n", 1822 ctrl->opts->reconnect_delay); 1823 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, 1824 ctrl->opts->reconnect_delay * HZ); 1825 } else { 1826 dev_info(ctrl->device, "Removing controller...\n"); 1827 nvme_delete_ctrl(ctrl); 1828 } 1829 } 1830 1831 static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) 1832 { 1833 struct nvmf_ctrl_options *opts = ctrl->opts; 1834 int ret; 1835 1836 ret = nvme_tcp_configure_admin_queue(ctrl, new); 1837 if (ret) 1838 return ret; 1839 1840 if (ctrl->icdoff) { 1841 dev_err(ctrl->device, "icdoff is not supported!\n"); 1842 goto destroy_admin; 1843 } 1844 1845 if (opts->queue_size > ctrl->sqsize + 1) 1846 dev_warn(ctrl->device, 1847 "queue_size %zu > ctrl sqsize %u, clamping down\n", 1848 opts->queue_size, ctrl->sqsize + 1); 1849 1850 if (ctrl->sqsize + 1 > ctrl->maxcmd) { 1851 dev_warn(ctrl->device, 1852 "sqsize %u > ctrl maxcmd %u, clamping down\n", 1853 ctrl->sqsize + 1, ctrl->maxcmd); 1854 ctrl->sqsize = ctrl->maxcmd - 1; 1855 } 1856 1857 if (ctrl->queue_count > 1) { 1858 ret = nvme_tcp_configure_io_queues(ctrl, new); 1859 if (ret) 1860 goto destroy_admin; 1861 } 1862 1863 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { 1864 /* state change failure is ok if we're in DELETING state */ 1865 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); 1866 ret = -EINVAL; 1867 goto destroy_io; 1868 } 1869 1870 nvme_start_ctrl(ctrl); 1871 return 0; 1872 1873 destroy_io: 1874 if (ctrl->queue_count > 1) 1875 nvme_tcp_destroy_io_queues(ctrl, new); 1876 destroy_admin: 1877 nvme_tcp_stop_queue(ctrl, 0); 1878 nvme_tcp_destroy_admin_queue(ctrl, new); 1879 return ret; 1880 } 1881 1882 static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) 1883 { 1884 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), 1885 struct nvme_tcp_ctrl, connect_work); 1886 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; 1887 1888 ++ctrl->nr_reconnects; 1889 1890 if (nvme_tcp_setup_ctrl(ctrl, false)) 1891 goto requeue; 1892 1893 dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", 1894 ctrl->nr_reconnects); 1895 1896 ctrl->nr_reconnects = 0; 1897 1898 return; 1899 1900 requeue: 1901 dev_info(ctrl->device, "Failed reconnect attempt %d\n", 1902 ctrl->nr_reconnects); 1903 nvme_tcp_reconnect_or_remove(ctrl); 1904 } 1905 1906 static void nvme_tcp_error_recovery_work(struct work_struct *work) 1907 { 1908 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work, 1909 struct nvme_tcp_ctrl, err_work); 1910 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; 1911 1912 nvme_stop_keep_alive(ctrl); 1913 nvme_tcp_teardown_io_queues(ctrl, false); 1914 /* unquiesce to fail fast pending requests */ 1915 nvme_start_queues(ctrl); 1916 nvme_tcp_teardown_admin_queue(ctrl, false); 1917 blk_mq_unquiesce_queue(ctrl->admin_q); 1918 1919 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 1920 /* state change failure is ok if we're in DELETING state */ 1921 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); 1922 return; 1923 } 1924 1925 nvme_tcp_reconnect_or_remove(ctrl); 1926 } 1927 1928 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) 1929 { 1930 cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); 1931 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); 1932 1933 nvme_tcp_teardown_io_queues(ctrl, shutdown); 1934 blk_mq_quiesce_queue(ctrl->admin_q); 1935 if (shutdown) 1936 nvme_shutdown_ctrl(ctrl); 1937 else 1938 nvme_disable_ctrl(ctrl); 1939 nvme_tcp_teardown_admin_queue(ctrl, shutdown); 1940 } 1941 1942 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl) 1943 { 1944 nvme_tcp_teardown_ctrl(ctrl, true); 1945 } 1946 1947 static void nvme_reset_ctrl_work(struct work_struct *work) 1948 { 1949 struct nvme_ctrl *ctrl = 1950 container_of(work, struct nvme_ctrl, reset_work); 1951 1952 nvme_stop_ctrl(ctrl); 1953 nvme_tcp_teardown_ctrl(ctrl, false); 1954 1955 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { 1956 /* state change failure is ok if we're in DELETING state */ 1957 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); 1958 return; 1959 } 1960 1961 if (nvme_tcp_setup_ctrl(ctrl, false)) 1962 goto out_fail; 1963 1964 return; 1965 1966 out_fail: 1967 ++ctrl->nr_reconnects; 1968 nvme_tcp_reconnect_or_remove(ctrl); 1969 } 1970 1971 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) 1972 { 1973 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); 1974 1975 if (list_empty(&ctrl->list)) 1976 goto free_ctrl; 1977 1978 mutex_lock(&nvme_tcp_ctrl_mutex); 1979 list_del(&ctrl->list); 1980 mutex_unlock(&nvme_tcp_ctrl_mutex); 1981 1982 nvmf_free_options(nctrl->opts); 1983 free_ctrl: 1984 kfree(ctrl->queues); 1985 kfree(ctrl); 1986 } 1987 1988 static void nvme_tcp_set_sg_null(struct nvme_command *c) 1989 { 1990 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 1991 1992 sg->addr = 0; 1993 sg->length = 0; 1994 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | 1995 NVME_SGL_FMT_TRANSPORT_A; 1996 } 1997 1998 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue, 1999 struct nvme_command *c, u32 data_len) 2000 { 2001 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 2002 2003 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); 2004 sg->length = cpu_to_le32(data_len); 2005 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; 2006 } 2007 2008 static void nvme_tcp_set_sg_host_data(struct nvme_command *c, 2009 u32 data_len) 2010 { 2011 struct nvme_sgl_desc *sg = &c->common.dptr.sgl; 2012 2013 sg->addr = 0; 2014 sg->length = cpu_to_le32(data_len); 2015 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | 2016 NVME_SGL_FMT_TRANSPORT_A; 2017 } 2018 2019 static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) 2020 { 2021 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg); 2022 struct nvme_tcp_queue *queue = &ctrl->queues[0]; 2023 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu; 2024 struct nvme_command *cmd = &pdu->cmd; 2025 u8 hdgst = nvme_tcp_hdgst_len(queue); 2026 2027 memset(pdu, 0, sizeof(*pdu)); 2028 pdu->hdr.type = nvme_tcp_cmd; 2029 if (queue->hdr_digest) 2030 pdu->hdr.flags |= NVME_TCP_F_HDGST; 2031 pdu->hdr.hlen = sizeof(*pdu); 2032 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); 2033 2034 cmd->common.opcode = nvme_admin_async_event; 2035 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; 2036 cmd->common.flags |= NVME_CMD_SGL_METABUF; 2037 nvme_tcp_set_sg_null(cmd); 2038 2039 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU; 2040 ctrl->async_req.offset = 0; 2041 ctrl->async_req.curr_bio = NULL; 2042 ctrl->async_req.data_len = 0; 2043 2044 nvme_tcp_queue_request(&ctrl->async_req); 2045 } 2046 2047 static enum blk_eh_timer_return 2048 nvme_tcp_timeout(struct request *rq, bool reserved) 2049 { 2050 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2051 struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; 2052 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 2053 2054 /* 2055 * Restart the timer if a controller reset is already scheduled. Any 2056 * timed out commands would be handled before entering the connecting 2057 * state. 2058 */ 2059 if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 2060 return BLK_EH_RESET_TIMER; 2061 2062 dev_warn(ctrl->ctrl.device, 2063 "queue %d: timeout request %#x type %d\n", 2064 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); 2065 2066 if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 2067 /* 2068 * Teardown immediately if controller times out while starting 2069 * or we are already started error recovery. all outstanding 2070 * requests are completed on shutdown, so we return BLK_EH_DONE. 2071 */ 2072 flush_work(&ctrl->err_work); 2073 nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); 2074 nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); 2075 return BLK_EH_DONE; 2076 } 2077 2078 dev_warn(ctrl->ctrl.device, "starting error recovery\n"); 2079 nvme_tcp_error_recovery(&ctrl->ctrl); 2080 2081 return BLK_EH_RESET_TIMER; 2082 } 2083 2084 static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, 2085 struct request *rq) 2086 { 2087 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2088 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 2089 struct nvme_command *c = &pdu->cmd; 2090 2091 c->common.flags |= NVME_CMD_SGL_METABUF; 2092 2093 if (rq_data_dir(rq) == WRITE && req->data_len && 2094 req->data_len <= nvme_tcp_inline_data_size(queue)) 2095 nvme_tcp_set_sg_inline(queue, c, req->data_len); 2096 else 2097 nvme_tcp_set_sg_host_data(c, req->data_len); 2098 2099 return 0; 2100 } 2101 2102 static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, 2103 struct request *rq) 2104 { 2105 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2106 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 2107 struct nvme_tcp_queue *queue = req->queue; 2108 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; 2109 blk_status_t ret; 2110 2111 ret = nvme_setup_cmd(ns, rq, &pdu->cmd); 2112 if (ret) 2113 return ret; 2114 2115 req->state = NVME_TCP_SEND_CMD_PDU; 2116 req->offset = 0; 2117 req->data_sent = 0; 2118 req->pdu_len = 0; 2119 req->pdu_sent = 0; 2120 req->data_len = blk_rq_payload_bytes(rq); 2121 req->curr_bio = rq->bio; 2122 2123 if (rq_data_dir(rq) == WRITE && 2124 req->data_len <= nvme_tcp_inline_data_size(queue)) 2125 req->pdu_len = req->data_len; 2126 else if (req->curr_bio) 2127 nvme_tcp_init_iter(req, READ); 2128 2129 pdu->hdr.type = nvme_tcp_cmd; 2130 pdu->hdr.flags = 0; 2131 if (queue->hdr_digest) 2132 pdu->hdr.flags |= NVME_TCP_F_HDGST; 2133 if (queue->data_digest && req->pdu_len) { 2134 pdu->hdr.flags |= NVME_TCP_F_DDGST; 2135 ddgst = nvme_tcp_ddgst_len(queue); 2136 } 2137 pdu->hdr.hlen = sizeof(*pdu); 2138 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0; 2139 pdu->hdr.plen = 2140 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst); 2141 2142 ret = nvme_tcp_map_data(queue, rq); 2143 if (unlikely(ret)) { 2144 nvme_cleanup_cmd(rq); 2145 dev_err(queue->ctrl->ctrl.device, 2146 "Failed to map data (%d)\n", ret); 2147 return ret; 2148 } 2149 2150 return 0; 2151 } 2152 2153 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, 2154 const struct blk_mq_queue_data *bd) 2155 { 2156 struct nvme_ns *ns = hctx->queue->queuedata; 2157 struct nvme_tcp_queue *queue = hctx->driver_data; 2158 struct request *rq = bd->rq; 2159 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2160 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); 2161 blk_status_t ret; 2162 2163 if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) 2164 return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); 2165 2166 ret = nvme_tcp_setup_cmd_pdu(ns, rq); 2167 if (unlikely(ret)) 2168 return ret; 2169 2170 blk_mq_start_request(rq); 2171 2172 nvme_tcp_queue_request(req); 2173 2174 return BLK_STS_OK; 2175 } 2176 2177 static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) 2178 { 2179 struct nvme_tcp_ctrl *ctrl = set->driver_data; 2180 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2181 2182 if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { 2183 /* separate read/write queues */ 2184 set->map[HCTX_TYPE_DEFAULT].nr_queues = 2185 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 2186 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; 2187 set->map[HCTX_TYPE_READ].nr_queues = 2188 ctrl->io_queues[HCTX_TYPE_READ]; 2189 set->map[HCTX_TYPE_READ].queue_offset = 2190 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 2191 } else { 2192 /* shared read/write queues */ 2193 set->map[HCTX_TYPE_DEFAULT].nr_queues = 2194 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 2195 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; 2196 set->map[HCTX_TYPE_READ].nr_queues = 2197 ctrl->io_queues[HCTX_TYPE_DEFAULT]; 2198 set->map[HCTX_TYPE_READ].queue_offset = 0; 2199 } 2200 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 2201 blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); 2202 2203 if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { 2204 /* map dedicated poll queues only if we have queues left */ 2205 set->map[HCTX_TYPE_POLL].nr_queues = 2206 ctrl->io_queues[HCTX_TYPE_POLL]; 2207 set->map[HCTX_TYPE_POLL].queue_offset = 2208 ctrl->io_queues[HCTX_TYPE_DEFAULT] + 2209 ctrl->io_queues[HCTX_TYPE_READ]; 2210 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); 2211 } 2212 2213 dev_info(ctrl->ctrl.device, 2214 "mapped %d/%d/%d default/read/poll queues.\n", 2215 ctrl->io_queues[HCTX_TYPE_DEFAULT], 2216 ctrl->io_queues[HCTX_TYPE_READ], 2217 ctrl->io_queues[HCTX_TYPE_POLL]); 2218 2219 return 0; 2220 } 2221 2222 static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) 2223 { 2224 struct nvme_tcp_queue *queue = hctx->driver_data; 2225 struct sock *sk = queue->sock->sk; 2226 2227 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) 2228 sk_busy_loop(sk, true); 2229 nvme_tcp_try_recv(queue); 2230 return queue->nr_cqe; 2231 } 2232 2233 static struct blk_mq_ops nvme_tcp_mq_ops = { 2234 .queue_rq = nvme_tcp_queue_rq, 2235 .complete = nvme_complete_rq, 2236 .init_request = nvme_tcp_init_request, 2237 .exit_request = nvme_tcp_exit_request, 2238 .init_hctx = nvme_tcp_init_hctx, 2239 .timeout = nvme_tcp_timeout, 2240 .map_queues = nvme_tcp_map_queues, 2241 .poll = nvme_tcp_poll, 2242 }; 2243 2244 static struct blk_mq_ops nvme_tcp_admin_mq_ops = { 2245 .queue_rq = nvme_tcp_queue_rq, 2246 .complete = nvme_complete_rq, 2247 .init_request = nvme_tcp_init_request, 2248 .exit_request = nvme_tcp_exit_request, 2249 .init_hctx = nvme_tcp_init_admin_hctx, 2250 .timeout = nvme_tcp_timeout, 2251 }; 2252 2253 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { 2254 .name = "tcp", 2255 .module = THIS_MODULE, 2256 .flags = NVME_F_FABRICS, 2257 .reg_read32 = nvmf_reg_read32, 2258 .reg_read64 = nvmf_reg_read64, 2259 .reg_write32 = nvmf_reg_write32, 2260 .free_ctrl = nvme_tcp_free_ctrl, 2261 .submit_async_event = nvme_tcp_submit_async_event, 2262 .delete_ctrl = nvme_tcp_delete_ctrl, 2263 .get_address = nvmf_get_address, 2264 }; 2265 2266 static bool 2267 nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) 2268 { 2269 struct nvme_tcp_ctrl *ctrl; 2270 bool found = false; 2271 2272 mutex_lock(&nvme_tcp_ctrl_mutex); 2273 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { 2274 found = nvmf_ip_options_match(&ctrl->ctrl, opts); 2275 if (found) 2276 break; 2277 } 2278 mutex_unlock(&nvme_tcp_ctrl_mutex); 2279 2280 return found; 2281 } 2282 2283 static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, 2284 struct nvmf_ctrl_options *opts) 2285 { 2286 struct nvme_tcp_ctrl *ctrl; 2287 int ret; 2288 2289 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); 2290 if (!ctrl) 2291 return ERR_PTR(-ENOMEM); 2292 2293 INIT_LIST_HEAD(&ctrl->list); 2294 ctrl->ctrl.opts = opts; 2295 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 2296 opts->nr_poll_queues + 1; 2297 ctrl->ctrl.sqsize = opts->queue_size - 1; 2298 ctrl->ctrl.kato = opts->kato; 2299 2300 INIT_DELAYED_WORK(&ctrl->connect_work, 2301 nvme_tcp_reconnect_ctrl_work); 2302 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); 2303 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); 2304 2305 if (!(opts->mask & NVMF_OPT_TRSVCID)) { 2306 opts->trsvcid = 2307 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); 2308 if (!opts->trsvcid) { 2309 ret = -ENOMEM; 2310 goto out_free_ctrl; 2311 } 2312 opts->mask |= NVMF_OPT_TRSVCID; 2313 } 2314 2315 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 2316 opts->traddr, opts->trsvcid, &ctrl->addr); 2317 if (ret) { 2318 pr_err("malformed address passed: %s:%s\n", 2319 opts->traddr, opts->trsvcid); 2320 goto out_free_ctrl; 2321 } 2322 2323 if (opts->mask & NVMF_OPT_HOST_TRADDR) { 2324 ret = inet_pton_with_scope(&init_net, AF_UNSPEC, 2325 opts->host_traddr, NULL, &ctrl->src_addr); 2326 if (ret) { 2327 pr_err("malformed src address passed: %s\n", 2328 opts->host_traddr); 2329 goto out_free_ctrl; 2330 } 2331 } 2332 2333 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { 2334 ret = -EALREADY; 2335 goto out_free_ctrl; 2336 } 2337 2338 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), 2339 GFP_KERNEL); 2340 if (!ctrl->queues) { 2341 ret = -ENOMEM; 2342 goto out_free_ctrl; 2343 } 2344 2345 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0); 2346 if (ret) 2347 goto out_kfree_queues; 2348 2349 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 2350 WARN_ON_ONCE(1); 2351 ret = -EINTR; 2352 goto out_uninit_ctrl; 2353 } 2354 2355 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true); 2356 if (ret) 2357 goto out_uninit_ctrl; 2358 2359 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", 2360 ctrl->ctrl.opts->subsysnqn, &ctrl->addr); 2361 2362 nvme_get_ctrl(&ctrl->ctrl); 2363 2364 mutex_lock(&nvme_tcp_ctrl_mutex); 2365 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); 2366 mutex_unlock(&nvme_tcp_ctrl_mutex); 2367 2368 return &ctrl->ctrl; 2369 2370 out_uninit_ctrl: 2371 nvme_uninit_ctrl(&ctrl->ctrl); 2372 nvme_put_ctrl(&ctrl->ctrl); 2373 if (ret > 0) 2374 ret = -EIO; 2375 return ERR_PTR(ret); 2376 out_kfree_queues: 2377 kfree(ctrl->queues); 2378 out_free_ctrl: 2379 kfree(ctrl); 2380 return ERR_PTR(ret); 2381 } 2382 2383 static struct nvmf_transport_ops nvme_tcp_transport = { 2384 .name = "tcp", 2385 .module = THIS_MODULE, 2386 .required_opts = NVMF_OPT_TRADDR, 2387 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | 2388 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | 2389 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | 2390 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | 2391 NVMF_OPT_TOS, 2392 .create_ctrl = nvme_tcp_create_ctrl, 2393 }; 2394 2395 static int __init nvme_tcp_init_module(void) 2396 { 2397 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", 2398 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 2399 if (!nvme_tcp_wq) 2400 return -ENOMEM; 2401 2402 nvmf_register_transport(&nvme_tcp_transport); 2403 return 0; 2404 } 2405 2406 static void __exit nvme_tcp_cleanup_module(void) 2407 { 2408 struct nvme_tcp_ctrl *ctrl; 2409 2410 nvmf_unregister_transport(&nvme_tcp_transport); 2411 2412 mutex_lock(&nvme_tcp_ctrl_mutex); 2413 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) 2414 nvme_delete_ctrl(&ctrl->ctrl); 2415 mutex_unlock(&nvme_tcp_ctrl_mutex); 2416 flush_workqueue(nvme_delete_wq); 2417 2418 destroy_workqueue(nvme_tcp_wq); 2419 } 2420 2421 module_init(nvme_tcp_init_module); 2422 module_exit(nvme_tcp_cleanup_module); 2423 2424 MODULE_LICENSE("GPL v2"); 2425