1 /* 2 * NVMe over Fabrics RDMA target. 3 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 #include <linux/atomic.h> 16 #include <linux/ctype.h> 17 #include <linux/delay.h> 18 #include <linux/err.h> 19 #include <linux/init.h> 20 #include <linux/module.h> 21 #include <linux/nvme.h> 22 #include <linux/slab.h> 23 #include <linux/string.h> 24 #include <linux/wait.h> 25 #include <linux/inet.h> 26 #include <asm/unaligned.h> 27 28 #include <rdma/ib_verbs.h> 29 #include <rdma/rdma_cm.h> 30 #include <rdma/rw.h> 31 32 #include <linux/nvme-rdma.h> 33 #include "nvmet.h" 34 35 /* 36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 37 */ 38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 39 #define NVMET_RDMA_MAX_INLINE_SGE 4 40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 41 42 struct nvmet_rdma_cmd { 43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 44 struct ib_cqe cqe; 45 struct ib_recv_wr wr; 46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 47 struct nvme_command *nvme_cmd; 48 struct nvmet_rdma_queue *queue; 49 }; 50 51 enum { 52 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 53 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 54 }; 55 56 struct nvmet_rdma_rsp { 57 struct ib_sge send_sge; 58 struct ib_cqe send_cqe; 59 struct ib_send_wr send_wr; 60 61 struct nvmet_rdma_cmd *cmd; 62 struct nvmet_rdma_queue *queue; 63 64 struct ib_cqe read_cqe; 65 struct rdma_rw_ctx rw; 66 67 struct nvmet_req req; 68 69 bool allocated; 70 u8 n_rdma; 71 u32 flags; 72 u32 invalidate_rkey; 73 74 struct list_head wait_list; 75 struct list_head free_list; 76 }; 77 78 enum nvmet_rdma_queue_state { 79 NVMET_RDMA_Q_CONNECTING, 80 NVMET_RDMA_Q_LIVE, 81 NVMET_RDMA_Q_DISCONNECTING, 82 }; 83 84 struct nvmet_rdma_queue { 85 struct rdma_cm_id *cm_id; 86 struct nvmet_port *port; 87 struct ib_cq *cq; 88 atomic_t sq_wr_avail; 89 struct nvmet_rdma_device *dev; 90 spinlock_t state_lock; 91 enum nvmet_rdma_queue_state state; 92 struct nvmet_cq nvme_cq; 93 struct nvmet_sq nvme_sq; 94 95 struct nvmet_rdma_rsp *rsps; 96 struct list_head free_rsps; 97 spinlock_t rsps_lock; 98 struct nvmet_rdma_cmd *cmds; 99 100 struct work_struct release_work; 101 struct list_head rsp_wait_list; 102 struct list_head rsp_wr_wait_list; 103 spinlock_t rsp_wr_wait_lock; 104 105 int idx; 106 int host_qid; 107 int recv_queue_size; 108 int send_queue_size; 109 110 struct list_head queue_list; 111 }; 112 113 struct nvmet_rdma_device { 114 struct ib_device *device; 115 struct ib_pd *pd; 116 struct ib_srq *srq; 117 struct nvmet_rdma_cmd *srq_cmds; 118 size_t srq_size; 119 struct kref ref; 120 struct list_head entry; 121 int inline_data_size; 122 int inline_page_count; 123 }; 124 125 static struct workqueue_struct *nvmet_rdma_delete_wq; 126 static bool nvmet_rdma_use_srq; 127 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 128 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 129 130 static DEFINE_IDA(nvmet_rdma_queue_ida); 131 static LIST_HEAD(nvmet_rdma_queue_list); 132 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 133 134 static LIST_HEAD(device_list); 135 static DEFINE_MUTEX(device_list_mutex); 136 137 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 138 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 139 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 140 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 141 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 142 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 143 144 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 145 146 static int num_pages(int len) 147 { 148 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 149 } 150 151 /* XXX: really should move to a generic header sooner or later.. */ 152 static inline u32 get_unaligned_le24(const u8 *p) 153 { 154 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 155 } 156 157 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 158 { 159 return nvme_is_write(rsp->req.cmd) && 160 rsp->req.transfer_len && 161 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 162 } 163 164 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 165 { 166 return !nvme_is_write(rsp->req.cmd) && 167 rsp->req.transfer_len && 168 !rsp->req.rsp->status && 169 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 170 } 171 172 static inline struct nvmet_rdma_rsp * 173 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 174 { 175 struct nvmet_rdma_rsp *rsp; 176 unsigned long flags; 177 178 spin_lock_irqsave(&queue->rsps_lock, flags); 179 rsp = list_first_entry_or_null(&queue->free_rsps, 180 struct nvmet_rdma_rsp, free_list); 181 if (likely(rsp)) 182 list_del(&rsp->free_list); 183 spin_unlock_irqrestore(&queue->rsps_lock, flags); 184 185 if (unlikely(!rsp)) { 186 rsp = kmalloc(sizeof(*rsp), GFP_KERNEL); 187 if (unlikely(!rsp)) 188 return NULL; 189 rsp->allocated = true; 190 } 191 192 return rsp; 193 } 194 195 static inline void 196 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 197 { 198 unsigned long flags; 199 200 if (rsp->allocated) { 201 kfree(rsp); 202 return; 203 } 204 205 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 206 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 207 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 208 } 209 210 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 211 struct nvmet_rdma_cmd *c) 212 { 213 struct scatterlist *sg; 214 struct ib_sge *sge; 215 int i; 216 217 if (!ndev->inline_data_size) 218 return; 219 220 sg = c->inline_sg; 221 sge = &c->sge[1]; 222 223 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 224 if (sge->length) 225 ib_dma_unmap_page(ndev->device, sge->addr, 226 sge->length, DMA_FROM_DEVICE); 227 if (sg_page(sg)) 228 __free_page(sg_page(sg)); 229 } 230 } 231 232 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 233 struct nvmet_rdma_cmd *c) 234 { 235 struct scatterlist *sg; 236 struct ib_sge *sge; 237 struct page *pg; 238 int len; 239 int i; 240 241 if (!ndev->inline_data_size) 242 return 0; 243 244 sg = c->inline_sg; 245 sg_init_table(sg, ndev->inline_page_count); 246 sge = &c->sge[1]; 247 len = ndev->inline_data_size; 248 249 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 250 pg = alloc_page(GFP_KERNEL); 251 if (!pg) 252 goto out_err; 253 sg_assign_page(sg, pg); 254 sge->addr = ib_dma_map_page(ndev->device, 255 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 256 if (ib_dma_mapping_error(ndev->device, sge->addr)) 257 goto out_err; 258 sge->length = min_t(int, len, PAGE_SIZE); 259 sge->lkey = ndev->pd->local_dma_lkey; 260 len -= sge->length; 261 } 262 263 return 0; 264 out_err: 265 for (; i >= 0; i--, sg--, sge--) { 266 if (sge->length) 267 ib_dma_unmap_page(ndev->device, sge->addr, 268 sge->length, DMA_FROM_DEVICE); 269 if (sg_page(sg)) 270 __free_page(sg_page(sg)); 271 } 272 return -ENOMEM; 273 } 274 275 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 276 struct nvmet_rdma_cmd *c, bool admin) 277 { 278 /* NVMe command / RDMA RECV */ 279 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 280 if (!c->nvme_cmd) 281 goto out; 282 283 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 284 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 285 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 286 goto out_free_cmd; 287 288 c->sge[0].length = sizeof(*c->nvme_cmd); 289 c->sge[0].lkey = ndev->pd->local_dma_lkey; 290 291 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 292 goto out_unmap_cmd; 293 294 c->cqe.done = nvmet_rdma_recv_done; 295 296 c->wr.wr_cqe = &c->cqe; 297 c->wr.sg_list = c->sge; 298 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 299 300 return 0; 301 302 out_unmap_cmd: 303 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 304 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 305 out_free_cmd: 306 kfree(c->nvme_cmd); 307 308 out: 309 return -ENOMEM; 310 } 311 312 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 313 struct nvmet_rdma_cmd *c, bool admin) 314 { 315 if (!admin) 316 nvmet_rdma_free_inline_pages(ndev, c); 317 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 318 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 319 kfree(c->nvme_cmd); 320 } 321 322 static struct nvmet_rdma_cmd * 323 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 324 int nr_cmds, bool admin) 325 { 326 struct nvmet_rdma_cmd *cmds; 327 int ret = -EINVAL, i; 328 329 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 330 if (!cmds) 331 goto out; 332 333 for (i = 0; i < nr_cmds; i++) { 334 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 335 if (ret) 336 goto out_free; 337 } 338 339 return cmds; 340 341 out_free: 342 while (--i >= 0) 343 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 344 kfree(cmds); 345 out: 346 return ERR_PTR(ret); 347 } 348 349 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 350 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 351 { 352 int i; 353 354 for (i = 0; i < nr_cmds; i++) 355 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 356 kfree(cmds); 357 } 358 359 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 360 struct nvmet_rdma_rsp *r) 361 { 362 /* NVMe CQE / RDMA SEND */ 363 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 364 if (!r->req.rsp) 365 goto out; 366 367 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 368 sizeof(*r->req.rsp), DMA_TO_DEVICE); 369 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 370 goto out_free_rsp; 371 372 r->send_sge.length = sizeof(*r->req.rsp); 373 r->send_sge.lkey = ndev->pd->local_dma_lkey; 374 375 r->send_cqe.done = nvmet_rdma_send_done; 376 377 r->send_wr.wr_cqe = &r->send_cqe; 378 r->send_wr.sg_list = &r->send_sge; 379 r->send_wr.num_sge = 1; 380 r->send_wr.send_flags = IB_SEND_SIGNALED; 381 382 /* Data In / RDMA READ */ 383 r->read_cqe.done = nvmet_rdma_read_data_done; 384 return 0; 385 386 out_free_rsp: 387 kfree(r->req.rsp); 388 out: 389 return -ENOMEM; 390 } 391 392 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 393 struct nvmet_rdma_rsp *r) 394 { 395 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 396 sizeof(*r->req.rsp), DMA_TO_DEVICE); 397 kfree(r->req.rsp); 398 } 399 400 static int 401 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 402 { 403 struct nvmet_rdma_device *ndev = queue->dev; 404 int nr_rsps = queue->recv_queue_size * 2; 405 int ret = -EINVAL, i; 406 407 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 408 GFP_KERNEL); 409 if (!queue->rsps) 410 goto out; 411 412 for (i = 0; i < nr_rsps; i++) { 413 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 414 415 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 416 if (ret) 417 goto out_free; 418 419 list_add_tail(&rsp->free_list, &queue->free_rsps); 420 } 421 422 return 0; 423 424 out_free: 425 while (--i >= 0) { 426 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 427 428 list_del(&rsp->free_list); 429 nvmet_rdma_free_rsp(ndev, rsp); 430 } 431 kfree(queue->rsps); 432 out: 433 return ret; 434 } 435 436 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 437 { 438 struct nvmet_rdma_device *ndev = queue->dev; 439 int i, nr_rsps = queue->recv_queue_size * 2; 440 441 for (i = 0; i < nr_rsps; i++) { 442 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 443 444 list_del(&rsp->free_list); 445 nvmet_rdma_free_rsp(ndev, rsp); 446 } 447 kfree(queue->rsps); 448 } 449 450 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 451 struct nvmet_rdma_cmd *cmd) 452 { 453 int ret; 454 455 ib_dma_sync_single_for_device(ndev->device, 456 cmd->sge[0].addr, cmd->sge[0].length, 457 DMA_FROM_DEVICE); 458 459 if (ndev->srq) 460 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 461 else 462 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 463 464 if (unlikely(ret)) 465 pr_err("post_recv cmd failed\n"); 466 467 return ret; 468 } 469 470 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 471 { 472 spin_lock(&queue->rsp_wr_wait_lock); 473 while (!list_empty(&queue->rsp_wr_wait_list)) { 474 struct nvmet_rdma_rsp *rsp; 475 bool ret; 476 477 rsp = list_entry(queue->rsp_wr_wait_list.next, 478 struct nvmet_rdma_rsp, wait_list); 479 list_del(&rsp->wait_list); 480 481 spin_unlock(&queue->rsp_wr_wait_lock); 482 ret = nvmet_rdma_execute_command(rsp); 483 spin_lock(&queue->rsp_wr_wait_lock); 484 485 if (!ret) { 486 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 487 break; 488 } 489 } 490 spin_unlock(&queue->rsp_wr_wait_lock); 491 } 492 493 494 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 495 { 496 struct nvmet_rdma_queue *queue = rsp->queue; 497 498 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 499 500 if (rsp->n_rdma) { 501 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 502 queue->cm_id->port_num, rsp->req.sg, 503 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 504 } 505 506 if (rsp->req.sg != rsp->cmd->inline_sg) 507 nvmet_req_free_sgl(&rsp->req); 508 509 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 510 nvmet_rdma_process_wr_wait_list(queue); 511 512 nvmet_rdma_put_rsp(rsp); 513 } 514 515 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 516 { 517 if (queue->nvme_sq.ctrl) { 518 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 519 } else { 520 /* 521 * we didn't setup the controller yet in case 522 * of admin connect error, just disconnect and 523 * cleanup the queue 524 */ 525 nvmet_rdma_queue_disconnect(queue); 526 } 527 } 528 529 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 530 { 531 struct nvmet_rdma_rsp *rsp = 532 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 533 534 nvmet_rdma_release_rsp(rsp); 535 536 if (unlikely(wc->status != IB_WC_SUCCESS && 537 wc->status != IB_WC_WR_FLUSH_ERR)) { 538 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 539 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 540 nvmet_rdma_error_comp(rsp->queue); 541 } 542 } 543 544 static void nvmet_rdma_queue_response(struct nvmet_req *req) 545 { 546 struct nvmet_rdma_rsp *rsp = 547 container_of(req, struct nvmet_rdma_rsp, req); 548 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 549 struct ib_send_wr *first_wr; 550 551 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 552 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 553 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 554 } else { 555 rsp->send_wr.opcode = IB_WR_SEND; 556 } 557 558 if (nvmet_rdma_need_data_out(rsp)) 559 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 560 cm_id->port_num, NULL, &rsp->send_wr); 561 else 562 first_wr = &rsp->send_wr; 563 564 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 565 566 ib_dma_sync_single_for_device(rsp->queue->dev->device, 567 rsp->send_sge.addr, rsp->send_sge.length, 568 DMA_TO_DEVICE); 569 570 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 571 pr_err("sending cmd response failed\n"); 572 nvmet_rdma_release_rsp(rsp); 573 } 574 } 575 576 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 577 { 578 struct nvmet_rdma_rsp *rsp = 579 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 580 struct nvmet_rdma_queue *queue = cq->cq_context; 581 582 WARN_ON(rsp->n_rdma <= 0); 583 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 584 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 585 queue->cm_id->port_num, rsp->req.sg, 586 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 587 rsp->n_rdma = 0; 588 589 if (unlikely(wc->status != IB_WC_SUCCESS)) { 590 nvmet_req_uninit(&rsp->req); 591 nvmet_rdma_release_rsp(rsp); 592 if (wc->status != IB_WC_WR_FLUSH_ERR) { 593 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 594 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 595 nvmet_rdma_error_comp(queue); 596 } 597 return; 598 } 599 600 nvmet_req_execute(&rsp->req); 601 } 602 603 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 604 u64 off) 605 { 606 int sg_count = num_pages(len); 607 struct scatterlist *sg; 608 int i; 609 610 sg = rsp->cmd->inline_sg; 611 for (i = 0; i < sg_count; i++, sg++) { 612 if (i < sg_count - 1) 613 sg_unmark_end(sg); 614 else 615 sg_mark_end(sg); 616 sg->offset = off; 617 sg->length = min_t(int, len, PAGE_SIZE - off); 618 len -= sg->length; 619 if (!i) 620 off = 0; 621 } 622 623 rsp->req.sg = rsp->cmd->inline_sg; 624 rsp->req.sg_cnt = sg_count; 625 } 626 627 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 628 { 629 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 630 u64 off = le64_to_cpu(sgl->addr); 631 u32 len = le32_to_cpu(sgl->length); 632 633 if (!nvme_is_write(rsp->req.cmd)) 634 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 635 636 if (off + len > rsp->queue->dev->inline_data_size) { 637 pr_err("invalid inline data offset!\n"); 638 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 639 } 640 641 /* no data command? */ 642 if (!len) 643 return 0; 644 645 nvmet_rdma_use_inline_sg(rsp, len, off); 646 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 647 rsp->req.transfer_len += len; 648 return 0; 649 } 650 651 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 652 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 653 { 654 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 655 u64 addr = le64_to_cpu(sgl->addr); 656 u32 key = get_unaligned_le32(sgl->key); 657 int ret; 658 659 rsp->req.transfer_len = get_unaligned_le24(sgl->length); 660 661 /* no data command? */ 662 if (!rsp->req.transfer_len) 663 return 0; 664 665 ret = nvmet_req_alloc_sgl(&rsp->req); 666 if (ret < 0) 667 goto error_out; 668 669 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 670 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 671 nvmet_data_dir(&rsp->req)); 672 if (ret < 0) 673 goto error_out; 674 rsp->n_rdma += ret; 675 676 if (invalidate) { 677 rsp->invalidate_rkey = key; 678 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 679 } 680 681 return 0; 682 683 error_out: 684 rsp->req.transfer_len = 0; 685 return NVME_SC_INTERNAL; 686 } 687 688 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 689 { 690 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 691 692 switch (sgl->type >> 4) { 693 case NVME_SGL_FMT_DATA_DESC: 694 switch (sgl->type & 0xf) { 695 case NVME_SGL_FMT_OFFSET: 696 return nvmet_rdma_map_sgl_inline(rsp); 697 default: 698 pr_err("invalid SGL subtype: %#x\n", sgl->type); 699 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 700 } 701 case NVME_KEY_SGL_FMT_DATA_DESC: 702 switch (sgl->type & 0xf) { 703 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 704 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 705 case NVME_SGL_FMT_ADDRESS: 706 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 707 default: 708 pr_err("invalid SGL subtype: %#x\n", sgl->type); 709 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 710 } 711 default: 712 pr_err("invalid SGL type: %#x\n", sgl->type); 713 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 714 } 715 } 716 717 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 718 { 719 struct nvmet_rdma_queue *queue = rsp->queue; 720 721 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 722 &queue->sq_wr_avail) < 0)) { 723 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 724 1 + rsp->n_rdma, queue->idx, 725 queue->nvme_sq.ctrl->cntlid); 726 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 727 return false; 728 } 729 730 if (nvmet_rdma_need_data_in(rsp)) { 731 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 732 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 733 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 734 } else { 735 nvmet_req_execute(&rsp->req); 736 } 737 738 return true; 739 } 740 741 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 742 struct nvmet_rdma_rsp *cmd) 743 { 744 u16 status; 745 746 ib_dma_sync_single_for_cpu(queue->dev->device, 747 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 748 DMA_FROM_DEVICE); 749 ib_dma_sync_single_for_cpu(queue->dev->device, 750 cmd->send_sge.addr, cmd->send_sge.length, 751 DMA_TO_DEVICE); 752 753 cmd->req.p2p_client = &queue->dev->device->dev; 754 755 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 756 &queue->nvme_sq, &nvmet_rdma_ops)) 757 return; 758 759 status = nvmet_rdma_map_sgl(cmd); 760 if (status) 761 goto out_err; 762 763 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 764 spin_lock(&queue->rsp_wr_wait_lock); 765 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 766 spin_unlock(&queue->rsp_wr_wait_lock); 767 } 768 769 return; 770 771 out_err: 772 nvmet_req_complete(&cmd->req, status); 773 } 774 775 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 776 { 777 struct nvmet_rdma_cmd *cmd = 778 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 779 struct nvmet_rdma_queue *queue = cq->cq_context; 780 struct nvmet_rdma_rsp *rsp; 781 782 if (unlikely(wc->status != IB_WC_SUCCESS)) { 783 if (wc->status != IB_WC_WR_FLUSH_ERR) { 784 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 785 wc->wr_cqe, ib_wc_status_msg(wc->status), 786 wc->status); 787 nvmet_rdma_error_comp(queue); 788 } 789 return; 790 } 791 792 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 793 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 794 nvmet_rdma_error_comp(queue); 795 return; 796 } 797 798 cmd->queue = queue; 799 rsp = nvmet_rdma_get_rsp(queue); 800 if (unlikely(!rsp)) { 801 /* 802 * we get here only under memory pressure, 803 * silently drop and have the host retry 804 * as we can't even fail it. 805 */ 806 nvmet_rdma_post_recv(queue->dev, cmd); 807 return; 808 } 809 rsp->queue = queue; 810 rsp->cmd = cmd; 811 rsp->flags = 0; 812 rsp->req.cmd = cmd->nvme_cmd; 813 rsp->req.port = queue->port; 814 rsp->n_rdma = 0; 815 816 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 817 unsigned long flags; 818 819 spin_lock_irqsave(&queue->state_lock, flags); 820 if (queue->state == NVMET_RDMA_Q_CONNECTING) 821 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 822 else 823 nvmet_rdma_put_rsp(rsp); 824 spin_unlock_irqrestore(&queue->state_lock, flags); 825 return; 826 } 827 828 nvmet_rdma_handle_command(queue, rsp); 829 } 830 831 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 832 { 833 if (!ndev->srq) 834 return; 835 836 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 837 ib_destroy_srq(ndev->srq); 838 } 839 840 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 841 { 842 struct ib_srq_init_attr srq_attr = { NULL, }; 843 struct ib_srq *srq; 844 size_t srq_size; 845 int ret, i; 846 847 srq_size = 4095; /* XXX: tune */ 848 849 srq_attr.attr.max_wr = srq_size; 850 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 851 srq_attr.attr.srq_limit = 0; 852 srq_attr.srq_type = IB_SRQT_BASIC; 853 srq = ib_create_srq(ndev->pd, &srq_attr); 854 if (IS_ERR(srq)) { 855 /* 856 * If SRQs aren't supported we just go ahead and use normal 857 * non-shared receive queues. 858 */ 859 pr_info("SRQ requested but not supported.\n"); 860 return 0; 861 } 862 863 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 864 if (IS_ERR(ndev->srq_cmds)) { 865 ret = PTR_ERR(ndev->srq_cmds); 866 goto out_destroy_srq; 867 } 868 869 ndev->srq = srq; 870 ndev->srq_size = srq_size; 871 872 for (i = 0; i < srq_size; i++) { 873 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 874 if (ret) 875 goto out_free_cmds; 876 } 877 878 return 0; 879 880 out_free_cmds: 881 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 882 out_destroy_srq: 883 ib_destroy_srq(srq); 884 return ret; 885 } 886 887 static void nvmet_rdma_free_dev(struct kref *ref) 888 { 889 struct nvmet_rdma_device *ndev = 890 container_of(ref, struct nvmet_rdma_device, ref); 891 892 mutex_lock(&device_list_mutex); 893 list_del(&ndev->entry); 894 mutex_unlock(&device_list_mutex); 895 896 nvmet_rdma_destroy_srq(ndev); 897 ib_dealloc_pd(ndev->pd); 898 899 kfree(ndev); 900 } 901 902 static struct nvmet_rdma_device * 903 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 904 { 905 struct nvmet_port *port = cm_id->context; 906 struct nvmet_rdma_device *ndev; 907 int inline_page_count; 908 int inline_sge_count; 909 int ret; 910 911 mutex_lock(&device_list_mutex); 912 list_for_each_entry(ndev, &device_list, entry) { 913 if (ndev->device->node_guid == cm_id->device->node_guid && 914 kref_get_unless_zero(&ndev->ref)) 915 goto out_unlock; 916 } 917 918 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 919 if (!ndev) 920 goto out_err; 921 922 inline_page_count = num_pages(port->inline_data_size); 923 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 924 cm_id->device->attrs.max_recv_sge) - 1; 925 if (inline_page_count > inline_sge_count) { 926 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 927 port->inline_data_size, cm_id->device->name, 928 inline_sge_count * PAGE_SIZE); 929 port->inline_data_size = inline_sge_count * PAGE_SIZE; 930 inline_page_count = inline_sge_count; 931 } 932 ndev->inline_data_size = port->inline_data_size; 933 ndev->inline_page_count = inline_page_count; 934 ndev->device = cm_id->device; 935 kref_init(&ndev->ref); 936 937 ndev->pd = ib_alloc_pd(ndev->device, 0); 938 if (IS_ERR(ndev->pd)) 939 goto out_free_dev; 940 941 if (nvmet_rdma_use_srq) { 942 ret = nvmet_rdma_init_srq(ndev); 943 if (ret) 944 goto out_free_pd; 945 } 946 947 list_add(&ndev->entry, &device_list); 948 out_unlock: 949 mutex_unlock(&device_list_mutex); 950 pr_debug("added %s.\n", ndev->device->name); 951 return ndev; 952 953 out_free_pd: 954 ib_dealloc_pd(ndev->pd); 955 out_free_dev: 956 kfree(ndev); 957 out_err: 958 mutex_unlock(&device_list_mutex); 959 return NULL; 960 } 961 962 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 963 { 964 struct ib_qp_init_attr qp_attr; 965 struct nvmet_rdma_device *ndev = queue->dev; 966 int comp_vector, nr_cqe, ret, i; 967 968 /* 969 * Spread the io queues across completion vectors, 970 * but still keep all admin queues on vector 0. 971 */ 972 comp_vector = !queue->host_qid ? 0 : 973 queue->idx % ndev->device->num_comp_vectors; 974 975 /* 976 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 977 */ 978 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 979 980 queue->cq = ib_alloc_cq(ndev->device, queue, 981 nr_cqe + 1, comp_vector, 982 IB_POLL_WORKQUEUE); 983 if (IS_ERR(queue->cq)) { 984 ret = PTR_ERR(queue->cq); 985 pr_err("failed to create CQ cqe= %d ret= %d\n", 986 nr_cqe + 1, ret); 987 goto out; 988 } 989 990 memset(&qp_attr, 0, sizeof(qp_attr)); 991 qp_attr.qp_context = queue; 992 qp_attr.event_handler = nvmet_rdma_qp_event; 993 qp_attr.send_cq = queue->cq; 994 qp_attr.recv_cq = queue->cq; 995 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 996 qp_attr.qp_type = IB_QPT_RC; 997 /* +1 for drain */ 998 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 999 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 1000 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 1001 ndev->device->attrs.max_send_sge); 1002 1003 if (ndev->srq) { 1004 qp_attr.srq = ndev->srq; 1005 } else { 1006 /* +1 for drain */ 1007 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 1008 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 1009 } 1010 1011 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 1012 if (ret) { 1013 pr_err("failed to create_qp ret= %d\n", ret); 1014 goto err_destroy_cq; 1015 } 1016 1017 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 1018 1019 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 1020 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 1021 qp_attr.cap.max_send_wr, queue->cm_id); 1022 1023 if (!ndev->srq) { 1024 for (i = 0; i < queue->recv_queue_size; i++) { 1025 queue->cmds[i].queue = queue; 1026 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 1027 if (ret) 1028 goto err_destroy_qp; 1029 } 1030 } 1031 1032 out: 1033 return ret; 1034 1035 err_destroy_qp: 1036 rdma_destroy_qp(queue->cm_id); 1037 err_destroy_cq: 1038 ib_free_cq(queue->cq); 1039 goto out; 1040 } 1041 1042 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1043 { 1044 struct ib_qp *qp = queue->cm_id->qp; 1045 1046 ib_drain_qp(qp); 1047 rdma_destroy_id(queue->cm_id); 1048 ib_destroy_qp(qp); 1049 ib_free_cq(queue->cq); 1050 } 1051 1052 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1053 { 1054 pr_debug("freeing queue %d\n", queue->idx); 1055 1056 nvmet_sq_destroy(&queue->nvme_sq); 1057 1058 nvmet_rdma_destroy_queue_ib(queue); 1059 if (!queue->dev->srq) { 1060 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1061 queue->recv_queue_size, 1062 !queue->host_qid); 1063 } 1064 nvmet_rdma_free_rsps(queue); 1065 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1066 kfree(queue); 1067 } 1068 1069 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1070 { 1071 struct nvmet_rdma_queue *queue = 1072 container_of(w, struct nvmet_rdma_queue, release_work); 1073 struct nvmet_rdma_device *dev = queue->dev; 1074 1075 nvmet_rdma_free_queue(queue); 1076 1077 kref_put(&dev->ref, nvmet_rdma_free_dev); 1078 } 1079 1080 static int 1081 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1082 struct nvmet_rdma_queue *queue) 1083 { 1084 struct nvme_rdma_cm_req *req; 1085 1086 req = (struct nvme_rdma_cm_req *)conn->private_data; 1087 if (!req || conn->private_data_len == 0) 1088 return NVME_RDMA_CM_INVALID_LEN; 1089 1090 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1091 return NVME_RDMA_CM_INVALID_RECFMT; 1092 1093 queue->host_qid = le16_to_cpu(req->qid); 1094 1095 /* 1096 * req->hsqsize corresponds to our recv queue size plus 1 1097 * req->hrqsize corresponds to our send queue size 1098 */ 1099 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1100 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1101 1102 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1103 return NVME_RDMA_CM_INVALID_HSQSIZE; 1104 1105 /* XXX: Should we enforce some kind of max for IO queues? */ 1106 1107 return 0; 1108 } 1109 1110 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1111 enum nvme_rdma_cm_status status) 1112 { 1113 struct nvme_rdma_cm_rej rej; 1114 1115 pr_debug("rejecting connect request: status %d (%s)\n", 1116 status, nvme_rdma_cm_msg(status)); 1117 1118 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1119 rej.sts = cpu_to_le16(status); 1120 1121 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1122 } 1123 1124 static struct nvmet_rdma_queue * 1125 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1126 struct rdma_cm_id *cm_id, 1127 struct rdma_cm_event *event) 1128 { 1129 struct nvmet_rdma_queue *queue; 1130 int ret; 1131 1132 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1133 if (!queue) { 1134 ret = NVME_RDMA_CM_NO_RSC; 1135 goto out_reject; 1136 } 1137 1138 ret = nvmet_sq_init(&queue->nvme_sq); 1139 if (ret) { 1140 ret = NVME_RDMA_CM_NO_RSC; 1141 goto out_free_queue; 1142 } 1143 1144 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1145 if (ret) 1146 goto out_destroy_sq; 1147 1148 /* 1149 * Schedules the actual release because calling rdma_destroy_id from 1150 * inside a CM callback would trigger a deadlock. (great API design..) 1151 */ 1152 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1153 queue->dev = ndev; 1154 queue->cm_id = cm_id; 1155 1156 spin_lock_init(&queue->state_lock); 1157 queue->state = NVMET_RDMA_Q_CONNECTING; 1158 INIT_LIST_HEAD(&queue->rsp_wait_list); 1159 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1160 spin_lock_init(&queue->rsp_wr_wait_lock); 1161 INIT_LIST_HEAD(&queue->free_rsps); 1162 spin_lock_init(&queue->rsps_lock); 1163 INIT_LIST_HEAD(&queue->queue_list); 1164 1165 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1166 if (queue->idx < 0) { 1167 ret = NVME_RDMA_CM_NO_RSC; 1168 goto out_destroy_sq; 1169 } 1170 1171 ret = nvmet_rdma_alloc_rsps(queue); 1172 if (ret) { 1173 ret = NVME_RDMA_CM_NO_RSC; 1174 goto out_ida_remove; 1175 } 1176 1177 if (!ndev->srq) { 1178 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1179 queue->recv_queue_size, 1180 !queue->host_qid); 1181 if (IS_ERR(queue->cmds)) { 1182 ret = NVME_RDMA_CM_NO_RSC; 1183 goto out_free_responses; 1184 } 1185 } 1186 1187 ret = nvmet_rdma_create_queue_ib(queue); 1188 if (ret) { 1189 pr_err("%s: creating RDMA queue failed (%d).\n", 1190 __func__, ret); 1191 ret = NVME_RDMA_CM_NO_RSC; 1192 goto out_free_cmds; 1193 } 1194 1195 return queue; 1196 1197 out_free_cmds: 1198 if (!ndev->srq) { 1199 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1200 queue->recv_queue_size, 1201 !queue->host_qid); 1202 } 1203 out_free_responses: 1204 nvmet_rdma_free_rsps(queue); 1205 out_ida_remove: 1206 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1207 out_destroy_sq: 1208 nvmet_sq_destroy(&queue->nvme_sq); 1209 out_free_queue: 1210 kfree(queue); 1211 out_reject: 1212 nvmet_rdma_cm_reject(cm_id, ret); 1213 return NULL; 1214 } 1215 1216 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1217 { 1218 struct nvmet_rdma_queue *queue = priv; 1219 1220 switch (event->event) { 1221 case IB_EVENT_COMM_EST: 1222 rdma_notify(queue->cm_id, event->event); 1223 break; 1224 default: 1225 pr_err("received IB QP event: %s (%d)\n", 1226 ib_event_msg(event->event), event->event); 1227 break; 1228 } 1229 } 1230 1231 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1232 struct nvmet_rdma_queue *queue, 1233 struct rdma_conn_param *p) 1234 { 1235 struct rdma_conn_param param = { }; 1236 struct nvme_rdma_cm_rep priv = { }; 1237 int ret = -ENOMEM; 1238 1239 param.rnr_retry_count = 7; 1240 param.flow_control = 1; 1241 param.initiator_depth = min_t(u8, p->initiator_depth, 1242 queue->dev->device->attrs.max_qp_init_rd_atom); 1243 param.private_data = &priv; 1244 param.private_data_len = sizeof(priv); 1245 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1246 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1247 1248 ret = rdma_accept(cm_id, ¶m); 1249 if (ret) 1250 pr_err("rdma_accept failed (error code = %d)\n", ret); 1251 1252 return ret; 1253 } 1254 1255 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1256 struct rdma_cm_event *event) 1257 { 1258 struct nvmet_rdma_device *ndev; 1259 struct nvmet_rdma_queue *queue; 1260 int ret = -EINVAL; 1261 1262 ndev = nvmet_rdma_find_get_device(cm_id); 1263 if (!ndev) { 1264 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1265 return -ECONNREFUSED; 1266 } 1267 1268 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1269 if (!queue) { 1270 ret = -ENOMEM; 1271 goto put_device; 1272 } 1273 queue->port = cm_id->context; 1274 1275 if (queue->host_qid == 0) { 1276 /* Let inflight controller teardown complete */ 1277 flush_workqueue(nvmet_rdma_delete_wq); 1278 } 1279 1280 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1281 if (ret) { 1282 queue_work(nvmet_rdma_delete_wq, &queue->release_work); 1283 /* Destroying rdma_cm id is not needed here */ 1284 return 0; 1285 } 1286 1287 mutex_lock(&nvmet_rdma_queue_mutex); 1288 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1289 mutex_unlock(&nvmet_rdma_queue_mutex); 1290 1291 return 0; 1292 1293 put_device: 1294 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1295 1296 return ret; 1297 } 1298 1299 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1300 { 1301 unsigned long flags; 1302 1303 spin_lock_irqsave(&queue->state_lock, flags); 1304 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1305 pr_warn("trying to establish a connected queue\n"); 1306 goto out_unlock; 1307 } 1308 queue->state = NVMET_RDMA_Q_LIVE; 1309 1310 while (!list_empty(&queue->rsp_wait_list)) { 1311 struct nvmet_rdma_rsp *cmd; 1312 1313 cmd = list_first_entry(&queue->rsp_wait_list, 1314 struct nvmet_rdma_rsp, wait_list); 1315 list_del(&cmd->wait_list); 1316 1317 spin_unlock_irqrestore(&queue->state_lock, flags); 1318 nvmet_rdma_handle_command(queue, cmd); 1319 spin_lock_irqsave(&queue->state_lock, flags); 1320 } 1321 1322 out_unlock: 1323 spin_unlock_irqrestore(&queue->state_lock, flags); 1324 } 1325 1326 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1327 { 1328 bool disconnect = false; 1329 unsigned long flags; 1330 1331 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1332 1333 spin_lock_irqsave(&queue->state_lock, flags); 1334 switch (queue->state) { 1335 case NVMET_RDMA_Q_CONNECTING: 1336 case NVMET_RDMA_Q_LIVE: 1337 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1338 disconnect = true; 1339 break; 1340 case NVMET_RDMA_Q_DISCONNECTING: 1341 break; 1342 } 1343 spin_unlock_irqrestore(&queue->state_lock, flags); 1344 1345 if (disconnect) { 1346 rdma_disconnect(queue->cm_id); 1347 queue_work(nvmet_rdma_delete_wq, &queue->release_work); 1348 } 1349 } 1350 1351 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1352 { 1353 bool disconnect = false; 1354 1355 mutex_lock(&nvmet_rdma_queue_mutex); 1356 if (!list_empty(&queue->queue_list)) { 1357 list_del_init(&queue->queue_list); 1358 disconnect = true; 1359 } 1360 mutex_unlock(&nvmet_rdma_queue_mutex); 1361 1362 if (disconnect) 1363 __nvmet_rdma_queue_disconnect(queue); 1364 } 1365 1366 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1367 struct nvmet_rdma_queue *queue) 1368 { 1369 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1370 1371 mutex_lock(&nvmet_rdma_queue_mutex); 1372 if (!list_empty(&queue->queue_list)) 1373 list_del_init(&queue->queue_list); 1374 mutex_unlock(&nvmet_rdma_queue_mutex); 1375 1376 pr_err("failed to connect queue %d\n", queue->idx); 1377 queue_work(nvmet_rdma_delete_wq, &queue->release_work); 1378 } 1379 1380 /** 1381 * nvme_rdma_device_removal() - Handle RDMA device removal 1382 * @cm_id: rdma_cm id, used for nvmet port 1383 * @queue: nvmet rdma queue (cm id qp_context) 1384 * 1385 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1386 * to unplug. Note that this event can be generated on a normal 1387 * queue cm_id and/or a device bound listener cm_id (where in this 1388 * case queue will be null). 1389 * 1390 * We registered an ib_client to handle device removal for queues, 1391 * so we only need to handle the listening port cm_ids. In this case 1392 * we nullify the priv to prevent double cm_id destruction and destroying 1393 * the cm_id implicitely by returning a non-zero rc to the callout. 1394 */ 1395 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1396 struct nvmet_rdma_queue *queue) 1397 { 1398 struct nvmet_port *port; 1399 1400 if (queue) { 1401 /* 1402 * This is a queue cm_id. we have registered 1403 * an ib_client to handle queues removal 1404 * so don't interfear and just return. 1405 */ 1406 return 0; 1407 } 1408 1409 port = cm_id->context; 1410 1411 /* 1412 * This is a listener cm_id. Make sure that 1413 * future remove_port won't invoke a double 1414 * cm_id destroy. use atomic xchg to make sure 1415 * we don't compete with remove_port. 1416 */ 1417 if (xchg(&port->priv, NULL) != cm_id) 1418 return 0; 1419 1420 /* 1421 * We need to return 1 so that the core will destroy 1422 * it's own ID. What a great API design.. 1423 */ 1424 return 1; 1425 } 1426 1427 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1428 struct rdma_cm_event *event) 1429 { 1430 struct nvmet_rdma_queue *queue = NULL; 1431 int ret = 0; 1432 1433 if (cm_id->qp) 1434 queue = cm_id->qp->qp_context; 1435 1436 pr_debug("%s (%d): status %d id %p\n", 1437 rdma_event_msg(event->event), event->event, 1438 event->status, cm_id); 1439 1440 switch (event->event) { 1441 case RDMA_CM_EVENT_CONNECT_REQUEST: 1442 ret = nvmet_rdma_queue_connect(cm_id, event); 1443 break; 1444 case RDMA_CM_EVENT_ESTABLISHED: 1445 nvmet_rdma_queue_established(queue); 1446 break; 1447 case RDMA_CM_EVENT_ADDR_CHANGE: 1448 case RDMA_CM_EVENT_DISCONNECTED: 1449 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1450 nvmet_rdma_queue_disconnect(queue); 1451 break; 1452 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1453 ret = nvmet_rdma_device_removal(cm_id, queue); 1454 break; 1455 case RDMA_CM_EVENT_REJECTED: 1456 pr_debug("Connection rejected: %s\n", 1457 rdma_reject_msg(cm_id, event->status)); 1458 /* FALLTHROUGH */ 1459 case RDMA_CM_EVENT_UNREACHABLE: 1460 case RDMA_CM_EVENT_CONNECT_ERROR: 1461 nvmet_rdma_queue_connect_fail(cm_id, queue); 1462 break; 1463 default: 1464 pr_err("received unrecognized RDMA CM event %d\n", 1465 event->event); 1466 break; 1467 } 1468 1469 return ret; 1470 } 1471 1472 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1473 { 1474 struct nvmet_rdma_queue *queue; 1475 1476 restart: 1477 mutex_lock(&nvmet_rdma_queue_mutex); 1478 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1479 if (queue->nvme_sq.ctrl == ctrl) { 1480 list_del_init(&queue->queue_list); 1481 mutex_unlock(&nvmet_rdma_queue_mutex); 1482 1483 __nvmet_rdma_queue_disconnect(queue); 1484 goto restart; 1485 } 1486 } 1487 mutex_unlock(&nvmet_rdma_queue_mutex); 1488 } 1489 1490 static int nvmet_rdma_add_port(struct nvmet_port *port) 1491 { 1492 struct rdma_cm_id *cm_id; 1493 struct sockaddr_storage addr = { }; 1494 __kernel_sa_family_t af; 1495 int ret; 1496 1497 switch (port->disc_addr.adrfam) { 1498 case NVMF_ADDR_FAMILY_IP4: 1499 af = AF_INET; 1500 break; 1501 case NVMF_ADDR_FAMILY_IP6: 1502 af = AF_INET6; 1503 break; 1504 default: 1505 pr_err("address family %d not supported\n", 1506 port->disc_addr.adrfam); 1507 return -EINVAL; 1508 } 1509 1510 if (port->inline_data_size < 0) { 1511 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1512 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1513 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1514 port->inline_data_size, 1515 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1516 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1517 } 1518 1519 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1520 port->disc_addr.trsvcid, &addr); 1521 if (ret) { 1522 pr_err("malformed ip/port passed: %s:%s\n", 1523 port->disc_addr.traddr, port->disc_addr.trsvcid); 1524 return ret; 1525 } 1526 1527 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1528 RDMA_PS_TCP, IB_QPT_RC); 1529 if (IS_ERR(cm_id)) { 1530 pr_err("CM ID creation failed\n"); 1531 return PTR_ERR(cm_id); 1532 } 1533 1534 /* 1535 * Allow both IPv4 and IPv6 sockets to bind a single port 1536 * at the same time. 1537 */ 1538 ret = rdma_set_afonly(cm_id, 1); 1539 if (ret) { 1540 pr_err("rdma_set_afonly failed (%d)\n", ret); 1541 goto out_destroy_id; 1542 } 1543 1544 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1545 if (ret) { 1546 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1547 (struct sockaddr *)&addr, ret); 1548 goto out_destroy_id; 1549 } 1550 1551 ret = rdma_listen(cm_id, 128); 1552 if (ret) { 1553 pr_err("listening to %pISpcs failed (%d)\n", 1554 (struct sockaddr *)&addr, ret); 1555 goto out_destroy_id; 1556 } 1557 1558 pr_info("enabling port %d (%pISpcs)\n", 1559 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1560 port->priv = cm_id; 1561 return 0; 1562 1563 out_destroy_id: 1564 rdma_destroy_id(cm_id); 1565 return ret; 1566 } 1567 1568 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1569 { 1570 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1571 1572 if (cm_id) 1573 rdma_destroy_id(cm_id); 1574 } 1575 1576 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1577 struct nvmet_port *port, char *traddr) 1578 { 1579 struct rdma_cm_id *cm_id = port->priv; 1580 1581 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1582 struct nvmet_rdma_rsp *rsp = 1583 container_of(req, struct nvmet_rdma_rsp, req); 1584 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1585 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1586 1587 sprintf(traddr, "%pISc", addr); 1588 } else { 1589 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1590 } 1591 } 1592 1593 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1594 .owner = THIS_MODULE, 1595 .type = NVMF_TRTYPE_RDMA, 1596 .msdbd = 1, 1597 .has_keyed_sgls = 1, 1598 .add_port = nvmet_rdma_add_port, 1599 .remove_port = nvmet_rdma_remove_port, 1600 .queue_response = nvmet_rdma_queue_response, 1601 .delete_ctrl = nvmet_rdma_delete_ctrl, 1602 .disc_traddr = nvmet_rdma_disc_port_addr, 1603 }; 1604 1605 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1606 { 1607 struct nvmet_rdma_queue *queue, *tmp; 1608 struct nvmet_rdma_device *ndev; 1609 bool found = false; 1610 1611 mutex_lock(&device_list_mutex); 1612 list_for_each_entry(ndev, &device_list, entry) { 1613 if (ndev->device == ib_device) { 1614 found = true; 1615 break; 1616 } 1617 } 1618 mutex_unlock(&device_list_mutex); 1619 1620 if (!found) 1621 return; 1622 1623 /* 1624 * IB Device that is used by nvmet controllers is being removed, 1625 * delete all queues using this device. 1626 */ 1627 mutex_lock(&nvmet_rdma_queue_mutex); 1628 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1629 queue_list) { 1630 if (queue->dev->device != ib_device) 1631 continue; 1632 1633 pr_info("Removing queue %d\n", queue->idx); 1634 list_del_init(&queue->queue_list); 1635 __nvmet_rdma_queue_disconnect(queue); 1636 } 1637 mutex_unlock(&nvmet_rdma_queue_mutex); 1638 1639 flush_scheduled_work(); 1640 } 1641 1642 static struct ib_client nvmet_rdma_ib_client = { 1643 .name = "nvmet_rdma", 1644 .remove = nvmet_rdma_remove_one 1645 }; 1646 1647 static int __init nvmet_rdma_init(void) 1648 { 1649 int ret; 1650 1651 ret = ib_register_client(&nvmet_rdma_ib_client); 1652 if (ret) 1653 return ret; 1654 1655 ret = nvmet_register_transport(&nvmet_rdma_ops); 1656 if (ret) 1657 goto err_ib_client; 1658 1659 nvmet_rdma_delete_wq = alloc_workqueue("nvmet-rdma-delete-wq", 1660 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 1661 if (!nvmet_rdma_delete_wq) { 1662 ret = -ENOMEM; 1663 goto err_unreg_transport; 1664 } 1665 1666 return 0; 1667 1668 err_unreg_transport: 1669 nvmet_unregister_transport(&nvmet_rdma_ops); 1670 err_ib_client: 1671 ib_unregister_client(&nvmet_rdma_ib_client); 1672 return ret; 1673 } 1674 1675 static void __exit nvmet_rdma_exit(void) 1676 { 1677 destroy_workqueue(nvmet_rdma_delete_wq); 1678 nvmet_unregister_transport(&nvmet_rdma_ops); 1679 ib_unregister_client(&nvmet_rdma_ib_client); 1680 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1681 ida_destroy(&nvmet_rdma_queue_ida); 1682 } 1683 1684 module_init(nvmet_rdma_init); 1685 module_exit(nvmet_rdma_exit); 1686 1687 MODULE_LICENSE("GPL v2"); 1688 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1689