1 /* 2 * NVMe over Fabrics RDMA target. 3 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 #include <linux/atomic.h> 16 #include <linux/ctype.h> 17 #include <linux/delay.h> 18 #include <linux/err.h> 19 #include <linux/init.h> 20 #include <linux/module.h> 21 #include <linux/nvme.h> 22 #include <linux/slab.h> 23 #include <linux/string.h> 24 #include <linux/wait.h> 25 #include <linux/inet.h> 26 #include <asm/unaligned.h> 27 28 #include <rdma/ib_verbs.h> 29 #include <rdma/rdma_cm.h> 30 #include <rdma/rw.h> 31 32 #include <linux/nvme-rdma.h> 33 #include "nvmet.h" 34 35 /* 36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 37 */ 38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 39 #define NVMET_RDMA_MAX_INLINE_SGE 4 40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 41 42 struct nvmet_rdma_cmd { 43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 44 struct ib_cqe cqe; 45 struct ib_recv_wr wr; 46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 47 struct nvme_command *nvme_cmd; 48 struct nvmet_rdma_queue *queue; 49 }; 50 51 enum { 52 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 53 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 54 }; 55 56 struct nvmet_rdma_rsp { 57 struct ib_sge send_sge; 58 struct ib_cqe send_cqe; 59 struct ib_send_wr send_wr; 60 61 struct nvmet_rdma_cmd *cmd; 62 struct nvmet_rdma_queue *queue; 63 64 struct ib_cqe read_cqe; 65 struct rdma_rw_ctx rw; 66 67 struct nvmet_req req; 68 69 bool allocated; 70 u8 n_rdma; 71 u32 flags; 72 u32 invalidate_rkey; 73 74 struct list_head wait_list; 75 struct list_head free_list; 76 }; 77 78 enum nvmet_rdma_queue_state { 79 NVMET_RDMA_Q_CONNECTING, 80 NVMET_RDMA_Q_LIVE, 81 NVMET_RDMA_Q_DISCONNECTING, 82 }; 83 84 struct nvmet_rdma_queue { 85 struct rdma_cm_id *cm_id; 86 struct nvmet_port *port; 87 struct ib_cq *cq; 88 atomic_t sq_wr_avail; 89 struct nvmet_rdma_device *dev; 90 spinlock_t state_lock; 91 enum nvmet_rdma_queue_state state; 92 struct nvmet_cq nvme_cq; 93 struct nvmet_sq nvme_sq; 94 95 struct nvmet_rdma_rsp *rsps; 96 struct list_head free_rsps; 97 spinlock_t rsps_lock; 98 struct nvmet_rdma_cmd *cmds; 99 100 struct work_struct release_work; 101 struct list_head rsp_wait_list; 102 struct list_head rsp_wr_wait_list; 103 spinlock_t rsp_wr_wait_lock; 104 105 int idx; 106 int host_qid; 107 int recv_queue_size; 108 int send_queue_size; 109 110 struct list_head queue_list; 111 }; 112 113 struct nvmet_rdma_device { 114 struct ib_device *device; 115 struct ib_pd *pd; 116 struct ib_srq *srq; 117 struct nvmet_rdma_cmd *srq_cmds; 118 size_t srq_size; 119 struct kref ref; 120 struct list_head entry; 121 int inline_data_size; 122 int inline_page_count; 123 }; 124 125 static bool nvmet_rdma_use_srq; 126 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 127 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 128 129 static DEFINE_IDA(nvmet_rdma_queue_ida); 130 static LIST_HEAD(nvmet_rdma_queue_list); 131 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 132 133 static LIST_HEAD(device_list); 134 static DEFINE_MUTEX(device_list_mutex); 135 136 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 137 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 138 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 139 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 140 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 141 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 142 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 143 struct nvmet_rdma_rsp *r); 144 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 145 struct nvmet_rdma_rsp *r); 146 147 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 148 149 static int num_pages(int len) 150 { 151 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 152 } 153 154 /* XXX: really should move to a generic header sooner or later.. */ 155 static inline u32 get_unaligned_le24(const u8 *p) 156 { 157 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 158 } 159 160 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 161 { 162 return nvme_is_write(rsp->req.cmd) && 163 rsp->req.transfer_len && 164 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 165 } 166 167 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 168 { 169 return !nvme_is_write(rsp->req.cmd) && 170 rsp->req.transfer_len && 171 !rsp->req.rsp->status && 172 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 173 } 174 175 static inline struct nvmet_rdma_rsp * 176 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 177 { 178 struct nvmet_rdma_rsp *rsp; 179 unsigned long flags; 180 181 spin_lock_irqsave(&queue->rsps_lock, flags); 182 rsp = list_first_entry_or_null(&queue->free_rsps, 183 struct nvmet_rdma_rsp, free_list); 184 if (likely(rsp)) 185 list_del(&rsp->free_list); 186 spin_unlock_irqrestore(&queue->rsps_lock, flags); 187 188 if (unlikely(!rsp)) { 189 int ret; 190 191 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); 192 if (unlikely(!rsp)) 193 return NULL; 194 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp); 195 if (unlikely(ret)) { 196 kfree(rsp); 197 return NULL; 198 } 199 200 rsp->allocated = true; 201 } 202 203 return rsp; 204 } 205 206 static inline void 207 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 208 { 209 unsigned long flags; 210 211 if (unlikely(rsp->allocated)) { 212 nvmet_rdma_free_rsp(rsp->queue->dev, rsp); 213 kfree(rsp); 214 return; 215 } 216 217 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 218 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 219 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 220 } 221 222 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 223 struct nvmet_rdma_cmd *c) 224 { 225 struct scatterlist *sg; 226 struct ib_sge *sge; 227 int i; 228 229 if (!ndev->inline_data_size) 230 return; 231 232 sg = c->inline_sg; 233 sge = &c->sge[1]; 234 235 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 236 if (sge->length) 237 ib_dma_unmap_page(ndev->device, sge->addr, 238 sge->length, DMA_FROM_DEVICE); 239 if (sg_page(sg)) 240 __free_page(sg_page(sg)); 241 } 242 } 243 244 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 245 struct nvmet_rdma_cmd *c) 246 { 247 struct scatterlist *sg; 248 struct ib_sge *sge; 249 struct page *pg; 250 int len; 251 int i; 252 253 if (!ndev->inline_data_size) 254 return 0; 255 256 sg = c->inline_sg; 257 sg_init_table(sg, ndev->inline_page_count); 258 sge = &c->sge[1]; 259 len = ndev->inline_data_size; 260 261 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 262 pg = alloc_page(GFP_KERNEL); 263 if (!pg) 264 goto out_err; 265 sg_assign_page(sg, pg); 266 sge->addr = ib_dma_map_page(ndev->device, 267 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 268 if (ib_dma_mapping_error(ndev->device, sge->addr)) 269 goto out_err; 270 sge->length = min_t(int, len, PAGE_SIZE); 271 sge->lkey = ndev->pd->local_dma_lkey; 272 len -= sge->length; 273 } 274 275 return 0; 276 out_err: 277 for (; i >= 0; i--, sg--, sge--) { 278 if (sge->length) 279 ib_dma_unmap_page(ndev->device, sge->addr, 280 sge->length, DMA_FROM_DEVICE); 281 if (sg_page(sg)) 282 __free_page(sg_page(sg)); 283 } 284 return -ENOMEM; 285 } 286 287 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 288 struct nvmet_rdma_cmd *c, bool admin) 289 { 290 /* NVMe command / RDMA RECV */ 291 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 292 if (!c->nvme_cmd) 293 goto out; 294 295 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 296 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 297 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 298 goto out_free_cmd; 299 300 c->sge[0].length = sizeof(*c->nvme_cmd); 301 c->sge[0].lkey = ndev->pd->local_dma_lkey; 302 303 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 304 goto out_unmap_cmd; 305 306 c->cqe.done = nvmet_rdma_recv_done; 307 308 c->wr.wr_cqe = &c->cqe; 309 c->wr.sg_list = c->sge; 310 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 311 312 return 0; 313 314 out_unmap_cmd: 315 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 316 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 317 out_free_cmd: 318 kfree(c->nvme_cmd); 319 320 out: 321 return -ENOMEM; 322 } 323 324 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 325 struct nvmet_rdma_cmd *c, bool admin) 326 { 327 if (!admin) 328 nvmet_rdma_free_inline_pages(ndev, c); 329 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 330 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 331 kfree(c->nvme_cmd); 332 } 333 334 static struct nvmet_rdma_cmd * 335 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 336 int nr_cmds, bool admin) 337 { 338 struct nvmet_rdma_cmd *cmds; 339 int ret = -EINVAL, i; 340 341 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 342 if (!cmds) 343 goto out; 344 345 for (i = 0; i < nr_cmds; i++) { 346 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 347 if (ret) 348 goto out_free; 349 } 350 351 return cmds; 352 353 out_free: 354 while (--i >= 0) 355 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 356 kfree(cmds); 357 out: 358 return ERR_PTR(ret); 359 } 360 361 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 362 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 363 { 364 int i; 365 366 for (i = 0; i < nr_cmds; i++) 367 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 368 kfree(cmds); 369 } 370 371 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 372 struct nvmet_rdma_rsp *r) 373 { 374 /* NVMe CQE / RDMA SEND */ 375 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 376 if (!r->req.rsp) 377 goto out; 378 379 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 380 sizeof(*r->req.rsp), DMA_TO_DEVICE); 381 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 382 goto out_free_rsp; 383 384 r->send_sge.length = sizeof(*r->req.rsp); 385 r->send_sge.lkey = ndev->pd->local_dma_lkey; 386 387 r->send_cqe.done = nvmet_rdma_send_done; 388 389 r->send_wr.wr_cqe = &r->send_cqe; 390 r->send_wr.sg_list = &r->send_sge; 391 r->send_wr.num_sge = 1; 392 r->send_wr.send_flags = IB_SEND_SIGNALED; 393 394 /* Data In / RDMA READ */ 395 r->read_cqe.done = nvmet_rdma_read_data_done; 396 return 0; 397 398 out_free_rsp: 399 kfree(r->req.rsp); 400 out: 401 return -ENOMEM; 402 } 403 404 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 405 struct nvmet_rdma_rsp *r) 406 { 407 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 408 sizeof(*r->req.rsp), DMA_TO_DEVICE); 409 kfree(r->req.rsp); 410 } 411 412 static int 413 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 414 { 415 struct nvmet_rdma_device *ndev = queue->dev; 416 int nr_rsps = queue->recv_queue_size * 2; 417 int ret = -EINVAL, i; 418 419 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 420 GFP_KERNEL); 421 if (!queue->rsps) 422 goto out; 423 424 for (i = 0; i < nr_rsps; i++) { 425 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 426 427 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 428 if (ret) 429 goto out_free; 430 431 list_add_tail(&rsp->free_list, &queue->free_rsps); 432 } 433 434 return 0; 435 436 out_free: 437 while (--i >= 0) { 438 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 439 440 list_del(&rsp->free_list); 441 nvmet_rdma_free_rsp(ndev, rsp); 442 } 443 kfree(queue->rsps); 444 out: 445 return ret; 446 } 447 448 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 449 { 450 struct nvmet_rdma_device *ndev = queue->dev; 451 int i, nr_rsps = queue->recv_queue_size * 2; 452 453 for (i = 0; i < nr_rsps; i++) { 454 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 455 456 list_del(&rsp->free_list); 457 nvmet_rdma_free_rsp(ndev, rsp); 458 } 459 kfree(queue->rsps); 460 } 461 462 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 463 struct nvmet_rdma_cmd *cmd) 464 { 465 int ret; 466 467 ib_dma_sync_single_for_device(ndev->device, 468 cmd->sge[0].addr, cmd->sge[0].length, 469 DMA_FROM_DEVICE); 470 471 if (ndev->srq) 472 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 473 else 474 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 475 476 if (unlikely(ret)) 477 pr_err("post_recv cmd failed\n"); 478 479 return ret; 480 } 481 482 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 483 { 484 spin_lock(&queue->rsp_wr_wait_lock); 485 while (!list_empty(&queue->rsp_wr_wait_list)) { 486 struct nvmet_rdma_rsp *rsp; 487 bool ret; 488 489 rsp = list_entry(queue->rsp_wr_wait_list.next, 490 struct nvmet_rdma_rsp, wait_list); 491 list_del(&rsp->wait_list); 492 493 spin_unlock(&queue->rsp_wr_wait_lock); 494 ret = nvmet_rdma_execute_command(rsp); 495 spin_lock(&queue->rsp_wr_wait_lock); 496 497 if (!ret) { 498 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 499 break; 500 } 501 } 502 spin_unlock(&queue->rsp_wr_wait_lock); 503 } 504 505 506 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 507 { 508 struct nvmet_rdma_queue *queue = rsp->queue; 509 510 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 511 512 if (rsp->n_rdma) { 513 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 514 queue->cm_id->port_num, rsp->req.sg, 515 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 516 } 517 518 if (rsp->req.sg != rsp->cmd->inline_sg) 519 nvmet_req_free_sgl(&rsp->req); 520 521 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 522 nvmet_rdma_process_wr_wait_list(queue); 523 524 nvmet_rdma_put_rsp(rsp); 525 } 526 527 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 528 { 529 if (queue->nvme_sq.ctrl) { 530 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 531 } else { 532 /* 533 * we didn't setup the controller yet in case 534 * of admin connect error, just disconnect and 535 * cleanup the queue 536 */ 537 nvmet_rdma_queue_disconnect(queue); 538 } 539 } 540 541 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 542 { 543 struct nvmet_rdma_rsp *rsp = 544 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 545 struct nvmet_rdma_queue *queue = cq->cq_context; 546 547 nvmet_rdma_release_rsp(rsp); 548 549 if (unlikely(wc->status != IB_WC_SUCCESS && 550 wc->status != IB_WC_WR_FLUSH_ERR)) { 551 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 552 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 553 nvmet_rdma_error_comp(queue); 554 } 555 } 556 557 static void nvmet_rdma_queue_response(struct nvmet_req *req) 558 { 559 struct nvmet_rdma_rsp *rsp = 560 container_of(req, struct nvmet_rdma_rsp, req); 561 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 562 struct ib_send_wr *first_wr; 563 564 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 565 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 566 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 567 } else { 568 rsp->send_wr.opcode = IB_WR_SEND; 569 } 570 571 if (nvmet_rdma_need_data_out(rsp)) 572 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 573 cm_id->port_num, NULL, &rsp->send_wr); 574 else 575 first_wr = &rsp->send_wr; 576 577 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 578 579 ib_dma_sync_single_for_device(rsp->queue->dev->device, 580 rsp->send_sge.addr, rsp->send_sge.length, 581 DMA_TO_DEVICE); 582 583 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 584 pr_err("sending cmd response failed\n"); 585 nvmet_rdma_release_rsp(rsp); 586 } 587 } 588 589 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 590 { 591 struct nvmet_rdma_rsp *rsp = 592 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 593 struct nvmet_rdma_queue *queue = cq->cq_context; 594 595 WARN_ON(rsp->n_rdma <= 0); 596 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 597 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 598 queue->cm_id->port_num, rsp->req.sg, 599 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 600 rsp->n_rdma = 0; 601 602 if (unlikely(wc->status != IB_WC_SUCCESS)) { 603 nvmet_req_uninit(&rsp->req); 604 nvmet_rdma_release_rsp(rsp); 605 if (wc->status != IB_WC_WR_FLUSH_ERR) { 606 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 607 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 608 nvmet_rdma_error_comp(queue); 609 } 610 return; 611 } 612 613 nvmet_req_execute(&rsp->req); 614 } 615 616 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 617 u64 off) 618 { 619 int sg_count = num_pages(len); 620 struct scatterlist *sg; 621 int i; 622 623 sg = rsp->cmd->inline_sg; 624 for (i = 0; i < sg_count; i++, sg++) { 625 if (i < sg_count - 1) 626 sg_unmark_end(sg); 627 else 628 sg_mark_end(sg); 629 sg->offset = off; 630 sg->length = min_t(int, len, PAGE_SIZE - off); 631 len -= sg->length; 632 if (!i) 633 off = 0; 634 } 635 636 rsp->req.sg = rsp->cmd->inline_sg; 637 rsp->req.sg_cnt = sg_count; 638 } 639 640 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 641 { 642 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 643 u64 off = le64_to_cpu(sgl->addr); 644 u32 len = le32_to_cpu(sgl->length); 645 646 if (!nvme_is_write(rsp->req.cmd)) { 647 rsp->req.error_loc = 648 offsetof(struct nvme_common_command, opcode); 649 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 650 } 651 652 if (off + len > rsp->queue->dev->inline_data_size) { 653 pr_err("invalid inline data offset!\n"); 654 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 655 } 656 657 /* no data command? */ 658 if (!len) 659 return 0; 660 661 nvmet_rdma_use_inline_sg(rsp, len, off); 662 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 663 rsp->req.transfer_len += len; 664 return 0; 665 } 666 667 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 668 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 669 { 670 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 671 u64 addr = le64_to_cpu(sgl->addr); 672 u32 key = get_unaligned_le32(sgl->key); 673 int ret; 674 675 rsp->req.transfer_len = get_unaligned_le24(sgl->length); 676 677 /* no data command? */ 678 if (!rsp->req.transfer_len) 679 return 0; 680 681 ret = nvmet_req_alloc_sgl(&rsp->req); 682 if (ret < 0) 683 goto error_out; 684 685 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 686 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 687 nvmet_data_dir(&rsp->req)); 688 if (ret < 0) 689 goto error_out; 690 rsp->n_rdma += ret; 691 692 if (invalidate) { 693 rsp->invalidate_rkey = key; 694 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 695 } 696 697 return 0; 698 699 error_out: 700 rsp->req.transfer_len = 0; 701 return NVME_SC_INTERNAL; 702 } 703 704 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 705 { 706 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 707 708 switch (sgl->type >> 4) { 709 case NVME_SGL_FMT_DATA_DESC: 710 switch (sgl->type & 0xf) { 711 case NVME_SGL_FMT_OFFSET: 712 return nvmet_rdma_map_sgl_inline(rsp); 713 default: 714 pr_err("invalid SGL subtype: %#x\n", sgl->type); 715 rsp->req.error_loc = 716 offsetof(struct nvme_common_command, dptr); 717 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 718 } 719 case NVME_KEY_SGL_FMT_DATA_DESC: 720 switch (sgl->type & 0xf) { 721 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 722 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 723 case NVME_SGL_FMT_ADDRESS: 724 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 725 default: 726 pr_err("invalid SGL subtype: %#x\n", sgl->type); 727 rsp->req.error_loc = 728 offsetof(struct nvme_common_command, dptr); 729 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 730 } 731 default: 732 pr_err("invalid SGL type: %#x\n", sgl->type); 733 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); 734 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 735 } 736 } 737 738 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 739 { 740 struct nvmet_rdma_queue *queue = rsp->queue; 741 742 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 743 &queue->sq_wr_avail) < 0)) { 744 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 745 1 + rsp->n_rdma, queue->idx, 746 queue->nvme_sq.ctrl->cntlid); 747 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 748 return false; 749 } 750 751 if (nvmet_rdma_need_data_in(rsp)) { 752 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 753 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 754 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 755 } else { 756 nvmet_req_execute(&rsp->req); 757 } 758 759 return true; 760 } 761 762 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 763 struct nvmet_rdma_rsp *cmd) 764 { 765 u16 status; 766 767 ib_dma_sync_single_for_cpu(queue->dev->device, 768 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 769 DMA_FROM_DEVICE); 770 ib_dma_sync_single_for_cpu(queue->dev->device, 771 cmd->send_sge.addr, cmd->send_sge.length, 772 DMA_TO_DEVICE); 773 774 cmd->req.p2p_client = &queue->dev->device->dev; 775 776 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 777 &queue->nvme_sq, &nvmet_rdma_ops)) 778 return; 779 780 status = nvmet_rdma_map_sgl(cmd); 781 if (status) 782 goto out_err; 783 784 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 785 spin_lock(&queue->rsp_wr_wait_lock); 786 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 787 spin_unlock(&queue->rsp_wr_wait_lock); 788 } 789 790 return; 791 792 out_err: 793 nvmet_req_complete(&cmd->req, status); 794 } 795 796 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 797 { 798 struct nvmet_rdma_cmd *cmd = 799 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 800 struct nvmet_rdma_queue *queue = cq->cq_context; 801 struct nvmet_rdma_rsp *rsp; 802 803 if (unlikely(wc->status != IB_WC_SUCCESS)) { 804 if (wc->status != IB_WC_WR_FLUSH_ERR) { 805 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 806 wc->wr_cqe, ib_wc_status_msg(wc->status), 807 wc->status); 808 nvmet_rdma_error_comp(queue); 809 } 810 return; 811 } 812 813 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 814 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 815 nvmet_rdma_error_comp(queue); 816 return; 817 } 818 819 cmd->queue = queue; 820 rsp = nvmet_rdma_get_rsp(queue); 821 if (unlikely(!rsp)) { 822 /* 823 * we get here only under memory pressure, 824 * silently drop and have the host retry 825 * as we can't even fail it. 826 */ 827 nvmet_rdma_post_recv(queue->dev, cmd); 828 return; 829 } 830 rsp->queue = queue; 831 rsp->cmd = cmd; 832 rsp->flags = 0; 833 rsp->req.cmd = cmd->nvme_cmd; 834 rsp->req.port = queue->port; 835 rsp->n_rdma = 0; 836 837 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 838 unsigned long flags; 839 840 spin_lock_irqsave(&queue->state_lock, flags); 841 if (queue->state == NVMET_RDMA_Q_CONNECTING) 842 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 843 else 844 nvmet_rdma_put_rsp(rsp); 845 spin_unlock_irqrestore(&queue->state_lock, flags); 846 return; 847 } 848 849 nvmet_rdma_handle_command(queue, rsp); 850 } 851 852 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 853 { 854 if (!ndev->srq) 855 return; 856 857 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 858 ib_destroy_srq(ndev->srq); 859 } 860 861 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 862 { 863 struct ib_srq_init_attr srq_attr = { NULL, }; 864 struct ib_srq *srq; 865 size_t srq_size; 866 int ret, i; 867 868 srq_size = 4095; /* XXX: tune */ 869 870 srq_attr.attr.max_wr = srq_size; 871 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 872 srq_attr.attr.srq_limit = 0; 873 srq_attr.srq_type = IB_SRQT_BASIC; 874 srq = ib_create_srq(ndev->pd, &srq_attr); 875 if (IS_ERR(srq)) { 876 /* 877 * If SRQs aren't supported we just go ahead and use normal 878 * non-shared receive queues. 879 */ 880 pr_info("SRQ requested but not supported.\n"); 881 return 0; 882 } 883 884 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 885 if (IS_ERR(ndev->srq_cmds)) { 886 ret = PTR_ERR(ndev->srq_cmds); 887 goto out_destroy_srq; 888 } 889 890 ndev->srq = srq; 891 ndev->srq_size = srq_size; 892 893 for (i = 0; i < srq_size; i++) { 894 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 895 if (ret) 896 goto out_free_cmds; 897 } 898 899 return 0; 900 901 out_free_cmds: 902 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 903 out_destroy_srq: 904 ib_destroy_srq(srq); 905 return ret; 906 } 907 908 static void nvmet_rdma_free_dev(struct kref *ref) 909 { 910 struct nvmet_rdma_device *ndev = 911 container_of(ref, struct nvmet_rdma_device, ref); 912 913 mutex_lock(&device_list_mutex); 914 list_del(&ndev->entry); 915 mutex_unlock(&device_list_mutex); 916 917 nvmet_rdma_destroy_srq(ndev); 918 ib_dealloc_pd(ndev->pd); 919 920 kfree(ndev); 921 } 922 923 static struct nvmet_rdma_device * 924 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 925 { 926 struct nvmet_port *port = cm_id->context; 927 struct nvmet_rdma_device *ndev; 928 int inline_page_count; 929 int inline_sge_count; 930 int ret; 931 932 mutex_lock(&device_list_mutex); 933 list_for_each_entry(ndev, &device_list, entry) { 934 if (ndev->device->node_guid == cm_id->device->node_guid && 935 kref_get_unless_zero(&ndev->ref)) 936 goto out_unlock; 937 } 938 939 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 940 if (!ndev) 941 goto out_err; 942 943 inline_page_count = num_pages(port->inline_data_size); 944 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 945 cm_id->device->attrs.max_recv_sge) - 1; 946 if (inline_page_count > inline_sge_count) { 947 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 948 port->inline_data_size, cm_id->device->name, 949 inline_sge_count * PAGE_SIZE); 950 port->inline_data_size = inline_sge_count * PAGE_SIZE; 951 inline_page_count = inline_sge_count; 952 } 953 ndev->inline_data_size = port->inline_data_size; 954 ndev->inline_page_count = inline_page_count; 955 ndev->device = cm_id->device; 956 kref_init(&ndev->ref); 957 958 ndev->pd = ib_alloc_pd(ndev->device, 0); 959 if (IS_ERR(ndev->pd)) 960 goto out_free_dev; 961 962 if (nvmet_rdma_use_srq) { 963 ret = nvmet_rdma_init_srq(ndev); 964 if (ret) 965 goto out_free_pd; 966 } 967 968 list_add(&ndev->entry, &device_list); 969 out_unlock: 970 mutex_unlock(&device_list_mutex); 971 pr_debug("added %s.\n", ndev->device->name); 972 return ndev; 973 974 out_free_pd: 975 ib_dealloc_pd(ndev->pd); 976 out_free_dev: 977 kfree(ndev); 978 out_err: 979 mutex_unlock(&device_list_mutex); 980 return NULL; 981 } 982 983 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 984 { 985 struct ib_qp_init_attr qp_attr; 986 struct nvmet_rdma_device *ndev = queue->dev; 987 int comp_vector, nr_cqe, ret, i; 988 989 /* 990 * Spread the io queues across completion vectors, 991 * but still keep all admin queues on vector 0. 992 */ 993 comp_vector = !queue->host_qid ? 0 : 994 queue->idx % ndev->device->num_comp_vectors; 995 996 /* 997 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 998 */ 999 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 1000 1001 queue->cq = ib_alloc_cq(ndev->device, queue, 1002 nr_cqe + 1, comp_vector, 1003 IB_POLL_WORKQUEUE); 1004 if (IS_ERR(queue->cq)) { 1005 ret = PTR_ERR(queue->cq); 1006 pr_err("failed to create CQ cqe= %d ret= %d\n", 1007 nr_cqe + 1, ret); 1008 goto out; 1009 } 1010 1011 memset(&qp_attr, 0, sizeof(qp_attr)); 1012 qp_attr.qp_context = queue; 1013 qp_attr.event_handler = nvmet_rdma_qp_event; 1014 qp_attr.send_cq = queue->cq; 1015 qp_attr.recv_cq = queue->cq; 1016 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1017 qp_attr.qp_type = IB_QPT_RC; 1018 /* +1 for drain */ 1019 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 1020 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 1021 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 1022 ndev->device->attrs.max_send_sge); 1023 1024 if (ndev->srq) { 1025 qp_attr.srq = ndev->srq; 1026 } else { 1027 /* +1 for drain */ 1028 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 1029 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 1030 } 1031 1032 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 1033 if (ret) { 1034 pr_err("failed to create_qp ret= %d\n", ret); 1035 goto err_destroy_cq; 1036 } 1037 1038 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 1039 1040 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 1041 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 1042 qp_attr.cap.max_send_wr, queue->cm_id); 1043 1044 if (!ndev->srq) { 1045 for (i = 0; i < queue->recv_queue_size; i++) { 1046 queue->cmds[i].queue = queue; 1047 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 1048 if (ret) 1049 goto err_destroy_qp; 1050 } 1051 } 1052 1053 out: 1054 return ret; 1055 1056 err_destroy_qp: 1057 rdma_destroy_qp(queue->cm_id); 1058 err_destroy_cq: 1059 ib_free_cq(queue->cq); 1060 goto out; 1061 } 1062 1063 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1064 { 1065 struct ib_qp *qp = queue->cm_id->qp; 1066 1067 ib_drain_qp(qp); 1068 rdma_destroy_id(queue->cm_id); 1069 ib_destroy_qp(qp); 1070 ib_free_cq(queue->cq); 1071 } 1072 1073 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1074 { 1075 pr_debug("freeing queue %d\n", queue->idx); 1076 1077 nvmet_sq_destroy(&queue->nvme_sq); 1078 1079 nvmet_rdma_destroy_queue_ib(queue); 1080 if (!queue->dev->srq) { 1081 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1082 queue->recv_queue_size, 1083 !queue->host_qid); 1084 } 1085 nvmet_rdma_free_rsps(queue); 1086 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1087 kfree(queue); 1088 } 1089 1090 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1091 { 1092 struct nvmet_rdma_queue *queue = 1093 container_of(w, struct nvmet_rdma_queue, release_work); 1094 struct nvmet_rdma_device *dev = queue->dev; 1095 1096 nvmet_rdma_free_queue(queue); 1097 1098 kref_put(&dev->ref, nvmet_rdma_free_dev); 1099 } 1100 1101 static int 1102 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1103 struct nvmet_rdma_queue *queue) 1104 { 1105 struct nvme_rdma_cm_req *req; 1106 1107 req = (struct nvme_rdma_cm_req *)conn->private_data; 1108 if (!req || conn->private_data_len == 0) 1109 return NVME_RDMA_CM_INVALID_LEN; 1110 1111 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1112 return NVME_RDMA_CM_INVALID_RECFMT; 1113 1114 queue->host_qid = le16_to_cpu(req->qid); 1115 1116 /* 1117 * req->hsqsize corresponds to our recv queue size plus 1 1118 * req->hrqsize corresponds to our send queue size 1119 */ 1120 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1121 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1122 1123 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1124 return NVME_RDMA_CM_INVALID_HSQSIZE; 1125 1126 /* XXX: Should we enforce some kind of max for IO queues? */ 1127 1128 return 0; 1129 } 1130 1131 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1132 enum nvme_rdma_cm_status status) 1133 { 1134 struct nvme_rdma_cm_rej rej; 1135 1136 pr_debug("rejecting connect request: status %d (%s)\n", 1137 status, nvme_rdma_cm_msg(status)); 1138 1139 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1140 rej.sts = cpu_to_le16(status); 1141 1142 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1143 } 1144 1145 static struct nvmet_rdma_queue * 1146 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1147 struct rdma_cm_id *cm_id, 1148 struct rdma_cm_event *event) 1149 { 1150 struct nvmet_rdma_queue *queue; 1151 int ret; 1152 1153 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1154 if (!queue) { 1155 ret = NVME_RDMA_CM_NO_RSC; 1156 goto out_reject; 1157 } 1158 1159 ret = nvmet_sq_init(&queue->nvme_sq); 1160 if (ret) { 1161 ret = NVME_RDMA_CM_NO_RSC; 1162 goto out_free_queue; 1163 } 1164 1165 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1166 if (ret) 1167 goto out_destroy_sq; 1168 1169 /* 1170 * Schedules the actual release because calling rdma_destroy_id from 1171 * inside a CM callback would trigger a deadlock. (great API design..) 1172 */ 1173 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1174 queue->dev = ndev; 1175 queue->cm_id = cm_id; 1176 1177 spin_lock_init(&queue->state_lock); 1178 queue->state = NVMET_RDMA_Q_CONNECTING; 1179 INIT_LIST_HEAD(&queue->rsp_wait_list); 1180 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1181 spin_lock_init(&queue->rsp_wr_wait_lock); 1182 INIT_LIST_HEAD(&queue->free_rsps); 1183 spin_lock_init(&queue->rsps_lock); 1184 INIT_LIST_HEAD(&queue->queue_list); 1185 1186 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1187 if (queue->idx < 0) { 1188 ret = NVME_RDMA_CM_NO_RSC; 1189 goto out_destroy_sq; 1190 } 1191 1192 ret = nvmet_rdma_alloc_rsps(queue); 1193 if (ret) { 1194 ret = NVME_RDMA_CM_NO_RSC; 1195 goto out_ida_remove; 1196 } 1197 1198 if (!ndev->srq) { 1199 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1200 queue->recv_queue_size, 1201 !queue->host_qid); 1202 if (IS_ERR(queue->cmds)) { 1203 ret = NVME_RDMA_CM_NO_RSC; 1204 goto out_free_responses; 1205 } 1206 } 1207 1208 ret = nvmet_rdma_create_queue_ib(queue); 1209 if (ret) { 1210 pr_err("%s: creating RDMA queue failed (%d).\n", 1211 __func__, ret); 1212 ret = NVME_RDMA_CM_NO_RSC; 1213 goto out_free_cmds; 1214 } 1215 1216 return queue; 1217 1218 out_free_cmds: 1219 if (!ndev->srq) { 1220 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1221 queue->recv_queue_size, 1222 !queue->host_qid); 1223 } 1224 out_free_responses: 1225 nvmet_rdma_free_rsps(queue); 1226 out_ida_remove: 1227 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1228 out_destroy_sq: 1229 nvmet_sq_destroy(&queue->nvme_sq); 1230 out_free_queue: 1231 kfree(queue); 1232 out_reject: 1233 nvmet_rdma_cm_reject(cm_id, ret); 1234 return NULL; 1235 } 1236 1237 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1238 { 1239 struct nvmet_rdma_queue *queue = priv; 1240 1241 switch (event->event) { 1242 case IB_EVENT_COMM_EST: 1243 rdma_notify(queue->cm_id, event->event); 1244 break; 1245 default: 1246 pr_err("received IB QP event: %s (%d)\n", 1247 ib_event_msg(event->event), event->event); 1248 break; 1249 } 1250 } 1251 1252 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1253 struct nvmet_rdma_queue *queue, 1254 struct rdma_conn_param *p) 1255 { 1256 struct rdma_conn_param param = { }; 1257 struct nvme_rdma_cm_rep priv = { }; 1258 int ret = -ENOMEM; 1259 1260 param.rnr_retry_count = 7; 1261 param.flow_control = 1; 1262 param.initiator_depth = min_t(u8, p->initiator_depth, 1263 queue->dev->device->attrs.max_qp_init_rd_atom); 1264 param.private_data = &priv; 1265 param.private_data_len = sizeof(priv); 1266 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1267 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1268 1269 ret = rdma_accept(cm_id, ¶m); 1270 if (ret) 1271 pr_err("rdma_accept failed (error code = %d)\n", ret); 1272 1273 return ret; 1274 } 1275 1276 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1277 struct rdma_cm_event *event) 1278 { 1279 struct nvmet_rdma_device *ndev; 1280 struct nvmet_rdma_queue *queue; 1281 int ret = -EINVAL; 1282 1283 ndev = nvmet_rdma_find_get_device(cm_id); 1284 if (!ndev) { 1285 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1286 return -ECONNREFUSED; 1287 } 1288 1289 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1290 if (!queue) { 1291 ret = -ENOMEM; 1292 goto put_device; 1293 } 1294 queue->port = cm_id->context; 1295 1296 if (queue->host_qid == 0) { 1297 /* Let inflight controller teardown complete */ 1298 flush_scheduled_work(); 1299 } 1300 1301 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1302 if (ret) { 1303 schedule_work(&queue->release_work); 1304 /* Destroying rdma_cm id is not needed here */ 1305 return 0; 1306 } 1307 1308 mutex_lock(&nvmet_rdma_queue_mutex); 1309 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1310 mutex_unlock(&nvmet_rdma_queue_mutex); 1311 1312 return 0; 1313 1314 put_device: 1315 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1316 1317 return ret; 1318 } 1319 1320 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1321 { 1322 unsigned long flags; 1323 1324 spin_lock_irqsave(&queue->state_lock, flags); 1325 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1326 pr_warn("trying to establish a connected queue\n"); 1327 goto out_unlock; 1328 } 1329 queue->state = NVMET_RDMA_Q_LIVE; 1330 1331 while (!list_empty(&queue->rsp_wait_list)) { 1332 struct nvmet_rdma_rsp *cmd; 1333 1334 cmd = list_first_entry(&queue->rsp_wait_list, 1335 struct nvmet_rdma_rsp, wait_list); 1336 list_del(&cmd->wait_list); 1337 1338 spin_unlock_irqrestore(&queue->state_lock, flags); 1339 nvmet_rdma_handle_command(queue, cmd); 1340 spin_lock_irqsave(&queue->state_lock, flags); 1341 } 1342 1343 out_unlock: 1344 spin_unlock_irqrestore(&queue->state_lock, flags); 1345 } 1346 1347 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1348 { 1349 bool disconnect = false; 1350 unsigned long flags; 1351 1352 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1353 1354 spin_lock_irqsave(&queue->state_lock, flags); 1355 switch (queue->state) { 1356 case NVMET_RDMA_Q_CONNECTING: 1357 case NVMET_RDMA_Q_LIVE: 1358 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1359 disconnect = true; 1360 break; 1361 case NVMET_RDMA_Q_DISCONNECTING: 1362 break; 1363 } 1364 spin_unlock_irqrestore(&queue->state_lock, flags); 1365 1366 if (disconnect) { 1367 rdma_disconnect(queue->cm_id); 1368 schedule_work(&queue->release_work); 1369 } 1370 } 1371 1372 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1373 { 1374 bool disconnect = false; 1375 1376 mutex_lock(&nvmet_rdma_queue_mutex); 1377 if (!list_empty(&queue->queue_list)) { 1378 list_del_init(&queue->queue_list); 1379 disconnect = true; 1380 } 1381 mutex_unlock(&nvmet_rdma_queue_mutex); 1382 1383 if (disconnect) 1384 __nvmet_rdma_queue_disconnect(queue); 1385 } 1386 1387 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1388 struct nvmet_rdma_queue *queue) 1389 { 1390 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1391 1392 mutex_lock(&nvmet_rdma_queue_mutex); 1393 if (!list_empty(&queue->queue_list)) 1394 list_del_init(&queue->queue_list); 1395 mutex_unlock(&nvmet_rdma_queue_mutex); 1396 1397 pr_err("failed to connect queue %d\n", queue->idx); 1398 schedule_work(&queue->release_work); 1399 } 1400 1401 /** 1402 * nvme_rdma_device_removal() - Handle RDMA device removal 1403 * @cm_id: rdma_cm id, used for nvmet port 1404 * @queue: nvmet rdma queue (cm id qp_context) 1405 * 1406 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1407 * to unplug. Note that this event can be generated on a normal 1408 * queue cm_id and/or a device bound listener cm_id (where in this 1409 * case queue will be null). 1410 * 1411 * We registered an ib_client to handle device removal for queues, 1412 * so we only need to handle the listening port cm_ids. In this case 1413 * we nullify the priv to prevent double cm_id destruction and destroying 1414 * the cm_id implicitely by returning a non-zero rc to the callout. 1415 */ 1416 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1417 struct nvmet_rdma_queue *queue) 1418 { 1419 struct nvmet_port *port; 1420 1421 if (queue) { 1422 /* 1423 * This is a queue cm_id. we have registered 1424 * an ib_client to handle queues removal 1425 * so don't interfear and just return. 1426 */ 1427 return 0; 1428 } 1429 1430 port = cm_id->context; 1431 1432 /* 1433 * This is a listener cm_id. Make sure that 1434 * future remove_port won't invoke a double 1435 * cm_id destroy. use atomic xchg to make sure 1436 * we don't compete with remove_port. 1437 */ 1438 if (xchg(&port->priv, NULL) != cm_id) 1439 return 0; 1440 1441 /* 1442 * We need to return 1 so that the core will destroy 1443 * it's own ID. What a great API design.. 1444 */ 1445 return 1; 1446 } 1447 1448 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1449 struct rdma_cm_event *event) 1450 { 1451 struct nvmet_rdma_queue *queue = NULL; 1452 int ret = 0; 1453 1454 if (cm_id->qp) 1455 queue = cm_id->qp->qp_context; 1456 1457 pr_debug("%s (%d): status %d id %p\n", 1458 rdma_event_msg(event->event), event->event, 1459 event->status, cm_id); 1460 1461 switch (event->event) { 1462 case RDMA_CM_EVENT_CONNECT_REQUEST: 1463 ret = nvmet_rdma_queue_connect(cm_id, event); 1464 break; 1465 case RDMA_CM_EVENT_ESTABLISHED: 1466 nvmet_rdma_queue_established(queue); 1467 break; 1468 case RDMA_CM_EVENT_ADDR_CHANGE: 1469 case RDMA_CM_EVENT_DISCONNECTED: 1470 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1471 nvmet_rdma_queue_disconnect(queue); 1472 break; 1473 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1474 ret = nvmet_rdma_device_removal(cm_id, queue); 1475 break; 1476 case RDMA_CM_EVENT_REJECTED: 1477 pr_debug("Connection rejected: %s\n", 1478 rdma_reject_msg(cm_id, event->status)); 1479 /* FALLTHROUGH */ 1480 case RDMA_CM_EVENT_UNREACHABLE: 1481 case RDMA_CM_EVENT_CONNECT_ERROR: 1482 nvmet_rdma_queue_connect_fail(cm_id, queue); 1483 break; 1484 default: 1485 pr_err("received unrecognized RDMA CM event %d\n", 1486 event->event); 1487 break; 1488 } 1489 1490 return ret; 1491 } 1492 1493 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1494 { 1495 struct nvmet_rdma_queue *queue; 1496 1497 restart: 1498 mutex_lock(&nvmet_rdma_queue_mutex); 1499 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1500 if (queue->nvme_sq.ctrl == ctrl) { 1501 list_del_init(&queue->queue_list); 1502 mutex_unlock(&nvmet_rdma_queue_mutex); 1503 1504 __nvmet_rdma_queue_disconnect(queue); 1505 goto restart; 1506 } 1507 } 1508 mutex_unlock(&nvmet_rdma_queue_mutex); 1509 } 1510 1511 static int nvmet_rdma_add_port(struct nvmet_port *port) 1512 { 1513 struct rdma_cm_id *cm_id; 1514 struct sockaddr_storage addr = { }; 1515 __kernel_sa_family_t af; 1516 int ret; 1517 1518 switch (port->disc_addr.adrfam) { 1519 case NVMF_ADDR_FAMILY_IP4: 1520 af = AF_INET; 1521 break; 1522 case NVMF_ADDR_FAMILY_IP6: 1523 af = AF_INET6; 1524 break; 1525 default: 1526 pr_err("address family %d not supported\n", 1527 port->disc_addr.adrfam); 1528 return -EINVAL; 1529 } 1530 1531 if (port->inline_data_size < 0) { 1532 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1533 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1534 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1535 port->inline_data_size, 1536 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1537 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1538 } 1539 1540 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1541 port->disc_addr.trsvcid, &addr); 1542 if (ret) { 1543 pr_err("malformed ip/port passed: %s:%s\n", 1544 port->disc_addr.traddr, port->disc_addr.trsvcid); 1545 return ret; 1546 } 1547 1548 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1549 RDMA_PS_TCP, IB_QPT_RC); 1550 if (IS_ERR(cm_id)) { 1551 pr_err("CM ID creation failed\n"); 1552 return PTR_ERR(cm_id); 1553 } 1554 1555 /* 1556 * Allow both IPv4 and IPv6 sockets to bind a single port 1557 * at the same time. 1558 */ 1559 ret = rdma_set_afonly(cm_id, 1); 1560 if (ret) { 1561 pr_err("rdma_set_afonly failed (%d)\n", ret); 1562 goto out_destroy_id; 1563 } 1564 1565 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1566 if (ret) { 1567 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1568 (struct sockaddr *)&addr, ret); 1569 goto out_destroy_id; 1570 } 1571 1572 ret = rdma_listen(cm_id, 128); 1573 if (ret) { 1574 pr_err("listening to %pISpcs failed (%d)\n", 1575 (struct sockaddr *)&addr, ret); 1576 goto out_destroy_id; 1577 } 1578 1579 pr_info("enabling port %d (%pISpcs)\n", 1580 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1581 port->priv = cm_id; 1582 return 0; 1583 1584 out_destroy_id: 1585 rdma_destroy_id(cm_id); 1586 return ret; 1587 } 1588 1589 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1590 { 1591 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1592 1593 if (cm_id) 1594 rdma_destroy_id(cm_id); 1595 } 1596 1597 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1598 struct nvmet_port *port, char *traddr) 1599 { 1600 struct rdma_cm_id *cm_id = port->priv; 1601 1602 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1603 struct nvmet_rdma_rsp *rsp = 1604 container_of(req, struct nvmet_rdma_rsp, req); 1605 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1606 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1607 1608 sprintf(traddr, "%pISc", addr); 1609 } else { 1610 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1611 } 1612 } 1613 1614 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1615 .owner = THIS_MODULE, 1616 .type = NVMF_TRTYPE_RDMA, 1617 .msdbd = 1, 1618 .has_keyed_sgls = 1, 1619 .add_port = nvmet_rdma_add_port, 1620 .remove_port = nvmet_rdma_remove_port, 1621 .queue_response = nvmet_rdma_queue_response, 1622 .delete_ctrl = nvmet_rdma_delete_ctrl, 1623 .disc_traddr = nvmet_rdma_disc_port_addr, 1624 }; 1625 1626 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1627 { 1628 struct nvmet_rdma_queue *queue, *tmp; 1629 struct nvmet_rdma_device *ndev; 1630 bool found = false; 1631 1632 mutex_lock(&device_list_mutex); 1633 list_for_each_entry(ndev, &device_list, entry) { 1634 if (ndev->device == ib_device) { 1635 found = true; 1636 break; 1637 } 1638 } 1639 mutex_unlock(&device_list_mutex); 1640 1641 if (!found) 1642 return; 1643 1644 /* 1645 * IB Device that is used by nvmet controllers is being removed, 1646 * delete all queues using this device. 1647 */ 1648 mutex_lock(&nvmet_rdma_queue_mutex); 1649 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1650 queue_list) { 1651 if (queue->dev->device != ib_device) 1652 continue; 1653 1654 pr_info("Removing queue %d\n", queue->idx); 1655 list_del_init(&queue->queue_list); 1656 __nvmet_rdma_queue_disconnect(queue); 1657 } 1658 mutex_unlock(&nvmet_rdma_queue_mutex); 1659 1660 flush_scheduled_work(); 1661 } 1662 1663 static struct ib_client nvmet_rdma_ib_client = { 1664 .name = "nvmet_rdma", 1665 .remove = nvmet_rdma_remove_one 1666 }; 1667 1668 static int __init nvmet_rdma_init(void) 1669 { 1670 int ret; 1671 1672 ret = ib_register_client(&nvmet_rdma_ib_client); 1673 if (ret) 1674 return ret; 1675 1676 ret = nvmet_register_transport(&nvmet_rdma_ops); 1677 if (ret) 1678 goto err_ib_client; 1679 1680 return 0; 1681 1682 err_ib_client: 1683 ib_unregister_client(&nvmet_rdma_ib_client); 1684 return ret; 1685 } 1686 1687 static void __exit nvmet_rdma_exit(void) 1688 { 1689 nvmet_unregister_transport(&nvmet_rdma_ops); 1690 ib_unregister_client(&nvmet_rdma_ib_client); 1691 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1692 ida_destroy(&nvmet_rdma_queue_ida); 1693 } 1694 1695 module_init(nvmet_rdma_init); 1696 module_exit(nvmet_rdma_exit); 1697 1698 MODULE_LICENSE("GPL v2"); 1699 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1700