1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics RDMA target. 4 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/atomic.h> 8 #include <linux/ctype.h> 9 #include <linux/delay.h> 10 #include <linux/err.h> 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/nvme.h> 14 #include <linux/slab.h> 15 #include <linux/string.h> 16 #include <linux/wait.h> 17 #include <linux/inet.h> 18 #include <asm/unaligned.h> 19 20 #include <rdma/ib_verbs.h> 21 #include <rdma/rdma_cm.h> 22 #include <rdma/rw.h> 23 24 #include <linux/nvme-rdma.h> 25 #include "nvmet.h" 26 27 /* 28 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 29 */ 30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 31 #define NVMET_RDMA_MAX_INLINE_SGE 4 32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 33 34 struct nvmet_rdma_cmd { 35 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 36 struct ib_cqe cqe; 37 struct ib_recv_wr wr; 38 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 39 struct nvme_command *nvme_cmd; 40 struct nvmet_rdma_queue *queue; 41 }; 42 43 enum { 44 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 45 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 46 }; 47 48 struct nvmet_rdma_rsp { 49 struct ib_sge send_sge; 50 struct ib_cqe send_cqe; 51 struct ib_send_wr send_wr; 52 53 struct nvmet_rdma_cmd *cmd; 54 struct nvmet_rdma_queue *queue; 55 56 struct ib_cqe read_cqe; 57 struct rdma_rw_ctx rw; 58 59 struct nvmet_req req; 60 61 bool allocated; 62 u8 n_rdma; 63 u32 flags; 64 u32 invalidate_rkey; 65 66 struct list_head wait_list; 67 struct list_head free_list; 68 }; 69 70 enum nvmet_rdma_queue_state { 71 NVMET_RDMA_Q_CONNECTING, 72 NVMET_RDMA_Q_LIVE, 73 NVMET_RDMA_Q_DISCONNECTING, 74 }; 75 76 struct nvmet_rdma_queue { 77 struct rdma_cm_id *cm_id; 78 struct nvmet_port *port; 79 struct ib_cq *cq; 80 atomic_t sq_wr_avail; 81 struct nvmet_rdma_device *dev; 82 spinlock_t state_lock; 83 enum nvmet_rdma_queue_state state; 84 struct nvmet_cq nvme_cq; 85 struct nvmet_sq nvme_sq; 86 87 struct nvmet_rdma_rsp *rsps; 88 struct list_head free_rsps; 89 spinlock_t rsps_lock; 90 struct nvmet_rdma_cmd *cmds; 91 92 struct work_struct release_work; 93 struct list_head rsp_wait_list; 94 struct list_head rsp_wr_wait_list; 95 spinlock_t rsp_wr_wait_lock; 96 97 int idx; 98 int host_qid; 99 int recv_queue_size; 100 int send_queue_size; 101 102 struct list_head queue_list; 103 }; 104 105 struct nvmet_rdma_device { 106 struct ib_device *device; 107 struct ib_pd *pd; 108 struct ib_srq *srq; 109 struct nvmet_rdma_cmd *srq_cmds; 110 size_t srq_size; 111 struct kref ref; 112 struct list_head entry; 113 int inline_data_size; 114 int inline_page_count; 115 }; 116 117 static bool nvmet_rdma_use_srq; 118 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 119 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 120 121 static DEFINE_IDA(nvmet_rdma_queue_ida); 122 static LIST_HEAD(nvmet_rdma_queue_list); 123 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 124 125 static LIST_HEAD(device_list); 126 static DEFINE_MUTEX(device_list_mutex); 127 128 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 129 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 130 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 131 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 132 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 133 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 134 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 135 struct nvmet_rdma_rsp *r); 136 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 137 struct nvmet_rdma_rsp *r); 138 139 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 140 141 static int num_pages(int len) 142 { 143 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 144 } 145 146 /* XXX: really should move to a generic header sooner or later.. */ 147 static inline u32 get_unaligned_le24(const u8 *p) 148 { 149 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 150 } 151 152 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 153 { 154 return nvme_is_write(rsp->req.cmd) && 155 rsp->req.transfer_len && 156 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 157 } 158 159 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 160 { 161 return !nvme_is_write(rsp->req.cmd) && 162 rsp->req.transfer_len && 163 !rsp->req.cqe->status && 164 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 165 } 166 167 static inline struct nvmet_rdma_rsp * 168 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 169 { 170 struct nvmet_rdma_rsp *rsp; 171 unsigned long flags; 172 173 spin_lock_irqsave(&queue->rsps_lock, flags); 174 rsp = list_first_entry_or_null(&queue->free_rsps, 175 struct nvmet_rdma_rsp, free_list); 176 if (likely(rsp)) 177 list_del(&rsp->free_list); 178 spin_unlock_irqrestore(&queue->rsps_lock, flags); 179 180 if (unlikely(!rsp)) { 181 int ret; 182 183 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); 184 if (unlikely(!rsp)) 185 return NULL; 186 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp); 187 if (unlikely(ret)) { 188 kfree(rsp); 189 return NULL; 190 } 191 192 rsp->allocated = true; 193 } 194 195 return rsp; 196 } 197 198 static inline void 199 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 200 { 201 unsigned long flags; 202 203 if (unlikely(rsp->allocated)) { 204 nvmet_rdma_free_rsp(rsp->queue->dev, rsp); 205 kfree(rsp); 206 return; 207 } 208 209 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 210 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 211 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 212 } 213 214 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 215 struct nvmet_rdma_cmd *c) 216 { 217 struct scatterlist *sg; 218 struct ib_sge *sge; 219 int i; 220 221 if (!ndev->inline_data_size) 222 return; 223 224 sg = c->inline_sg; 225 sge = &c->sge[1]; 226 227 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 228 if (sge->length) 229 ib_dma_unmap_page(ndev->device, sge->addr, 230 sge->length, DMA_FROM_DEVICE); 231 if (sg_page(sg)) 232 __free_page(sg_page(sg)); 233 } 234 } 235 236 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 237 struct nvmet_rdma_cmd *c) 238 { 239 struct scatterlist *sg; 240 struct ib_sge *sge; 241 struct page *pg; 242 int len; 243 int i; 244 245 if (!ndev->inline_data_size) 246 return 0; 247 248 sg = c->inline_sg; 249 sg_init_table(sg, ndev->inline_page_count); 250 sge = &c->sge[1]; 251 len = ndev->inline_data_size; 252 253 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 254 pg = alloc_page(GFP_KERNEL); 255 if (!pg) 256 goto out_err; 257 sg_assign_page(sg, pg); 258 sge->addr = ib_dma_map_page(ndev->device, 259 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 260 if (ib_dma_mapping_error(ndev->device, sge->addr)) 261 goto out_err; 262 sge->length = min_t(int, len, PAGE_SIZE); 263 sge->lkey = ndev->pd->local_dma_lkey; 264 len -= sge->length; 265 } 266 267 return 0; 268 out_err: 269 for (; i >= 0; i--, sg--, sge--) { 270 if (sge->length) 271 ib_dma_unmap_page(ndev->device, sge->addr, 272 sge->length, DMA_FROM_DEVICE); 273 if (sg_page(sg)) 274 __free_page(sg_page(sg)); 275 } 276 return -ENOMEM; 277 } 278 279 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 280 struct nvmet_rdma_cmd *c, bool admin) 281 { 282 /* NVMe command / RDMA RECV */ 283 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 284 if (!c->nvme_cmd) 285 goto out; 286 287 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 288 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 289 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 290 goto out_free_cmd; 291 292 c->sge[0].length = sizeof(*c->nvme_cmd); 293 c->sge[0].lkey = ndev->pd->local_dma_lkey; 294 295 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 296 goto out_unmap_cmd; 297 298 c->cqe.done = nvmet_rdma_recv_done; 299 300 c->wr.wr_cqe = &c->cqe; 301 c->wr.sg_list = c->sge; 302 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 303 304 return 0; 305 306 out_unmap_cmd: 307 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 308 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 309 out_free_cmd: 310 kfree(c->nvme_cmd); 311 312 out: 313 return -ENOMEM; 314 } 315 316 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 317 struct nvmet_rdma_cmd *c, bool admin) 318 { 319 if (!admin) 320 nvmet_rdma_free_inline_pages(ndev, c); 321 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 322 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 323 kfree(c->nvme_cmd); 324 } 325 326 static struct nvmet_rdma_cmd * 327 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 328 int nr_cmds, bool admin) 329 { 330 struct nvmet_rdma_cmd *cmds; 331 int ret = -EINVAL, i; 332 333 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 334 if (!cmds) 335 goto out; 336 337 for (i = 0; i < nr_cmds; i++) { 338 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 339 if (ret) 340 goto out_free; 341 } 342 343 return cmds; 344 345 out_free: 346 while (--i >= 0) 347 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 348 kfree(cmds); 349 out: 350 return ERR_PTR(ret); 351 } 352 353 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 354 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 355 { 356 int i; 357 358 for (i = 0; i < nr_cmds; i++) 359 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 360 kfree(cmds); 361 } 362 363 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 364 struct nvmet_rdma_rsp *r) 365 { 366 /* NVMe CQE / RDMA SEND */ 367 r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); 368 if (!r->req.cqe) 369 goto out; 370 371 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, 372 sizeof(*r->req.cqe), DMA_TO_DEVICE); 373 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 374 goto out_free_rsp; 375 376 r->req.p2p_client = &ndev->device->dev; 377 r->send_sge.length = sizeof(*r->req.cqe); 378 r->send_sge.lkey = ndev->pd->local_dma_lkey; 379 380 r->send_cqe.done = nvmet_rdma_send_done; 381 382 r->send_wr.wr_cqe = &r->send_cqe; 383 r->send_wr.sg_list = &r->send_sge; 384 r->send_wr.num_sge = 1; 385 r->send_wr.send_flags = IB_SEND_SIGNALED; 386 387 /* Data In / RDMA READ */ 388 r->read_cqe.done = nvmet_rdma_read_data_done; 389 return 0; 390 391 out_free_rsp: 392 kfree(r->req.cqe); 393 out: 394 return -ENOMEM; 395 } 396 397 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 398 struct nvmet_rdma_rsp *r) 399 { 400 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 401 sizeof(*r->req.cqe), DMA_TO_DEVICE); 402 kfree(r->req.cqe); 403 } 404 405 static int 406 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 407 { 408 struct nvmet_rdma_device *ndev = queue->dev; 409 int nr_rsps = queue->recv_queue_size * 2; 410 int ret = -EINVAL, i; 411 412 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 413 GFP_KERNEL); 414 if (!queue->rsps) 415 goto out; 416 417 for (i = 0; i < nr_rsps; i++) { 418 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 419 420 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 421 if (ret) 422 goto out_free; 423 424 list_add_tail(&rsp->free_list, &queue->free_rsps); 425 } 426 427 return 0; 428 429 out_free: 430 while (--i >= 0) { 431 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 432 433 list_del(&rsp->free_list); 434 nvmet_rdma_free_rsp(ndev, rsp); 435 } 436 kfree(queue->rsps); 437 out: 438 return ret; 439 } 440 441 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 442 { 443 struct nvmet_rdma_device *ndev = queue->dev; 444 int i, nr_rsps = queue->recv_queue_size * 2; 445 446 for (i = 0; i < nr_rsps; i++) { 447 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 448 449 list_del(&rsp->free_list); 450 nvmet_rdma_free_rsp(ndev, rsp); 451 } 452 kfree(queue->rsps); 453 } 454 455 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 456 struct nvmet_rdma_cmd *cmd) 457 { 458 int ret; 459 460 ib_dma_sync_single_for_device(ndev->device, 461 cmd->sge[0].addr, cmd->sge[0].length, 462 DMA_FROM_DEVICE); 463 464 if (ndev->srq) 465 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 466 else 467 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 468 469 if (unlikely(ret)) 470 pr_err("post_recv cmd failed\n"); 471 472 return ret; 473 } 474 475 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 476 { 477 spin_lock(&queue->rsp_wr_wait_lock); 478 while (!list_empty(&queue->rsp_wr_wait_list)) { 479 struct nvmet_rdma_rsp *rsp; 480 bool ret; 481 482 rsp = list_entry(queue->rsp_wr_wait_list.next, 483 struct nvmet_rdma_rsp, wait_list); 484 list_del(&rsp->wait_list); 485 486 spin_unlock(&queue->rsp_wr_wait_lock); 487 ret = nvmet_rdma_execute_command(rsp); 488 spin_lock(&queue->rsp_wr_wait_lock); 489 490 if (!ret) { 491 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 492 break; 493 } 494 } 495 spin_unlock(&queue->rsp_wr_wait_lock); 496 } 497 498 499 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 500 { 501 struct nvmet_rdma_queue *queue = rsp->queue; 502 503 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 504 505 if (rsp->n_rdma) { 506 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 507 queue->cm_id->port_num, rsp->req.sg, 508 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 509 } 510 511 if (rsp->req.sg != rsp->cmd->inline_sg) 512 nvmet_req_free_sgl(&rsp->req); 513 514 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 515 nvmet_rdma_process_wr_wait_list(queue); 516 517 nvmet_rdma_put_rsp(rsp); 518 } 519 520 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 521 { 522 if (queue->nvme_sq.ctrl) { 523 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 524 } else { 525 /* 526 * we didn't setup the controller yet in case 527 * of admin connect error, just disconnect and 528 * cleanup the queue 529 */ 530 nvmet_rdma_queue_disconnect(queue); 531 } 532 } 533 534 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 535 { 536 struct nvmet_rdma_rsp *rsp = 537 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 538 struct nvmet_rdma_queue *queue = cq->cq_context; 539 540 nvmet_rdma_release_rsp(rsp); 541 542 if (unlikely(wc->status != IB_WC_SUCCESS && 543 wc->status != IB_WC_WR_FLUSH_ERR)) { 544 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 545 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 546 nvmet_rdma_error_comp(queue); 547 } 548 } 549 550 static void nvmet_rdma_queue_response(struct nvmet_req *req) 551 { 552 struct nvmet_rdma_rsp *rsp = 553 container_of(req, struct nvmet_rdma_rsp, req); 554 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 555 struct ib_send_wr *first_wr; 556 557 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 558 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 559 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 560 } else { 561 rsp->send_wr.opcode = IB_WR_SEND; 562 } 563 564 if (nvmet_rdma_need_data_out(rsp)) 565 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 566 cm_id->port_num, NULL, &rsp->send_wr); 567 else 568 first_wr = &rsp->send_wr; 569 570 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 571 572 ib_dma_sync_single_for_device(rsp->queue->dev->device, 573 rsp->send_sge.addr, rsp->send_sge.length, 574 DMA_TO_DEVICE); 575 576 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 577 pr_err("sending cmd response failed\n"); 578 nvmet_rdma_release_rsp(rsp); 579 } 580 } 581 582 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 583 { 584 struct nvmet_rdma_rsp *rsp = 585 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 586 struct nvmet_rdma_queue *queue = cq->cq_context; 587 588 WARN_ON(rsp->n_rdma <= 0); 589 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 590 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 591 queue->cm_id->port_num, rsp->req.sg, 592 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 593 rsp->n_rdma = 0; 594 595 if (unlikely(wc->status != IB_WC_SUCCESS)) { 596 nvmet_req_uninit(&rsp->req); 597 nvmet_rdma_release_rsp(rsp); 598 if (wc->status != IB_WC_WR_FLUSH_ERR) { 599 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 600 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 601 nvmet_rdma_error_comp(queue); 602 } 603 return; 604 } 605 606 nvmet_req_execute(&rsp->req); 607 } 608 609 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 610 u64 off) 611 { 612 int sg_count = num_pages(len); 613 struct scatterlist *sg; 614 int i; 615 616 sg = rsp->cmd->inline_sg; 617 for (i = 0; i < sg_count; i++, sg++) { 618 if (i < sg_count - 1) 619 sg_unmark_end(sg); 620 else 621 sg_mark_end(sg); 622 sg->offset = off; 623 sg->length = min_t(int, len, PAGE_SIZE - off); 624 len -= sg->length; 625 if (!i) 626 off = 0; 627 } 628 629 rsp->req.sg = rsp->cmd->inline_sg; 630 rsp->req.sg_cnt = sg_count; 631 } 632 633 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 634 { 635 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 636 u64 off = le64_to_cpu(sgl->addr); 637 u32 len = le32_to_cpu(sgl->length); 638 639 if (!nvme_is_write(rsp->req.cmd)) { 640 rsp->req.error_loc = 641 offsetof(struct nvme_common_command, opcode); 642 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 643 } 644 645 if (off + len > rsp->queue->dev->inline_data_size) { 646 pr_err("invalid inline data offset!\n"); 647 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 648 } 649 650 /* no data command? */ 651 if (!len) 652 return 0; 653 654 nvmet_rdma_use_inline_sg(rsp, len, off); 655 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 656 rsp->req.transfer_len += len; 657 return 0; 658 } 659 660 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 661 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 662 { 663 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 664 u64 addr = le64_to_cpu(sgl->addr); 665 u32 key = get_unaligned_le32(sgl->key); 666 int ret; 667 668 rsp->req.transfer_len = get_unaligned_le24(sgl->length); 669 670 /* no data command? */ 671 if (!rsp->req.transfer_len) 672 return 0; 673 674 ret = nvmet_req_alloc_sgl(&rsp->req); 675 if (ret < 0) 676 goto error_out; 677 678 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 679 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 680 nvmet_data_dir(&rsp->req)); 681 if (ret < 0) 682 goto error_out; 683 rsp->n_rdma += ret; 684 685 if (invalidate) { 686 rsp->invalidate_rkey = key; 687 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 688 } 689 690 return 0; 691 692 error_out: 693 rsp->req.transfer_len = 0; 694 return NVME_SC_INTERNAL; 695 } 696 697 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 698 { 699 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 700 701 switch (sgl->type >> 4) { 702 case NVME_SGL_FMT_DATA_DESC: 703 switch (sgl->type & 0xf) { 704 case NVME_SGL_FMT_OFFSET: 705 return nvmet_rdma_map_sgl_inline(rsp); 706 default: 707 pr_err("invalid SGL subtype: %#x\n", sgl->type); 708 rsp->req.error_loc = 709 offsetof(struct nvme_common_command, dptr); 710 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 711 } 712 case NVME_KEY_SGL_FMT_DATA_DESC: 713 switch (sgl->type & 0xf) { 714 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 715 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 716 case NVME_SGL_FMT_ADDRESS: 717 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 718 default: 719 pr_err("invalid SGL subtype: %#x\n", sgl->type); 720 rsp->req.error_loc = 721 offsetof(struct nvme_common_command, dptr); 722 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 723 } 724 default: 725 pr_err("invalid SGL type: %#x\n", sgl->type); 726 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); 727 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 728 } 729 } 730 731 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 732 { 733 struct nvmet_rdma_queue *queue = rsp->queue; 734 735 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 736 &queue->sq_wr_avail) < 0)) { 737 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 738 1 + rsp->n_rdma, queue->idx, 739 queue->nvme_sq.ctrl->cntlid); 740 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 741 return false; 742 } 743 744 if (nvmet_rdma_need_data_in(rsp)) { 745 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 746 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 747 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 748 } else { 749 nvmet_req_execute(&rsp->req); 750 } 751 752 return true; 753 } 754 755 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 756 struct nvmet_rdma_rsp *cmd) 757 { 758 u16 status; 759 760 ib_dma_sync_single_for_cpu(queue->dev->device, 761 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 762 DMA_FROM_DEVICE); 763 ib_dma_sync_single_for_cpu(queue->dev->device, 764 cmd->send_sge.addr, cmd->send_sge.length, 765 DMA_TO_DEVICE); 766 767 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 768 &queue->nvme_sq, &nvmet_rdma_ops)) 769 return; 770 771 status = nvmet_rdma_map_sgl(cmd); 772 if (status) 773 goto out_err; 774 775 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 776 spin_lock(&queue->rsp_wr_wait_lock); 777 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 778 spin_unlock(&queue->rsp_wr_wait_lock); 779 } 780 781 return; 782 783 out_err: 784 nvmet_req_complete(&cmd->req, status); 785 } 786 787 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 788 { 789 struct nvmet_rdma_cmd *cmd = 790 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 791 struct nvmet_rdma_queue *queue = cq->cq_context; 792 struct nvmet_rdma_rsp *rsp; 793 794 if (unlikely(wc->status != IB_WC_SUCCESS)) { 795 if (wc->status != IB_WC_WR_FLUSH_ERR) { 796 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 797 wc->wr_cqe, ib_wc_status_msg(wc->status), 798 wc->status); 799 nvmet_rdma_error_comp(queue); 800 } 801 return; 802 } 803 804 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 805 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 806 nvmet_rdma_error_comp(queue); 807 return; 808 } 809 810 cmd->queue = queue; 811 rsp = nvmet_rdma_get_rsp(queue); 812 if (unlikely(!rsp)) { 813 /* 814 * we get here only under memory pressure, 815 * silently drop and have the host retry 816 * as we can't even fail it. 817 */ 818 nvmet_rdma_post_recv(queue->dev, cmd); 819 return; 820 } 821 rsp->queue = queue; 822 rsp->cmd = cmd; 823 rsp->flags = 0; 824 rsp->req.cmd = cmd->nvme_cmd; 825 rsp->req.port = queue->port; 826 rsp->n_rdma = 0; 827 828 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 829 unsigned long flags; 830 831 spin_lock_irqsave(&queue->state_lock, flags); 832 if (queue->state == NVMET_RDMA_Q_CONNECTING) 833 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 834 else 835 nvmet_rdma_put_rsp(rsp); 836 spin_unlock_irqrestore(&queue->state_lock, flags); 837 return; 838 } 839 840 nvmet_rdma_handle_command(queue, rsp); 841 } 842 843 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 844 { 845 if (!ndev->srq) 846 return; 847 848 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 849 ib_destroy_srq(ndev->srq); 850 } 851 852 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 853 { 854 struct ib_srq_init_attr srq_attr = { NULL, }; 855 struct ib_srq *srq; 856 size_t srq_size; 857 int ret, i; 858 859 srq_size = 4095; /* XXX: tune */ 860 861 srq_attr.attr.max_wr = srq_size; 862 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 863 srq_attr.attr.srq_limit = 0; 864 srq_attr.srq_type = IB_SRQT_BASIC; 865 srq = ib_create_srq(ndev->pd, &srq_attr); 866 if (IS_ERR(srq)) { 867 /* 868 * If SRQs aren't supported we just go ahead and use normal 869 * non-shared receive queues. 870 */ 871 pr_info("SRQ requested but not supported.\n"); 872 return 0; 873 } 874 875 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 876 if (IS_ERR(ndev->srq_cmds)) { 877 ret = PTR_ERR(ndev->srq_cmds); 878 goto out_destroy_srq; 879 } 880 881 ndev->srq = srq; 882 ndev->srq_size = srq_size; 883 884 for (i = 0; i < srq_size; i++) { 885 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 886 if (ret) 887 goto out_free_cmds; 888 } 889 890 return 0; 891 892 out_free_cmds: 893 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 894 out_destroy_srq: 895 ib_destroy_srq(srq); 896 return ret; 897 } 898 899 static void nvmet_rdma_free_dev(struct kref *ref) 900 { 901 struct nvmet_rdma_device *ndev = 902 container_of(ref, struct nvmet_rdma_device, ref); 903 904 mutex_lock(&device_list_mutex); 905 list_del(&ndev->entry); 906 mutex_unlock(&device_list_mutex); 907 908 nvmet_rdma_destroy_srq(ndev); 909 ib_dealloc_pd(ndev->pd); 910 911 kfree(ndev); 912 } 913 914 static struct nvmet_rdma_device * 915 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 916 { 917 struct nvmet_port *port = cm_id->context; 918 struct nvmet_rdma_device *ndev; 919 int inline_page_count; 920 int inline_sge_count; 921 int ret; 922 923 mutex_lock(&device_list_mutex); 924 list_for_each_entry(ndev, &device_list, entry) { 925 if (ndev->device->node_guid == cm_id->device->node_guid && 926 kref_get_unless_zero(&ndev->ref)) 927 goto out_unlock; 928 } 929 930 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 931 if (!ndev) 932 goto out_err; 933 934 inline_page_count = num_pages(port->inline_data_size); 935 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 936 cm_id->device->attrs.max_recv_sge) - 1; 937 if (inline_page_count > inline_sge_count) { 938 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 939 port->inline_data_size, cm_id->device->name, 940 inline_sge_count * PAGE_SIZE); 941 port->inline_data_size = inline_sge_count * PAGE_SIZE; 942 inline_page_count = inline_sge_count; 943 } 944 ndev->inline_data_size = port->inline_data_size; 945 ndev->inline_page_count = inline_page_count; 946 ndev->device = cm_id->device; 947 kref_init(&ndev->ref); 948 949 ndev->pd = ib_alloc_pd(ndev->device, 0); 950 if (IS_ERR(ndev->pd)) 951 goto out_free_dev; 952 953 if (nvmet_rdma_use_srq) { 954 ret = nvmet_rdma_init_srq(ndev); 955 if (ret) 956 goto out_free_pd; 957 } 958 959 list_add(&ndev->entry, &device_list); 960 out_unlock: 961 mutex_unlock(&device_list_mutex); 962 pr_debug("added %s.\n", ndev->device->name); 963 return ndev; 964 965 out_free_pd: 966 ib_dealloc_pd(ndev->pd); 967 out_free_dev: 968 kfree(ndev); 969 out_err: 970 mutex_unlock(&device_list_mutex); 971 return NULL; 972 } 973 974 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 975 { 976 struct ib_qp_init_attr qp_attr; 977 struct nvmet_rdma_device *ndev = queue->dev; 978 int comp_vector, nr_cqe, ret, i; 979 980 /* 981 * Spread the io queues across completion vectors, 982 * but still keep all admin queues on vector 0. 983 */ 984 comp_vector = !queue->host_qid ? 0 : 985 queue->idx % ndev->device->num_comp_vectors; 986 987 /* 988 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 989 */ 990 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 991 992 queue->cq = ib_alloc_cq(ndev->device, queue, 993 nr_cqe + 1, comp_vector, 994 IB_POLL_WORKQUEUE); 995 if (IS_ERR(queue->cq)) { 996 ret = PTR_ERR(queue->cq); 997 pr_err("failed to create CQ cqe= %d ret= %d\n", 998 nr_cqe + 1, ret); 999 goto out; 1000 } 1001 1002 memset(&qp_attr, 0, sizeof(qp_attr)); 1003 qp_attr.qp_context = queue; 1004 qp_attr.event_handler = nvmet_rdma_qp_event; 1005 qp_attr.send_cq = queue->cq; 1006 qp_attr.recv_cq = queue->cq; 1007 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1008 qp_attr.qp_type = IB_QPT_RC; 1009 /* +1 for drain */ 1010 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 1011 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 1012 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 1013 ndev->device->attrs.max_send_sge); 1014 1015 if (ndev->srq) { 1016 qp_attr.srq = ndev->srq; 1017 } else { 1018 /* +1 for drain */ 1019 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 1020 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 1021 } 1022 1023 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 1024 if (ret) { 1025 pr_err("failed to create_qp ret= %d\n", ret); 1026 goto err_destroy_cq; 1027 } 1028 1029 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 1030 1031 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 1032 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 1033 qp_attr.cap.max_send_wr, queue->cm_id); 1034 1035 if (!ndev->srq) { 1036 for (i = 0; i < queue->recv_queue_size; i++) { 1037 queue->cmds[i].queue = queue; 1038 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 1039 if (ret) 1040 goto err_destroy_qp; 1041 } 1042 } 1043 1044 out: 1045 return ret; 1046 1047 err_destroy_qp: 1048 rdma_destroy_qp(queue->cm_id); 1049 err_destroy_cq: 1050 ib_free_cq(queue->cq); 1051 goto out; 1052 } 1053 1054 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1055 { 1056 struct ib_qp *qp = queue->cm_id->qp; 1057 1058 ib_drain_qp(qp); 1059 rdma_destroy_id(queue->cm_id); 1060 ib_destroy_qp(qp); 1061 ib_free_cq(queue->cq); 1062 } 1063 1064 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1065 { 1066 pr_debug("freeing queue %d\n", queue->idx); 1067 1068 nvmet_sq_destroy(&queue->nvme_sq); 1069 1070 nvmet_rdma_destroy_queue_ib(queue); 1071 if (!queue->dev->srq) { 1072 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1073 queue->recv_queue_size, 1074 !queue->host_qid); 1075 } 1076 nvmet_rdma_free_rsps(queue); 1077 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1078 kfree(queue); 1079 } 1080 1081 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1082 { 1083 struct nvmet_rdma_queue *queue = 1084 container_of(w, struct nvmet_rdma_queue, release_work); 1085 struct nvmet_rdma_device *dev = queue->dev; 1086 1087 nvmet_rdma_free_queue(queue); 1088 1089 kref_put(&dev->ref, nvmet_rdma_free_dev); 1090 } 1091 1092 static int 1093 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1094 struct nvmet_rdma_queue *queue) 1095 { 1096 struct nvme_rdma_cm_req *req; 1097 1098 req = (struct nvme_rdma_cm_req *)conn->private_data; 1099 if (!req || conn->private_data_len == 0) 1100 return NVME_RDMA_CM_INVALID_LEN; 1101 1102 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1103 return NVME_RDMA_CM_INVALID_RECFMT; 1104 1105 queue->host_qid = le16_to_cpu(req->qid); 1106 1107 /* 1108 * req->hsqsize corresponds to our recv queue size plus 1 1109 * req->hrqsize corresponds to our send queue size 1110 */ 1111 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1112 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1113 1114 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1115 return NVME_RDMA_CM_INVALID_HSQSIZE; 1116 1117 /* XXX: Should we enforce some kind of max for IO queues? */ 1118 1119 return 0; 1120 } 1121 1122 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1123 enum nvme_rdma_cm_status status) 1124 { 1125 struct nvme_rdma_cm_rej rej; 1126 1127 pr_debug("rejecting connect request: status %d (%s)\n", 1128 status, nvme_rdma_cm_msg(status)); 1129 1130 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1131 rej.sts = cpu_to_le16(status); 1132 1133 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1134 } 1135 1136 static struct nvmet_rdma_queue * 1137 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1138 struct rdma_cm_id *cm_id, 1139 struct rdma_cm_event *event) 1140 { 1141 struct nvmet_rdma_queue *queue; 1142 int ret; 1143 1144 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1145 if (!queue) { 1146 ret = NVME_RDMA_CM_NO_RSC; 1147 goto out_reject; 1148 } 1149 1150 ret = nvmet_sq_init(&queue->nvme_sq); 1151 if (ret) { 1152 ret = NVME_RDMA_CM_NO_RSC; 1153 goto out_free_queue; 1154 } 1155 1156 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1157 if (ret) 1158 goto out_destroy_sq; 1159 1160 /* 1161 * Schedules the actual release because calling rdma_destroy_id from 1162 * inside a CM callback would trigger a deadlock. (great API design..) 1163 */ 1164 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1165 queue->dev = ndev; 1166 queue->cm_id = cm_id; 1167 1168 spin_lock_init(&queue->state_lock); 1169 queue->state = NVMET_RDMA_Q_CONNECTING; 1170 INIT_LIST_HEAD(&queue->rsp_wait_list); 1171 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1172 spin_lock_init(&queue->rsp_wr_wait_lock); 1173 INIT_LIST_HEAD(&queue->free_rsps); 1174 spin_lock_init(&queue->rsps_lock); 1175 INIT_LIST_HEAD(&queue->queue_list); 1176 1177 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1178 if (queue->idx < 0) { 1179 ret = NVME_RDMA_CM_NO_RSC; 1180 goto out_destroy_sq; 1181 } 1182 1183 ret = nvmet_rdma_alloc_rsps(queue); 1184 if (ret) { 1185 ret = NVME_RDMA_CM_NO_RSC; 1186 goto out_ida_remove; 1187 } 1188 1189 if (!ndev->srq) { 1190 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1191 queue->recv_queue_size, 1192 !queue->host_qid); 1193 if (IS_ERR(queue->cmds)) { 1194 ret = NVME_RDMA_CM_NO_RSC; 1195 goto out_free_responses; 1196 } 1197 } 1198 1199 ret = nvmet_rdma_create_queue_ib(queue); 1200 if (ret) { 1201 pr_err("%s: creating RDMA queue failed (%d).\n", 1202 __func__, ret); 1203 ret = NVME_RDMA_CM_NO_RSC; 1204 goto out_free_cmds; 1205 } 1206 1207 return queue; 1208 1209 out_free_cmds: 1210 if (!ndev->srq) { 1211 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1212 queue->recv_queue_size, 1213 !queue->host_qid); 1214 } 1215 out_free_responses: 1216 nvmet_rdma_free_rsps(queue); 1217 out_ida_remove: 1218 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1219 out_destroy_sq: 1220 nvmet_sq_destroy(&queue->nvme_sq); 1221 out_free_queue: 1222 kfree(queue); 1223 out_reject: 1224 nvmet_rdma_cm_reject(cm_id, ret); 1225 return NULL; 1226 } 1227 1228 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1229 { 1230 struct nvmet_rdma_queue *queue = priv; 1231 1232 switch (event->event) { 1233 case IB_EVENT_COMM_EST: 1234 rdma_notify(queue->cm_id, event->event); 1235 break; 1236 default: 1237 pr_err("received IB QP event: %s (%d)\n", 1238 ib_event_msg(event->event), event->event); 1239 break; 1240 } 1241 } 1242 1243 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1244 struct nvmet_rdma_queue *queue, 1245 struct rdma_conn_param *p) 1246 { 1247 struct rdma_conn_param param = { }; 1248 struct nvme_rdma_cm_rep priv = { }; 1249 int ret = -ENOMEM; 1250 1251 param.rnr_retry_count = 7; 1252 param.flow_control = 1; 1253 param.initiator_depth = min_t(u8, p->initiator_depth, 1254 queue->dev->device->attrs.max_qp_init_rd_atom); 1255 param.private_data = &priv; 1256 param.private_data_len = sizeof(priv); 1257 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1258 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1259 1260 ret = rdma_accept(cm_id, ¶m); 1261 if (ret) 1262 pr_err("rdma_accept failed (error code = %d)\n", ret); 1263 1264 return ret; 1265 } 1266 1267 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1268 struct rdma_cm_event *event) 1269 { 1270 struct nvmet_rdma_device *ndev; 1271 struct nvmet_rdma_queue *queue; 1272 int ret = -EINVAL; 1273 1274 ndev = nvmet_rdma_find_get_device(cm_id); 1275 if (!ndev) { 1276 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1277 return -ECONNREFUSED; 1278 } 1279 1280 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1281 if (!queue) { 1282 ret = -ENOMEM; 1283 goto put_device; 1284 } 1285 queue->port = cm_id->context; 1286 1287 if (queue->host_qid == 0) { 1288 /* Let inflight controller teardown complete */ 1289 flush_scheduled_work(); 1290 } 1291 1292 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1293 if (ret) { 1294 schedule_work(&queue->release_work); 1295 /* Destroying rdma_cm id is not needed here */ 1296 return 0; 1297 } 1298 1299 mutex_lock(&nvmet_rdma_queue_mutex); 1300 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1301 mutex_unlock(&nvmet_rdma_queue_mutex); 1302 1303 return 0; 1304 1305 put_device: 1306 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1307 1308 return ret; 1309 } 1310 1311 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1312 { 1313 unsigned long flags; 1314 1315 spin_lock_irqsave(&queue->state_lock, flags); 1316 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1317 pr_warn("trying to establish a connected queue\n"); 1318 goto out_unlock; 1319 } 1320 queue->state = NVMET_RDMA_Q_LIVE; 1321 1322 while (!list_empty(&queue->rsp_wait_list)) { 1323 struct nvmet_rdma_rsp *cmd; 1324 1325 cmd = list_first_entry(&queue->rsp_wait_list, 1326 struct nvmet_rdma_rsp, wait_list); 1327 list_del(&cmd->wait_list); 1328 1329 spin_unlock_irqrestore(&queue->state_lock, flags); 1330 nvmet_rdma_handle_command(queue, cmd); 1331 spin_lock_irqsave(&queue->state_lock, flags); 1332 } 1333 1334 out_unlock: 1335 spin_unlock_irqrestore(&queue->state_lock, flags); 1336 } 1337 1338 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1339 { 1340 bool disconnect = false; 1341 unsigned long flags; 1342 1343 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1344 1345 spin_lock_irqsave(&queue->state_lock, flags); 1346 switch (queue->state) { 1347 case NVMET_RDMA_Q_CONNECTING: 1348 case NVMET_RDMA_Q_LIVE: 1349 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1350 disconnect = true; 1351 break; 1352 case NVMET_RDMA_Q_DISCONNECTING: 1353 break; 1354 } 1355 spin_unlock_irqrestore(&queue->state_lock, flags); 1356 1357 if (disconnect) { 1358 rdma_disconnect(queue->cm_id); 1359 schedule_work(&queue->release_work); 1360 } 1361 } 1362 1363 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1364 { 1365 bool disconnect = false; 1366 1367 mutex_lock(&nvmet_rdma_queue_mutex); 1368 if (!list_empty(&queue->queue_list)) { 1369 list_del_init(&queue->queue_list); 1370 disconnect = true; 1371 } 1372 mutex_unlock(&nvmet_rdma_queue_mutex); 1373 1374 if (disconnect) 1375 __nvmet_rdma_queue_disconnect(queue); 1376 } 1377 1378 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1379 struct nvmet_rdma_queue *queue) 1380 { 1381 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1382 1383 mutex_lock(&nvmet_rdma_queue_mutex); 1384 if (!list_empty(&queue->queue_list)) 1385 list_del_init(&queue->queue_list); 1386 mutex_unlock(&nvmet_rdma_queue_mutex); 1387 1388 pr_err("failed to connect queue %d\n", queue->idx); 1389 schedule_work(&queue->release_work); 1390 } 1391 1392 /** 1393 * nvme_rdma_device_removal() - Handle RDMA device removal 1394 * @cm_id: rdma_cm id, used for nvmet port 1395 * @queue: nvmet rdma queue (cm id qp_context) 1396 * 1397 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1398 * to unplug. Note that this event can be generated on a normal 1399 * queue cm_id and/or a device bound listener cm_id (where in this 1400 * case queue will be null). 1401 * 1402 * We registered an ib_client to handle device removal for queues, 1403 * so we only need to handle the listening port cm_ids. In this case 1404 * we nullify the priv to prevent double cm_id destruction and destroying 1405 * the cm_id implicitely by returning a non-zero rc to the callout. 1406 */ 1407 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1408 struct nvmet_rdma_queue *queue) 1409 { 1410 struct nvmet_port *port; 1411 1412 if (queue) { 1413 /* 1414 * This is a queue cm_id. we have registered 1415 * an ib_client to handle queues removal 1416 * so don't interfear and just return. 1417 */ 1418 return 0; 1419 } 1420 1421 port = cm_id->context; 1422 1423 /* 1424 * This is a listener cm_id. Make sure that 1425 * future remove_port won't invoke a double 1426 * cm_id destroy. use atomic xchg to make sure 1427 * we don't compete with remove_port. 1428 */ 1429 if (xchg(&port->priv, NULL) != cm_id) 1430 return 0; 1431 1432 /* 1433 * We need to return 1 so that the core will destroy 1434 * it's own ID. What a great API design.. 1435 */ 1436 return 1; 1437 } 1438 1439 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1440 struct rdma_cm_event *event) 1441 { 1442 struct nvmet_rdma_queue *queue = NULL; 1443 int ret = 0; 1444 1445 if (cm_id->qp) 1446 queue = cm_id->qp->qp_context; 1447 1448 pr_debug("%s (%d): status %d id %p\n", 1449 rdma_event_msg(event->event), event->event, 1450 event->status, cm_id); 1451 1452 switch (event->event) { 1453 case RDMA_CM_EVENT_CONNECT_REQUEST: 1454 ret = nvmet_rdma_queue_connect(cm_id, event); 1455 break; 1456 case RDMA_CM_EVENT_ESTABLISHED: 1457 nvmet_rdma_queue_established(queue); 1458 break; 1459 case RDMA_CM_EVENT_ADDR_CHANGE: 1460 case RDMA_CM_EVENT_DISCONNECTED: 1461 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1462 nvmet_rdma_queue_disconnect(queue); 1463 break; 1464 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1465 ret = nvmet_rdma_device_removal(cm_id, queue); 1466 break; 1467 case RDMA_CM_EVENT_REJECTED: 1468 pr_debug("Connection rejected: %s\n", 1469 rdma_reject_msg(cm_id, event->status)); 1470 /* FALLTHROUGH */ 1471 case RDMA_CM_EVENT_UNREACHABLE: 1472 case RDMA_CM_EVENT_CONNECT_ERROR: 1473 nvmet_rdma_queue_connect_fail(cm_id, queue); 1474 break; 1475 default: 1476 pr_err("received unrecognized RDMA CM event %d\n", 1477 event->event); 1478 break; 1479 } 1480 1481 return ret; 1482 } 1483 1484 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1485 { 1486 struct nvmet_rdma_queue *queue; 1487 1488 restart: 1489 mutex_lock(&nvmet_rdma_queue_mutex); 1490 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1491 if (queue->nvme_sq.ctrl == ctrl) { 1492 list_del_init(&queue->queue_list); 1493 mutex_unlock(&nvmet_rdma_queue_mutex); 1494 1495 __nvmet_rdma_queue_disconnect(queue); 1496 goto restart; 1497 } 1498 } 1499 mutex_unlock(&nvmet_rdma_queue_mutex); 1500 } 1501 1502 static int nvmet_rdma_add_port(struct nvmet_port *port) 1503 { 1504 struct rdma_cm_id *cm_id; 1505 struct sockaddr_storage addr = { }; 1506 __kernel_sa_family_t af; 1507 int ret; 1508 1509 switch (port->disc_addr.adrfam) { 1510 case NVMF_ADDR_FAMILY_IP4: 1511 af = AF_INET; 1512 break; 1513 case NVMF_ADDR_FAMILY_IP6: 1514 af = AF_INET6; 1515 break; 1516 default: 1517 pr_err("address family %d not supported\n", 1518 port->disc_addr.adrfam); 1519 return -EINVAL; 1520 } 1521 1522 if (port->inline_data_size < 0) { 1523 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1524 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1525 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1526 port->inline_data_size, 1527 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1528 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1529 } 1530 1531 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1532 port->disc_addr.trsvcid, &addr); 1533 if (ret) { 1534 pr_err("malformed ip/port passed: %s:%s\n", 1535 port->disc_addr.traddr, port->disc_addr.trsvcid); 1536 return ret; 1537 } 1538 1539 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1540 RDMA_PS_TCP, IB_QPT_RC); 1541 if (IS_ERR(cm_id)) { 1542 pr_err("CM ID creation failed\n"); 1543 return PTR_ERR(cm_id); 1544 } 1545 1546 /* 1547 * Allow both IPv4 and IPv6 sockets to bind a single port 1548 * at the same time. 1549 */ 1550 ret = rdma_set_afonly(cm_id, 1); 1551 if (ret) { 1552 pr_err("rdma_set_afonly failed (%d)\n", ret); 1553 goto out_destroy_id; 1554 } 1555 1556 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1557 if (ret) { 1558 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1559 (struct sockaddr *)&addr, ret); 1560 goto out_destroy_id; 1561 } 1562 1563 ret = rdma_listen(cm_id, 128); 1564 if (ret) { 1565 pr_err("listening to %pISpcs failed (%d)\n", 1566 (struct sockaddr *)&addr, ret); 1567 goto out_destroy_id; 1568 } 1569 1570 pr_info("enabling port %d (%pISpcs)\n", 1571 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1572 port->priv = cm_id; 1573 return 0; 1574 1575 out_destroy_id: 1576 rdma_destroy_id(cm_id); 1577 return ret; 1578 } 1579 1580 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1581 { 1582 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1583 1584 if (cm_id) 1585 rdma_destroy_id(cm_id); 1586 } 1587 1588 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1589 struct nvmet_port *port, char *traddr) 1590 { 1591 struct rdma_cm_id *cm_id = port->priv; 1592 1593 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1594 struct nvmet_rdma_rsp *rsp = 1595 container_of(req, struct nvmet_rdma_rsp, req); 1596 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1597 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1598 1599 sprintf(traddr, "%pISc", addr); 1600 } else { 1601 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1602 } 1603 } 1604 1605 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1606 .owner = THIS_MODULE, 1607 .type = NVMF_TRTYPE_RDMA, 1608 .msdbd = 1, 1609 .has_keyed_sgls = 1, 1610 .add_port = nvmet_rdma_add_port, 1611 .remove_port = nvmet_rdma_remove_port, 1612 .queue_response = nvmet_rdma_queue_response, 1613 .delete_ctrl = nvmet_rdma_delete_ctrl, 1614 .disc_traddr = nvmet_rdma_disc_port_addr, 1615 }; 1616 1617 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1618 { 1619 struct nvmet_rdma_queue *queue, *tmp; 1620 struct nvmet_rdma_device *ndev; 1621 bool found = false; 1622 1623 mutex_lock(&device_list_mutex); 1624 list_for_each_entry(ndev, &device_list, entry) { 1625 if (ndev->device == ib_device) { 1626 found = true; 1627 break; 1628 } 1629 } 1630 mutex_unlock(&device_list_mutex); 1631 1632 if (!found) 1633 return; 1634 1635 /* 1636 * IB Device that is used by nvmet controllers is being removed, 1637 * delete all queues using this device. 1638 */ 1639 mutex_lock(&nvmet_rdma_queue_mutex); 1640 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1641 queue_list) { 1642 if (queue->dev->device != ib_device) 1643 continue; 1644 1645 pr_info("Removing queue %d\n", queue->idx); 1646 list_del_init(&queue->queue_list); 1647 __nvmet_rdma_queue_disconnect(queue); 1648 } 1649 mutex_unlock(&nvmet_rdma_queue_mutex); 1650 1651 flush_scheduled_work(); 1652 } 1653 1654 static struct ib_client nvmet_rdma_ib_client = { 1655 .name = "nvmet_rdma", 1656 .remove = nvmet_rdma_remove_one 1657 }; 1658 1659 static int __init nvmet_rdma_init(void) 1660 { 1661 int ret; 1662 1663 ret = ib_register_client(&nvmet_rdma_ib_client); 1664 if (ret) 1665 return ret; 1666 1667 ret = nvmet_register_transport(&nvmet_rdma_ops); 1668 if (ret) 1669 goto err_ib_client; 1670 1671 return 0; 1672 1673 err_ib_client: 1674 ib_unregister_client(&nvmet_rdma_ib_client); 1675 return ret; 1676 } 1677 1678 static void __exit nvmet_rdma_exit(void) 1679 { 1680 nvmet_unregister_transport(&nvmet_rdma_ops); 1681 ib_unregister_client(&nvmet_rdma_ib_client); 1682 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1683 ida_destroy(&nvmet_rdma_queue_ida); 1684 } 1685 1686 module_init(nvmet_rdma_init); 1687 module_exit(nvmet_rdma_exit); 1688 1689 MODULE_LICENSE("GPL v2"); 1690 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1691