1 /* 2 * NVMe over Fabrics RDMA target. 3 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 #include <linux/atomic.h> 16 #include <linux/ctype.h> 17 #include <linux/delay.h> 18 #include <linux/err.h> 19 #include <linux/init.h> 20 #include <linux/module.h> 21 #include <linux/nvme.h> 22 #include <linux/slab.h> 23 #include <linux/string.h> 24 #include <linux/wait.h> 25 #include <linux/inet.h> 26 #include <asm/unaligned.h> 27 28 #include <rdma/ib_verbs.h> 29 #include <rdma/rdma_cm.h> 30 #include <rdma/rw.h> 31 32 #include <linux/nvme-rdma.h> 33 #include "nvmet.h" 34 35 /* 36 * We allow up to a page of inline data to go with the SQE 37 */ 38 #define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE 39 40 struct nvmet_rdma_cmd { 41 struct ib_sge sge[2]; 42 struct ib_cqe cqe; 43 struct ib_recv_wr wr; 44 struct scatterlist inline_sg; 45 struct page *inline_page; 46 struct nvme_command *nvme_cmd; 47 struct nvmet_rdma_queue *queue; 48 }; 49 50 enum { 51 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 52 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 53 }; 54 55 struct nvmet_rdma_rsp { 56 struct ib_sge send_sge; 57 struct ib_cqe send_cqe; 58 struct ib_send_wr send_wr; 59 60 struct nvmet_rdma_cmd *cmd; 61 struct nvmet_rdma_queue *queue; 62 63 struct ib_cqe read_cqe; 64 struct rdma_rw_ctx rw; 65 66 struct nvmet_req req; 67 68 u8 n_rdma; 69 u32 flags; 70 u32 invalidate_rkey; 71 72 struct list_head wait_list; 73 struct list_head free_list; 74 }; 75 76 enum nvmet_rdma_queue_state { 77 NVMET_RDMA_Q_CONNECTING, 78 NVMET_RDMA_Q_LIVE, 79 NVMET_RDMA_Q_DISCONNECTING, 80 }; 81 82 struct nvmet_rdma_queue { 83 struct rdma_cm_id *cm_id; 84 struct nvmet_port *port; 85 struct ib_cq *cq; 86 atomic_t sq_wr_avail; 87 struct nvmet_rdma_device *dev; 88 spinlock_t state_lock; 89 enum nvmet_rdma_queue_state state; 90 struct nvmet_cq nvme_cq; 91 struct nvmet_sq nvme_sq; 92 93 struct nvmet_rdma_rsp *rsps; 94 struct list_head free_rsps; 95 spinlock_t rsps_lock; 96 struct nvmet_rdma_cmd *cmds; 97 98 struct work_struct release_work; 99 struct list_head rsp_wait_list; 100 struct list_head rsp_wr_wait_list; 101 spinlock_t rsp_wr_wait_lock; 102 103 int idx; 104 int host_qid; 105 int recv_queue_size; 106 int send_queue_size; 107 108 struct list_head queue_list; 109 }; 110 111 struct nvmet_rdma_device { 112 struct ib_device *device; 113 struct ib_pd *pd; 114 struct ib_srq *srq; 115 struct nvmet_rdma_cmd *srq_cmds; 116 size_t srq_size; 117 struct kref ref; 118 struct list_head entry; 119 }; 120 121 static bool nvmet_rdma_use_srq; 122 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 123 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 124 125 static DEFINE_IDA(nvmet_rdma_queue_ida); 126 static LIST_HEAD(nvmet_rdma_queue_list); 127 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 128 129 static LIST_HEAD(device_list); 130 static DEFINE_MUTEX(device_list_mutex); 131 132 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 133 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 134 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 135 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 136 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 137 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 138 139 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 140 141 /* XXX: really should move to a generic header sooner or later.. */ 142 static inline u32 get_unaligned_le24(const u8 *p) 143 { 144 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 145 } 146 147 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 148 { 149 return nvme_is_write(rsp->req.cmd) && 150 rsp->req.transfer_len && 151 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 152 } 153 154 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 155 { 156 return !nvme_is_write(rsp->req.cmd) && 157 rsp->req.transfer_len && 158 !rsp->req.rsp->status && 159 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 160 } 161 162 static inline struct nvmet_rdma_rsp * 163 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 164 { 165 struct nvmet_rdma_rsp *rsp; 166 unsigned long flags; 167 168 spin_lock_irqsave(&queue->rsps_lock, flags); 169 rsp = list_first_entry(&queue->free_rsps, 170 struct nvmet_rdma_rsp, free_list); 171 list_del(&rsp->free_list); 172 spin_unlock_irqrestore(&queue->rsps_lock, flags); 173 174 return rsp; 175 } 176 177 static inline void 178 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 179 { 180 unsigned long flags; 181 182 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 183 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 184 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 185 } 186 187 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 188 struct nvmet_rdma_cmd *c, bool admin) 189 { 190 /* NVMe command / RDMA RECV */ 191 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 192 if (!c->nvme_cmd) 193 goto out; 194 195 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 196 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 197 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 198 goto out_free_cmd; 199 200 c->sge[0].length = sizeof(*c->nvme_cmd); 201 c->sge[0].lkey = ndev->pd->local_dma_lkey; 202 203 if (!admin) { 204 c->inline_page = alloc_pages(GFP_KERNEL, 205 get_order(NVMET_RDMA_INLINE_DATA_SIZE)); 206 if (!c->inline_page) 207 goto out_unmap_cmd; 208 c->sge[1].addr = ib_dma_map_page(ndev->device, 209 c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, 210 DMA_FROM_DEVICE); 211 if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) 212 goto out_free_inline_page; 213 c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; 214 c->sge[1].lkey = ndev->pd->local_dma_lkey; 215 } 216 217 c->cqe.done = nvmet_rdma_recv_done; 218 219 c->wr.wr_cqe = &c->cqe; 220 c->wr.sg_list = c->sge; 221 c->wr.num_sge = admin ? 1 : 2; 222 223 return 0; 224 225 out_free_inline_page: 226 if (!admin) { 227 __free_pages(c->inline_page, 228 get_order(NVMET_RDMA_INLINE_DATA_SIZE)); 229 } 230 out_unmap_cmd: 231 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 232 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 233 out_free_cmd: 234 kfree(c->nvme_cmd); 235 236 out: 237 return -ENOMEM; 238 } 239 240 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 241 struct nvmet_rdma_cmd *c, bool admin) 242 { 243 if (!admin) { 244 ib_dma_unmap_page(ndev->device, c->sge[1].addr, 245 NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); 246 __free_pages(c->inline_page, 247 get_order(NVMET_RDMA_INLINE_DATA_SIZE)); 248 } 249 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 250 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 251 kfree(c->nvme_cmd); 252 } 253 254 static struct nvmet_rdma_cmd * 255 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 256 int nr_cmds, bool admin) 257 { 258 struct nvmet_rdma_cmd *cmds; 259 int ret = -EINVAL, i; 260 261 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 262 if (!cmds) 263 goto out; 264 265 for (i = 0; i < nr_cmds; i++) { 266 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 267 if (ret) 268 goto out_free; 269 } 270 271 return cmds; 272 273 out_free: 274 while (--i >= 0) 275 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 276 kfree(cmds); 277 out: 278 return ERR_PTR(ret); 279 } 280 281 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 282 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 283 { 284 int i; 285 286 for (i = 0; i < nr_cmds; i++) 287 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 288 kfree(cmds); 289 } 290 291 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 292 struct nvmet_rdma_rsp *r) 293 { 294 /* NVMe CQE / RDMA SEND */ 295 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 296 if (!r->req.rsp) 297 goto out; 298 299 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 300 sizeof(*r->req.rsp), DMA_TO_DEVICE); 301 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 302 goto out_free_rsp; 303 304 r->send_sge.length = sizeof(*r->req.rsp); 305 r->send_sge.lkey = ndev->pd->local_dma_lkey; 306 307 r->send_cqe.done = nvmet_rdma_send_done; 308 309 r->send_wr.wr_cqe = &r->send_cqe; 310 r->send_wr.sg_list = &r->send_sge; 311 r->send_wr.num_sge = 1; 312 r->send_wr.send_flags = IB_SEND_SIGNALED; 313 314 /* Data In / RDMA READ */ 315 r->read_cqe.done = nvmet_rdma_read_data_done; 316 return 0; 317 318 out_free_rsp: 319 kfree(r->req.rsp); 320 out: 321 return -ENOMEM; 322 } 323 324 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 325 struct nvmet_rdma_rsp *r) 326 { 327 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 328 sizeof(*r->req.rsp), DMA_TO_DEVICE); 329 kfree(r->req.rsp); 330 } 331 332 static int 333 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 334 { 335 struct nvmet_rdma_device *ndev = queue->dev; 336 int nr_rsps = queue->recv_queue_size * 2; 337 int ret = -EINVAL, i; 338 339 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 340 GFP_KERNEL); 341 if (!queue->rsps) 342 goto out; 343 344 for (i = 0; i < nr_rsps; i++) { 345 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 346 347 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 348 if (ret) 349 goto out_free; 350 351 list_add_tail(&rsp->free_list, &queue->free_rsps); 352 } 353 354 return 0; 355 356 out_free: 357 while (--i >= 0) { 358 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 359 360 list_del(&rsp->free_list); 361 nvmet_rdma_free_rsp(ndev, rsp); 362 } 363 kfree(queue->rsps); 364 out: 365 return ret; 366 } 367 368 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 369 { 370 struct nvmet_rdma_device *ndev = queue->dev; 371 int i, nr_rsps = queue->recv_queue_size * 2; 372 373 for (i = 0; i < nr_rsps; i++) { 374 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 375 376 list_del(&rsp->free_list); 377 nvmet_rdma_free_rsp(ndev, rsp); 378 } 379 kfree(queue->rsps); 380 } 381 382 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 383 struct nvmet_rdma_cmd *cmd) 384 { 385 struct ib_recv_wr *bad_wr; 386 387 ib_dma_sync_single_for_device(ndev->device, 388 cmd->sge[0].addr, cmd->sge[0].length, 389 DMA_FROM_DEVICE); 390 391 if (ndev->srq) 392 return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); 393 return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); 394 } 395 396 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 397 { 398 spin_lock(&queue->rsp_wr_wait_lock); 399 while (!list_empty(&queue->rsp_wr_wait_list)) { 400 struct nvmet_rdma_rsp *rsp; 401 bool ret; 402 403 rsp = list_entry(queue->rsp_wr_wait_list.next, 404 struct nvmet_rdma_rsp, wait_list); 405 list_del(&rsp->wait_list); 406 407 spin_unlock(&queue->rsp_wr_wait_lock); 408 ret = nvmet_rdma_execute_command(rsp); 409 spin_lock(&queue->rsp_wr_wait_lock); 410 411 if (!ret) { 412 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 413 break; 414 } 415 } 416 spin_unlock(&queue->rsp_wr_wait_lock); 417 } 418 419 420 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 421 { 422 struct nvmet_rdma_queue *queue = rsp->queue; 423 424 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 425 426 if (rsp->n_rdma) { 427 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 428 queue->cm_id->port_num, rsp->req.sg, 429 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 430 } 431 432 if (rsp->req.sg != &rsp->cmd->inline_sg) 433 sgl_free(rsp->req.sg); 434 435 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 436 nvmet_rdma_process_wr_wait_list(queue); 437 438 nvmet_rdma_put_rsp(rsp); 439 } 440 441 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 442 { 443 if (queue->nvme_sq.ctrl) { 444 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 445 } else { 446 /* 447 * we didn't setup the controller yet in case 448 * of admin connect error, just disconnect and 449 * cleanup the queue 450 */ 451 nvmet_rdma_queue_disconnect(queue); 452 } 453 } 454 455 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 456 { 457 struct nvmet_rdma_rsp *rsp = 458 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 459 460 nvmet_rdma_release_rsp(rsp); 461 462 if (unlikely(wc->status != IB_WC_SUCCESS && 463 wc->status != IB_WC_WR_FLUSH_ERR)) { 464 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 465 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 466 nvmet_rdma_error_comp(rsp->queue); 467 } 468 } 469 470 static void nvmet_rdma_queue_response(struct nvmet_req *req) 471 { 472 struct nvmet_rdma_rsp *rsp = 473 container_of(req, struct nvmet_rdma_rsp, req); 474 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 475 struct ib_send_wr *first_wr, *bad_wr; 476 477 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 478 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 479 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 480 } else { 481 rsp->send_wr.opcode = IB_WR_SEND; 482 } 483 484 if (nvmet_rdma_need_data_out(rsp)) 485 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 486 cm_id->port_num, NULL, &rsp->send_wr); 487 else 488 first_wr = &rsp->send_wr; 489 490 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 491 492 ib_dma_sync_single_for_device(rsp->queue->dev->device, 493 rsp->send_sge.addr, rsp->send_sge.length, 494 DMA_TO_DEVICE); 495 496 if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { 497 pr_err("sending cmd response failed\n"); 498 nvmet_rdma_release_rsp(rsp); 499 } 500 } 501 502 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 503 { 504 struct nvmet_rdma_rsp *rsp = 505 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 506 struct nvmet_rdma_queue *queue = cq->cq_context; 507 508 WARN_ON(rsp->n_rdma <= 0); 509 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 510 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 511 queue->cm_id->port_num, rsp->req.sg, 512 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 513 rsp->n_rdma = 0; 514 515 if (unlikely(wc->status != IB_WC_SUCCESS)) { 516 nvmet_req_uninit(&rsp->req); 517 nvmet_rdma_release_rsp(rsp); 518 if (wc->status != IB_WC_WR_FLUSH_ERR) { 519 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 520 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 521 nvmet_rdma_error_comp(queue); 522 } 523 return; 524 } 525 526 nvmet_req_execute(&rsp->req); 527 } 528 529 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 530 u64 off) 531 { 532 sg_init_table(&rsp->cmd->inline_sg, 1); 533 sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); 534 rsp->req.sg = &rsp->cmd->inline_sg; 535 rsp->req.sg_cnt = 1; 536 } 537 538 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 539 { 540 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 541 u64 off = le64_to_cpu(sgl->addr); 542 u32 len = le32_to_cpu(sgl->length); 543 544 if (!nvme_is_write(rsp->req.cmd)) 545 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 546 547 if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { 548 pr_err("invalid inline data offset!\n"); 549 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 550 } 551 552 /* no data command? */ 553 if (!len) 554 return 0; 555 556 nvmet_rdma_use_inline_sg(rsp, len, off); 557 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 558 rsp->req.transfer_len += len; 559 return 0; 560 } 561 562 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 563 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 564 { 565 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 566 u64 addr = le64_to_cpu(sgl->addr); 567 u32 len = get_unaligned_le24(sgl->length); 568 u32 key = get_unaligned_le32(sgl->key); 569 int ret; 570 571 /* no data command? */ 572 if (!len) 573 return 0; 574 575 rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); 576 if (!rsp->req.sg) 577 return NVME_SC_INTERNAL; 578 579 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 580 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 581 nvmet_data_dir(&rsp->req)); 582 if (ret < 0) 583 return NVME_SC_INTERNAL; 584 rsp->req.transfer_len += len; 585 rsp->n_rdma += ret; 586 587 if (invalidate) { 588 rsp->invalidate_rkey = key; 589 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 590 } 591 592 return 0; 593 } 594 595 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 596 { 597 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 598 599 switch (sgl->type >> 4) { 600 case NVME_SGL_FMT_DATA_DESC: 601 switch (sgl->type & 0xf) { 602 case NVME_SGL_FMT_OFFSET: 603 return nvmet_rdma_map_sgl_inline(rsp); 604 default: 605 pr_err("invalid SGL subtype: %#x\n", sgl->type); 606 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 607 } 608 case NVME_KEY_SGL_FMT_DATA_DESC: 609 switch (sgl->type & 0xf) { 610 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 611 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 612 case NVME_SGL_FMT_ADDRESS: 613 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 614 default: 615 pr_err("invalid SGL subtype: %#x\n", sgl->type); 616 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 617 } 618 default: 619 pr_err("invalid SGL type: %#x\n", sgl->type); 620 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 621 } 622 } 623 624 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 625 { 626 struct nvmet_rdma_queue *queue = rsp->queue; 627 628 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 629 &queue->sq_wr_avail) < 0)) { 630 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 631 1 + rsp->n_rdma, queue->idx, 632 queue->nvme_sq.ctrl->cntlid); 633 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 634 return false; 635 } 636 637 if (nvmet_rdma_need_data_in(rsp)) { 638 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 639 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 640 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 641 } else { 642 nvmet_req_execute(&rsp->req); 643 } 644 645 return true; 646 } 647 648 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 649 struct nvmet_rdma_rsp *cmd) 650 { 651 u16 status; 652 653 ib_dma_sync_single_for_cpu(queue->dev->device, 654 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 655 DMA_FROM_DEVICE); 656 ib_dma_sync_single_for_cpu(queue->dev->device, 657 cmd->send_sge.addr, cmd->send_sge.length, 658 DMA_TO_DEVICE); 659 660 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 661 &queue->nvme_sq, &nvmet_rdma_ops)) 662 return; 663 664 status = nvmet_rdma_map_sgl(cmd); 665 if (status) 666 goto out_err; 667 668 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 669 spin_lock(&queue->rsp_wr_wait_lock); 670 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 671 spin_unlock(&queue->rsp_wr_wait_lock); 672 } 673 674 return; 675 676 out_err: 677 nvmet_req_complete(&cmd->req, status); 678 } 679 680 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 681 { 682 struct nvmet_rdma_cmd *cmd = 683 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 684 struct nvmet_rdma_queue *queue = cq->cq_context; 685 struct nvmet_rdma_rsp *rsp; 686 687 if (unlikely(wc->status != IB_WC_SUCCESS)) { 688 if (wc->status != IB_WC_WR_FLUSH_ERR) { 689 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 690 wc->wr_cqe, ib_wc_status_msg(wc->status), 691 wc->status); 692 nvmet_rdma_error_comp(queue); 693 } 694 return; 695 } 696 697 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 698 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 699 nvmet_rdma_error_comp(queue); 700 return; 701 } 702 703 cmd->queue = queue; 704 rsp = nvmet_rdma_get_rsp(queue); 705 rsp->queue = queue; 706 rsp->cmd = cmd; 707 rsp->flags = 0; 708 rsp->req.cmd = cmd->nvme_cmd; 709 rsp->req.port = queue->port; 710 rsp->n_rdma = 0; 711 712 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 713 unsigned long flags; 714 715 spin_lock_irqsave(&queue->state_lock, flags); 716 if (queue->state == NVMET_RDMA_Q_CONNECTING) 717 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 718 else 719 nvmet_rdma_put_rsp(rsp); 720 spin_unlock_irqrestore(&queue->state_lock, flags); 721 return; 722 } 723 724 nvmet_rdma_handle_command(queue, rsp); 725 } 726 727 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 728 { 729 if (!ndev->srq) 730 return; 731 732 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 733 ib_destroy_srq(ndev->srq); 734 } 735 736 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 737 { 738 struct ib_srq_init_attr srq_attr = { NULL, }; 739 struct ib_srq *srq; 740 size_t srq_size; 741 int ret, i; 742 743 srq_size = 4095; /* XXX: tune */ 744 745 srq_attr.attr.max_wr = srq_size; 746 srq_attr.attr.max_sge = 2; 747 srq_attr.attr.srq_limit = 0; 748 srq_attr.srq_type = IB_SRQT_BASIC; 749 srq = ib_create_srq(ndev->pd, &srq_attr); 750 if (IS_ERR(srq)) { 751 /* 752 * If SRQs aren't supported we just go ahead and use normal 753 * non-shared receive queues. 754 */ 755 pr_info("SRQ requested but not supported.\n"); 756 return 0; 757 } 758 759 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 760 if (IS_ERR(ndev->srq_cmds)) { 761 ret = PTR_ERR(ndev->srq_cmds); 762 goto out_destroy_srq; 763 } 764 765 ndev->srq = srq; 766 ndev->srq_size = srq_size; 767 768 for (i = 0; i < srq_size; i++) 769 nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 770 771 return 0; 772 773 out_destroy_srq: 774 ib_destroy_srq(srq); 775 return ret; 776 } 777 778 static void nvmet_rdma_free_dev(struct kref *ref) 779 { 780 struct nvmet_rdma_device *ndev = 781 container_of(ref, struct nvmet_rdma_device, ref); 782 783 mutex_lock(&device_list_mutex); 784 list_del(&ndev->entry); 785 mutex_unlock(&device_list_mutex); 786 787 nvmet_rdma_destroy_srq(ndev); 788 ib_dealloc_pd(ndev->pd); 789 790 kfree(ndev); 791 } 792 793 static struct nvmet_rdma_device * 794 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 795 { 796 struct nvmet_rdma_device *ndev; 797 int ret; 798 799 mutex_lock(&device_list_mutex); 800 list_for_each_entry(ndev, &device_list, entry) { 801 if (ndev->device->node_guid == cm_id->device->node_guid && 802 kref_get_unless_zero(&ndev->ref)) 803 goto out_unlock; 804 } 805 806 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 807 if (!ndev) 808 goto out_err; 809 810 ndev->device = cm_id->device; 811 kref_init(&ndev->ref); 812 813 ndev->pd = ib_alloc_pd(ndev->device, 0); 814 if (IS_ERR(ndev->pd)) 815 goto out_free_dev; 816 817 if (nvmet_rdma_use_srq) { 818 ret = nvmet_rdma_init_srq(ndev); 819 if (ret) 820 goto out_free_pd; 821 } 822 823 list_add(&ndev->entry, &device_list); 824 out_unlock: 825 mutex_unlock(&device_list_mutex); 826 pr_debug("added %s.\n", ndev->device->name); 827 return ndev; 828 829 out_free_pd: 830 ib_dealloc_pd(ndev->pd); 831 out_free_dev: 832 kfree(ndev); 833 out_err: 834 mutex_unlock(&device_list_mutex); 835 return NULL; 836 } 837 838 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 839 { 840 struct ib_qp_init_attr qp_attr; 841 struct nvmet_rdma_device *ndev = queue->dev; 842 int comp_vector, nr_cqe, ret, i; 843 844 /* 845 * Spread the io queues across completion vectors, 846 * but still keep all admin queues on vector 0. 847 */ 848 comp_vector = !queue->host_qid ? 0 : 849 queue->idx % ndev->device->num_comp_vectors; 850 851 /* 852 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 853 */ 854 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 855 856 queue->cq = ib_alloc_cq(ndev->device, queue, 857 nr_cqe + 1, comp_vector, 858 IB_POLL_WORKQUEUE); 859 if (IS_ERR(queue->cq)) { 860 ret = PTR_ERR(queue->cq); 861 pr_err("failed to create CQ cqe= %d ret= %d\n", 862 nr_cqe + 1, ret); 863 goto out; 864 } 865 866 memset(&qp_attr, 0, sizeof(qp_attr)); 867 qp_attr.qp_context = queue; 868 qp_attr.event_handler = nvmet_rdma_qp_event; 869 qp_attr.send_cq = queue->cq; 870 qp_attr.recv_cq = queue->cq; 871 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 872 qp_attr.qp_type = IB_QPT_RC; 873 /* +1 for drain */ 874 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 875 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 876 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 877 ndev->device->attrs.max_sge); 878 879 if (ndev->srq) { 880 qp_attr.srq = ndev->srq; 881 } else { 882 /* +1 for drain */ 883 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 884 qp_attr.cap.max_recv_sge = 2; 885 } 886 887 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 888 if (ret) { 889 pr_err("failed to create_qp ret= %d\n", ret); 890 goto err_destroy_cq; 891 } 892 893 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 894 895 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 896 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 897 qp_attr.cap.max_send_wr, queue->cm_id); 898 899 if (!ndev->srq) { 900 for (i = 0; i < queue->recv_queue_size; i++) { 901 queue->cmds[i].queue = queue; 902 nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 903 } 904 } 905 906 out: 907 return ret; 908 909 err_destroy_cq: 910 ib_free_cq(queue->cq); 911 goto out; 912 } 913 914 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 915 { 916 struct ib_qp *qp = queue->cm_id->qp; 917 918 ib_drain_qp(qp); 919 rdma_destroy_id(queue->cm_id); 920 ib_destroy_qp(qp); 921 ib_free_cq(queue->cq); 922 } 923 924 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 925 { 926 pr_debug("freeing queue %d\n", queue->idx); 927 928 nvmet_sq_destroy(&queue->nvme_sq); 929 930 nvmet_rdma_destroy_queue_ib(queue); 931 if (!queue->dev->srq) { 932 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 933 queue->recv_queue_size, 934 !queue->host_qid); 935 } 936 nvmet_rdma_free_rsps(queue); 937 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 938 kfree(queue); 939 } 940 941 static void nvmet_rdma_release_queue_work(struct work_struct *w) 942 { 943 struct nvmet_rdma_queue *queue = 944 container_of(w, struct nvmet_rdma_queue, release_work); 945 struct nvmet_rdma_device *dev = queue->dev; 946 947 nvmet_rdma_free_queue(queue); 948 949 kref_put(&dev->ref, nvmet_rdma_free_dev); 950 } 951 952 static int 953 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 954 struct nvmet_rdma_queue *queue) 955 { 956 struct nvme_rdma_cm_req *req; 957 958 req = (struct nvme_rdma_cm_req *)conn->private_data; 959 if (!req || conn->private_data_len == 0) 960 return NVME_RDMA_CM_INVALID_LEN; 961 962 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 963 return NVME_RDMA_CM_INVALID_RECFMT; 964 965 queue->host_qid = le16_to_cpu(req->qid); 966 967 /* 968 * req->hsqsize corresponds to our recv queue size plus 1 969 * req->hrqsize corresponds to our send queue size 970 */ 971 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 972 queue->send_queue_size = le16_to_cpu(req->hrqsize); 973 974 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 975 return NVME_RDMA_CM_INVALID_HSQSIZE; 976 977 /* XXX: Should we enforce some kind of max for IO queues? */ 978 979 return 0; 980 } 981 982 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 983 enum nvme_rdma_cm_status status) 984 { 985 struct nvme_rdma_cm_rej rej; 986 987 pr_debug("rejecting connect request: status %d (%s)\n", 988 status, nvme_rdma_cm_msg(status)); 989 990 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 991 rej.sts = cpu_to_le16(status); 992 993 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 994 } 995 996 static struct nvmet_rdma_queue * 997 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 998 struct rdma_cm_id *cm_id, 999 struct rdma_cm_event *event) 1000 { 1001 struct nvmet_rdma_queue *queue; 1002 int ret; 1003 1004 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1005 if (!queue) { 1006 ret = NVME_RDMA_CM_NO_RSC; 1007 goto out_reject; 1008 } 1009 1010 ret = nvmet_sq_init(&queue->nvme_sq); 1011 if (ret) { 1012 ret = NVME_RDMA_CM_NO_RSC; 1013 goto out_free_queue; 1014 } 1015 1016 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1017 if (ret) 1018 goto out_destroy_sq; 1019 1020 /* 1021 * Schedules the actual release because calling rdma_destroy_id from 1022 * inside a CM callback would trigger a deadlock. (great API design..) 1023 */ 1024 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1025 queue->dev = ndev; 1026 queue->cm_id = cm_id; 1027 1028 spin_lock_init(&queue->state_lock); 1029 queue->state = NVMET_RDMA_Q_CONNECTING; 1030 INIT_LIST_HEAD(&queue->rsp_wait_list); 1031 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1032 spin_lock_init(&queue->rsp_wr_wait_lock); 1033 INIT_LIST_HEAD(&queue->free_rsps); 1034 spin_lock_init(&queue->rsps_lock); 1035 INIT_LIST_HEAD(&queue->queue_list); 1036 1037 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1038 if (queue->idx < 0) { 1039 ret = NVME_RDMA_CM_NO_RSC; 1040 goto out_destroy_sq; 1041 } 1042 1043 ret = nvmet_rdma_alloc_rsps(queue); 1044 if (ret) { 1045 ret = NVME_RDMA_CM_NO_RSC; 1046 goto out_ida_remove; 1047 } 1048 1049 if (!ndev->srq) { 1050 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1051 queue->recv_queue_size, 1052 !queue->host_qid); 1053 if (IS_ERR(queue->cmds)) { 1054 ret = NVME_RDMA_CM_NO_RSC; 1055 goto out_free_responses; 1056 } 1057 } 1058 1059 ret = nvmet_rdma_create_queue_ib(queue); 1060 if (ret) { 1061 pr_err("%s: creating RDMA queue failed (%d).\n", 1062 __func__, ret); 1063 ret = NVME_RDMA_CM_NO_RSC; 1064 goto out_free_cmds; 1065 } 1066 1067 return queue; 1068 1069 out_free_cmds: 1070 if (!ndev->srq) { 1071 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1072 queue->recv_queue_size, 1073 !queue->host_qid); 1074 } 1075 out_free_responses: 1076 nvmet_rdma_free_rsps(queue); 1077 out_ida_remove: 1078 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1079 out_destroy_sq: 1080 nvmet_sq_destroy(&queue->nvme_sq); 1081 out_free_queue: 1082 kfree(queue); 1083 out_reject: 1084 nvmet_rdma_cm_reject(cm_id, ret); 1085 return NULL; 1086 } 1087 1088 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1089 { 1090 struct nvmet_rdma_queue *queue = priv; 1091 1092 switch (event->event) { 1093 case IB_EVENT_COMM_EST: 1094 rdma_notify(queue->cm_id, event->event); 1095 break; 1096 default: 1097 pr_err("received IB QP event: %s (%d)\n", 1098 ib_event_msg(event->event), event->event); 1099 break; 1100 } 1101 } 1102 1103 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1104 struct nvmet_rdma_queue *queue, 1105 struct rdma_conn_param *p) 1106 { 1107 struct rdma_conn_param param = { }; 1108 struct nvme_rdma_cm_rep priv = { }; 1109 int ret = -ENOMEM; 1110 1111 param.rnr_retry_count = 7; 1112 param.flow_control = 1; 1113 param.initiator_depth = min_t(u8, p->initiator_depth, 1114 queue->dev->device->attrs.max_qp_init_rd_atom); 1115 param.private_data = &priv; 1116 param.private_data_len = sizeof(priv); 1117 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1118 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1119 1120 ret = rdma_accept(cm_id, ¶m); 1121 if (ret) 1122 pr_err("rdma_accept failed (error code = %d)\n", ret); 1123 1124 return ret; 1125 } 1126 1127 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1128 struct rdma_cm_event *event) 1129 { 1130 struct nvmet_rdma_device *ndev; 1131 struct nvmet_rdma_queue *queue; 1132 int ret = -EINVAL; 1133 1134 ndev = nvmet_rdma_find_get_device(cm_id); 1135 if (!ndev) { 1136 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1137 return -ECONNREFUSED; 1138 } 1139 1140 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1141 if (!queue) { 1142 ret = -ENOMEM; 1143 goto put_device; 1144 } 1145 queue->port = cm_id->context; 1146 1147 if (queue->host_qid == 0) { 1148 /* Let inflight controller teardown complete */ 1149 flush_scheduled_work(); 1150 } 1151 1152 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1153 if (ret) { 1154 schedule_work(&queue->release_work); 1155 /* Destroying rdma_cm id is not needed here */ 1156 return 0; 1157 } 1158 1159 mutex_lock(&nvmet_rdma_queue_mutex); 1160 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1161 mutex_unlock(&nvmet_rdma_queue_mutex); 1162 1163 return 0; 1164 1165 put_device: 1166 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1167 1168 return ret; 1169 } 1170 1171 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1172 { 1173 unsigned long flags; 1174 1175 spin_lock_irqsave(&queue->state_lock, flags); 1176 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1177 pr_warn("trying to establish a connected queue\n"); 1178 goto out_unlock; 1179 } 1180 queue->state = NVMET_RDMA_Q_LIVE; 1181 1182 while (!list_empty(&queue->rsp_wait_list)) { 1183 struct nvmet_rdma_rsp *cmd; 1184 1185 cmd = list_first_entry(&queue->rsp_wait_list, 1186 struct nvmet_rdma_rsp, wait_list); 1187 list_del(&cmd->wait_list); 1188 1189 spin_unlock_irqrestore(&queue->state_lock, flags); 1190 nvmet_rdma_handle_command(queue, cmd); 1191 spin_lock_irqsave(&queue->state_lock, flags); 1192 } 1193 1194 out_unlock: 1195 spin_unlock_irqrestore(&queue->state_lock, flags); 1196 } 1197 1198 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1199 { 1200 bool disconnect = false; 1201 unsigned long flags; 1202 1203 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1204 1205 spin_lock_irqsave(&queue->state_lock, flags); 1206 switch (queue->state) { 1207 case NVMET_RDMA_Q_CONNECTING: 1208 case NVMET_RDMA_Q_LIVE: 1209 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1210 disconnect = true; 1211 break; 1212 case NVMET_RDMA_Q_DISCONNECTING: 1213 break; 1214 } 1215 spin_unlock_irqrestore(&queue->state_lock, flags); 1216 1217 if (disconnect) { 1218 rdma_disconnect(queue->cm_id); 1219 schedule_work(&queue->release_work); 1220 } 1221 } 1222 1223 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1224 { 1225 bool disconnect = false; 1226 1227 mutex_lock(&nvmet_rdma_queue_mutex); 1228 if (!list_empty(&queue->queue_list)) { 1229 list_del_init(&queue->queue_list); 1230 disconnect = true; 1231 } 1232 mutex_unlock(&nvmet_rdma_queue_mutex); 1233 1234 if (disconnect) 1235 __nvmet_rdma_queue_disconnect(queue); 1236 } 1237 1238 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1239 struct nvmet_rdma_queue *queue) 1240 { 1241 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1242 1243 mutex_lock(&nvmet_rdma_queue_mutex); 1244 if (!list_empty(&queue->queue_list)) 1245 list_del_init(&queue->queue_list); 1246 mutex_unlock(&nvmet_rdma_queue_mutex); 1247 1248 pr_err("failed to connect queue %d\n", queue->idx); 1249 schedule_work(&queue->release_work); 1250 } 1251 1252 /** 1253 * nvme_rdma_device_removal() - Handle RDMA device removal 1254 * @cm_id: rdma_cm id, used for nvmet port 1255 * @queue: nvmet rdma queue (cm id qp_context) 1256 * 1257 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1258 * to unplug. Note that this event can be generated on a normal 1259 * queue cm_id and/or a device bound listener cm_id (where in this 1260 * case queue will be null). 1261 * 1262 * We registered an ib_client to handle device removal for queues, 1263 * so we only need to handle the listening port cm_ids. In this case 1264 * we nullify the priv to prevent double cm_id destruction and destroying 1265 * the cm_id implicitely by returning a non-zero rc to the callout. 1266 */ 1267 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1268 struct nvmet_rdma_queue *queue) 1269 { 1270 struct nvmet_port *port; 1271 1272 if (queue) { 1273 /* 1274 * This is a queue cm_id. we have registered 1275 * an ib_client to handle queues removal 1276 * so don't interfear and just return. 1277 */ 1278 return 0; 1279 } 1280 1281 port = cm_id->context; 1282 1283 /* 1284 * This is a listener cm_id. Make sure that 1285 * future remove_port won't invoke a double 1286 * cm_id destroy. use atomic xchg to make sure 1287 * we don't compete with remove_port. 1288 */ 1289 if (xchg(&port->priv, NULL) != cm_id) 1290 return 0; 1291 1292 /* 1293 * We need to return 1 so that the core will destroy 1294 * it's own ID. What a great API design.. 1295 */ 1296 return 1; 1297 } 1298 1299 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1300 struct rdma_cm_event *event) 1301 { 1302 struct nvmet_rdma_queue *queue = NULL; 1303 int ret = 0; 1304 1305 if (cm_id->qp) 1306 queue = cm_id->qp->qp_context; 1307 1308 pr_debug("%s (%d): status %d id %p\n", 1309 rdma_event_msg(event->event), event->event, 1310 event->status, cm_id); 1311 1312 switch (event->event) { 1313 case RDMA_CM_EVENT_CONNECT_REQUEST: 1314 ret = nvmet_rdma_queue_connect(cm_id, event); 1315 break; 1316 case RDMA_CM_EVENT_ESTABLISHED: 1317 nvmet_rdma_queue_established(queue); 1318 break; 1319 case RDMA_CM_EVENT_ADDR_CHANGE: 1320 case RDMA_CM_EVENT_DISCONNECTED: 1321 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1322 nvmet_rdma_queue_disconnect(queue); 1323 break; 1324 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1325 ret = nvmet_rdma_device_removal(cm_id, queue); 1326 break; 1327 case RDMA_CM_EVENT_REJECTED: 1328 pr_debug("Connection rejected: %s\n", 1329 rdma_reject_msg(cm_id, event->status)); 1330 /* FALLTHROUGH */ 1331 case RDMA_CM_EVENT_UNREACHABLE: 1332 case RDMA_CM_EVENT_CONNECT_ERROR: 1333 nvmet_rdma_queue_connect_fail(cm_id, queue); 1334 break; 1335 default: 1336 pr_err("received unrecognized RDMA CM event %d\n", 1337 event->event); 1338 break; 1339 } 1340 1341 return ret; 1342 } 1343 1344 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1345 { 1346 struct nvmet_rdma_queue *queue; 1347 1348 restart: 1349 mutex_lock(&nvmet_rdma_queue_mutex); 1350 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1351 if (queue->nvme_sq.ctrl == ctrl) { 1352 list_del_init(&queue->queue_list); 1353 mutex_unlock(&nvmet_rdma_queue_mutex); 1354 1355 __nvmet_rdma_queue_disconnect(queue); 1356 goto restart; 1357 } 1358 } 1359 mutex_unlock(&nvmet_rdma_queue_mutex); 1360 } 1361 1362 static int nvmet_rdma_add_port(struct nvmet_port *port) 1363 { 1364 struct rdma_cm_id *cm_id; 1365 struct sockaddr_storage addr = { }; 1366 __kernel_sa_family_t af; 1367 int ret; 1368 1369 switch (port->disc_addr.adrfam) { 1370 case NVMF_ADDR_FAMILY_IP4: 1371 af = AF_INET; 1372 break; 1373 case NVMF_ADDR_FAMILY_IP6: 1374 af = AF_INET6; 1375 break; 1376 default: 1377 pr_err("address family %d not supported\n", 1378 port->disc_addr.adrfam); 1379 return -EINVAL; 1380 } 1381 1382 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1383 port->disc_addr.trsvcid, &addr); 1384 if (ret) { 1385 pr_err("malformed ip/port passed: %s:%s\n", 1386 port->disc_addr.traddr, port->disc_addr.trsvcid); 1387 return ret; 1388 } 1389 1390 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1391 RDMA_PS_TCP, IB_QPT_RC); 1392 if (IS_ERR(cm_id)) { 1393 pr_err("CM ID creation failed\n"); 1394 return PTR_ERR(cm_id); 1395 } 1396 1397 /* 1398 * Allow both IPv4 and IPv6 sockets to bind a single port 1399 * at the same time. 1400 */ 1401 ret = rdma_set_afonly(cm_id, 1); 1402 if (ret) { 1403 pr_err("rdma_set_afonly failed (%d)\n", ret); 1404 goto out_destroy_id; 1405 } 1406 1407 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1408 if (ret) { 1409 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1410 (struct sockaddr *)&addr, ret); 1411 goto out_destroy_id; 1412 } 1413 1414 ret = rdma_listen(cm_id, 128); 1415 if (ret) { 1416 pr_err("listening to %pISpcs failed (%d)\n", 1417 (struct sockaddr *)&addr, ret); 1418 goto out_destroy_id; 1419 } 1420 1421 pr_info("enabling port %d (%pISpcs)\n", 1422 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1423 port->priv = cm_id; 1424 return 0; 1425 1426 out_destroy_id: 1427 rdma_destroy_id(cm_id); 1428 return ret; 1429 } 1430 1431 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1432 { 1433 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1434 1435 if (cm_id) 1436 rdma_destroy_id(cm_id); 1437 } 1438 1439 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1440 struct nvmet_port *port, char *traddr) 1441 { 1442 struct rdma_cm_id *cm_id = port->priv; 1443 1444 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1445 struct nvmet_rdma_rsp *rsp = 1446 container_of(req, struct nvmet_rdma_rsp, req); 1447 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1448 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1449 1450 sprintf(traddr, "%pISc", addr); 1451 } else { 1452 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1453 } 1454 } 1455 1456 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1457 .owner = THIS_MODULE, 1458 .type = NVMF_TRTYPE_RDMA, 1459 .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, 1460 .msdbd = 1, 1461 .has_keyed_sgls = 1, 1462 .add_port = nvmet_rdma_add_port, 1463 .remove_port = nvmet_rdma_remove_port, 1464 .queue_response = nvmet_rdma_queue_response, 1465 .delete_ctrl = nvmet_rdma_delete_ctrl, 1466 .disc_traddr = nvmet_rdma_disc_port_addr, 1467 }; 1468 1469 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1470 { 1471 struct nvmet_rdma_queue *queue, *tmp; 1472 struct nvmet_rdma_device *ndev; 1473 bool found = false; 1474 1475 mutex_lock(&device_list_mutex); 1476 list_for_each_entry(ndev, &device_list, entry) { 1477 if (ndev->device == ib_device) { 1478 found = true; 1479 break; 1480 } 1481 } 1482 mutex_unlock(&device_list_mutex); 1483 1484 if (!found) 1485 return; 1486 1487 /* 1488 * IB Device that is used by nvmet controllers is being removed, 1489 * delete all queues using this device. 1490 */ 1491 mutex_lock(&nvmet_rdma_queue_mutex); 1492 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1493 queue_list) { 1494 if (queue->dev->device != ib_device) 1495 continue; 1496 1497 pr_info("Removing queue %d\n", queue->idx); 1498 list_del_init(&queue->queue_list); 1499 __nvmet_rdma_queue_disconnect(queue); 1500 } 1501 mutex_unlock(&nvmet_rdma_queue_mutex); 1502 1503 flush_scheduled_work(); 1504 } 1505 1506 static struct ib_client nvmet_rdma_ib_client = { 1507 .name = "nvmet_rdma", 1508 .remove = nvmet_rdma_remove_one 1509 }; 1510 1511 static int __init nvmet_rdma_init(void) 1512 { 1513 int ret; 1514 1515 ret = ib_register_client(&nvmet_rdma_ib_client); 1516 if (ret) 1517 return ret; 1518 1519 ret = nvmet_register_transport(&nvmet_rdma_ops); 1520 if (ret) 1521 goto err_ib_client; 1522 1523 return 0; 1524 1525 err_ib_client: 1526 ib_unregister_client(&nvmet_rdma_ib_client); 1527 return ret; 1528 } 1529 1530 static void __exit nvmet_rdma_exit(void) 1531 { 1532 nvmet_unregister_transport(&nvmet_rdma_ops); 1533 ib_unregister_client(&nvmet_rdma_ib_client); 1534 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1535 ida_destroy(&nvmet_rdma_queue_ida); 1536 } 1537 1538 module_init(nvmet_rdma_init); 1539 module_exit(nvmet_rdma_exit); 1540 1541 MODULE_LICENSE("GPL v2"); 1542 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1543